improve MW scraper

2023-09-05 14:00:29 +00:00 · 2023-09-05 14:00:29 +00:00 · 43665cff6d
commit 43665cff6d
parent 874d4c744a
9 changed files with 147 additions and 104 deletions
--- a/MW_thesaurus.py
+++ b/MW_thesaurus.py
@ -1,6 +1,6 @@
 from itertools import chain

-from dict_dl import Queue, WordParser, ot, rb, uq, uqall
+from dict_dl import Queue, WordParser, ot, rb, uq, uqall, only_first_text


 class MWThesaurusParser(WordParser):
@ -12,64 +12,39 @@ class MWThesaurusParser(WordParser):
    def thes(self):
        thes = {}
        for i in range(1, 10):
-            for entry in self.root.findall(f".//div[@id='thesaurus-entry-{i}']"):
-                for se in chain(
-                    entry.findall(".//div[@class='sb no-sn']"),
-                    entry.findall(".//div[@class='sb has-num']"),
+            for j in range(1, 10):
+                for entry in self.root.findall(
+                    f".//div[@id='thesaurus-entry-{i}-{j}']"
                ):
-                    for e in se.findall(".//span[@class='dt']"):
-                        examples = [ot(li) for li in e.findall(".//li")]
-                        for ul in e.findall(".//ul"):
-                            ul.clear()
-                        d = ot(e)
-                        thes[d] = {"examples": examples}
-                    thes[d]["synonyms"] = [
-                        ot(li)
-                        for li in se.findall(
-                            ".//span[@class='thes-list syn-list']/div[@class='thes-list-content synonyms_list']//li//a"
-                        )
-                    ]
-                    thes[d]["synonyms"].extend([
-                        ot(li)
-                        for li in se.findall(
-                            ".//span[@class='thes-list phrase-list']/div[@class='thes-list-content synonyms_list']//li//a"
-                        )
-                    ])
-                    thes[d]["near synonyms"] = [
-                        ot(li)
-                        for li in se.findall(
-                            ".//span[@class='thes-list rel-list']/div[@class='thes-list-content synonyms_list']//li//a"
-                        )
-                    ]
-                    thes[d]["near synonyms"].extend(
-                        [
-                            ot(li)
-                            for li in se.findall(
-                                ".//span[@class='thes-list sim-list']/div[@class='thes-list-content synonyms_list']//li//a"
-                            )
-                        ]
-                    )
-                    thes[d]["near antonyms"] = [
-                        ot(li)
-                        for li in se.findall(
-                            ".//span[@class='thes-list near-list']/div[@class='thes-list-content synonyms_list']//li//a"
-                        )
-                    ]
-                    thes[d]["near antonyms"].extend(
-                        [
-                            ot(li)
-                            for li in se.findall(
-                                ".//span[@class='thes-list opp-list']/div[@class='thes-list-content synonyms_list']//li//a"
-                            )
-                        ]
-                    )
-                    thes[d]["antonyms"] = [
-                        ot(li)
-                        for li in se.findall(
-                            ".//span[@class='thes-list ant-list']/div[@class='thes-list-content synonyms_list']//li//a"
-                        )
-                    ]
+                    d = ""
+                    for e in entry.findall(".//span[@class='dt']"):
+                        d = only_first_text(e)
+                    thes[d] = {}
+                    for relev in [4, 3]:
+                        for e in entry.findall(
+                            f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']"
+                        ):
+                            thes[d]["synonyms"] = thes[d].get("synonyms", []) + [ot(e)]
+                    for relev in [2, 1]:
+                        for e in entry.findall(
+                            f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']"
+                        ):
+                            thes[d]["near synonyms"] = thes[d].get(
+                                "near synonyms", []
+                            ) + [ot(e)]

+                    for relev in [4, 3]:
+                        for e in entry.findall(
+                            f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']"
+                        ):
+                            thes[d]["antonyms"] = thes[d].get("antonyms", []) + [ot(e)]
+                    for relev in [2, 1]:
+                        for e in entry.findall(
+                            f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']"
+                        ):
+                            thes[d]["near antonyms"] = thes[d].get(
+                                "near antonyms", []
+                            ) + [ot(e)]
        return thes

    @property
@ -92,13 +67,15 @@ class MWThesaurusParser(WordParser):
        assert (
            self.type or self.thes
        ), f"{self.time} {self.word}: type or definitions came back empty..."
-        return uqall({self.word: self.thes | {"type": self.type}})
+        return uqall(
+            {self.word: self.thes} | {"type": self.type, "time_of_retrieval": self.time}
+        )


-w = MWThesaurusParser("augur")
+# w = MWThesaurusParser("content")
 # print(w.neighbours)
-print(w.todict())
-exit()
+# print(w.todict())
+# exit()

 q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json")
 q.loadDB()
--- a/analysis.py
+++ b/analysis.py
@ -1,12 +1,25 @@
+from dict_dl import FullDictionary
+
 # import matplotlib.pyplot as plt
 # from PIL import Image
 # from wordcloud import STOPWORDS, WordCloud

-d = FullDictionary("en_merriam_webster/", "_mw.json")
+d = FullDictionary("en_MerriamWebster/", "_MW.json")
 # d = Dictionary("en_MW_thesaurus/", "_mwt.json")
 # d = Dictionary("de_duden/", "_duden.json")
 print(f"{d.readtime:.06f}")

+print(
+    sorted(
+        [
+            k
+            for k in d
+            if not any([c in ["a", "e", "i", "o", "u", "_"] for c in k.lower()])
+            and len(k) > 2
+            and k[-1] not in string.ascii_uppercase
+        ]
+    )
+)
 # print([k for k in d if not all([c in string.ascii_letters for c in k])])
 print([k for k in d if "?" in k])
 exit()
--- a/d.py
+++ b/d.py
@ -18,7 +18,7 @@ d = DictFile(os.path.expandvars(f"$DICT_DL/en_MerriamWebster/{prefix}_MW.json"))

 print(f"||||||||{query}||||||||")
 for k, v in d[query].items():
-    print(k,v)
+    print(k, v)
 #     if k != "type":
 #         table = Table(title=k)
 #         table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)
--- a/dict_dl.py
+++ b/dict_dl.py
@ -122,8 +122,8 @@ def url2str(url: str, clean=True) -> str:
    xml_str = re.sub(r"<>", "-->", xml_str)
    xml_str = remove_tag(xml_str, "head")
    # xml_str = remove_tag(xml_str)
-    with open("test.html", "w") as f:
-        f.write(xml_str)
+    # with open("test.html", "w") as f:
+    #     f.write(xml_str)
    return xml_str


--- a/duden.py
+++ b/duden.py
@ -96,10 +96,10 @@ class DudenParser(WordParser):
        )


-# d = DudenParser("hinfallen")
-# print(d.neighbours)
-# print(d.todict())
-# exit()
+d = DudenParser("hineintauchen")
+print(d.neighbours)
+print(d.todict())
+exit()

 q = Queue(DudenParser, "de_Duden/", "_D.json")
 q.loadDB()
--- a/main.py
+++ b/main.py
@ -80,7 +80,7 @@ def phrases(n: int = 4, nouns: int = 1, adjs: int = 2, pw: bool = False):
        if pw:
            # ps = [ "".join(p)[:-1] for p in [ [word + char for word, char in zip(p, [random_char() for w in p])] for p in phrases ] ]
            ps = [
-                "".join([w.capitalize() for i,w in enumerate(p) if i > 0])
+                "".join([w.capitalize() if i > 0 else w for i, w in enumerate(p)])
                + random_char()
                + f"{random.randint(0,999):03d}"
                for p in phrases
--- a/merriam_webster.py
+++ b/merriam_webster.py
@ -17,11 +17,11 @@ class MerriamWebsterParser(WordParser):
                        definitions[ot(d)] = [
                            ot(ex) for ex in e.findall("./span[@class!='dtText']")
                        ]
-        if not definitions: # british spelling...
+        if not definitions:  # british spelling...
            for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"):
                for e in entry.findall(".//span[@class='cxl']"):
                    words = [ot(d) for d in entry.findall(".//a")]
-                    definitions[f'{ot(e)} {", ".join(words)}'] = [ ]
+                    definitions[f'{ot(e)} {", ".join(words)}'] = []
        return cw(definitions)

    @property
--- a/mw_scraper.py
+++ b/mw_scraper.py
@ -39,11 +39,6 @@ class Pronunciation(SQLModel, table=True):


 engine = create_engine(db_pass)
-SQLModel.metadata.create_all(engine)
-
-html_session = HTMLSession()
-
-QUEUE = {line.strip() for line in open("queue.db", "rt")}


 def add_word(word):
@ -78,6 +73,7 @@ def add_word(word):
                .where(Sense.word == _word)
                .where(Sense.word_class == _class)
            ).one_or_none()
+
            if results:
                sense = results
            else:
@ -97,14 +93,26 @@ def add_word(word):
                    for sents in dt.find("span.sents"):
                        _example.append(sents.text)
                    _examples.append("; ".join(_example))
-                session.add(
-                    Description(
-                        word=word.word,
-                        sense_id=sense.id,
-                        description="; ".join(_desc),
-                        examples=_examples,
+
+                _final_description = "; ".join(_desc)
+                results = session.exec(
+                    select(Description).where(
+                        Description.word == word.word,
+                        Description.description == _final_description,
                    )
-                )
+                ).one_or_none()
+                if results:
+                    continue
+                else:
+                    session.add(
+                        Description(
+                            word=word.word,
+                            sense_id=sense.id,
+                            description=_final_description,
+                            examples=_examples,
+                        )
+                    )
+
            for pron in c.find(
                "span.prons-entries-list-inline div.prons-entry-list-item,a.prons-entry-list-item"
            ):
@ -117,18 +125,28 @@ def add_word(word):
    return links


-def present():
+def presently_available():
    with Session(engine) as session:
        return session.exec(select(Word.word)).unique()


-while True:
-    try:
-        QUEUE |= add_word(random.choice(list(QUEUE)))
-        QUEUE -= set(present())
-        print(len(QUEUE))
-        sleep(random.random() * 5)
-    except KeyboardInterrupt:
-        with open("queue.db", "wt") as f:
-            f.write("\n".join(list(QUEUE)))
-        exit(0)
+if __name__ == "__main__":
+    SQLModel.metadata.create_all(engine)
+    html_session = HTMLSession()
+
+    QUEUE = {line.strip() for line in open("queue.db", "rt")}
+
+    while True:
+        try:
+            if len(QUEUE) < 20:
+                exit()
+            next_word = random.choice(list(QUEUE))
+            already_present = set(presently_available())
+            print(next_word, len(QUEUE), len(already_present))
+            QUEUE |= add_word(next_word)
+            QUEUE -= already_present | {next_word}
+            sleep(random.random() * 5)
+        except KeyboardInterrupt:
+            with open("queue.db", "wt") as f:
+                f.write("\n".join(list(QUEUE)))
+            exit(0)
--- a/t.py
+++ b/t.py
@ -1,25 +1,60 @@
 #!/bin/python
+"""search the Merriam Webster Thesaurus with ease"""
+import argparse
 import os
-import sys
 from itertools import zip_longest

 from rich.console import Console
 from rich.table import Table

 from dict_dl import DictFile
+import string

-if len(sys.argv) < 2:
-    query = next(sys.stdin).strip()
-else:
-    query = sys.argv[1].strip()
-prefix = query[:3]
+letters = string.ascii_lowercase
+prefix_length = 3
+unusual = (
+    lambda prefix: not all([c in letters for c in prefix.lower()])
+    or len(prefix) < prefix_length
+)

+parser = argparse.ArgumentParser(description="Merriam Webster Thesaurus")
+parser.add_argument("-p", "--preview", action="store_true", help="FZF preview")
+parser.add_argument("query", type=str, help="query")
+
+args = parser.parse_args()
+prefix = args.query[:prefix_length].lower()
+
+if unusual(prefix):
+    prefix = "_" * prefix_length
 d = DictFile(os.path.expandvars(f"$DICT_DL/en_MWThesaurus/{prefix}_MWT.json"))

-print(f"||||||||{query}||||||||")
+if args.preview:
+    for k, v in d[args.query].items():
+        if k == "type":
+            word_type = k
+        else:
+            syns = v["synonyms"]
+            nsyns = v["related" if "related" in v else "near synonyms"]
+            nants = v["near antonyms"]
+            ants = v["antonyms"]
+            print(f"> {k}")
+            if syns:
+                print("  SYNONYMS\n  ", ", ".join(syns))
+            if nsyns:
+                print("  NEAR SYNONYMS\n  ", ", ".join(nsyns))
+            if nants:
+                print("  NEAR ANTONYMS\n  ", ", ".join(nants))
+            if ants:
+                print("  ANTONYMS\n  ", ", ".join(ants))
+        print()
+    exit()
+
+print(f"||||||||{args.query}||||||||")
 print()
-for k, v in d[query].items():
-    if k != "type":
+for k, v in d[args.query].items():
+    if k == "type":
+        word_type = k
+    else:
        table = Table(title=k)
        table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)
        table.add_column("near synonyms", justify="center", style="cyan", no_wrap=True)