state of the parser

2022-11-15 11:41:33 +00:00 · 2022-11-15 11:41:33 +00:00 · f65a677cb4
commit f65a677cb4
parent 9b33e9add8
4 changed files with 77 additions and 22 deletions
--- a/MW_thesaurus.py
+++ b/MW_thesaurus.py
@ -6,7 +6,7 @@ from dict_dl import Queue, WordParser, ot, rb, uq, uqall
 class MWThesaurusParser(WordParser):
    def __init__(self, word):
        url_prefix = "https://www.merriam-webster.com/thesaurus/"
-        super().__init__(word, url_prefix)
+        super().__init__(word, url_prefix, clean=True)

    @property
    def thes(self):
@ -75,9 +75,7 @@ class MWThesaurusParser(WordParser):
    @property
    def type(self):
        types = set()
-        for e in self.root.findall(
-            ".//div[@class='row entry-header thesaurus']//span[@class='fl']"
-        ):
+        for e in self.root.findall(".//a[@class='important-blue-link']"):
            types.add(rb(ot(e), "(", ")"))
        return sorted(types)

@ -97,9 +95,10 @@ class MWThesaurusParser(WordParser):
        return uqall({self.word: self.thes | {"type": self.type}})


-# w = MWThesaurusParser("coffining")
-# print(w.todict())
-# exit()
+w = MWThesaurusParser("augur")
+# print(w.neighbours)
+print(w.todict())
+exit()

 q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json")
 q.loadDB()
--- a/d.py
+++ b/d.py
@ -0,0 +1,36 @@
+#!/bin/python
+import os
+import sys
+from itertools import zip_longest
+
+from rich.console import Console
+from rich.table import Table
+
+from dict_dl import DictFile
+
+if len(sys.argv) < 2:
+    query = next(sys.stdin).strip()
+else:
+    query = sys.argv[1].strip()
+prefix = query[:3]
+
+d = DictFile(os.path.expandvars(f"$DICT_DL/en_MerriamWebster/{prefix}_MW.json"))
+
+print(f"||||||||{query}||||||||")
+for k, v in d[query].items():
+    print(k,v)
+#     if k != "type":
+#         table = Table(title=k)
+#         table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)
+#         table.add_column("near synonyms", justify="center", style="cyan", no_wrap=True)
+#         table.add_column("near antonyms", justify="center", style="cyan", no_wrap=True)
+#         table.add_column("antonyms", justify="center", style="cyan", no_wrap=True)
+#         syns = v["synonyms"]
+#         nsyns = v["related" if "related" in v else "near synonyms"]
+#         ants = v["near antonyms"]
+#         nants = v["antonyms"]
+#         for s, ns, na, a in zip_longest(syns, nsyns, nants, ants, fillvalue=""):
+#             table.add_row(s, ns, na, a)
+
+#         console = Console()
+#         console.print(table)
--- a/dict_dl.py
+++ b/dict_dl.py
@ -106,18 +106,24 @@ def only_text(e):
    return " ".join(all_text(e))


-def url2str(url: str) -> str:
+def url2str(url: str, clean=True) -> str:
    headers = {
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36"
    }
    # bad_html = requests.get(url, headers=headers)
    bad_html = requests.get(url)
-    tree = BeautifulSoup(bad_html.text, features="lxml")
-    xml_str = str(tree)
+    if clean:
+        tree = BeautifulSoup(bad_html.text, features="html.parser")
+        xml_str = str(tree)
+    else:
+        xml_str = bad_html.text
+
+    xml_str = re.sub(r"[^!]--[^>]", "-->", xml_str)
+    xml_str = re.sub(r"<>", "-->", xml_str)
    xml_str = remove_tag(xml_str, "head")
-    xml_str = remove_tag(xml_str)
-    # with open("test.html", "w") as f:
-    #     f.write(xml_str)
+    # xml_str = remove_tag(xml_str)
+    with open("test.html", "w") as f:
+        f.write(xml_str)
    return xml_str


@ -134,11 +140,11 @@ class WordParser:
    - self.neighbours = words found on the site
    - self.todict() = returning a dict with the parsed info"""

-    def __init__(self, word, url_prefix):
+    def __init__(self, word, url_prefix, clean=True):
        self.time = datetime.now().strftime("%Y%m%d-%H%M%S")
        self.word = uq(word)
        self.url = f"{url_prefix}{word}"
-        self.xml_string = url2str(self.url)
+        self.xml_string = url2str(self.url, clean=clean)
        self.root = ET.fromstring(self.xml_string)


@ -149,6 +155,7 @@ class FileSet(set):
            super().__init__({line.strip() for line in open(self.file, "r")})
        else:
            super()
+        self -= {""}

    def load(self):
        if os.path.isfile(self.file):
@ -156,10 +163,13 @@ class FileSet(set):
        else:
            super()

-    def save(self):
+    def save(self, sort=False):
        if self:
            with open(self.file, "w") as f:
-                f.write("\n".join([w for w in self if w]))
+                if sort:
+                    f.write("\n".join([w for w in sorted(self) if w]))
+                else:
+                    f.write("\n".join([w for w in self if w]))

    def append(self):
        if self and os.path.isfile(self.file):
@ -215,8 +225,8 @@ class Queue:
        )

    def wait(self):
-        if int(time.strftime("%M")) % 10 == 0:# cron job
-            self.words.save()
+        if int(time.time()) % 10 == 0:  # cron job
+            self.words.save(sort=True)
            self.queue.save()
        self.time_exponent = abs(self.time_exponent)
        a = self.time_base**self.time_exponent
@ -226,7 +236,11 @@ class Queue:
    def loadDB(self):
        for db_file in Path(self.dir_prefix).glob(f"*{self.suffix}"):
            with open(db_file, "r") as f:
-                self.words |= set(json.load(f).keys())
+                try:
+                    self.words |= set(json.load(f).keys())
+                except json.decoder.JSONDecodeError:
+                    print(db_file, " corrupted")
+                    exit()

    def pick_random(self):
        self.redo.load()
--- a/merriam_webster.py
+++ b/merriam_webster.py
@ -16,6 +16,11 @@ class MerriamWebsterParser(WordParser):
                        definitions[ot(d)] = [
                            ot(ex) for ex in e.findall("./span[@class!='dtText']")
                        ]
+        if not definitions: # british spelling...
+            for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"):
+                for e in entry.findall(".//span[@class='cxl']"):
+                    words = [ot(d) for d in entry.findall(".//a")]
+                    definitions[f'{ot(e)} {", ".join(words)}'] = [ ]
        return cw(definitions)

    @property
@ -37,7 +42,7 @@ class MerriamWebsterParser(WordParser):
    @property
    def type(self):
        types = set()
-        for e in self.root.findall(".//*[@class='fl']"):
+        for e in self.root.findall(".//a[@class='important-blue-link']"):
            types.add(rb(ot(e), "(", ")"))
        return sorted(types)

@ -126,8 +131,9 @@ class MerriamWebsterParser(WordParser):
        )


-# testword = "revivalist"
+# testword = "optimize"
 # d = MerriamWebsterParser(testword)
+# # print(d.definitions)
 # print(d.neighbours)
 # word_dict = d.todict()
 # for k, v in word_dict[testword].items():