From f65a677cb4e726f2d77d249523814865614df02a Mon Sep 17 00:00:00 2001
From: julius <julius@ju1ius.xyz>
Date: Tue, 15 Nov 2022 11:41:33 +0000
Subject: [PATCH] state of the parser

---
 MW_thesaurus.py    | 13 ++++++-------
 d.py               | 36 ++++++++++++++++++++++++++++++++++++
 dict_dl.py         | 40 +++++++++++++++++++++++++++-------------
 merriam_webster.py | 10 ++++++++--
 4 files changed, 77 insertions(+), 22 deletions(-)
 create mode 100755 d.py

diff --git a/MW_thesaurus.py b/MW_thesaurus.py
index 33fb8422..5ab3227c 100644
--- a/MW_thesaurus.py
+++ b/MW_thesaurus.py
@@ -6,7 +6,7 @@ from dict_dl import Queue, WordParser, ot, rb, uq, uqall
 class MWThesaurusParser(WordParser):
     def __init__(self, word):
         url_prefix = "https://www.merriam-webster.com/thesaurus/"
-        super().__init__(word, url_prefix)
+        super().__init__(word, url_prefix, clean=True)
 
     @property
     def thes(self):
@@ -75,9 +75,7 @@ class MWThesaurusParser(WordParser):
     @property
     def type(self):
         types = set()
-        for e in self.root.findall(
-            ".//div[@class='row entry-header thesaurus']//span[@class='fl']"
-        ):
+        for e in self.root.findall(".//a[@class='important-blue-link']"):
             types.add(rb(ot(e), "(", ")"))
         return sorted(types)
 
@@ -97,9 +95,10 @@ class MWThesaurusParser(WordParser):
         return uqall({self.word: self.thes | {"type": self.type}})
 
 
-# w = MWThesaurusParser("coffining")
-# print(w.todict())
-# exit()
+w = MWThesaurusParser("augur")
+# print(w.neighbours)
+print(w.todict())
+exit()
 
 q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json")
 q.loadDB()
diff --git a/d.py b/d.py
new file mode 100755
index 00000000..9b6d58e9
--- /dev/null
+++ b/d.py
@@ -0,0 +1,36 @@
+#!/bin/python
+import os
+import sys
+from itertools import zip_longest
+
+from rich.console import Console
+from rich.table import Table
+
+from dict_dl import DictFile
+
+if len(sys.argv) < 2:
+    query = next(sys.stdin).strip()
+else:
+    query = sys.argv[1].strip()
+prefix = query[:3]
+
+d = DictFile(os.path.expandvars(f"$DICT_DL/en_MerriamWebster/{prefix}_MW.json"))
+
+print(f"||||||||{query}||||||||")
+for k, v in d[query].items():
+    print(k,v)
+#     if k != "type":
+#         table = Table(title=k)
+#         table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)
+#         table.add_column("near synonyms", justify="center", style="cyan", no_wrap=True)
+#         table.add_column("near antonyms", justify="center", style="cyan", no_wrap=True)
+#         table.add_column("antonyms", justify="center", style="cyan", no_wrap=True)
+#         syns = v["synonyms"]
+#         nsyns = v["related" if "related" in v else "near synonyms"]
+#         ants = v["near antonyms"]
+#         nants = v["antonyms"]
+#         for s, ns, na, a in zip_longest(syns, nsyns, nants, ants, fillvalue=""):
+#             table.add_row(s, ns, na, a)
+
+#         console = Console()
+#         console.print(table)
diff --git a/dict_dl.py b/dict_dl.py
index 4fd74f19..27326f31 100644
--- a/dict_dl.py
+++ b/dict_dl.py
@@ -106,18 +106,24 @@ def only_text(e):
     return " ".join(all_text(e))
 
 
-def url2str(url: str) -> str:
+def url2str(url: str, clean=True) -> str:
     headers = {
         "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36"
     }
     # bad_html = requests.get(url, headers=headers)
     bad_html = requests.get(url)
-    tree = BeautifulSoup(bad_html.text, features="lxml")
-    xml_str = str(tree)
+    if clean:
+        tree = BeautifulSoup(bad_html.text, features="html.parser")
+        xml_str = str(tree)
+    else:
+        xml_str = bad_html.text
+
+    xml_str = re.sub(r"[^!]--[^>]", "-->", xml_str)
+    xml_str = re.sub(r"<>", "-->", xml_str)
     xml_str = remove_tag(xml_str, "head")
-    xml_str = remove_tag(xml_str)
-    # with open("test.html", "w") as f:
-    #     f.write(xml_str)
+    # xml_str = remove_tag(xml_str)
+    with open("test.html", "w") as f:
+        f.write(xml_str)
     return xml_str
 
 
@@ -134,11 +140,11 @@ class WordParser:
     - self.neighbours = words found on the site
     - self.todict() = returning a dict with the parsed info"""
 
-    def __init__(self, word, url_prefix):
+    def __init__(self, word, url_prefix, clean=True):
         self.time = datetime.now().strftime("%Y%m%d-%H%M%S")
         self.word = uq(word)
         self.url = f"{url_prefix}{word}"
-        self.xml_string = url2str(self.url)
+        self.xml_string = url2str(self.url, clean=clean)
         self.root = ET.fromstring(self.xml_string)
 
 
@@ -149,6 +155,7 @@ class FileSet(set):
             super().__init__({line.strip() for line in open(self.file, "r")})
         else:
             super()
+        self -= {""}
 
     def load(self):
         if os.path.isfile(self.file):
@@ -156,10 +163,13 @@ class FileSet(set):
         else:
             super()
 
-    def save(self):
+    def save(self, sort=False):
         if self:
             with open(self.file, "w") as f:
-                f.write("\n".join([w for w in self if w]))
+                if sort:
+                    f.write("\n".join([w for w in sorted(self) if w]))
+                else:
+                    f.write("\n".join([w for w in self if w]))
 
     def append(self):
         if self and os.path.isfile(self.file):
@@ -215,8 +225,8 @@ class Queue:
         )
 
     def wait(self):
-        if int(time.strftime("%M")) % 10 == 0:# cron job
-            self.words.save()
+        if int(time.time()) % 10 == 0:  # cron job
+            self.words.save(sort=True)
             self.queue.save()
         self.time_exponent = abs(self.time_exponent)
         a = self.time_base**self.time_exponent
@@ -226,7 +236,11 @@ class Queue:
     def loadDB(self):
         for db_file in Path(self.dir_prefix).glob(f"*{self.suffix}"):
             with open(db_file, "r") as f:
-                self.words |= set(json.load(f).keys())
+                try:
+                    self.words |= set(json.load(f).keys())
+                except json.decoder.JSONDecodeError:
+                    print(db_file, " corrupted")
+                    exit()
 
     def pick_random(self):
         self.redo.load()
diff --git a/merriam_webster.py b/merriam_webster.py
index 722d9b18..a6a950e6 100644
--- a/merriam_webster.py
+++ b/merriam_webster.py
@@ -16,6 +16,11 @@ class MerriamWebsterParser(WordParser):
                         definitions[ot(d)] = [
                             ot(ex) for ex in e.findall("./span[@class!='dtText']")
                         ]
+        if not definitions: # british spelling...
+            for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"):
+                for e in entry.findall(".//span[@class='cxl']"):
+                    words = [ot(d) for d in entry.findall(".//a")]
+                    definitions[f'{ot(e)} {", ".join(words)}'] = [ ]
         return cw(definitions)
 
     @property
@@ -37,7 +42,7 @@ class MerriamWebsterParser(WordParser):
     @property
     def type(self):
         types = set()
-        for e in self.root.findall(".//*[@class='fl']"):
+        for e in self.root.findall(".//a[@class='important-blue-link']"):
             types.add(rb(ot(e), "(", ")"))
         return sorted(types)
 
@@ -126,8 +131,9 @@ class MerriamWebsterParser(WordParser):
         )
 
 
-# testword = "revivalist"
+# testword = "optimize"
 # d = MerriamWebsterParser(testword)
+# # print(d.definitions)
 # print(d.neighbours)
 # word_dict = d.todict()
 # for k, v in word_dict[testword].items():