From f65a677cb4e726f2d77d249523814865614df02a Mon Sep 17 00:00:00 2001 From: julius Date: Tue, 15 Nov 2022 11:41:33 +0000 Subject: [PATCH] state of the parser --- MW_thesaurus.py | 13 ++++++------- d.py | 36 ++++++++++++++++++++++++++++++++++++ dict_dl.py | 40 +++++++++++++++++++++++++++------------- merriam_webster.py | 10 ++++++++-- 4 files changed, 77 insertions(+), 22 deletions(-) create mode 100755 d.py diff --git a/MW_thesaurus.py b/MW_thesaurus.py index 33fb8422..5ab3227c 100644 --- a/MW_thesaurus.py +++ b/MW_thesaurus.py @@ -6,7 +6,7 @@ from dict_dl import Queue, WordParser, ot, rb, uq, uqall class MWThesaurusParser(WordParser): def __init__(self, word): url_prefix = "https://www.merriam-webster.com/thesaurus/" - super().__init__(word, url_prefix) + super().__init__(word, url_prefix, clean=True) @property def thes(self): @@ -75,9 +75,7 @@ class MWThesaurusParser(WordParser): @property def type(self): types = set() - for e in self.root.findall( - ".//div[@class='row entry-header thesaurus']//span[@class='fl']" - ): + for e in self.root.findall(".//a[@class='important-blue-link']"): types.add(rb(ot(e), "(", ")")) return sorted(types) @@ -97,9 +95,10 @@ class MWThesaurusParser(WordParser): return uqall({self.word: self.thes | {"type": self.type}}) -# w = MWThesaurusParser("coffining") -# print(w.todict()) -# exit() +w = MWThesaurusParser("augur") +# print(w.neighbours) +print(w.todict()) +exit() q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json") q.loadDB() diff --git a/d.py b/d.py new file mode 100755 index 00000000..9b6d58e9 --- /dev/null +++ b/d.py @@ -0,0 +1,36 @@ +#!/bin/python +import os +import sys +from itertools import zip_longest + +from rich.console import Console +from rich.table import Table + +from dict_dl import DictFile + +if len(sys.argv) < 2: + query = next(sys.stdin).strip() +else: + query = sys.argv[1].strip() +prefix = query[:3] + +d = DictFile(os.path.expandvars(f"$DICT_DL/en_MerriamWebster/{prefix}_MW.json")) + +print(f"||||||||{query}||||||||") +for k, v in d[query].items(): + print(k,v) +# if k != "type": +# table = Table(title=k) +# table.add_column("synonyms", justify="center", style="cyan", no_wrap=True) +# table.add_column("near synonyms", justify="center", style="cyan", no_wrap=True) +# table.add_column("near antonyms", justify="center", style="cyan", no_wrap=True) +# table.add_column("antonyms", justify="center", style="cyan", no_wrap=True) +# syns = v["synonyms"] +# nsyns = v["related" if "related" in v else "near synonyms"] +# ants = v["near antonyms"] +# nants = v["antonyms"] +# for s, ns, na, a in zip_longest(syns, nsyns, nants, ants, fillvalue=""): +# table.add_row(s, ns, na, a) + +# console = Console() +# console.print(table) diff --git a/dict_dl.py b/dict_dl.py index 4fd74f19..27326f31 100644 --- a/dict_dl.py +++ b/dict_dl.py @@ -106,18 +106,24 @@ def only_text(e): return " ".join(all_text(e)) -def url2str(url: str) -> str: +def url2str(url: str, clean=True) -> str: headers = { "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36" } # bad_html = requests.get(url, headers=headers) bad_html = requests.get(url) - tree = BeautifulSoup(bad_html.text, features="lxml") - xml_str = str(tree) + if clean: + tree = BeautifulSoup(bad_html.text, features="html.parser") + xml_str = str(tree) + else: + xml_str = bad_html.text + + xml_str = re.sub(r"[^!]--[^>]", "-->", xml_str) + xml_str = re.sub(r"<>", "-->", xml_str) xml_str = remove_tag(xml_str, "head") - xml_str = remove_tag(xml_str) - # with open("test.html", "w") as f: - # f.write(xml_str) + # xml_str = remove_tag(xml_str) + with open("test.html", "w") as f: + f.write(xml_str) return xml_str @@ -134,11 +140,11 @@ class WordParser: - self.neighbours = words found on the site - self.todict() = returning a dict with the parsed info""" - def __init__(self, word, url_prefix): + def __init__(self, word, url_prefix, clean=True): self.time = datetime.now().strftime("%Y%m%d-%H%M%S") self.word = uq(word) self.url = f"{url_prefix}{word}" - self.xml_string = url2str(self.url) + self.xml_string = url2str(self.url, clean=clean) self.root = ET.fromstring(self.xml_string) @@ -149,6 +155,7 @@ class FileSet(set): super().__init__({line.strip() for line in open(self.file, "r")}) else: super() + self -= {""} def load(self): if os.path.isfile(self.file): @@ -156,10 +163,13 @@ class FileSet(set): else: super() - def save(self): + def save(self, sort=False): if self: with open(self.file, "w") as f: - f.write("\n".join([w for w in self if w])) + if sort: + f.write("\n".join([w for w in sorted(self) if w])) + else: + f.write("\n".join([w for w in self if w])) def append(self): if self and os.path.isfile(self.file): @@ -215,8 +225,8 @@ class Queue: ) def wait(self): - if int(time.strftime("%M")) % 10 == 0:# cron job - self.words.save() + if int(time.time()) % 10 == 0: # cron job + self.words.save(sort=True) self.queue.save() self.time_exponent = abs(self.time_exponent) a = self.time_base**self.time_exponent @@ -226,7 +236,11 @@ class Queue: def loadDB(self): for db_file in Path(self.dir_prefix).glob(f"*{self.suffix}"): with open(db_file, "r") as f: - self.words |= set(json.load(f).keys()) + try: + self.words |= set(json.load(f).keys()) + except json.decoder.JSONDecodeError: + print(db_file, " corrupted") + exit() def pick_random(self): self.redo.load() diff --git a/merriam_webster.py b/merriam_webster.py index 722d9b18..a6a950e6 100644 --- a/merriam_webster.py +++ b/merriam_webster.py @@ -16,6 +16,11 @@ class MerriamWebsterParser(WordParser): definitions[ot(d)] = [ ot(ex) for ex in e.findall("./span[@class!='dtText']") ] + if not definitions: # british spelling... + for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"): + for e in entry.findall(".//span[@class='cxl']"): + words = [ot(d) for d in entry.findall(".//a")] + definitions[f'{ot(e)} {", ".join(words)}'] = [ ] return cw(definitions) @property @@ -37,7 +42,7 @@ class MerriamWebsterParser(WordParser): @property def type(self): types = set() - for e in self.root.findall(".//*[@class='fl']"): + for e in self.root.findall(".//a[@class='important-blue-link']"): types.add(rb(ot(e), "(", ")")) return sorted(types) @@ -126,8 +131,9 @@ class MerriamWebsterParser(WordParser): ) -# testword = "revivalist" +# testword = "optimize" # d = MerriamWebsterParser(testword) +# # print(d.definitions) # print(d.neighbours) # word_dict = d.todict() # for k, v in word_dict[testword].items():