diff --git a/MW_thesaurus.py b/MW_thesaurus.py index 5ab3227c..51838c8a 100644 --- a/MW_thesaurus.py +++ b/MW_thesaurus.py @@ -1,6 +1,6 @@ from itertools import chain -from dict_dl import Queue, WordParser, ot, rb, uq, uqall +from dict_dl import Queue, WordParser, ot, rb, uq, uqall, only_first_text class MWThesaurusParser(WordParser): @@ -12,64 +12,39 @@ class MWThesaurusParser(WordParser): def thes(self): thes = {} for i in range(1, 10): - for entry in self.root.findall(f".//div[@id='thesaurus-entry-{i}']"): - for se in chain( - entry.findall(".//div[@class='sb no-sn']"), - entry.findall(".//div[@class='sb has-num']"), + for j in range(1, 10): + for entry in self.root.findall( + f".//div[@id='thesaurus-entry-{i}-{j}']" ): - for e in se.findall(".//span[@class='dt']"): - examples = [ot(li) for li in e.findall(".//li")] - for ul in e.findall(".//ul"): - ul.clear() - d = ot(e) - thes[d] = {"examples": examples} - thes[d]["synonyms"] = [ - ot(li) - for li in se.findall( - ".//span[@class='thes-list syn-list']/div[@class='thes-list-content synonyms_list']//li//a" - ) - ] - thes[d]["synonyms"].extend([ - ot(li) - for li in se.findall( - ".//span[@class='thes-list phrase-list']/div[@class='thes-list-content synonyms_list']//li//a" - ) - ]) - thes[d]["near synonyms"] = [ - ot(li) - for li in se.findall( - ".//span[@class='thes-list rel-list']/div[@class='thes-list-content synonyms_list']//li//a" - ) - ] - thes[d]["near synonyms"].extend( - [ - ot(li) - for li in se.findall( - ".//span[@class='thes-list sim-list']/div[@class='thes-list-content synonyms_list']//li//a" - ) - ] - ) - thes[d]["near antonyms"] = [ - ot(li) - for li in se.findall( - ".//span[@class='thes-list near-list']/div[@class='thes-list-content synonyms_list']//li//a" - ) - ] - thes[d]["near antonyms"].extend( - [ - ot(li) - for li in se.findall( - ".//span[@class='thes-list opp-list']/div[@class='thes-list-content synonyms_list']//li//a" - ) - ] - ) - thes[d]["antonyms"] = [ - ot(li) - for li in se.findall( - ".//span[@class='thes-list ant-list']/div[@class='thes-list-content synonyms_list']//li//a" - ) - ] + d = "" + for e in entry.findall(".//span[@class='dt']"): + d = only_first_text(e) + thes[d] = {} + for relev in [4, 3]: + for e in entry.findall( + f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']" + ): + thes[d]["synonyms"] = thes[d].get("synonyms", []) + [ot(e)] + for relev in [2, 1]: + for e in entry.findall( + f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']" + ): + thes[d]["near synonyms"] = thes[d].get( + "near synonyms", [] + ) + [ot(e)] + for relev in [4, 3]: + for e in entry.findall( + f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']" + ): + thes[d]["antonyms"] = thes[d].get("antonyms", []) + [ot(e)] + for relev in [2, 1]: + for e in entry.findall( + f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']" + ): + thes[d]["near antonyms"] = thes[d].get( + "near antonyms", [] + ) + [ot(e)] return thes @property @@ -92,13 +67,15 @@ class MWThesaurusParser(WordParser): assert ( self.type or self.thes ), f"{self.time} {self.word}: type or definitions came back empty..." - return uqall({self.word: self.thes | {"type": self.type}}) + return uqall( + {self.word: self.thes} | {"type": self.type, "time_of_retrieval": self.time} + ) -w = MWThesaurusParser("augur") +# w = MWThesaurusParser("content") # print(w.neighbours) -print(w.todict()) -exit() +# print(w.todict()) +# exit() q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json") q.loadDB() diff --git a/analysis.py b/analysis.py index 875c2bac..22742b93 100644 --- a/analysis.py +++ b/analysis.py @@ -1,12 +1,25 @@ +from dict_dl import FullDictionary + # import matplotlib.pyplot as plt # from PIL import Image # from wordcloud import STOPWORDS, WordCloud -d = FullDictionary("en_merriam_webster/", "_mw.json") +d = FullDictionary("en_MerriamWebster/", "_MW.json") # d = Dictionary("en_MW_thesaurus/", "_mwt.json") # d = Dictionary("de_duden/", "_duden.json") print(f"{d.readtime:.06f}") +print( + sorted( + [ + k + for k in d + if not any([c in ["a", "e", "i", "o", "u", "_"] for c in k.lower()]) + and len(k) > 2 + and k[-1] not in string.ascii_uppercase + ] + ) +) # print([k for k in d if not all([c in string.ascii_letters for c in k])]) print([k for k in d if "?" in k]) exit() diff --git a/d.py b/d.py old mode 100755 new mode 100644 index 9b6d58e9..e8b365c1 --- a/d.py +++ b/d.py @@ -18,7 +18,7 @@ d = DictFile(os.path.expandvars(f"$DICT_DL/en_MerriamWebster/{prefix}_MW.json")) print(f"||||||||{query}||||||||") for k, v in d[query].items(): - print(k,v) + print(k, v) # if k != "type": # table = Table(title=k) # table.add_column("synonyms", justify="center", style="cyan", no_wrap=True) diff --git a/dict_dl.py b/dict_dl.py index 27326f31..a21ca7e6 100644 --- a/dict_dl.py +++ b/dict_dl.py @@ -122,8 +122,8 @@ def url2str(url: str, clean=True) -> str: xml_str = re.sub(r"<>", "-->", xml_str) xml_str = remove_tag(xml_str, "head") # xml_str = remove_tag(xml_str) - with open("test.html", "w") as f: - f.write(xml_str) + # with open("test.html", "w") as f: + # f.write(xml_str) return xml_str diff --git a/duden.py b/duden.py index f51f2742..368aaf45 100644 --- a/duden.py +++ b/duden.py @@ -96,10 +96,10 @@ class DudenParser(WordParser): ) -# d = DudenParser("hinfallen") -# print(d.neighbours) -# print(d.todict()) -# exit() +d = DudenParser("hineintauchen") +print(d.neighbours) +print(d.todict()) +exit() q = Queue(DudenParser, "de_Duden/", "_D.json") q.loadDB() diff --git a/main.py b/main.py index 04516e56..ac57a0d2 100644 --- a/main.py +++ b/main.py @@ -80,7 +80,7 @@ def phrases(n: int = 4, nouns: int = 1, adjs: int = 2, pw: bool = False): if pw: # ps = [ "".join(p)[:-1] for p in [ [word + char for word, char in zip(p, [random_char() for w in p])] for p in phrases ] ] ps = [ - "".join([w.capitalize() for i,w in enumerate(p) if i > 0]) + "".join([w.capitalize() if i > 0 else w for i, w in enumerate(p)]) + random_char() + f"{random.randint(0,999):03d}" for p in phrases diff --git a/merriam_webster.py b/merriam_webster.py index a2f0a655..b60c14b4 100644 --- a/merriam_webster.py +++ b/merriam_webster.py @@ -17,11 +17,11 @@ class MerriamWebsterParser(WordParser): definitions[ot(d)] = [ ot(ex) for ex in e.findall("./span[@class!='dtText']") ] - if not definitions: # british spelling... + if not definitions: # british spelling... for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"): for e in entry.findall(".//span[@class='cxl']"): words = [ot(d) for d in entry.findall(".//a")] - definitions[f'{ot(e)} {", ".join(words)}'] = [ ] + definitions[f'{ot(e)} {", ".join(words)}'] = [] return cw(definitions) @property diff --git a/mw_scraper.py b/mw_scraper.py index 11b45a7a..a9efbd57 100644 --- a/mw_scraper.py +++ b/mw_scraper.py @@ -39,11 +39,6 @@ class Pronunciation(SQLModel, table=True): engine = create_engine(db_pass) -SQLModel.metadata.create_all(engine) - -html_session = HTMLSession() - -QUEUE = {line.strip() for line in open("queue.db", "rt")} def add_word(word): @@ -78,6 +73,7 @@ def add_word(word): .where(Sense.word == _word) .where(Sense.word_class == _class) ).one_or_none() + if results: sense = results else: @@ -97,14 +93,26 @@ def add_word(word): for sents in dt.find("span.sents"): _example.append(sents.text) _examples.append("; ".join(_example)) - session.add( - Description( - word=word.word, - sense_id=sense.id, - description="; ".join(_desc), - examples=_examples, + + _final_description = "; ".join(_desc) + results = session.exec( + select(Description).where( + Description.word == word.word, + Description.description == _final_description, ) - ) + ).one_or_none() + if results: + continue + else: + session.add( + Description( + word=word.word, + sense_id=sense.id, + description=_final_description, + examples=_examples, + ) + ) + for pron in c.find( "span.prons-entries-list-inline div.prons-entry-list-item,a.prons-entry-list-item" ): @@ -117,18 +125,28 @@ def add_word(word): return links -def present(): +def presently_available(): with Session(engine) as session: return session.exec(select(Word.word)).unique() -while True: - try: - QUEUE |= add_word(random.choice(list(QUEUE))) - QUEUE -= set(present()) - print(len(QUEUE)) - sleep(random.random() * 5) - except KeyboardInterrupt: - with open("queue.db", "wt") as f: - f.write("\n".join(list(QUEUE))) - exit(0) +if __name__ == "__main__": + SQLModel.metadata.create_all(engine) + html_session = HTMLSession() + + QUEUE = {line.strip() for line in open("queue.db", "rt")} + + while True: + try: + if len(QUEUE) < 20: + exit() + next_word = random.choice(list(QUEUE)) + already_present = set(presently_available()) + print(next_word, len(QUEUE), len(already_present)) + QUEUE |= add_word(next_word) + QUEUE -= already_present | {next_word} + sleep(random.random() * 5) + except KeyboardInterrupt: + with open("queue.db", "wt") as f: + f.write("\n".join(list(QUEUE))) + exit(0) diff --git a/t.py b/t.py index fe9feb0b..653f5fe7 100755 --- a/t.py +++ b/t.py @@ -1,25 +1,60 @@ #!/bin/python +"""search the Merriam Webster Thesaurus with ease""" +import argparse import os -import sys from itertools import zip_longest from rich.console import Console from rich.table import Table from dict_dl import DictFile +import string -if len(sys.argv) < 2: - query = next(sys.stdin).strip() -else: - query = sys.argv[1].strip() -prefix = query[:3] +letters = string.ascii_lowercase +prefix_length = 3 +unusual = ( + lambda prefix: not all([c in letters for c in prefix.lower()]) + or len(prefix) < prefix_length +) +parser = argparse.ArgumentParser(description="Merriam Webster Thesaurus") +parser.add_argument("-p", "--preview", action="store_true", help="FZF preview") +parser.add_argument("query", type=str, help="query") + +args = parser.parse_args() +prefix = args.query[:prefix_length].lower() + +if unusual(prefix): + prefix = "_" * prefix_length d = DictFile(os.path.expandvars(f"$DICT_DL/en_MWThesaurus/{prefix}_MWT.json")) -print(f"||||||||{query}||||||||") +if args.preview: + for k, v in d[args.query].items(): + if k == "type": + word_type = k + else: + syns = v["synonyms"] + nsyns = v["related" if "related" in v else "near synonyms"] + nants = v["near antonyms"] + ants = v["antonyms"] + print(f"> {k}") + if syns: + print(" SYNONYMS\n ", ", ".join(syns)) + if nsyns: + print(" NEAR SYNONYMS\n ", ", ".join(nsyns)) + if nants: + print(" NEAR ANTONYMS\n ", ", ".join(nants)) + if ants: + print(" ANTONYMS\n ", ", ".join(ants)) + print() + exit() + +print(f"||||||||{args.query}||||||||") print() -for k, v in d[query].items(): - if k != "type": +for k, v in d[args.query].items(): + if k == "type": + word_type = k + else: table = Table(title=k) table.add_column("synonyms", justify="center", style="cyan", no_wrap=True) table.add_column("near synonyms", justify="center", style="cyan", no_wrap=True)