from dict_dl import Queue, WordParser, ot, rb, uq, uqall, only_first_text class MWThesaurusParser(WordParser): def __init__(self, word): url_prefix = "https://www.merriam-webster.com/thesaurus/" super().__init__(word, url_prefix, clean=True) @property def thes(self): thes = {} for i in range(1, 10): for j in range(1, 10): for entry in self.root.findall( f".//div[@id='thesaurus-entry-{i}-{j}']" ): d = "" for e in entry.findall(".//span[@class='dt']"): d = only_first_text(e) thes[d] = {} for relev in [4, 3]: for e in entry.findall( f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']" ): thes[d]["synonyms"] = thes[d].get("synonyms", []) + [ot(e)] for relev in [2, 1]: for e in entry.findall( f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']" ): thes[d]["near synonyms"] = thes[d].get( "near synonyms", [] ) + [ot(e)] for relev in [4, 3]: for e in entry.findall( f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']" ): thes[d]["antonyms"] = thes[d].get("antonyms", []) + [ot(e)] for relev in [2, 1]: for e in entry.findall( f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']" ): thes[d]["near antonyms"] = thes[d].get( "near antonyms", [] ) + [ot(e)] return thes @property def type(self): types = set() for e in self.root.findall(".//a[@class='important-blue-link']"): types.add(rb(ot(e), "(", ")")) return sorted(types) @property def neighbours(self): neighbours = set() for e in self.root.findall(".//a"): if "href" in e.attrib and e.attrib["href"].startswith("/thesaurus/"): link = e.attrib["href"].split("/")[-1].split("#")[0].split("?")[0] neighbours.add(uq(link)) return neighbours def todict(self): assert ( self.type or self.thes ), f"{self.time} {self.word}: type or definitions came back empty..." return uqall( {self.word: self.thes} | {"type": self.type, "time_of_retrieval": self.time} ) # w = MWThesaurusParser("content") # print(w.neighbours) # print(w.todict()) # exit() q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json") q.loadDB() # q.add_word("pretty much") # exit() while True: q.add_word()