from itertools import chain from dict_dl import Queue, WordParser, ot, rb, uq, uqall class MWThesaurusParser(WordParser): def __init__(self, word): url_prefix = "https://www.merriam-webster.com/thesaurus/" super().__init__(word, url_prefix) @property def thes(self): thes = {} for i in range(1, 10): for entry in self.root.findall(f".//div[@id='thesaurus-entry-{i}']"): for se in chain( entry.findall(".//div[@class='sb no-sn']"), entry.findall(".//div[@class='sb has-num']"), ): for e in se.findall(".//span[@class='dt']"): examples = [ot(li) for li in e.findall(".//li")] for ul in e.findall(".//ul"): ul.clear() d = ot(e) thes[d] = {"examples": examples} thes[d]["synonyms"] = [ ot(li) for li in se.findall( ".//span[@class='thes-list syn-list']/div[@class='thes-list-content synonyms_list']//li//a" ) ] thes[d]["synonyms"].extend([ ot(li) for li in se.findall( ".//span[@class='thes-list phrase-list']/div[@class='thes-list-content synonyms_list']//li//a" ) ]) thes[d]["near synonyms"] = [ ot(li) for li in se.findall( ".//span[@class='thes-list rel-list']/div[@class='thes-list-content synonyms_list']//li//a" ) ] thes[d]["near synonyms"].extend( [ ot(li) for li in se.findall( ".//span[@class='thes-list sim-list']/div[@class='thes-list-content synonyms_list']//li//a" ) ] ) thes[d]["near antonyms"] = [ ot(li) for li in se.findall( ".//span[@class='thes-list near-list']/div[@class='thes-list-content synonyms_list']//li//a" ) ] thes[d]["near antonyms"].extend( [ ot(li) for li in se.findall( ".//span[@class='thes-list opp-list']/div[@class='thes-list-content synonyms_list']//li//a" ) ] ) thes[d]["antonyms"] = [ ot(li) for li in se.findall( ".//span[@class='thes-list ant-list']/div[@class='thes-list-content synonyms_list']//li//a" ) ] return thes @property def type(self): types = set() for e in self.root.findall( ".//div[@class='row entry-header thesaurus']//span[@class='fl']" ): types.add(rb(ot(e), "(", ")")) return sorted(types) @property def neighbours(self): neighbours = set() for e in self.root.findall(".//a"): if "href" in e.attrib and e.attrib["href"].startswith("/thesaurus/"): link = e.attrib["href"].split("/")[-1].split("#")[0].split("?")[0] neighbours.add(uq(link)) return neighbours def todict(self): assert ( self.type or self.thes ), f"{self.time} {self.word}: type or definitions came back empty..." return uqall({self.word: self.thes | {"type": self.type}}) # w = MWThesaurusParser("coffining") # print(w.todict()) # exit() q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json") q.loadDB() # q.add_word("pretty much") # exit() while True: q.add_word()