from dict_dl import Queue, WordParser, cw, ot, rb, uq, uqall class MerriamWebsterParser(WordParser): def __init__(self, word): url_prefix = "https://www.merriam-webster.com/dictionary/" super().__init__(word, url_prefix) @property def definitions(self): definitions = {} for i in range(1, 20): for entry in self.root.findall(f".//div[@id='dictionary-entry-{i}']"): for e in entry.findall(".//span[@class='dt']"): for d in e.findall("./span[@class='dtText']"): definitions[ot(d)] = [ ot(ex) for ex in e.findall("./span[@class!='dtText']") ] if not definitions: # british spelling... for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"): for e in entry.findall(".//span[@class='cxl']"): words = [ot(d) for d in entry.findall(".//a")] definitions[f'{ot(e)} {", ".join(words)}'] = [] return cw(definitions) @property def neighbours(self): neighbours = set() for e in self.root.findall(".//a"): if "href" in e.attrib and e.attrib["href"].startswith("/dictionary/"): link = e.attrib["href"].split("/")[-1].split("#")[0].split("?")[0] neighbours.add(uq(link)) return neighbours @property def pronounciation(self): prs = [] for e in self.root.findall(".//span[@class='mw no-badge']"): prs.append(ot(e)) return list(set(prs)) @property def type(self): types = set() for e in self.root.findall(".//a[@class='important-blue-link']"): types.add(rb(ot(e), "(", ")")) return sorted(types) @property def examples(self): examples = [] queries = [ ".//*[@class='in-sentences']/span", ".//*[@class='in-sentences read-more-content-hint-container']/span", ".//*[@class='on-web']/span", ".//*[@class='on-web read-more-content-hint-container']/span", ] for query in queries: for e in self.root.findall(query): examples.append(" ".join([l for l in e.itertext()])) return cw(examples) @property def first_known_use(self): uses = {} for e in self.root.findall(".//div[@class='first-known-content-section']"): for y in e.findall(".//p[@class='ety-sl pb-3']"): uses[ot(y)] = "" for t, y in zip( e.findall(".//p[@class='function-label']"), e.findall(".//p[@class='ety-sl']"), ): uses[ot(y)] = rb(ot(t), "(", ")") return cw(uses) @property def synonym_discussion(self): syndisc = "" for e in self.root.findall(".//div[@id='synonym-discussion-anchor']"): syndisc = ot(e) return syndisc @property def history_and_etymology(self): hande = {} for e in self.root.findall(".//div[@class='etymology-content-section']"): for y in e.findall(".//p[@class='et']"): hande[ot(y)] = "" for t, y in zip( e.findall(".//p[@class='function-label']"), e.findall(".//p[@class='et']"), ): hande[ot(y)] = rb(ot(t), "(", ")") print(hande) return cw(hande) @property def synonyms(self): uls = [ul for ul in self.root.findall(".//div[@id='synonyms-anchor']/ul")] if uls: return [next(l.itertext()) for l in uls[0].findall(".//a")] else: return [] @property def antonyms(self): uls = [ul for ul in self.root.findall(".//div[@id='synonyms-anchor']/ul")] if len(uls) > 1: return [next(l.itertext()) for l in uls[1].findall(".//a")] else: return [] def todict(self): assert ( self.type or self.definitions ), f"{self.time} {self.word}: type or definitions came back empty..." return uqall( { self.word: { "type": self.type, "definitions": self.definitions, "pronounciation": self.pronounciation, # "synonyms": self.synonyms, # "antonyms": self.antonyms, # "synonym_discussion": self.synonym_discussion, "examples": self.examples, "history_and_etymology": self.history_and_etymology, "first_known_use": self.first_known_use, "time_of_retrieval": self.time, } } ) # testword = "domicile" # d = MerriamWebsterParser(testword) # # print(d.definitions) # print(d.neighbours) # word_dict = d.todict() # for k, v in word_dict[testword].items(): # print(f"### {k} ###\n", v) # exit() q = Queue(MerriamWebsterParser, "en_MerriamWebster/", "_MW.json") q.loadDB() while True: q.add_word()