import re from time import time from dict_dl import Queue, WordParser, cw, ot,rb class MerriamWebsterParser(WordParser): def __init__(self, word): url_prefix = "https://www.merriam-webster.com/dictionary/" super().__init__(word, url_prefix) @property def definitions(self): definitions = {} for i in range(1, 20): for entry in self.root.findall(f".//div[@id='dictionary-entry-{i}']"): for e in entry.findall(".//span[@class='dt']"): for d in e.findall("./span[@class='dtText']"): definitions[ot(d)] = [ot(ex) for ex in e.findall("./span[@class!='dtText']")] return cw(definitions) @property def neighbors(self): neighbors = [] for query in [".//*[@data-ind='4']", ".//*[@data-ind='6']"]: for e in self.root.findall(query): if "href" in e.attrib and not "thesaurus" in e.attrib["href"]: neighbors.append(" ".join([l for l in e.itertext()]).strip()) return cw(neighbors) @property def pronounciation(self): prs = [] for e in self.root.findall(".//*[@class='pr']"): prs.append(ot(e)) return list(set(prs)) @property def type(self): types = set() for e in self.root.findall(".//*[@class='fl']"): types.add(rb(ot(e), "(", ")")) return sorted(types) @property def examples(self): examples = [] queries = [ ".//*[@class='in-sentences']/span", ".//*[@class='in-sentences read-more-content-hint-container']/span", ".//*[@class='on-web']/span", ".//*[@class='on-web read-more-content-hint-container']/span", ] for query in queries: for e in self.root.findall(query): examples.append(" ".join([l for l in e.itertext()])) return cw(examples) @property def first_known_use(self): uses = {} for e in self.root.findall(".//div[@id='first-known-anchor']"): for y in e.findall(".//p[@class='ety-sl']"): uses[ot(y)] = "" for t,y in zip(e.findall(".//p[@class='function-label']"), e.findall(".//p[@class='ety-sl']")): uses[ot(y)] = rb(ot(t), "(", ")") return cw(uses) @property def synonym_discussion(self): syndisc = "" for e in self.root.findall(".//div[@id='synonym-discussion-anchor']"): syndisc = ot(e) return syndisc @property def history_and_etymology(self): hande = {} for e in self.root.findall(".//div[@id='etymology-anchor']"): for y in e.findall(".//p[@class='et']"): hande[ot(y)] = "" for t,y in zip(e.findall(".//p[@class='function-label']"), e.findall(".//p[@class='et']")): hande[ot(y)] = rb(ot(t), "(", ")") return cw(hande) @property def synonyms(self): uls = [ul for ul in self.root.findall(".//div[@id='synonyms-anchor']/ul")] if uls: return [next(l.itertext()) for l in uls[0].findall(".//a")] else: return [] @property def antonyms(self): uls = [ul for ul in self.root.findall(".//div[@id='synonyms-anchor']/ul")] if len(uls) > 1: return [next(l.itertext()) for l in uls[1].findall(".//a")] else: return [] def todict(self): assert ( self.type or self.definitions ), f"{self.time} {self.word}: type or definitions came back empty..." return { self.word: { "type": self.type, "definitions": self.definitions, "pronounciation": self.pronounciation, "synonyms": self.synonyms, "antonyms": self.antonyms, "synonym_discussion": self.synonym_discussion, "examples": self.examples, "history_and_etymology": self.history_and_etymology, "first_known_use": self.first_known_use, "time_of_retrieval": self.time, } } # testword = "sound" # d = MerriamWebsterParser(testword) # word_dict = d.todict() # for k, v in word_dict[testword].items(): # print(f"### {k} ###\n", v) # exit() q = Queue(MerriamWebsterParser, "en_merriam_webster/", "_mw.json", prefix_length=2) q.loadDB() while True: q.add_word()