from dict_dl import Queue, WordParser, cw, ot, uqall class DudenParser(WordParser): def __init__(self, word): url_prefix = "https://www.duden.de/rechtschreibung/" super().__init__(word, url_prefix) @property def definitions(self): defs = {} texts = (e for e in self.root.findall(".//div[@id='bedeutungen']")) for e in texts: for d in e.findall(".//div[@class='enumeration__text']"): defs[ot(d)] = [] for d, examples in zip( e.findall(".//div[@class='enumeration__text']"), e.findall(".//ul[@class='note__list']"), ): defs[ot(d)] = [ot(li) for li in examples.findall(".//li")] texts = (e for e in self.root.findall(".//div[@id='bedeutung']")) for e in texts: for d in e.findall(".//p"): defs[ot(d)] = [] for d, examples in zip( e.findall(".//p"), e.findall(".//ul[@class='note__list']") ): defs[ot(d)] = [ot(li) for li in examples.findall(".//li")] return cw(defs) @property def pronounciation(self): for e in self.root.findall(".//span[@class='ipa']"): ipa = ot(e)[1:-1] return ipa return [] @property def neighbours(self): neighbours = [] for e in self.root.findall(".//a"): if "href" in e.attrib and "/rechtschreibung/" in e.attrib["href"]: link = e.attrib["href"].split("/")[-1].split("#")[0] neighbours.append(link) return cw(neighbours) @property def wendungen(self): wends = [] for n in self.root.findall(".//dl[@class='note']"): if "Wendungen, Redensarten, Sprichwörter" in ot(n): wends.extend([ot(li) for li in n.findall(".//li")]) return cw(wends) @property def type(self): for t in ( " ".join([l for l in e.itertext()]) for e in self.root.findall(".//dd[@class='tuple__val']") ): return t return [] @property def history_and_etymology(self): for e in self.root.findall(".//div[@id='herkunft']//p"): return cw([l for l in e.itertext()]) @property def synonyms(self): syns = [] for e in self.root.findall( ".//div[@id='synonyme']//a[@data-duden-ref-type='lexeme']" ): syns.extend([l for l in e.itertext()]) return cw(syns) def todict(self): assert ( self.type or self.definitions ), f"{self.time} {self.word}: type or definitions came back empty..." return uqall( { self.word: { "type": self.type, "definitions": self.definitions, "pronounciation": self.pronounciation, "synonyms": self.synonyms, "history_and_etymology": self.history_and_etymology, "wendungen": self.wendungen, "time_of_retrieval": self.time, } } ) d = DudenParser("hineintauchen") print(d.neighbours) print(d.todict()) exit() q = Queue(DudenParser, "de_Duden/", "_D.json") q.loadDB() while True: q.add_word()