dict_dl/duden.py

from dict_dl import WordParser, Queue, cw, ot, uqall

class DudenParser(WordParser):
    def __init__(self, word):
        url_prefix= "https://www.duden.de/rechtschreibung/"
        super().__init__(word, url_prefix)
        
    @property
    def definitions(self):
        defs = {}
        texts = (e for e in self.root.findall(".//div[@id='bedeutungen']"))
        for e in texts:
            for d in e.findall(".//div[@class='enumeration__text']"):
                defs[ot(d)] = []
            for d, examples in zip(
                e.findall(".//div[@class='enumeration__text']"),
                e.findall(".//ul[@class='note__list']"),
            ):
                defs[ot(d)] = [ot(li) for li in examples.findall(".//li")]

        texts = (e for e in self.root.findall(".//div[@id='bedeutung']"))
        for e in texts:
            for d in e.findall(".//p"):
                defs[ot(d)] = []
            for d, examples in zip(
                e.findall(".//p"), e.findall(".//ul[@class='note__list']")
            ):
                defs[ot(d)] = [ot(li) for li in examples.findall(".//li")]

        return cw(defs)

    @property
    def pronounciation(self):
        for e in self.root.findall(".//span[@class='ipa']"):
            ipa = ot(e)[1:-1]
            return ipa
        return []

    @property
    def neighbours(self):
        neighbours = []
        for e in self.root.findall(".//a"):
            if "href" in e.attrib and "/rechtschreibung/" in e.attrib["href"]:
                link = e.attrib["href"].split("/")[-1].split("#")[0]
                neighbours.append(link)
        return cw(neighbours)

    @property
    def wendungen(self):
        wends = []
        for n in self.root.findall(".//dl[@class='note']"):
            if "Wendungen, Redensarten, Sprichwörter" in ot(n):
                wends.extend([ot(li) for li in n.findall(".//li")])
        return cw(wends)

    @property
    def type(self):
        for t in (
            " ".join([l for l in e.itertext()])
            for e in self.root.findall(".//dd[@class='tuple__val']")
        ):
            return t
        return []

    @property
    def history_and_etymology(self):
        for e in self.root.findall(".//div[@id='herkunft']//p"):
            return cw([l for l in e.itertext()])

    @property
    def synonyms(self):
        syns = []
        for e in self.root.findall(
            ".//div[@id='synonyme']//a[@data-duden-ref-type='lexeme']"
        ):
            syns.extend([l for l in e.itertext()])
        return cw(syns)

    def todict(self):
        assert (
            self.type or self.definitions
        ), f"{self.time} {self.word}: type or definitions came back empty..."
        return uqall({
            self.word: {
                "type": self.type,
                "definitions": self.definitions,
                "pronounciation": self.pronounciation,
                "synonyms": self.synonyms,
                "history_and_etymology": self.history_and_etymology,
                "wendungen": self.wendungen,
                "time_of_retrieval": self.time,
            }
        })

# d = DudenParser("hinfallen")
# print(d.neighbours)
# print(d.todict())
# exit()

q = Queue(DudenParser, "de_duden/", "_duden.json")
q.loadDB()

while True:
    q.add_word()
add MWThesaurus 2022-07-08 10:43:24 +00:00			`from dict_dl import WordParser, Queue, cw, ot, uqall`
Initial commit 2022-07-06 11:06:37 +00:00
			`class DudenParser(WordParser):`
			`def __init__(self, word):`
			`url_prefix= "https://www.duden.de/rechtschreibung/"`
			`super().__init__(word, url_prefix)`

			`@property`
			`def definitions(self):`
			`defs = {}`
			`texts = (e for e in self.root.findall(".//div[@id='bedeutungen']"))`
			`for e in texts:`
duden: "-" -> "_" 2022-07-07 15:54:11 +00:00			`for d in e.findall(".//div[@class='enumeration__text']"):`
			`defs[ot(d)] = []`
Initial commit 2022-07-06 11:06:37 +00:00			`for d, examples in zip(`
			`e.findall(".//div[@class='enumeration__text']"),`
			`e.findall(".//ul[@class='note__list']"),`
			`):`
			`defs[ot(d)] = [ot(li) for li in examples.findall(".//li")]`

			`texts = (e for e in self.root.findall(".//div[@id='bedeutung']"))`
			`for e in texts:`
			`for d in e.findall(".//p"):`
duden: "-" -> "_" 2022-07-07 15:54:11 +00:00			`defs[ot(d)] = []`
Initial commit 2022-07-06 11:06:37 +00:00			`for d, examples in zip(`
			`e.findall(".//p"), e.findall(".//ul[@class='note__list']")`
			`):`
duden: "-" -> "_" 2022-07-07 15:54:11 +00:00			`defs[ot(d)] = [ot(li) for li in examples.findall(".//li")]`
Initial commit 2022-07-06 11:06:37 +00:00
			`return cw(defs)`

			`@property`
			`def pronounciation(self):`
			`for e in self.root.findall(".//span[@class='ipa']"):`
			`ipa = ot(e)[1:-1]`
			`return ipa`
			`return []`

			`@property`
fix saving of snafu and redo 2022-07-07 17:08:30 +00:00			`def neighbours(self):`
			`neighbours = []`
			`for e in self.root.findall(".//a"):`
			`if "href" in e.attrib and "/rechtschreibung/" in e.attrib["href"]:`
			`link = e.attrib["href"].split("/")[-1].split("#")[0]`
			`neighbours.append(link)`
			`return cw(neighbours)`
Initial commit 2022-07-06 11:06:37 +00:00
			`@property`
			`def wendungen(self):`
			`wends = []`
			`for n in self.root.findall(".//dl[@class='note']"):`
			`if "Wendungen, Redensarten, Sprichwörter" in ot(n):`
			`wends.extend([ot(li) for li in n.findall(".//li")])`
			`return cw(wends)`

			`@property`
			`def type(self):`
			`for t in (`
			`" ".join([l for l in e.itertext()])`
			`for e in self.root.findall(".//dd[@class='tuple__val']")`
			`):`
			`return t`
			`return []`

			`@property`
			`def history_and_etymology(self):`
			`for e in self.root.findall(".//div[@id='herkunft']//p"):`
			`return cw([l for l in e.itertext()])`

			`@property`
			`def synonyms(self):`
			`syns = []`
			`for e in self.root.findall(`
			`".//div[@id='synonyme']//a[@data-duden-ref-type='lexeme']"`
			`):`
			`syns.extend([l for l in e.itertext()])`
			`return cw(syns)`

			`def todict(self):`
			`assert (`
			`self.type or self.definitions`
			`), f"{self.time} {self.word}: type or definitions came back empty..."`
add MWThesaurus 2022-07-08 10:43:24 +00:00			`return uqall({`
Initial commit 2022-07-06 11:06:37 +00:00			`self.word: {`
			`"type": self.type,`
			`"definitions": self.definitions,`
			`"pronounciation": self.pronounciation,`
			`"synonyms": self.synonyms,`
			`"history_and_etymology": self.history_and_etymology,`
			`"wendungen": self.wendungen,`
			`"time_of_retrieval": self.time,`
			`}`
add MWThesaurus 2022-07-08 10:43:24 +00:00			`})`
Initial commit 2022-07-06 11:06:37 +00:00
fix saving of snafu and redo 2022-07-07 17:08:30 +00:00			`# d = DudenParser("hinfallen")`
			`# print(d.neighbours)`
			`# print(d.todict())`
			`# exit()`
Initial commit 2022-07-06 11:06:37 +00:00
			`q = Queue(DudenParser, "de_duden/", "_duden.json")`
			`q.loadDB()`

			`while True:`
			`q.add_word()`