106 lines
3.2 KiB
Python
106 lines
3.2 KiB
Python
from dict_dl import Queue, WordParser, cw, ot, uqall
|
|
|
|
|
|
class DudenParser(WordParser):
|
|
def __init__(self, word):
|
|
url_prefix= "https://www.duden.de/rechtschreibung/"
|
|
super().__init__(word, url_prefix)
|
|
|
|
@property
|
|
def definitions(self):
|
|
defs = {}
|
|
texts = (e for e in self.root.findall(".//div[@id='bedeutungen']"))
|
|
for e in texts:
|
|
for d in e.findall(".//div[@class='enumeration__text']"):
|
|
defs[ot(d)] = []
|
|
for d, examples in zip(
|
|
e.findall(".//div[@class='enumeration__text']"),
|
|
e.findall(".//ul[@class='note__list']"),
|
|
):
|
|
defs[ot(d)] = [ot(li) for li in examples.findall(".//li")]
|
|
|
|
texts = (e for e in self.root.findall(".//div[@id='bedeutung']"))
|
|
for e in texts:
|
|
for d in e.findall(".//p"):
|
|
defs[ot(d)] = []
|
|
for d, examples in zip(
|
|
e.findall(".//p"), e.findall(".//ul[@class='note__list']")
|
|
):
|
|
defs[ot(d)] = [ot(li) for li in examples.findall(".//li")]
|
|
|
|
return cw(defs)
|
|
|
|
@property
|
|
def pronounciation(self):
|
|
for e in self.root.findall(".//span[@class='ipa']"):
|
|
ipa = ot(e)[1:-1]
|
|
return ipa
|
|
return []
|
|
|
|
@property
|
|
def neighbours(self):
|
|
neighbours = []
|
|
for e in self.root.findall(".//a"):
|
|
if "href" in e.attrib and "/rechtschreibung/" in e.attrib["href"]:
|
|
link = e.attrib["href"].split("/")[-1].split("#")[0]
|
|
neighbours.append(link)
|
|
return cw(neighbours)
|
|
|
|
@property
|
|
def wendungen(self):
|
|
wends = []
|
|
for n in self.root.findall(".//dl[@class='note']"):
|
|
if "Wendungen, Redensarten, Sprichwörter" in ot(n):
|
|
wends.extend([ot(li) for li in n.findall(".//li")])
|
|
return cw(wends)
|
|
|
|
@property
|
|
def type(self):
|
|
for t in (
|
|
" ".join([l for l in e.itertext()])
|
|
for e in self.root.findall(".//dd[@class='tuple__val']")
|
|
):
|
|
return t
|
|
return []
|
|
|
|
@property
|
|
def history_and_etymology(self):
|
|
for e in self.root.findall(".//div[@id='herkunft']//p"):
|
|
return cw([l for l in e.itertext()])
|
|
|
|
@property
|
|
def synonyms(self):
|
|
syns = []
|
|
for e in self.root.findall(
|
|
".//div[@id='synonyme']//a[@data-duden-ref-type='lexeme']"
|
|
):
|
|
syns.extend([l for l in e.itertext()])
|
|
return cw(syns)
|
|
|
|
def todict(self):
|
|
assert (
|
|
self.type or self.definitions
|
|
), f"{self.time} {self.word}: type or definitions came back empty..."
|
|
return uqall({
|
|
self.word: {
|
|
"type": self.type,
|
|
"definitions": self.definitions,
|
|
"pronounciation": self.pronounciation,
|
|
"synonyms": self.synonyms,
|
|
"history_and_etymology": self.history_and_etymology,
|
|
"wendungen": self.wendungen,
|
|
"time_of_retrieval": self.time,
|
|
}
|
|
})
|
|
|
|
# d = DudenParser("hinfallen")
|
|
# print(d.neighbours)
|
|
# print(d.todict())
|
|
# exit()
|
|
|
|
q = Queue(DudenParser, "de_duden/", "_duden.json")
|
|
q.loadDB()
|
|
|
|
while True:
|
|
q.add_word()
|