dict_dl/duden.py

105 lines
3.2 KiB
Python
Raw Normal View History

2022-07-08 10:43:24 +00:00
from dict_dl import WordParser, Queue, cw, ot, uqall
2022-07-06 11:06:37 +00:00
class DudenParser(WordParser):
def __init__(self, word):
url_prefix= "https://www.duden.de/rechtschreibung/"
super().__init__(word, url_prefix)
@property
def definitions(self):
defs = {}
texts = (e for e in self.root.findall(".//div[@id='bedeutungen']"))
for e in texts:
2022-07-07 15:54:11 +00:00
for d in e.findall(".//div[@class='enumeration__text']"):
defs[ot(d)] = []
2022-07-06 11:06:37 +00:00
for d, examples in zip(
e.findall(".//div[@class='enumeration__text']"),
e.findall(".//ul[@class='note__list']"),
):
defs[ot(d)] = [ot(li) for li in examples.findall(".//li")]
texts = (e for e in self.root.findall(".//div[@id='bedeutung']"))
for e in texts:
for d in e.findall(".//p"):
2022-07-07 15:54:11 +00:00
defs[ot(d)] = []
2022-07-06 11:06:37 +00:00
for d, examples in zip(
e.findall(".//p"), e.findall(".//ul[@class='note__list']")
):
2022-07-07 15:54:11 +00:00
defs[ot(d)] = [ot(li) for li in examples.findall(".//li")]
2022-07-06 11:06:37 +00:00
return cw(defs)
@property
def pronounciation(self):
for e in self.root.findall(".//span[@class='ipa']"):
ipa = ot(e)[1:-1]
return ipa
return []
@property
2022-07-07 17:08:30 +00:00
def neighbours(self):
neighbours = []
for e in self.root.findall(".//a"):
if "href" in e.attrib and "/rechtschreibung/" in e.attrib["href"]:
link = e.attrib["href"].split("/")[-1].split("#")[0]
neighbours.append(link)
return cw(neighbours)
2022-07-06 11:06:37 +00:00
@property
def wendungen(self):
wends = []
for n in self.root.findall(".//dl[@class='note']"):
if "Wendungen, Redensarten, Sprichwörter" in ot(n):
wends.extend([ot(li) for li in n.findall(".//li")])
return cw(wends)
@property
def type(self):
for t in (
" ".join([l for l in e.itertext()])
for e in self.root.findall(".//dd[@class='tuple__val']")
):
return t
return []
@property
def history_and_etymology(self):
for e in self.root.findall(".//div[@id='herkunft']//p"):
return cw([l for l in e.itertext()])
@property
def synonyms(self):
syns = []
for e in self.root.findall(
".//div[@id='synonyme']//a[@data-duden-ref-type='lexeme']"
):
syns.extend([l for l in e.itertext()])
return cw(syns)
def todict(self):
assert (
self.type or self.definitions
), f"{self.time} {self.word}: type or definitions came back empty..."
2022-07-08 10:43:24 +00:00
return uqall({
2022-07-06 11:06:37 +00:00
self.word: {
"type": self.type,
"definitions": self.definitions,
"pronounciation": self.pronounciation,
"synonyms": self.synonyms,
"history_and_etymology": self.history_and_etymology,
"wendungen": self.wendungen,
"time_of_retrieval": self.time,
}
2022-07-08 10:43:24 +00:00
})
2022-07-06 11:06:37 +00:00
2022-07-07 17:08:30 +00:00
# d = DudenParser("hinfallen")
# print(d.neighbours)
# print(d.todict())
# exit()
2022-07-06 11:06:37 +00:00
q = Queue(DudenParser, "de_duden/", "_duden.json")
q.loadDB()
while True:
q.add_word()