dict_dl/de_duden/dp.py

199 lines
6.2 KiB
Python
Raw Normal View History

2022-07-06 11:06:37 +00:00
import json
import re
import time
from datetime import datetime
from xml.etree import ElementTree as ET
import requests
from bs4 import BeautifulSoup
def remove_tag(string, tag="script"):
otag = f"<{tag}"
ctag = f"</{tag}>"
otag_pos = 0
ctag_pos = 0
for i in range(len(string)):
if string[i : i + len(otag)] == otag:
otag_pos = i
elif string[i : i + len(ctag)] == ctag:
ctag_pos = i + len(ctag)
if otag_pos and ctag_pos:
return remove_tag(string[:otag_pos] + string[ctag_pos:], tag)
return string
def all_text(e):
return clear_whitespace(' '.join(e.itertext()))
def only_text(e):
return ' '.join(all_text(e))
def clear_whitespace(data):
if isinstance(data, list):
iterator = enumerate(data)
elif isinstance(data, dict):
iterator = data.items()
elif isinstance(data, str):
data = [data]
iterator = enumerate(data)
else:
raise TypeError("can only traverse list or dict")
for i, value in iterator:
if isinstance(value, (list, dict)):
clear_whitespace(value)
elif isinstance(value, str):
data[i] = re.sub(r"[\n\t\s]+", " ", value).strip()
return data
def url2str(url: str) -> str:
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36"
}
bad_html = requests.get(url, headers=headers)
tree = BeautifulSoup(bad_html.text, features="lxml")
xml_str = str(tree)
xml_str = remove_tag(xml_str, "head")
xml_str = remove_tag(xml_str)
# with open("test.html", "w") as f:
# f.write(xml_str)
return xml_str
class WordParser:
def __init__(self, word):
self.time = datetime.now().strftime("%Y%m%d-%H%M%S")
self.word = word
self.url = f"https://www.duden.de/rechtschreibung/{word}"
self.xml_string = url2str(self.url)
self.root = ET.fromstring(self.xml_string)
@property
def definitions(self):
defs = {}
texts = (e for e in self.root.findall(".//div[@id='bedeutungen']"))
for e in texts:
for d, examples in zip(
e.findall(".//div[@class='enumeration__text']"),
e.findall(".//ul[@class='note__list']"),
):
defs[only_text(d)] = [only_text(li) for li in examples.findall(".//li")]
texts = (e for e in self.root.findall(".//div[@id='bedeutung']"))
for e in texts:
for d in e.findall(".//p"):
defs[next(d.itertext())] = []
for d, examples in zip(
e.findall(".//p"), e.findall(".//ul[@class='note__list']")
):
defs[next(d.itertext())] = clear_whitespace(
" ".join((examples.itertext())).split("\n")
)
return clear_whitespace(defs)
@property
def pronounciation(self):
for e in self.root.findall(".//span[@class='ipa']"):
ipa = only_text(e)[1:-1]
return ipa
return []
@property
def neighbors(self):
neighbors = []
for e in self.root.findall(".//a[@data-duden-ref-type='lexeme']"):
link = e.attrib["href"].split("/")[-1].split("#")[0]
neighbors.append(link)
return clear_whitespace(neighbors)
@property
def wendungen(self):
wends = []
for n in self.root.findall(".//dl[@class='note']"):
if "Wendungen, Redensarten, Sprichwörter" in only_text(n):
wends.extend([only_text(li) for li in n.findall(".//li")])
return clear_whitespace(wends)
@property
def type(self):
for t in (
" ".join([l for l in e.itertext()])
for e in self.root.findall(".//dd[@class='tuple__val']")
):
return t
return []
@property
def history_and_etymology(self):
for e in self.root.findall(".//div[@id='herkunft']//p"):
return clear_whitespace([l for l in e.itertext()])
@property
def synonyms(self):
syns = []
for e in self.root.findall(
".//div[@id='synonyme']//a[@data-duden-ref-type='lexeme']"
):
syns.extend([l for l in e.itertext()])
return clear_whitespace(syns)
def todict(self):
assert self.type or self.definitions, f"{self.time} {self.word}: type or definitions came back empty..."
return {
self.word: {
"type": self.type,
"definitions": self.definitions,
"pronounciation": self.pronounciation,
"synonyms": self.synonyms,
"history_and_etymology": self.history_and_etymology,
"wendungen": self.wendungen,
"time_of_retrieval": self.time,
}
}
if __name__ == "__main__":
# xml_string = url2str("https://www.duden.de/rechtschreibung/Hora_Stunde_Gebet")
# with open("test.html", "w") as f:
# f.write(xml_string)
# with open("test.html", "r") as f:
# xml_string = "".join(f.readlines())
# root = ET.fromstring(xml_string)
w = WordParser("Triage")
print(w.pronounciation)
# for k,v in w.definitions.items():
# print(f"{k}: \n{v}")
# with open("g_duden.json", "w") as f:
# json.dump(w.todict(), f)
# w = WordParser("Sunday")
# for e in w.root.findall(".//span[@class='dtText']"):
# print(" ".join([l for l in e.itertext()]))
# print(w.todict())
# mw |= w.todict()
# exit()
# with open("mw.json", "w") as f:
# json.dump(mw, f)
# xml_string = url2str("https://www.merriam-webster.com/dictionary/mechanical")
# from lxml import objectify
# def xml_to_dict(xml_str):
# """Convert xml to dict, using lxml v3.4.2 xml processing library, see http://lxml.de/"""
# def xml_to_dict_recursion(xml_object):
# dict_object = xml_object.__dict__
# if not dict_object: # if empty dict returned
# return xml_object
# for key, value in dict_object.items():
# dict_object[key] = xml_to_dict_recursion(value)
# return dict_object
# return xml_to_dict_recursion(objectify.fromstring(xml_str))