199 lines
6.2 KiB
Python
199 lines
6.2 KiB
Python
import json
|
|
import re
|
|
import time
|
|
from datetime import datetime
|
|
from xml.etree import ElementTree as ET
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def remove_tag(string, tag="script"):
|
|
otag = f"<{tag}"
|
|
ctag = f"</{tag}>"
|
|
otag_pos = 0
|
|
ctag_pos = 0
|
|
for i in range(len(string)):
|
|
if string[i : i + len(otag)] == otag:
|
|
otag_pos = i
|
|
elif string[i : i + len(ctag)] == ctag:
|
|
ctag_pos = i + len(ctag)
|
|
if otag_pos and ctag_pos:
|
|
return remove_tag(string[:otag_pos] + string[ctag_pos:], tag)
|
|
return string
|
|
|
|
def all_text(e):
|
|
return clear_whitespace(' '.join(e.itertext()))
|
|
def only_text(e):
|
|
return ' '.join(all_text(e))
|
|
|
|
def clear_whitespace(data):
|
|
if isinstance(data, list):
|
|
iterator = enumerate(data)
|
|
elif isinstance(data, dict):
|
|
iterator = data.items()
|
|
elif isinstance(data, str):
|
|
data = [data]
|
|
iterator = enumerate(data)
|
|
else:
|
|
raise TypeError("can only traverse list or dict")
|
|
for i, value in iterator:
|
|
if isinstance(value, (list, dict)):
|
|
clear_whitespace(value)
|
|
elif isinstance(value, str):
|
|
data[i] = re.sub(r"[\n\t\s]+", " ", value).strip()
|
|
return data
|
|
|
|
|
|
def url2str(url: str) -> str:
|
|
headers = {
|
|
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36"
|
|
}
|
|
bad_html = requests.get(url, headers=headers)
|
|
tree = BeautifulSoup(bad_html.text, features="lxml")
|
|
xml_str = str(tree)
|
|
xml_str = remove_tag(xml_str, "head")
|
|
xml_str = remove_tag(xml_str)
|
|
# with open("test.html", "w") as f:
|
|
# f.write(xml_str)
|
|
return xml_str
|
|
|
|
|
|
class WordParser:
|
|
def __init__(self, word):
|
|
self.time = datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
self.word = word
|
|
self.url = f"https://www.duden.de/rechtschreibung/{word}"
|
|
self.xml_string = url2str(self.url)
|
|
self.root = ET.fromstring(self.xml_string)
|
|
|
|
@property
|
|
def definitions(self):
|
|
defs = {}
|
|
texts = (e for e in self.root.findall(".//div[@id='bedeutungen']"))
|
|
for e in texts:
|
|
for d, examples in zip(
|
|
e.findall(".//div[@class='enumeration__text']"),
|
|
e.findall(".//ul[@class='note__list']"),
|
|
):
|
|
defs[only_text(d)] = [only_text(li) for li in examples.findall(".//li")]
|
|
|
|
texts = (e for e in self.root.findall(".//div[@id='bedeutung']"))
|
|
for e in texts:
|
|
for d in e.findall(".//p"):
|
|
defs[next(d.itertext())] = []
|
|
for d, examples in zip(
|
|
e.findall(".//p"), e.findall(".//ul[@class='note__list']")
|
|
):
|
|
defs[next(d.itertext())] = clear_whitespace(
|
|
" ".join((examples.itertext())).split("\n")
|
|
)
|
|
|
|
return clear_whitespace(defs)
|
|
|
|
@property
|
|
def pronounciation(self):
|
|
for e in self.root.findall(".//span[@class='ipa']"):
|
|
ipa = only_text(e)[1:-1]
|
|
return ipa
|
|
return []
|
|
|
|
@property
|
|
def neighbors(self):
|
|
neighbors = []
|
|
for e in self.root.findall(".//a[@data-duden-ref-type='lexeme']"):
|
|
link = e.attrib["href"].split("/")[-1].split("#")[0]
|
|
neighbors.append(link)
|
|
return clear_whitespace(neighbors)
|
|
|
|
@property
|
|
def wendungen(self):
|
|
wends = []
|
|
for n in self.root.findall(".//dl[@class='note']"):
|
|
if "Wendungen, Redensarten, Sprichwörter" in only_text(n):
|
|
wends.extend([only_text(li) for li in n.findall(".//li")])
|
|
return clear_whitespace(wends)
|
|
|
|
@property
|
|
def type(self):
|
|
for t in (
|
|
" ".join([l for l in e.itertext()])
|
|
for e in self.root.findall(".//dd[@class='tuple__val']")
|
|
):
|
|
return t
|
|
return []
|
|
|
|
@property
|
|
def history_and_etymology(self):
|
|
for e in self.root.findall(".//div[@id='herkunft']//p"):
|
|
return clear_whitespace([l for l in e.itertext()])
|
|
|
|
@property
|
|
def synonyms(self):
|
|
syns = []
|
|
for e in self.root.findall(
|
|
".//div[@id='synonyme']//a[@data-duden-ref-type='lexeme']"
|
|
):
|
|
syns.extend([l for l in e.itertext()])
|
|
return clear_whitespace(syns)
|
|
|
|
def todict(self):
|
|
assert self.type or self.definitions, f"{self.time} {self.word}: type or definitions came back empty..."
|
|
return {
|
|
self.word: {
|
|
"type": self.type,
|
|
"definitions": self.definitions,
|
|
"pronounciation": self.pronounciation,
|
|
"synonyms": self.synonyms,
|
|
"history_and_etymology": self.history_and_etymology,
|
|
"wendungen": self.wendungen,
|
|
"time_of_retrieval": self.time,
|
|
}
|
|
}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
# xml_string = url2str("https://www.duden.de/rechtschreibung/Hora_Stunde_Gebet")
|
|
# with open("test.html", "w") as f:
|
|
# f.write(xml_string)
|
|
# with open("test.html", "r") as f:
|
|
# xml_string = "".join(f.readlines())
|
|
# root = ET.fromstring(xml_string)
|
|
w = WordParser("Triage")
|
|
print(w.pronounciation)
|
|
# for k,v in w.definitions.items():
|
|
# print(f"{k}: \n{v}")
|
|
|
|
# with open("g_duden.json", "w") as f:
|
|
# json.dump(w.todict(), f)
|
|
|
|
# w = WordParser("Sunday")
|
|
# for e in w.root.findall(".//span[@class='dtText']"):
|
|
# print(" ".join([l for l in e.itertext()]))
|
|
# print(w.todict())
|
|
|
|
# mw |= w.todict()
|
|
# exit()
|
|
# with open("mw.json", "w") as f:
|
|
# json.dump(mw, f)
|
|
|
|
# xml_string = url2str("https://www.merriam-webster.com/dictionary/mechanical")
|
|
|
|
|
|
# from lxml import objectify
|
|
|
|
|
|
# def xml_to_dict(xml_str):
|
|
# """Convert xml to dict, using lxml v3.4.2 xml processing library, see http://lxml.de/"""
|
|
|
|
# def xml_to_dict_recursion(xml_object):
|
|
# dict_object = xml_object.__dict__
|
|
# if not dict_object: # if empty dict returned
|
|
# return xml_object
|
|
# for key, value in dict_object.items():
|
|
# dict_object[key] = xml_to_dict_recursion(value)
|
|
# return dict_object
|
|
|
|
# return xml_to_dict_recursion(objectify.fromstring(xml_str))
|