import re from datetime import datetime from xml.etree import ElementTree as ET import requests from bs4 import BeautifulSoup def remove_tag(string, tag="script"): otag = f"<{tag}" ctag = f"" otag_pos = 0 ctag_pos = 0 for i in range(len(string)): if string[i : i + len(otag)] == otag: otag_pos = i elif string[i : i + len(ctag)] == ctag: ctag_pos = i + len(ctag) if otag_pos and ctag_pos: return remove_tag(string[:otag_pos] + string[ctag_pos:], tag) return string def all_text(e): return clear_whitespace(' '.join(e.itertext())) def only_text(e): return ' '.join(all_text(e)) def clear_whitespace(data): if isinstance(data, list): iterator = enumerate(data) elif isinstance(data, dict): iterator = data.items() elif isinstance(data, str): data = [data] iterator = enumerate(data) else: raise TypeError("can only traverse list or dict") for i, value in iterator: if isinstance(value, (list, dict)): clear_whitespace(value) elif isinstance(value, str): data[i] = re.sub(r"[\n\t\s]+", " ", value).strip() return data def url2str(url: str) -> str: headers = { "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36" } bad_html = requests.get(url, headers=headers) tree = BeautifulSoup(bad_html.text, features="lxml") xml_str = str(tree) xml_str = remove_tag(xml_str, "head") xml_str = remove_tag(xml_str) # with open("test.html", "w") as f: # f.write(xml_str) return xml_str class WordParser: def __init__(self, word): self.time = datetime.now().strftime("%Y%m%d-%H%M%S") self.word = word self.url = f"https://www.duden.de/rechtschreibung/{word}" self.xml_string = url2str(self.url) self.root = ET.fromstring(self.xml_string) @property def definitions(self): defs = {} texts = (e for e in self.root.findall(".//div[@id='bedeutungen']")) for e in texts: for d, examples in zip( e.findall(".//div[@class='enumeration__text']"), e.findall(".//ul[@class='note__list']"), ): defs[only_text(d)] = [only_text(li) for li in examples.findall(".//li")] texts = (e for e in self.root.findall(".//div[@id='bedeutung']")) for e in texts: for d in e.findall(".//p"): defs[next(d.itertext())] = [] for d, examples in zip( e.findall(".//p"), e.findall(".//ul[@class='note__list']") ): defs[next(d.itertext())] = clear_whitespace( " ".join((examples.itertext())).split("\n") ) return clear_whitespace(defs) @property def pronounciation(self): for e in self.root.findall(".//span[@class='ipa']"): ipa = only_text(e)[1:-1] return ipa return [] @property def neighbors(self): neighbors = [] for e in self.root.findall(".//a[@data-duden-ref-type='lexeme']"): link = e.attrib["href"].split("/")[-1].split("#")[0] neighbors.append(link) return clear_whitespace(neighbors) @property def wendungen(self): wends = [] for n in self.root.findall(".//dl[@class='note']"): if "Wendungen, Redensarten, Sprichwörter" in only_text(n): wends.extend([only_text(li) for li in n.findall(".//li")]) return clear_whitespace(wends) @property def type(self): for t in ( " ".join([l for l in e.itertext()]) for e in self.root.findall(".//dd[@class='tuple__val']") ): return t return [] @property def history_and_etymology(self): for e in self.root.findall(".//div[@id='herkunft']//p"): return clear_whitespace([l for l in e.itertext()]) @property def synonyms(self): syns = [] for e in self.root.findall( ".//div[@id='synonyme']//a[@data-duden-ref-type='lexeme']" ): syns.extend([l for l in e.itertext()]) return clear_whitespace(syns) def todict(self): assert self.type or self.definitions, f"{self.time} {self.word}: type or definitions came back empty..." return { self.word: { "type": self.type, "definitions": self.definitions, "pronounciation": self.pronounciation, "synonyms": self.synonyms, "history_and_etymology": self.history_and_etymology, "wendungen": self.wendungen, "time_of_retrieval": self.time, } } if __name__ == "__main__": # xml_string = url2str("https://www.duden.de/rechtschreibung/Hora_Stunde_Gebet") # with open("test.html", "w") as f: # f.write(xml_string) # with open("test.html", "r") as f: # xml_string = "".join(f.readlines()) # root = ET.fromstring(xml_string) w = WordParser("Triage") print(w.pronounciation) # for k,v in w.definitions.items(): # print(f"{k}: \n{v}") # with open("g_duden.json", "w") as f: # json.dump(w.todict(), f) # w = WordParser("Sunday") # for e in w.root.findall(".//span[@class='dtText']"): # print(" ".join([l for l in e.itertext()])) # print(w.todict()) # mw |= w.todict() # exit() # with open("mw.json", "w") as f: # json.dump(mw, f) # xml_string = url2str("https://www.merriam-webster.com/dictionary/mechanical") # from lxml import objectify # def xml_to_dict(xml_str): # """Convert xml to dict, using lxml v3.4.2 xml processing library, see http://lxml.de/""" # def xml_to_dict_recursion(xml_object): # dict_object = xml_object.__dict__ # if not dict_object: # if empty dict returned # return xml_object # for key, value in dict_object.items(): # dict_object[key] = xml_to_dict_recursion(value) # return dict_object # return xml_to_dict_recursion(objectify.fromstring(xml_str))