dict_dl/MW_thesaurus.py
2023-11-20 17:49:47 +01:00

87 lines
3.1 KiB
Python

from dict_dl import Queue, WordParser, ot, rb, uq, uqall, only_first_text
class MWThesaurusParser(WordParser):
def __init__(self, word):
url_prefix = "https://www.merriam-webster.com/thesaurus/"
super().__init__(word, url_prefix, clean=True)
@property
def thes(self):
thes = {}
for i in range(1, 10):
for j in range(1, 10):
for entry in self.root.findall(
f".//div[@id='thesaurus-entry-{i}-{j}']"
):
d = ""
for e in entry.findall(".//span[@class='dt']"):
d = only_first_text(e)
thes[d] = {}
for relev in [4, 3]:
for e in entry.findall(
f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']"
):
thes[d]["synonyms"] = thes[d].get("synonyms", []) + [ot(e)]
for relev in [2, 1]:
for e in entry.findall(
f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']"
):
thes[d]["near synonyms"] = thes[d].get(
"near synonyms", []
) + [ot(e)]
for relev in [4, 3]:
for e in entry.findall(
f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']"
):
thes[d]["antonyms"] = thes[d].get("antonyms", []) + [ot(e)]
for relev in [2, 1]:
for e in entry.findall(
f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']"
):
thes[d]["near antonyms"] = thes[d].get(
"near antonyms", []
) + [ot(e)]
return thes
@property
def type(self):
types = set()
for e in self.root.findall(".//a[@class='important-blue-link']"):
types.add(rb(ot(e), "(", ")"))
return sorted(types)
@property
def neighbours(self):
neighbours = set()
for e in self.root.findall(".//a"):
if "href" in e.attrib and e.attrib["href"].startswith("/thesaurus/"):
link = e.attrib["href"].split("/")[-1].split("#")[0].split("?")[0]
neighbours.add(uq(link))
return neighbours
def todict(self):
assert (
self.type or self.thes
), f"{self.time} {self.word}: type or definitions came back empty..."
return uqall(
{self.word: self.thes} | {"type": self.type, "time_of_retrieval": self.time}
)
# w = MWThesaurusParser("content")
# print(w.neighbours)
# print(w.todict())
# exit()
q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json")
q.loadDB()
# q.add_word("pretty much")
# exit()
while True:
q.add_word()