2022-07-08 10:43:24 +00:00
|
|
|
from itertools import chain
|
|
|
|
|
2023-09-05 14:00:29 +00:00
|
|
|
from dict_dl import Queue, WordParser, ot, rb, uq, uqall, only_first_text
|
2022-07-08 10:43:24 +00:00
|
|
|
|
|
|
|
|
|
|
|
class MWThesaurusParser(WordParser):
|
|
|
|
def __init__(self, word):
|
|
|
|
url_prefix = "https://www.merriam-webster.com/thesaurus/"
|
2022-11-15 11:41:33 +00:00
|
|
|
super().__init__(word, url_prefix, clean=True)
|
2022-07-08 10:43:24 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def thes(self):
|
|
|
|
thes = {}
|
|
|
|
for i in range(1, 10):
|
2023-09-05 14:00:29 +00:00
|
|
|
for j in range(1, 10):
|
|
|
|
for entry in self.root.findall(
|
|
|
|
f".//div[@id='thesaurus-entry-{i}-{j}']"
|
2022-07-08 10:43:24 +00:00
|
|
|
):
|
2023-09-05 14:00:29 +00:00
|
|
|
d = ""
|
|
|
|
for e in entry.findall(".//span[@class='dt']"):
|
|
|
|
d = only_first_text(e)
|
|
|
|
thes[d] = {}
|
|
|
|
for relev in [4, 3]:
|
|
|
|
for e in entry.findall(
|
|
|
|
f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']"
|
|
|
|
):
|
|
|
|
thes[d]["synonyms"] = thes[d].get("synonyms", []) + [ot(e)]
|
|
|
|
for relev in [2, 1]:
|
|
|
|
for e in entry.findall(
|
|
|
|
f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']"
|
|
|
|
):
|
|
|
|
thes[d]["near synonyms"] = thes[d].get(
|
|
|
|
"near synonyms", []
|
|
|
|
) + [ot(e)]
|
2022-07-08 10:43:24 +00:00
|
|
|
|
2023-09-05 14:00:29 +00:00
|
|
|
for relev in [4, 3]:
|
|
|
|
for e in entry.findall(
|
|
|
|
f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']"
|
|
|
|
):
|
|
|
|
thes[d]["antonyms"] = thes[d].get("antonyms", []) + [ot(e)]
|
|
|
|
for relev in [2, 1]:
|
|
|
|
for e in entry.findall(
|
|
|
|
f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']"
|
|
|
|
):
|
|
|
|
thes[d]["near antonyms"] = thes[d].get(
|
|
|
|
"near antonyms", []
|
|
|
|
) + [ot(e)]
|
2022-07-08 10:43:24 +00:00
|
|
|
return thes
|
|
|
|
|
|
|
|
@property
|
|
|
|
def type(self):
|
|
|
|
types = set()
|
2022-11-15 11:41:33 +00:00
|
|
|
for e in self.root.findall(".//a[@class='important-blue-link']"):
|
2022-07-08 10:43:24 +00:00
|
|
|
types.add(rb(ot(e), "(", ")"))
|
|
|
|
return sorted(types)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def neighbours(self):
|
|
|
|
neighbours = set()
|
|
|
|
for e in self.root.findall(".//a"):
|
|
|
|
if "href" in e.attrib and e.attrib["href"].startswith("/thesaurus/"):
|
|
|
|
link = e.attrib["href"].split("/")[-1].split("#")[0].split("?")[0]
|
|
|
|
neighbours.add(uq(link))
|
|
|
|
return neighbours
|
|
|
|
|
|
|
|
def todict(self):
|
|
|
|
assert (
|
|
|
|
self.type or self.thes
|
|
|
|
), f"{self.time} {self.word}: type or definitions came back empty..."
|
2023-09-05 14:00:29 +00:00
|
|
|
return uqall(
|
|
|
|
{self.word: self.thes} | {"type": self.type, "time_of_retrieval": self.time}
|
|
|
|
)
|
2022-07-08 10:43:24 +00:00
|
|
|
|
|
|
|
|
2023-09-05 14:00:29 +00:00
|
|
|
# w = MWThesaurusParser("content")
|
2022-11-15 11:41:33 +00:00
|
|
|
# print(w.neighbours)
|
2023-09-05 14:00:29 +00:00
|
|
|
# print(w.todict())
|
|
|
|
# exit()
|
2022-07-08 10:43:24 +00:00
|
|
|
|
2022-07-10 04:35:09 +00:00
|
|
|
q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json")
|
2022-07-08 10:43:24 +00:00
|
|
|
q.loadDB()
|
|
|
|
|
2022-07-15 11:16:05 +00:00
|
|
|
# q.add_word("pretty much")
|
|
|
|
# exit()
|
|
|
|
|
2022-07-08 10:43:24 +00:00
|
|
|
while True:
|
|
|
|
q.add_word()
|