111 lines
4.1 KiB
Python
111 lines
4.1 KiB
Python
from itertools import chain
|
|
|
|
from dict_dl import Queue, WordParser, ot, rb, uq, uqall
|
|
|
|
|
|
class MWThesaurusParser(WordParser):
|
|
def __init__(self, word):
|
|
url_prefix = "https://www.merriam-webster.com/thesaurus/"
|
|
super().__init__(word, url_prefix, clean=True)
|
|
|
|
@property
|
|
def thes(self):
|
|
thes = {}
|
|
for i in range(1, 10):
|
|
for entry in self.root.findall(f".//div[@id='thesaurus-entry-{i}']"):
|
|
for se in chain(
|
|
entry.findall(".//div[@class='sb no-sn']"),
|
|
entry.findall(".//div[@class='sb has-num']"),
|
|
):
|
|
for e in se.findall(".//span[@class='dt']"):
|
|
examples = [ot(li) for li in e.findall(".//li")]
|
|
for ul in e.findall(".//ul"):
|
|
ul.clear()
|
|
d = ot(e)
|
|
thes[d] = {"examples": examples}
|
|
thes[d]["synonyms"] = [
|
|
ot(li)
|
|
for li in se.findall(
|
|
".//span[@class='thes-list syn-list']/div[@class='thes-list-content synonyms_list']//li//a"
|
|
)
|
|
]
|
|
thes[d]["synonyms"].extend([
|
|
ot(li)
|
|
for li in se.findall(
|
|
".//span[@class='thes-list phrase-list']/div[@class='thes-list-content synonyms_list']//li//a"
|
|
)
|
|
])
|
|
thes[d]["near synonyms"] = [
|
|
ot(li)
|
|
for li in se.findall(
|
|
".//span[@class='thes-list rel-list']/div[@class='thes-list-content synonyms_list']//li//a"
|
|
)
|
|
]
|
|
thes[d]["near synonyms"].extend(
|
|
[
|
|
ot(li)
|
|
for li in se.findall(
|
|
".//span[@class='thes-list sim-list']/div[@class='thes-list-content synonyms_list']//li//a"
|
|
)
|
|
]
|
|
)
|
|
thes[d]["near antonyms"] = [
|
|
ot(li)
|
|
for li in se.findall(
|
|
".//span[@class='thes-list near-list']/div[@class='thes-list-content synonyms_list']//li//a"
|
|
)
|
|
]
|
|
thes[d]["near antonyms"].extend(
|
|
[
|
|
ot(li)
|
|
for li in se.findall(
|
|
".//span[@class='thes-list opp-list']/div[@class='thes-list-content synonyms_list']//li//a"
|
|
)
|
|
]
|
|
)
|
|
thes[d]["antonyms"] = [
|
|
ot(li)
|
|
for li in se.findall(
|
|
".//span[@class='thes-list ant-list']/div[@class='thes-list-content synonyms_list']//li//a"
|
|
)
|
|
]
|
|
|
|
return thes
|
|
|
|
@property
|
|
def type(self):
|
|
types = set()
|
|
for e in self.root.findall(".//a[@class='important-blue-link']"):
|
|
types.add(rb(ot(e), "(", ")"))
|
|
return sorted(types)
|
|
|
|
@property
|
|
def neighbours(self):
|
|
neighbours = set()
|
|
for e in self.root.findall(".//a"):
|
|
if "href" in e.attrib and e.attrib["href"].startswith("/thesaurus/"):
|
|
link = e.attrib["href"].split("/")[-1].split("#")[0].split("?")[0]
|
|
neighbours.add(uq(link))
|
|
return neighbours
|
|
|
|
def todict(self):
|
|
assert (
|
|
self.type or self.thes
|
|
), f"{self.time} {self.word}: type or definitions came back empty..."
|
|
return uqall({self.word: self.thes | {"type": self.type}})
|
|
|
|
|
|
w = MWThesaurusParser("augur")
|
|
# print(w.neighbours)
|
|
print(w.todict())
|
|
exit()
|
|
|
|
q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json")
|
|
q.loadDB()
|
|
|
|
# q.add_word("pretty much")
|
|
# exit()
|
|
|
|
while True:
|
|
q.add_word()
|