dict_dl/MW_thesaurus.py

68 lines
2.9 KiB
Python
Raw Normal View History

2022-07-08 10:43:24 +00:00
from itertools import chain
2022-07-10 02:48:14 +00:00
from dict_dl import Queue, WordParser, ot, rb, uq, uqall
2022-07-08 10:43:24 +00:00
class MWThesaurusParser(WordParser):
def __init__(self, word):
url_prefix = "https://www.merriam-webster.com/thesaurus/"
super().__init__(word, url_prefix)
@property
def thes(self):
thes = {}
for i in range(1, 10):
for entry in self.root.findall(f".//div[@id='thesaurus-entry-{i}']"):
for se in chain(
entry.findall(".//div[@class='sb no-sn']"),
entry.findall(".//div[@class='sb has-num']"),
):
for e in se.findall(".//span[@class='dt']"):
examples = [ot(li) for li in e.findall(".//li")]
[e.remove(ul) for ul in e.findall(".//ul")]
d = ot(e)
thes[d] = {"examples": examples}
2022-07-08 15:42:29 +00:00
thes[d]["synonyms"] = [ ot(li) for li in se.findall( ".//span[@class='thes-list syn-list']/div[@class='thes-list-content synonyms_list']//li//a") ]
thes[d]["near synonyms"] = [ ot(li) for li in se.findall( ".//span[@class='thes-list rel-list']/div[@class='thes-list-content synonyms_list']//li//a") ]
thes[d]["near synonyms"].extend([ ot(li) for li in se.findall( ".//span[@class='thes-list sim-list']/div[@class='thes-list-content synonyms_list']//li//a") ])
thes[d]["near antonyms"] = [ ot(li) for li in se.findall( ".//span[@class='thes-list near-list']/div[@class='thes-list-content synonyms_list']//li//a") ]
thes[d]["near antonyms"].extend([ ot(li) for li in se.findall( ".//span[@class='thes-list opp-list']/div[@class='thes-list-content synonyms_list']//li//a") ])
thes[d]["antonyms"] = [ ot(li) for li in se.findall( ".//span[@class='thes-list ant-list']/div[@class='thes-list-content synonyms_list']//li//a") ]
2022-07-08 10:43:24 +00:00
return thes
@property
def type(self):
types = set()
for e in self.root.findall(
".//div[@class='row entry-header thesaurus']//span[@class='fl']"
):
types.add(rb(ot(e), "(", ")"))
return sorted(types)
@property
def neighbours(self):
neighbours = set()
for e in self.root.findall(".//a"):
if "href" in e.attrib and e.attrib["href"].startswith("/thesaurus/"):
link = e.attrib["href"].split("/")[-1].split("#")[0].split("?")[0]
neighbours.add(uq(link))
return neighbours
def todict(self):
assert (
self.type or self.thes
), f"{self.time} {self.word}: type or definitions came back empty..."
return uqall({self.word: self.thes | {"type": self.type}})
# w = MWThesaurusParser("coffining")
# print(w.todict())
# exit()
q = Queue(MWThesaurusParser, "en_MW_thesaurus/", "_mwt.json", prefix_length=2)
q.loadDB()
while True:
q.add_word()