2022-07-08 10:43:24 +00:00
|
|
|
from dict_dl import Queue, WordParser, cw, ot, uq, rb, uqall
|
2022-07-06 11:06:37 +00:00
|
|
|
|
|
|
|
|
|
|
|
class MerriamWebsterParser(WordParser):
|
|
|
|
def __init__(self, word):
|
|
|
|
url_prefix = "https://www.merriam-webster.com/dictionary/"
|
|
|
|
super().__init__(word, url_prefix)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def definitions(self):
|
|
|
|
definitions = {}
|
|
|
|
for i in range(1, 20):
|
|
|
|
for entry in self.root.findall(f".//div[@id='dictionary-entry-{i}']"):
|
|
|
|
for e in entry.findall(".//span[@class='dt']"):
|
|
|
|
for d in e.findall("./span[@class='dtText']"):
|
2022-07-08 10:43:24 +00:00
|
|
|
definitions[ot(d)] = [
|
|
|
|
ot(ex) for ex in e.findall("./span[@class!='dtText']")
|
|
|
|
]
|
2022-07-06 11:06:37 +00:00
|
|
|
return cw(definitions)
|
|
|
|
|
|
|
|
@property
|
2022-07-07 17:08:30 +00:00
|
|
|
def neighbours(self):
|
2022-07-08 10:43:24 +00:00
|
|
|
neighbours = set()
|
2022-07-07 17:08:30 +00:00
|
|
|
for e in self.root.findall(".//a"):
|
2022-07-08 10:43:24 +00:00
|
|
|
if "href" in e.attrib and e.attrib["href"].startswith("/dictionary/"):
|
|
|
|
link = e.attrib["href"].split("/")[-1].split("#")[0].split("?")[0]
|
|
|
|
neighbours.add(uq(link))
|
|
|
|
return neighbours
|
2022-07-06 11:06:37 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def pronounciation(self):
|
|
|
|
prs = []
|
|
|
|
for e in self.root.findall(".//*[@class='pr']"):
|
|
|
|
prs.append(ot(e))
|
|
|
|
return list(set(prs))
|
|
|
|
|
|
|
|
@property
|
|
|
|
def type(self):
|
|
|
|
types = set()
|
|
|
|
for e in self.root.findall(".//*[@class='fl']"):
|
|
|
|
types.add(rb(ot(e), "(", ")"))
|
|
|
|
return sorted(types)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def examples(self):
|
|
|
|
examples = []
|
|
|
|
queries = [
|
|
|
|
".//*[@class='in-sentences']/span",
|
|
|
|
".//*[@class='in-sentences read-more-content-hint-container']/span",
|
|
|
|
".//*[@class='on-web']/span",
|
|
|
|
".//*[@class='on-web read-more-content-hint-container']/span",
|
|
|
|
]
|
|
|
|
for query in queries:
|
|
|
|
for e in self.root.findall(query):
|
|
|
|
examples.append(" ".join([l for l in e.itertext()]))
|
|
|
|
return cw(examples)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def first_known_use(self):
|
|
|
|
uses = {}
|
|
|
|
for e in self.root.findall(".//div[@id='first-known-anchor']"):
|
|
|
|
for y in e.findall(".//p[@class='ety-sl']"):
|
|
|
|
uses[ot(y)] = ""
|
2022-07-08 10:43:24 +00:00
|
|
|
for t, y in zip(
|
|
|
|
e.findall(".//p[@class='function-label']"),
|
|
|
|
e.findall(".//p[@class='ety-sl']"),
|
|
|
|
):
|
2022-07-06 11:06:37 +00:00
|
|
|
uses[ot(y)] = rb(ot(t), "(", ")")
|
|
|
|
return cw(uses)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def synonym_discussion(self):
|
|
|
|
syndisc = ""
|
|
|
|
for e in self.root.findall(".//div[@id='synonym-discussion-anchor']"):
|
|
|
|
syndisc = ot(e)
|
|
|
|
return syndisc
|
|
|
|
|
|
|
|
@property
|
|
|
|
def history_and_etymology(self):
|
|
|
|
hande = {}
|
|
|
|
for e in self.root.findall(".//div[@id='etymology-anchor']"):
|
|
|
|
for y in e.findall(".//p[@class='et']"):
|
|
|
|
hande[ot(y)] = ""
|
2022-07-08 10:43:24 +00:00
|
|
|
for t, y in zip(
|
|
|
|
e.findall(".//p[@class='function-label']"),
|
|
|
|
e.findall(".//p[@class='et']"),
|
|
|
|
):
|
2022-07-06 11:06:37 +00:00
|
|
|
hande[ot(y)] = rb(ot(t), "(", ")")
|
|
|
|
return cw(hande)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def synonyms(self):
|
|
|
|
uls = [ul for ul in self.root.findall(".//div[@id='synonyms-anchor']/ul")]
|
|
|
|
if uls:
|
|
|
|
return [next(l.itertext()) for l in uls[0].findall(".//a")]
|
|
|
|
else:
|
|
|
|
return []
|
|
|
|
|
|
|
|
@property
|
|
|
|
def antonyms(self):
|
|
|
|
uls = [ul for ul in self.root.findall(".//div[@id='synonyms-anchor']/ul")]
|
|
|
|
if len(uls) > 1:
|
|
|
|
return [next(l.itertext()) for l in uls[1].findall(".//a")]
|
|
|
|
else:
|
|
|
|
return []
|
|
|
|
|
|
|
|
def todict(self):
|
|
|
|
assert (
|
|
|
|
self.type or self.definitions
|
|
|
|
), f"{self.time} {self.word}: type or definitions came back empty..."
|
2022-07-08 10:43:24 +00:00
|
|
|
return uqall({
|
2022-07-06 11:06:37 +00:00
|
|
|
self.word: {
|
|
|
|
"type": self.type,
|
|
|
|
"definitions": self.definitions,
|
|
|
|
"pronounciation": self.pronounciation,
|
|
|
|
"synonyms": self.synonyms,
|
|
|
|
"antonyms": self.antonyms,
|
|
|
|
"synonym_discussion": self.synonym_discussion,
|
|
|
|
"examples": self.examples,
|
|
|
|
"history_and_etymology": self.history_and_etymology,
|
|
|
|
"first_known_use": self.first_known_use,
|
|
|
|
"time_of_retrieval": self.time,
|
|
|
|
}
|
2022-07-08 10:43:24 +00:00
|
|
|
})
|
2022-07-06 11:06:37 +00:00
|
|
|
|
|
|
|
|
2022-07-08 10:43:24 +00:00
|
|
|
# testword = "revivalist"
|
2022-07-07 17:08:30 +00:00
|
|
|
# d = MerriamWebsterParser(testword)
|
|
|
|
# print(d.neighbours)
|
|
|
|
# word_dict = d.todict()
|
|
|
|
# for k, v in word_dict[testword].items():
|
|
|
|
# print(f"### {k} ###\n", v)
|
|
|
|
# exit()
|
2022-07-06 11:06:37 +00:00
|
|
|
|
|
|
|
q = Queue(MerriamWebsterParser, "en_merriam_webster/", "_mw.json", prefix_length=2)
|
|
|
|
q.loadDB()
|
|
|
|
|
|
|
|
while True:
|
|
|
|
q.add_word()
|