dict_dl/merriam_webster.py

149 lines
5.0 KiB
Python
Raw Normal View History

2022-07-10 03:16:16 +00:00
from dict_dl import Queue, WordParser, cw, ot, rb, uq, uqall
2022-07-06 11:06:37 +00:00
class MerriamWebsterParser(WordParser):
def __init__(self, word):
url_prefix = "https://www.merriam-webster.com/dictionary/"
super().__init__(word, url_prefix)
@property
def definitions(self):
definitions = {}
for i in range(1, 20):
for entry in self.root.findall(f".//div[@id='dictionary-entry-{i}']"):
for e in entry.findall(".//span[@class='dt']"):
for d in e.findall("./span[@class='dtText']"):
2022-07-08 10:43:24 +00:00
definitions[ot(d)] = [
ot(ex) for ex in e.findall("./span[@class!='dtText']")
]
2023-09-05 14:00:29 +00:00
if not definitions: # british spelling...
2022-11-15 11:41:33 +00:00
for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"):
for e in entry.findall(".//span[@class='cxl']"):
words = [ot(d) for d in entry.findall(".//a")]
2023-09-05 14:00:29 +00:00
definitions[f'{ot(e)} {", ".join(words)}'] = []
2022-07-06 11:06:37 +00:00
return cw(definitions)
@property
2022-07-07 17:08:30 +00:00
def neighbours(self):
2022-07-08 10:43:24 +00:00
neighbours = set()
2022-07-07 17:08:30 +00:00
for e in self.root.findall(".//a"):
2022-07-08 10:43:24 +00:00
if "href" in e.attrib and e.attrib["href"].startswith("/dictionary/"):
link = e.attrib["href"].split("/")[-1].split("#")[0].split("?")[0]
neighbours.add(uq(link))
return neighbours
2022-07-06 11:06:37 +00:00
@property
def pronounciation(self):
prs = []
for e in self.root.findall(".//span[@class='mw no-badge']"):
2022-07-06 11:06:37 +00:00
prs.append(ot(e))
return list(set(prs))
@property
def type(self):
types = set()
2022-11-15 11:41:33 +00:00
for e in self.root.findall(".//a[@class='important-blue-link']"):
2022-07-06 11:06:37 +00:00
types.add(rb(ot(e), "(", ")"))
return sorted(types)
@property
def examples(self):
examples = []
queries = [
".//*[@class='in-sentences']/span",
".//*[@class='in-sentences read-more-content-hint-container']/span",
".//*[@class='on-web']/span",
".//*[@class='on-web read-more-content-hint-container']/span",
]
for query in queries:
for e in self.root.findall(query):
examples.append(" ".join([l for l in e.itertext()]))
return cw(examples)
@property
def first_known_use(self):
uses = {}
2022-12-09 19:13:03 +00:00
for e in self.root.findall(".//div[@class='first-known-content-section']"):
for y in e.findall(".//p[@class='ety-sl pb-3']"):
2022-07-06 11:06:37 +00:00
uses[ot(y)] = ""
2022-07-08 10:43:24 +00:00
for t, y in zip(
e.findall(".//p[@class='function-label']"),
e.findall(".//p[@class='ety-sl']"),
):
2022-07-06 11:06:37 +00:00
uses[ot(y)] = rb(ot(t), "(", ")")
return cw(uses)
@property
def synonym_discussion(self):
syndisc = ""
for e in self.root.findall(".//div[@id='synonym-discussion-anchor']"):
syndisc = ot(e)
return syndisc
@property
def history_and_etymology(self):
hande = {}
2022-12-09 19:13:03 +00:00
for e in self.root.findall(".//div[@class='etymology-content-section']"):
2022-07-06 11:06:37 +00:00
for y in e.findall(".//p[@class='et']"):
hande[ot(y)] = ""
2022-07-08 10:43:24 +00:00
for t, y in zip(
e.findall(".//p[@class='function-label']"),
e.findall(".//p[@class='et']"),
):
2022-07-06 11:06:37 +00:00
hande[ot(y)] = rb(ot(t), "(", ")")
2022-12-09 19:13:03 +00:00
print(hande)
2022-07-06 11:06:37 +00:00
return cw(hande)
@property
def synonyms(self):
uls = [ul for ul in self.root.findall(".//div[@id='synonyms-anchor']/ul")]
if uls:
return [next(l.itertext()) for l in uls[0].findall(".//a")]
else:
return []
@property
def antonyms(self):
uls = [ul for ul in self.root.findall(".//div[@id='synonyms-anchor']/ul")]
if len(uls) > 1:
return [next(l.itertext()) for l in uls[1].findall(".//a")]
else:
return []
def todict(self):
assert (
self.type or self.definitions
), f"{self.time} {self.word}: type or definitions came back empty..."
2022-07-10 04:35:09 +00:00
return uqall(
{
self.word: {
"type": self.type,
"definitions": self.definitions,
"pronounciation": self.pronounciation,
# "synonyms": self.synonyms,
# "antonyms": self.antonyms,
# "synonym_discussion": self.synonym_discussion,
2022-07-10 04:35:09 +00:00
"examples": self.examples,
"history_and_etymology": self.history_and_etymology,
"first_known_use": self.first_known_use,
"time_of_retrieval": self.time,
}
2022-07-06 11:06:37 +00:00
}
2022-07-10 04:35:09 +00:00
)
2022-07-06 11:06:37 +00:00
# testword = "domicile"
# d = MerriamWebsterParser(testword)
# # print(d.definitions)
# print(d.neighbours)
# word_dict = d.todict()
# for k, v in word_dict[testword].items():
# print(f"### {k} ###\n", v)
# exit()
2022-07-06 11:06:37 +00:00
2022-07-10 04:35:09 +00:00
q = Queue(MerriamWebsterParser, "en_MerriamWebster/", "_MW.json")
2022-07-06 11:06:37 +00:00
q.loadDB()
while True:
q.add_word()