improve MW scraper

This commit is contained in:
julius 2023-09-05 14:00:29 +00:00
parent 874d4c744a
commit 43665cff6d
9 changed files with 147 additions and 104 deletions

View File

@ -1,6 +1,6 @@
from itertools import chain from itertools import chain
from dict_dl import Queue, WordParser, ot, rb, uq, uqall from dict_dl import Queue, WordParser, ot, rb, uq, uqall, only_first_text
class MWThesaurusParser(WordParser): class MWThesaurusParser(WordParser):
@ -12,64 +12,39 @@ class MWThesaurusParser(WordParser):
def thes(self): def thes(self):
thes = {} thes = {}
for i in range(1, 10): for i in range(1, 10):
for entry in self.root.findall(f".//div[@id='thesaurus-entry-{i}']"): for j in range(1, 10):
for se in chain( for entry in self.root.findall(
entry.findall(".//div[@class='sb no-sn']"), f".//div[@id='thesaurus-entry-{i}-{j}']"
entry.findall(".//div[@class='sb has-num']"),
): ):
for e in se.findall(".//span[@class='dt']"): d = ""
examples = [ot(li) for li in e.findall(".//li")] for e in entry.findall(".//span[@class='dt']"):
for ul in e.findall(".//ul"): d = only_first_text(e)
ul.clear() thes[d] = {}
d = ot(e) for relev in [4, 3]:
thes[d] = {"examples": examples} for e in entry.findall(
thes[d]["synonyms"] = [ f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']"
ot(li) ):
for li in se.findall( thes[d]["synonyms"] = thes[d].get("synonyms", []) + [ot(e)]
".//span[@class='thes-list syn-list']/div[@class='thes-list-content synonyms_list']//li//a" for relev in [2, 1]:
) for e in entry.findall(
] f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']"
thes[d]["synonyms"].extend([ ):
ot(li) thes[d]["near synonyms"] = thes[d].get(
for li in se.findall( "near synonyms", []
".//span[@class='thes-list phrase-list']/div[@class='thes-list-content synonyms_list']//li//a" ) + [ot(e)]
)
])
thes[d]["near synonyms"] = [
ot(li)
for li in se.findall(
".//span[@class='thes-list rel-list']/div[@class='thes-list-content synonyms_list']//li//a"
)
]
thes[d]["near synonyms"].extend(
[
ot(li)
for li in se.findall(
".//span[@class='thes-list sim-list']/div[@class='thes-list-content synonyms_list']//li//a"
)
]
)
thes[d]["near antonyms"] = [
ot(li)
for li in se.findall(
".//span[@class='thes-list near-list']/div[@class='thes-list-content synonyms_list']//li//a"
)
]
thes[d]["near antonyms"].extend(
[
ot(li)
for li in se.findall(
".//span[@class='thes-list opp-list']/div[@class='thes-list-content synonyms_list']//li//a"
)
]
)
thes[d]["antonyms"] = [
ot(li)
for li in se.findall(
".//span[@class='thes-list ant-list']/div[@class='thes-list-content synonyms_list']//li//a"
)
]
for relev in [4, 3]:
for e in entry.findall(
f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']"
):
thes[d]["antonyms"] = thes[d].get("antonyms", []) + [ot(e)]
for relev in [2, 1]:
for e in entry.findall(
f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']"
):
thes[d]["near antonyms"] = thes[d].get(
"near antonyms", []
) + [ot(e)]
return thes return thes
@property @property
@ -92,13 +67,15 @@ class MWThesaurusParser(WordParser):
assert ( assert (
self.type or self.thes self.type or self.thes
), f"{self.time} {self.word}: type or definitions came back empty..." ), f"{self.time} {self.word}: type or definitions came back empty..."
return uqall({self.word: self.thes | {"type": self.type}}) return uqall(
{self.word: self.thes} | {"type": self.type, "time_of_retrieval": self.time}
)
w = MWThesaurusParser("augur") # w = MWThesaurusParser("content")
# print(w.neighbours) # print(w.neighbours)
print(w.todict()) # print(w.todict())
exit() # exit()
q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json") q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json")
q.loadDB() q.loadDB()

View File

@ -1,12 +1,25 @@
from dict_dl import FullDictionary
# import matplotlib.pyplot as plt # import matplotlib.pyplot as plt
# from PIL import Image # from PIL import Image
# from wordcloud import STOPWORDS, WordCloud # from wordcloud import STOPWORDS, WordCloud
d = FullDictionary("en_merriam_webster/", "_mw.json") d = FullDictionary("en_MerriamWebster/", "_MW.json")
# d = Dictionary("en_MW_thesaurus/", "_mwt.json") # d = Dictionary("en_MW_thesaurus/", "_mwt.json")
# d = Dictionary("de_duden/", "_duden.json") # d = Dictionary("de_duden/", "_duden.json")
print(f"{d.readtime:.06f}") print(f"{d.readtime:.06f}")
print(
sorted(
[
k
for k in d
if not any([c in ["a", "e", "i", "o", "u", "_"] for c in k.lower()])
and len(k) > 2
and k[-1] not in string.ascii_uppercase
]
)
)
# print([k for k in d if not all([c in string.ascii_letters for c in k])]) # print([k for k in d if not all([c in string.ascii_letters for c in k])])
print([k for k in d if "?" in k]) print([k for k in d if "?" in k])
exit() exit()

2
d.py Executable file → Normal file
View File

@ -18,7 +18,7 @@ d = DictFile(os.path.expandvars(f"$DICT_DL/en_MerriamWebster/{prefix}_MW.json"))
print(f"||||||||{query}||||||||") print(f"||||||||{query}||||||||")
for k, v in d[query].items(): for k, v in d[query].items():
print(k,v) print(k, v)
# if k != "type": # if k != "type":
# table = Table(title=k) # table = Table(title=k)
# table.add_column("synonyms", justify="center", style="cyan", no_wrap=True) # table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)

View File

@ -122,8 +122,8 @@ def url2str(url: str, clean=True) -> str:
xml_str = re.sub(r"<>", "-->", xml_str) xml_str = re.sub(r"<>", "-->", xml_str)
xml_str = remove_tag(xml_str, "head") xml_str = remove_tag(xml_str, "head")
# xml_str = remove_tag(xml_str) # xml_str = remove_tag(xml_str)
with open("test.html", "w") as f: # with open("test.html", "w") as f:
f.write(xml_str) # f.write(xml_str)
return xml_str return xml_str

View File

@ -96,10 +96,10 @@ class DudenParser(WordParser):
) )
# d = DudenParser("hinfallen") d = DudenParser("hineintauchen")
# print(d.neighbours) print(d.neighbours)
# print(d.todict()) print(d.todict())
# exit() exit()
q = Queue(DudenParser, "de_Duden/", "_D.json") q = Queue(DudenParser, "de_Duden/", "_D.json")
q.loadDB() q.loadDB()

View File

@ -80,7 +80,7 @@ def phrases(n: int = 4, nouns: int = 1, adjs: int = 2, pw: bool = False):
if pw: if pw:
# ps = [ "".join(p)[:-1] for p in [ [word + char for word, char in zip(p, [random_char() for w in p])] for p in phrases ] ] # ps = [ "".join(p)[:-1] for p in [ [word + char for word, char in zip(p, [random_char() for w in p])] for p in phrases ] ]
ps = [ ps = [
"".join([w.capitalize() for i,w in enumerate(p) if i > 0]) "".join([w.capitalize() if i > 0 else w for i, w in enumerate(p)])
+ random_char() + random_char()
+ f"{random.randint(0,999):03d}" + f"{random.randint(0,999):03d}"
for p in phrases for p in phrases

View File

@ -21,7 +21,7 @@ class MerriamWebsterParser(WordParser):
for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"): for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"):
for e in entry.findall(".//span[@class='cxl']"): for e in entry.findall(".//span[@class='cxl']"):
words = [ot(d) for d in entry.findall(".//a")] words = [ot(d) for d in entry.findall(".//a")]
definitions[f'{ot(e)} {", ".join(words)}'] = [ ] definitions[f'{ot(e)} {", ".join(words)}'] = []
return cw(definitions) return cw(definitions)
@property @property

View File

@ -39,11 +39,6 @@ class Pronunciation(SQLModel, table=True):
engine = create_engine(db_pass) engine = create_engine(db_pass)
SQLModel.metadata.create_all(engine)
html_session = HTMLSession()
QUEUE = {line.strip() for line in open("queue.db", "rt")}
def add_word(word): def add_word(word):
@ -78,6 +73,7 @@ def add_word(word):
.where(Sense.word == _word) .where(Sense.word == _word)
.where(Sense.word_class == _class) .where(Sense.word_class == _class)
).one_or_none() ).one_or_none()
if results: if results:
sense = results sense = results
else: else:
@ -97,14 +93,26 @@ def add_word(word):
for sents in dt.find("span.sents"): for sents in dt.find("span.sents"):
_example.append(sents.text) _example.append(sents.text)
_examples.append("; ".join(_example)) _examples.append("; ".join(_example))
_final_description = "; ".join(_desc)
results = session.exec(
select(Description).where(
Description.word == word.word,
Description.description == _final_description,
)
).one_or_none()
if results:
continue
else:
session.add( session.add(
Description( Description(
word=word.word, word=word.word,
sense_id=sense.id, sense_id=sense.id,
description="; ".join(_desc), description=_final_description,
examples=_examples, examples=_examples,
) )
) )
for pron in c.find( for pron in c.find(
"span.prons-entries-list-inline div.prons-entry-list-item,a.prons-entry-list-item" "span.prons-entries-list-inline div.prons-entry-list-item,a.prons-entry-list-item"
): ):
@ -117,16 +125,26 @@ def add_word(word):
return links return links
def present(): def presently_available():
with Session(engine) as session: with Session(engine) as session:
return session.exec(select(Word.word)).unique() return session.exec(select(Word.word)).unique()
while True: if __name__ == "__main__":
SQLModel.metadata.create_all(engine)
html_session = HTMLSession()
QUEUE = {line.strip() for line in open("queue.db", "rt")}
while True:
try: try:
QUEUE |= add_word(random.choice(list(QUEUE))) if len(QUEUE) < 20:
QUEUE -= set(present()) exit()
print(len(QUEUE)) next_word = random.choice(list(QUEUE))
already_present = set(presently_available())
print(next_word, len(QUEUE), len(already_present))
QUEUE |= add_word(next_word)
QUEUE -= already_present | {next_word}
sleep(random.random() * 5) sleep(random.random() * 5)
except KeyboardInterrupt: except KeyboardInterrupt:
with open("queue.db", "wt") as f: with open("queue.db", "wt") as f:

53
t.py
View File

@ -1,25 +1,60 @@
#!/bin/python #!/bin/python
"""search the Merriam Webster Thesaurus with ease"""
import argparse
import os import os
import sys
from itertools import zip_longest from itertools import zip_longest
from rich.console import Console from rich.console import Console
from rich.table import Table from rich.table import Table
from dict_dl import DictFile from dict_dl import DictFile
import string
if len(sys.argv) < 2: letters = string.ascii_lowercase
query = next(sys.stdin).strip() prefix_length = 3
else: unusual = (
query = sys.argv[1].strip() lambda prefix: not all([c in letters for c in prefix.lower()])
prefix = query[:3] or len(prefix) < prefix_length
)
parser = argparse.ArgumentParser(description="Merriam Webster Thesaurus")
parser.add_argument("-p", "--preview", action="store_true", help="FZF preview")
parser.add_argument("query", type=str, help="query")
args = parser.parse_args()
prefix = args.query[:prefix_length].lower()
if unusual(prefix):
prefix = "_" * prefix_length
d = DictFile(os.path.expandvars(f"$DICT_DL/en_MWThesaurus/{prefix}_MWT.json")) d = DictFile(os.path.expandvars(f"$DICT_DL/en_MWThesaurus/{prefix}_MWT.json"))
print(f"||||||||{query}||||||||") if args.preview:
for k, v in d[args.query].items():
if k == "type":
word_type = k
else:
syns = v["synonyms"]
nsyns = v["related" if "related" in v else "near synonyms"]
nants = v["near antonyms"]
ants = v["antonyms"]
print(f"> {k}")
if syns:
print(" SYNONYMS\n ", ", ".join(syns))
if nsyns:
print(" NEAR SYNONYMS\n ", ", ".join(nsyns))
if nants:
print(" NEAR ANTONYMS\n ", ", ".join(nants))
if ants:
print(" ANTONYMS\n ", ", ".join(ants))
print()
exit()
print(f"||||||||{args.query}||||||||")
print() print()
for k, v in d[query].items(): for k, v in d[args.query].items():
if k != "type": if k == "type":
word_type = k
else:
table = Table(title=k) table = Table(title=k)
table.add_column("synonyms", justify="center", style="cyan", no_wrap=True) table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)
table.add_column("near synonyms", justify="center", style="cyan", no_wrap=True) table.add_column("near synonyms", justify="center", style="cyan", no_wrap=True)