improve MW scraper
This commit is contained in:
parent
874d4c744a
commit
43665cff6d
@ -1,6 +1,6 @@
|
|||||||
from itertools import chain
|
from itertools import chain
|
||||||
|
|
||||||
from dict_dl import Queue, WordParser, ot, rb, uq, uqall
|
from dict_dl import Queue, WordParser, ot, rb, uq, uqall, only_first_text
|
||||||
|
|
||||||
|
|
||||||
class MWThesaurusParser(WordParser):
|
class MWThesaurusParser(WordParser):
|
||||||
@ -12,64 +12,39 @@ class MWThesaurusParser(WordParser):
|
|||||||
def thes(self):
|
def thes(self):
|
||||||
thes = {}
|
thes = {}
|
||||||
for i in range(1, 10):
|
for i in range(1, 10):
|
||||||
for entry in self.root.findall(f".//div[@id='thesaurus-entry-{i}']"):
|
for j in range(1, 10):
|
||||||
for se in chain(
|
for entry in self.root.findall(
|
||||||
entry.findall(".//div[@class='sb no-sn']"),
|
f".//div[@id='thesaurus-entry-{i}-{j}']"
|
||||||
entry.findall(".//div[@class='sb has-num']"),
|
|
||||||
):
|
):
|
||||||
for e in se.findall(".//span[@class='dt']"):
|
d = ""
|
||||||
examples = [ot(li) for li in e.findall(".//li")]
|
for e in entry.findall(".//span[@class='dt']"):
|
||||||
for ul in e.findall(".//ul"):
|
d = only_first_text(e)
|
||||||
ul.clear()
|
thes[d] = {}
|
||||||
d = ot(e)
|
for relev in [4, 3]:
|
||||||
thes[d] = {"examples": examples}
|
for e in entry.findall(
|
||||||
thes[d]["synonyms"] = [
|
f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']"
|
||||||
ot(li)
|
):
|
||||||
for li in se.findall(
|
thes[d]["synonyms"] = thes[d].get("synonyms", []) + [ot(e)]
|
||||||
".//span[@class='thes-list syn-list']/div[@class='thes-list-content synonyms_list']//li//a"
|
for relev in [2, 1]:
|
||||||
)
|
for e in entry.findall(
|
||||||
]
|
f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']"
|
||||||
thes[d]["synonyms"].extend([
|
):
|
||||||
ot(li)
|
thes[d]["near synonyms"] = thes[d].get(
|
||||||
for li in se.findall(
|
"near synonyms", []
|
||||||
".//span[@class='thes-list phrase-list']/div[@class='thes-list-content synonyms_list']//li//a"
|
) + [ot(e)]
|
||||||
)
|
|
||||||
])
|
|
||||||
thes[d]["near synonyms"] = [
|
|
||||||
ot(li)
|
|
||||||
for li in se.findall(
|
|
||||||
".//span[@class='thes-list rel-list']/div[@class='thes-list-content synonyms_list']//li//a"
|
|
||||||
)
|
|
||||||
]
|
|
||||||
thes[d]["near synonyms"].extend(
|
|
||||||
[
|
|
||||||
ot(li)
|
|
||||||
for li in se.findall(
|
|
||||||
".//span[@class='thes-list sim-list']/div[@class='thes-list-content synonyms_list']//li//a"
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
thes[d]["near antonyms"] = [
|
|
||||||
ot(li)
|
|
||||||
for li in se.findall(
|
|
||||||
".//span[@class='thes-list near-list']/div[@class='thes-list-content synonyms_list']//li//a"
|
|
||||||
)
|
|
||||||
]
|
|
||||||
thes[d]["near antonyms"].extend(
|
|
||||||
[
|
|
||||||
ot(li)
|
|
||||||
for li in se.findall(
|
|
||||||
".//span[@class='thes-list opp-list']/div[@class='thes-list-content synonyms_list']//li//a"
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
thes[d]["antonyms"] = [
|
|
||||||
ot(li)
|
|
||||||
for li in se.findall(
|
|
||||||
".//span[@class='thes-list ant-list']/div[@class='thes-list-content synonyms_list']//li//a"
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
|
for relev in [4, 3]:
|
||||||
|
for e in entry.findall(
|
||||||
|
f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']"
|
||||||
|
):
|
||||||
|
thes[d]["antonyms"] = thes[d].get("antonyms", []) + [ot(e)]
|
||||||
|
for relev in [2, 1]:
|
||||||
|
for e in entry.findall(
|
||||||
|
f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']"
|
||||||
|
):
|
||||||
|
thes[d]["near antonyms"] = thes[d].get(
|
||||||
|
"near antonyms", []
|
||||||
|
) + [ot(e)]
|
||||||
return thes
|
return thes
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -92,13 +67,15 @@ class MWThesaurusParser(WordParser):
|
|||||||
assert (
|
assert (
|
||||||
self.type or self.thes
|
self.type or self.thes
|
||||||
), f"{self.time} {self.word}: type or definitions came back empty..."
|
), f"{self.time} {self.word}: type or definitions came back empty..."
|
||||||
return uqall({self.word: self.thes | {"type": self.type}})
|
return uqall(
|
||||||
|
{self.word: self.thes} | {"type": self.type, "time_of_retrieval": self.time}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
w = MWThesaurusParser("augur")
|
# w = MWThesaurusParser("content")
|
||||||
# print(w.neighbours)
|
# print(w.neighbours)
|
||||||
print(w.todict())
|
# print(w.todict())
|
||||||
exit()
|
# exit()
|
||||||
|
|
||||||
q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json")
|
q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json")
|
||||||
q.loadDB()
|
q.loadDB()
|
||||||
|
15
analysis.py
15
analysis.py
@ -1,12 +1,25 @@
|
|||||||
|
from dict_dl import FullDictionary
|
||||||
|
|
||||||
# import matplotlib.pyplot as plt
|
# import matplotlib.pyplot as plt
|
||||||
# from PIL import Image
|
# from PIL import Image
|
||||||
# from wordcloud import STOPWORDS, WordCloud
|
# from wordcloud import STOPWORDS, WordCloud
|
||||||
|
|
||||||
d = FullDictionary("en_merriam_webster/", "_mw.json")
|
d = FullDictionary("en_MerriamWebster/", "_MW.json")
|
||||||
# d = Dictionary("en_MW_thesaurus/", "_mwt.json")
|
# d = Dictionary("en_MW_thesaurus/", "_mwt.json")
|
||||||
# d = Dictionary("de_duden/", "_duden.json")
|
# d = Dictionary("de_duden/", "_duden.json")
|
||||||
print(f"{d.readtime:.06f}")
|
print(f"{d.readtime:.06f}")
|
||||||
|
|
||||||
|
print(
|
||||||
|
sorted(
|
||||||
|
[
|
||||||
|
k
|
||||||
|
for k in d
|
||||||
|
if not any([c in ["a", "e", "i", "o", "u", "_"] for c in k.lower()])
|
||||||
|
and len(k) > 2
|
||||||
|
and k[-1] not in string.ascii_uppercase
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
# print([k for k in d if not all([c in string.ascii_letters for c in k])])
|
# print([k for k in d if not all([c in string.ascii_letters for c in k])])
|
||||||
print([k for k in d if "?" in k])
|
print([k for k in d if "?" in k])
|
||||||
exit()
|
exit()
|
||||||
|
2
d.py
Executable file → Normal file
2
d.py
Executable file → Normal file
@ -18,7 +18,7 @@ d = DictFile(os.path.expandvars(f"$DICT_DL/en_MerriamWebster/{prefix}_MW.json"))
|
|||||||
|
|
||||||
print(f"||||||||{query}||||||||")
|
print(f"||||||||{query}||||||||")
|
||||||
for k, v in d[query].items():
|
for k, v in d[query].items():
|
||||||
print(k,v)
|
print(k, v)
|
||||||
# if k != "type":
|
# if k != "type":
|
||||||
# table = Table(title=k)
|
# table = Table(title=k)
|
||||||
# table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)
|
# table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)
|
||||||
|
@ -122,8 +122,8 @@ def url2str(url: str, clean=True) -> str:
|
|||||||
xml_str = re.sub(r"<>", "-->", xml_str)
|
xml_str = re.sub(r"<>", "-->", xml_str)
|
||||||
xml_str = remove_tag(xml_str, "head")
|
xml_str = remove_tag(xml_str, "head")
|
||||||
# xml_str = remove_tag(xml_str)
|
# xml_str = remove_tag(xml_str)
|
||||||
with open("test.html", "w") as f:
|
# with open("test.html", "w") as f:
|
||||||
f.write(xml_str)
|
# f.write(xml_str)
|
||||||
return xml_str
|
return xml_str
|
||||||
|
|
||||||
|
|
||||||
|
8
duden.py
8
duden.py
@ -96,10 +96,10 @@ class DudenParser(WordParser):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# d = DudenParser("hinfallen")
|
d = DudenParser("hineintauchen")
|
||||||
# print(d.neighbours)
|
print(d.neighbours)
|
||||||
# print(d.todict())
|
print(d.todict())
|
||||||
# exit()
|
exit()
|
||||||
|
|
||||||
q = Queue(DudenParser, "de_Duden/", "_D.json")
|
q = Queue(DudenParser, "de_Duden/", "_D.json")
|
||||||
q.loadDB()
|
q.loadDB()
|
||||||
|
2
main.py
2
main.py
@ -80,7 +80,7 @@ def phrases(n: int = 4, nouns: int = 1, adjs: int = 2, pw: bool = False):
|
|||||||
if pw:
|
if pw:
|
||||||
# ps = [ "".join(p)[:-1] for p in [ [word + char for word, char in zip(p, [random_char() for w in p])] for p in phrases ] ]
|
# ps = [ "".join(p)[:-1] for p in [ [word + char for word, char in zip(p, [random_char() for w in p])] for p in phrases ] ]
|
||||||
ps = [
|
ps = [
|
||||||
"".join([w.capitalize() for i,w in enumerate(p) if i > 0])
|
"".join([w.capitalize() if i > 0 else w for i, w in enumerate(p)])
|
||||||
+ random_char()
|
+ random_char()
|
||||||
+ f"{random.randint(0,999):03d}"
|
+ f"{random.randint(0,999):03d}"
|
||||||
for p in phrases
|
for p in phrases
|
||||||
|
@ -17,11 +17,11 @@ class MerriamWebsterParser(WordParser):
|
|||||||
definitions[ot(d)] = [
|
definitions[ot(d)] = [
|
||||||
ot(ex) for ex in e.findall("./span[@class!='dtText']")
|
ot(ex) for ex in e.findall("./span[@class!='dtText']")
|
||||||
]
|
]
|
||||||
if not definitions: # british spelling...
|
if not definitions: # british spelling...
|
||||||
for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"):
|
for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"):
|
||||||
for e in entry.findall(".//span[@class='cxl']"):
|
for e in entry.findall(".//span[@class='cxl']"):
|
||||||
words = [ot(d) for d in entry.findall(".//a")]
|
words = [ot(d) for d in entry.findall(".//a")]
|
||||||
definitions[f'{ot(e)} {", ".join(words)}'] = [ ]
|
definitions[f'{ot(e)} {", ".join(words)}'] = []
|
||||||
return cw(definitions)
|
return cw(definitions)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -39,11 +39,6 @@ class Pronunciation(SQLModel, table=True):
|
|||||||
|
|
||||||
|
|
||||||
engine = create_engine(db_pass)
|
engine = create_engine(db_pass)
|
||||||
SQLModel.metadata.create_all(engine)
|
|
||||||
|
|
||||||
html_session = HTMLSession()
|
|
||||||
|
|
||||||
QUEUE = {line.strip() for line in open("queue.db", "rt")}
|
|
||||||
|
|
||||||
|
|
||||||
def add_word(word):
|
def add_word(word):
|
||||||
@ -78,6 +73,7 @@ def add_word(word):
|
|||||||
.where(Sense.word == _word)
|
.where(Sense.word == _word)
|
||||||
.where(Sense.word_class == _class)
|
.where(Sense.word_class == _class)
|
||||||
).one_or_none()
|
).one_or_none()
|
||||||
|
|
||||||
if results:
|
if results:
|
||||||
sense = results
|
sense = results
|
||||||
else:
|
else:
|
||||||
@ -97,14 +93,26 @@ def add_word(word):
|
|||||||
for sents in dt.find("span.sents"):
|
for sents in dt.find("span.sents"):
|
||||||
_example.append(sents.text)
|
_example.append(sents.text)
|
||||||
_examples.append("; ".join(_example))
|
_examples.append("; ".join(_example))
|
||||||
session.add(
|
|
||||||
Description(
|
_final_description = "; ".join(_desc)
|
||||||
word=word.word,
|
results = session.exec(
|
||||||
sense_id=sense.id,
|
select(Description).where(
|
||||||
description="; ".join(_desc),
|
Description.word == word.word,
|
||||||
examples=_examples,
|
Description.description == _final_description,
|
||||||
)
|
)
|
||||||
)
|
).one_or_none()
|
||||||
|
if results:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
session.add(
|
||||||
|
Description(
|
||||||
|
word=word.word,
|
||||||
|
sense_id=sense.id,
|
||||||
|
description=_final_description,
|
||||||
|
examples=_examples,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
for pron in c.find(
|
for pron in c.find(
|
||||||
"span.prons-entries-list-inline div.prons-entry-list-item,a.prons-entry-list-item"
|
"span.prons-entries-list-inline div.prons-entry-list-item,a.prons-entry-list-item"
|
||||||
):
|
):
|
||||||
@ -117,18 +125,28 @@ def add_word(word):
|
|||||||
return links
|
return links
|
||||||
|
|
||||||
|
|
||||||
def present():
|
def presently_available():
|
||||||
with Session(engine) as session:
|
with Session(engine) as session:
|
||||||
return session.exec(select(Word.word)).unique()
|
return session.exec(select(Word.word)).unique()
|
||||||
|
|
||||||
|
|
||||||
while True:
|
if __name__ == "__main__":
|
||||||
try:
|
SQLModel.metadata.create_all(engine)
|
||||||
QUEUE |= add_word(random.choice(list(QUEUE)))
|
html_session = HTMLSession()
|
||||||
QUEUE -= set(present())
|
|
||||||
print(len(QUEUE))
|
QUEUE = {line.strip() for line in open("queue.db", "rt")}
|
||||||
sleep(random.random() * 5)
|
|
||||||
except KeyboardInterrupt:
|
while True:
|
||||||
with open("queue.db", "wt") as f:
|
try:
|
||||||
f.write("\n".join(list(QUEUE)))
|
if len(QUEUE) < 20:
|
||||||
exit(0)
|
exit()
|
||||||
|
next_word = random.choice(list(QUEUE))
|
||||||
|
already_present = set(presently_available())
|
||||||
|
print(next_word, len(QUEUE), len(already_present))
|
||||||
|
QUEUE |= add_word(next_word)
|
||||||
|
QUEUE -= already_present | {next_word}
|
||||||
|
sleep(random.random() * 5)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
with open("queue.db", "wt") as f:
|
||||||
|
f.write("\n".join(list(QUEUE)))
|
||||||
|
exit(0)
|
||||||
|
53
t.py
53
t.py
@ -1,25 +1,60 @@
|
|||||||
#!/bin/python
|
#!/bin/python
|
||||||
|
"""search the Merriam Webster Thesaurus with ease"""
|
||||||
|
import argparse
|
||||||
import os
|
import os
|
||||||
import sys
|
|
||||||
from itertools import zip_longest
|
from itertools import zip_longest
|
||||||
|
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
from rich.table import Table
|
from rich.table import Table
|
||||||
|
|
||||||
from dict_dl import DictFile
|
from dict_dl import DictFile
|
||||||
|
import string
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
letters = string.ascii_lowercase
|
||||||
query = next(sys.stdin).strip()
|
prefix_length = 3
|
||||||
else:
|
unusual = (
|
||||||
query = sys.argv[1].strip()
|
lambda prefix: not all([c in letters for c in prefix.lower()])
|
||||||
prefix = query[:3]
|
or len(prefix) < prefix_length
|
||||||
|
)
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Merriam Webster Thesaurus")
|
||||||
|
parser.add_argument("-p", "--preview", action="store_true", help="FZF preview")
|
||||||
|
parser.add_argument("query", type=str, help="query")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
prefix = args.query[:prefix_length].lower()
|
||||||
|
|
||||||
|
if unusual(prefix):
|
||||||
|
prefix = "_" * prefix_length
|
||||||
d = DictFile(os.path.expandvars(f"$DICT_DL/en_MWThesaurus/{prefix}_MWT.json"))
|
d = DictFile(os.path.expandvars(f"$DICT_DL/en_MWThesaurus/{prefix}_MWT.json"))
|
||||||
|
|
||||||
print(f"||||||||{query}||||||||")
|
if args.preview:
|
||||||
|
for k, v in d[args.query].items():
|
||||||
|
if k == "type":
|
||||||
|
word_type = k
|
||||||
|
else:
|
||||||
|
syns = v["synonyms"]
|
||||||
|
nsyns = v["related" if "related" in v else "near synonyms"]
|
||||||
|
nants = v["near antonyms"]
|
||||||
|
ants = v["antonyms"]
|
||||||
|
print(f"> {k}")
|
||||||
|
if syns:
|
||||||
|
print(" SYNONYMS\n ", ", ".join(syns))
|
||||||
|
if nsyns:
|
||||||
|
print(" NEAR SYNONYMS\n ", ", ".join(nsyns))
|
||||||
|
if nants:
|
||||||
|
print(" NEAR ANTONYMS\n ", ", ".join(nants))
|
||||||
|
if ants:
|
||||||
|
print(" ANTONYMS\n ", ", ".join(ants))
|
||||||
|
print()
|
||||||
|
exit()
|
||||||
|
|
||||||
|
print(f"||||||||{args.query}||||||||")
|
||||||
print()
|
print()
|
||||||
for k, v in d[query].items():
|
for k, v in d[args.query].items():
|
||||||
if k != "type":
|
if k == "type":
|
||||||
|
word_type = k
|
||||||
|
else:
|
||||||
table = Table(title=k)
|
table = Table(title=k)
|
||||||
table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)
|
table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)
|
||||||
table.add_column("near synonyms", justify="center", style="cyan", no_wrap=True)
|
table.add_column("near synonyms", justify="center", style="cyan", no_wrap=True)
|
||||||
|
Loading…
Reference in New Issue
Block a user