Compare commits

...

2 Commits

Author SHA1 Message Date
47130f30d7 explicit ARRAY type for SQLAlchemy 2023-11-20 15:52:07 +00:00
43665cff6d improve MW scraper 2023-09-05 14:00:29 +00:00
9 changed files with 150 additions and 106 deletions

View File

@ -1,6 +1,6 @@
from itertools import chain from itertools import chain
from dict_dl import Queue, WordParser, ot, rb, uq, uqall from dict_dl import Queue, WordParser, ot, rb, uq, uqall, only_first_text
class MWThesaurusParser(WordParser): class MWThesaurusParser(WordParser):
@ -12,64 +12,39 @@ class MWThesaurusParser(WordParser):
def thes(self): def thes(self):
thes = {} thes = {}
for i in range(1, 10): for i in range(1, 10):
for entry in self.root.findall(f".//div[@id='thesaurus-entry-{i}']"): for j in range(1, 10):
for se in chain( for entry in self.root.findall(
entry.findall(".//div[@class='sb no-sn']"), f".//div[@id='thesaurus-entry-{i}-{j}']"
entry.findall(".//div[@class='sb has-num']"),
): ):
for e in se.findall(".//span[@class='dt']"): d = ""
examples = [ot(li) for li in e.findall(".//li")] for e in entry.findall(".//span[@class='dt']"):
for ul in e.findall(".//ul"): d = only_first_text(e)
ul.clear() thes[d] = {}
d = ot(e) for relev in [4, 3]:
thes[d] = {"examples": examples} for e in entry.findall(
thes[d]["synonyms"] = [ f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']"
ot(li) ):
for li in se.findall( thes[d]["synonyms"] = thes[d].get("synonyms", []) + [ot(e)]
".//span[@class='thes-list syn-list']/div[@class='thes-list-content synonyms_list']//li//a" for relev in [2, 1]:
) for e in entry.findall(
] f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']"
thes[d]["synonyms"].extend([ ):
ot(li) thes[d]["near synonyms"] = thes[d].get(
for li in se.findall( "near synonyms", []
".//span[@class='thes-list phrase-list']/div[@class='thes-list-content synonyms_list']//li//a" ) + [ot(e)]
)
])
thes[d]["near synonyms"] = [
ot(li)
for li in se.findall(
".//span[@class='thes-list rel-list']/div[@class='thes-list-content synonyms_list']//li//a"
)
]
thes[d]["near synonyms"].extend(
[
ot(li)
for li in se.findall(
".//span[@class='thes-list sim-list']/div[@class='thes-list-content synonyms_list']//li//a"
)
]
)
thes[d]["near antonyms"] = [
ot(li)
for li in se.findall(
".//span[@class='thes-list near-list']/div[@class='thes-list-content synonyms_list']//li//a"
)
]
thes[d]["near antonyms"].extend(
[
ot(li)
for li in se.findall(
".//span[@class='thes-list opp-list']/div[@class='thes-list-content synonyms_list']//li//a"
)
]
)
thes[d]["antonyms"] = [
ot(li)
for li in se.findall(
".//span[@class='thes-list ant-list']/div[@class='thes-list-content synonyms_list']//li//a"
)
]
for relev in [4, 3]:
for e in entry.findall(
f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']"
):
thes[d]["antonyms"] = thes[d].get("antonyms", []) + [ot(e)]
for relev in [2, 1]:
for e in entry.findall(
f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']"
):
thes[d]["near antonyms"] = thes[d].get(
"near antonyms", []
) + [ot(e)]
return thes return thes
@property @property
@ -92,13 +67,15 @@ class MWThesaurusParser(WordParser):
assert ( assert (
self.type or self.thes self.type or self.thes
), f"{self.time} {self.word}: type or definitions came back empty..." ), f"{self.time} {self.word}: type or definitions came back empty..."
return uqall({self.word: self.thes | {"type": self.type}}) return uqall(
{self.word: self.thes} | {"type": self.type, "time_of_retrieval": self.time}
)
w = MWThesaurusParser("augur") # w = MWThesaurusParser("content")
# print(w.neighbours) # print(w.neighbours)
print(w.todict()) # print(w.todict())
exit() # exit()
q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json") q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json")
q.loadDB() q.loadDB()

View File

@ -1,12 +1,25 @@
from dict_dl import FullDictionary
# import matplotlib.pyplot as plt # import matplotlib.pyplot as plt
# from PIL import Image # from PIL import Image
# from wordcloud import STOPWORDS, WordCloud # from wordcloud import STOPWORDS, WordCloud
d = FullDictionary("en_merriam_webster/", "_mw.json") d = FullDictionary("en_MerriamWebster/", "_MW.json")
# d = Dictionary("en_MW_thesaurus/", "_mwt.json") # d = Dictionary("en_MW_thesaurus/", "_mwt.json")
# d = Dictionary("de_duden/", "_duden.json") # d = Dictionary("de_duden/", "_duden.json")
print(f"{d.readtime:.06f}") print(f"{d.readtime:.06f}")
print(
sorted(
[
k
for k in d
if not any([c in ["a", "e", "i", "o", "u", "_"] for c in k.lower()])
and len(k) > 2
and k[-1] not in string.ascii_uppercase
]
)
)
# print([k for k in d if not all([c in string.ascii_letters for c in k])]) # print([k for k in d if not all([c in string.ascii_letters for c in k])])
print([k for k in d if "?" in k]) print([k for k in d if "?" in k])
exit() exit()

2
d.py Executable file → Normal file
View File

@ -18,7 +18,7 @@ d = DictFile(os.path.expandvars(f"$DICT_DL/en_MerriamWebster/{prefix}_MW.json"))
print(f"||||||||{query}||||||||") print(f"||||||||{query}||||||||")
for k, v in d[query].items(): for k, v in d[query].items():
print(k,v) print(k, v)
# if k != "type": # if k != "type":
# table = Table(title=k) # table = Table(title=k)
# table.add_column("synonyms", justify="center", style="cyan", no_wrap=True) # table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)

View File

@ -122,8 +122,8 @@ def url2str(url: str, clean=True) -> str:
xml_str = re.sub(r"<>", "-->", xml_str) xml_str = re.sub(r"<>", "-->", xml_str)
xml_str = remove_tag(xml_str, "head") xml_str = remove_tag(xml_str, "head")
# xml_str = remove_tag(xml_str) # xml_str = remove_tag(xml_str)
with open("test.html", "w") as f: # with open("test.html", "w") as f:
f.write(xml_str) # f.write(xml_str)
return xml_str return xml_str

View File

@ -96,10 +96,10 @@ class DudenParser(WordParser):
) )
# d = DudenParser("hinfallen") d = DudenParser("hineintauchen")
# print(d.neighbours) print(d.neighbours)
# print(d.todict()) print(d.todict())
# exit() exit()
q = Queue(DudenParser, "de_Duden/", "_D.json") q = Queue(DudenParser, "de_Duden/", "_D.json")
q.loadDB() q.loadDB()

View File

@ -80,7 +80,7 @@ def phrases(n: int = 4, nouns: int = 1, adjs: int = 2, pw: bool = False):
if pw: if pw:
# ps = [ "".join(p)[:-1] for p in [ [word + char for word, char in zip(p, [random_char() for w in p])] for p in phrases ] ] # ps = [ "".join(p)[:-1] for p in [ [word + char for word, char in zip(p, [random_char() for w in p])] for p in phrases ] ]
ps = [ ps = [
"".join([w.capitalize() for i,w in enumerate(p) if i > 0]) "".join([w.capitalize() if i > 0 else w for i, w in enumerate(p)])
+ random_char() + random_char()
+ f"{random.randint(0,999):03d}" + f"{random.randint(0,999):03d}"
for p in phrases for p in phrases

View File

@ -17,11 +17,11 @@ class MerriamWebsterParser(WordParser):
definitions[ot(d)] = [ definitions[ot(d)] = [
ot(ex) for ex in e.findall("./span[@class!='dtText']") ot(ex) for ex in e.findall("./span[@class!='dtText']")
] ]
if not definitions: # british spelling... if not definitions: # british spelling...
for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"): for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"):
for e in entry.findall(".//span[@class='cxl']"): for e in entry.findall(".//span[@class='cxl']"):
words = [ot(d) for d in entry.findall(".//a")] words = [ot(d) for d in entry.findall(".//a")]
definitions[f'{ot(e)} {", ".join(words)}'] = [ ] definitions[f'{ot(e)} {", ".join(words)}'] = []
return cw(definitions) return cw(definitions)
@property @property

View File

@ -1,11 +1,12 @@
import random import random
from typing import List
import re import re
from pathlib import Path from pathlib import Path
from time import sleep from time import sleep
from urllib.parse import urlparse from urllib.parse import urlparse
from requests_html import HTMLSession from requests_html import HTMLSession
from sqlmodel import Field, Relationship, Session, SQLModel, create_engine, select from sqlmodel import Field, Relationship, Session, SQLModel, create_engine, select, Column, ARRAY, String
with open("db.secrets", "r") as f: with open("db.secrets", "r") as f:
db_pass = f.readline().strip() db_pass = f.readline().strip()
@ -28,7 +29,7 @@ class Description(SQLModel, table=True):
word: str | None = Field(default=None, foreign_key="word.word") word: str | None = Field(default=None, foreign_key="word.word")
sense_id: int | None = Field(default=None, foreign_key="sense.id") sense_id: int | None = Field(default=None, foreign_key="sense.id")
description: str description: str
examples: list[str] | None examples: List[str] | None = Field(sa_column=Column(ARRAY(String)))
class Pronunciation(SQLModel, table=True): class Pronunciation(SQLModel, table=True):
@ -39,11 +40,6 @@ class Pronunciation(SQLModel, table=True):
engine = create_engine(db_pass) engine = create_engine(db_pass)
SQLModel.metadata.create_all(engine)
html_session = HTMLSession()
QUEUE = {line.strip() for line in open("queue.db", "rt")}
def add_word(word): def add_word(word):
@ -78,6 +74,7 @@ def add_word(word):
.where(Sense.word == _word) .where(Sense.word == _word)
.where(Sense.word_class == _class) .where(Sense.word_class == _class)
).one_or_none() ).one_or_none()
if results: if results:
sense = results sense = results
else: else:
@ -97,14 +94,26 @@ def add_word(word):
for sents in dt.find("span.sents"): for sents in dt.find("span.sents"):
_example.append(sents.text) _example.append(sents.text)
_examples.append("; ".join(_example)) _examples.append("; ".join(_example))
session.add(
Description( _final_description = "; ".join(_desc)
word=word.word, results = session.exec(
sense_id=sense.id, select(Description).where(
description="; ".join(_desc), Description.word == word.word,
examples=_examples, Description.description == _final_description,
) )
) ).one_or_none()
if results:
continue
else:
session.add(
Description(
word=word.word,
sense_id=sense.id,
description=_final_description,
examples=_examples,
)
)
for pron in c.find( for pron in c.find(
"span.prons-entries-list-inline div.prons-entry-list-item,a.prons-entry-list-item" "span.prons-entries-list-inline div.prons-entry-list-item,a.prons-entry-list-item"
): ):
@ -117,18 +126,28 @@ def add_word(word):
return links return links
def present(): def presently_available():
with Session(engine) as session: with Session(engine) as session:
return session.exec(select(Word.word)).unique() return session.exec(select(Word.word)).unique()
while True: if __name__ == "__main__":
try: SQLModel.metadata.create_all(engine)
QUEUE |= add_word(random.choice(list(QUEUE))) html_session = HTMLSession()
QUEUE -= set(present())
print(len(QUEUE)) QUEUE = {line.strip() for line in open("queue.db", "rt")}
sleep(random.random() * 5)
except KeyboardInterrupt: while True:
with open("queue.db", "wt") as f: try:
f.write("\n".join(list(QUEUE))) if len(QUEUE) < 20:
exit(0) exit()
next_word = random.choice(list(QUEUE))
already_present = set(presently_available())
print(next_word, len(QUEUE), len(already_present))
QUEUE |= add_word(next_word)
QUEUE -= already_present | {next_word}
sleep(random.random() * 5)
except KeyboardInterrupt:
with open("queue.db", "wt") as f:
f.write("\n".join(list(QUEUE)))
exit(0)

53
t.py
View File

@ -1,25 +1,60 @@
#!/bin/python #!/bin/python
"""search the Merriam Webster Thesaurus with ease"""
import argparse
import os import os
import sys
from itertools import zip_longest from itertools import zip_longest
from rich.console import Console from rich.console import Console
from rich.table import Table from rich.table import Table
from dict_dl import DictFile from dict_dl import DictFile
import string
if len(sys.argv) < 2: letters = string.ascii_lowercase
query = next(sys.stdin).strip() prefix_length = 3
else: unusual = (
query = sys.argv[1].strip() lambda prefix: not all([c in letters for c in prefix.lower()])
prefix = query[:3] or len(prefix) < prefix_length
)
parser = argparse.ArgumentParser(description="Merriam Webster Thesaurus")
parser.add_argument("-p", "--preview", action="store_true", help="FZF preview")
parser.add_argument("query", type=str, help="query")
args = parser.parse_args()
prefix = args.query[:prefix_length].lower()
if unusual(prefix):
prefix = "_" * prefix_length
d = DictFile(os.path.expandvars(f"$DICT_DL/en_MWThesaurus/{prefix}_MWT.json")) d = DictFile(os.path.expandvars(f"$DICT_DL/en_MWThesaurus/{prefix}_MWT.json"))
print(f"||||||||{query}||||||||") if args.preview:
for k, v in d[args.query].items():
if k == "type":
word_type = k
else:
syns = v["synonyms"]
nsyns = v["related" if "related" in v else "near synonyms"]
nants = v["near antonyms"]
ants = v["antonyms"]
print(f"> {k}")
if syns:
print(" SYNONYMS\n ", ", ".join(syns))
if nsyns:
print(" NEAR SYNONYMS\n ", ", ".join(nsyns))
if nants:
print(" NEAR ANTONYMS\n ", ", ".join(nants))
if ants:
print(" ANTONYMS\n ", ", ".join(ants))
print()
exit()
print(f"||||||||{args.query}||||||||")
print() print()
for k, v in d[query].items(): for k, v in d[args.query].items():
if k != "type": if k == "type":
word_type = k
else:
table = Table(title=k) table = Table(title=k)
table.add_column("synonyms", justify="center", style="cyan", no_wrap=True) table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)
table.add_column("near synonyms", justify="center", style="cyan", no_wrap=True) table.add_column("near synonyms", justify="center", style="cyan", no_wrap=True)