Compare commits

..

No commits in common. "47130f30d7c3f6fac4b5a8c6144568c0a3379c43" and "874d4c744a7e8852dab8733b903b6ea1f98518d3" have entirely different histories.

9 changed files with 106 additions and 150 deletions

View File

@ -1,6 +1,6 @@
from itertools import chain from itertools import chain
from dict_dl import Queue, WordParser, ot, rb, uq, uqall, only_first_text from dict_dl import Queue, WordParser, ot, rb, uq, uqall
class MWThesaurusParser(WordParser): class MWThesaurusParser(WordParser):
@ -12,39 +12,64 @@ class MWThesaurusParser(WordParser):
def thes(self): def thes(self):
thes = {} thes = {}
for i in range(1, 10): for i in range(1, 10):
for j in range(1, 10): for entry in self.root.findall(f".//div[@id='thesaurus-entry-{i}']"):
for entry in self.root.findall( for se in chain(
f".//div[@id='thesaurus-entry-{i}-{j}']" entry.findall(".//div[@class='sb no-sn']"),
entry.findall(".//div[@class='sb has-num']"),
): ):
d = "" for e in se.findall(".//span[@class='dt']"):
for e in entry.findall(".//span[@class='dt']"): examples = [ot(li) for li in e.findall(".//li")]
d = only_first_text(e) for ul in e.findall(".//ul"):
thes[d] = {} ul.clear()
for relev in [4, 3]: d = ot(e)
for e in entry.findall( thes[d] = {"examples": examples}
f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']" thes[d]["synonyms"] = [
): ot(li)
thes[d]["synonyms"] = thes[d].get("synonyms", []) + [ot(e)] for li in se.findall(
for relev in [2, 1]: ".//span[@class='thes-list syn-list']/div[@class='thes-list-content synonyms_list']//li//a"
for e in entry.findall( )
f".//span[@class='thes-list sim-list-scored']//span[@class='lozenge color-{relev}']" ]
): thes[d]["synonyms"].extend([
thes[d]["near synonyms"] = thes[d].get( ot(li)
"near synonyms", [] for li in se.findall(
) + [ot(e)] ".//span[@class='thes-list phrase-list']/div[@class='thes-list-content synonyms_list']//li//a"
)
])
thes[d]["near synonyms"] = [
ot(li)
for li in se.findall(
".//span[@class='thes-list rel-list']/div[@class='thes-list-content synonyms_list']//li//a"
)
]
thes[d]["near synonyms"].extend(
[
ot(li)
for li in se.findall(
".//span[@class='thes-list sim-list']/div[@class='thes-list-content synonyms_list']//li//a"
)
]
)
thes[d]["near antonyms"] = [
ot(li)
for li in se.findall(
".//span[@class='thes-list near-list']/div[@class='thes-list-content synonyms_list']//li//a"
)
]
thes[d]["near antonyms"].extend(
[
ot(li)
for li in se.findall(
".//span[@class='thes-list opp-list']/div[@class='thes-list-content synonyms_list']//li//a"
)
]
)
thes[d]["antonyms"] = [
ot(li)
for li in se.findall(
".//span[@class='thes-list ant-list']/div[@class='thes-list-content synonyms_list']//li//a"
)
]
for relev in [4, 3]:
for e in entry.findall(
f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']"
):
thes[d]["antonyms"] = thes[d].get("antonyms", []) + [ot(e)]
for relev in [2, 1]:
for e in entry.findall(
f".//span[@class='thes-list opp-list-scored']//span[@class='lozenge color-{relev}']"
):
thes[d]["near antonyms"] = thes[d].get(
"near antonyms", []
) + [ot(e)]
return thes return thes
@property @property
@ -67,15 +92,13 @@ class MWThesaurusParser(WordParser):
assert ( assert (
self.type or self.thes self.type or self.thes
), f"{self.time} {self.word}: type or definitions came back empty..." ), f"{self.time} {self.word}: type or definitions came back empty..."
return uqall( return uqall({self.word: self.thes | {"type": self.type}})
{self.word: self.thes} | {"type": self.type, "time_of_retrieval": self.time}
)
# w = MWThesaurusParser("content") w = MWThesaurusParser("augur")
# print(w.neighbours) # print(w.neighbours)
# print(w.todict()) print(w.todict())
# exit() exit()
q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json") q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json")
q.loadDB() q.loadDB()

View File

@ -1,25 +1,12 @@
from dict_dl import FullDictionary
# import matplotlib.pyplot as plt # import matplotlib.pyplot as plt
# from PIL import Image # from PIL import Image
# from wordcloud import STOPWORDS, WordCloud # from wordcloud import STOPWORDS, WordCloud
d = FullDictionary("en_MerriamWebster/", "_MW.json") d = FullDictionary("en_merriam_webster/", "_mw.json")
# d = Dictionary("en_MW_thesaurus/", "_mwt.json") # d = Dictionary("en_MW_thesaurus/", "_mwt.json")
# d = Dictionary("de_duden/", "_duden.json") # d = Dictionary("de_duden/", "_duden.json")
print(f"{d.readtime:.06f}") print(f"{d.readtime:.06f}")
print(
sorted(
[
k
for k in d
if not any([c in ["a", "e", "i", "o", "u", "_"] for c in k.lower()])
and len(k) > 2
and k[-1] not in string.ascii_uppercase
]
)
)
# print([k for k in d if not all([c in string.ascii_letters for c in k])]) # print([k for k in d if not all([c in string.ascii_letters for c in k])])
print([k for k in d if "?" in k]) print([k for k in d if "?" in k])
exit() exit()

2
d.py Normal file → Executable file
View File

@ -18,7 +18,7 @@ d = DictFile(os.path.expandvars(f"$DICT_DL/en_MerriamWebster/{prefix}_MW.json"))
print(f"||||||||{query}||||||||") print(f"||||||||{query}||||||||")
for k, v in d[query].items(): for k, v in d[query].items():
print(k, v) print(k,v)
# if k != "type": # if k != "type":
# table = Table(title=k) # table = Table(title=k)
# table.add_column("synonyms", justify="center", style="cyan", no_wrap=True) # table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)

View File

@ -122,8 +122,8 @@ def url2str(url: str, clean=True) -> str:
xml_str = re.sub(r"<>", "-->", xml_str) xml_str = re.sub(r"<>", "-->", xml_str)
xml_str = remove_tag(xml_str, "head") xml_str = remove_tag(xml_str, "head")
# xml_str = remove_tag(xml_str) # xml_str = remove_tag(xml_str)
# with open("test.html", "w") as f: with open("test.html", "w") as f:
# f.write(xml_str) f.write(xml_str)
return xml_str return xml_str

View File

@ -96,10 +96,10 @@ class DudenParser(WordParser):
) )
d = DudenParser("hineintauchen") # d = DudenParser("hinfallen")
print(d.neighbours) # print(d.neighbours)
print(d.todict()) # print(d.todict())
exit() # exit()
q = Queue(DudenParser, "de_Duden/", "_D.json") q = Queue(DudenParser, "de_Duden/", "_D.json")
q.loadDB() q.loadDB()

View File

@ -80,7 +80,7 @@ def phrases(n: int = 4, nouns: int = 1, adjs: int = 2, pw: bool = False):
if pw: if pw:
# ps = [ "".join(p)[:-1] for p in [ [word + char for word, char in zip(p, [random_char() for w in p])] for p in phrases ] ] # ps = [ "".join(p)[:-1] for p in [ [word + char for word, char in zip(p, [random_char() for w in p])] for p in phrases ] ]
ps = [ ps = [
"".join([w.capitalize() if i > 0 else w for i, w in enumerate(p)]) "".join([w.capitalize() for i,w in enumerate(p) if i > 0])
+ random_char() + random_char()
+ f"{random.randint(0,999):03d}" + f"{random.randint(0,999):03d}"
for p in phrases for p in phrases

View File

@ -21,7 +21,7 @@ class MerriamWebsterParser(WordParser):
for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"): for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"):
for e in entry.findall(".//span[@class='cxl']"): for e in entry.findall(".//span[@class='cxl']"):
words = [ot(d) for d in entry.findall(".//a")] words = [ot(d) for d in entry.findall(".//a")]
definitions[f'{ot(e)} {", ".join(words)}'] = [] definitions[f'{ot(e)} {", ".join(words)}'] = [ ]
return cw(definitions) return cw(definitions)
@property @property

View File

@ -1,12 +1,11 @@
import random import random
from typing import List
import re import re
from pathlib import Path from pathlib import Path
from time import sleep from time import sleep
from urllib.parse import urlparse from urllib.parse import urlparse
from requests_html import HTMLSession from requests_html import HTMLSession
from sqlmodel import Field, Relationship, Session, SQLModel, create_engine, select, Column, ARRAY, String from sqlmodel import Field, Relationship, Session, SQLModel, create_engine, select
with open("db.secrets", "r") as f: with open("db.secrets", "r") as f:
db_pass = f.readline().strip() db_pass = f.readline().strip()
@ -29,7 +28,7 @@ class Description(SQLModel, table=True):
word: str | None = Field(default=None, foreign_key="word.word") word: str | None = Field(default=None, foreign_key="word.word")
sense_id: int | None = Field(default=None, foreign_key="sense.id") sense_id: int | None = Field(default=None, foreign_key="sense.id")
description: str description: str
examples: List[str] | None = Field(sa_column=Column(ARRAY(String))) examples: list[str] | None
class Pronunciation(SQLModel, table=True): class Pronunciation(SQLModel, table=True):
@ -40,6 +39,11 @@ class Pronunciation(SQLModel, table=True):
engine = create_engine(db_pass) engine = create_engine(db_pass)
SQLModel.metadata.create_all(engine)
html_session = HTMLSession()
QUEUE = {line.strip() for line in open("queue.db", "rt")}
def add_word(word): def add_word(word):
@ -74,7 +78,6 @@ def add_word(word):
.where(Sense.word == _word) .where(Sense.word == _word)
.where(Sense.word_class == _class) .where(Sense.word_class == _class)
).one_or_none() ).one_or_none()
if results: if results:
sense = results sense = results
else: else:
@ -94,26 +97,14 @@ def add_word(word):
for sents in dt.find("span.sents"): for sents in dt.find("span.sents"):
_example.append(sents.text) _example.append(sents.text)
_examples.append("; ".join(_example)) _examples.append("; ".join(_example))
_final_description = "; ".join(_desc)
results = session.exec(
select(Description).where(
Description.word == word.word,
Description.description == _final_description,
)
).one_or_none()
if results:
continue
else:
session.add( session.add(
Description( Description(
word=word.word, word=word.word,
sense_id=sense.id, sense_id=sense.id,
description=_final_description, description="; ".join(_desc),
examples=_examples, examples=_examples,
) )
) )
for pron in c.find( for pron in c.find(
"span.prons-entries-list-inline div.prons-entry-list-item,a.prons-entry-list-item" "span.prons-entries-list-inline div.prons-entry-list-item,a.prons-entry-list-item"
): ):
@ -126,26 +117,16 @@ def add_word(word):
return links return links
def presently_available(): def present():
with Session(engine) as session: with Session(engine) as session:
return session.exec(select(Word.word)).unique() return session.exec(select(Word.word)).unique()
if __name__ == "__main__": while True:
SQLModel.metadata.create_all(engine)
html_session = HTMLSession()
QUEUE = {line.strip() for line in open("queue.db", "rt")}
while True:
try: try:
if len(QUEUE) < 20: QUEUE |= add_word(random.choice(list(QUEUE)))
exit() QUEUE -= set(present())
next_word = random.choice(list(QUEUE)) print(len(QUEUE))
already_present = set(presently_available())
print(next_word, len(QUEUE), len(already_present))
QUEUE |= add_word(next_word)
QUEUE -= already_present | {next_word}
sleep(random.random() * 5) sleep(random.random() * 5)
except KeyboardInterrupt: except KeyboardInterrupt:
with open("queue.db", "wt") as f: with open("queue.db", "wt") as f:

53
t.py
View File

@ -1,60 +1,25 @@
#!/bin/python #!/bin/python
"""search the Merriam Webster Thesaurus with ease"""
import argparse
import os import os
import sys
from itertools import zip_longest from itertools import zip_longest
from rich.console import Console from rich.console import Console
from rich.table import Table from rich.table import Table
from dict_dl import DictFile from dict_dl import DictFile
import string
letters = string.ascii_lowercase if len(sys.argv) < 2:
prefix_length = 3 query = next(sys.stdin).strip()
unusual = ( else:
lambda prefix: not all([c in letters for c in prefix.lower()]) query = sys.argv[1].strip()
or len(prefix) < prefix_length prefix = query[:3]
)
parser = argparse.ArgumentParser(description="Merriam Webster Thesaurus")
parser.add_argument("-p", "--preview", action="store_true", help="FZF preview")
parser.add_argument("query", type=str, help="query")
args = parser.parse_args()
prefix = args.query[:prefix_length].lower()
if unusual(prefix):
prefix = "_" * prefix_length
d = DictFile(os.path.expandvars(f"$DICT_DL/en_MWThesaurus/{prefix}_MWT.json")) d = DictFile(os.path.expandvars(f"$DICT_DL/en_MWThesaurus/{prefix}_MWT.json"))
if args.preview: print(f"||||||||{query}||||||||")
for k, v in d[args.query].items():
if k == "type":
word_type = k
else:
syns = v["synonyms"]
nsyns = v["related" if "related" in v else "near synonyms"]
nants = v["near antonyms"]
ants = v["antonyms"]
print(f"> {k}")
if syns:
print(" SYNONYMS\n ", ", ".join(syns))
if nsyns:
print(" NEAR SYNONYMS\n ", ", ".join(nsyns))
if nants:
print(" NEAR ANTONYMS\n ", ", ".join(nants))
if ants:
print(" ANTONYMS\n ", ", ".join(ants))
print()
exit()
print(f"||||||||{args.query}||||||||")
print() print()
for k, v in d[args.query].items(): for k, v in d[query].items():
if k == "type": if k != "type":
word_type = k
else:
table = Table(title=k) table = Table(title=k)
table.add_column("synonyms", justify="center", style="cyan", no_wrap=True) table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)
table.add_column("near synonyms", justify="center", style="cyan", no_wrap=True) table.add_column("near synonyms", justify="center", style="cyan", no_wrap=True)