state of the parser
This commit is contained in:
parent
9b33e9add8
commit
f65a677cb4
@ -6,7 +6,7 @@ from dict_dl import Queue, WordParser, ot, rb, uq, uqall
|
||||
class MWThesaurusParser(WordParser):
|
||||
def __init__(self, word):
|
||||
url_prefix = "https://www.merriam-webster.com/thesaurus/"
|
||||
super().__init__(word, url_prefix)
|
||||
super().__init__(word, url_prefix, clean=True)
|
||||
|
||||
@property
|
||||
def thes(self):
|
||||
@ -75,9 +75,7 @@ class MWThesaurusParser(WordParser):
|
||||
@property
|
||||
def type(self):
|
||||
types = set()
|
||||
for e in self.root.findall(
|
||||
".//div[@class='row entry-header thesaurus']//span[@class='fl']"
|
||||
):
|
||||
for e in self.root.findall(".//a[@class='important-blue-link']"):
|
||||
types.add(rb(ot(e), "(", ")"))
|
||||
return sorted(types)
|
||||
|
||||
@ -97,9 +95,10 @@ class MWThesaurusParser(WordParser):
|
||||
return uqall({self.word: self.thes | {"type": self.type}})
|
||||
|
||||
|
||||
# w = MWThesaurusParser("coffining")
|
||||
# print(w.todict())
|
||||
# exit()
|
||||
w = MWThesaurusParser("augur")
|
||||
# print(w.neighbours)
|
||||
print(w.todict())
|
||||
exit()
|
||||
|
||||
q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json")
|
||||
q.loadDB()
|
||||
|
36
d.py
Executable file
36
d.py
Executable file
@ -0,0 +1,36 @@
|
||||
#!/bin/python
|
||||
import os
|
||||
import sys
|
||||
from itertools import zip_longest
|
||||
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
from dict_dl import DictFile
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
query = next(sys.stdin).strip()
|
||||
else:
|
||||
query = sys.argv[1].strip()
|
||||
prefix = query[:3]
|
||||
|
||||
d = DictFile(os.path.expandvars(f"$DICT_DL/en_MerriamWebster/{prefix}_MW.json"))
|
||||
|
||||
print(f"||||||||{query}||||||||")
|
||||
for k, v in d[query].items():
|
||||
print(k,v)
|
||||
# if k != "type":
|
||||
# table = Table(title=k)
|
||||
# table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)
|
||||
# table.add_column("near synonyms", justify="center", style="cyan", no_wrap=True)
|
||||
# table.add_column("near antonyms", justify="center", style="cyan", no_wrap=True)
|
||||
# table.add_column("antonyms", justify="center", style="cyan", no_wrap=True)
|
||||
# syns = v["synonyms"]
|
||||
# nsyns = v["related" if "related" in v else "near synonyms"]
|
||||
# ants = v["near antonyms"]
|
||||
# nants = v["antonyms"]
|
||||
# for s, ns, na, a in zip_longest(syns, nsyns, nants, ants, fillvalue=""):
|
||||
# table.add_row(s, ns, na, a)
|
||||
|
||||
# console = Console()
|
||||
# console.print(table)
|
40
dict_dl.py
40
dict_dl.py
@ -106,18 +106,24 @@ def only_text(e):
|
||||
return " ".join(all_text(e))
|
||||
|
||||
|
||||
def url2str(url: str) -> str:
|
||||
def url2str(url: str, clean=True) -> str:
|
||||
headers = {
|
||||
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36"
|
||||
}
|
||||
# bad_html = requests.get(url, headers=headers)
|
||||
bad_html = requests.get(url)
|
||||
tree = BeautifulSoup(bad_html.text, features="lxml")
|
||||
xml_str = str(tree)
|
||||
if clean:
|
||||
tree = BeautifulSoup(bad_html.text, features="html.parser")
|
||||
xml_str = str(tree)
|
||||
else:
|
||||
xml_str = bad_html.text
|
||||
|
||||
xml_str = re.sub(r"[^!]--[^>]", "-->", xml_str)
|
||||
xml_str = re.sub(r"<>", "-->", xml_str)
|
||||
xml_str = remove_tag(xml_str, "head")
|
||||
xml_str = remove_tag(xml_str)
|
||||
# with open("test.html", "w") as f:
|
||||
# f.write(xml_str)
|
||||
# xml_str = remove_tag(xml_str)
|
||||
with open("test.html", "w") as f:
|
||||
f.write(xml_str)
|
||||
return xml_str
|
||||
|
||||
|
||||
@ -134,11 +140,11 @@ class WordParser:
|
||||
- self.neighbours = words found on the site
|
||||
- self.todict() = returning a dict with the parsed info"""
|
||||
|
||||
def __init__(self, word, url_prefix):
|
||||
def __init__(self, word, url_prefix, clean=True):
|
||||
self.time = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
self.word = uq(word)
|
||||
self.url = f"{url_prefix}{word}"
|
||||
self.xml_string = url2str(self.url)
|
||||
self.xml_string = url2str(self.url, clean=clean)
|
||||
self.root = ET.fromstring(self.xml_string)
|
||||
|
||||
|
||||
@ -149,6 +155,7 @@ class FileSet(set):
|
||||
super().__init__({line.strip() for line in open(self.file, "r")})
|
||||
else:
|
||||
super()
|
||||
self -= {""}
|
||||
|
||||
def load(self):
|
||||
if os.path.isfile(self.file):
|
||||
@ -156,10 +163,13 @@ class FileSet(set):
|
||||
else:
|
||||
super()
|
||||
|
||||
def save(self):
|
||||
def save(self, sort=False):
|
||||
if self:
|
||||
with open(self.file, "w") as f:
|
||||
f.write("\n".join([w for w in self if w]))
|
||||
if sort:
|
||||
f.write("\n".join([w for w in sorted(self) if w]))
|
||||
else:
|
||||
f.write("\n".join([w for w in self if w]))
|
||||
|
||||
def append(self):
|
||||
if self and os.path.isfile(self.file):
|
||||
@ -215,8 +225,8 @@ class Queue:
|
||||
)
|
||||
|
||||
def wait(self):
|
||||
if int(time.strftime("%M")) % 10 == 0:# cron job
|
||||
self.words.save()
|
||||
if int(time.time()) % 10 == 0: # cron job
|
||||
self.words.save(sort=True)
|
||||
self.queue.save()
|
||||
self.time_exponent = abs(self.time_exponent)
|
||||
a = self.time_base**self.time_exponent
|
||||
@ -226,7 +236,11 @@ class Queue:
|
||||
def loadDB(self):
|
||||
for db_file in Path(self.dir_prefix).glob(f"*{self.suffix}"):
|
||||
with open(db_file, "r") as f:
|
||||
self.words |= set(json.load(f).keys())
|
||||
try:
|
||||
self.words |= set(json.load(f).keys())
|
||||
except json.decoder.JSONDecodeError:
|
||||
print(db_file, " corrupted")
|
||||
exit()
|
||||
|
||||
def pick_random(self):
|
||||
self.redo.load()
|
||||
|
@ -16,6 +16,11 @@ class MerriamWebsterParser(WordParser):
|
||||
definitions[ot(d)] = [
|
||||
ot(ex) for ex in e.findall("./span[@class!='dtText']")
|
||||
]
|
||||
if not definitions: # british spelling...
|
||||
for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"):
|
||||
for e in entry.findall(".//span[@class='cxl']"):
|
||||
words = [ot(d) for d in entry.findall(".//a")]
|
||||
definitions[f'{ot(e)} {", ".join(words)}'] = [ ]
|
||||
return cw(definitions)
|
||||
|
||||
@property
|
||||
@ -37,7 +42,7 @@ class MerriamWebsterParser(WordParser):
|
||||
@property
|
||||
def type(self):
|
||||
types = set()
|
||||
for e in self.root.findall(".//*[@class='fl']"):
|
||||
for e in self.root.findall(".//a[@class='important-blue-link']"):
|
||||
types.add(rb(ot(e), "(", ")"))
|
||||
return sorted(types)
|
||||
|
||||
@ -126,8 +131,9 @@ class MerriamWebsterParser(WordParser):
|
||||
)
|
||||
|
||||
|
||||
# testword = "revivalist"
|
||||
# testword = "optimize"
|
||||
# d = MerriamWebsterParser(testword)
|
||||
# # print(d.definitions)
|
||||
# print(d.neighbours)
|
||||
# word_dict = d.todict()
|
||||
# for k, v in word_dict[testword].items():
|
||||
|
Loading…
Reference in New Issue
Block a user