state of the parser

This commit is contained in:
julius 2022-11-15 11:41:33 +00:00
parent 9b33e9add8
commit f65a677cb4
4 changed files with 77 additions and 22 deletions

View File

@ -6,7 +6,7 @@ from dict_dl import Queue, WordParser, ot, rb, uq, uqall
class MWThesaurusParser(WordParser):
def __init__(self, word):
url_prefix = "https://www.merriam-webster.com/thesaurus/"
super().__init__(word, url_prefix)
super().__init__(word, url_prefix, clean=True)
@property
def thes(self):
@ -75,9 +75,7 @@ class MWThesaurusParser(WordParser):
@property
def type(self):
types = set()
for e in self.root.findall(
".//div[@class='row entry-header thesaurus']//span[@class='fl']"
):
for e in self.root.findall(".//a[@class='important-blue-link']"):
types.add(rb(ot(e), "(", ")"))
return sorted(types)
@ -97,9 +95,10 @@ class MWThesaurusParser(WordParser):
return uqall({self.word: self.thes | {"type": self.type}})
# w = MWThesaurusParser("coffining")
# print(w.todict())
# exit()
w = MWThesaurusParser("augur")
# print(w.neighbours)
print(w.todict())
exit()
q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json")
q.loadDB()

36
d.py Executable file
View File

@ -0,0 +1,36 @@
#!/bin/python
import os
import sys
from itertools import zip_longest
from rich.console import Console
from rich.table import Table
from dict_dl import DictFile
if len(sys.argv) < 2:
query = next(sys.stdin).strip()
else:
query = sys.argv[1].strip()
prefix = query[:3]
d = DictFile(os.path.expandvars(f"$DICT_DL/en_MerriamWebster/{prefix}_MW.json"))
print(f"||||||||{query}||||||||")
for k, v in d[query].items():
print(k,v)
# if k != "type":
# table = Table(title=k)
# table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)
# table.add_column("near synonyms", justify="center", style="cyan", no_wrap=True)
# table.add_column("near antonyms", justify="center", style="cyan", no_wrap=True)
# table.add_column("antonyms", justify="center", style="cyan", no_wrap=True)
# syns = v["synonyms"]
# nsyns = v["related" if "related" in v else "near synonyms"]
# ants = v["near antonyms"]
# nants = v["antonyms"]
# for s, ns, na, a in zip_longest(syns, nsyns, nants, ants, fillvalue=""):
# table.add_row(s, ns, na, a)
# console = Console()
# console.print(table)

View File

@ -106,18 +106,24 @@ def only_text(e):
return " ".join(all_text(e))
def url2str(url: str) -> str:
def url2str(url: str, clean=True) -> str:
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36"
}
# bad_html = requests.get(url, headers=headers)
bad_html = requests.get(url)
tree = BeautifulSoup(bad_html.text, features="lxml")
if clean:
tree = BeautifulSoup(bad_html.text, features="html.parser")
xml_str = str(tree)
else:
xml_str = bad_html.text
xml_str = re.sub(r"[^!]--[^>]", "-->", xml_str)
xml_str = re.sub(r"<>", "-->", xml_str)
xml_str = remove_tag(xml_str, "head")
xml_str = remove_tag(xml_str)
# with open("test.html", "w") as f:
# f.write(xml_str)
# xml_str = remove_tag(xml_str)
with open("test.html", "w") as f:
f.write(xml_str)
return xml_str
@ -134,11 +140,11 @@ class WordParser:
- self.neighbours = words found on the site
- self.todict() = returning a dict with the parsed info"""
def __init__(self, word, url_prefix):
def __init__(self, word, url_prefix, clean=True):
self.time = datetime.now().strftime("%Y%m%d-%H%M%S")
self.word = uq(word)
self.url = f"{url_prefix}{word}"
self.xml_string = url2str(self.url)
self.xml_string = url2str(self.url, clean=clean)
self.root = ET.fromstring(self.xml_string)
@ -149,6 +155,7 @@ class FileSet(set):
super().__init__({line.strip() for line in open(self.file, "r")})
else:
super()
self -= {""}
def load(self):
if os.path.isfile(self.file):
@ -156,9 +163,12 @@ class FileSet(set):
else:
super()
def save(self):
def save(self, sort=False):
if self:
with open(self.file, "w") as f:
if sort:
f.write("\n".join([w for w in sorted(self) if w]))
else:
f.write("\n".join([w for w in self if w]))
def append(self):
@ -215,8 +225,8 @@ class Queue:
)
def wait(self):
if int(time.strftime("%M")) % 10 == 0:# cron job
self.words.save()
if int(time.time()) % 10 == 0: # cron job
self.words.save(sort=True)
self.queue.save()
self.time_exponent = abs(self.time_exponent)
a = self.time_base**self.time_exponent
@ -226,7 +236,11 @@ class Queue:
def loadDB(self):
for db_file in Path(self.dir_prefix).glob(f"*{self.suffix}"):
with open(db_file, "r") as f:
try:
self.words |= set(json.load(f).keys())
except json.decoder.JSONDecodeError:
print(db_file, " corrupted")
exit()
def pick_random(self):
self.redo.load()

View File

@ -16,6 +16,11 @@ class MerriamWebsterParser(WordParser):
definitions[ot(d)] = [
ot(ex) for ex in e.findall("./span[@class!='dtText']")
]
if not definitions: # british spelling...
for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"):
for e in entry.findall(".//span[@class='cxl']"):
words = [ot(d) for d in entry.findall(".//a")]
definitions[f'{ot(e)} {", ".join(words)}'] = [ ]
return cw(definitions)
@property
@ -37,7 +42,7 @@ class MerriamWebsterParser(WordParser):
@property
def type(self):
types = set()
for e in self.root.findall(".//*[@class='fl']"):
for e in self.root.findall(".//a[@class='important-blue-link']"):
types.add(rb(ot(e), "(", ")"))
return sorted(types)
@ -126,8 +131,9 @@ class MerriamWebsterParser(WordParser):
)
# testword = "revivalist"
# testword = "optimize"
# d = MerriamWebsterParser(testword)
# # print(d.definitions)
# print(d.neighbours)
# word_dict = d.todict()
# for k, v in word_dict[testword].items():