state of the parser
This commit is contained in:
parent
9b33e9add8
commit
f65a677cb4
@ -6,7 +6,7 @@ from dict_dl import Queue, WordParser, ot, rb, uq, uqall
|
|||||||
class MWThesaurusParser(WordParser):
|
class MWThesaurusParser(WordParser):
|
||||||
def __init__(self, word):
|
def __init__(self, word):
|
||||||
url_prefix = "https://www.merriam-webster.com/thesaurus/"
|
url_prefix = "https://www.merriam-webster.com/thesaurus/"
|
||||||
super().__init__(word, url_prefix)
|
super().__init__(word, url_prefix, clean=True)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def thes(self):
|
def thes(self):
|
||||||
@ -75,9 +75,7 @@ class MWThesaurusParser(WordParser):
|
|||||||
@property
|
@property
|
||||||
def type(self):
|
def type(self):
|
||||||
types = set()
|
types = set()
|
||||||
for e in self.root.findall(
|
for e in self.root.findall(".//a[@class='important-blue-link']"):
|
||||||
".//div[@class='row entry-header thesaurus']//span[@class='fl']"
|
|
||||||
):
|
|
||||||
types.add(rb(ot(e), "(", ")"))
|
types.add(rb(ot(e), "(", ")"))
|
||||||
return sorted(types)
|
return sorted(types)
|
||||||
|
|
||||||
@ -97,9 +95,10 @@ class MWThesaurusParser(WordParser):
|
|||||||
return uqall({self.word: self.thes | {"type": self.type}})
|
return uqall({self.word: self.thes | {"type": self.type}})
|
||||||
|
|
||||||
|
|
||||||
# w = MWThesaurusParser("coffining")
|
w = MWThesaurusParser("augur")
|
||||||
# print(w.todict())
|
# print(w.neighbours)
|
||||||
# exit()
|
print(w.todict())
|
||||||
|
exit()
|
||||||
|
|
||||||
q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json")
|
q = Queue(MWThesaurusParser, "en_MWThesaurus/", "_MWT.json")
|
||||||
q.loadDB()
|
q.loadDB()
|
||||||
|
36
d.py
Executable file
36
d.py
Executable file
@ -0,0 +1,36 @@
|
|||||||
|
#!/bin/python
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from itertools import zip_longest
|
||||||
|
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
|
||||||
|
from dict_dl import DictFile
|
||||||
|
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
query = next(sys.stdin).strip()
|
||||||
|
else:
|
||||||
|
query = sys.argv[1].strip()
|
||||||
|
prefix = query[:3]
|
||||||
|
|
||||||
|
d = DictFile(os.path.expandvars(f"$DICT_DL/en_MerriamWebster/{prefix}_MW.json"))
|
||||||
|
|
||||||
|
print(f"||||||||{query}||||||||")
|
||||||
|
for k, v in d[query].items():
|
||||||
|
print(k,v)
|
||||||
|
# if k != "type":
|
||||||
|
# table = Table(title=k)
|
||||||
|
# table.add_column("synonyms", justify="center", style="cyan", no_wrap=True)
|
||||||
|
# table.add_column("near synonyms", justify="center", style="cyan", no_wrap=True)
|
||||||
|
# table.add_column("near antonyms", justify="center", style="cyan", no_wrap=True)
|
||||||
|
# table.add_column("antonyms", justify="center", style="cyan", no_wrap=True)
|
||||||
|
# syns = v["synonyms"]
|
||||||
|
# nsyns = v["related" if "related" in v else "near synonyms"]
|
||||||
|
# ants = v["near antonyms"]
|
||||||
|
# nants = v["antonyms"]
|
||||||
|
# for s, ns, na, a in zip_longest(syns, nsyns, nants, ants, fillvalue=""):
|
||||||
|
# table.add_row(s, ns, na, a)
|
||||||
|
|
||||||
|
# console = Console()
|
||||||
|
# console.print(table)
|
40
dict_dl.py
40
dict_dl.py
@ -106,18 +106,24 @@ def only_text(e):
|
|||||||
return " ".join(all_text(e))
|
return " ".join(all_text(e))
|
||||||
|
|
||||||
|
|
||||||
def url2str(url: str) -> str:
|
def url2str(url: str, clean=True) -> str:
|
||||||
headers = {
|
headers = {
|
||||||
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36"
|
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36"
|
||||||
}
|
}
|
||||||
# bad_html = requests.get(url, headers=headers)
|
# bad_html = requests.get(url, headers=headers)
|
||||||
bad_html = requests.get(url)
|
bad_html = requests.get(url)
|
||||||
tree = BeautifulSoup(bad_html.text, features="lxml")
|
if clean:
|
||||||
xml_str = str(tree)
|
tree = BeautifulSoup(bad_html.text, features="html.parser")
|
||||||
|
xml_str = str(tree)
|
||||||
|
else:
|
||||||
|
xml_str = bad_html.text
|
||||||
|
|
||||||
|
xml_str = re.sub(r"[^!]--[^>]", "-->", xml_str)
|
||||||
|
xml_str = re.sub(r"<>", "-->", xml_str)
|
||||||
xml_str = remove_tag(xml_str, "head")
|
xml_str = remove_tag(xml_str, "head")
|
||||||
xml_str = remove_tag(xml_str)
|
# xml_str = remove_tag(xml_str)
|
||||||
# with open("test.html", "w") as f:
|
with open("test.html", "w") as f:
|
||||||
# f.write(xml_str)
|
f.write(xml_str)
|
||||||
return xml_str
|
return xml_str
|
||||||
|
|
||||||
|
|
||||||
@ -134,11 +140,11 @@ class WordParser:
|
|||||||
- self.neighbours = words found on the site
|
- self.neighbours = words found on the site
|
||||||
- self.todict() = returning a dict with the parsed info"""
|
- self.todict() = returning a dict with the parsed info"""
|
||||||
|
|
||||||
def __init__(self, word, url_prefix):
|
def __init__(self, word, url_prefix, clean=True):
|
||||||
self.time = datetime.now().strftime("%Y%m%d-%H%M%S")
|
self.time = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||||
self.word = uq(word)
|
self.word = uq(word)
|
||||||
self.url = f"{url_prefix}{word}"
|
self.url = f"{url_prefix}{word}"
|
||||||
self.xml_string = url2str(self.url)
|
self.xml_string = url2str(self.url, clean=clean)
|
||||||
self.root = ET.fromstring(self.xml_string)
|
self.root = ET.fromstring(self.xml_string)
|
||||||
|
|
||||||
|
|
||||||
@ -149,6 +155,7 @@ class FileSet(set):
|
|||||||
super().__init__({line.strip() for line in open(self.file, "r")})
|
super().__init__({line.strip() for line in open(self.file, "r")})
|
||||||
else:
|
else:
|
||||||
super()
|
super()
|
||||||
|
self -= {""}
|
||||||
|
|
||||||
def load(self):
|
def load(self):
|
||||||
if os.path.isfile(self.file):
|
if os.path.isfile(self.file):
|
||||||
@ -156,10 +163,13 @@ class FileSet(set):
|
|||||||
else:
|
else:
|
||||||
super()
|
super()
|
||||||
|
|
||||||
def save(self):
|
def save(self, sort=False):
|
||||||
if self:
|
if self:
|
||||||
with open(self.file, "w") as f:
|
with open(self.file, "w") as f:
|
||||||
f.write("\n".join([w for w in self if w]))
|
if sort:
|
||||||
|
f.write("\n".join([w for w in sorted(self) if w]))
|
||||||
|
else:
|
||||||
|
f.write("\n".join([w for w in self if w]))
|
||||||
|
|
||||||
def append(self):
|
def append(self):
|
||||||
if self and os.path.isfile(self.file):
|
if self and os.path.isfile(self.file):
|
||||||
@ -215,8 +225,8 @@ class Queue:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def wait(self):
|
def wait(self):
|
||||||
if int(time.strftime("%M")) % 10 == 0:# cron job
|
if int(time.time()) % 10 == 0: # cron job
|
||||||
self.words.save()
|
self.words.save(sort=True)
|
||||||
self.queue.save()
|
self.queue.save()
|
||||||
self.time_exponent = abs(self.time_exponent)
|
self.time_exponent = abs(self.time_exponent)
|
||||||
a = self.time_base**self.time_exponent
|
a = self.time_base**self.time_exponent
|
||||||
@ -226,7 +236,11 @@ class Queue:
|
|||||||
def loadDB(self):
|
def loadDB(self):
|
||||||
for db_file in Path(self.dir_prefix).glob(f"*{self.suffix}"):
|
for db_file in Path(self.dir_prefix).glob(f"*{self.suffix}"):
|
||||||
with open(db_file, "r") as f:
|
with open(db_file, "r") as f:
|
||||||
self.words |= set(json.load(f).keys())
|
try:
|
||||||
|
self.words |= set(json.load(f).keys())
|
||||||
|
except json.decoder.JSONDecodeError:
|
||||||
|
print(db_file, " corrupted")
|
||||||
|
exit()
|
||||||
|
|
||||||
def pick_random(self):
|
def pick_random(self):
|
||||||
self.redo.load()
|
self.redo.load()
|
||||||
|
@ -16,6 +16,11 @@ class MerriamWebsterParser(WordParser):
|
|||||||
definitions[ot(d)] = [
|
definitions[ot(d)] = [
|
||||||
ot(ex) for ex in e.findall("./span[@class!='dtText']")
|
ot(ex) for ex in e.findall("./span[@class!='dtText']")
|
||||||
]
|
]
|
||||||
|
if not definitions: # british spelling...
|
||||||
|
for entry in self.root.findall(f".//div[@id='dictionary-entry-1']"):
|
||||||
|
for e in entry.findall(".//span[@class='cxl']"):
|
||||||
|
words = [ot(d) for d in entry.findall(".//a")]
|
||||||
|
definitions[f'{ot(e)} {", ".join(words)}'] = [ ]
|
||||||
return cw(definitions)
|
return cw(definitions)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -37,7 +42,7 @@ class MerriamWebsterParser(WordParser):
|
|||||||
@property
|
@property
|
||||||
def type(self):
|
def type(self):
|
||||||
types = set()
|
types = set()
|
||||||
for e in self.root.findall(".//*[@class='fl']"):
|
for e in self.root.findall(".//a[@class='important-blue-link']"):
|
||||||
types.add(rb(ot(e), "(", ")"))
|
types.add(rb(ot(e), "(", ")"))
|
||||||
return sorted(types)
|
return sorted(types)
|
||||||
|
|
||||||
@ -126,8 +131,9 @@ class MerriamWebsterParser(WordParser):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# testword = "revivalist"
|
# testword = "optimize"
|
||||||
# d = MerriamWebsterParser(testword)
|
# d = MerriamWebsterParser(testword)
|
||||||
|
# # print(d.definitions)
|
||||||
# print(d.neighbours)
|
# print(d.neighbours)
|
||||||
# word_dict = d.todict()
|
# word_dict = d.todict()
|
||||||
# for k, v in word_dict[testword].items():
|
# for k, v in word_dict[testword].items():
|
||||||
|
Loading…
Reference in New Issue
Block a user