dict_dl/dict_dl.py

307 lines
8.4 KiB
Python
Raw Normal View History

2022-07-06 11:06:37 +00:00
import json
import os
2022-07-06 11:06:37 +00:00
import random
import re
import string
import time
from datetime import datetime
from pathlib import Path
from urllib.parse import unquote
2022-07-06 11:06:37 +00:00
from xml.etree import ElementTree as ET
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
2022-07-06 11:06:37 +00:00
letters = string.ascii_lowercase
# def uq(s):
# return unquote(s).split("?")[0]
uq = unquote
2022-07-06 11:06:37 +00:00
2022-07-08 10:43:24 +00:00
def uqall(data):
if isinstance(data, list):
iterator = enumerate(data)
elif isinstance(data, dict):
for k in list(data.keys()):
if "%" in k:
data[uq(k)] = data.pop(k)
2022-07-08 10:43:24 +00:00
iterator = data.items()
elif isinstance(data, str):
data = [data]
iterator = enumerate(data)
else:
raise TypeError("can only traverse list or dict")
for i, value in iterator:
if isinstance(value, (list, dict)):
uqall(value)
elif isinstance(value, str):
if "%" in value:
data[i] = uq(value)
2022-07-08 10:43:24 +00:00
return data
def clear_whitespace(data):
if isinstance(data, list):
iterator = enumerate(data)
elif isinstance(data, dict):
iterator = data.items()
elif isinstance(data, str):
data = [data]
iterator = enumerate(data)
else:
raise TypeError("can only traverse list or dict")
for i, value in iterator:
if isinstance(value, (list, dict)):
clear_whitespace(value)
elif isinstance(value, str):
data[i] = re.sub(r"[\n\t\s]+", " ", value).strip()
return data
2022-07-06 11:06:37 +00:00
def randtime(a, b, k=0):
if k:
return [random.uniform(a, b) for _ in range(k)]
else:
return random.uniform(a, b)
def remove_between(string, a, b):
otag_pos = 0
ctag_pos = 0
for i in range(len(string)):
if string[i : i + len(a)] == a:
otag_pos = i
elif string[i : i + len(b)] == b:
ctag_pos = i + len(b)
if otag_pos and ctag_pos:
return remove_between(string[:otag_pos] + string[ctag_pos:], a, b)
return string.strip()
2022-07-06 11:06:37 +00:00
def remove_tag(string, tag="script"):
otag = f"<{tag}"
ctag = f"</{tag}>"
otag_pos = 0
ctag_pos = 0
for i in range(len(string)):
if string[i : i + len(otag)] == otag:
otag_pos = i
elif string[i : i + len(ctag)] == ctag:
ctag_pos = i + len(ctag)
if otag_pos and ctag_pos:
return remove_tag(string[:otag_pos] + string[ctag_pos:], tag)
return string
def all_text(e):
return clear_whitespace(" ".join(e.itertext()))
2022-07-08 10:43:24 +00:00
def only_first_text(e):
return clear_whitespace([next(e.itertext())])[0]
2022-07-06 11:06:37 +00:00
2022-07-08 10:43:24 +00:00
def only_text(e):
return " ".join(all_text(e))
2022-07-06 11:06:37 +00:00
2022-11-15 11:41:33 +00:00
def url2str(url: str, clean=True) -> str:
2022-07-06 11:06:37 +00:00
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36"
}
2022-07-07 15:54:11 +00:00
# bad_html = requests.get(url, headers=headers)
bad_html = requests.get(url)
2022-11-15 11:41:33 +00:00
if clean:
tree = BeautifulSoup(bad_html.text, features="html.parser")
xml_str = str(tree)
else:
xml_str = bad_html.text
xml_str = re.sub(r"[^!]--[^>]", "-->", xml_str)
xml_str = re.sub(r"<>", "-->", xml_str)
2022-07-06 11:06:37 +00:00
xml_str = remove_tag(xml_str, "head")
2022-11-15 11:41:33 +00:00
# xml_str = remove_tag(xml_str)
with open("test.html", "w") as f:
f.write(xml_str)
2022-07-06 11:06:37 +00:00
return xml_str
# aliases
rb = remove_between
cw = clear_whitespace
ot = only_text
2022-07-08 10:43:24 +00:00
at = all_text
oft = only_first_text
2022-07-06 11:06:37 +00:00
class WordParser:
2022-07-07 15:54:11 +00:00
"""WordParser needs additional methods to work with Queue:
2022-07-07 17:08:30 +00:00
- self.neighbours = words found on the site
2022-07-07 15:54:11 +00:00
- self.todict() = returning a dict with the parsed info"""
2022-11-15 11:41:33 +00:00
def __init__(self, word, url_prefix, clean=True):
2022-07-06 11:06:37 +00:00
self.time = datetime.now().strftime("%Y%m%d-%H%M%S")
2022-07-15 11:16:05 +00:00
self.word = uq(word)
2022-07-06 11:06:37 +00:00
self.url = f"{url_prefix}{word}"
2022-11-15 11:41:33 +00:00
self.xml_string = url2str(self.url, clean=clean)
2022-07-06 11:06:37 +00:00
self.root = ET.fromstring(self.xml_string)
class FileSet(set):
def __init__(self, file):
self.file = file
2022-07-15 11:16:05 +00:00
if os.path.isfile(self.file):
super().__init__({line.strip() for line in open(self.file, "r")})
else:
super()
2022-11-15 11:41:33 +00:00
self -= {""}
def load(self):
2022-07-15 11:16:05 +00:00
if os.path.isfile(self.file):
self.update({line.strip() for line in open(self.file, "r")})
else:
super()
2022-07-06 11:06:37 +00:00
2022-11-15 11:41:33 +00:00
def save(self, sort=False):
2022-07-08 10:43:24 +00:00
if self:
with open(self.file, "w") as f:
2022-11-15 11:41:33 +00:00
if sort:
f.write("\n".join([w for w in sorted(self) if w]))
else:
f.write("\n".join([w for w in self if w]))
2022-07-06 11:06:37 +00:00
def append(self):
2022-07-15 11:16:05 +00:00
if self and os.path.isfile(self.file):
2022-07-08 10:43:24 +00:00
self |= {line.strip() for line in open(self.file, "r")}
2022-07-15 11:16:05 +00:00
self.save()
2022-07-06 11:06:37 +00:00
class DictFile(dict):
def __init__(self, file):
self.file = file
if os.path.isfile(self.file):
with open(self.file, "r") as f:
super().__init__(json.load(f))
else:
super()
def save(self):
with open(self.file, "w") as f:
json.dump(self, f, separators=(",", ":"), indent=2, sort_keys=False)
class FullDictionary(dict):
def __init__(self, dir_prefix, suffix):
self.__dict__.update(locals())
full_dict = {}
start = time.time()
for db_file in Path(self.dir_prefix).glob(f"*{self.suffix}"):
with open(db_file, "r") as f:
full_dict |= json.load(f)
self.readtime = time.time() - start
super().__init__(full_dict)
2022-07-10 05:08:12 +00:00
del full_dict
2022-07-06 11:06:37 +00:00
class Queue:
def __init__(
self,
Parser,
dir_prefix,
suffix,
time_base=1.01,
time_exponent=10,
prefix_length=3,
2022-07-06 11:06:37 +00:00
):
self.__dict__.update(locals())
2022-07-15 11:16:05 +00:00
self.words = FileSet(f"{dir_prefix}words")
2022-07-06 11:06:37 +00:00
self.queue = FileSet(f"{dir_prefix}queue")
self.snafus = FileSet(f"{dir_prefix}snafus")
self.redo = FileSet(f"{dir_prefix}redo")
2022-07-15 11:16:05 +00:00
self.unusual = (
lambda prefix: not all([c in letters for c in prefix.lower()])
or len(prefix) < self.prefix_length
)
2022-07-06 11:06:37 +00:00
def wait(self):
2022-11-15 11:41:33 +00:00
if int(time.time()) % 10 == 0: # cron job
self.words.save(sort=True)
2022-07-15 11:16:05 +00:00
self.queue.save()
2022-07-11 06:10:43 +00:00
self.time_exponent = abs(self.time_exponent)
2022-07-06 11:06:37 +00:00
a = self.time_base**self.time_exponent
b = self.time_base ** (self.time_exponent * 3)
time.sleep(randtime(a, b))
def loadDB(self):
2022-07-10 05:08:12 +00:00
for db_file in Path(self.dir_prefix).glob(f"*{self.suffix}"):
with open(db_file, "r") as f:
2022-11-15 11:41:33 +00:00
try:
self.words |= set(json.load(f).keys())
except json.decoder.JSONDecodeError:
print(db_file, " corrupted")
exit()
2022-07-06 11:06:37 +00:00
2022-07-11 06:10:43 +00:00
def pick_random(self):
self.redo.load()
self.queue -= self.words
2022-07-07 17:08:30 +00:00
self.queue -= self.snafus
2022-07-07 15:54:11 +00:00
self.queue |= self.redo
2022-07-11 06:10:43 +00:00
if len(self.queue) < 1:
p = random.choice(list(self.words))
self.time_exponent += 1
else:
p = random.choice(list(self.queue))
self.time_exponent -= 20
return p
2022-07-15 11:16:05 +00:00
2022-07-11 06:10:43 +00:00
def add_word(self, p=None):
if p == None:
p = self.pick_random()
2022-07-06 11:06:37 +00:00
try:
w = self.Parser(p) # fetch new word
2022-07-15 11:16:05 +00:00
print(p)
2022-07-06 11:06:37 +00:00
word_dict = w.todict()
2022-07-07 17:08:30 +00:00
2022-07-10 03:16:16 +00:00
prefix = p[: self.prefix_length].lower()
2022-07-10 05:20:58 +00:00
if self.unusual(prefix):
2022-07-10 03:16:16 +00:00
prefix = "_" * self.prefix_length
dict_part = DictFile(f"{self.dir_prefix}{prefix}{self.suffix}")
dict_part |= word_dict
dict_part.save()
del dict_part
self.words |= set(word_dict.keys())
2022-07-08 10:43:24 +00:00
self.queue |= set(w.neighbours)
self.queue -= {p}
self.redo -= {p}
self.redo.save()
2022-07-06 11:06:37 +00:00
self.wait()
except (
AssertionError,
ET.ParseError,
2022-07-07 15:54:11 +00:00
):
self.queue.save()
print("snafu... ", p)
2022-07-07 17:08:30 +00:00
self.redo -= {p}
self.redo.save()
2022-07-07 15:54:11 +00:00
self.snafus |= {p}
self.snafus.append()
self.wait()
except ConnectionError:
2022-07-06 11:06:37 +00:00
self.queue.save()
2022-07-07 15:54:11 +00:00
self.time_exponent += 1
self.wait()
2022-07-07 15:54:11 +00:00
if __name__ == "__main__":
2022-07-10 05:08:12 +00:00
d = collect_words("en_MerriamWebster/", "_MW.json")
print(len(set(d)))
# print(d.readtime)
time.sleep(3)
print("del")
del d
time.sleep(3)