hold dictionary in memory only partially
This commit is contained in:
parent
609ac67fc5
commit
2aa80f99e8
63
dict_dl.py
63
dict_dl.py
@ -5,13 +5,17 @@ import string
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from urllib.parse import unquote as uq
|
||||
from urllib.parse import unquote
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from requests.exceptions import ConnectionError
|
||||
|
||||
# def uq(s):
|
||||
# return unquote(s).split("?")[0]
|
||||
uq = unquote
|
||||
|
||||
|
||||
def uqall(data):
|
||||
if isinstance(data, list):
|
||||
@ -153,7 +157,18 @@ class FileSet(set):
|
||||
self.save()
|
||||
|
||||
|
||||
class Dictionary(dict):
|
||||
class DictFile(dict):
|
||||
def __init__(self, file):
|
||||
self.file = file
|
||||
with open(self.file, "r") as f:
|
||||
super().__init__(json.load(f))
|
||||
|
||||
def save(self):
|
||||
with open(self.file, "w") as f:
|
||||
json.dump(self, f, separators=(",", ":"), indent=2, sort_keys=False)
|
||||
|
||||
|
||||
class FullDictionary(dict):
|
||||
def __init__(self, dir_prefix, suffix):
|
||||
self.__dict__.update(locals())
|
||||
full_dict = {}
|
||||
@ -177,7 +192,7 @@ class Queue:
|
||||
):
|
||||
self.__dict__.update(locals())
|
||||
self.letters = string.ascii_lowercase
|
||||
self.full_dict = {}
|
||||
self.words = set()
|
||||
self.queue = FileSet(f"{dir_prefix}queue")
|
||||
self.snafus = FileSet(f"{dir_prefix}snafus")
|
||||
self.redo = FileSet(f"{dir_prefix}redo")
|
||||
@ -190,57 +205,41 @@ class Queue:
|
||||
def loadDB(self):
|
||||
for db_file in Path(self.dir_prefix).glob(f"*{self.suffix}"):
|
||||
with open(db_file, "r") as f:
|
||||
self.full_dict |= json.load(f)
|
||||
self.full_dict = uqall(self.full_dict)
|
||||
|
||||
def updateDB(self, pick):
|
||||
start = time.time()
|
||||
|
||||
prefix = pick[: self.prefix_length].lower()
|
||||
if all([c.lower() in self.letters for c in prefix]):
|
||||
c_db = {
|
||||
k: v
|
||||
for k, v in self.full_dict.items()
|
||||
if k[: self.prefix_length].lower() == prefix
|
||||
}
|
||||
else:
|
||||
c_db = {
|
||||
k: v
|
||||
for k, v in self.full_dict.items()
|
||||
if any([c.lower() not in self.letters for c in k[: self.prefix_length]])
|
||||
}
|
||||
prefix = "_" * self.prefix_length
|
||||
|
||||
with open(f"{self.dir_prefix}{prefix}{self.suffix}", "w") as f: # save DB
|
||||
json.dump(c_db, f, separators=(",", ":"), indent=2, sort_keys=False)
|
||||
self.words |= set(json.load(f).keys())
|
||||
|
||||
def add_word(self):
|
||||
self.redo.load()
|
||||
self.queue -= set(self.full_dict.keys())
|
||||
self.queue -= self.words
|
||||
self.queue -= self.snafus
|
||||
self.queue |= self.redo
|
||||
len_queue = len(self.queue) # actual queue
|
||||
p = random.choice(list(self.queue))
|
||||
try:
|
||||
start_parsing = time.time()
|
||||
prefix = p[: self.prefix_length].lower()
|
||||
w = self.Parser(p) # fetch new word
|
||||
word_dict = w.todict()
|
||||
start_db_stuff = time.time()
|
||||
self.full_dict |= word_dict
|
||||
|
||||
print(
|
||||
f"{p} | "
|
||||
f"{len(self.full_dict)} words collected, "
|
||||
f"{len(self.words)} words collected, "
|
||||
f"{len_queue} words waiting in queue"
|
||||
# f", {start_db_stuff-start_parsing:.06f}s"
|
||||
# f"/{time.time() - start_db_stuff:.06f}s"
|
||||
)
|
||||
|
||||
start_db_stuff = time.time()
|
||||
dict_part = DictFile(f"{self.dir_prefix}{prefix}{self.suffix}")
|
||||
dict_part |= word_dict
|
||||
dict_part.save()
|
||||
del dict_part
|
||||
|
||||
self.words |= set(word_dict.keys())
|
||||
|
||||
self.queue |= set(w.neighbours)
|
||||
self.queue -= {p}
|
||||
self.redo -= {p}
|
||||
self.redo.save()
|
||||
self.updateDB(p)
|
||||
self.wait()
|
||||
except (
|
||||
AssertionError,
|
||||
@ -261,3 +260,5 @@ class Queue:
|
||||
|
||||
if __name__ == "__main__":
|
||||
f = FileSet("en_merriam_webster/queue")
|
||||
d = DictFile("en_merriam_webster/ab_mw.json")
|
||||
print(d)
|
||||
|
Loading…
Reference in New Issue
Block a user