hold dictionary in memory only partially

This commit is contained in:
julius 2022-07-10 02:38:37 +00:00
parent 609ac67fc5
commit 2aa80f99e8

View File

@ -5,13 +5,17 @@ import string
import time import time
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from urllib.parse import unquote as uq from urllib.parse import unquote
from xml.etree import ElementTree as ET from xml.etree import ElementTree as ET
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError from requests.exceptions import ConnectionError
# def uq(s):
# return unquote(s).split("?")[0]
uq = unquote
def uqall(data): def uqall(data):
if isinstance(data, list): if isinstance(data, list):
@ -153,7 +157,18 @@ class FileSet(set):
self.save() self.save()
class Dictionary(dict): class DictFile(dict):
def __init__(self, file):
self.file = file
with open(self.file, "r") as f:
super().__init__(json.load(f))
def save(self):
with open(self.file, "w") as f:
json.dump(self, f, separators=(",", ":"), indent=2, sort_keys=False)
class FullDictionary(dict):
def __init__(self, dir_prefix, suffix): def __init__(self, dir_prefix, suffix):
self.__dict__.update(locals()) self.__dict__.update(locals())
full_dict = {} full_dict = {}
@ -177,7 +192,7 @@ class Queue:
): ):
self.__dict__.update(locals()) self.__dict__.update(locals())
self.letters = string.ascii_lowercase self.letters = string.ascii_lowercase
self.full_dict = {} self.words = set()
self.queue = FileSet(f"{dir_prefix}queue") self.queue = FileSet(f"{dir_prefix}queue")
self.snafus = FileSet(f"{dir_prefix}snafus") self.snafus = FileSet(f"{dir_prefix}snafus")
self.redo = FileSet(f"{dir_prefix}redo") self.redo = FileSet(f"{dir_prefix}redo")
@ -190,57 +205,41 @@ class Queue:
def loadDB(self): def loadDB(self):
for db_file in Path(self.dir_prefix).glob(f"*{self.suffix}"): for db_file in Path(self.dir_prefix).glob(f"*{self.suffix}"):
with open(db_file, "r") as f: with open(db_file, "r") as f:
self.full_dict |= json.load(f) self.words |= set(json.load(f).keys())
self.full_dict = uqall(self.full_dict)
def updateDB(self, pick):
start = time.time()
prefix = pick[: self.prefix_length].lower()
if all([c.lower() in self.letters for c in prefix]):
c_db = {
k: v
for k, v in self.full_dict.items()
if k[: self.prefix_length].lower() == prefix
}
else:
c_db = {
k: v
for k, v in self.full_dict.items()
if any([c.lower() not in self.letters for c in k[: self.prefix_length]])
}
prefix = "_" * self.prefix_length
with open(f"{self.dir_prefix}{prefix}{self.suffix}", "w") as f: # save DB
json.dump(c_db, f, separators=(",", ":"), indent=2, sort_keys=False)
def add_word(self): def add_word(self):
self.redo.load() self.redo.load()
self.queue -= set(self.full_dict.keys()) self.queue -= self.words
self.queue -= self.snafus self.queue -= self.snafus
self.queue |= self.redo self.queue |= self.redo
len_queue = len(self.queue) # actual queue len_queue = len(self.queue) # actual queue
p = random.choice(list(self.queue)) p = random.choice(list(self.queue))
try: try:
start_parsing = time.time() start_parsing = time.time()
prefix = p[: self.prefix_length].lower()
w = self.Parser(p) # fetch new word w = self.Parser(p) # fetch new word
word_dict = w.todict() word_dict = w.todict()
start_db_stuff = time.time()
self.full_dict |= word_dict
print( print(
f"{p} | " f"{p} | "
f"{len(self.full_dict)} words collected, " f"{len(self.words)} words collected, "
f"{len_queue} words waiting in queue" f"{len_queue} words waiting in queue"
# f", {start_db_stuff-start_parsing:.06f}s" # f", {start_db_stuff-start_parsing:.06f}s"
# f"/{time.time() - start_db_stuff:.06f}s" # f"/{time.time() - start_db_stuff:.06f}s"
) )
start_db_stuff = time.time()
dict_part = DictFile(f"{self.dir_prefix}{prefix}{self.suffix}")
dict_part |= word_dict
dict_part.save()
del dict_part
self.words |= set(word_dict.keys())
self.queue |= set(w.neighbours) self.queue |= set(w.neighbours)
self.queue -= {p} self.queue -= {p}
self.redo -= {p} self.redo -= {p}
self.redo.save() self.redo.save()
self.updateDB(p)
self.wait() self.wait()
except ( except (
AssertionError, AssertionError,
@ -261,3 +260,5 @@ class Queue:
if __name__ == "__main__": if __name__ == "__main__":
f = FileSet("en_merriam_webster/queue") f = FileSet("en_merriam_webster/queue")
d = DictFile("en_merriam_webster/ab_mw.json")
print(d)