From 2aa80f99e8e2238100b4277acb1ccd478c9fcee5 Mon Sep 17 00:00:00 2001 From: julius Date: Sun, 10 Jul 2022 02:38:37 +0000 Subject: [PATCH] hold dictionary in memory only partially --- dict_dl.py | 63 +++++++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/dict_dl.py b/dict_dl.py index 74dab2d5..5ecccad3 100644 --- a/dict_dl.py +++ b/dict_dl.py @@ -5,13 +5,17 @@ import string import time from datetime import datetime from pathlib import Path -from urllib.parse import unquote as uq +from urllib.parse import unquote from xml.etree import ElementTree as ET import requests from bs4 import BeautifulSoup from requests.exceptions import ConnectionError +# def uq(s): +# return unquote(s).split("?")[0] +uq = unquote + def uqall(data): if isinstance(data, list): @@ -153,7 +157,18 @@ class FileSet(set): self.save() -class Dictionary(dict): +class DictFile(dict): + def __init__(self, file): + self.file = file + with open(self.file, "r") as f: + super().__init__(json.load(f)) + + def save(self): + with open(self.file, "w") as f: + json.dump(self, f, separators=(",", ":"), indent=2, sort_keys=False) + + +class FullDictionary(dict): def __init__(self, dir_prefix, suffix): self.__dict__.update(locals()) full_dict = {} @@ -177,7 +192,7 @@ class Queue: ): self.__dict__.update(locals()) self.letters = string.ascii_lowercase - self.full_dict = {} + self.words = set() self.queue = FileSet(f"{dir_prefix}queue") self.snafus = FileSet(f"{dir_prefix}snafus") self.redo = FileSet(f"{dir_prefix}redo") @@ -190,57 +205,41 @@ class Queue: def loadDB(self): for db_file in Path(self.dir_prefix).glob(f"*{self.suffix}"): with open(db_file, "r") as f: - self.full_dict |= json.load(f) - self.full_dict = uqall(self.full_dict) - - def updateDB(self, pick): - start = time.time() - - prefix = pick[: self.prefix_length].lower() - if all([c.lower() in self.letters for c in prefix]): - c_db = { - k: v - for k, v in self.full_dict.items() - if k[: self.prefix_length].lower() == prefix - } - else: - c_db = { - k: v - for k, v in self.full_dict.items() - if any([c.lower() not in self.letters for c in k[: self.prefix_length]]) - } - prefix = "_" * self.prefix_length - - with open(f"{self.dir_prefix}{prefix}{self.suffix}", "w") as f: # save DB - json.dump(c_db, f, separators=(",", ":"), indent=2, sort_keys=False) + self.words |= set(json.load(f).keys()) def add_word(self): self.redo.load() - self.queue -= set(self.full_dict.keys()) + self.queue -= self.words self.queue -= self.snafus self.queue |= self.redo len_queue = len(self.queue) # actual queue p = random.choice(list(self.queue)) try: start_parsing = time.time() + prefix = p[: self.prefix_length].lower() w = self.Parser(p) # fetch new word word_dict = w.todict() - start_db_stuff = time.time() - self.full_dict |= word_dict print( f"{p} | " - f"{len(self.full_dict)} words collected, " + f"{len(self.words)} words collected, " f"{len_queue} words waiting in queue" # f", {start_db_stuff-start_parsing:.06f}s" # f"/{time.time() - start_db_stuff:.06f}s" ) + start_db_stuff = time.time() + dict_part = DictFile(f"{self.dir_prefix}{prefix}{self.suffix}") + dict_part |= word_dict + dict_part.save() + del dict_part + + self.words |= set(word_dict.keys()) + self.queue |= set(w.neighbours) self.queue -= {p} self.redo -= {p} self.redo.save() - self.updateDB(p) self.wait() except ( AssertionError, @@ -261,3 +260,5 @@ class Queue: if __name__ == "__main__": f = FileSet("en_merriam_webster/queue") + d = DictFile("en_merriam_webster/ab_mw.json") + print(d)