hold dictionary in memory only partially
This commit is contained in:
parent
609ac67fc5
commit
2aa80f99e8
63
dict_dl.py
63
dict_dl.py
@ -5,13 +5,17 @@ import string
|
|||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import unquote as uq
|
from urllib.parse import unquote
|
||||||
from xml.etree import ElementTree as ET
|
from xml.etree import ElementTree as ET
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from requests.exceptions import ConnectionError
|
from requests.exceptions import ConnectionError
|
||||||
|
|
||||||
|
# def uq(s):
|
||||||
|
# return unquote(s).split("?")[0]
|
||||||
|
uq = unquote
|
||||||
|
|
||||||
|
|
||||||
def uqall(data):
|
def uqall(data):
|
||||||
if isinstance(data, list):
|
if isinstance(data, list):
|
||||||
@ -153,7 +157,18 @@ class FileSet(set):
|
|||||||
self.save()
|
self.save()
|
||||||
|
|
||||||
|
|
||||||
class Dictionary(dict):
|
class DictFile(dict):
|
||||||
|
def __init__(self, file):
|
||||||
|
self.file = file
|
||||||
|
with open(self.file, "r") as f:
|
||||||
|
super().__init__(json.load(f))
|
||||||
|
|
||||||
|
def save(self):
|
||||||
|
with open(self.file, "w") as f:
|
||||||
|
json.dump(self, f, separators=(",", ":"), indent=2, sort_keys=False)
|
||||||
|
|
||||||
|
|
||||||
|
class FullDictionary(dict):
|
||||||
def __init__(self, dir_prefix, suffix):
|
def __init__(self, dir_prefix, suffix):
|
||||||
self.__dict__.update(locals())
|
self.__dict__.update(locals())
|
||||||
full_dict = {}
|
full_dict = {}
|
||||||
@ -177,7 +192,7 @@ class Queue:
|
|||||||
):
|
):
|
||||||
self.__dict__.update(locals())
|
self.__dict__.update(locals())
|
||||||
self.letters = string.ascii_lowercase
|
self.letters = string.ascii_lowercase
|
||||||
self.full_dict = {}
|
self.words = set()
|
||||||
self.queue = FileSet(f"{dir_prefix}queue")
|
self.queue = FileSet(f"{dir_prefix}queue")
|
||||||
self.snafus = FileSet(f"{dir_prefix}snafus")
|
self.snafus = FileSet(f"{dir_prefix}snafus")
|
||||||
self.redo = FileSet(f"{dir_prefix}redo")
|
self.redo = FileSet(f"{dir_prefix}redo")
|
||||||
@ -190,57 +205,41 @@ class Queue:
|
|||||||
def loadDB(self):
|
def loadDB(self):
|
||||||
for db_file in Path(self.dir_prefix).glob(f"*{self.suffix}"):
|
for db_file in Path(self.dir_prefix).glob(f"*{self.suffix}"):
|
||||||
with open(db_file, "r") as f:
|
with open(db_file, "r") as f:
|
||||||
self.full_dict |= json.load(f)
|
self.words |= set(json.load(f).keys())
|
||||||
self.full_dict = uqall(self.full_dict)
|
|
||||||
|
|
||||||
def updateDB(self, pick):
|
|
||||||
start = time.time()
|
|
||||||
|
|
||||||
prefix = pick[: self.prefix_length].lower()
|
|
||||||
if all([c.lower() in self.letters for c in prefix]):
|
|
||||||
c_db = {
|
|
||||||
k: v
|
|
||||||
for k, v in self.full_dict.items()
|
|
||||||
if k[: self.prefix_length].lower() == prefix
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
c_db = {
|
|
||||||
k: v
|
|
||||||
for k, v in self.full_dict.items()
|
|
||||||
if any([c.lower() not in self.letters for c in k[: self.prefix_length]])
|
|
||||||
}
|
|
||||||
prefix = "_" * self.prefix_length
|
|
||||||
|
|
||||||
with open(f"{self.dir_prefix}{prefix}{self.suffix}", "w") as f: # save DB
|
|
||||||
json.dump(c_db, f, separators=(",", ":"), indent=2, sort_keys=False)
|
|
||||||
|
|
||||||
def add_word(self):
|
def add_word(self):
|
||||||
self.redo.load()
|
self.redo.load()
|
||||||
self.queue -= set(self.full_dict.keys())
|
self.queue -= self.words
|
||||||
self.queue -= self.snafus
|
self.queue -= self.snafus
|
||||||
self.queue |= self.redo
|
self.queue |= self.redo
|
||||||
len_queue = len(self.queue) # actual queue
|
len_queue = len(self.queue) # actual queue
|
||||||
p = random.choice(list(self.queue))
|
p = random.choice(list(self.queue))
|
||||||
try:
|
try:
|
||||||
start_parsing = time.time()
|
start_parsing = time.time()
|
||||||
|
prefix = p[: self.prefix_length].lower()
|
||||||
w = self.Parser(p) # fetch new word
|
w = self.Parser(p) # fetch new word
|
||||||
word_dict = w.todict()
|
word_dict = w.todict()
|
||||||
start_db_stuff = time.time()
|
|
||||||
self.full_dict |= word_dict
|
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"{p} | "
|
f"{p} | "
|
||||||
f"{len(self.full_dict)} words collected, "
|
f"{len(self.words)} words collected, "
|
||||||
f"{len_queue} words waiting in queue"
|
f"{len_queue} words waiting in queue"
|
||||||
# f", {start_db_stuff-start_parsing:.06f}s"
|
# f", {start_db_stuff-start_parsing:.06f}s"
|
||||||
# f"/{time.time() - start_db_stuff:.06f}s"
|
# f"/{time.time() - start_db_stuff:.06f}s"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
start_db_stuff = time.time()
|
||||||
|
dict_part = DictFile(f"{self.dir_prefix}{prefix}{self.suffix}")
|
||||||
|
dict_part |= word_dict
|
||||||
|
dict_part.save()
|
||||||
|
del dict_part
|
||||||
|
|
||||||
|
self.words |= set(word_dict.keys())
|
||||||
|
|
||||||
self.queue |= set(w.neighbours)
|
self.queue |= set(w.neighbours)
|
||||||
self.queue -= {p}
|
self.queue -= {p}
|
||||||
self.redo -= {p}
|
self.redo -= {p}
|
||||||
self.redo.save()
|
self.redo.save()
|
||||||
self.updateDB(p)
|
|
||||||
self.wait()
|
self.wait()
|
||||||
except (
|
except (
|
||||||
AssertionError,
|
AssertionError,
|
||||||
@ -261,3 +260,5 @@ class Queue:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
f = FileSet("en_merriam_webster/queue")
|
f = FileSet("en_merriam_webster/queue")
|
||||||
|
d = DictFile("en_merriam_webster/ab_mw.json")
|
||||||
|
print(d)
|
||||||
|
Loading…
Reference in New Issue
Block a user