Compare commits

...

2 Commits

Author SHA1 Message Date
julius
b8b8e42b8e
autoflake 2023-11-20 17:49:47 +01:00
julius
52238ad9ae
browse through the MW dictionary 2023-11-20 17:48:21 +01:00
6 changed files with 38 additions and 8 deletions

View File

@ -1,4 +1,3 @@
from itertools import chain
from dict_dl import Queue, WordParser, ot, rb, uq, uqall, only_first_text from dict_dl import Queue, WordParser, ot, rb, uq, uqall, only_first_text

3
d.py
View File

@ -1,10 +1,7 @@
#!/bin/python #!/bin/python
import os import os
import sys import sys
from itertools import zip_longest
from rich.console import Console
from rich.table import Table
from dict_dl import DictFile from dict_dl import DictFile

View File

@ -1,8 +1,7 @@
import random import random
from pathlib import Path
import uvicorn import uvicorn
from fastapi import FastAPI, Response from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from sqlmodel import Session, select, text from sqlmodel import Session, select, text

View File

@ -1,5 +1,4 @@
from dict_dl import Queue, WordParser, cw, ot, rb, uq, uqall from dict_dl import Queue, WordParser, cw, ot, rb, uq, uqall
import itertools
class MerriamWebsterParser(WordParser): class MerriamWebsterParser(WordParser):

36
mw_browser.py Normal file
View File

@ -0,0 +1,36 @@
import random
from pathlib import Path
from time import sleep
from string import ascii_lowercase
from urllib.parse import urlparse
from requests_html import HTMLSession
letters = list(ascii_lowercase) + ["BIO", "GEO", "0-9"]
db_file = "ALL.db"
if not Path(db_file).is_file():
Path(db_file).touch()
html_session = HTMLSession()
url_stem = "https://www.merriam-webster.com"
links = set()
words = {line.strip() for line in open(db_file, "rt")}
for letter in letters:
url = f"https://www.merriam-webster.com/browse/dictionary/{letter}"
r = html_session.get(url)
# r.html.render()
links |= {
p.attrs["href"] for p in r.html.find(f'a[href*="browse/dictionary/{letter}/"]')
}
for link in links:
print(link)
r = html_session.get(url_stem + link)
words |= {
Path(urlparse(word.attrs["href"]).path).name
for word in r.html.find('a[href*="/dictionary/"]')
}
sleep(random.random() * 2)
with open(db_file, "wt") as f:
f.write("\n".join(list(words)))

View File

@ -6,7 +6,7 @@ from time import sleep
from urllib.parse import urlparse from urllib.parse import urlparse
from requests_html import HTMLSession from requests_html import HTMLSession
from sqlmodel import Field, Relationship, Session, SQLModel, create_engine, select, Column, ARRAY, String from sqlmodel import Field, Session, SQLModel, create_engine, select, Column, ARRAY, String
with open("db.secrets", "r") as f: with open("db.secrets", "r") as f:
db_pass = f.readline().strip() db_pass = f.readline().strip()