dict_dl/mw_scraper.py

154 lines
5.0 KiB
Python
Raw Normal View History

2023-04-12 06:47:38 +00:00
import random
2023-11-20 15:52:07 +00:00
from typing import List
2023-04-12 06:47:38 +00:00
import re
from pathlib import Path
from time import sleep
from urllib.parse import urlparse
from requests_html import HTMLSession
2023-11-20 15:52:07 +00:00
from sqlmodel import Field, Relationship, Session, SQLModel, create_engine, select, Column, ARRAY, String
2023-04-12 06:47:38 +00:00
with open("db.secrets", "r") as f:
db_pass = f.readline().strip()
class Word(SQLModel, table=True):
id: int | None = Field(default=None, primary_key=True)
word: str = Field(index=True, unique=True)
class Sense(SQLModel, table=True):
id: int | None = Field(default=None, primary_key=True)
word: str = Field(default=None, foreign_key="word.word")
word_class: str | None
class Description(SQLModel, table=True):
id: int | None = Field(default=None, primary_key=True)
word: str | None = Field(default=None, foreign_key="word.word")
sense_id: int | None = Field(default=None, foreign_key="sense.id")
description: str
2023-11-20 15:52:07 +00:00
examples: List[str] | None = Field(sa_column=Column(ARRAY(String)))
2023-04-12 06:47:38 +00:00
class Pronunciation(SQLModel, table=True):
id: int | None = Field(default=None, primary_key=True)
word: str | None = Field(default=None, foreign_key="word.word")
sense: int | None = Field(default=None, foreign_key="sense.id")
pronunciation: str
engine = create_engine(db_pass)
def add_word(word):
url = f"https://www.merriam-webster.com/dictionary/{word}"
r = html_session.get(url)
# r.html.render()
links = set()
with Session(engine) as session:
for c in r.html.find("div.entry-word-section-container"):
_word = c.find("h1.hword,p.hword", first=True).text
try:
_class = c.find("h2.parts-of-speech", first=True).text
except AttributeError:
print(f"fail: {_word}")
continue
_class = re.sub("\(.*\)", "", _class).strip()
links |= {
Path(urlparse(link.attrs["href"]).path).name
for link in c.find('a[href*="/dictionary/"]')
}
results = session.exec(select(Word).where(Word.word == _word)).one_or_none()
if results:
word = results
else:
word = Word(word=_word)
session.add(word)
results = session.exec(
select(Sense)
.where(Sense.word == _word)
.where(Sense.word_class == _class)
).one_or_none()
2023-09-05 14:00:29 +00:00
2023-04-12 06:47:38 +00:00
if results:
sense = results
else:
sense = Sense(word=word.word, word_class=_class)
session.add(sense)
session.commit()
for sn in c.find("div.sense"):
_desc = []
_examples = []
for dt in sn.find("span.dt"):
for dttext in dt.find("span.dtText"):
_desc.append(dttext.text)
for dt in sn.find("div.sub-content-thread"):
_example = []
for sents in dt.find("span.sents"):
_example.append(sents.text)
_examples.append("; ".join(_example))
2023-09-05 14:00:29 +00:00
_final_description = "; ".join(_desc)
results = session.exec(
select(Description).where(
Description.word == word.word,
Description.description == _final_description,
2023-04-12 06:47:38 +00:00
)
2023-09-05 14:00:29 +00:00
).one_or_none()
if results:
continue
else:
session.add(
Description(
word=word.word,
sense_id=sense.id,
description=_final_description,
examples=_examples,
)
)
2023-04-12 06:47:38 +00:00
for pron in c.find(
"span.prons-entries-list-inline div.prons-entry-list-item,a.prons-entry-list-item"
):
session.add(
Pronunciation(
word=word.word, sense=sense.id, pronunciation=pron.text
)
)
session.commit()
return links
2023-09-05 14:00:29 +00:00
def presently_available():
2023-04-12 06:47:38 +00:00
with Session(engine) as session:
return session.exec(select(Word.word)).unique()
2023-09-05 14:00:29 +00:00
if __name__ == "__main__":
SQLModel.metadata.create_all(engine)
html_session = HTMLSession()
QUEUE = {line.strip() for line in open("queue.db", "rt")}
while True:
try:
if len(QUEUE) < 20:
exit()
next_word = random.choice(list(QUEUE))
already_present = set(presently_available())
print(next_word, len(QUEUE), len(already_present))
QUEUE |= add_word(next_word)
QUEUE -= already_present | {next_word}
sleep(random.random() * 5)
except KeyboardInterrupt:
with open("queue.db", "wt") as f:
f.write("\n".join(list(QUEUE)))
exit(0)