New implementation with SQLModel
This commit is contained in:
parent
0c11d07523
commit
5b9076eeb5
134
mw_scraper.py
Normal file
134
mw_scraper.py
Normal file
@ -0,0 +1,134 @@
|
||||
import random
|
||||
import re
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from requests_html import HTMLSession
|
||||
from sqlmodel import Field, Relationship, Session, SQLModel, create_engine, select
|
||||
|
||||
with open("db.secrets", "r") as f:
|
||||
db_pass = f.readline().strip()
|
||||
|
||||
|
||||
class Word(SQLModel, table=True):
|
||||
id: int | None = Field(default=None, primary_key=True)
|
||||
word: str = Field(index=True, unique=True)
|
||||
|
||||
|
||||
class Sense(SQLModel, table=True):
|
||||
id: int | None = Field(default=None, primary_key=True)
|
||||
word: str = Field(default=None, foreign_key="word.word")
|
||||
|
||||
word_class: str | None
|
||||
|
||||
|
||||
class Description(SQLModel, table=True):
|
||||
id: int | None = Field(default=None, primary_key=True)
|
||||
word: str | None = Field(default=None, foreign_key="word.word")
|
||||
sense_id: int | None = Field(default=None, foreign_key="sense.id")
|
||||
description: str
|
||||
examples: list[str] | None
|
||||
|
||||
|
||||
class Pronunciation(SQLModel, table=True):
|
||||
id: int | None = Field(default=None, primary_key=True)
|
||||
word: str | None = Field(default=None, foreign_key="word.word")
|
||||
sense: int | None = Field(default=None, foreign_key="sense.id")
|
||||
pronunciation: str
|
||||
|
||||
|
||||
engine = create_engine(db_pass)
|
||||
SQLModel.metadata.create_all(engine)
|
||||
|
||||
html_session = HTMLSession()
|
||||
|
||||
QUEUE = {line.strip() for line in open("queue.db", "rt")}
|
||||
|
||||
|
||||
def add_word(word):
|
||||
url = f"https://www.merriam-webster.com/dictionary/{word}"
|
||||
r = html_session.get(url)
|
||||
# r.html.render()
|
||||
links = set()
|
||||
|
||||
with Session(engine) as session:
|
||||
for c in r.html.find("div.entry-word-section-container"):
|
||||
_word = c.find("h1.hword,p.hword", first=True).text
|
||||
try:
|
||||
_class = c.find("h2.parts-of-speech", first=True).text
|
||||
except AttributeError:
|
||||
print(f"fail: {_word}")
|
||||
continue
|
||||
_class = re.sub("\(.*\)", "", _class).strip()
|
||||
links |= {
|
||||
Path(urlparse(link.attrs["href"]).path).name
|
||||
for link in c.find('a[href*="/dictionary/"]')
|
||||
}
|
||||
|
||||
results = session.exec(select(Word).where(Word.word == _word)).one_or_none()
|
||||
if results:
|
||||
word = results
|
||||
else:
|
||||
word = Word(word=_word)
|
||||
session.add(word)
|
||||
|
||||
results = session.exec(
|
||||
select(Sense)
|
||||
.where(Sense.word == _word)
|
||||
.where(Sense.word_class == _class)
|
||||
).one_or_none()
|
||||
if results:
|
||||
sense = results
|
||||
else:
|
||||
sense = Sense(word=word.word, word_class=_class)
|
||||
session.add(sense)
|
||||
|
||||
session.commit()
|
||||
|
||||
for sn in c.find("div.sense"):
|
||||
_desc = []
|
||||
_examples = []
|
||||
for dt in sn.find("span.dt"):
|
||||
for dttext in dt.find("span.dtText"):
|
||||
_desc.append(dttext.text)
|
||||
for dt in sn.find("div.sub-content-thread"):
|
||||
_example = []
|
||||
for sents in dt.find("span.sents"):
|
||||
_example.append(sents.text)
|
||||
_examples.append("; ".join(_example))
|
||||
session.add(
|
||||
Description(
|
||||
word=word.word,
|
||||
sense_id=sense.id,
|
||||
description="; ".join(_desc),
|
||||
examples=_examples,
|
||||
)
|
||||
)
|
||||
for pron in c.find(
|
||||
"span.prons-entries-list-inline div.prons-entry-list-item,a.prons-entry-list-item"
|
||||
):
|
||||
session.add(
|
||||
Pronunciation(
|
||||
word=word.word, sense=sense.id, pronunciation=pron.text
|
||||
)
|
||||
)
|
||||
session.commit()
|
||||
return links
|
||||
|
||||
|
||||
def present():
|
||||
with Session(engine) as session:
|
||||
return session.exec(select(Word.word)).unique()
|
||||
|
||||
|
||||
while True:
|
||||
try:
|
||||
QUEUE |= add_word(random.choice(list(QUEUE)))
|
||||
QUEUE -= set(present())
|
||||
print(len(QUEUE))
|
||||
sleep(random.random() * 5)
|
||||
except KeyboardInterrupt:
|
||||
with open("queue.db", "wt") as f:
|
||||
f.write("\n".join(list(QUEUE)))
|
||||
exit(0)
|
Loading…
Reference in New Issue
Block a user