From 5b9076eeb50183979fdcab1d4d87ce98e284d590 Mon Sep 17 00:00:00 2001 From: julius Date: Wed, 12 Apr 2023 06:47:38 +0000 Subject: [PATCH] New implementation with SQLModel --- mw_scraper.py | 134 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 mw_scraper.py diff --git a/mw_scraper.py b/mw_scraper.py new file mode 100644 index 00000000..11b45a7a --- /dev/null +++ b/mw_scraper.py @@ -0,0 +1,134 @@ +import random +import re +from pathlib import Path +from time import sleep +from urllib.parse import urlparse + +from requests_html import HTMLSession +from sqlmodel import Field, Relationship, Session, SQLModel, create_engine, select + +with open("db.secrets", "r") as f: + db_pass = f.readline().strip() + + +class Word(SQLModel, table=True): + id: int | None = Field(default=None, primary_key=True) + word: str = Field(index=True, unique=True) + + +class Sense(SQLModel, table=True): + id: int | None = Field(default=None, primary_key=True) + word: str = Field(default=None, foreign_key="word.word") + + word_class: str | None + + +class Description(SQLModel, table=True): + id: int | None = Field(default=None, primary_key=True) + word: str | None = Field(default=None, foreign_key="word.word") + sense_id: int | None = Field(default=None, foreign_key="sense.id") + description: str + examples: list[str] | None + + +class Pronunciation(SQLModel, table=True): + id: int | None = Field(default=None, primary_key=True) + word: str | None = Field(default=None, foreign_key="word.word") + sense: int | None = Field(default=None, foreign_key="sense.id") + pronunciation: str + + +engine = create_engine(db_pass) +SQLModel.metadata.create_all(engine) + +html_session = HTMLSession() + +QUEUE = {line.strip() for line in open("queue.db", "rt")} + + +def add_word(word): + url = f"https://www.merriam-webster.com/dictionary/{word}" + r = html_session.get(url) + # r.html.render() + links = set() + + with Session(engine) as session: + for c in r.html.find("div.entry-word-section-container"): + _word = c.find("h1.hword,p.hword", first=True).text + try: + _class = c.find("h2.parts-of-speech", first=True).text + except AttributeError: + print(f"fail: {_word}") + continue + _class = re.sub("\(.*\)", "", _class).strip() + links |= { + Path(urlparse(link.attrs["href"]).path).name + for link in c.find('a[href*="/dictionary/"]') + } + + results = session.exec(select(Word).where(Word.word == _word)).one_or_none() + if results: + word = results + else: + word = Word(word=_word) + session.add(word) + + results = session.exec( + select(Sense) + .where(Sense.word == _word) + .where(Sense.word_class == _class) + ).one_or_none() + if results: + sense = results + else: + sense = Sense(word=word.word, word_class=_class) + session.add(sense) + + session.commit() + + for sn in c.find("div.sense"): + _desc = [] + _examples = [] + for dt in sn.find("span.dt"): + for dttext in dt.find("span.dtText"): + _desc.append(dttext.text) + for dt in sn.find("div.sub-content-thread"): + _example = [] + for sents in dt.find("span.sents"): + _example.append(sents.text) + _examples.append("; ".join(_example)) + session.add( + Description( + word=word.word, + sense_id=sense.id, + description="; ".join(_desc), + examples=_examples, + ) + ) + for pron in c.find( + "span.prons-entries-list-inline div.prons-entry-list-item,a.prons-entry-list-item" + ): + session.add( + Pronunciation( + word=word.word, sense=sense.id, pronunciation=pron.text + ) + ) + session.commit() + return links + + +def present(): + with Session(engine) as session: + return session.exec(select(Word.word)).unique() + + +while True: + try: + QUEUE |= add_word(random.choice(list(QUEUE))) + QUEUE -= set(present()) + print(len(QUEUE)) + sleep(random.random() * 5) + except KeyboardInterrupt: + with open("queue.db", "wt") as f: + f.write("\n".join(list(QUEUE))) + exit(0)