From 52238ad9ae4b5269dd24dcb8daa41108ee5eb182 Mon Sep 17 00:00:00 2001 From: julius Date: Mon, 20 Nov 2023 17:48:21 +0100 Subject: [PATCH] browse through the MW dictionary --- mw_browser.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 mw_browser.py diff --git a/mw_browser.py b/mw_browser.py new file mode 100644 index 00000000..cc10d0da --- /dev/null +++ b/mw_browser.py @@ -0,0 +1,36 @@ +import random +from pathlib import Path +from time import sleep +from string import ascii_lowercase +from urllib.parse import urlparse + +from requests_html import HTMLSession + +letters = list(ascii_lowercase) + ["BIO", "GEO", "0-9"] +db_file = "ALL.db" +if not Path(db_file).is_file(): + Path(db_file).touch() + +html_session = HTMLSession() +url_stem = "https://www.merriam-webster.com" + +links = set() +words = {line.strip() for line in open(db_file, "rt")} + +for letter in letters: + url = f"https://www.merriam-webster.com/browse/dictionary/{letter}" + r = html_session.get(url) + # r.html.render() + links |= { + p.attrs["href"] for p in r.html.find(f'a[href*="browse/dictionary/{letter}/"]') + } + for link in links: + print(link) + r = html_session.get(url_stem + link) + words |= { + Path(urlparse(word.attrs["href"]).path).name + for word in r.html.find('a[href*="/dictionary/"]') + } + sleep(random.random() * 2) + with open(db_file, "wt") as f: + f.write("\n".join(list(words)))