diff --git a/mw_browser.py b/mw_browser.py new file mode 100644 index 00000000..cc10d0da --- /dev/null +++ b/mw_browser.py @@ -0,0 +1,36 @@ +import random +from pathlib import Path +from time import sleep +from string import ascii_lowercase +from urllib.parse import urlparse + +from requests_html import HTMLSession + +letters = list(ascii_lowercase) + ["BIO", "GEO", "0-9"] +db_file = "ALL.db" +if not Path(db_file).is_file(): + Path(db_file).touch() + +html_session = HTMLSession() +url_stem = "https://www.merriam-webster.com" + +links = set() +words = {line.strip() for line in open(db_file, "rt")} + +for letter in letters: + url = f"https://www.merriam-webster.com/browse/dictionary/{letter}" + r = html_session.get(url) + # r.html.render() + links |= { + p.attrs["href"] for p in r.html.find(f'a[href*="browse/dictionary/{letter}/"]') + } + for link in links: + print(link) + r = html_session.get(url_stem + link) + words |= { + Path(urlparse(word.attrs["href"]).path).name + for word in r.html.find('a[href*="/dictionary/"]') + } + sleep(random.random() * 2) + with open(db_file, "wt") as f: + f.write("\n".join(list(words)))