import random from pathlib import Path from time import sleep from string import ascii_lowercase from urllib.parse import urlparse from requests_html import HTMLSession letters = list(ascii_lowercase) + ["BIO", "GEO", "0-9"] db_file = "ALL.db" if not Path(db_file).is_file(): Path(db_file).touch() html_session = HTMLSession() url_stem = "https://www.merriam-webster.com" links = set() words = {line.strip() for line in open(db_file, "rt")} for letter in letters: url = f"https://www.merriam-webster.com/browse/dictionary/{letter}" r = html_session.get(url) # r.html.render() links |= { p.attrs["href"] for p in r.html.find(f'a[href*="browse/dictionary/{letter}/"]') } for link in links: print(link) r = html_session.get(url_stem + link) words |= { Path(urlparse(word.attrs["href"]).path).name for word in r.html.find('a[href*="/dictionary/"]') } sleep(random.random() * 2) with open(db_file, "wt") as f: f.write("\n".join(list(words)))