browse through the MW dictionary
This commit is contained in:
parent
47130f30d7
commit
52238ad9ae
36
mw_browser.py
Normal file
36
mw_browser.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
import random
|
||||||
|
from pathlib import Path
|
||||||
|
from time import sleep
|
||||||
|
from string import ascii_lowercase
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from requests_html import HTMLSession
|
||||||
|
|
||||||
|
letters = list(ascii_lowercase) + ["BIO", "GEO", "0-9"]
|
||||||
|
db_file = "ALL.db"
|
||||||
|
if not Path(db_file).is_file():
|
||||||
|
Path(db_file).touch()
|
||||||
|
|
||||||
|
html_session = HTMLSession()
|
||||||
|
url_stem = "https://www.merriam-webster.com"
|
||||||
|
|
||||||
|
links = set()
|
||||||
|
words = {line.strip() for line in open(db_file, "rt")}
|
||||||
|
|
||||||
|
for letter in letters:
|
||||||
|
url = f"https://www.merriam-webster.com/browse/dictionary/{letter}"
|
||||||
|
r = html_session.get(url)
|
||||||
|
# r.html.render()
|
||||||
|
links |= {
|
||||||
|
p.attrs["href"] for p in r.html.find(f'a[href*="browse/dictionary/{letter}/"]')
|
||||||
|
}
|
||||||
|
for link in links:
|
||||||
|
print(link)
|
||||||
|
r = html_session.get(url_stem + link)
|
||||||
|
words |= {
|
||||||
|
Path(urlparse(word.attrs["href"]).path).name
|
||||||
|
for word in r.html.find('a[href*="/dictionary/"]')
|
||||||
|
}
|
||||||
|
sleep(random.random() * 2)
|
||||||
|
with open(db_file, "wt") as f:
|
||||||
|
f.write("\n".join(list(words)))
|
Loading…
Reference in New Issue
Block a user