dict_dl/MW_thesaurus.py

from itertools import chain

from dict_dl import Queue, WordParser, cw, oft, ot, uq, rb, uqall


class MWThesaurusParser(WordParser):
    def __init__(self, word):
        url_prefix = "https://www.merriam-webster.com/thesaurus/"
        super().__init__(word, url_prefix)

    @property
    def thes(self):
        thes = {}
        for i in range(1, 10):
            for entry in self.root.findall(f".//div[@id='thesaurus-entry-{i}']"):
                for se in chain(
                    entry.findall(".//div[@class='sb no-sn']"),
                    entry.findall(".//div[@class='sb has-num']"),
                ):
                    for e in se.findall(".//span[@class='dt']"):
                        examples = [ot(li) for li in e.findall(".//li")]
                        [e.remove(ul) for ul in e.findall(".//ul")]
                        d = ot(e)
                        thes[d] = {"examples": examples}
                    thes[d]["synonyms"] = [
                        ot(li)
                        for li in se.findall(
                            ".//span[@class='thes-list syn-list']/div[@class='thes-list-content synonyms_list']//li//a"
                        )
                    ]
                    thes[d]["related"] = [
                        ot(li)
                        for li in se.findall(
                            ".//span[@class='thes-list rel-list']/div[@class='thes-list-content synonyms_list']//li//a"
                        )
                    ]
                    thes[d]["near antonyms"] = [
                        ot(li)
                        for li in se.findall(
                            ".//span[@class='thes-list near-list']/div[@class='thes-list-content synonyms_list']//li//a"
                        )
                    ]
                    thes[d]["antonyms"] = [
                        ot(li)
                        for li in se.findall(
                            ".//span[@class='thes-list ant-list']/div[@class='thes-list-content synonyms_list']//li//a"
                        )
                    ]

        return thes

    @property
    def type(self):
        types = set()
        for e in self.root.findall(
            ".//div[@class='row entry-header thesaurus']//span[@class='fl']"
        ):
            types.add(rb(ot(e), "(", ")"))
        return sorted(types)

    @property
    def neighbours(self):
        neighbours = set()
        for e in self.root.findall(".//a"):
            if "href" in e.attrib and e.attrib["href"].startswith("/thesaurus/"):
                link = e.attrib["href"].split("/")[-1].split("#")[0].split("?")[0]
                neighbours.add(uq(link))
        return neighbours

    def todict(self):
        assert (
            self.type or self.thes
        ), f"{self.time} {self.word}: type or definitions came back empty..."
        return uqall({self.word: self.thes | {"type": self.type}})


# w = MWThesaurusParser("coffining")
# print(w.todict())
# exit()

q = Queue(MWThesaurusParser, "en_MW_thesaurus/", "_mwt.json", prefix_length=2)
q.loadDB()

while True:
    q.add_word()
add MWThesaurus 2022-07-08 10:43:24 +00:00			`from itertools import chain`

			`from dict_dl import Queue, WordParser, cw, oft, ot, uq, rb, uqall`


			`class MWThesaurusParser(WordParser):`
			`def __init__(self, word):`
			`url_prefix = "https://www.merriam-webster.com/thesaurus/"`
			`super().__init__(word, url_prefix)`

			`@property`
			`def thes(self):`
			`thes = {}`
			`for i in range(1, 10):`
			`for entry in self.root.findall(f".//div[@id='thesaurus-entry-{i}']"):`
			`for se in chain(`
			`entry.findall(".//div[@class='sb no-sn']"),`
			`entry.findall(".//div[@class='sb has-num']"),`
			`):`
			`for e in se.findall(".//span[@class='dt']"):`
			`examples = [ot(li) for li in e.findall(".//li")]`
			`[e.remove(ul) for ul in e.findall(".//ul")]`
			`d = ot(e)`
			`thes[d] = {"examples": examples}`
			`thes[d]["synonyms"] = [`
			`ot(li)`
			`for li in se.findall(`
			`".//span[@class='thes-list syn-list']/div[@class='thes-list-content synonyms_list']//li//a"`
			`)`
			`]`
			`thes[d]["related"] = [`
			`ot(li)`
			`for li in se.findall(`
			`".//span[@class='thes-list rel-list']/div[@class='thes-list-content synonyms_list']//li//a"`
			`)`
			`]`
			`thes[d]["near antonyms"] = [`
			`ot(li)`
			`for li in se.findall(`
			`".//span[@class='thes-list near-list']/div[@class='thes-list-content synonyms_list']//li//a"`
			`)`
			`]`
			`thes[d]["antonyms"] = [`
			`ot(li)`
			`for li in se.findall(`
			`".//span[@class='thes-list ant-list']/div[@class='thes-list-content synonyms_list']//li//a"`
			`)`
			`]`

			`return thes`

			`@property`
			`def type(self):`
			`types = set()`
			`for e in self.root.findall(`
			`".//div[@class='row entry-header thesaurus']//span[@class='fl']"`
			`):`
			`types.add(rb(ot(e), "(", ")"))`
			`return sorted(types)`

			`@property`
			`def neighbours(self):`
			`neighbours = set()`
			`for e in self.root.findall(".//a"):`
			`if "href" in e.attrib and e.attrib["href"].startswith("/thesaurus/"):`
			`link = e.attrib["href"].split("/")[-1].split("#")[0].split("?")[0]`
			`neighbours.add(uq(link))`
			`return neighbours`

			`def todict(self):`
			`assert (`
			`self.type or self.thes`
			`), f"{self.time} {self.word}: type or definitions came back empty..."`
			`return uqall({self.word: self.thes \| {"type": self.type}})`


			`# w = MWThesaurusParser("coffining")`
			`# print(w.todict())`
			`# exit()`

			`q = Queue(MWThesaurusParser, "en_MW_thesaurus/", "_mwt.json", prefix_length=2)`
			`q.loadDB()`

			`while True:`
			`q.add_word()`