typst-documentation-fetcher/typst-documentation-fetcher.py

# /// script
# requires-python = ">=3.13"
# dependencies = [
#     "bs4",
#     "requests",
# ]
# ///

import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor

import requests
from bs4 import BeautifulSoup


def main() -> None:
    sitemap = requests.get("https://typst.app/sitemap-0.xml")
    root = ET.fromstring(sitemap.content)
    page_urls = [
        el[0].text
        for el in root
        if isinstance(el[0].text, str) and "/reference/" in el[0].text
    ]

    with ThreadPoolExecutor(max_workers=8) as executor:
        results = list(executor.map(fetch_page, page_urls))
        results = list(executor.map(strip_page, results))
        results = list(executor.map(write_files, results))


def fetch_page(url: str) -> tuple[str, str]:
    print(f"downloading {url}...")
    try:
        response = requests.get(url, timeout=10)
        print(f"{url} successfully fetched!")
        return (url, response.text)
    except Exception as e:
        raise RuntimeError(e)


def strip_page(url_page: tuple[str, str]) -> tuple[str, str]:
    url, page = url_page
    print(f"stripping trash from {url}")
    soup = BeautifulSoup(page, "html.parser")
    for tag in soup(["header", "nav", "footer", "head", "script"]):
        tag.decompose()
    for tag in soup.find_all(True):
        tag.attrs = {}
    return (url, str(soup))


def write_files(url_page: tuple[str, str]):
    url, page = url_page
    url = url[url.rfind("/reference/") + len("/reference/") : -1]
    if url == "":
        url = "index"

    filename = url.replace("/", "_") + ".html"
    filename = "typst-docs/" + filename
    with open(filename, "w", encoding="utf-8") as f:
        print(f"writing to {filename}")
        f.write(page)
        print(f"success with {filename}!")


if __name__ == "__main__":
    main()