Files
typst-documentation-fetcher/typst-documentation-fetcher.py
2025-10-03 20:15:49 +03:00

68 lines
1.8 KiB
Python

# /// script
# requires-python = ">=3.13"
# dependencies = [
# "bs4",
# "requests",
# ]
# ///
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
def main() -> None:
sitemap = requests.get("https://typst.app/sitemap-0.xml")
root = ET.fromstring(sitemap.content)
page_urls = [
el[0].text
for el in root
if isinstance(el[0].text, str) and "/reference/" in el[0].text
]
with ThreadPoolExecutor(max_workers=8) as executor:
results = list(executor.map(fetch_page, page_urls))
results = list(executor.map(strip_page, results))
results = list(executor.map(write_files, results))
def fetch_page(url: str) -> tuple[str, str]:
print(f"downloading {url}...")
try:
response = requests.get(url, timeout=10)
print(f"{url} successfully fetched!")
return (url, response.text)
except Exception as e:
raise RuntimeError(e)
def strip_page(url_page: tuple[str, str]) -> tuple[str, str]:
url, page = url_page
print(f"stripping trash from {url}")
soup = BeautifulSoup(page, "html.parser")
for tag in soup(["header", "nav", "footer", "head", "script"]):
tag.decompose()
for tag in soup.find_all(True):
tag.attrs = {}
return (url, str(soup))
def write_files(url_page: tuple[str, str]):
url, page = url_page
url = url[url.rfind("/reference/") + len("/reference/") : -1]
if url == "":
url = "index"
filename = url.replace("/", "_") + ".html"
filename = "typst-docs/" + filename
with open(filename, "w", encoding="utf-8") as f:
print(f"writing to {filename}")
f.write(page)
print(f"success with {filename}!")
if __name__ == "__main__":
main()