68 lines
1.8 KiB
Python
68 lines
1.8 KiB
Python
# /// script
|
|
# requires-python = ">=3.13"
|
|
# dependencies = [
|
|
# "bs4",
|
|
# "requests",
|
|
# ]
|
|
# ///
|
|
|
|
import xml.etree.ElementTree as ET
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def main() -> None:
|
|
sitemap = requests.get("https://typst.app/sitemap-0.xml")
|
|
root = ET.fromstring(sitemap.content)
|
|
page_urls = [
|
|
el[0].text
|
|
for el in root
|
|
if isinstance(el[0].text, str) and "/reference/" in el[0].text
|
|
]
|
|
|
|
with ThreadPoolExecutor(max_workers=8) as executor:
|
|
results = list(executor.map(fetch_page, page_urls))
|
|
results = list(executor.map(strip_page, results))
|
|
results = list(executor.map(write_files, results))
|
|
|
|
|
|
def fetch_page(url: str) -> tuple[str, str]:
|
|
print(f"downloading {url}...")
|
|
try:
|
|
response = requests.get(url, timeout=10)
|
|
print(f"{url} successfully fetched!")
|
|
return (url, response.text)
|
|
except Exception as e:
|
|
raise RuntimeError(e)
|
|
|
|
|
|
def strip_page(url_page: tuple[str, str]) -> tuple[str, str]:
|
|
url, page = url_page
|
|
print(f"stripping trash from {url}")
|
|
soup = BeautifulSoup(page, "html.parser")
|
|
for tag in soup(["header", "nav", "footer", "head", "script"]):
|
|
tag.decompose()
|
|
for tag in soup.find_all(True):
|
|
tag.attrs = {}
|
|
return (url, str(soup))
|
|
|
|
|
|
def write_files(url_page: tuple[str, str]):
|
|
url, page = url_page
|
|
url = url[url.rfind("/reference/") + len("/reference/") : -1]
|
|
if url == "":
|
|
url = "index"
|
|
|
|
filename = url.replace("/", "_") + ".html"
|
|
filename = "typst-docs/" + filename
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|
print(f"writing to {filename}")
|
|
f.write(page)
|
|
print(f"success with {filename}!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|