67 lines
1.8 KiB
TypeScript
67 lines
1.8 KiB
TypeScript
import { XMLParser } from "fast-xml-parser";
|
|
import * as cheerio from 'cheerio';
|
|
|
|
type Page = {
|
|
url: string;
|
|
contents: string;
|
|
}
|
|
|
|
const main = async () => {
|
|
const sitemapLink = "https://typst.app/sitemap-0.xml";
|
|
|
|
const sitemapResp = await fetch(sitemapLink);
|
|
const sitemap = await sitemapResp.text();
|
|
|
|
const parser = new XMLParser();
|
|
const result = parser.parse(sitemap);
|
|
const urlArr: { loc: string }[] = result.urlset.url;
|
|
const urls: string[] = urlArr
|
|
.map(url => url.loc)
|
|
.filter(url => url.includes("/reference/"));
|
|
|
|
const pages = await Promise.all(urls.map(url => fetchPage(url)))
|
|
const cleanPages = await Promise.all(pages.map(page => stripPage(page)))
|
|
await Promise.all(cleanPages.map(page => savePage(page)));
|
|
};
|
|
|
|
async function fetchPage(url: string): Promise<Page> {
|
|
console.log(`downloading ${url}...`);
|
|
|
|
const response = await fetch(url);
|
|
const page = await response.text();
|
|
|
|
return { url: url, contents: page };
|
|
}
|
|
|
|
const unneededElements = ["header", "nav", "footer", "head", "script"];
|
|
|
|
async function stripPage(page: Page): Promise<Page> {
|
|
console.log(`cleaning page ${page.url}`);
|
|
|
|
const $ = cheerio.load(page.contents);
|
|
unneededElements.forEach(tag => $(tag).remove());
|
|
page.contents = $.html();
|
|
|
|
return page;
|
|
}
|
|
|
|
async function savePage(page: Page) {
|
|
const refIdx = page.url.indexOf("/reference/");
|
|
let fileName = page.url
|
|
.substring(refIdx + "/reference/".length)
|
|
.replaceAll("/", "_");
|
|
|
|
if (fileName === "") {
|
|
fileName = "index";
|
|
} else {
|
|
fileName = fileName.slice(0, -1);
|
|
}
|
|
fileName = fileName + ".html";
|
|
|
|
console.log(`writing ${fileName}`);
|
|
await Bun.write(`typst-docs/${fileName}`, page.contents)
|
|
console.log(`wrote ${fileName} to disk!`);
|
|
}
|
|
|
|
main();
|