Eric Bower
·
09 Mar 23
scrape.ts
1import { marked } from "../deps.ts";
2import type { Resource } from "../types.ts";
3import { createResource } from "../entities.ts";
4
5const URLS = [
6 "https://raw.githubusercontent.com/rockerBOO/awesome-neovim/main/README.md",
7];
8
9Promise.all(URLS.map((url) => fetchMarkdown(url).then(processMarkdown)))
10 .then((resources) => {
11 const flatten = resources.reduce((acc, r) => {
12 acc.push(...r);
13 return acc;
14 }, []);
15 return flatten;
16 })
17 .then(saveScrapeData)
18 .catch(console.error);
19
20async function fetchMarkdown(url: string) {
21 const response = await fetch(url);
22 const text = await response.text();
23 return text;
24}
25
26function sanitizeTag(tag: string) {
27 if (tag === "(requires neovim 0.5)") return "neovim-0.5";
28 if (tag === "tree-sitter supported colorscheme") {
29 return "treesitter-colorschemes";
30 }
31 return tag.toLocaleLowerCase().replace(/\s/g, "-");
32}
33
34function processMarkdown(text: string) {
35 const resources: Resource[] = [];
36 const tree = marked.lexer(text);
37 let headings: string[] = [];
38 tree.forEach((token: any) => {
39 if (token.type === "heading" && token.depth > 1) {
40 headings = headings.slice(0, token.depth - 2);
41 headings.push(token.text.toLocaleLowerCase());
42 }
43
44 if (token.type === "list") {
45 token.items.forEach((t: any) => {
46 (t as any).tokens.forEach((tt: any) => {
47 if (!tt.tokens) return;
48
49 // hardcoded deny-list for headings
50 for (let i = 0; i < headings.length; i += 1) {
51 const heading = headings[i];
52 if (
53 ["contents", "vim", "ui", "wishlist", "resource"].includes(
54 heading,
55 )
56 ) return;
57 }
58
59 const tags = headings.map(sanitizeTag);
60 const resource = createResource({
61 tags,
62 });
63 let link = "";
64
65 // first token is always a link
66 const token = tt.tokens[0];
67 if (!token) return;
68 if (!token.href) return;
69
70 link = token.href;
71 // skip non-github links
72 if (!link.includes("github.com")) return;
73
74 const href = link
75 .replace("https://github.com/", "")
76 .replace("http://github.com", "");
77 const d = href.split("/");
78 resource.username = d[0];
79 resource.repo = d[1].replace(/#.+/, "");
80 resources.push(resource);
81 });
82 });
83 }
84 });
85
86 return resources;
87}
88
89async function saveScrapeData(resources: Resource[]) {
90 const newResources = resources.sort((a, b) => {
91 if (a.username === b.username) {
92 return a.repo.localeCompare(b.repo);
93 }
94 return a.username.localeCompare(b.username);
95 });
96 const data = { resources: newResources };
97 const json = JSON.stringify(data, null, 2);
98 await Deno.writeTextFile("./data/scrape.json", json);
99}