repos / neovimcraft

website that makes it easy to find neovim plugins
git clone https://github.com/neurosnap/neovimcraft.git

neovimcraft / src / scripts
Eric Bower · 09 Mar 23

scrape.ts

 1import { marked } from "../deps.ts";
 2import type { Resource } from "../types.ts";
 3import { createResource } from "../entities.ts";
 4
 5const URLS = [
 6  "https://raw.githubusercontent.com/rockerBOO/awesome-neovim/main/README.md",
 7];
 8
 9Promise.all(URLS.map((url) => fetchMarkdown(url).then(processMarkdown)))
10  .then((resources) => {
11    const flatten = resources.reduce((acc, r) => {
12      acc.push(...r);
13      return acc;
14    }, []);
15    return flatten;
16  })
17  .then(saveScrapeData)
18  .catch(console.error);
19
20async function fetchMarkdown(url: string) {
21  const response = await fetch(url);
22  const text = await response.text();
23  return text;
24}
25
26function sanitizeTag(tag: string) {
27  if (tag === "(requires neovim 0.5)") return "neovim-0.5";
28  if (tag === "tree-sitter supported colorscheme") {
29    return "treesitter-colorschemes";
30  }
31  return tag.toLocaleLowerCase().replace(/\s/g, "-");
32}
33
34function processMarkdown(text: string) {
35  const resources: Resource[] = [];
36  const tree = marked.lexer(text);
37  let headings: string[] = [];
38  tree.forEach((token: any) => {
39    if (token.type === "heading" && token.depth > 1) {
40      headings = headings.slice(0, token.depth - 2);
41      headings.push(token.text.toLocaleLowerCase());
42    }
43
44    if (token.type === "list") {
45      token.items.forEach((t: any) => {
46        (t as any).tokens.forEach((tt: any) => {
47          if (!tt.tokens) return;
48
49          // hardcoded deny-list for headings
50          for (let i = 0; i < headings.length; i += 1) {
51            const heading = headings[i];
52            if (
53              ["contents", "vim", "ui", "wishlist", "resource"].includes(
54                heading,
55              )
56            ) return;
57          }
58
59          const tags = headings.map(sanitizeTag);
60          const resource = createResource({
61            tags,
62          });
63          let link = "";
64
65          // first token is always a link
66          const token = tt.tokens[0];
67          if (!token) return;
68          if (!token.href) return;
69
70          link = token.href;
71          // skip non-github links
72          if (!link.includes("github.com")) return;
73
74          const href = link
75            .replace("https://github.com/", "")
76            .replace("http://github.com", "");
77          const d = href.split("/");
78          resource.username = d[0];
79          resource.repo = d[1].replace(/#.+/, "");
80          resources.push(resource);
81        });
82      });
83    }
84  });
85
86  return resources;
87}
88
89async function saveScrapeData(resources: Resource[]) {
90  const newResources = resources.sort((a, b) => {
91    if (a.username === b.username) {
92      return a.repo.localeCompare(b.repo);
93    }
94    return a.username.localeCompare(b.username);
95  });
96  const data = { resources: newResources };
97  const json = JSON.stringify(data, null, 2);
98  await Deno.writeTextFile("./data/scrape.json", json);
99}