-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathscrapeLinks.ts
66 lines (62 loc) · 1.89 KB
/
scrapeLinks.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import path from 'path'
const regexMap = (
input: string,
regex: RegExp,
iteratee = (x: string[]) => x[1]
): string[] => {
const results = []
let match: string[] = regex.exec(input)
while (match !== null) {
const result = iteratee(match)
if (result) results.push(result)
match = regex.exec(input)
}
return results
}
const scrapeFromString: (filePath: string, content: string) => string[] = (
filePath,
content
) => {
switch (path.extname(filePath)) {
case '.md':
case '.mdx': {
const mdLinks = regexMap(
content,
/\[.*?\]\((?:<((?:\(.*?\)|.)*?)>|((?:\(.*?\)|.)*?))(?: ["'].*?["'])?\)/gm,
x => x[2] || x[1]
)
const mdRefLinks = regexMap(
content,
/\[.*\]:[^\S\r\n]*\n?[^\S\r\n]*((?:\/|https?:\/\/|www)\S*)/gm,
x => x[2] || x[1]
)
const hrefLinks = regexMap(content, /href="(.*?)"/gm)
const links = mdLinks.concat(mdRefLinks).concat(hrefLinks)
return links
? links
.filter(Boolean)
.map(link => (link.startsWith('/static') ? link.slice(7) : link))
: null
}
case '.html':
return regexMap(content, /href="(.*?)"/gm)
case '.json':
return regexMap(content, /"(?:(?:https?:)?\/\/)?(?:)"/gm)
default:
// credit to https://urlregex.com/, but modified to only hit http/s protocol
return regexMap(
content,
/(((https?:\/\/)[A-Za-z0-9.-]+|(?:www\.)[A-Za-z0-9.-]+)((?:\/[+~%/.\w\-_]*)?\??(?:[-+=&;%@.\w_]*)#?(?:[.!/\\\w]*))?)/gm,
x => x[0]
)
}
}
const defaultScrapeLinks: (args: {
filePath: string
content: string | string[]
}) => string[] = ({ filePath, content }) =>
(Array.isArray(content)
? [].concat(...content.map(line => scrapeFromString(filePath, line)))
: scrapeFromString(filePath, content)
).filter(link => link && !link.startsWith('#'))
export default defaultScrapeLinks