-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathlinkfollower.js
86 lines (74 loc) · 2.89 KB
/
linkfollower.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
const edgeOnWin10 = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.2592.102'
const metaRefreshPattern = '(CONTENT|content)=["\']0;[ ]*(URL|url)=(.*?)(["\']*>)'
const MAX_REDIRECT_DEPTH = 10;
const fetchOptions = {
redirect: 'manual',
follow: 0,
headers: {
'User-Agent': edgeOnWin10,
'Accept': 'text/html'
}
}
export default async function* startFollowing(url) {
let count = 1
let keepGoing = true
while (keepGoing) {
if (count > MAX_REDIRECT_DEPTH) {
return { url: url, status: `Max redirect depth of ${MAX_REDIRECT_DEPTH} exceeded` }
}
try {
const response = await visit(url)
count++
keepGoing = response.redirect
url = keepGoing ? new URL(response.redirectUrl) : null
yield response
} catch (err) {
keepGoing = false
return { url: url, status: `${err}` }
}
}
}
const visit = async url => {
try {
const response = await fetch(url, fetchOptions)
if (isRedirect(response.status)) {
const locationHeader = response.headers.get('location').replaceAll(/\/$/g, "")
return locationHeader
? { url: url, redirect: true, status: response.status, redirectUrl: addBaseTo(locationHeader, url.origin) }
: { status: `${url} responded with status ${response.status} but no location header` }
}
if (response.status === 200) {
const html = await response.text()
const extracted = extractors.flatMap((extractor) => extractor(html)).filter((elem) => elem != null)
return extracted.length !== 0
? { url: url, redirect: true, status: '200 + extracted', redirectUrl: new URL(addBaseTo(extracted[0], url.origin)) }
: { url: url, redirect: false, status: response.status }
}
} catch (error) {
return { status: `${error.message}` }
}
}
const isRedirect = status => {
return status === 301
|| status === 302
|| status === 303
|| status === 307
|| status === 308;
}
const extractMetaRefreshUrl = html => {
let match = html.match(metaRefreshPattern)
return match && match.length == 5 ? stripUnwantedCharsFrom(match[3]) : null
}
const extractFromLinkedIn = (html) => {
const regex = /<a.*name="external_url_click".*>\s+(http[s]?:\/\/.*\s+)<\/a>/g
const matches = [...html.matchAll(regex)]
return matches.length != 0 ? matches.map(m => m[1])[0].trim() : null
}
const stripUnwantedCharsFrom = (url) => url.replaceAll(/['" ]/g, "")
const addBaseTo = (maybeCompleteUrl, base) => {
if (maybeCompleteUrl.startsWith('http')) {
new URL(maybeCompleteUrl)
}
return new URL(maybeCompleteUrl, base)
}
const extractors = [extractMetaRefreshUrl, extractFromLinkedIn]