forked from github/docs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
check-github-github-links.js
executable file
·226 lines (195 loc) · 7.38 KB
/
check-github-github-links.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#!/usr/bin/env node
// [start-readme]
//
// Run this script to get all broken docs.github.com links in github/github
//
// [end-readme]
import fs from 'fs/promises'
import got, { RequestError } from 'got'
import { getContents, getPathsWithMatchingStrings } from './helpers/git-utils.js'
if (!process.env.GITHUB_TOKEN) {
throw new Error('Error! You must have a GITHUB_TOKEN set in an .env file to run this script.')
}
const FORCE_DOWNLOAD = Boolean(JSON.parse(process.env.FORCE_DOWNLOAD || 'false'))
const BATCH_SIZE = JSON.parse(process.env.BATCH_SIZE || '10')
const BASE_URL = process.env.BASE_URL || 'http://localhost:4000'
main()
// The way `got` does retries:
//
// sleep = 1000 * Math.pow(2, retry - 1) + Math.random() * 100
//
// So, it means:
//
// 1. ~1000ms
// 2. ~2000ms
// 3. ~4000ms
//
// ...if the limit we set is 3.
// Our own timeout, in ./middleware/timeout.js defaults to 10 seconds.
// So there's no point in trying more attempts than 3 because it would
// just timeout on the 10s. (i.e. 1000 + 2000 + 4000 + 8000 > 10,000)
const retryConfiguration = {
limit: 3,
}
// According to our Datadog metrics, the *average* time for the
// the 'archive_enterprise_proxy' metric is ~70ms (excluding spikes)
// which much less than 500ms.
const timeoutConfiguration = {
request: 3000,
}
async function main() {
const searchStrings = ['https://docs.github.com', 'GitHub help_url', 'GitHub developer_help_url']
const foundFiles = []
try {
foundFiles.push(...JSON.parse(await fs.readFile('/tmp/foundFiles.json', 'utf-8')))
} catch (error) {
if (!(error.code && error.code === 'ENOENT')) {
throw error
}
}
if (!foundFiles.length || FORCE_DOWNLOAD) {
foundFiles.push(...(await getPathsWithMatchingStrings(searchStrings, 'github', 'github')))
await fs.writeFile('/tmp/foundFiles.json', JSON.stringify(foundFiles, undefined, 2), 'utf-8')
}
const searchFiles = [...new Set(foundFiles)] // filters out dupes
.filter((file) => endsWithAny(['.rb', '.yml', '.yaml', '.txt', '.pdf', '.erb', '.js'], file))
.filter(
(file) =>
!file.includes('test/') &&
!file.includes('app/views/') &&
!file.includes('config.') &&
!file.includes('app/api/description/')
)
const docsLinksFiles = []
const urlRegEx =
/https?:\/\/(www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&//=]*)/g
try {
docsLinksFiles.push(...JSON.parse(await fs.readFile('/tmp/docsLinksFiles.json', 'utf-8')))
} catch (error) {
if (!(error.code && error.code === 'ENOENT')) {
throw error
}
}
if (!docsLinksFiles.length || FORCE_DOWNLOAD) {
for (const file of searchFiles) {
const contents = await getContents('github', 'github', 'master', file)
if (
contents.includes('https://docs.github.com') ||
contents.includes('GitHub.help_url') ||
contents.includes('GitHub.developer_help_url')
) {
const docsIndices = getIndicesOf('https://docs.github.com', contents)
const helpIndices = getIndicesOf('GitHub.help_url', contents)
helpIndices.push(...getIndicesOf('GitHub.developer_help_url', contents))
if (docsIndices.length > 0) {
docsIndices.forEach((numIndex) => {
// Assuming we don't have links close to 500 characters long
const docsLink = contents.substring(numIndex, numIndex + 500).match(urlRegEx)
const linkURL = new URL(docsLink[0].toString().replace(/[^a-zA-Z0-9]*$|\\n$/g, ''))
const linkPath = linkURL.pathname + linkURL.hash
docsLinksFiles.push({ linkPath, file })
})
}
if (helpIndices.length > 0) {
helpIndices.forEach((numIndex) => {
// There are certain links like #{GitHub.help_url}#{learn_more_path} and #{GitHub.developer_help_url}#{learn_more_path} that we should skip
if (
(contents.substring(numIndex, numIndex + 11) === 'GitHub.help' &&
contents.charAt(numIndex + 16) === '#') ||
(contents.substring(numIndex, numIndex + 16) === 'GitHub.developer' &&
contents.charAt(numIndex + 26) === '#') ||
// See internal issue #2180
contents.slice(numIndex, numIndex + 'GitHub.help_url}/github/#{'.length) ===
'GitHub.help_url}/github/#{'
) {
return
}
const startSearchIndex = contents.indexOf('/', numIndex)
// Looking for the closest '/' after GitHub.developer_help_url or GitHub.help_url
// There are certain links that don't start with `/` so we want to skip those.
// If there's no `/` within 30 characters of GitHub.help_url/GitHub.developer_help_url, skip
if (startSearchIndex - numIndex < 30) {
const linkPath = contents
.substring(
startSearchIndex,
regexIndexOf(
contents,
/\n|"\)|{@email_tracking_params}|\^http|Ahttps|example|This|TODO"|[{}|"%><.,')* ]/,
startSearchIndex + 1
)
)
.trim()
// Certain specific links can be ignored as well
if (['/deprecation-1'].includes(linkPath)) {
return
}
docsLinksFiles.push({ linkPath, file })
}
})
}
}
}
await fs.writeFile(
'/tmp/docsLinksFiles.json',
JSON.stringify(docsLinksFiles, undefined, 2),
'utf-8'
)
}
const brokenLinks = []
// Break up the long list of URLs to test into batches
for (const batch of [...Array(Math.floor(docsLinksFiles.length / BATCH_SIZE)).keys()]) {
const slice = docsLinksFiles.slice(batch * BATCH_SIZE, batch * BATCH_SIZE + BATCH_SIZE)
await Promise.all(
slice.map(async ({ linkPath, file }) => {
// This isn't necessary but if it can't be constructed, it'll
// fail in quite a nice way and not "blame got".
const url = new URL(BASE_URL + linkPath)
try {
await got(url.href, {
retry: retryConfiguration,
timeout: timeoutConfiguration,
})
} catch (error) {
if (error instanceof RequestError) {
brokenLinks.push({ linkPath, file })
} else {
console.warn(`URL when it threw: ${url}`)
throw error
}
}
})
)
}
if (!brokenLinks.length) {
console.log('All links are good!')
process.exit(0)
}
console.log(`Found ${brokenLinks.length} total broken links in github/github`)
console.log('```')
console.log(`${JSON.stringify([...brokenLinks], null, 2)}`)
console.log('```')
// Exit unsuccessfully if broken links are found.
process.exit(1)
}
function endsWithAny(suffixes, string) {
for (const suffix of suffixes) {
if (string.endsWith(suffix)) return true
}
return false
}
function getIndicesOf(searchString, string) {
const searchStrLen = searchString.length
if (searchStrLen === 0) return []
let startIndex = 0
let index
const indices = []
while ((index = string.indexOf(searchString, startIndex)) > -1) {
indices.push(index)
startIndex = index + searchStrLen
}
return indices
}
function regexIndexOf(string, regex, startPos) {
const indexOf = string.substring(startPos || 0).search(regex)
return indexOf >= 0 ? indexOf + (startPos || 0) : indexOf
}