Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 70 additions & 42 deletions scripts/checkLinks.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@
//@ts-nocheck
import {Browser, chromium, Page, Response} from "@playwright/test";
import { Browser, chromium, Page, Response } from "@playwright/test";
import * as fs from "node:fs";

const brokenLinks = new Set<string>();
const visitedLinks = new Set<string>();
const linkQueue: { url: string, path: string }[] = [];
const MAX_PAGES = 5;
let activeCount = 0;

async function main() {
const browserInstance = await chromium.launch();
const baseUrl = process.env.BASE_URL || 'https://www.docs.sei.io/';
await crawlPages(baseUrl, browserInstance, 'main');
linkQueue.push({ url: baseUrl, path: 'main' });

const workers = [];
for (let i = 0; i < MAX_PAGES; i++) {
workers.push(processLink(browserInstance, i));
}
await Promise.all(workers);

fs.writeFileSync('brokenLinks.json', JSON.stringify([...brokenLinks], null, 2));

if (brokenLinks.size > 0) {
Expand All @@ -18,54 +28,69 @@ async function main() {
await browserInstance.close();
}

async function crawlPages(url: string, browser: Browser, path: string) {
const ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.81"
const page = await browser.newPage({userAgent: ua});
console.info('Visiting: ' + path);
if(isInternal(url)){
await checkInternalLinks(url, page, path, browser);
} else {
await checkExternalLinks(url, page, path, browser);
async function processLink(browser: Browser, workerId: number) {
const ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.81";
while (true) {
const currentTask = linkQueue.shift();

if (!currentTask) {
if (linkQueue.length === 0 && activeCount === 0) break;
await new Promise(res => setTimeout(res, 100));
continue;
}

activeCount++;
const { url, path } = currentTask;

if (visitedLinks.has(url)) {
activeCount--;
continue;
}
visitedLinks.add(url);

console.info(`Worker ${workerId} visiting: ${path}`);
const page = await browser.newPage({ userAgent: ua });
try {
await processPage(page, path, url);
} catch (error) {
console.error(`Worker ${workerId} error processing ${url}: ${error}`);
} finally {
await page.close();
activeCount--;
}
}
}

function isInternal(url: string) {
return url.includes('docs.sei') || url.includes('localhost:3000');
}

async function checkInternalLinks(url: string, page: Page, path: string, browser: Browser) {
const isBroken = await isLinkBroken(page, url, path);
visitedLinks.add(url);
if (isBroken) {
await page.close();
return;
}
const linksToCheck = await getLinksFromPage(page, path);
await page.close();
for (const link of linksToCheck){
await crawlPages(link, browser, `${url} => ${link}`)
async function processPage(page: Page, path: string, url:string) {
if (isInternal(url)) {
const isBroken = await isLinkBroken(page, url, path);
if (!isBroken) {
const links = await getLinksFromPage(page, path);
links.forEach(link => {
if (!visitedLinks.has(link)) {
linkQueue.push({ url: link, path: `${path} => ${link}` });
}
});
}
} else {
if (!path.includes('learn/getting-tokens')) {
await isLinkBroken(page, url, path);
}
}
}

async function checkExternalLinks(url: string, page: Page, path: string, browser: Browser) {
if (path.includes('learn/getting-tokens')) return;
await isLinkBroken(page, url, path);
visitedLinks.add(url);
await page.close();
return;
}

async function isLinkBroken(page: Page, url: string, path: string) {
if (visitedLinks.has(url)) return false;
if ((url.includes('localhost') && !url.includes(':3000')) || url.includes('.tar.gz')) return false;

let pageResponse: Response;
try {
pageResponse = await page.goto(url, {waitUntil: 'load', timeout: 15000});
pageResponse = await page.goto(url, { waitUntil: 'load', timeout: 45000 });
} catch (error: any) {
if (error.message.includes('Timeout')) {
pageResponse = await retryPageLoadIfTimeout(page, url);
}
pageResponse = await retryPageLoadIfTimeout(page, url, path);
} finally {
if (!pageResponse || [404, 403].includes(pageResponse.status())) {
console.warn(`Broken link detected: ${path} (Status ${pageResponse ? pageResponse.status() : 'page not opened'})`);
Expand All @@ -78,8 +103,9 @@ async function isLinkBroken(page: Page, url: string, path: string) {

async function retryPageLoadIfTimeout(page: Page, url: string, path: string) {
try {
return await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 15000 });
} catch {
console.warn(`Retrying page load for ${path}`);
return await page.goto(url, { waitUntil: 'load', timeout: 45000 });
} catch(e: any) {
return undefined;
}
}
Expand All @@ -89,13 +115,15 @@ async function getLinksFromPage(page: Page, path: string) {
const linksOnPage = await page.$$eval(node, links =>
links.map(link => link.href)
);
return linksOnPage.filter(href => !visitedLinks.has(href)).filter(href => {
if (isInternal(href)) return !href.includes('#');
return true;
});
return linksOnPage
.filter(href => !visitedLinks.has(href))
.filter(href => {
if (isInternal(href)) return !href.includes('#');
return true;
});
}

main().then(() => {
console.info('No broken links found.');
console.log('Finished checking links');
process.exit(0);
});
});