Skip to content

Commit

Permalink
Reduce crawler load on servers
Browse files Browse the repository at this point in the history
The crawler has a hard time crawling all specs nowadays due to more stringent
restrictions on servers that lead to network timeouts and errors. See:
w3c/webref#1244

The goal of this update is to reduce the load of the crawler onto servers. Two
changes:

1. The list of specs to crawl gets sorted to distribute origins. This should
help with diluting requests sent to a specific server at once. The notion of
"origin" used in the code is loose and more meant to identify the server that
serves the resource than the actual origin.

2. Requests sent to a given origin are serialized, and sent 2 seconds minimum
after the last request was sent (and processed). The crawler still processes
the list 4 specs at a time otherwise (provided the specs are to be retrieved
from different origins).

The consequence of 1. is that the specs are no longer processed in order, so
logs will make the crawler look a bit drunk, processing specs seemingly
randomly, as in:

```
  1/610 - https://aomediacodec.github.io/afgs1-spec/ - crawling
  8/610 - https://compat.spec.whatwg.org/ - crawling
 12/610 - https://datatracker.ietf.org/doc/html/draft-davidben-http-client-hint-reliability - crawling
 13/610 - https://datatracker.ietf.org/doc/html/draft-ietf-httpbis-rfc6265bis - crawling
 12/610 - https://datatracker.ietf.org/doc/html/draft-davidben-http-client-hint-reliability - done
 16/610 - https://drafts.css-houdini.org/css-typed-om-2/ - crawling
 13/610 - https://datatracker.ietf.org/doc/html/draft-ietf-httpbis-rfc6265bis - done
 45/610 - https://fidoalliance.org/specs/fido-v2.1-ps-20210615/fido-client-to-authenticator-protocol-v2.1-ps-errata-20220621.html - crawling
https://compat.spec.whatwg.org/ [error] Multiple event handler named orientationchange, cannot associate reliably to an interface in Compatibility Standard
  8/610 - https://compat.spec.whatwg.org/ - done
 66/610 - https://registry.khronos.org/glTF/specs/2.0/glTF-2.0.html - crawling
https://aomediacodec.github.io/afgs1-spec/ [log] extract refs without rules
  1/610 - https://aomediacodec.github.io/afgs1-spec/ - done
```
  • Loading branch information
tidoust committed Jun 5, 2024
1 parent 2be9f4c commit 8026699
Show file tree
Hide file tree
Showing 2 changed files with 154 additions and 20 deletions.
6 changes: 6 additions & 0 deletions src/lib/mock-server.js
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,12 @@ mockAgent
.reply(200, '')
.persist();

mockAgent
.get("https://www.w3.org")
.intercept({ method: "GET", path: "/StyleSheets/TR/2021/dark.css" })
.reply(200, '')
.persist();

mockAgent
.get("https://www.w3.org")
.intercept({ method: "GET", path: "/Tools/respec/respec-highlight" })
Expand Down
168 changes: 148 additions & 20 deletions src/lib/specs-crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,88 @@ const {

const {version: reffyVersion} = require('../../package.json');

/**
* To be friendly with servers, requests get serialized by origin server,
* and the code sleeps a bit in between requests to a given origin server.
* To achieve, the code needs to take a lock on the origin it wants to send a
* request to.
*/
const originLocks = {};


/**
* Helper function to sleep for a specified number of milliseconds
*/
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms, 'slept'));
}


/**
* Helper function to interleave values of a list of arrays.
*
* For example:
* interleave([0, 2, 4, 6, 8], [1, 3, 5]) returns [0, 1, 2, 3, 4, 5, 6, 8]
* interleave([0, 3], [1, 4], [2, 5]) returns [0, 1, 2, 3, 4, 5]
*
* The function is used to sort the list of specs to crawl so as to distribute
* origins throughout the list.
*
* Note the function happily modifies (and empties in practice) the arrays
* it receives as arguments.
*/
function interleave(firstArray, ...furtherArrays) {
if (firstArray?.length > 0) {
// Return the concactenation of the first item in the first array,
// and of the result of interleaving remaining arrays, putting the
// first array last in the list.
const firstItem = firstArray.shift();
return [firstItem, ...interleave(...furtherArrays, firstArray)];
}
else {
// First array is empty, let's proceed with remaining arrays
// until there's nothing else to proceed.
if (furtherArrays.length > 0) {
return interleave(...furtherArrays);
}
else {
return [];
}
}
}


/**
* Helper function that returns the "origin" of a URL, defined in a loose way
* as the part of the true origin that identifies the server that's going to
* serve the resource.
*
* For example "github.io" for all specs under github.io, "whatwg.org" for
* all WHATWG specs, "csswg.org" for CSS specs at large (including Houdini
* and FXTF specs since they are served by the same server).
*/
function getOrigin(url) {
if (!url) {
return '';
}
const origin = (new URL(url)).origin;
if (origin.endsWith('.whatwg.org')) {
return 'whatwg.org';
}
else if (origin.endsWith('.github.io')) {
return 'github.io';
}
else if (origin.endsWith('.csswg.org') ||
origin.endsWith('.css-houdini.org') ||
origin.endsWith('.fxtf.org')) {
return 'csswg.org';
}
else {
return origin;
}
}


/**
* Return the spec if crawl succeeded or crawl result from given fallback list
* if crawl yielded an error (and fallback does exist).
Expand Down Expand Up @@ -95,24 +177,51 @@ async function crawlSpec(spec, crawlOptions) {
result = {};
}
else {
result = await processSpecification(
urlToCrawl,
(spec, modules) => {
const idToHeading = modules.find(m => m.needsIdToHeadingMap) ?
window.reffy.mapIdsToHeadings() : null;
const res = {
crawled: window.location.toString()
};
modules.forEach(mod => {
res[mod.property] = window.reffy[mod.name](spec, idToHeading);
});
return res;
},
[spec, crawlOptions.modules],
{ quiet: crawlOptions.quiet,
forceLocalFetch: crawlOptions.forceLocalFetch,
...cacheInfo}
);
// To be friendly with servers, requests are serialized per origin
// and only sent after a couple of seconds.
const origin = getOrigin(urlToCrawl.url);
let originLock = originLocks[origin];
if (!originLock) {
originLock = {
locked: false,
last: 0
};
originLocks[origin] = originLock;
}
// Wait for the "lock" on the origin. Once we can take it, sleep as
// needed to only send a request after enough time has elapsed.
while (originLock.locked) {
await sleep(100);
}
originLock.locked = true;
const now = Date.now();
if (now - originLock.last < 2000) {
await sleep(2000 - (now - originLock.last));
}
try {
result = await processSpecification(
urlToCrawl,
(spec, modules) => {
const idToHeading = modules.find(m => m.needsIdToHeadingMap) ?
window.reffy.mapIdsToHeadings() : null;
const res = {
crawled: window.location.toString()
};
modules.forEach(mod => {
res[mod.property] = window.reffy[mod.name](spec, idToHeading);
});
return res;
},
[spec, crawlOptions.modules],
{ quiet: crawlOptions.quiet,
forceLocalFetch: crawlOptions.forceLocalFetch,
...cacheInfo}
);
}
finally {
originLock.last = Date.now();
originLock.locked = false;
}
if (result.status === "notmodified" && fallback) {
crawlOptions.quiet ?? console.warn(`skipping ${spec.url}, no change`);
const copy = Object.assign({}, fallback);
Expand Down Expand Up @@ -343,14 +452,33 @@ async function crawlList(speclist, crawlOptions) {
return { spec, readyToCrawl, resolve, reject };
});

// While we want results to be returned following the initial order of the
// specs, to avoid sending too many requests at once to the same origin,
// we'll sort specs so that origins get interleaved.
// Note: there may be specs without URL (ISO specs)
const specsByOrigin = {};
for (const spec of list) {
const toCrawl = crawlOptions.publishedVersion ?
(spec.release ?? spec.nightly) :
spec.nightly;
const origin = getOrigin(toCrawl?.url);
if (!specsByOrigin[origin]) {
specsByOrigin[origin] = [];
}
specsByOrigin[origin].push(spec);
}
const spreadList = interleave(...Object.values(specsByOrigin));

// In debug mode, specs are processed one by one. In normal mode,
// specs are processing in chunks
const chunkSize = Math.min((crawlOptions.debug ? 1 : 4), list.length);

let pos = 0;
function flagNextSpecAsReadyToCrawl() {
if (pos < listAndPromise.length) {
listAndPromise[pos].resolve();
if (pos < spreadList.length) {
const spec = spreadList[pos];
const specAndPromise = listAndPromise.find(sp => sp.spec === spec);
specAndPromise.resolve();
pos += 1;
}
}
Expand Down

0 comments on commit 8026699

Please sign in to comment.