Skip to content

Commit

Permalink
Gentle on CSS server, nice on w3.org, faster on other origins
Browse files Browse the repository at this point in the history
This makes the sleep time depend on the origin being crawled:
- The CSS server breaks easily, the code sleeps 2 seconds in between requests.
- The www.w3.org server has a limit of 180 requests per minute, the code sleeps
1 second in between requests.
- Other origins seem more amenable to faster crawls, the code sleeps 100ms in
between requests.
  • Loading branch information
tidoust committed Jun 7, 2024
1 parent cf53cac commit 02a021e
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 9 deletions.
11 changes: 10 additions & 1 deletion src/lib/specs-crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,16 @@ async function crawlList(speclist, crawlOptions) {
return result;
}

const crawlQueue = new ThrottledQueue(4);
const crawlQueue = new ThrottledQueue({
maxParallel: 4,
sleepInterval: origin => {
switch (origin) {
case 'https://csswg.org': return 2000;
case 'https://www.w3.org': return 1000;
default: return 100;
}
}
});
const results = await Promise.all(list.map((spec, idx) => {
const versionToCrawl = crawlOptions.publishedVersion ?
(spec.release ? spec.release : spec.nightly) :
Expand Down
25 changes: 17 additions & 8 deletions src/lib/throttled-queue.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ function getOrigin(url) {
}
const origin = (new URL(url)).origin;
if (origin.endsWith('.whatwg.org')) {
return 'whatwg.org';
return 'https://whatwg.org';
}
else if (origin.endsWith('.github.io')) {
return 'github.io';
return 'https://github.io';
}
else if (origin.endsWith('.csswg.org') ||
origin.endsWith('.css-houdini.org') ||
origin.endsWith('.fxtf.org')) {
return 'csswg.org';
return 'https://csswg.org';
}
else {
return origin;
Expand All @@ -46,12 +46,16 @@ function getOrigin(url) {
module.exports = class ThrottledQueue {
originQueue = {};
maxParallel = 4;
sleepInterval = 2000;
ongoing = 0;
pending = [];

constructor(maxParallel) {
if (maxParallel >= 0) {
this.maxParallel = maxParallel;
constructor(options = { maxParallel: 4, sleepInterval: 2000 }) {
if (options.maxParallel >= 0) {
this.maxParallel = options.maxParallel;
}
if (options.sleepInterval) {
this.sleepInterval = options.sleepInterval;
}
}

Expand Down Expand Up @@ -99,7 +103,9 @@ module.exports = class ThrottledQueue {
* requested maximum.
*
* Additionally, the function forces a 2 second sleep after processing to
* keep a low network profile.
* keep a low network profile (sleeping time can be adjusted per origin
* depending if the sleepInterval parameter that was passed to the
* constructor is a function.
*/
async runThrottledPerOrigin(url, processFunction, ...params) {
const origin = getOrigin(url);
Expand All @@ -110,7 +116,10 @@ module.exports = class ThrottledQueue {
this.originQueue[origin] = this.originQueue[origin]
.then(async _ => this.runThrottled(processFunction, ...params))
.then(async result => {
await sleep(2000);
const interval = (typeof this.sleepInterval === 'function') ?
this.sleepInterval(origin) :
this.sleepInterval;
await sleep(interval);
return result;
})
.then(resolve)
Expand Down

0 comments on commit 02a021e

Please sign in to comment.