Skip to content

Commit

Permalink
Page-reuse concurrency + Browser Repair + Screencaster Cleanup Improv…
Browse files Browse the repository at this point in the history
…ements (#157)

* new window: use cdp instead of window.open

* new window tweaks: add reuseCount, use browser.target() instead of opening a new blank page

* rename NewWindowPage -> ReuseWindowConcurrency, move to windowconcur.js
potential fix for #156

* browser repair:
- when using window-concurrency, attempt to repair / relaunch browser if cdp errors occur
- mark pages as failed and don't reuse if page error or cdp errors occur
- screencaster: clear previous targets if screencasting when repairing browser

* bump version to 0.7.0-beta.3
  • Loading branch information
ikreymer authored Aug 19, 2022
1 parent 827c153 commit 6cc38bf
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 88 deletions.
18 changes: 17 additions & 1 deletion crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ class Crawler {

} catch (e) {
console.warn(e);
await this.markPageFailed(page);
}
}

Expand Down Expand Up @@ -519,6 +520,10 @@ class Crawler {

this.screencaster = this.initScreenCaster();

if (this.cluster.browser.setScreencaster) {
this.cluster.browser.setScreencaster(this.screencaster);
}

for (let i = 0; i < this.params.scopedSeeds.length; i++) {
const seed = this.params.scopedSeeds[i];
if (!await this.queueUrl(i, seed.url, 0, 0)) {
Expand Down Expand Up @@ -700,6 +705,9 @@ class Crawler {
ignoreAbort = shouldIgnoreAbort(req);
});

// more serious page error, mark page session as invalid
page.on("error", () => this.markPageFailed(page));

const gotoOpts = isHTMLPage ? this.gotoOpts : "domcontentloaded";

try {
Expand All @@ -711,7 +719,7 @@ class Crawler {
} catch (e) {
let msg = e.message || "";
if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) {
this.statusLog(`ERROR: ${url}: ${msg}`);
this.statusLog(`Load Error: ${url}: ${msg}`);
this.errorCount++;
}
}
Expand Down Expand Up @@ -739,6 +747,14 @@ class Crawler {
}
}

async markPageFailed(page) {
page.__failed = true;
this.errorCount++;
if (this.screencaster) {
await this.screencaster.endTarget(page.target());
}
}

async netIdle(page) {
if (!this.params.netIdleWait) {
return;
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "0.7.0-beta.2",
"version": "0.7.0-beta.3",
"main": "browsertrix-crawler",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
"author": "Ilya Kreymer <[email protected]>, Webrecorder Software",
Expand All @@ -17,7 +17,7 @@
"minio": "7.0.26",
"node-fetch": "^2.6.1",
"puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue",
"puppeteer-core": "16.1.0",
"puppeteer-core": "^16.1.1",
"request": "^2.88.2",
"sitemapper": "^3.1.2",
"uuid": "8.3.2",
Expand Down
6 changes: 3 additions & 3 deletions util/argParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ const { Cluster } = require("puppeteer-cluster");
const yargs = require("yargs/yargs");
const { hideBin } = require("yargs/helpers");

const { NewWindowPage} = require("./screencaster");
const { ReuseWindowConcurrency } = require("./windowconcur");
const { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } = require("./constants");
const { ScopedSeed } = require("./seeds");
const { interpolateFilename } = require("./storage");
Expand Down Expand Up @@ -374,7 +374,7 @@ class ArgParser {
argv.newContext = Cluster.CONCURRENCY_PAGE;
if (argv.screencastPort && argv.workers > 1) {
console.log("Note: to support screencasting with >1 workers, newContext set to 'window' instead of 'page'");
argv.newContext = NewWindowPage;
argv.newContext = ReuseWindowConcurrency;
}
break;

Expand All @@ -387,7 +387,7 @@ class ArgParser {
break;

case "window":
argv.newContext = NewWindowPage;
argv.newContext = ReuseWindowConcurrency;
break;

default:
Expand Down
100 changes: 22 additions & 78 deletions util/screencaster.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@ const path = require("path");

const { initRedis } = require("./redis");


const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default;

const indexHTML = fs.readFileSync(path.join(__dirname, "..", "html", "screencast.html"), {encoding: "utf8"});


Expand Down Expand Up @@ -231,27 +228,36 @@ class ScreenCaster
}
}

async endTarget(target) {
const id = target._targetId;
const cdp = this.targets.get(id);
if (!cdp) {
return;
async endAllTargets() {
const targetIds = this.targets.keys();

for (const key of targetIds) {
await this.endTargetById(key);
}
}

await this.stopCast(cdp);
async endTarget(target) {
await this.endTargetById(target._targetId);
}

async endTargetById(id) {
this.caches.delete(id);
this.urls.delete(id);

const cdp = this.targets.get(id);

if (cdp) {
try {
await this.stopCast(cdp);
await cdp.detach();
} catch (e) {
// already detached
}
}

await this.transport.sendAll({msg: "close", id});

this.targets.delete(id);

try {
await cdp.detach();
} catch (e) {
// already detached
}
}

async startCast(cdp) {
Expand Down Expand Up @@ -298,66 +304,4 @@ class ScreenCaster
}
}


// ===========================================================================
class NewWindowPage extends SingleBrowserImplementation {
async init() {
await super.init();

this.newTargets = [];

this.nextPromise();

this.mainPage = await this.browser.newPage();

this.pages = [];
this.reuse = true;

await this.mainPage.goto("about:blank");

this.mainTarget = this.mainPage.target();

this.browser.on("targetcreated", (target) => {
if (this._nextTarget && target.opener() === this.mainTarget) {
this.newTargets.push(target);
this._nextTarget();
this.nextPromise();
}
});
}

nextPromise() {
this._nextPromise = new Promise((resolve) => this._nextTarget = resolve);
}

async getNewPage() {
const p = this._nextPromise;

await this.mainPage.evaluate("window.open('about:blank', '', 'resizable');");

await p;

const target = this.newTargets.shift();

return {page: await target.page() };
}

async createResources() {
if (this.pages.length) {
return {page: this.pages.shift()};
}
return await this.getNewPage();
}

async freeResources(resources) {
if (this.reuse) {
this.pages.push(resources.page);
} else {
await resources.page.close();
}
}
}



module.exports = { ScreenCaster, NewWindowPage, WSTransport, RedisPubSubTransport };
module.exports = { ScreenCaster, WSTransport, RedisPubSubTransport };
115 changes: 115 additions & 0 deletions util/windowconcur.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default;


// ===========================================================================
class ReuseWindowConcurrency extends SingleBrowserImplementation {
async init() {
await super.init();

this.pendingTargets = new Map();
this.startPage = "about:blank?_browsertrix" + Math.random().toString(36).slice(2);

this.pages = [];
this.reuseCount = 25;

this.screencaster = null;

const mainTarget = this.browser.target();

this.cdp = await mainTarget.createCDPSession();
this.sessionId = this.cdp.id();

this.browser.on("targetcreated", (target) => {
if (target.url() === this.startPage) {
this.pendingTargets.set(target._targetId, target);
}
});
}

setScreencaster(screencaster) {
this.screencaster = screencaster;
}

async repair() {
if (this.openInstances !== 0 || this.repairing) {
// already repairing or there are still pages open? wait for start/finish
await new Promise(resolve => this.waitingForRepairResolvers.push(resolve));
return;
}

this.repairing = true;
console.debug("Starting repair");

if (this.screencaster) {
this.screencaster.endAllTargets();
}

try {
// will probably fail, but just in case the repair was not necessary
await this.browser.close();
} catch (e) {
console.debug("Unable to close browser.");
}

try {
await this.init();
} catch (err) {
console.debug("Unable to restart chrome.");
}
this.repairRequested = false;
this.repairing = false;
this.waitingForRepairResolvers.forEach(resolve => resolve());
this.waitingForRepairResolvers = [];
}

async getNewPage() {
while (true) {
let targetId;
try {
const res = await this.cdp.send("Target.createTarget", {url: this.startPage, newWindow: true});
targetId = res.targetId;
} catch (e) {
console.warn(e);
await this.repair();
}

const target = this.pendingTargets.get(targetId);
// this shouldn't really happen, but just in case somehow ended up w/o a target, try again
if (!target) {
continue;
}

this.pendingTargets.delete(targetId);

return {page: await target.page(), count: 0, id: this.sessionId};
}
}

async createResources() {
if (this.pages.length) {
const res = this.pages.shift();
if (res.id === this.sessionId) {
return res;
} else {
// page is using stale session (eg. from crashed/previous browser instance), don't attempt to reuse
}
}
return await this.getNewPage();
}

async freeResources(resources) {
// if marked as failed, don't try to reuse
if (resources.page.__failed) {
await resources.page.close();
}
if (++resources.count > this.reuseCount) {
await resources.page.close();
} else {
this.pages.push(resources);
}
}
}

module.exports = { ReuseWindowConcurrency };


8 changes: 4 additions & 4 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4246,10 +4246,10 @@ punycode@^2.1.0, punycode@^2.1.1:
dependencies:
debug "^4.1.1"

puppeteer-core@^16.1.0:
version "16.1.0"
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-16.1.0.tgz#0485312363e6e1d65889d4b31de677bd36f872e4"
integrity sha512-Eu9FCqdWU2PU/RY53sa+JTsbFiQg5fJyaHX5DP0WZ4+lVLVdMfR9dwPimRkSl9NEcArm7lZMpiDlVCYelE90ZA==
puppeteer-core@^16.1.1:
version "16.1.1"
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-16.1.1.tgz#2c26c560934a1c524a767c9ec0818520b7adb22a"
integrity sha512-ls+A6t+cbeNtsNIEyWkGoVJRHseEvBhS3NlI2DBFaJNBUG6kUfmAVyColu1ubgy4VuWLKpGUcwrPTVIvNd1Dew==
dependencies:
cross-fetch "3.1.5"
debug "4.3.4"
Expand Down

0 comments on commit 6cc38bf

Please sign in to comment.