From 6cc38bf5119ace83f5cc32c2e9d3cfd7bbef4abf Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 19 Aug 2022 09:23:40 -0700 Subject: [PATCH] Page-reuse concurrency + Browser Repair + Screencaster Cleanup Improvements (#157) * new window: use cdp instead of window.open * new window tweaks: add reuseCount, use browser.target() instead of opening a new blank page * rename NewWindowPage -> ReuseWindowConcurrency, move to windowconcur.js potential fix for #156 * browser repair: - when using window-concurrency, attempt to repair / relaunch browser if cdp errors occur - mark pages as failed and don't reuse if page error or cdp errors occur - screencaster: clear previous targets if screencasting when repairing browser * bump version to 0.7.0-beta.3 --- crawler.js | 18 ++++++- package.json | 4 +- util/argParser.js | 6 +-- util/screencaster.js | 100 +++++++++---------------------------- util/windowconcur.js | 115 +++++++++++++++++++++++++++++++++++++++++++ yarn.lock | 8 +-- 6 files changed, 163 insertions(+), 88 deletions(-) create mode 100644 util/windowconcur.js diff --git a/crawler.js b/crawler.js index 8f98ffe60..5338f8416 100644 --- a/crawler.js +++ b/crawler.js @@ -389,6 +389,7 @@ class Crawler { } catch (e) { console.warn(e); + await this.markPageFailed(page); } } @@ -519,6 +520,10 @@ class Crawler { this.screencaster = this.initScreenCaster(); + if (this.cluster.browser.setScreencaster) { + this.cluster.browser.setScreencaster(this.screencaster); + } + for (let i = 0; i < this.params.scopedSeeds.length; i++) { const seed = this.params.scopedSeeds[i]; if (!await this.queueUrl(i, seed.url, 0, 0)) { @@ -700,6 +705,9 @@ class Crawler { ignoreAbort = shouldIgnoreAbort(req); }); + // more serious page error, mark page session as invalid + page.on("error", () => this.markPageFailed(page)); + const gotoOpts = isHTMLPage ? this.gotoOpts : "domcontentloaded"; try { @@ -711,7 +719,7 @@ class Crawler { } catch (e) { let msg = e.message || ""; if (!msg.startsWith("net::ERR_ABORTED") || !ignoreAbort) { - this.statusLog(`ERROR: ${url}: ${msg}`); + this.statusLog(`Load Error: ${url}: ${msg}`); this.errorCount++; } } @@ -739,6 +747,14 @@ class Crawler { } } + async markPageFailed(page) { + page.__failed = true; + this.errorCount++; + if (this.screencaster) { + await this.screencaster.endTarget(page.target()); + } + } + async netIdle(page) { if (!this.params.netIdleWait) { return; diff --git a/package.json b/package.json index 37eafd580..9f7ea499f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-crawler", - "version": "0.7.0-beta.2", + "version": "0.7.0-beta.3", "main": "browsertrix-crawler", "repository": "https://github.com/webrecorder/browsertrix-crawler", "author": "Ilya Kreymer , Webrecorder Software", @@ -17,7 +17,7 @@ "minio": "7.0.26", "node-fetch": "^2.6.1", "puppeteer-cluster": "github:ikreymer/puppeteer-cluster#async-job-queue", - "puppeteer-core": "16.1.0", + "puppeteer-core": "^16.1.1", "request": "^2.88.2", "sitemapper": "^3.1.2", "uuid": "8.3.2", diff --git a/util/argParser.js b/util/argParser.js index 5af8fa6cf..cf4d8a22e 100644 --- a/util/argParser.js +++ b/util/argParser.js @@ -8,7 +8,7 @@ const { Cluster } = require("puppeteer-cluster"); const yargs = require("yargs/yargs"); const { hideBin } = require("yargs/helpers"); -const { NewWindowPage} = require("./screencaster"); +const { ReuseWindowConcurrency } = require("./windowconcur"); const { BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS } = require("./constants"); const { ScopedSeed } = require("./seeds"); const { interpolateFilename } = require("./storage"); @@ -374,7 +374,7 @@ class ArgParser { argv.newContext = Cluster.CONCURRENCY_PAGE; if (argv.screencastPort && argv.workers > 1) { console.log("Note: to support screencasting with >1 workers, newContext set to 'window' instead of 'page'"); - argv.newContext = NewWindowPage; + argv.newContext = ReuseWindowConcurrency; } break; @@ -387,7 +387,7 @@ class ArgParser { break; case "window": - argv.newContext = NewWindowPage; + argv.newContext = ReuseWindowConcurrency; break; default: diff --git a/util/screencaster.js b/util/screencaster.js index 4dfe5768c..cca5b7efe 100644 --- a/util/screencaster.js +++ b/util/screencaster.js @@ -6,9 +6,6 @@ const path = require("path"); const { initRedis } = require("./redis"); - -const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default; - const indexHTML = fs.readFileSync(path.join(__dirname, "..", "html", "screencast.html"), {encoding: "utf8"}); @@ -231,27 +228,36 @@ class ScreenCaster } } - async endTarget(target) { - const id = target._targetId; - const cdp = this.targets.get(id); - if (!cdp) { - return; + async endAllTargets() { + const targetIds = this.targets.keys(); + + for (const key of targetIds) { + await this.endTargetById(key); } + } - await this.stopCast(cdp); + async endTarget(target) { + await this.endTargetById(target._targetId); + } + async endTargetById(id) { this.caches.delete(id); this.urls.delete(id); + const cdp = this.targets.get(id); + + if (cdp) { + try { + await this.stopCast(cdp); + await cdp.detach(); + } catch (e) { + // already detached + } + } + await this.transport.sendAll({msg: "close", id}); this.targets.delete(id); - - try { - await cdp.detach(); - } catch (e) { - // already detached - } } async startCast(cdp) { @@ -298,66 +304,4 @@ class ScreenCaster } } - -// =========================================================================== -class NewWindowPage extends SingleBrowserImplementation { - async init() { - await super.init(); - - this.newTargets = []; - - this.nextPromise(); - - this.mainPage = await this.browser.newPage(); - - this.pages = []; - this.reuse = true; - - await this.mainPage.goto("about:blank"); - - this.mainTarget = this.mainPage.target(); - - this.browser.on("targetcreated", (target) => { - if (this._nextTarget && target.opener() === this.mainTarget) { - this.newTargets.push(target); - this._nextTarget(); - this.nextPromise(); - } - }); - } - - nextPromise() { - this._nextPromise = new Promise((resolve) => this._nextTarget = resolve); - } - - async getNewPage() { - const p = this._nextPromise; - - await this.mainPage.evaluate("window.open('about:blank', '', 'resizable');"); - - await p; - - const target = this.newTargets.shift(); - - return {page: await target.page() }; - } - - async createResources() { - if (this.pages.length) { - return {page: this.pages.shift()}; - } - return await this.getNewPage(); - } - - async freeResources(resources) { - if (this.reuse) { - this.pages.push(resources.page); - } else { - await resources.page.close(); - } - } -} - - - -module.exports = { ScreenCaster, NewWindowPage, WSTransport, RedisPubSubTransport }; +module.exports = { ScreenCaster, WSTransport, RedisPubSubTransport }; diff --git a/util/windowconcur.js b/util/windowconcur.js new file mode 100644 index 000000000..9db0c8952 --- /dev/null +++ b/util/windowconcur.js @@ -0,0 +1,115 @@ +const SingleBrowserImplementation = require("puppeteer-cluster/dist/concurrency/SingleBrowserImplementation").default; + + +// =========================================================================== +class ReuseWindowConcurrency extends SingleBrowserImplementation { + async init() { + await super.init(); + + this.pendingTargets = new Map(); + this.startPage = "about:blank?_browsertrix" + Math.random().toString(36).slice(2); + + this.pages = []; + this.reuseCount = 25; + + this.screencaster = null; + + const mainTarget = this.browser.target(); + + this.cdp = await mainTarget.createCDPSession(); + this.sessionId = this.cdp.id(); + + this.browser.on("targetcreated", (target) => { + if (target.url() === this.startPage) { + this.pendingTargets.set(target._targetId, target); + } + }); + } + + setScreencaster(screencaster) { + this.screencaster = screencaster; + } + + async repair() { + if (this.openInstances !== 0 || this.repairing) { + // already repairing or there are still pages open? wait for start/finish + await new Promise(resolve => this.waitingForRepairResolvers.push(resolve)); + return; + } + + this.repairing = true; + console.debug("Starting repair"); + + if (this.screencaster) { + this.screencaster.endAllTargets(); + } + + try { + // will probably fail, but just in case the repair was not necessary + await this.browser.close(); + } catch (e) { + console.debug("Unable to close browser."); + } + + try { + await this.init(); + } catch (err) { + console.debug("Unable to restart chrome."); + } + this.repairRequested = false; + this.repairing = false; + this.waitingForRepairResolvers.forEach(resolve => resolve()); + this.waitingForRepairResolvers = []; + } + + async getNewPage() { + while (true) { + let targetId; + try { + const res = await this.cdp.send("Target.createTarget", {url: this.startPage, newWindow: true}); + targetId = res.targetId; + } catch (e) { + console.warn(e); + await this.repair(); + } + + const target = this.pendingTargets.get(targetId); + // this shouldn't really happen, but just in case somehow ended up w/o a target, try again + if (!target) { + continue; + } + + this.pendingTargets.delete(targetId); + + return {page: await target.page(), count: 0, id: this.sessionId}; + } + } + + async createResources() { + if (this.pages.length) { + const res = this.pages.shift(); + if (res.id === this.sessionId) { + return res; + } else { + // page is using stale session (eg. from crashed/previous browser instance), don't attempt to reuse + } + } + return await this.getNewPage(); + } + + async freeResources(resources) { + // if marked as failed, don't try to reuse + if (resources.page.__failed) { + await resources.page.close(); + } + if (++resources.count > this.reuseCount) { + await resources.page.close(); + } else { + this.pages.push(resources); + } + } +} + +module.exports = { ReuseWindowConcurrency }; + + diff --git a/yarn.lock b/yarn.lock index 25bde17cc..591a25eb2 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4246,10 +4246,10 @@ punycode@^2.1.0, punycode@^2.1.1: dependencies: debug "^4.1.1" -puppeteer-core@^16.1.0: - version "16.1.0" - resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-16.1.0.tgz#0485312363e6e1d65889d4b31de677bd36f872e4" - integrity sha512-Eu9FCqdWU2PU/RY53sa+JTsbFiQg5fJyaHX5DP0WZ4+lVLVdMfR9dwPimRkSl9NEcArm7lZMpiDlVCYelE90ZA== +puppeteer-core@^16.1.1: + version "16.1.1" + resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-16.1.1.tgz#2c26c560934a1c524a767c9ec0818520b7adb22a" + integrity sha512-ls+A6t+cbeNtsNIEyWkGoVJRHseEvBhS3NlI2DBFaJNBUG6kUfmAVyColu1ubgy4VuWLKpGUcwrPTVIvNd1Dew== dependencies: cross-fetch "3.1.5" debug "4.3.4"