feat: add parseDOM option (#7)

zeke · Jul 26, 2020 · 18fc27a · 18fc27a
1 parent 2c9fb43
commit 18fc27a
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 4 deletions.
diff --git a/index.js b/index.js
@@ -7,6 +7,7 @@ module.exports = function domwaiter (pages, opts = {}) {
   const emitter = new EventEmitter()
 
   const defaults = {
+    parseDOM: true,
     json: false,
     maxConcurrent: 5,
     minTime: 500
@@ -44,8 +45,8 @@ async function getPage (page, emitter, opts) {
   } else {
     try {
       const body = (await got(page.url)).body
-      const $ = cheerio.load(body)
-      const pageCopy = Object.assign({}, page, { body, $ })
+      const pageCopy = Object.assign({}, page, { body })
+      if (opts.parseDOM) pageCopy.$ = cheerio.load(body)
       emitter.emit('page', pageCopy)
     } catch (err) {
       emitter.emit('error', err)

diff --git a/readme.md b/readme.md
@@ -10,7 +10,7 @@ Do you have a large collection of URLs you want to scrape? Scraping one page at
 - Event-emitting API to keep a low memory footprint
 - Supports fetching JSON too (instead of HTML DOM)
 - Rate limiting powered by [bottleneck](https://ghub.io/bottleneck)
-- DOM parsing powered by [cheerio](https://ghub.io/cheerio)
+- DOM parsing powered by [cheerio](https://ghub.io/cheerio) (optional; can be disabled)
 - HTTP requests powered by [got](https://ghub.io/got)
 
 ## Installation
@@ -50,7 +50,8 @@ This module exports a single function `domwaiter`:
 
 - `pages` Array (required) - Each item in the array must have a `url` property with a fully-qualified HTTP(S) URL. These object can optionally have other properties, which will be included in the emitted `page` events. See below.
 - `opts` Object (optional)
-  - `json` Boolean - Set to `true` if you're fetching JSON instead of HTML. If `true`, a `json` property will be present on each emitted `page` object (and the `$` and `body` properties will NOT be present).
+  - `parseDOM` Boolean - Defaults to `true`. Set to `false` if you don't need the parsed `page.$` DOM object. Disabling DOM parsing will boost performance.
+  - `json` Boolean - Defaults to `false`. Set to `true` if you're fetching JSON instead of HTML. If `true`, a `json` property will be present on each emitted `page` object (and the `$` and `body` properties will NOT be present).
   - `maxConcurrent` Number - How many jobs can be executing at the same time. Defaults to `5`. This option is passed to the underlying [bottleneck](https://ghub.io/bottleneck#docs) instance.
   - `minTime`: Number - How long to wait after launching a job before launching another one. Defaults to `500` (milliseconds). This option is passed to the underlying [bottleneck](https://ghub.io/bottleneck#docs) instance.
 

diff --git a/test.js b/test.js
@@ -92,6 +92,35 @@ describe('domwaiter', () => {
       })
   })
 
+  test('allows `parseDOM` option to skip cheerio parsing', (done) => {
+    const mock = nock('https://example.com')
+      .get('/foo')
+      .reply(200, '<html><title>Hello, foo</title></html>')
+
+    const pages = [
+      { url: 'https://example.com/foo' }
+    ]
+
+    const waiter = domwaiter(pages, { minTime: 10, parseDOM: false })
+    const results = []
+
+    waiter
+      .on('page', (page) => {
+        results.push(page)
+      })
+      .on('done', () => {
+        expect(mock.isDone()).toBe(true)
+        expect(results.length).toBe(1)
+        expect(results[0].body).toContain('Hello, foo')
+        expect(results[0].$).toBe(undefined)
+        done()
+      })
+      .on('error', (err) => {
+        console.error('domwaiter error')
+        console.error(err)
+      })
+  })
+
   test('supports json responses', (done) => {
     const mock = nock('https://example.com')
       .get('/foo')