From f13bb721f6a3cfb9eb30929fe798e3c0c5f6f374 Mon Sep 17 00:00:00 2001 From: Janet Date: Tue, 14 Mar 2017 18:34:40 -0400 Subject: [PATCH] feat: prospect magazine parser (#147) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: prospect magazine parser Couldn’t find a way to parse the date but I think it’s good otherwise. * fix: pulls date * fix: add timezone * fix: generalize --- .../1488476298434.html | 938 ++++++++++++++++++ src/extractors/custom/index.js | 1 + .../www.prospectmagazine.co.uk/index.js | 54 + .../www.prospectmagazine.co.uk/index.test.js | 97 ++ 4 files changed, 1090 insertions(+) create mode 100644 fixtures/www.prospectmagazine.co.uk/1488476298434.html create mode 100644 src/extractors/custom/www.prospectmagazine.co.uk/index.js create mode 100644 src/extractors/custom/www.prospectmagazine.co.uk/index.test.js diff --git a/fixtures/www.prospectmagazine.co.uk/1488476298434.html b/fixtures/www.prospectmagazine.co.uk/1488476298434.html new file mode 100644 index 000000000..ed72698b1 --- /dev/null +++ b/fixtures/www.prospectmagazine.co.uk/1488476298434.html @@ -0,0 +1,938 @@ + + + + + + + + + + + +The Anglosphere: new enthusiasm for an old dream | Prospect Magazine + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+ + + + + + + + + + + + + + + + + + + +
+ +
+ +
+ +
+ + + +
+ +
+ + + + + +
+ + + + + + + + + + + + + + + + + + + +
+ +
+ + + +
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/extractors/custom/index.js b/src/extractors/custom/index.js index fd9a7b856..1395dad40 100644 --- a/src/extractors/custom/index.js +++ b/src/extractors/custom/index.js @@ -82,6 +82,7 @@ export * from './fortune.com'; export * from './www.linkedin.com'; export * from './obamawhitehouse.archives.gov'; export * from './www.opposingviews.com'; +export * from './www.prospectmagazine.co.uk'; export * from './forward.com'; export * from './www.qdaily.com'; export * from './gothamist.com'; diff --git a/src/extractors/custom/www.prospectmagazine.co.uk/index.js b/src/extractors/custom/www.prospectmagazine.co.uk/index.js new file mode 100644 index 000000000..a320e5742 --- /dev/null +++ b/src/extractors/custom/www.prospectmagazine.co.uk/index.js @@ -0,0 +1,54 @@ +export const WwwProspectmagazineCoUkExtractor = { + domain: 'www.prospectmagazine.co.uk', + + title: { + selectors: [ + '.page-title', + ], + }, + + author: { + selectors: [ + '.aside_author .title', + ], + }, + + date_published: { + selectors: [ + '.post-info', + ], + + timezone: 'Europe/London', + }, + + dek: { + selectors: [ + '.page-subtitle', + ], + }, + + lead_image_url: { + selectors: [ + ['meta[name="og:image"]', 'value'], + ], + }, + + content: { + selectors: [ + // ['article.type-post div.post_content p'], + 'article .post_content', + ], + + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: { + }, + + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: [ + + ], + }, +}; diff --git a/src/extractors/custom/www.prospectmagazine.co.uk/index.test.js b/src/extractors/custom/www.prospectmagazine.co.uk/index.test.js new file mode 100644 index 000000000..97244b099 --- /dev/null +++ b/src/extractors/custom/www.prospectmagazine.co.uk/index.test.js @@ -0,0 +1,97 @@ +import assert from 'assert'; +import fs from 'fs'; +import URL from 'url'; +import cheerio from 'cheerio'; + +import Mercury from 'mercury'; +import getExtractor from 'extractors/get-extractor'; +import { excerptContent } from 'utils/text'; + +describe('WwwProspectmagazineCoUkExtractor', () => { + describe('initial test case', () => { + let result; + let url; + beforeAll(() => { + url = + 'http://www.prospectmagazine.co.uk/magazine/anglosphere-old-dream-brexit-role-in-the-world'; + const html = + fs.readFileSync('./fixtures/www.prospectmagazine.co.uk/1488476298434.html'); + result = + Mercury.parse(url, html, { fallback: false }); + }); + + it('is selected properly', () => { + // This test should be passing by default. + // It sanity checks that the correct parser + // is being selected for URLs from this domain + const extractor = getExtractor(url); + assert.equal(extractor.domain, URL.parse(url).hostname); + }); + + it('returns the title', async () => { + // To pass this test, fill out the title selector + // in ./src/extractors/custom/www.prospectmagazine.co.uk/index.js. + const { title } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(title, 'The Anglosphere: new enthusiasm for an old dream'); + }); + + it('returns the author', async () => { + // To pass this test, fill out the author selector + // in ./src/extractors/custom/www.prospectmagazine.co.uk/index.js. + const { author } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(author, 'Duncan Bell'); + }); + + it('returns the date_published', async () => { + // To pass this test, fill out the date_published selector + // in ./src/extractors/custom/www.prospectmagazine.co.uk/index.js. + const { date_published } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(date_published, '2017-01-19T17:00:00.000Z'); + }); + + it('returns the dek', async () => { + // To pass this test, fill out the dek selector + // in ./src/extractors/custom/www.prospectmagazine.co.uk/index.js. + const { dek } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(dek, 'Having cut Britain adrift of Europe, Brexiters are indulging in an old fantasy about a new national role in the world—as the hub of a far-flung Anglosphere'); + }); + + it('returns the lead_image_url', async () => { + // To pass this test, fill out the lead_image_url selector + // in ./src/extractors/custom/www.prospectmagazine.co.uk/index.js. + const { lead_image_url } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(lead_image_url, 'http://www.prospectmagazine.co.uk/wp-content/uploads/2017/01/64344_web.jpg'); + }); + + it('returns the content', async () => { + // To pass this test, fill out the content selector + // in ./src/extractors/custom/www.prospectmagazine.co.uk/index.js. + // You may also want to make use of the clean and transform + // options. + const { content } = await result; + + const $ = cheerio.load(content || ''); + + const first13 = excerptContent($('*').first().text(), 13); + + // Update these values with the expected values from + // the article. + assert.equal(first13, 'Leading Brexiteers proposed Britain should reinforce its relationship with “natural allies” Australia, Canada'); + }); + }); +});