From 2af0f6179a584c3ee334e0fb8f102ca0ef2b2598 Mon Sep 17 00:00:00 2001 From: Janet Date: Tue, 7 Feb 2017 14:53:05 -0500 Subject: [PATCH] feat: rawstory parser (#109) * feat: rawstory parser Finished, with a little help from Frankie (thanks Frankie!) * fix: date_published timezone --- fixtures/www.rawstory.com/1482439337481.html | 1 + src/extractors/custom/index.js | 1 + .../custom/www.rawstory.com/index.js | 47 ++++++++++ .../custom/www.rawstory.com/index.test.js | 87 +++++++++++++++++++ 4 files changed, 136 insertions(+) create mode 100644 fixtures/www.rawstory.com/1482439337481.html create mode 100644 src/extractors/custom/www.rawstory.com/index.js create mode 100644 src/extractors/custom/www.rawstory.com/index.test.js diff --git a/fixtures/www.rawstory.com/1482439337481.html b/fixtures/www.rawstory.com/1482439337481.html new file mode 100644 index 000000000..a8d81e664 --- /dev/null +++ b/fixtures/www.rawstory.com/1482439337481.html @@ -0,0 +1 @@ + These 6 celebrities are refusing to play Trump’s inauguration as team scrambles to secure talent

Styles

Structure

{text}

{title}

{text}

{title}

{text}

{title}

{text}

{title}

Values

Scripts

Google+ \ No newline at end of file diff --git a/src/extractors/custom/index.js b/src/extractors/custom/index.js index fc25fed21..e907fe40e 100644 --- a/src/extractors/custom/index.js +++ b/src/extractors/custom/index.js @@ -55,6 +55,7 @@ export * from './uproxx.com'; export * from './www.eonline.com'; export * from './www.miamiherald.com'; export * from './www.refinery29.com'; +export * from './www.rawstory.com'; export * from './www.cnet.com'; export * from './www.cinemablend.com'; export * from './www.today.com'; diff --git a/src/extractors/custom/www.rawstory.com/index.js b/src/extractors/custom/www.rawstory.com/index.js new file mode 100644 index 000000000..0b8ea3dfd --- /dev/null +++ b/src/extractors/custom/www.rawstory.com/index.js @@ -0,0 +1,47 @@ +export const WwwRawstoryComExtractor = { + domain: 'www.rawstory.com', + + title: { + selectors: [ + '.blog-title', + ], + }, + + author: { + selectors: [ + '.blog-author a:first-of-type', + ], + }, + + date_published: { + selectors: [ + '.blog-author a:last-of-type', + ], + + timezone: 'EST', + }, + + lead_image_url: { + selectors: [ + ['meta[name="og:image"]', 'value'], + ], + }, + + content: { + selectors: [ + '.blog-content', + ], + + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: { + }, + + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: [ + + ], + }, +}; diff --git a/src/extractors/custom/www.rawstory.com/index.test.js b/src/extractors/custom/www.rawstory.com/index.test.js new file mode 100644 index 000000000..f67fa23dc --- /dev/null +++ b/src/extractors/custom/www.rawstory.com/index.test.js @@ -0,0 +1,87 @@ +import assert from 'assert'; +import fs from 'fs'; +import URL from 'url'; +import cheerio from 'cheerio'; + +import Mercury from 'mercury'; +import getExtractor from 'extractors/get-extractor'; +import { excerptContent } from 'utils/text'; + +describe('WwwRawstoryComExtractor', () => { + describe('initial test case', () => { + let result; + let url; + beforeAll(() => { + url = + 'http://www.rawstory.com/2016/12/these-6-celebrities-are-refusing-to-play-trumps-inauguration-as-team-scrambles-to-secure-talent/'; + const html = + fs.readFileSync('./fixtures/www.rawstory.com/1482439337481.html'); + result = + Mercury.parse(url, html, { fallback: false }); + }); + + it('is selected properly', () => { + // This test should be passing by default. + // It sanity checks that the correct parser + // is being selected for URLs from this domain + const extractor = getExtractor(url); + assert.equal(extractor.domain, URL.parse(url).hostname); + }); + + it('returns the title', async () => { + // To pass this test, fill out the title selector + // in ./src/extractors/custom/www.rawstory.com/index.js. + const { title } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(title, 'These 6 celebrities are refusing to play Trump’s inauguration as team scrambles to secure talent'); + }); + + it('returns the author', async () => { + // To pass this test, fill out the author selector + // in ./src/extractors/custom/www.rawstory.com/index.js. + const { author } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(author, 'Tana Ganeva'); + }); + + it('returns the date_published', async () => { + // To pass this test, fill out the date_published selector + // in ./src/extractors/custom/www.rawstory.com/index.js. + const { date_published } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(date_published, '2016-12-22T19:27:00.000Z'); + }); + + it('returns the lead_image_url', async () => { + // To pass this test, fill out the lead_image_url selector + // in ./src/extractors/custom/www.rawstory.com/index.js. + const { lead_image_url } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(lead_image_url, 'http://www.rawstory.com/wp-content/uploads/2016/12/Donald-Trump11-800x430.jpg'); + }); + + it('returns the content', async () => { + // To pass this test, fill out the content selector + // in ./src/extractors/custom/www.rawstory.com/index.js. + // You may also want to make use of the clean and transform + // options. + const { content } = await result; + + const $ = cheerio.load(content || ''); + + const first13 = excerptContent($('*').first().text(), 13); + + // Update these values with the expected values from + // the article. + assert.equal(first13, 'As Inauguration Day approaches, Donald Trump’s team is reportedly having such a hard'); + }); + }); +});