From a38c727a0aee058f640f03cd25f0930adda45a4c Mon Sep 17 00:00:00 2001 From: kik0220 <4346449+kik0220@users.noreply.github.com> Date: Wed, 24 Apr 2019 21:29:02 +0900 Subject: [PATCH] feat: add deadline.com custom parser (#383) * feat: add deadline.com custom parser * fix: timezone * fix: date_published selectors * fix: title and author selector * test: transform .embed-twitter * fix: regenerate the fixture and fix content selector --- fixtures/deadline.com/1556104756617.html | 2543 +++++++++++++++++ src/extractors/custom/deadline.com/index.js | 34 + .../custom/deadline.com/index.test.js | 115 + src/extractors/custom/index.js | 1 + 4 files changed, 2693 insertions(+) create mode 100644 fixtures/deadline.com/1556104756617.html create mode 100644 src/extractors/custom/deadline.com/index.js create mode 100644 src/extractors/custom/deadline.com/index.test.js diff --git a/fixtures/deadline.com/1556104756617.html b/fixtures/deadline.com/1556104756617.html new file mode 100644 index 000000000..99f9d6b2a --- /dev/null +++ b/fixtures/deadline.com/1556104756617.html @@ -0,0 +1,2543 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Donald Trump Advises Boeing, Tweeting “But What The Hell Do I Know?”; Twitter Answers – Deadline + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+ Click to Skip Ad +
+
Closing in...
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+ + + +
+ +
+ +
+
+
+
+ +
+
+
+
+
+
+
+ + +
+ + +Skip to main content + +
+
+
+ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + + + + Got A Tip? + +
+ +
+
+
+ + + + + +
+
+ + + Follow Us: + + +
+
+
+
+ +
+ +
+
+ + + +
+
+
+ +
+ +
+
+ +
+ +
+ + + + +
+ + + + +
+ + + + +
+ +
+

+ Newswire

+ + +
+ + +
+ + + + + + + + +
+

Copyright © 2019 Penske Business Media, LLC. All Rights reserved.

+ + Powered by + WordPress.com VIP +
+ + + +
+ + + + + +
+
+
+

+ + + Must Read Stories +

+ +
+ +
+
+ + + + + + + + + + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + diff --git a/src/extractors/custom/deadline.com/index.js b/src/extractors/custom/deadline.com/index.js new file mode 100644 index 000000000..3a1555f84 --- /dev/null +++ b/src/extractors/custom/deadline.com/index.js @@ -0,0 +1,34 @@ +export const DeadlineComExtractor = { + domain: 'deadline.com', + + title: { + selectors: ['h1'], + }, + + author: { + selectors: ['section.author h3'], + }, + + date_published: { + selectors: [['meta[name="article:published_time"]', 'value']], + }, + + dek: null, + + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']], + }, + + content: { + selectors: ['div.a-article-grid__main.pmc-a-grid article.pmc-a-grid-item'], + + transforms: { + '.embed-twitter': $node => { + const innerHtml = $node.html(); + $node.replaceWith(innerHtml); + }, + }, + + clean: [], + }, +}; diff --git a/src/extractors/custom/deadline.com/index.test.js b/src/extractors/custom/deadline.com/index.test.js new file mode 100644 index 000000000..67220ce16 --- /dev/null +++ b/src/extractors/custom/deadline.com/index.test.js @@ -0,0 +1,115 @@ +import assert from 'assert'; +import URL from 'url'; +import cheerio from 'cheerio'; + +import Mercury from 'mercury'; +import getExtractor from 'extractors/get-extractor'; +import { excerptContent } from 'utils/text'; + +const fs = require('fs'); + +describe('DeadlineComExtractor', () => { + describe('initial test case', () => { + let result; + let url; + beforeAll(() => { + url = + 'https://deadline.com/2019/04/donald-trump-boeing-max-737-rebrand-advice-twitter-1202595880/'; + const html = fs.readFileSync( + './fixtures/deadline.com/1556104756617.html' + ); + result = Mercury.parse(url, { + html, + fallback: false, + }); + }); + + it('is selected properly', () => { + // This test should be passing by default. + // It sanity checks that the correct parser + // is being selected for URLs from this domain + const extractor = getExtractor(url); + assert.equal(extractor.domain, URL.parse(url).hostname); + }); + + it('returns the title', async () => { + // To pass this test, fill out the title selector + // in ./src/extractors/custom/deadline.com/index.js. + const { title } = await result; + + // Update these values with the expected values from + // the article. + assert.equal( + title.split('Twitter')[0], + `Donald Trump Advises Boeing To Rebrand Max 737, Tweeting “But What The Hell Do I Know?”; ` + ); + }); + + it('returns the author', async () => { + // To pass this test, fill out the author selector + // in ./src/extractors/custom/deadline.com/index.js. + const { author } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(author, 'Lisa de Moraes'); + }); + + it('returns the date_published', async () => { + // To pass this test, fill out the date_published selector + // in ./src/extractors/custom/deadline.com/index.js. + const { date_published } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(date_published, `2019-04-15T13:18:34.000Z`); + }); + + it('returns the dek', async () => { + // To pass this test, fill out the dek selector + // in ./src/extractors/custom/deadline.com/index.js. + const { dek } = await result; + + // Update these values with the expected values from + // the article. + assert.equal(dek, null); + }); + + it('returns the lead_image_url', async () => { + // To pass this test, fill out the lead_image_url selector + // in ./src/extractors/custom/deadline.com/index.js. + const { lead_image_url } = await result; + + // Update these values with the expected values from + // the article. + assert.equal( + lead_image_url, + `https://pmcdeadline2.files.wordpress.com/2019/01/donald-trump-2.jpg?w=1024` + ); + }); + + it('returns the content', async () => { + // To pass this test, fill out the content selector + // in ./src/extractors/custom/deadline.com/index.js. + // You may also want to make use of the clean and transform + // options. + const { content } = await result; + + const $ = cheerio.load(content || ''); + + const first13 = excerptContent( + $('*') + .first() + .text(), + 13 + ); + + // Update these values with the expected values from + // the article. + assert.equal( + first13, + 'Andrew Harnik/AP/Shutterstock Twitter erupted Monday morning when President Donald Trump shared his branding' + ); + }); + }); +}); diff --git a/src/extractors/custom/index.js b/src/extractors/custom/index.js index 7c7dad38d..eff05e806 100644 --- a/src/extractors/custom/index.js +++ b/src/extractors/custom/index.js @@ -114,3 +114,4 @@ export * from './takagi-hiromitsu.jp'; export * from './bookwalker.jp'; export * from './www.yomiuri.co.jp'; export * from './japan.cnet.com'; +export * from './deadline.com';