From 117376267e19cd8cb15686c67300172efe53613d Mon Sep 17 00:00:00 2001 From: Adam Pash Date: Mon, 5 Dec 2016 17:12:16 -0500 Subject: [PATCH 1/2] feat: custom detectors for matching extractors for publishing platforms --- src/extractors/detect-by-html.js | 10 ++++++++++ src/extractors/detect-by-html.test.js | 24 ++++++++++++++++++++++++ src/extractors/get-extractor.js | 6 ++++-- src/extractors/get-extractor.test.js | 13 ++++++++++++- src/mercury.js | 6 +++--- 5 files changed, 53 insertions(+), 6 deletions(-) create mode 100644 src/extractors/detect-by-html.js create mode 100644 src/extractors/detect-by-html.test.js diff --git a/src/extractors/detect-by-html.js b/src/extractors/detect-by-html.js new file mode 100644 index 000000000..1ea5bc0da --- /dev/null +++ b/src/extractors/detect-by-html.js @@ -0,0 +1,10 @@ +const Detectors = { + 'meta[name="al:ios:app_name"][value="Medium"]': 'medium.com', + 'meta[name="generator"][value="blogger"]': 'blogspot.com', +}; + +export default function detectByHtml($) { + const selector = Reflect.ownKeys(Detectors).find(s => $(s).length > 0); + + return Detectors[selector]; +} diff --git a/src/extractors/detect-by-html.test.js b/src/extractors/detect-by-html.test.js new file mode 100644 index 000000000..93841c0f5 --- /dev/null +++ b/src/extractors/detect-by-html.test.js @@ -0,0 +1,24 @@ +import assert from 'assert'; +import cheerio from 'cheerio'; + +import detectByHtml from './detect-by-html'; + +describe('detectByHtml', () => { + it('detects a medium post from the html', () => { + const html = + ''; + + const $ = cheerio.load(html); + + assert.equal(detectByHtml($), 'medium.com'); + }); + + it('returns nothing if no match is found', () => { + const html = + '
'; + + const $ = cheerio.load(html); + + assert.equal(detectByHtml($), null); + }); +}); diff --git a/src/extractors/get-extractor.js b/src/extractors/get-extractor.js index 4b01297f6..4d5c12ce5 100644 --- a/src/extractors/get-extractor.js +++ b/src/extractors/get-extractor.js @@ -2,11 +2,13 @@ import URL from 'url'; import Extractors from './all'; import GenericExtractor from './generic'; +import detectByHtml from './detect-by-html'; -export default function getExtractor(url, parsedUrl) { +export default function getExtractor(url, parsedUrl, $) { parsedUrl = parsedUrl || URL.parse(url); const { hostname } = parsedUrl; const baseDomain = hostname.split('.').slice(-2).join('.'); - return Extractors[hostname] || Extractors[baseDomain] || GenericExtractor; + return Extractors[hostname] || Extractors[baseDomain] || + Extractors[detectByHtml($)] || GenericExtractor; } diff --git a/src/extractors/get-extractor.test.js b/src/extractors/get-extractor.test.js index 412e83094..3e31e0dd3 100644 --- a/src/extractors/get-extractor.test.js +++ b/src/extractors/get-extractor.test.js @@ -1,10 +1,11 @@ import assert from 'assert'; +import cheerio from 'cheerio'; import getExtractor from './get-extractor'; describe('getExtractor(url)', () => { it('returns GenericExtractor if no custom extractor is found', () => { - const extractor = getExtractor('http://example.com'); + const extractor = getExtractor('http://example.com', null, cheerio.load('
')); assert.equal(extractor.domain, '*'); }); @@ -26,4 +27,14 @@ describe('getExtractor(url)', () => { assert.equal(extractor.domain, 'wikipedia.org'); }); + + it('returns a custom extractor based on detectors', () => { + const html = + ''; + + const $ = cheerio.load(html); + const extractor = getExtractor('http://foo.com', null, $); + + assert.equal(extractor.domain, 'medium.com'); + }); }); diff --git a/src/mercury.js b/src/mercury.js index c79fafc72..c9e3bda42 100644 --- a/src/mercury.js +++ b/src/mercury.js @@ -31,11 +31,11 @@ const Mercury = { return Errors.badUrl; } - const Extractor = getExtractor(url, parsedUrl); - // console.log(`Using extractor for ${Extractor.domain}`); - const $ = await Resource.create(url, html, parsedUrl); + const Extractor = getExtractor(url, parsedUrl, $); + // console.log(`Using extractor for ${Extractor.domain}`); + // If we found an error creating the resource, return that error if ($.failed) { return $; From 0de9b391c18376ed459ef860a887862723718538 Mon Sep 17 00:00:00 2001 From: Adam Pash Date: Mon, 5 Dec 2016 17:36:31 -0500 Subject: [PATCH 2/2] test fix --- src/extractors/detect-by-html.js | 9 +++++++-- src/extractors/detect-by-html.test.js | 2 +- src/extractors/get-extractor.js | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/extractors/detect-by-html.js b/src/extractors/detect-by-html.js index 1ea5bc0da..21fae75d7 100644 --- a/src/extractors/detect-by-html.js +++ b/src/extractors/detect-by-html.js @@ -1,6 +1,11 @@ +import { + MediumExtractor, + BloggerExtractor, +} from './custom/'; + const Detectors = { - 'meta[name="al:ios:app_name"][value="Medium"]': 'medium.com', - 'meta[name="generator"][value="blogger"]': 'blogspot.com', + 'meta[name="al:ios:app_name"][value="Medium"]': MediumExtractor, + 'meta[name="generator"][value="blogger"]': BloggerExtractor, }; export default function detectByHtml($) { diff --git a/src/extractors/detect-by-html.test.js b/src/extractors/detect-by-html.test.js index 93841c0f5..ae5f28eff 100644 --- a/src/extractors/detect-by-html.test.js +++ b/src/extractors/detect-by-html.test.js @@ -10,7 +10,7 @@ describe('detectByHtml', () => { const $ = cheerio.load(html); - assert.equal(detectByHtml($), 'medium.com'); + assert.equal(detectByHtml($).domain, 'medium.com'); }); it('returns nothing if no match is found', () => { diff --git a/src/extractors/get-extractor.js b/src/extractors/get-extractor.js index 4d5c12ce5..b92be9e68 100644 --- a/src/extractors/get-extractor.js +++ b/src/extractors/get-extractor.js @@ -10,5 +10,5 @@ export default function getExtractor(url, parsedUrl, $) { const baseDomain = hostname.split('.').slice(-2).join('.'); return Extractors[hostname] || Extractors[baseDomain] || - Extractors[detectByHtml($)] || GenericExtractor; + detectByHtml($) || GenericExtractor; }