Skip to content

Commit

Permalink
fix: incorrect parsing on theatlantic.com (#475)
Browse files Browse the repository at this point in the history
* fix: incorrect parsing on theatlantic.com

* chore: updating theatlantic.com tests & fixtures

* chore: removing script data from minified fixture
  • Loading branch information
mtashley authored Aug 20, 2019
1 parent 5e33263 commit 0686ee7
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 12 deletions.
2 changes: 1 addition & 1 deletion fixtures/www.theatlantic.com/1474321707642.html

Large diffs are not rendered by default.

30 changes: 21 additions & 9 deletions src/extractors/custom/www.theatlantic.com/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,15 @@
export const TheAtlanticExtractor = {
domain: 'www.theatlantic.com',
title: {
selectors: ['h1.hed'],
selectors: ['h1', '.c-article-header__hed'],
},

author: {
selectors: ['article#article .article-cover-extra .metadata .byline a'],
selectors: [['meta[name="author"]', 'value'], '.c-byline__author'],
},

content: {
selectors: [
['.article-cover figure.lead-img', '.article-body'],
'.article-body',
],
selectors: ['article', '.article-body'],

// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
Expand All @@ -23,14 +20,29 @@ export const TheAtlanticExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['.partner-box', '.callout'],
clean: [
'.partner-box',
'.callout',
'.c-article-writer__image',
'.c-article-writer__content',
'.c-letters-cta__text',
'.c-footer__logo',
'.c-recirculation-link',
'.twitter-tweet',
],
},

dek: {
selectors: [['meta[name="description"]', 'value']],
},

date_published: {
selectors: [['time[itemProp="datePublished"]', 'datetime']],
selectors: [['time[itemprop="datePublished"]', 'datetime']],
},

lead_image_url: null,
lead_image_url: {
selectors: [['img[itemprop="url"]', 'src']],
},

next_page_url: null,

Expand Down
13 changes: 11 additions & 2 deletions src/extractors/custom/www.theatlantic.com/index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ describe('AtlanticExtractor', () => {
// selectors in ./src/extractors/custom/www.theatlantic.com/index.js. This test is just
// a stub; you can add more fields to test as much of
// your parser as possible.
const { content, title, author } = await result;
const { content, title, author, dek, lead_image_url } = await result;

const $ = cheerio.load(content);
const text = $('*')
.first()
Expand All @@ -48,7 +49,15 @@ describe('AtlanticExtractor', () => {
'Why New Yorkers Received a Push Alert About a Manhunt'
);
assert.equal(author, 'Kaveh Waddell');
assert.equal(text, 'New York police offi');
assert.equal(text, 'The city has never b');
assert.equal(
dek,
'The city has never before used the emergency system the way it did Monday morning.'
);
assert.equal(
lead_image_url,
'https://cdn.theatlantic.com/assets/media/img/mt/2016/09/RTSO9RP/lead_720_405.jpg?mod=1533691849'
);
});
});
});

0 comments on commit 0686ee7

Please sign in to comment.