Skip to content

Commit

Permalink
fix: incorrect parsing on medium.com (#477)
Browse files Browse the repository at this point in the history
* fix: medium extractor now pulls content

* fix: remove youtube caption if no preview available

* fix: remove youtube node if no image

* fix: removing dek from medium.com extractor
  • Loading branch information
mtashley authored Aug 28, 2019
1 parent 2bed238 commit 2422e47
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 24 deletions.
2 changes: 1 addition & 1 deletion fixtures/medium.com/1477523363921.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion fixtures/medium.com/1485902752952.html

Large diffs are not rendered by default.

34 changes: 18 additions & 16 deletions src/extractors/custom/medium.com/index.js
Original file line number Diff line number Diff line change
@@ -1,22 +1,16 @@
export const MediumExtractor = {
domain: 'medium.com',

supportedDomains: ['trackchanges.postlight.com'],

title: {
selectors: ['h1'],
selectors: ['h1', ['meta[name="og:title"]', 'value']],
},

author: {
selectors: [['meta[name="author"]', 'value']],
},

content: {
selectors: [
['.section-content'],
'.section-content',
'article > div > section',
],
selectors: ['article'],

// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
Expand All @@ -25,14 +19,18 @@ export const MediumExtractor = {
iframe: $node => {
const ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
const thumb = decodeURIComponent($node.attr('data-thumbnail'));
const $parent = $node.parents('figure');

if (ytRe.test(thumb)) {
const [_, youtubeId] = thumb.match(ytRe); // eslint-disable-line
$node.attr('src', `https://www.youtube.com/embed/${youtubeId}`);
const $parent = $node.parents('figure');
const $caption = $parent.find('figcaption');
$parent.empty().append([$node, $caption]);
return;
}

// If we can't draw the YouTube preview, remove the figure.
$parent.remove();
},

// rewrite figures to pull out image and caption, remove rest
Expand All @@ -42,29 +40,33 @@ export const MediumExtractor = {

const $img = $node.find('img').slice(-1)[0];
const $caption = $node.find('figcaption');

$node.empty().append([$img, $caption]);
},

// Remove any smaller images that did not get caught by the generic image
// cleaner (author photo 48px, leading sentence images 79px, etc.).
img: $node => {
const width = parseInt($node.attr('width'), 10);
if (width < 100) $node.remove();
},
},

// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [],
clean: ['span', 'svg'],
},

date_published: {
selectors: [['time[datetime]', 'datetime']],
selectors: [['meta[name="article:published_time"]', 'value']],
},

lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']],
},

dek: {
selectors: [
// enter selectors
],
},
dek: null,

next_page_url: {
selectors: [
Expand Down
9 changes: 3 additions & 6 deletions src/extractors/custom/medium.com/index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ describe('MediumExtractor', () => {
it('returns the date_published', async () => {
const { date_published } = await result;

assert.equal(date_published, '2016-10-19T14:24:20.323Z');
assert.equal(date_published, '2016-10-19T14:30:56.529Z');
});

it('returns the dek', async () => {
Expand All @@ -62,7 +62,7 @@ describe('MediumExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'https://cdn-images-1.medium.com/max/1200/1*3Gzaug9mRc8vvx1cuQWkog.png'
'https://miro.medium.com/max/540/1*3Gzaug9mRc8vvx1cuQWkog.png'
);
});

Expand All @@ -78,12 +78,9 @@ describe('MediumExtractor', () => {
13
);

// testing that youtube video transform is working
assert.equal(/IAoy3ia2ivI/.test(content), true);

assert.equal(
first13,
'Video of WTF? My talk at the White House Frontiers ConferenceLast Thursday, I'
'Last Thursday, I had the honor to be one of the warmup acts'
);
});
});
Expand Down

0 comments on commit 2422e47

Please sign in to comment.