diff --git a/README.md b/README.md index cc2521d..f2c2422 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,11 @@ You can use `unfluff` from node or right on the command line! This is what `unfluff` will try to grab from a web page: - `title` - The document's title (from the <title> tag) +- `softTitle` - A version of `title` with less truncation +- `date` - The document's publication date +- `copyright` - The document's copyright line, if present +- `author` - The document's author +- `publisher` - The document's publisher (website name) - `text` - The main text of the document with all the junk thrown away - `image` - The main image for the document (what's used by facebook, etc.) - `videos` - An array of videos that were embedded in the article. Each video has src, width and height. @@ -127,7 +132,14 @@ data = extractor(my_html_data, 'en'); ```json { - "title": "Shovel Knight review: rewrite history", + "title": "Shovel Knight review", + "softTitle": "Shovel Knight review: rewrite history", + "date": "2014-06-26T13:00:03Z", + "copyright": "2016 Vox Media Inc Designed in house", + "author": [ + "Griffin McElroy" + ], + "publisher": "Polygon", "text": "Shovel Knight is inspired by the past in all the right ways — but it's far from stuck in it. [.. snip ..]", "image": "http://cdn2.vox-cdn.com/uploads/chorus_image/image/34834129/jellyfish_hero.0_cinema_1280.0.png", "tags": [], @@ -159,6 +171,11 @@ data = extractor.lazy(my_html_data, 'en'); // Access whichever data elements you need directly. console.log(data.title()); +console.log(data.softTitle()); +console.log(data.date()); +console.log(data.copyright()); +console.log(data.author()); +console.log(data.publisher()); console.log(data.text()); console.log(data.image()); console.log(data.tags()); diff --git a/lib/extractor.js b/lib/extractor.js index 132b61b..f51049e 100644 --- a/lib/extractor.js +++ b/lib/extractor.js @@ -1,34 +1,68 @@ // Generated by CoffeeScript 2.0.0-beta7 void function () { - var _, addSiblings, biggestTitleChunk, formatter, getObjectTag, getScore, getSiblingsContent, getSiblingsScore, getVideoAttrs, isBoostable, isHighlinkDensity, isNodescoreThresholdMet, isTableAndNoParaExist, postCleanup, stopwords, updateNodeCount, updateScore; + var _, addSiblings, biggestTitleChunk, cleanText, cleanTitle, formatter, getObjectTag, getScore, getSiblingsContent, getSiblingsScore, getVideoAttrs, isBoostable, isHighlinkDensity, isNodescoreThresholdMet, isTableAndNoParaExist, postCleanup, rawTitle, stopwords, updateNodeCount, updateScore; _ = require('lodash'); stopwords = require('./stopwords'); formatter = require('./formatter'); module.exports = { - title: function (doc) { - var titleElement, titleText, usedDelimeter; - titleElement = doc("meta[property='og:title']"); - if (titleElement) - titleText = titleElement.attr('content'); - if (!titleText) { - titleElement = doc('title').first(); - titleText = titleElement.text(); + date: function (doc) { + var cache$, cache$1, cache$2, cache$3, cache$4, dateCandidates; + dateCandidates = doc("meta[property='article:published_time'], meta[itemprop*='datePublished'], meta[name='dcterms.modified'], meta[name='dcterms.date'], meta[name='DC.date.issued'], meta[name='dc.date.issued'], meta[name='dc.date.modified'], meta[name='dc.date.created'], meta[name='DC.date'], meta[name='DC.Date'], meta[name='dc.date'], meta[name='date'], time[itemprop*='pubDate'], time[itemprop*='pubdate'], span[itemprop*='datePublished'], span[property*='datePublished'], p[itemprop*='datePublished'], p[property*='datePublished'], div[itemprop*='datePublished'], div[property*='datePublished'], li[itemprop*='datePublished'], li[property*='datePublished'], time, span[class*='date'], p[class*='date'], div[class*='date']"); + return (null != dateCandidates && null != (cache$ = dateCandidates.first()) && null != (cache$1 = cache$.attr('content')) ? cache$1.trim() : void 0) || (null != dateCandidates && null != (cache$2 = dateCandidates.first()) && null != (cache$3 = cache$2.attr('datetime')) ? cache$3.trim() : void 0) || cleanText(null != dateCandidates && null != (cache$4 = dateCandidates.first()) ? cache$4.text() : void 0) || null; + }, + copyright: function (doc) { + var cache$, copyright, copyrightCandidates, text; + copyrightCandidates = doc("p[class*='copyright'], div[class*='copyright'], span[class*='copyright'], li[class*='copyright'], p[id*='copyright'], div[id*='copyright'], span[id*='copyright'], li[id*='copyright']"); + text = null != copyrightCandidates && null != (cache$ = copyrightCandidates.first()) ? cache$.text() : void 0; + if (!text) { + text = doc('body').text().replace(/\s*[\r\n]+\s*/g, '. '); + if (!(text.indexOf('\xa9') > 0)) + return null; } - if (!titleElement) - return null; - usedDelimeter = false; - _.each([ + copyright = text.replace(/.*?©(\s*copyright)?([^,;:.|\r\n]+).*/gi, '$2').trim(); + return cleanText(copyright); + }, + author: function (doc) { + var authorCandidates, authorList, cache$, cache$1, cache$2, cache$3, cache$4, cache$5, fallbackAuthor; + authorCandidates = doc("meta[property='article:author'], meta[property='og:article:author'], meta[name='author'], meta[name='dcterms.creator'], meta[name='DC.creator'], meta[name='DC.Creator'], meta[name='dc.creator'], meta[name='creator']"); + authorList = []; + authorCandidates.each(function () { + var author, cache$, cache$1; + author = null != (cache$ = doc(this)) && null != (cache$1 = cache$.attr('content')) ? cache$1.trim() : void 0; + if (author) + return authorList.push(author); + }); + if (authorList.length === 0) { + fallbackAuthor = (null != (cache$ = doc("span[class*='author']").first()) ? cache$.text() : void 0) || (null != (cache$1 = doc("p[class*='author']").first()) ? cache$1.text() : void 0) || (null != (cache$2 = doc("div[class*='author']").first()) ? cache$2.text() : void 0) || (null != (cache$3 = doc("span[class*='byline']").first()) ? cache$3.text() : void 0) || (null != (cache$4 = doc("p[class*='byline']").first()) ? cache$4.text() : void 0) || (null != (cache$5 = doc("div[class*='byline']").first()) ? cache$5.text() : void 0); + if (fallbackAuthor) + authorList.push(cleanText(fallbackAuthor)); + } + return authorList; + }, + publisher: function (doc) { + var cache$, cache$1, publisherCandidates; + publisherCandidates = doc("meta[property='og:site_name'], meta[name='dc.publisher'], meta[name='DC.publisher'], meta[name='DC.Publisher']"); + if (null != publisherCandidates && null != (cache$ = publisherCandidates.first()) && null != (cache$1 = cache$.attr('content'))) + return cache$1.trim(); + }, + title: function (doc) { + var titleText; + titleText = rawTitle(doc); + return cleanTitle(titleText, [ '|', ' - ', '\xbb', ':' - ], function (c) { - if (titleText.indexOf(c) >= 0 && !usedDelimeter) { - titleText = biggestTitleChunk(titleText, c); - return usedDelimeter = true; - } - }); - return titleText.replace(/�/g, '').trim(); + ]); + }, + softTitle: function (doc) { + var titleText; + titleText = rawTitle(doc); + return cleanTitle(titleText, [ + '|', + ' - ', + '\xbb' + ]); }, text: function (doc, topNode, lang) { if (topNode) { @@ -414,4 +448,37 @@ void function () { }); return node; }; + cleanText = function (text) { + return text.replace(/[\r\n\t]/g, ' ').replace(/\s\s+/g, ' ').replace(//g, '').replace(/�/g, '').trim(); + }; + cleanTitle = function (title, delimiters) { + var titleText, usedDelimeter; + titleText = title || ''; + usedDelimeter = false; + _.each(delimiters, function (c) { + if (titleText.indexOf(c) >= 0 && !usedDelimeter) { + titleText = biggestTitleChunk(titleText, c); + return usedDelimeter = true; + } + }); + return cleanText(titleText); + }; + rawTitle = function (doc) { + var cache$, cache$1, cache$2, cache$3, cache$4, cache$5, cache$6, cache$7, cache$8, cache$9, gotTitle, titleText; + gotTitle = false; + titleText = ''; + _.each([ + null != (cache$ = doc("meta[property='og:title']")) && null != (cache$1 = cache$.first()) ? cache$1.attr('content') : void 0, + null != (cache$2 = doc("h1[class*='title']")) && null != (cache$3 = cache$2.first()) ? cache$3.text() : void 0, + null != (cache$4 = doc('title')) && null != (cache$5 = cache$4.first()) ? cache$5.text() : void 0, + null != (cache$6 = doc('h1')) && null != (cache$7 = cache$6.first()) ? cache$7.text() : void 0, + null != (cache$8 = doc('h2')) && null != (cache$9 = cache$8.first()) ? cache$9.text() : void 0 + ], function (candidate) { + if (candidate && candidate.trim() && !gotTitle) { + titleText = candidate.trim(); + return gotTitle = true; + } + }); + return titleText; + }; }.call(this); diff --git a/lib/unfluff.js b/lib/unfluff.js index a724ad5..81ad385 100644 --- a/lib/unfluff.js +++ b/lib/unfluff.js @@ -10,6 +10,11 @@ void function () { lng = language || extractor.lang(doc); pageData = { title: extractor.title(doc), + softTitle: extractor.softTitle(doc), + date: extractor.date(doc), + author: extractor.author(doc), + publisher: extractor.publisher(doc), + copyright: extractor.copyright(doc), favicon: extractor.favicon(doc), description: extractor.description(doc), keywords: extractor.keywords(doc), @@ -31,6 +36,31 @@ void function () { doc = getParsedDoc.call(this, html); return null != this.title_ ? this.title_ : this.title_ = extractor.title(doc); }, + softTitle: function () { + var doc; + doc = getParsedDoc.call(this, html); + return null != this.softTitle_ ? this.softTitle_ : this.softTitle_ = extractor.softTitle(doc); + }, + date: function () { + var doc; + doc = getParsedDoc.call(this, html); + return null != this.date_ ? this.date_ : this.date_ = extractor.date(doc); + }, + copyright: function () { + var doc; + doc = getParsedDoc.call(this, html); + return null != this.copyright_ ? this.copyright_ : this.copyright_ = extractor.copyright(doc); + }, + author: function () { + var doc; + doc = getParsedDoc.call(this, html); + return null != this.author_ ? this.author_ : this.author_ = extractor.author(doc); + }, + publisher: function () { + var doc; + doc = getParsedDoc.call(this, html); + return null != this.publisher_ ? this.publisher_ : this.publisher_ = extractor.publisher(doc); + }, favicon: function () { var doc; doc = getParsedDoc.call(this, html); diff --git a/src/extractor.coffee b/src/extractor.coffee index 6dac1a6..b166559 100644 --- a/src/extractor.coffee +++ b/src/extractor.coffee @@ -3,24 +3,91 @@ stopwords = require("./stopwords") formatter = require("./formatter") module.exports = - # Grab the title of an html doc (excluding junk) - title: (doc) -> - titleElement = doc("meta[property='og:title']") - titleText = titleElement.attr("content") if titleElement + # Grab the date of an html doc + date: (doc) -> + dateCandidates = doc("meta[property='article:published_time'], \ + meta[itemprop*='datePublished'], meta[name='dcterms.modified'], \ + meta[name='dcterms.date'], \ + meta[name='DC.date.issued'], meta[name='dc.date.issued'], \ + meta[name='dc.date.modified'], meta[name='dc.date.created'], \ + meta[name='DC.date'], \ + meta[name='DC.Date'], \ + meta[name='dc.date'], \ + meta[name='date'], \ + time[itemprop*='pubDate'], \ + time[itemprop*='pubdate'], \ + span[itemprop*='datePublished'], \ + span[property*='datePublished'], \ + p[itemprop*='datePublished'], \ + p[property*='datePublished'], \ + div[itemprop*='datePublished'], \ + div[property*='datePublished'], \ + li[itemprop*='datePublished'], \ + li[property*='datePublished'], \ + time, \ + span[class*='date'], \ + p[class*='date'], \ + div[class*='date']") + dateCandidates?.first()?.attr("content")?.trim() || dateCandidates?.first()?.attr("datetime")?.trim() || cleanText(dateCandidates?.first()?.text()) || null + + + # Grab the copyright line + copyright: (doc) -> + copyrightCandidates = doc("p[class*='copyright'], div[class*='copyright'], span[class*='copyright'], li[class*='copyright'], \ + p[id*='copyright'], div[id*='copyright'], span[id*='copyright'], li[id*='copyright']") + text = copyrightCandidates?.first()?.text() + if !text + # try to find the copyright in the text + text = doc("body").text().replace(/\s*[\r\n]+\s*/g, ". ") + return null unless text.indexOf("©") > 0 + copyright = text.replace(/.*?©(\s*copyright)?([^,;:.|\r\n]+).*/gi, "$2").trim() + cleanText(copyright) + + + # Grab the author of an html doc + author: (doc) -> + authorCandidates = doc("meta[property='article:author'], \ + meta[property='og:article:author'], meta[name='author'], \ + meta[name='dcterms.creator'], \ + meta[name='DC.creator'], \ + meta[name='DC.Creator'], \ + meta[name='dc.creator'], \ + meta[name='creator']") + authorList = [] + authorCandidates.each () -> + author = doc(this)?.attr("content")?.trim() + if author + authorList.push(author) + # fallback to a named author div + if authorList.length == 0 + fallbackAuthor = doc("span[class*='author']").first()?.text() || doc("p[class*='author']").first()?.text() || doc("div[class*='author']").first()?.text() || \ + doc("span[class*='byline']").first()?.text() || doc("p[class*='byline']").first()?.text() || doc("div[class*='byline']").first()?.text() + if fallbackAuthor + authorList.push(cleanText(fallbackAuthor)) + + authorList + + + # Grab the publisher of the page/site + publisher: (doc) -> + publisherCandidates = doc("meta[property='og:site_name'], \ + meta[name='dc.publisher'], \ + meta[name='DC.publisher'], \ + meta[name='DC.Publisher']") + publisherCandidates?.first()?.attr("content")?.trim() - if !titleText - titleElement = doc("title").first() - titleText = titleElement.text() - return null unless titleElement + # Grab the title of an html doc (excluding junk) + # Hard-truncates titles containing colon or spaced dash + title: (doc) -> + titleText = rawTitle(doc) + return cleanTitle(titleText, ["|", " - ", "»", ":"]) - usedDelimeter = false - _.each ["|", " - ", "»", ":"], (c) -> - if titleText.indexOf(c) >= 0 && !usedDelimeter - titleText = biggestTitleChunk(titleText, c) - usedDelimeter = true + # Grab the title with soft truncation + softTitle: (doc) -> + titleText = rawTitle(doc) + return cleanTitle(titleText, ["|", " - ", "»"]) - titleText.replace(/�/g, "").trim() # Grab the 'main' text chunk text: (doc, topNode, lang) -> @@ -434,3 +501,33 @@ postCleanup = (doc, targetNode, lang) -> doc(e).remove() return node + + +cleanText = (text) -> + return text.replace(/[\r\n\t]/g, " ").replace(/\s\s+/g, " ").replace(//g, "").replace(/�/g, "").trim() + + +cleanTitle = (title, delimiters) -> + titleText = title || "" + usedDelimeter = false + _.each delimiters, (c) -> + if titleText.indexOf(c) >= 0 && !usedDelimeter + titleText = biggestTitleChunk(titleText, c) + usedDelimeter = true + return cleanText(titleText) + + +rawTitle = (doc) -> + gotTitle = false + titleText = "" + # The first h1 or h2 is a useful fallback + _.each [doc("meta[property='og:title']")?.first()?.attr("content"), \ + doc("h1[class*='title']")?.first()?.text(), \ + doc("title")?.first()?.text(), \ + doc("h1")?.first()?.text(), \ + doc("h2")?.first()?.text()], (candidate) -> + if candidate && candidate.trim() && !gotTitle + titleText = candidate.trim() + gotTitle = true + + return titleText \ No newline at end of file diff --git a/src/unfluff.coffee b/src/unfluff.coffee index 9d5a20e..52e7da0 100644 --- a/src/unfluff.coffee +++ b/src/unfluff.coffee @@ -8,6 +8,11 @@ module.exports = unfluff = (html, language) -> pageData = title: extractor.title(doc) + softTitle: extractor.softTitle(doc) + date: extractor.date(doc) + author: extractor.author(doc) + publisher: extractor.publisher(doc) + copyright: extractor.copyright(doc) favicon: extractor.favicon(doc) description: extractor.description(doc) keywords: extractor.keywords(doc) @@ -34,6 +39,26 @@ unfluff.lazy = (html, language) -> doc = getParsedDoc.call(this, html) @title_ ?= extractor.title(doc) + softTitle: () -> + doc = getParsedDoc.call(this, html) + @softTitle_ ?= extractor.softTitle(doc) + + date: () -> + doc = getParsedDoc.call(this, html) + @date_ ?= extractor.date(doc) + + copyright: () -> + doc = getParsedDoc.call(this, html) + @copyright_ ?= extractor.copyright(doc) + + author: () -> + doc = getParsedDoc.call(this, html) + @author_ ?= extractor.author(doc) + + publisher: () -> + doc = getParsedDoc.call(this, html) + @publisher_ ?= extractor.publisher(doc) + favicon: () -> doc = getParsedDoc.call(this, html) @favicon_ ?= extractor.favicon(doc) diff --git a/test/extractor.coffee b/test/extractor.coffee index 51cc86b..dded7c0 100644 --- a/test/extractor.coffee +++ b/test/extractor.coffee @@ -20,6 +20,11 @@ suite 'Extractor', -> title = extractor.title(doc) eq title, "This is my page" + test 'returns a soft title chunk without truncation', -> + doc = cheerio.load("University Budgets: Where Your Fees Go | Top Universities") + title = extractor.softTitle(doc) + eq title, "University Budgets: Where Your Fees Go" + test 'prefers the meta tag title', -> doc = cheerio.load("This is my page - mysite") title = extractor.title(doc) @@ -49,3 +54,59 @@ suite 'Extractor', -> doc = cheerio.load("") favicon = extractor.favicon(doc) eq undefined, favicon + + test 'returns the article published meta date', -> + doc = cheerio.load("") + date = extractor.date(doc) + eq date, "2014-10-15T00:01:03+00:00" + + test 'returns the article dublin core meta date', -> + doc = cheerio.load("") + date = extractor.date(doc) + eq date, "2014-10-15T00:01:03+00:00" + + test 'returns the date in the