Merge pull request #49 from philgooch/add-date-author-copyright

Adds author, date, copyright extraction
ageitgey · Feb 20, 2016 · 52cb470 · 52cb470
2 parents 0ead6cc + 0f9bb51
commit 52cb470
Show file tree

Hide file tree

Showing 6 changed files with 332 additions and 35 deletions.
diff --git a/README.md b/README.md
@@ -54,6 +54,11 @@ You can use `unfluff` from node or right on the command line!
 
 This is what `unfluff` will try to grab from a web page:
 - `title` - The document's title (from the &lt;title&gt; tag)
+- `softTitle` - A version of `title` with less truncation
+- `date` - The document's publication date
+- `copyright` - The document's copyright line, if present
+- `author` - The document's author
+- `publisher` - The document's publisher (website name)
 - `text` - The main text of the document with all the junk thrown away
 - `image` - The main image for the document (what's used by facebook, etc.)
 - `videos` - An array of videos that were embedded in the article. Each video has src, width and height.
@@ -127,7 +132,14 @@ data = extractor(my_html_data, 'en');
 
 ```json
 {
-  "title": "Shovel Knight review: rewrite history",
+  "title": "Shovel Knight review",
+  "softTitle": "Shovel Knight review: rewrite history",
+  "date": "2014-06-26T13:00:03Z",
+  "copyright": "2016 Vox Media Inc Designed in house",
+  "author": [
+    "Griffin McElroy"
+  ],
+  "publisher": "Polygon",
   "text": "Shovel Knight is inspired by the past in all the right ways — but it's far from stuck in it. [.. snip ..]",
   "image": "http://cdn2.vox-cdn.com/uploads/chorus_image/image/34834129/jellyfish_hero.0_cinema_1280.0.png",  
   "tags": [],
@@ -159,6 +171,11 @@ data = extractor.lazy(my_html_data, 'en');
 
 // Access whichever data elements you need directly.
 console.log(data.title());
+console.log(data.softTitle());
+console.log(data.date());
+console.log(data.copyright());
+console.log(data.author());
+console.log(data.publisher());
 console.log(data.text());
 console.log(data.image());
 console.log(data.tags());

diff --git a/lib/extractor.js b/lib/extractor.js
diff --git a/lib/unfluff.js b/lib/unfluff.js
diff --git a/src/extractor.coffee b/src/extractor.coffee
@@ -3,24 +3,91 @@ stopwords = require("./stopwords")
 formatter = require("./formatter")
 
 module.exports =
-  # Grab the title of an html doc (excluding junk)
-  title: (doc) ->
-    titleElement = doc("meta[property='og:title']")
-    titleText = titleElement.attr("content") if titleElement
+  # Grab the date of an html doc
+  date: (doc) ->
+    dateCandidates = doc("meta[property='article:published_time'], \
+    meta[itemprop*='datePublished'], meta[name='dcterms.modified'], \
+    meta[name='dcterms.date'], \
+    meta[name='DC.date.issued'],  meta[name='dc.date.issued'], \
+    meta[name='dc.date.modified'], meta[name='dc.date.created'], \
+    meta[name='DC.date'], \
+    meta[name='DC.Date'], \
+    meta[name='dc.date'], \
+    meta[name='date'], \
+    time[itemprop*='pubDate'], \
+    time[itemprop*='pubdate'], \
+    span[itemprop*='datePublished'], \
+    span[property*='datePublished'], \
+    p[itemprop*='datePublished'], \
+    p[property*='datePublished'], \
+    div[itemprop*='datePublished'], \
+    div[property*='datePublished'], \
+    li[itemprop*='datePublished'], \
+    li[property*='datePublished'], \
+    time, \
+    span[class*='date'], \
+    p[class*='date'], \
+    div[class*='date']")
+    dateCandidates?.first()?.attr("content")?.trim() || dateCandidates?.first()?.attr("datetime")?.trim() || cleanText(dateCandidates?.first()?.text()) || null
+
+
+  # Grab the copyright line
+  copyright: (doc) ->
+    copyrightCandidates = doc("p[class*='copyright'], div[class*='copyright'], span[class*='copyright'], li[class*='copyright'], \
+    p[id*='copyright'], div[id*='copyright'], span[id*='copyright'], li[id*='copyright']")
+    text = copyrightCandidates?.first()?.text()
+    if !text
+      # try to find the copyright in the text
+      text = doc("body").text().replace(/\s*[\r\n]+\s*/g, ". ")
+      return null unless text.indexOf("©") > 0
+    copyright = text.replace(/.*?©(\s*copyright)?([^,;:.|\r\n]+).*/gi, "$2").trim()
+    cleanText(copyright)
+
+
+  # Grab the author of an html doc
+  author: (doc) ->
+    authorCandidates = doc("meta[property='article:author'], \
+    meta[property='og:article:author'], meta[name='author'], \
+    meta[name='dcterms.creator'], \
+    meta[name='DC.creator'], \
+    meta[name='DC.Creator'], \
+    meta[name='dc.creator'], \
+    meta[name='creator']")
+    authorList = []
+    authorCandidates.each () ->
+      author = doc(this)?.attr("content")?.trim()
+      if author
+        authorList.push(author)
+    # fallback to a named author div
+    if authorList.length == 0
+      fallbackAuthor = doc("span[class*='author']").first()?.text() || doc("p[class*='author']").first()?.text() || doc("div[class*='author']").first()?.text() || \
+      doc("span[class*='byline']").first()?.text() || doc("p[class*='byline']").first()?.text() || doc("div[class*='byline']").first()?.text()
+      if fallbackAuthor
+        authorList.push(cleanText(fallbackAuthor))
+
+    authorList
+
+
+  # Grab the publisher of the page/site
+  publisher: (doc) ->
+    publisherCandidates = doc("meta[property='og:site_name'], \
+    meta[name='dc.publisher'], \
+    meta[name='DC.publisher'], \
+    meta[name='DC.Publisher']")
+    publisherCandidates?.first()?.attr("content")?.trim()
 
-    if !titleText
-      titleElement = doc("title").first()
-      titleText = titleElement.text()
 
-    return null unless titleElement
+  # Grab the title of an html doc (excluding junk)
+  # Hard-truncates titles containing colon or spaced dash
+  title: (doc) ->
+    titleText = rawTitle(doc)
+    return cleanTitle(titleText, ["|", " - ", "»", ":"])
 
-    usedDelimeter = false
-    _.each ["|", " - ", "»", ":"], (c) ->
-      if titleText.indexOf(c) >= 0 && !usedDelimeter
-        titleText = biggestTitleChunk(titleText, c)
-        usedDelimeter = true
+  # Grab the title with soft truncation
+  softTitle: (doc) ->
+    titleText = rawTitle(doc)
+    return cleanTitle(titleText, ["|", " - ", "»"])
 
-    titleText.replace(/�/g, "").trim()
 
   # Grab the 'main' text chunk
   text: (doc, topNode, lang) ->
@@ -434,3 +501,33 @@ postCleanup = (doc, targetNode, lang) ->
         doc(e).remove()
 
   return node
+
+
+cleanText = (text) ->
+  return text.replace(/[\r\n\t]/g, " ").replace(/\s\s+/g, " ").replace(/<!--.+?-->/g, "").replace(/�/g, "").trim()
+
+
+cleanTitle = (title, delimiters) ->
+  titleText = title || ""
+  usedDelimeter = false
+  _.each delimiters, (c) ->
+    if titleText.indexOf(c) >= 0 && !usedDelimeter
+      titleText = biggestTitleChunk(titleText, c)
+      usedDelimeter = true
+  return cleanText(titleText)
+
+
+rawTitle = (doc) ->
+  gotTitle = false
+  titleText = ""
+  # The first h1 or h2 is a useful fallback
+  _.each [doc("meta[property='og:title']")?.first()?.attr("content"), \
+  doc("h1[class*='title']")?.first()?.text(), \
+  doc("title")?.first()?.text(), \
+  doc("h1")?.first()?.text(), \
+  doc("h2")?.first()?.text()], (candidate) ->
+    if candidate && candidate.trim() && !gotTitle
+      titleText = candidate.trim()
+      gotTitle = true
+
+  return titleText
diff --git a/src/unfluff.coffee b/src/unfluff.coffee
@@ -8,6 +8,11 @@ module.exports = unfluff = (html, language) ->
 
   pageData =
     title: extractor.title(doc)
+    softTitle: extractor.softTitle(doc)
+    date: extractor.date(doc)
+    author: extractor.author(doc)
+    publisher: extractor.publisher(doc)
+    copyright: extractor.copyright(doc)
     favicon: extractor.favicon(doc)
     description: extractor.description(doc)
     keywords: extractor.keywords(doc)
@@ -34,6 +39,26 @@ unfluff.lazy = (html, language) ->
     doc = getParsedDoc.call(this, html)
     @title_ ?= extractor.title(doc)
 
+  softTitle: () ->
+    doc = getParsedDoc.call(this, html)
+    @softTitle_ ?= extractor.softTitle(doc)
+
+  date: () ->
+    doc = getParsedDoc.call(this, html)
+    @date_ ?= extractor.date(doc)
+
+  copyright: () ->
+    doc = getParsedDoc.call(this, html)
+    @copyright_ ?= extractor.copyright(doc)
+
+  author: () ->
+    doc = getParsedDoc.call(this, html)
+    @author_ ?= extractor.author(doc)
+
+  publisher: () ->
+    doc = getParsedDoc.call(this, html)
+    @publisher_ ?= extractor.publisher(doc)
+
   favicon: () ->
     doc = getParsedDoc.call(this, html)
     @favicon_ ?= extractor.favicon(doc)