Skip to content

Commit

Permalink
Merge pull request #49 from philgooch/add-date-author-copyright
Browse files Browse the repository at this point in the history
Adds author, date, copyright extraction
  • Loading branch information
ageitgey committed Feb 20, 2016
2 parents 0ead6cc + 0f9bb51 commit 52cb470
Show file tree
Hide file tree
Showing 6 changed files with 332 additions and 35 deletions.
19 changes: 18 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ You can use `unfluff` from node or right on the command line!

This is what `unfluff` will try to grab from a web page:
- `title` - The document's title (from the <title> tag)
- `softTitle` - A version of `title` with less truncation
- `date` - The document's publication date
- `copyright` - The document's copyright line, if present
- `author` - The document's author
- `publisher` - The document's publisher (website name)
- `text` - The main text of the document with all the junk thrown away
- `image` - The main image for the document (what's used by facebook, etc.)
- `videos` - An array of videos that were embedded in the article. Each video has src, width and height.
Expand Down Expand Up @@ -127,7 +132,14 @@ data = extractor(my_html_data, 'en');

```json
{
"title": "Shovel Knight review: rewrite history",
"title": "Shovel Knight review",
"softTitle": "Shovel Knight review: rewrite history",
"date": "2014-06-26T13:00:03Z",
"copyright": "2016 Vox Media Inc Designed in house",
"author": [
"Griffin McElroy"
],
"publisher": "Polygon",
"text": "Shovel Knight is inspired by the past in all the right ways — but it's far from stuck in it. [.. snip ..]",
"image": "http://cdn2.vox-cdn.com/uploads/chorus_image/image/34834129/jellyfish_hero.0_cinema_1280.0.png",
"tags": [],
Expand Down Expand Up @@ -159,6 +171,11 @@ data = extractor.lazy(my_html_data, 'en');

// Access whichever data elements you need directly.
console.log(data.title());
console.log(data.softTitle());
console.log(data.date());
console.log(data.copyright());
console.log(data.author());
console.log(data.publisher());
console.log(data.text());
console.log(data.image());
console.log(data.tags());
Expand Down
107 changes: 87 additions & 20 deletions lib/extractor.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 30 additions & 0 deletions lib/unfluff.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

125 changes: 111 additions & 14 deletions src/extractor.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,91 @@ stopwords = require("./stopwords")
formatter = require("./formatter")

module.exports =
# Grab the title of an html doc (excluding junk)
title: (doc) ->
titleElement = doc("meta[property='og:title']")
titleText = titleElement.attr("content") if titleElement
# Grab the date of an html doc
date: (doc) ->
dateCandidates = doc("meta[property='article:published_time'], \
meta[itemprop*='datePublished'], meta[name='dcterms.modified'], \
meta[name='dcterms.date'], \
meta[name='DC.date.issued'], meta[name='dc.date.issued'], \
meta[name='dc.date.modified'], meta[name='dc.date.created'], \
meta[name='DC.date'], \
meta[name='DC.Date'], \
meta[name='dc.date'], \
meta[name='date'], \
time[itemprop*='pubDate'], \
time[itemprop*='pubdate'], \
span[itemprop*='datePublished'], \
span[property*='datePublished'], \
p[itemprop*='datePublished'], \
p[property*='datePublished'], \
div[itemprop*='datePublished'], \
div[property*='datePublished'], \
li[itemprop*='datePublished'], \
li[property*='datePublished'], \
time, \
span[class*='date'], \
p[class*='date'], \
div[class*='date']")
dateCandidates?.first()?.attr("content")?.trim() || dateCandidates?.first()?.attr("datetime")?.trim() || cleanText(dateCandidates?.first()?.text()) || null


# Grab the copyright line
copyright: (doc) ->
copyrightCandidates = doc("p[class*='copyright'], div[class*='copyright'], span[class*='copyright'], li[class*='copyright'], \
p[id*='copyright'], div[id*='copyright'], span[id*='copyright'], li[id*='copyright']")
text = copyrightCandidates?.first()?.text()
if !text
# try to find the copyright in the text
text = doc("body").text().replace(/\s*[\r\n]+\s*/g, ". ")
return null unless text.indexOf("©") > 0
copyright = text.replace(/.*?©(\s*copyright)?([^,;:.|\r\n]+).*/gi, "$2").trim()
cleanText(copyright)


# Grab the author of an html doc
author: (doc) ->
authorCandidates = doc("meta[property='article:author'], \
meta[property='og:article:author'], meta[name='author'], \
meta[name='dcterms.creator'], \
meta[name='DC.creator'], \
meta[name='DC.Creator'], \
meta[name='dc.creator'], \
meta[name='creator']")
authorList = []
authorCandidates.each () ->
author = doc(this)?.attr("content")?.trim()
if author
authorList.push(author)
# fallback to a named author div
if authorList.length == 0
fallbackAuthor = doc("span[class*='author']").first()?.text() || doc("p[class*='author']").first()?.text() || doc("div[class*='author']").first()?.text() || \
doc("span[class*='byline']").first()?.text() || doc("p[class*='byline']").first()?.text() || doc("div[class*='byline']").first()?.text()
if fallbackAuthor
authorList.push(cleanText(fallbackAuthor))

authorList


# Grab the publisher of the page/site
publisher: (doc) ->
publisherCandidates = doc("meta[property='og:site_name'], \
meta[name='dc.publisher'], \
meta[name='DC.publisher'], \
meta[name='DC.Publisher']")
publisherCandidates?.first()?.attr("content")?.trim()

if !titleText
titleElement = doc("title").first()
titleText = titleElement.text()

return null unless titleElement
# Grab the title of an html doc (excluding junk)
# Hard-truncates titles containing colon or spaced dash
title: (doc) ->
titleText = rawTitle(doc)
return cleanTitle(titleText, ["|", " - ", "»", ":"])

usedDelimeter = false
_.each ["|", " - ", "»", ":"], (c) ->
if titleText.indexOf(c) >= 0 && !usedDelimeter
titleText = biggestTitleChunk(titleText, c)
usedDelimeter = true
# Grab the title with soft truncation
softTitle: (doc) ->
titleText = rawTitle(doc)
return cleanTitle(titleText, ["|", " - ", "»"])

titleText.replace(//g, "").trim()

# Grab the 'main' text chunk
text: (doc, topNode, lang) ->
Expand Down Expand Up @@ -434,3 +501,33 @@ postCleanup = (doc, targetNode, lang) ->
doc(e).remove()

return node


cleanText = (text) ->
return text.replace(/[\r\n\t]/g, " ").replace(/\s\s+/g, " ").replace(/<!--.+?-->/g, "").replace(//g, "").trim()


cleanTitle = (title, delimiters) ->
titleText = title || ""
usedDelimeter = false
_.each delimiters, (c) ->
if titleText.indexOf(c) >= 0 && !usedDelimeter
titleText = biggestTitleChunk(titleText, c)
usedDelimeter = true
return cleanText(titleText)


rawTitle = (doc) ->
gotTitle = false
titleText = ""
# The first h1 or h2 is a useful fallback
_.each [doc("meta[property='og:title']")?.first()?.attr("content"), \
doc("h1[class*='title']")?.first()?.text(), \
doc("title")?.first()?.text(), \
doc("h1")?.first()?.text(), \
doc("h2")?.first()?.text()], (candidate) ->
if candidate && candidate.trim() && !gotTitle
titleText = candidate.trim()
gotTitle = true

return titleText
25 changes: 25 additions & 0 deletions src/unfluff.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ module.exports = unfluff = (html, language) ->

pageData =
title: extractor.title(doc)
softTitle: extractor.softTitle(doc)
date: extractor.date(doc)
author: extractor.author(doc)
publisher: extractor.publisher(doc)
copyright: extractor.copyright(doc)
favicon: extractor.favicon(doc)
description: extractor.description(doc)
keywords: extractor.keywords(doc)
Expand All @@ -34,6 +39,26 @@ unfluff.lazy = (html, language) ->
doc = getParsedDoc.call(this, html)
@title_ ?= extractor.title(doc)

softTitle: () ->
doc = getParsedDoc.call(this, html)
@softTitle_ ?= extractor.softTitle(doc)

date: () ->
doc = getParsedDoc.call(this, html)
@date_ ?= extractor.date(doc)

copyright: () ->
doc = getParsedDoc.call(this, html)
@copyright_ ?= extractor.copyright(doc)

author: () ->
doc = getParsedDoc.call(this, html)
@author_ ?= extractor.author(doc)

publisher: () ->
doc = getParsedDoc.call(this, html)
@publisher_ ?= extractor.publisher(doc)

favicon: () ->
doc = getParsedDoc.call(this, html)
@favicon_ ?= extractor.favicon(doc)
Expand Down
Loading

0 comments on commit 52cb470

Please sign in to comment.