Skip to content

Commit

Permalink
feat: extract custom types with extend option (#313)
Browse files Browse the repository at this point in the history
* feat: extract custom types with extend option

Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.

```
Mercury.parse(
  url,
  extend: {
    last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
  }
)
```

* chore: use Reflect.ownKeys

* feat: add CLI options

* doc: add extend param to cli help

* refactor: extract selectExtendedTypes

* feat: only overwrite null extended results

* feat: add allowMultiple extraction option

* feat: accept extendList CLI args

* feat: allow attribute selectors in extends on CLI

* test: update extend tests

* fix: don't invoke cleaner for custom types

* feat: always return array if allowMultiple

* test: add test for array of single result

* refactor: extract extractHtml

* refactor: destructure allowMultiple

* fix: wrap multiple matches in $ for cheerio shim

* fix: find extended types before any other munging

* feat: absolutize all links

* fix: clean content more directly

* doc: Update CLI docs in README

* chore: update dist

* doc: Document extend in custom extractor README
  • Loading branch information
droob authored and adampash committed Mar 25, 2019
1 parent 136d6df commit b3e2a0f
Show file tree
Hide file tree
Showing 9 changed files with 549 additions and 101 deletions.
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,9 @@ If Mercury is unable to find a field, that field will return `null`.
By default, Mercury Parser returns the `content` field as HTML. However, you can override this behavior by passing in options to the `parse` function, specifying whether or not to scrape all pages of an article, and what type of output to return (valid values are `'html'`, `'markdown'`, and `'text'`). For example:

```javascript
Mercury.parse(url, { contentType: 'markdown' }).then(result => console.log(result));
Mercury.parse(url, { contentType: 'markdown' }).then(result =>
console.log(result)
);
```

This returns the the page's `content` as GitHub-flavored Markdown:
Expand All @@ -94,6 +96,15 @@ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source

# Pass optional --format argument to set content type (html|markdown|text)
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --format=markdown

# Pass optional --extend-list argument to add a custom type to the response
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend credit="p:last-child em"

# Pass optional --extend-list argument to add a custom type with multiple matches
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend-list categories=".meta__tags-list a"

# Get the value of attributes by adding a pipe to --extend or --extend-list
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend-list links=".body a|href"
```

## License
Expand Down
27 changes: 24 additions & 3 deletions cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,20 @@ const {
_: [url],
format,
f,
extend,
e,
extendList,
l,
} = argv;
(async (urlToParse, contentType) => {
(async (urlToParse, contentType, extendedTypes, extendedListTypes) => {
if (!urlToParse) {
console.log(
'\n\
mercury-parser\n\n\
The Mercury Parser extracts semantic content from any url\n\n\
Usage:\n\
\n\
$ mercury-parser url-to-parse [--format=html|text|markdown]\n\
$ mercury-parser url-to-parse [--format=html|text|markdown] [--extend type=selector]... [--extend-list type=selector]... \n\
\n\
'
);
Expand All @@ -31,8 +35,25 @@ Usage:\n\
text: 'text',
txt: 'text',
};
const extensions = {};
[].concat(extendedTypes || []).forEach(t => {
const [name, selector] = t.split('=');
const fullSelector =
selector.indexOf('|') > 0 ? selector.split('|') : selector;
extensions[name] = { selectors: [fullSelector] };
});
[].concat(extendedListTypes || []).forEach(t => {
const [name, selector] = t.split('=');
const fullSelector =
selector.indexOf('|') > 0 ? selector.split('|') : selector;
extensions[name] = {
selectors: [fullSelector],
allowMultiple: true,
};
});
const result = await Mercury.parse(urlToParse, {
contentType: contentTypeMap[contentType],
extend: extensions,
});
console.log(JSON.stringify(result, null, 2));
} catch (e) {
Expand All @@ -51,4 +72,4 @@ Usage:\n\
console.error(`\n${reportBug}\n`);
process.exit(1);
}
})(url, format || f);
})(url, format || f, extend || e, extendList || l);
Loading

0 comments on commit b3e2a0f

Please sign in to comment.