feat: extract custom types with extend option (#313)

* feat: extract custom types with extend option Adds an `extend` option that lets you add custom types to be extracted and returned alongside the defaults, either in a call to `parse()` or in a custom extractor. ``` Mercury.parse( url, extend: { last_edited: { selectors: ['#last-edited'], defaultCleaner: false } } ) ``` * chore: use Reflect.ownKeys * feat: add CLI options * doc: add extend param to cli help * refactor: extract selectExtendedTypes * feat: only overwrite null extended results * feat: add allowMultiple extraction option * feat: accept extendList CLI args * feat: allow attribute selectors in extends on CLI * test: update extend tests * fix: don't invoke cleaner for custom types * feat: always return array if allowMultiple * test: add test for array of single result * refactor: extract extractHtml * refactor: destructure allowMultiple * fix: wrap multiple matches in $ for cheerio shim * fix: find extended types before any other munging * feat: absolutize all links * fix: clean content more directly * doc: Update CLI docs in README * chore: update dist * doc: Document extend in custom extractor README
postlight · Mar 25, 2019 · b3e2a0f · b3e2a0f
1 parent 136d6df
commit b3e2a0f
Show file tree

Hide file tree

Showing 9 changed files with 549 additions and 101 deletions.
diff --git a/README.md b/README.md
@@ -67,7 +67,9 @@ If Mercury is unable to find a field, that field will return `null`.
 By default, Mercury Parser returns the `content` field as HTML. However, you can override this behavior by passing in options to the `parse` function, specifying whether or not to scrape all pages of an article, and what type of output to return (valid values are `'html'`, `'markdown'`, and `'text'`). For example:
 
 ```javascript
-Mercury.parse(url, { contentType: 'markdown' }).then(result => console.log(result));
+Mercury.parse(url, { contentType: 'markdown' }).then(result =>
+  console.log(result)
+);
 ```
 
 This returns the the page's `content` as GitHub-flavored Markdown:
@@ -94,6 +96,15 @@ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source
 
 # Pass optional --format argument to set content type (html|markdown|text)
 mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --format=markdown
+
+# Pass optional --extend-list argument to add a custom type to the response
+mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend credit="p:last-child em"
+
+# Pass optional --extend-list argument to add a custom type with multiple matches
+mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend-list categories=".meta__tags-list a"
+
+# Get the value of attributes by adding a pipe to --extend or --extend-list
+mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend-list links=".body a|href"
 ```
 
 ## License

diff --git a/cli.js b/cli.js
@@ -8,16 +8,20 @@ const {
   _: [url],
   format,
   f,
+  extend,
+  e,
+  extendList,
+  l,
 } = argv;
-(async (urlToParse, contentType) => {
+(async (urlToParse, contentType, extendedTypes, extendedListTypes) => {
   if (!urlToParse) {
     console.log(
       '\n\
 mercury-parser\n\n\
     The Mercury Parser extracts semantic content from any url\n\n\
 Usage:\n\
 \n\
-    $ mercury-parser url-to-parse [--format=html|text|markdown]\n\
+    $ mercury-parser url-to-parse [--format=html|text|markdown] [--extend type=selector]... [--extend-list type=selector]... \n\
 \n\
 '
     );
@@ -31,8 +35,25 @@ Usage:\n\
       text: 'text',
       txt: 'text',
     };
+    const extensions = {};
+    [].concat(extendedTypes || []).forEach(t => {
+      const [name, selector] = t.split('=');
+      const fullSelector =
+        selector.indexOf('|') > 0 ? selector.split('|') : selector;
+      extensions[name] = { selectors: [fullSelector] };
+    });
+    [].concat(extendedListTypes || []).forEach(t => {
+      const [name, selector] = t.split('=');
+      const fullSelector =
+        selector.indexOf('|') > 0 ? selector.split('|') : selector;
+      extensions[name] = {
+        selectors: [fullSelector],
+        allowMultiple: true,
+      };
+    });
     const result = await Mercury.parse(urlToParse, {
       contentType: contentTypeMap[contentType],
+      extend: extensions,
     });
     console.log(JSON.stringify(result, null, 2));
   } catch (e) {
@@ -51,4 +72,4 @@ Usage:\n\
     console.error(`\n${reportBug}\n`);
     process.exit(1);
   }
-})(url, format || f);
+})(url, format || f, extend || e, extendList || l);