diff --git a/dist/mercury.js b/dist/mercury.js index ec1594392..6a01adcb6 100644 --- a/dist/mercury.js +++ b/dist/mercury.js @@ -1264,7 +1264,6 @@ function absolutizeSet($, rootUrl, $content) { // descriptors can only contain positive numbers followed immediately by either 'w' or 'x' // space characters inside the URL should be encoded (%20 or +) var candidates = urlSet.match(/(?:\s*)(\S+(?:\s*[\d.]+[wx])?)(?:\s*,\s*)?/g); - if (!candidates) return; var absoluteCandidates = candidates.map(function (candidate) { // a candidate URL cannot start or end with a comma // descriptors are separated from the URLs by unescaped whitespace @@ -1935,31 +1934,30 @@ var TheAtlanticExtractor = { var NewYorkerExtractor = { domain: 'www.newyorker.com', title: { - selectors: ['h1[class^="ArticleHeader__hed"]', ['meta[name="og:title"]', 'value']] + selectors: ['h1.title'] }, author: { - selectors: ['div[class^="ArticleContributors"] a[rel="author"]', 'article header div[class*="Byline__multipleContributors"]'] + selectors: ['.contributors'] }, content: { - selectors: ['main[class^="Layout__content"]'], + selectors: ['div#articleBody', 'div.articleBody'], // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: [], // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result - clean: ['footer[class^="ArticleFooter__footer"]'] + clean: [] }, date_published: { - selectors: [['meta[name="pubdate"]', 'value']], - format: 'YYYYMMDD', + selectors: [['meta[name="article:published_time"]', 'value'], ['time[itemProp="datePublished"]', 'content']], timezone: 'America/New_York' }, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']] }, dek: { - selectors: ['h2[class^="ArticleHeader__dek"]'] + selectors: ['.dek', 'h2.dek'] }, next_page_url: null, excerpt: null @@ -4762,30 +4760,6 @@ var NewsMynaviJpExtractor = { } }; -var ClinicaltrialsGovExtractor = { - domain: 'clinicaltrials.gov', - title: { - selectors: ['h1.solo_record'] - }, - author: { - selectors: ['div#sponsor.info-text'] - }, - date_published: { - // selectors: ['span.term[data-term="Last Update Posted"]'], - selectors: ['div:has(> span.term[data-term="Last Update Posted"])'] - }, - content: { - selectors: ['div#tab-body'], - // Is there anything in the content you selected that needs transformed - // before it's consumable content? E.g., unusual lazy loaded images - transforms: {}, - // Is there anything that is in the result that shouldn't be? - // The clean selectors will remove anything that matches from - // the result - clean: ['.usa-alert> img'] - } -}; - var GithubComExtractor = { domain: 'github.com', title: { @@ -4891,11 +4865,7 @@ var WwwOssnewsJpExtractor = { selectors: ['#alpha-block h1.hxnewstitle'] }, author: null, - date_published: { - selectors: ['p.fs12'], - format: 'YYYY年MM月DD日 HH:mm', - timezone: 'Asia/Tokyo' - }, + date_published: null, dek: null, lead_image_url: { selectors: [['meta[name="og:image"]', 'value']] @@ -4961,11 +4931,7 @@ var WwwSanwaCoJpExtractor = { selectors: ['#newsContent h1'] }, author: null, - date_published: { - selectors: ['p.date'], - format: 'YYYY.MM.DD', - timezone: 'Asia/Tokyo' - }, + date_published: null, dek: { selectors: [['meta[name="og:description"]', 'value']] }, @@ -4986,11 +4952,7 @@ var WwwElecomCoJpExtractor = { selectors: ['title'] }, author: null, - date_published: { - selectors: ['p.section-last'], - format: 'YYYY.MM.DD', - timezone: 'Asia/Tokyo' - }, + date_published: null, dek: null, lead_image_url: null, content: { @@ -5034,11 +4996,7 @@ var JvndbJvnJpExtractor = { selectors: ['title'] }, author: null, - date_published: { - selectors: ['div.modifytxt:nth-child(2)'], - format: 'YYYY/MM/DD', - timezone: 'Asia/Tokyo' - }, + date_published: null, dek: null, lead_image_url: null, content: { @@ -5106,535 +5064,6 @@ var WwwJnsaOrgExtractor = { } }; -var PhpspotOrgExtractor = { - domain: 'phpspot.org', - title: { - selectors: ['h3.hl'] - }, - author: null, - date_published: { - selectors: ['h4.hl'], - format: 'YYYY年MM月DD日', - timezone: 'Asia/Tokyo' - }, - dek: null, - lead_image_url: null, - content: { - selectors: ['div.entrybody'], - defaultCleaner: false, - transforms: {}, - clean: [] - } -}; - -var WwwInfoqComExtractor = { - domain: 'www.infoq.com', - title: { - selectors: ['h1.heading'] - }, - author: { - selectors: ['div.widget.article__authors'] - }, - date_published: { - selectors: ['.article__readTime.date'], - format: 'YYYY年MM月DD日', - timezone: 'Asia/Tokyo' - }, - dek: { - selectors: [['meta[name="og:description"]', 'value']] - }, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['div.article__data'], - defaultCleaner: false, - transforms: {}, - clean: [] - } -}; - -var WwwMoongiftJpExtractor = { - domain: 'www.moongift.jp', - title: { - selectors: ['h1.title a'] - }, - author: null, - date_published: { - selectors: ['ul.meta li:not(.social):first-of-type'], - timezone: 'Asia/Tokyo' - }, - dek: { - selectors: [['meta[name="og:description"]', 'value']] - }, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['#main'], - transforms: {}, - clean: ['ul.mg_service.cf'] - } -}; - -var WwwItmediaCoJpExtractor = { - domain: 'www.itmedia.co.jp', - supportedDomains: ['www.atmarkit.co.jp', 'techtarget.itmedia.co.jp', 'nlab.itmedia.co.jp'], - title: { - selectors: ['#cmsTitle h1'] - }, - author: { - selectors: ['#byline'] - }, - date_published: { - selectors: [['meta[name="article:modified_time"]', 'value']] - }, - dek: { - selectors: ['#cmsAbstract h2'] - }, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['#cmsBody'], - defaultCleaner: false, - transforms: {}, - clean: ['#snsSharebox'] - } -}; - -var WwwPublickey1JpExtractor = { - domain: 'www.publickey1.jp', - title: { - selectors: ['h1'] - }, - author: { - selectors: ['#subcol p:has(img)'] - }, - date_published: { - selectors: ['div.pubdate'], - format: 'YYYY年MM月DD日', - timezone: 'Asia/Tokyo' - }, - dek: null, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['#maincol'], - defaultCleaner: false, - transforms: {}, - clean: ['#breadcrumbs', 'div.sbm', 'div.ad_footer'] - } -}; - -var TakagihiromitsuJpExtractor = { - domain: 'takagi-hiromitsu.jp', - title: { - selectors: ['h3'] - }, - author: { - selectors: [['meta[name="author"]', 'value']] - }, - date_published: { - selectors: [['meta[http-equiv="Last-Modified"]', 'value']] - }, - dek: null, - lead_image_url: null, - content: { - selectors: ['div.body'], - defaultCleaner: false, - transforms: {}, - clean: [] - } -}; - -var BookwalkerJpExtractor = { - domain: 'bookwalker.jp', - title: { - selectors: ['h1.main-heading'] - }, - author: { - selectors: ['div.authors'] - }, - date_published: { - selectors: ['.work-info .work-detail:first-of-type .work-detail-contents:last-of-type'], - timezone: 'Asia/Tokyo' - }, - dek: null, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: [['div.main-info', 'div.main-cover-inner']], - defaultCleaner: false, - transforms: {}, - clean: ['span.label.label--trial', 'dt.info-head.info-head--coin', 'dd.info-contents.info-contents--coin', 'div.info-notice.fn-toggleClass'] - } -}; - -var WwwYomiuriCoJpExtractor = { - domain: 'www.yomiuri.co.jp', - title: { - selectors: ['h1.title-article.c-article-title'] - }, - author: null, - date_published: { - selectors: [['meta[name="article:published_time"]', 'value']] - }, - dek: null, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['div.p-main-contents'], - transforms: {}, - clean: [] - } -}; - -var JapanCnetComExtractor = { - domain: 'japan.cnet.com', - title: { - selectors: ['.leaf-headline-ttl'] - }, - author: { - selectors: ['.writer'] - }, - date_published: { - selectors: ['.date'], - format: 'YYYY年MM月DD日 HH時mm分', - timezone: 'Asia/Tokyo' - }, - dek: null, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['div.article_body'], - transforms: {}, - clean: [] - } -}; - -var DeadlineComExtractor = { - domain: 'deadline.com', - title: { - selectors: ['h1'] - }, - author: { - selectors: ['section.author h3'] - }, - date_published: { - selectors: [['meta[name="article:published_time"]', 'value']] - }, - dek: null, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['div.a-article-grid__main.pmc-a-grid article.pmc-a-grid-item'], - transforms: { - '.embed-twitter': function embedTwitter($node) { - var innerHtml = $node.html(); - $node.replaceWith(innerHtml); - } - }, - clean: [] - } -}; - -var WwwGizmodoJpExtractor = { - domain: 'www.gizmodo.jp', - title: { - selectors: ['h1.p-post-title'] - }, - author: { - selectors: ['li.p-post-AssistAuthor'] - }, - date_published: { - selectors: [['li.p-post-AssistTime time', 'datetime']] - }, - dek: null, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['article.p-post'], - transforms: { - 'img.p-post-thumbnailImage': function imgPPostThumbnailImage($node) { - var src = $node.attr('src'); - $node.attr('src', src.replace(/^.*=%27/, '').replace(/%27;$/, '')); - } - }, - clean: ['h1.p-post-title', 'ul.p-post-Assist'] - } -}; - -var GetnewsJpExtractor = { - domain: 'getnews.jp', - title: { - selectors: ['article h1'] - }, - author: { - selectors: ['span.prof'] - }, - date_published: { - selectors: [['ul.cattag-top time', 'datetime']] - }, - dek: null, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['div.post-bodycopy'], - transforms: {}, - clean: [] - } -}; - -var WwwLifehackerJpExtractor = { - domain: 'www.lifehacker.jp', - title: { - selectors: ['h1.lh-summary-title'] - }, - author: { - selectors: ['p.lh-entryDetailInner--credit'] - }, - date_published: { - selectors: [['div.lh-entryDetail-header time', 'datetime']] - }, - dek: null, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['div.lh-entryDetail-body'], - transforms: { - 'img.lazyload': function imgLazyload($node) { - var src = $node.attr('src'); - $node.attr('src', src.replace(/^.*=%27/, '').replace(/%27;$/, '')); - } - }, - clean: ['p.lh-entryDetailInner--credit'] - } -}; - -var SectIijAdJpExtractor = { - domain: 'sect.iij.ad.jp', - title: { - selectors: ['h3'] - }, - author: { - selectors: ['dl.entrydate dd'] - }, - date_published: { - selectors: ['dl.entrydate dd'], - format: 'YYYY年MM月DD日', - timezone: 'Asia/Tokyo' - }, - dek: null, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['#article'], - transforms: {}, - clean: ['dl.entrydate'] - } -}; - -var WwwOreillyCoJpExtractor = { - domain: 'www.oreilly.co.jp', - title: { - selectors: ['h3'] - }, - author: { - selectors: ['li[itemprop="author"]'] - }, - date_published: { - selectors: [['meta[itemprop="datePublished"]', 'value']], - timezone: 'Asia/Tokyo' - }, - dek: null, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['#content'], - defaultCleaner: false, - transforms: {}, - clean: ['.social-tools'] - } -}; - -var WwwIpaGoJpExtractor = { - domain: 'www.ipa.go.jp', - title: { - selectors: ['h1'] - }, - author: null, - date_published: { - selectors: ['p.ipar_text_right'], - format: 'YYYY年M月D日', - timezone: 'Asia/Tokyo' - }, - dek: null, - lead_image_url: null, - content: { - selectors: ['#ipar_main'], - defaultCleaner: false, - transforms: {}, - clean: ['p.ipar_text_right'] - } -}; - -var WeeklyAsciiJpExtractor = { - domain: 'weekly.ascii.jp', - title: { - selectors: ['h1[itemprop="headline"]'] - }, - author: { - selectors: ['p.author'] - }, - date_published: { - selectors: [['meta[name="odate"]', 'value']] - }, - dek: null, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['div.article'], - transforms: {}, - clean: [] - } -}; - -var TechlogIijAdJpExtractor = { - domain: 'techlog.iij.ad.jp', - title: { - selectors: ['h1.entry-title'] - }, - author: { - selectors: ['a[rel="author"]'] - }, - date_published: { - selectors: [['time.entry-date', 'datetime']] - }, - dek: null, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['div.entry-content'], - defaultCleaner: false, - transforms: {}, - clean: [] - } -}; - -var WiredJpExtractor = { - domain: 'wired.jp', - title: { - selectors: ['h1.post-title'] - }, - author: { - selectors: ['p[itemprop="author"]'] - }, - date_published: { - selectors: [['time', 'datetime']] - }, - dek: { - selectors: ['.post-intro'] - }, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['article.article-detail'], - transforms: { - 'img[data-original]': function imgDataOriginal($node) { - var dataOriginal = $node.attr('data-original'); - var src = $node.attr('src'); - var url = URL.resolve(src, dataOriginal); - $node.attr('src', url); - } - }, - clean: ['.post-category', 'time', 'h1.post-title', '.social-area-syncer'] - } -}; - -var JapanZdnetComExtractor = { - domain: 'japan.zdnet.com', - title: { - selectors: ['h1'] - }, - author: { - selectors: [['meta[name="cXenseParse:author"]', 'value']] - }, - date_published: { - selectors: [['meta[name="article:published_time"]', 'value']] - }, - dek: null, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['div.article_body'], - transforms: {}, - clean: [] - } -}; - -var WwwRbbtodayComExtractor = { - domain: 'www.rbbtoday.com', - title: { - selectors: ['h1'] - }, - author: { - selectors: ['.writer.writer-name'] - }, - date_published: { - selectors: [['header time', 'datetime']] - }, - dek: { - selectors: ['.arti-summary'] - }, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['.arti-content'], - transforms: {}, - clean: ['.arti-giga'] - } -}; - -var WwwLemondeFrExtractor = { - domain: 'www.lemonde.fr', - title: { - selectors: ['h1.article__title'] - }, - author: { - selectors: ['.author__name'] - }, - date_published: { - selectors: [['meta[name="og:article:published_time"]', 'value']] - }, - dek: { - selectors: ['.article__desc'] - }, - lead_image_url: { - selectors: [['meta[name="og:image"]', 'value']] - }, - content: { - selectors: ['.article__content'], - transforms: {}, - clean: [] - } -}; - var CustomExtractors = /*#__PURE__*/Object.freeze({ @@ -5733,7 +5162,6 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({ WwwFastcompanyComExtractor: WwwFastcompanyComExtractor, BlisterreviewComExtractor: BlisterreviewComExtractor, NewsMynaviJpExtractor: NewsMynaviJpExtractor, - ClinicaltrialsGovExtractor: ClinicaltrialsGovExtractor, GithubComExtractor: GithubComExtractor, WwwRedditComExtractor: WwwRedditComExtractor, OtrsComExtractor: OtrsComExtractor, @@ -5745,29 +5173,7 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({ ScanNetsecurityNeJpExtractor: ScanNetsecurityNeJpExtractor, JvndbJvnJpExtractor: JvndbJvnJpExtractor, GeniusComExtractor: GeniusComExtractor, - WwwJnsaOrgExtractor: WwwJnsaOrgExtractor, - PhpspotOrgExtractor: PhpspotOrgExtractor, - WwwInfoqComExtractor: WwwInfoqComExtractor, - WwwMoongiftJpExtractor: WwwMoongiftJpExtractor, - WwwItmediaCoJpExtractor: WwwItmediaCoJpExtractor, - WwwPublickey1JpExtractor: WwwPublickey1JpExtractor, - TakagihiromitsuJpExtractor: TakagihiromitsuJpExtractor, - BookwalkerJpExtractor: BookwalkerJpExtractor, - WwwYomiuriCoJpExtractor: WwwYomiuriCoJpExtractor, - JapanCnetComExtractor: JapanCnetComExtractor, - DeadlineComExtractor: DeadlineComExtractor, - WwwGizmodoJpExtractor: WwwGizmodoJpExtractor, - GetnewsJpExtractor: GetnewsJpExtractor, - WwwLifehackerJpExtractor: WwwLifehackerJpExtractor, - SectIijAdJpExtractor: SectIijAdJpExtractor, - WwwOreillyCoJpExtractor: WwwOreillyCoJpExtractor, - WwwIpaGoJpExtractor: WwwIpaGoJpExtractor, - WeeklyAsciiJpExtractor: WeeklyAsciiJpExtractor, - TechlogIijAdJpExtractor: TechlogIijAdJpExtractor, - WiredJpExtractor: WiredJpExtractor, - JapanZdnetComExtractor: JapanZdnetComExtractor, - WwwRbbtodayComExtractor: WwwRbbtodayComExtractor, - WwwLemondeFrExtractor: WwwLemondeFrExtractor + WwwJnsaOrgExtractor: WwwJnsaOrgExtractor }); var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) { diff --git a/fixtures/clinicaltrials.gov/1551708504719.html b/fixtures/clinicaltrials.gov/1551708504719.html index 9ea64f242..72a9e5f1f 100644 --- a/fixtures/clinicaltrials.gov/1551708504719.html +++ b/fixtures/clinicaltrials.gov/1551708504719.html @@ -12,6 +12,7 @@ + @@ -67,77 +68,97 @@ - + - - - - - + + + + + Changes in Circulating Tumor-Specific DNA in Patients With Non-Metastatic Non-Small Cell Lung Cancer - Full Text View - ClinicalTrials.gov - + - - - - - - + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - + + + + + + + + + + - - + -