Skip to content

Commit

Permalink
improve use of @mozilla/readability
Browse files Browse the repository at this point in the history
  • Loading branch information
josStorer committed Mar 22, 2024
1 parent 0308566 commit 777f405
Showing 1 changed file with 73 additions and 12 deletions.
85 changes: 73 additions & 12 deletions src/utils/get-core-content-text.mjs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { getPossibleElementByQuerySelector } from './get-possible-element-by-query-selector.mjs'
import { Readability } from "@mozilla/readability"
import { Readability, isProbablyReaderable } from '@mozilla/readability'

const adapters = {
'scholar.google': ['#gs_res_ccl_mid'],
Expand All @@ -13,28 +13,89 @@ const adapters = {
'new.qq.com': ['.content-article'],
}

export function getCoreContentText() {
function getTextFrom(e) {
return e.innerText || e.textContent
function getArea(e) {
const rect = e.getBoundingClientRect()
return rect.width * rect.height
}

function findLargestElement(e) {
if (!e) {
return null
}
let maxArea = 0
let largestElement = null
const limitedArea = 0.8 * getArea(e)

function traverseDOM(node) {
if (node.nodeType === Node.ELEMENT_NODE) {
const area = getArea(node)

if (area > maxArea && area < limitedArea) {
maxArea = area
largestElement = node
}

Array.from(node.children).forEach(traverseDOM)
}
}

traverseDOM(e)
return largestElement
}

function getTextFrom(e) {
return e.innerText || e.textContent
}

function postProcessText(text) {
return text
.trim()
.replaceAll(' ', '')
.replaceAll('\t', '')
.replaceAll('\n\n', '')
.replaceAll(',,', '')
}

export function getCoreContentText() {
for (const [siteName, selectors] of Object.entries(adapters)) {
if (location.hostname.includes(siteName)) {
const element = getPossibleElementByQuerySelector(selectors)
if (element) return getTextFrom(element)
if (element) return postProcessText(getTextFrom(element))
break
}
}

const element = document.querySelector('article')
if (element) {
return getTextFrom(element)
return postProcessText(getTextFrom(element))
}

let article = new Readability(document.cloneNode(true), {
keepClasses: true
}).parse()
let content = article.textContent.trim().replaceAll(' ', '').replaceAll('\t', '').replaceAll('\n\n', '').replaceAll(',,', '')
console.log(content)
return content
if (isProbablyReaderable(document)) {
let article = new Readability(document.cloneNode(true), {
keepClasses: true,
}).parse()
console.log('readerable')
return postProcessText(article.textContent)
}

const largestElement = findLargestElement(document.body)
const secondLargestElement = findLargestElement(largestElement)
console.log(largestElement)
console.log(secondLargestElement)

let ret
if (!largestElement) {
ret = getTextFrom(document.body)
console.log('use document.body')
} else if (
secondLargestElement &&
getArea(secondLargestElement) > 0.5 * getArea(largestElement)
) {
ret = getTextFrom(secondLargestElement)
console.log('use second')
} else {
ret = getTextFrom(largestElement)
console.log('use first')
}
return postProcessText(ret)
}

0 comments on commit 777f405

Please sign in to comment.