Skip to content

Commit

Permalink
Merge pull request #741 from microlinkhq/readability
Browse files Browse the repository at this point in the history
chore(readability): use worker threads
  • Loading branch information
Kikobeats authored Jan 19, 2025
2 parents 90ece33 + 9b29660 commit 5478e0e
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 39 deletions.
7 changes: 0 additions & 7 deletions packages/metascraper-readability/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,6 @@ $ npm install metascraper-readability --save

#### options

##### getDocument

Type: `function`<br>
Default: [source code](https://github.com/microlinkhq/metascraper/blob/master/packages/metascraper-readability/src/index.js#L14-L20)

The function to be called to serialized html into a DOM document.

##### readabilityOpts

Type: `object`
Expand Down
8 changes: 6 additions & 2 deletions packages/metascraper-readability/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"dependencies": {
"@metascraper/helpers": "workspace:*",
"@mozilla/readability": "~0.5.0",
"async-memoize-one": "~1.1.8",
"happy-dom": "~16.5.3"
},
"devDependencies": {
Expand All @@ -37,7 +38,10 @@
"src"
],
"scripts": {
"test": "NODE_PATH=.. TZ=UTC ava --timeout 15s"
"test": "NODE_PATH=.. TZ=UTC ava"
},
"license": "MIT"
"license": "MIT",
"ava": {
"timeout": "15s"
}
}
1 change: 0 additions & 1 deletion packages/metascraper-readability/src/index.d.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
type Options = {
getDocument: ({url: string, html: string }) => Document,
readabilityOpts: import('readability').ReadabilityOptions,
}

Expand Down
47 changes: 18 additions & 29 deletions packages/metascraper-readability/src/index.js
Original file line number Diff line number Diff line change
@@ -1,36 +1,25 @@
'use strict'

const { memoizeOne, composeRule } = require('@metascraper/helpers')
const { Readability } = require('@mozilla/readability')

const parseReader = reader => {
try {
return reader.parse()
} catch (_) {
return {}
}
}

const defaultGetDocument = ({ url, html }) => {
const { Window } = require('happy-dom')
const window = new Window({ url })
const document = window.document
document.documentElement.innerHTML = html
return document
}

module.exports = ({
getDocument = defaultGetDocument,
readabilityOpts
} = {}) => {
const readability = memoizeOne((url, html, getDocument) => {
const document = getDocument({ url, html })
const reader = new Readability(document, readabilityOpts)
return parseReader(reader)
}, memoizeOne.EqualityFirstArgument)

const asyncMemoizeOne = require('async-memoize-one')
const { Worker } = require('worker_threads')
const path = require('path')

const SCRIPT_PATH = path.resolve(__dirname, 'worker.js')

const readability = asyncMemoizeOne((url, html, readabilityOpts) => {
const worker = new Worker(SCRIPT_PATH, {
workerData: { url, html, readabilityOpts }
})
const { promise, resolve, reject } = Promise.withResolvers()
worker.on('message', message => resolve(JSON.parse(message)))
worker.on('error', reject)
return promise
}, memoizeOne.EqualityFirstArgument)

module.exports = ({ readabilityOpts } = {}) => {
const getReadbility = composeRule(($, url) =>
readability(url, $.html(), getDocument)
readability(url, $.html(), readabilityOpts)
)

const rules = {
Expand Down
28 changes: 28 additions & 0 deletions packages/metascraper-readability/src/worker.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
'use strict'

const { workerData, parentPort } = require('node:worker_threads')
const { Readability } = require('@mozilla/readability')

const parseReader = reader => {
try {
return reader.parse()
} catch (_) {
return {}
}
}

const getDocument = ({ url, html }) => {
const { Window } = require('happy-dom')
const window = new Window({ url })
const document = window.document
document.documentElement.innerHTML = html
return document
}

const main = async ({ url, html, readabilityOpts } = {}) => {
const document = getDocument({ url, html })
const reader = new Readability(document, readabilityOpts)
return parseReader(reader)
}

main(workerData).then(result => parentPort.postMessage(JSON.stringify(result)))

0 comments on commit 5478e0e

Please sign in to comment.