From 88a2411e61edf972b4da53a7dd1782640d9398f8 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 30 Jan 2025 14:56:15 +0000 Subject: [PATCH] Add connection timeout to S3 and Axios calls, and streamline/generalize axios options --- src/Downloader.ts | 63 ++++++++++++++---------------------- src/MediaWiki.ts | 1 + src/S3.ts | 1 + src/mwoffliner.lib.ts | 6 ++-- src/sanitize-argument.ts | 2 +- src/util/dump.ts | 10 ++---- src/util/misc.ts | 10 +++--- test/unit/downloader.test.ts | 15 ++++++--- test/unit/s3.test.ts | 30 +++++++++++------ test/unit/util.test.ts | 26 ++++++++++----- 10 files changed, 86 insertions(+), 78 deletions(-) diff --git a/src/Downloader.ts b/src/Downloader.ts index 94000188..f4d669b1 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -73,18 +73,6 @@ interface CompressionData { data: any } -export const defaultStreamRequestOptions: AxiosRequestConfig = { - headers: { - accept: 'application/octet-stream', - 'cache-control': 'public, max-stale=86400', - 'accept-encoding': 'gzip, deflate', - 'user-agent': config.userAgent, - }, - responseType: 'stream', - timeout: config.defaults.requestTimeout, - method: 'GET', -} - type URLDirector = WikimediaDesktopURLDirector | WikimediaMobileURLDirector | VisualEditorURLDirector | RestApiURLDirector /** * Downloader is a class providing content retrieval functionalities for both Mediawiki and S3 remote instances. @@ -95,9 +83,11 @@ class Downloader { public cssDependenceUrls: KVS = {} public readonly webp: boolean = false public readonly requestTimeout: number - public arrayBufferRequestOptions: AxiosRequestConfig - public jsonRequestOptions: AxiosRequestConfig - public streamRequestOptions: AxiosRequestConfig + public readonly abortSignal: AbortSignal + public readonly basicRequestOptions: AxiosRequestConfig + public readonly arrayBufferRequestOptions: AxiosRequestConfig + public readonly jsonRequestOptions: AxiosRequestConfig + public readonly streamRequestOptions: AxiosRequestConfig public wikimediaMobileJsDependenciesList: string[] = [] public wikimediaMobileStyleDependenciesList: string[] = [] @@ -125,6 +115,8 @@ class Downloader { this.apiUrlDirector = new ApiURLDirector(MediaWiki.actionApiUrl.href) this.insecure = insecure + this.abortSignal = AbortSignal.timeout(this.requestTimeout) + this.backoffOptions = { strategy: new backoff.ExponentialStrategy(), failAfter: 7, @@ -135,53 +127,48 @@ class Downloader { ...backoffOptions, } - this.arrayBufferRequestOptions = { + this.basicRequestOptions = { // HTTP agent pools with 'keepAlive' to reuse TCP connections, so it's faster httpAgent: new http.Agent({ keepAlive: true }), httpsAgent: new https.Agent({ keepAlive: true, rejectUnauthorized: !this.insecure }), // rejectUnauthorized: false disables TLS - + timeout: this.requestTimeout, + signal: this.abortSignal, headers: { 'cache-control': 'public, max-stale=86400', 'user-agent': this.uaString, cookie: this.loginCookie, }, - responseType: 'arraybuffer', - timeout: this.requestTimeout, - method: 'GET', validateStatus(status) { return (status >= 200 && status < 300) || status === 304 }, } - this.jsonRequestOptions = { - // HTTP agent pools with 'keepAlive' to reuse TCP connections, so it's faster - httpAgent: new http.Agent({ keepAlive: true }), - httpsAgent: new https.Agent({ keepAlive: true, rejectUnauthorized: !this.insecure }), + this.arrayBufferRequestOptions = { + ...this.basicRequestOptions, + responseType: 'arraybuffer', + method: 'GET', + } + this.jsonRequestOptions = { + ...this.basicRequestOptions, headers: { + ...this.basicRequestOptions.headers, accept: 'application/json', - 'cache-control': 'public, max-stale=86400', 'accept-encoding': 'gzip, deflate', - 'user-agent': this.uaString, - cookie: this.loginCookie, }, responseType: 'json', - timeout: this.requestTimeout, method: 'GET', } this.streamRequestOptions = { - // HTTP agent pools with 'keepAlive' to reuse TCP connections, so it's faster - ...defaultStreamRequestOptions, - httpAgent: new http.Agent({ keepAlive: true }), - httpsAgent: new https.Agent({ keepAlive: true, rejectUnauthorized: !this.insecure }), - + ...this.basicRequestOptions, headers: { - ...defaultStreamRequestOptions.headers, - 'user-agent': this.uaString, - cookie: this.loginCookie, + ...this.basicRequestOptions.headers, + accept: 'application/octet-stream', + 'accept-encoding': 'gzip, deflate', }, - timeout: this.requestTimeout, + responseType: 'stream', + method: 'GET', } } @@ -405,7 +392,7 @@ class Downloader { public async canGetUrl(url: string): Promise { try { - await axios.get(url) + await axios.get(url, this.arrayBufferRequestOptions) return true } catch (err) { return false diff --git a/src/MediaWiki.ts b/src/MediaWiki.ts index e4045203..c9bedd51 100644 --- a/src/MediaWiki.ts +++ b/src/MediaWiki.ts @@ -274,6 +274,7 @@ class MediaWiki { // Logging in await axios(this.actionApiUrl.href, { + ...downloader.arrayBufferRequestOptions, data: qs.stringify({ action: 'login', format: 'json', diff --git a/src/S3.ts b/src/S3.ts index 3e8d1044..ab1b36d0 100644 --- a/src/S3.ts +++ b/src/S3.ts @@ -49,6 +49,7 @@ class S3 { forcePathStyle: s3UrlBase.protocol === 'http:', region: this.region, requestHandler: new NodeHttpHandler({ + connectionTimeout: this.reqTimeout, requestTimeout: this.reqTimeout, httpAgent: new Agent({ keepAlive: true }), httpsAgent: new Agent({ keepAlive: true, rejectUnauthorized: !this.insecure }), // rejectUnauthorized: false disables TLS diff --git a/src/mwoffliner.lib.ts b/src/mwoffliner.lib.ts index a0ed0188..c2a69e2c 100644 --- a/src/mwoffliner.lib.ts +++ b/src/mwoffliner.lib.ts @@ -265,7 +265,7 @@ async function execute(argv: any) { let articleListToIgnoreLines: string[] if (articleListToIgnore) { try { - articleListToIgnoreLines = await extractArticleList(articleListToIgnore) + articleListToIgnoreLines = await extractArticleList(articleListToIgnore, downloader) logger.info(`ArticleListToIgnore has [${articleListToIgnoreLines.length}] items`) } catch (err) { logger.error(`Failed to read articleListToIgnore from [${articleListToIgnore}]`, err) @@ -276,7 +276,7 @@ async function execute(argv: any) { let articleListLines: string[] if (articleList) { try { - articleListLines = await extractArticleList(articleList) + articleListLines = await extractArticleList(articleList, downloader) if (articleListToIgnore) { articleListLines = articleListLines.filter((title: string) => !articleListToIgnoreLines.includes(title)) } @@ -434,7 +434,7 @@ async function execute(argv: any) { if (downloader.webp) { logger.log('Downloading polyfill module') - await importPolyfillModules(zimCreator) + await importPolyfillModules(downloader, zimCreator) } logger.log('Downloading module dependencies') diff --git a/src/sanitize-argument.ts b/src/sanitize-argument.ts index 1fcb793e..00f6721b 100644 --- a/src/sanitize-argument.ts +++ b/src/sanitize-argument.ts @@ -68,7 +68,7 @@ export async function sanitize_all(argv: any) { const s3UrlObj = urlParser.parse(optimisationCacheUrl) const queryReader = QueryStringParser.parse(s3UrlObj.query) const s3Url = (s3UrlObj.protocol || 'https:') + '//' + (s3UrlObj.host || '') + (s3UrlObj.pathname || '') - const s3Obj = new S3(s3Url, queryReader) + const s3Obj = new S3(s3Url, queryReader, 1000 * 60, argv.insecure) await s3Obj.initialise().then(() => { logger.log('Successfully logged in S3') }) diff --git a/src/util/dump.ts b/src/util/dump.ts index 999dcef0..e18d6048 100644 --- a/src/util/dump.ts +++ b/src/util/dump.ts @@ -167,7 +167,7 @@ export async function downloadAndSaveModule(zimCreator: ZimCreator, downloader: } // URLs should be kept the same as Kiwix JS relies on it. -export async function importPolyfillModules(zimCreator: ZimCreator) { +export async function importPolyfillModules(downloader: Downloader, zimCreator: ZimCreator) { ;[ { name: 'webpHeroPolyfill', path: path.join(__dirname, '../../node_modules/webp-hero/dist-cjs/polyfills.js') }, { name: 'webpHeroBundle', path: path.join(__dirname, '../../node_modules/webp-hero/dist-cjs/webp-hero.bundle.js') }, @@ -181,13 +181,7 @@ export async function importPolyfillModules(zimCreator: ZimCreator) { }) const content = await axios - .get(WEBP_HANDLER_URL, { - responseType: 'arraybuffer', - timeout: 60000, - validateStatus(status) { - return [200, 302, 304].indexOf(status) > -1 - }, - }) + .get(WEBP_HANDLER_URL, downloader.arrayBufferRequestOptions) .then((a) => a.data) .catch((err) => { throw new Error(`Failed to download webpHandler from [${WEBP_HANDLER_URL}]: ${err}`) diff --git a/src/util/misc.ts b/src/util/misc.ts index 7f2ccd80..f20efcd5 100644 --- a/src/util/misc.ts +++ b/src/util/misc.ts @@ -1,6 +1,6 @@ import crypto from 'crypto' import domino from 'domino' -import { defaultStreamRequestOptions } from '../Downloader.js' +import Downloader from '../Downloader.js' import countryLanguage from '@ladjs/country-language' import fs from 'fs' import path from 'path' @@ -384,9 +384,9 @@ export function cleanupAxiosError(err: AxiosError) { return { name: err.name, message: err.message, url: err.config?.url, status: err.response?.status, responseType: err.config?.responseType, data: err.response?.data } } -async function downloadListByUrl(url: string): Promise { +async function downloadListByUrl(url: string, downloader: Downloader): Promise { const fileName = url.split('/').slice(-1)[0] - const { data: contentStream } = await axios.get(url, defaultStreamRequestOptions) + const { data: contentStream } = await axios.get(url, downloader.streamRequestOptions) const filePath = path.join(await getTmpDirectory(), fileName) const writeStream = fs.createWriteStream(filePath) await new Promise((resolve, reject) => { @@ -398,7 +398,7 @@ async function downloadListByUrl(url: string): Promise { return filePath } -export async function extractArticleList(articleList: string): Promise { +export async function extractArticleList(articleList: string, downloader: Downloader): Promise { const list = await Promise.all( articleList .split(',') @@ -414,7 +414,7 @@ export async function extractArticleList(articleList: string): Promise } if (url && url.href) { try { - item = await downloadListByUrl(url.href) + item = await downloadListByUrl(url.href, downloader) } catch (e) { throw new Error(`Failed to read articleList from URL: ${url.href}`) } diff --git a/test/unit/downloader.test.ts b/test/unit/downloader.test.ts index fa657aa1..e59d143d 100644 --- a/test/unit/downloader.test.ts +++ b/test/unit/downloader.test.ts @@ -289,11 +289,16 @@ describe('Downloader class', () => { MediaWiki.base = 'https://en.wikipedia.org' MediaWiki.getCategories = true - s3 = new S3(`${s3UrlObj.protocol}//${s3UrlObj.host}/`, { - bucketName: s3UrlObj.query.bucketName, - keyId: s3UrlObj.query.keyId, - secretAccessKey: s3UrlObj.query.secretAccessKey, - }) + s3 = new S3( + `${s3UrlObj.protocol}//${s3UrlObj.host}/`, + { + bucketName: s3UrlObj.query.bucketName, + keyId: s3UrlObj.query.keyId, + secretAccessKey: s3UrlObj.query.secretAccessKey, + }, + 1000 * 60, + false, + ) downloader = new Downloader({ uaString: `${config.userAgent} (contact@kiwix.org)`, speed: 1, diff --git a/test/unit/s3.test.ts b/test/unit/s3.test.ts index 593065b9..2e5b89f7 100644 --- a/test/unit/s3.test.ts +++ b/test/unit/s3.test.ts @@ -10,11 +10,16 @@ describeIf('S3', () => { test('S3 checks', async () => { const s3UrlObj = urlParser.parse(`${process.env.S3_URL}`, true) - const s3 = new S3(`${s3UrlObj.protocol}//${s3UrlObj.host}/`, { - bucketName: s3UrlObj.query.bucketName, - keyId: s3UrlObj.query.keyId, - secretAccessKey: s3UrlObj.query.secretAccessKey, - }) + const s3 = new S3( + `${s3UrlObj.protocol}//${s3UrlObj.host}/`, + { + bucketName: s3UrlObj.query.bucketName, + keyId: s3UrlObj.query.keyId, + secretAccessKey: s3UrlObj.query.secretAccessKey, + }, + 1000 * 60, + false, + ) const credentialExists = await s3.initialise() // Credentials on S3 exists @@ -48,11 +53,16 @@ describeIf('S3', () => { expect( () => - new S3(`${wrongS3UrlObj.protocol}//${wrongS3UrlObj.host}/`, { - bucketName: wrongS3UrlObj.query.bucketName, - keyId: wrongS3UrlObj.query.keyId, - secretAccessKey: wrongS3UrlObj.query.secretAccessKey, - }), + new S3( + `${wrongS3UrlObj.protocol}//${wrongS3UrlObj.host}/`, + { + bucketName: wrongS3UrlObj.query.bucketName, + keyId: wrongS3UrlObj.query.keyId, + secretAccessKey: wrongS3UrlObj.query.secretAccessKey, + }, + 1000 * 60, + false, + ), ).toThrow('Unknown S3 region set') }) }) diff --git a/test/unit/util.test.ts b/test/unit/util.test.ts index 705df9b5..1ecee5b9 100644 --- a/test/unit/util.test.ts +++ b/test/unit/util.test.ts @@ -22,6 +22,9 @@ import { fileURLToPath } from 'url' import { jest } from '@jest/globals' import fs from 'fs' import rimraf from 'rimraf' +import Downloader from '../../src/Downloader.js' +import MediaWiki from '../../src/MediaWiki.js' +import { config } from '../../src/config.js' jest.setTimeout(10000) @@ -29,6 +32,13 @@ const __filename = fileURLToPath(import.meta.url) const __dirname = path.dirname(__filename) describe('Utils', () => { + let downloader: Downloader + + MediaWiki.base = 'https://en.wikipedia.org' // Mandatory setting for proper downloader initialization + beforeAll(async () => { + downloader = new Downloader({ uaString: `${config.userAgent} (contact@kiwix.org)`, speed: 1, reqTimeout: 1000 * 60, webp: true, optimisationCacheUrl: '' }) + }) + test('util -> interpolateTranslationString', async () => { expect(interpolateTranslationString('Hello world', {})).toEqual('Hello world') expect(interpolateTranslationString('Hello ${name}', { name: 'John' })).toEqual('Hello John') @@ -283,22 +293,22 @@ describe('Utils', () => { }) test('One string as parameter', async () => { - const result: string[] = await extractArticleList('testString') + const result: string[] = await extractArticleList('testString', downloader) expect(result).toEqual(['testString']) }) test('Comma separated strings as parameter', async () => { - const result: string[] = await extractArticleList(argumentsList.join(',')) + const result: string[] = await extractArticleList(argumentsList.join(','), downloader) expect(result).toEqual(argumentsList) }) test('Filename string as parameter', async () => { - const result: string[] = await extractArticleList(filePath) + const result: string[] = await extractArticleList(filePath, downloader) expect(result).toEqual(argumentsList) }) test('Comma separated filenames string as parameter', async () => { - const result: string[] = await extractArticleList(`${filePath},${anotherFilePath}`) + const result: string[] = await extractArticleList(`${filePath},${anotherFilePath}`, downloader) expect(result.sort()).toEqual(argumentsList.concat(anotherArgumentsList)) }) @@ -306,7 +316,7 @@ describe('Utils', () => { jest.spyOn(axios, 'get').mockResolvedValue({ data: fs.createReadStream(filePath), }) - const result: string[] = await extractArticleList('http://test.com/strings') + const result: string[] = await extractArticleList('http://test.com/strings', downloader) expect(result).toEqual(argumentsList) }) @@ -317,18 +327,18 @@ describe('Utils', () => { jest.spyOn(axios, 'get').mockResolvedValueOnce({ data: fs.createReadStream(anotherFilePath), }) - const result: string[] = await extractArticleList('http://test.com/strings,http://test.com/another-strings') + const result: string[] = await extractArticleList('http://test.com/strings,http://test.com/another-strings', downloader) expect(result.sort()).toEqual(argumentsList.concat(anotherArgumentsList)) }) test('The parameter starts from HTTP but it is not the URL', async () => { - const result: string[] = await extractArticleList('http-test') + const result: string[] = await extractArticleList('http-test', downloader) expect(result).toEqual(['http-test']) }) test('Error if trying to get articleList from wrong URL ', async () => { jest.spyOn(axios, 'get').mockRejectedValue({}) - await expect(extractArticleList('http://valid-wrong-url.com/')).rejects.toThrow('Failed to read articleList from URL: http://valid-wrong-url.com/') + await expect(extractArticleList('http://valid-wrong-url.com/', downloader)).rejects.toThrow('Failed to read articleList from URL: http://valid-wrong-url.com/') }) })