Skip to content

Commit

Permalink
Add connection timeout to S3 and Axios calls, and streamline/generali…
Browse files Browse the repository at this point in the history
…ze axios options
  • Loading branch information
benoit74 committed Jan 30, 2025
1 parent 8ba3c86 commit 88a2411
Show file tree
Hide file tree
Showing 10 changed files with 86 additions and 78 deletions.
63 changes: 25 additions & 38 deletions src/Downloader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,18 +73,6 @@ interface CompressionData {
data: any
}

export const defaultStreamRequestOptions: AxiosRequestConfig = {
headers: {
accept: 'application/octet-stream',
'cache-control': 'public, max-stale=86400',
'accept-encoding': 'gzip, deflate',
'user-agent': config.userAgent,
},
responseType: 'stream',
timeout: config.defaults.requestTimeout,
method: 'GET',
}

type URLDirector = WikimediaDesktopURLDirector | WikimediaMobileURLDirector | VisualEditorURLDirector | RestApiURLDirector
/**
* Downloader is a class providing content retrieval functionalities for both Mediawiki and S3 remote instances.
Expand All @@ -95,9 +83,11 @@ class Downloader {
public cssDependenceUrls: KVS<boolean> = {}
public readonly webp: boolean = false
public readonly requestTimeout: number
public arrayBufferRequestOptions: AxiosRequestConfig
public jsonRequestOptions: AxiosRequestConfig
public streamRequestOptions: AxiosRequestConfig
public readonly abortSignal: AbortSignal
public readonly basicRequestOptions: AxiosRequestConfig
public readonly arrayBufferRequestOptions: AxiosRequestConfig
public readonly jsonRequestOptions: AxiosRequestConfig
public readonly streamRequestOptions: AxiosRequestConfig
public wikimediaMobileJsDependenciesList: string[] = []
public wikimediaMobileStyleDependenciesList: string[] = []

Expand Down Expand Up @@ -125,6 +115,8 @@ class Downloader {
this.apiUrlDirector = new ApiURLDirector(MediaWiki.actionApiUrl.href)
this.insecure = insecure

this.abortSignal = AbortSignal.timeout(this.requestTimeout)

this.backoffOptions = {
strategy: new backoff.ExponentialStrategy(),
failAfter: 7,
Expand All @@ -135,53 +127,48 @@ class Downloader {
...backoffOptions,
}

this.arrayBufferRequestOptions = {
this.basicRequestOptions = {
// HTTP agent pools with 'keepAlive' to reuse TCP connections, so it's faster
httpAgent: new http.Agent({ keepAlive: true }),
httpsAgent: new https.Agent({ keepAlive: true, rejectUnauthorized: !this.insecure }), // rejectUnauthorized: false disables TLS

timeout: this.requestTimeout,
signal: this.abortSignal,
headers: {
'cache-control': 'public, max-stale=86400',
'user-agent': this.uaString,
cookie: this.loginCookie,
},
responseType: 'arraybuffer',
timeout: this.requestTimeout,
method: 'GET',
validateStatus(status) {
return (status >= 200 && status < 300) || status === 304
},
}

this.jsonRequestOptions = {
// HTTP agent pools with 'keepAlive' to reuse TCP connections, so it's faster
httpAgent: new http.Agent({ keepAlive: true }),
httpsAgent: new https.Agent({ keepAlive: true, rejectUnauthorized: !this.insecure }),
this.arrayBufferRequestOptions = {
...this.basicRequestOptions,
responseType: 'arraybuffer',
method: 'GET',
}

this.jsonRequestOptions = {
...this.basicRequestOptions,
headers: {
...this.basicRequestOptions.headers,
accept: 'application/json',
'cache-control': 'public, max-stale=86400',
'accept-encoding': 'gzip, deflate',
'user-agent': this.uaString,
cookie: this.loginCookie,
},
responseType: 'json',
timeout: this.requestTimeout,
method: 'GET',
}

this.streamRequestOptions = {
// HTTP agent pools with 'keepAlive' to reuse TCP connections, so it's faster
...defaultStreamRequestOptions,
httpAgent: new http.Agent({ keepAlive: true }),
httpsAgent: new https.Agent({ keepAlive: true, rejectUnauthorized: !this.insecure }),

...this.basicRequestOptions,
headers: {
...defaultStreamRequestOptions.headers,
'user-agent': this.uaString,
cookie: this.loginCookie,
...this.basicRequestOptions.headers,
accept: 'application/octet-stream',
'accept-encoding': 'gzip, deflate',
},
timeout: this.requestTimeout,
responseType: 'stream',
method: 'GET',
}
}

Expand Down Expand Up @@ -405,7 +392,7 @@ class Downloader {

public async canGetUrl(url: string): Promise<boolean> {
try {
await axios.get(url)
await axios.get(url, this.arrayBufferRequestOptions)
return true
} catch (err) {
return false
Expand Down
1 change: 1 addition & 0 deletions src/MediaWiki.ts
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ class MediaWiki {

// Logging in
await axios(this.actionApiUrl.href, {
...downloader.arrayBufferRequestOptions,
data: qs.stringify({
action: 'login',
format: 'json',
Expand Down
1 change: 1 addition & 0 deletions src/S3.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class S3 {
forcePathStyle: s3UrlBase.protocol === 'http:',
region: this.region,
requestHandler: new NodeHttpHandler({
connectionTimeout: this.reqTimeout,
requestTimeout: this.reqTimeout,
httpAgent: new Agent({ keepAlive: true }),
httpsAgent: new Agent({ keepAlive: true, rejectUnauthorized: !this.insecure }), // rejectUnauthorized: false disables TLS
Expand Down
6 changes: 3 additions & 3 deletions src/mwoffliner.lib.ts
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ async function execute(argv: any) {
let articleListToIgnoreLines: string[]
if (articleListToIgnore) {
try {
articleListToIgnoreLines = await extractArticleList(articleListToIgnore)
articleListToIgnoreLines = await extractArticleList(articleListToIgnore, downloader)
logger.info(`ArticleListToIgnore has [${articleListToIgnoreLines.length}] items`)
} catch (err) {
logger.error(`Failed to read articleListToIgnore from [${articleListToIgnore}]`, err)
Expand All @@ -276,7 +276,7 @@ async function execute(argv: any) {
let articleListLines: string[]
if (articleList) {
try {
articleListLines = await extractArticleList(articleList)
articleListLines = await extractArticleList(articleList, downloader)
if (articleListToIgnore) {
articleListLines = articleListLines.filter((title: string) => !articleListToIgnoreLines.includes(title))
}
Expand Down Expand Up @@ -434,7 +434,7 @@ async function execute(argv: any) {

if (downloader.webp) {
logger.log('Downloading polyfill module')
await importPolyfillModules(zimCreator)
await importPolyfillModules(downloader, zimCreator)
}

logger.log('Downloading module dependencies')
Expand Down
2 changes: 1 addition & 1 deletion src/sanitize-argument.ts
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ export async function sanitize_all(argv: any) {
const s3UrlObj = urlParser.parse(optimisationCacheUrl)
const queryReader = QueryStringParser.parse(s3UrlObj.query)
const s3Url = (s3UrlObj.protocol || 'https:') + '//' + (s3UrlObj.host || '') + (s3UrlObj.pathname || '')
const s3Obj = new S3(s3Url, queryReader)
const s3Obj = new S3(s3Url, queryReader, 1000 * 60, argv.insecure)
await s3Obj.initialise().then(() => {
logger.log('Successfully logged in S3')
})
Expand Down
10 changes: 2 additions & 8 deletions src/util/dump.ts
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ export async function downloadAndSaveModule(zimCreator: ZimCreator, downloader:
}

// URLs should be kept the same as Kiwix JS relies on it.
export async function importPolyfillModules(zimCreator: ZimCreator) {
export async function importPolyfillModules(downloader: Downloader, zimCreator: ZimCreator) {
;[
{ name: 'webpHeroPolyfill', path: path.join(__dirname, '../../node_modules/webp-hero/dist-cjs/polyfills.js') },
{ name: 'webpHeroBundle', path: path.join(__dirname, '../../node_modules/webp-hero/dist-cjs/webp-hero.bundle.js') },
Expand All @@ -181,13 +181,7 @@ export async function importPolyfillModules(zimCreator: ZimCreator) {
})

const content = await axios
.get(WEBP_HANDLER_URL, {
responseType: 'arraybuffer',
timeout: 60000,
validateStatus(status) {
return [200, 302, 304].indexOf(status) > -1
},
})
.get(WEBP_HANDLER_URL, downloader.arrayBufferRequestOptions)
.then((a) => a.data)
.catch((err) => {
throw new Error(`Failed to download webpHandler from [${WEBP_HANDLER_URL}]: ${err}`)
Expand Down
10 changes: 5 additions & 5 deletions src/util/misc.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import crypto from 'crypto'
import domino from 'domino'
import { defaultStreamRequestOptions } from '../Downloader.js'
import Downloader from '../Downloader.js'
import countryLanguage from '@ladjs/country-language'
import fs from 'fs'
import path from 'path'
Expand Down Expand Up @@ -384,9 +384,9 @@ export function cleanupAxiosError(err: AxiosError) {
return { name: err.name, message: err.message, url: err.config?.url, status: err.response?.status, responseType: err.config?.responseType, data: err.response?.data }
}

async function downloadListByUrl(url: string): Promise<string> {
async function downloadListByUrl(url: string, downloader: Downloader): Promise<string> {
const fileName = url.split('/').slice(-1)[0]
const { data: contentStream } = await axios.get(url, defaultStreamRequestOptions)
const { data: contentStream } = await axios.get(url, downloader.streamRequestOptions)
const filePath = path.join(await getTmpDirectory(), fileName)
const writeStream = fs.createWriteStream(filePath)
await new Promise((resolve, reject) => {
Expand All @@ -398,7 +398,7 @@ async function downloadListByUrl(url: string): Promise<string> {
return filePath
}

export async function extractArticleList(articleList: string): Promise<string[]> {
export async function extractArticleList(articleList: string, downloader: Downloader): Promise<string[]> {
const list = await Promise.all(
articleList
.split(',')
Expand All @@ -414,7 +414,7 @@ export async function extractArticleList(articleList: string): Promise<string[]>
}
if (url && url.href) {
try {
item = await downloadListByUrl(url.href)
item = await downloadListByUrl(url.href, downloader)
} catch (e) {
throw new Error(`Failed to read articleList from URL: ${url.href}`)
}
Expand Down
15 changes: 10 additions & 5 deletions test/unit/downloader.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -289,11 +289,16 @@ describe('Downloader class', () => {
MediaWiki.base = 'https://en.wikipedia.org'
MediaWiki.getCategories = true

s3 = new S3(`${s3UrlObj.protocol}//${s3UrlObj.host}/`, {
bucketName: s3UrlObj.query.bucketName,
keyId: s3UrlObj.query.keyId,
secretAccessKey: s3UrlObj.query.secretAccessKey,
})
s3 = new S3(
`${s3UrlObj.protocol}//${s3UrlObj.host}/`,
{
bucketName: s3UrlObj.query.bucketName,
keyId: s3UrlObj.query.keyId,
secretAccessKey: s3UrlObj.query.secretAccessKey,
},
1000 * 60,
false,
)
downloader = new Downloader({
uaString: `${config.userAgent} ([email protected])`,
speed: 1,
Expand Down
30 changes: 20 additions & 10 deletions test/unit/s3.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,16 @@ describeIf('S3', () => {
test('S3 checks', async () => {
const s3UrlObj = urlParser.parse(`${process.env.S3_URL}`, true)

const s3 = new S3(`${s3UrlObj.protocol}//${s3UrlObj.host}/`, {
bucketName: s3UrlObj.query.bucketName,
keyId: s3UrlObj.query.keyId,
secretAccessKey: s3UrlObj.query.secretAccessKey,
})
const s3 = new S3(
`${s3UrlObj.protocol}//${s3UrlObj.host}/`,
{
bucketName: s3UrlObj.query.bucketName,
keyId: s3UrlObj.query.keyId,
secretAccessKey: s3UrlObj.query.secretAccessKey,
},
1000 * 60,
false,
)

const credentialExists = await s3.initialise()
// Credentials on S3 exists
Expand Down Expand Up @@ -48,11 +53,16 @@ describeIf('S3', () => {

expect(
() =>
new S3(`${wrongS3UrlObj.protocol}//${wrongS3UrlObj.host}/`, {
bucketName: wrongS3UrlObj.query.bucketName,
keyId: wrongS3UrlObj.query.keyId,
secretAccessKey: wrongS3UrlObj.query.secretAccessKey,
}),
new S3(
`${wrongS3UrlObj.protocol}//${wrongS3UrlObj.host}/`,
{
bucketName: wrongS3UrlObj.query.bucketName,
keyId: wrongS3UrlObj.query.keyId,
secretAccessKey: wrongS3UrlObj.query.secretAccessKey,
},
1000 * 60,
false,
),
).toThrow('Unknown S3 region set')
})
})
26 changes: 18 additions & 8 deletions test/unit/util.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,23 @@ import { fileURLToPath } from 'url'
import { jest } from '@jest/globals'
import fs from 'fs'
import rimraf from 'rimraf'
import Downloader from '../../src/Downloader.js'
import MediaWiki from '../../src/MediaWiki.js'
import { config } from '../../src/config.js'

jest.setTimeout(10000)

const __filename = fileURLToPath(import.meta.url)
const __dirname = path.dirname(__filename)

describe('Utils', () => {
let downloader: Downloader

MediaWiki.base = 'https://en.wikipedia.org' // Mandatory setting for proper downloader initialization
beforeAll(async () => {
downloader = new Downloader({ uaString: `${config.userAgent} ([email protected])`, speed: 1, reqTimeout: 1000 * 60, webp: true, optimisationCacheUrl: '' })
})

test('util -> interpolateTranslationString', async () => {
expect(interpolateTranslationString('Hello world', {})).toEqual('Hello world')
expect(interpolateTranslationString('Hello ${name}', { name: 'John' })).toEqual('Hello John')
Expand Down Expand Up @@ -283,30 +293,30 @@ describe('Utils', () => {
})

test('One string as parameter', async () => {
const result: string[] = await extractArticleList('testString')
const result: string[] = await extractArticleList('testString', downloader)
expect(result).toEqual(['testString'])
})

test('Comma separated strings as parameter', async () => {
const result: string[] = await extractArticleList(argumentsList.join(','))
const result: string[] = await extractArticleList(argumentsList.join(','), downloader)
expect(result).toEqual(argumentsList)
})

test('Filename string as parameter', async () => {
const result: string[] = await extractArticleList(filePath)
const result: string[] = await extractArticleList(filePath, downloader)
expect(result).toEqual(argumentsList)
})

test('Comma separated filenames string as parameter', async () => {
const result: string[] = await extractArticleList(`${filePath},${anotherFilePath}`)
const result: string[] = await extractArticleList(`${filePath},${anotherFilePath}`, downloader)
expect(result.sort()).toEqual(argumentsList.concat(anotherArgumentsList))
})

test('URL as parameter', async () => {
jest.spyOn(axios, 'get').mockResolvedValue({
data: fs.createReadStream(filePath),
})
const result: string[] = await extractArticleList('http://test.com/strings')
const result: string[] = await extractArticleList('http://test.com/strings', downloader)
expect(result).toEqual(argumentsList)
})

Expand All @@ -317,18 +327,18 @@ describe('Utils', () => {
jest.spyOn(axios, 'get').mockResolvedValueOnce({
data: fs.createReadStream(anotherFilePath),
})
const result: string[] = await extractArticleList('http://test.com/strings,http://test.com/another-strings')
const result: string[] = await extractArticleList('http://test.com/strings,http://test.com/another-strings', downloader)
expect(result.sort()).toEqual(argumentsList.concat(anotherArgumentsList))
})

test('The parameter starts from HTTP but it is not the URL', async () => {
const result: string[] = await extractArticleList('http-test')
const result: string[] = await extractArticleList('http-test', downloader)
expect(result).toEqual(['http-test'])
})

test('Error if trying to get articleList from wrong URL ', async () => {
jest.spyOn(axios, 'get').mockRejectedValue({})
await expect(extractArticleList('http://valid-wrong-url.com/')).rejects.toThrow('Failed to read articleList from URL: http://valid-wrong-url.com/')
await expect(extractArticleList('http://valid-wrong-url.com/', downloader)).rejects.toThrow('Failed to read articleList from URL: http://valid-wrong-url.com/')
})
})

Expand Down

0 comments on commit 88a2411

Please sign in to comment.