From e350e43800cf5814f6b8121d3c25994e8ee9cd99 Mon Sep 17 00:00:00 2001 From: Pavlo Karatsiuba Date: Tue, 14 Feb 2023 18:10:36 +0100 Subject: [PATCH 1/2] Treat hidden DOM elements before treating media files. --- src/util/saveArticles.ts | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/util/saveArticles.ts b/src/util/saveArticles.ts index 158cd7ba5..1396b4270 100644 --- a/src/util/saveArticles.ts +++ b/src/util/saveArticles.ts @@ -433,6 +433,19 @@ async function processArticleHtml( let mediaDependencies: Array<{ url: string; path: string }> = [] let subtitles: Array<{ url: string; path: string }> = [] let doc = domino.createDocument(html) + + const ruRet = await rewriteUrlsOfDoc(doc, articleId, redisStore, mw, dump) + doc = ruRet.doc + mediaDependencies = mediaDependencies.concat( + ruRet.mediaDependencies + .filter((a) => a) + .map((url) => { + const path = getMediaBase(url, false) + return { url, path } + }), + ) + doc = applyOtherTreatments(doc, dump) + const tmRet = await treatMedias(doc, mw, dump, articleId, webp, redisStore) doc = tmRet.doc @@ -454,17 +467,6 @@ async function processArticleHtml( return { url, path } }), ) - const ruRet = await rewriteUrlsOfDoc(doc, articleId, redisStore, mw, dump) - doc = ruRet.doc - mediaDependencies = mediaDependencies.concat( - ruRet.mediaDependencies - .filter((a) => a) - .map((url) => { - const path = getMediaBase(url, false) - return { url, path } - }), - ) - doc = applyOtherTreatments(doc, dump) if (!dump.isMainPage(articleId) && dump.customProcessor?.preProcessArticle) { doc = await dump.customProcessor.preProcessArticle(articleId, doc) From 08d56d8e39ae8779b44a813e621dc362d3cc9a6f Mon Sep 17 00:00:00 2001 From: Pavlo Karatsiuba Date: Wed, 15 Feb 2023 14:24:43 +0100 Subject: [PATCH 2/2] Added test to check if media files are not downloading from hidden elements --- test/e2e/treatMedia.e2e.test.ts | 38 +++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 test/e2e/treatMedia.e2e.test.ts diff --git a/test/e2e/treatMedia.e2e.test.ts b/test/e2e/treatMedia.e2e.test.ts new file mode 100644 index 000000000..22e045ec8 --- /dev/null +++ b/test/e2e/treatMedia.e2e.test.ts @@ -0,0 +1,38 @@ +import * as mwoffliner from '../../src/mwoffliner.lib.js' +import { execa } from 'execa' +import rimraf from 'rimraf' +import { zimdumpAvailable, zimdump } from '../util.js' +import 'dotenv/config' +import { jest } from '@jest/globals' + +jest.setTimeout(20000) + +describe('treatment test', () => { + const now = new Date() + const testId = `mwo-test-${+now}` + + const articleList = 'Read_my_lips:_no_new_taxes' + const parameters = { + mwUrl: 'https://en.wikipedia.org', + adminEmail: 'test@kiwix.org', + articleList, + outputDirectory: testId, + redis: process.env.REDIS, + } + + test('media file from hidden element should not be downloaded', async () => { + await execa('redis-cli flushall', { shell: true }) + + const outFiles = await mwoffliner.execute(parameters) + // Created 1 output + expect(outFiles).toHaveLength(1) + + if (await zimdumpAvailable()) { + await expect(zimdump(`list --url "I/George_Bush_1988_No_New_Taxes.ogg" ${outFiles[0].outFile}`)).rejects.toThrow('Entry not found') + } else { + console.log('Zimdump not installed, skipping test') + } + + rimraf.sync(`./${testId}`) + }) +})