From b143e8372c3b43573b6905893f962e72a8d94ca0 Mon Sep 17 00:00:00 2001 From: Jaifroid Date: Mon, 31 Jan 2022 21:16:19 +0000 Subject: [PATCH] Fix incorrect processing of titles with question marks and anchor parameters #806 (#807) --- service-worker.js | 74 ++++++++++++++++++++------------------------ tests/tests.js | 19 +++++++----- www/js/app.js | 61 +++++++++++++++++++++++------------- www/js/lib/uiUtil.js | 38 +++++------------------ 4 files changed, 91 insertions(+), 101 deletions(-) diff --git a/service-worker.js b/service-worker.js index ccdb049ff..eba777112 100644 --- a/service-worker.js +++ b/service-worker.js @@ -187,14 +187,19 @@ let fetchCaptureEnabled = false; /** * Intercept selected Fetch requests from the browser window */ - self.addEventListener('fetch', function (event) { +self.addEventListener('fetch', function (event) { // Only cache GET requests if (event.request.method !== "GET") return; - // Remove any querystring before requesting from the cache - var rqUrl = event.request.url.replace(/\?[^?]+$/i, ''); + var rqUrl = event.request.url; + var urlObject = new URL(rqUrl); + // Test the URL with parameters removed + var strippedUrl = urlObject.pathname; // Select cache depending on request format - var cache = /\.zim\//i.test(rqUrl) ? ASSETS_CACHE : APP_CACHE; + var cache = /\.zim\//i.test(strippedUrl) ? ASSETS_CACHE : APP_CACHE; if (cache === ASSETS_CACHE && !fetchCaptureEnabled) return; + // For APP_CACHE assets, we should ignore any querystring (whereas it should be conserved for ZIM assets, + // especially .js assets, where it may be significant). Anchor targets are irreleveant in this context. + if (cache === APP_CACHE) rqUrl = strippedUrl; event.respondWith( // First see if the content is in the cache fromCache(cache, rqUrl).then(function (response) { @@ -203,24 +208,24 @@ let fetchCaptureEnabled = false; }, function () { // The response was not found in the cache so we look for it in the ZIM // and add it to the cache if it is an asset type (css or js) - if (cache === ASSETS_CACHE && regexpZIMUrlWithNamespace.test(rqUrl)) { - return fetchRequestFromZIM(event).then(function (response) { + if (cache === ASSETS_CACHE && regexpZIMUrlWithNamespace.test(strippedUrl)) { + return fetchUrlFromZIM(urlObject).then(function (response) { // Add css or js assets to ASSETS_CACHE (or update their cache entries) unless the URL schema is not supported if (regexpCachedContentTypes.test(response.headers.get('Content-Type')) && !regexpExcludedURLSchema.test(event.request.url)) { - event.waitUntil(updateCache(ASSETS_CACHE, event.request, response.clone())); + event.waitUntil(updateCache(ASSETS_CACHE, rqUrl, response.clone())); } return response; - }).catch(function (msgPortData, title) { - console.error('Invalid message received from app.js for ' + title, msgPortData); + }).catch(function (msgPortData) { + console.error('Invalid message received from app.js for ' + strippedUrl, msgPortData); return msgPortData; }); } else { // It's not an asset, or it doesn't match a ZIM URL pattern, so we should fetch it with Fetch API return fetch(event.request).then(function (response) { // If request was successful, add or update it in the cache, but be careful not to cache the ZIM archive itself! - if (!regexpExcludedURLSchema.test(rqUrl) && !/\.zim\w{0,2}$/i.test(rqUrl)) { - event.waitUntil(updateCache(APP_CACHE, event.request, response.clone())); + if (!regexpExcludedURLSchema.test(event.request.url) && !/\.zim\w{0,2}$/i.test(strippedUrl)) { + event.waitUntil(updateCache(APP_CACHE, rqUrl, response.clone())); } return response; }).catch(function (error) { @@ -271,25 +276,22 @@ let fetchCaptureEnabled = false; }); /** - * Handles fetch events that need to be extracted from the ZIM + * Handles URLs that need to be extracted from the ZIM archive * - * @param {Event} fetchEvent The fetch event to be processed + * @param {URL} urlObject The URL object to be processed for extraction from the ZIM * @returns {Promise} A Promise for the Response, or rejects with the invalid message port data */ -function fetchRequestFromZIM(fetchEvent) { +function fetchUrlFromZIM(urlObject) { return new Promise(function (resolve, reject) { - var nameSpace; - var title; - var titleWithNameSpace; - var regexpResult = regexpZIMUrlWithNamespace.exec(fetchEvent.request.url); - var prefix = regexpResult[1]; - nameSpace = regexpResult[2]; - title = regexpResult[3]; + // Note that titles may contain bare question marks or hashes, so we must use only the pathname without any URL parameters. + // Be sure that you haven't encoded any querystring along with the URL. + var barePathname = decodeURIComponent(urlObject.pathname); + var partsOfZIMUrl = regexpZIMUrlWithNamespace.exec(barePathname); + var prefix = partsOfZIMUrl[1]; + var nameSpace = partsOfZIMUrl[2]; + var title = partsOfZIMUrl[3]; - // We need to remove the potential parameters in the URL - title = removeUrlParameters(decodeURIComponent(title)); - - titleWithNameSpace = nameSpace + '/' + title; + var titleWithNameSpace = nameSpace + '/' + title; // Let's instantiate a new messageChannel, to allow app.js to give us the content var messageChannel = new MessageChannel(); @@ -333,15 +335,6 @@ function fetchRequestFromZIM(fetchEvent) { }); } -/** - * Removes parameters and anchors from a URL - * @param {type} url The URL to be processed - * @returns {String} The same URL without its parameters and anchors - */ -function removeUrlParameters(url) { - return url.replace(/([^?#]+)[?#].*$/, '$1'); -} - /** * Looks up a Request in a cache and returns a Promise for the matched Response * @param {String} cache The name of the cache to look in @@ -350,12 +343,10 @@ function removeUrlParameters(url) { */ function fromCache(cache, requestUrl) { // Prevents use of Cache API if user has disabled it - if (!useAppCache && cache === APP_CACHE || !useAssetsCache && cache === ASSETS_CACHE) return Promise.reject('disabled'); + if (!(useAppCache && cache === APP_CACHE || useAssetsCache && cache === ASSETS_CACHE)) return Promise.reject('disabled'); return caches.open(cache).then(function (cacheObj) { return cacheObj.match(requestUrl).then(function (matching) { - if (!matching || matching.status === 404) { - return Promise.reject('no-match'); - } + if (!matching || matching.status === 404) return Promise.reject('no-match'); console.debug('[SW] Supplying ' + requestUrl + ' from ' + cache + '...'); return matching; }); @@ -365,15 +356,16 @@ function fromCache(cache, requestUrl) { /** * Stores or updates in a cache the given Request/Response pair * @param {String} cache The name of the cache to open - * @param {Request} request The original Request object + * @param {Request|String} request The original Request object or the URL string requested * @param {Response} response The Response received from the server/ZIM * @returns {Promise} A Promise for the update action */ function updateCache(cache, request, response) { // Prevents use of Cache API if user has disabled it - if (!useAppCache && cache === APP_CACHE || !useAssetsCache && cache === ASSETS_CACHE) return Promise.resolve(); + if (!response.ok || !(useAppCache && cache === APP_CACHE || useAssetsCache && cache === ASSETS_CACHE)) + return Promise.resolve(); return caches.open(cache).then(function (cacheObj) { - console.debug('[SW] Adding ' + request.url + ' to ' + cache + '...'); + console.debug('[SW] Adding ' + (request.url || request) + ' to ' + cache + '...'); return cacheObj.put(request, response); }); } diff --git a/tests/tests.js b/tests/tests.js index 110af528e..fca69ab90 100644 --- a/tests/tests.js +++ b/tests/tests.js @@ -118,14 +118,17 @@ define(['jquery', 'zimArchive', 'zimDirEntry', 'util', 'uiUtil', 'utf8'], assert.equal(util.allCaseFirstLetters(testString6, "full").indexOf("ΚΑΛΆ ΝΕΡΆ ΜΑΓΝΗΣΊΑ ŽIŽEK") >= 0, true, "All Unicode letters should be uppercase"); }); QUnit.test("check removal of parameters in URL", function(assert) { - var testUrl1 = "A/question.html"; - var testUrl2 = "A/question.html?param1=toto¶m2=titi"; - var testUrl3 = "A/question.html?param1=toto¶m2=titi#anchor"; - var testUrl4 = "A/question.html#anchor"; - assert.equal(uiUtil.removeUrlParameters(testUrl1), testUrl1); - assert.equal(uiUtil.removeUrlParameters(testUrl2), testUrl1); - assert.equal(uiUtil.removeUrlParameters(testUrl3), testUrl1); - assert.equal(uiUtil.removeUrlParameters(testUrl4), testUrl1); + var baseUrl = "A/Che cosa è l'amore?.html"; + var testUrls = [ + "A/Che%20cosa%20%C3%A8%20l'amore%3F.html?param1=toto¶m2=titi", + "A/Che%20cosa%20%C3%A8%20l'amore%3F.html?param1=toto¶m2=titi#anchor", + "A/Che%20cosa%20%C3%A8%20l'amore%3F.html#anchor" + ]; + testUrls.forEach(function (testUrl) { + assert.equal(decodeURIComponent( + uiUtil.removeUrlParameters(testUrl) + ), baseUrl); + }); }); QUnit.module("ZIM initialisation"); diff --git a/www/js/app.js b/www/js/app.js index 1272cebe8..ee2b4233e 100644 --- a/www/js/app.js +++ b/www/js/app.js @@ -248,7 +248,7 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys if (/Enter/.test(e.key)) { if (activeElement.classList.contains('hover')) { var dirEntryId = activeElement.getAttribute('dirEntryId'); - findDirEntryFromDirEntryIdAndLaunchArticleRead(dirEntryId); + findDirEntryFromDirEntryIdAndLaunchArticleRead(decodeURIComponent(dirEntryId)); return; } } @@ -710,9 +710,9 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys } return; } - // Because the "outer" Service Worker still runs in a PWA app, we don't actually disable the SW in this context, but it will no longer - // be intercepting requests - if ('serviceWorker' in navigator) { + // Because the Service Worker must still run in a PWA app so that it can work offline, we don't actually disable the SW in this context, + // but it will no longer be intercepting requests for ZIM assets (only requests for the app's own code) + if (isServiceWorkerAvailable()) { serviceWorkerRegistration = null; } refreshAPIStatus(); @@ -1285,7 +1285,11 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys var listLength = dirEntryArray.length < params.maxSearchResultsSize ? dirEntryArray.length : params.maxSearchResultsSize; for (var i = 0; i < listLength; i++) { var dirEntry = dirEntryArray[i]; - var dirEntryStringId = uiUtil.htmlEscapeChars(dirEntry.toStringId()); + // NB We use encodeURIComponent rather than encodeURI here because we know that any question marks in the title are not querystrings, + // and should be encoded [kiwix-js #806]. DEV: be very careful if you edit the dirEntryId attribute below, because the contents must be + // inside double quotes (in the final HTML string), given that dirEntryStringId may contain bare apostrophes + // Info: encodeURIComponent encodes all characters except A-Z a-z 0-9 - _ . ! ~ * ' ( ) + var dirEntryStringId = encodeURIComponent(dirEntry.toStringId()); articleListDivHtml += '' + dirEntry.getTitleOrUrl() + ''; } @@ -1304,20 +1308,19 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys /** * Handles the click on the title of an article in search results - * @param {Event} event - * @returns {Boolean} + * @param {Event} event The click event to handle + * @returns {Boolean} Always returns false for JQuery event handling */ function handleTitleClick(event) { - var dirEntryId = event.target.getAttribute("dirEntryId"); + var dirEntryId = decodeURIComponent(event.target.getAttribute('dirEntryId')); findDirEntryFromDirEntryIdAndLaunchArticleRead(dirEntryId); return false; } - /** * Creates an instance of DirEntry from given dirEntryId (including resolving redirects), * and call the function to read the corresponding article - * @param {String} dirEntryId + * @param {String} dirEntryId The stringified Directory Entry to parse and launch */ function findDirEntryFromDirEntryIdAndLaunchArticleRead(dirEntryId) { if (selectedArchive.isReady()) { @@ -1501,7 +1504,10 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys // the link. This is currently the case for epub and pdf files in Project Gutenberg ZIMs -- add any further types you need // to support to this regex. The "zip" has been added here as an example of how to support further filetypes var regexpDownloadLinks = /^.*?\.epub($|\?)|^.*?\.pdf($|\?)|^.*?\.zip($|\?)/i; - + + // A string to hold any anchor parameter in clicked ZIM URLs (as we must strip these to find the article in the ZIM) + var anchorParameter; + /** * Display the the given HTML article in the web page, * and convert links to javascript calls @@ -1591,7 +1597,12 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys //loadJavaScriptJQuery(); loadCSSJQuery(); insertMediaBlobsJQuery(); - + // Jump to any anchor parameter + if (anchorParameter) { + var target = iframeContentDocument.getElementById(anchorParameter); + if (target) target.scrollIntoView(); + anchorParameter = ''; + } if (iframeArticleContent.contentWindow) { // Configure home key press to focus #prefix only if the feature is in active state if (params.useHomeKeyToFocusSearchBar) @@ -1613,7 +1624,8 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys // NB dirEntry.url can also contain path separator / in some ZIMs (Stackexchange). } and ] do not need to be escaped as they have no meaning on their own. var escapedUrl = encodeURIComponent(dirEntry.url).replace(/([\\$^.|?*+/()[{])/g, '\\$1'); // Pattern to match a local anchor in an href even if prefixed by escaped url; will also match # on its own - var regexpLocalAnchorHref = new RegExp('^(?:#|' + escapedUrl + '#)([^#]*$)'); + // Note that we exclude any # with a semicolon between it and the end of the string, to avoid accidentally matching e.g. ' + var regexpLocalAnchorHref = new RegExp('^(?:#|' + escapedUrl + '#)([^#;]*$)'); var iframe = iframeArticleContent.contentDocument; Array.prototype.slice.call(iframe.querySelectorAll('a, area')).forEach(function (anchor) { // Attempts to access any properties of 'this' with malformed URLs causes app crash in Edge/UWP [kiwix-js #430] @@ -1624,12 +1636,13 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys return; } var href = anchor.getAttribute('href'); - if (href === null || href === undefined) return; + if (href === null || href === undefined || /^javascript:/i.test(anchor.protocol)) return; + var anchorTarget = href.match(regexpLocalAnchorHref); if (href.length === 0) { // It's a link with an empty href, pointing to the current page: do nothing. - } else if (regexpLocalAnchorHref.test(href)) { + } else if (anchorTarget) { // It's a local anchor link : remove escapedUrl if any (see above) - anchor.setAttribute('href', href.replace(/^[^#]*/, '')); + anchor.setAttribute('href', '#' + anchorTarget[1]); } else if (anchor.protocol !== currentProtocol || anchor.host !== currentHost) { // It's an external URL : we should open it in a new tab @@ -1654,6 +1667,8 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys // Add an onclick event to extract this article or file from the ZIM // instead of following the link anchor.addEventListener('click', function (e) { + anchorParameter = href.match(/#([^#;]+)$/); + anchorParameter = anchorParameter ? anchorParameter[1] : ''; var zimUrl = uiUtil.deriveZimUrlFromRelativeUrl(uriComponent, baseUrl); goToArticle(zimUrl, downloadAttrValue, contentType); e.preventDefault(); @@ -1709,20 +1724,18 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys // These sections can be opened by clicking on them, but this is done with some javascript. // The code below is a workaround we still need for compatibility with ZIM files generated by mwoffliner in 2018. // A better fix has been made for more recent ZIM files, with the use of noscript tags : see https://github.com/openzim/mwoffliner/issues/324 - var iframe = document.getElementById('articleContent').contentDocument; + var iframe = iframeArticleContent.contentDocument; var collapsedBlocks = iframe.querySelectorAll('.collapsible-block:not(.open-block), .collapsible-heading:not(.open-block)'); // Using decrementing loop to optimize performance : see https://stackoverflow.com/questions/3520688 for (var i = collapsedBlocks.length; i--;) { collapsedBlocks[i].classList.add('open-block'); } - var cssCount = 0; var cssFulfilled = 0; - var iframe = iframeArticleContent.contentDocument; Array.prototype.slice.call(iframe.querySelectorAll('link[data-kiwixurl]')).forEach(function (link) { cssCount++; var linkUrl = link.getAttribute('data-kiwixurl'); - var url = uiUtil.removeUrlParameters(decodeURIComponent(linkUrl)); + var url = decodeURIComponent(uiUtil.removeUrlParameters(linkUrl)); if (cssCache.has(url)) { var nodeContent = cssCache.get(url); if (/stylesheet/i.test(link.rel)) uiUtil.replaceCSSLinkWithInlineCSS(link, nodeContent); @@ -1731,6 +1744,10 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys } else { if (params.assetsCache) $('#cachingAssets').show(); selectedArchive.getDirEntryByPath(url).then(function (dirEntry) { + if (!dirEntry) { + cssCache.set(url, ''); // Prevent repeated lookups of this unfindable asset + throw 'DirEntry ' + typeof dirEntry; + } var mimetype = dirEntry.getMimetype(); var readFile = /^text\//i.test(mimetype) ? selectedArchive.readUtf8File : selectedArchive.readBinaryFile; return readFile(dirEntry, function (fileDirEntry, content) { @@ -1742,7 +1759,7 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys renderIfCSSFulfilled(fileDirEntry.url); }); }).catch(function (e) { - console.error("could not find DirEntry for CSS : " + url, e); + console.error("Could not find DirEntry for link element: " + url, e); cssCount--; renderIfCSSFulfilled(); }); @@ -1777,7 +1794,7 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys // var script = $(this); // var scriptUrl = script.attr("data-kiwixurl"); // // TODO check that the type of the script is text/javascript or application/javascript - // var title = uiUtil.removeUrlParameters(decodeURIComponent(scriptUrl)); + // var title = uiUtil.removeUrlParameters(scriptUrl); // selectedArchive.getDirEntryByPath(title).then(function(dirEntry) { // if (dirEntry === null) { // console.log("Error: js file not found: " + title); diff --git a/www/js/lib/uiUtil.js b/www/js/lib/uiUtil.js index 5b6a9c8fa..fef45ef63 100644 --- a/www/js/lib/uiUtil.js +++ b/www/js/lib/uiUtil.js @@ -103,15 +103,17 @@ define(rqDef, function(settingsStore) { link.parentNode.replaceChild(cssElement, link); } - var regexpRemoveUrlParameters = new RegExp(/([^?#]+)[?#].*$/); - /** * Removes parameters and anchors from a URL - * @param {type} url - * @returns {String} same URL without its parameters and anchors + * @param {type} url The URL to be processed + * @returns {String} The same URL without its parameters and anchors */ function removeUrlParameters(url) { - return url.replace(regexpRemoveUrlParameters, "$1"); + // Remove any querystring + var strippedUrl = url.replace(/\?[^?]*$/, ''); + // Remove any anchor parameters - note that we are deliberately excluding entity references, e.g. '''. + strippedUrl = strippedUrl.replace(/#[^#;]*$/, ''); + return strippedUrl; } /** @@ -125,7 +127,7 @@ define(rqDef, function(settingsStore) { function deriveZimUrlFromRelativeUrl(url, base) { // We use a dummy domain because URL API requires a valid URI var dummy = 'http://d/'; - var deriveZimUrl = function(url, base) { + var deriveZimUrl = function (url, base) { if (typeof URL === 'function') return new URL(url, base); // IE11 lacks URL API: workaround adapted from https://stackoverflow.com/a/28183162/9727685 var d = document.implementation.createHTMLDocument('t'); @@ -306,29 +308,6 @@ define(rqDef, function(settingsStore) { return rect.top < window.innerHeight && rect.bottom > 0 && rect.left < window.innerWidth && rect.right > 0; } - /** - * Encodes the html escape characters in the string before using it as html class name,id etc. - * - * @param {String} string The string in which html characters are to be escaped - * - */ - function htmlEscapeChars(string) { - var escapechars = { - '&': '&', - '<': '<', - '>': '>', - '"': '"', - "'": ''', - '/': '/', - '`': '`', - '=': '=' - }; - string = String(string).replace(/[&<>"'`=/]/g, function (s) { - return escapechars[s]; - }); - return string; - } - /** * Removes the animation effect between various sections */ @@ -521,7 +500,6 @@ define(rqDef, function(settingsStore) { checkServerIsAccessible: checkServerIsAccessible, spinnerDisplay: spinnerDisplay, isElementInView: isElementInView, - htmlEscapeChars: htmlEscapeChars, removeAnimationClasses: removeAnimationClasses, applyAnimationToSection: applyAnimationToSection, applyAppTheme: applyAppTheme,