Skip to content

Commit

Permalink
Preliminary support for ZIM archives with no namespace (#698)
Browse files Browse the repository at this point in the history
Partially implements #684.
  • Loading branch information
Jaifroid authored Feb 6, 2021
1 parent dfc9bbf commit 18c51f1
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 25 deletions.
2 changes: 1 addition & 1 deletion service-worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ var regexpExcludedURLSchema = /^(?:chrome-extension|example-extension):/i;
* In our case, there is also the ZIM file name used as a prefix in the URL
* @type {RegExp}
*/
var regexpZIMUrlWithNamespace = /(?:^|\/)([^\/]+\/)([-ABIJMUVWX])\/(.+)/;
var regexpZIMUrlWithNamespace = /(?:^|\/)([^/]+\/)([-ABCIJMUVWX])\/(.+)/;

self.addEventListener('install', function (event) {
event.waitUntil(self.skipWaiting());
Expand Down
45 changes: 28 additions & 17 deletions www/js/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -1217,14 +1217,14 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys

// Compile some regular expressions needed to modify links
// Pattern to find a ZIM URL (with its namespace) - see https://wiki.openzim.org/wiki/ZIM_file_format#Namespaces
var regexpZIMUrlWithNamespace = /^[./]*([-ABIJMUVWX]\/.+)$/;
// Regex below finds images, scripts, stylesheets and tracks with ZIM-type metadata and image namespaces [kiwix-js #378]
// It first searches for <img, <script, <link, etc., then scans forward to find, on a word boundary, either src=["']
// or href=["'] (ignoring any extra whitespace), and it then tests the path of the URL with a non-capturing lookahead that
// matches ZIM URLs with namespaces [-IJ] ('-' = metadata or 'I'/'J' = image). When the regex is used below, it will also
// remove any relative or absolute path from ZIM-style URLs.
// DEV: If you want to support more namespaces, add them to the END of the character set [-IJ] (not to the beginning)
var regexpTagsWithZimUrl = /(<(?:img|script|link|track)\b[^>]*?\s)(?:src|href)(\s*=\s*["'])(?:\.\.\/|\/)+(?=[-IJ]\/)/ig;
var regexpZIMUrlWithNamespace = /^[./]*([-ABCIJMUVWX]\/.+)$/;
// Regex below finds images, scripts, stylesheets and tracks with ZIM-type metadata and image namespaces [kiwix-js #378].
// It first searches for <img, <script, <link, etc., then scans forward to find, on a word boundary, either src=["'] or href=["']
// (ignoring any extra whitespace), and it then tests the path of the URL with a non-capturing negative lookahead that excludes
// URLs that begin 'http' (i.e. non-relative URLs). It then captures the whole of the URL up until either the opening delimiter
// (" or ', which is capture group \3) or a querystring or hash character (? or #). When the regex is used below, it will be further
// processed to calculate the ZIM URL from the relative path. This regex can cope with legitimate single quote marks (') in the URL.
var regexpTagsWithZimUrl = /(<(?:img|script|link|track)\b[^>]*?\s)(?:src|href)(\s*=\s*(["']))(?!http)(.+?)(?=\3|\?|#)/ig;
// Regex below tests the html of an article for active content [kiwix-js #466]
// It inspects every <script> block in the html and matches in the following cases: 1) the script loads a UI application called app.js;
// 2) the script block has inline content that does not contain "importScript()", "toggleOpenSection" or an "articleId" assignment
Expand Down Expand Up @@ -1253,9 +1253,19 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys
if (regexpActiveContent.test(htmlArticle)) uiUtil.displayActiveContentWarning();
}

// Calculate the current article's ZIM baseUrl to use when processing relative links
var baseUrl = dirEntry.namespace + '/' + dirEntry.url.replace(/[^/]+$/, '');

// Replaces ZIM-style URLs of img, script, link and media tags with a data-kiwixurl to prevent 404 errors [kiwix-js #272 #376]
// This replacement also processes the URL to remove the path so that the URL is ready for subsequent jQuery functions
htmlArticle = htmlArticle.replace(regexpTagsWithZimUrl, '$1data-kiwixurl$2');
// This replacement also processes the URL relative to the page's ZIM URL so that we can find the ZIM URL of the asset
// with the correct namespace (this works for old-style -,I,J namespaces and for new-style C namespace)
htmlArticle = htmlArticle.replace(regexpTagsWithZimUrl, function(match, blockStart, equals, quote, relAssetUrl) {
var assetZIMUrl = uiUtil.deriveZimUrlFromRelativeUrl(relAssetUrl, baseUrl);
// DEV: Note that deriveZimUrlFromRelativeUrl produces a *decoded* URL (and incidentally would remove any URI component
// if we had captured it). We therefore re-encode the URI with encodeURI (which does not encode forward slashes) instead
// of encodeURIComponent.
return blockStart + 'data-kiwixurl' + equals + encodeURI(assetZIMUrl);
});

// Extract any css classes from the html tag (they will be stripped when injected in iframe with .innerHTML)
var htmlCSS = htmlArticle.match(/<html[^>]*class\s*=\s*["']\s*([^"']+)/i);
Expand Down Expand Up @@ -1317,9 +1327,6 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys
// Load the blank article to clear the iframe (NB iframe onload event runs *after* this)
iframeArticleContent.src = "article.html";

// Calculate the current article's ZIM baseUrl to use when processing relative links
var baseUrl = dirEntry.namespace + '/' + dirEntry.url.replace(/[^/]+$/, '');

function parseAnchorsJQuery() {
var currentProtocol = location.protocol;
var currentHost = location.host;
Expand Down Expand Up @@ -1513,13 +1520,13 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys
var source = mediaSource.getAttribute('src');
source = source ? uiUtil.deriveZimUrlFromRelativeUrl(source, baseUrl) : null;
// We have to exempt text tracks from using deriveZimUrlFromRelativeurl due to a bug in Firefox [kiwix-js #496]
source = source ? source : mediaSource.dataset.kiwixurl;
source = source ? source : decodeURIComponent(mediaSource.dataset.kiwixurl);
if (!source || !regexpZIMUrlWithNamespace.test(source)) {
if (source) console.error('No usable media source was found for: ' + source);
return;
}
var mediaElement = /audio|video/i.test(mediaSource.tagName) ? mediaSource : mediaSource.parentElement;
selectedArchive.getDirEntryByTitle(decodeURIComponent(source)).then(function(dirEntry) {
selectedArchive.getDirEntryByTitle(source).then(function(dirEntry) {
return selectedArchive.readBinaryFile(dirEntry, function (fileDirEntry, mediaArray) {
var mimeType = mediaSource.type ? mediaSource.type : dirEntry.getMimetype();
var blob = new Blob([mediaArray], { type: mimeType });
Expand Down Expand Up @@ -1610,7 +1617,10 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys
$("#searchingArticles").hide();
alert("Error finding random article.");
} else {
if (dirEntry.namespace === 'A') {
// We fall back to the old A namespace to support old ZIM files without a text/html MIME type for articles
// DEV: This will need to be changed if we search titlePtrList version 1
// in a future PR, as that list contains only articles
if (dirEntry.getMimetype() === 'text/html' || dirEntry.namespace === 'A') {
params.isLandingPage = false;
$('#activeContent').hide();
$('#searchingArticles').show();
Expand All @@ -1632,7 +1642,8 @@ define(['jquery', 'zimArchiveLoader', 'uiUtil', 'settingsStore','abstractFilesys
$("#searchingArticles").hide();
$("#welcomeText").show();
} else {
if (dirEntry.namespace === 'A') {
// DEV: see comment above under goToRandomArticle()
if (dirEntry.redirect || dirEntry.getMimetype() === 'text/html' || dirEntry.namespace === 'A') {
params.isLandingPage = true;
readArticle(dirEntry);
} else {
Expand Down
6 changes: 3 additions & 3 deletions www/js/lib/uiUtil.js
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ define(rqDef, function() {
* Derives the URL.pathname from a relative or semi-relative URL using the given base ZIM URL
*
* @param {String} url The (URI-encoded) URL to convert (e.g. "Einstein", "../Einstein",
* "../../I/im%C3%A1gen.png", "-/s/style.css", "/A/Einstein.html")
* @param {String} base The base ZIM URL of the currently loaded article (e.g. "A/" or "A/subdir1/subdir2/")
* @returns {String} The derived ZIM URL in decoded form (e.g. "A/Einstein", "I/imágen.png")
* "../../I/im%C3%A1gen.png", "-/s/style.css", "/A/Einstein.html", "../static/bootstrap/css/bootstrap.min.css")
* @param {String} base The base ZIM URL of the currently loaded article (e.g. "A/", "A/subdir1/subdir2/", "C/Singapore/")
* @returns {String} The derived ZIM URL in decoded form (e.g. "A/Einstein", "I/imágen.png", "C/")
*/
function deriveZimUrlFromRelativeUrl(url, base) {
// We use a dummy domain because URL API requires a valid URI
Expand Down
34 changes: 30 additions & 4 deletions www/js/lib/zimArchive.js
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,30 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
}
searchNextVariant();
};

/**
* A method to return the namespace in the ZIM file that contains the primary user content. In old-format ZIM files (minor
* version 0) there are a number of content namespaces, but the primary one in which to search for titles is 'A'. In new-format
* ZIMs (minor version 1) there is a single content namespace 'C'. See https://openzim.org/wiki/ZIM_file_format. This method
* throws an error if it cannot determine the namespace or if the ZIM is not ready.
* @returns {String} The content namespace for the ZIM archive
*/
ZIMArchive.prototype.getContentNamespace = function () {
var errorText;
if (this.isReady()) {
var ver = this._file.minorVersion;
// DEV: There are currently only two defined values for minorVersion in the OpenZIM specification
// If this changes, adapt the error checking and return values
if (ver > 1) {
errorText = 'Unknown ZIM minor version!';
} else {
return ver === 0 ? 'A' : 'C';
}
} else {
errorText = 'We could not determine the content namespace because the ZIM file is not ready!';
}
throw new Error(errorText);
};

/**
* Look for dirEntries with title starting with the given prefix (case-sensitive)
Expand All @@ -220,12 +244,14 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
*/
ZIMArchive.prototype.findDirEntriesWithPrefixCaseSensitive = function(prefix, resultSize, search, callback) {
var that = this;
var cns = this.getContentNamespace();
util.binarySearch(0, this._file.articleCount, function(i) {
return that._file.dirEntryByTitleIndex(i).then(function(dirEntry) {
if (search.status === 'cancelled') return 0;
if (dirEntry.namespace < 'A') return 1;
if (dirEntry.namespace > 'A') return -1;
// We should now be in namespace A
var ns = dirEntry.namespace;
if (ns < cns) return 1;
if (ns > cns) return -1;
// We should now be in namespace A (old format ZIM) or C (new format ZIM)
return prefix <= dirEntry.getTitleOrUrl() ? -1 : 1;
});
}, true).then(function(firstIndex) {
Expand All @@ -237,7 +263,7 @@ define(['zimfile', 'zimDirEntry', 'util', 'utf8'],
return that._file.dirEntryByTitleIndex(index).then(function(dirEntry) {
var title = dirEntry.getTitleOrUrl();
// Only return dirEntries with titles that actually begin with prefix
if (dirEntry.namespace === 'A' && title.indexOf(prefix) === 0) {
if (dirEntry.namespace === cns && title.indexOf(prefix) === 0) {
dirEntries.push(dirEntry);
// Report interim result
callback([dirEntry], true);
Expand Down
3 changes: 3 additions & 0 deletions www/js/lib/zimfile.js
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,9 @@ define(['xzdec_wrapper', 'zstddec_wrapper', 'util', 'utf8', 'q', 'zimDirEntry',
zf.id = tempFileId++;
fileIDs.set(zf.name, zf.id);
}
// For a description of these values, see https://wiki.openzim.org/wiki/ZIM_file_format
zf.majorVersion = readInt(header, 4, 2); // Not currently used by this implementation
zf.minorVersion = readInt(header, 6, 2); // Used to determine the User Content namespace
zf.articleCount = readInt(header, 24, 4);
zf.clusterCount = readInt(header, 28, 4);
zf.urlPtrPos = urlPtrPos;
Expand Down

0 comments on commit 18c51f1

Please sign in to comment.