Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Step 1 of refactoring: More coming later #116

Merged
merged 3 commits into from
Aug 15, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions lib/config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"use strict";

var config = {
userAgent: 'MWOffliner/HEAD',

defaults: {
publisher: 'Kiwix',
redisConfig: '/dev/shm/redis.sock',
requestTimeout: 60,
},

filters: {
/* All DOM nodes with on of these styles will be removed */
/* On Wikivoyage 'noprint' remove also top banners like on 'South America'. */
cssClassBlackList: [
'noprint', 'metadata', 'ambox', 'stub',
'topicon', 'magnify', 'navbar',
'mwe-math-mathml-inline', 'mw-kartographer-container'
],

/* Additional black list if only intro is dumped */
nodetCssClassBlackList: [ 'mw-ref' ],

/* All DOM node with these styles will be deleted
* if no <a> tag is included in the sub-tree */
cssClassBlackListIfNoLink: [
'mainarticle', 'seealso', 'dablink', 'rellink', 'hatnote'
],

/* All DOM nodes which we should for to display */
cssClassDisplayList: ['thumb'],

/* List of style to be removed */
cssClassCallsBlackList: ['plainlinks'],

/* All nodes with one of these ids will be removed */
idBlackList: ['purgelink'],
},

output: {
cssResources: [ 'mobile.css', 'content.parsoid.css', 'inserted_style_mobile.css' ],

dirs : {
style: 's',
media: 'm',
javascript: 'j',
styleModules: 'css_modules',
jsModules: 'js_modules',
}

templates: {
/* Template code for any redirect to be written on the FS */
redirects : [
'<html>', '<head>',
'<meta charset="UTF-8" />',
'<title>{{ title }}</title>',
'<meta http-equiv="refresh" content="0; URL={{ target }}">'
'</head>', '<body></body>', '</html>'
].join(''),
},
},
};

module.exports = {
config: config
};
147 changes: 70 additions & 77 deletions lib/mwoffliner.lib.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ var htmlMinifier = require('html-minifier');
const parsoid = require('parsoid');
const fetch = require('node-fetch');

var config = require('./config.js');

module.exports = {
getParametersList: () => [
{ name: 'mwUrl', description: 'Mediawiki base URL. Dont forget the trailing /', required: true },
Expand Down Expand Up @@ -75,7 +77,7 @@ module.exports = {
/************************************/

/* Layout */
const mobileLayout = argv.mobileLayout || false
const mobileLayout = argv.mobileLayout || false;

/* Formats */
var dumps = [''];
Expand All @@ -95,37 +97,14 @@ module.exports = {
}
}

/* Template code for any redirect to be written on the FS */
var redirectTemplateCode = '<html><head><meta charset="UTF-8" /><title>{{ title }}</title><meta http-equiv="refresh" content="0; URL={{ target }}"></head><body></body></html>';

/* All DOM nodes with on of these styles will be removed */
/* On Wikivoyage 'noprint' remove also top banners like on 'South America'. */
var cssClassBlackList = [ 'noprint', 'metadata', 'ambox', 'stub', 'topicon', 'magnify', 'navbar', 'mwe-math-mathml-inline', 'mw-kartographer-container' ];

/* Additional black list if only intro is dumped */
var nodetCssClassBlackList = [ 'mw-ref'];

/* All DOM node with these styles will be deleted if no A node is included in the sub-tree */
var cssClassBlackListIfNoLink = [ 'mainarticle', 'seealso', 'dablink', 'rellink', 'hatnote' ];

/* All DOM nodes which we should for to display */
var cssClassDisplayList = ['thumb'];

/* List of style to be removed */
var cssClassCallsBlackList = ['plainlinks'];

/* All nodes with one of these ids will be remove */
var idBlackList = ['purgelink'];

/* HTTP user-agent string */
var adminEmail = argv.adminEmail;
var userAgentString = 'MWOffliner/HEAD';
if (validateEmail(adminEmail)) {
userAgentString += ' (' + adminEmail + ')';
} else {
if (!validateEmail(adminEmail)) {
console.error('Admin email ' + adminEmail + ' is not valid');
process.exit(1);
}

var userAgentString += config.userAgent + ' (' + adminEmail + ')';
var loginCookie = '';

/* Directory wehre everything is saved at the end of the process */
Expand Down Expand Up @@ -188,10 +167,10 @@ module.exports = {
var resume = argv.resume;

/* Redis configuration */
var redisConf = argv.redis ? argv.redis : '/dev/shm/redis.sock';
var redisConf = argv.redis ? argv.redis : config.defaults.redisConfig;

/* Default request timeout */
var requestTimeout = argv.requestTimeout ? argv.requestTimeout : 60;
var requestTimeout = argv.requestTimeout || config.defaults.requestTimeout;

/* Keep empty paragraphs */
var keepEmptyParagraphs = argv.keepEmptyParagraphs;
Expand All @@ -200,7 +179,7 @@ module.exports = {
var withZimFullTextIndex = argv.withZimFullTextIndex;

/* ZIM publisher */
var publisher = argv.publisher || 'Kiwix';
var publisher = argv.publisher || config.defaults.publisher;

/* Wikipedia/... URL */
var mwUrl = argv.mwUrl;
Expand Down Expand Up @@ -316,7 +295,11 @@ module.exports = {
/************************************/

/* Check if opt. binaries are available */
var optBinaries = ['jpegoptim --version', 'pngquant --version', 'gifsicle --version', 'advdef --version', 'file --help', 'stat --version', 'convert --version'];
var optBinaries = [
'jpegoptim --version', 'pngquant --version',
'gifsicle --version', 'advdef --version',
'file --help', 'stat --version', 'convert --version'
];
try {
dumps.forEach(function (dump) {
if (dump.toLowerCase().indexOf('nozim') < 0) {
Expand Down Expand Up @@ -345,27 +328,39 @@ module.exports = {
var redisCachedMediaToCheckDatabase = redisNamePrefix + 'c';

/* Compile templates */
var redirectTemplate = swig.compile(redirectTemplateCode);
var redirectTemplate = swig.compile(config.output.templates.redirects);
var footerTemplate = swig.compile(footerTemplateCode);

/************************************/
/* CONSTANT VARIABLE SECTION ********/
/************************************/

var styleDirectory = 's';
var styleModulesDirectory = 'css_modules'
var mediaDirectory = 'm';
var javascriptDirectory = 'j';
var jsModulesDirectory = 'js_modules'

const genericJsModules = ['startup', 'jquery', 'mediawiki', 'site']
var dirs = config.output.dirs;
var cssPath = function(css) {
return [dirs.style, dirs.styleModules, css].join('/');
};
var jsPath = function(js) {
return [dirs.javascript, dirs.jsModules, js].join('/');
};
var genHeaderCSSLink = function(css) {
return '<link href="' + cssPath(css) + '" rel="stylesheet" type="text/css" />';
};
var genHeaderScript = function(js) {
return '<script src="' + jsPath(js) '"></script>';
};

var cssLinks = config.output.cssResources.reduce(function(buf, css) {
return buf + genHeaderCSSLink(css);
}, '');

const genericJsModules = ['startup', 'jquery', 'mediawiki', 'site'];
const genericCssModules = mobileLayout
? ['skins.minerva.base.reset|skins.minerva.content.styles|ext.cite.style|mediawiki.page.gallery.styles|mobile.app.pagestyles.android|mediawiki.skinning.content.parsoid']
: []
: [];

// this module has no css, blacklisting it avoid creating an empty file that generate an error in firefox
// error is "style sheet could not be loaded"
const blackListCssModules = ['mediawiki.page.gallery']
const blackListCssModules = ['mediawiki.page.gallery'];

var mediaRegex = /^(.*\/)([^\/]+)(\/)(\d+px-|)(.+?)(\.[A-Za-z0-9]{2,6}|)(\.[A-Za-z0-9]{2,6}|)$/;
var htmlMobileTemplateCode = `
Expand All @@ -379,9 +374,7 @@ module.exports = {
function importScript(){return 1} // this is to avoid the error from site.js
</script>
__ARTICLE_CSS_LIST__
<link href="${styleDirectory}/${styleModulesDirectory}/mobile.css" rel="stylesheet" type="text/css" />
<link href="${styleDirectory}/${styleModulesDirectory}/content.parsoid.css" rel="stylesheet" type="text/css" />
<link href="${styleDirectory}/${styleModulesDirectory}/inserted_style_mobile.css" rel="stylesheet" type="text/css" />
${cssLinks}
</head>
<body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject page-Lyon rootpage-Lyon stable skin-minerva action-view animations">
<div id="mw-mf-viewport" class="feature-header-v2">
Expand Down Expand Up @@ -721,19 +714,16 @@ module.exports = {
}

function saveStaticFiles(finished) {
var saveStaticFile = function(resource) {
var dirs = config.output.dirs;
config.output.cssResources.forEach(function(css) {
try {
fs.readFile(pathParser.resolve(__dirname, "../" + resource), (err, data) =>
fs.writeFile(pathParser.resolve(htmlRootPath, `${styleDirectory}/${styleModulesDirectory}/${resource}`), data, () => {})
fs.readFile(pathParser.resolve(__dirname, "../" + css), (err, data) =>
fs.writeFile(pathParser.resolve(htmlRootPath, cssPath(css)), data, () => {})
)
} catch (error) {
console.error('Could not create ' + resource + ' file : ', error)
console.error('Could not create ' + css + ' file : ', error)
}
};

saveStaticFile('content.parsoid.css');
saveStaticFile('mobile.css');
saveStaticFile('inserted_style_mobile.css');
});
finished();
}

Expand Down Expand Up @@ -1096,7 +1086,7 @@ module.exports = {
jsConfigVars = `(window.RLQ=window.RLQ||[]).push(function() {${jsConfigVars}});`
jsConfigVars = jsConfigVars.replace('nosuchaction', 'view') // to replace the wgAction config that is set to 'nosuchaction' from api but should be 'view'
try {
fs.writeFileSync(pathParser.resolve(htmlRootPath, `${javascriptDirectory}/${jsModulesDirectory}/jsConfigVars.js`), jsConfigVars )
fs.writeFileSync(pathParser.resolve(htmlRootPath, jsPath('jsConfigVars.js')), jsConfigVars )
printLog(`created dep jsConfigVars.js for article ${articleId}`)
} catch (e) {
console.error(`Error writing file ${moduleUri}`, e)
Expand Down Expand Up @@ -1139,14 +1129,15 @@ module.exports = {
document.body.dispatchEvent(startUpEvent)
})()`

let moduleUri
let apiParameterOnly
let moduleUri;
let apiParameterOnly;
let dirs = config.output.dirs;
if (type === 'js') {
moduleUri = pathParser.resolve(htmlRootPath, `${javascriptDirectory}/${jsModulesDirectory}/${module}.js`)
apiParameterOnly = 'scripts'
moduleUri = pathParser.resolve(htmlRootPath, jsPath(module + '.js'));
apiParameterOnly = 'scripts';
} else if (type === 'css') {
moduleUri = pathParser.resolve(htmlRootPath, `${styleDirectory}/${styleModulesDirectory}/${module}.css`)
apiParameterOnly = 'styles'
moduleUri = pathParser.resolve(htmlRootPath, cssPath(module + '.css'));
apiParameterOnly = 'styles';
}

const moduleApiUrl = encodeURI(`${mwUrl}w/load.php?debug=false&lang=en&modules=${module}&only=${apiParameterOnly}&skin=vector&version=&*`)
Expand Down Expand Up @@ -1590,6 +1581,8 @@ module.exports = {
}

function applyOtherTreatments(parsoidDoc, articleId, finished) {
var filtersConfig = config.filters;

/* Don't need <link> and <input> tags */
var nodesToDelete = [
{ tag: 'link' },
Expand All @@ -1615,18 +1608,18 @@ module.exports = {
});

/* Remove element with black listed CSS classes */
cssClassBlackList.map(function (classname) {
filtersConfig.cssClassBlackList.map(function (classname) {
nodesToDelete.push({ class: className });
});

if (nodet) {
nodetCssClassBlackList.map(function (classname) {
filtersConfig.nodetCssClassBlackList.map(function (classname) {
nodesToDelete.push({ class: className });
});
}

/* Remove element with black listed CSS classes and no link */
cssClassBlackListIfNoLink.map(function (classname) {
filtersConfig.cssClassBlackListIfNoLink.map(function (classname) {
nodesToDelete.push({
class: classname,
filter: function(n) {
Expand Down Expand Up @@ -1672,15 +1665,15 @@ module.exports = {
}

/* Remove element with id in the blacklist */
idBlackList.map(function (id) {
filtersConfig.idBlackList.map(function (id) {
var node = parsoidDoc.getElementById(id);
if (node) {
deleteNode(node);
}
});

/* Force display of element with that CSS class */
cssClassDisplayList.map(function (classname) {
filtersConfig.cssClassDisplayList.map(function (classname) {
var nodes = parsoidDoc.getElementsByClassName(classname);
for (var i = 0; i < nodes.length; i++) {
nodes[i].style.removeProperty('display');
Expand Down Expand Up @@ -1725,7 +1718,7 @@ module.exports = {
}

/* Remove a few css calls */
cssClassCallsBlackList.map(function (classname) {
filtersConfig.cssClassCallsBlackList.map(function (classname) {
if (node.getAttribute('class')) {
node.setAttribute('class', node.getAttribute('class').replace(classname, ''));
}
Expand All @@ -1739,15 +1732,15 @@ module.exports = {
const htmlTemplateDoc = domino.createDocument(
(mobileLayout ? htmlMobileTemplateCode : htmlDesktopTemplateCode)
.replace('__ARTICLE_CONFIGVARS_LIST__', jsConfigVars !== ''
? `<script src="${javascriptDirectory}/${jsModulesDirectory}/jsConfigVars.js"></script>`
? genHeaderScript('jsConfigVars')
: ''
)
.replace('__ARTICLE_JS_LIST__', jsDependenciesList.length !== 0
? jsDependenciesList.map(oneJsDep => `<script src="${javascriptDirectory}/${jsModulesDirectory}/${oneJsDep}.js"></script>`).join('\n')
? jsDependenciesList.map(oneJsDep => genHeaderScript(oneJsDep)).join('\n')
: ''
)
.replace('__ARTICLE_CSS_LIST__', styleDependenciesList.length !== 0
? styleDependenciesList.map(oneCssDep => `<link href="${styleDirectory}/${styleModulesDirectory}/${oneCssDep}.css" rel="stylesheet" type="text/css" />`).join('\n')
? styleDependenciesList.map(oneCssDep => genHeaderCSSLink(oneCssDep)).join('\n')
: ''
)
)
Expand Down Expand Up @@ -2003,7 +1996,7 @@ module.exports = {
function saveStylesheet(finished) {
printLog('Dumping stylesheets...');
var urlCache = new Object();
var stylePath = htmlRootPath + styleDirectory + '/style.css';
var stylePath = htmlRootPath + dirs.style + '/style.css';

/* Remove if exists */
fs.unlink(stylePath, function () { });
Expand Down Expand Up @@ -2057,7 +2050,7 @@ module.exports = {
/* Download CSS dependency, but avoid duplicate calls */
if (!urlCache.hasOwnProperty(url) && filename) {
urlCache[url] = true;
downloadCSSFileQueue.push({ url: url, path: htmlRootPath + styleDirectory + '/' + filename });
downloadCSSFileQueue.push({ url: url, path: htmlRootPath + dirs.style + '/' + filename });
}
}
}
Expand Down Expand Up @@ -2363,11 +2356,11 @@ module.exports = {
[
function (finished) { exec('rm -rf \"' + htmlRootPath + '\"', finished) },
function (finished) { fs.mkdir(htmlRootPath, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + styleDirectory, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + styleDirectory + '/' + styleModulesDirectory, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + mediaDirectory, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + javascriptDirectory, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + javascriptDirectory + '/' + jsModulesDirectory, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + dirs.style, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + dirs.style + '/' + dirs.styleModules, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + dirs.media, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + dirs.javascript, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + dirs.javascript + '/' + dirs.jsModules, undefined, finished) },
],
function (error) {
if (error) {
Expand Down Expand Up @@ -2762,7 +2755,7 @@ module.exports = {
filename = unicodeCutter.truncateToBinarySize(basename, 239 - ext.length) + crypto.createHash('md5').update(basename).digest('hex').substring(0, 2) + '.' + ext;
}

return mediaDirectory + '/' + e(filename);
return dirs.media + '/' + e(filename);
}

function getArticleUrl(articleId) {
Expand Down