Skip to content

Commit

Permalink
Merge pull request #116 from subbuss/master
Browse files Browse the repository at this point in the history
Step 1 of refactoring: More coming later
  • Loading branch information
kelson42 authored Aug 15, 2017
2 parents ac1f782 + 945b160 commit d343a5a
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 77 deletions.
66 changes: 66 additions & 0 deletions lib/config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"use strict";

var config = {
userAgent: 'MWOffliner/HEAD',

defaults: {
publisher: 'Kiwix',
redisConfig: '/dev/shm/redis.sock',
requestTimeout: 60,
},

filters: {
/* All DOM nodes with on of these styles will be removed */
/* On Wikivoyage 'noprint' remove also top banners like on 'South America'. */
cssClassBlackList: [
'noprint', 'metadata', 'ambox', 'stub',
'topicon', 'magnify', 'navbar',
'mwe-math-mathml-inline', 'mw-kartographer-container'
],

/* Additional black list if only intro is dumped */
nodetCssClassBlackList: [ 'mw-ref' ],

/* All DOM node with these styles will be deleted
* if no <a> tag is included in the sub-tree */
cssClassBlackListIfNoLink: [
'mainarticle', 'seealso', 'dablink', 'rellink', 'hatnote'
],

/* All DOM nodes which we should for to display */
cssClassDisplayList: ['thumb'],

/* List of style to be removed */
cssClassCallsBlackList: ['plainlinks'],

/* All nodes with one of these ids will be removed */
idBlackList: ['purgelink'],
},

output: {
cssResources: [ 'mobile.css', 'content.parsoid.css', 'inserted_style_mobile.css' ],

dirs : {
style: 's',
media: 'm',
javascript: 'j',
styleModules: 'css_modules',
jsModules: 'js_modules',
}

templates: {
/* Template code for any redirect to be written on the FS */
redirects : [
'<html>', '<head>',
'<meta charset="UTF-8" />',
'<title>{{ title }}</title>',
'<meta http-equiv="refresh" content="0; URL={{ target }}">'
'</head>', '<body></body>', '</html>'
].join(''),
},
},
};

module.exports = {
config: config
};
147 changes: 70 additions & 77 deletions lib/mwoffliner.lib.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ var htmlMinifier = require('html-minifier');
const parsoid = require('parsoid');
const fetch = require('node-fetch');

var config = require('./config.js');

module.exports = {
getParametersList: () => [
{ name: 'mwUrl', description: 'Mediawiki base URL. Dont forget the trailing /', required: true },
Expand Down Expand Up @@ -75,7 +77,7 @@ module.exports = {
/************************************/

/* Layout */
const mobileLayout = argv.mobileLayout || false
const mobileLayout = argv.mobileLayout || false;

/* Formats */
var dumps = [''];
Expand All @@ -95,37 +97,14 @@ module.exports = {
}
}

/* Template code for any redirect to be written on the FS */
var redirectTemplateCode = '<html><head><meta charset="UTF-8" /><title>{{ title }}</title><meta http-equiv="refresh" content="0; URL={{ target }}"></head><body></body></html>';

/* All DOM nodes with on of these styles will be removed */
/* On Wikivoyage 'noprint' remove also top banners like on 'South America'. */
var cssClassBlackList = [ 'noprint', 'metadata', 'ambox', 'stub', 'topicon', 'magnify', 'navbar', 'mwe-math-mathml-inline', 'mw-kartographer-container' ];

/* Additional black list if only intro is dumped */
var nodetCssClassBlackList = [ 'mw-ref'];

/* All DOM node with these styles will be deleted if no A node is included in the sub-tree */
var cssClassBlackListIfNoLink = [ 'mainarticle', 'seealso', 'dablink', 'rellink', 'hatnote' ];

/* All DOM nodes which we should for to display */
var cssClassDisplayList = ['thumb'];

/* List of style to be removed */
var cssClassCallsBlackList = ['plainlinks'];

/* All nodes with one of these ids will be remove */
var idBlackList = ['purgelink'];

/* HTTP user-agent string */
var adminEmail = argv.adminEmail;
var userAgentString = 'MWOffliner/HEAD';
if (validateEmail(adminEmail)) {
userAgentString += ' (' + adminEmail + ')';
} else {
if (!validateEmail(adminEmail)) {
console.error('Admin email ' + adminEmail + ' is not valid');
process.exit(1);
}

var userAgentString += config.userAgent + ' (' + adminEmail + ')';
var loginCookie = '';

/* Directory wehre everything is saved at the end of the process */
Expand Down Expand Up @@ -188,10 +167,10 @@ module.exports = {
var resume = argv.resume;

/* Redis configuration */
var redisConf = argv.redis ? argv.redis : '/dev/shm/redis.sock';
var redisConf = argv.redis ? argv.redis : config.defaults.redisConfig;

/* Default request timeout */
var requestTimeout = argv.requestTimeout ? argv.requestTimeout : 60;
var requestTimeout = argv.requestTimeout || config.defaults.requestTimeout;

/* Keep empty paragraphs */
var keepEmptyParagraphs = argv.keepEmptyParagraphs;
Expand All @@ -200,7 +179,7 @@ module.exports = {
var withZimFullTextIndex = argv.withZimFullTextIndex;

/* ZIM publisher */
var publisher = argv.publisher || 'Kiwix';
var publisher = argv.publisher || config.defaults.publisher;

/* Wikipedia/... URL */
var mwUrl = argv.mwUrl;
Expand Down Expand Up @@ -316,7 +295,11 @@ module.exports = {
/************************************/

/* Check if opt. binaries are available */
var optBinaries = ['jpegoptim --version', 'pngquant --version', 'gifsicle --version', 'advdef --version', 'file --help', 'stat --version', 'convert --version'];
var optBinaries = [
'jpegoptim --version', 'pngquant --version',
'gifsicle --version', 'advdef --version',
'file --help', 'stat --version', 'convert --version'
];
try {
dumps.forEach(function (dump) {
if (dump.toLowerCase().indexOf('nozim') < 0) {
Expand Down Expand Up @@ -345,27 +328,39 @@ module.exports = {
var redisCachedMediaToCheckDatabase = redisNamePrefix + 'c';

/* Compile templates */
var redirectTemplate = swig.compile(redirectTemplateCode);
var redirectTemplate = swig.compile(config.output.templates.redirects);
var footerTemplate = swig.compile(footerTemplateCode);

/************************************/
/* CONSTANT VARIABLE SECTION ********/
/************************************/

var styleDirectory = 's';
var styleModulesDirectory = 'css_modules'
var mediaDirectory = 'm';
var javascriptDirectory = 'j';
var jsModulesDirectory = 'js_modules'

const genericJsModules = ['startup', 'jquery', 'mediawiki', 'site']
var dirs = config.output.dirs;
var cssPath = function(css) {
return [dirs.style, dirs.styleModules, css].join('/');
};
var jsPath = function(js) {
return [dirs.javascript, dirs.jsModules, js].join('/');
};
var genHeaderCSSLink = function(css) {
return '<link href="' + cssPath(css) + '" rel="stylesheet" type="text/css" />';
};
var genHeaderScript = function(js) {
return '<script src="' + jsPath(js) '"></script>';
};

var cssLinks = config.output.cssResources.reduce(function(buf, css) {
return buf + genHeaderCSSLink(css);
}, '');

const genericJsModules = ['startup', 'jquery', 'mediawiki', 'site'];
const genericCssModules = mobileLayout
? ['skins.minerva.base.reset|skins.minerva.content.styles|ext.cite.style|mediawiki.page.gallery.styles|mobile.app.pagestyles.android|mediawiki.skinning.content.parsoid']
: []
: [];

// this module has no css, blacklisting it avoid creating an empty file that generate an error in firefox
// error is "style sheet could not be loaded"
const blackListCssModules = ['mediawiki.page.gallery']
const blackListCssModules = ['mediawiki.page.gallery'];

var mediaRegex = /^(.*\/)([^\/]+)(\/)(\d+px-|)(.+?)(\.[A-Za-z0-9]{2,6}|)(\.[A-Za-z0-9]{2,6}|)$/;
var htmlMobileTemplateCode = `
Expand All @@ -379,9 +374,7 @@ module.exports = {
function importScript(){return 1} // this is to avoid the error from site.js
</script>
__ARTICLE_CSS_LIST__
<link href="${styleDirectory}/${styleModulesDirectory}/mobile.css" rel="stylesheet" type="text/css" />
<link href="${styleDirectory}/${styleModulesDirectory}/content.parsoid.css" rel="stylesheet" type="text/css" />
<link href="${styleDirectory}/${styleModulesDirectory}/inserted_style_mobile.css" rel="stylesheet" type="text/css" />
${cssLinks}
</head>
<body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject page-Lyon rootpage-Lyon stable skin-minerva action-view animations">
<div id="mw-mf-viewport" class="feature-header-v2">
Expand Down Expand Up @@ -721,19 +714,16 @@ module.exports = {
}

function saveStaticFiles(finished) {
var saveStaticFile = function(resource) {
var dirs = config.output.dirs;
config.output.cssResources.forEach(function(css) {
try {
fs.readFile(pathParser.resolve(__dirname, "../" + resource), (err, data) =>
fs.writeFile(pathParser.resolve(htmlRootPath, `${styleDirectory}/${styleModulesDirectory}/${resource}`), data, () => {})
fs.readFile(pathParser.resolve(__dirname, "../" + css), (err, data) =>
fs.writeFile(pathParser.resolve(htmlRootPath, cssPath(css)), data, () => {})
)
} catch (error) {
console.error('Could not create ' + resource + ' file : ', error)
console.error('Could not create ' + css + ' file : ', error)
}
};

saveStaticFile('content.parsoid.css');
saveStaticFile('mobile.css');
saveStaticFile('inserted_style_mobile.css');
});
finished();
}

Expand Down Expand Up @@ -1096,7 +1086,7 @@ module.exports = {
jsConfigVars = `(window.RLQ=window.RLQ||[]).push(function() {${jsConfigVars}});`
jsConfigVars = jsConfigVars.replace('nosuchaction', 'view') // to replace the wgAction config that is set to 'nosuchaction' from api but should be 'view'
try {
fs.writeFileSync(pathParser.resolve(htmlRootPath, `${javascriptDirectory}/${jsModulesDirectory}/jsConfigVars.js`), jsConfigVars )
fs.writeFileSync(pathParser.resolve(htmlRootPath, jsPath('jsConfigVars.js')), jsConfigVars )
printLog(`created dep jsConfigVars.js for article ${articleId}`)
} catch (e) {
console.error(`Error writing file ${moduleUri}`, e)
Expand Down Expand Up @@ -1139,14 +1129,15 @@ module.exports = {
document.body.dispatchEvent(startUpEvent)
})()`

let moduleUri
let apiParameterOnly
let moduleUri;
let apiParameterOnly;
let dirs = config.output.dirs;
if (type === 'js') {
moduleUri = pathParser.resolve(htmlRootPath, `${javascriptDirectory}/${jsModulesDirectory}/${module}.js`)
apiParameterOnly = 'scripts'
moduleUri = pathParser.resolve(htmlRootPath, jsPath(module + '.js'));
apiParameterOnly = 'scripts';
} else if (type === 'css') {
moduleUri = pathParser.resolve(htmlRootPath, `${styleDirectory}/${styleModulesDirectory}/${module}.css`)
apiParameterOnly = 'styles'
moduleUri = pathParser.resolve(htmlRootPath, cssPath(module + '.css'));
apiParameterOnly = 'styles';
}

const moduleApiUrl = encodeURI(`${mwUrl}w/load.php?debug=false&lang=en&modules=${module}&only=${apiParameterOnly}&skin=vector&version=&*`)
Expand Down Expand Up @@ -1590,6 +1581,8 @@ module.exports = {
}

function applyOtherTreatments(parsoidDoc, articleId, finished) {
var filtersConfig = config.filters;

/* Don't need <link> and <input> tags */
var nodesToDelete = [
{ tag: 'link' },
Expand All @@ -1615,18 +1608,18 @@ module.exports = {
});

/* Remove element with black listed CSS classes */
cssClassBlackList.map(function (classname) {
filtersConfig.cssClassBlackList.map(function (classname) {
nodesToDelete.push({ class: className });
});

if (nodet) {
nodetCssClassBlackList.map(function (classname) {
filtersConfig.nodetCssClassBlackList.map(function (classname) {
nodesToDelete.push({ class: className });
});
}

/* Remove element with black listed CSS classes and no link */
cssClassBlackListIfNoLink.map(function (classname) {
filtersConfig.cssClassBlackListIfNoLink.map(function (classname) {
nodesToDelete.push({
class: classname,
filter: function(n) {
Expand Down Expand Up @@ -1672,15 +1665,15 @@ module.exports = {
}

/* Remove element with id in the blacklist */
idBlackList.map(function (id) {
filtersConfig.idBlackList.map(function (id) {
var node = parsoidDoc.getElementById(id);
if (node) {
deleteNode(node);
}
});

/* Force display of element with that CSS class */
cssClassDisplayList.map(function (classname) {
filtersConfig.cssClassDisplayList.map(function (classname) {
var nodes = parsoidDoc.getElementsByClassName(classname);
for (var i = 0; i < nodes.length; i++) {
nodes[i].style.removeProperty('display');
Expand Down Expand Up @@ -1725,7 +1718,7 @@ module.exports = {
}

/* Remove a few css calls */
cssClassCallsBlackList.map(function (classname) {
filtersConfig.cssClassCallsBlackList.map(function (classname) {
if (node.getAttribute('class')) {
node.setAttribute('class', node.getAttribute('class').replace(classname, ''));
}
Expand All @@ -1739,15 +1732,15 @@ module.exports = {
const htmlTemplateDoc = domino.createDocument(
(mobileLayout ? htmlMobileTemplateCode : htmlDesktopTemplateCode)
.replace('__ARTICLE_CONFIGVARS_LIST__', jsConfigVars !== ''
? `<script src="${javascriptDirectory}/${jsModulesDirectory}/jsConfigVars.js"></script>`
? genHeaderScript('jsConfigVars')
: ''
)
.replace('__ARTICLE_JS_LIST__', jsDependenciesList.length !== 0
? jsDependenciesList.map(oneJsDep => `<script src="${javascriptDirectory}/${jsModulesDirectory}/${oneJsDep}.js"></script>`).join('\n')
? jsDependenciesList.map(oneJsDep => genHeaderScript(oneJsDep)).join('\n')
: ''
)
.replace('__ARTICLE_CSS_LIST__', styleDependenciesList.length !== 0
? styleDependenciesList.map(oneCssDep => `<link href="${styleDirectory}/${styleModulesDirectory}/${oneCssDep}.css" rel="stylesheet" type="text/css" />`).join('\n')
? styleDependenciesList.map(oneCssDep => genHeaderCSSLink(oneCssDep)).join('\n')
: ''
)
)
Expand Down Expand Up @@ -2003,7 +1996,7 @@ module.exports = {
function saveStylesheet(finished) {
printLog('Dumping stylesheets...');
var urlCache = new Object();
var stylePath = htmlRootPath + styleDirectory + '/style.css';
var stylePath = htmlRootPath + dirs.style + '/style.css';

/* Remove if exists */
fs.unlink(stylePath, function () { });
Expand Down Expand Up @@ -2057,7 +2050,7 @@ module.exports = {
/* Download CSS dependency, but avoid duplicate calls */
if (!urlCache.hasOwnProperty(url) && filename) {
urlCache[url] = true;
downloadCSSFileQueue.push({ url: url, path: htmlRootPath + styleDirectory + '/' + filename });
downloadCSSFileQueue.push({ url: url, path: htmlRootPath + dirs.style + '/' + filename });
}
}
}
Expand Down Expand Up @@ -2363,11 +2356,11 @@ module.exports = {
[
function (finished) { exec('rm -rf \"' + htmlRootPath + '\"', finished) },
function (finished) { fs.mkdir(htmlRootPath, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + styleDirectory, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + styleDirectory + '/' + styleModulesDirectory, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + mediaDirectory, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + javascriptDirectory, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + javascriptDirectory + '/' + jsModulesDirectory, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + dirs.style, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + dirs.style + '/' + dirs.styleModules, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + dirs.media, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + dirs.javascript, undefined, finished) },
function (finished) { fs.mkdir(htmlRootPath + dirs.javascript + '/' + dirs.jsModules, undefined, finished) },
],
function (error) {
if (error) {
Expand Down Expand Up @@ -2762,7 +2755,7 @@ module.exports = {
filename = unicodeCutter.truncateToBinarySize(basename, 239 - ext.length) + crypto.createHash('md5').update(basename).digest('hex').substring(0, 2) + '.' + ext;
}

return mediaDirectory + '/' + e(filename);
return dirs.media + '/' + e(filename);
}

function getArticleUrl(articleId) {
Expand Down

0 comments on commit d343a5a

Please sign in to comment.