forked from gildas-lormeau/single-file-cli
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathargs.js
361 lines (359 loc) · 19 KB
/
args.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
/*
* Copyright 2010-2020 Gildas Lormeau
* contact : gildas.lormeau <at> gmail.com
*
* This file is part of SingleFile.
*
* The code in this file is free software: you can redistribute it and/or
* modify it under the terms of the GNU Affero General Public License
* (GNU AGPL) as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* The code in this file is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero
* General Public License for more details.
*
* As additional permission under GNU AGPL version 3 section 7, you may
* distribute UNMODIFIED VERSIONS OF THIS file without the copy of the GNU
* AGPL normally required by section 4, provided you include this license
* notice and a URL through which recipients can access the Corresponding
* Source.
*/
/* global require, module */
const args = require("yargs")
.wrap(null)
.command("$0 [url] [output]", "Save a page into a single HTML file.", yargs => {
yargs.positional("url", { description: "URL or path on the filesystem of the page to save", type: "string" });
yargs.positional("output", { description: "Output filename", type: "string" });
})
.default({
"accept-headers": {
"font": "application/font-woff2;q=1.0,application/font-woff;q=0.9,*/*;q=0.8",
"image": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
"stylesheet": "text/css,*/*;q=0.1",
"script": "*/*",
"document": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
},
"back-end": "puppeteer",
"block-mixed-content": false,
"browser-server": "",
"browser-headless": true,
"browser-executable-path": "",
"browser-width": 1280,
"browser-height": 720,
"browser-load-max-time": 60000,
"browser-wait-delay": 0,
"browser-wait-until": "networkidle0",
"browser-wait-until-fallback": true,
"browser-debug": false,
"browser-script": [],
"browser-stylesheet": [],
"browser-args": "",
"browser-start-minimized": false,
"browser-cookie": [],
"browser-cookies-file": "",
"browser-ignore-insecure-certs": false,
"browser-freeze-prototypes": false,
"compress-content": false,
"compress-CSS": false,
"compress-HTML": true,
"dump-content": false,
"emulateMediaFeature": [],
"filename-template": "%if-empty<{page-title}|No title> ({date-locale} {time-locale}).{filename-extension}",
"filename-conflict-action": "uniquify",
"filename-replacement-character": "_",
"filename-max-length": 192,
"filename-max-length-unit": "bytes",
"replace-emojis-in-filename": false,
"group-duplicate-images": true,
"max-size-duplicate-images": 512 * 1024,
"http-header": [],
"http-proxy-server": "",
"http-proxy-username": "",
"http-proxy-password": "",
"include-infobar": false,
"insert-meta-csp": true,
"load-deferred-images": true,
"load-deferred-images-dispatch-scroll-event": false,
"load-deferred-images-max-idle-time": 1500,
"load-deferred-images-keep-zoom-level": false,
"load-deferred-images-before-frames": false,
"max-parallel-workers": 8,
"max-resource-size-enabled": false,
"max-resource-size": 10,
"move-styles-in-head": false,
"output-directory": "",
"password": "",
"remove-hidden-elements": true,
"remove-unused-styles": true,
"remove-unused-fonts": true,
"remove-saved-date": false,
"remove-frames": false,
"block-scripts": true,
"block-audios": true,
"block-videos": true,
"remove-alternative-fonts": true,
"remove-alternative-medias": true,
"remove-alternative-images": true,
"save-original-urls": false,
"save-raw-page": false,
"web-driver-executable-path": "",
"user-script-enabled": true,
"include-BOM": false,
"crawl-links": false,
"crawl-inner-links-only": true,
"crawl-remove-url-fragment": true,
"crawl-max-depth": 1,
"crawl-external-links-max-depth": 1,
"crawl-replace-urls": false,
"crawl-rewrite-rule": [],
"insert-text-body": false,
"create-root-directory": false,
"self-extracting-archive": true,
"extract-data-from-page": true,
"prevent-appended-data": false
})
.options("back-end", { description: "Back-end to use" })
.choices("back-end", ["jsdom", "puppeteer", "webdriver-chromium", "webdriver-gecko", "puppeteer-firefox", "playwright-firefox", "playwright-chromium", "playwright-webkit"])
.options("block-audios", { description: "Block audios" })
.boolean("block-audios")
.options("block-fonts", { description: "Block fonts" })
.boolean("block-fonts")
.options("block-images", { description: "Block images" })
.boolean("block-images")
.options("block-scripts", { description: "Block scripts" })
.boolean("block-scripts")
.options("block-videos", { description: "Block videos" })
.boolean("block-videos")
.options("block-mixed-content", { description: "Block mixed contents" })
.boolean("block-mixed-content")
.options("browser-server", { description: "Server to connect to (puppeteer only for now)" })
.string("browser-server")
.options("browser-headless", { description: "Run the browser in headless mode (puppeteer, webdriver-gecko, webdriver-chromium)" })
.boolean("browser-headless")
.options("browser-executable-path", { description: "Path to chrome/chromium executable (puppeteer, webdriver-gecko, webdriver-chromium)" })
.string("browser-executable-path")
.options("browser-width", { description: "Width of the browser viewport in pixels" })
.number("browser-width")
.options("browser-height", { description: "Height of the browser viewport in pixels" })
.number("browser-height")
.options("browser-load-max-time", { description: "Maximum delay of time to wait for page loading in ms (puppeteer, webdriver-gecko, webdriver-chromium)" })
.number("browser-load-max-time")
.options("browser-wait-delay", { description: "Time to wait before capturing the page in ms" })
.number("browser-wait-delay")
.options("browser-wait-until", { description: "When to consider the page is loaded (puppeteer, webdriver-gecko, webdriver-chromium)" })
.choices("browser-wait-until", ["networkidle0", "networkidle2", "load", "domcontentloaded"])
.options("browser-wait-until-fallback", { description: "Retry with the next value of --browser-wait-until when a timeout error is thrown" })
.boolean("browser-wait-until-fallback")
.options("browser-debug", { description: "Enable debug mode (puppeteer, webdriver-gecko, webdriver-chromium)" })
.boolean("browser-debug")
.options("browser-script", { description: "Path of a script executed in the page (and all the frames) before it is loaded" })
.array("browser-script")
.options("browser-stylesheet", { description: "Path of a stylesheet file inserted into the page (and all the frames) after it is loaded" })
.array("browser-stylesheet")
.options("browser-args", { description: "Arguments provided as a JSON array and passed to the browser (puppeteer, webdriver-gecko, webdriver-chromium)" })
.string("browser-args")
.options("browser-start-minimized", { description: "Minimize the browser (puppeteer)" })
.boolean("browser-start-minimized")
.options("browser-cookie", { description: "Ordered list of cookie parameters separated by a comma: name,value,domain,path,expires,httpOnly,secure,sameSite,url (puppeteer, webdriver-gecko, webdriver-chromium, jsdom)" })
.array("browser-cookie")
.options("browser-cookies-file", { description: "Path of the cookies file formatted as a JSON file or a Netscape text file (puppeteer, webdriver-gecko, webdriver-chromium, jsdom)" })
.string("browser-cookies-file")
.options("browser-ignore-insecure-certs", { description: "Ignore HTTPs errors" })
.boolean("browser-ignore-insecure-certs")
.options("browser-freeze-prototypes", { description: "Freeze prototypes of built-in objects in the page" })
.boolean("browser-freeze-prototypes")
.options("compress-content", { description: "Compress the output file into a ZIP file" })
.boolean("compress-content")
.options("compress-CSS", { description: "Compress CSS stylesheets" })
.boolean("compress-CSS")
.options("compress-HTML", { description: "Compress HTML content" })
.boolean("compress-HTML")
.options("crawl-links", { description: "Crawl and save pages found via inner links" })
.boolean("crawl-links")
.options("crawl-inner-links-only", { description: "Crawl pages found via inner links only if they are hosted on the same domain" })
.boolean("crawl-inner-links-only")
.options("crawl-no-parent", { description: "Crawl pages found via inner links only if their URLs are not parent of the URL to crawl" })
.boolean("crawl-no-parent")
.options("crawl-load-session", { description: "Name of the file of the session to load (previously saved with --crawl-save-session or --crawl-sync-session)" })
.string("crawl-load-session")
.options("crawl-remove-url-fragment", { description: "Remove URL fragments found in links" })
.boolean("crawl-remove-url-fragment")
.options("crawl-save-session", { description: "Name of the file where to save the state of the session" })
.string("crawl-save-session")
.options("crawl-sync-session", { description: "Name of the file where to load and save the state of the session" })
.string("crawl-sync-session")
.options("crawl-max-depth", { description: "Max depth when crawling pages found in internal and external links (0: infinite)" })
.number("crawl-max-depth")
.options("crawl-external-links-max-depth", { description: "Max depth when crawling pages found in external links (0: infinite)" })
.number("crawl-external-links-max-depth")
.options("crawl-replace-urls", { description: "Replace URLs of saved pages with relative paths of saved pages on the filesystem" })
.boolean("crawl-replace-urls")
.options("crawl-rewrite-rule", { description: "Rewrite rule used to rewrite URLs of crawled pages" })
.array("crawl-rewrite-rule")
.options("dump-content", { description: "Dump the content of the processed page in the console ('true' when running in Docker)" })
.boolean("dump-content")
.options("emulate-media-feature", { description: "Emulate a media feature. The syntax is <name>:<value>, e.g. \"prefers-color-scheme:dark\" (puppeteer)" })
.array("emulate-media-feature")
.options("error-file")
.string("error-file")
.options("filename-template", { description: "Template used to generate the output filename (see help page of the extension for more info)" })
.string("filename-template")
.options("filename-conflict-action", { description: "Action when the filename is conflicting with existing one on the filesystem. The possible values are \"uniquify\" (default), \"overwrite\" and \"skip\"" })
.string("filename-conflict-action")
.options("filename-replacement-character", { description: "The character used for replacing invalid characters in filenames" })
.string("filename-replacement-character")
.options("filename-max-length", { description: "Specify the maximum length of the filename" })
.number("filename-max-length")
.options("filename-max-length-unit", { description: "Specify the unit of the maximum length of the filename ('bytes' or 'chars')" })
.string("filename-max-length-unit")
.options("replace-emojis-in-filename", { description: "Replace emojis in the filename with their unicode text representation" })
.boolean("replace-emojis-in-filename")
.options("group-duplicate-images", { description: "Group duplicate images into CSS custom properties" })
.boolean("group-duplicate-images")
.options("max-size-duplicate-images", { description: "Maximum sie in bytes of duplicate images stored as CSS custom properties" })
.number("max-size-duplicate-images")
.options("http-header", { description: "Extra HTTP header (puppeteer, jsdom)" })
.array("http-header")
.options("http-proxy-server", { description: "Proxy address (puppeteer)" })
.string("http-proxy-server")
.options("http-proxy-username", { description: "HTTP username (puppeteer)" })
.string("http-proxy-username")
.options("http-proxy-password", { description: "HTTP password (puppeteer)" })
.string("http-proxy-password")
.options("include-BOM", { description: "Include the UTF-8 BOM into the HTML page" })
.boolean("include-BOM")
.options("include-infobar", { description: "Include the infobar" })
.boolean("include-infobar")
.options("insert-meta-csp", { description: "Include a <meta> tag with a CSP to avoid potential requests to internet when viewing a page" })
.boolean("insert-meta-csp")
.options("load-deferred-images", { description: "Load deferred (a.k.a. lazy-loaded) images (puppeteer, webdriver-gecko, webdriver-chromium)" })
.boolean("load-deferred-images")
.options("load-deferred-images-dispatch-scroll-event", { description: "Dispatch 'scroll' event when loading deferred images" })
.boolean("load-deferred-images-dispatch-scroll-event")
.options("load-deferred-images-max-idle-time", { description: "Maximum delay of time to wait for deferred images in ms (puppeteer, webdriver-gecko, webdriver-chromium)" })
.number("load-deferred-images-max-idle-time")
.options("load-deferred-images-keep-zoom-level", { description: "Load deferred images by keeping zoomed out the page" })
.boolean("load-deferred-images-keep-zoom-level")
.options("load-deferred-images-before-frames", { description: "Load deferred frames before before saving fame contents" })
.boolean("load-deferred-images-before-frames")
.options("max-parallel-workers", { description: "Maximum number of browsers launched in parallel when processing a list of URLs (cf --urls-file)" })
.number("max-parallel-workers")
.options("max-resource-size-enabled", { description: "Enable removal of embedded resources exceeding a given size" })
.boolean("max-resource-size-enabled")
.options("max-resource-size", { description: "Maximum size of embedded resources in MB (i.e. images, stylesheets, scripts and iframes)" })
.number("max-resource-size")
.options("move-styles-in-head", { description: "Move style elements outside the head element into the head element" })
.boolean("move-styles-in-head")
.options("password", { description: "Password of the zip file" })
.string("password")
.options("remove-frames", { description: "Remove frames (puppeteer, webdriver-gecko, webdriver-chromium)" })
.boolean("remove-frames")
.options("remove-hidden-elements", { description: "Remove HTML elements which are not displayed" })
.boolean("remove-hidden-elements")
.options("remove-unused-styles", { description: "Remove unused CSS rules and unneeded declarations" })
.boolean("remove-unused-styles")
.options("remove-unused-fonts", { description: "Remove unused CSS font rules" })
.boolean("remove-unused-fonts")
.options("remove-saved-date", { description: "Remove saved date metadata in HTML header" })
.boolean("remove-saved-date")
.options("block-scripts", { description: "Block scripts" })
.boolean("block-scripts")
.options("block-audios", { description: "Block audio elements" })
.boolean("block-audios")
.options("block-videos", { description: "Block video elements" })
.boolean("block-videos")
.options("remove-alternative-fonts", { description: "Remove alternative fonts to the ones displayed" })
.boolean("remove-alternative-fonts")
.options("remove-alternative-medias", { description: "Remove alternative CSS stylesheets" })
.boolean("remove-alternative-medias")
.options("remove-alternative-images", { description: "Remove images for alternative sizes of screen" })
.boolean("remove-alternative-images")
.options("save-original-urls", { description: "Save the original URLS in the embedded contents" })
.boolean("save-original-urls")
.options("save-raw-page", { description: "Save the original page without interpreting it into the browser (puppeteer, webdriver-gecko, webdriver-chromium)" })
.boolean("save-raw-page")
.options("urls-file", { description: "Path to a text file containing a list of URLs (separated by a newline) to save" })
.string("urls-file")
.options("user-agent", { description: "User-agent of the browser (puppeteer, webdriver-gecko, webdriver-chromium)" })
.string("user-agent")
.options("user-script-enabled", { description: "Enable the event API allowing to execute scripts before the page is saved" })
.boolean("user-script-enabled")
.options("web-driver-executable-path", { description: "Path to Selenium WebDriver executable (webdriver-gecko, webdriver-chromium)" })
.string("web-driver-executable-path")
.options("self-extracting-archive", { description: "Create a self extracting HTML file" })
.boolean("self-extracting-archive")
.options("insert-text-body", { description: "Insert the text of the page into the self-extracting HTML file" })
.boolean("insert-text-body")
.options("create-root-directory", { description: "Create a root directory based on the timestamp" })
.boolean("create-root-directory")
.options("extract-data-from-page", { description: "Extract compressed data from the page instead of fetching the page in order to create universal self-extracting HTML files" })
.boolean("extract-data-from-page")
.options("prevent-appended-data", { description: "Prevent appending data after the compressed data when creating self-extracting HTML files" })
.boolean("prevent-appended-data")
.options("output-directory", { description: "Path to where to save files, this path must exist." })
.string("output-directory")
.argv;
args.backgroundSave = true;
args.compressCSS = args.compressCss;
args.compressHTML = args.compressHtml;
args.includeBOM = args.includeBom;
args.crawlReplaceURLs = args.crawlReplaceUrls;
args.crawlRemoveURLFragment = args.crawlRemoveUrlFragment;
args.insertMetaCSP = args.insertMetaCsp;
args.saveOriginalURLs = args.saveOriginalUrls;
if (args.removeScripts) {
args.blockScripts = true;
}
if (args.removeAudioSrc) {
args.blockAudios = true;
}
if (args.removeVideoSrc) {
args.blockVideos = true;
}
const headers = args.httpHeader;
delete args.httpHeader;
args.httpHeaders = {};
headers.forEach(header => {
const matchedHeader = header.match(/^(.*?):(.*)$/);
if (matchedHeader.length == 3) {
args.httpHeaders[matchedHeader[1].trim()] = matchedHeader[2].trimLeft();
}
});
const cookies = args.browserCookie;
delete args.browserCookie;
args.browserCookies = cookies.map(cookieValue => {
const value = cookieValue.split(/(?<!\\),/);
return {
name: value[0],
value: value[1],
domain: value[2] || undefined,
path: value[3] || undefined,
expires: value[4] && Number(value[4]) || undefined,
httpOnly: value[5] && value[5] == "true" || undefined,
secure: value[6] && value[5] == "true" || undefined,
sameSite: value[7] || undefined,
url: value[8] || undefined
};
});
args.browserScripts = args.browserScript;
delete args.browserScript;
args.browserStylesheets = args.browserStylesheet;
delete args.browserStylesheet;
args.crawlRewriteRules = args.crawlRewriteRule;
delete args.crawlRewriteRule;
args.emulateMediaFeatures = args.emulateMediaFeature
.map(value => {
const splitValue = value.match(/^([^:]+):(.*)$/);
if (splitValue.length >= 3) {
return { name: splitValue[1].trim(), value: splitValue[2].trim() };
}
})
.filter(identity => identity);
delete args.emulateMediaFeature;
Object.keys(args).filter(optionName => optionName.includes("-"))
.forEach(optionName => delete args[optionName]);
delete args["$0"];
delete args["_"];
module.exports = args;