diff --git a/README.md b/README.md index 45252fa..df17ff6 100644 --- a/README.md +++ b/README.md @@ -576,6 +576,47 @@ But _function expressions_ are of course not statements. It’s difficult to tel Luckily, none of these edge cases are likely to occur in real code. +### Known failures + +js-tokens advertises that it “never fails”. Tell you what, it _can_ fail on extreme inputs. The regex engine of the runtime can eventually give up. js-tokens has worked around it to some extent by changing its regexes to be easier on the regex engine. To solve completely, js-tokens would have to stop using regex, but then it wouldn’t be _tiny_ anymore which is the whole point. Luckily, only extreme inputs can fail, hopefully ones you’ll never encounter. + +For example, if you try to parse the string literal `"\n\n\n"` but with 10 million `\n` instead of just 3, the regex engine gives up with `RangeError: Maximum call stack size exceeded` (or similar). Try it out: + +```js +Array.from(require("js-tokens")(`"${"\\n".repeat(1e7)}"`)); +``` + +(Yes, that is the _regex engine_ of the runtime giving up. js-tokens has no recursive functions.) + +However, if you repeat `a` instead of `\n` 10 million times (`"aaaaaa…"`), it works: + +```js +Array.from(require("js-tokens")(`"${"a".repeat(1e7)}"`)); +``` + +That’s good, because it’s much more common to have lots of non-escapes in a row in a big string literal, than having mostly escapes. (Obfuscated code might have _only_ escapes though.) + +#### Safari warning + +I’ve seen Safari _give up_ instead of throwing an error. + +In Safari, Chrome, Firefox and Node.js the following code successfully results in a match: + +```js +/(#)(?:a|b)+/.exec("#" + "a".repeat(1e5)); +``` + +But for the following code (with `1e7` instead of `1e5`), the runtimes differ: + +```js +/(#)(?:a|b)+/.exec("#" + "a".repeat(1e7)); +``` + +- Chrome, Firefox and Node.js all throw `RangeError: Maximum call stack size exceeded` (or similar). +- Safari returns `null` (at the time of writing), silently giving up on matching the regex. It’s kind of lying that the regex did not match, while in reality it would given enough computing resources. + +This means that in Safari, js-tokens might not fail but instead give you unexpected tokens. + ## Performance With [@babel/parser] for comparison. Node.js 18.13.0 on a MacBook Pro M1 (Ventura). diff --git a/index.coffee b/index.coffee index e549334..865ab20 100644 --- a/index.coffee +++ b/index.coffee @@ -12,13 +12,13 @@ RegularExpressionLiteral = /// (?: \[ (?: - (?![ \] \\ ]). + [^ \] \\ \n \r \u2028 \u2029 ]+ | \\. )* \] | - (?![ / \\ ]). + [^ / \\ \n \r \u2028 \u2029 ]+ | \\. )* @@ -61,7 +61,7 @@ Identifier = /// (\x23?) (?=[ $ _ \p{ID_Start} \\ ]) (?: - [ $ _ \u200C \u200D \p{ID_Continue} ] + [ $ _ \u200C \u200D \p{ID_Continue} ]+ | \\u[ \d a-f A-F ]{4} | @@ -72,7 +72,9 @@ Identifier = /// StringLiteral = /// ([ ' " ]) (?: - (?! \1 )[^ \\ \n \r ] + [^ ' " \\ \n \r ]+ + | + (?! \1 )[ ' " ] | \\(?: \r\n | [^] ) )* @@ -112,7 +114,7 @@ NumericLiteral = /// Template = /// [ ` } ] (?: - [^ ` \\ $ ] + [^ ` \\ $ ]+ | \\[^] | @@ -134,7 +136,7 @@ LineTerminatorSequence = /// MultiLineComment = /// /\* (?: - [^*] + [^*]+ | \*(?!/) )* @@ -159,7 +161,9 @@ JSXIdentifier = /// JSXString = /// ([ ' " ]) (?: - (?! \1 )[^] + [^ ' "]+ + | + (?! \1 )[ ' " ] )* (\1)? ///y diff --git a/test/very-long-tokens.test.js b/test/very-long-tokens.test.js new file mode 100644 index 0000000..f28889e --- /dev/null +++ b/test/very-long-tokens.test.js @@ -0,0 +1,163 @@ +"use strict"; + +const jsTokens = require("../build/index"); + +function run(input) { + const types = Array.from(jsTokens(input), (token) => token.type); + expect(types).toHaveLength(1); + return types[0]; +} + +const LARGE = 1e7; + +// See https://github.com/lydell/js-tokens/issues/42 +// The regex engine can throw `Maximum call stack size exceeded` when +// the input is too long for certain regex features. At the time of writing, +// `(?:a|b)+` threw an error, while `[ab]+` did not. js-tokens uses alternation +// a lot to match things like “ordinary content OR escape”. The workaround is to +// add an unnecessary-looking `+` _inside_ the alternation (for “ordinary content”) +// to optimize the common case. + +describe("Very long tokens", () => { + describe("RegularExpressionLiteral", () => { + test("basic", () => { + expect(run(`/${"a".repeat(LARGE)}/`)).toBe("RegularExpressionLiteral"); + }); + + test("character class", () => { + expect(run(`/[${"a".repeat(LARGE)}]/`)).toBe("RegularExpressionLiteral"); + }); + + test("flags", () => { + expect(run(`/a/${"g".repeat(LARGE)}`)).toBe("RegularExpressionLiteral"); + }); + }); + + test("IdentifierName", () => { + expect(run("a".repeat(LARGE))).toBe("IdentifierName"); + }); + + test("PrivateIdentifier", () => { + expect(run(`#${"a".repeat(LARGE)}`)).toBe("PrivateIdentifier"); + }); + + describe("StringLiteral", () => { + test("single quote", () => { + expect(run(`'${"a".repeat(LARGE)}'`)).toBe("StringLiteral"); + }); + + test("double quote", () => { + expect(run(`"${"a".repeat(LARGE)}"`)).toBe("StringLiteral"); + }); + + test("string with both large repetitions and escapes", () => { + const content = `\\"${"a".repeat(LARGE)}\\\\'\\"\\n`.repeat(10); + expect(run(`"${content}"`)).toBe("StringLiteral"); + }); + + test("a string with a very large number of lines with \\n escapes", () => { + // Using `LARGE` results in `RangeError: Invalid string length` here. + const content = `${"a".repeat(100)}\\n`.repeat(1e6); + expect(run(`"${content}"`)).toBe("StringLiteral"); + }); + + test("a string with a very large number of lines with actual escaped newlines", () => { + // Using `LARGE` results in `RangeError: Invalid string length` here. + const content = `${"a".repeat(100)}\\\n`.repeat(1e6); + expect(run(`"${content}"`)).toBe("StringLiteral"); + }); + }); + + test("NumericLiteral", () => { + // We don’t support extremely long literals for `NumericLiteral`, because + // that regex is already complicated enough and no real (even generated) + // code should end up with such long literals, since JavaScript does not + // have that amount of number precision anyway. + // `eval(`2${"0".repeat(308)}`)` gives `Infinity`, and that’s not even close + // to getting a `Maximum call stack size exceeded`. And you can’t have that + // many decimals either. + // eslint-disable-next-line no-loss-of-precision + expect(2e308).toBe(Infinity); + expect(run(`2${"0".repeat(308)}`)).toBe("NumericLiteral"); + expect(() => + run(`${"1".repeat(LARGE)}`) + ).toThrowErrorMatchingInlineSnapshot(`"Maximum call stack size exceeded"`); + }); + + describe("Template", () => { + test("NoSubstitutionTemplate", () => { + expect(run(`\`${"a".repeat(LARGE)}\``)).toBe("NoSubstitutionTemplate"); + }); + + test("TemplateHead + TemplateMiddle + TemplateTail", () => { + expect( + Array.from( + jsTokens( + `\`${"a".repeat(LARGE)}\${0}${"a".repeat(LARGE)}\${0}${"a".repeat( + LARGE + )}\`` + ), + + (token) => token.type + ) + ).toMatchInlineSnapshot(` + [ + "TemplateHead", + "NumericLiteral", + "TemplateMiddle", + "NumericLiteral", + "TemplateTail", + ] + `); + }); + }); + + test("WhiteSpace", () => { + expect(run(" ".repeat(LARGE))).toBe("WhiteSpace"); + }); + + test("MultiLineComment", () => { + expect(run(`/*${"a".repeat(LARGE)}*/`)).toBe("MultiLineComment"); + }); + + test("SingleLineComment", () => { + expect(run(`//${"a".repeat(LARGE)}`)).toBe("SingleLineComment"); + }); + + test("JSX", () => { + expect( + Array.from( + jsTokens( + `<${"a".repeat(LARGE)} ${"a".repeat(LARGE)}="${"a".repeat( + LARGE + )}">${"a".repeat(LARGE)}`, + { jsx: true } + ), + (token) => token.type + ) + ).toMatchInlineSnapshot(` + [ + "JSXPunctuator", + "JSXIdentifier", + "WhiteSpace", + "JSXIdentifier", + "JSXPunctuator", + "JSXString", + "JSXPunctuator", + "JSXText", + ] + `); + }); +}); + +describe("README.md examples", () => { + test("success", () => { + expect(run(`"${"a".repeat(LARGE)}"`)).toBe("StringLiteral"); + }); + + test("failure", () => { + expect(() => + run(`"${"\\n".repeat(LARGE)}"`) + ).toThrowErrorMatchingInlineSnapshot(`"Maximum call stack size exceeded"`); + }); +});