lydell · lydell · Feb 3, 2024 · Jan 28, 2024 · Jan 28, 2024 · Jan 28, 2024
diff --git a/README.md b/README.md
@@ -576,6 +576,47 @@ But _function expressions_ are of course not statements. It’s difficult to tel
 
 Luckily, none of these edge cases are likely to occur in real code.
 
+### Known failures
+
+js-tokens advertises that it “never fails”. Tell you what, it _can_ fail on extreme inputs. The regex engine of the runtime can eventually give up. js-tokens has worked around it to some extent by changing its regexes to be easier on the regex engine. To solve completely, js-tokens would have to stop using regex, but then it wouldn’t be _tiny_ anymore which is the whole point. Luckily, only extreme inputs can fail, hopefully ones you’ll never encounter.
+
+For example, if you try to parse the string literal `"\n\n\n"` but with 10 million `\n` instead of just 3, the regex engine gives up with `RangeError: Maximum call stack size exceeded` (or similar). Try it out:
+
+```js
+Array.from(require("js-tokens")(`"${"\\n".repeat(1e7)}"`));
+```
+
+(Yes, that is the _regex engine_ of the runtime giving up. js-tokens has no recursive functions.)
+
+However, if you repeat `a` instead of `\n` 10 million times (`"aaaaaa…"`), it works:
+
+```js
+Array.from(require("js-tokens")(`"${"a".repeat(1e7)}"`));
+```
+
+That’s good, because it’s much more common to have lots of non-escapes in a row in a big string literal, than having mostly escapes. (Obfuscated code might have _only_ escapes though.)
+
+#### Safari warning
+
+I’ve seen Safari _give up_ instead of throwing an error.
+
+In Safari, Chrome, Firefox and Node.js the following code successfully results in a match:
+
+```js
+/(#)(?:a|b)+/.exec("#" + "a".repeat(1e5));
+```
+
+But for the following code (with `1e7` instead of `1e5`), the runtimes differ:
+
+```js
+/(#)(?:a|b)+/.exec("#" + "a".repeat(1e7));
+```
+
+- Chrome, Firefox and Node.js all throw `RangeError: Maximum call stack size exceeded` (or similar).
+- Safari returns `null` (at the time of writing), silently giving up on matching the regex. It’s kind of lying that the regex did not match, while in reality it would given enough computing resources.
+
+This means that in Safari, js-tokens might not fail but instead give you unexpected tokens.
+
 ## Performance
 
 With [@babel/parser] for comparison. Node.js 18.13.0 on a MacBook Pro M1 (Ventura).

diff --git a/index.coffee b/index.coffee
@@ -12,13 +12,13 @@ RegularExpressionLiteral = ///
  (?:
  \[
  (?:
- (?![ \] \\ ]).
+ [^ \] \\ \n \r \u2028 \u2029 ]+
  |
  \\.
  )*
  \]
  |
- (?![ / \\ ]).
+ [^ / \\ \n \r \u2028 \u2029 ]+
  |
  \\.
  )*
@@ -61,7 +61,7 @@ Identifier = ///
  (\x23?)
  (?=[ $ _ \p{ID_Start} \\ ])
  (?:
- [ $ _ \u200C \u200D \p{ID_Continue} ]
+ [ $ _ \u200C \u200D \p{ID_Continue} ]+
  |
  \\u[ \d a-f A-F ]{4}
  |
@@ -72,7 +72,9 @@ Identifier = ///
 StringLiteral = ///
  ([ ' " ])
  (?:
- (?! \1 )[^ \\ \n \r ]
+ [^ ' " \\ \n \r ]+
+ |
+ (?! \1 )[ ' " ]
  |
  \\(?: \r\n | [^] )
  )*
@@ -112,7 +114,7 @@ NumericLiteral = ///
 Template = ///
  [ ` } ]
  (?:
- [^ ` \\ $ ]
+ [^ ` \\ $ ]+
  |
  \\[^]
  |
@@ -134,7 +136,7 @@ LineTerminatorSequence = ///
 MultiLineComment = ///
  /\*
  (?:
- [^*]
+ [^*]+
  |
  \*(?!/)
  )*
@@ -159,7 +161,9 @@ JSXIdentifier = ///
 JSXString = ///
  ([ ' " ])
  (?:
- (?! \1 )[^]
+ [^ ' "]+
+ |
+ (?! \1 )[ ' " ]
  )*
  (\1)?
 ///y

diff --git a/test/very-long-tokens.test.js b/test/very-long-tokens.test.js
@@ -0,0 +1,163 @@
+"use strict";
+
+const jsTokens = require("../build/index");
+
+function run(input) {
+ const types = Array.from(jsTokens(input), (token) => token.type);
+ expect(types).toHaveLength(1);
+ return types[0];
+}
+
+const LARGE = 1e7;
+
+// See https://github.com/lydell/js-tokens/issues/42
+// The regex engine can throw `Maximum call stack size exceeded` when
+// the input is too long for certain regex features. At the time of writing,
+// `(?:a|b)+` threw an error, while `[ab]+` did not. js-tokens uses alternation
+// a lot to match things like “ordinary content OR escape”. The workaround is to
+// add an unnecessary-looking `+` _inside_ the alternation (for “ordinary content”)
+// to optimize the common case.
+
+describe("Very long tokens", () => {
+ describe("RegularExpressionLiteral", () => {
+ test("basic", () => {
+ expect(run(`/${"a".repeat(LARGE)}/`)).toBe("RegularExpressionLiteral");
+ });
+
+ test("character class", () => {
+ expect(run(`/[${"a".repeat(LARGE)}]/`)).toBe("RegularExpressionLiteral");
+ });
+
+ test("flags", () => {
+ expect(run(`/a/${"g".repeat(LARGE)}`)).toBe("RegularExpressionLiteral");
+ });
+ });
+
+ test("IdentifierName", () => {
+ expect(run("a".repeat(LARGE))).toBe("IdentifierName");
+ });
+
+ test("PrivateIdentifier", () => {
+ expect(run(`#${"a".repeat(LARGE)}`)).toBe("PrivateIdentifier");
+ });
+
+ describe("StringLiteral", () => {
+ test("single quote", () => {
+ expect(run(`'${"a".repeat(LARGE)}'`)).toBe("StringLiteral");
+ });
+
+ test("double quote", () => {
+ expect(run(`"${"a".repeat(LARGE)}"`)).toBe("StringLiteral");
+ });
+
+ test("string with both large repetitions and escapes", () => {
+ const content = `\\"${"a".repeat(LARGE)}\\\\'\\"\\n`.repeat(10);
+ expect(run(`"${content}"`)).toBe("StringLiteral");
+ });
+
+ test("a string with a very large number of lines with \\n escapes", () => {
+ // Using `LARGE` results in `RangeError: Invalid string length` here.
+ const content = `${"a".repeat(100)}\\n`.repeat(1e6);
+ expect(run(`"${content}"`)).toBe("StringLiteral");
+ });
+
+ test("a string with a very large number of lines with actual escaped newlines", () => {
+ // Using `LARGE` results in `RangeError: Invalid string length` here.
+ const content = `${"a".repeat(100)}\\\n`.repeat(1e6);
+ expect(run(`"${content}"`)).toBe("StringLiteral");
+ });
+ });
+
+ test("NumericLiteral", () => {
+ // We don’t support extremely long literals for `NumericLiteral`, because
+ // that regex is already complicated enough and no real (even generated)
+ // code should end up with such long literals, since JavaScript does not
+ // have that amount of number precision anyway.
+ // `eval(`2${"0".repeat(308)}`)` gives `Infinity`, and that’s not even close
+ // to getting a `Maximum call stack size exceeded`. And you can’t have that
+ // many decimals either.
+ // eslint-disable-next-line no-loss-of-precision
+ expect(2e308).toBe(Infinity);
+ expect(run(`2${"0".repeat(308)}`)).toBe("NumericLiteral");
+ expect(() =>
+ run(`${"1".repeat(LARGE)}`)
+ ).toThrowErrorMatchingInlineSnapshot(`"Maximum call stack size exceeded"`);
+ });
+
+ describe("Template", () => {
+ test("NoSubstitutionTemplate", () => {
+ expect(run(`\`${"a".repeat(LARGE)}\``)).toBe("NoSubstitutionTemplate");
+ });
+
+ test("TemplateHead + TemplateMiddle + TemplateTail", () => {
+ expect(
+ Array.from(
+ jsTokens(
+ `\`${"a".repeat(LARGE)}\${0}${"a".repeat(LARGE)}\${0}${"a".repeat(
+ LARGE
+ )}\``
+ ),
+
+ (token) => token.type
+ )
+ ).toMatchInlineSnapshot(`
+ [
+ "TemplateHead",
+ "NumericLiteral",
+ "TemplateMiddle",
+ "NumericLiteral",
+ "TemplateTail",
+ ]
+ `);
+ });
+ });
+
+ test("WhiteSpace", () => {
+ expect(run(" ".repeat(LARGE))).toBe("WhiteSpace");
+ });
+
+ test("MultiLineComment", () => {
+ expect(run(`/*${"a".repeat(LARGE)}*/`)).toBe("MultiLineComment");
+ });
+
+ test("SingleLineComment", () => {
+ expect(run(`//${"a".repeat(LARGE)}`)).toBe("SingleLineComment");
+ });
+
+ test("JSX", () => {
+ expect(
+ Array.from(
+ jsTokens(
+ `<${"a".repeat(LARGE)} ${"a".repeat(LARGE)}="${"a".repeat(
+ LARGE
+ )}">${"a".repeat(LARGE)}`,
+ { jsx: true }
+ ),
+ (token) => token.type
+ )
+ ).toMatchInlineSnapshot(`
+ [
+ "JSXPunctuator",
+ "JSXIdentifier",
+ "WhiteSpace",
+ "JSXIdentifier",
+ "JSXPunctuator",
+ "JSXString",
+ "JSXPunctuator",
+ "JSXText",
+ ]
+ `);
+ });
+});
+
+describe("README.md examples", () => {
+ test("success", () => {
+ expect(run(`"${"a".repeat(LARGE)}"`)).toBe("StringLiteral");
+ });
+
+ test("failure", () => {
+ expect(() =>
+ run(`"${"\\n".repeat(LARGE)}"`)
+ ).toThrowErrorMatchingInlineSnapshot(`"Maximum call stack size exceeded"`);
+ });
+});