Removed Lookbehinds

Beginning and End delimiters for EM and Strong must be searched in a separate regex to work without lookbehinds. This invalidates the regex that skips over blocks (code, html, etc.) that take precedence over EM or Strong blocks. Getting around this means we must now mask not only reflinks, but all enclosed blocks which were previously just skipped over in the Regex. Add one check for overlapping Strong block when testing EM, now passes Commonmark 390 and 471
markedjs · Jul 8, 2020 · 211b9f9 · 211b9f9
1 parent bd4f8c4
commit 211b9f9
Show file tree

Hide file tree

Showing 5 changed files with 78 additions and 20 deletions.
diff --git a/src/Lexer.js b/src/Lexer.js
@@ -324,17 +324,23 @@ module.exports = class Lexer {
 
     // String with links masked to avoid interference with em and strong
     let maskedSrc = src;
+    let match;
+
+    // Mask out reflinks
     if (this.tokens.links) {
       const links = Object.keys(this.tokens.links);
       if (links.length > 0) {
-        let match;
         while ((match = this.tokenizer.rules.inline.reflinkSearch.exec(maskedSrc)) != null) {
           if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) {
             maskedSrc = maskedSrc.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.reflinkSearch.lastIndex);
           }
         }
       }
     }
+    // Mask out other blocks
+    while ((match = this.tokenizer.rules.inline.emSkip.exec(maskedSrc)) != null) {
+      maskedSrc = maskedSrc.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.emSkip.lastIndex);
+    }
 
     while (src) {
       // escape

diff --git a/src/Tokenizer.js b/src/Tokenizer.js
@@ -490,11 +490,25 @@ module.exports = class Tokenizer {
   }
 
   strong(src, maskedSrc, prevChar = '') {
-    let cap = this.rules.inline.preStrong.exec(src);
+    let match = this.rules.inline.strStart.exec(src);
 
-    if (cap) {
+    if (match) {
       maskedSrc = maskedSrc.slice(-1 * src.length);
-      cap = this.rules.inline.strong.exec(maskedSrc);
+      let strEnd;
+
+      if(match[0] == "**")
+        strEnd = this.rules.inline.strEndAst;
+      else
+        strEnd = this.rules.inline.strEndUnd;
+
+      strEnd.lastIndex = 0;
+
+      let cap;
+      while ((match = strEnd.exec(maskedSrc)) != null) {
+        cap = this.rules.inline.strong.exec(maskedSrc.slice(0,match.index+3));
+        if (cap)
+          break;
+      }
 
       if (cap) {
         if (!cap[1] || (cap[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) {
@@ -509,11 +523,25 @@ module.exports = class Tokenizer {
   }
 
   em(src, maskedSrc, prevChar = '') {
-    let cap = this.rules.inline.preEm.exec(src);
+    let match = this.rules.inline.emStart.exec(src);
 
-    if (cap) {
+    if (match) {
       maskedSrc = maskedSrc.slice(-1 * src.length);
-      cap = this.rules.inline.em.exec(maskedSrc);
+      let emEnd;
+
+      if(match[0] == "*")
+        emEnd = this.rules.inline.emEndAst;
+      else
+        emEnd = this.rules.inline.emEndUnd;
+
+      emEnd.lastIndex = 0;
+
+      let cap;
+      while ((match = emEnd.exec(maskedSrc)) != null) {
+        cap = this.rules.inline.em.exec(maskedSrc.slice(0,match.index+2));
+        if (cap)
+          break;
+      }
 
       if (cap) {
         if (!cap[1] || (cap[1] && (prevChar === '' || this.rules.inline.punctuation.exec(prevChar)))) {

diff --git a/src/rules.js b/src/rules.js
@@ -169,11 +169,15 @@ const inline = {
   reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/,
   nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/,
   reflinkSearch: 'reflink|nolink(?!\\()',
-  preStrong: /^(?:\*\*|__)/,
-  strong: /^(?:(\*\*(?=[*punctuation]))|\*\*)(?![\s])((?:(?:(?!emSkip)(?:[^*]|[\\\s]\*)|emSkip)|(?:(?:(?!emSkip)(?:[^*]|[\\\s]\*)|emSkip)*?(?<!\\)\*){2})+?)(?:(?<![punctuation\s])\*\*(?!\*)|(?<=[punctuation])\*\*(?!\*)(?:(?=[punctuation\s]|$)))|^__(?![\s])((?:(?:(?!emSkip)(?:[^_]|[\\\s]_)|emSkip)|(?:(?:(?!emSkip)(?:[^_]|[\\\s]_)|emSkip)*?(?<!\\)_){2})+?)(?:(?<![\s])__(?!_)(?:(?=[punctuation\s])|$))/,
-  preEm: /^[*_]/,
+  strStart: /^\*\*|__/,
+  strEndAst: /[^punctuation\s]\*\*(?!\*)|[punctuation]\*\*(?!\*)(?:(?=[punctuation\s]|$))/,
+  strEndUnd: /[^\s]__(?!_)(?:(?=[punctuation\s])|$)/,
+  strong: /^(?:(\*\*(?=[*punctuation]))|\*\*)(?![\s])((?:(?:(?!evSkip)(?:[^*]|\\\*)|evSkip)|(?:(?:(?!evSkip)(?:[^*]|\\\*)|evSkip)*?(?<!\\)\*){2})+?)\*\*$|^__(?![\s])((?:(?:(?!evSkip)(?:[^_]|\\_)|evSkip)|(?:(?:(?!evSkip)(?:[^_]|\\_)|evSkip)*?(?<!\\)_){2})+?)__$/,
+  emStart: /^[*_]/,
+  emEndAst: /[^punctuation\s]\*(?!\*)|[punctuation]\*(?!\*)(?:(?=[punctuation\s]|$))/,
+  emEndUnd: /[^\s]_(?!_)(?:(?=[punctuation\s])|$)/,
   // (1) returns if starts w/ punctuation  | (2)   ⬐Check groups to skip over ⬐ skip if needed ⬐repeat logic for inner *'s (must be in pairs)⬎     ⬐last char can't be punct OR final * must also be followed by punct (or endline)  | (3) Underscores ⬐Check groups to skip over ⬐skip if needed ⬐repeat logic for inner _'s (must be in pairs)⬎  ⬐last char can't be a space, and final _ must preceed punct or \s (or endline)
-  em: /^(?:(\*(?=[punctuation]))|\*)(?![*\s])(?:(?:(?!emSkip)(?:[^*]|[\\\s]\*)|emSkip)|(?:(?:(?!emSkip)(?:[^*]|[\\\s]\*)|emSkip)*?(?<!\\)\*){2})*?(?:(?<![punctuation\s])\*(?!\*)|(?<=[punctuation])\*(?!\*)(?:(?=[punctuation\s]|$)))|^_(?![_\s])((?:(?:(?!emSkip)(?:[^_]|[\\\s]_)|emSkip)|(?:(?:(?!emSkip)(?:[^_]|[\\\s]_)|emSkip)*?(?<!\\)_){2})*?)(?:(?<![\s])_(?!_)(?:(?=[punctuation\s])|$))/,
+  em: /^(?:(\*(?=[punctuation]))|\*)(?![*\s])(?:(?:(?!evSkip)(?:[^*]|\\\*)|evSkip)|\*(?:(?!evSkip)(?:[^*]|\\\*)|evSkip)*?\*)*?\*$|^_(?![_\s])(?:(?:(?!evSkip)(?:[^_]|\\_)|evSkip)|(?:(?:(?!evSkip)(?:[^_]|\\_)|evSkip)*?_){2})*?_$/,
   code: /^(`+)([^`]|[^`][\s\S]*?[^`])\1(?!`)/,
   br: /^( {2,}|\\)\n(?!\s*$)/,
   del: noopTest,
@@ -188,17 +192,41 @@ inline.punctuation = edit(inline.punctuation).replace(/punctuation/g, inline._pu
 
 // sequences em should skip over [title](link), `code`, <html>
 inline._emSkip = '\\[[^\\]]*?\\]\\([^\\)]*?\\)|`[^`]*?`|<[^>]*?>';
+inline._strSkip = '\\[[^\\]]*?\\]\\([^\\)]*?\\)|`[^`]*?`|<[^>]*?>';
+inline._evSkip = '__[^_]*?__';
 
 inline.em = edit(inline.em)
   .replace(/punctuation/g, inline._punctuation)
-  .replace(/emSkip/g, inline._emSkip)
+  .replace(/evSkip/g, inline._evSkip)
+  .getRegex();
+
+inline.emEndAst = edit(inline.emEndAst, 'g')
+  .replace(/punctuation/g, inline._punctuation)
+  .getRegex();
+
+inline.emEndUnd = edit(inline.emEndUnd, 'g')
+  .replace(/punctuation/g, inline._punctuation)
+  .getRegex();
+
+inline.emSkip = edit(inline._emSkip, 'g')
+  .getRegex();
+
+inline.evSkip = edit(inline._evSkip, 'g')
   .getRegex();
 
 inline.strong = edit(inline.strong)
   .replace(/punctuation/g, inline._punctuation)
   .replace(/emSkip/g, inline._emSkip)
   .getRegex();
 
+inline.strEndAst = edit(inline.strEndAst, 'g')
+  .replace(/punctuation/g, inline._punctuation)
+  .getRegex();
+
+inline.strEndUnd = edit(inline.strEndUnd, 'g')
+  .replace(/punctuation/g, inline._punctuation)
+  .getRegex();
+
 inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g;
 
 inline._scheme = /[a-zA-Z][a-zA-Z0-9+.-]{1,31}/;

diff --git a/test/specs/commonmark/commonmark.0.29.json b/test/specs/commonmark/commonmark.0.29.json
@@ -3160,8 +3160,7 @@
     "example": 390,
     "start_line": 6672,
     "end_line": 6676,
-    "section": "Emphasis and strong emphasis",
-    "shouldFail": true
+    "section": "Emphasis and strong emphasis"
   },
   {
     "markdown": "**(**foo)\n",
@@ -3828,8 +3827,7 @@
     "example": 471,
     "start_line": 7355,
     "end_line": 7359,
-    "section": "Emphasis and strong emphasis",
-    "shouldFail": true
+    "section": "Emphasis and strong emphasis"
   },
   {
     "markdown": "*[bar*](/url)\n",

diff --git a/test/specs/gfm/commonmark.0.29.json b/test/specs/gfm/commonmark.0.29.json
@@ -3160,8 +3160,7 @@
     "example": 390,
     "start_line": 6672,
     "end_line": 6676,
-    "section": "Emphasis and strong emphasis",
-    "shouldFail": true
+    "section": "Emphasis and strong emphasis"
   },
   {
     "markdown": "**(**foo)\n",
@@ -3828,8 +3827,7 @@
     "example": 471,
     "start_line": 7355,
     "end_line": 7359,
-    "section": "Emphasis and strong emphasis",
-    "shouldFail": true
+    "section": "Emphasis and strong emphasis"
   },
   {
     "markdown": "*[bar*](/url)\n",