From cee6b8475570c82cd0a485a7f30fba6aa4c69d3c Mon Sep 17 00:00:00 2001 From: iamahuman Date: Mon, 29 Apr 2019 16:37:21 +0900 Subject: [PATCH] Synchronize code with upstream commonmark.js 0.29 * Updated code to match commonmark/commonmark.js@5eebfd3868b812abb2ba5830ca0cc6c9b638add7 * Improved block dispatch performance by using a pre-cached dict instead of importlib * Add missing Unicode case folding for reference normalization * Fixed a bug where empty link labels were not properly recognized --- commonmark/blocks.py | 87 ++--- commonmark/common.py | 19 +- commonmark/inlines.py | 68 ++-- commonmark/node.py | 1 + commonmark/normalize_reference.py | 165 +++++++++ commonmark/render/html.py | 16 +- commonmark/tests/unit_tests.py | 28 +- commonmark/utils.py | 3 - spec.txt | 548 +++++++++++++++++++++++------- 9 files changed, 700 insertions(+), 235 deletions(-) create mode 100644 commonmark/normalize_reference.py delete mode 100644 commonmark/utils.py diff --git a/commonmark/blocks.py b/commonmark/blocks.py index fa236a8..aeec21d 100644 --- a/commonmark/blocks.py +++ b/commonmark/blocks.py @@ -1,12 +1,10 @@ from __future__ import absolute_import, unicode_literals import re -from importlib import import_module from commonmark import common from commonmark.common import unescape_string from commonmark.inlines import InlineParser from commonmark.node import Node -from commonmark.utils import to_camel_case CODE_INDENT = 4 @@ -21,7 +19,7 @@ r'^<[/]?(?:address|article|aside|base|basefont|blockquote|body|' r'caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|' r'fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|' - r'header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|' + r'header|hr|html|iframe|legend|li|link|main|menu|menuitem|' r'nav|noframes|ol|optgroup|option|p|param|section|source|title|' r'summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)' r'(?:\s|[/]?[>]|$)', @@ -45,7 +43,7 @@ reBulletListMarker = re.compile(r'^[*+-]') reOrderedListMarker = re.compile(r'^(\d{1,9})([.)])') reATXHeadingMarker = re.compile(r'^#{1,6}(?:[ \t]+|$)') -reCodeFence = re.compile(r'^`{3,}(?!.*`)|^~{3,}(?!.*~)') +reCodeFence = re.compile(r'^`{3,}(?!.*`)|^~{3,}') reClosingCodeFence = re.compile(r'^(?:`{3,}|~{3,})(?= *$)') reSetextHeadingLine = re.compile(r'^(?:=+|-+)[ \t]*$') reLineEnding = re.compile(r'\r\n|\n|\r') @@ -57,7 +55,7 @@ def is_blank(s): def is_space_or_tab(s): - return s == ' ' or s == '\t' + return s in (' ', '\t') def peek(ln, pos): @@ -73,9 +71,12 @@ def ends_with_blank_line(block): while block: if block.last_line_blank: return True - if (block.t == 'list' or block.t == 'item'): + if not block.last_line_checked and \ + block.t in ('list', 'item'): + block.last_line_checked = True block = block.last_child else: + block.last_line_checked = True break return False @@ -94,6 +95,8 @@ def parse_list_marker(parser, container): 'padding': None, 'marker_offset': parser.indent, } + if parser.indent >= 4: + return None m = re.search(reBulletListMarker, rest) m2 = re.search(reOrderedListMarker, rest) if m: @@ -515,15 +518,25 @@ def setext_heading(parser, container=None): parser.current_line[parser.next_nonspace:]) if m: parser.close_unmatched_blocks() - heading = Node('heading', container.sourcepos) - heading.level = 1 if m.group()[0] == '=' else 2 - heading.string_content = container.string_content - container.insert_after(heading) - container.unlink() - parser.tip = heading - parser.advance_offset( - len(parser.current_line) - parser.offset, False) - return 2 + # resolve reference link definitiosn + while peek(container.string_content, 0) == '[': + pos = parser.inline_parser.parseReference( + container.string_content, parser.refmap) + if not pos: + break + container.string_content = container.string_content[pos:] + if container.string_content: + heading = Node('heading', container.sourcepos) + heading.level = 1 if m.group()[0] == '=' else 2 + heading.string_content = container.string_content + container.insert_after(heading) + container.unlink() + parser.tip = heading + parser.advance_offset( + len(parser.current_line) - parser.offset, False) + return 2 + else: + return 0 return 0 @@ -610,13 +623,8 @@ def add_child(self, tag, offset): """ Add block of type tag as a child of the tip. If the tip can't accept children, close and finalize it and try its parent, and so on til we find a block that can accept children.""" - block_class = getattr(import_module('commonmark.blocks'), - to_camel_case(self.tip.t)) - while not block_class.can_contain(tag): + while not self.blocks[self.tip.t].can_contain(tag): self.finalize(self.tip, self.line_number - 1) - block_class = getattr( - import_module('commonmark.blocks'), - to_camel_case(self.tip.t)) column_number = offset + 1 new_block = Node(tag, [[self.line_number, column_number], [0, 0]]) @@ -725,15 +733,15 @@ def incorporate_line(self, ln): # For each containing block, try to parse the associated line start. # Bail out on failure: container will point to the last matching block. # Set all_matched to false if not all containers match. - last_child = container.last_child - while last_child and last_child.is_open: + while True: + last_child = container.last_child + if not (last_child and last_child.is_open): + break container = last_child self.find_next_nonspace() - block_class = getattr( - import_module('commonmark.blocks'), - to_camel_case(container.t)) - rv = block_class.continue_(self, container) + + rv = self.blocks[container.t].continue_(self, container) if rv == 0: # we've matched, keep going pass @@ -745,21 +753,19 @@ def incorporate_line(self, ln): self.last_line_length = len(ln) return else: - raise ValueError('returned illegal value, must be 0, 1, or 2') + raise ValueError( + 'continue_ returned illegal value, must be 0, 1, or 2') if not all_matched: # back up to last matching block container = container.parent break - last_child = container.last_child - self.all_closed = (container == self.oldtip) self.last_matched_container = container - block_class = getattr(import_module('commonmark.blocks'), - to_camel_case(container.t)) - matched_leaf = container.t != 'paragraph' and block_class.accepts_lines + matched_leaf = container.t != 'paragraph' and \ + self.blocks[container.t].accepts_lines starts = self.block_starts starts_len = len(starts.METHODS) # Unless last matched container is a code block, try new container @@ -824,9 +830,7 @@ def incorporate_line(self, ln): cont.last_line_blank = last_line_blank cont = cont.parent - block_class = getattr(import_module('commonmark.blocks'), - to_camel_case(t)) - if block_class.accepts_lines: + if self.blocks[t].accepts_lines: self.add_line() # if HtmlBlock, check for end condition if t == 'html_block' and \ @@ -853,9 +857,8 @@ def finalize(self, block, line_number): above = block.parent block.is_open = False block.sourcepos[1] = [line_number, self.last_line_length] - block_class = getattr(import_module('commonmark.blocks'), - to_camel_case(block.t)) - block_class.finalize(self, block) + + self.blocks[block.t].finalize(self, block) self.tip = above @@ -897,3 +900,9 @@ def parse(self, my_input): self.finalize(self.tip, length) self.process_inlines(self.doc) return self.doc + + +CAMEL_RE = re.compile("(.)([A-Z](?:[a-z]+|(?<=[a-z0-9].)))") +Parser.blocks = dict( + (CAMEL_RE.sub(r'\1_\2', cls.__name__).lower(), cls) + for cls in Block.__subclasses__()) diff --git a/commonmark/common.py b/commonmark/common.py index 1327869..b15a8b6 100644 --- a/commonmark/common.py +++ b/commonmark/common.py @@ -19,7 +19,7 @@ from commonmark import entitytrans HTMLunescape = entitytrans._unescape -ENTITY = '&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});' +ENTITY = '&(?:#x[a-f0-9]{1,6}|#[0-9]{1,7}|[a-z][a-z0-9]{1,31});' TAGNAME = '[A-Za-z][A-Za-z0-9-]*' ATTRIBUTENAME = '[a-zA-Z_:][a-zA-Z0-9:._-]*' @@ -45,7 +45,6 @@ '\\\\' + ESCAPABLE + '|' + ENTITY, re.IGNORECASE) XMLSPECIAL = '[&<>"]' reXmlSpecial = re.compile(XMLSPECIAL) -reXmlSpecialOrEntity = re.compile(ENTITY + '|' + XMLSPECIAL, re.IGNORECASE) def unescape_char(s): @@ -102,19 +101,13 @@ def replace_unsafe_char(s): return UNSAFE_MAP.get(s, s) -def escape_xml(s, preserve_entities): +def escape_xml(s): if s is None: return '' if re.search(reXmlSpecial, s): - if preserve_entities: - return re.sub( - reXmlSpecialOrEntity, - lambda m: replace_unsafe_char(m.group()), - s) - else: - return re.sub( - reXmlSpecial, - lambda m: replace_unsafe_char(m.group()), - s) + return re.sub( + reXmlSpecial, + lambda m: replace_unsafe_char(m.group()), + s) else: return s diff --git a/commonmark/inlines.py b/commonmark/inlines.py index f2f66c7..88a84cf 100644 --- a/commonmark/inlines.py +++ b/commonmark/inlines.py @@ -5,6 +5,7 @@ from commonmark import common from commonmark.common import normalize_uri, unescape_string from commonmark.node import Node +from commonmark.normalize_reference import normalize_reference if sys.version_info >= (3, 0): if sys.version_info >= (3, 4): @@ -22,7 +23,7 @@ ESCAPED_CHAR = '\\\\' + common.ESCAPABLE rePunctuation = re.compile( - r'[!"#$%&\'()*+,\-./:;<=>?@\[\]^_`{|}~\xA1\xA7\xAB\xB6\xB7\xBB' + r'[!"#$%&\'()*+,\-./:;<=>?@\[\]\\^_`{|}~\xA1\xA7\xAB\xB6\xB7\xBB' r'\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3' r'\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F' r'\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E' @@ -54,10 +55,8 @@ '|' + '\'(' + ESCAPED_CHAR + '|[^\'\\x00])*\'' + '|' + - '\\((' + ESCAPED_CHAR + '|[^)\\x00])*\\))') -reLinkDestinationBraces = re.compile( - '^(?:[<](?:[^ <>\\t\\n\\\\\\x00]' + '|' + ESCAPED_CHAR + '|' + - '\\\\)*[>])') + '\\((' + ESCAPED_CHAR + '|[^()\\x00])*\\))') +reLinkDestinationBraces = re.compile(r'^(?:<(?:[^<>\n\\\x00]|\\.)*>)') reEscapable = re.compile('^' + common.ESCAPABLE) reEntityHere = re.compile('^' + common.ENTITY, re.IGNORECASE) @@ -79,21 +78,11 @@ reFinalSpace = re.compile(r' *$') reInitialSpace = re.compile(r'^ *') reSpaceAtEndOfLine = re.compile(r'^ *(?:\n|$)') -reLinkLabel = re.compile('^\\[(?:[^\\\\\\[\\]]|' + ESCAPED_CHAR + - '|\\\\){0,1000}\\]') +reLinkLabel = re.compile(r'^\[(?:[^\\\[\]]|\\.){0,1000}\]') # Matches a string of non-special characters. reMain = re.compile(r'^[^\n`\[\]\\!<&*_\'"]+', re.MULTILINE) -def normalizeReference(s): - """Normalize reference label. - - Collapse internal whitespace to single space, remove - leading/trailing whitespace, case fold. - """ - return re.sub(r'\s+', ' ', s.strip()).upper() - - def text(s): node = Node('text', None) node.literal = s @@ -175,12 +164,14 @@ def parseBackticks(self, block): after_open_ticks = self.pos matched = self.match(reTicks) while matched is not None: - if (matched == ticks): + if matched == ticks: node = Node('code', None) - c = self.subject[after_open_ticks:self.pos - len(ticks)] - c = c.strip() - c = re.sub(reWhitespace, ' ', c) - node.literal = c + contents = self.subject[after_open_ticks:self.pos-len(ticks)] \ + .replace('\n', ' ') + if contents.lstrip(' ') and contents[0] == contents[-1] == ' ': + node.literal = contents[1:-1] + else: + node.literal = contents block.append_child(node) return True matched = self.match(reTicks) @@ -394,8 +385,9 @@ def processEmphasis(self, stack_bottom): opener != openers_bottom[closercc]): odd_match = (closer.get('can_open') or opener.get('can_close')) and \ - (opener.get('origdelims') + - closer.get('origdelims')) % 3 == 0 + closer['origdelims'] % 3 != 0 and \ + (opener['origdelims'] + + closer['origdelims']) % 3 == 0 if opener.get('cc') == closercc and \ opener.get('can_open') and \ not odd_match: @@ -502,12 +494,17 @@ def parseLinkDestination(self): """ res = self.match(reLinkDestinationBraces) if res is None: + if self.peek() == '<': + return None # TODO handrolled parser; res should be None or the string savepos = self.pos openparens = 0 - c = self.peek() - while c is not None: - if c == '\\': + while True: + c = self.peek() + if c is None: + break + if c == '\\' and re.search( + reEscapable, self.subject[self.pos+1:self.pos+2]): self.pos += 1 if self.peek() is not None: self.pos += 1 @@ -524,7 +521,8 @@ def parseLinkDestination(self): break else: self.pos += 1 - c = self.peek() + if self.pos == savepos and c != ')': + return None res = self.subject[savepos:self.pos] return normalize_uri(unescape_string(res)) else: @@ -539,7 +537,7 @@ def parseLinkLabel(self): # Note: our regex will allow something of form [..\]; # we disallow it here rather than using lookahead in the regex: m = self.match(reLinkLabel) - if m is None or len(m) > 1001 or re.search(r'([^\\]\\\]$|\[\n\]$)', m): + if m is None or len(m) > 1001: return 0 else: return len(m) @@ -647,7 +645,7 @@ def parseCloseBracket(self, block): if reflabel: # lookup rawlabel in refmap - link = self.refmap.get(normalizeReference(reflabel)) + link = self.refmap.get(normalize_reference(reflabel)) if link: dest = link['destination'] title = link['title'] @@ -779,13 +777,15 @@ def parseReference(self, s, refmap): self.spnl() dest = self.parseLinkDestination() - if (dest is None or len(dest) == 0): + if dest is None: self.pos = startpos return 0 beforetitle = self.pos self.spnl() - title = self.parseLinkTitle() + title = None + if self.pos != beforetitle: + title = self.parseLinkTitle() if title is None: title = '' # rewind before spaces @@ -810,13 +810,13 @@ def parseReference(self, s, refmap): self.pos = startpos return 0 - normlabel = normalizeReference(rawlabel) - if refmap.get(normlabel) == '': + normlabel = normalize_reference(rawlabel) + if normlabel == '': # label must contain non-whitespace characters self.pos = startpos return 0 - if refmap.get(normlabel) is None: + if not refmap.get(normlabel): refmap[normlabel] = { 'destination': dest, 'title': title diff --git a/commonmark/node.py b/commonmark/node.py index 39e26b5..4c0ed40 100644 --- a/commonmark/node.py +++ b/commonmark/node.py @@ -78,6 +78,7 @@ def __init__(self, node_type, sourcepos): self.nxt = None self.sourcepos = sourcepos self.last_line_blank = False + self.last_line_checked = False self.is_open = True self.string_content = '' self.literal = None diff --git a/commonmark/normalize_reference.py b/commonmark/normalize_reference.py new file mode 100644 index 0000000..d68a3b1 --- /dev/null +++ b/commonmark/normalize_reference.py @@ -0,0 +1,165 @@ +"""Case-folding and whitespace normalization""" +# Unicode Case Folding table has been derived from the following work: +# +# CaseFolding-12.0.0.txt +# Date: 2019-01-22, 08:18:22 GMT +# (c) 2019 Unicode(R) Inc. +# Unicode and the Unicode Logo are registered trademarks +# of Unicode, Inc. in the U.S. and other countries. +# For terms of use, see http://www.unicode.org/terms_of_use.html +# +# Unicode Character Database +# For documentation, see http://www.unicode.org/reports/tr44/ + +import re +import sys +from builtins import str, chr + +__all__ = ["normalize_reference"] + +if sys.version_info < (3,) and sys.maxunicode <= 0xffff: + # shim for Python 2.x UCS2 build + _unichr = chr + + def chr(cdp): + if 0x10000 <= cdp < 0x110000: + cdp -= 0x10000 + return (_unichr(0xd800 | (cdp >> 10)) + + _unichr(0xdc00 | (cdp & 0x3ff))) + return _unichr(cdp) + + +def _parse_table(tbl): + xlat = {} + cur_i, cur_j = -1, 0 + for entry in tbl.split(';'): + arr = entry.split(',') + info = [int(x, 36) if x else 0 for x in arr[0].split(':')] + arr = [int(x, 36) for x in arr[1:]] + assert not any(x in xlat for x in arr) + sfx = ''.join(map(chr, arr)) + streak, stride = 0, 1 + if len(info) == 2: + fdt, delta = info + elif len(info) == 3: + fdt, streak, delta = info + else: + fdt, streak, delta, stride = info + assert streak >= 0 and stride >= 1 + cur_i += fdt + 1 + cur_j -= delta + assert cur_j != 0 + i = cur_i + last = cur_i + streak + while i <= last: + # uniqueness and idempotency + assert i not in xlat and i + cur_j not in xlat + assert i not in arr + xlat[i] = chr(i + cur_j) + sfx + i += stride + return xlat + + +XLAT = _parse_table( + # ===== Start of Unicode Case Folding table ===== + '1t:p:-w;37:-kn;a:m:kn;n:6:;6:3w,37;w:1a:-31:2;1b:5k,lj;1:4:-5k:2;6:e::' + '2;f:-aa,32;:18:aa:2;19:3e;:4:-3e:2;5:7h;1:-da;:2:5t:2;3:-5p;:5p;1:1:-5' + 'o;1:5o;2:-26;:-3f;:-1;:5m;1:-5o;:-2;1:-4;:2;:5s;3:-5u;:-2;1:-1;:4:5x:2' + ';5:-61;:61;1:-61;2:61;1:-61;:61;1:1:-60;1:2:60:2;3:-62;:4:62:4;b:-1;:1' + ';1:-1;:1;1:-1;:g:1:2;i:g::2;h:av,lo;:-aw;:2:1:2;3:2q;:-15;:12:-1l:2;13' + ':3n;1:g:-3n:2;n:-8bu;:8bu;1:4k;:-8gb;2:8br;1:5g;:-7c;:-2;:8:1y:2;72:-3' + '7;16:2:37:2;5:;8:-37;6:26;1:2:1;3:-r;1:1:1;1:m,lk,ld;:g:9;h:8:;c:b,lk,' + 'ld;h:k;c:-7;:12;:-5;3:-a;:7;1:m:-n:2;n:1j;:-6;2:c;:4;1:-1t;1:8;:-8;2:2' + ':3n;2:f:-5u;f:v:1c;27:w:v:2;15:1g::2;1h:-e;:c:e:2;e:2m::2;2o:11:-1b;2d' + ':2a,136;26w:11:-5mq;12:6::6;mo:5:5m0;1on:4sm;:-1;:-9;:1:-2;1:1;:-7;:-o' + ';:-vzb;7:16:tj7;18:2:;8y:44:-2bl:2;45:5yn,mp;:-b,lk;:-2,lm;:-1,lm;:p,j' + 'i;:-5xb;2:5wx,37;1:2m:-5yk:2;2v:7:9;f:5:;f:7:;f:7:;f:5:;7:5fn,lv;1:2,l' + 'v,lc;1:2,lv,ld;1:2,lv,n6;2:6:-5ft:2;e:7:;n:7:3c,qh;7:7:8,qh;7:7:-o,qh;' + '7:7:8,qh;7:7:-1k,qh;7:7:8,qh;9:-6,qh;:5hc,qh;:6,qh;1:-3,n6;:1,n6,qh;:1' + ':-5j2;1:1:1u;1:5hd,qh;1:-6;3:-5h3,qh;:5ha,qh;:a,qh;1:-7,n6;:1,n6,qh;:3' + ':-5h6;3:5hb,qh;5:4,lk,lc;:1,lk,ld;2:3,n6;:1,lk,n6;:1:-5jq;1:1:2k;7:5h5' + ',lk,lc;:1,lk,ld;:5,lv;1:-2,n6;:1,lk,n6;:1:-5ju;1:1:2w;1:-2x;5:33,qh;:5' + 'h0,qh;:-4,qh;1:7,n6;:1,n6,qh;:1:-5gu;1:1:-2;1:5h1,qh;89:8a;3:o2;:-3d;6' + ':-6ea;19:f:c;y:f;mq:p:-p;1ft:1a:-m;2n:1b;1:8ag;:-5ch;:5c1;2:4:-8a0:2;5' + ':8bh;:-v;:y;:-1;1:3:-8bj:3;b:1:8cg;1:2q:-8cg:2;2y:2::2;6:nym::nym;nyn:' + '16::2;1p:q::2;4h:c::2;f:1o::2;1y:2::2;3:r9h;:8:-r9h:2;c:;1:wmh;2:2:-wm' + 'h:2;5:i::2;j:wn9;:b;:-4;:-a;:3;1:-1e;:o;:-l;:-xbp;:a:pr:2;d:;1:1d;:wlv' + ';:-5cb;q1:27:2oo;fpr:jii,2u;:1,2x;:1,30;:1,2u,2x;:1,2u,30;:-c,38;:1,38' + ';c:-z8,12u;:1,12d;:1,12j;:-9,12u;:b,12l;sp:p:-1cjn;ym:13:-8;4v:z:;1jj:' + '1e:-o;2e7:v:w;gwv:v:;o8v:x:-2' + # ===== End of Unicode Case Folding table ===== +) + + +def _check_native(tbl): + """ + Determine if Python's own native implementation + subsumes the supplied case folding table + """ + try: + for i in tbl: + stv = chr(i) + if stv.casefold() == stv: + return False + except AttributeError: + return False + return True + + +# Hoist version check out of function for performance +SPACE_RE = re.compile(r'[ \t\r\n]+') +if _check_native(XLAT): + def normalize_reference(string): + """ + Normalize reference label: collapse internal whitespace + to single space, remove leading/trailing whitespace, case fold. + """ + return SPACE_RE.sub(' ', string[1:-1].strip()).casefold() +elif sys.version_info >= (3,) or sys.maxunicode > 0xffff: + def normalize_reference(string): + """ + Normalize reference label: collapse internal whitespace + to single space, remove leading/trailing whitespace, case fold. + """ + return SPACE_RE.sub(' ', string[1:-1].strip()).translate(XLAT) +else: + def _get_smp_regex(): + xls = sorted(x - 0x10000 for x in XLAT if x >= 0x10000) + xls.append(-1) + fmt, (dsh, opn, pip, cse) = str('\\u%04x'), str('-[|]') + rga, srk, erk = [str(r'[ \t\r\n]+')], 0, -2 + for k in xls: + new_hir = (erk ^ k) >> 10 != 0 + if new_hir or erk + 1 != k: + if erk >= 0 and srk != erk: + if srk + 1 != erk: + rga.append(dsh) + rga.append(fmt % (0xdc00 + (erk & 0x3ff))) + if new_hir: + if erk >= 0: + rga.append(cse) + if k < 0: + break + rga.append(pip) + rga.append(fmt % (0xd800 + (k >> 10))) + rga.append(opn) + srk = k + rga.append(fmt % (0xdc00 + (srk & 0x3ff))) + erk = k + return re.compile(str().join(rga)) + + def _subst_handler(matchobj): + src = matchobj.group(0) + hiv = ord(src[0]) + if hiv < 0xd800: + return ' ' + return XLAT[0x10000 + ((hiv & 0x3ff) << 10) | (ord(src[1]) & 0x3ff)] + + SMP_RE = _get_smp_regex() + + def normalize_reference(string): + """ + Normalize reference label: collapse internal whitespace + to single space, remove leading/trailing whitespace, case fold. + """ + return SMP_RE.sub(_subst_handler, string[1:-1].strip()).translate(XLAT) diff --git a/commonmark/render/html.py b/commonmark/render/html.py index 66612f7..b4ea345 100644 --- a/commonmark/render/html.py +++ b/commonmark/render/html.py @@ -29,8 +29,8 @@ def __init__(self, options={}): self.last_out = '\n' self.options = options - def escape(self, text, preserve_entities): - return escape_xml(text, preserve_entities) + def escape(self, text): + return escape_xml(text) def tag(self, name, attrs=None, selfclosing=None): """Helper function to produce an HTML tag.""" @@ -65,10 +65,10 @@ def link(self, node, entering): if entering: if not (self.options.get('safe') and potentially_unsafe(node.destination)): - attrs.append(['href', self.escape(node.destination, True)]) + attrs.append(['href', self.escape(node.destination)]) if node.title: - attrs.append(['title', self.escape(node.title, True)]) + attrs.append(['title', self.escape(node.title)]) self.tag('a', attrs) else: @@ -82,14 +82,14 @@ def image(self, node, entering): self.lit('')
                 else:
                     self.lit('<img src=') def emph(self, node, entering): @@ -132,7 +132,7 @@ def code_block(self, node, entering): attrs = self.attrs(node) if len(info_words) > 0 and len(info_words[0]) > 0: attrs.append(['class', 'language-' + - self.escape(info_words[0], True)]) + self.escape(info_words[0])]) self.cr() self.tag('pre') @@ -214,7 +214,7 @@ def custom_block(self, node, entering): # Helper methods # def out(self, s): - self.lit(self.escape(s, False)) + self.lit(self.escape(s)) def attrs(self, node): att = [] diff --git a/commonmark/tests/unit_tests.py b/commonmark/tests/unit_tests.py index aebcfbb..e9efef9 100644 --- a/commonmark/tests/unit_tests.py +++ b/commonmark/tests/unit_tests.py @@ -25,7 +25,6 @@ def text(): from commonmark.render.html import HtmlRenderer from commonmark.inlines import InlineParser from commonmark.node import NodeWalker, Node -from commonmark.utils import to_camel_case class TestCommonmark(unittest.TestCase): @@ -100,6 +99,22 @@ def test_smart_dashes(self): html = renderer.render(ast) self.assertEqual(html, expected_html) + def test_regex_vulnerability_link_label(self): + i = 200 + while i <= 2000: + s = commonmark.commonmark('[' + ('\\' * i) + '\n') + self.assertEqual(s, '

' + '[' + ('\\' * (i // 2)) + '

\n', + '[\\\\... %d deep' % (i,)) + i *= 10 + + def test_regex_vulnerability_link_destination(self): + i = 200 + while i <= 2000: + s = commonmark.commonmark(('[](' * i) + '\n') + self.assertEqual(s, '

' + ('[](' * i) + '

\n', + '[]( %d deep' % (i,)) + i *= 10 + class TestHtmlRenderer(unittest.TestCase): def test_init(self): @@ -138,16 +153,5 @@ def test_text(self, s): self.parser.parse(s) -class TestUtils(unittest.TestCase): - def test_to_camel_case(self): - self.assertEqual(to_camel_case('snake_case'), 'SnakeCase') - self.assertEqual(to_camel_case(''), '') - self.assertEqual(to_camel_case('word'), 'Word') - - @given(text()) - def test_random_text(self, s): - to_camel_case(s) - - if __name__ == '__main__': unittest.main() diff --git a/commonmark/utils.py b/commonmark/utils.py deleted file mode 100644 index 7ea2584..0000000 --- a/commonmark/utils.py +++ /dev/null @@ -1,3 +0,0 @@ -def to_camel_case(snake_str): - components = snake_str.split('_') - return ''.join(x.title() for x in components) diff --git a/spec.txt b/spec.txt index 9fd5841..3913de4 100644 --- a/spec.txt +++ b/spec.txt @@ -1,8 +1,8 @@ --- title: CommonMark Spec author: John MacFarlane -version: 0.28 -date: '2017-08-01' +version: 0.29 +date: '2019-04-06' license: '[CC-BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/)' ... @@ -248,7 +248,7 @@ satisfactory replacement for a spec. Because there is no unambiguous spec, implementations have diverged considerably. As a result, users are often surprised to find that -a document that renders one way on one system (say, a github wiki) +a document that renders one way on one system (say, a GitHub wiki) renders differently on another (say, converting to docbook using pandoc). To make matters worse, because nothing in Markdown counts as a "syntax error," the divergence often isn't discovered right away. @@ -328,8 +328,10 @@ that is not a [whitespace character]. An [ASCII punctuation character](@) is `!`, `"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`, -`*`, `+`, `,`, `-`, `.`, `/`, `:`, `;`, `<`, `=`, `>`, `?`, `@`, -`[`, `\`, `]`, `^`, `_`, `` ` ``, `{`, `|`, `}`, or `~`. +`*`, `+`, `,`, `-`, `.`, `/` (U+0021–2F), +`:`, `;`, `<`, `=`, `>`, `?`, `@` (U+003A–0040), +`[`, `\`, `]`, `^`, `_`, `` ` `` (U+005B–0060), +`{`, `|`, `}`, or `~` (U+007B–007E). A [punctuation character](@) is an [ASCII punctuation character] or anything in @@ -514,8 +516,8 @@ one block element does not affect the inline parsing of any other. ## Container blocks and leaf blocks We can divide blocks into two types: -[container block](@)s, -which can contain other blocks, and [leaf block](@)s, +[container blocks](@), +which can contain other blocks, and [leaf blocks](@), which cannot. # Leaf blocks @@ -527,7 +529,7 @@ Markdown document. A line consisting of 0-3 spaces of indentation, followed by a sequence of three or more matching `-`, `_`, or `*` characters, each followed -optionally by any number of spaces, forms a +optionally by any number of spaces or tabs, forms a [thematic break](@). ```````````````````````````````` example @@ -825,7 +827,7 @@ Contents are parsed as inlines: ```````````````````````````````` -Leading and trailing blanks are ignored in parsing inline content: +Leading and trailing [whitespace] is ignored in parsing inline content: ```````````````````````````````` example # foo @@ -1024,6 +1026,20 @@ baz* baz ```````````````````````````````` +The contents are the result of parsing the headings's raw +content as inlines. The heading's raw content is formed by +concatenating the lines and removing initial and final +[whitespace]. + +```````````````````````````````` example + Foo *bar +baz*→ +==== +. +

Foo bar +baz

+```````````````````````````````` + The underlining can be any length: @@ -1584,8 +1600,8 @@ begins with a code fence, indented no more than three spaces. The line with the opening code fence may optionally contain some text following the code fence; this is trimmed of leading and trailing -spaces and called the [info string](@). -The [info string] may not contain any backtick +whitespace and called the [info string](@). If the [info string] comes +after a backtick fence, it may not contain any backtick characters. (The reason for this restriction is that otherwise some inline code would be incorrectly interpreted as the beginning of a fenced code block.) @@ -1870,7 +1886,7 @@ Code fences (opening and closing) cannot contain internal spaces: ``` ``` aaa . -

+

aaa

```````````````````````````````` @@ -1922,9 +1938,11 @@ bar An [info string] can be provided after the opening code fence. -Opening and closing spaces will be stripped, and the first word, prefixed -with `language-`, is used as the value for the `class` attribute of the -`code` element within the enclosing `pre` element. +Although this spec doesn't mandate any particular treatment of +the info string, the first word is typically used to specify +the language of the code block. In HTML output, the language is +normally indicated by adding a class to the `code` element consisting +of `language-` followed by the language name. ```````````````````````````````` example ```ruby @@ -1973,6 +1991,18 @@ foo

```````````````````````````````` +[Info strings] for tilde code blocks can contain backticks and tildes: + +```````````````````````````````` example +~~~ aa ``` ~~~ +foo +~~~ +. +
foo
+
+```````````````````````````````` + + Closing code fences cannot have [info strings]: ```````````````````````````````` example @@ -1991,14 +2021,15 @@ Closing code fences cannot have [info strings]: An [HTML block](@) is a group of lines that is treated as raw HTML (and will not be escaped in HTML output). -There are seven kinds of [HTML block], which can be defined -by their start and end conditions. The block begins with a line that -meets a [start condition](@) (after up to three spaces -optional indentation). It ends with the first subsequent line that -meets a matching [end condition](@), or the last line of -the document or other [container block]), if no line is encountered that meets the -[end condition]. If the first line meets both the [start condition] -and the [end condition], the block will contain just that line. +There are seven kinds of [HTML block], which can be defined by their +start and end conditions. The block begins with a line that meets a +[start condition](@) (after up to three spaces optional indentation). +It ends with the first subsequent line that meets a matching [end +condition](@), or the last line of the document, or the last line of +the [container block](#container-blocks) containing the current HTML +block, if no line is encountered that meets the [end condition]. If +the first line meets both the [start condition] and the [end +condition], the block will contain just that line. 1. **Start condition:** line begins with the string ``, or @@ -2037,16 +2068,17 @@ the string `/>`.\ **End condition:** line is followed by a [blank line]. 7. **Start condition:** line begins with a complete [open tag] -or [closing tag] (with any [tag name] other than `script`, -`style`, or `pre`) followed only by [whitespace] -or the end of the line.\ +(with any [tag name] other than `script`, +`style`, or `pre`) or a complete [closing tag], +followed only by [whitespace] or the end of the line.\ **End condition:** line is followed by a [blank line]. HTML blocks continue until they are closed by their appropriate -[end condition], or the last line of the document or other [container block]. -This means any HTML **within an HTML block** that might otherwise be recognised -as a start condition will be ignored by the parser and passed through as-is, -without changing the parser's state. +[end condition], or the last line of the document or other [container +block](#container-blocks). This means any HTML **within an HTML +block** that might otherwise be recognised as a start condition will +be ignored by the parser and passed through as-is, without changing +the parser's state. For instance, `
` within a HTML block started by `` will not affect
 the parser state; as the HTML block was started in by start condition 6, it
@@ -2069,7 +2101,7 @@ _world_.
 
```````````````````````````````` -In this case, the HTML block is terminated by the newline — the `**hello**` +In this case, the HTML block is terminated by the newline — the `**Hello**` text remains verbatim — and regular parsing resumes, with a paragraph, emphasised `world` and inline and block HTML following. @@ -2612,7 +2644,8 @@ bar However, a following blank line is needed, except at the end of -a document, and except for blocks of types 1--5, above: +a document, and except for blocks of types 1--5, [above][HTML +block]: ```````````````````````````````` example
@@ -2758,8 +2791,8 @@ an indented code block: Fortunately, blank lines are usually not necessary and can be deleted. The exception is inside `
` tags, but as described
-above, raw HTML blocks starting with `
` *can* contain blank
-lines.
+[above][HTML blocks], raw HTML blocks starting with `
`
+*can* contain blank lines.
 
 ## Link reference definitions
 
@@ -2811,7 +2844,7 @@ them.
 
 ```````````````````````````````` example
 [Foo bar]:
-
+
 'title'
 
 [Foo bar]
@@ -2877,6 +2910,29 @@ The link destination may not be omitted:
 

[foo]

```````````````````````````````` + However, an empty link destination may be specified using + angle brackets: + +```````````````````````````````` example +[foo]: <> + +[foo] +. +

foo

+```````````````````````````````` + +The title must be separated from the link destination by +whitespace: + +```````````````````````````````` example +[foo]: (baz) + +[foo] +. +

[foo]: (baz)

+

[foo]

+```````````````````````````````` + Both title and destination can contain backslash escapes and literal backslashes: @@ -3034,6 +3090,25 @@ and thematic breaks, and it need not be followed by a blank line. ```````````````````````````````` +```````````````````````````````` example +[foo]: /url +bar +=== +[foo] +. +

bar

+

foo

+```````````````````````````````` + +```````````````````````````````` example +[foo]: /url +=== +[foo] +. +

=== +foo

+```````````````````````````````` + Several [link reference definitions] can occur one after another, without intervening blank lines. @@ -3070,6 +3145,17 @@ are defined: ```````````````````````````````` +Whether something is a [link reference definition] is +independent of whether the link reference it defines is +used in the document. Thus, for example, the following +document contains just a link reference definition, and +no visible content: + +```````````````````````````````` example +[foo]: /url +. +```````````````````````````````` + ## Paragraphs @@ -3207,7 +3293,7 @@ aaa # Container blocks -A [container block] is a block that has other +A [container block](#container-blocks) is a block that has other blocks as its contents. There are two basic kinds of container blocks: [block quotes] and [list items]. [Lists] are meta-containers for [list items]. @@ -3669,9 +3755,8 @@ in some browsers.) The following rules define [list items]: 1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of - blocks *Bs* starting with a [non-whitespace character] and not separated - from each other by more than one blank line, and *M* is a list - marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces, then the result + blocks *Bs* starting with a [non-whitespace character], and *M* is a + list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces, then the result of prepending *M* and the following spaces to the first line of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a list item with *Bs* as its contents. The type of the list item @@ -3981,8 +4066,7 @@ A start number may not be negative: 2. **Item starting with indented code.** If a sequence of lines *Ls* constitute a sequence of blocks *Bs* starting with an indented code - block and not separated from each other by more than one blank line, - and *M* is a list marker of width *W* followed by + block, and *M* is a list marker of width *W* followed by one space, then the result of prepending *M* and the following space to the first line of *Ls*, and indenting subsequent lines of *Ls* by *W + 1* spaces, is a list item with *Bs* as its contents. @@ -4458,9 +4542,10 @@ continued here.

6. **That's all.** Nothing that is not counted as a list item by rules #1--5 counts as a [list item](#list-items). -The rules for sublists follow from the general rules above. A sublist -must be indented the same number of spaces a paragraph would need to be -in order to be included in the list item. +The rules for sublists follow from the general rules +[above][List items]. A sublist must be indented the same number +of spaces a paragraph would need to be in order to be included +in the list item. So, in this case we need two spaces indent: @@ -5049,11 +5134,9 @@ item: - b - c - d - - e - - f - - g - - h -- i + - e + - f +- g .
  • a
  • @@ -5063,8 +5146,6 @@ item:
  • e
  • f
  • g
  • -
  • h
  • -
  • i
```````````````````````````````` @@ -5074,7 +5155,7 @@ item: 2. b - 3. c + 3. c .
  1. @@ -5089,6 +5170,49 @@ item:
```````````````````````````````` +Note, however, that list items may not be indented more than +three spaces. Here `- e` is treated as a paragraph continuation +line, because it is indented more than three spaces: + +```````````````````````````````` example +- a + - b + - c + - d + - e +. +
    +
  • a
  • +
  • b
  • +
  • c
  • +
  • d +- e
  • +
+```````````````````````````````` + +And here, `3. c` is treated as in indented code block, +because it is indented four spaces and preceded by a +blank line. + +```````````````````````````````` example +1. a + + 2. b + + 3. c +. +
    +
  1. +

    a

    +
  2. +
  3. +

    b

    +
  4. +
+
3. c
+
+```````````````````````````````` + This is a loose list, because there is a blank line between two of the list items: @@ -5378,10 +5502,10 @@ Thus, for example, in

hilo`

```````````````````````````````` - `hi` is parsed as code, leaving the backtick at the end as a literal backtick. + ## Backslash escapes Any ASCII punctuation character may be backslash-escaped: @@ -5415,6 +5539,7 @@ not have their usual Markdown meanings: \* not a list \# not a heading \[foo]: /url "not a reference" +\ö not a character entity .

*not emphasized* <br/> not a tag @@ -5423,7 +5548,8 @@ not have their usual Markdown meanings: 1. not a list * not a list # not a heading -[foo]: /url "not a reference"

+[foo]: /url "not a reference" +&ouml; not a character entity

```````````````````````````````` @@ -5521,13 +5647,23 @@ foo ## Entity and numeric character references -All valid HTML entity references and numeric character -references, except those occuring in code blocks and code spans, -are recognized as such and treated as equivalent to the -corresponding Unicode characters. Conforming CommonMark parsers -need not store information about whether a particular character -was represented in the source using a Unicode character or -an entity reference. +Valid HTML entity references and numeric character references +can be used in place of the corresponding Unicode character, +with the following exceptions: + +- Entity and character references are not recognized in code + blocks and code spans. + +- Entity and character references cannot stand in place of + special characters that define structural elements in + CommonMark. For example, although `*` can be used + in place of a literal `*` character, `*` cannot replace + `*` in emphasis delimiters, bullet list markers, or thematic + breaks. + +Conforming CommonMark parsers need not store information about +whether a particular character was represented in the source +using a Unicode character or an entity reference. [Entity references](@) consist of `&` + any of the valid HTML5 entity names + `;`. The @@ -5548,22 +5684,22 @@ references and their corresponding code points. [Decimal numeric character references](@) -consist of `&#` + a string of 1--8 arabic digits + `;`. A +consist of `&#` + a string of 1--7 arabic digits + `;`. A numeric character reference is parsed as the corresponding Unicode character. Invalid Unicode code points will be replaced by the REPLACEMENT CHARACTER (`U+FFFD`). For security reasons, the code point `U+0000` will also be replaced by `U+FFFD`. ```````````````````````````````` example -# Ӓ Ϡ � � +# Ӓ Ϡ � . -

# Ӓ Ϡ � �

+

# Ӓ Ϡ �

```````````````````````````````` [Hexadecimal numeric character references](@) consist of `&#` + -either `X` or `x` + a string of 1-8 hexadecimal digits + `;`. +either `X` or `x` + a string of 1-6 hexadecimal digits + `;`. They too are parsed as the corresponding Unicode character (this time specified with a hexadecimal numeral instead of decimal). @@ -5578,9 +5714,13 @@ Here are some nonentities: ```````````````````````````````` example   &x; &#; &#x; +� +&#abcdef0; &ThisIsNotDefined; &hi?; .

&nbsp &x; &#; &#x; +&#987654321; +&#abcdef0; &ThisIsNotDefined; &hi?;

```````````````````````````````` @@ -5661,6 +5801,51 @@ text in code spans and code blocks: ```````````````````````````````` +Entity and numeric character references cannot be used +in place of symbols indicating structure in CommonMark +documents. + +```````````````````````````````` example +*foo* +*foo* +. +

*foo* +foo

+```````````````````````````````` + +```````````````````````````````` example +* foo + +* foo +. +

* foo

+
    +
  • foo
  • +
+```````````````````````````````` + +```````````````````````````````` example +foo bar +. +

foo + +bar

+```````````````````````````````` + +```````````````````````````````` example + foo +. +

→foo

+```````````````````````````````` + + +```````````````````````````````` example +[a](url "tit") +. +

[a](url "tit")

+```````````````````````````````` + + ## Code spans A [backtick string](@) @@ -5669,9 +5854,16 @@ preceded nor followed by a backtick. A [code span](@) begins with a backtick string and ends with a backtick string of equal length. The contents of the code span are -the characters between the two backtick strings, with leading and -trailing spaces and [line endings] removed, and -[whitespace] collapsed to single spaces. +the characters between the two backtick strings, normalized in the +following ways: + +- First, [line endings] are converted to [spaces]. +- If the resulting string both begins *and* ends with a [space] + character, but does not consist entirely of [space] + characters, a single [space] character is removed from the + front and back. This allows you to include code that begins + or ends with backtick characters, which must be separated by + whitespace from the opening or closing backtick strings. This is a simple code span: @@ -5683,10 +5875,11 @@ This is a simple code span: Here two backticks are used, because the code contains a backtick. -This example also illustrates stripping of leading and trailing spaces: +This example also illustrates stripping of a single leading and +trailing space: ```````````````````````````````` example -`` foo ` bar `` +`` foo ` bar `` .

foo ` bar

```````````````````````````````` @@ -5701,58 +5894,79 @@ spaces:

``

```````````````````````````````` +Note that only *one* space is stripped: -[Line endings] are treated like spaces: +```````````````````````````````` example +` `` ` +. +

``

+```````````````````````````````` + +The stripping only happens if the space is on both +sides of the string: ```````````````````````````````` example -`` -foo -`` +` a` . -

foo

+

a

```````````````````````````````` +Only [spaces], and not [unicode whitespace] in general, are +stripped in this way: + +```````````````````````````````` example +` b ` +. +

 b 

+```````````````````````````````` -Interior spaces and [line endings] are collapsed into -single spaces, just as they would be by a browser: +No stripping occurs if the code span contains only spaces: ```````````````````````````````` example -`foo bar - baz` +` ` +` ` . -

foo bar baz

+

  +

```````````````````````````````` -Not all [Unicode whitespace] (for instance, non-breaking space) is -collapsed, however: +[Line endings] are treated like spaces: ```````````````````````````````` example -`a  b` +`` +foo +bar +baz +`` . -

a  b

+

foo bar baz

```````````````````````````````` +```````````````````````````````` example +`` +foo +`` +. +

foo

+```````````````````````````````` -Q: Why not just leave the spaces, since browsers will collapse them -anyway? A: Because we might be targeting a non-HTML format, and we -shouldn't rely on HTML-specific rendering assumptions. -(Existing implementations differ in their treatment of internal -spaces and [line endings]. Some, including `Markdown.pl` and -`showdown`, convert an internal [line ending] into a -`
` tag. But this makes things difficult for those who like to -hard-wrap their paragraphs, since a line break in the midst of a code -span will cause an unintended line break in the output. Others just -leave internal spaces as they are, which is fine if only HTML is being -targeted.) +Interior spaces are not collapsed: ```````````````````````````````` example -`foo `` bar` +`foo bar +baz` . -

foo `` bar

+

foo bar baz

```````````````````````````````` +Note that browsers will typically collapse consecutive spaces +when rendering `` elements, so it is recommended that +the following CSS be used: + + code{white-space: pre-wrap;} + Note that backslash escapes do not work in code spans. All backslashes are treated literally: @@ -5768,6 +5982,19 @@ Backslash escapes are never needed, because one can always choose a string of *n* backtick characters as delimiters, where the code does not contain any strings of exactly *n* backtick characters. +```````````````````````````````` example +``foo`bar`` +. +

foo`bar

+```````````````````````````````` + +```````````````````````````````` example +` foo `` bar ` +. +

foo `` bar

+```````````````````````````````` + + Code span backticks have higher precedence than any other inline constructs except HTML tags and autolinks. Thus, for example, this is not parsed as emphasized text, since the second `*` is part of a code @@ -5905,15 +6132,17 @@ of one or more `_` characters that is not preceded or followed by a non-backslash-escaped `_` character. A [left-flanking delimiter run](@) is -a [delimiter run] that is (a) not followed by [Unicode whitespace], -and (b) not followed by a [punctuation character], or +a [delimiter run] that is (1) not followed by [Unicode whitespace], +and either (2a) not followed by a [punctuation character], or +(2b) followed by a [punctuation character] and preceded by [Unicode whitespace] or a [punctuation character]. For purposes of this definition, the beginning and the end of the line count as Unicode whitespace. A [right-flanking delimiter run](@) is -a [delimiter run] that is (a) not preceded by [Unicode whitespace], -and (b) not preceded by a [punctuation character], or +a [delimiter run] that is (1) not preceded by [Unicode whitespace], +and either (2a) not preceded by a [punctuation character], or +(2b) preceded by a [punctuation character] and followed by [Unicode whitespace] or a [punctuation character]. For purposes of this definition, the beginning and the end of the line count as Unicode whitespace. @@ -6005,7 +6234,8 @@ The following rules define emphasis and strong emphasis: [delimiter runs]. If one of the delimiters can both open and close emphasis, then the sum of the lengths of the delimiter runs containing the opening and closing delimiters - must not be a multiple of 3. + must not be a multiple of 3 unless both lengths are + multiples of 3. 10. Strong emphasis begins with a delimiter that [can open strong emphasis] and ends with a delimiter that @@ -6015,7 +6245,8 @@ The following rules define emphasis and strong emphasis: [delimiter runs]. If one of the delimiters can both open and close strong emphasis, then the sum of the lengths of the delimiter runs containing the opening and closing - delimiters must not be a multiple of 3. + delimiters must not be a multiple of 3 unless both lengths + are multiples of 3. 11. A literal `*` character cannot occur at the beginning or end of `*`-delimited emphasis or `**`-delimited strong emphasis, unless it @@ -6634,7 +6865,19 @@ is precluded by the condition that a delimiter that can both open and close (like the `*` after `foo`) cannot form emphasis if the sum of the lengths of the delimiter runs containing the opening and -closing delimiters is a multiple of 3. +closing delimiters is a multiple of 3 unless +both lengths are multiples of 3. + + +For the same reason, we don't get two consecutive +emphasis sections in this example: + +```````````````````````````````` example +*foo**bar* +. +

foo**bar

+```````````````````````````````` + The same condition ensures that the following cases are all strong emphasis nested inside @@ -6663,6 +6906,23 @@ omitted: ```````````````````````````````` +When the lengths of the interior closing and opening +delimiter runs are *both* multiples of 3, though, +they can match to create emphasis: + +```````````````````````````````` example +foo***bar***baz +. +

foobarbaz

+```````````````````````````````` + +```````````````````````````````` example +foo******bar*********baz +. +

foobar***baz

+```````````````````````````````` + + Indefinite levels of nesting are possible: ```````````````````````````````` example @@ -7198,15 +7458,16 @@ following rules apply: A [link destination](@) consists of either - a sequence of zero or more characters between an opening `<` and a - closing `>` that contains no spaces, line breaks, or unescaped + closing `>` that contains no line breaks or unescaped `<` or `>` characters, or -- a nonempty sequence of characters that does not include - ASCII space or control characters, and includes parentheses - only if (a) they are backslash-escaped or (b) they are part of - a balanced pair of unescaped parentheses. (Implementations - may impose limits on parentheses nesting to avoid performance - issues, but at least three levels of nesting should be supported.) +- a nonempty sequence of characters that does not start with + `<`, does not include ASCII space or control characters, and + includes parentheses only if (a) they are backslash-escaped or + (b) they are part of a balanced pair of unescaped parentheses. + (Implementations may impose limits on parentheses nesting to + avoid performance issues, but at least three levels of nesting + should be supported.) A [link title](@) consists of either @@ -7219,7 +7480,8 @@ A [link title](@) consists of either backslash-escaped, or - a sequence of zero or more characters between matching parentheses - (`(...)`), including a `)` character only if it is backslash-escaped. + (`(...)`), including a `(` or `)` character only if it is + backslash-escaped. Although [link titles] may span multiple lines, they may not contain a [blank line]. @@ -7269,9 +7531,8 @@ Both the title and the destination may be omitted:

link

```````````````````````````````` - -The destination cannot contain spaces or line breaks, -even if enclosed in pointy brackets: +The destination can only contain spaces if it is +enclosed in pointy brackets: ```````````````````````````````` example [link](/my uri) @@ -7279,13 +7540,14 @@ even if enclosed in pointy brackets:

[link](/my uri)

```````````````````````````````` - ```````````````````````````````` example [link](
) . -

[link](</my uri>)

+

link

```````````````````````````````` +The destination cannot contain line breaks, +even if enclosed in pointy brackets: ```````````````````````````````` example [link](foo @@ -7295,7 +7557,6 @@ bar) bar)

```````````````````````````````` - ```````````````````````````````` example [link]() @@ -7304,6 +7565,36 @@ bar>) bar>)

```````````````````````````````` +The destination can contain `)` if it is enclosed +in pointy brackets: + +```````````````````````````````` example +[a]() +. +

a

+```````````````````````````````` + +Pointy brackets that enclose links must be unescaped: + +```````````````````````````````` example +[link]() +. +

[link](<foo>)

+```````````````````````````````` + +These are not links, because the opening pointy bracket +is not matched properly: + +```````````````````````````````` example +[a]( +[a](c) +. +

[a](<b)c +[a](<b)c> +[a](c)

+```````````````````````````````` + Parentheses inside the link destination may be escaped: ```````````````````````````````` example @@ -8411,7 +8702,7 @@ If you want a link after a literal `!`, backslash-escape the as the link label. A [URI autolink](@) consists of `<`, followed by an -[absolute URI] not containing `<`, followed by `>`. It is parsed as +[absolute URI] followed by `>`. It is parsed as a link to the URI, with the URI as the link's label. An [absolute URI](@), @@ -8624,7 +8915,7 @@ a [single-quoted attribute value], or a [double-quoted attribute value]. An [unquoted attribute value](@) is a nonempty string of characters not -including spaces, `"`, `'`, `=`, `<`, `>`, or `` ` ``. +including [whitespace], `"`, `'`, `=`, `<`, `>`, or `` ` ``. A [single-quoted attribute value](@) consists of `'`, zero or more @@ -8745,9 +9036,13 @@ Illegal [whitespace]: ```````````````````````````````` example < a>< foo> + .

< a>< -foo><bar/ >

+foo><bar/ > +<foo bar=baz +bim!bop />

```````````````````````````````` @@ -8944,10 +9239,10 @@ bar

Line breaks do not occur inside code spans ```````````````````````````````` example -`code +`code span` . -

code span

+

code span

```````````````````````````````` @@ -9365,7 +9660,8 @@ just above `stack_bottom` (or the first element if `stack_bottom` is NULL). We keep track of the `openers_bottom` for each delimiter -type (`*`, `_`). Initialize this to `stack_bottom`. +type (`*`, `_`) and each length of the closing delimiter run +(modulo 3). Initialize this to `stack_bottom`. Then we repeat the following until we run out of potential closers: @@ -9397,7 +9693,7 @@ closers: of the delimiter stack. If the closing node is removed, reset `current_position` to the next element in the stack. -- If none in found: +- If none is found: + Set `openers_bottom` to the element before `current_position`. (We know that there are no openers for this kind of closer up to and