diff --git a/commonmark/blocks.py b/commonmark/blocks.py
index fa236a8..aeec21d 100644
--- a/commonmark/blocks.py
+++ b/commonmark/blocks.py
@@ -1,12 +1,10 @@
from __future__ import absolute_import, unicode_literals
import re
-from importlib import import_module
from commonmark import common
from commonmark.common import unescape_string
from commonmark.inlines import InlineParser
from commonmark.node import Node
-from commonmark.utils import to_camel_case
CODE_INDENT = 4
@@ -21,7 +19,7 @@
r'^<[/]?(?:address|article|aside|base|basefont|blockquote|body|'
r'caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|'
r'fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|'
- r'header|hr|html|iframe|legend|li|link|main|menu|menuitem|meta|'
+ r'header|hr|html|iframe|legend|li|link|main|menu|menuitem|'
r'nav|noframes|ol|optgroup|option|p|param|section|source|title|'
r'summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)'
r'(?:\s|[/]?[>]|$)',
@@ -45,7 +43,7 @@
reBulletListMarker = re.compile(r'^[*+-]')
reOrderedListMarker = re.compile(r'^(\d{1,9})([.)])')
reATXHeadingMarker = re.compile(r'^#{1,6}(?:[ \t]+|$)')
-reCodeFence = re.compile(r'^`{3,}(?!.*`)|^~{3,}(?!.*~)')
+reCodeFence = re.compile(r'^`{3,}(?!.*`)|^~{3,}')
reClosingCodeFence = re.compile(r'^(?:`{3,}|~{3,})(?= *$)')
reSetextHeadingLine = re.compile(r'^(?:=+|-+)[ \t]*$')
reLineEnding = re.compile(r'\r\n|\n|\r')
@@ -57,7 +55,7 @@ def is_blank(s):
def is_space_or_tab(s):
- return s == ' ' or s == '\t'
+ return s in (' ', '\t')
def peek(ln, pos):
@@ -73,9 +71,12 @@ def ends_with_blank_line(block):
while block:
if block.last_line_blank:
return True
- if (block.t == 'list' or block.t == 'item'):
+ if not block.last_line_checked and \
+ block.t in ('list', 'item'):
+ block.last_line_checked = True
block = block.last_child
else:
+ block.last_line_checked = True
break
return False
@@ -94,6 +95,8 @@ def parse_list_marker(parser, container):
'padding': None,
'marker_offset': parser.indent,
}
+ if parser.indent >= 4:
+ return None
m = re.search(reBulletListMarker, rest)
m2 = re.search(reOrderedListMarker, rest)
if m:
@@ -515,15 +518,25 @@ def setext_heading(parser, container=None):
parser.current_line[parser.next_nonspace:])
if m:
parser.close_unmatched_blocks()
- heading = Node('heading', container.sourcepos)
- heading.level = 1 if m.group()[0] == '=' else 2
- heading.string_content = container.string_content
- container.insert_after(heading)
- container.unlink()
- parser.tip = heading
- parser.advance_offset(
- len(parser.current_line) - parser.offset, False)
- return 2
+ # resolve reference link definitiosn
+ while peek(container.string_content, 0) == '[':
+ pos = parser.inline_parser.parseReference(
+ container.string_content, parser.refmap)
+ if not pos:
+ break
+ container.string_content = container.string_content[pos:]
+ if container.string_content:
+ heading = Node('heading', container.sourcepos)
+ heading.level = 1 if m.group()[0] == '=' else 2
+ heading.string_content = container.string_content
+ container.insert_after(heading)
+ container.unlink()
+ parser.tip = heading
+ parser.advance_offset(
+ len(parser.current_line) - parser.offset, False)
+ return 2
+ else:
+ return 0
return 0
@@ -610,13 +623,8 @@ def add_child(self, tag, offset):
""" Add block of type tag as a child of the tip. If the tip can't
accept children, close and finalize it and try its parent,
and so on til we find a block that can accept children."""
- block_class = getattr(import_module('commonmark.blocks'),
- to_camel_case(self.tip.t))
- while not block_class.can_contain(tag):
+ while not self.blocks[self.tip.t].can_contain(tag):
self.finalize(self.tip, self.line_number - 1)
- block_class = getattr(
- import_module('commonmark.blocks'),
- to_camel_case(self.tip.t))
column_number = offset + 1
new_block = Node(tag, [[self.line_number, column_number], [0, 0]])
@@ -725,15 +733,15 @@ def incorporate_line(self, ln):
# For each containing block, try to parse the associated line start.
# Bail out on failure: container will point to the last matching block.
# Set all_matched to false if not all containers match.
- last_child = container.last_child
- while last_child and last_child.is_open:
+ while True:
+ last_child = container.last_child
+ if not (last_child and last_child.is_open):
+ break
container = last_child
self.find_next_nonspace()
- block_class = getattr(
- import_module('commonmark.blocks'),
- to_camel_case(container.t))
- rv = block_class.continue_(self, container)
+
+ rv = self.blocks[container.t].continue_(self, container)
if rv == 0:
# we've matched, keep going
pass
@@ -745,21 +753,19 @@ def incorporate_line(self, ln):
self.last_line_length = len(ln)
return
else:
- raise ValueError('returned illegal value, must be 0, 1, or 2')
+ raise ValueError(
+ 'continue_ returned illegal value, must be 0, 1, or 2')
if not all_matched:
# back up to last matching block
container = container.parent
break
- last_child = container.last_child
-
self.all_closed = (container == self.oldtip)
self.last_matched_container = container
- block_class = getattr(import_module('commonmark.blocks'),
- to_camel_case(container.t))
- matched_leaf = container.t != 'paragraph' and block_class.accepts_lines
+ matched_leaf = container.t != 'paragraph' and \
+ self.blocks[container.t].accepts_lines
starts = self.block_starts
starts_len = len(starts.METHODS)
# Unless last matched container is a code block, try new container
@@ -824,9 +830,7 @@ def incorporate_line(self, ln):
cont.last_line_blank = last_line_blank
cont = cont.parent
- block_class = getattr(import_module('commonmark.blocks'),
- to_camel_case(t))
- if block_class.accepts_lines:
+ if self.blocks[t].accepts_lines:
self.add_line()
# if HtmlBlock, check for end condition
if t == 'html_block' and \
@@ -853,9 +857,8 @@ def finalize(self, block, line_number):
above = block.parent
block.is_open = False
block.sourcepos[1] = [line_number, self.last_line_length]
- block_class = getattr(import_module('commonmark.blocks'),
- to_camel_case(block.t))
- block_class.finalize(self, block)
+
+ self.blocks[block.t].finalize(self, block)
self.tip = above
@@ -897,3 +900,9 @@ def parse(self, my_input):
self.finalize(self.tip, length)
self.process_inlines(self.doc)
return self.doc
+
+
+CAMEL_RE = re.compile("(.)([A-Z](?:[a-z]+|(?<=[a-z0-9].)))")
+Parser.blocks = dict(
+ (CAMEL_RE.sub(r'\1_\2', cls.__name__).lower(), cls)
+ for cls in Block.__subclasses__())
diff --git a/commonmark/common.py b/commonmark/common.py
index 1327869..b15a8b6 100644
--- a/commonmark/common.py
+++ b/commonmark/common.py
@@ -19,7 +19,7 @@
from commonmark import entitytrans
HTMLunescape = entitytrans._unescape
-ENTITY = '&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});'
+ENTITY = '&(?:#x[a-f0-9]{1,6}|#[0-9]{1,7}|[a-z][a-z0-9]{1,31});'
TAGNAME = '[A-Za-z][A-Za-z0-9-]*'
ATTRIBUTENAME = '[a-zA-Z_:][a-zA-Z0-9:._-]*'
@@ -45,7 +45,6 @@
'\\\\' + ESCAPABLE + '|' + ENTITY, re.IGNORECASE)
XMLSPECIAL = '[&<>"]'
reXmlSpecial = re.compile(XMLSPECIAL)
-reXmlSpecialOrEntity = re.compile(ENTITY + '|' + XMLSPECIAL, re.IGNORECASE)
def unescape_char(s):
@@ -102,19 +101,13 @@ def replace_unsafe_char(s):
return UNSAFE_MAP.get(s, s)
-def escape_xml(s, preserve_entities):
+def escape_xml(s):
if s is None:
return ''
if re.search(reXmlSpecial, s):
- if preserve_entities:
- return re.sub(
- reXmlSpecialOrEntity,
- lambda m: replace_unsafe_char(m.group()),
- s)
- else:
- return re.sub(
- reXmlSpecial,
- lambda m: replace_unsafe_char(m.group()),
- s)
+ return re.sub(
+ reXmlSpecial,
+ lambda m: replace_unsafe_char(m.group()),
+ s)
else:
return s
diff --git a/commonmark/inlines.py b/commonmark/inlines.py
index f2f66c7..88a84cf 100644
--- a/commonmark/inlines.py
+++ b/commonmark/inlines.py
@@ -5,6 +5,7 @@
from commonmark import common
from commonmark.common import normalize_uri, unescape_string
from commonmark.node import Node
+from commonmark.normalize_reference import normalize_reference
if sys.version_info >= (3, 0):
if sys.version_info >= (3, 4):
@@ -22,7 +23,7 @@
ESCAPED_CHAR = '\\\\' + common.ESCAPABLE
rePunctuation = re.compile(
- r'[!"#$%&\'()*+,\-./:;<=>?@\[\]^_`{|}~\xA1\xA7\xAB\xB6\xB7\xBB'
+ r'[!"#$%&\'()*+,\-./:;<=>?@\[\]\\^_`{|}~\xA1\xA7\xAB\xB6\xB7\xBB'
r'\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3'
r'\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F'
r'\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E'
@@ -54,10 +55,8 @@
'|' +
'\'(' + ESCAPED_CHAR + '|[^\'\\x00])*\'' +
'|' +
- '\\((' + ESCAPED_CHAR + '|[^)\\x00])*\\))')
-reLinkDestinationBraces = re.compile(
- '^(?:[<](?:[^ <>\\t\\n\\\\\\x00]' + '|' + ESCAPED_CHAR + '|' +
- '\\\\)*[>])')
+ '\\((' + ESCAPED_CHAR + '|[^()\\x00])*\\))')
+reLinkDestinationBraces = re.compile(r'^(?:<(?:[^<>\n\\\x00]|\\.)*>)')
reEscapable = re.compile('^' + common.ESCAPABLE)
reEntityHere = re.compile('^' + common.ENTITY, re.IGNORECASE)
@@ -79,21 +78,11 @@
reFinalSpace = re.compile(r' *$')
reInitialSpace = re.compile(r'^ *')
reSpaceAtEndOfLine = re.compile(r'^ *(?:\n|$)')
-reLinkLabel = re.compile('^\\[(?:[^\\\\\\[\\]]|' + ESCAPED_CHAR +
- '|\\\\){0,1000}\\]')
+reLinkLabel = re.compile(r'^\[(?:[^\\\[\]]|\\.){0,1000}\]')
# Matches a string of non-special characters.
reMain = re.compile(r'^[^\n`\[\]\\!<&*_\'"]+', re.MULTILINE)
-def normalizeReference(s):
- """Normalize reference label.
-
- Collapse internal whitespace to single space, remove
- leading/trailing whitespace, case fold.
- """
- return re.sub(r'\s+', ' ', s.strip()).upper()
-
-
def text(s):
node = Node('text', None)
node.literal = s
@@ -175,12 +164,14 @@ def parseBackticks(self, block):
after_open_ticks = self.pos
matched = self.match(reTicks)
while matched is not None:
- if (matched == ticks):
+ if matched == ticks:
node = Node('code', None)
- c = self.subject[after_open_ticks:self.pos - len(ticks)]
- c = c.strip()
- c = re.sub(reWhitespace, ' ', c)
- node.literal = c
+ contents = self.subject[after_open_ticks:self.pos-len(ticks)] \
+ .replace('\n', ' ')
+ if contents.lstrip(' ') and contents[0] == contents[-1] == ' ':
+ node.literal = contents[1:-1]
+ else:
+ node.literal = contents
block.append_child(node)
return True
matched = self.match(reTicks)
@@ -394,8 +385,9 @@ def processEmphasis(self, stack_bottom):
opener != openers_bottom[closercc]):
odd_match = (closer.get('can_open') or
opener.get('can_close')) and \
- (opener.get('origdelims') +
- closer.get('origdelims')) % 3 == 0
+ closer['origdelims'] % 3 != 0 and \
+ (opener['origdelims'] +
+ closer['origdelims']) % 3 == 0
if opener.get('cc') == closercc and \
opener.get('can_open') and \
not odd_match:
@@ -502,12 +494,17 @@ def parseLinkDestination(self):
"""
res = self.match(reLinkDestinationBraces)
if res is None:
+ if self.peek() == '<':
+ return None
# TODO handrolled parser; res should be None or the string
savepos = self.pos
openparens = 0
- c = self.peek()
- while c is not None:
- if c == '\\':
+ while True:
+ c = self.peek()
+ if c is None:
+ break
+ if c == '\\' and re.search(
+ reEscapable, self.subject[self.pos+1:self.pos+2]):
self.pos += 1
if self.peek() is not None:
self.pos += 1
@@ -524,7 +521,8 @@ def parseLinkDestination(self):
break
else:
self.pos += 1
- c = self.peek()
+ if self.pos == savepos and c != ')':
+ return None
res = self.subject[savepos:self.pos]
return normalize_uri(unescape_string(res))
else:
@@ -539,7 +537,7 @@ def parseLinkLabel(self):
# Note: our regex will allow something of form [..\];
# we disallow it here rather than using lookahead in the regex:
m = self.match(reLinkLabel)
- if m is None or len(m) > 1001 or re.search(r'([^\\]\\\]$|\[\n\]$)', m):
+ if m is None or len(m) > 1001:
return 0
else:
return len(m)
@@ -647,7 +645,7 @@ def parseCloseBracket(self, block):
if reflabel:
# lookup rawlabel in refmap
- link = self.refmap.get(normalizeReference(reflabel))
+ link = self.refmap.get(normalize_reference(reflabel))
if link:
dest = link['destination']
title = link['title']
@@ -779,13 +777,15 @@ def parseReference(self, s, refmap):
self.spnl()
dest = self.parseLinkDestination()
- if (dest is None or len(dest) == 0):
+ if dest is None:
self.pos = startpos
return 0
beforetitle = self.pos
self.spnl()
- title = self.parseLinkTitle()
+ title = None
+ if self.pos != beforetitle:
+ title = self.parseLinkTitle()
if title is None:
title = ''
# rewind before spaces
@@ -810,13 +810,13 @@ def parseReference(self, s, refmap):
self.pos = startpos
return 0
- normlabel = normalizeReference(rawlabel)
- if refmap.get(normlabel) == '':
+ normlabel = normalize_reference(rawlabel)
+ if normlabel == '':
# label must contain non-whitespace characters
self.pos = startpos
return 0
- if refmap.get(normlabel) is None:
+ if not refmap.get(normlabel):
refmap[normlabel] = {
'destination': dest,
'title': title
diff --git a/commonmark/node.py b/commonmark/node.py
index 39e26b5..4c0ed40 100644
--- a/commonmark/node.py
+++ b/commonmark/node.py
@@ -78,6 +78,7 @@ def __init__(self, node_type, sourcepos):
self.nxt = None
self.sourcepos = sourcepos
self.last_line_blank = False
+ self.last_line_checked = False
self.is_open = True
self.string_content = ''
self.literal = None
diff --git a/commonmark/normalize_reference.py b/commonmark/normalize_reference.py
new file mode 100644
index 0000000..d68a3b1
--- /dev/null
+++ b/commonmark/normalize_reference.py
@@ -0,0 +1,165 @@
+"""Case-folding and whitespace normalization"""
+# Unicode Case Folding table has been derived from the following work:
+#
+# CaseFolding-12.0.0.txt
+# Date: 2019-01-22, 08:18:22 GMT
+# (c) 2019 Unicode(R) Inc.
+# Unicode and the Unicode Logo are registered trademarks
+# of Unicode, Inc. in the U.S. and other countries.
+# For terms of use, see http://www.unicode.org/terms_of_use.html
+#
+# Unicode Character Database
+# For documentation, see http://www.unicode.org/reports/tr44/
+
+import re
+import sys
+from builtins import str, chr
+
+__all__ = ["normalize_reference"]
+
+if sys.version_info < (3,) and sys.maxunicode <= 0xffff:
+ # shim for Python 2.x UCS2 build
+ _unichr = chr
+
+ def chr(cdp):
+ if 0x10000 <= cdp < 0x110000:
+ cdp -= 0x10000
+ return (_unichr(0xd800 | (cdp >> 10)) +
+ _unichr(0xdc00 | (cdp & 0x3ff)))
+ return _unichr(cdp)
+
+
+def _parse_table(tbl):
+ xlat = {}
+ cur_i, cur_j = -1, 0
+ for entry in tbl.split(';'):
+ arr = entry.split(',')
+ info = [int(x, 36) if x else 0 for x in arr[0].split(':')]
+ arr = [int(x, 36) for x in arr[1:]]
+ assert not any(x in xlat for x in arr)
+ sfx = ''.join(map(chr, arr))
+ streak, stride = 0, 1
+ if len(info) == 2:
+ fdt, delta = info
+ elif len(info) == 3:
+ fdt, streak, delta = info
+ else:
+ fdt, streak, delta, stride = info
+ assert streak >= 0 and stride >= 1
+ cur_i += fdt + 1
+ cur_j -= delta
+ assert cur_j != 0
+ i = cur_i
+ last = cur_i + streak
+ while i <= last:
+ # uniqueness and idempotency
+ assert i not in xlat and i + cur_j not in xlat
+ assert i not in arr
+ xlat[i] = chr(i + cur_j) + sfx
+ i += stride
+ return xlat
+
+
+XLAT = _parse_table(
+ # ===== Start of Unicode Case Folding table =====
+ '1t:p:-w;37:-kn;a:m:kn;n:6:;6:3w,37;w:1a:-31:2;1b:5k,lj;1:4:-5k:2;6:e::'
+ '2;f:-aa,32;:18:aa:2;19:3e;:4:-3e:2;5:7h;1:-da;:2:5t:2;3:-5p;:5p;1:1:-5'
+ 'o;1:5o;2:-26;:-3f;:-1;:5m;1:-5o;:-2;1:-4;:2;:5s;3:-5u;:-2;1:-1;:4:5x:2'
+ ';5:-61;:61;1:-61;2:61;1:-61;:61;1:1:-60;1:2:60:2;3:-62;:4:62:4;b:-1;:1'
+ ';1:-1;:1;1:-1;:g:1:2;i:g::2;h:av,lo;:-aw;:2:1:2;3:2q;:-15;:12:-1l:2;13'
+ ':3n;1:g:-3n:2;n:-8bu;:8bu;1:4k;:-8gb;2:8br;1:5g;:-7c;:-2;:8:1y:2;72:-3'
+ '7;16:2:37:2;5:;8:-37;6:26;1:2:1;3:-r;1:1:1;1:m,lk,ld;:g:9;h:8:;c:b,lk,'
+ 'ld;h:k;c:-7;:12;:-5;3:-a;:7;1:m:-n:2;n:1j;:-6;2:c;:4;1:-1t;1:8;:-8;2:2'
+ ':3n;2:f:-5u;f:v:1c;27:w:v:2;15:1g::2;1h:-e;:c:e:2;e:2m::2;2o:11:-1b;2d'
+ ':2a,136;26w:11:-5mq;12:6::6;mo:5:5m0;1on:4sm;:-1;:-9;:1:-2;1:1;:-7;:-o'
+ ';:-vzb;7:16:tj7;18:2:;8y:44:-2bl:2;45:5yn,mp;:-b,lk;:-2,lm;:-1,lm;:p,j'
+ 'i;:-5xb;2:5wx,37;1:2m:-5yk:2;2v:7:9;f:5:;f:7:;f:7:;f:5:;7:5fn,lv;1:2,l'
+ 'v,lc;1:2,lv,ld;1:2,lv,n6;2:6:-5ft:2;e:7:;n:7:3c,qh;7:7:8,qh;7:7:-o,qh;'
+ '7:7:8,qh;7:7:-1k,qh;7:7:8,qh;9:-6,qh;:5hc,qh;:6,qh;1:-3,n6;:1,n6,qh;:1'
+ ':-5j2;1:1:1u;1:5hd,qh;1:-6;3:-5h3,qh;:5ha,qh;:a,qh;1:-7,n6;:1,n6,qh;:3'
+ ':-5h6;3:5hb,qh;5:4,lk,lc;:1,lk,ld;2:3,n6;:1,lk,n6;:1:-5jq;1:1:2k;7:5h5'
+ ',lk,lc;:1,lk,ld;:5,lv;1:-2,n6;:1,lk,n6;:1:-5ju;1:1:2w;1:-2x;5:33,qh;:5'
+ 'h0,qh;:-4,qh;1:7,n6;:1,n6,qh;:1:-5gu;1:1:-2;1:5h1,qh;89:8a;3:o2;:-3d;6'
+ ':-6ea;19:f:c;y:f;mq:p:-p;1ft:1a:-m;2n:1b;1:8ag;:-5ch;:5c1;2:4:-8a0:2;5'
+ ':8bh;:-v;:y;:-1;1:3:-8bj:3;b:1:8cg;1:2q:-8cg:2;2y:2::2;6:nym::nym;nyn:'
+ '16::2;1p:q::2;4h:c::2;f:1o::2;1y:2::2;3:r9h;:8:-r9h:2;c:;1:wmh;2:2:-wm'
+ 'h:2;5:i::2;j:wn9;:b;:-4;:-a;:3;1:-1e;:o;:-l;:-xbp;:a:pr:2;d:;1:1d;:wlv'
+ ';:-5cb;q1:27:2oo;fpr:jii,2u;:1,2x;:1,30;:1,2u,2x;:1,2u,30;:-c,38;:1,38'
+ ';c:-z8,12u;:1,12d;:1,12j;:-9,12u;:b,12l;sp:p:-1cjn;ym:13:-8;4v:z:;1jj:'
+ '1e:-o;2e7:v:w;gwv:v:;o8v:x:-2'
+ # ===== End of Unicode Case Folding table =====
+)
+
+
+def _check_native(tbl):
+ """
+ Determine if Python's own native implementation
+ subsumes the supplied case folding table
+ """
+ try:
+ for i in tbl:
+ stv = chr(i)
+ if stv.casefold() == stv:
+ return False
+ except AttributeError:
+ return False
+ return True
+
+
+# Hoist version check out of function for performance
+SPACE_RE = re.compile(r'[ \t\r\n]+')
+if _check_native(XLAT):
+ def normalize_reference(string):
+ """
+ Normalize reference label: collapse internal whitespace
+ to single space, remove leading/trailing whitespace, case fold.
+ """
+ return SPACE_RE.sub(' ', string[1:-1].strip()).casefold()
+elif sys.version_info >= (3,) or sys.maxunicode > 0xffff:
+ def normalize_reference(string):
+ """
+ Normalize reference label: collapse internal whitespace
+ to single space, remove leading/trailing whitespace, case fold.
+ """
+ return SPACE_RE.sub(' ', string[1:-1].strip()).translate(XLAT)
+else:
+ def _get_smp_regex():
+ xls = sorted(x - 0x10000 for x in XLAT if x >= 0x10000)
+ xls.append(-1)
+ fmt, (dsh, opn, pip, cse) = str('\\u%04x'), str('-[|]')
+ rga, srk, erk = [str(r'[ \t\r\n]+')], 0, -2
+ for k in xls:
+ new_hir = (erk ^ k) >> 10 != 0
+ if new_hir or erk + 1 != k:
+ if erk >= 0 and srk != erk:
+ if srk + 1 != erk:
+ rga.append(dsh)
+ rga.append(fmt % (0xdc00 + (erk & 0x3ff)))
+ if new_hir:
+ if erk >= 0:
+ rga.append(cse)
+ if k < 0:
+ break
+ rga.append(pip)
+ rga.append(fmt % (0xd800 + (k >> 10)))
+ rga.append(opn)
+ srk = k
+ rga.append(fmt % (0xdc00 + (srk & 0x3ff)))
+ erk = k
+ return re.compile(str().join(rga))
+
+ def _subst_handler(matchobj):
+ src = matchobj.group(0)
+ hiv = ord(src[0])
+ if hiv < 0xd800:
+ return ' '
+ return XLAT[0x10000 + ((hiv & 0x3ff) << 10) | (ord(src[1]) & 0x3ff)]
+
+ SMP_RE = _get_smp_regex()
+
+ def normalize_reference(string):
+ """
+ Normalize reference label: collapse internal whitespace
+ to single space, remove leading/trailing whitespace, case fold.
+ """
+ return SMP_RE.sub(_subst_handler, string[1:-1].strip()).translate(XLAT)
diff --git a/commonmark/render/html.py b/commonmark/render/html.py
index 66612f7..b4ea345 100644
--- a/commonmark/render/html.py
+++ b/commonmark/render/html.py
@@ -29,8 +29,8 @@ def __init__(self, options={}):
self.last_out = '\n'
self.options = options
- def escape(self, text, preserve_entities):
- return escape_xml(text, preserve_entities)
+ def escape(self, text):
+ return escape_xml(text)
def tag(self, name, attrs=None, selfclosing=None):
"""Helper function to produce an HTML tag."""
@@ -65,10 +65,10 @@ def link(self, node, entering):
if entering:
if not (self.options.get('safe') and
potentially_unsafe(node.destination)):
- attrs.append(['href', self.escape(node.destination, True)])
+ attrs.append(['href', self.escape(node.destination)])
if node.title:
- attrs.append(['title', self.escape(node.title, True)])
+ attrs.append(['title', self.escape(node.title)])
self.tag('a', attrs)
else:
@@ -82,14 +82,14 @@ def image(self, node, entering):
self.lit('')
def emph(self, node, entering):
@@ -132,7 +132,7 @@ def code_block(self, node, entering):
attrs = self.attrs(node)
if len(info_words) > 0 and len(info_words[0]) > 0:
attrs.append(['class', 'language-' +
- self.escape(info_words[0], True)])
+ self.escape(info_words[0])])
self.cr()
self.tag('pre')
@@ -214,7 +214,7 @@ def custom_block(self, node, entering):
# Helper methods #
def out(self, s):
- self.lit(self.escape(s, False))
+ self.lit(self.escape(s))
def attrs(self, node):
att = []
diff --git a/commonmark/tests/unit_tests.py b/commonmark/tests/unit_tests.py
index aebcfbb..e9efef9 100644
--- a/commonmark/tests/unit_tests.py
+++ b/commonmark/tests/unit_tests.py
@@ -25,7 +25,6 @@ def text():
from commonmark.render.html import HtmlRenderer
from commonmark.inlines import InlineParser
from commonmark.node import NodeWalker, Node
-from commonmark.utils import to_camel_case
class TestCommonmark(unittest.TestCase):
@@ -100,6 +99,22 @@ def test_smart_dashes(self):
html = renderer.render(ast)
self.assertEqual(html, expected_html)
+ def test_regex_vulnerability_link_label(self):
+ i = 200
+ while i <= 2000:
+ s = commonmark.commonmark('[' + ('\\' * i) + '\n')
+ self.assertEqual(s, '
' + '[' + ('\\' * (i // 2)) + '
\n', + '[\\\\... %d deep' % (i,)) + i *= 10 + + def test_regex_vulnerability_link_destination(self): + i = 200 + while i <= 2000: + s = commonmark.commonmark(('[](' * i) + '\n') + self.assertEqual(s, '' + ('[](' * i) + '
\n', + '[]( %d deep' % (i,)) + i *= 10 + class TestHtmlRenderer(unittest.TestCase): def test_init(self): @@ -138,16 +153,5 @@ def test_text(self, s): self.parser.parse(s) -class TestUtils(unittest.TestCase): - def test_to_camel_case(self): - self.assertEqual(to_camel_case('snake_case'), 'SnakeCase') - self.assertEqual(to_camel_case(''), '') - self.assertEqual(to_camel_case('word'), 'Word') - - @given(text()) - def test_random_text(self, s): - to_camel_case(s) - - if __name__ == '__main__': unittest.main() diff --git a/commonmark/utils.py b/commonmark/utils.py deleted file mode 100644 index 7ea2584..0000000 --- a/commonmark/utils.py +++ /dev/null @@ -1,3 +0,0 @@ -def to_camel_case(snake_str): - components = snake_str.split('_') - return ''.join(x.title() for x in components) diff --git a/spec.txt b/spec.txt index 9fd5841..3913de4 100644 --- a/spec.txt +++ b/spec.txt @@ -1,8 +1,8 @@ --- title: CommonMark Spec author: John MacFarlane -version: 0.28 -date: '2017-08-01' +version: 0.29 +date: '2019-04-06' license: '[CC-BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/)' ... @@ -248,7 +248,7 @@ satisfactory replacement for a spec. Because there is no unambiguous spec, implementations have diverged considerably. As a result, users are often surprised to find that -a document that renders one way on one system (say, a github wiki) +a document that renders one way on one system (say, a GitHub wiki) renders differently on another (say, converting to docbook using pandoc). To make matters worse, because nothing in Markdown counts as a "syntax error," the divergence often isn't discovered right away. @@ -328,8 +328,10 @@ that is not a [whitespace character]. An [ASCII punctuation character](@) is `!`, `"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`, -`*`, `+`, `,`, `-`, `.`, `/`, `:`, `;`, `<`, `=`, `>`, `?`, `@`, -`[`, `\`, `]`, `^`, `_`, `` ` ``, `{`, `|`, `}`, or `~`. +`*`, `+`, `,`, `-`, `.`, `/` (U+0021–2F), +`:`, `;`, `<`, `=`, `>`, `?`, `@` (U+003A–0040), +`[`, `\`, `]`, `^`, `_`, `` ` `` (U+005B–0060), +`{`, `|`, `}`, or `~` (U+007B–007E). A [punctuation character](@) is an [ASCII punctuation character] or anything in @@ -514,8 +516,8 @@ one block element does not affect the inline parsing of any other. ## Container blocks and leaf blocks We can divide blocks into two types: -[container block](@)s, -which can contain other blocks, and [leaf block](@)s, +[container blocks](@), +which can contain other blocks, and [leaf blocks](@), which cannot. # Leaf blocks @@ -527,7 +529,7 @@ Markdown document. A line consisting of 0-3 spaces of indentation, followed by a sequence of three or more matching `-`, `_`, or `*` characters, each followed -optionally by any number of spaces, forms a +optionally by any number of spaces or tabs, forms a [thematic break](@). ```````````````````````````````` example @@ -825,7 +827,7 @@ Contents are parsed as inlines: ```````````````````````````````` -Leading and trailing blanks are ignored in parsing inline content: +Leading and trailing [whitespace] is ignored in parsing inline content: ```````````````````````````````` example # foo @@ -1024,6 +1026,20 @@ baz* baz ```````````````````````````````` +The contents are the result of parsing the headings's raw +content as inlines. The heading's raw content is formed by +concatenating the lines and removing initial and final +[whitespace]. + +```````````````````````````````` example + Foo *bar +baz*→ +==== +. +
+
aaa
foo
+
+````````````````````````````````
+
+
Closing code fences cannot have [info strings]:
```````````````````````````````` example
@@ -1991,14 +2021,15 @@ Closing code fences cannot have [info strings]:
An [HTML block](@) is a group of lines that is treated
as raw HTML (and will not be escaped in HTML output).
-There are seven kinds of [HTML block], which can be defined
-by their start and end conditions. The block begins with a line that
-meets a [start condition](@) (after up to three spaces
-optional indentation). It ends with the first subsequent line that
-meets a matching [end condition](@), or the last line of
-the document or other [container block]), if no line is encountered that meets the
-[end condition]. If the first line meets both the [start condition]
-and the [end condition], the block will contain just that line.
+There are seven kinds of [HTML block], which can be defined by their
+start and end conditions. The block begins with a line that meets a
+[start condition](@) (after up to three spaces optional indentation).
+It ends with the first subsequent line that meets a matching [end
+condition](@), or the last line of the document, or the last line of
+the [container block](#container-blocks) containing the current HTML
+block, if no line is encountered that meets the [end condition]. If
+the first line meets both the [start condition] and the [end
+condition], the block will contain just that line.
1. **Start condition:** line begins with the string `