markdown.py

# -*- coding: utf-8 -*-

try:
    import _jsre as re
except:
    import re

import random

letters = 'abcdefghijklmnopqrstuvwxyz'
letters += letters.upper()+'0123456789'

class URL:

    def __init__(self,src):
        elts = src.split(maxsplit=1)
        self.href = elts[0]
        self.alt = ''
        if len(elts) == 2:
            alt = elts[1]
            if alt[0] == '"' and alt[-1] == '"':
                self.alt = alt[1:-1]
            elif alt[0] == "'" and alt[-1] == "'":
                self.alt=alt[1:-1]
            elif alt[0] == "(" and alt[-1] == ")":
                self.alt=alt[1:-1]


class CodeBlock:

    def __init__(self, line):
        self.lines = [line]
        if line.startswith("```"):
            if len(line) > 3:
                self.info = line[3:]
            else:
                self.info = "block"
        elif line.startswith("`") and len(line) > 1:
            self.info = line[1:]
        elif line.startswith(">>>"):
            self.info = "python-console"
        else:
            self.info = None

    def to_html(self):
        if self.lines[0].startswith("`"):
            self.lines.pop(0)
        res = escape('\n'.join(self.lines))
        res = unmark(res)
        _class = self.info or "marked"
        res = '<pre class="%s">%s</pre>\n' %(_class, res)
        return res, []


class HtmlBlock:

    def __init__(self, src):
        self.src = src

    def to_html(self):
        return self.src

class Marked:

    def __init__(self, line=''):
        self.line = line
        self.children = []

    def to_html(self):
        return apply_markdown(self.line)

# get references
refs = {}
ref_pattern = r"^\[(.*)\]:\s+(.*)"

def mark(src):

    global refs
    refs = {}
    # split source in sections
    # sections can be :
    # - a block-level HTML element (markdown syntax will not be processed)
    # - a script
    # - a span-level HTML tag (markdown syntax will be processed)
    # - a code block

    # normalise line feeds
    src = src.replace('\r\n','\n')

    # hr line
    src = re.sub(r'\n\n(-|\*| )+\n', '\n<hr>\n', src)

    # lines followed by dashes
    src = re.sub(r'(.*?)\n=+\n', '\n# \\1\n', src)
    src = re.sub(r'(.*?)\n-+\n', '\n## \\1\n', src)

    lines = src.split('\n') + ['']

    i = bq = 0
    ul = ol = 0

    while i<len(lines):

        # enclose lines starting by > in a blockquote
        if lines[i].startswith('>'):
            nb = 1
            while nb < len(lines[i]) and lines[i][nb] == '>':
                nb += 1
            lines[i] = lines[i][nb:]
            if nb>bq:
                lines.insert(i, '<blockquote>' * (nb - bq))
                i += 1
                bq = nb
            elif nb<bq:
                lines.insert(i, '</blockquote>' * (bq - nb))
                i += 1
                bq = nb
        elif bq>0:
            lines.insert(i, '</blockquote>' * bq)
            i += 1
            bq = 0

        # unordered lists
        if (lines[i].strip() and lines[i].lstrip()[0] in '-+*'
                and len(lines[i].lstrip()) > 1
                and lines[i].lstrip()[1] == ' '
                and (i == 0 or ul or not lines[i - 1].strip())):
            # line indentation indicates nesting level
            nb = 1 + len(lines[i]) - len(lines[i].lstrip())
            lines[i] = '<li>' + lines[i][nb:]
            if nb > ul:
                lines.insert(i, '<ul>' * (nb - ul))
                i += 1
            elif nb < ul:
                lines.insert(i, '</ul>' * (ul - nb))
                i += 1
            ul = nb
        elif ul and not lines[i].strip():
            if (i < len(lines) - 1 and lines[i+1].strip()
                    and not lines[i + 1].startswith(' ')):
                nline = lines[i + 1].lstrip()
                if nline[0] in '-+*' and len(nline) > 1 and nline[1] == ' ':
                    pass
                else:
                    lines.insert(i, '</ul>' * ul)
                    i += 1
                    ul = 0

        # ordered lists
        mo = re.search(r'^(\d+\.)', lines[i])
        if mo:
            if not ol:
                lines.insert(i, '<ol>')
                i += 1
            lines[i] = '<li>' + lines[i][len(mo.groups()[0]):]
            ol = 1
        elif (ol and not lines[i].strip() and i < len(lines) - 1
                and not lines[i + 1].startswith(' ')
                and not re.search(r'^(\d+\.)', lines[i + 1])):
            lines.insert(i, '</ol>')
            i += 1
            ol = 0

        i += 1

    if ul:
        lines.append('</ul>' * ul)
    if ol:
        lines.append('</ol>' * ol)
    if bq:
        lines.append('</blockquote>' * bq)

    sections = []
    scripts = []
    section = Marked()

    i = 0
    while i < len(lines):
        line = lines[i]
        if line.strip() and line.startswith('    '):
            if isinstance(section, Marked) and section.line:
                sections.append(section)
            section = CodeBlock(line[4:])
            j = i + 1
            while j < len(lines) and lines[j].startswith('    '):
                section.lines.append(lines[j][4:])
                j += 1
            sections.append(section)
            section = Marked()
            i = j
            continue

        elif line.strip() and line.startswith("```"):
            # fenced code blocks à la Github Flavoured Markdown
            if isinstance(section, Marked) and section.line:
                sections.append(section)
            section = CodeBlock(line)
            j = i + 1
            while j < len(lines) and not lines[j].startswith("```"):
                section.lines.append(lines[j])
                j += 1
            sections.append(section)
            section = Marked()
            i = j+1
            continue

        elif line.lower().startswith('<script'):
            if isinstance(section, Marked) and section.line:
                sections.append(section)
                section = Marked()
            j = i + 1
            while j < len(lines):
                if lines[j].lower().startswith('</script>'):
                    scripts.append('\n'.join(lines[i + 1:j]))
                    for k in range(i, j + 1):
                        lines[k] = ''
                    break
                j += 1
            i = j
            continue

        # atext header
        elif line.startswith('#'):
            level = 1
            line = lines[i]
            while level < len(line) and line[level] == '#' and level <= 6:
                level += 1
            if not line[level + 1:].strip():
                if level == 1:
                    i += 1
                    continue
                else:
                    lines[i] = '<H%s>%s</H%s>\n' %(level - 1, '#', level - 1)
            else:
                lines[i] = '<H%s>%s</H%s>\n' %(level, line[level + 1:], level)

        else:
            mo = re.search(ref_pattern, line)
            if mo is not None:
                if isinstance(section, Marked) and section.line:
                    sections.append(section)
                    section = Marked()
                key = mo.groups()[0]
                value = URL(mo.groups()[1])
                refs[key.lower()] = value
            else:
                if not line.strip():
                    line = '<p></p>'
                if section.line:
                    section.line += '\n'
                section.line += line

            i += 1

    if isinstance(section, Marked) and section.line:
        sections.append(section)

    res = ''
    for section in sections:
        mk, _scripts = section.to_html()
        res += mk
        scripts += _scripts

    return res, scripts

def escape(czone):
    czone = czone.replace('&', '&amp;')
    czone = czone.replace('<', '&lt;')
    czone = czone.replace('>', '&gt;')
    czone = czone.replace('_', '&#95;')
    czone = czone.replace('*', '&#42;')
    return czone

def s_escape(mo):
    # used in re.sub
    czone = mo.string[mo.start():mo.end()]
    return escape(czone)

def unmark(code_zone):
    # convert _ to &#95; inside inline code
    code_zone = code_zone.replace('_', '&#95;')
    return code_zone

def s_unmark(mo):
    # convert _ to &#95; inside inline code
    code_zone = mo.string[mo.start():mo.end()]
    code_zone = code_zone.replace('_', '&#95;')
    return code_zone

def apply_markdown(src):

    scripts = []
    key = None

    i = 0
    while i < len(src):
        if src[i] == '[':
            start_a = i+1
            while True:
                end_a = src.find(']', i)
                if end_a == -1:
                    break
                if src[end_a - 1]=='\\':
                    i = end_a + 1
                else:
                    break
            if end_a > -1 and src[start_a:end_a].find('\n') == -1:
                link = src[start_a:end_a]
                rest = src[end_a + 1:].lstrip()
                if rest and rest[0] == '(':
                    j = 0
                    while True:
                        end_href = rest.find(')', j)
                        if end_href == -1:
                            break
                        if rest[end_href - 1] == '\\':
                            j = end_href + 1
                        else:
                            break
                    if end_href > -1 and rest[:end_href].find('\n') == -1:
                        tag = ('<a href="' + rest[1:end_href] + '">' + link
                            + '</a>')
                        src = src[:start_a - 1] + tag + rest[end_href + 1:]
                        i = start_a + len(tag)
                elif rest and rest[0] == '[':
                    j = 0
                    while True:
                        end_key = rest.find(']', j)
                        if end_key == -1:
                            break
                        if rest[end_key-1] == '\\':
                            j = end_key + 1
                        else:
                            break
                    if end_key > -1 and rest[:end_key].find('\n') == -1:
                        if not key:
                            key = link
                        if key.lower() not in refs:
                            raise KeyError('unknown reference %s' %key)
                        url = refs[key.lower()]
                        tag = '<a href="' + url + '">' + link + '</a>'
                        src = src[:start_a - 1] + tag + rest[end_key + 1:]
                        i = start_a + len(tag)

        i += 1

    # before applying the markup with _ and *, isolate HTML tags because
    # they can contain these characters

    # We replace them temporarily by a random string. The string starts
    # and ends with a whitespace so that in the new source, the text before
    # the random string has a word end. E.g. <h2>_italic_</h2> is replaced
    # by something like " agFyf_italic_ agFyf" so that the markdown for
    # _italic_ can be applied.
    rstr = ' '+''.join(random.choice(letters) for i in range(16)) + ' '

    i = 0
    state = None
    start = -1
    data = ''
    tags = []
    while i < len(src):
        if src[i] == '<':
            j = i + 1
            while j < len(src):
                if src[j] == '"' or src[j] == "'":
                    if state == src[j] and src[j - 1] != '\\':
                        state = None
                        j = start + len(data) + 1
                        data = ''
                    elif state is None:
                        state = src[j]
                        start = j
                    else:
                        data += src[j]
                elif src[j] == '>' and state is None:
                    tags.append(src[i:j + 1])
                    src = src[:i] + rstr + src[j + 1:]
                    i += len(rstr)
                    break
                elif state == '"' or state == "'":
                    data += src[j]
                elif src[j] == '\n':
                    # if a sign < is not followed by > in the same ligne, it
                    # is the sign "lesser than"
                    src = src[:i] + '&lt;' + src[i + 1:]
                    j = i + 4
                    break
                j += 1
        elif src[i] == '`' and i > 0 and src[i - 1] != '\\':
            # ignore the content of inline code
            j = i + 1
            while j < len(src):
                if src[j] == '`' and src[j - 1] != '\\':
                    break
                j += 1
            i = j
        i += 1

    # escape "<", ">", "&" and "_" in inline code
    code_pattern = r'\`(.*?)\`'
    src = re.sub(code_pattern, s_escape, src)

    # replace escaped ` _ * by HTML characters
    src = src.replace(r'\\`', '&#96;')
    src = src.replace(r'\_', '&#95;')
    src = src.replace(r'\*', '&#42;')

    # emphasis
    strong_patterns = [('STRONG', r'\*\*(.+?)\*\*'), ('B', r'__(.+?)__')]
    for tag,strong_pattern in strong_patterns:
        src = re.sub(strong_pattern, r'<%s>\1</%s>' %(tag, tag), src)

    # EM for *xxx*
    src = re.sub(r'\*(.+?)\*', r'<%s>\1</%s>' %('EM', 'EM'), src)

    # I for _xxx_ where the _ are at the beginning or end of a word
    # An underscore inside a word is ignored.
    src = re.sub(r'\b_(.*?)_\b', r'<I>\1</I>', src,
        flags=re.M)

    # inline code
    code_pattern = r'\`(.*?)\`'
    src = re.sub(code_pattern, r'<code>\1</code>', src)

    # restore tags
    while True:
        pos = src.rfind(rstr)
        if pos==-1:
            break
        repl = tags.pop()
        src = src[:pos] + repl + src[pos + len(rstr):]

    src = '<p>' + src + '</p>'

    return src, scripts


if __name__ == '__main__':
    import sys
    data = ''.join(sys.stdin)

    print(mark(data)[0], end='')