From 2bd07726854e184a0f2ed0a79871901b16206fcb Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Thu, 4 Apr 2024 18:55:54 +0000 Subject: [PATCH 01/11] Avoid inline styles inside `` / `
`
 conversion (#117)

* Avoid inline styles inside `` / `
` conversion

The check used for this is analogous to that used to avoid escaping
potential markup characters inside such tags.

Fixes #103

---------

Co-authored-by: AlexVonB 
---
 markdownify/__init__.py   |  2 ++
 tests/test_conversions.py | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index 86226d2..0945916 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -48,6 +48,8 @@ def abstract_inline_conversion(markup_fn):
     """
     def implementation(self, el, text, convert_as_inline):
         markup = markup_fn(self)
+        if el.find_parent(['pre', 'code', 'kbd', 'samp']):
+            return text
         prefix, suffix, text = chomp(text)
         if not text:
             return ''
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
index 1e685f3..9652143 100644
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -87,6 +87,16 @@ def test_code():
     assert md('*this_should_not_escape*') == '`*this_should_not_escape*`'
     assert md('this  should\t\tnormalize') == '`this should normalize`'
     assert md('this  should\t\tnormalize') == '`this should normalize`'
+    assert md('foobarbaz') == '`foobarbaz`'
+    assert md('foobarbaz') == '`foobarbaz`'
+    assert md('foo bar baz') == '`foo bar baz`'
+    assert md('foo bar baz') == '`foo bar baz`'
+    assert md('foo bar baz') == '`foo bar baz`'
+    assert md('foo bar baz') == '`foo bar baz`'
+    assert md('foo bar baz') == '`foo bar baz`'
+    assert md('foo bar baz') == '`foo bar baz`'
+    assert md('foobarbaz', sup_symbol='^') == '`foobarbaz`'
+    assert md('foobarbaz', sub_symbol='^') == '`foobarbaz`'
 
 
 def test_del():
@@ -215,6 +225,17 @@ def test_pre():
     assert md('
*this_should_not_escape*
') == '\n```\n*this_should_not_escape*\n```\n' assert md('
\t\tthis  should\t\tnot  normalize
') == '\n```\n\t\tthis should\t\tnot normalize\n```\n' assert md('
\t\tthis  should\t\tnot  normalize
') == '\n```\n\t\tthis should\t\tnot normalize\n```\n' + assert md('
foo\nbar\nbaz
') == '\n```\nfoo\nbar\nbaz\n```\n' + assert md('
foo\nbar\nbaz
') == '\n```\nfoo\nbar\nbaz\n```\n' + assert md('
foo\nbar\nbaz
') == '\n```\nfoo\nbar\nbaz\n```\n' + assert md('
foo\nbaz
') == '\n```\nfoo\nbaz\n```\n' + assert md('
foo\nbar\nbaz
') == '\n```\nfoo\nbar\nbaz\n```\n' + assert md('
foo\nbar\nbaz
') == '\n```\nfoo\nbar\nbaz\n```\n' + assert md('
foo\nbar\nbaz
') == '\n```\nfoo\nbar\nbaz\n```\n' + assert md('
foo\nbar\nbaz
') == '\n```\nfoo\nbar\nbaz\n```\n' + assert md('
foo\nbar\nbaz
') == '\n```\nfoo\nbar\nbaz\n```\n' + assert md('
foo\nbar\nbaz
', sup_symbol='^') == '\n```\nfoo\nbar\nbaz\n```\n' + assert md('
foo\nbar\nbaz
', sub_symbol='^') == '\n```\nfoo\nbar\nbaz\n```\n' def test_script(): From 46af45bb3c392180c254a3f97f6bcb8bfecb8116 Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Thu, 4 Apr 2024 19:42:58 +0000 Subject: [PATCH 02/11] Escape all characters with Markdown significance (#118) * Escape all characters with Markdown significance There are many punctuation characters that sometimes have significance in Markdown; more systematically escape them all (based on a new escape_misc configuration option). A limited attempt is made to limit the escaping of '.' and ')' to the context where they might have Markdown significance (after a number, where they can indicate an ordered list item); no such attempt is made for the other characters (and even that limiting of '.' and ')' may not be entirely safe in all cases, as it's possible the HTML could have the number outside the block being escaped in one go, e.g. `1.`. --------- Co-authored-by: AlexVonB --- README.rst | 5 +++++ markdownify/__init__.py | 4 ++++ tests/test_escaping.py | 23 +++++++++++++++++++++-- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 51888ea..a0cd678 100644 --- a/README.rst +++ b/README.rst @@ -123,6 +123,11 @@ escape_underscores If set to ``False``, do not escape ``_`` to ``\_`` in text. Defaults to ``True``. +escape_misc + If set to ``False``, do not escape miscellaneous punctuation characters + that sometimes have Markdown significance in text. + Defaults to ``True``. + keep_inline_images_in Images are converted to their alt-text when the images are located inside headlines or table cells. If some inline images should be converted to diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 0945916..eaa6ded 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -71,6 +71,7 @@ class DefaultOptions: default_title = False escape_asterisks = True escape_underscores = True + escape_misc = True heading_style = UNDERLINED keep_inline_images_in = [] newline_style = SPACES @@ -201,6 +202,9 @@ def should_convert_tag(self, tag): def escape(self, text): if not text: return '' + if self.options['escape_misc']: + text = re.sub(r'([\\&<`[>~#=+|-])', r'\\\1', text) + text = re.sub(r'([0-9])([.)])', r'\1\\\2', text) if self.options['escape_asterisks']: text = text.replace('*', r'\*') if self.options['escape_underscores']: diff --git a/tests/test_escaping.py b/tests/test_escaping.py index 2f3a83e..eaef77d 100644 --- a/tests/test_escaping.py +++ b/tests/test_escaping.py @@ -12,7 +12,7 @@ def test_underscore(): def test_xml_entities(): - assert md('&') == '&' + assert md('&') == r'\&' def test_named_entities(): @@ -25,4 +25,23 @@ def test_hexadecimal_entities(): def test_single_escaping_entities(): - assert md('&amp;') == '&' + assert md('&amp;') == r'\&' + + +def text_misc(): + assert md('\\*') == r'\\\*' + assert md('') == r'\' + assert md('# foo') == r'\# foo' + assert md('> foo') == r'\> foo' + assert md('~~foo~~') == r'\~\~foo\~\~' + assert md('foo\n===\n') == 'foo\n\\=\\=\\=\n' + assert md('---\n') == '\\-\\-\\-\n' + assert md('+ x\n+ y\n') == '\\+ x\n\\+ y\n' + assert md('`x`') == r'\`x\`' + assert md('[text](link)') == r'\[text](link)' + assert md('1. x') == r'1\. x' + assert md('not a number. x') == r'not a number. x' + assert md('1) x') == r'1\) x' + assert md('not a number) x') == r'not a number) x' + assert md('|not table|') == r'\|not table\|' + assert md(r'\ &amp; | ` `', escape_misc=False) == r'\ & | ` `' From 43dbe20aaf9d11c38c9dab7d0e8f30dfdedf19e7 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Thu, 4 Apr 2024 21:49:45 +0200 Subject: [PATCH 03/11] fixed github action badges see https://github.com/badges/shields/issues/8671 --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index a0cd678..35d58fd 100644 --- a/README.rst +++ b/README.rst @@ -1,8 +1,8 @@ |build| |version| |license| |downloads| -.. |build| image:: https://img.shields.io/github/workflow/status/matthewwithanm/python-markdownify/Python%20application/develop +.. |build| image:: https://img.shields.io/github/actions/workflow/status/matthewwithanm/python-markdownify/python-app.yml?branch=develop :alt: GitHub Workflow Status - :target: https://github.com/matthewwithanm/python-markdownify/actions?query=workflow%3A%22Python+application%22 + :target: https://github.com/matthewwithanm/python-markdownify/actions/workflows/python-app.yml?query=workflow%3A%22Python+application%22 .. |version| image:: https://img.shields.io/pypi/v/markdownify :alt: Pypi version From c1672aee444d4fa8c76a7be37b0746ce769d2631 Mon Sep 17 00:00:00 2001 From: samypr100 <3933065+samypr100@users.noreply.github.com> Date: Sun, 23 Jun 2024 06:59:14 -0400 Subject: [PATCH 04/11] Update MANIFEST.in to exclude tests during packaging (#125) --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 9561fb1..70656c8 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1,2 @@ include README.rst +prune tests From 2ec33384de85d0906b4b40a59f1a3650846150cb Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Sun, 23 Jun 2024 13:17:20 +0200 Subject: [PATCH 05/11] handle un-parsable colspan values fixes #126 --- markdownify/__init__.py | 6 +++--- tests/test_tables.py | 14 +++++++++++++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index eaa6ded..6a983d9 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -383,13 +383,13 @@ def convert_figcaption(self, el, text, convert_as_inline): def convert_td(self, el, text, convert_as_inline): colspan = 1 - if 'colspan' in el.attrs: + if 'colspan' in el.attrs and el['colspan'].isdigit(): colspan = int(el['colspan']) return ' ' + text.strip().replace("\n", " ") + ' |' * colspan def convert_th(self, el, text, convert_as_inline): colspan = 1 - if 'colspan' in el.attrs: + if 'colspan' in el.attrs and el['colspan'].isdigit(): colspan = int(el['colspan']) return ' ' + text.strip().replace("\n", " ") + ' |' * colspan @@ -406,7 +406,7 @@ def convert_tr(self, el, text, convert_as_inline): # first row and is headline: print headline underline full_colspan = 0 for cell in cells: - if "colspan" in cell.attrs: + if 'colspan' in cell.attrs and cell['colspan'].isdigit(): full_colspan += int(cell["colspan"]) else: full_colspan += 1 diff --git a/tests/test_tables.py b/tests/test_tables.py index 9120c29..594e5bf 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -215,7 +215,7 @@ Age - Jill + Jill Smith 50 @@ -226,6 +226,17 @@ """ +table_with_undefined_colspan = """ + + + + + + + + +
NameAge
JillSmith
""" + def test_table(): assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' @@ -240,3 +251,4 @@ def test_table(): assert md(table_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_caption) == 'TEXT\n\nCaption\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n' assert md(table_with_colspan) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_undefined_colspan) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n' From 7861b330cd05c0c19fc496530f02922d5493c568 Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Sun, 23 Jun 2024 11:28:05 +0000 Subject: [PATCH 06/11] Special-case use of HTML tags for converting `` / `` (#119) Allow different strings before / after `` / `` content In particular, this allows setting `sub_symbol=''`, `sup_symbol=''`, to use raw HTML in the output when converting subscripts and superscripts. --- README.rst | 6 +++++- markdownify/__init__.py | 9 +++++++-- tests/test_conversions.py | 2 ++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 35d58fd..55ea7cf 100644 --- a/README.rst +++ b/README.rst @@ -87,7 +87,11 @@ strong_em_symbol sub_symbol, sup_symbol Define the chars that surround ```` and ```` text. Defaults to an empty string, because this is non-standard behavior. Could be something like - ``~`` and ``^`` to result in ``~sub~`` and ``^sup^``. + ``~`` and ``^`` to result in ``~sub~`` and ``^sup^``. If the value starts + with ``<`` and ends with ``>``, it is treated as an HTML tag and a ``/`` is + inserted after the ``<`` in the string used after the text; this allows + specifying ```` to use raw HTML in the output for subscripts, for + example. newline_style Defines the style of marking linebreaks (``
``) in markdown. The default diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 6a983d9..d7bd780 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -43,17 +43,22 @@ def abstract_inline_conversion(markup_fn): """ This abstracts all simple inline tags like b, em, del, ... Returns a function that wraps the chomped text in a pair of the string - that is returned by markup_fn. markup_fn is necessary to allow for + that is returned by markup_fn, with '/' inserted in the string used after + the text if it looks like an HTML tag. markup_fn is necessary to allow for references to self.strong_em_symbol etc. """ def implementation(self, el, text, convert_as_inline): markup = markup_fn(self) + if markup.startswith('<') and markup.endswith('>'): + markup_after = 'foo
') == 'foo' assert md('foo', sub_symbol='~') == '~foo~' + assert md('foo', sub_symbol='') == 'foo' def test_sup(): assert md('foo') == 'foo' assert md('foo', sup_symbol='^') == '^foo^' + assert md('foo', sup_symbol='') == 'foo' def test_lang(): From 50b4640db2d7f88b44c20f947e705ba59f1b9fe0 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Sun, 23 Jun 2024 13:30:08 +0200 Subject: [PATCH 07/11] better naming for markup variables --- markdownify/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index d7bd780..2f71cad 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -48,17 +48,17 @@ def abstract_inline_conversion(markup_fn): references to self.strong_em_symbol etc. """ def implementation(self, el, text, convert_as_inline): - markup = markup_fn(self) - if markup.startswith('<') and markup.endswith('>'): - markup_after = ''): + markup_suffix = ' Date: Sun, 23 Jun 2024 20:28:53 +0800 Subject: [PATCH 08/11] handle ol start value is not number (#127) Co-authored-by: Mico --- markdownify/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 2f71cad..cd66a39 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -326,7 +326,7 @@ def convert_list(self, el, text, convert_as_inline): def convert_li(self, el, text, convert_as_inline): parent = el.parent if parent is not None and parent.name == 'ol': - if parent.get("start"): + if parent.get("start") and str(parent.get("start")).isnumeric(): start = int(parent.get("start")) else: start = 1 From 0a5c89aa493ae0cdc090305ba14ef7fa1c6f13c4 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Sun, 23 Jun 2024 14:30:07 +0200 Subject: [PATCH 09/11] added test for ol start check --- tests/test_lists.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_lists.py b/tests/test_lists.py index 5a04430..35eee13 100644 --- a/tests/test_lists.py +++ b/tests/test_lists.py @@ -43,6 +43,9 @@ def test_ol(): assert md('
  1. a
  2. b
') == '1. a\n2. b\n' assert md('
  1. a
  2. b
') == '3. a\n4. b\n' + assert md('
  1. a
  2. b
') == '1. a\n2. b\n' + assert md('
  1. a
  2. b
') == '1. a\n2. b\n' + assert md('
  1. a
  2. b
') == '1. a\n2. b\n' def test_nested_ols(): From 75a678dab9d7cec2c18b58489ea4a66b6f794908 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Sun, 14 Jul 2024 21:02:49 +0200 Subject: [PATCH 10/11] fix pytest version to 8 --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 9eb8750..54ba143 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ envlist = py38 [testenv] passenv = PYTHONPATH deps = - pytest + pytest==8 flake8 restructuredtext_lint Pygments From f6c8daf8a58948c88256a09a60085e28e628564e Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Sun, 14 Jul 2024 21:19:23 +0200 Subject: [PATCH 11/11] bump to v0.13.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9a26468..9a703d0 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ pkgmeta = { '__title__': 'markdownify', '__author__': 'Matthew Tretter', - '__version__': '0.12.1', + '__version__': '0.13.0', } read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read()