Restore previous version of markdown_mistune.py.

jupyter · Feb 3, 2025 · 6080520 · 6080520
1 parent 3b45be0
commit 6080520
Show file tree

Hide file tree

Showing 4 changed files with 7,757 additions and 35 deletions.
diff --git a/Untitled.ipynb b/Untitled.ipynb
@@ -0,0 +1,57 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e1ecc100-fae0-422f-bcf6-73fbb2bdd215",
+   "metadata": {},
+   "source": [
+    "#test"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23c99cdf-7405-4c4f-889e-2ce72db47d23",
+   "metadata": {},
+   "source": [
+    "##test1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "422906ca-be89-4a2a-823f-e498ea193767",
+   "metadata": {},
+   "source": [
+    "##test2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ac26a764-016b-4a8d-82b0-b34f2961c36a",
+   "metadata": {},
+   "source": [
+    "##test3"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/nbconvert/filters/markdown_mistune.py b/nbconvert/filters/markdown_mistune.py
@@ -13,7 +13,7 @@
 
 import bs4
 import mistune
-from nbformat import NotebookNode
+from mistune.renderers.markdown import MarkdownRenderer
 from pygments import highlight
 from pygments.formatters import HtmlFormatter
 from pygments.lexer import Lexer
@@ -36,7 +36,7 @@ def __call__(self, markdown: "Markdown") -> None:
 
 
 try:  # for Mistune >= 3.0
-    from mistune import (  # type:ignore[attr-defined]
+    from mistune import (# type:ignore[attr-defined]
         BlockParser,
         BlockState,
         HTMLRenderer,
@@ -52,7 +52,7 @@ def __call__(self, markdown: "Markdown") -> None:
 except ImportError:  # for Mistune >= 2.0
     import re
 
-    from mistune import (  # type: ignore[attr-defined]
+    from mistune import (# type: ignore[attr-defined]
         PLUGINS,
         BlockParser,
         HTMLRenderer,
@@ -63,8 +63,8 @@ def __call__(self, markdown: "Markdown") -> None:
     MISTUNE_V3 = False
     MISTUNE_V3_ATX = False
 
-    def import_plugin(name: str) -> "MarkdownPlugin":  # type: ignore[misc]
-        """Simple implementation of Mistune V3"s import_plugin for V2."""
+    def import_plugin(name: str) -> "Plugin":  # type: ignore[misc]
+        """Simple implementation of Mistune V3's import_plugin for V2."""
         return PLUGINS[name]  # type: ignore[no-any-return]
 
 
@@ -73,7 +73,7 @@ class InvalidNotebook(Exception):
 
 
 def _dotall(pattern: str) -> str:
-    """Makes the "." special character match any character inside the pattern, including a newline.
+    """Makes the '.' special character match any character inside the pattern, including a newline.
 
     This is implemented with the inline flag `(?s:...)` and is equivalent to using `re.DOTALL`.
     It is useful for LaTeX environments, where line breaks may be present.
@@ -88,7 +88,7 @@ class MathBlockParser(BlockParser):
         order to avoid other block level rules splitting math sections apart.
 
         It works by matching each multiline math environment as a single paragraph,
-        so that other rules don"t think each section is its own paragraph. Inline
+        so that other rules don't think each section is its own paragraph. Inline
         is ignored here.
         """
 
@@ -216,7 +216,7 @@ class MathBlockParser(BlockParser):  # type: ignore[no-redef]
             re.DOTALL,
         )
 
-        # Regex for header that doesn"t require space after "#"
+        # Regex for header that doesn't require space after '#'
         AXT_HEADING = re.compile(r" {0,3}(#{1,6})(?!#+)(?: *\n+|([^\n]*?)(?:\n+|\s+?#+\s*\n+))")
 
         # Multiline math must be searched before other rules
@@ -257,7 +257,7 @@ class MathInlineParser(InlineParser):  # type: ignore[no-redef]
 
         def parse_block_math_tex(self, m: Match[str], state: Any) -> Tuple[str, str]:
             """Parse block text math."""
-            # sometimes the Scanner keeps the final "$$", so we use the
+            # sometimes the Scanner keeps the final '$$', so we use the
             # full matched string and remove the math markers
             text = m.group(0)[2:-2]
             return "block_math", text
@@ -295,6 +295,7 @@ def __init__(
         anchor_link_text: str = "¶",
         path: str = "",
         attachments: Optional[Dict[str, Dict[str, str]]] = None,
+        **lexer_options,
     ):
         """Initialize the renderer."""
         super().__init__(escape, allow_harmful_protocols)
@@ -308,7 +309,7 @@ def __init__(
         else:
             self.attachments = {}
 
-    def block_code(self, code: str, info: Optional[str] = None) -> str:
+    def block_code(self, code: str, info: Optional[str]=None) -> str:
         """Handle block code."""
         lang: Optional[str] = ""
         lexer: Optional[Lexer] = None
@@ -381,7 +382,7 @@ def inline_math(self, body: str) -> str:
         """Handle inline math."""
         return f"${self.escape_html(body)}$"
 
-    def image(self, text: str, url: str, title: Optional[str] = None) -> str:
+    def image(self, text: str, url: str, title: Optional[str]=None) -> str:
         """Rendering a image with title and text.
 
         :param text: alt text of the image.
@@ -405,7 +406,7 @@ def _embed_image_or_attachment(self, src: str) -> str:
 
         attachment_prefix = "attachment:"
         if src.startswith(attachment_prefix):
-            name = src[len(attachment_prefix) :]
+            name = src[len(attachment_prefix):]
 
             if name not in self.attachments:
                 msg = f"missing attachment: {name}"
@@ -451,7 +452,7 @@ def _html_embed_images(self, html: str) -> str:
         parsed_html = bs4.BeautifulSoup(html, features="html.parser")
         imgs: bs4.ResultSet[bs4.Tag] = parsed_html.find_all("img")
 
-        # Replace img tags"s sources by base64 dataurls
+        # Replace img tags's sources by base64 dataurls
         for img in imgs:
             src = img.attrs.get("src")
             if src is None:
@@ -482,7 +483,7 @@ def __init__(
         renderer: HTMLRenderer,
         block: Optional[BlockParser] = None,
         inline: Optional[InlineParser] = None,
-        plugins: Optional[Iterable[MarkdownPlugin]] = None,
+        plugins: Optional[Iterable["Plugin"]] = None,
     ):
         """Initialize the parser."""
         if block is None:
@@ -507,28 +508,37 @@ def markdown2html_mistune(source: str) -> str:
     return MarkdownWithMath(renderer=IPythonRenderer(escape=False)).render(source)
 
 
-def extract_titles_from_notebook_node(nb: NotebookNode):
-    """Create a Markdown parser with the HeadingExtractor renderer to collect all the headings of a notebook
-    The input argument is the notebooknode from which a single string with all the markdown content concatenated
-    The output is an array containing information about the headings such as their level, their text content, an identifier and a href that can be used in case of html converter.s"""
+class HeadingExtractor(MarkdownRenderer):
+    """A renderer to capture headings"""
 
-    cells_html_collection = ""
-    for cell in nb.cells:
-        if cell.cell_type == "markdown":
-            markdown_source = cell.source
-            html_source = mistune.html(markdown_source)  # convert all the markdown sources to html
-            cells_html_collection = cells_html_collection + html_source + "\n"
+    def __init__(self):
+        """Initialize the class."""
+        super().__init__()
+        self.headings = []
 
+    def heading(self, text, level):
+        """Return an empty string for the headings to avoid outputting them."""
+        self.headings.append((level, text))
+        return ""
+
+
+def extract_titles_from_markdown_input(markdown_input):
+    """  Create a Markdown parser with the HeadingExtractor renderer to collect all the headings of a notebook"""
+    """ The input argument is markdown_input that is a single string with all the markdown content concatenated """
+    """ The output is an array containing information about the headings such as their level, their text content, an identifier and a href that can be used in case of html converter.s"""
     titles_array = []
-    html_collection = bs4.BeautifulSoup(cells_html_collection, "html.parser")
-    headings = html_collection.select("h1, h2, h3, h4, h5, h6")
-
-    # Iterate on all headings to get the necessary information on the various titles
-    for heading in headings:
-        text = heading.get_text().lstrip().rstrip()
-        level = int(heading.name[1])
-        header_id = text.replace(" ", "-")
-        heading["id"] = header_id
-        href = "#" + header_id
-        titles_array.append([str(heading), level, href])
+    renderer = HeadingExtractor()
+    extract_titles = mistune.create_markdown(renderer=renderer)
+    extract_titles(markdown_input)
+    headings = renderer.headings
+
+    """ Iterate on all headings to get the necessary information on the various titles """
+    for __, title in headings:
+        children = title["children"]
+        attrs = title["attrs"]
+        raw_text = children[0]["raw"]
+        header_level = attrs["level"]
+        id = raw_text.replace(' ', '-')
+        href = "#" + id
+        titles_array.append([header_level, raw_text, id, href])
     return titles_array