Skip to content

Commit

Permalink
Restore previous version of markdown_mistune.py.
Browse files Browse the repository at this point in the history
  • Loading branch information
HaudinFlorence committed Feb 3, 2025
1 parent 3b45be0 commit 6080520
Show file tree
Hide file tree
Showing 4 changed files with 7,757 additions and 35 deletions.
57 changes: 57 additions & 0 deletions Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "e1ecc100-fae0-422f-bcf6-73fbb2bdd215",
"metadata": {},
"source": [
"#test"
]
},
{
"cell_type": "markdown",
"id": "23c99cdf-7405-4c4f-889e-2ce72db47d23",
"metadata": {},
"source": [
"##test1"
]
},
{
"cell_type": "markdown",
"id": "422906ca-be89-4a2a-823f-e498ea193767",
"metadata": {},
"source": [
"##test2"
]
},
{
"cell_type": "markdown",
"id": "ac26a764-016b-4a8d-82b0-b34f2961c36a",
"metadata": {},
"source": [
"##test3"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
80 changes: 45 additions & 35 deletions nbconvert/filters/markdown_mistune.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

import bs4
import mistune
from nbformat import NotebookNode
from mistune.renderers.markdown import MarkdownRenderer
from pygments import highlight
from pygments.formatters import HtmlFormatter
from pygments.lexer import Lexer
Expand All @@ -36,7 +36,7 @@ def __call__(self, markdown: "Markdown") -> None:


try: # for Mistune >= 3.0
from mistune import ( # type:ignore[attr-defined]
from mistune import (# type:ignore[attr-defined]
BlockParser,
BlockState,
HTMLRenderer,
Expand All @@ -52,7 +52,7 @@ def __call__(self, markdown: "Markdown") -> None:
except ImportError: # for Mistune >= 2.0
import re

from mistune import ( # type: ignore[attr-defined]
from mistune import (# type: ignore[attr-defined]
PLUGINS,
BlockParser,
HTMLRenderer,
Expand All @@ -63,8 +63,8 @@ def __call__(self, markdown: "Markdown") -> None:
MISTUNE_V3 = False
MISTUNE_V3_ATX = False

def import_plugin(name: str) -> "MarkdownPlugin": # type: ignore[misc]
"""Simple implementation of Mistune V3"s import_plugin for V2."""
def import_plugin(name: str) -> "Plugin": # type: ignore[misc]
"""Simple implementation of Mistune V3's import_plugin for V2."""
return PLUGINS[name] # type: ignore[no-any-return]


Expand All @@ -73,7 +73,7 @@ class InvalidNotebook(Exception):


def _dotall(pattern: str) -> str:
"""Makes the "." special character match any character inside the pattern, including a newline.
"""Makes the '.' special character match any character inside the pattern, including a newline.
This is implemented with the inline flag `(?s:...)` and is equivalent to using `re.DOTALL`.
It is useful for LaTeX environments, where line breaks may be present.
Expand All @@ -88,7 +88,7 @@ class MathBlockParser(BlockParser):
order to avoid other block level rules splitting math sections apart.
It works by matching each multiline math environment as a single paragraph,
so that other rules don"t think each section is its own paragraph. Inline
so that other rules don't think each section is its own paragraph. Inline
is ignored here.
"""

Expand Down Expand Up @@ -216,7 +216,7 @@ class MathBlockParser(BlockParser): # type: ignore[no-redef]
re.DOTALL,
)

# Regex for header that doesn"t require space after "#"
# Regex for header that doesn't require space after '#'
AXT_HEADING = re.compile(r" {0,3}(#{1,6})(?!#+)(?: *\n+|([^\n]*?)(?:\n+|\s+?#+\s*\n+))")

# Multiline math must be searched before other rules
Expand Down Expand Up @@ -257,7 +257,7 @@ class MathInlineParser(InlineParser): # type: ignore[no-redef]

def parse_block_math_tex(self, m: Match[str], state: Any) -> Tuple[str, str]:
"""Parse block text math."""
# sometimes the Scanner keeps the final "$$", so we use the
# sometimes the Scanner keeps the final '$$', so we use the
# full matched string and remove the math markers
text = m.group(0)[2:-2]
return "block_math", text
Expand Down Expand Up @@ -295,6 +295,7 @@ def __init__(
anchor_link_text: str = "¶",
path: str = "",
attachments: Optional[Dict[str, Dict[str, str]]] = None,
**lexer_options,
):
"""Initialize the renderer."""
super().__init__(escape, allow_harmful_protocols)
Expand All @@ -308,7 +309,7 @@ def __init__(
else:
self.attachments = {}

def block_code(self, code: str, info: Optional[str] = None) -> str:
def block_code(self, code: str, info: Optional[str]=None) -> str:
"""Handle block code."""
lang: Optional[str] = ""
lexer: Optional[Lexer] = None
Expand Down Expand Up @@ -381,7 +382,7 @@ def inline_math(self, body: str) -> str:
"""Handle inline math."""
return f"${self.escape_html(body)}$"

def image(self, text: str, url: str, title: Optional[str] = None) -> str:
def image(self, text: str, url: str, title: Optional[str]=None) -> str:
"""Rendering a image with title and text.
:param text: alt text of the image.
Expand All @@ -405,7 +406,7 @@ def _embed_image_or_attachment(self, src: str) -> str:

attachment_prefix = "attachment:"
if src.startswith(attachment_prefix):
name = src[len(attachment_prefix) :]
name = src[len(attachment_prefix):]

if name not in self.attachments:
msg = f"missing attachment: {name}"
Expand Down Expand Up @@ -451,7 +452,7 @@ def _html_embed_images(self, html: str) -> str:
parsed_html = bs4.BeautifulSoup(html, features="html.parser")
imgs: bs4.ResultSet[bs4.Tag] = parsed_html.find_all("img")

# Replace img tags"s sources by base64 dataurls
# Replace img tags's sources by base64 dataurls
for img in imgs:
src = img.attrs.get("src")
if src is None:
Expand Down Expand Up @@ -482,7 +483,7 @@ def __init__(
renderer: HTMLRenderer,
block: Optional[BlockParser] = None,
inline: Optional[InlineParser] = None,
plugins: Optional[Iterable[MarkdownPlugin]] = None,
plugins: Optional[Iterable["Plugin"]] = None,
):
"""Initialize the parser."""
if block is None:
Expand All @@ -507,28 +508,37 @@ def markdown2html_mistune(source: str) -> str:
return MarkdownWithMath(renderer=IPythonRenderer(escape=False)).render(source)


def extract_titles_from_notebook_node(nb: NotebookNode):
"""Create a Markdown parser with the HeadingExtractor renderer to collect all the headings of a notebook
The input argument is the notebooknode from which a single string with all the markdown content concatenated
The output is an array containing information about the headings such as their level, their text content, an identifier and a href that can be used in case of html converter.s"""
class HeadingExtractor(MarkdownRenderer):
"""A renderer to capture headings"""

cells_html_collection = ""
for cell in nb.cells:
if cell.cell_type == "markdown":
markdown_source = cell.source
html_source = mistune.html(markdown_source) # convert all the markdown sources to html
cells_html_collection = cells_html_collection + html_source + "\n"
def __init__(self):
"""Initialize the class."""
super().__init__()
self.headings = []

def heading(self, text, level):
"""Return an empty string for the headings to avoid outputting them."""
self.headings.append((level, text))
return ""


def extract_titles_from_markdown_input(markdown_input):
""" Create a Markdown parser with the HeadingExtractor renderer to collect all the headings of a notebook"""
""" The input argument is markdown_input that is a single string with all the markdown content concatenated """
""" The output is an array containing information about the headings such as their level, their text content, an identifier and a href that can be used in case of html converter.s"""
titles_array = []
html_collection = bs4.BeautifulSoup(cells_html_collection, "html.parser")
headings = html_collection.select("h1, h2, h3, h4, h5, h6")

# Iterate on all headings to get the necessary information on the various titles
for heading in headings:
text = heading.get_text().lstrip().rstrip()
level = int(heading.name[1])
header_id = text.replace(" ", "-")
heading["id"] = header_id
href = "#" + header_id
titles_array.append([str(heading), level, href])
renderer = HeadingExtractor()
extract_titles = mistune.create_markdown(renderer=renderer)
extract_titles(markdown_input)
headings = renderer.headings

""" Iterate on all headings to get the necessary information on the various titles """
for __, title in headings:
children = title["children"]
attrs = title["attrs"]
raw_text = children[0]["raw"]
header_level = attrs["level"]
id = raw_text.replace(' ', '-')
href = "#" + id
titles_array.append([header_level, raw_text, id, href])
return titles_array
Loading

0 comments on commit 6080520

Please sign in to comment.