Skip to content

Commit

Permalink
docs: Add more HTML to Markdown Lua filters (#5054)
Browse files Browse the repository at this point in the history
This uses sh instead of shell and removes div which improves linting.

Some of the code makes no difference for the output, but does not harm either and this script and its usage have limited life span.
  • Loading branch information
cwhite911 authored Feb 7, 2025
1 parent 4ff9373 commit bd2ebf3
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 13 deletions.
29 changes: 19 additions & 10 deletions utils/grass_html2md.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ set -eu
# wget
#
# Author(s):
# Martin Landa, Markus Neteler
# Martin Landa, Markus Neteler, Corey White
#
# Usage:
# If you have "pandoc" in PATH, execute for HTML file conversion in
# current directory and subdirectories:
# ./utils/grass_html2md.sh
#
# COPYRIGHT: (C) 2024 by the GRASS Development Team
# COPYRIGHT: (C) 2024-2025 by the GRASS Development Team
#
# This program is free software under the GNU General Public
# License (>=v2). Read the file COPYING that comes with GRASS
Expand All @@ -43,6 +43,22 @@ trap "exitprocedure" 2 3 15
# path to LUA file (./utils/pandoc_codeblock.lua)
UTILSPATH="utils"

process_file() {
local file="$1" # temporary file
local f="$2" # original file

cat "$file" | \
sed 's#<div class="code"><pre>#<pre><code>#g' | \
sed 's#</pre></div>#</code></pre>#g' | \
pandoc -f html-native_divs \
-t gfm+pipe_tables+gfm_auto_identifiers --wrap=auto \
--lua-filter "${UTILSPATH}/pandoc_codeblock.lua" | \
sed 's+ \\\$+ \$+g' | sed 's+%20+-+g' > "${f%%.html}.md"

rm -f "$file"

}

# run recursively: HTML to MD
for f in $(find . -name *.html); do
echo "${f}"
Expand All @@ -57,13 +73,6 @@ for f in $(find . -name *.html); do
s|_KEEPHTML||g;
' "${f%%.html}.html" > "${f%%.html}_tmp.html"

cat "${f%%.html}_tmp.html" | \
sed 's#<div class="code"><pre>#<pre><code>#g' | \
sed 's#</pre></div>#</code></pre>#g' | \
pandoc --from=html --to=markdown -t gfm \
--lua-filter "${UTILSPATH}/pandoc_codeblock.lua" | \
sed 's+ \\\$+ \$+g' | sed 's+%20+-+g' > "${f%%.html}.md"

rm -f "${f%%.html}_tmp.html"
process_file "${f%%.html}_tmp.html" ${f%%.html}.html

done
70 changes: 67 additions & 3 deletions utils/pandoc_codeblock.lua
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,71 @@
-- Test cases
-- raster/r.sun/r.sun.html

-- Function to convert code blocks to markdown
function CodeBlock (cb)
return pandoc.RawBlock('markdown', '```shell\n' .. cb.text .. '\n```\n')
-- Enforces markdownlint rules during Pandoc conversion
local MAX_LINE_LENGTH = 120 -- Adjust as needed for MD013

local LIST_INDENT = ""

function Image(el)
-- Convert HTML <img> to Markdown ![alt text](src)
local alt_text = el.alt or "image-alt"
local src = el.src
return pandoc.Image({pandoc.Str(alt_text)}, src)
end

-- Fixes some edge cases with raw HTML elements
function RawInline(el)
if el.format == "html" then
if el.text:match("<em>") then
return pandoc.RawInline("markdown", "*")
elseif el.text:match("</em>") then
return pandoc.RawInline("markdown", "*")
elseif el.text:match("<i>") then
return pandoc.RawInline("markdown", "*")
elseif el.text:match("</i>") then
return pandoc.RawInline("markdown", "*")
elseif el.text:match("&nbsp;") then
return pandoc.RawInline("markdown", " ")
elseif el.text:match("&lt;") then
return pandoc.RawInline("markdown", "<")
elseif el.text:match("&gt;") then
return pandoc.RawInline("markdown", ">")
end
end
return el
end

function CodeBlock(el)
-- Ensure fenced code blocks with backticks
local lang = el.classes[1] or "sh" -- Preserve language if available
return pandoc.RawBlock("markdown", "```" .. lang .. "\n" .. el.text .. "\n```")
end

function Header(el)
return pandoc.Header(el.level, el.content) -- Ensure ATX-style headers
end

function Str(el)
local text = el.text:gsub("%s+$", "") -- Remove trailing spaces
return pandoc.Str(text)
end

function Pandoc(doc)
-- Process document with defined rules
local new_blocks = {}
local previous_blank = false

for _, block in ipairs(doc.blocks) do
if block.t == "Para" and #block.content == 0 then
if not previous_blank then
table.insert(new_blocks, block)
end
previous_blank = true
else
table.insert(new_blocks, block)
previous_blank = false
end
end

return pandoc.Pandoc(new_blocks)
end

0 comments on commit bd2ebf3

Please sign in to comment.