Skip to content

Commit

Permalink
refactor(python): Minor non-breaking space ( ) tweak for HTML …
Browse files Browse the repository at this point in the history
…rendering (#19864)
  • Loading branch information
alexander-beedie authored Nov 22, 2024
1 parent 5b3a8f9 commit 4af1c43
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 29 deletions.
12 changes: 9 additions & 3 deletions py-polars/polars/dataframe/_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import os
import re
from textwrap import dedent
from typing import TYPE_CHECKING

Expand All @@ -15,6 +16,11 @@
from polars import DataFrame


def replace_consecutive_spaces(s: str) -> str:
"""Replace consecutive spaces with HTML non-breaking spaces."""
return re.sub(r"( {2,})", lambda match: " " * len(match.group(0)), s)


class Tag:
"""Class for representing an HTML tag."""

Expand Down Expand Up @@ -119,9 +125,9 @@ def write_body(self) -> None:
else:
series = self.df[:, c]
self.elements.append(
html.escape(
series._s.get_fmt(r, str_len_limit)
).replace(" ", " ")
replace_consecutive_spaces(
html.escape(series._s.get_fmt(r, str_len_limit))
)
)

def write(self, inner: str) -> None:
Expand Down
42 changes: 16 additions & 26 deletions py-polars/tests/unit/dataframe/test_repr_html.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import re
import pytest

import polars as pl

Expand Down Expand Up @@ -81,28 +81,18 @@ def test_series_repr_html_max_rows_default() -> None:
assert html.count("<td>") - 2 == expected_rows


def test_html_representation_multiple_spaces() -> None:
df = pl.DataFrame(
{"string_col": ["multiple spaces", " trailing and leading "]}
)
html_repr = df._repr_html_()

# Regex explanation:
# Matches cell content inside <td>...</td> tags, but only within the <tbody> section
# 1. <tbody>: Ensures matching starts within the <tbody> section.
# 2. .*?: Lazily matches any content until the first <td> tag.
# 3. <td>(.*?)</td>: Captures the content inside each <td> tag (non-greedy).
# 4. .*?: Lazily matches any content between <td>...</td> and </tbody>.
# 5. </tbody>: Ensures matching ends at the closing </tbody> tag.
# The re.S flag allows the regex to work across multiple lines.
cell_pattern = re.compile(r"<tbody>.*?<td>(.*?)</td>.*?</tbody>", re.S)

cells = cell_pattern.findall(html_repr)

for cell_content in cells:
# Check that there are no regular spaces in the content
assert " " not in cell_content, f"Unexpected space in cell: {cell_content}"
# Check that the content contains &nbsp; as required
assert (
"&nbsp;" in cell_content
), f"Expected &nbsp; in cell but found: {cell_content}"
@pytest.mark.parametrize(
("text", "expected"),
[
("single space", "single space"),
("multiple spaces", "multiple&nbsp;&nbsp;&nbsp;spaces"),
(
" trailing & leading spaces ",
"&nbsp;&nbsp;trailing &amp; leading spaces&nbsp;&nbsp;",
),
],
)
def test_html_representation_multiple_spaces(text: str, expected: str) -> None:
with pl.Config(fmt_str_lengths=100):
html_repr = pl.DataFrame({"s": [text]})._repr_html_()
assert f"<td>&quot;{expected}&quot;</td>" in html_repr

0 comments on commit 4af1c43

Please sign in to comment.