Skip to content

Commit

Permalink
Switch html_utils.clean_html's cleaner
Browse files Browse the repository at this point in the history
We switch from `lxml.html.clean.Cleaner` to `nh3.clean`. Doing so will
allow us to remove the former (in a follow-up commit).

Whilst `nh3.clean` says that it conforms to the HTML5 specification, it
adds unnecessary `tbody` elements to `table` elements, which doesn't
conform.[^1] However, we accept that it adds unnecessary `tbody`
elements, modify the tests, and move on.

Closes #4254

[^1]: https://html.spec.whatwg.org/multipage/tables.html#the-tbody-element
  • Loading branch information
iaindillingham committed Jun 24, 2024
1 parent 1f84123 commit aaf227b
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 17 deletions.
10 changes: 6 additions & 4 deletions jobserver/html_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from lxml.html.clean import Cleaner
import nh3


def clean_html(html):
cleaned = Cleaner(page_structure=False, style=True, kill_tags=["head"]).clean_html(
html
)
"""
Cleans the given HTML document/fragment with a whitelist-based cleaner, returning an
HTML fragment that conforms to the HTML5 specification.
"""
cleaned = nh3.clean(html)
return cleaned
10 changes: 4 additions & 6 deletions jobserver/reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@


def process_html(html):
# We want to handle complete HTML documents and also fragments. We're going to extract the contents of the body
# at the end of this function, but it's easiest to normalize to complete documents because that's what the
# HTML-wrangling libraries we're using are most comfortable handling.
if "<html>" not in html:
html = f"<html><body>{html}</body></head>"

cleaned = html_utils.clean_html(html)

# It's easier for BeautifulSoup to work with an HTML document, rather than with an
# HTML fragment.
cleaned = f"<html><body>{cleaned}</body></head>"

soup = BeautifulSoup(cleaned, "html.parser")

# For small screens we want to allow side-scrolling for just a small number of elements. To enable this each one
Expand Down
14 changes: 7 additions & 7 deletions tests/unit/jobserver/test_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,13 @@ def test_html_processing_extracts_body(html):
(
"""
<table>
<tr><td>something</td></tr>
<tbody><tr><td>something</td></tr></tbody>
</table>
""",
"""
<div class="overflow-wrapper">
<table>
<tr><td>something</td></tr>
<tbody><tr><td>something</td></tr></tbody>
</table>
</div>
""",
Expand All @@ -114,23 +114,23 @@ def test_html_processing_extracts_body(html):
<html>
<body>
<table>
<tr><td>something</td></tr>
<tbody><tr><td>something</td></tr></tbody>
</table>
</body>
</html>
""",
"""
<div class="overflow-wrapper">
<table>
<tr><td>something</td></tr>
<tbody><tr><td>something</td></tr></tbody>
</table>
</div>
""",
),
(
"""
<table>
<tr><td>something</td></tr>
<tbody><tr><td>something</td></tr></tbody>
</table>
<table>
<tr><td>something else</td></tr>
Expand All @@ -139,12 +139,12 @@ def test_html_processing_extracts_body(html):
"""
<div class="overflow-wrapper">
<table>
<tr><td>something</td></tr>
<tbody><tr><td>something</td></tr></tbody>
</table>
</div>
<div class="overflow-wrapper">
<table>
<tr><td>something else</td></tr>
<tbody><tr><td>something else</td></tr></tbody>
</table>
</div>
""",
Expand Down

0 comments on commit aaf227b

Please sign in to comment.