diff --git a/jobserver/html_utils.py b/jobserver/html_utils.py
index 2e808ba9e..3ea1f4965 100644
--- a/jobserver/html_utils.py
+++ b/jobserver/html_utils.py
@@ -1,8 +1,10 @@
-from lxml.html.clean import Cleaner
+import nh3
def clean_html(html):
- cleaned = Cleaner(page_structure=False, style=True, kill_tags=["head"]).clean_html(
- html
- )
+ """
+ Cleans the given HTML document/fragment with a whitelist-based cleaner, returning an
+ HTML fragment that conforms to the HTML5 specification.
+ """
+ cleaned = nh3.clean(html)
return cleaned
diff --git a/jobserver/reports.py b/jobserver/reports.py
index 003c61456..e608cb1a1 100644
--- a/jobserver/reports.py
+++ b/jobserver/reports.py
@@ -5,14 +5,12 @@
def process_html(html):
- # We want to handle complete HTML documents and also fragments. We're going to extract the contents of the body
- # at the end of this function, but it's easiest to normalize to complete documents because that's what the
- # HTML-wrangling libraries we're using are most comfortable handling.
- if "" not in html:
- html = f"
{html}"
-
cleaned = html_utils.clean_html(html)
+ # It's easier for BeautifulSoup to work with an HTML document, rather than with an
+ # HTML fragment.
+ cleaned = f"{cleaned}"
+
soup = BeautifulSoup(cleaned, "html.parser")
# For small screens we want to allow side-scrolling for just a small number of elements. To enable this each one
diff --git a/tests/unit/jobserver/test_reports.py b/tests/unit/jobserver/test_reports.py
index d0a8b6963..2357eda19 100644
--- a/tests/unit/jobserver/test_reports.py
+++ b/tests/unit/jobserver/test_reports.py
@@ -98,13 +98,13 @@ def test_html_processing_extracts_body(html):
(
"""
""",
"""
""",
@@ -114,7 +114,7 @@ def test_html_processing_extracts_body(html):
@@ -122,7 +122,7 @@ def test_html_processing_extracts_body(html):
"""
""",
@@ -130,7 +130,7 @@ def test_html_processing_extracts_body(html):
(
"""
something else |
@@ -139,12 +139,12 @@ def test_html_processing_extracts_body(html):
"""
- something else |
+ something else |
""",