diff --git a/jobserver/html_utils.py b/jobserver/html_utils.py index 2e808ba9e..3ea1f4965 100644 --- a/jobserver/html_utils.py +++ b/jobserver/html_utils.py @@ -1,8 +1,10 @@ -from lxml.html.clean import Cleaner +import nh3 def clean_html(html): - cleaned = Cleaner(page_structure=False, style=True, kill_tags=["head"]).clean_html( - html - ) + """ + Cleans the given HTML document/fragment with a whitelist-based cleaner, returning an + HTML fragment that conforms to the HTML5 specification. + """ + cleaned = nh3.clean(html) return cleaned diff --git a/jobserver/reports.py b/jobserver/reports.py index 003c61456..e608cb1a1 100644 --- a/jobserver/reports.py +++ b/jobserver/reports.py @@ -5,14 +5,12 @@ def process_html(html): - # We want to handle complete HTML documents and also fragments. We're going to extract the contents of the body - # at the end of this function, but it's easiest to normalize to complete documents because that's what the - # HTML-wrangling libraries we're using are most comfortable handling. - if "" not in html: - html = f"{html}" - cleaned = html_utils.clean_html(html) + # It's easier for BeautifulSoup to work with an HTML document, rather than with an + # HTML fragment. + cleaned = f"{cleaned}" + soup = BeautifulSoup(cleaned, "html.parser") # For small screens we want to allow side-scrolling for just a small number of elements. To enable this each one diff --git a/tests/unit/jobserver/test_reports.py b/tests/unit/jobserver/test_reports.py index d0a8b6963..2357eda19 100644 --- a/tests/unit/jobserver/test_reports.py +++ b/tests/unit/jobserver/test_reports.py @@ -98,13 +98,13 @@ def test_html_processing_extracts_body(html): ( """ - +
something
something
""", """
- +
something
something
""", @@ -114,7 +114,7 @@ def test_html_processing_extracts_body(html): - +
something
something
@@ -122,7 +122,7 @@ def test_html_processing_extracts_body(html): """
- +
something
something
""", @@ -130,7 +130,7 @@ def test_html_processing_extracts_body(html): ( """ - +
something
something
@@ -139,12 +139,12 @@ def test_html_processing_extracts_body(html): """
something else
- +
something
something
- +
something else
something else
""",