Switch html_utils.clean_html's cleaner

We switch from `lxml.html.clean.Cleaner` to `nh3.clean`. Doing so will allow us to remove the former (in a follow-up commit). Whilst `nh3.clean` says that it conforms to the HTML5 specification, it adds unnecessary `tbody` elements to `table` elements, which doesn't conform.[^1] However, we accept that it adds unnecessary `tbody` elements, modify the tests, and move on. Closes #4254 [^1]: https://html.spec.whatwg.org/multipage/tables.html#the-tbody-element
opensafely-core · Jun 24, 2024 · aaf227b · aaf227b
1 parent 1f84123
commit aaf227b
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 17 deletions.
diff --git a/jobserver/html_utils.py b/jobserver/html_utils.py
@@ -1,8 +1,10 @@
-from lxml.html.clean import Cleaner
+import nh3
 
 
 def clean_html(html):
-    cleaned = Cleaner(page_structure=False, style=True, kill_tags=["head"]).clean_html(
-        html
-    )
+    """
+    Cleans the given HTML document/fragment with a whitelist-based cleaner, returning an
+    HTML fragment that conforms to the HTML5 specification.
+    """
+    cleaned = nh3.clean(html)
     return cleaned
diff --git a/jobserver/reports.py b/jobserver/reports.py
@@ -5,14 +5,12 @@
 
 
 def process_html(html):
-    # We want to handle complete HTML documents and also fragments. We're going to extract the contents of the body
-    # at the end of this function, but it's easiest to normalize to complete documents because that's what the
-    # HTML-wrangling libraries we're using are most comfortable handling.
-    if "<html>" not in html:
-        html = f"<html><body>{html}</body></head>"
-
     cleaned = html_utils.clean_html(html)
 
+    # It's easier for BeautifulSoup to work with an HTML document, rather than with an
+    # HTML fragment.
+    cleaned = f"<html><body>{cleaned}</body></head>"
+
     soup = BeautifulSoup(cleaned, "html.parser")
 
     # For small screens we want to allow side-scrolling for just a small number of elements. To enable this each one

diff --git a/tests/unit/jobserver/test_reports.py b/tests/unit/jobserver/test_reports.py
@@ -98,13 +98,13 @@ def test_html_processing_extracts_body(html):
         (
             """
             <table>
-                <tr><td>something</td></tr>
+                <tbody><tr><td>something</td></tr></tbody>
             </table>
             """,
             """
             <div class="overflow-wrapper">
                 <table>
-                    <tr><td>something</td></tr>
+                    <tbody><tr><td>something</td></tr></tbody>
                 </table>
             </div>
             """,
@@ -114,23 +114,23 @@ def test_html_processing_extracts_body(html):
             <html>
                 <body>
                     <table>
-                        <tr><td>something</td></tr>
+                        <tbody><tr><td>something</td></tr></tbody>
                     </table>
                 </body>
             </html>
             """,
             """
             <div class="overflow-wrapper">
                 <table>
-                    <tr><td>something</td></tr>
+                    <tbody><tr><td>something</td></tr></tbody>
                 </table>
             </div>
             """,
         ),
         (
             """
             <table>
-                <tr><td>something</td></tr>
+                <tbody><tr><td>something</td></tr></tbody>
             </table>
             <table>
                 <tr><td>something else</td></tr>
@@ -139,12 +139,12 @@ def test_html_processing_extracts_body(html):
             """
             <div class="overflow-wrapper">
                 <table>
-                    <tr><td>something</td></tr>
+                    <tbody><tr><td>something</td></tr></tbody>
                 </table>
             </div>
             <div class="overflow-wrapper">
                 <table>
-                    <tr><td>something else</td></tr>
+                    <tbody><tr><td>something else</td></tr></tbody>
                 </table>
             </div>
             """,