text-splitters[patch]: Fix HTMLSectionSplitter (#22812)

Update former pull request: #22654. Modified `langchain_text_splitters.HTMLSectionSplitter`, where in the latest version `dict` data structure is used to store sections from a html document, in function `split_html_by_headers`. The header/section element names serve as dict keys. This can be a problem when duplicate header/section element names are present in a single html document. Latter ones can replace former ones with the same name. Therefore some contents can be miss after html text splitting is conducted. Using a list to store sections can hopefully solve the problem. A Unit test considering duplicate header names has been added. --------- Co-authored-by: Bagatur <[email protected]>
langchain-ai · Jun 20, 2024 · 9072d58 · 9072d58
1 parent ff7f03d
commit 9072d58
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 13 deletions.
diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py
@@ -233,9 +233,7 @@ def create_documents(
                 documents.append(new_doc)
         return documents
 
-    def split_html_by_headers(
-        self, html_doc: str
-    ) -> Dict[str, Dict[str, Optional[str]]]:
+    def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]:
         try:
             from bs4 import BeautifulSoup, PageElement  # type: ignore[import-untyped]
         except ImportError as e:
@@ -247,7 +245,7 @@ def split_html_by_headers(
 
         soup = BeautifulSoup(html_doc, "html.parser")
         headers = list(self.headers_to_split_on.keys())
-        sections: Dict[str, Dict[str, Optional[str]]] = {}
+        sections: list[dict[str, str | None]] = []
 
         headers = soup.find_all(["body"] + headers)
 
@@ -269,10 +267,13 @@ def split_html_by_headers(
             content = " ".join(section_content).strip()
 
             if content != "":
-                sections[current_header] = {
-                    "content": content,
-                    "tag_name": current_header_tag,
-                }
+                sections.append(
+                    {
+                        "header": current_header,
+                        "content": content,
+                        "tag_name": current_header_tag,
+                    }
+                )
 
         return sections
 
@@ -307,12 +308,12 @@ def split_text_from_file(self, file: Any) -> List[Document]:
 
         return [
             Document(
-                cast(str, sections[section_key]["content"]),
+                cast(str, section["content"]),
                 metadata={
-                    self.headers_to_split_on[
-                        str(sections[section_key]["tag_name"])
-                    ]: section_key
+                    self.headers_to_split_on[str(section["tag_name"])]: section[
+                        "header"
+                    ]
                 },
             )
-            for section_key in sections.keys()
+            for section in sections
         ]
diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
@@ -1650,6 +1650,59 @@ def test_section_splitter_accepts_an_absolute_path() -> None:
     sec_splitter.split_text(html_string)
 
 
+@pytest.mark.requires("lxml")
+@pytest.mark.requires("bs4")
+def test_happy_path_splitting_with_duplicate_header_tag() -> None:
+    # arrange
+    html_string = """<!DOCTYPE html>
+        <html>
+        <body>
+            <div>
+                <h1>Foo</h1>
+                <p>Some intro text about Foo.</p>
+                <div>
+                    <h2>Bar main section</h2>
+                    <p>Some intro text about Bar.</p>
+                    <h3>Bar subsection 1</h3>
+                    <p>Some text about the first subtopic of Bar.</p>
+                    <h3>Bar subsection 2</h3>
+                    <p>Some text about the second subtopic of Bar.</p>
+                </div>
+                <div>
+                    <h2>Foo</h2>
+                    <p>Some text about Baz</p>
+                </div>
+                <h1>Foo</h1>
+                <br>
+                <p>Some concluding text about Foo</p>
+            </div>
+        </body>
+        </html>"""
+
+    sec_splitter = HTMLSectionSplitter(
+        headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")]
+    )
+
+    docs = sec_splitter.split_text(html_string)
+
+    assert len(docs) == 4
+    assert docs[0].page_content == "Foo \n Some intro text about Foo."
+    assert docs[0].metadata["Header 1"] == "Foo"
+
+    assert docs[1].page_content == (
+        "Bar main section \n Some intro text about Bar. \n "
+        "Bar subsection 1 \n Some text about the first subtopic of Bar. \n "
+        "Bar subsection 2 \n Some text about the second subtopic of Bar."
+    )
+    assert docs[1].metadata["Header 2"] == "Bar main section"
+
+    assert docs[2].page_content == "Foo \n Some text about Baz"
+    assert docs[2].metadata["Header 2"] == "Foo"
+
+    assert docs[3].page_content == "Foo \n \n Some concluding text about Foo"
+    assert docs[3].metadata["Header 1"] == "Foo"
+
+
 def test_split_json() -> None:
     """Test json text splitter"""
     max_chunk = 800