Skip to content

Commit

Permalink
text-splitters[patch]: Fix HTMLSectionSplitter (#22812)
Browse files Browse the repository at this point in the history
Update former pull request:
#22654.

Modified `langchain_text_splitters.HTMLSectionSplitter`, where in the
latest version `dict` data structure is used to store sections from a
html document, in function `split_html_by_headers`. The header/section
element names serve as dict keys. This can be a problem when duplicate
header/section element names are present in a single html document.
Latter ones can replace former ones with the same name. Therefore some
contents can be miss after html text splitting is conducted.

Using a list to store sections can hopefully solve the problem. A Unit
test considering duplicate header names has been added.

---------

Co-authored-by: Bagatur <[email protected]>
  • Loading branch information
2 people authored and hinthornw committed Jun 20, 2024
1 parent ff7f03d commit 9072d58
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 13 deletions.
27 changes: 14 additions & 13 deletions libs/text-splitters/langchain_text_splitters/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,9 +233,7 @@ def create_documents(
documents.append(new_doc)
return documents

def split_html_by_headers(
self, html_doc: str
) -> Dict[str, Dict[str, Optional[str]]]:
def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]:
try:
from bs4 import BeautifulSoup, PageElement # type: ignore[import-untyped]
except ImportError as e:
Expand All @@ -247,7 +245,7 @@ def split_html_by_headers(

soup = BeautifulSoup(html_doc, "html.parser")
headers = list(self.headers_to_split_on.keys())
sections: Dict[str, Dict[str, Optional[str]]] = {}
sections: list[dict[str, str | None]] = []

headers = soup.find_all(["body"] + headers)

Expand All @@ -269,10 +267,13 @@ def split_html_by_headers(
content = " ".join(section_content).strip()

if content != "":
sections[current_header] = {
"content": content,
"tag_name": current_header_tag,
}
sections.append(
{
"header": current_header,
"content": content,
"tag_name": current_header_tag,
}
)

return sections

Expand Down Expand Up @@ -307,12 +308,12 @@ def split_text_from_file(self, file: Any) -> List[Document]:

return [
Document(
cast(str, sections[section_key]["content"]),
cast(str, section["content"]),
metadata={
self.headers_to_split_on[
str(sections[section_key]["tag_name"])
]: section_key
self.headers_to_split_on[str(section["tag_name"])]: section[
"header"
]
},
)
for section_key in sections.keys()
for section in sections
]
53 changes: 53 additions & 0 deletions libs/text-splitters/tests/unit_tests/test_text_splitters.py
Original file line number Diff line number Diff line change
Expand Up @@ -1650,6 +1650,59 @@ def test_section_splitter_accepts_an_absolute_path() -> None:
sec_splitter.split_text(html_string)


@pytest.mark.requires("lxml")
@pytest.mark.requires("bs4")
def test_happy_path_splitting_with_duplicate_header_tag() -> None:
# arrange
html_string = """<!DOCTYPE html>
<html>
<body>
<div>
<h1>Foo</h1>
<p>Some intro text about Foo.</p>
<div>
<h2>Bar main section</h2>
<p>Some intro text about Bar.</p>
<h3>Bar subsection 1</h3>
<p>Some text about the first subtopic of Bar.</p>
<h3>Bar subsection 2</h3>
<p>Some text about the second subtopic of Bar.</p>
</div>
<div>
<h2>Foo</h2>
<p>Some text about Baz</p>
</div>
<h1>Foo</h1>
<br>
<p>Some concluding text about Foo</p>
</div>
</body>
</html>"""

sec_splitter = HTMLSectionSplitter(
headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")]
)

docs = sec_splitter.split_text(html_string)

assert len(docs) == 4
assert docs[0].page_content == "Foo \n Some intro text about Foo."
assert docs[0].metadata["Header 1"] == "Foo"

assert docs[1].page_content == (
"Bar main section \n Some intro text about Bar. \n "
"Bar subsection 1 \n Some text about the first subtopic of Bar. \n "
"Bar subsection 2 \n Some text about the second subtopic of Bar."
)
assert docs[1].metadata["Header 2"] == "Bar main section"

assert docs[2].page_content == "Foo \n Some text about Baz"
assert docs[2].metadata["Header 2"] == "Foo"

assert docs[3].page_content == "Foo \n \n Some concluding text about Foo"
assert docs[3].metadata["Header 1"] == "Foo"


def test_split_json() -> None:
"""Test json text splitter"""
max_chunk = 800
Expand Down

0 comments on commit 9072d58

Please sign in to comment.