From 4bfcc46f5abfad13fc5e54df5444b9b4b34c1229 Mon Sep 17 00:00:00 2001
From: Gianfranco Rossi
Date: Wed, 29 Jan 2025 20:12:56 -0500
Subject: [PATCH] fix(annotate_citations): try to include HTML style tags if
not balanced
Some annotations, specially for ReferenceCitations, are discarded in HTML sources because some style tags (mostly i or em) are not balanced. This PR tries to include the style tags in the citation span
- Adds tests for `utils.maybe_balance_style_tags`
- Add a logger.error call when the unbalanced HTML could not be fixed
Solves #196
---
eyecite/annotate.py | 31 +++++++++---
eyecite/utils.py | 54 ++++++++++++++++++++
tests/test_AnnotateTest.py | 100 ++++++++++++++++++++++++++++++++++++-
3 files changed, 176 insertions(+), 9 deletions(-)
diff --git a/eyecite/annotate.py b/eyecite/annotate.py
index 7f5ba46..d3558b6 100644
--- a/eyecite/annotate.py
+++ b/eyecite/annotate.py
@@ -1,11 +1,18 @@
from bisect import bisect_left, bisect_right
from difflib import SequenceMatcher
from functools import partial
+from logging import getLogger
from typing import Any, Callable, Iterable, Optional, Tuple
import fast_diff_match_patch
-from eyecite.utils import is_balanced_html, wrap_html_tags
+from eyecite.utils import (
+ is_balanced_html,
+ maybe_balance_style_tags,
+ wrap_html_tags,
+)
+
+logger = getLogger("eyecite")
def annotate_citations(
@@ -59,6 +66,9 @@ def annotate_citations(
Returns:
The annotated text.
"""
+ if unbalanced_tags not in ["unchecked", "skip", "wrap"]:
+ raise ValueError(f"Unknown option '{unbalanced_tags}")
+
# set up offset_updater if we have to move annotations to source_text
offset_updater = None
if source_text and source_text != plain_text:
@@ -88,13 +98,20 @@ def annotate_citations(
# handle HTML tags
if unbalanced_tags == "unchecked":
pass
- elif unbalanced_tags in ("skip", "wrap"):
- if not is_balanced_html(span_text):
- if unbalanced_tags == "skip":
- continue
+ elif not is_balanced_html(span_text):
+ if unbalanced_tags == "wrap":
span_text = wrap_html_tags(span_text, after, before)
- else:
- raise ValueError(f"Unknown option '{unbalanced_tags}")
+ else: # "skip" case
+ original_span_text = span_text
+ start, end, span_text = maybe_balance_style_tags(
+ start, end, plain_text
+ )
+ if not is_balanced_html(span_text):
+ logger.error(
+ "Citation was not annotated due to unbalanced tags %s",
+ original_span_text,
+ )
+ continue
if annotator is not None:
annotated_span = annotator(before, span_text, after)
diff --git a/eyecite/utils.py b/eyecite/utils.py
index b2bb66a..c606a32 100644
--- a/eyecite/utils.py
+++ b/eyecite/utils.py
@@ -130,3 +130,57 @@ def hash_sha256(dictionary: dict) -> int:
# Calculate the hash of the bytes, convert to an int, and return
return int.from_bytes(hashlib.sha256(json_bytes).digest(), byteorder="big")
+
+
+def maybe_balance_style_tags(
+ start: int, end: int, plain_text: str
+) -> tuple[int, int, str]:
+ """Try to include style tags at the edge of the span marked as invalid
+
+ In some HTML sources the citations are styled with tags like or
+ When the citation is found in a stripped-of-tags text, the span may
+ leave out the opening or closing tag. When this happens and we try to
+ annotate the HTML, it will render invalid HTML. This happens mostly with
+ IdCitation, ReferenceCitation, etc.
+
+ This function will try to find opening or closing tags inmediately
+ preceding or following the citation span. If it finds them, it will
+ return the new start, end and span. If not, it will return the old ones
+
+ :param start: the original start of the span
+ :param end: the origina end of the span
+ :param plain_text: the text to annotate
+ :return: a tuple (new start, new end, new span text)
+ """
+ span_text = plain_text[start:end]
+ style_tags = ["i", "em", "b"]
+ tolerance = 5 # tolerate at most this amount of whitespace
+
+ for tag in style_tags:
+ opening_tag = f"<{tag}>"
+ closing_tag = f"{tag}>"
+ has_opening = opening_tag in span_text
+ has_closing = closing_tag in span_text
+ if has_opening and not has_closing:
+ # look for closing tag after the end
+ extended_end = max(
+ end + len(closing_tag) + tolerance, len(plain_text)
+ )
+ if end_match := re.search(
+ rf"{span_text}\s*{closing_tag}",
+ plain_text[start:extended_end],
+ flags=re.MULTILINE,
+ ):
+ end = start + end_match.end()
+
+ if not has_opening and has_closing:
+ # look for opening tag before the start
+ extended_start = min(start - len(opening_tag) - tolerance, 0)
+ if start_match := re.search(
+ rf"{opening_tag}\s*{span_text}",
+ plain_text[extended_start:end],
+ flags=re.MULTILINE,
+ ):
+ start = extended_start + start_match.start()
+
+ return start, end, plain_text[start:end]
diff --git a/tests/test_AnnotateTest.py b/tests/test_AnnotateTest.py
index e61c7af..7572b82 100644
--- a/tests/test_AnnotateTest.py
+++ b/tests/test_AnnotateTest.py
@@ -12,6 +12,7 @@ def straighten_quotes(text):
def lower_annotator(before, text, after):
return before + text.lower() + after
+ self.maxDiff = None
test_pairs = (
# single cite
("1 U.S. 1", "<0>1 U.S. 10>", []),
@@ -59,10 +60,10 @@ def lower_annotator(before, text, after):
"foo <0>1 U.S. 10> bar",
["html", "inline_whitespace"],
),
- # whitespace and html -- skip unbalanced tags
+ # whitespace and html -- unbalanced tags are repaired
(
"foo 1 U.S. 1; 2 U.S. 2",
- "foo 1 U.S. 1; <1>2 U.S. 21>",
+ "foo <0>1 U.S. 10>; <1>2 U.S. 21>",
["html", "inline_whitespace"],
{"unbalanced_tags": "skip"},
),
@@ -101,6 +102,94 @@ def lower_annotator(before, text, after):
[],
{"annotator": lower_annotator},
),
+ # solvable unbalanced tag. Need the FullCaseCitation first
+ # so the ReferenceCitation can be found
+ # from https://www.courtlistener.com/api/rest/v4/opinions/8496639/
+ # source: Opinion.xml_harvard
+ (
+ " partially secured by a debtor’s principal residence was not "
+ "con-firmable. Nobelman v. Am. Sav. Bank, "
+ "508 U.S. 324, 113 S.Ct. 2106, 124 L.Ed.2d 228 (1993). That "
+ "plan proposed to bifurcate the claim and... pay the unsecured"
+ "... only by a lien on the debtor’s principal residence.” "
+ "Nobelman at 332, 113 S.Ct. 2106. Section 1123(b)(5) "
+ "codifies the Nobelman decision in individual debtor "
+ "chapter 11 cases.",
+ " partially secured by a debtor’s principal residence was not"
+ " con-firmable. Nobelman v. Am. Sav. Bank, "
+ "508 U.S. 324, "
+ "113 S.Ct. 2106, 124 L.Ed.2d 228"
+ " (1993). That plan proposed to bifurcate the claim and..."
+ " pay the unsecured... only by a lien on the debtor’s"
+ " principal residence.” Nobelman "
+ "at 332, 113 S.Ct. 2106. Section"
+ " 1123(b)(5) codifies the Nobelman decision in"
+ " individual debtor chapter 11 cases.",
+ ["html", "all_whitespace"],
+ {"annotate_anchors": True, "unbalanced_tags": "skip"},
+ ),
+ # solvable unbalanced tag
+ # from https://www.courtlistener.com/api/rest/v4/opinions/2841253/
+ # source: Opinion.html
+ (
+ "he has not agreed so to submit.’” Howsam v. Dean"
+ " Witter Reynolds, Inc., 537 U.S. 79, 83, 123 S. Ct."
+ " 588, 591 (2002) (combined mandamus and"
+ " interlocutory appeal) (citing Howsam at 84, 123"
+ " S. Ct. at 592)",
+ "he has not agreed so to submit.’” Howsam v. Dean"
+ " Witter Reynolds, Inc., 537 U.S."
+ " 79, 83, 123 S. Ct. 588, 591"
+ " (2002) (combined mandamus and interlocutory appeal)"
+ " (citing Howsam at 84, 123 S. Ct. at 592)",
+ ["html", "all_whitespace"],
+ {"annotate_anchors": True, "unbalanced_tags": "skip"},
+ ),
+ # The next 2 examples could be resolved if we increased the
+ # character tolerance or admitted the full case name instead of
+ # just one of the parties
+ (
+ # https://www.courtlistener.com/api/rest/v4/opinions/1535649/
+ # source: xml_harvard
+ "See also Styler v. Tall Oaks, Inc. (In re Hatch),"
+ " 93 B.R. 263, 267 (Bankr.D. Utah 1988),"
+ " rev'd 114 B.R. 747 (D.Utah 1989)."
+ "
... The court makes no"
+ " determination as to whe Fifth Amendment to the"
+ " constitution of the United States.” Styler v."
+ " Tall Oaks, Inc. (In re Hatch), at 748."
+ "",
+ "See also Styler v. Tall Oaks, Inc. (In re Hatch),"
+ " 93 B.R. 263, 267"
+ " (Bankr.D. Utah 1988), rev'd 114 B.R. 747 (D.Utah 1989)."
+ "... The court makes no"
+ " determination as to whe Fifth Amendment to the"
+ " constitution of the United States.” Styler v."
+ " Tall Oaks, Inc. (In re Hatch), at 748."
+ "",
+ ["html", "all_whitespace"],
+ {"annotate_anchors": True, "unbalanced_tags": "skip"},
+ ),
+ (
+ # https://www.courtlistener.com/api/rest/v4/opinions/1985850/
+ # source: html_lawbox
+ "to act rationally. See, e.g., State v."
+ " Wingler, 25 N.J. 161, 175, 135 A.2d"
+ " 468 (1957); citing, ... have been applied.'"
+ " [State v. Wingler at 175, 135 A.2d"
+ " 468, citing, Minnesota ex rel.",
+ "to act rationally. See, e.g., State v."
+ " Wingler, 25 N.J."
+ " 161, 175, 135 A.2d"
+ " 468 (1957); citing, ... have been applied.'"
+ " [State v. Wingler at 175, 135 A.2d 468, citing,"
+ " Minnesota ex rel.",
+ ["html", "all_whitespace"],
+ {"annotate_anchors": True, "unbalanced_tags": "skip"},
+ ),
)
for source_text, expected, clean_steps, *annotate_kwargs in test_pairs:
annotate_kwargs = annotate_kwargs[0] if annotate_kwargs else {}
@@ -115,6 +204,13 @@ def lower_annotator(before, text, after):
(c.span(), f"<{i}>", f"{i}>")
for i, c in enumerate(cites)
]
+
+ if annotate_kwargs.pop("annotate_anchors", False):
+ annotations = [
+ (c.span(), "", "")
+ for c in cites
+ ]
+
annotated = annotate_citations(
plain_text,
annotations,