From 4bfcc46f5abfad13fc5e54df5444b9b4b34c1229 Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Wed, 29 Jan 2025 20:12:56 -0500 Subject: [PATCH] fix(annotate_citations): try to include HTML style tags if not balanced Some annotations, specially for ReferenceCitations, are discarded in HTML sources because some style tags (mostly i or em) are not balanced. This PR tries to include the style tags in the citation span - Adds tests for `utils.maybe_balance_style_tags` - Add a logger.error call when the unbalanced HTML could not be fixed Solves #196 --- eyecite/annotate.py | 31 +++++++++--- eyecite/utils.py | 54 ++++++++++++++++++++ tests/test_AnnotateTest.py | 100 ++++++++++++++++++++++++++++++++++++- 3 files changed, 176 insertions(+), 9 deletions(-) diff --git a/eyecite/annotate.py b/eyecite/annotate.py index 7f5ba46..d3558b6 100644 --- a/eyecite/annotate.py +++ b/eyecite/annotate.py @@ -1,11 +1,18 @@ from bisect import bisect_left, bisect_right from difflib import SequenceMatcher from functools import partial +from logging import getLogger from typing import Any, Callable, Iterable, Optional, Tuple import fast_diff_match_patch -from eyecite.utils import is_balanced_html, wrap_html_tags +from eyecite.utils import ( + is_balanced_html, + maybe_balance_style_tags, + wrap_html_tags, +) + +logger = getLogger("eyecite") def annotate_citations( @@ -59,6 +66,9 @@ def annotate_citations( Returns: The annotated text. """ + if unbalanced_tags not in ["unchecked", "skip", "wrap"]: + raise ValueError(f"Unknown option '{unbalanced_tags}") + # set up offset_updater if we have to move annotations to source_text offset_updater = None if source_text and source_text != plain_text: @@ -88,13 +98,20 @@ def annotate_citations( # handle HTML tags if unbalanced_tags == "unchecked": pass - elif unbalanced_tags in ("skip", "wrap"): - if not is_balanced_html(span_text): - if unbalanced_tags == "skip": - continue + elif not is_balanced_html(span_text): + if unbalanced_tags == "wrap": span_text = wrap_html_tags(span_text, after, before) - else: - raise ValueError(f"Unknown option '{unbalanced_tags}") + else: # "skip" case + original_span_text = span_text + start, end, span_text = maybe_balance_style_tags( + start, end, plain_text + ) + if not is_balanced_html(span_text): + logger.error( + "Citation was not annotated due to unbalanced tags %s", + original_span_text, + ) + continue if annotator is not None: annotated_span = annotator(before, span_text, after) diff --git a/eyecite/utils.py b/eyecite/utils.py index b2bb66a..c606a32 100644 --- a/eyecite/utils.py +++ b/eyecite/utils.py @@ -130,3 +130,57 @@ def hash_sha256(dictionary: dict) -> int: # Calculate the hash of the bytes, convert to an int, and return return int.from_bytes(hashlib.sha256(json_bytes).digest(), byteorder="big") + + +def maybe_balance_style_tags( + start: int, end: int, plain_text: str +) -> tuple[int, int, str]: + """Try to include style tags at the edge of the span marked as invalid + + In some HTML sources the citations are styled with tags like or + When the citation is found in a stripped-of-tags text, the span may + leave out the opening or closing tag. When this happens and we try to + annotate the HTML, it will render invalid HTML. This happens mostly with + IdCitation, ReferenceCitation, etc. + + This function will try to find opening or closing tags inmediately + preceding or following the citation span. If it finds them, it will + return the new start, end and span. If not, it will return the old ones + + :param start: the original start of the span + :param end: the origina end of the span + :param plain_text: the text to annotate + :return: a tuple (new start, new end, new span text) + """ + span_text = plain_text[start:end] + style_tags = ["i", "em", "b"] + tolerance = 5 # tolerate at most this amount of whitespace + + for tag in style_tags: + opening_tag = f"<{tag}>" + closing_tag = f"" + has_opening = opening_tag in span_text + has_closing = closing_tag in span_text + if has_opening and not has_closing: + # look for closing tag after the end + extended_end = max( + end + len(closing_tag) + tolerance, len(plain_text) + ) + if end_match := re.search( + rf"{span_text}\s*{closing_tag}", + plain_text[start:extended_end], + flags=re.MULTILINE, + ): + end = start + end_match.end() + + if not has_opening and has_closing: + # look for opening tag before the start + extended_start = min(start - len(opening_tag) - tolerance, 0) + if start_match := re.search( + rf"{opening_tag}\s*{span_text}", + plain_text[extended_start:end], + flags=re.MULTILINE, + ): + start = extended_start + start_match.start() + + return start, end, plain_text[start:end] diff --git a/tests/test_AnnotateTest.py b/tests/test_AnnotateTest.py index e61c7af..7572b82 100644 --- a/tests/test_AnnotateTest.py +++ b/tests/test_AnnotateTest.py @@ -12,6 +12,7 @@ def straighten_quotes(text): def lower_annotator(before, text, after): return before + text.lower() + after + self.maxDiff = None test_pairs = ( # single cite ("1 U.S. 1", "<0>1 U.S. 1", []), @@ -59,10 +60,10 @@ def lower_annotator(before, text, after): "foo <0>1 U.S. 1 bar", ["html", "inline_whitespace"], ), - # whitespace and html -- skip unbalanced tags + # whitespace and html -- unbalanced tags are repaired ( "foo 1 U.S. 1; 2 U.S. 2", - "foo 1 U.S. 1; <1>2 U.S. 2", + "foo <0>1 U.S. 1; <1>2 U.S. 2", ["html", "inline_whitespace"], {"unbalanced_tags": "skip"}, ), @@ -101,6 +102,94 @@ def lower_annotator(before, text, after): [], {"annotator": lower_annotator}, ), + # solvable unbalanced tag. Need the FullCaseCitation first + # so the ReferenceCitation can be found + # from https://www.courtlistener.com/api/rest/v4/opinions/8496639/ + # source: Opinion.xml_harvard + ( + " partially secured by a debtor’s principal residence was not " + "con-firmable. Nobelman v. Am. Sav. Bank, " + "508 U.S. 324, 113 S.Ct. 2106, 124 L.Ed.2d 228 (1993). That " + "plan proposed to bifurcate the claim and... pay the unsecured" + "... only by a lien on the debtor’s principal residence.” " + "Nobelman at 332, 113 S.Ct. 2106. Section 1123(b)(5) " + "codifies the Nobelman decision in individual debtor " + "chapter 11 cases.", + " partially secured by a debtor’s principal residence was not" + " con-firmable. Nobelman v. Am. Sav. Bank, " + "508 U.S. 324, " + "113 S.Ct. 2106, 124 L.Ed.2d 228" + " (1993). That plan proposed to bifurcate the claim and..." + " pay the unsecured... only by a lien on the debtor’s" + " principal residence.” Nobelman " + "at 332, 113 S.Ct. 2106. Section" + " 1123(b)(5) codifies the Nobelman decision in" + " individual debtor chapter 11 cases.", + ["html", "all_whitespace"], + {"annotate_anchors": True, "unbalanced_tags": "skip"}, + ), + # solvable unbalanced tag + # from https://www.courtlistener.com/api/rest/v4/opinions/2841253/ + # source: Opinion.html + ( + "he has not agreed so to submit.’” Howsam v. Dean" + " Witter Reynolds, Inc., 537 U.S. 79, 83, 123 S. Ct." + " 588, 591 (2002) (combined mandamus and" + " interlocutory appeal) (citing Howsam at 84, 123" + " S. Ct. at 592)", + "he has not agreed so to submit.’” Howsam v. Dean" + " Witter Reynolds, Inc., 537 U.S." + " 79, 83, 123 S. Ct. 588, 591" + " (2002) (combined mandamus and interlocutory appeal)" + " (citing Howsam at 84, 123 S. Ct. at 592)", + ["html", "all_whitespace"], + {"annotate_anchors": True, "unbalanced_tags": "skip"}, + ), + # The next 2 examples could be resolved if we increased the + # character tolerance or admitted the full case name instead of + # just one of the parties + ( + # https://www.courtlistener.com/api/rest/v4/opinions/1535649/ + # source: xml_harvard + "See also Styler v. Tall Oaks, Inc. (In re Hatch)," + " 93 B.R. 263, 267 (Bankr.D. Utah 1988)," + " rev'd 114 B.R. 747 (D.Utah 1989)." + "

... The court makes no" + " determination as to whe Fifth Amendment to the" + " constitution of the United States.” Styler v." + " Tall Oaks, Inc. (In re Hatch), at 748." + "

", + "See also Styler v. Tall Oaks, Inc. (In re Hatch)," + " 93 B.R. 263, 267" + " (Bankr.D. Utah 1988), rev'd 114 B.R. 747 (D.Utah 1989)." + "

... The court makes no" + " determination as to whe Fifth Amendment to the" + " constitution of the United States.” Styler v." + " Tall Oaks, Inc. (In re Hatch), at 748." + "

", + ["html", "all_whitespace"], + {"annotate_anchors": True, "unbalanced_tags": "skip"}, + ), + ( + # https://www.courtlistener.com/api/rest/v4/opinions/1985850/ + # source: html_lawbox + "to act rationally. See, e.g., State v." + " Wingler, 25 N.J. 161, 175, 135 A.2d" + " 468 (1957); citing, ... have been applied.'" + " [State v. Wingler at 175, 135 A.2d" + " 468, citing, Minnesota ex rel.", + "to act rationally. See, e.g., State v." + " Wingler, 25 N.J." + " 161, 175, 135 A.2d" + " 468 (1957); citing, ... have been applied.'" + " [State v. Wingler at 175, 135 A.2d 468, citing," + " Minnesota ex rel.", + ["html", "all_whitespace"], + {"annotate_anchors": True, "unbalanced_tags": "skip"}, + ), ) for source_text, expected, clean_steps, *annotate_kwargs in test_pairs: annotate_kwargs = annotate_kwargs[0] if annotate_kwargs else {} @@ -115,6 +204,13 @@ def lower_annotator(before, text, after): (c.span(), f"<{i}>", f"") for i, c in enumerate(cites) ] + + if annotate_kwargs.pop("annotate_anchors", False): + annotations = [ + (c.span(), "", "") + for c in cites + ] + annotated = annotate_citations( plain_text, annotations,