Merge pull request #1276 from freelawproject/recap_email_bankruptcy_s…

…hort_description
freelawproject · Jan 10, 2025 · 5981261 · 5981261
2 parents bb6a4e0 + 93c4d4b
commit 5981261
Show file tree

Hide file tree

Showing 62 changed files with 4,234 additions and 115 deletions.
diff --git a/juriscraper/pacer/email.py b/juriscraper/pacer/email.py
@@ -48,6 +48,31 @@ class NotificationEmail(BaseDocketReport, BaseReport):
         "Modified Date Filed from",
         "Added Correct PDF to document",
     ]
+    bankruptcy_courts_with_tests = [
+        "arwb",
+        "cacb",
+        "casb",
+        "cob",
+        "ctb",
+        "dcb",
+        "deb",
+        "flsb",
+        "ianb",
+        "mdb",
+        "nceb",
+        "ndb",
+        "nhb",
+        "njb",
+        "nyeb",
+        "nysb",
+        "okeb",
+        "paeb",
+        "pamb",
+        "pawb",
+        "tnmb",
+        "txnb",
+        "vaeb",
+    ]
 
     def __init__(self, court_id):
         self.court_id = court_id
@@ -57,6 +82,7 @@ def __init__(self, court_id):
         self.docket_numbers = []
         self.subject = None
         self.case_names = []
+        self.raw_docket_numbers = set()
         if self.court_id.endswith("b"):
             self.is_bankruptcy = True
         else:
@@ -172,16 +198,16 @@ def _parse_docket_number(
             return case_number, self._return_default_dn_components()
 
         path = self._sibling_path("Case Number")
+        docket_numbers_str = current_node.xpath(f"{path}/a/text()")
+        self.raw_docket_numbers.update(set(docket_numbers_str))
         docket_number, docket_number_components = (
-            self._parse_docket_number_strs(
-                current_node.xpath(f"{path}/a/text()")
-            )
+            self._parse_docket_number_strs(docket_numbers_str)
         )
         if not docket_number:
+            docket_numbers_str = current_node.xpath(f"{path}/p/a/text()")
+            self.raw_docket_numbers.update(set(docket_numbers_str))
             docket_number, docket_number_components = (
-                self._parse_docket_number_strs(
-                    current_node.xpath(f"{path}/p/a/text()")
-                )
+                self._parse_docket_number_strs(docket_numbers_str)
             )
         return docket_number, docket_number_components
 
@@ -199,6 +225,7 @@ def _parse_docket_number_plain(
         email_body = self.tree.text_content()
         regex = r"Case Number:(.*)"
         docket_number = re.findall(regex, email_body)
+        self.raw_docket_numbers.update(set(docket_number))
         return self._parse_docket_number_strs(docket_number)
 
     def _get_date_filed(self) -> date:
@@ -396,7 +423,6 @@ def _get_dockets(self) -> DocketType:
             docket_number, docket_number_components = (
                 self._parse_docket_number_plain()
             )
-            docket_number = docket_number
             # Cache the docket number for its later use.
             self.docket_numbers.append(docket_number)
 
@@ -534,115 +560,69 @@ def _get_docket_entries(
         return []
 
     def _parse_bankruptcy_short_description(self, subject: str) -> str:
-        """Parse the short description of a bankruptcy case from the email
-        subject. Subjects for bankruptcy varies a lot from court to court.
-        This function supports parsing the short description for courts with
-        known examples.
+        """Parse the short description of a bankruptcy case from the email subject
+
+        The subject contains: the docket number, the case name, the short description
+        and special characters. The presence and order of these elements varies from
+        court to court. The short description that we parse out should match the
+        "Description" on the "History/Document" report on PACER
+
+        A special case are multi-docket NEFs, of which we have only seen
+        2-docket NEFs. These are labelled as "_multi_" on the example files, and we
+        have only seen "Adversary Cases", so far. The subject will use one
+        of the case names and one of the docket numbers, and then follow
+        the general rules
 
         :param subject: The email subject string.
         :return: The parsed short description.
         """
+        # Some courts have subjects like
+        # `Multiple Cases "{docket} {case name} Close [Aa]dversary [Cc]ase" - {initials}`
+        if close_adv_match := re.search(
+            r"close adversary case", subject, flags=re.IGNORECASE
+        ):
+            return close_adv_match.group(0)
+
+        # raw dockets from "plain/text" email may have a URL
+        raw_docket_numbers = [
+            i.strip().split(" ")[0] for i in self.raw_docket_numbers
+        ]
+        raw_docket_numbers.sort(key=len, reverse=True)  # use longest first
+        for part in raw_docket_numbers + self.docket_numbers + self.case_names:
+            subject = subject.replace(part, " ")
 
-        # See paeb_3.txt for a test of multi docket NEF
-        # So far, we have only seen 2-docket NEF
-        is_multidocket = len(self.docket_numbers) == 2
-        if len(self.docket_numbers) > 1 and self.court_id != "njb":
+        # Sometimes the full case name is not used in the `subject`
+        # Some courts use a 18 character limit
+        # See deb_2.txt, pamb_1 and pamb_3 for examples
+        for case_name in self.case_names:
+            subject = subject.replace(case_name[:18].strip(), " ")
+            subject = subject.replace(case_name.split(" and ")[0], " ")
+
+        # Deletes:
+        # - "NEF: " placeholder
+        # - "Ch \d{1,2}" abbreviations
+        cleanup_regex = r"(NEF:? )|(C[Hh][- ]?(7|9|11|13))|(C[hH][\s-]*$)"
+        subject = re.sub(cleanup_regex, " ", subject)
+
+        # Courts like `nhb` do not use the "Ch \d{1,2}" abbreviation
+        # and we must delete the "Chapter..." string; but only once
+        # See nhb_2 for an example with 2 "Chapter..." strings
+        chapter_regex = r"Chapter[- ]?(7|9|11|13)"
+        if self.court_id in ["nhb"]:
+            subject = re.sub(chapter_regex, " ", subject, 1)
+        elif len(re.findall(chapter_regex, subject)) > 1:
             logger.error(
-                "Not parsing description for Bankruptcy Multi Docket NEF for court '%s'",
-                self.court_id,
-                extra={
-                    "fingerprint": [
-                        f"{self.court_id}-not-parsing-multi-docket-short-description"
-                    ]
-                },
+                "Trying to parse a RECAP email with more than 1 'Chapter...' string"
             )
-            return ""
 
-        short_description = ""
-        docket_number = self.docket_numbers[0]
-        case_name = self.case_names[0]
-
-        if is_multidocket:
-            # Docket number / case name from one or both of the 2 cases
-            # may be used in the subject string
-            if self.docket_numbers[1] in subject:
-                docket_number = self.docket_numbers[1]
-            if self.case_names[1] in subject:
-                case_name = self.case_names[1]
-
-        if self.court_id in [
-            "cacb",
-            "ctb",
-            "cob",
-            "ianb",
-            "nyeb",
-            "txnb",
-            "okeb",
-        ]:
-            # In: 6:22-bk-13643-SY Request for courtesy Notice of Electronic Filing (NEF)
-            # Out: Request for courtesy Notice of Electronic Filing (NEF)
-            short_description = subject.split(docket_number)[-1]
-
-            # Remove docket number traces "-AAA"
-            regex = r"^-.*?\s"
-            short_description = re.sub(regex, "", short_description)
-        elif self.court_id in ["njb", "dcb", "vaeb", "paeb", "mdb", "arwb"]:
-            # In: Ch-11 19-27439-MBK Determination of Adjournment Request - Hollister Construc
-            # Out: Determination of Adjournment Request
-            # In Ch-13 1:24-bk-70534 Meeting of Creditors - Second Non-Appearance; Michael Clayton Lowry
-            # Out: Meeting of Creditors - Second Non-Appearance
-            short_description = subject.split(docket_number)[-1]
-            # Remove docket number traces "-AAA"
-            # Remove CH after docket and BK after short description for dcb
-            regex = r"^-.*?\s|C[Hh][\s\d]+|[ (]?B[Kk]( Other)?[) ]?"
-            short_description = re.sub(regex, "", short_description)
-            separator = ";" if self.court_id == "arwb" else "-"
-            short_description = short_description.rsplit(separator, 1)[0]
-        elif self.court_id == "nysb":
-            # In: 22-22507-cgm Ch13 Affidavit Re: Gerasimos Stefanitsis
-            # Out: Affidavit
-            short_description = subject.split(case_name)[0]
-            short_description = short_description.replace("Re:", "")
-            short_description = short_description.split(docket_number)[-1]
-
-            # Remove strings starting with "Ch" followed by a number
-            regex = r"\bCh\d+\b"
-            short_description = re.sub(regex, "", short_description)
-
-            # Remove docket number traces "-AAA"
-            regex = r"^-.*?\s"
-            short_description = re.sub(regex, "", short_description)
-        elif self.court_id in ["pawb", "ndb", "deb", "pamb", "nhb"]:
-            # In: Ch-7 22-20823-GLT U LOCK INC Reply
-            # Out: Reply
-            if case_name in subject:
-                short_description = subject.split(case_name)[-1]
-            elif case_name[:18] in subject:
-                # See deb_2.txt, pamb_1 and pamb_3 for examples
-                short_description = subject.split(case_name[:18])[-1]
-            elif (
-                " and " in case_name and case_name.split(" and ")[0] in subject
-            ):
-                # See pamb_2.txt
-                short_description = subject.split(case_name.split(" and ")[0])[
-                    -1
-                ]
-        elif self.court_id in [
-            "tnmb",
-        ]:
-            # In: Docket Order - Continue Hearing (Auto) Ch 13 Jeffery Wayne Lovell and Tiffany Nicole Lovell 1:24-bk-01377
-            # Out: Docket Order - Continue Hearing (Auto) Ch 13
-            if case_name in subject:
-                short_description = subject.split(case_name)[0]
-        else:
+        subject = subject.strip(" ;:,- ")
+        # some courts use "Re: {case name}"
+        short_description = re.sub("( Re$)|(^Re:? )", "", subject)
+
+        if self.court_id not in self.bankruptcy_courts_with_tests:
             logger.error(
-                "Short description has no parsing for bankruptcy court '%s'",
+                "Parsing a RECAP email from court %s without tests",
                 self.court_id,
-                extra={
-                    "fingerprint": [
-                        f"{self.court_id}-not-parsing-short-description"
-                    ]
-                },
             )
 
         return short_description

diff --git a/tests/examples/pacer/nef/s3/cacb_7.json b/tests/examples/pacer/nef/s3/cacb_7.json
@@ -0,0 +1,46 @@
+{
+  "appellate": false,
+  "contains_attachments": false,
+  "court_id": "cacb",
+  "dockets": [
+    {
+      "case_name": "Lee v. Newman",
+      "date_filed": null,
+      "docket_entries": [
+        {
+          "date_filed": "2023-02-15",
+          "description": "Order granting stipulation to continue pre-trial conference from 2/28/23 to 4/25/23 @ 10:00 a.m. (BNC-PDF) (Related Doc [75]) Signed on 2/15/2023 (JC6)",
+          "document_number": "76",
+          "document_url": "https://ecf.cacb.uscourts.gov/doc1/9730106502427?pdf_header=&magic_num=94356268&de_seq_num=252&caseid=1927388",
+          "pacer_case_id": "1927388",
+          "pacer_doc_id": "9730106502427",
+          "pacer_magic_num": "94356268",
+          "pacer_seq_no": "252",
+          "short_description": "ORDER to continue/reschedule hearing (BNC-PDF)"
+        }
+      ],
+      "docket_number": "6:21-ap-01071",
+      "federal_defendant_number": null,
+      "federal_dn_case_type": "ap",
+      "federal_dn_judge_initials_assigned": "SC",
+      "federal_dn_judge_initials_referred": null,
+      "federal_dn_office_code": "6"
+    }
+  ],
+  "email_recipients": [
+    {
+      "email_addresses": [
+        "[email protected]",
+        "[email protected]",
+        "[email protected]",
+        "[email protected]",
+        "[email protected]",
+        "[email protected]@y.net",
+        "[email protected]",
+        "[email protected]",
+        "[email protected]"
+      ],
+      "name": ""
+    }
+  ]
+}