Merge pull request #4857 from freelawproject/4826-replicate-pdf-uploa…

…ds-to-subdockets 4826 Replicate RECAP PDF uploads to subdockets
freelawproject · Jan 8, 2025 · 437c5d7 · 437c5d7
2 parents 738f6f9 + dfee2d5
commit 437c5d7
Show file tree

Hide file tree

Showing 3 changed files with 507 additions and 205 deletions.
diff --git a/cl/recap/api_serializers.py b/cl/recap/api_serializers.py
@@ -95,10 +95,10 @@ def validate(self, attrs):
             UPLOAD_TYPE.CASE_QUERY_RESULT_PAGE,
         ]:
             # These are district or bankruptcy court dockets. Is the court valid?
-            court_ids = Court.federal_courts.district_or_bankruptcy_pacer_courts().values_list(
-                "pk", flat=True
+            court_ids = (
+                Court.federal_courts.district_or_bankruptcy_pacer_courts()
             )
-            if attrs["court"].pk not in court_ids:
+            if not court_ids.filter(pk=attrs["court"].pk).exists():
                 raise ValidationError(
                     "%s is not a district or bankruptcy court ID. Did you "
                     "mean to use the upload_type for appellate dockets?"
@@ -108,11 +108,9 @@ def validate(self, attrs):
         if attrs["upload_type"] == UPLOAD_TYPE.CLAIMS_REGISTER:
             # Only allowed on bankruptcy courts
             bankruptcy_court_ids = (
-                Court.federal_courts.bankruptcy_pacer_courts().values_list(
-                    "pk", flat=True
-                )
+                Court.federal_courts.bankruptcy_pacer_courts()
             )
-            if attrs["court"].pk not in bankruptcy_court_ids:
+            if not bankruptcy_court_ids.filter(pk=attrs["court"].pk).exists():
                 raise ValidationError(
                     "%s is not a bankruptcy court ID. Only bankruptcy cases "
                     "should have claims registry pages." % attrs["court"]
@@ -127,12 +125,8 @@ def validate(self, attrs):
             UPLOAD_TYPE.APPELLATE_CASE_QUERY_RESULT_PAGE,
         ]:
             # Appellate court dockets. Is the court valid?
-            appellate_court_ids = (
-                Court.federal_courts.appellate_pacer_courts().values_list(
-                    "pk", flat=True
-                )
-            )
-            if attrs["court"].pk not in appellate_court_ids:
+            appellate_court_ids = Court.federal_courts.appellate_pacer_courts()
+            if not appellate_court_ids.filter(pk=attrs["court"].pk).exists():
                 raise ValidationError(
                     "%s is not an appellate court ID. Did you mean to use the "
                     "upload_type for district dockets?" % attrs["court"]
@@ -203,11 +197,8 @@ def validate(self, attrs):
         mail = attrs["mail"]
         receipt = attrs["receipt"]
 
-        all_court_ids = Court.federal_courts.all_pacer_courts().values_list(
-            "pk", flat=True
-        )
-
-        if court_id not in all_court_ids:
+        all_court_ids = Court.federal_courts.all_pacer_courts()
+        if not all_court_ids.filter(pk=court_id).exists():
             raise ValidationError(
                 f"{attrs['court'].pk} is not a PACER court ID."
             )
@@ -274,10 +265,9 @@ class Meta:
 
     def validate(self, attrs):
         # Is it a good court value?
-        valid_court_ids = Court.federal_courts.district_or_bankruptcy_pacer_courts().values_list(
-            "pk", flat=True
+        valid_court_ids = (
+            Court.federal_courts.district_or_bankruptcy_pacer_courts()
         )
-
         if (
             attrs.get("court")
             or attrs.get("docket")
@@ -293,7 +283,7 @@ def validate(self, attrs):
                     if attrs.get("court")
                     else attrs["docket"].court_id
                 )
-            if court_id not in valid_court_ids:
+            if not valid_court_ids.filter(pk=court_id).exists():
                 raise ValidationError(f"Invalid court id: {court_id}")
 
         # Docket validations

diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py
@@ -20,6 +20,7 @@
 from django.core.files.base import ContentFile, File
 from django.core.files.uploadedfile import SimpleUploadedFile
 from django.db import IntegrityError, transaction
+from django.db.models import QuerySet
 from django.utils.timezone import now
 from juriscraper.lib.exceptions import PacerLoginException, ParsingException
 from juriscraper.lib.string_utils import CaseNameTweaker, harmonize
@@ -114,7 +115,9 @@ async def process_recap_upload(pq: ProcessingQueue) -> None:
         for pq_pk in sub_docket_att_page_pks:
             await process_recap_attachment(pq_pk)
     elif pq.upload_type == UPLOAD_TYPE.PDF:
-        await process_recap_pdf(pq.pk)
+        sub_docket_pdf_pks = await find_subdocket_pdf_rds(pq.pk)
+        for pq_pk in sub_docket_pdf_pks:
+            await process_recap_pdf(pq_pk)
     elif pq.upload_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT:
         docket = await process_recap_docket_history_report(pq.pk)
     elif pq.upload_type == UPLOAD_TYPE.APPELLATE_DOCKET:
@@ -676,6 +679,30 @@ async def get_att_data_from_pq(
     return pq, att_data, text
 
 
+def get_main_rds(court_id: str, pacer_doc_id: str) -> QuerySet:
+    """
+    Return the main RECAPDocument queryset for a given court and pacer_doc_id.
+    :param court_id: The court ID to query.
+    :param pacer_doc_id: The pacer document ID.
+    :return: The main RECAPDocument queryset.
+    """
+    main_rds_qs = (
+        RECAPDocument.objects.select_related("docket_entry__docket")
+        .filter(
+            pacer_doc_id=pacer_doc_id,
+            docket_entry__docket__court_id=court_id,
+        )
+        .order_by("docket_entry__docket__pacer_case_id")
+        .distinct("docket_entry__docket__pacer_case_id")
+        .only(
+            "pacer_doc_id",
+            "docket_entry__docket__pacer_case_id",
+            "docket_entry__docket__court_id",
+        )
+    )
+    return main_rds_qs
+
+
 async def find_subdocket_att_page_rds(
     pk: int,
 ) -> list[int]:
@@ -687,43 +714,100 @@ async def find_subdocket_att_page_rds(
     """
 
     pq = await ProcessingQueue.objects.aget(pk=pk)
-    court = await Court.objects.aget(id=pq.court_id)
     pq, att_data, text = await get_att_data_from_pq(pq)
     pacer_doc_id = att_data["pacer_doc_id"]
-    main_rds = (
-        RECAPDocument.objects.select_related("docket_entry__docket")
-        .filter(
-            pacer_doc_id=pacer_doc_id,
-            docket_entry__docket__court=court,
-        )
-        .order_by("docket_entry__docket__pacer_case_id")
-        .distinct("docket_entry__docket__pacer_case_id")
-        .only(
-            "pacer_doc_id",
-            "docket_entry__docket__pacer_case_id",
-            "docket_entry__docket__court_id",
-        )
-        .exclude(docket_entry__docket__pacer_case_id=pq.pacer_case_id)
+    main_rds = get_main_rds(pq.court_id, pacer_doc_id).exclude(
+        docket_entry__docket__pacer_case_id=pq.pacer_case_id
     )
     pqs_to_process_pks = [
         pq.pk
     ]  # Add the original pq to the list of pqs to process
     original_file_content = text.encode("utf-8")
     original_file_name = pq.filepath_local.name
-    async for main_rd in main_rds:
-        main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id
-        # Create additional pqs for each subdocket case found.
-        pq_created = await ProcessingQueue.objects.acreate(
-            uploader_id=pq.uploader_id,
-            pacer_doc_id=pacer_doc_id,
-            pacer_case_id=main_pacer_case_id,
-            court_id=court.pk,
-            upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE,
-            filepath_local=ContentFile(
-                original_file_content, name=original_file_name
-            ),
+
+    @sync_to_async
+    def save_pq_instances():
+        with transaction.atomic():
+            for main_rd in main_rds:
+                main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id
+                # Create additional pqs for each subdocket case found.
+                pq_created = ProcessingQueue.objects.create(
+                    uploader_id=pq.uploader_id,
+                    pacer_doc_id=pacer_doc_id,
+                    pacer_case_id=main_pacer_case_id,
+                    court_id=pq.court_id,
+                    upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE,
+                    filepath_local=ContentFile(
+                        original_file_content, name=original_file_name
+                    ),
+                )
+                pqs_to_process_pks.append(pq_created.pk)
+
+    await save_pq_instances()
+    return pqs_to_process_pks
+
+
+async def find_subdocket_pdf_rds(
+    pk: int,
+) -> list[int]:
+    """Look for RECAP Documents that belong to subdockets, and create a PQ
+    object for each additional PDF upload that requires processing.
+
+    :param pk: Primary key of the processing queue item.
+    :return: A list of ProcessingQueue pks to process.
+    """
+
+    pq = await ProcessingQueue.objects.aget(pk=pk)
+    main_rds = get_main_rds(pq.court_id, pq.pacer_doc_id)
+    pqs_to_process_pks = [
+        pq.pk
+    ]  # Add the original pq to the list of pqs to process
+
+    appellate_court_ids = Court.federal_courts.appellate_pacer_courts()
+    if await appellate_court_ids.filter(pk=pq.court_id).aexists():
+        # Abort the process for appellate documents. Subdockets cannot be found
+        # in appellate cases.
+        return pqs_to_process_pks
+
+    if pq.pacer_case_id:
+        # If pq already has a pacer_case_id, exclude it from the queryset.
+        main_rds = main_rds.exclude(
+            docket_entry__docket__pacer_case_id=pq.pacer_case_id
         )
-        pqs_to_process_pks.append(pq_created.pk)
+
+    pdf_binary_content = pq.filepath_local.read()
+
+    @sync_to_async
+    def save_pq_instances():
+        with transaction.atomic():
+            for i, main_rd in enumerate(main_rds):
+                if i == 0 and not pq.pacer_case_id:
+                    # If the original PQ does not have a pacer_case_id,
+                    # assign it a pacer_case_id from one of the matched RDs
+                    # to ensure the RD lookup in process_recap_pdf succeeds.
+                    pq.pacer_case_id = (
+                        main_rd.docket_entry.docket.pacer_case_id
+                    )
+                    pq.save()
+                    continue
+
+                main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id
+                # Create additional pqs for each subdocket case found.
+                pq_created = ProcessingQueue.objects.create(
+                    uploader_id=pq.uploader_id,
+                    pacer_doc_id=pq.pacer_doc_id,
+                    pacer_case_id=main_pacer_case_id,
+                    document_number=pq.document_number,
+                    attachment_number=pq.attachment_number,
+                    court_id=pq.court_id,
+                    upload_type=UPLOAD_TYPE.PDF,
+                    filepath_local=ContentFile(
+                        pdf_binary_content, name=pq.filepath_local.name
+                    ),
+                )
+                pqs_to_process_pks.append(pq_created.pk)
+
+    await save_pq_instances()
     return pqs_to_process_pks
 
 
@@ -747,10 +831,6 @@ async def process_recap_attachment(
     await mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
     logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}")
 
-    pq = await ProcessingQueue.objects.aget(pk=pk)
-    await mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
-    logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}")
-
     pq, att_data, text = await get_att_data_from_pq(pq)
 
     if document_number is None: