Skip to content

Commit

Permalink
Merge pull request #4857 from freelawproject/4826-replicate-pdf-uploa…
Browse files Browse the repository at this point in the history
…ds-to-subdockets

4826 Replicate RECAP PDF uploads to subdockets
  • Loading branch information
mlissner authored Jan 8, 2025
2 parents 738f6f9 + dfee2d5 commit 437c5d7
Show file tree
Hide file tree
Showing 3 changed files with 507 additions and 205 deletions.
34 changes: 12 additions & 22 deletions cl/recap/api_serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,10 @@ def validate(self, attrs):
UPLOAD_TYPE.CASE_QUERY_RESULT_PAGE,
]:
# These are district or bankruptcy court dockets. Is the court valid?
court_ids = Court.federal_courts.district_or_bankruptcy_pacer_courts().values_list(
"pk", flat=True
court_ids = (
Court.federal_courts.district_or_bankruptcy_pacer_courts()
)
if attrs["court"].pk not in court_ids:
if not court_ids.filter(pk=attrs["court"].pk).exists():
raise ValidationError(
"%s is not a district or bankruptcy court ID. Did you "
"mean to use the upload_type for appellate dockets?"
Expand All @@ -108,11 +108,9 @@ def validate(self, attrs):
if attrs["upload_type"] == UPLOAD_TYPE.CLAIMS_REGISTER:
# Only allowed on bankruptcy courts
bankruptcy_court_ids = (
Court.federal_courts.bankruptcy_pacer_courts().values_list(
"pk", flat=True
)
Court.federal_courts.bankruptcy_pacer_courts()
)
if attrs["court"].pk not in bankruptcy_court_ids:
if not bankruptcy_court_ids.filter(pk=attrs["court"].pk).exists():
raise ValidationError(
"%s is not a bankruptcy court ID. Only bankruptcy cases "
"should have claims registry pages." % attrs["court"]
Expand All @@ -127,12 +125,8 @@ def validate(self, attrs):
UPLOAD_TYPE.APPELLATE_CASE_QUERY_RESULT_PAGE,
]:
# Appellate court dockets. Is the court valid?
appellate_court_ids = (
Court.federal_courts.appellate_pacer_courts().values_list(
"pk", flat=True
)
)
if attrs["court"].pk not in appellate_court_ids:
appellate_court_ids = Court.federal_courts.appellate_pacer_courts()
if not appellate_court_ids.filter(pk=attrs["court"].pk).exists():
raise ValidationError(
"%s is not an appellate court ID. Did you mean to use the "
"upload_type for district dockets?" % attrs["court"]
Expand Down Expand Up @@ -203,11 +197,8 @@ def validate(self, attrs):
mail = attrs["mail"]
receipt = attrs["receipt"]

all_court_ids = Court.federal_courts.all_pacer_courts().values_list(
"pk", flat=True
)

if court_id not in all_court_ids:
all_court_ids = Court.federal_courts.all_pacer_courts()
if not all_court_ids.filter(pk=court_id).exists():
raise ValidationError(
f"{attrs['court'].pk} is not a PACER court ID."
)
Expand Down Expand Up @@ -274,10 +265,9 @@ class Meta:

def validate(self, attrs):
# Is it a good court value?
valid_court_ids = Court.federal_courts.district_or_bankruptcy_pacer_courts().values_list(
"pk", flat=True
valid_court_ids = (
Court.federal_courts.district_or_bankruptcy_pacer_courts()
)

if (
attrs.get("court")
or attrs.get("docket")
Expand All @@ -293,7 +283,7 @@ def validate(self, attrs):
if attrs.get("court")
else attrs["docket"].court_id
)
if court_id not in valid_court_ids:
if not valid_court_ids.filter(pk=court_id).exists():
raise ValidationError(f"Invalid court id: {court_id}")

# Docket validations
Expand Down
146 changes: 113 additions & 33 deletions cl/recap/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from django.core.files.base import ContentFile, File
from django.core.files.uploadedfile import SimpleUploadedFile
from django.db import IntegrityError, transaction
from django.db.models import QuerySet
from django.utils.timezone import now
from juriscraper.lib.exceptions import PacerLoginException, ParsingException
from juriscraper.lib.string_utils import CaseNameTweaker, harmonize
Expand Down Expand Up @@ -114,7 +115,9 @@ async def process_recap_upload(pq: ProcessingQueue) -> None:
for pq_pk in sub_docket_att_page_pks:
await process_recap_attachment(pq_pk)
elif pq.upload_type == UPLOAD_TYPE.PDF:
await process_recap_pdf(pq.pk)
sub_docket_pdf_pks = await find_subdocket_pdf_rds(pq.pk)
for pq_pk in sub_docket_pdf_pks:
await process_recap_pdf(pq_pk)
elif pq.upload_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT:
docket = await process_recap_docket_history_report(pq.pk)
elif pq.upload_type == UPLOAD_TYPE.APPELLATE_DOCKET:
Expand Down Expand Up @@ -676,6 +679,30 @@ async def get_att_data_from_pq(
return pq, att_data, text


def get_main_rds(court_id: str, pacer_doc_id: str) -> QuerySet:
"""
Return the main RECAPDocument queryset for a given court and pacer_doc_id.
:param court_id: The court ID to query.
:param pacer_doc_id: The pacer document ID.
:return: The main RECAPDocument queryset.
"""
main_rds_qs = (
RECAPDocument.objects.select_related("docket_entry__docket")
.filter(
pacer_doc_id=pacer_doc_id,
docket_entry__docket__court_id=court_id,
)
.order_by("docket_entry__docket__pacer_case_id")
.distinct("docket_entry__docket__pacer_case_id")
.only(
"pacer_doc_id",
"docket_entry__docket__pacer_case_id",
"docket_entry__docket__court_id",
)
)
return main_rds_qs


async def find_subdocket_att_page_rds(
pk: int,
) -> list[int]:
Expand All @@ -687,43 +714,100 @@ async def find_subdocket_att_page_rds(
"""

pq = await ProcessingQueue.objects.aget(pk=pk)
court = await Court.objects.aget(id=pq.court_id)
pq, att_data, text = await get_att_data_from_pq(pq)
pacer_doc_id = att_data["pacer_doc_id"]
main_rds = (
RECAPDocument.objects.select_related("docket_entry__docket")
.filter(
pacer_doc_id=pacer_doc_id,
docket_entry__docket__court=court,
)
.order_by("docket_entry__docket__pacer_case_id")
.distinct("docket_entry__docket__pacer_case_id")
.only(
"pacer_doc_id",
"docket_entry__docket__pacer_case_id",
"docket_entry__docket__court_id",
)
.exclude(docket_entry__docket__pacer_case_id=pq.pacer_case_id)
main_rds = get_main_rds(pq.court_id, pacer_doc_id).exclude(
docket_entry__docket__pacer_case_id=pq.pacer_case_id
)
pqs_to_process_pks = [
pq.pk
] # Add the original pq to the list of pqs to process
original_file_content = text.encode("utf-8")
original_file_name = pq.filepath_local.name
async for main_rd in main_rds:
main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id
# Create additional pqs for each subdocket case found.
pq_created = await ProcessingQueue.objects.acreate(
uploader_id=pq.uploader_id,
pacer_doc_id=pacer_doc_id,
pacer_case_id=main_pacer_case_id,
court_id=court.pk,
upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE,
filepath_local=ContentFile(
original_file_content, name=original_file_name
),

@sync_to_async
def save_pq_instances():
with transaction.atomic():
for main_rd in main_rds:
main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id
# Create additional pqs for each subdocket case found.
pq_created = ProcessingQueue.objects.create(
uploader_id=pq.uploader_id,
pacer_doc_id=pacer_doc_id,
pacer_case_id=main_pacer_case_id,
court_id=pq.court_id,
upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE,
filepath_local=ContentFile(
original_file_content, name=original_file_name
),
)
pqs_to_process_pks.append(pq_created.pk)

await save_pq_instances()
return pqs_to_process_pks


async def find_subdocket_pdf_rds(
pk: int,
) -> list[int]:
"""Look for RECAP Documents that belong to subdockets, and create a PQ
object for each additional PDF upload that requires processing.
:param pk: Primary key of the processing queue item.
:return: A list of ProcessingQueue pks to process.
"""

pq = await ProcessingQueue.objects.aget(pk=pk)
main_rds = get_main_rds(pq.court_id, pq.pacer_doc_id)
pqs_to_process_pks = [
pq.pk
] # Add the original pq to the list of pqs to process

appellate_court_ids = Court.federal_courts.appellate_pacer_courts()
if await appellate_court_ids.filter(pk=pq.court_id).aexists():
# Abort the process for appellate documents. Subdockets cannot be found
# in appellate cases.
return pqs_to_process_pks

if pq.pacer_case_id:
# If pq already has a pacer_case_id, exclude it from the queryset.
main_rds = main_rds.exclude(
docket_entry__docket__pacer_case_id=pq.pacer_case_id
)
pqs_to_process_pks.append(pq_created.pk)

pdf_binary_content = pq.filepath_local.read()

@sync_to_async
def save_pq_instances():
with transaction.atomic():
for i, main_rd in enumerate(main_rds):
if i == 0 and not pq.pacer_case_id:
# If the original PQ does not have a pacer_case_id,
# assign it a pacer_case_id from one of the matched RDs
# to ensure the RD lookup in process_recap_pdf succeeds.
pq.pacer_case_id = (
main_rd.docket_entry.docket.pacer_case_id
)
pq.save()
continue

main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id
# Create additional pqs for each subdocket case found.
pq_created = ProcessingQueue.objects.create(
uploader_id=pq.uploader_id,
pacer_doc_id=pq.pacer_doc_id,
pacer_case_id=main_pacer_case_id,
document_number=pq.document_number,
attachment_number=pq.attachment_number,
court_id=pq.court_id,
upload_type=UPLOAD_TYPE.PDF,
filepath_local=ContentFile(
pdf_binary_content, name=pq.filepath_local.name
),
)
pqs_to_process_pks.append(pq_created.pk)

await save_pq_instances()
return pqs_to_process_pks


Expand All @@ -747,10 +831,6 @@ async def process_recap_attachment(
await mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}")

pq = await ProcessingQueue.objects.aget(pk=pk)
await mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}")

pq, att_data, text = await get_att_data_from_pq(pq)

if document_number is None:
Expand Down
Loading

0 comments on commit 437c5d7

Please sign in to comment.