Skip to content

Commit

Permalink
fix(ScraperExtractFromText): add fail case testing
Browse files Browse the repository at this point in the history
Solves #1289

- Adds a fail test case to `text_extract_from_text`
- Updates bia, bap1, nm and sd scrapers to return an empty dict
when no match was found
  • Loading branch information
grossir committed Jan 9, 2025
1 parent bb6a4e0 commit ccd9a6c
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 10 deletions.
11 changes: 9 additions & 2 deletions juriscraper/opinions/united_states/administrative_agency/bia.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from datetime import datetime
from typing import Any, Dict

from juriscraper.AbstractSite import logger
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


Expand Down Expand Up @@ -70,8 +71,14 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
date = re.findall(
r"Decided (by (Acting\s)?Attorney General )?(.*\d{4})",
scraped_text,
)[0][-1]
date_filed = datetime.strptime(date, "%B %d, %Y").strftime("%Y-%m-%d")
)
if not date:
logger.error("bia: unable to extract_from_text a date_filed")
return {}

date_filed = datetime.strptime(date[0][-1], "%B %d, %Y").strftime(
"%Y-%m-%d"
)
metadata = {
"OpinionCluster": {
"date_filed": date_filed,
Expand Down
7 changes: 4 additions & 3 deletions juriscraper/opinions/united_states/federal_bankruptcy/bap1.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,10 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
"""
months = "|".join(calendar.month_name[1:])
date_pattern = re.compile(rf"({months})\s+\d{{1,2}}\s?,?\s+\d{{4}}")
match = re.search(date_pattern, scraped_text)
date_extracted = match.group(0) if match else ""
date_filed = re.sub(r"\s+", " ", date_extracted).strip()
if match := re.search(date_pattern, scraped_text):
date_filed = re.sub(r"\s+", " ", match.group(0)).strip()
else:
return {}

metadata = {
"OpinionCluster": {
Expand Down
8 changes: 6 additions & 2 deletions juriscraper/opinions/united_states/state/nm.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,14 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
:param scraped_text: Text of scraped content
:return: metadata
"""
docket_number = re.findall(r"N[oO]\.\s(.*)", scraped_text)[0]
docket_number = re.findall(r"N[oO]\.\s(.*)", scraped_text)
if not docket_number:
logger.error("nm: unable to extract_from_text a docket_number")
return {}

metadata = {
"OpinionCluster": {
"docket_number": docket_number,
"docket_number": docket_number[0],
},
}
return metadata
9 changes: 6 additions & 3 deletions juriscraper/opinions/united_states/state/sd.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,12 +142,15 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
"""

# The docket number appears to be the first text on the page.
# So I crop the text to avoid any confusion that might occur in the
# So we crop the text to avoid any confusion that might occur in the
# body of an opinion.
docket = re.findall(r"#\d+.*-.-\w{3}", scraped_text[:100])[0]
docket = re.findall(r"#\d+.*-.-\w{3}", scraped_text[:100])
if not docket:
return {}

metadata = {
"Docket": {
"docket_number": docket,
"docket_number": docket[0],
},
}
return metadata
12 changes: 12 additions & 0 deletions tests/local/test_ScraperExtractFromTextTest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
import logging
import unittest

from juriscraper.lib.importer import build_module_list
Expand Down Expand Up @@ -752,16 +753,27 @@ class ScraperExtractFromText(unittest.TestCase):

def test_extract_from_text(self):
"""Test that extract_from_text returns the expected data."""
# prevent logger.error calls to be triggered
logging.disable(logging.CRITICAL)
for module_string, test_cases in self.test_data.items():
package, module = module_string.rsplit(".", 1)
mod = __import__(
f"{package}.{module}", globals(), locals(), [module]
)
site = mod.Site()

# ensure that if no data is parsed, a dict is returned
# also, this ensures that there are no uncontrolled exceptions
self.assertTrue(
isinstance(
site.extract_from_text("Lorem ipsum dolorem..."), dict
)
)
for test_case in test_cases:
self.assertEqual(
site.extract_from_text(test_case[0]), test_case[1]
)
logging.disable(logging.NOTSET)

def test_extract_from_text_properly_implemented(self):
"""Ensure that extract_from_text is properly implemented."""
Expand Down

0 comments on commit ccd9a6c

Please sign in to comment.