Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hot-fix: original metadata checks #83

Merged
merged 5 commits into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions src/aind_data_asset_indexer/aind_bucket_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,8 @@ def _resolve_schema_information(
# with record info if they are different
if is_in_record and is_in_root and is_in_copy_subdir:
self._write_root_file_with_record_info(
docdb_record=docdb_record.get(field_name), **common_kwargs
docdb_record_contents=docdb_record.get(field_name),
**common_kwargs,
)
# If field is not null, a file exists in the root folder, and
# no file exists in copy_subdir, then copy root folder file to
Expand All @@ -242,21 +243,35 @@ def _resolve_schema_information(
elif is_in_record and is_in_root and not is_in_copy_subdir:
self._copy_file_from_root_to_subdir(**common_kwargs)
self._write_root_file_with_record_info(
docdb_record=docdb_record.get(field_name), **common_kwargs
docdb_record_contents=docdb_record.get(field_name),
**common_kwargs,
)
# If field is not null, no file exists in the root folder, and
# a file exists in copy_subdir, then create a file in the root
# folder with the record info
elif is_in_record and not is_in_root and is_in_copy_subdir:
self._write_root_file_with_record_info(
docdb_record=docdb_record.get(field_name), **common_kwargs
docdb_record_contents=docdb_record.get(field_name),
**common_kwargs,
)
# If field is not null, no file exists in the root folder, and
# no file exists in copy_subdir, then create a file in the root
# folder with the record info and then copy it to the copy subdir
elif is_in_record and not is_in_root and not is_in_copy_subdir:
self._write_root_file_with_record_info(
docdb_record=docdb_record.get(field_name), **common_kwargs
docdb_record_contents=docdb_record.get(field_name),
**common_kwargs,
)
# Get file info for new file in root folder
object_key = create_object_key(
prefix=prefix, filename=core_schema_file_name
)
common_kwargs["core_schema_info_in_root"] = (
get_dict_of_file_info(
s3_client=s3_client,
bucket=self.job_settings.s3_bucket,
keys=[object_key],
).get(object_key)
)
self._copy_file_from_root_to_subdir(**common_kwargs)
# If field is null, a file exists in the root folder, and
Expand Down Expand Up @@ -298,6 +313,11 @@ def _resolve_schema_information(
f"Something went wrong downloading or parsing "
f"s3://{self.job_settings.s3_bucket}/{object_key}"
)
# Can delete corrupt root file since a copy has been made
response = s3_client.delete_object(
Bucket=self.job_settings.s3_bucket, Key=object_key
)
logging.debug(f"{response}")

# If field is null, no file exists in the root folder, and
# a file exists in copy_subdir, then do nothing
Expand Down
14 changes: 6 additions & 8 deletions src/aind_data_asset_indexer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,11 +351,10 @@ def does_s3_metadata_copy_exist(
Bucket=bucket, Prefix=copy_prefix, Delimiter="/"
)
if "Contents" in response:
core_schemas = [s.rstrip(".json") for s in core_schema_file_names]
pattern = r"([a-zA-Z0-9_]+)\.\d{8}\.json$"
core_schemas = [s.replace(".json", "") for s in core_schema_file_names]
pattern = re.escape(copy_prefix) + r"([a-zA-Z0-9_]+)\.\d{8}\.json$"
for obj in response["Contents"]:
file_name = obj["Key"].lstrip(copy_prefix)
m = re.match(pattern, file_name)
m = re.match(pattern, obj["Key"])
if m is not None and m.group(1) in core_schemas:
return True
return False
Expand Down Expand Up @@ -393,11 +392,10 @@ def list_metadata_copies(
)
files = []
if "Contents" in response:
core_schemas = [s.rstrip(".json") for s in core_schema_file_names]
pattern = r"([a-zA-Z0-9_]+)\.\d{8}\.json$"
core_schemas = [s.replace(".json", "") for s in core_schema_file_names]
pattern = re.escape(copy_prefix) + r"([a-zA-Z0-9_]+)\.\d{8}\.json$"
for obj in response["Contents"]:
file_name = obj["Key"].lstrip(copy_prefix)
m = re.match(pattern, file_name)
m = re.match(pattern, obj["Key"])
if m is not None and m.group(1) in core_schemas:
files.append(f"{m.group(1)}.json")
return files
Expand Down
37 changes: 30 additions & 7 deletions tests/test_aind_bucket_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def test_resolve_schema_information_case_1(
)
self.assertEqual(dict(), docdb_fields_to_update)
mock_write_file_with_record_info.assert_called_once_with(
docdb_record=self.example_md_record.get("subject"),
docdb_record_contents=self.example_md_record.get("subject"),
s3_client=mock_s3_client,
prefix="ecephys_642478_2023-01-17_13-56-29",
core_schema_file_name="subject.json",
Expand Down Expand Up @@ -316,7 +316,7 @@ def test_resolve_schema_information_case_2(
),
)
mock_write_file_with_record_info.assert_called_once_with(
docdb_record=self.example_md_record.get("subject"),
docdb_record_contents=self.example_md_record.get("subject"),
s3_client=mock_s3_client,
prefix="ecephys_642478_2023-01-17_13-56-29",
core_schema_file_name="subject.json",
Expand Down Expand Up @@ -373,7 +373,7 @@ def test_resolve_schema_information_case_3(
self.assertEqual(dict(), docdb_fields_to_update)
mock_copy_file_to_subdir.assert_not_called()
mock_write_file_with_record_info.assert_called_once_with(
docdb_record=self.example_md_record.get("subject"),
docdb_record_contents=self.example_md_record.get("subject"),
s3_client=mock_s3_client,
prefix="ecephys_642478_2023-01-17_13-56-29",
core_schema_file_name="subject.json",
Expand All @@ -400,12 +400,14 @@ def test_resolve_schema_information_case_3(
"aind_data_asset_indexer.aind_bucket_indexer.AindIndexBucketJob."
"_write_root_file_with_record_info"
)
@patch("aind_data_asset_indexer.aind_bucket_indexer.get_dict_of_file_info")
@patch(
"aind_data_asset_indexer.aind_bucket_indexer.core_schema_file_names",
["subject.json"],
) # Mocking this to limit for loop to one iteration
def test_resolve_schema_information_case_4(
self,
mock_get_dict_of_file_info: MagicMock,
mock_write_file_with_record_info: MagicMock,
mock_copy_file_to_subdir: MagicMock,
mock_download_json_file: MagicMock,
Expand All @@ -420,6 +422,18 @@ def test_resolve_schema_information_case_4(
"""

core_schema_info_in_root = dict()
core_schema_info_in_root_after_copy = {
"ecephys_642478_2023-01-17_13-56-29/subject.json": {
"last_modified": datetime(
2024, 5, 15, 17, 41, 28, tzinfo=timezone.utc
),
"e_tag": '"7ce612b2f26be2efe806990cb4eb4266"',
"version_id": "version_id",
}
}
mock_get_dict_of_file_info.return_value = (
core_schema_info_in_root_after_copy
)
docdb_fields_to_update = self.basic_job._resolve_schema_information(
prefix="ecephys_642478_2023-01-17_13-56-29",
s3_client=mock_s3_client,
Expand All @@ -429,20 +443,25 @@ def test_resolve_schema_information_case_4(
)
self.assertEqual(dict(), docdb_fields_to_update)
mock_write_file_with_record_info.assert_called_once_with(
docdb_record=self.example_md_record.get("subject"),
docdb_record_contents=self.example_md_record.get("subject"),
s3_client=mock_s3_client,
prefix="ecephys_642478_2023-01-17_13-56-29",
core_schema_file_name="subject.json",
core_schema_info_in_root=core_schema_info_in_root.get(
"subject.json"
),
)
mock_get_dict_of_file_info.assert_called_once_with(
s3_client=mock_s3_client,
bucket=self.basic_job.job_settings.s3_bucket,
keys=["ecephys_642478_2023-01-17_13-56-29/subject.json"],
)
mock_copy_file_to_subdir.assert_called_once_with(
s3_client=mock_s3_client,
prefix="ecephys_642478_2023-01-17_13-56-29",
core_schema_file_name="subject.json",
core_schema_info_in_root=core_schema_info_in_root.get(
"subject.json"
core_schema_info_in_root=core_schema_info_in_root_after_copy.get(
"ecephys_642478_2023-01-17_13-56-29/subject.json"
),
)
mock_download_json_file.assert_not_called()
Expand Down Expand Up @@ -643,13 +662,17 @@ def test_resolve_schema_information_case_6_corrupt_download(
"subject.json"
),
)
mock_log_debug.assert_not_called()
mock_log_info.assert_not_called()
mock_log_warn.assert_called_once_with(
"Something went wrong downloading or parsing "
"s3://aind-ephys-data-dev-u5u0i5/"
"ecephys_642478_2023-01-17_13-56-29/subject.json"
)
mock_s3_client.delete_object.assert_called_once_with(
Bucket="aind-ephys-data-dev-u5u0i5",
Key="ecephys_642478_2023-01-17_13-56-29/subject.json",
)
mock_log_debug.assert_called_once()

@patch("boto3.client")
@patch("logging.debug")
Expand Down
2 changes: 1 addition & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ def test_list_metadata_copies(self, mock_s3_client: MagicMock):
copy_subdir="original_metadata",
s3_client=mock_s3_client,
)
self.assertEqual(["subject.json"], contents)
self.assertEqual(["data_description.json", "subject.json"], contents)

@patch("boto3.client")
def test_does_s3_metadata_copy_exist_none(self, mock_s3_client: MagicMock):
Expand Down
Loading