diff --git a/src/aind_data_asset_indexer/aind_bucket_indexer.py b/src/aind_data_asset_indexer/aind_bucket_indexer.py index cbd882c..468a936 100644 --- a/src/aind_data_asset_indexer/aind_bucket_indexer.py +++ b/src/aind_data_asset_indexer/aind_bucket_indexer.py @@ -233,7 +233,8 @@ def _resolve_schema_information( # with record info if they are different if is_in_record and is_in_root and is_in_copy_subdir: self._write_root_file_with_record_info( - docdb_record=docdb_record.get(field_name), **common_kwargs + docdb_record_contents=docdb_record.get(field_name), + **common_kwargs, ) # If field is not null, a file exists in the root folder, and # no file exists in copy_subdir, then copy root folder file to @@ -242,21 +243,35 @@ def _resolve_schema_information( elif is_in_record and is_in_root and not is_in_copy_subdir: self._copy_file_from_root_to_subdir(**common_kwargs) self._write_root_file_with_record_info( - docdb_record=docdb_record.get(field_name), **common_kwargs + docdb_record_contents=docdb_record.get(field_name), + **common_kwargs, ) # If field is not null, no file exists in the root folder, and # a file exists in copy_subdir, then create a file in the root # folder with the record info elif is_in_record and not is_in_root and is_in_copy_subdir: self._write_root_file_with_record_info( - docdb_record=docdb_record.get(field_name), **common_kwargs + docdb_record_contents=docdb_record.get(field_name), + **common_kwargs, ) # If field is not null, no file exists in the root folder, and # no file exists in copy_subdir, then create a file in the root # folder with the record info and then copy it to the copy subdir elif is_in_record and not is_in_root and not is_in_copy_subdir: self._write_root_file_with_record_info( - docdb_record=docdb_record.get(field_name), **common_kwargs + docdb_record_contents=docdb_record.get(field_name), + **common_kwargs, + ) + # Get file info for new file in root folder + object_key = create_object_key( + prefix=prefix, filename=core_schema_file_name + ) + common_kwargs["core_schema_info_in_root"] = ( + get_dict_of_file_info( + s3_client=s3_client, + bucket=self.job_settings.s3_bucket, + keys=[object_key], + ).get(object_key) ) self._copy_file_from_root_to_subdir(**common_kwargs) # If field is null, a file exists in the root folder, and @@ -298,6 +313,11 @@ def _resolve_schema_information( f"Something went wrong downloading or parsing " f"s3://{self.job_settings.s3_bucket}/{object_key}" ) + # Can delete corrupt root file since a copy has been made + response = s3_client.delete_object( + Bucket=self.job_settings.s3_bucket, Key=object_key + ) + logging.debug(f"{response}") # If field is null, no file exists in the root folder, and # a file exists in copy_subdir, then do nothing diff --git a/src/aind_data_asset_indexer/utils.py b/src/aind_data_asset_indexer/utils.py index b65df09..e1d7877 100644 --- a/src/aind_data_asset_indexer/utils.py +++ b/src/aind_data_asset_indexer/utils.py @@ -351,11 +351,10 @@ def does_s3_metadata_copy_exist( Bucket=bucket, Prefix=copy_prefix, Delimiter="/" ) if "Contents" in response: - core_schemas = [s.rstrip(".json") for s in core_schema_file_names] - pattern = r"([a-zA-Z0-9_]+)\.\d{8}\.json$" + core_schemas = [s.replace(".json", "") for s in core_schema_file_names] + pattern = re.escape(copy_prefix) + r"([a-zA-Z0-9_]+)\.\d{8}\.json$" for obj in response["Contents"]: - file_name = obj["Key"].lstrip(copy_prefix) - m = re.match(pattern, file_name) + m = re.match(pattern, obj["Key"]) if m is not None and m.group(1) in core_schemas: return True return False @@ -393,11 +392,10 @@ def list_metadata_copies( ) files = [] if "Contents" in response: - core_schemas = [s.rstrip(".json") for s in core_schema_file_names] - pattern = r"([a-zA-Z0-9_]+)\.\d{8}\.json$" + core_schemas = [s.replace(".json", "") for s in core_schema_file_names] + pattern = re.escape(copy_prefix) + r"([a-zA-Z0-9_]+)\.\d{8}\.json$" for obj in response["Contents"]: - file_name = obj["Key"].lstrip(copy_prefix) - m = re.match(pattern, file_name) + m = re.match(pattern, obj["Key"]) if m is not None and m.group(1) in core_schemas: files.append(f"{m.group(1)}.json") return files diff --git a/tests/test_aind_bucket_indexer.py b/tests/test_aind_bucket_indexer.py index 795dafa..460aecc 100644 --- a/tests/test_aind_bucket_indexer.py +++ b/tests/test_aind_bucket_indexer.py @@ -243,7 +243,7 @@ def test_resolve_schema_information_case_1( ) self.assertEqual(dict(), docdb_fields_to_update) mock_write_file_with_record_info.assert_called_once_with( - docdb_record=self.example_md_record.get("subject"), + docdb_record_contents=self.example_md_record.get("subject"), s3_client=mock_s3_client, prefix="ecephys_642478_2023-01-17_13-56-29", core_schema_file_name="subject.json", @@ -316,7 +316,7 @@ def test_resolve_schema_information_case_2( ), ) mock_write_file_with_record_info.assert_called_once_with( - docdb_record=self.example_md_record.get("subject"), + docdb_record_contents=self.example_md_record.get("subject"), s3_client=mock_s3_client, prefix="ecephys_642478_2023-01-17_13-56-29", core_schema_file_name="subject.json", @@ -373,7 +373,7 @@ def test_resolve_schema_information_case_3( self.assertEqual(dict(), docdb_fields_to_update) mock_copy_file_to_subdir.assert_not_called() mock_write_file_with_record_info.assert_called_once_with( - docdb_record=self.example_md_record.get("subject"), + docdb_record_contents=self.example_md_record.get("subject"), s3_client=mock_s3_client, prefix="ecephys_642478_2023-01-17_13-56-29", core_schema_file_name="subject.json", @@ -400,12 +400,14 @@ def test_resolve_schema_information_case_3( "aind_data_asset_indexer.aind_bucket_indexer.AindIndexBucketJob." "_write_root_file_with_record_info" ) + @patch("aind_data_asset_indexer.aind_bucket_indexer.get_dict_of_file_info") @patch( "aind_data_asset_indexer.aind_bucket_indexer.core_schema_file_names", ["subject.json"], ) # Mocking this to limit for loop to one iteration def test_resolve_schema_information_case_4( self, + mock_get_dict_of_file_info: MagicMock, mock_write_file_with_record_info: MagicMock, mock_copy_file_to_subdir: MagicMock, mock_download_json_file: MagicMock, @@ -420,6 +422,18 @@ def test_resolve_schema_information_case_4( """ core_schema_info_in_root = dict() + core_schema_info_in_root_after_copy = { + "ecephys_642478_2023-01-17_13-56-29/subject.json": { + "last_modified": datetime( + 2024, 5, 15, 17, 41, 28, tzinfo=timezone.utc + ), + "e_tag": '"7ce612b2f26be2efe806990cb4eb4266"', + "version_id": "version_id", + } + } + mock_get_dict_of_file_info.return_value = ( + core_schema_info_in_root_after_copy + ) docdb_fields_to_update = self.basic_job._resolve_schema_information( prefix="ecephys_642478_2023-01-17_13-56-29", s3_client=mock_s3_client, @@ -429,7 +443,7 @@ def test_resolve_schema_information_case_4( ) self.assertEqual(dict(), docdb_fields_to_update) mock_write_file_with_record_info.assert_called_once_with( - docdb_record=self.example_md_record.get("subject"), + docdb_record_contents=self.example_md_record.get("subject"), s3_client=mock_s3_client, prefix="ecephys_642478_2023-01-17_13-56-29", core_schema_file_name="subject.json", @@ -437,12 +451,17 @@ def test_resolve_schema_information_case_4( "subject.json" ), ) + mock_get_dict_of_file_info.assert_called_once_with( + s3_client=mock_s3_client, + bucket=self.basic_job.job_settings.s3_bucket, + keys=["ecephys_642478_2023-01-17_13-56-29/subject.json"], + ) mock_copy_file_to_subdir.assert_called_once_with( s3_client=mock_s3_client, prefix="ecephys_642478_2023-01-17_13-56-29", core_schema_file_name="subject.json", - core_schema_info_in_root=core_schema_info_in_root.get( - "subject.json" + core_schema_info_in_root=core_schema_info_in_root_after_copy.get( + "ecephys_642478_2023-01-17_13-56-29/subject.json" ), ) mock_download_json_file.assert_not_called() @@ -643,13 +662,17 @@ def test_resolve_schema_information_case_6_corrupt_download( "subject.json" ), ) - mock_log_debug.assert_not_called() mock_log_info.assert_not_called() mock_log_warn.assert_called_once_with( "Something went wrong downloading or parsing " "s3://aind-ephys-data-dev-u5u0i5/" "ecephys_642478_2023-01-17_13-56-29/subject.json" ) + mock_s3_client.delete_object.assert_called_once_with( + Bucket="aind-ephys-data-dev-u5u0i5", + Key="ecephys_642478_2023-01-17_13-56-29/subject.json", + ) + mock_log_debug.assert_called_once() @patch("boto3.client") @patch("logging.debug") diff --git a/tests/test_utils.py b/tests/test_utils.py index 7e1838c..5f6ad9d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -521,7 +521,7 @@ def test_list_metadata_copies(self, mock_s3_client: MagicMock): copy_subdir="original_metadata", s3_client=mock_s3_client, ) - self.assertEqual(["subject.json"], contents) + self.assertEqual(["data_description.json", "subject.json"], contents) @patch("boto3.client") def test_does_s3_metadata_copy_exist_none(self, mock_s3_client: MagicMock):