diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 669131ae025b3f..55e25ebe88d125 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -881,12 +881,8 @@ def get_folder_info( id = path_spec.get_partition_from_path( self.create_s3_path(max_file.bucket_name, max_file.key) ) - if id is None: - logger.warning( - f"Unable to extract partition from path {max_file.key}. Skipping..." - ) - continue + # If id is None, it means the folder is not a partition partitions.append( Folder( partition_id=id, diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_basic.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_basic.json index 47a20310d36772..6ff3925b1afb34 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_basic.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_basic.json @@ -8,8 +8,7 @@ "json": { "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json", - "number_of_partitions": "1", - "partitions": "{'min_partition': {'id': [('partition[0]', 'year=2022'), ('partition[1]', 'month=jan')], 'creation_time': '2020-04-14 07:06:20+00:00', 'modification_time': '2020-04-14 07:06:20+00:00', 'size': '14540', 'sample_file': 's3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json'}, 'max_partition': {'id': [('partition[0]', 'year=2022'), ('partition[1]', 'month=jan')], 'creation_time': '2020-04-14 07:06:20+00:00', 'modification_time': '2020-04-14 07:06:20+00:00', 'size': '14540', 'sample_file': 's3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json'}}" + "number_of_partitions": "1" }, "name": "folder_aaa.pokemon_abilities_json", "description": "", @@ -583,12 +582,12 @@ "aspect": { "json": { "minPartition": { - "partition": "partition[0]=year=2022/partition[1]=month=jan", + "partition": "partition_0=year=2022/partition_1=month=jan", "createdTime": 1586847980000, "lastModifiedTime": 1586847980000 }, "maxPartition": { - "partition": "partition[0]=year=2022/partition[1]=month=jan", + "partition": "partition_0=year=2022/partition_1=month=jan", "createdTime": 1586847980000, "lastModifiedTime": 1586847980000 } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_keyval.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_keyval.json index 35661f1b317e70..86fa7835008dd4 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_keyval.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_keyval.json @@ -8,8 +8,7 @@ "json": { "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json", - "number_of_partitions": "1", - "partitions": "{'min_partition': {'id': [('partition_key[0]', 'year'), ('partition[0]', '2022'), ('partition_key[1]', 'month'), ('partition[1]', 'jan')], 'creation_time': '2020-04-14 07:06:20+00:00', 'modification_time': '2020-04-14 07:06:20+00:00', 'size': '14540', 'sample_file': 's3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json'}, 'max_partition': {'id': [('partition_key[0]', 'year'), ('partition[0]', '2022'), ('partition_key[1]', 'month'), ('partition[1]', 'jan')], 'creation_time': '2020-04-14 07:06:20+00:00', 'modification_time': '2020-04-14 07:06:20+00:00', 'size': '14540', 'sample_file': 's3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json'}}" + "number_of_partitions": "1" }, "name": "folder_aaa.pokemon_abilities_json", "description": "", @@ -583,12 +582,12 @@ "aspect": { "json": { "minPartition": { - "partition": "partition_key[0]=year/partition[0]=2022/partition_key[1]=month/partition[1]=jan", + "partition": "year=2022/month=jan", "createdTime": 1586847980000, "lastModifiedTime": 1586847980000 }, "maxPartition": { - "partition": "partition_key[0]=year/partition[0]=2022/partition_key[1]=month/partition[1]=jan", + "partition": "year=2022/month=jan", "createdTime": 1586847980000, "lastModifiedTime": 1586847980000 } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_update_schema.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_update_schema.json index dce9a961a9152a..95ec5e83f7991c 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_update_schema.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_update_schema.json @@ -8,8 +8,7 @@ "json": { "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json", - "number_of_partitions": "1", - "partitions": "{'min_partition': {'id': [('partition[0]', 'year=2022'), ('partition[1]', 'month=jan')], 'creation_time': '2020-04-14 07:06:20+00:00', 'modification_time': '2020-04-14 07:06:20+00:00', 'size': '14540', 'sample_file': 's3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json'}, 'max_partition': {'id': [('partition[0]', 'year=2022'), ('partition[1]', 'month=jan')], 'creation_time': '2020-04-14 07:06:20+00:00', 'modification_time': '2020-04-14 07:06:20+00:00', 'size': '14540', 'sample_file': 's3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json'}}" + "number_of_partitions": "1" }, "name": "folder_aaa.pokemon_abilities_json", "description": "", @@ -583,12 +582,12 @@ "aspect": { "json": { "minPartition": { - "partition": "partition[0]=year=2022/partition[1]=month=jan", + "partition": "partition_0=year=2022/partition_1=month=jan", "createdTime": 1586847980000, "lastModifiedTime": 1586847980000 }, "maxPartition": { - "partition": "partition[0]=year=2022/partition[1]=month=jan", + "partition": "partition_0=year=2022/partition_1=month=jan", "createdTime": 1586847980000, "lastModifiedTime": 1586847980000 } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_update_schema_with_partition_autodetect.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_update_schema_with_partition_autodetect.json index 95dc26d0227836..4fee8fb36b13f0 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_update_schema_with_partition_autodetect.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_update_schema_with_partition_autodetect.json @@ -774,8 +774,7 @@ "json": { "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json", - "number_of_partitions": "1", - "partitions": "{'min_partition': {'id': [('year', '2022'), ('month', 'jan')], 'creation_time': '2020-04-14 07:06:20+00:00', 'modification_time': '2020-04-14 07:06:20+00:00', 'size': '14540', 'sample_file': 's3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json'}, 'max_partition': {'id': [('year', '2022'), ('month', 'jan')], 'creation_time': '2020-04-14 07:06:20+00:00', 'modification_time': '2020-04-14 07:06:20+00:00', 'size': '14540', 'sample_file': 's3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json'}}" + "number_of_partitions": "1" }, "name": "folder_aaa.pokemon_abilities_json", "description": "", diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_with_partition_autodetect_traverse_all.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_with_partition_autodetect_traverse_all.json index ec7bcff31e8412..13ba57f5be6717 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_with_partition_autodetect_traverse_all.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_with_partition_autodetect_traverse_all.json @@ -774,8 +774,7 @@ "json": { "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json", - "number_of_partitions": "7", - "partitions": "{'min_partition': {'id': [('year', '2019'), ('month', 'feb')], 'creation_time': '2020-04-14 07:04:20+00:00', 'modification_time': '2020-04-14 07:04:30+00:00', 'size': '29010', 'sample_file': 's3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2019/month=feb/part2.json'}, 'max_partition': {'id': [('year', '2022'), ('month', 'jan')], 'creation_time': '2020-04-14 07:06:20+00:00', 'modification_time': '2020-04-14 07:06:20+00:00', 'size': '14540', 'sample_file': 's3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json'}}" + "number_of_partitions": "7" }, "name": "folder_aaa.pokemon_abilities_json", "description": "", diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_with_partition_autodetect_traverse_min_max.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_with_partition_autodetect_traverse_min_max.json index 301f7b1ca9dbf2..9f45583f1d3eca 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_with_partition_autodetect_traverse_min_max.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_folder_partition_with_partition_autodetect_traverse_min_max.json @@ -774,8 +774,7 @@ "json": { "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json", - "number_of_partitions": "2", - "partitions": "{'min_partition': {'id': [('year', '2019'), ('month', 'feb')], 'creation_time': '2020-04-14 07:04:20+00:00', 'modification_time': '2020-04-14 07:04:30+00:00', 'size': '29010', 'sample_file': 's3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2019/month=feb/part2.json'}, 'max_partition': {'id': [('year', '2022'), ('month', 'jan')], 'creation_time': '2020-04-14 07:06:20+00:00', 'modification_time': '2020-04-14 07:06:20+00:00', 'size': '14540', 'sample_file': 's3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json'}}" + "number_of_partitions": "2" }, "name": "folder_aaa.pokemon_abilities_json", "description": "", diff --git a/metadata-ingestion/tests/integration/s3/sources/s3/folder_no_partition.json b/metadata-ingestion/tests/integration/s3/sources/s3/folder_no_partition.json index c06e411005399e..db3eaadf300407 100644 --- a/metadata-ingestion/tests/integration/s3/sources/s3/folder_no_partition.json +++ b/metadata-ingestion/tests/integration/s3/sources/s3/folder_no_partition.json @@ -2,6 +2,7 @@ "type": "s3", "config": { "env": "UAT", + "sort_schema_fields": true, "path_specs": [{ "include": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/{table}/*.*" }],