From 92a41a9c3a5722aceac7de902392464bcb8516d6 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 11 Oct 2022 12:31:07 -0700 Subject: [PATCH] feat(ingest): include raw s3 paths if s3 source --- .../src/datahub/ingestion/source/s3/source.py | 2 ++ .../golden_mces_file_without_extension.json | 3 ++- .../s3/golden_mces_multiple_files.json | 21 ++++++++++++------- .../golden_mces_multiple_spec_for_files.json | 6 ++++-- .../s3/golden_mces_single_file.json | 3 ++- 5 files changed, 24 insertions(+), 11 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index bcc611ef77329..6070e5ddf8dde 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -543,6 +543,8 @@ def ingest_table( "number_of_files": str(table_data.number_of_files), "size_in_bytes": str(table_data.size_in_bytes), } + if table_data.is_s3: + customProperties["table_path"] = str(table_data.table_path) dataset_properties = DatasetPropertiesClass( description="", diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_file_without_extension.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_file_without_extension.json index 75b3a72aa4eb3..6f3261b96a277 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_file_without_extension.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_file_without_extension.json @@ -8,7 +8,8 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "number_of_files": "1", - "size_in_bytes": "172" + "size_in_bytes": "172", + "table_path": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/no_extension/small" }, "name": "small", "description": "", diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json index aba52da527be6..c1df865bb00c6 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json @@ -8,7 +8,8 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "number_of_files": "1", - "size_in_bytes": "3575" + "size_in_bytes": "3575", + "table_path": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv" }, "name": "NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv", "description": "", @@ -707,7 +708,8 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "1024", + "table_path": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro" }, "name": "chord_progressions_avro.avro", "description": "", @@ -828,7 +830,8 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "number_of_files": "1", - "size_in_bytes": "604" + "size_in_bytes": "604", + "table_path": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_csv.csv" }, "name": "chord_progressions_csv.csv", "description": "", @@ -949,7 +952,8 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "number_of_files": "1", - "size_in_bytes": "4646" + "size_in_bytes": "4646", + "table_path": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/countries_json.json" }, "name": "countries_json.json", "description": "", @@ -1046,7 +1050,8 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "number_of_files": "1", - "size_in_bytes": "4206" + "size_in_bytes": "4206", + "table_path": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/food_parquet.parquet" }, "name": "food_parquet.parquet", "description": "", @@ -1167,7 +1172,8 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "number_of_files": "1", - "size_in_bytes": "172" + "size_in_bytes": "172", + "table_path": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/small.csv" }, "name": "small.csv", "description": "", @@ -1288,7 +1294,8 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "number_of_files": "1", - "size_in_bytes": "34056" + "size_in_bytes": "34056", + "table_path": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/wa_fn_usec_hr_employee_attrition_csv.csv" }, "name": "wa_fn_usec_hr_employee_attrition_csv.csv", "description": "", diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_spec_for_files.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_spec_for_files.json index 95e18b6e1f871..18c4596001d0c 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_spec_for_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_spec_for_files.json @@ -8,7 +8,8 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "1024", + "table_path": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro" }, "name": "chord_progressions_avro.avro", "description": "", @@ -407,7 +408,8 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "number_of_files": "1", - "size_in_bytes": "604" + "size_in_bytes": "604", + "table_path": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_csv.csv" }, "name": "chord_progressions_csv.csv", "description": "", diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_single_file.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_single_file.json index d8825802ebb18..255a1ae95eeb6 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_single_file.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_single_file.json @@ -8,7 +8,8 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "1024", + "table_path": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro" }, "name": "chord_progressions_avro.avro", "description": "",