From 782d33db994e3638722a78286e50c7418cf43ef5 Mon Sep 17 00:00:00 2001 From: AvaniSiddhapuraAPT <156416042+AvaniSiddhapuraAPT@users.noreply.github.com> Date: Tue, 5 Mar 2024 21:00:09 +0530 Subject: [PATCH] fix(ingest/bigquery): escape special characters for table descriptions (#9932) --- .../ingestion/source/bigquery_v2/bigquery.py | 7 ++- .../source/bigquery_v2/bigquery_helper.py | 19 +++++++ .../bigquery_v2/bigquery_mcp_golden.json | 55 +++++++++++++------ .../unit/test_bigqueryv2_usage_source.py | 36 ++++++++++++ 4 files changed, 98 insertions(+), 19 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index c36b150d3220f9..bcc0aa50ed22e6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -40,6 +40,9 @@ BigQueryTableRef, ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config +from datahub.ingestion.source.bigquery_v2.bigquery_helper import ( + unquote_and_decode_unicode_escape_seq, +) from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( BigqueryColumn, @@ -1073,7 +1076,9 @@ def gen_dataset_workunits( dataset_properties = DatasetProperties( name=datahub_dataset_name.get_table_display_name(), - description=table.comment, + description=unquote_and_decode_unicode_escape_seq(table.comment) + if table.comment + else "", qualifiedName=str(datahub_dataset_name), created=( TimeStamp(time=int(table.created.timestamp() * 1000)) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py new file mode 100644 index 00000000000000..6142c96a5faa1d --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py @@ -0,0 +1,19 @@ +from typing import Optional + + +def unquote_and_decode_unicode_escape_seq( + string: str, + leading_quote: str = '"', + trailing_quote: Optional[str] = None, +) -> str: + """ + If string starts and ends with a quote, unquote it and decode Unicode escape sequences + """ + trailing_quote = trailing_quote if trailing_quote else leading_quote + + if string.startswith(leading_quote) and string.endswith(trailing_quote): + string = string[1:-1] + + cleaned_string = string.encode().decode("unicode-escape") + + return cleaned_string diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json index f58eee09aa1cec..da9589d2195ac6 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -78,7 +82,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -100,7 +105,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -115,7 +121,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -130,7 +137,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -147,7 +155,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -162,7 +171,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -182,7 +192,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -197,7 +208,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -229,7 +241,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -243,12 +256,14 @@ "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3stable-1", "name": "table-1", "qualifiedName": "project-id-1.bigquery-dataset-1.table-1", + "description": "", "tags": [] } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -263,7 +278,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -279,7 +295,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -296,7 +313,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -320,7 +338,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py index 80f9ab927f887b..8a3fa5ca46ea4a 100644 --- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py +++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py @@ -8,6 +8,9 @@ BigQueryTableRef, ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config +from datahub.ingestion.source.bigquery_v2.bigquery_helper import ( + unquote_and_decode_unicode_escape_seq, +) from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor from datahub.sql_parsing.schema_resolver import SchemaResolver @@ -176,3 +179,36 @@ def test_bigquery_table_sanitasitation(): assert table_identifier.dataset == "dataset-4567" assert table_identifier.table == "foo_2016*" assert table_identifier.get_table_display_name() == "foo" + + +def test_unquote_and_decode_unicode_escape_seq(): + + # Test with a string that starts and ends with quotes and has Unicode escape sequences + input_string = '"Hello \\u003cWorld\\u003e"' + expected_output = "Hello " + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with a string that does not start and end with quotes + input_string = "Hello \\u003cWorld\\u003e" + expected_output = "Hello " + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with an empty string + input_string = "" + expected_output = "" + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with a string that does not have Unicode escape sequences + input_string = "No escape sequences here" + expected_output = "No escape sequences here" + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with a string that starts and ends with quotes but does not have escape sequences + input_string = '"No escape sequences here"' + expected_output = "No escape sequences here" + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output