From d7df6e902541a5fae8e90c65634ba1cad42ad827 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 08:48:41 +1000 Subject: [PATCH 01/88] Spelling errors in code --- metadata-ingestion-modules/airflow-plugin/setup.py | 2 +- .../src/datahub_airflow_plugin/datahub_plugin.py | 2 +- .../examples/bootstrap_data/business_glossary.yml | 2 +- .../examples/library/data_quality_mcpw_rest.py | 2 +- .../src/datahub/api/entities/datajob/dataflow.py | 2 +- .../src/datahub/api/entities/datajob/datajob.py | 4 ++-- .../api/entities/dataprocess/dataprocess_instance.py | 10 +++++----- metadata-ingestion/src/datahub/cli/cli_utils.py | 2 +- metadata-ingestion/src/datahub/cli/ingest_cli.py | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 8ca4cad470f67e..85b0a4553d7a52 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -21,7 +21,7 @@ def get_long_description(): base_requirements = { - # Compatability. + # Compatibility. "dataclasses>=0.6; python_version < '3.7'", "typing_extensions>=3.10.0.2", "mypy_extensions>=0.4.3", diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py index c893ff61cb9ec3..6d8bb56791b3e2 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py @@ -215,7 +215,7 @@ def datahub_on_success_callback(context, *args, **kwargs): for inlet in inlets: datajob.inlets.append(inlet.urn) - # We have to use _oulets because outlets is empty + # We have to use _outlets because outlets is empty for outlet in task._outlets: datajob.outlets.append(outlet.urn) diff --git a/metadata-ingestion/examples/bootstrap_data/business_glossary.yml b/metadata-ingestion/examples/bootstrap_data/business_glossary.yml index 6669d393b7211d..71fd59bbccc462 100644 --- a/metadata-ingestion/examples/bootstrap_data/business_glossary.yml +++ b/metadata-ingestion/examples/bootstrap_data/business_glossary.yml @@ -40,7 +40,7 @@ nodes: inherits: - Classification.Sensitive - name: ClientsAndAccounts - description: Provides basic concepts such as account, account holder, account provider, relationship manager that are commonly used by financial services providers to describe customers and to determine counterparty identities + description: Provides basic concepts such as account, account holder, account provider, relationship manager that are commonly used by financial services providers to describe customers and to determine counterparts identities owners: groups: - finance diff --git a/metadata-ingestion/examples/library/data_quality_mcpw_rest.py b/metadata-ingestion/examples/library/data_quality_mcpw_rest.py index 7672d634f58468..077ca550e880eb 100644 --- a/metadata-ingestion/examples/library/data_quality_mcpw_rest.py +++ b/metadata-ingestion/examples/library/data_quality_mcpw_rest.py @@ -47,7 +47,7 @@ def emitAssertionResult(assertionResult: AssertionRunEvent) -> None: aspect=assertionResult, ) - # Emit BatchAssertion Result! (timseries aspect) + # Emit BatchAssertion Result! (timeseries aspect) emitter.emit_mcp(dataset_assertionRunEvent_mcp) diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py b/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py index c0378d554d5d31..588e66f19a0ef1 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py @@ -142,7 +142,7 @@ def emit( """ Emit the DataFlow entity to Datahub - :param emitter: Datahub Emitter to emit the proccess event + :param emitter: Datahub Emitter to emit the process event :param callback: (Optional[Callable[[Exception, str], None]]) the callback method for KafkaEmitter if it is used """ for mcp in self.generate_mcp(): diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py index 329eca7d9cd44b..ec86ad80226312 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py @@ -52,7 +52,7 @@ class DataJob: properties Dict[str, str]: Custom properties to set for the DataProcessInstance url (Optional[str]): Url which points to the DataJob at the orchestrator inlets (List[str]): List of urns the DataProcessInstance consumes - outlest (List[str]): List of urns the DataProcessInstance produces + outlets (List[str]): List of urns the DataProcessInstance produces input_datajob_urns: List[DataJobUrn] = field(default_factory=list) """ @@ -179,7 +179,7 @@ def emit( """ Emit the DataJob entity to Datahub - :param emitter: Datahub Emitter to emit the proccess event + :param emitter: Datahub Emitter to emit the process event :param callback: (Optional[Callable[[Exception, str], None]]) the callback method for KafkaEmitter if it is used :rtype: None """ diff --git a/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py b/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py index 859e5700a51c3b..9b107d701ab02d 100644 --- a/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py +++ b/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py @@ -55,7 +55,7 @@ class DataProcessInstance: template_urn (Optional[Union[DataJobUrn, DataFlowUrn]]): The parent DataJob or DataFlow which was instantiated if applicable parent_instance (Optional[DataProcessInstanceUrn]): The parent execution's urn if applicable properties Dict[str, str]: Custom properties to set for the DataProcessInstance - url (Optional[str]): Url which points to the exection at the orchestrator + url (Optional[str]): Url which points to the execution at the orchestrator inlets (List[str]): List of entities the DataProcessInstance consumes outlets (List[str]): List of entities the DataProcessInstance produces """ @@ -118,10 +118,10 @@ def emit_process_start( """ :rtype: None - :param emitter: Datahub Emitter to emit the proccess event + :param emitter: Datahub Emitter to emit the process event :param start_timestamp_millis: (int) the execution start time in milliseconds :param attempt: the number of attempt of the execution with the same execution id - :param emit_template: (bool) If it is set the template of the execution (datajob, datflow) will be emitted as well. + :param emit_template: (bool) If it is set the template of the execution (datajob, dataflow) will be emitted as well. :param callback: (Optional[Callable[[Exception, str], None]]) the callback method for KafkaEmitter if it is used """ if emit_template and self.template_urn is not None: @@ -312,8 +312,8 @@ def from_datajob( :param datajob: (DataJob) the datajob from generate the DataProcessInstance :param id: (str) the id for the DataProcessInstance - :param clone_inlets: (bool) wheather to clone datajob's inlets - :param clone_outlets: (bool) wheather to clone datajob's outlets + :param clone_inlets: (bool) whether to clone datajob's inlets + :param clone_outlets: (bool) whether to clone datajob's outlets :return: DataProcessInstance """ dpi: DataProcessInstance = DataProcessInstance( diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py index ffd6da2b1c3399..81dbae79d7354f 100644 --- a/metadata-ingestion/src/datahub/cli/cli_utils.py +++ b/metadata-ingestion/src/datahub/cli/cli_utils.py @@ -130,7 +130,7 @@ def get_details_from_config(): gms_token = gms_config.token return gms_host, gms_token except yaml.YAMLError as exc: - click.secho(f"{DATAHUB_CONFIG_PATH} malformatted, error: {exc}", bold=True) + click.secho(f"{DATAHUB_CONFIG_PATH} malformed, error: {exc}", bold=True) return None, None diff --git a/metadata-ingestion/src/datahub/cli/ingest_cli.py b/metadata-ingestion/src/datahub/cli/ingest_cli.py index 55d3a0cdc04760..264a7bf0fed723 100644 --- a/metadata-ingestion/src/datahub/cli/ingest_cli.py +++ b/metadata-ingestion/src/datahub/cli/ingest_cli.py @@ -79,7 +79,7 @@ def ingest() -> None: type=bool, is_flag=True, default=False, - help="Supress display of variable values in logs by supressing elaborae stacktrace (stackprinter) during ingestion failures", + help="Suppress display of variable values in logs by suppressing elaborate stacktrace (stackprinter) during ingestion failures", ) @click.pass_context @upgrade.check_upgrade From bb26c5c9a3c343c5b788f047b347c3837176ee60 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 08:48:50 +1000 Subject: [PATCH 02/88] Use pathlib --- metadata-ingestion-modules/airflow-plugin/setup.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 85b0a4553d7a52..6d26cf4498bdf9 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -1,4 +1,5 @@ import os +import pathlib import sys from typing import Dict, Set @@ -14,10 +15,7 @@ def get_long_description(): root = os.path.dirname(__file__) - with open(os.path.join(root, "README.md")) as f: - description = f.read() - - return description + return pathlib.Path(os.path.join(root, "README.md")).read_text() base_requirements = { From 0c72919d7670f54b90ec78a63ebaf7f306ab2c69 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 08:48:58 +1000 Subject: [PATCH 03/88] Spelling --- metadata-ingestion/src/datahub/ingestion/api/decorators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/api/decorators.py b/metadata-ingestion/src/datahub/ingestion/api/decorators.py index 9b3f35ae9d8116..eafbe14106fd23 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/decorators.py +++ b/metadata-ingestion/src/datahub/ingestion/api/decorators.py @@ -56,7 +56,7 @@ class SupportStatus(Enum): """ INCUBATING = auto() """ - Incubating Sources are ready for DataHub Community adoption but have not been tested for a wide variety of edge-cases. We eagerly solicit feedback from the Community to streghten the connector; minor version changes may arise in future releases. + Incubating Sources are ready for DataHub Community adoption but have not been tested for a wide variety of edge-cases. We eagerly solicit feedback from the Community to strengthen the connector; minor version changes may arise in future releases. """ TESTING = auto() """ From 7eb7aa7e41a0062f4c422d638d10339c243d6e04 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 08:49:10 +1000 Subject: [PATCH 04/88] Found bug in code --- metadata-ingestion/src/datahub/emitter/mcp_builder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/metadata-ingestion/src/datahub/emitter/mcp_builder.py b/metadata-ingestion/src/datahub/emitter/mcp_builder.py index 7aed2e29137492..868916fda2c810 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mcp_builder.py @@ -233,6 +233,7 @@ def gen_containers( def add_dataset_to_container( + # FIXME: Union requires two or more type arguments container_key: KeyType, dataset_urn: str ) -> Iterable[Union[MetadataWorkUnit]]: container_urn = make_container_urn( From fb069eee4ff55f82358693eee5c636e0e24a4962 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 08:49:28 +1000 Subject: [PATCH 05/88] Spelling and declare as TODO --- metadata-ingestion/src/datahub/configuration/kafka.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/configuration/kafka.py b/metadata-ingestion/src/datahub/configuration/kafka.py index 876db21086e88e..e752285cdde2af 100644 --- a/metadata-ingestion/src/datahub/configuration/kafka.py +++ b/metadata-ingestion/src/datahub/configuration/kafka.py @@ -27,7 +27,7 @@ def bootstrap_host_colon_port_comma(cls, val: str) -> str: else: host = entry assert re.match( - # This regex is quite loose. Many invalid hostnames or IPs will slip through, + # TODO: This regex is quite loose. Many invalid hostname's or IPs will slip through, # but it serves as a good first line of validation. We defer to Kafka for the # remaining validation. r"^[\w\-\.\:]+$", From d4c36a096c60006831ca8dfe5a9d0a100d10511d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 08:51:53 +1000 Subject: [PATCH 06/88] Test assertions to pass --- .../airflow-plugin/tests/integration/integration_test_dummy.py | 2 +- .../airflow-plugin/tests/unit/test_dummy.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/integration_test_dummy.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/integration_test_dummy.py index f4f53619168f89..10cf3ad0a608ae 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/integration_test_dummy.py +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/integration_test_dummy.py @@ -1,2 +1,2 @@ def test_dummy(): - assert True + pass diff --git a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_dummy.py b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_dummy.py index f4f53619168f89..10cf3ad0a608ae 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_dummy.py +++ b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_dummy.py @@ -1,2 +1,2 @@ def test_dummy(): - assert True + pass From 97617d3c44c982bcde6b1d12c7726445c63a20b4 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 08:52:02 +1000 Subject: [PATCH 07/88] use contextlib --- .../src/datahub_airflow_plugin/datahub_plugin.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py index 6d8bb56791b3e2..f7b384c1d32390 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py @@ -1,3 +1,4 @@ +import contextlib import traceback from typing import Any, Iterable @@ -389,13 +390,10 @@ def _patch_policy(settings): def _patch_datahub_policy(): - try: + with contextlib.suppress(ImportError): import airflow_local_settings _patch_policy(airflow_local_settings) - except ImportError: - pass - from airflow.models.dagbag import settings _patch_policy(settings) From 2774b73d4dcf54331eb9c9335dd7fbfe90e8b19f Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 08:52:20 +1000 Subject: [PATCH 08/88] dataset add col if/else simplification --- .../library/dataset_add_column_tag.py | 19 +++++++++---------- .../library/dataset_add_column_term.py | 19 +++++++++---------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/metadata-ingestion/examples/library/dataset_add_column_tag.py b/metadata-ingestion/examples/library/dataset_add_column_tag.py index f5243ce28a5f01..a457d12f493ae0 100644 --- a/metadata-ingestion/examples/library/dataset_add_column_tag.py +++ b/metadata-ingestion/examples/library/dataset_add_column_tag.py @@ -23,18 +23,17 @@ def get_simple_field_path_from_v2_field_path(field_path: str) -> str: """A helper function to extract simple . path notation from the v2 field path""" - if field_path.startswith("[version=2.0]"): - # this is a v2 field path - tokens = [ - t - for t in field_path.split(".") - if not (t.startswith("[") or t.endswith("]")) - ] - path = ".".join(tokens) - return path - else: + if not field_path.startswith("[version=2.0]"): # not a v2, we assume this is a simple path return field_path + # this is a v2 field path + tokens = [ + t + for t in field_path.split(".") + if not (t.startswith("[") or t.endswith("]")) + ] + + return ".".join(tokens) # Inputs -> the column, dataset and the tag to set diff --git a/metadata-ingestion/examples/library/dataset_add_column_term.py b/metadata-ingestion/examples/library/dataset_add_column_term.py index ff1cad48a9f0c0..ea5cd1f632f743 100644 --- a/metadata-ingestion/examples/library/dataset_add_column_term.py +++ b/metadata-ingestion/examples/library/dataset_add_column_term.py @@ -23,18 +23,17 @@ def get_simple_field_path_from_v2_field_path(field_path: str) -> str: """A helper function to extract simple . path notation from the v2 field path""" - if field_path.startswith("[version=2.0]"): - # this is a v2 field path - tokens = [ - t - for t in field_path.split(".") - if not (t.startswith("[") or t.endswith("]")) - ] - path = ".".join(tokens) - return path - else: + if not field_path.startswith("[version=2.0]"): # not a v2, we assume this is a simple path return field_path + # this is a v2 field path + tokens = [ + t + for t in field_path.split(".") + if not (t.startswith("[") or t.endswith("]")) + ] + + return ".".join(tokens) # Inputs -> the column, dataset and the term to set From 387b9d3eda977bdd653bb7b36fbdb2547b716464 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 08:52:40 +1000 Subject: [PATCH 09/88] lineage emitter UTC timezone --- .../lineage_job_dataflow_new_api_simple.py | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/metadata-ingestion/examples/library/lineage_job_dataflow_new_api_simple.py b/metadata-ingestion/examples/library/lineage_job_dataflow_new_api_simple.py index 7212282156d8b9..d339d35110db1d 100644 --- a/metadata-ingestion/examples/library/lineage_job_dataflow_new_api_simple.py +++ b/metadata-ingestion/examples/library/lineage_job_dataflow_new_api_simple.py @@ -8,6 +8,7 @@ ) from datahub.emitter.rest_emitter import DatahubRestEmitter +from datetime import timezone emitter = DatahubRestEmitter("http://localhost:8080") jobFlow = DataFlow(cluster="prod", orchestrator="airflow", id="flow_api_simple") @@ -36,40 +37,39 @@ jobFlowRun = DataProcessInstance.from_dataflow( dataflow=jobFlow, id=f"{jobFlow.id}-{uuid.uuid4()}" ) -jobFlowRun.emit_process_start(emitter, int(datetime.utcnow().timestamp() * 1000)) +jobFlowRun.emit_process_start(emitter, int(datetime.now(timezone.utc).timestamp() * 1000)) + jobRun = DataProcessInstance.from_datajob( datajob=dataJob, id=f"{jobFlow.id}-{uuid.uuid4()}" ) -jobRun.emit_process_start(emitter, int(datetime.utcnow().timestamp() * 1000)) -jobRun.emit_process_end( - emitter, int(datetime.utcnow().timestamp() * 1000), result=InstanceRunResult.SUCCESS -) +jobRun.emit_process_start(emitter, int(datetime.now(timezone.utc).timestamp() * 1000)) + +jobRun.emit_process_end(emitter, int(datetime.now(timezone.utc).timestamp() * 1000), result=InstanceRunResult.SUCCESS) + job2Run = DataProcessInstance.from_datajob( datajob=dataJob2, id=f"{jobFlow.id}-{uuid.uuid4()}" ) -job2Run.emit_process_start(emitter, int(datetime.utcnow().timestamp() * 1000)) -job2Run.emit_process_end( - emitter, int(datetime.utcnow().timestamp() * 1000), result=InstanceRunResult.SUCCESS -) +job2Run.emit_process_start(emitter, int(datetime.now(timezone.utc).timestamp() * 1000)) + +job2Run.emit_process_end(emitter, int(datetime.now(timezone.utc).timestamp() * 1000), result=InstanceRunResult.SUCCESS) + job3Run = DataProcessInstance.from_datajob( datajob=dataJob3, id=f"{jobFlow.id}-{uuid.uuid4()}" ) -job3Run.emit_process_start(emitter, int(datetime.utcnow().timestamp() * 1000)) -job3Run.emit_process_end( - emitter, int(datetime.utcnow().timestamp() * 1000), result=InstanceRunResult.SUCCESS -) +job3Run.emit_process_start(emitter, int(datetime.now(timezone.utc).timestamp() * 1000)) + +job3Run.emit_process_end(emitter, int(datetime.now(timezone.utc).timestamp() * 1000), result=InstanceRunResult.SUCCESS) + job4Run = DataProcessInstance.from_datajob( datajob=dataJob4, id=f"{jobFlow.id}-{uuid.uuid4()}" ) -job4Run.emit_process_start(emitter, int(datetime.utcnow().timestamp() * 1000)) -job4Run.emit_process_end( - emitter, int(datetime.utcnow().timestamp() * 1000), result=InstanceRunResult.SUCCESS -) +job4Run.emit_process_start(emitter, int(datetime.now(timezone.utc).timestamp() * 1000)) -jobFlowRun.emit_process_end( - emitter, int(datetime.utcnow().timestamp() * 1000), result=InstanceRunResult.SUCCESS -) +job4Run.emit_process_end(emitter, int(datetime.now(timezone.utc).timestamp() * 1000), result=InstanceRunResult.SUCCESS) + + +jobFlowRun.emit_process_end(emitter, int(datetime.now(timezone.utc).timestamp() * 1000), result=InstanceRunResult.SUCCESS) From de786c332d547a3ef98ef1ddadae1f4dbf61cb3e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 08:52:43 +1000 Subject: [PATCH 10/88] Update lineage_emitter_mcpw_rest.py --- .../examples/library/lineage_emitter_mcpw_rest.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/metadata-ingestion/examples/library/lineage_emitter_mcpw_rest.py b/metadata-ingestion/examples/library/lineage_emitter_mcpw_rest.py index 11f73d36cb29d9..d1c934cba40409 100644 --- a/metadata-ingestion/examples/library/lineage_emitter_mcpw_rest.py +++ b/metadata-ingestion/examples/library/lineage_emitter_mcpw_rest.py @@ -10,13 +10,11 @@ ) from datahub.metadata.schema_classes import ChangeTypeClass -# Construct upstream tables. -upstream_tables: List[UpstreamClass] = [] upstream_table_1 = UpstreamClass( dataset=builder.make_dataset_urn("bigquery", "upstream_table_1", "PROD"), type=DatasetLineageTypeClass.TRANSFORMED, ) -upstream_tables.append(upstream_table_1) +upstream_tables: List[UpstreamClass] = [upstream_table_1] upstream_table_2 = UpstreamClass( dataset=builder.make_dataset_urn("bigquery", "upstream_table_2", "PROD"), type=DatasetLineageTypeClass.TRANSFORMED, From d95e8baac5525886a6ed870c4303a906595263e0 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 08:52:52 +1000 Subject: [PATCH 11/88] Update custom_transform_example.py --- .../examples/transforms/custom_transform_example.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/examples/transforms/custom_transform_example.py b/metadata-ingestion/examples/transforms/custom_transform_example.py index 85663d971092b5..4a3d16d4a4dd94 100644 --- a/metadata-ingestion/examples/transforms/custom_transform_example.py +++ b/metadata-ingestion/examples/transforms/custom_transform_example.py @@ -61,13 +61,8 @@ def transform_aspect( # type: ignore assert aspect is None or isinstance(aspect, OwnershipClass) if owners_to_add: - ownership = ( - aspect - if aspect - else OwnershipClass( - owners=[], - ) - ) + ownership = aspect or OwnershipClass(owners=[],) + ownership.owners.extend(owners_to_add) return ownership From 20f770ebed5566fbdadbde4bc664b39eb280f958 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 08:53:22 +1000 Subject: [PATCH 12/88] path read over open --- metadata-ingestion/scripts/avro_codegen.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py index 05d8fb1c5804c6..e5758d159bdee5 100644 --- a/metadata-ingestion/scripts/avro_codegen.py +++ b/metadata-ingestion/scripts/avro_codegen.py @@ -10,11 +10,8 @@ def load_schema_file(schema_file: str) -> str: - with open(schema_file) as f: - raw_schema_text = f.read() - - redo_spaces = json.dumps(json.loads(raw_schema_text), indent=2) - return redo_spaces + raw_schema_text = Path(schema_file).read_text() + return json.dumps(json.loads(raw_schema_text), indent=2) def merge_schemas(schemas: List[str]) -> str: From 2b1b6b9d378a28bb7add2a6838e18d553a95839d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 08:53:52 +1000 Subject: [PATCH 13/88] spelling --- metadata-ingestion/src/datahub/cli/cli_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py index 81dbae79d7354f..0e0c1c9c430adb 100644 --- a/metadata-ingestion/src/datahub/cli/cli_utils.py +++ b/metadata-ingestion/src/datahub/cli/cli_utils.py @@ -200,7 +200,7 @@ def test_connection(): def test_connectivity_complain_exit(operation_name: str) -> None: - """Test connectivty to metadata-service, log operation name and exit""" + """Test connectivity to metadata-service, log operation name and exit""" # First test connectivity try: test_connection() From 5562bfe680598ee012674ae862941d01d25db0b0 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 08:54:05 +1000 Subject: [PATCH 14/88] escape strings --- metadata-ingestion/src/datahub/cli/cli_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py index 0e0c1c9c430adb..817ea500e7c093 100644 --- a/metadata-ingestion/src/datahub/cli/cli_utils.py +++ b/metadata-ingestion/src/datahub/cli/cli_utils.py @@ -194,7 +194,7 @@ def get_session_and_host(): def test_connection(): (session, host) = get_session_and_host() - url = host + "/config" + url = f"{host}/config" response = session.get(url) response.raise_for_status() @@ -556,7 +556,7 @@ def get_entity( endpoint: str = f"/entitiesV2/{encoded_urn}" if aspect and len(aspect): - endpoint = endpoint + "?aspects=List(" + ",".join(aspect) + ")" + endpoint = f"{endpoint}?aspects=List(" + ",".join(aspect) + ")" response = session.get(gms_host + endpoint) response.raise_for_status() From 2952cefb369364b6f22131117c24422b4202982d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 16:32:39 +1000 Subject: [PATCH 15/88] Update cli_utils.py --- .../src/datahub/cli/cli_utils.py | 48 ++++--------------- 1 file changed, 10 insertions(+), 38 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py index 817ea500e7c093..84ee3aa8513a31 100644 --- a/metadata-ingestion/src/datahub/cli/cli_utils.py +++ b/metadata-ingestion/src/datahub/cli/cli_utils.py @@ -297,10 +297,7 @@ def post_delete_references_endpoint( path: str, cached_session_host: Optional[Tuple[Session, str]] = None, ) -> Tuple[int, List[Dict]]: - if not cached_session_host: - session, gms_host = get_session_and_host() - else: - session, gms_host = cached_session_host + session, gms_host = cached_session_host or get_session_and_host() url = gms_host + path payload = json.dumps(payload_obj) @@ -316,10 +313,7 @@ def post_delete_endpoint( path: str, cached_session_host: Optional[Tuple[Session, str]] = None, ) -> typing.Tuple[str, int]: - if not cached_session_host: - session, gms_host = get_session_and_host() - else: - session, gms_host = cached_session_host + session, gms_host = cached_session_host or get_session_and_host() url = gms_host + path return post_delete_endpoint_with_session_and_url(session, url, payload_obj) @@ -369,9 +363,7 @@ def get_urns_by_filter( "condition": "EQUAL", } ) - if platform is not None and ( - entity_type_lower == "chart" or entity_type_lower == "dashboard" - ): + if platform is not None and entity_type_lower in {"chart", "dashboard"}: filter_criteria.append( { "field": "tool", @@ -479,10 +471,7 @@ def batch_get_ids( session, gms_host = get_session_and_host() endpoint: str = "/entitiesV2" url = gms_host + endpoint - ids_to_get = [] - for id in ids: - ids_to_get.append(Urn.url_encode(id)) - + ids_to_get = [Urn.url_encode(id) for id in ids] response = session.get( f"{url}?ids=List({','.join(ids_to_get)})", ) @@ -539,11 +528,7 @@ def get_entity( aspect: Optional[List] = None, cached_session_host: Optional[Tuple[Session, str]] = None, ) -> Dict: - if not cached_session_host: - session, gms_host = get_session_and_host() - else: - session, gms_host = cached_session_host - + session, gms_host = cached_session_host or get_session_and_host() if urn.startswith("urn%3A"): # we assume the urn is already encoded encoded_urn: str = urn @@ -570,11 +555,7 @@ def post_entity( aspect_value: Dict, cached_session_host: Optional[Tuple[Session, str]] = None, ) -> Dict: - if not cached_session_host: - session, gms_host = get_session_and_host() - else: - session, gms_host = cached_session_host - + session, gms_host = cached_session_host or get_session_and_host() endpoint: str = "/aspects/?action=ingestProposal" proposal = { @@ -671,10 +652,7 @@ def get_latest_timeseries_aspect_values( timeseries_aspect_name: str, cached_session_host: Optional[Tuple[Session, str]], ) -> Dict: - if not cached_session_host: - session, gms_host = get_session_and_host() - else: - session, gms_host = cached_session_host + session, gms_host = cached_session_host or get_session_and_host() query_body = { "urn": entity_urn, "entity": guess_entity_type(entity_urn), @@ -725,14 +703,8 @@ def get_aspects_for_entity( aspect_value["aspect"]["value"] = json.loads( aspect_value["aspect"]["value"] ) - aspect_list.update( - # Follow the convention used for non-timeseries aspects. - { - aspect_cls.RECORD_SCHEMA.fullname.replace( - "pegasus2avro.", "" - ): aspect_value - } - ) + aspect_list[aspect_cls.RECORD_SCHEMA.fullname.replace("pegasus2avro.", "")] = aspect_value + aspect_map: Dict[str, Union[dict, DictWrapper]] = {} for a in aspect_list.values(): @@ -756,4 +728,4 @@ def get_aspects_for_entity( if aspects: return {k: v for (k, v) in aspect_map.items() if k in aspects} else: - return {k: v for (k, v) in aspect_map.items()} + return dict(aspect_map) From ef6b5bc5b18850af8bf807e624dea976115b1f00 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 16:32:41 +1000 Subject: [PATCH 16/88] Update delete_cli.py --- .../src/datahub/cli/delete_cli.py | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/delete_cli.py b/metadata-ingestion/src/datahub/cli/delete_cli.py index 1ca0ac864693b1..ae8574b89c9695 100644 --- a/metadata-ingestion/src/datahub/cli/delete_cli.py +++ b/metadata-ingestion/src/datahub/cli/delete_cli.py @@ -234,16 +234,16 @@ def delete_with_filters( logger.info(f"datahub configured with {gms_host}") emitter = rest_emitter.DatahubRestEmitter(gms_server=gms_host, token=token) batch_deletion_result = DeletionResult() - urns = [ - u - for u in cli_utils.get_urns_by_filter( + urns = list( + cli_utils.get_urns_by_filter( env=env, platform=platform, search_query=search_query, entity_type=entity_type, include_removed=include_removed, ) - ] + ) + logger.info( f"Filter matched {len(urns)} entities. Sample: {choices(urns, k=min(5, len(urns)))}" ) @@ -284,12 +284,12 @@ def _delete_one_urn( if soft: # Add removed aspect - if not cached_emitter: + if cached_emitter: + emitter = cached_emitter + else: _, gms_host = cli_utils.get_session_and_host() token = cli_utils.get_token() emitter = rest_emitter.DatahubRestEmitter(gms_server=gms_host, token=token) - else: - emitter = cached_emitter if not dry_run: emitter.emit_mcp( MetadataChangeProposalWrapper( @@ -305,18 +305,17 @@ def _delete_one_urn( ) else: logger.info(f"[Dry-run] Would soft-delete {urn}") + elif not dry_run: + payload_obj = {"urn": urn} + urn, rows_affected = cli_utils.post_delete_endpoint( + payload_obj, + "/entities?action=delete", + cached_session_host=cached_session_host, + ) + deletion_result.num_records = rows_affected else: - if not dry_run: - payload_obj = {"urn": urn} - urn, rows_affected = cli_utils.post_delete_endpoint( - payload_obj, - "/entities?action=delete", - cached_session_host=cached_session_host, - ) - deletion_result.num_records = rows_affected - else: - logger.info(f"[Dry-run] Would hard-delete {urn}") - deletion_result.num_records = UNKNOWN_NUM_RECORDS # since we don't know how many rows will be affected + logger.info(f"[Dry-run] Would hard-delete {urn}") + deletion_result.num_records = UNKNOWN_NUM_RECORDS # since we don't know how many rows will be affected deletion_result.end() return deletion_result From 0c92e2b64fc2140b5f350c7671129898fee13a74 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 16:32:45 +1000 Subject: [PATCH 17/88] Update docker_check.py --- metadata-ingestion/src/datahub/cli/docker_check.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/docker_check.py b/metadata-ingestion/src/datahub/cli/docker_check.py index e530f4d19616f3..005651d673df36 100644 --- a/metadata-ingestion/src/datahub/cli/docker_check.py +++ b/metadata-ingestion/src/datahub/cli/docker_check.py @@ -88,10 +88,9 @@ def check_local_docker_containers(preflight_only: bool = False) -> List[str]: if len(containers) == 0: issues.append("quickstart.sh or dev.sh is not running") else: - existing_containers = set(container.name for container in containers) + existing_containers = {container.name for container in containers} missing_containers = set(REQUIRED_CONTAINERS) - existing_containers - for missing in missing_containers: - issues.append(f"{missing} container is not present") + issues.extend(f"{missing} container is not present" for missing in missing_containers) # Check that the containers are running and healthy. for container in containers: From 62a6dd87c93b61f5b7bddfdd10cb3a4d26e930b1 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 16:32:50 +1000 Subject: [PATCH 18/88] Escape --- metadata-ingestion/src/datahub/cli/ingest_cli.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/ingest_cli.py b/metadata-ingestion/src/datahub/cli/ingest_cli.py index 264a7bf0fed723..ecb7d80e30fcfa 100644 --- a/metadata-ingestion/src/datahub/cli/ingest_cli.py +++ b/metadata-ingestion/src/datahub/cli/ingest_cli.py @@ -313,14 +313,14 @@ def rollback( current_time = now.strftime("%Y-%m-%d %H:%M:%S") try: - folder_name = report_dir + "/" + current_time + folder_name = f"{report_dir}/{current_time}" - ingestion_config_file_name = folder_name + "/config.json" + ingestion_config_file_name = f"{folder_name}/config.json" os.makedirs(os.path.dirname(ingestion_config_file_name), exist_ok=True) with open(ingestion_config_file_name, "w") as file_handle: json.dump({"run_id": run_id}, file_handle) - csv_file_name = folder_name + "/unsafe_entities.csv" + csv_file_name = f"{folder_name}/unsafe_entities.csv" with open(csv_file_name, "w") as file_handle: writer = csv.writer(file_handle) writer.writerow(["urn"]) @@ -329,4 +329,4 @@ def rollback( except IOError as e: print(e) - sys.exit("Unable to write reports to " + report_dir) + sys.exit(f"Unable to write reports to {report_dir}") From 26f758f66a8459574380816c10359c05f163f353 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 16:32:54 +1000 Subject: [PATCH 19/88] Update migration_utils.py --- metadata-ingestion/src/datahub/cli/migration_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/migration_utils.py b/metadata-ingestion/src/datahub/cli/migration_utils.py index b383e2849b6b9b..79546e07ac056a 100644 --- a/metadata-ingestion/src/datahub/cli/migration_utils.py +++ b/metadata-ingestion/src/datahub/cli/migration_utils.py @@ -218,8 +218,8 @@ def modify_urn_list_for_aspect( new_urn: str, ) -> DictWrapper: - if hasattr(UrnListModifier, aspect_name + "_modifier"): - modifier = getattr(UrnListModifier, aspect_name + "_modifier") + if hasattr(UrnListModifier, f"{aspect_name}_modifier"): + modifier = getattr(UrnListModifier, f"{aspect_name}_modifier") return modifier( aspect=aspect, relationship_type=relationship_type, From 03c4bf1646f8971f24a4ad35e7c5ac316de757f3 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 16:32:57 +1000 Subject: [PATCH 20/88] Update timeline_cli.py --- .../src/datahub/cli/timeline_cli.py | 58 +++++++------------ 1 file changed, 21 insertions(+), 37 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/timeline_cli.py b/metadata-ingestion/src/datahub/cli/timeline_cli.py index 40c5af4e1e78a0..eec753a4af2ba0 100644 --- a/metadata-ingestion/src/datahub/cli/timeline_cli.py +++ b/metadata-ingestion/src/datahub/cli/timeline_cli.py @@ -19,46 +19,34 @@ def pretty_field_path(field_path: str) -> str: - if field_path.startswith("[version=2.0]"): + if not field_path.startswith("[version=2.0]"): + return field_path # breakpoint() # parse schema field - tokens = [ - t - for t in field_path.split(".") - if not (t.startswith("[") or t.endswith("]")) - ] - path = ".".join(tokens) - return path - else: - return field_path + tokens = [t for t in field_path.split(".") if not t.startswith("[") and not t.endswith("]")] + + return ".".join(tokens) def pretty_id(id: Optional[str]) -> str: if not id: return "" - else: - # breakpoint() - assert id is not None - if id.startswith("urn:li:datasetField:") or id.startswith( + # breakpoint() + assert id is not None + if id.startswith("urn:li:datasetField:") or id.startswith( "urn:li:schemaField:" ): - # parse schema field - schema_field_key = schema_field_urn_to_key( - id.replace("urn:li:datasetField", "urn:li:schemaField") - ) - if schema_field_key: - assert schema_field_key is not None - field_path = schema_field_key.fieldPath - - return f"{colored('field','cyan')}:{colored(pretty_field_path(field_path),'white')}" - if id.startswith("[version=2.0]"): - return f"{colored('field','cyan')}:{colored(pretty_field_path(id),'white')}" - - if id.startswith("urn:li:dataset"): - # parse dataset urn - dataset_key = dataset_urn_to_key(id) - if dataset_key: - return f"{colored('dataset','cyan')}:{colored(dataset_key.platform,'white')}:{colored(dataset_key.name,'white')}" + if schema_field_key := schema_field_urn_to_key(id.replace("urn:li:datasetField", "urn:li:schemaField")): + assert schema_field_key is not None + field_path = schema_field_key.fieldPath + + return f"{colored('field','cyan')}:{colored(pretty_field_path(field_path),'white')}" + if id.startswith("[version=2.0]"): + return f"{colored('field','cyan')}:{colored(pretty_field_path(id),'white')}" + + if id.startswith("urn:li:dataset"): + if dataset_key := dataset_urn_to_key(id): + return f"{colored('dataset','cyan')}:{colored(dataset_key.platform,'white')}:{colored(dataset_key.name,'white')}" # failed to prettify, return original return id @@ -194,12 +182,8 @@ def timeline( change_instant = str( datetime.fromtimestamp(change_txn["timestamp"] // 1000) ) - change_color = ( - "green" - if change_txn.get("semVerChange") == "MINOR" - or change_txn.get("semVerChange") == "PATCH" - else "red" - ) + change_color = "green" if change_txn.get("semVerChange") in ["MINOR", "PATCH"] else "red" + print( f"{colored(change_instant,'cyan')} - {colored(change_txn['semVer'],change_color)}" ) From 5ee83d9f637d00b71ba4ffb225c7eacb4e03877b Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 16:33:01 +1000 Subject: [PATCH 21/88] Update common.py --- .../src/datahub/configuration/common.py | 44 +++++-------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 716572babc1e41..b9fce56d5ff60b 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -39,10 +39,7 @@ class OperationalError(PipelineExecutionError): def __init__(self, message: str, info: dict = None): self.message = message - if info: - self.info = info - else: - self.info = {} + self.info = info or {} class ConfigurationError(MetaError): @@ -104,7 +101,7 @@ class OauthConfiguration(ConfigModel): class AllowDenyPattern(ConfigModel): - """A class to store allow deny regexes""" + """A class to store allow deny regex's""" allow: List[str] = Field( default=[".*"], @@ -128,10 +125,7 @@ def alphabet_pattern(self) -> Pattern: @property def regex_flags(self) -> int: - if self.ignoreCase: - return re.IGNORECASE - else: - return 0 + return re.IGNORECASE if self.ignoreCase else 0 @classmethod def allow_all(cls) -> "AllowDenyPattern": @@ -142,23 +136,16 @@ def allowed(self, string: str) -> bool: if re.match(deny_pattern, string, self.regex_flags): return False - for allow_pattern in self.allow: - if re.match(allow_pattern, string, self.regex_flags): - return True - - return False + return any(re.match(allow_pattern, string, self.regex_flags) for allow_pattern in self.allow) def is_fully_specified_allow_list(self) -> bool: """ - If the allow patterns are literals and not full regexes, then it is considered + If the allow patterns are literals and not full regex's, then it is considered fully specified. This is useful if you want to convert a 'list + filter' pattern into a 'search for the ones that are allowed' pattern, which can be much more efficient in some cases. """ - for allow_pattern in self.allow: - if not self.alphabet_pattern.match(allow_pattern): - return False - return True + return all(self.alphabet_pattern.match(allow_pattern) for allow_pattern in self.allow) def get_allowed_list(self) -> List[str]: """Return the list of allowed strings as a list, after taking into account deny patterns, if possible""" @@ -167,7 +154,7 @@ def get_allowed_list(self) -> List[str]: class KeyValuePattern(ConfigModel): - """A class to store allow deny regexes""" + """A class to store allow deny regex's""" rules: Dict[str, List[str]] = {".*": []} alphabet: str = "[A-Za-z0-9 _.-]" @@ -181,28 +168,19 @@ def all(cls) -> "KeyValuePattern": return KeyValuePattern() def value(self, string: str) -> List[str]: - for key in self.rules.keys(): - if re.match(key, string): - return self.rules[key] - return [] + return next((self.rules[key] for key in self.rules.keys() if re.match(key, string)), []) def matched(self, string: str) -> bool: - for key in self.rules.keys(): - if re.match(key, string): - return True - return False + return any(re.match(key, string) for key in self.rules.keys()) def is_fully_specified_key(self) -> bool: """ - If the allow patterns are literals and not full regexes, then it is considered + If the allow patterns are literals and not full regex's, then it is considered fully specified. This is useful if you want to convert a 'list + filter' pattern into a 'search for the ones that are allowed' pattern, which can be much more efficient in some cases. """ - for key in self.rules.keys(): - if not self.alphabet_pattern.match(key): - return True - return False + return any(not self.alphabet_pattern.match(key) for key in self.rules.keys()) def get(self) -> Dict[str, List[str]]: """Return the list of allowed strings as a list, after taking into account deny patterns, if possible""" From c78fdd87707c9df2482f3947615a1ecafd503c1c Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 16:33:03 +1000 Subject: [PATCH 22/88] Update import_resolver.py --- .../src/datahub/configuration/import_resolver.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/import_resolver.py b/metadata-ingestion/src/datahub/configuration/import_resolver.py index 56e232d0403241..19627c7b8c9569 100644 --- a/metadata-ingestion/src/datahub/configuration/import_resolver.py +++ b/metadata-ingestion/src/datahub/configuration/import_resolver.py @@ -8,9 +8,7 @@ def _pydantic_resolver(v: Union[T, str]) -> T: - if isinstance(v, str): - return import_path(v) - return v + return import_path(v) if isinstance(v, str) else v def pydantic_resolve_key(field: str) -> classmethod: From 14209324d2c9658f3760206f9172a98dcf79e5f2 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 16:33:07 +1000 Subject: [PATCH 23/88] Update yaml.py --- metadata-ingestion/src/datahub/configuration/yaml.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/yaml.py b/metadata-ingestion/src/datahub/configuration/yaml.py index ee710b07bab3d2..1f1172836f7448 100644 --- a/metadata-ingestion/src/datahub/configuration/yaml.py +++ b/metadata-ingestion/src/datahub/configuration/yaml.py @@ -9,5 +9,4 @@ class YamlConfigurationMechanism(ConfigurationMechanism): """Ability to load configuration from yaml files""" def load_config(self, config_fp: IO) -> dict: - config = yaml.safe_load(config_fp) - return config + return yaml.safe_load(config_fp) From 16e2c1204e05f2c5fd5c01602c04c3eb2dea69da Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 16:33:09 +1000 Subject: [PATCH 24/88] Update kafka_emitter.py --- .../src/datahub/emitter/kafka_emitter.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/metadata-ingestion/src/datahub/emitter/kafka_emitter.py b/metadata-ingestion/src/datahub/emitter/kafka_emitter.py index f2dc663cf0677a..001097a2e42f5b 100644 --- a/metadata-ingestion/src/datahub/emitter/kafka_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/kafka_emitter.py @@ -49,12 +49,11 @@ def validate_topic_routes(cls: "KafkaEmitterConfig", values: dict) -> dict: raise ConfigurationError( "Using both topic and topic_routes configuration for Kafka is not supported. Use only topic_routes" ) - else: - logger.warning( - "Looks like you're using the deprecated `topic` configuration. Please migrate to `topic_routes`." - ) - # upgrade topic provided to topic_routes mce entry - values["topic_routes"][MCE_KEY] = values["topic"] + logger.warning( + "Looks like you're using the deprecated `topic` configuration. Please migrate to `topic_routes`." + ) + # upgrade topic provided to topic_routes mce entry + values["topic_routes"][MCE_KEY] = values["topic"] return values @@ -70,8 +69,7 @@ def __init__(self, config: KafkaEmitterConfig): def convert_mce_to_dict( mce: MetadataChangeEvent, ctx: SerializationContext ) -> dict: - tuple_encoding = mce.to_obj(tuples=True) - return tuple_encoding + return mce.to_obj(tuples=True) mce_avro_serializer = AvroSerializer( schema_str=getMetadataChangeEventSchema(), @@ -83,8 +81,7 @@ def convert_mcp_to_dict( mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper], ctx: SerializationContext, ) -> dict: - tuple_encoding = mcp.to_obj(tuples=True) - return tuple_encoding + return mcp.to_obj(tuples=True) mcp_avro_serializer = AvroSerializer( schema_str=getMetadataChangeProposalSchema(), From 2b0612f1ae88c79f72f0b856c2be4461106b6097 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 16:33:13 +1000 Subject: [PATCH 25/88] Update mce_builder.py --- .../src/datahub/emitter/mce_builder.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py index e6203933705cb1..19f2487f043327 100644 --- a/metadata-ingestion/src/datahub/emitter/mce_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py @@ -104,8 +104,8 @@ def schema_field_urn_to_key(schema_field_urn: str) -> Optional[SchemaFieldKeyCla pattern = r"urn:li:schemaField:\((.*),(.*)\)" results = re.search(pattern, schema_field_urn) if results is not None: - dataset_urn: str = results.group(1) - field_path: str = results.group(2) + dataset_urn: str = results[1] + field_path: str = results[2] return SchemaFieldKeyClass(parent=dataset_urn, fieldPath=field_path) return None @@ -114,9 +114,7 @@ def dataset_urn_to_key(dataset_urn: str) -> Optional[DatasetKeyClass]: pattern = r"urn:li:dataset:\(urn:li:dataPlatform:(.*),(.*),(.*)\)" results = re.search(pattern, dataset_urn) if results is not None: - return DatasetKeyClass( - platform=results.group(1), name=results.group(2), origin=results.group(3) - ) + return DatasetKeyClass(platform=results[1], name=results[2], origin=results[3]) return None @@ -128,9 +126,7 @@ def container_new_urn_to_key(dataset_urn: str) -> Optional[ContainerKeyClass]: pattern = r"urn:dh:container:0:\((.*)\)" results = re.search(pattern, dataset_urn) if results is not None: - return ContainerKeyClass( - guid=results.group(1), - ) + return ContainerKeyClass(guid=results[1]) return None @@ -146,9 +142,7 @@ def container_urn_to_key(guid: str) -> Optional[ContainerKeyClass]: pattern = r"urn:li:container:(.*)" results = re.search(pattern, guid) if results is not None: - return ContainerKeyClass( - guid=results.group(1), - ) + return ContainerKeyClass(guid=results[1]) return None @@ -156,8 +150,7 @@ def datahub_guid(obj: dict) -> str: obj_str = json.dumps( pre_json_transform(obj), separators=(",", ":"), sort_keys=True ).encode("utf-8") - datahub_guid = md5(obj_str).hexdigest() - return datahub_guid + return md5(obj_str).hexdigest() def make_assertion_urn(assertion_id: str) -> str: From 85368b5f8b150ceccd93f5c15a51743c826855cc Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 16:33:15 +1000 Subject: [PATCH 26/88] Update serialization_helper.py --- .../src/datahub/emitter/serialization_helper.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/emitter/serialization_helper.py b/metadata-ingestion/src/datahub/emitter/serialization_helper.py index 5a348ce267b10f..958c913698e442 100644 --- a/metadata-ingestion/src/datahub/emitter/serialization_helper.py +++ b/metadata-ingestion/src/datahub/emitter/serialization_helper.py @@ -16,10 +16,11 @@ def _json_transform(obj: Any, from_pattern: str, to_pattern: str) -> Any: field = obj["fieldDiscriminator"] return {field: _json_transform(obj[field], from_pattern, to_pattern)} - new_obj: Any = {} - for key, value in obj.items(): - if value is not None: - new_obj[key] = _json_transform(value, from_pattern, to_pattern) + new_obj: Any = { + key: _json_transform(value, from_pattern, to_pattern) \ + for key, value in obj.items() if value is not None + } + return new_obj elif isinstance(obj, list): new_obj = [_json_transform(item, from_pattern, to_pattern) for item in obj] From af20f707102f0dcc8b84680381ba6f1feef92260 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 16:33:17 +1000 Subject: [PATCH 27/88] Update committable.py --- metadata-ingestion/src/datahub/ingestion/api/committable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/api/committable.py b/metadata-ingestion/src/datahub/ingestion/api/committable.py index f1aada4477f1ab..e41eb24abc2d96 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/committable.py +++ b/metadata-ingestion/src/datahub/ingestion/api/committable.py @@ -55,7 +55,7 @@ def __init__( super(_CommittableConcrete, self).__init__(state_to_commit=state_to_commit) def has_successfully_committed(self) -> bool: - return True if not self.state_to_commit or self.committed else False + return bool(not self.state_to_commit or self.committed) @abstractmethod def get_previous_states( From 9ab0d3dff3a84dcb943825c576cf45d4a5efaca6 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 16:33:19 +1000 Subject: [PATCH 28/88] Update common.py --- .../src/datahub/ingestion/api/common.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/api/common.py b/metadata-ingestion/src/datahub/ingestion/api/common.py index 56c21a7f39c627..fd458f9b4fd980 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/common.py +++ b/metadata-ingestion/src/datahub/ingestion/api/common.py @@ -55,8 +55,8 @@ def __init__( self.pipeline_name = pipeline_name self.dry_run_mode = dry_run self.preview_mode = preview_mode - self.reporters: Dict[str, Committable] = dict() - self.checkpointers: Dict[str, Committable] = dict() + self.reporters: Dict[str, Committable] = {} + self.checkpointers: Dict[str, Committable] = {} self._set_dataset_urn_to_lower_if_needed() def _set_dataset_urn_to_lower_if_needed(self) -> None: @@ -81,11 +81,8 @@ def register_reporter(self, committable: Committable) -> None: self.reporters[committable.name] = committable def get_reporters(self) -> Iterable[Committable]: - for committable in self.reporters.values(): - yield committable + yield from self.reporters.values() def get_committables(self) -> Iterable[Tuple[str, Committable]]: - for reporting_item_commitable in self.reporters.items(): - yield reporting_item_commitable - for checkpointing_item_commitable in self.checkpointers.items(): - yield checkpointing_item_commitable + yield from self.reporters.items() + yield from self.checkpointers.items() From daff0e490ceea7573bd4e2b9b0aca4c2458bcd1c Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 16:33:21 +1000 Subject: [PATCH 29/88] Update decorators.py --- metadata-ingestion/src/datahub/ingestion/api/decorators.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/api/decorators.py b/metadata-ingestion/src/datahub/ingestion/api/decorators.py index eafbe14106fd23..7666a4f52a2271 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/decorators.py +++ b/metadata-ingestion/src/datahub/ingestion/api/decorators.py @@ -34,11 +34,8 @@ def platform_name( def wrapper(cls: Type) -> Type: setattr(cls, "get_platform_name", lambda: platform_name) - setattr( - cls, - "get_platform_id", - lambda: id if id else platform_name.lower().replace(" ", "-"), - ) + setattr(cls, "get_platform_id", lambda: id or platform_name.lower().replace(" ", "-")) + return cls if id and " " in id: From 6a79b7f53beeca7fdd767a3c723cd55e370f61ae Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:30:32 +1000 Subject: [PATCH 30/88] Update urn.py --- .../src/datahub/utilities/urns/urn.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/metadata-ingestion/src/datahub/utilities/urns/urn.py b/metadata-ingestion/src/datahub/utilities/urns/urn.py index 7498cc1532c66e..7ae6d37472621a 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/urn.py @@ -21,7 +21,7 @@ class Urn: def __init__( self, entity_type: str, entity_id: List[str], urn_domain: str = LI_DOMAIN ): - if len(entity_id) == 0: + if not entity_id: raise InvalidUrnError("Empty entity id.") self._validate_entity_type(entity_type) self._validate_entity_id(entity_id) @@ -122,9 +122,9 @@ def _get_entity_id_from_str(entity_id: str) -> List[str]: part_start = i + 1 if start_paren_count != 0: - raise InvalidUrnError(f"{entity_id}, mismtached paren nesting") + raise InvalidUrnError(f"{entity_id}, mismatched parent nesting") - parts.append(entity_id[part_start : len(entity_id) - 1]) + parts.append(entity_id[part_start:-1]) return parts @@ -151,11 +151,4 @@ def __hash__(self) -> int: return hash((self._domain, self._entity_type) + tuple(self._entity_id)) def __eq__(self, other: object) -> bool: - if not isinstance(other, Urn): - return False - - return ( - self._entity_id == other._entity_id - and self._domain == other._domain - and self._entity_type == other._entity_type - ) + return (self._entity_id == other._entity_id and self._domain == other._domain and self._entity_type == other._entity_type) if isinstance(other, Urn) else False From 3f48494f63122d1f5b0f38911358de6551bc16d6 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:30:39 +1000 Subject: [PATCH 31/88] Update registry.py --- .../src/datahub/ingestion/api/registry.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/api/registry.py b/metadata-ingestion/src/datahub/ingestion/api/registry.py index f83921639c227e..a8529817e2500f 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/registry.py +++ b/metadata-ingestion/src/datahub/ingestion/api/registry.py @@ -79,16 +79,15 @@ def register_disabled( def _ensure_not_lazy(self, key: str) -> Union[Type[T], Exception]: path = self._mapping[key] - if isinstance(path, str): - try: - plugin_class = import_path(path) - self.register(key, plugin_class, override=True) - return plugin_class - except (AssertionError, ModuleNotFoundError, ImportError) as e: - self.register_disabled(key, e, override=True) - return e - else: + if not isinstance(path, str): return path + try: + plugin_class = import_path(path) + self.register(key, plugin_class, override=True) + return plugin_class + except (AssertionError, ImportError) as e: + self.register_disabled(key, e, override=True) + return e def is_enabled(self, key: str) -> bool: tp = self._mapping[key] From d0b37ef5793122c5730b781879dfd72db172e4ef Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:30:42 +1000 Subject: [PATCH 32/88] Update protobuf_util.py --- .../src/datahub/ingestion/extractor/protobuf_util.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/protobuf_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/protobuf_util.py index e5f976ff88dd56..51fdbd8fbdb680 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/protobuf_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/protobuf_util.py @@ -365,11 +365,7 @@ def _schema_fields_from_dag( if generations and generations[0]: roots = generations[0] - leafs: List = [] - for node in graph: - if graph.out_degree(node) == 0: - leafs.append(node) - + leafs: List = [node for node in graph if graph.out_degree(node) == 0] type_of_nodes: Dict = nx.get_node_attributes(graph, "node_type") for root in roots: From a58dd7402e1ec086df954eab782c28472472136f Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:30:45 +1000 Subject: [PATCH 33/88] Update datahub_ingestion_reporting_provider.py --- .../ingestion/reporting/datahub_ingestion_reporting_provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_reporting_provider.py b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_reporting_provider.py index 568c41aac9dbd9..1bb89236cc51ad 100644 --- a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_reporting_provider.py +++ b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_reporting_provider.py @@ -115,7 +115,7 @@ def get_previous_states( ) -> List[ReportingJobStatesMap]: if not last_only: raise NotImplementedError( - "Currently supports retrieving only the last commited state." + "Currently supports retrieving only the last committed state." ) if filter_opt is not None: raise NotImplementedError( From 5342aaf31223085d077b663a80b9ea131f48ceaa Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:30:48 +1000 Subject: [PATCH 34/88] Update pipeline.py --- .../src/datahub/ingestion/run/pipeline.py | 27 ++++++++----------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index e26470f823a6c7..e429ad097b440d 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -53,11 +53,10 @@ def run_id_should_be_semantic( cls, v: Optional[str], values: Dict[str, Any], **kwargs: Any ) -> str: if v == "__DEFAULT_RUN_ID": - if "source" in values: - if hasattr(values["source"], "type"): - source_type = values["source"].type - current_time = datetime.datetime.now().strftime("%Y_%m_%d-%H_%M_%S") - return f"{source_type}-{current_time}" + if "source" in values and hasattr(values["source"], "type"): + source_type = values["source"].type + current_time = datetime.datetime.now().strftime("%Y_%m_%d-%H_%M_%S") + return f"{source_type}-{current_time}" return str(uuid.uuid1()) # default run_id if we cannot infer a source type else: @@ -86,12 +85,11 @@ def default_sink_is_datahub_rest(cls, values: Dict[str, Any]) -> Any: def datahub_api_should_use_rest_sink_as_default( cls, v: Optional[DatahubClientConfig], values: Dict[str, Any], **kwargs: Any ) -> Optional[DatahubClientConfig]: - if v is None: - if "sink" in values and hasattr(values["sink"], "type"): - sink_type = values["sink"].type - if sink_type == "datahub-rest": - sink_config = values["sink"].config - v = DatahubClientConfig.parse_obj(sink_config) + if v is None and "sink" in values and hasattr(values["sink"], "type"): + sink_type = values["sink"].type + if sink_type == "datahub-rest": + sink_config = values["sink"].config + v = DatahubClientConfig.parse_obj(sink_config) return v @@ -266,11 +264,8 @@ def process_commits(self) -> None: if self.source.get_report().failures or self.sink.get_report().failures else False ) - has_warnings: bool = ( - True - if self.source.get_report().warnings or self.sink.get_report().warnings - else False - ) + has_warnings: bool = bool(self.source.get_report().warnings or self.sink.get_report().warnings) + for name, committable in self.ctx.get_committables(): commit_policy: CommitPolicy = committable.commit_policy From 759bacd37cd58a895a9ba6da7a122afaafe1a584 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:30:52 +1000 Subject: [PATCH 35/88] Update datahub_kafka.py --- .../src/datahub/ingestion/sink/datahub_kafka.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/sink/datahub_kafka.py b/metadata-ingestion/src/datahub/ingestion/sink/datahub_kafka.py index f931b9039303a0..20929e85887a77 100644 --- a/metadata-ingestion/src/datahub/ingestion/sink/datahub_kafka.py +++ b/metadata-ingestion/src/datahub/ingestion/sink/datahub_kafka.py @@ -77,9 +77,7 @@ def write_record_async( self.report, record_envelope, write_callback ).kafka_callback, ) - elif isinstance(record, MetadataChangeProposalWrapper) or isinstance( - record, MetadataChangeProposalClass - ): + elif isinstance(record, (MetadataChangeProposalWrapper, MetadataChangeProposalClass)): self.emitter.emit_mcp_async( record, callback=_KafkaCallback( From b4275e19f135e545425eb7b6e43a737b4a15379e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:30:55 +1000 Subject: [PATCH 36/88] Update datahub_rest.py --- .../src/datahub/ingestion/sink/datahub_rest.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py b/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py index 74e536350457b5..415a7a1c827da8 100644 --- a/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py +++ b/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py @@ -1,4 +1,5 @@ import concurrent.futures +import contextlib import functools import logging from dataclasses import dataclass @@ -111,13 +112,8 @@ def _write_done_callback( else: # trim exception stacktraces when reporting warnings if "stackTrace" in e.info: - try: - e.info["stackTrace"] = "\n".join( - e.info["stackTrace"].split("\n")[0:2] - ) - except Exception: - # ignore failures in trimming - pass + with contextlib.suppress(Exception): + e.info["stackTrace"] = "\n".join(e.info["stackTrace"].split("\n")[:2]) record = record_envelope.record if isinstance(record, MetadataChangeProposalWrapper): # include information about the entity that failed From f12961dad694c29afc1b86ea9cae17f9785ec6ea Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:30:59 +1000 Subject: [PATCH 37/88] Update pulsar.py --- .../src/datahub/ingestion/source_config/pulsar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/pulsar.py b/metadata-ingestion/src/datahub/ingestion/source_config/pulsar.py index e21c6fc3ea42ba..836960ac50633f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/pulsar.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/pulsar.py @@ -30,7 +30,7 @@ def _is_valid_hostname(hostname: str) -> bool: """ if len(hostname) > 253: return False - # Hostnames ending on a dot are valid, if present strip exactly one + # Hostname's ending on a dot are valid, if present strip exactly one if hostname[-1] == ".": hostname = hostname[:-1] allowed = re.compile(r"(?!-)[A-Z\d-]{1,63}(? Date: Wed, 8 Jun 2022 22:31:04 +1000 Subject: [PATCH 38/88] Update snowflake.py --- .../ingestion/source_config/sql/snowflake.py | 100 +++++++++--------- 1 file changed, 49 insertions(+), 51 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index b00ac9cdfb41b8..644be7afbe749a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -90,7 +90,7 @@ class SnowflakeProvisionRoleConfig(ConfigModel): @pydantic.validator("admin_username", always=True) def username_not_empty(cls, v, values, **kwargs): v_str: str = str(v) - if v_str.strip() == "": + if not v_str.strip(): raise ValueError("username is empty") return v @@ -180,60 +180,58 @@ def authenticator_type_is_valid(cls, v, values, field): f"unsupported authenticator type '{v}' was provided," f" use one of {list(VALID_AUTH_TYPES.keys())}" ) - else: - if v == "KEY_PAIR_AUTHENTICATOR": - # If we are using key pair auth, we need the private key path and password to be set - if values.get("private_key_path") is None: - raise ValueError( - f"'private_key_path' was none " - f"but should be set when using {v} authentication" - ) - elif v == "OAUTH_AUTHENTICATOR": - if values.get("oauth_config") is None: - raise ValueError( - f"'oauth_config' is none but should be set when using {v} authentication" - ) - if values.get("oauth_config").provider is None: - raise ValueError( - f"'oauth_config.provider' is none " - f"but should be set when using {v} authentication" - ) - if values.get("oauth_config").client_id is None: - raise ValueError( - f"'oauth_config.client_id' is none " - f"but should be set when using {v} authentication" - ) - if values.get("oauth_config").scopes is None: + if v == "KEY_PAIR_AUTHENTICATOR": + # If we are using key pair auth, we need the private key path and password to be set + if values.get("private_key_path") is None: + raise ValueError( + f"'private_key_path' was none " + f"but should be set when using {v} authentication" + ) + elif v == "OAUTH_AUTHENTICATOR": + if values.get("oauth_config") is None: + raise ValueError( + f"'oauth_config' is none but should be set when using {v} authentication" + ) + if values.get("oauth_config").provider is None: + raise ValueError( + f"'oauth_config.provider' is none " + f"but should be set when using {v} authentication" + ) + if values.get("oauth_config").client_id is None: + raise ValueError( + f"'oauth_config.client_id' is none " + f"but should be set when using {v} authentication" + ) + if values.get("oauth_config").scopes is None: + raise ValueError( + f"'oauth_config.scopes' was none " + f"but should be set when using {v} authentication" + ) + if values.get("oauth_config").authority_url is None: + raise ValueError( + f"'oauth_config.authority_url' was none " + f"but should be set when using {v} authentication" + ) + if values.get("oauth_config").use_certificate is True: + if ( + values.get("oauth_config").base64_encoded_oauth_private_key + is None + ): raise ValueError( - f"'oauth_config.scopes' was none " - f"but should be set when using {v} authentication" + "'base64_encoded_oauth_private_key' was none " + "but should be set when using certificate for oauth_config" ) - if values.get("oauth_config").authority_url is None: + if values.get("oauth").base64_encoded_oauth_public_key is None: raise ValueError( - f"'oauth_config.authority_url' was none " - f"but should be set when using {v} authentication" + "'base64_encoded_oauth_public_key' was none" + "but should be set when using use_certificate true for oauth_config" ) - if values.get("oauth_config").use_certificate is True: - if ( - values.get("oauth_config").base64_encoded_oauth_private_key - is None - ): - raise ValueError( - "'base64_encoded_oauth_private_key' was none " - "but should be set when using certificate for oauth_config" - ) - if values.get("oauth").base64_encoded_oauth_public_key is None: - raise ValueError( - "'base64_encoded_oauth_public_key' was none" - "but should be set when using use_certificate true for oauth_config" - ) - else: - if values.get("oauth_config").client_secret is None: - raise ValueError( - "'oauth_config.client_secret' was none " - "but should be set when using use_certificate false for oauth_config" - ) - logger.info(f"using authenticator type '{v}'") + elif values.get("oauth_config").client_secret is None: + raise ValueError( + "'oauth_config.client_secret' was none " + "but should be set when using use_certificate false for oauth_config" + ) + logger.info(f"using authenticator type '{v}'") return v @pydantic.validator("include_view_lineage") From e7dd056f70ad4b0b000ae3b10fec80fec86f2b56 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:31:10 +1000 Subject: [PATCH 39/88] Update bigquery_usage.py --- .../src/datahub/ingestion/source_config/usage/bigquery_usage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py b/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py index 05dc636d312c2e..9abee691ca9bf8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py @@ -114,7 +114,7 @@ class BigQueryUsageConfig(BigQueryBaseConfig, DatasetSourceConfigBase, BaseUsage credential: Optional[BigQueryCredential] = pydantic.Field( default=None, - description="Bigquery credential. Required if GOOGLE_APPLICATION_CREDENTIALS enviroment variable is not set. See this example recipe for details", + description="Bigquery credential. Required if GOOGLE_APPLICATION_CREDENTIALS environment variable is not set. See this example recipe for details", ) _credentials_path: Optional[str] = pydantic.PrivateAttr(None) temp_table_dataset_prefix: str = pydantic.Field( From 6bf6136eed0b1e49204e1a37a5687a3f119c3c49 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:31:13 +1000 Subject: [PATCH 40/88] Update base_transformer.py --- .../src/datahub/ingestion/transformer/base_transformer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/base_transformer.py b/metadata-ingestion/src/datahub/ingestion/transformer/base_transformer.py index ecc1dcfc5fd31f..c6f641c8fcd6e5 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/base_transformer.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/base_transformer.py @@ -132,9 +132,7 @@ def _should_process( return True # fall through, no entity type matched return False - elif isinstance(record, MetadataChangeProposalWrapper) or isinstance( - record, MetadataChangeProposalClass - ): + elif isinstance(record, (MetadataChangeProposalWrapper, MetadataChangeProposalClass)): return record.entityType in entity_types # default to process everything that is not caught by above checks From 0a1db396b12d33f589ca5b031d6f906a263542d9 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:31:21 +1000 Subject: [PATCH 41/88] Update mark_dataset_status.py --- .../src/datahub/ingestion/transformer/mark_dataset_status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/mark_dataset_status.py b/metadata-ingestion/src/datahub/ingestion/transformer/mark_dataset_status.py index bae8d0e07a80ab..d833e9bcc75a64 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/mark_dataset_status.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/mark_dataset_status.py @@ -40,6 +40,6 @@ def transform_aspect( self, entity_urn: str, aspect_name: str, aspect: Optional[builder.Aspect] ) -> Optional[builder.Aspect]: assert aspect is None or isinstance(aspect, StatusClass) - status_aspect: StatusClass = aspect if aspect else StatusClass(removed=None) + status_aspect: StatusClass = aspect or StatusClass(removed=None) status_aspect.removed = self.config.removed return status_aspect # type: ignore From df463c45f8a4b543718b94161900ef6a82f53b3f Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:31:23 +1000 Subject: [PATCH 42/88] Update action.py --- .../integrations/great_expectations/action.py | 38 ++++++++----------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/metadata-ingestion/src/datahub/integrations/great_expectations/action.py b/metadata-ingestion/src/datahub/integrations/great_expectations/action.py index 98b344c0a06cc7..572ecdf36302d5 100644 --- a/metadata-ingestion/src/datahub/integrations/great_expectations/action.py +++ b/metadata-ingestion/src/datahub/integrations/great_expectations/action.py @@ -190,12 +190,11 @@ def _run( result = "DataHub notification succeeded" except Exception as e: result = "DataHub notification failed" - if self.graceful_exceptions: - logger.error(e) - logger.info("Supressing error because graceful_exceptions is set") - else: + if not self.graceful_exceptions: raise + logger.error(e) + logger.info("Suppressing error because graceful_exceptions is set") return {"datahub_notification_result": result} def get_assertions_with_results( @@ -224,7 +223,7 @@ def get_assertions_with_results( for result in validation_result_suite.results: expectation_config = result["expectation_config"] expectation_type = expectation_config["expectation_type"] - success = True if result["success"] else False + success = bool(result["success"]) kwargs = { k: v for k, v in expectation_config["kwargs"].items() if k != "batch_id" } @@ -271,8 +270,6 @@ def get_assertions_with_results( # TODO: Understand why their run time is incorrect. run_time = run_id.run_time.astimezone(timezone.utc) - assertionResults = [] - evaluation_parameters = ( { k: convert_to_string(v) @@ -328,8 +325,7 @@ def get_assertions_with_results( ) if ds.get("partitionSpec") is not None: assertionResult.partitionSpec = ds.get("partitionSpec") - assertionResults.append(assertionResult) - + assertionResults = [assertionResult] assertions_with_results.append( { "assertionUrn": assertionUrn, @@ -629,10 +625,8 @@ def get_dataset_partitions(self, batch_identifier, data_asset): query = data_asset.batches[ batch_identifier ].batch_request.runtime_parameters["query"] - partitionSpec = PartitionSpecClass( - type=PartitionTypeClass.QUERY, - partition="Query_" + builder.datahub_guid(query), - ) + partitionSpec = PartitionSpecClass(type=PartitionTypeClass.QUERY, partition=f"Query_{builder.datahub_guid(query)}") + batchSpec = BatchSpec( nativeBatchId=batch_identifier, query=query, @@ -678,9 +672,9 @@ def get_dataset_partitions(self, batch_identifier, data_asset): return dataset_partitions def get_platform_instance(self, datasource_name): - if self.platform_instance_map and datasource_name in self.platform_instance_map: - return self.platform_instance_map[datasource_name] if self.platform_instance_map: + if datasource_name in self.platform_instance_map: + return self.platform_instance_map[datasource_name] warn( f"Datasource {datasource_name} is not present in platform_instance_map" ) @@ -698,21 +692,21 @@ def make_dataset_urn_from_sqlalchemy_uri( schema_name, table_name = table_name.split(".")[-2:] if data_platform in ["redshift", "postgres"]: - schema_name = schema_name if schema_name else "public" + schema_name = schema_name or "public" if url_instance.database is None: warn( f"DataHubValidationAction failed to locate database name for {data_platform}." ) return None - schema_name = "{}.{}".format(url_instance.database, schema_name) + schema_name = f"{url_instance.database}.{schema_name}" elif data_platform == "mssql": - schema_name = schema_name if schema_name else "dbo" + schema_name = schema_name or "dbo" if url_instance.database is None: warn( f"DataHubValidationAction failed to locate database name for {data_platform}." ) return None - schema_name = "{}.{}".format(url_instance.database, schema_name) + schema_name = f"{url_instance.database}.{schema_name}" elif data_platform in ["trino", "snowflake"]: if schema_name is None or url_instance.database is None: warn( @@ -738,16 +732,16 @@ def make_dataset_urn_from_sqlalchemy_uri( ) ) return None - schema_name = "{}.{}".format(url_instance.host, url_instance.database) + schema_name = f"{url_instance.host}.{url_instance.database}" - schema_name = schema_name if schema_name else url_instance.database + schema_name = schema_name or url_instance.database if schema_name is None: warn( f"DataHubValidationAction failed to locate schema name for {data_platform}." ) return None - dataset_name = "{}.{}".format(schema_name, table_name) + dataset_name = f"{schema_name}.{table_name}" dataset_urn = builder.make_dataset_urn_with_platform_instance( platform=data_platform, From 2ab2fddba5b0da0c9d7060645570f342a003814b Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:31:26 +1000 Subject: [PATCH 43/88] Update stats.py --- metadata-ingestion/src/datahub/telemetry/stats.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/telemetry/stats.py b/metadata-ingestion/src/datahub/telemetry/stats.py index ea48aab14c77db..e76580d677588c 100644 --- a/metadata-ingestion/src/datahub/telemetry/stats.py +++ b/metadata-ingestion/src/datahub/telemetry/stats.py @@ -27,9 +27,7 @@ def calculate_percentiles( min(i, size - 1) for i in percentile_indices ] # in case of rounding errors - values = {p: data_sorted[i] for p, i in zip(percentiles, percentile_indices)} - - return values + return {p: data_sorted[i] for p, i in zip(percentiles, percentile_indices)} def discretize(statistic: Union[float, int]) -> int: From d1881718ca0d1971397788e78288e3eaa8b760cc Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:31:29 +1000 Subject: [PATCH 44/88] Update upgrade.py --- .../src/datahub/upgrade/upgrade.py | 21 +++++-------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/metadata-ingestion/src/datahub/upgrade/upgrade.py b/metadata-ingestion/src/datahub/upgrade/upgrade.py index 046f32202d83bd..839e23f6536ae6 100644 --- a/metadata-ingestion/src/datahub/upgrade/upgrade.py +++ b/metadata-ingestion/src/datahub/upgrade/upgrade.py @@ -1,3 +1,4 @@ +import contextlib import logging from datetime import datetime, timedelta, timezone from functools import wraps @@ -221,7 +222,7 @@ def maybe_print_upgrade_message( # noqa: C901 encourage_cli_upgrade = False client_server_compat = 0 encourage_quickstart_upgrade = False - try: + with contextlib.suppress(Exception): version_stats = retrieve_versions(server) if not version_stats: return @@ -261,12 +262,9 @@ def maybe_print_upgrade_message( # noqa: C901 ): encourage_quickstart_upgrade = True - except Exception: - pass - # Compute recommendations and print one if client_server_compat < 0: - try: + with contextlib.suppress(Exception): assert version_stats print( colored("❗Client-Server Incompatible❗", "yellow"), @@ -279,10 +277,8 @@ def maybe_print_upgrade_message( # noqa: C901 "cyan", ), ) - except Exception: - pass elif client_server_compat > 0: - try: + with contextlib.suppress(Exception): assert version_stats print( colored("❗Client-Server Incompatible❗", "red"), @@ -295,12 +291,8 @@ def maybe_print_upgrade_message( # noqa: C901 "cyan", ), ) - except Exception: - pass - - # we only encourage upgrades if we think client_server is currently compatible elif client_server_compat == 0 and encourage_cli_upgrade: - try: + with contextlib.suppress(Exception): print( colored("💡 Upgrade cli!", "yellow"), colored( @@ -308,9 +300,6 @@ def maybe_print_upgrade_message( # noqa: C901 "cyan", ), ) - except Exception: - pass - elif encourage_quickstart_upgrade: try: assert version_stats From 5663dc9129b6bacf68f81e4793030e64c9eb43cb Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:31:32 +1000 Subject: [PATCH 45/88] Update telemetry.py --- metadata-ingestion/src/datahub/telemetry/telemetry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/telemetry/telemetry.py b/metadata-ingestion/src/datahub/telemetry/telemetry.py index b95df169414320..0a346d09373850 100644 --- a/metadata-ingestion/src/datahub/telemetry/telemetry.py +++ b/metadata-ingestion/src/datahub/telemetry/telemetry.py @@ -273,7 +273,7 @@ def get_full_class_name(obj): module = obj.__class__.__module__ if module is None or module == str.__class__.__module__: return obj.__class__.__name__ - return module + "." + obj.__class__.__name__ + return f"{module}.{obj.__class__.__name__}" def with_telemetry(func: Callable[..., T]) -> Callable[..., T]: From b57b58b24f8e36c3f026b35f5b12cea03a91fe7e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:31:34 +1000 Subject: [PATCH 46/88] Update hive_schema_to_avro.py --- .../datahub/utilities/hive_schema_to_avro.py | 31 +++++++------------ 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py b/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py index fc9680ba642d42..c83ec153144f0f 100644 --- a/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py +++ b/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py @@ -52,10 +52,11 @@ def _parse_datatype_string( raise ValueError("'>' should be the last char, but got: %s" % s) parts = HiveColumnToAvroConverter._ignore_brackets_split(s[4:-1], ",") if len(parts) != 2: - raise ValueError( + raise ValueError(( "The map type string format is: 'map', " - + "but got: %s" % s - ) + + f"but got: {s}" + )) + kt = HiveColumnToAvroConverter._parse_datatype_string(parts[0]) vt = HiveColumnToAvroConverter._parse_datatype_string(parts[1]) # keys are assumed to be strings in avro map @@ -102,10 +103,8 @@ def _parse_struct_fields_string(s: str, **kwargs: Any) -> Dict[str, object]: for part in parts: name_and_type = HiveColumnToAvroConverter._ignore_brackets_split(part, ":") if len(name_and_type) != 2: - raise ValueError( - "The struct field string format is: 'field_name:field_type', " - + "but got: %s" % part - ) + raise ValueError(("The struct field string format is: 'field_name:field_type', " + f"but got: {part}")) + field_name = name_and_type[0].strip() if field_name.startswith("`"): if field_name[-1] != "`": @@ -117,17 +116,11 @@ def _parse_struct_fields_string(s: str, **kwargs: Any) -> Dict[str, object]: fields.append({"name": field_name, "type": field_type}) if kwargs.get("ustruct_seqn") is not None: - struct_name = "__structn_{}_{}".format( - kwargs["ustruct_seqn"], str(uuid.uuid4()).replace("-", "") - ) + struct_name = f'__structn_{kwargs["ustruct_seqn"]}_{str(uuid.uuid4()).replace("-", "")}' + else: - struct_name = "__struct_{}".format(str(uuid.uuid4()).replace("-", "")) - return { - "type": "record", - "name": struct_name, - "fields": fields, - "native_data_type": "struct<{}>".format(s), - } + struct_name = f'__struct_{str(uuid.uuid4()).replace("-", "")}' + return {"type": "record", "name": struct_name, "fields": fields, "native_data_type": f"struct<{s}>"} @staticmethod def _parse_basic_datatype_string(s: str) -> Dict[str, object]: @@ -193,7 +186,7 @@ def _ignore_brackets_split(s: str, separator: str) -> List[str]: buf += c elif c in HiveColumnToAvroConverter._BRACKETS.values(): if level == 0: - raise ValueError("Brackets are not correctly paired: %s" % s) + raise ValueError(f"Brackets are not correctly paired: {s}") level -= 1 buf += c elif c == separator and level > 0: @@ -205,7 +198,7 @@ def _ignore_brackets_split(s: str, separator: str) -> List[str]: buf += c if len(buf) == 0: - raise ValueError("The %s cannot be the last char: %s" % (separator, s)) + raise ValueError(f"The {separator} cannot be the last char: {s}") parts.append(buf) return parts From d1dda5e32ce5c2ce314a68f89e5273f46b2d64b9 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:31:36 +1000 Subject: [PATCH 47/88] Update mapping.py --- metadata-ingestion/src/datahub/utilities/mapping.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py index 2b6c458db8d1ce..af4916a4055747 100644 --- a/metadata-ingestion/src/datahub/utilities/mapping.py +++ b/metadata-ingestion/src/datahub/utilities/mapping.py @@ -172,7 +172,7 @@ def get_operation_value( def sanitize_owner_ids(self, owner_id: str) -> str: if owner_id.__contains__("@"): - owner_id = owner_id[0 : owner_id.index("@")] + owner_id = owner_id[:owner_id.index("@")] return owner_id def is_match(self, match_clause: Any, raw_props_value: Any) -> bool: @@ -181,9 +181,8 @@ def is_match(self, match_clause: Any, raw_props_value: Any) -> bool: if type(raw_props_value) not in Constants.OPERAND_DATATYPE_SUPPORTED or type( raw_props_value ) != type(match_clause): - is_matching = False + return False elif type(raw_props_value) == str: - is_matching = True if re.match(match_clause, raw_props_value) else False + return bool(re.match(match_clause, raw_props_value)) else: - is_matching = match_clause == raw_props_value - return is_matching + return match_clause == raw_props_value From a3839e25574b589743ed710d61183ac808b89f9e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:31:38 +1000 Subject: [PATCH 48/88] Update memory_leak_detector.py --- .../src/datahub/utilities/memory_leak_detector.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/utilities/memory_leak_detector.py b/metadata-ingestion/src/datahub/utilities/memory_leak_detector.py index b5fa3c3a723ea4..ef0db205b72ac9 100644 --- a/metadata-ingestion/src/datahub/utilities/memory_leak_detector.py +++ b/metadata-ingestion/src/datahub/utilities/memory_leak_detector.py @@ -12,7 +12,7 @@ def _trace_has_file(trace: tracemalloc.Traceback, file_pattern: str) -> bool: - for frame_index in range(0, len(trace)): + for frame_index in range(len(trace)): cur_frame = trace[frame_index] if fnmatch.fnmatch(cur_frame.filename, file_pattern): return True @@ -99,8 +99,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: _init_leak_detection() try: - res = func(*args, **kwargs) - return res + return func(*args, **kwargs) finally: if detect_leaks: _perform_leak_detection() From 1d180c15a48e25a80b184ae52db75b5d5f35ed2d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:31:40 +1000 Subject: [PATCH 49/88] Update server_config_util.py --- metadata-ingestion/src/datahub/utilities/server_config_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/utilities/server_config_util.py b/metadata-ingestion/src/datahub/utilities/server_config_util.py index c919a1356f2642..40841321ad2778 100644 --- a/metadata-ingestion/src/datahub/utilities/server_config_util.py +++ b/metadata-ingestion/src/datahub/utilities/server_config_util.py @@ -3,7 +3,7 @@ from datahub.telemetry.telemetry import set_telemetry_enable # Only to be written to for logging server related information -global_debug: Dict[str, Any] = dict() +global_debug: Dict[str, Any] = {} def set_gms_config(config: Dict) -> Any: From ab48916abae7766657d40dd1ce52bf3c38aed487 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:31:42 +1000 Subject: [PATCH 50/88] Update sql_lineage_parser_impl.py --- .../utilities/sql_lineage_parser_impl.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/metadata-ingestion/src/datahub/utilities/sql_lineage_parser_impl.py b/metadata-ingestion/src/datahub/utilities/sql_lineage_parser_impl.py index 80ea7cc31455cd..63b3edaf8c0556 100644 --- a/metadata-ingestion/src/datahub/utilities/sql_lineage_parser_impl.py +++ b/metadata-ingestion/src/datahub/utilities/sql_lineage_parser_impl.py @@ -1,3 +1,4 @@ +import contextlib import logging import re import unittest @@ -7,15 +8,12 @@ from sqllineage.core.holders import Column, SQLLineageHolder from sqllineage.exceptions import SQLLineageException -try: +with contextlib.suppress(ImportError): import sqlparse from networkx import DiGraph from sqllineage.core import LineageAnalyzer import datahub.utilities.sqllineage_patch -except ImportError: - pass - logger = logging.getLogger(__name__) @@ -97,7 +95,7 @@ def __init__(self, sql_query: str) -> None: logger.error(f"SQL lineage analyzer error '{e}' for query: '{self._sql}") def get_tables(self) -> List[str]: - result: List[str] = list() + result: List[str] = [] if self._sql_holder is None: logger.error("sql holder not present so cannot get tables") return result @@ -135,12 +133,8 @@ def get_columns(self) -> List[str]: result.add(str(column.raw_name)) # Reverting back all the previously renamed words which confuses the parser - result = set(["date" if c == self._DATE_SWAP_TOKEN else c for c in result]) - result = set( - [ - "timestamp" if c == self._TIMESTAMP_SWAP_TOKEN else c - for c in list(result) - ] - ) + result = {"date" if c == self._DATE_SWAP_TOKEN else c for c in result} + result = {"timestamp" if c == self._TIMESTAMP_SWAP_TOKEN else c for c in list(result)} + # swap back renamed date column return list(result) From bb37d4d8966df05e24e799c3c2c227333bfa7103 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:31:46 +1000 Subject: [PATCH 51/88] Update sql_parser.py --- metadata-ingestion/src/datahub/utilities/sql_parser.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/utilities/sql_parser.py b/metadata-ingestion/src/datahub/utilities/sql_parser.py index eb0bc0ec8262f4..28b5082ccbb3b2 100644 --- a/metadata-ingestion/src/datahub/utilities/sql_parser.py +++ b/metadata-ingestion/src/datahub/utilities/sql_parser.py @@ -1,3 +1,4 @@ +import contextlib import logging import multiprocessing import re @@ -9,11 +10,8 @@ from datahub.utilities.sql_lineage_parser_impl import SqlLineageSQLParserImpl -try: +with contextlib.suppress(ImportError): from sql_metadata import Parser as MetadataSQLParser -except ImportError: - pass - logger = logging.getLogger(__name__) From 15740801cb87cba36d46f2e32d65dad68997ec7f Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:31:48 +1000 Subject: [PATCH 52/88] Update sqlalchemy_query_combiner.py --- .../src/datahub/utilities/sqlalchemy_query_combiner.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/utilities/sqlalchemy_query_combiner.py b/metadata-ingestion/src/datahub/utilities/sqlalchemy_query_combiner.py index 0474f4ec7d3d68..947f5e30d62c89 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlalchemy_query_combiner.py +++ b/metadata-ingestion/src/datahub/utilities/sqlalchemy_query_combiner.py @@ -108,8 +108,7 @@ def get_query_columns(query: Any) -> List[Any]: try: # inner_columns will be more accurate if the column names are unnamed, # since .columns will remove the "duplicates". - inner_columns = list(query.inner_columns) - return inner_columns + return list(query.inner_columns) except AttributeError: return list(query.columns) From 705050e2e3a548dfd4ee211ca7aaf81ceada5212 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:42:16 +1000 Subject: [PATCH 53/88] Update powerbi.py --- .../src/datahub/ingestion/source/powerbi.py | 57 +++++++++---------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi.py index 5cfba5fa2ec14b..a192840cd5dd68 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi.py @@ -131,7 +131,7 @@ class PowerBiAPIConfig(EnvBasedSourceConfigBase): authority = "https://login.microsoftonline.com/" def get_authority_url(self): - return "{}{}".format(self.authority, self.tenant_id) + return f"{self.authority}{self.tenant_id}" class PowerBiDashboardSourceConfig(PowerBiAPIConfig): @@ -216,7 +216,7 @@ class Table: tables: List[Any] def get_urn_part(self): - return "datasets.{}".format(self.id) + return f"datasets.{self.id}" def __members(self): return (self.id,) @@ -239,7 +239,7 @@ class Report: dataset: Any def get_urn_part(self): - return "reports.{}".format(self.id) + return f"reports.{self.id}" @dataclass class Tile: @@ -257,7 +257,7 @@ class CreatedFrom(Enum): createdFrom: CreatedFrom def get_urn_part(self): - return "charts.{}".format(self.id) + return f"charts.{self.id}" @dataclass class User: @@ -269,7 +269,7 @@ class User: principalType: str def get_urn_part(self): - return "users.{}".format(self.id) + return f"users.{self.id}" def __members(self): return (self.id,) @@ -296,7 +296,7 @@ class Dashboard: users: List[Any] def get_urn_part(self): - return "dashboards.{}".format(self.id) + return f"dashboards.{self.id}" def __members(self): return (self.id,) @@ -322,9 +322,9 @@ def __init__(self, config: PowerBiAPIConfig) -> None: ) # Test connection by generating a access token - LOGGER.info("Trying to connect to {}".format(self.__config.get_authority_url())) + LOGGER.info(f"Trying to connect to {self.__config.get_authority_url()}") self.get_access_token() - LOGGER.info("Able to connect to {}".format(self.__config.get_authority_url())) + LOGGER.info(f"Able to connect to {self.__config.get_authority_url()}") def __get_users(self, workspace_id: str, entity: str, id: str) -> List[User]: """ @@ -338,7 +338,7 @@ def __get_users(self, workspace_id: str, entity: str, id: str) -> List[User]: ENTITY_ID=id, ) # Hit PowerBi - LOGGER.info("Request to URL={}".format(user_list_endpoint)) + LOGGER.info(f"Request to URL={user_list_endpoint}") response = requests.get( url=user_list_endpoint, headers={Constant.Authorization: self.get_access_token()}, @@ -346,14 +346,11 @@ def __get_users(self, workspace_id: str, entity: str, id: str) -> List[User]: # Check if we got response from PowerBi if response.status_code != 200: - LOGGER.warning( - "Failed to fetch user list from power-bi for, http_status={}, message={}".format( - response.status_code, response.text - ) - ) - LOGGER.info("{}={}".format(Constant.WorkspaceId, workspace_id)) - LOGGER.info("{}={}".format(Constant.ENTITY, entity)) - LOGGER.info("{}={}".format(Constant.ID, id)) + LOGGER.warning(f"Failed to fetch user list from power-bi for, http_status={response.status_code}, message={response.text}") + + LOGGER.info(f"{Constant.WorkspaceId}={workspace_id}") + LOGGER.info(f"{Constant.ENTITY}={entity}") + LOGGER.info(f"{Constant.ID}={id}") raise ConnectionError("Failed to fetch the user list from the power-bi") users_dict: List[Any] = response.json()[Constant.VALUE] @@ -379,8 +376,8 @@ def __get_report(self, workspace_id: str, report_id: str) -> Any: """ if workspace_id is None or report_id is None: LOGGER.info("Input values are None") - LOGGER.info("{}={}".format(Constant.WorkspaceId, workspace_id)) - LOGGER.info("{}={}".format(Constant.ReportId, report_id)) + LOGGER.info(f"{Constant.WorkspaceId}={workspace_id}") + LOGGER.info(f"{Constant.ReportId}={report_id}") return None report_get_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.REPORT_GET] @@ -391,7 +388,7 @@ def __get_report(self, workspace_id: str, report_id: str) -> Any: REPORT_ID=report_id, ) # Hit PowerBi - LOGGER.info("Request to report URL={}".format(report_get_endpoint)) + LOGGER.info(f"Request to report URL={report_get_endpoint}") response = requests.get( url=report_get_endpoint, headers={Constant.Authorization: self.get_access_token()}, @@ -401,8 +398,8 @@ def __get_report(self, workspace_id: str, report_id: str) -> Any: if response.status_code != 200: message: str = "Failed to fetch report from power-bi for" LOGGER.warning(message) - LOGGER.warning("{}={}".format(Constant.WorkspaceId, workspace_id)) - LOGGER.warning("{}={}".format(Constant.ReportId, report_id)) + LOGGER.warning(f"{Constant.WorkspaceId}={workspace_id}") + LOGGER.warning(f"{Constant.ReportId}={report_id}") raise ConnectionError(message) response_dict = response.json() @@ -440,7 +437,7 @@ def get_access_token(self): self.__access_token = "Bearer {}".format(auth_response.get("access_token")) - LOGGER.debug("{}={}".format(Constant.PBIAccessToken, self.__access_token)) + LOGGER.debug(f"{Constant.PBIAccessToken}={self.__access_token}") return self.__access_token @@ -464,7 +461,7 @@ def get_dashboards(self, workspace: Workspace) -> List[Dashboard]: POWERBI_BASE_URL=self.__config.base_url, WORKSPACE_ID=workspace.id ) # Hit PowerBi - LOGGER.info("Request to URL={}".format(dashboard_list_endpoint)) + LOGGER.info(f"Request to URL={dashboard_list_endpoint}") response = requests.get( url=dashboard_list_endpoint, headers={Constant.Authorization: self.get_access_token()}, @@ -473,7 +470,7 @@ def get_dashboards(self, workspace: Workspace) -> List[Dashboard]: # Check if we got response from PowerBi if response.status_code != 200: LOGGER.warning("Failed to fetch dashboard list from power-bi for") - LOGGER.warning("{}={}".format(Constant.WorkspaceId, workspace.id)) + LOGGER.warning(f"{Constant.WorkspaceId}={workspace.id}") raise ConnectionError( "Failed to fetch the dashboard list from the power-bi" ) @@ -505,8 +502,8 @@ def get_dataset(self, workspace_id: str, dataset_id: str) -> Any: """ if workspace_id is None or dataset_id is None: LOGGER.info("Input values are None") - LOGGER.info("{}={}".format(Constant.WorkspaceId, workspace_id)) - LOGGER.info("{}={}".format(Constant.DatasetId, dataset_id)) + LOGGER.info(f"{Constant.WorkspaceId}={workspace_id}") + LOGGER.info(f"{Constant.DatasetId}={dataset_id}") return None dataset_get_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.DATASET_GET] @@ -517,7 +514,7 @@ def get_dataset(self, workspace_id: str, dataset_id: str) -> Any: DATASET_ID=dataset_id, ) # Hit PowerBi - LOGGER.info("Request to dataset URL={}".format(dataset_get_endpoint)) + LOGGER.info(f"Request to dataset URL={dataset_get_endpoint}") response = requests.get( url=dataset_get_endpoint, headers={Constant.Authorization: self.get_access_token()}, @@ -527,8 +524,8 @@ def get_dataset(self, workspace_id: str, dataset_id: str) -> Any: if response.status_code != 200: message: str = "Failed to fetch dataset from power-bi for" LOGGER.warning(message) - LOGGER.warning("{}={}".format(Constant.WorkspaceId, workspace_id)) - LOGGER.warning("{}={}".format(Constant.DatasetId, dataset_id)) + LOGGER.warning(f"{Constant.WorkspaceId}={workspace_id}") + LOGGER.warning(f"{Constant.DatasetId}={dataset_id}") raise ConnectionError(message) response_dict = response.json() From b7df2f923ea72b8a40c6c4b7f19ab174ee3f650a Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:42:18 +1000 Subject: [PATCH 54/88] Update redash.py --- .../src/datahub/ingestion/source/redash.py | 47 +++++++------------ 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redash.py b/metadata-ingestion/src/datahub/ingestion/source/redash.py index 2abd61849ac260..7aceafd22bd5aa 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redash.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redash.py @@ -203,29 +203,23 @@ def get_full_qualified_name(self, database_name: str, table_name: str) -> str: def get_full_qualified_name(platform: str, database_name: str, table_name: str) -> str: - if platform == "postgres": - full_qualified_name = PostgresQualifiedNameParser().get_full_qualified_name( - database_name, table_name - ) - elif platform == "mysql": - full_qualified_name = MysqlQualifiedNameParser().get_full_qualified_name( - database_name, table_name - ) - elif platform == "mssql": - full_qualified_name = MssqlQualifiedNameParser().get_full_qualified_name( - database_name, table_name - ) - elif platform == "athena": - full_qualified_name = AthenaQualifiedNameParser().get_full_qualified_name( - database_name, table_name - ) + if platform == "athena": + return AthenaQualifiedNameParser().get_full_qualified_name(database_name, table_name) + elif platform == "bigquery": - full_qualified_name = BigqueryQualifiedNameParser().get_full_qualified_name( - database_name, table_name - ) + return BigqueryQualifiedNameParser().get_full_qualified_name(database_name, table_name) + + elif platform == "mssql": + return MssqlQualifiedNameParser().get_full_qualified_name(database_name, table_name) + + elif platform == "mysql": + return MysqlQualifiedNameParser().get_full_qualified_name(database_name, table_name) + + elif platform == "postgres": + return PostgresQualifiedNameParser().get_full_qualified_name(database_name, table_name) + else: - full_qualified_name = f"{database_name}.{table_name}" - return full_qualified_name + return f"{database_name}.{table_name}" class RedashConfig(ConfigModel): @@ -405,8 +399,7 @@ def _get_platform_based_on_datasource(self, data_source: Dict) -> str: map = REDASH_DATA_SOURCE_TO_DATAHUB_MAP.get( data_source_type, {"platform": DEFAULT_DATA_SOURCE_PLATFORM} ) - platform = map.get("platform", DEFAULT_DATA_SOURCE_PLATFORM) - return platform + return map.get("platform", DEFAULT_DATA_SOURCE_PLATFORM) return DEFAULT_DATA_SOURCE_PLATFORM def _get_database_name_based_on_datasource( @@ -596,9 +589,7 @@ def _process_dashboard_response( # the API is id based not slug based # Tested the same with a Redash instance dashboard_id = dashboard_response["id"] - dashboard_data = self.client._get( - "api/dashboards/{}".format(dashboard_id) - ).json() + dashboard_data = self.client._get(f"api/dashboards/{dashboard_id}").json() except Exception: # This does not work in our testing but keeping for now because # people in community are using Redash connector successfully @@ -686,9 +677,7 @@ def _get_chart_snapshot(self, query_data: Dict, viz_data: Dict) -> ChartSnapshot chart_type = self._get_chart_type_from_viz_data(viz_data) query_id = query_data.get("id") chart_url = f"{self.config.connect_uri}/queries/{query_id}#{viz_id}" - description = ( - viz_data.get("description", "") if viz_data.get("description", "") else "" - ) + description = viz_data.get("description", "") or "" data_source_id = query_data.get("data_source_id") data_source = self._get_chart_data_source(data_source_id) data_source_type = data_source.get("type") From 2c51bc386d6f793c2907aba9237360a38a1d6ec5 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:42:21 +1000 Subject: [PATCH 55/88] Update tableau.py --- .../src/datahub/ingestion/source/tableau.py | 42 +++++-------------- 1 file changed, 11 insertions(+), 31 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index fdb57ef4b543cc..2a6696f8823683 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -398,22 +398,17 @@ def _create_upstream_table_lineage( table_path = None if project and datasource.get("name"): - table_name = table.get("name") if table.get("name") else table["id"] + table_name = table.get("name") or table["id"] table_path = f"{project.replace('/', REPLACE_SLASH_CHAR)}/{datasource['name']}/{table_name}" - self.upstream_tables[table_urn] = ( - table.get("columns", []), - table_path, - table.get("isEmbedded") if table.get("isEmbedded") else False, - ) + self.upstream_tables[table_urn] = table.get("columns", []), table_path, table.get("isEmbedded") or False + return upstream_tables def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]: count_on_query = len(self.custom_sql_ids_being_used) - custom_sql_filter = "idWithin: {}".format( - json.dumps(self.custom_sql_ids_being_used) - ) + custom_sql_filter = f"idWithin: {json.dumps(self.custom_sql_ids_being_used)}" custom_sql_connection, total_count, has_next_page = self.get_connection_object( custom_sql_graphql_query, "customSQLTablesConnection", custom_sql_filter ) @@ -491,7 +486,7 @@ def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]: dataset_snapshot.aspects.append(schema_metadata) # Browse path - csql_name = csql.get("name") if csql.get("name") else csql_id + csql_name = csql.get("name") or csql_id if project and datasource_name: browse_paths = BrowsePathsClass( @@ -605,7 +600,6 @@ def _get_schema_metadata_for_datasource( self, datasource_fields: List[dict] ) -> Optional[SchemaMetadata]: fields = [] - schema_metadata = None for field in datasource_fields: # check datasource - custom sql relations from a field being referenced self._track_custom_sql_ids(field) @@ -632,17 +626,7 @@ def _get_schema_metadata_for_datasource( ) fields.append(schema_field) - if fields: - schema_metadata = SchemaMetadata( - schemaName="test", - platform=f"urn:li:dataPlatform:{self.platform}", - version=0, - fields=fields, - hash="", - platformSchema=OtherSchema(rawSchema=""), - ) - - return schema_metadata + return SchemaMetadata(schemaName="test", platform=f"urn:li:dataPlatform:{self.platform}", version=0, fields=fields, hash="", platformSchema=OtherSchema(rawSchema=""),) if fields else None def get_metadata_change_event( self, snap_shot: Union["DatasetSnapshot", "DashboardSnapshot", "ChartSnapshot"] @@ -697,9 +681,7 @@ def emit_datasource( aspects=[], ) - datasource_name = ( - datasource.get("name") if datasource.get("name") else datasource_id - ) + datasource_name = datasource.get("name") or datasource_id if is_embedded_ds and workbook and workbook.get("name"): datasource_name = f"{workbook['name']}/{datasource_name}" # Browse path @@ -780,9 +762,7 @@ def emit_datasource( def emit_published_datasources(self) -> Iterable[MetadataWorkUnit]: count_on_query = len(self.datasource_ids_being_used) - datasource_filter = "idWithin: {}".format( - json.dumps(self.datasource_ids_being_used) - ) + datasource_filter = f"idWithin: {json.dumps(self.datasource_ids_being_used)}" ( published_datasource_conn, total_count, @@ -933,7 +913,7 @@ def emit_sheets_as_charts(self, workbook: Dict) -> Iterable[MetadataWorkUnit]: chart_snapshot.aspects.append(chart_info) if workbook.get("projectName") and workbook.get("name"): - sheet_name = sheet.get("name") if sheet.get("name") else sheet["id"] + sheet_name = sheet.get("name") or sheet["id"] # Browse path browse_path = BrowsePathsClass( paths=[ @@ -1050,7 +1030,7 @@ def emit_dashboards(self, workbook: Dict) -> Iterable[MetadataWorkUnit]: dashboard_snapshot.aspects.append(dashboard_info_class) if workbook.get("projectName") and workbook.get("name"): - dashboard_name = title if title else dashboard["id"] + dashboard_name = title or dashboard["id"] # browse path browse_paths = BrowsePathsClass( paths=[ @@ -1104,7 +1084,7 @@ def _get_schema(self, schema_provided: str, database: str, fullName: str) -> str def _extract_schema_from_fullName(self, fullName: str) -> str: # fullName is observed to be in format [schemaName].[tableName] # OR simply tableName OR [tableName] - if fullName.startswith("[") and fullName.find("].[") >= 0: + if fullName.startswith("[") and "].[" in fullName: return fullName[1 : fullName.index("]")] return "" From 4c9705f62c29434d5406a403b1119ae53da8e8e6 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:42:23 +1000 Subject: [PATCH 56/88] Update pulsar.py --- .../src/datahub/ingestion/source/pulsar.py | 47 +++++++------------ 1 file changed, 16 insertions(+), 31 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py index e4d9a505ea7210..e969c2d3aeb3ea 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py +++ b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py @@ -98,7 +98,7 @@ def __init__(self, config: PulsarSourceConfig, ctx: PipelineContext): self.platform: str = "pulsar" self.config: PulsarSourceConfig = config self.report: PulsarSourceReport = PulsarSourceReport() - self.base_url: str = self.config.web_service_url + "/admin/v2" + self.base_url: str = f"{self.config.web_service_url}/admin/v2" self.tenants: List[str] = config.tenants if ( @@ -119,9 +119,7 @@ def __init__(self, config: PulsarSourceConfig, ctx: PipelineContext): if self._is_oauth_authentication_configured(): # Get OpenId configuration from issuer, e.g. token_endpoint - oid_config_url = ( - "%s/.well-known/openid-configuration" % self.config.issuer_url - ) + oid_config_url = f"{self.config.issuer_url}/.well-known/openid-configuration" oid_config_response = requests.get( oid_config_url, verify=False, allow_redirects=False ) @@ -129,10 +127,8 @@ def __init__(self, config: PulsarSourceConfig, ctx: PipelineContext): if oid_config_response: self.config.oid_config.update(oid_config_response.json()) else: - logger.error( - "Unexpected response while getting discovery document using %s : %s" - % (oid_config_url, oid_config_response) - ) + logger.error(f"Unexpected response while getting discovery document using {oid_config_url} : {oid_config_response}") + if "token_endpoint" not in self.config.oid_config: raise Exception( @@ -323,17 +319,13 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: ] # Report the Pulsar broker version we are communicating with - self.report.report_pulsar_version( - self.session.get( - "%s/brokers/version" % self.base_url, - timeout=self.config.timeout, - ).text - ) + self.report.report_pulsar_version(self.session.get(f"{self.base_url}/brokers/version", timeout=self.config.timeout).text) + # If no tenants are provided, request all tenants from cluster using /admin/v2/tenants endpoint. # Requesting cluster tenant information requires superuser privileges if not self.tenants: - self.tenants = self._get_pulsar_metadata(self.base_url + "/tenants") or [] + self.tenants = self._get_pulsar_metadata(f"{self.base_url}/tenants") or [] # Initialize counters self.report.tenants_scanned = 0 @@ -345,10 +337,9 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: if self.config.tenant_patterns.allowed(tenant): # Get namespaces belonging to a tenant, /admin/v2/%s/namespaces # A tenant admin role has sufficient privileges to perform this action - namespaces = ( - self._get_pulsar_metadata(self.base_url + "/namespaces/%s" % tenant) - or [] - ) + namespaces = self._get_pulsar_metadata(f"{self.base_url}/namespaces/{tenant}") or [] + + for namespace in namespaces: self.report.namespaces_scanned += 1 if self.config.namespace_patterns.allowed(namespace): @@ -406,14 +397,10 @@ def _add_topic_to_checkpoint(self, topic: str) -> None: ) def _is_token_authentication_configured(self) -> bool: - if self.config.token is not None: - return True - return False + return self.config.token is not None def _is_oauth_authentication_configured(self) -> bool: - if self.config.issuer_url is not None: - return True - return False + return self.config.issuer_url is not None def _get_schema_and_fields( self, pulsar_topic: PulsarTopic, is_key_schema: bool @@ -421,11 +408,8 @@ def _get_schema_and_fields( pulsar_schema: Optional[PulsarSchema] = None - schema_url = self.base_url + "/schemas/%s/%s/%s/schema" % ( - pulsar_topic.tenant, - pulsar_topic.namespace, - pulsar_topic.topic, - ) + schema_url = self.base_url + f"/schemas/{pulsar_topic.tenant}/{pulsar_topic.namespace}/{pulsar_topic.topic}/schema" + schema_payload = self._get_pulsar_metadata(schema_url) @@ -449,7 +433,7 @@ def _get_schema_fields( ) -> List[SchemaField]: # Parse the schema and convert it to SchemaFields. fields: List[SchemaField] = [] - if schema.schema_type == "AVRO" or schema.schema_type == "JSON": + if schema.schema_type in ["AVRO", "JSON"]: # Extract fields from schema and get the FQN for the schema fields = schema_util.avro_schema_to_mce_fields( schema.schema_str, is_key_schema=is_key_schema @@ -465,6 +449,7 @@ def _get_schema_metadata( self, pulsar_topic: PulsarTopic, platform_urn: str ) -> Tuple[Optional[PulsarSchema], Optional[SchemaMetadata]]: + # FIXME: Type annotations are not working for this function. schema, fields = self._get_schema_and_fields( pulsar_topic=pulsar_topic, is_key_schema=False ) # type: Tuple[Optional[PulsarSchema], List[SchemaField]] From d4afcb76f14e32641cca260e31ac2734c686787b Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:42:24 +1000 Subject: [PATCH 57/88] Update tableau_common.py --- .../src/datahub/ingestion/source/tableau_common.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py index ddebc6a437ea4b..90a254e1c872bd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py @@ -430,8 +430,7 @@ def make_table_urn( # if there are more than 3 tokens, just take the final 3 fully_qualified_table_name = ".".join(fully_qualified_table_name.split(".")[-3:]) - urn = builder.make_dataset_urn(platform, fully_qualified_table_name, env) - return urn + return builder.make_dataset_urn(platform, fully_qualified_table_name, env) def make_description_from_params(description, formula): @@ -448,10 +447,9 @@ def make_description_from_params(description, formula): def get_field_value_in_sheet(field, field_name): if field.get("__typename", "") == "DatasourceField": - field = field.get("remoteField") if field.get("remoteField") else {} + field = field.get("remoteField") or {} - field_value = field.get(field_name, "") - return field_value + return field.get(field_name, "") def get_unique_custom_sql(custom_sql_list: List[dict]) -> List[dict]: @@ -503,6 +501,4 @@ def query_metadata(server, main_query, connection_name, first, offset, qry_filte filter=qry_filter, main_query=main_query, ) - query_result = server.metadata.query(query) - - return query_result + return server.metadata.query(query) From c4b15e95eebd2a45abc95c947d8dba12cc8cce1d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:49:35 +1000 Subject: [PATCH 58/88] Update powerbi.py --- .../src/datahub/ingestion/source/powerbi.py | 131 ++++++------------ 1 file changed, 44 insertions(+), 87 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi.py index a192840cd5dd68..f9df9a802a3719 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi.py @@ -555,7 +555,7 @@ def get_data_source(self, dataset: Dataset) -> Any: DATASET_ID=dataset.id, ) # Hit PowerBi - LOGGER.info("Request to datasource URL={}".format(datasource_get_endpoint)) + LOGGER.info(f"Request to datasource URL={datasource_get_endpoint}") response = requests.get( url=datasource_get_endpoint, headers={Constant.Authorization: self.get_access_token()}, @@ -565,18 +565,15 @@ def get_data_source(self, dataset: Dataset) -> Any: if response.status_code != 200: message: str = "Failed to fetch datasource from power-bi for" LOGGER.warning(message) - LOGGER.warning("{}={}".format(Constant.WorkspaceId, dataset.workspace_id)) - LOGGER.warning("{}={}".format(Constant.DatasetId, dataset.id)) + LOGGER.warning(f"{Constant.WorkspaceId}={dataset.workspace_id}") + LOGGER.warning(f"{Constant.DatasetId}={dataset.id}") raise ConnectionError(message) res = response.json() value = res["value"] if len(value) == 0: - LOGGER.info( - "datasource is not found for dataset {}({})".format( - dataset.name, dataset.id - ) - ) + LOGGER.info(f"datasource is not found for dataset {dataset.name}({dataset.id})") + return None # Consider only zero index datasource datasource_dict = value[0] @@ -642,13 +639,8 @@ def new_dataset_or_report(tile_instance: Any) -> dict: else: report_fields["createdFrom"] = PowerBiAPI.Tile.CreatedFrom.VISUALIZATION - LOGGER.info( - "Tile {}({}) is created from {}".format( - tile_instance.get("title"), - tile_instance.get("id"), - report_fields["createdFrom"], - ) - ) + LOGGER.info(f'Tile {tile_instance.get("title")}({tile_instance.get("id")}) is created from {report_fields["createdFrom"]}') + return report_fields @@ -698,7 +690,7 @@ def get_workspace(self, workspace_id: str) -> Workspace: POWERBI_ADMIN_BASE_URL=self.__config.admin_base_url ) - def create_scan_job(): + def create_scan_job(): # sourcery skip: avoid-builtin-shadow """ Create scan job on PowerBi for the workspace """ @@ -718,9 +710,8 @@ def create_scan_job(): ) if res.status_code not in (200, 202): - message = "API({}) return error code {} for workpace id({})".format( - scan_create_endpoint, res.status_code, workspace_id - ) + message = f"API({scan_create_endpoint}) return error code {res.status_code} for workspace id({workspace_id})" + LOGGER.warning(message) @@ -736,47 +727,40 @@ def wait_for_scan_to_complete(scan_id: str, timeout: int) -> Boolean: """ minimum_sleep = 3 if timeout < minimum_sleep: - LOGGER.info( - "Setting timeout to minimum_sleep time {} seconds".format( - minimum_sleep - ) - ) + LOGGER.info(f"Setting timeout to minimum_sleep time {minimum_sleep} seconds") timeout = minimum_sleep - max_trial = int(timeout / minimum_sleep) - LOGGER.info("Max trial {}".format(max_trial)) + max_trial = timeout // minimum_sleep + LOGGER.info(f"Max trial {max_trial}") scan_get_endpoint = PowerBiAPI.API_ENDPOINTS[Constant.SCAN_GET] scan_get_endpoint = scan_get_endpoint.format( POWERBI_ADMIN_BASE_URL=self.__config.admin_base_url, SCAN_ID=scan_id ) - LOGGER.info("Hitting URL={}".format(scan_get_endpoint)) + LOGGER.info(f"Hitting URL={scan_get_endpoint}") trail = 1 while True: - LOGGER.info("Trial = {}".format(trail)) + LOGGER.info(f"Trial = {trail}") res = requests.get( scan_get_endpoint, headers={Constant.Authorization: self.get_access_token()}, ) if res.status_code != 200: - message = "API({}) return error code {} for scan id({})".format( - scan_get_endpoint, res.status_code, scan_id - ) + message = f"API({scan_get_endpoint}) return error code {res.status_code} for scan id({scan_id})" + LOGGER.warning(message) raise ConnectionError(message) if res.json()["status"].upper() == "Succeeded".upper(): - LOGGER.info( - "Scan result is available for scan id({})".format(scan_id) - ) + LOGGER.info(f"Scan result is available for scan id({scan_id})") return True if trail == max_trial: break - LOGGER.info("Sleeping for {} seconds".format(minimum_sleep)) + LOGGER.info(f"Sleeping for {minimum_sleep} seconds") sleep(minimum_sleep) trail += 1 @@ -785,7 +769,7 @@ def wait_for_scan_to_complete(scan_id: str, timeout: int) -> Boolean: def get_scan_result(scan_id: str) -> dict: LOGGER.info("Fetching scan result") - LOGGER.info("{}={}".format(Constant.SCAN_ID, scan_id)) + LOGGER.info(f"{Constant.SCAN_ID}={scan_id}") scan_result_get_endpoint = PowerBiAPI.API_ENDPOINTS[ Constant.SCAN_RESULT_GET ] @@ -793,15 +777,14 @@ def get_scan_result(scan_id: str) -> dict: POWERBI_ADMIN_BASE_URL=self.__config.admin_base_url, SCAN_ID=scan_id ) - LOGGER.info("Hittin URL={}".format(scan_result_get_endpoint)) + LOGGER.info(f"Hitting URL={scan_result_get_endpoint}") res = requests.get( scan_result_get_endpoint, headers={Constant.Authorization: self.get_access_token()}, ) if res.status_code != 200: - message = "API({}) return error code {} for scan id({})".format( - scan_result_get_endpoint, res.status_code, scan_id - ) + message = f"API({scan_result_get_endpoint}) return error code {res.status_code} for scan id({scan_id})" + LOGGER.warning(message) @@ -817,11 +800,8 @@ def json_to_dataset_map(scan_result: dict) -> dict: dataset_map: dict = {} if datasets is None or len(datasets) == 0: - LOGGER.warning( - "Workspace {}({}) does not have datasets".format( - scan_result["name"], scan_result["id"] - ) - ) + LOGGER.warning(f'Workspace {scan_result["name"]}({scan_result["id"]}) does not have datasets') + LOGGER.info("Returning empty datasets") return dataset_map @@ -840,19 +820,13 @@ def json_to_dataset_map(scan_result: dict) -> dict: dataset_instance.datasource and dataset_instance.datasource.metadata.is_relational is True ): - LOGGER.info( - "Processing tables attribute for dataset {}({})".format( - dataset_instance.name, dataset_instance.id - ) - ) + LOGGER.info(f"Processing tables attribute for dataset {dataset_instance.name}({dataset_instance.id})") + for table in dataset_dict["tables"]: if "Value.NativeQuery(" in table["source"][0]["expression"]: - LOGGER.warning( - "Table {} is created from Custom SQL. Ignoring in processing".format( - table["name"] - ) - ) + LOGGER.warning(f'Table {table["name"]} is created from Custom SQL. Ignoring in processing') + continue # PowerBi table name contains schema name and table name. Format is @@ -972,29 +946,18 @@ def __to_datahub_dataset( dataset.datasource is None or dataset.datasource.metadata.is_relational is False ): - LOGGER.warning( - "Dataset {}({}) is not created from relational datasource".format( - dataset.name, dataset.id - ) - ) + LOGGER.warning(f"Dataset {dataset.name}({dataset.id}) is not created from relational datasource") + return dataset_mcps - LOGGER.info( - "Converting dataset={}(id={}) to datahub dataset".format( - dataset.name, dataset.id - ) - ) + LOGGER.info(f"Converting dataset={dataset.name}(id={dataset.id}) to datahub dataset") + for table in dataset.tables: # Create an URN for dataset - ds_urn = builder.make_dataset_urn( - platform=self.__config.dataset_type_mapping[dataset.datasource.type], - name="{}.{}.{}".format( - dataset.datasource.database, table.schema_name, table.name - ), - env=self.__config.env, - ) - LOGGER.info("{}={}".format(Constant.Dataset_URN, ds_urn)) + ds_urn = builder.make_dataset_urn(platform=self.__config.dataset_type_mapping[dataset.datasource.type], name=f"{dataset.datasource.database}.{table.schema_name}.{table.name}", env=self.__config.env) + + LOGGER.info(f"{Constant.Dataset_URN}={ds_urn}") # Create datasetProperties mcp ds_properties = DatasetPropertiesClass(description=table.name) @@ -1202,11 +1165,8 @@ def to_datahub_user( Map PowerBi user to datahub user """ - LOGGER.info( - "Converting user {}(id={}) to datahub's user".format( - user.displayName, user.id - ) - ) + LOGGER.info(f"Converting user {user.displayName}(id={user.id}) to datahub's user") + # Create an URN for user user_urn = builder.make_user_urn(user.get_urn_part()) @@ -1263,10 +1223,10 @@ def to_datahub_chart( chart_mcps = [] # Return empty list if input list is empty - if len(tiles) == 0: + if not tiles: return [], [] - LOGGER.info("Converting tiles(count={}) to charts".format(len(tiles))) + LOGGER.info(f"Converting tiles(count={len(tiles)}) to charts") for tile in tiles: if tile is None: @@ -1288,9 +1248,8 @@ def to_datahub_work_units( ) -> Set[EquableMetadataWorkUnit]: mcps = [] - LOGGER.info( - "Converting dashboard={} to datahub dashboard".format(dashboard.displayName) - ) + LOGGER.info(f"Converting dashboard={dashboard.displayName} to datahub dashboard") + # Convert user to CorpUser user_mcps = self.to_datahub_users(dashboard.users) @@ -1388,12 +1347,10 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: self.reporter.report_dashboards_scanned() self.reporter.report_charts_scanned(count=len(dashboard.tiles)) except Exception as e: - message = "Error ({}) occurred while loading dashboard {}(id={}) tiles.".format( - e, dashboard.displayName, dashboard.id - ) + message = f"Error ({e}) occurred while loading dashboard {dashboard.displayName}(id={dashboard.id}) tiles." + LOGGER.exception(message, e) self.reporter.report_warning(dashboard.id, message) - # Convert PowerBi Dashboard and child entities to Datahub work unit to ingest into Datahub workunits = self.mapper.to_datahub_work_units(dashboard) for workunit in workunits: From 3cfd95a510a61812a941ccb9fccf5b07e9fe3d4c Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:49:37 +1000 Subject: [PATCH 59/88] Update openapi_parser.py --- .../ingestion/source/openapi_parser.py | 52 ++++++++----------- 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py index 830b6562755eb7..233d920f6877ef 100755 --- a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py @@ -20,9 +20,9 @@ def flatten(d: dict, prefix: str = "") -> Generator: for k, v in d.items(): if isinstance(v, dict): - yield from flatten(v, prefix + "." + k) + yield from flatten(v, f"{prefix}.{k}") else: - yield (prefix + "-" + k).strip(".") + yield f"{prefix}-{k}".strip(".") def flatten2list(d: dict) -> list: @@ -53,15 +53,13 @@ def request_call( headers = {"accept": "application/json"} if username is not None and password is not None: - response = requests.get( - url, headers=headers, auth=HTTPBasicAuth(username, password) - ) + return requests.get(url, headers=headers, auth=HTTPBasicAuth(username, password)) + elif token is not None: - headers["Authorization"] = "Bearer " + token - response = requests.get(url, headers=headers) + headers["Authorization"] = f"Bearer {token}" + return requests.get(url, headers=headers) else: - response = requests.get(url, headers=headers) - return response + return requests.get(url, headers=headers) def get_swag_json( @@ -77,14 +75,13 @@ def get_swag_json( else: response = request_call(url=tot_url, username=username, password=password) - if response.status_code == 200: - try: - dict_data = json.loads(response.content) - except json.JSONDecodeError: # it's not a JSON! - dict_data = yaml.safe_load(response.content) - return dict_data - else: + if response.status_code != 200: raise Exception(f"Unable to retrieve {tot_url}, error {response.status_code}") + try: + dict_data = json.loads(response.content) + except json.JSONDecodeError: # it's not a JSON! + dict_data = yaml.safe_load(response.content) + return dict_data def get_url_basepath(sw_dict: dict) -> str: @@ -95,7 +92,7 @@ def get_url_basepath(sw_dict: dict) -> str: def check_sw_version(sw_dict: dict) -> None: - if "swagger" in sw_dict.keys(): + if "swagger" in sw_dict: v_split = sw_dict["swagger"].split(".") else: v_split = sw_dict["openapi"].split(".") @@ -108,7 +105,7 @@ def check_sw_version(sw_dict: dict) -> None: ) -def get_endpoints(sw_dict: dict) -> dict: # noqa: C901 +def get_endpoints(sw_dict: dict) -> dict: # noqa: C901 """ Get all the URLs accepting the "GET" method, together with their description and the tags """ @@ -176,8 +173,7 @@ def get_endpoints(sw_dict: dict) -> dict: # noqa: C901 if "parameters" in p_o["get"].keys(): url_details[p_k]["parameters"] = p_o["get"]["parameters"] - ord_d = dict(sorted(url_details.items())) # sorting for convenience - return ord_d + return dict(sorted(url_details.items())) def guessing_url_name(url: str, examples: dict) -> str: @@ -187,10 +183,7 @@ def guessing_url_name(url: str, examples: dict) -> str: extr_data = {"advancedcomputersearches": {'id': 202, 'name': '_unmanaged'}} -->> guessed_url = /advancedcomputersearches/name/_unmanaged/id/202' """ - if url[0] == "/": - url2op = url[1:] # operational url does not need the very first / - else: - url2op = url + url2op = url[1:] if url[0] == "/" else url divisions = url2op.split("/") # the very first part of the url should stay the same. @@ -211,14 +204,14 @@ def guessing_url_name(url: str, examples: dict) -> str: if div_pos > 0: root = root[: div_pos - 1] # like "base/field" should become "base" - if root in examples.keys(): + if root in examples: # if our root is contained in our samples examples... ex2use = root - elif root[:-1] in examples.keys(): + elif root[:-1] in examples: ex2use = root[:-1] - elif root.replace("/", ".") in examples.keys(): + elif root.replace("/", ".") in examples: ex2use = root.replace("/", ".") - elif root[:-1].replace("/", ".") in examples.keys(): + elif root[:-1].replace("/", ".") in examples: ex2use = root[:-1].replace("/", ".") else: return url @@ -277,8 +270,7 @@ def try_guessing(url: str, examples: dict) -> str: Any non-guessed name will stay as it was (with parenthesis{}) """ url_guess = guessing_url_name(url, examples) # try to fill with known informations - url_guess_id = maybe_theres_simple_id(url_guess) # try to fill IDs with "1"s... - return url_guess_id + return maybe_theres_simple_id(url_guess) def clean_url(url: str) -> str: From 99823bbfba36599e235091c38f8a9cc6906ae30e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:49:40 +1000 Subject: [PATCH 60/88] Update openapi.py --- metadata-ingestion/src/datahub/ingestion/source/openapi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi.py b/metadata-ingestion/src/datahub/ingestion/source/openapi.py index b71cb363b96e46..9548677e1cdc11 100755 --- a/metadata-ingestion/src/datahub/ingestion/source/openapi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/openapi.py @@ -118,7 +118,7 @@ class ApiWorkUnit(MetadataWorkUnit): class APISource(Source, ABC): """ - This plugin is meant to gather dataset-like informations about OpenApi Endpoints. + This plugin is meant to gather dataset-like information about OpenApi Endpoints. As example, if by calling GET at the endpoint at `https://test_endpoint.com/api/users/` you obtain as result: ```JSON From 1a85c8fed56ff4351f616198dd01d860a05cd5f9 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:49:42 +1000 Subject: [PATCH 61/88] Update nifi.py --- metadata-ingestion/src/datahub/ingestion/source/nifi.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/nifi.py b/metadata-ingestion/src/datahub/ingestion/source/nifi.py index bb8ac443555252..20cd1daa6671e2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/nifi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/nifi.py @@ -337,10 +337,7 @@ def __init__(self, config: NifiSourceConfig, ctx: PipelineContext) -> None: if self.config.site_url_to_site_name is None: self.config.site_url_to_site_name = {} - if ( - not urljoin(self.config.site_url, "/nifi/") - in self.config.site_url_to_site_name - ): + if urljoin(self.config.site_url, "/nifi/") not in self.config.site_url_to_site_name: self.config.site_url_to_site_name[ urljoin(self.config.site_url, "/nifi/") ] = self.config.site_name @@ -774,7 +771,7 @@ def construct_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 rootpg = self.nifi_flow.root_process_group flow_name = rootpg.name # self.config.site_name flow_urn = builder.make_data_flow_urn(NIFI, rootpg.id, self.config.env) - flow_properties = dict() + flow_properties = {} if self.nifi_flow.clustered is not None: flow_properties["clustered"] = str(self.nifi_flow.clustered) if self.nifi_flow.version is not None: From 748d1155e6ab68cca1e51accc0e59374ca3b302d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:53:02 +1000 Subject: [PATCH 62/88] Update mongodb.py --- metadata-ingestion/src/datahub/ingestion/source/mongodb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py index 8d6201867dd8b3..9710fe023a2560 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py @@ -172,9 +172,9 @@ def construct_schema_pymongo( maximum size of the document that will be considered for generating the schema. """ - doc_size_field = "temporary_doc_size_field" aggregations: List[Dict] = [] if is_version_gte_4_4: + doc_size_field = "temporary_doc_size_field" # create a temporary field to store the size of the document. filter on it and then remove it. aggregations = [ {"$addFields": {doc_size_field: {"$bsonSize": "$$ROOT"}}}, From 23e43294ae4612cb86d2f8307d8e4872ae6518c8 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 22:53:04 +1000 Subject: [PATCH 63/88] Update metabase.py --- .../src/datahub/ingestion/source/metabase.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/metabase.py b/metadata-ingestion/src/datahub/ingestion/source/metabase.py index 93308ff93b3226..a4873e1bd08633 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metabase.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metabase.py @@ -42,6 +42,7 @@ OwnershipTypeClass, ) from datahub.utilities import config_clean +from datetime import timezone class MetabaseConfig(DatasetLineageProviderConfigBase): @@ -199,7 +200,7 @@ def get_timestamp_millis_from_ts_string(ts_str: str) -> int: try: return int(dp.parse(ts_str).timestamp() * 1000) except (dp.ParserError, OverflowError): - return int(datetime.utcnow().timestamp() * 1000) + return int(datetime.now(timezone.utc).timestamp() * 1000) def construct_dashboard_from_api_data( self, dashboard_info: dict @@ -448,9 +449,7 @@ def get_datasource_urn(self, card_details): if source_table_id is not None: schema_name, table_name = self.get_source_table_from_id(source_table_id) if table_name: - source_paths.add( - f"{schema_name + '.' if schema_name else ''}{table_name}" - ) + source_paths.add(f"{f'{schema_name}.' if schema_name else ''}{table_name}") else: try: raw_query = ( @@ -478,7 +477,7 @@ def get_datasource_urn(self, card_details): # Create dataset URNs dataset_urn = [] - dbname = f"{database_name + '.' if database_name else ''}" + dbname = f"{f'{database_name}.' if database_name else ''}" source_tables = list(map(lambda tbl: f"{dbname}{tbl}", source_paths)) dataset_urn = [ builder.make_dataset_urn_with_platform_instance( From 82509c7504cd109a049ba02c0c288ea4e7230287 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 23:02:24 +1000 Subject: [PATCH 64/88] Update lookml.py --- .../src/datahub/ingestion/source/lookml.py | 57 +++++++------------ 1 file changed, 19 insertions(+), 38 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/lookml.py b/metadata-ingestion/src/datahub/ingestion/source/lookml.py index 2370a76aa28a00..2c8e23709d1ee1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/lookml.py +++ b/metadata-ingestion/src/datahub/ingestion/source/lookml.py @@ -130,18 +130,17 @@ def from_looker_connection( ".*": _get_generic_definition, } - if looker_connection.dialect_name is not None: - for extractor_pattern, extracting_function in extractors.items(): - if re.match(extractor_pattern, looker_connection.dialect_name): - (platform, db, schema) = extracting_function(looker_connection) - return cls(platform=platform, default_db=db, default_schema=schema) - raise ConfigurationError( - f"Could not find an appropriate platform for looker_connection: {looker_connection.name} with dialect: {looker_connection.dialect_name}" - ) - else: + if looker_connection.dialect_name is None: raise ConfigurationError( f"Unable to fetch a fully filled out connection for {looker_connection.name}. Please check your API permissions." ) + for extractor_pattern, extracting_function in extractors.items(): + if re.match(extractor_pattern, looker_connection.dialect_name): + (platform, db, schema) = extracting_function(looker_connection) + return cls(platform=platform, default_db=db, default_schema=schema) + raise ConfigurationError( + f"Could not find an appropriate platform for looker_connection: {looker_connection.name} with dialect: {looker_connection.dialect_name}" + ) class LookMLSourceConfig(LookerCommonConfig): @@ -591,7 +590,7 @@ def from_looker_dict( if sql_table_name is not None else None ) - derived_table = looker_view.get("derived_table", None) + derived_table = looker_view.get("derived_table") dimensions = cls._get_fields( looker_view.get("dimensions", []), ViewFieldType.DIMENSION @@ -605,7 +604,7 @@ def from_looker_dict( fields: List[ViewField] = dimensions + dimension_groups + measures # also store the view logic and materialization - view_logic = looker_viewfile.raw_file_content[0:max_file_snippet_length] + view_logic = looker_viewfile.raw_file_content[:max_file_snippet_length] # Parse SQL from derived tables to extract dependencies if derived_table is not None: @@ -630,9 +629,7 @@ def from_looker_dict( if k in ["datagroup_trigger", "sql_trigger_value", "persist_for"]: materialized = True if "materialized_view" in derived_table: - materialized = ( - True if derived_table["materialized_view"] == "yes" else False - ) + materialized = derived_table["materialized_view"] == "yes" view_details = ViewProperties( materialized=materialized, viewLogic=view_logic, viewLanguage=view_lang @@ -654,14 +651,7 @@ def from_looker_dict( # If not a derived table, then this view essentially wraps an existing # object in the database. - if sql_table_name is not None: - # If sql_table_name is set, there is a single dependency in the view, on the sql_table_name. - sql_table_names = [sql_table_name] - else: - # Otherwise, default to the view name as per the docs: - # https://docs.looker.com/reference/view-params/sql_table_name-for-view - sql_table_names = [view_name] - + sql_table_names = [view_name] if sql_table_name is None else [sql_table_name] output_looker_view = LookerView( id=LookerViewId( project_name=project_name, model_name=model_name, view_name=view_name @@ -705,7 +695,7 @@ def _extract_metadata_from_sql_query( # Add those in if we detect that it is missing if not re.search(r"SELECT\s", sql_query, flags=re.I): # add a SELECT clause at the beginning - sql_query = "SELECT " + sql_query + sql_query = f"SELECT {sql_query}" if not re.search(r"FROM\s", sql_query, flags=re.I): # add a FROM clause at the end sql_query = f"{sql_query} FROM {sql_table_name if sql_table_name is not None else view_name}" @@ -714,7 +704,7 @@ def _extract_metadata_from_sql_query( sql_info = cls._get_sql_info(sql_query, sql_parser_path) sql_table_names = sql_info.table_names column_names = sql_info.column_names - if fields == []: + if not fields: # it seems like the view is defined purely as sql, let's try using the column names to populate the schema fields = [ # set types to unknown for now as our sql parser doesn't give us column types yet @@ -722,10 +712,7 @@ def _extract_metadata_from_sql_query( for c in column_names ] except Exception as e: - reporter.report_warning( - f"looker-view-{view_name}", - f"Failed to parse sql query, lineage will not be accurate. Exception: {e}", - ) + reporter.report_warning(f"looker-view-{view_name}", f"Failed to parse sql query, lineage will not be accurate. Exception: {e}") return fields, sql_table_names @@ -843,10 +830,7 @@ def _load_model(self, path: str) -> LookerModel: return looker_model def _platform_names_have_2_parts(self, platform: str) -> bool: - if platform in ["hive", "mysql", "athena"]: - return True - else: - return False + return platform in {"hive", "mysql", "athena"} def _generate_fully_qualified_name( self, sql_table_name: str, connection_def: LookerConnectionDefinition @@ -998,7 +982,6 @@ def _get_custom_properties(self, looker_view: LookerView) -> DatasetPropertiesCl def _build_dataset_mcps( self, looker_view: LookerView ) -> List[MetadataChangeProposalWrapper]: - events = [] subTypeEvent = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, @@ -1006,7 +989,7 @@ def _build_dataset_mcps( aspectName="subTypes", aspect=SubTypesClass(typeNames=["view"]), ) - events.append(subTypeEvent) + events = [subTypeEvent] if looker_view.view_details is not None: viewEvent = MetadataChangeProposalWrapper( entityType="dataset", @@ -1047,9 +1030,7 @@ def _build_dataset_mce(self, looker_view: LookerView) -> MetadataChangeEvent: dataset_snapshot.aspects.append(schema_metadata) dataset_snapshot.aspects.append(self._get_custom_properties(looker_view)) - mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) - - return mce + return MetadataChangeEvent(proposedSnapshot=dataset_snapshot) def get_project_name(self, model_name: str) -> str: if self.source_config.project_name is not None: @@ -1091,7 +1072,7 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 for file_path in model_files: self.reporter.report_models_scanned() - model_name = file_path.stem[0:-model_suffix_len] + model_name = file_path.stem[:-model_suffix_len] if not self.source_config.model_pattern.allowed(model_name): self.reporter.report_models_dropped(model_name) From 3245d7b91b5a1ecbf0e086640bd449c1f25c7bb0 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 23:02:26 +1000 Subject: [PATCH 65/88] Update looker_common.py --- .../datahub/ingestion/source/looker_common.py | 40 ++++++------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker_common.py index 7f717d0efc82d0..58cf0674267d76 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker_common.py @@ -110,12 +110,8 @@ class LookerExploreNamingConfig(ConfigModel): def init_naming_pattern(cls, v): if isinstance(v, NamingPattern): return v - else: - assert isinstance(v, str), "pattern must be a string" - naming_pattern = NamingPattern( - allowed_vars=naming_pattern_variables, pattern=v - ) - return naming_pattern + assert isinstance(v, str), "pattern must be a string" + return NamingPattern(allowed_vars=naming_pattern_variables, pattern=v) @validator("explore_naming_pattern", "explore_browse_pattern", always=True) def validate_naming_pattern(cls, v): @@ -143,12 +139,8 @@ class LookerViewNamingConfig(ConfigModel): def init_naming_pattern(cls, v): if isinstance(v, NamingPattern): return v - else: - assert isinstance(v, str), "pattern must be a string" - naming_pattern = NamingPattern( - allowed_vars=naming_pattern_variables, pattern=v - ) - return naming_pattern + assert isinstance(v, str), "pattern must be a string" + return NamingPattern(allowed_vars=naming_pattern_variables, pattern=v) @validator("view_naming_pattern", "view_browse_pattern", always=True) def validate_naming_pattern(cls, v): @@ -314,8 +306,7 @@ def _extract_view_from_field(field: str) -> str: assert ( field.count(".") == 1 ), f"Error: A field must be prefixed by a view name, field is: {field}" - view_name = field.split(".")[0] - return view_name + return field.split(".")[0] @staticmethod def _get_field_type( @@ -336,8 +327,7 @@ def _get_field_type( ) type_class = NullTypeClass - data_type = SchemaFieldDataType(type=type_class()) - return data_type + return SchemaFieldDataType(type=type_class()) @staticmethod def _get_schema( @@ -346,7 +336,7 @@ def _get_schema( view_fields: List[ViewField], reporter: SourceReport, ) -> Optional[SchemaMetadataClass]: - if view_fields == []: + if not view_fields: return None fields, primary_keys = LookerUtil._get_fields_and_primary_keys( view_fields=view_fields, reporter=reporter @@ -618,16 +608,10 @@ def from_api( # noqa: C901 source_file=explore.source_file, ) except SDKError as e: - logger.warn( - "Failed to extract explore {} from model {}.".format( - explore_name, model - ) - ) - logger.debug( - "Failed to extract explore {} from model {} with {}".format( - explore_name, model, e - ) - ) + logger.warn(f"Failed to extract explore {explore_name} from model {model}.") + logger.debug(f"Failed to extract explore {explore_name} from model {model} with {e}") + + except AssertionError: reporter.report_warning( key="chart-", @@ -678,7 +662,7 @@ def _get_url(self, base_url): # If the base_url contains a port number (like https://company.looker.com:19999) remove the port number m = re.match("^(.*):([0-9]+)$", base_url) if m is not None: - base_url = m.group(1) + base_url = m[1] return f"{base_url}/explore/{self.model_name}/{self.name}" def _to_metadata_events( # noqa: C901 From 232dbdac550461027b97c204b21b9fd5754008af Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 23:02:29 +1000 Subject: [PATCH 66/88] Update looker.py --- .../src/datahub/ingestion/source/looker.py | 52 ++++++++----------- 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker.py b/metadata-ingestion/src/datahub/ingestion/source/looker.py index 252d9f553e0572..2afa4ca05cbb36 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker.py @@ -216,11 +216,11 @@ def url(self, base_url: str) -> str: # If the base_url contains a port number (like https://company.looker.com:19999) remove the port number m = re.match("^(.*):([0-9]+)$", base_url) if m is not None: - base_url = m.group(1) + base_url = m[1] if self.look_id is not None: - return base_url + "/looks/" + self.look_id + return f"{base_url}/looks/{self.look_id}" else: - return base_url + "/x/" + self.query_slug + return f"{base_url}/x/{self.query_slug}" def get_urn_element_id(self): # A dashboard element can use a look or just a raw query against an explore @@ -270,23 +270,22 @@ def __init__(self, client: Looker31SDK): def get_by_id( self, id: int, transport_options: Optional[TransportOptions] ) -> Optional[LookerUser]: - logger.debug("Will get user {}".format(id)) + logger.debug(f"Will get user {id}") if id in self.user_map: return self.user_map[id] - else: - try: - raw_user: User = self.client.user( - id, - fields=self.fields, - transport_options=transport_options, - ) - looker_user = LookerUser._from_user(raw_user) - self.user_map[id] = looker_user - return looker_user - except SDKError as e: - logger.warn("Could not find user with id {}".format(id)) - logger.warn("Failure was {}".format(e)) - return None + try: + raw_user: User = self.client.user( + id, + fields=self.fields, + transport_options=transport_options, + ) + looker_user = LookerUser._from_user(raw_user) + self.user_map[id] = looker_user + return looker_user + except SDKError as e: + logger.warn(f"Could not find user with id {id}") + logger.warn(f"Failure was {e}") + return None @dataclass @@ -306,8 +305,8 @@ def url(self, base_url): # If the base_url contains a port number (like https://company.looker.com:19999) remove the port number m = re.match("^(.*):([0-9]+)$", base_url) if m is not None: - base_url = m.group(1) - return base_url + "/dashboards/" + self.id + base_url = m[1] + return f"{base_url}/dashboards/{self.id}" def get_urn_dashboard_id(self): return f"dashboards.{self.id}" @@ -350,8 +349,7 @@ def _extract_view_from_field(field: str) -> str: assert ( field.count(".") == 1 ), f"Error: A field must be prefixed by a view name, field is: {field}" - view_name = field.split(".")[0] - return view_name + return field.split(".")[0] def _get_views_from_fields(self, fields: List[str]) -> List[str]: field_set = set(fields) @@ -449,12 +447,8 @@ def _get_looker_dashboard_element( # noqa: C901 raise ValueError("Element ID can't be None") if element.query is not None: - explores = [] fields = self._get_fields_from_query(element.query) - if element.query.view is not None: - # Get the explore from the view directly - explores = [element.query.view] - + explores = [element.query.view] if element.query.view is not None else [] logger.debug( "Element {}: Explores added: {}".format(element.title, explores) ) @@ -474,7 +468,6 @@ def _get_looker_dashboard_element( # noqa: C901 upstream_fields=fields, ) - # Dashboard elements can *alternatively* link to an existing look elif element.look is not None: # we pick from element title by default, falling back to look title. title: str = ( @@ -512,7 +505,6 @@ def _get_looker_dashboard_element( # noqa: C901 upstream_fields=fields, ) - # Failing the above two approaches, pick out details from result_maker elif element.result_maker is not None: model: str = "" fields = [] @@ -957,7 +949,7 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: else False, ) else: - raise Exception("Unexpected type of event {}".format(event)) + raise Exception(f"Unexpected type of event {event}") self.reporter.report_workunit(workunit) yield workunit From bc6e2b03bdc204693901846798ee1c77a8e6e03c Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 23:02:31 +1000 Subject: [PATCH 67/88] Update feast.py --- .../src/datahub/ingestion/source/feast.py | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/feast.py b/metadata-ingestion/src/datahub/ingestion/source/feast.py index c17ae5c14a85fd..0c67e2d036083f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/feast.py +++ b/metadata-ingestion/src/datahub/ingestion/source/feast.py @@ -4,23 +4,22 @@ from pydantic import Field -if sys.version_info >= (3, 7): - from feast import ( - BigQuerySource, - Entity, - Feature, - FeatureStore, - FeatureView, - FileSource, - KafkaSource, - KinesisSource, - OnDemandFeatureView, - ValueType, - ) - from feast.data_source import DataSource, RequestDataSource -else: +if sys.version_info < (3, 7): raise ModuleNotFoundError("The feast plugin requires Python 3.7 or newer.") +from feast import ( + BigQuerySource, + Entity, + Feature, + FeatureStore, + FeatureView, + FileSource, + KafkaSource, + KinesisSource, + OnDemandFeatureView, + ValueType, +) +from feast.data_source import DataSource, RequestDataSource import datahub.emitter.mce_builder as builder from datahub.configuration.common import ConfigModel from datahub.emitter.mce_builder import DEFAULT_ENV @@ -52,6 +51,7 @@ assert sys.version_info >= (3, 7) # needed for mypy +# FIXME: ValueType module cannot be used as a type _field_type_mapping: Dict[ValueType, str] = { ValueType.UNKNOWN: MLFeatureDataType.UNKNOWN, ValueType.BYTES: MLFeatureDataType.BYTE, @@ -218,6 +218,7 @@ def _get_entity_workunit( def _get_feature_workunit( self, + # FIXME: FeatureView and OnDemandFeatureView cannot be used as a type feature_view: Union[FeatureView, OnDemandFeatureView], feature: Feature, ) -> MetadataWorkUnit: From 77609ee991e216d2490d86dd57386c9598b0272b Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 8 Jun 2022 23:02:33 +1000 Subject: [PATCH 68/88] Update elastic_search.py --- .../src/datahub/ingestion/source/elastic_search.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py b/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py index c5a6c13bf695df..74fca0d2654519 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py +++ b/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py @@ -255,9 +255,7 @@ def host_colon_port_comma(cls, host_val: str) -> str: @property def http_auth(self) -> Optional[Tuple[str, str]]: - if self.username is None: - return None - return self.username, self.password or "" + return None if self.username is None else (self.username, self.password or "") @platform_name("Elastic Search") From 0dedbc881e5a4dc0ace715cb5e93d4b7deb682fa Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 9 Jun 2022 14:39:26 +1000 Subject: [PATCH 69/88] Running black --- .../src/datahub/cli/cli_utils.py | 5 +- .../src/datahub/cli/delete_cli.py | 4 +- .../src/datahub/cli/docker_check.py | 4 +- .../src/datahub/cli/timeline_cli.py | 18 ++++-- .../src/datahub/configuration/common.py | 13 ++++- .../src/datahub/emitter/mcp_builder.py | 3 +- .../datahub/emitter/serialization_helper.py | 5 +- .../src/datahub/ingestion/api/decorators.py | 6 +- .../src/datahub/ingestion/run/pipeline.py | 4 +- .../datahub/ingestion/sink/datahub_kafka.py | 4 +- .../datahub/ingestion/sink/datahub_rest.py | 4 +- .../datahub/ingestion/source/looker_common.py | 5 +- .../src/datahub/ingestion/source/lookml.py | 5 +- .../src/datahub/ingestion/source/metabase.py | 4 +- .../src/datahub/ingestion/source/nifi.py | 5 +- .../ingestion/source/openapi_parser.py | 6 +- .../src/datahub/ingestion/source/powerbi.py | 58 ++++++++++++------- .../src/datahub/ingestion/source/pulsar.py | 28 ++++++--- .../src/datahub/ingestion/source/redash.py | 24 ++++++-- .../src/datahub/ingestion/source/tableau.py | 20 ++++++- .../ingestion/source_config/sql/snowflake.py | 5 +- .../ingestion/transformer/base_transformer.py | 4 +- .../integrations/great_expectations/action.py | 5 +- .../datahub/utilities/hive_schema_to_avro.py | 24 ++++++-- .../src/datahub/utilities/mapping.py | 2 +- .../utilities/sql_lineage_parser_impl.py | 4 +- .../src/datahub/utilities/urns/urn.py | 10 +++- 27 files changed, 200 insertions(+), 79 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py index 84ee3aa8513a31..d234beb2dafbfe 100644 --- a/metadata-ingestion/src/datahub/cli/cli_utils.py +++ b/metadata-ingestion/src/datahub/cli/cli_utils.py @@ -703,8 +703,9 @@ def get_aspects_for_entity( aspect_value["aspect"]["value"] = json.loads( aspect_value["aspect"]["value"] ) - aspect_list[aspect_cls.RECORD_SCHEMA.fullname.replace("pegasus2avro.", "")] = aspect_value - + aspect_list[ + aspect_cls.RECORD_SCHEMA.fullname.replace("pegasus2avro.", "") + ] = aspect_value aspect_map: Dict[str, Union[dict, DictWrapper]] = {} for a in aspect_list.values(): diff --git a/metadata-ingestion/src/datahub/cli/delete_cli.py b/metadata-ingestion/src/datahub/cli/delete_cli.py index ae8574b89c9695..c4ff2b5e6f9361 100644 --- a/metadata-ingestion/src/datahub/cli/delete_cli.py +++ b/metadata-ingestion/src/datahub/cli/delete_cli.py @@ -315,7 +315,9 @@ def _delete_one_urn( deletion_result.num_records = rows_affected else: logger.info(f"[Dry-run] Would hard-delete {urn}") - deletion_result.num_records = UNKNOWN_NUM_RECORDS # since we don't know how many rows will be affected + deletion_result.num_records = ( + UNKNOWN_NUM_RECORDS # since we don't know how many rows will be affected + ) deletion_result.end() return deletion_result diff --git a/metadata-ingestion/src/datahub/cli/docker_check.py b/metadata-ingestion/src/datahub/cli/docker_check.py index 005651d673df36..25719cef2334d9 100644 --- a/metadata-ingestion/src/datahub/cli/docker_check.py +++ b/metadata-ingestion/src/datahub/cli/docker_check.py @@ -90,7 +90,9 @@ def check_local_docker_containers(preflight_only: bool = False) -> List[str]: else: existing_containers = {container.name for container in containers} missing_containers = set(REQUIRED_CONTAINERS) - existing_containers - issues.extend(f"{missing} container is not present" for missing in missing_containers) + issues.extend( + f"{missing} container is not present" for missing in missing_containers + ) # Check that the containers are running and healthy. for container in containers: diff --git a/metadata-ingestion/src/datahub/cli/timeline_cli.py b/metadata-ingestion/src/datahub/cli/timeline_cli.py index eec753a4af2ba0..516ca7bd7fe7b3 100644 --- a/metadata-ingestion/src/datahub/cli/timeline_cli.py +++ b/metadata-ingestion/src/datahub/cli/timeline_cli.py @@ -23,7 +23,11 @@ def pretty_field_path(field_path: str) -> str: return field_path # breakpoint() # parse schema field - tokens = [t for t in field_path.split(".") if not t.startswith("[") and not t.endswith("]")] + tokens = [ + t + for t in field_path.split(".") + if not t.startswith("[") and not t.endswith("]") + ] return ".".join(tokens) @@ -33,10 +37,10 @@ def pretty_id(id: Optional[str]) -> str: return "" # breakpoint() assert id is not None - if id.startswith("urn:li:datasetField:") or id.startswith( - "urn:li:schemaField:" + if id.startswith("urn:li:datasetField:") or id.startswith("urn:li:schemaField:"): + if schema_field_key := schema_field_urn_to_key( + id.replace("urn:li:datasetField", "urn:li:schemaField") ): - if schema_field_key := schema_field_urn_to_key(id.replace("urn:li:datasetField", "urn:li:schemaField")): assert schema_field_key is not None field_path = schema_field_key.fieldPath @@ -182,7 +186,11 @@ def timeline( change_instant = str( datetime.fromtimestamp(change_txn["timestamp"] // 1000) ) - change_color = "green" if change_txn.get("semVerChange") in ["MINOR", "PATCH"] else "red" + change_color = ( + "green" + if change_txn.get("semVerChange") in ["MINOR", "PATCH"] + else "red" + ) print( f"{colored(change_instant,'cyan')} - {colored(change_txn['semVer'],change_color)}" diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index b9fce56d5ff60b..80f8d717daf7a5 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -136,7 +136,10 @@ def allowed(self, string: str) -> bool: if re.match(deny_pattern, string, self.regex_flags): return False - return any(re.match(allow_pattern, string, self.regex_flags) for allow_pattern in self.allow) + return any( + re.match(allow_pattern, string, self.regex_flags) + for allow_pattern in self.allow + ) def is_fully_specified_allow_list(self) -> bool: """ @@ -145,7 +148,9 @@ def is_fully_specified_allow_list(self) -> bool: pattern into a 'search for the ones that are allowed' pattern, which can be much more efficient in some cases. """ - return all(self.alphabet_pattern.match(allow_pattern) for allow_pattern in self.allow) + return all( + self.alphabet_pattern.match(allow_pattern) for allow_pattern in self.allow + ) def get_allowed_list(self) -> List[str]: """Return the list of allowed strings as a list, after taking into account deny patterns, if possible""" @@ -168,7 +173,9 @@ def all(cls) -> "KeyValuePattern": return KeyValuePattern() def value(self, string: str) -> List[str]: - return next((self.rules[key] for key in self.rules.keys() if re.match(key, string)), []) + return next( + (self.rules[key] for key in self.rules.keys() if re.match(key, string)), [] + ) def matched(self, string: str) -> bool: return any(re.match(key, string) for key in self.rules.keys()) diff --git a/metadata-ingestion/src/datahub/emitter/mcp_builder.py b/metadata-ingestion/src/datahub/emitter/mcp_builder.py index 868916fda2c810..055db3c6a4ad61 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mcp_builder.py @@ -234,7 +234,8 @@ def gen_containers( def add_dataset_to_container( # FIXME: Union requires two or more type arguments - container_key: KeyType, dataset_urn: str + container_key: KeyType, + dataset_urn: str, ) -> Iterable[Union[MetadataWorkUnit]]: container_urn = make_container_urn( guid=container_key.guid(), diff --git a/metadata-ingestion/src/datahub/emitter/serialization_helper.py b/metadata-ingestion/src/datahub/emitter/serialization_helper.py index 958c913698e442..cad4e9dd3270fc 100644 --- a/metadata-ingestion/src/datahub/emitter/serialization_helper.py +++ b/metadata-ingestion/src/datahub/emitter/serialization_helper.py @@ -17,8 +17,9 @@ def _json_transform(obj: Any, from_pattern: str, to_pattern: str) -> Any: return {field: _json_transform(obj[field], from_pattern, to_pattern)} new_obj: Any = { - key: _json_transform(value, from_pattern, to_pattern) \ - for key, value in obj.items() if value is not None + key: _json_transform(value, from_pattern, to_pattern) + for key, value in obj.items() + if value is not None } return new_obj diff --git a/metadata-ingestion/src/datahub/ingestion/api/decorators.py b/metadata-ingestion/src/datahub/ingestion/api/decorators.py index 7666a4f52a2271..20867a8571b24c 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/decorators.py +++ b/metadata-ingestion/src/datahub/ingestion/api/decorators.py @@ -34,7 +34,11 @@ def platform_name( def wrapper(cls: Type) -> Type: setattr(cls, "get_platform_name", lambda: platform_name) - setattr(cls, "get_platform_id", lambda: id or platform_name.lower().replace(" ", "-")) + setattr( + cls, + "get_platform_id", + lambda: id or platform_name.lower().replace(" ", "-"), + ) return cls diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index 9c94108ec51309..4d2e02d8022e60 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -264,7 +264,9 @@ def process_commits(self) -> None: if self.source.get_report().failures or self.sink.get_report().failures else False ) - has_warnings: bool = bool(self.source.get_report().warnings or self.sink.get_report().warnings) + has_warnings: bool = bool( + self.source.get_report().warnings or self.sink.get_report().warnings + ) for name, committable in self.ctx.get_committables(): commit_policy: CommitPolicy = committable.commit_policy diff --git a/metadata-ingestion/src/datahub/ingestion/sink/datahub_kafka.py b/metadata-ingestion/src/datahub/ingestion/sink/datahub_kafka.py index 20929e85887a77..93d3aa5f6c85d3 100644 --- a/metadata-ingestion/src/datahub/ingestion/sink/datahub_kafka.py +++ b/metadata-ingestion/src/datahub/ingestion/sink/datahub_kafka.py @@ -77,7 +77,9 @@ def write_record_async( self.report, record_envelope, write_callback ).kafka_callback, ) - elif isinstance(record, (MetadataChangeProposalWrapper, MetadataChangeProposalClass)): + elif isinstance( + record, (MetadataChangeProposalWrapper, MetadataChangeProposalClass) + ): self.emitter.emit_mcp_async( record, callback=_KafkaCallback( diff --git a/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py b/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py index 415a7a1c827da8..d95eb245ccfb50 100644 --- a/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py +++ b/metadata-ingestion/src/datahub/ingestion/sink/datahub_rest.py @@ -113,7 +113,9 @@ def _write_done_callback( # trim exception stacktraces when reporting warnings if "stackTrace" in e.info: with contextlib.suppress(Exception): - e.info["stackTrace"] = "\n".join(e.info["stackTrace"].split("\n")[:2]) + e.info["stackTrace"] = "\n".join( + e.info["stackTrace"].split("\n")[:2] + ) record = record_envelope.record if isinstance(record, MetadataChangeProposalWrapper): # include information about the entity that failed diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker_common.py index 49637251375b6e..668d3d3e2d898f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker_common.py @@ -624,8 +624,9 @@ def from_api( # noqa: C901 ) except SDKError as e: logger.warn(f"Failed to extract explore {explore_name} from model {model}.") - logger.debug(f"Failed to extract explore {explore_name} from model {model} with {e}") - + logger.debug( + f"Failed to extract explore {explore_name} from model {model} with {e}" + ) except AssertionError: reporter.report_warning( diff --git a/metadata-ingestion/src/datahub/ingestion/source/lookml.py b/metadata-ingestion/src/datahub/ingestion/source/lookml.py index 571ba1e38359c2..203bf199c27ad5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/lookml.py +++ b/metadata-ingestion/src/datahub/ingestion/source/lookml.py @@ -712,7 +712,10 @@ def _extract_metadata_from_sql_query( for c in column_names ] except Exception as e: - reporter.report_warning(f"looker-view-{view_name}", f"Failed to parse sql query, lineage will not be accurate. Exception: {e}") + reporter.report_warning( + f"looker-view-{view_name}", + f"Failed to parse sql query, lineage will not be accurate. Exception: {e}", + ) return fields, sql_table_names diff --git a/metadata-ingestion/src/datahub/ingestion/source/metabase.py b/metadata-ingestion/src/datahub/ingestion/source/metabase.py index a4873e1bd08633..bd930f90ff3aff 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metabase.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metabase.py @@ -449,7 +449,9 @@ def get_datasource_urn(self, card_details): if source_table_id is not None: schema_name, table_name = self.get_source_table_from_id(source_table_id) if table_name: - source_paths.add(f"{f'{schema_name}.' if schema_name else ''}{table_name}") + source_paths.add( + f"{f'{schema_name}.' if schema_name else ''}{table_name}" + ) else: try: raw_query = ( diff --git a/metadata-ingestion/src/datahub/ingestion/source/nifi.py b/metadata-ingestion/src/datahub/ingestion/source/nifi.py index 20cd1daa6671e2..e9bc5f0b5daba0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/nifi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/nifi.py @@ -337,7 +337,10 @@ def __init__(self, config: NifiSourceConfig, ctx: PipelineContext) -> None: if self.config.site_url_to_site_name is None: self.config.site_url_to_site_name = {} - if urljoin(self.config.site_url, "/nifi/") not in self.config.site_url_to_site_name: + if ( + urljoin(self.config.site_url, "/nifi/") + not in self.config.site_url_to_site_name + ): self.config.site_url_to_site_name[ urljoin(self.config.site_url, "/nifi/") ] = self.config.site_name diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py index 233d920f6877ef..f33654daa15595 100755 --- a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py @@ -53,7 +53,9 @@ def request_call( headers = {"accept": "application/json"} if username is not None and password is not None: - return requests.get(url, headers=headers, auth=HTTPBasicAuth(username, password)) + return requests.get( + url, headers=headers, auth=HTTPBasicAuth(username, password) + ) elif token is not None: headers["Authorization"] = f"Bearer {token}" @@ -105,7 +107,7 @@ def check_sw_version(sw_dict: dict) -> None: ) -def get_endpoints(sw_dict: dict) -> dict: # noqa: C901 +def get_endpoints(sw_dict: dict) -> dict: # noqa: C901 """ Get all the URLs accepting the "GET" method, together with their description and the tags """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi.py index f9df9a802a3719..a4182ab6b6824f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi.py @@ -346,7 +346,9 @@ def __get_users(self, workspace_id: str, entity: str, id: str) -> List[User]: # Check if we got response from PowerBi if response.status_code != 200: - LOGGER.warning(f"Failed to fetch user list from power-bi for, http_status={response.status_code}, message={response.text}") + LOGGER.warning( + f"Failed to fetch user list from power-bi for, http_status={response.status_code}, message={response.text}" + ) LOGGER.info(f"{Constant.WorkspaceId}={workspace_id}") LOGGER.info(f"{Constant.ENTITY}={entity}") @@ -572,7 +574,9 @@ def get_data_source(self, dataset: Dataset) -> Any: res = response.json() value = res["value"] if len(value) == 0: - LOGGER.info(f"datasource is not found for dataset {dataset.name}({dataset.id})") + LOGGER.info( + f"datasource is not found for dataset {dataset.name}({dataset.id})" + ) return None # Consider only zero index datasource @@ -639,8 +643,9 @@ def new_dataset_or_report(tile_instance: Any) -> dict: else: report_fields["createdFrom"] = PowerBiAPI.Tile.CreatedFrom.VISUALIZATION - LOGGER.info(f'Tile {tile_instance.get("title")}({tile_instance.get("id")}) is created from {report_fields["createdFrom"]}') - + LOGGER.info( + f'Tile {tile_instance.get("title")}({tile_instance.get("id")}) is created from {report_fields["createdFrom"]}' + ) return report_fields @@ -712,7 +717,6 @@ def create_scan_job(): # sourcery skip: avoid-builtin-shadow if res.status_code not in (200, 202): message = f"API({scan_create_endpoint}) return error code {res.status_code} for workspace id({workspace_id})" - LOGGER.warning(message) raise ConnectionError(message) @@ -727,7 +731,9 @@ def wait_for_scan_to_complete(scan_id: str, timeout: int) -> Boolean: """ minimum_sleep = 3 if timeout < minimum_sleep: - LOGGER.info(f"Setting timeout to minimum_sleep time {minimum_sleep} seconds") + LOGGER.info( + f"Setting timeout to minimum_sleep time {minimum_sleep} seconds" + ) timeout = minimum_sleep max_trial = timeout // minimum_sleep @@ -749,7 +755,6 @@ def wait_for_scan_to_complete(scan_id: str, timeout: int) -> Boolean: if res.status_code != 200: message = f"API({scan_get_endpoint}) return error code {res.status_code} for scan id({scan_id})" - LOGGER.warning(message) raise ConnectionError(message) @@ -785,7 +790,6 @@ def get_scan_result(scan_id: str) -> dict: if res.status_code != 200: message = f"API({scan_result_get_endpoint}) return error code {res.status_code} for scan id({scan_id})" - LOGGER.warning(message) raise ConnectionError(message) @@ -800,7 +804,9 @@ def json_to_dataset_map(scan_result: dict) -> dict: dataset_map: dict = {} if datasets is None or len(datasets) == 0: - LOGGER.warning(f'Workspace {scan_result["name"]}({scan_result["id"]}) does not have datasets') + LOGGER.warning( + f'Workspace {scan_result["name"]}({scan_result["id"]}) does not have datasets' + ) LOGGER.info("Returning empty datasets") return dataset_map @@ -820,12 +826,15 @@ def json_to_dataset_map(scan_result: dict) -> dict: dataset_instance.datasource and dataset_instance.datasource.metadata.is_relational is True ): - LOGGER.info(f"Processing tables attribute for dataset {dataset_instance.name}({dataset_instance.id})") - + LOGGER.info( + f"Processing tables attribute for dataset {dataset_instance.name}({dataset_instance.id})" + ) for table in dataset_dict["tables"]: if "Value.NativeQuery(" in table["source"][0]["expression"]: - LOGGER.warning(f'Table {table["name"]} is created from Custom SQL. Ignoring in processing') + LOGGER.warning( + f'Table {table["name"]} is created from Custom SQL. Ignoring in processing' + ) continue @@ -946,16 +955,23 @@ def __to_datahub_dataset( dataset.datasource is None or dataset.datasource.metadata.is_relational is False ): - LOGGER.warning(f"Dataset {dataset.name}({dataset.id}) is not created from relational datasource") + LOGGER.warning( + f"Dataset {dataset.name}({dataset.id}) is not created from relational datasource" + ) return dataset_mcps - LOGGER.info(f"Converting dataset={dataset.name}(id={dataset.id}) to datahub dataset") - + LOGGER.info( + f"Converting dataset={dataset.name}(id={dataset.id}) to datahub dataset" + ) for table in dataset.tables: # Create an URN for dataset - ds_urn = builder.make_dataset_urn(platform=self.__config.dataset_type_mapping[dataset.datasource.type], name=f"{dataset.datasource.database}.{table.schema_name}.{table.name}", env=self.__config.env) + ds_urn = builder.make_dataset_urn( + platform=self.__config.dataset_type_mapping[dataset.datasource.type], + name=f"{dataset.datasource.database}.{table.schema_name}.{table.name}", + env=self.__config.env, + ) LOGGER.info(f"{Constant.Dataset_URN}={ds_urn}") # Create datasetProperties mcp @@ -1165,8 +1181,9 @@ def to_datahub_user( Map PowerBi user to datahub user """ - LOGGER.info(f"Converting user {user.displayName}(id={user.id}) to datahub's user") - + LOGGER.info( + f"Converting user {user.displayName}(id={user.id}) to datahub's user" + ) # Create an URN for user user_urn = builder.make_user_urn(user.get_urn_part()) @@ -1248,8 +1265,9 @@ def to_datahub_work_units( ) -> Set[EquableMetadataWorkUnit]: mcps = [] - LOGGER.info(f"Converting dashboard={dashboard.displayName} to datahub dashboard") - + LOGGER.info( + f"Converting dashboard={dashboard.displayName} to datahub dashboard" + ) # Convert user to CorpUser user_mcps = self.to_datahub_users(dashboard.users) diff --git a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py index e969c2d3aeb3ea..ffc0253a070b2d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py +++ b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py @@ -119,7 +119,9 @@ def __init__(self, config: PulsarSourceConfig, ctx: PipelineContext): if self._is_oauth_authentication_configured(): # Get OpenId configuration from issuer, e.g. token_endpoint - oid_config_url = f"{self.config.issuer_url}/.well-known/openid-configuration" + oid_config_url = ( + f"{self.config.issuer_url}/.well-known/openid-configuration" + ) oid_config_response = requests.get( oid_config_url, verify=False, allow_redirects=False ) @@ -127,8 +129,9 @@ def __init__(self, config: PulsarSourceConfig, ctx: PipelineContext): if oid_config_response: self.config.oid_config.update(oid_config_response.json()) else: - logger.error(f"Unexpected response while getting discovery document using {oid_config_url} : {oid_config_response}") - + logger.error( + f"Unexpected response while getting discovery document using {oid_config_url} : {oid_config_response}" + ) if "token_endpoint" not in self.config.oid_config: raise Exception( @@ -319,8 +322,11 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: ] # Report the Pulsar broker version we are communicating with - self.report.report_pulsar_version(self.session.get(f"{self.base_url}/brokers/version", timeout=self.config.timeout).text) - + self.report.report_pulsar_version( + self.session.get( + f"{self.base_url}/brokers/version", timeout=self.config.timeout + ).text + ) # If no tenants are provided, request all tenants from cluster using /admin/v2/tenants endpoint. # Requesting cluster tenant information requires superuser privileges @@ -337,8 +343,10 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: if self.config.tenant_patterns.allowed(tenant): # Get namespaces belonging to a tenant, /admin/v2/%s/namespaces # A tenant admin role has sufficient privileges to perform this action - namespaces = self._get_pulsar_metadata(f"{self.base_url}/namespaces/{tenant}") or [] - + namespaces = ( + self._get_pulsar_metadata(f"{self.base_url}/namespaces/{tenant}") + or [] + ) for namespace in namespaces: self.report.namespaces_scanned += 1 @@ -408,8 +416,10 @@ def _get_schema_and_fields( pulsar_schema: Optional[PulsarSchema] = None - schema_url = self.base_url + f"/schemas/{pulsar_topic.tenant}/{pulsar_topic.namespace}/{pulsar_topic.topic}/schema" - + schema_url = ( + self.base_url + + f"/schemas/{pulsar_topic.tenant}/{pulsar_topic.namespace}/{pulsar_topic.topic}/schema" + ) schema_payload = self._get_pulsar_metadata(schema_url) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redash.py b/metadata-ingestion/src/datahub/ingestion/source/redash.py index 7aceafd22bd5aa..aa1d093e02e85f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redash.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redash.py @@ -204,19 +204,29 @@ def get_full_qualified_name(self, database_name: str, table_name: str) -> str: def get_full_qualified_name(platform: str, database_name: str, table_name: str) -> str: if platform == "athena": - return AthenaQualifiedNameParser().get_full_qualified_name(database_name, table_name) + return AthenaQualifiedNameParser().get_full_qualified_name( + database_name, table_name + ) elif platform == "bigquery": - return BigqueryQualifiedNameParser().get_full_qualified_name(database_name, table_name) + return BigqueryQualifiedNameParser().get_full_qualified_name( + database_name, table_name + ) elif platform == "mssql": - return MssqlQualifiedNameParser().get_full_qualified_name(database_name, table_name) + return MssqlQualifiedNameParser().get_full_qualified_name( + database_name, table_name + ) elif platform == "mysql": - return MysqlQualifiedNameParser().get_full_qualified_name(database_name, table_name) + return MysqlQualifiedNameParser().get_full_qualified_name( + database_name, table_name + ) elif platform == "postgres": - return PostgresQualifiedNameParser().get_full_qualified_name(database_name, table_name) + return PostgresQualifiedNameParser().get_full_qualified_name( + database_name, table_name + ) else: return f"{database_name}.{table_name}" @@ -589,7 +599,9 @@ def _process_dashboard_response( # the API is id based not slug based # Tested the same with a Redash instance dashboard_id = dashboard_response["id"] - dashboard_data = self.client._get(f"api/dashboards/{dashboard_id}").json() + dashboard_data = self.client._get( + f"api/dashboards/{dashboard_id}" + ).json() except Exception: # This does not work in our testing but keeping for now because # people in community are using Redash connector successfully diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index 2a6696f8823683..5d33c593c85c67 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -401,8 +401,11 @@ def _create_upstream_table_lineage( table_name = table.get("name") or table["id"] table_path = f"{project.replace('/', REPLACE_SLASH_CHAR)}/{datasource['name']}/{table_name}" - self.upstream_tables[table_urn] = table.get("columns", []), table_path, table.get("isEmbedded") or False - + self.upstream_tables[table_urn] = ( + table.get("columns", []), + table_path, + table.get("isEmbedded") or False, + ) return upstream_tables @@ -626,7 +629,18 @@ def _get_schema_metadata_for_datasource( ) fields.append(schema_field) - return SchemaMetadata(schemaName="test", platform=f"urn:li:dataPlatform:{self.platform}", version=0, fields=fields, hash="", platformSchema=OtherSchema(rawSchema=""),) if fields else None + return ( + SchemaMetadata( + schemaName="test", + platform=f"urn:li:dataPlatform:{self.platform}", + version=0, + fields=fields, + hash="", + platformSchema=OtherSchema(rawSchema=""), + ) + if fields + else None + ) def get_metadata_change_event( self, snap_shot: Union["DatasetSnapshot", "DashboardSnapshot", "ChartSnapshot"] diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index 644be7afbe749a..984b8fa917d647 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -213,10 +213,7 @@ def authenticator_type_is_valid(cls, v, values, field): f"but should be set when using {v} authentication" ) if values.get("oauth_config").use_certificate is True: - if ( - values.get("oauth_config").base64_encoded_oauth_private_key - is None - ): + if values.get("oauth_config").base64_encoded_oauth_private_key is None: raise ValueError( "'base64_encoded_oauth_private_key' was none " "but should be set when using certificate for oauth_config" diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/base_transformer.py b/metadata-ingestion/src/datahub/ingestion/transformer/base_transformer.py index c6f641c8fcd6e5..82cfecbddfed39 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/base_transformer.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/base_transformer.py @@ -132,7 +132,9 @@ def _should_process( return True # fall through, no entity type matched return False - elif isinstance(record, (MetadataChangeProposalWrapper, MetadataChangeProposalClass)): + elif isinstance( + record, (MetadataChangeProposalWrapper, MetadataChangeProposalClass) + ): return record.entityType in entity_types # default to process everything that is not caught by above checks diff --git a/metadata-ingestion/src/datahub/integrations/great_expectations/action.py b/metadata-ingestion/src/datahub/integrations/great_expectations/action.py index 572ecdf36302d5..5dca3541493156 100644 --- a/metadata-ingestion/src/datahub/integrations/great_expectations/action.py +++ b/metadata-ingestion/src/datahub/integrations/great_expectations/action.py @@ -625,7 +625,10 @@ def get_dataset_partitions(self, batch_identifier, data_asset): query = data_asset.batches[ batch_identifier ].batch_request.runtime_parameters["query"] - partitionSpec = PartitionSpecClass(type=PartitionTypeClass.QUERY, partition=f"Query_{builder.datahub_guid(query)}") + partitionSpec = PartitionSpecClass( + type=PartitionTypeClass.QUERY, + partition=f"Query_{builder.datahub_guid(query)}", + ) batchSpec = BatchSpec( nativeBatchId=batch_identifier, diff --git a/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py b/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py index c83ec153144f0f..6e8d8da5f3fb82 100644 --- a/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py +++ b/metadata-ingestion/src/datahub/utilities/hive_schema_to_avro.py @@ -52,10 +52,12 @@ def _parse_datatype_string( raise ValueError("'>' should be the last char, but got: %s" % s) parts = HiveColumnToAvroConverter._ignore_brackets_split(s[4:-1], ",") if len(parts) != 2: - raise ValueError(( - "The map type string format is: 'map', " - + f"but got: {s}" - )) + raise ValueError( + ( + "The map type string format is: 'map', " + + f"but got: {s}" + ) + ) kt = HiveColumnToAvroConverter._parse_datatype_string(parts[0]) vt = HiveColumnToAvroConverter._parse_datatype_string(parts[1]) @@ -103,7 +105,12 @@ def _parse_struct_fields_string(s: str, **kwargs: Any) -> Dict[str, object]: for part in parts: name_and_type = HiveColumnToAvroConverter._ignore_brackets_split(part, ":") if len(name_and_type) != 2: - raise ValueError(("The struct field string format is: 'field_name:field_type', " + f"but got: {part}")) + raise ValueError( + ( + "The struct field string format is: 'field_name:field_type', " + + f"but got: {part}" + ) + ) field_name = name_and_type[0].strip() if field_name.startswith("`"): @@ -120,7 +127,12 @@ def _parse_struct_fields_string(s: str, **kwargs: Any) -> Dict[str, object]: else: struct_name = f'__struct_{str(uuid.uuid4()).replace("-", "")}' - return {"type": "record", "name": struct_name, "fields": fields, "native_data_type": f"struct<{s}>"} + return { + "type": "record", + "name": struct_name, + "fields": fields, + "native_data_type": f"struct<{s}>", + } @staticmethod def _parse_basic_datatype_string(s: str) -> Dict[str, object]: diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py index af4916a4055747..7b0d6d1dd92229 100644 --- a/metadata-ingestion/src/datahub/utilities/mapping.py +++ b/metadata-ingestion/src/datahub/utilities/mapping.py @@ -172,7 +172,7 @@ def get_operation_value( def sanitize_owner_ids(self, owner_id: str) -> str: if owner_id.__contains__("@"): - owner_id = owner_id[:owner_id.index("@")] + owner_id = owner_id[: owner_id.index("@")] return owner_id def is_match(self, match_clause: Any, raw_props_value: Any) -> bool: diff --git a/metadata-ingestion/src/datahub/utilities/sql_lineage_parser_impl.py b/metadata-ingestion/src/datahub/utilities/sql_lineage_parser_impl.py index 63b3edaf8c0556..6fe57b297d4528 100644 --- a/metadata-ingestion/src/datahub/utilities/sql_lineage_parser_impl.py +++ b/metadata-ingestion/src/datahub/utilities/sql_lineage_parser_impl.py @@ -134,7 +134,9 @@ def get_columns(self) -> List[str]: # Reverting back all the previously renamed words which confuses the parser result = {"date" if c == self._DATE_SWAP_TOKEN else c for c in result} - result = {"timestamp" if c == self._TIMESTAMP_SWAP_TOKEN else c for c in list(result)} + result = { + "timestamp" if c == self._TIMESTAMP_SWAP_TOKEN else c for c in list(result) + } # swap back renamed date column return list(result) diff --git a/metadata-ingestion/src/datahub/utilities/urns/urn.py b/metadata-ingestion/src/datahub/utilities/urns/urn.py index 7ae6d37472621a..ca664a7848b850 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/urn.py @@ -151,4 +151,12 @@ def __hash__(self) -> int: return hash((self._domain, self._entity_type) + tuple(self._entity_id)) def __eq__(self, other: object) -> bool: - return (self._entity_id == other._entity_id and self._domain == other._domain and self._entity_type == other._entity_type) if isinstance(other, Urn) else False + return ( + ( + self._entity_id == other._entity_id + and self._domain == other._domain + and self._entity_type == other._entity_type + ) + if isinstance(other, Urn) + else False + ) From 95d47dbce737cb9bab753c10fcacf7105178909b Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 9 Jun 2022 14:39:42 +1000 Subject: [PATCH 70/88] running isort --- metadata-ingestion/src/datahub/ingestion/source/feast.py | 1 + metadata-ingestion/src/datahub/ingestion/source/metabase.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/feast.py b/metadata-ingestion/src/datahub/ingestion/source/feast.py index 0c67e2d036083f..7d4f5360ad4202 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/feast.py +++ b/metadata-ingestion/src/datahub/ingestion/source/feast.py @@ -20,6 +20,7 @@ ValueType, ) from feast.data_source import DataSource, RequestDataSource + import datahub.emitter.mce_builder as builder from datahub.configuration.common import ConfigModel from datahub.emitter.mce_builder import DEFAULT_ENV diff --git a/metadata-ingestion/src/datahub/ingestion/source/metabase.py b/metadata-ingestion/src/datahub/ingestion/source/metabase.py index bd930f90ff3aff..98bcbcba591ebd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metabase.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metabase.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timezone from functools import lru_cache from typing import Dict, Iterable, Optional @@ -42,7 +42,6 @@ OwnershipTypeClass, ) from datahub.utilities import config_clean -from datetime import timezone class MetabaseConfig(DatasetLineageProviderConfigBase): From 4d708b585f686d8930589c2bd8aa4b2e04ffacdc Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 9 Jun 2022 14:41:53 +1000 Subject: [PATCH 71/88] lint fix --- .../library/dataset_add_column_tag.py | 4 +- .../library/dataset_add_column_term.py | 6 +-- .../lineage_job_dataflow_new_api_simple.py | 37 +++++++++++++++---- .../transforms/custom_transform_example.py | 4 +- 4 files changed, 35 insertions(+), 16 deletions(-) diff --git a/metadata-ingestion/examples/library/dataset_add_column_tag.py b/metadata-ingestion/examples/library/dataset_add_column_tag.py index a457d12f493ae0..8a15d33ff78779 100644 --- a/metadata-ingestion/examples/library/dataset_add_column_tag.py +++ b/metadata-ingestion/examples/library/dataset_add_column_tag.py @@ -28,9 +28,7 @@ def get_simple_field_path_from_v2_field_path(field_path: str) -> str: return field_path # this is a v2 field path tokens = [ - t - for t in field_path.split(".") - if not (t.startswith("[") or t.endswith("]")) + t for t in field_path.split(".") if not (t.startswith("[") or t.endswith("]")) ] return ".".join(tokens) diff --git a/metadata-ingestion/examples/library/dataset_add_column_term.py b/metadata-ingestion/examples/library/dataset_add_column_term.py index ea5cd1f632f743..d656b5bd4502e7 100644 --- a/metadata-ingestion/examples/library/dataset_add_column_term.py +++ b/metadata-ingestion/examples/library/dataset_add_column_term.py @@ -28,11 +28,9 @@ def get_simple_field_path_from_v2_field_path(field_path: str) -> str: return field_path # this is a v2 field path tokens = [ - t - for t in field_path.split(".") - if not (t.startswith("[") or t.endswith("]")) + t for t in field_path.split(".") if not (t.startswith("[") or t.endswith("]")) ] - + return ".".join(tokens) diff --git a/metadata-ingestion/examples/library/lineage_job_dataflow_new_api_simple.py b/metadata-ingestion/examples/library/lineage_job_dataflow_new_api_simple.py index d339d35110db1d..1871a8af09e50c 100644 --- a/metadata-ingestion/examples/library/lineage_job_dataflow_new_api_simple.py +++ b/metadata-ingestion/examples/library/lineage_job_dataflow_new_api_simple.py @@ -1,5 +1,5 @@ import uuid -from datetime import datetime +from datetime import datetime, timezone from datahub.api.entities.datajob import DataFlow, DataJob from datahub.api.entities.dataprocess.dataprocess_instance import ( @@ -8,7 +8,6 @@ ) from datahub.emitter.rest_emitter import DatahubRestEmitter -from datetime import timezone emitter = DatahubRestEmitter("http://localhost:8080") jobFlow = DataFlow(cluster="prod", orchestrator="airflow", id="flow_api_simple") @@ -37,7 +36,9 @@ jobFlowRun = DataProcessInstance.from_dataflow( dataflow=jobFlow, id=f"{jobFlow.id}-{uuid.uuid4()}" ) -jobFlowRun.emit_process_start(emitter, int(datetime.now(timezone.utc).timestamp() * 1000)) +jobFlowRun.emit_process_start( + emitter, int(datetime.now(timezone.utc).timestamp() * 1000) +) jobRun = DataProcessInstance.from_datajob( @@ -45,7 +46,11 @@ ) jobRun.emit_process_start(emitter, int(datetime.now(timezone.utc).timestamp() * 1000)) -jobRun.emit_process_end(emitter, int(datetime.now(timezone.utc).timestamp() * 1000), result=InstanceRunResult.SUCCESS) +jobRun.emit_process_end( + emitter, + int(datetime.now(timezone.utc).timestamp() * 1000), + result=InstanceRunResult.SUCCESS, +) job2Run = DataProcessInstance.from_datajob( @@ -53,7 +58,11 @@ ) job2Run.emit_process_start(emitter, int(datetime.now(timezone.utc).timestamp() * 1000)) -job2Run.emit_process_end(emitter, int(datetime.now(timezone.utc).timestamp() * 1000), result=InstanceRunResult.SUCCESS) +job2Run.emit_process_end( + emitter, + int(datetime.now(timezone.utc).timestamp() * 1000), + result=InstanceRunResult.SUCCESS, +) job3Run = DataProcessInstance.from_datajob( @@ -61,7 +70,11 @@ ) job3Run.emit_process_start(emitter, int(datetime.now(timezone.utc).timestamp() * 1000)) -job3Run.emit_process_end(emitter, int(datetime.now(timezone.utc).timestamp() * 1000), result=InstanceRunResult.SUCCESS) +job3Run.emit_process_end( + emitter, + int(datetime.now(timezone.utc).timestamp() * 1000), + result=InstanceRunResult.SUCCESS, +) job4Run = DataProcessInstance.from_datajob( @@ -69,7 +82,15 @@ ) job4Run.emit_process_start(emitter, int(datetime.now(timezone.utc).timestamp() * 1000)) -job4Run.emit_process_end(emitter, int(datetime.now(timezone.utc).timestamp() * 1000), result=InstanceRunResult.SUCCESS) +job4Run.emit_process_end( + emitter, + int(datetime.now(timezone.utc).timestamp() * 1000), + result=InstanceRunResult.SUCCESS, +) -jobFlowRun.emit_process_end(emitter, int(datetime.now(timezone.utc).timestamp() * 1000), result=InstanceRunResult.SUCCESS) +jobFlowRun.emit_process_end( + emitter, + int(datetime.now(timezone.utc).timestamp() * 1000), + result=InstanceRunResult.SUCCESS, +) diff --git a/metadata-ingestion/examples/transforms/custom_transform_example.py b/metadata-ingestion/examples/transforms/custom_transform_example.py index 4a3d16d4a4dd94..57560e75cf7e92 100644 --- a/metadata-ingestion/examples/transforms/custom_transform_example.py +++ b/metadata-ingestion/examples/transforms/custom_transform_example.py @@ -61,7 +61,9 @@ def transform_aspect( # type: ignore assert aspect is None or isinstance(aspect, OwnershipClass) if owners_to_add: - ownership = aspect or OwnershipClass(owners=[],) + ownership = aspect or OwnershipClass( + owners=[], + ) ownership.owners.extend(owners_to_add) From ee0bc05294d7d0e01e86e49126e83995a07e2b18 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 9 Jun 2022 15:32:58 +1000 Subject: [PATCH 72/88] Update timeline_cli.py --- metadata-ingestion/src/datahub/cli/timeline_cli.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/timeline_cli.py b/metadata-ingestion/src/datahub/cli/timeline_cli.py index 516ca7bd7fe7b3..579dff5425a112 100644 --- a/metadata-ingestion/src/datahub/cli/timeline_cli.py +++ b/metadata-ingestion/src/datahub/cli/timeline_cli.py @@ -38,9 +38,10 @@ def pretty_id(id: Optional[str]) -> str: # breakpoint() assert id is not None if id.startswith("urn:li:datasetField:") or id.startswith("urn:li:schemaField:"): - if schema_field_key := schema_field_urn_to_key( + schema_field_key = schema_field_urn_to_key( id.replace("urn:li:datasetField", "urn:li:schemaField") - ): + ) + if schema_field_key: assert schema_field_key is not None field_path = schema_field_key.fieldPath @@ -49,7 +50,8 @@ def pretty_id(id: Optional[str]) -> str: return f"{colored('field','cyan')}:{colored(pretty_field_path(id),'white')}" if id.startswith("urn:li:dataset"): - if dataset_key := dataset_urn_to_key(id): + dataset_key = dataset_urn_to_key(id) + if dataset_key: return f"{colored('dataset','cyan')}:{colored(dataset_key.platform,'white')}:{colored(dataset_key.name,'white')}" # failed to prettify, return original return id From 8010744cddbb9409dd3d0b3591e4f80d3c1d3329 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 9 Jun 2022 16:32:30 +1000 Subject: [PATCH 73/88] logger warn to logger warning --- metadata-ingestion/src/datahub/cli/delete_cli.py | 2 +- .../src/datahub/ingestion/source/aws/glue.py | 6 +++--- metadata-ingestion/src/datahub/ingestion/source/looker.py | 4 ++-- .../src/datahub/ingestion/source/looker_common.py | 2 +- metadata-ingestion/src/datahub/ingestion/source/nifi.py | 8 ++++---- .../src/datahub/ingestion/source/s3/source.py | 6 +++--- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/delete_cli.py b/metadata-ingestion/src/datahub/cli/delete_cli.py index c4ff2b5e6f9361..a11fa3e6703520 100644 --- a/metadata-ingestion/src/datahub/cli/delete_cli.py +++ b/metadata-ingestion/src/datahub/cli/delete_cli.py @@ -182,7 +182,7 @@ def delete( else: # log warn include_removed + hard is the only way to work if include_removed and soft: - logger.warn( + logger.warning( "A filtered delete including soft deleted entities is redundant, because it is a soft delete by default. Please use --include-removed in conjunction with --hard" ) # Filter based delete diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index a7171cfe42caf0..6594da6ddf2063 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -1059,7 +1059,7 @@ def get_s3_tags() -> Optional[GlobalTagsClass]: ] ) except self.s3_client.exceptions.ClientError: - logger.warn(f"No tags found for bucket={bucket_name}") + logger.warning(f"No tags found for bucket={bucket_name}") if self.source_config.use_s3_object_tags: key_prefix = s3_util.get_key_prefix( table["StorageDescriptor"]["Location"] @@ -1078,7 +1078,7 @@ def get_s3_tags() -> Optional[GlobalTagsClass]: else: # Unlike bucket tags, if an object does not have tags, it will just return an empty array # as opposed to an exception. - logger.warn( + logger.warning( f"No tags found for bucket={bucket_name} key={key_prefix}" ) if len(tags_to_add) == 0: @@ -1097,7 +1097,7 @@ def get_s3_tags() -> Optional[GlobalTagsClass]: [current_tag.tag for current_tag in current_tags.tags] ) else: - logger.warn( + logger.warning( "Could not connect to DatahubApi. No current tags to maintain" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker.py b/metadata-ingestion/src/datahub/ingestion/source/looker.py index 2afa4ca05cbb36..a469e16ab34c47 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker.py @@ -283,8 +283,8 @@ def get_by_id( self.user_map[id] = looker_user return looker_user except SDKError as e: - logger.warn(f"Could not find user with id {id}") - logger.warn(f"Failure was {e}") + logger.warning(f"Could not find user with id {id}") + logger.warning(f"Failure was {e}") return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker_common.py index 668d3d3e2d898f..b2348fee9ace40 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker_common.py @@ -623,7 +623,7 @@ def from_api( # noqa: C901 source_file=explore.source_file, ) except SDKError as e: - logger.warn(f"Failed to extract explore {explore_name} from model {model}.") + logger.warning(f"Failed to extract explore {explore_name} from model {model}.") logger.debug( f"Failed to extract explore {explore_name} from model {model} with {e}" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/nifi.py b/metadata-ingestion/src/datahub/ingestion/source/nifi.py index e9bc5f0b5daba0..9677d5cbd3b5cb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/nifi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/nifi.py @@ -620,7 +620,7 @@ def create_nifi_flow(self): if about_response.ok: nifi_version = about_response.json().get("about", {}).get("version") else: - logger.warn("Failed to fetch version for nifi") + logger.warning("Failed to fetch version for nifi") cluster_response = self.session.get( url=urljoin(self.config.site_url, CLUSTER_ENDPOINT) ) @@ -630,7 +630,7 @@ def create_nifi_flow(self): cluster_response.json().get("clusterSummary", {}).get("clustered") ) else: - logger.warn("Failed to fetch cluster summary for flow") + logger.warning("Failed to fetch cluster summary for flow") pg_response = self.session.get( url=urljoin(self.config.site_url, PG_ENDPOINT) + "root" ) @@ -715,7 +715,7 @@ def fetch_provenance_events( attempts = 5 # wait for at most 5 attempts 5*1= 5 seconds while (not provenance.get("finished", False)) and attempts > 0: - logger.warn( + logger.warning( f"Provenance query not completed, attempts left : {attempts}" ) # wait until the uri returns percentcomplete 100 @@ -757,7 +757,7 @@ def fetch_provenance_events( f"provenance events could not be fetched for processor \ {processor.id} of type {processor.name}", ) - logger.warn(provenance_response.text) + logger.warning(provenance_response.text) return def report_warning(self, key: str, reason: str) -> None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 29f54dc9449e3e..d3943745cbc81f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -689,7 +689,7 @@ def get_s3_tags( ] ) except s3.meta.client.exceptions.ClientError: - logger.warn(f"No tags found for bucket={bucket_name}") + logger.warning(f"No tags found for bucket={bucket_name}") if self.source_config.use_s3_object_tags and key_name is not None: s3_client = self.source_config.aws_config.get_s3_client() @@ -707,7 +707,7 @@ def get_s3_tags( else: # Unlike bucket tags, if an object does not have tags, it will just return an empty array # as opposed to an exception. - logger.warn(f"No tags found for bucket={bucket_name} key={key_name}") + logger.warning(f"No tags found for bucket={bucket_name} key={key_name}") if len(tags_to_add) == 0: return None if self.ctx.graph is not None: @@ -722,7 +722,7 @@ def get_s3_tags( [current_tag.tag for current_tag in current_tags.tags] ) else: - logger.warn("Could not connect to DatahubApi. No current tags to maintain") + logger.warning("Could not connect to DatahubApi. No current tags to maintain") # Remove duplicate tags tags_to_add = list(set(tags_to_add)) new_tags = GlobalTagsClass( From b168138f30a8e8755dd4e3e1200a17700e348fc8 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 11 Jun 2022 17:00:40 +1000 Subject: [PATCH 74/88] Update business_glossary.yml --- .../examples/bootstrap_data/business_glossary.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/examples/bootstrap_data/business_glossary.yml b/metadata-ingestion/examples/bootstrap_data/business_glossary.yml index 71fd59bbccc462..6669d393b7211d 100644 --- a/metadata-ingestion/examples/bootstrap_data/business_glossary.yml +++ b/metadata-ingestion/examples/bootstrap_data/business_glossary.yml @@ -40,7 +40,7 @@ nodes: inherits: - Classification.Sensitive - name: ClientsAndAccounts - description: Provides basic concepts such as account, account holder, account provider, relationship manager that are commonly used by financial services providers to describe customers and to determine counterparts identities + description: Provides basic concepts such as account, account holder, account provider, relationship manager that are commonly used by financial services providers to describe customers and to determine counterparty identities owners: groups: - finance From 980826cd1c3dbff831794d527becb1027567c58d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 11 Jun 2022 20:37:33 +1000 Subject: [PATCH 75/88] Update ldap.py --- metadata-ingestion/src/datahub/ingestion/source/ldap.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ldap.py b/metadata-ingestion/src/datahub/ingestion/source/ldap.py index 633651c7b171a6..f3b40610d0210a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ldap.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ldap.py @@ -153,7 +153,7 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: _rtype, rdata, _rmsgid, serverctrls = self.ldap_client.result3(msgid) except ldap.LDAPError as e: self.report.report_failure( - "ldap-control", "LDAP search failed: {}".format(e) + "ldap-control", f"LDAP search failed: {e}" ) break @@ -211,9 +211,8 @@ def handle_user(self, dn: str, attrs: Dict[str, Any]) -> Iterable[MetadataWorkUn manager_ldap = guess_person_ldap(m_attrs) except ldap.LDAPError as e: self.report.report_warning( - dn, "manager LDAP search failed: {}".format(e) + dn, f"manager LDAP search failed: {e}" ) - mce = self.build_corp_user_mce(dn, attrs, manager_ldap) if mce: wu = MetadataWorkUnit(dn, mce) From 1e60f1b3b5f95c301587d4a8bd865b88d580553f Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Tue, 14 Jun 2022 13:18:07 +1000 Subject: [PATCH 76/88] lint --- metadata-ingestion/src/datahub/ingestion/source/ldap.py | 8 ++------ .../src/datahub/ingestion/source/looker_common.py | 4 +++- .../src/datahub/ingestion/source/s3/source.py | 4 +++- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ldap.py b/metadata-ingestion/src/datahub/ingestion/source/ldap.py index f3b40610d0210a..45d1c26e21495f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ldap.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ldap.py @@ -152,9 +152,7 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: ) _rtype, rdata, _rmsgid, serverctrls = self.ldap_client.result3(msgid) except ldap.LDAPError as e: - self.report.report_failure( - "ldap-control", f"LDAP search failed: {e}" - ) + self.report.report_failure("ldap-control", f"LDAP search failed: {e}") break for dn, attrs in rdata: @@ -210,9 +208,7 @@ def handle_user(self, dn: str, attrs: Dict[str, Any]) -> Iterable[MetadataWorkUn _m_dn, m_attrs = self.ldap_client.result3(manager_msgid)[1][0] manager_ldap = guess_person_ldap(m_attrs) except ldap.LDAPError as e: - self.report.report_warning( - dn, f"manager LDAP search failed: {e}" - ) + self.report.report_warning(dn, f"manager LDAP search failed: {e}") mce = self.build_corp_user_mce(dn, attrs, manager_ldap) if mce: wu = MetadataWorkUnit(dn, mce) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker_common.py index b2348fee9ace40..28f2b23c1a258a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker_common.py @@ -623,7 +623,9 @@ def from_api( # noqa: C901 source_file=explore.source_file, ) except SDKError as e: - logger.warning(f"Failed to extract explore {explore_name} from model {model}.") + logger.warning( + f"Failed to extract explore {explore_name} from model {model}." + ) logger.debug( f"Failed to extract explore {explore_name} from model {model} with {e}" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index d3943745cbc81f..0856c58e69ed5d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -722,7 +722,9 @@ def get_s3_tags( [current_tag.tag for current_tag in current_tags.tags] ) else: - logger.warning("Could not connect to DatahubApi. No current tags to maintain") + logger.warning( + "Could not connect to DatahubApi. No current tags to maintain" + ) # Remove duplicate tags tags_to_add = list(set(tags_to_add)) new_tags = GlobalTagsClass( From 7c5e98da8706aceb940be1dca996cebd75b5d6f0 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Mon, 20 Jun 2022 17:56:38 +1000 Subject: [PATCH 77/88] Update cli_utils.py --- metadata-ingestion/src/datahub/cli/cli_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py index d234beb2dafbfe..ee7478e075d848 100644 --- a/metadata-ingestion/src/datahub/cli/cli_utils.py +++ b/metadata-ingestion/src/datahub/cli/cli_utils.py @@ -579,7 +579,7 @@ def post_entity( ) response = session.post(url, payload) response.raise_for_status() - return response.status_code + return int(response.status_code) type_class_to_name_map = { From b59d2edc38b3999018be4d9e1ed7dbf9ef21157b Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Mon, 20 Jun 2022 21:56:36 +1000 Subject: [PATCH 78/88] Update cli_utils.py --- metadata-ingestion/src/datahub/cli/cli_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py index ee7478e075d848..b3c20dad16de22 100644 --- a/metadata-ingestion/src/datahub/cli/cli_utils.py +++ b/metadata-ingestion/src/datahub/cli/cli_utils.py @@ -554,7 +554,7 @@ def post_entity( aspect_name: str, aspect_value: Dict, cached_session_host: Optional[Tuple[Session, str]] = None, -) -> Dict: +) -> int: session, gms_host = cached_session_host or get_session_and_host() endpoint: str = "/aspects/?action=ingestProposal" @@ -579,7 +579,7 @@ def post_entity( ) response = session.post(url, payload) response.raise_for_status() - return int(response.status_code) + return (response.status_code) type_class_to_name_map = { From 4820f6239d53ef262b208bd4846ece0fd1c8b18d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Mon, 20 Jun 2022 21:56:45 +1000 Subject: [PATCH 79/88] Update cli_utils.py --- metadata-ingestion/src/datahub/cli/cli_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py index b3c20dad16de22..c7fab41b4d8acc 100644 --- a/metadata-ingestion/src/datahub/cli/cli_utils.py +++ b/metadata-ingestion/src/datahub/cli/cli_utils.py @@ -579,7 +579,7 @@ def post_entity( ) response = session.post(url, payload) response.raise_for_status() - return (response.status_code) + return response.status_code type_class_to_name_map = { From 978111f172d6bafc8d9f4c552402b981c47ca92a Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Jun 2022 21:47:29 +1000 Subject: [PATCH 80/88] updates as per request --- metadata-ingestion/src/datahub/configuration/common.py | 8 ++++---- metadata-ingestion/src/datahub/ingestion/source/looker.py | 3 +++ metadata-ingestion/src/datahub/ingestion/source/lookml.py | 5 ++++- .../src/datahub/ingestion/source_config/pulsar.py | 2 +- metadata-ingestion/src/datahub/utilities/urns/urn.py | 2 +- 5 files changed, 13 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 80f8d717daf7a5..86b366a008962b 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -101,7 +101,7 @@ class OauthConfiguration(ConfigModel): class AllowDenyPattern(ConfigModel): - """A class to store allow deny regex's""" + """A class to store allow deny regexs""" allow: List[str] = Field( default=[".*"], @@ -143,7 +143,7 @@ def allowed(self, string: str) -> bool: def is_fully_specified_allow_list(self) -> bool: """ - If the allow patterns are literals and not full regex's, then it is considered + If the allow patterns are literals and not full regexs, then it is considered fully specified. This is useful if you want to convert a 'list + filter' pattern into a 'search for the ones that are allowed' pattern, which can be much more efficient in some cases. @@ -159,7 +159,7 @@ def get_allowed_list(self) -> List[str]: class KeyValuePattern(ConfigModel): - """A class to store allow deny regex's""" + """A class to store allow deny regexs""" rules: Dict[str, List[str]] = {".*": []} alphabet: str = "[A-Za-z0-9 _.-]" @@ -182,7 +182,7 @@ def matched(self, string: str) -> bool: def is_fully_specified_key(self) -> bool: """ - If the allow patterns are literals and not full regex's, then it is considered + If the allow patterns are literals and not full regexs, then it is considered fully specified. This is useful if you want to convert a 'list + filter' pattern into a 'search for the ones that are allowed' pattern, which can be much more efficient in some cases. diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker.py b/metadata-ingestion/src/datahub/ingestion/source/looker.py index a469e16ab34c47..7c1368f814e4e0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker.py @@ -448,6 +448,7 @@ def _get_looker_dashboard_element( # noqa: C901 if element.query is not None: fields = self._get_fields_from_query(element.query) + # Get the explore from the view directly explores = [element.query.view] if element.query.view is not None else [] logger.debug( "Element {}: Explores added: {}".format(element.title, explores) @@ -468,6 +469,7 @@ def _get_looker_dashboard_element( # noqa: C901 upstream_fields=fields, ) + # Dashboard elements can *alternatively* link to an existing look elif element.look is not None: # we pick from element title by default, falling back to look title. title: str = ( @@ -505,6 +507,7 @@ def _get_looker_dashboard_element( # noqa: C901 upstream_fields=fields, ) + # Failing the above two approaches, pick out details from result_maker elif element.result_maker is not None: model: str = "" fields = [] diff --git a/metadata-ingestion/src/datahub/ingestion/source/lookml.py b/metadata-ingestion/src/datahub/ingestion/source/lookml.py index 203bf199c27ad5..45203f9b3891aa 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/lookml.py +++ b/metadata-ingestion/src/datahub/ingestion/source/lookml.py @@ -650,7 +650,10 @@ def from_looker_dict( ) # If not a derived table, then this view essentially wraps an existing - # object in the database. + # object in the database. If sql_table_name is set, there is a single + # dependency in the view, on the sql_table_name. + # Otherwise, default to the view name as per the docs: + # https://docs.looker.com/reference/view-params/sql_table_name-for-view sql_table_names = [view_name] if sql_table_name is None else [sql_table_name] output_looker_view = LookerView( id=LookerViewId( diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/pulsar.py b/metadata-ingestion/src/datahub/ingestion/source_config/pulsar.py index 836960ac50633f..e1bdf072787cd5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/pulsar.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/pulsar.py @@ -30,7 +30,7 @@ def _is_valid_hostname(hostname: str) -> bool: """ if len(hostname) > 253: return False - # Hostname's ending on a dot are valid, if present strip exactly one + # Hostnames ending on a dot are valid, if present strip exactly one if hostname[-1] == ".": hostname = hostname[:-1] allowed = re.compile(r"(?!-)[A-Z\d-]{1,63}(? List[str]: part_start = i + 1 if start_paren_count != 0: - raise InvalidUrnError(f"{entity_id}, mismatched parent nesting") + raise InvalidUrnError(f"{entity_id}, mismatched paren nesting") parts.append(entity_id[part_start:-1]) From 391fa2834a4d2367a2c38d8d2530566977db8c14 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Jun 2022 21:55:24 +1000 Subject: [PATCH 81/88] Update kafka.py --- metadata-ingestion/src/datahub/configuration/kafka.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/configuration/kafka.py b/metadata-ingestion/src/datahub/configuration/kafka.py index e752285cdde2af..197322a2e566e1 100644 --- a/metadata-ingestion/src/datahub/configuration/kafka.py +++ b/metadata-ingestion/src/datahub/configuration/kafka.py @@ -27,7 +27,7 @@ def bootstrap_host_colon_port_comma(cls, val: str) -> str: else: host = entry assert re.match( - # TODO: This regex is quite loose. Many invalid hostname's or IPs will slip through, + # This regex is quite loose. Many invalid hostname's or IPs will slip through, # but it serves as a good first line of validation. We defer to Kafka for the # remaining validation. r"^[\w\-\.\:]+$", From e29961885df610aaaa1272719f9aa52930bb7d14 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Jun 2022 22:01:43 +1000 Subject: [PATCH 82/88] Update powerbi.py --- metadata-ingestion/src/datahub/ingestion/source/powerbi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi.py index a4182ab6b6824f..ff0924349c0e91 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi.py @@ -695,7 +695,7 @@ def get_workspace(self, workspace_id: str) -> Workspace: POWERBI_ADMIN_BASE_URL=self.__config.admin_base_url ) - def create_scan_job(): # sourcery skip: avoid-builtin-shadow + def create_scan_job(): """ Create scan job on PowerBi for the workspace """ From 76150f1ca241736a1e8c816342bdc99d4040f141 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Jun 2022 08:45:17 +1000 Subject: [PATCH 83/88] lint --- metadata-ingestion/scripts/docgen.py | 23 ++++++++++++------- .../src/datahub/ingestion/source/lookml.py | 2 +- .../src/datahub/utilities/mapping.py | 2 +- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/metadata-ingestion/scripts/docgen.py b/metadata-ingestion/scripts/docgen.py index 03434d291d3024..fa59cd20dda1c0 100644 --- a/metadata-ingestion/scripts/docgen.py +++ b/metadata-ingestion/scripts/docgen.py @@ -605,9 +605,10 @@ def generate( os.makedirs(config_dir, exist_ok=True) with open(f"{config_dir}/{plugin_name}_config.json", "w") as f: f.write(source_config_class.schema_json(indent=2)) - - create_or_update(source_documentation, - [platform_id, "plugins", plugin_name, "config_schema"], + + create_or_update( + source_documentation, + [platform_id, "plugins", plugin_name, "config_schema"], source_config_class.schema_json(indent=2) or "", ) @@ -649,7 +650,9 @@ def generate( with open(platform_doc_file, "w") as f: if "name" in platform_docs: - f.write(f"import Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\n\n") + f.write( + f"import Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';\n\n" + ) f.write(f"# {platform_docs['name']}\n") if len(platform_docs["plugins"].keys()) > 1: # More than one plugin used to provide integration with this platform @@ -722,8 +725,10 @@ def generate( f.write("\n```\n") if "config" in plugin_docs: f.write("\n### Config Details\n") - f.write(""" - \n\n""") + f.write( + """ + \n\n""" + ) f.write( "Note that a `.` is used to denote nested fields in the YAML recipe.\n\n" ) @@ -733,7 +738,8 @@ def generate( for doc in plugin_docs["config"]: f.write(doc) f.write("\n\n\n") - f.write(f""" + f.write( + f""" The [JSONSchema](https://json-schema.org/) for this configuration is inlined below.\n\n @@ -741,7 +747,8 @@ def generate( {plugin_docs['config_schema']} ```\n\n -\n\n""") +\n\n""" + ) # insert custom plugin docs after config details f.write(plugin_docs.get("custom_docs", "")) if "classname" in plugin_docs: diff --git a/metadata-ingestion/src/datahub/ingestion/source/lookml.py b/metadata-ingestion/src/datahub/ingestion/source/lookml.py index 45203f9b3891aa..aee876da2f7bac 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/lookml.py +++ b/metadata-ingestion/src/datahub/ingestion/source/lookml.py @@ -650,7 +650,7 @@ def from_looker_dict( ) # If not a derived table, then this view essentially wraps an existing - # object in the database. If sql_table_name is set, there is a single + # object in the database. If sql_table_name is set, there is a single # dependency in the view, on the sql_table_name. # Otherwise, default to the view name as per the docs: # https://docs.looker.com/reference/view-params/sql_table_name-for-view diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py index 1af643c8491876..0debde162ecaf7 100644 --- a/metadata-ingestion/src/datahub/utilities/mapping.py +++ b/metadata-ingestion/src/datahub/utilities/mapping.py @@ -236,4 +236,4 @@ def get_match(self, match_clause: Any, raw_props_value: Any) -> Optional[Match]: elif type(raw_props_value) == str: return bool(re.match(match_clause, raw_props_value)) else: - return match_clause == raw_props_value \ No newline at end of file + return match_clause == raw_props_value From da217c6ebdb2c2b3b1aacb63d52fa7c79cd1918e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Jun 2022 08:46:48 +1000 Subject: [PATCH 84/88] Update common.py --- metadata-ingestion/src/datahub/configuration/common.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 86b366a008962b..be3c6a13d3599b 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -101,7 +101,7 @@ class OauthConfiguration(ConfigModel): class AllowDenyPattern(ConfigModel): - """A class to store allow deny regexs""" + """A class to store allow deny regexes""" allow: List[str] = Field( default=[".*"], @@ -143,7 +143,7 @@ def allowed(self, string: str) -> bool: def is_fully_specified_allow_list(self) -> bool: """ - If the allow patterns are literals and not full regexs, then it is considered + If the allow patterns are literals and not full regexes, then it is considered fully specified. This is useful if you want to convert a 'list + filter' pattern into a 'search for the ones that are allowed' pattern, which can be much more efficient in some cases. @@ -159,7 +159,7 @@ def get_allowed_list(self) -> List[str]: class KeyValuePattern(ConfigModel): - """A class to store allow deny regexs""" + """A class to store allow deny regexes""" rules: Dict[str, List[str]] = {".*": []} alphabet: str = "[A-Za-z0-9 _.-]" @@ -182,7 +182,7 @@ def matched(self, string: str) -> bool: def is_fully_specified_key(self) -> bool: """ - If the allow patterns are literals and not full regexs, then it is considered + If the allow patterns are literals and not full regexes, then it is considered fully specified. This is useful if you want to convert a 'list + filter' pattern into a 'search for the ones that are allowed' pattern, which can be much more efficient in some cases. From 6ab23f24ed8fa9654805e1e663e08781d1fb6b59 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Tue, 28 Jun 2022 08:47:34 +1000 Subject: [PATCH 85/88] Update mapping.py --- metadata-ingestion/src/datahub/utilities/mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py index 0debde162ecaf7..4d0093f4b2ffcc 100644 --- a/metadata-ingestion/src/datahub/utilities/mapping.py +++ b/metadata-ingestion/src/datahub/utilities/mapping.py @@ -224,7 +224,7 @@ def _get_best_match(the_match: Match, group_name: str) -> str: def sanitize_owner_ids(self, owner_id: str) -> str: if owner_id.__contains__("@"): - owner_id = owner_id[: owner_id.index("@")] + owner_id = owner_id[0: owner_id.index("@")] return owner_id def get_match(self, match_clause: Any, raw_props_value: Any) -> Optional[Match]: From c7f8106f5e6635cd746b48f49d3b999100fc8b9e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Tue, 28 Jun 2022 08:51:57 +1000 Subject: [PATCH 86/88] Update mapping.py --- metadata-ingestion/src/datahub/utilities/mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py index 4d0093f4b2ffcc..30d044e25ba46e 100644 --- a/metadata-ingestion/src/datahub/utilities/mapping.py +++ b/metadata-ingestion/src/datahub/utilities/mapping.py @@ -224,7 +224,7 @@ def _get_best_match(the_match: Match, group_name: str) -> str: def sanitize_owner_ids(self, owner_id: str) -> str: if owner_id.__contains__("@"): - owner_id = owner_id[0: owner_id.index("@")] + owner_id = owner_id[0 : owner_id.index("@")] return owner_id def get_match(self, match_clause: Any, raw_props_value: Any) -> Optional[Match]: From 6fc07859e19527edf4fb0f726ba6635cc7f089e8 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 30 Jun 2022 08:14:39 +1000 Subject: [PATCH 87/88] Update source.py --- .../src/datahub/ingestion/source/s3/source.py | 79 ------------------- 1 file changed, 79 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index c7feb6d19463fa..6e66dcc3d84167 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -593,85 +593,6 @@ def ingest_table( if self.source_config.profiling.enabled: yield from self.get_table_profile(table_data, dataset_urn) - def gen_bucket_key(self, name): - return S3BucketKey( - platform="s3", - instance=self.source_config.env - if self.source_config.platform_instance is None - else self.source_config.platform_instance, - bucket_name=name, - ) - - def get_s3_tags( - self, bucket_name: str, key_name: Optional[str], dataset_urn: str - ) -> Optional[GlobalTagsClass]: - if self.source_config.aws_config is None: - raise ValueError("aws_config not set. Cannot browse s3") - new_tags = GlobalTagsClass(tags=[]) - tags_to_add = [] - if self.source_config.use_s3_bucket_tags: - s3 = self.source_config.aws_config.get_s3_resource() - bucket = s3.Bucket(bucket_name) - try: - tags_to_add.extend( - [ - make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""") - for tag in bucket.Tagging().tag_set - ] - ) - except s3.meta.client.exceptions.ClientError: - logger.warning(f"No tags found for bucket={bucket_name}") - - if self.source_config.use_s3_object_tags and key_name is not None: - s3_client = self.source_config.aws_config.get_s3_client() - object_tagging = s3_client.get_object_tagging( - Bucket=bucket_name, Key=key_name - ) - tag_set = object_tagging["TagSet"] - if tag_set: - tags_to_add.extend( - [ - make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""") - for tag in tag_set - ] - ) - else: - # Unlike bucket tags, if an object does not have tags, it will just return an empty array - # as opposed to an exception. - logger.warning(f"No tags found for bucket={bucket_name} key={key_name}") - if len(tags_to_add) == 0: - return None - if self.ctx.graph is not None: - logger.debug("Connected to DatahubApi, grabbing current tags to maintain.") - current_tags: Optional[GlobalTagsClass] = self.ctx.graph.get_aspect_v2( - entity_urn=dataset_urn, - aspect="globalTags", - aspect_type=GlobalTagsClass, - ) - if current_tags: - tags_to_add.extend( - [current_tag.tag for current_tag in current_tags.tags] - ) - else: - logger.warning( - "Could not connect to DatahubApi. No current tags to maintain" - ) - # Remove duplicate tags - tags_to_add = list(set(tags_to_add)) - new_tags = GlobalTagsClass( - tags=[TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add] - ) - return new_tags - - def gen_folder_key(self, abs_path): - return FolderKey( - platform=self.source_config.platform, - instance=self.source_config.env - if self.source_config.platform_instance is None - else self.source_config.platform_instance, - folder_abs_path=abs_path, - ) - def get_prefix(self, relative_path: str) -> str: index = re.search(r"[\*|\{]", relative_path) if index: From 2de8aee652194bc0845a48e51fae1aa9f9b7961a Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Wed, 6 Jul 2022 14:52:18 +0530 Subject: [PATCH 88/88] revert changes to functionality --- metadata-ingestion/src/datahub/utilities/mapping.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py index 30d044e25ba46e..6212f1b001e002 100644 --- a/metadata-ingestion/src/datahub/utilities/mapping.py +++ b/metadata-ingestion/src/datahub/utilities/mapping.py @@ -232,8 +232,8 @@ def get_match(self, match_clause: Any, raw_props_value: Any) -> Optional[Match]: if type(raw_props_value) not in Constants.OPERAND_DATATYPE_SUPPORTED or type( raw_props_value ) != type(match_clause): - return False + return None elif type(raw_props_value) == str: - return bool(re.match(match_clause, raw_props_value)) + return re.match(match_clause, raw_props_value) else: - return match_clause == raw_props_value + return re.match(str(match_clause), str(raw_props_value))