From f83d4b70f63065d7449dca513f8b7885ea1a7fbe Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 12 Jul 2024 15:08:51 -0700 Subject: [PATCH] feat(ingest): add snowflake-queries source (#10835) --- metadata-ingestion/setup.py | 2 + .../src/datahub/ingestion/api/source.py | 5 + .../source/bigquery_v2/bigquery_audit.py | 2 +- .../ingestion/source/fivetran/config.py | 6 +- .../ingestion/source/redshift/lineage_v2.py | 10 +- .../source/snowflake/snowflake_assertion.py | 35 +- .../source/snowflake/snowflake_config.py | 139 +++-- .../snowflake/snowflake_connection.py} | 120 ++-- .../source/snowflake/snowflake_data_reader.py | 4 +- .../source/snowflake/snowflake_lineage_v2.py | 96 ++-- .../source/snowflake/snowflake_profiler.py | 2 +- .../source/snowflake/snowflake_queries.py | 515 ++++++++++++++++++ .../source/snowflake/snowflake_query.py | 26 +- .../source/snowflake/snowflake_schema.py | 39 +- .../source/snowflake/snowflake_schema_gen.py | 108 ++-- .../source/snowflake/snowflake_summary.py | 99 +--- .../source/snowflake/snowflake_usage_v2.py | 50 +- .../source/snowflake/snowflake_utils.py | 293 +++++----- .../source/snowflake/snowflake_v2.py | 90 ++- .../ingestion/source/sql/sql_config.py | 52 +- .../ingestion/source_config/sql/__init__.py | 0 .../sql_parsing/sql_parsing_aggregator.py | 388 ++++++++++--- .../tests/integration/snowflake/common.py | 8 +- .../snowflake/snowflake_golden.json | 473 +++++++++++++++- .../snowflake_privatelink_golden.json | 124 ++++- .../snowflake/test_snowflake_failures.py | 14 +- .../test_add_known_query_lineage.json | 11 +- .../test_basic_lineage.json | 14 +- .../test_column_lineage_deduplication.json | 34 +- .../test_multistep_temp_table.json | 13 +- .../test_overlapping_inserts.json | 28 +- ..._overlapping_inserts_from_temp_tables.json | 31 +- .../aggregator_goldens/test_table_rename.json | 28 +- .../aggregator_goldens/test_temp_table.json | 22 +- .../aggregator_goldens/test_view_lineage.json | 14 +- .../state/test_redundant_run_skip_handler.py | 8 +- .../tests/unit/test_snowflake_source.py | 51 +- .../linkedin/query/QueryUsageStatistics.pdl | 43 ++ .../src/main/resources/entity-registry.yml | 1 + 39 files changed, 2284 insertions(+), 714 deletions(-) rename metadata-ingestion/src/datahub/ingestion/{source_config/sql/snowflake.py => source/snowflake/snowflake_connection.py} (81%) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py delete mode 100644 metadata-ingestion/src/datahub/ingestion/source_config/sql/__init__.py create mode 100644 metadata-models/src/main/pegasus/com/linkedin/query/QueryUsageStatistics.pdl diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 5ff62dd02fbc3f..b8db746a63fdba 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -410,6 +410,7 @@ "salesforce": {"simple-salesforce"}, "snowflake": snowflake_common | usage_common | sqlglot_lib, "snowflake-summary": snowflake_common | usage_common | sqlglot_lib, + "snowflake-queries": snowflake_common | usage_common | sqlglot_lib, "sqlalchemy": sql_common, "sql-queries": usage_common | sqlglot_lib, "slack": slack, @@ -662,6 +663,7 @@ "slack = datahub.ingestion.source.slack.slack:SlackSource", "snowflake = datahub.ingestion.source.snowflake.snowflake_v2:SnowflakeV2Source", "snowflake-summary = datahub.ingestion.source.snowflake.snowflake_summary:SnowflakeSummarySource", + "snowflake-queries = datahub.ingestion.source.snowflake.snowflake_queries:SnowflakeQueriesSource", "superset = datahub.ingestion.source.superset:SupersetSource", "tableau = datahub.ingestion.source.tableau:TableauSource", "openapi = datahub.ingestion.source.openapi:OpenApiSource", diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index d78500b4401e5f..ad1b312ef445c1 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -45,6 +45,8 @@ logger = logging.getLogger(__name__) +_MAX_CONTEXT_STRING_LENGTH = 300 + class SourceCapability(Enum): PLATFORM_INSTANCE = "Platform Instance" @@ -112,6 +114,9 @@ def report_log( log_key = f"{title}-{message}" entries = self._entries[level] + if context and len(context) > _MAX_CONTEXT_STRING_LENGTH: + context = f"{context[:_MAX_CONTEXT_STRING_LENGTH]} ..." + log_content = f"{message} => {context}" if context else message if exc: log_content += f"{log_content}: {exc}" diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py index 0e7e98b0e5e8f0..f8fcea7c57545c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py @@ -192,7 +192,7 @@ def from_string_name(cls, ref: str) -> "BigQueryTableRef": def from_urn(cls, urn: str) -> "BigQueryTableRef": """Raises: ValueError if urn is not a valid BigQuery table URN.""" dataset_urn = DatasetUrn.create_from_string(urn) - split = dataset_urn.get_dataset_name().rsplit(".", 3) + split = dataset_urn.name.rsplit(".", 3) if len(split) == 3: project, dataset, table = split else: diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py index f55d9f89ad97f1..f8b1c6dd93d6d9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py @@ -13,6 +13,9 @@ from datahub.ingestion.source.bigquery_v2.bigquery_config import ( BigQueryConnectionConfig, ) +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnectionConfig, +) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalSourceReport, StatefulStaleMetadataRemovalConfig, @@ -20,7 +23,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionConfigBase, ) -from datahub.ingestion.source_config.sql.snowflake import BaseSnowflakeConfig from datahub.utilities.perf_timer import PerfTimer logger = logging.getLogger(__name__) @@ -66,7 +68,7 @@ class Constant: } -class SnowflakeDestinationConfig(BaseSnowflakeConfig): +class SnowflakeDestinationConfig(SnowflakeConnectionConfig): database: str = Field(description="The fivetran connector log database.") log_schema: str = Field(description="The fivetran connector log schema.") diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py index 062a99de6b7358..526e5e2cf12d02 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py @@ -105,7 +105,15 @@ def build( for schema, tables in schemas.items() for table in tables } - self.aggregator.is_temp_table = lambda urn: urn not in self.known_urns + self.aggregator._is_temp_table = ( + lambda name: DatasetUrn.create_from_ids( + self.platform, + name, + env=self.config.env, + platform_instance=self.config.platform_instance, + ).urn() + not in self.known_urns + ) # Handle all the temp tables up front. if self.config.resolve_temp_table_in_lineage: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py index a28a81cc5b955d..2a1d18c83e6fa8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py @@ -1,6 +1,6 @@ import logging from datetime import datetime -from typing import Callable, Iterable, List, Optional +from typing import Iterable, List, Optional from pydantic import BaseModel @@ -11,14 +11,14 @@ ) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config +from datahub.ingestion.source.snowflake.snowflake_config import ( + SnowflakeIdentifierConfig, + SnowflakeV2Config, +) +from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeCommonMixin, - SnowflakeConnectionMixin, - SnowflakeQueryMixin, -) +from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeIdentifierMixin from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( AssertionResult, AssertionResultType, @@ -40,30 +40,27 @@ class DataQualityMonitoringResult(BaseModel): VALUE: int -class SnowflakeAssertionsHandler( - SnowflakeCommonMixin, SnowflakeQueryMixin, SnowflakeConnectionMixin -): +class SnowflakeAssertionsHandler(SnowflakeIdentifierMixin): def __init__( self, config: SnowflakeV2Config, report: SnowflakeV2Report, - dataset_urn_builder: Callable[[str], str], + connection: SnowflakeConnection, ) -> None: self.config = config self.report = report self.logger = logger - self.dataset_urn_builder = dataset_urn_builder - self.connection = None + self.connection = connection self._urns_processed: List[str] = [] + @property + def identifier_config(self) -> SnowflakeIdentifierConfig: + return self.config + def get_assertion_workunits( self, discovered_datasets: List[str] ) -> Iterable[MetadataWorkUnit]: - self.connection = self.create_connection() - if self.connection is None: - return - - cur = self.query( + cur = self.connection.query( SnowflakeQuery.dmf_assertion_results( datetime_to_ts_millis(self.config.start_time), datetime_to_ts_millis(self.config.end_time), @@ -110,7 +107,7 @@ def _process_result_row( aspect=AssertionRunEvent( timestampMillis=datetime_to_ts_millis(result.MEASUREMENT_TIME), runId=result.MEASUREMENT_TIME.strftime("%Y-%m-%dT%H:%M:%SZ"), - asserteeUrn=self.dataset_urn_builder(assertee), + asserteeUrn=self.gen_dataset_urn(assertee), status=AssertionRunStatus.COMPLETE, assertionUrn=make_assertion_urn(assertion_guid), result=AssertionResult( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 588187e8e11c28..f6247eb949417b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -4,24 +4,31 @@ from enum import Enum from typing import Dict, List, Optional, Set, cast +import pydantic from pydantic import Field, SecretStr, root_validator, validator from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.pattern_utils import UUID_REGEX +from datahub.configuration.source_common import ( + EnvConfigMixin, + LowerCaseDatasetUrnConfigMixin, + PlatformInstanceConfigMixin, +) +from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.ingestion.glossary.classification_mixin import ( ClassificationSourceConfigMixin, ) +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnectionConfig, +) +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterConfig from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulLineageConfigMixin, StatefulProfilingConfigMixin, StatefulUsageConfigMixin, ) -from datahub.ingestion.source_config.sql.snowflake import ( - BaseSnowflakeConfig, - SnowflakeConfig, -) from datahub.ingestion.source_config.usage.snowflake_usage import SnowflakeUsageConfig from datahub.utilities.global_warning_util import add_global_warning @@ -32,11 +39,12 @@ # # DBT incremental models create temporary tables ending with __dbt_tmp # Ref - https://discourse.getdbt.com/t/handling-bigquery-incremental-dbt-tmp-tables/7540 -DEFAULT_TABLES_DENY_LIST = [ +DEFAULT_TEMP_TABLES_PATTERNS = [ r".*\.FIVETRAN_.*_STAGING\..*", # fivetran r".*__DBT_TMP$", # dbt rf".*\.SEGMENT_{UUID_REGEX}", # segment rf".*\.STAGING_.*_{UUID_REGEX}", # stitch + r".*\.(GE_TMP_|GE_TEMP_|GX_TEMP_)[0-9A-F]{8}", # great expectations ] @@ -73,6 +81,93 @@ def source_database(self) -> DatabaseId: return DatabaseId(self.database, self.platform_instance) +class SnowflakeFilterConfig(SQLFilterConfig): + database_pattern: AllowDenyPattern = Field( + AllowDenyPattern( + deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"], + ), + description="Regex patterns for databases to filter in ingestion.", + ) + + schema_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="Regex patterns for schemas to filter in ingestion. Will match against the full `database.schema` name if `match_fully_qualified_names` is enabled.", + ) + # table_pattern and view_pattern are inherited from SQLFilterConfig + + match_fully_qualified_names: bool = Field( + default=False, + description="Whether `schema_pattern` is matched against fully qualified schema name `.`.", + ) + + @root_validator(pre=False, skip_on_failure=True) + def validate_legacy_schema_pattern(cls, values: Dict) -> Dict: + schema_pattern: Optional[AllowDenyPattern] = values.get("schema_pattern") + match_fully_qualified_names = values.get("match_fully_qualified_names") + + if ( + schema_pattern is not None + and schema_pattern != AllowDenyPattern.allow_all() + and match_fully_qualified_names is not None + and not match_fully_qualified_names + ): + logger.warning( + "Please update `schema_pattern` to match against fully qualified schema name `.` and set config `match_fully_qualified_names : True`." + "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. " + "The config option `match_fully_qualified_names` will be deprecated in future and the default behavior will assume `match_fully_qualified_names: True`." + ) + + # Always exclude reporting metadata for INFORMATION_SCHEMA schema + if schema_pattern is not None and schema_pattern: + logger.debug("Adding deny for INFORMATION_SCHEMA to schema_pattern.") + cast(AllowDenyPattern, schema_pattern).deny.append(r".*INFORMATION_SCHEMA$") + + return values + + +class SnowflakeIdentifierConfig( + PlatformInstanceConfigMixin, EnvConfigMixin, LowerCaseDatasetUrnConfigMixin +): + # Changing default value here. + convert_urns_to_lowercase: bool = Field( + default=True, + ) + + +# TODO: SnowflakeConfig is unused except for this inheritance. We should collapse the config inheritance hierarchy. +class SnowflakeConfig( + SnowflakeIdentifierConfig, + SnowflakeFilterConfig, + # SnowflakeFilterConfig must come before (higher precedence) the SQLCommon config, so that the documentation overrides are applied. + SnowflakeConnectionConfig, + BaseTimeWindowConfig, + SQLCommonConfig, +): + include_table_lineage: bool = pydantic.Field( + default=True, + description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.", + ) + include_view_lineage: bool = pydantic.Field( + default=True, + description="If enabled, populates the snowflake view->table and table->view lineages. Requires appropriate grants given to the role, and include_table_lineage to be True. view->table lineage requires Snowflake Enterprise Edition or above.", + ) + + ignore_start_time_lineage: bool = False + upstream_lineage_in_report: bool = False + + @pydantic.root_validator(skip_on_failure=True) + def validate_include_view_lineage(cls, values): + if ( + "include_table_lineage" in values + and not values.get("include_table_lineage") + and values.get("include_view_lineage") + ): + raise ValueError( + "include_table_lineage must be True for include_view_lineage to be set." + ) + return values + + class SnowflakeV2Config( SnowflakeConfig, SnowflakeUsageConfig, @@ -81,10 +176,6 @@ class SnowflakeV2Config( StatefulProfilingConfigMixin, ClassificationSourceConfigMixin, ): - convert_urns_to_lowercase: bool = Field( - default=True, - ) - include_usage_stats: bool = Field( default=True, description="If enabled, populates the snowflake usage statistics. Requires appropriate grants given to the role.", @@ -133,11 +224,6 @@ class SnowflakeV2Config( description="Whether to populate Snowsight url for Snowflake Objects", ) - match_fully_qualified_names: bool = Field( - default=False, - description="Whether `schema_pattern` is matched against fully qualified schema name `.`.", - ) - _use_legacy_lineage_method_removed = pydantic_removed_field( "use_legacy_lineage_method" ) @@ -154,7 +240,7 @@ class SnowflakeV2Config( # This is required since access_history table does not capture whether the table was temporary table. temporary_tables_pattern: List[str] = Field( - default=DEFAULT_TABLES_DENY_LIST, + default=DEFAULT_TEMP_TABLES_PATTERNS, description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to " "match the entire table name in database.schema.table format. Defaults are to set in such a way " "to ignore the temporary staging tables created by known ETL tools.", @@ -210,27 +296,6 @@ def validate_unsupported_configs(cls, values: Dict) -> Dict: "include_read_operational_stats is not supported. Set `include_read_operational_stats` to False.", ) - match_fully_qualified_names = values.get("match_fully_qualified_names") - - schema_pattern: Optional[AllowDenyPattern] = values.get("schema_pattern") - - if ( - schema_pattern is not None - and schema_pattern != AllowDenyPattern.allow_all() - and match_fully_qualified_names is not None - and not match_fully_qualified_names - ): - logger.warning( - "Please update `schema_pattern` to match against fully qualified schema name `.` and set config `match_fully_qualified_names : True`." - "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. " - "The config option `match_fully_qualified_names` will be deprecated in future and the default behavior will assume `match_fully_qualified_names: True`." - ) - - # Always exclude reporting metadata for INFORMATION_SCHEMA schema - if schema_pattern is not None and schema_pattern: - logger.debug("Adding deny for INFORMATION_SCHEMA to schema_pattern.") - cast(AllowDenyPattern, schema_pattern).deny.append(r".*INFORMATION_SCHEMA$") - include_technical_schema = values.get("include_technical_schema") include_profiles = ( values.get("profiling") is not None and values["profiling"].enabled @@ -259,7 +324,7 @@ def get_sql_alchemy_url( password: Optional[SecretStr] = None, role: Optional[str] = None, ) -> str: - return BaseSnowflakeConfig.get_sql_alchemy_url( + return SnowflakeConnectionConfig.get_sql_alchemy_url( self, database=database, username=username, password=password, role=role ) diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py similarity index 81% rename from metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py rename to metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py index 521e755b6a00c5..e981ed3e2e6650 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py @@ -5,6 +5,8 @@ import snowflake.connector from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import serialization +from snowflake.connector import SnowflakeConnection as NativeSnowflakeConnection +from snowflake.connector.cursor import DictCursor from snowflake.connector.network import ( DEFAULT_AUTHENTICATOR, EXTERNAL_BROWSER_AUTHENTICATOR, @@ -12,38 +14,41 @@ OAUTH_AUTHENTICATOR, ) -from datahub.configuration.common import AllowDenyPattern, ConfigModel +from datahub.configuration.common import ConfigModel, ConfigurationError, MetaError from datahub.configuration.connection_resolver import auto_connection_resolver from datahub.configuration.oauth import OAuthConfiguration, OAuthIdentityProvider -from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.ingestion.source.snowflake.constants import ( CLIENT_PREFETCH_THREADS, CLIENT_SESSION_KEEP_ALIVE, ) from datahub.ingestion.source.sql.oauth_generator import OAuthTokenGenerator -from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri +from datahub.ingestion.source.sql.sql_config import make_sqlalchemy_uri from datahub.utilities.config_clean import ( remove_protocol, remove_suffix, remove_trailing_slashes, ) -logger: logging.Logger = logging.getLogger(__name__) +logger = logging.getLogger(__name__) -APPLICATION_NAME: str = "acryl_datahub" +_APPLICATION_NAME: str = "acryl_datahub" -VALID_AUTH_TYPES: Dict[str, str] = { +_VALID_AUTH_TYPES: Dict[str, str] = { "DEFAULT_AUTHENTICATOR": DEFAULT_AUTHENTICATOR, "EXTERNAL_BROWSER_AUTHENTICATOR": EXTERNAL_BROWSER_AUTHENTICATOR, "KEY_PAIR_AUTHENTICATOR": KEY_PAIR_AUTHENTICATOR, "OAUTH_AUTHENTICATOR": OAUTH_AUTHENTICATOR, } -SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com" +_SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com" -class BaseSnowflakeConfig(ConfigModel): +class SnowflakePermissionError(MetaError): + """A permission error has happened""" + + +class SnowflakeConnectionConfig(ConfigModel): # Note: this config model is also used by the snowflake-usage source. _connection = auto_connection_resolver() @@ -106,15 +111,15 @@ def get_account(self) -> str: def validate_account_id(cls, account_id: str) -> str: account_id = remove_protocol(account_id) account_id = remove_trailing_slashes(account_id) - account_id = remove_suffix(account_id, SNOWFLAKE_HOST_SUFFIX) + account_id = remove_suffix(account_id, _SNOWFLAKE_HOST_SUFFIX) return account_id @pydantic.validator("authentication_type", always=True) def authenticator_type_is_valid(cls, v, values): - if v not in VALID_AUTH_TYPES.keys(): + if v not in _VALID_AUTH_TYPES.keys(): raise ValueError( f"unsupported authenticator type '{v}' was provided," - f" use one of {list(VALID_AUTH_TYPES.keys())}" + f" use one of {list(_VALID_AUTH_TYPES.keys())}" ) if ( values.get("private_key") is not None @@ -189,10 +194,10 @@ def get_sql_alchemy_url( # Drop the options if value is None. key: value for (key, value) in { - "authenticator": VALID_AUTH_TYPES.get(self.authentication_type), + "authenticator": _VALID_AUTH_TYPES.get(self.authentication_type), "warehouse": self.warehouse, "role": role, - "application": APPLICATION_NAME, + "application": _APPLICATION_NAME, }.items() if value }, @@ -255,7 +260,7 @@ def get_options(self) -> dict: self.options["connect_args"] = options_connect_args return self.options - def get_oauth_connection(self) -> snowflake.connector.SnowflakeConnection: + def get_oauth_connection(self) -> NativeSnowflakeConnection: assert ( self.oauth_config ), "oauth_config should be provided if using oauth based authentication" @@ -292,12 +297,12 @@ def get_oauth_connection(self) -> snowflake.connector.SnowflakeConnection: token=token, role=self.role, warehouse=self.warehouse, - authenticator=VALID_AUTH_TYPES.get(self.authentication_type), - application=APPLICATION_NAME, + authenticator=_VALID_AUTH_TYPES.get(self.authentication_type), + application=_APPLICATION_NAME, **connect_args, ) - def get_key_pair_connection(self) -> snowflake.connector.SnowflakeConnection: + def get_key_pair_connection(self) -> NativeSnowflakeConnection: connect_args = self.get_options()["connect_args"] return snowflake.connector.connect( @@ -305,12 +310,12 @@ def get_key_pair_connection(self) -> snowflake.connector.SnowflakeConnection: account=self.account_id, warehouse=self.warehouse, role=self.role, - authenticator=VALID_AUTH_TYPES.get(self.authentication_type), - application=APPLICATION_NAME, + authenticator=_VALID_AUTH_TYPES.get(self.authentication_type), + application=_APPLICATION_NAME, **connect_args, ) - def get_connection(self) -> snowflake.connector.SnowflakeConnection: + def get_native_connection(self) -> NativeSnowflakeConnection: connect_args = self.get_options()["connect_args"] if self.authentication_type == "DEFAULT_AUTHENTICATOR": return snowflake.connector.connect( @@ -319,7 +324,7 @@ def get_connection(self) -> snowflake.connector.SnowflakeConnection: account=self.account_id, warehouse=self.warehouse, role=self.role, - application=APPLICATION_NAME, + application=_APPLICATION_NAME, **connect_args, ) elif self.authentication_type == "OAUTH_AUTHENTICATOR": @@ -333,40 +338,59 @@ def get_connection(self) -> snowflake.connector.SnowflakeConnection: account=self.account_id, warehouse=self.warehouse, role=self.role, - authenticator=VALID_AUTH_TYPES.get(self.authentication_type), - application=APPLICATION_NAME, + authenticator=_VALID_AUTH_TYPES.get(self.authentication_type), + application=_APPLICATION_NAME, **connect_args, ) else: # not expected to be here raise Exception("Not expected to be here.") + def get_connection(self) -> "SnowflakeConnection": + try: + return SnowflakeConnection(self.get_native_connection()) + except Exception as e: + logger.debug(e, exc_info=e) -class SnowflakeConfig(BaseSnowflakeConfig, BaseTimeWindowConfig, SQLCommonConfig): - include_table_lineage: bool = pydantic.Field( - default=True, - description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.", - ) - include_view_lineage: bool = pydantic.Field( - default=True, - description="If enabled, populates the snowflake view->table and table->view lineages. Requires appropriate grants given to the role, and include_table_lineage to be True. view->table lineage requires Snowflake Enterprise Edition or above.", - ) + if "not granted to this user" in str(e): + raise SnowflakePermissionError( + f"Permissions error when connecting to snowflake: {e}" + ) from e - database_pattern: AllowDenyPattern = AllowDenyPattern( - deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"] - ) + raise ConfigurationError( + f"Failed to connect to snowflake instance: {e}" + ) from e - ignore_start_time_lineage: bool = False - upstream_lineage_in_report: bool = False - @pydantic.root_validator(skip_on_failure=True) - def validate_include_view_lineage(cls, values): - if ( - "include_table_lineage" in values - and not values.get("include_table_lineage") - and values.get("include_view_lineage") - ): - raise ValueError( - "include_table_lineage must be True for include_view_lineage to be set." - ) - return values +class SnowflakeConnection: + _connection: NativeSnowflakeConnection + + def __init__(self, connection: NativeSnowflakeConnection): + self._connection = connection + + def native_connection(self) -> NativeSnowflakeConnection: + return self._connection + + def query(self, query: str) -> Any: + try: + logger.info(f"Query: {query}", stacklevel=2) + resp = self._connection.cursor(DictCursor).execute(query) + return resp + + except Exception as e: + if _is_permission_error(e): + raise SnowflakePermissionError(e) from e + raise + + def is_closed(self) -> bool: + return self._connection.is_closed() + + def close(self): + self._connection.close() + + +def _is_permission_error(e: Exception) -> bool: + msg = str(e) + # 002003 (02000): SQL compilation error: Database/SCHEMA 'XXXX' does not exist or not authorized. + # Insufficient privileges to operate on database 'XXXX' + return "Insufficient privileges" in msg or "not authorized" in msg diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_data_reader.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_data_reader.py index 9fa81cb1bd20cb..c9615bb498fe48 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_data_reader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_data_reader.py @@ -2,9 +2,9 @@ from typing import Any, Callable, Dict, List import pandas as pd -from snowflake.connector import SnowflakeConnection from datahub.ingestion.source.common.data_reader import DataReader +from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection from datahub.utilities.perf_timer import PerfTimer logger = logging.Logger(__name__) @@ -39,7 +39,7 @@ def get_sample_data_for_table( logger.debug( f"Collecting sample values for table {db_name}.{schema_name}.{table_name}" ) - with PerfTimer() as timer, self.conn.cursor() as cursor: + with PerfTimer() as timer, self.conn.native_connection().cursor() as cursor: sql = f'select * from "{db_name}"."{schema_name}"."{table_name}" sample ({sample_size} rows);' cursor.execute(sql) dat = cursor.fetchall() diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index b12ef4d19c45c8..3e65f062004189 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -5,9 +5,9 @@ from typing import Any, Callable, Collection, Iterable, List, Optional, Set, Tuple, Type from pydantic import BaseModel, validator -from snowflake.connector import SnowflakeConnection from datahub.configuration.datetimes import parse_absolute_time +from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage from datahub.ingestion.source.snowflake.constants import ( @@ -15,14 +15,13 @@ SnowflakeEdition, ) from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config -from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery -from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeCommonMixin, - SnowflakeConnectionMixin, +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnection, SnowflakePermissionError, - SnowflakeQueryMixin, ) +from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery +from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report +from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantLineageRunSkipHandler, ) @@ -30,6 +29,7 @@ from datahub.sql_parsing.sql_parsing_aggregator import ( ColumnLineageInfo, ColumnRef, + KnownLineageMapping, KnownQueryLineageInfo, SqlParsingAggregator, UrnStr, @@ -101,9 +101,7 @@ class SnowflakeColumnId: object_domain: Optional[str] = None -class SnowflakeLineageExtractor( - SnowflakeQueryMixin, SnowflakeConnectionMixin, SnowflakeCommonMixin -): +class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable): """ Extracts Lineage from Snowflake. Following lineage edges are considered. @@ -120,6 +118,7 @@ def __init__( self, config: SnowflakeV2Config, report: SnowflakeV2Report, + connection: SnowflakeConnection, dataset_urn_builder: Callable[[str], str], redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler], sql_aggregator: SqlParsingAggregator, @@ -128,7 +127,7 @@ def __init__( self.report = report self.logger = logger self.dataset_urn_builder = dataset_urn_builder - self.connection: Optional[SnowflakeConnection] = None + self.connection = connection self.sql_aggregator = sql_aggregator self.redundant_run_skip_handler = redundant_run_skip_handler @@ -165,10 +164,6 @@ def get_workunits( if not self._should_ingest_lineage(): return - self.connection = self.create_connection() - if self.connection is None: - return - # s3 dataset -> snowflake table self._populate_external_upstreams(discovered_tables) @@ -264,13 +259,20 @@ def _populate_external_upstreams(self, discovered_tables: List[str]) -> None: with PerfTimer() as timer: self.report.num_external_table_edges_scanned = 0 - self._populate_external_lineage_from_copy_history(discovered_tables) + for ( + known_lineage_mapping + ) in self._populate_external_lineage_from_copy_history(discovered_tables): + self.sql_aggregator.add(known_lineage_mapping) logger.info( "Done populating external lineage from copy history. " f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far." ) - self._populate_external_lineage_from_show_query(discovered_tables) + for ( + known_lineage_mapping + ) in self._populate_external_lineage_from_show_query(discovered_tables): + self.sql_aggregator.add(known_lineage_mapping) + logger.info( "Done populating external lineage from show external tables. " f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far." @@ -282,10 +284,10 @@ def _populate_external_upstreams(self, discovered_tables: List[str]) -> None: # NOTE: Snowflake does not log this information to the access_history table. def _populate_external_lineage_from_show_query( self, discovered_tables: List[str] - ) -> None: + ) -> Iterable[KnownLineageMapping]: external_tables_query: str = SnowflakeQuery.show_external_tables() try: - for db_row in self.query(external_tables_query): + for db_row in self.connection.query(external_tables_query): key = self.get_dataset_identifier( db_row["name"], db_row["schema_name"], db_row["database_name"] ) @@ -293,11 +295,11 @@ def _populate_external_lineage_from_show_query( if key not in discovered_tables: continue if db_row["location"].startswith("s3://"): - self.sql_aggregator.add_known_lineage_mapping( - downstream_urn=self.dataset_urn_builder(key), + yield KnownLineageMapping( upstream_urn=make_s3_urn_for_lineage( db_row["location"], self.config.env ), + downstream_urn=self.dataset_urn_builder(key), ) self.report.num_external_table_edges_scanned += 1 @@ -316,7 +318,7 @@ def _populate_external_lineage_from_show_query( # NOTE: Snowflake does not log this information to the access_history table. def _populate_external_lineage_from_copy_history( self, discovered_tables: List[str] - ) -> None: + ) -> Iterable[KnownLineageMapping]: query: str = SnowflakeQuery.copy_lineage_history( start_time_millis=int(self.start_time.timestamp() * 1000), end_time_millis=int(self.end_time.timestamp() * 1000), @@ -324,8 +326,12 @@ def _populate_external_lineage_from_copy_history( ) try: - for db_row in self.query(query): - self._process_external_lineage_result_row(db_row, discovered_tables) + for db_row in self.connection.query(query): + known_lineage_mapping = self._process_external_lineage_result_row( + db_row, discovered_tables + ) + if known_lineage_mapping: + yield known_lineage_mapping except Exception as e: if isinstance(e, SnowflakePermissionError): error_msg = "Failed to get external lineage. Please grant imported privileges on SNOWFLAKE database. " @@ -340,35 +346,40 @@ def _populate_external_lineage_from_copy_history( def _process_external_lineage_result_row( self, db_row: dict, discovered_tables: List[str] - ) -> None: + ) -> Optional[KnownLineageMapping]: # key is the down-stream table name key: str = self.get_dataset_identifier_from_qualified_name( db_row["DOWNSTREAM_TABLE_NAME"] ) if key not in discovered_tables: - return + return None if db_row["UPSTREAM_LOCATIONS"] is not None: external_locations = json.loads(db_row["UPSTREAM_LOCATIONS"]) for loc in external_locations: if loc.startswith("s3://"): - self.sql_aggregator.add_known_lineage_mapping( - downstream_urn=self.dataset_urn_builder(key), + self.report.num_external_table_edges_scanned += 1 + return KnownLineageMapping( upstream_urn=make_s3_urn_for_lineage(loc, self.config.env), + downstream_urn=self.dataset_urn_builder(key), ) - self.report.num_external_table_edges_scanned += 1 + + return None def _fetch_upstream_lineages_for_tables(self) -> Iterable[UpstreamLineageEdge]: query: str = SnowflakeQuery.table_to_table_lineage_history_v2( start_time_millis=int(self.start_time.timestamp() * 1000), end_time_millis=int(self.end_time.timestamp() * 1000), upstreams_deny_pattern=self.config.temporary_tables_pattern, - include_view_lineage=self.config.include_view_lineage, + # The self.config.include_view_lineage setting is about fetching upstreams of views. + # We always generate lineage pointing at views from tables, even if self.config.include_view_lineage is False. + # TODO: Remove this `include_view_lineage` flag, since it's effectively dead code. + include_view_lineage=True, include_column_lineage=self.config.include_column_lineage, ) try: - for db_row in self.query(query): + for db_row in self.connection.query(query): edge = self._process_upstream_lineage_row(db_row) if edge: yield edge @@ -409,10 +420,12 @@ def map_query_result_upstreams( upstream_name = self.get_dataset_identifier_from_qualified_name( upstream_table.upstream_object_name ) - if upstream_name and self._is_dataset_pattern_allowed( - upstream_name, - upstream_table.upstream_object_domain, - is_upstream=True, + if upstream_name and ( + not self.config.validate_upstreams_against_patterns + or self.is_dataset_pattern_allowed( + upstream_name, + upstream_table.upstream_object_domain, + ) ): upstreams.append(self.dataset_urn_builder(upstream_name)) except Exception as e: @@ -493,10 +506,12 @@ def build_finegrained_lineage_upstreams( if ( upstream_col.object_name and upstream_col.column_name - and self._is_dataset_pattern_allowed( - upstream_col.object_name, - upstream_col.object_domain, - is_upstream=True, + and ( + not self.config.validate_upstreams_against_patterns + or self.is_dataset_pattern_allowed( + upstream_col.object_name, + upstream_col.object_domain, + ) ) ): upstream_dataset_name = self.get_dataset_identifier_from_qualified_name( @@ -547,3 +562,6 @@ def _should_ingest_lineage(self) -> bool: def report_status(self, step: str, status: bool) -> None: if self.redundant_run_skip_handler: self.redundant_run_skip_handler.report_current_run_status(step, status) + + def close(self) -> None: + pass diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py index 5e6ade29344eb0..4deeb9f96f48eb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py @@ -164,7 +164,7 @@ def callable_for_db_connection(self, db_name: str) -> Callable: schema_name = self.database_default_schema.get(db_name) def get_db_connection(): - conn = self.config.get_connection() + conn = self.config.get_native_connection() conn.cursor().execute(SnowflakeQuery.use_database(db_name)) # As mentioned here - https://docs.snowflake.com/en/sql-reference/sql/use-database#usage-notes diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py new file mode 100644 index 00000000000000..c647a624a54673 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -0,0 +1,515 @@ +import functools +import json +import logging +import pathlib +import re +import tempfile +from dataclasses import dataclass +from datetime import datetime, timezone +from typing import Any, Dict, Iterable, List, Optional, Union + +import pydantic +from typing_extensions import Self + +from datahub.configuration.time_window_config import ( + BaseTimeWindowConfig, + BucketDuration, +) +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.report import Report +from datahub.ingestion.api.source import Source, SourceReport +from datahub.ingestion.api.source_helpers import auto_workunit +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain +from datahub.ingestion.source.snowflake.snowflake_config import ( + DEFAULT_TEMP_TABLES_PATTERNS, + SnowflakeFilterConfig, + SnowflakeIdentifierConfig, +) +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnection, + SnowflakeConnectionConfig, +) +from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery +from datahub.ingestion.source.snowflake.snowflake_utils import ( + SnowflakeFilterMixin, + SnowflakeIdentifierMixin, +) +from datahub.ingestion.source.usage.usage_common import BaseUsageConfig +from datahub.metadata.urns import CorpUserUrn +from datahub.sql_parsing.sql_parsing_aggregator import ( + KnownLineageMapping, + PreparsedQuery, + SqlAggregatorReport, + SqlParsingAggregator, +) +from datahub.sql_parsing.sql_parsing_common import QueryType +from datahub.sql_parsing.sqlglot_lineage import ( + ColumnLineageInfo, + ColumnRef, + DownstreamColumnRef, +) +from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList + +logger = logging.getLogger(__name__) + + +class SnowflakeQueriesExtractorConfig(SnowflakeIdentifierConfig, SnowflakeFilterConfig): + # TODO: Support stateful ingestion for the time windows. + window: BaseTimeWindowConfig = BaseTimeWindowConfig() + + # TODO: make this a proper allow/deny pattern + deny_usernames: List[str] = [] + + temporary_tables_pattern: List[str] = pydantic.Field( + default=DEFAULT_TEMP_TABLES_PATTERNS, + description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to " + "match the entire table name in database.schema.table format. Defaults are to set in such a way " + "to ignore the temporary staging tables created by known ETL tools.", + ) + + local_temp_path: Optional[pathlib.Path] = pydantic.Field( + default=None, + description="Local path to store the audit log.", + # TODO: For now, this is simply an advanced config to make local testing easier. + # Eventually, we will want to store date-specific files in the directory and use it as a cache. + hidden_from_docs=True, + ) + + convert_urns_to_lowercase: bool = pydantic.Field( + # Override the default. + default=True, + description="Whether to convert dataset urns to lowercase.", + ) + + include_lineage: bool = True + include_queries: bool = True + include_usage_statistics: bool = True + include_query_usage_statistics: bool = False + include_operations: bool = True + + +class SnowflakeQueriesSourceConfig(SnowflakeQueriesExtractorConfig): + connection: SnowflakeConnectionConfig + + +@dataclass +class SnowflakeQueriesExtractorReport(Report): + window: Optional[BaseTimeWindowConfig] = None + + sql_aggregator: Optional[SqlAggregatorReport] = None + + +@dataclass +class SnowflakeQueriesSourceReport(SourceReport): + queries_extractor: Optional[SnowflakeQueriesExtractorReport] = None + + +class SnowflakeQueriesExtractor(SnowflakeFilterMixin, SnowflakeIdentifierMixin): + def __init__( + self, + connection: SnowflakeConnection, + config: SnowflakeQueriesExtractorConfig, + structured_report: SourceReport, + ): + self.connection = connection + + self.config = config + self.report = SnowflakeQueriesExtractorReport() + self._structured_report = structured_report + + self.aggregator = SqlParsingAggregator( + platform=self.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + # graph=self.ctx.graph, + generate_lineage=self.config.include_lineage, + generate_queries=self.config.include_queries, + generate_usage_statistics=self.config.include_usage_statistics, + generate_query_usage_statistics=self.config.include_query_usage_statistics, + usage_config=BaseUsageConfig( + bucket_duration=self.config.window.bucket_duration, + start_time=self.config.window.start_time, + end_time=self.config.window.end_time, + # TODO make the rest of the fields configurable + ), + generate_operations=self.config.include_operations, + is_temp_table=self.is_temp_table, + is_allowed_table=self.is_allowed_table, + format_queries=False, + ) + self.report.sql_aggregator = self.aggregator.report + + @property + def structured_reporter(self) -> SourceReport: + return self._structured_report + + @property + def filter_config(self) -> SnowflakeFilterConfig: + return self.config + + @property + def identifier_config(self) -> SnowflakeIdentifierConfig: + return self.config + + @functools.cached_property + def local_temp_path(self) -> pathlib.Path: + if self.config.local_temp_path: + assert self.config.local_temp_path.is_dir() + return self.config.local_temp_path + + path = pathlib.Path(tempfile.mkdtemp()) + path.mkdir(parents=True, exist_ok=True) + logger.info(f"Using local temp path: {path}") + return path + + def is_temp_table(self, name: str) -> bool: + return any( + re.match(pattern, name, flags=re.IGNORECASE) + for pattern in self.config.temporary_tables_pattern + ) + + def is_allowed_table(self, name: str) -> bool: + return self.is_dataset_pattern_allowed(name, SnowflakeObjectDomain.TABLE) + + def get_workunits_internal( + self, + ) -> Iterable[MetadataWorkUnit]: + self.report.window = self.config.window + + # TODO: Add some logic to check if the cached audit log is stale or not. + audit_log_file = self.local_temp_path / "audit_log.sqlite" + use_cached_audit_log = audit_log_file.exists() + + queries: FileBackedList[Union[KnownLineageMapping, PreparsedQuery]] + if use_cached_audit_log: + logger.info("Using cached audit log") + shared_connection = ConnectionWrapper(audit_log_file) + queries = FileBackedList(shared_connection) + else: + audit_log_file.unlink(missing_ok=True) + + shared_connection = ConnectionWrapper(audit_log_file) + queries = FileBackedList(shared_connection) + + logger.info("Fetching audit log") + for entry in self.fetch_audit_log(): + queries.append(entry) + + for query in queries: + self.aggregator.add(query) + + yield from auto_workunit(self.aggregator.gen_metadata()) + + def fetch_audit_log( + self, + ) -> Iterable[Union[KnownLineageMapping, PreparsedQuery]]: + """ + # TODO: we need to fetch this info from somewhere + discovered_tables = [] + + snowflake_lineage_v2 = SnowflakeLineageExtractor( + config=self.config, # type: ignore + report=self.report, # type: ignore + dataset_urn_builder=self.gen_dataset_urn, + redundant_run_skip_handler=None, + sql_aggregator=self.aggregator, # TODO this should be unused + ) + + for ( + known_lineage_mapping + ) in snowflake_lineage_v2._populate_external_lineage_from_copy_history( + discovered_tables=discovered_tables + ): + interim_results.append(known_lineage_mapping) + + for ( + known_lineage_mapping + ) in snowflake_lineage_v2._populate_external_lineage_from_show_query( + discovered_tables=discovered_tables + ): + interim_results.append(known_lineage_mapping) + """ + + audit_log_query = _build_enriched_audit_log_query( + start_time=self.config.window.start_time, + end_time=self.config.window.end_time, + bucket_duration=self.config.window.bucket_duration, + deny_usernames=self.config.deny_usernames, + ) + + resp = self.connection.query(audit_log_query) + + for i, row in enumerate(resp): + if i % 1000 == 0: + logger.info(f"Processed {i} audit log rows") + + assert isinstance(row, dict) + try: + entry = self._parse_audit_log_row(row) + except Exception as e: + self.structured_reporter.warning( + "Error parsing audit log row", + context=f"{row}", + exc=e, + ) + else: + yield entry + + def get_dataset_identifier_from_qualified_name(self, qualified_name: str) -> str: + # Copied from SnowflakeCommonMixin. + return self.snowflake_identifier(self.cleanup_qualified_name(qualified_name)) + + def _parse_audit_log_row(self, row: Dict[str, Any]) -> PreparsedQuery: + json_fields = { + "DIRECT_OBJECTS_ACCESSED", + "OBJECTS_MODIFIED", + } + + res = {} + for key, value in row.items(): + if key in json_fields and value: + value = json.loads(value) + key = key.lower() + res[key] = value + + direct_objects_accessed = res["direct_objects_accessed"] + objects_modified = res["objects_modified"] + + upstreams = [] + column_usage = {} + + for obj in direct_objects_accessed: + dataset = self.gen_dataset_urn( + self.get_dataset_identifier_from_qualified_name(obj["objectName"]) + ) + + columns = set() + for modified_column in obj["columns"]: + columns.add(self.snowflake_identifier(modified_column["columnName"])) + + upstreams.append(dataset) + column_usage[dataset] = columns + + downstream = None + column_lineage = None + for obj in objects_modified: + # We don't expect there to be more than one object modified. + if downstream: + self.structured_reporter.report_warning( + message="Unexpectedly got multiple downstream entities from the Snowflake audit log.", + context=f"{row}", + ) + + downstream = self.gen_dataset_urn( + self.get_dataset_identifier_from_qualified_name(obj["objectName"]) + ) + column_lineage = [] + for modified_column in obj["columns"]: + column_lineage.append( + ColumnLineageInfo( + downstream=DownstreamColumnRef( + dataset=downstream, + column=self.snowflake_identifier( + modified_column["columnName"] + ), + ), + upstreams=[ + ColumnRef( + table=self.gen_dataset_urn( + self.get_dataset_identifier_from_qualified_name( + upstream["objectName"] + ) + ), + column=self.snowflake_identifier( + upstream["columnName"] + ), + ) + for upstream in modified_column["directSources"] + if upstream["objectDomain"] + in SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS + ], + ) + ) + + # TODO: Support filtering the table names. + # if objects_modified: + # breakpoint() + + # TODO implement email address mapping + user = CorpUserUrn(res["user_name"]) + + timestamp: datetime = res["query_start_time"] + timestamp = timestamp.astimezone(timezone.utc) + + # TODO need to map snowflake query types to ours + query_type = SNOWFLAKE_QUERY_TYPE_MAPPING.get( + res["query_type"], QueryType.UNKNOWN + ) + + entry = PreparsedQuery( + query_id=res["query_fingerprint"], + query_text=res["query_text"], + upstreams=upstreams, + downstream=downstream, + column_lineage=column_lineage, + column_usage=column_usage, + inferred_schema=None, + confidence_score=1, + query_count=res["query_count"], + user=user, + timestamp=timestamp, + session_id=res["session_id"], + query_type=query_type, + ) + return entry + + +class SnowflakeQueriesSource(Source): + def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesSourceConfig): + self.ctx = ctx + self.config = config + self.report = SnowflakeQueriesSourceReport() + + self.platform = "snowflake" + + self.connection = self.config.connection.get_connection() + + self.queries_extractor = SnowflakeQueriesExtractor( + connection=self.connection, + config=self.config, + structured_report=self.report, + ) + self.report.queries_extractor = self.queries_extractor.report + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> Self: + config = SnowflakeQueriesSourceConfig.parse_obj(config_dict) + return cls(ctx, config) + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + # TODO: Disable auto status processor? + return self.queries_extractor.get_workunits_internal() + + def get_report(self) -> SnowflakeQueriesSourceReport: + return self.report + + +# Make sure we don't try to generate too much info for a single query. +_MAX_TABLES_PER_QUERY = 20 + + +def _build_enriched_audit_log_query( + start_time: datetime, + end_time: datetime, + bucket_duration: BucketDuration, + deny_usernames: Optional[List[str]], +) -> str: + start_time_millis = int(start_time.timestamp() * 1000) + end_time_millis = int(end_time.timestamp() * 1000) + + users_filter = "" + if deny_usernames: + user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames) + users_filter = f"user_name NOT IN ({user_not_in})" + + time_bucket_size = bucket_duration.value + assert time_bucket_size in ("HOUR", "DAY", "MONTH") + + return f"""\ +WITH +fingerprinted_queries as ( + SELECT *, + -- TODO: Generate better fingerprints for each query by pushing down regex logic. + query_history.query_parameterized_hash as query_fingerprint + FROM + snowflake.account_usage.query_history + WHERE + query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3) + AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3) + AND execution_status = 'SUCCESS' + AND {users_filter or 'TRUE'} +) +, deduplicated_queries as ( + SELECT + *, + DATE_TRUNC( + {time_bucket_size}, + CONVERT_TIMEZONE('UTC', start_time) + ) AS bucket_start_time, + COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint) AS query_count, + FROM + fingerprinted_queries + QUALIFY + ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint ORDER BY start_time DESC) = 1 +) +, raw_access_history AS ( + SELECT + query_id, + query_start_time, + user_name, + direct_objects_accessed, + objects_modified, + FROM + snowflake.account_usage.access_history + WHERE + query_start_time >= to_timestamp_ltz({start_time_millis}, 3) + AND query_start_time < to_timestamp_ltz({end_time_millis}, 3) + AND {users_filter or 'TRUE'} + AND query_id IN ( + SELECT query_id FROM deduplicated_queries + ) +) +, filtered_access_history AS ( + -- TODO: Add table filter clause. + SELECT + query_id, + query_start_time, + ARRAY_SLICE( + FILTER(direct_objects_accessed, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}), + 0, {_MAX_TABLES_PER_QUERY} + ) as direct_objects_accessed, + -- TODO: Drop the columns.baseSources subfield. + FILTER(objects_modified, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}) as objects_modified, + FROM raw_access_history + WHERE ( array_size(direct_objects_accessed) > 0 or array_size(objects_modified) > 0 ) +) +, query_access_history AS ( + SELECT + q.bucket_start_time, + q.query_id, + q.query_fingerprint, + q.query_count, + q.session_id AS "SESSION_ID", + q.start_time AS "QUERY_START_TIME", + q.total_elapsed_time AS "QUERY_DURATION", + q.query_text AS "QUERY_TEXT", + q.query_type AS "QUERY_TYPE", + q.database_name as "DEFAULT_DB", + q.schema_name as "DEFAULT_SCHEMA", + q.rows_inserted AS "ROWS_INSERTED", + q.rows_updated AS "ROWS_UPDATED", + q.rows_deleted AS "ROWS_DELETED", + q.user_name AS "USER_NAME", + q.role_name AS "ROLE_NAME", + a.direct_objects_accessed, + a.objects_modified, + FROM deduplicated_queries q + JOIN filtered_access_history a USING (query_id) +) +SELECT * FROM query_access_history +""" + + +SNOWFLAKE_QUERY_TYPE_MAPPING = { + "INSERT": QueryType.INSERT, + "UPDATE": QueryType.UPDATE, + "DELETE": QueryType.DELETE, + "CREATE": QueryType.CREATE_OTHER, + "CREATE_TABLE": QueryType.CREATE_DDL, + "CREATE_VIEW": QueryType.CREATE_VIEW, + "CREATE_TABLE_AS_SELECT": QueryType.CREATE_TABLE_AS_SELECT, + "MERGE": QueryType.MERGE, + "COPY": QueryType.UNKNOWN, + "TRUNCATE_TABLE": QueryType.UNKNOWN, +} diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index 9f655b34177fc6..a2e18a64d9a809 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -3,7 +3,9 @@ from datahub.configuration.common import AllowDenyPattern from datahub.configuration.time_window_config import BucketDuration from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain -from datahub.ingestion.source.snowflake.snowflake_config import DEFAULT_TABLES_DENY_LIST +from datahub.ingestion.source.snowflake.snowflake_config import ( + DEFAULT_TEMP_TABLES_PATTERNS, +) from datahub.utilities.prefix_batch_builder import PrefixGroup SHOW_VIEWS_MAX_PAGE_SIZE = 10000 @@ -28,13 +30,15 @@ def create_deny_regex_sql_filter( class SnowflakeQuery: - ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = ( - "(" - f"'{SnowflakeObjectDomain.TABLE.capitalize()}'," - f"'{SnowflakeObjectDomain.EXTERNAL_TABLE.capitalize()}'," - f"'{SnowflakeObjectDomain.VIEW.capitalize()}'," - f"'{SnowflakeObjectDomain.MATERIALIZED_VIEW.capitalize()}'" - ")" + ACCESS_HISTORY_TABLE_VIEW_DOMAINS = { + SnowflakeObjectDomain.TABLE.capitalize(), + SnowflakeObjectDomain.EXTERNAL_TABLE.capitalize(), + SnowflakeObjectDomain.VIEW.capitalize(), + SnowflakeObjectDomain.MATERIALIZED_VIEW.capitalize(), + } + + ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = "({})".format( + ",".join(f"'{domain}'" for domain in ACCESS_HISTORY_TABLE_VIEW_DOMAINS) ) ACCESS_HISTORY_TABLE_DOMAINS_FILTER = ( "(" @@ -356,7 +360,7 @@ def table_to_table_lineage_history_v2( end_time_millis: int, include_view_lineage: bool = True, include_column_lineage: bool = True, - upstreams_deny_pattern: List[str] = DEFAULT_TABLES_DENY_LIST, + upstreams_deny_pattern: List[str] = DEFAULT_TEMP_TABLES_PATTERNS, ) -> str: if include_column_lineage: return SnowflakeQuery.table_upstreams_with_column_lineage( @@ -407,7 +411,7 @@ def show_external_tables() -> str: def copy_lineage_history( start_time_millis: int, end_time_millis: int, - downstreams_deny_pattern: List[str] = DEFAULT_TABLES_DENY_LIST, + downstreams_deny_pattern: List[str] = DEFAULT_TEMP_TABLES_PATTERNS, ) -> str: temp_table_filter = create_deny_regex_sql_filter( downstreams_deny_pattern, @@ -450,7 +454,7 @@ def usage_per_object_per_time_bucket_for_time_window( include_top_n_queries: bool, email_domain: Optional[str], email_filter: AllowDenyPattern, - table_deny_pattern: List[str] = DEFAULT_TABLES_DENY_LIST, + table_deny_pattern: List[str] = DEFAULT_TEMP_TABLES_PATTERNS, ) -> str: if not include_top_n_queries: top_n_queries = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py index 4bc684a22514c4..ce8f20d23aa6b1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py @@ -5,15 +5,13 @@ from datetime import datetime from typing import Callable, Dict, Iterable, List, MutableMapping, Optional -from snowflake.connector import SnowflakeConnection - from datahub.ingestion.api.report import SupportsAsObj from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain +from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection from datahub.ingestion.source.snowflake.snowflake_query import ( SHOW_VIEWS_MAX_PAGE_SIZE, SnowflakeQuery, ) -from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeQueryMixin from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView from datahub.utilities.file_backed_collections import FileBackedDict from datahub.utilities.prefix_batch_builder import build_prefix_batches @@ -185,19 +183,12 @@ def get_column_tags_for_table( ) -class SnowflakeDataDictionary(SnowflakeQueryMixin, SupportsAsObj): - def __init__(self) -> None: +class SnowflakeDataDictionary(SupportsAsObj): + def __init__(self, connection: SnowflakeConnection) -> None: self.logger = logger - self.connection: Optional[SnowflakeConnection] = None - def set_connection(self, connection: SnowflakeConnection) -> None: self.connection = connection - def get_connection(self) -> SnowflakeConnection: - # Connection is already present by the time this is called - assert self.connection is not None - return self.connection - def as_obj(self) -> Dict[str, Dict[str, int]]: # TODO: Move this into a proper report type that gets computed. @@ -221,7 +212,7 @@ def as_obj(self) -> Dict[str, Dict[str, int]]: def show_databases(self) -> List[SnowflakeDatabase]: databases: List[SnowflakeDatabase] = [] - cur = self.query( + cur = self.connection.query( SnowflakeQuery.show_databases(), ) @@ -238,7 +229,7 @@ def show_databases(self) -> List[SnowflakeDatabase]: def get_databases(self, db_name: str) -> List[SnowflakeDatabase]: databases: List[SnowflakeDatabase] = [] - cur = self.query( + cur = self.connection.query( SnowflakeQuery.get_databases(db_name), ) @@ -256,7 +247,7 @@ def get_databases(self, db_name: str) -> List[SnowflakeDatabase]: def get_schemas_for_database(self, db_name: str) -> List[SnowflakeSchema]: snowflake_schemas = [] - cur = self.query( + cur = self.connection.query( SnowflakeQuery.schemas_for_database(db_name), ) @@ -276,7 +267,7 @@ def get_tables_for_database( ) -> Optional[Dict[str, List[SnowflakeTable]]]: tables: Dict[str, List[SnowflakeTable]] = {} try: - cur = self.query( + cur = self.connection.query( SnowflakeQuery.tables_for_database(db_name), ) except Exception as e: @@ -309,7 +300,7 @@ def get_tables_for_schema( ) -> List[SnowflakeTable]: tables: List[SnowflakeTable] = [] - cur = self.query( + cur = self.connection.query( SnowflakeQuery.tables_for_schema(schema_name, db_name), ) @@ -337,7 +328,7 @@ def get_views_for_database(self, db_name: str) -> Dict[str, List[SnowflakeView]] first_iteration = True view_pagination_marker: Optional[str] = None while first_iteration or view_pagination_marker is not None: - cur = self.query( + cur = self.connection.query( SnowflakeQuery.show_views_for_database( db_name, limit=page_limit, @@ -406,7 +397,7 @@ def get_columns_for_schema( schema_name, db_name, object_batch ) - cur = self.query(query) + cur = self.connection.query(query) for column in cur: if column["TABLE_NAME"] not in columns: @@ -430,7 +421,7 @@ def get_pk_constraints_for_schema( self, schema_name: str, db_name: str ) -> Dict[str, SnowflakePK]: constraints: Dict[str, SnowflakePK] = {} - cur = self.query( + cur = self.connection.query( SnowflakeQuery.show_primary_keys_for_schema(schema_name, db_name), ) @@ -449,7 +440,7 @@ def get_fk_constraints_for_schema( constraints: Dict[str, List[SnowflakeFK]] = {} fk_constraints_map: Dict[str, SnowflakeFK] = {} - cur = self.query( + cur = self.connection.query( SnowflakeQuery.show_foreign_keys_for_schema(schema_name, db_name), ) @@ -481,7 +472,7 @@ def get_tags_for_database_without_propagation( self, db_name: str, ) -> _SnowflakeTagCache: - cur = self.query( + cur = self.connection.query( SnowflakeQuery.get_all_tags_in_database_without_propagation(db_name) ) @@ -536,7 +527,7 @@ def get_tags_for_object_with_propagation( ) -> List[SnowflakeTag]: tags: List[SnowflakeTag] = [] - cur = self.query( + cur = self.connection.query( SnowflakeQuery.get_all_tags_on_object_with_propagation( db_name, quoted_identifier, domain ), @@ -557,7 +548,7 @@ def get_tags_on_columns_for_table( self, quoted_table_name: str, db_name: str ) -> Dict[str, List[SnowflakeTag]]: tags: Dict[str, List[SnowflakeTag]] = defaultdict(list) - cur = self.query( + cur = self.connection.query( SnowflakeQuery.get_tags_on_columns_with_propagation( db_name, quoted_table_name ), diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index ac2a3ced5a232c..e604ed96b8eb6b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -2,9 +2,7 @@ import itertools import logging import queue -from typing import Dict, Iterable, List, Optional, Union - -from snowflake.connector import SnowflakeConnection +from typing import Callable, Dict, Iterable, List, Optional, Union from datahub.configuration.pattern_utils import is_schema_allowed from datahub.emitter.mce_builder import ( @@ -14,6 +12,7 @@ make_tag_urn, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.source import SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.glossary.classification_mixin import ( ClassificationHandler, @@ -29,9 +28,15 @@ SnowflakeObjectDomain, ) from datahub.ingestion.source.snowflake.snowflake_config import ( + SnowflakeFilterConfig, + SnowflakeIdentifierConfig, SnowflakeV2Config, TagOption, ) +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnection, + SnowflakePermissionError, +) from datahub.ingestion.source.snowflake.snowflake_data_reader import SnowflakeDataReader from datahub.ingestion.source.snowflake.snowflake_profiler import SnowflakeProfiler from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report @@ -49,11 +54,9 @@ ) from datahub.ingestion.source.snowflake.snowflake_tag import SnowflakeTagExtractor from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeCommonMixin, - SnowflakeCommonProtocol, - SnowflakeConnectionMixin, - SnowflakePermissionError, - SnowflakeQueryMixin, + SnowflakeFilterMixin, + SnowflakeIdentifierMixin, + SnowsightUrlBuilder, ) from datahub.ingestion.source.sql.sql_utils import ( add_table_to_schema_container, @@ -140,29 +143,26 @@ } -class SnowflakeSchemaGenerator( - SnowflakeQueryMixin, - SnowflakeConnectionMixin, - SnowflakeCommonMixin, - SnowflakeCommonProtocol, -): +class SnowflakeSchemaGenerator(SnowflakeFilterMixin, SnowflakeIdentifierMixin): def __init__( self, config: SnowflakeV2Config, report: SnowflakeV2Report, connection: SnowflakeConnection, + dataset_urn_builder: Callable[[str], str], domain_registry: Optional[DomainRegistry], profiler: Optional[SnowflakeProfiler], aggregator: Optional[SqlParsingAggregator], - snowsight_base_url: Optional[str], + snowsight_url_builder: Optional[SnowsightUrlBuilder], ) -> None: self.config: SnowflakeV2Config = config self.report: SnowflakeV2Report = report self.connection: SnowflakeConnection = connection - self.logger = logger + self.dataset_urn_builder = dataset_urn_builder - self.data_dictionary: SnowflakeDataDictionary = SnowflakeDataDictionary() - self.data_dictionary.set_connection(self.connection) + self.data_dictionary: SnowflakeDataDictionary = SnowflakeDataDictionary( + connection=self.connection + ) self.report.data_dictionary_cache = self.data_dictionary self.domain_registry: Optional[DomainRegistry] = domain_registry @@ -171,7 +171,9 @@ def __init__( config, self.data_dictionary, self.report ) self.profiler: Optional[SnowflakeProfiler] = profiler - self.snowsight_base_url: Optional[str] = snowsight_base_url + self.snowsight_url_builder: Optional[ + SnowsightUrlBuilder + ] = snowsight_url_builder # These are populated as side-effects of get_workunits_internal. self.databases: List[SnowflakeDatabase] = [] @@ -180,11 +182,23 @@ def __init__( def get_connection(self) -> SnowflakeConnection: return self.connection + @property + def structured_reporter(self) -> SourceReport: + return self.report + + @property + def filter_config(self) -> SnowflakeFilterConfig: + return self.config + + @property + def identifier_config(self) -> SnowflakeIdentifierConfig: + return self.config + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.databases = [] for database in self.get_databases() or []: self.report.report_entity_scanned(database.name, "database") - if not self.config.database_pattern.allowed(database.name): + if not self.filter_config.database_pattern.allowed(database.name): self.report.report_dropped(f"{database.name}.*") else: self.databases.append(database) @@ -348,10 +362,10 @@ def fetch_schemas_for_database( for schema in self.data_dictionary.get_schemas_for_database(db_name): self.report.report_entity_scanned(schema.name, "schema") if not is_schema_allowed( - self.config.schema_pattern, + self.filter_config.schema_pattern, schema.name, db_name, - self.config.match_fully_qualified_names, + self.filter_config.match_fully_qualified_names, ): self.report.report_dropped(f"{db_name}.{schema.name}.*") else: @@ -432,7 +446,7 @@ def _process_schema( ) if view.view_definition: self.aggregator.add_view_definition( - view_urn=self.gen_dataset_urn(view_identifier), + view_urn=self.dataset_urn_builder(view_identifier), view_definition=view.view_definition, default_db=db_name, default_schema=schema_name, @@ -462,7 +476,7 @@ def fetch_views_for_schema( self.report.report_entity_scanned(view_name, "view") - if not self.config.view_pattern.allowed(view_name): + if not self.filter_config.view_pattern.allowed(view_name): self.report.report_dropped(view_name) else: views.append(view) @@ -495,7 +509,7 @@ def fetch_tables_for_schema( table.name, schema_name, db_name ) self.report.report_entity_scanned(table_identifier) - if not self.config.table_pattern.allowed(table_identifier): + if not self.filter_config.table_pattern.allowed(table_identifier): self.report.report_dropped(table_identifier) else: tables.append(table) @@ -664,7 +678,7 @@ def gen_dataset_workunits( yield from self._process_tag(tag) dataset_name = self.get_dataset_identifier(table.name, schema_name, db_name) - dataset_urn = self.gen_dataset_urn(dataset_name) + dataset_urn = self.dataset_urn_builder(dataset_name) status = Status(removed=False) yield MetadataChangeProposalWrapper( @@ -768,7 +782,7 @@ def get_dataset_properties( qualifiedName=f"{db_name}.{schema_name}.{table.name}", customProperties={}, externalUrl=( - self.get_external_url_for_table( + self.snowsight_url_builder.get_external_url_for_table( table.name, schema_name, db_name, @@ -778,7 +792,7 @@ def get_dataset_properties( else SnowflakeObjectDomain.VIEW ), ) - if self.config.include_external_url + if self.snowsight_url_builder else None ), ) @@ -802,7 +816,7 @@ def gen_schema_metadata( db_name: str, ) -> SchemaMetadata: dataset_name = self.get_dataset_identifier(table.name, schema_name, db_name) - dataset_urn = self.gen_dataset_urn(dataset_name) + dataset_urn = self.dataset_urn_builder(dataset_name) foreign_keys: Optional[List[ForeignKeyConstraint]] = None if isinstance(table, SnowflakeTable) and len(table.foreign_keys) > 0: @@ -907,8 +921,8 @@ def gen_database_containers( domain_registry=self.domain_registry, domain_config=self.config.domain, external_url=( - self.get_external_url_for_database(database.name) - if self.config.include_external_url + self.snowsight_url_builder.get_external_url_for_database(database.name) + if self.snowsight_url_builder else None ), description=database.comment, @@ -963,8 +977,10 @@ def gen_schema_containers( domain_registry=self.domain_registry, description=schema.comment, external_url=( - self.get_external_url_for_schema(schema.name, db_name) - if self.config.include_external_url + self.snowsight_url_builder.get_external_url_for_schema( + schema.name, db_name + ) + if self.snowsight_url_builder else None ), created=( @@ -975,11 +991,7 @@ def gen_schema_containers( last_modified=( int(schema.last_altered.timestamp() * 1000) if schema.last_altered is not None - else ( - int(schema.created.timestamp() * 1000) - if schema.created is not None - else None - ) + else None ), tags=( [self.snowflake_identifier(tag.identifier()) for tag in schema.tags] @@ -1044,23 +1056,3 @@ def get_fk_constraints_for_table( # Access to table but none of its constraints - is this possible ? return constraints.get(table_name, []) - - # domain is either "view" or "table" - def get_external_url_for_table( - self, table_name: str, schema_name: str, db_name: str, domain: str - ) -> Optional[str]: - if self.snowsight_base_url is not None: - return f"{self.snowsight_base_url}#/data/databases/{db_name}/schemas/{schema_name}/{domain}/{table_name}/" - return None - - def get_external_url_for_schema( - self, schema_name: str, db_name: str - ) -> Optional[str]: - if self.snowsight_base_url is not None: - return f"{self.snowsight_base_url}#/data/databases/{db_name}/schemas/{schema_name}/" - return None - - def get_external_url_for_database(self, db_name: str) -> Optional[str]: - if self.snowsight_base_url is not None: - return f"{self.snowsight_base_url}#/data/databases/{db_name}/" - return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py index cd6f17092e810a..f78ae70291f8a7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py @@ -3,57 +3,31 @@ from collections import defaultdict from typing import Dict, Iterable, List, Optional -import pydantic -from snowflake.connector import SnowflakeConnection - -from datahub.configuration.common import AllowDenyPattern from datahub.configuration.source_common import LowerCaseDatasetUrnConfigMixin from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import SupportStatus, config_class, support_status from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.snowflake.snowflake_schema import ( - SnowflakeDatabase, - SnowflakeDataDictionary, +from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeFilterConfig +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnectionConfig, ) +from datahub.ingestion.source.snowflake.snowflake_schema import SnowflakeDatabase from datahub.ingestion.source.snowflake.snowflake_schema_gen import ( SnowflakeSchemaGenerator, ) -from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeCommonMixin, - SnowflakeConnectionMixin, - SnowflakeQueryMixin, -) -from datahub.ingestion.source_config.sql.snowflake import BaseSnowflakeConfig from datahub.ingestion.source_report.time_window import BaseTimeWindowReport from datahub.utilities.lossy_collections import LossyList class SnowflakeSummaryConfig( - BaseSnowflakeConfig, BaseTimeWindowConfig, LowerCaseDatasetUrnConfigMixin + SnowflakeFilterConfig, + SnowflakeConnectionConfig, + BaseTimeWindowConfig, + LowerCaseDatasetUrnConfigMixin, ): - - # Copied from SnowflakeConfig. - database_pattern: AllowDenyPattern = AllowDenyPattern( - deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"] - ) - schema_pattern: AllowDenyPattern = pydantic.Field( - default=AllowDenyPattern.allow_all(), - description="Regex patterns for schemas to filter in ingestion. Specify regex to only match the schema name. e.g. to match all tables in schema analytics, use the regex 'analytics'", - ) - table_pattern: AllowDenyPattern = pydantic.Field( - default=AllowDenyPattern.allow_all(), - description="Regex patterns for tables to filter in ingestion. Specify regex to match the entire table name in database.schema.table format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'", - ) - view_pattern: AllowDenyPattern = pydantic.Field( - default=AllowDenyPattern.allow_all(), - description="Regex patterns for views to filter in ingestion. Note: Defaults to table_pattern if not specified. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'", - ) - match_fully_qualified_names: bool = pydantic.Field( - default=True, - description="Whether `schema_pattern` is matched against fully qualified schema name `.`.", - ) + pass @dataclasses.dataclass @@ -80,37 +54,31 @@ def report_entity_scanned(self, name: str, ent_type: str = "table") -> None: @config_class(SnowflakeSummaryConfig) @support_status(SupportStatus.INCUBATING) -class SnowflakeSummarySource( - SnowflakeQueryMixin, - SnowflakeConnectionMixin, - SnowflakeCommonMixin, - Source, -): +class SnowflakeSummarySource(Source): def __init__(self, ctx: PipelineContext, config: SnowflakeSummaryConfig): super().__init__(ctx) self.config: SnowflakeSummaryConfig = config self.report: SnowflakeSummaryReport = SnowflakeSummaryReport() - - self.data_dictionary = SnowflakeDataDictionary() - self.connection: Optional[SnowflakeConnection] = None self.logger = logging.getLogger(__name__) - def create_connection(self) -> Optional[SnowflakeConnection]: - # TODO: Eventually we'll want to use the implementation from SnowflakeConnectionMixin, - # since it has better error reporting. - # return super().create_connection() - return self.config.get_connection() + self.connection = self.config.get_connection() def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - self.connection = self.create_connection() - if self.connection is None: - return - - self.data_dictionary.set_connection(self.connection) + schema_generator = SnowflakeSchemaGenerator( + # This is a hack, but we just hope that the config / report have all the fields we need. + config=self.config, # type: ignore + report=self.report, # type: ignore + connection=self.connection, + dataset_urn_builder=lambda x: "", + domain_registry=None, + profiler=None, + aggregator=None, + snowsight_url_builder=None, + ) # Databases. databases: List[SnowflakeDatabase] = [] - for database in self.get_databases() or []: # type: ignore + for database in schema_generator.get_databases() or []: # TODO: Support database_patterns. if not self.config.database_pattern.allowed(database.name): self.report.report_dropped(f"{database.name}.*") @@ -119,16 +87,16 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # Schemas. for database in databases: - self.fetch_schemas_for_database(database, database.name) # type: ignore + schema_generator.fetch_schemas_for_database(database, database.name) self.report.schema_counters[database.name] = len(database.schemas) for schema in database.schemas: # Tables/views. - tables = self.fetch_tables_for_schema( # type: ignore + tables = schema_generator.fetch_tables_for_schema( schema, database.name, schema.name ) - views = self.fetch_views_for_schema( # type: ignore + views = schema_generator.fetch_views_for_schema( schema, database.name, schema.name ) @@ -139,7 +107,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # Queries for usage. start_time_millis = self.config.start_time.timestamp() * 1000 end_time_millis = self.config.end_time.timestamp() * 1000 - for row in self.query( + for row in self.connection.query( f"""\ SELECT COUNT(*) AS CNT FROM snowflake.account_usage.query_history @@ -150,7 +118,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.report.num_snowflake_queries = row["CNT"] # Queries for lineage/operations. - for row in self.query( + for row in self.connection.query( f"""\ SELECT COUNT(*) AS CNT FROM @@ -166,16 +134,5 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # This source doesn't produce any metadata itself. All important information goes into the report. yield from [] - # This is a bit of a hack, but lets us reuse the code from the main ingestion source. - # Mypy doesn't really know how to deal with it though, which is why we have all these - # type ignore comments. - get_databases = SnowflakeSchemaGenerator.get_databases - get_databases_from_ischema = SnowflakeSchemaGenerator.get_databases_from_ischema - fetch_schemas_for_database = SnowflakeSchemaGenerator.fetch_schemas_for_database - fetch_tables_for_schema = SnowflakeSchemaGenerator.fetch_tables_for_schema - fetch_views_for_schema = SnowflakeSchemaGenerator.fetch_views_for_schema - get_tables_for_schema = SnowflakeSchemaGenerator.get_tables_for_schema - get_views_for_schema = SnowflakeSchemaGenerator.get_views_for_schema - def get_report(self) -> SnowflakeSummaryReport: return self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index e8b56a01944ad2..c5e0994059f2e4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -5,23 +5,22 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple import pydantic -from snowflake.connector import SnowflakeConnection from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.emitter.mce_builder import make_user_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.snowflake.constants import SnowflakeEdition from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config -from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery -from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeCommonMixin, - SnowflakeConnectionMixin, +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnection, SnowflakePermissionError, - SnowflakeQueryMixin, ) +from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery +from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report +from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantUsageRunSkipHandler, ) @@ -107,13 +106,12 @@ class SnowflakeJoinedAccessEvent(PermissiveModel): role_name: str -class SnowflakeUsageExtractor( - SnowflakeQueryMixin, SnowflakeConnectionMixin, SnowflakeCommonMixin -): +class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable): def __init__( self, config: SnowflakeV2Config, report: SnowflakeV2Report, + connection: SnowflakeConnection, dataset_urn_builder: Callable[[str], str], redundant_run_skip_handler: Optional[RedundantUsageRunSkipHandler], ) -> None: @@ -121,7 +119,7 @@ def __init__( self.report: SnowflakeV2Report = report self.dataset_urn_builder = dataset_urn_builder self.logger = logger - self.connection: Optional[SnowflakeConnection] = None + self.connection = connection self.redundant_run_skip_handler = redundant_run_skip_handler self.start_time, self.end_time = ( @@ -144,11 +142,6 @@ def get_usage_workunits( return self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION) - - self.connection = self.create_connection() - if self.connection is None: - return - if self.report.edition == SnowflakeEdition.STANDARD.value: logger.info( "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported." @@ -207,7 +200,7 @@ def _get_workunits_internal( with PerfTimer() as timer: logger.info("Getting aggregated usage statistics") try: - results = self.query( + results = self.connection.query( SnowflakeQuery.usage_per_object_per_time_bucket_for_time_window( start_time_millis=int(self.start_time.timestamp() * 1000), end_time_millis=int(self.end_time.timestamp() * 1000), @@ -239,7 +232,7 @@ def _get_workunits_internal( logger.debug(f"Processing usage row number {results.rownumber}") logger.debug(self.report.usage_aggregation.as_string()) - if not self._is_dataset_pattern_allowed( + if not self.is_dataset_pattern_allowed( row["OBJECT_NAME"], row["OBJECT_DOMAIN"], ): @@ -293,7 +286,7 @@ def build_usage_statistics_for_dataset( f"Failed to parse usage statistics for dataset {dataset_identifier} due to error {e}.", exc_info=e, ) - self.report_warning( + self.report.warning( "Failed to parse usage statistics for dataset", dataset_identifier ) @@ -376,7 +369,8 @@ def _get_snowflake_history(self) -> Iterable[SnowflakeJoinedAccessEvent]: with PerfTimer() as timer: query = self._make_operations_query() try: - results = self.query(query) + assert self.connection is not None + results = self.connection.query(query) except Exception as e: logger.debug(e, exc_info=e) self.warn_if_stateful_else_error( @@ -398,7 +392,10 @@ def _make_operations_query(self) -> str: def _check_usage_date_ranges(self) -> None: with PerfTimer() as timer: try: - results = self.query(SnowflakeQuery.get_access_history_date_range()) + assert self.connection is not None + results = self.connection.query( + SnowflakeQuery.get_access_history_date_range() + ) except Exception as e: if isinstance(e, SnowflakePermissionError): error_msg = "Failed to get usage. Please grant imported privileges on SNOWFLAKE database. " @@ -407,7 +404,7 @@ def _check_usage_date_ranges(self) -> None: ) else: logger.debug(e, exc_info=e) - self.report_warning( + self.report.warning( "usage", f"Extracting the date range for usage data from Snowflake failed due to error {e}.", ) @@ -419,7 +416,7 @@ def _check_usage_date_ranges(self) -> None: or db_row["MIN_TIME"] is None or db_row["MAX_TIME"] is None ): - self.report_warning( + self.report.warning( "check-usage-data", f"Missing data for access_history {db_row}.", ) @@ -505,7 +502,7 @@ def _process_snowflake_history_row( yield event except Exception as e: self.report.rows_parsing_error += 1 - self.report_warning( + self.report.warning( "operation", f"Failed to parse operation history row {event_dict}, {e}", ) @@ -564,7 +561,7 @@ def _is_unsupported_object_accessed(self, obj: Dict[str, Any]) -> bool: def _is_object_valid(self, obj: Dict[str, Any]) -> bool: if self._is_unsupported_object_accessed( obj - ) or not self._is_dataset_pattern_allowed( + ) or not self.is_dataset_pattern_allowed( obj.get("objectName"), obj.get("objectDomain") ): return False @@ -590,3 +587,6 @@ def _should_ingest_usage(self) -> bool: def report_status(self, step: str, status: bool) -> None: if self.redundant_run_skip_handler: self.redundant_run_skip_handler.report_current_run_status(step, status) + + def close(self) -> None: + pass diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index 5d4cc38469f7ce..c33fbb3d0bfc80 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -1,54 +1,41 @@ -import logging -from typing import Any, Optional +import abc +from typing import ClassVar, Literal, Optional, Tuple -from snowflake.connector import SnowflakeConnection -from snowflake.connector.cursor import DictCursor from typing_extensions import Protocol -from datahub.configuration.common import MetaError from datahub.configuration.pattern_utils import is_schema_allowed from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.ingestion.api.source import SourceReport from datahub.ingestion.source.snowflake.constants import ( - GENERIC_PERMISSION_ERROR_KEY, SNOWFLAKE_REGION_CLOUD_REGION_MAPPING, SnowflakeCloudProvider, SnowflakeObjectDomain, ) -from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config +from datahub.ingestion.source.snowflake.snowflake_config import ( + SnowflakeFilterConfig, + SnowflakeIdentifierConfig, + SnowflakeV2Config, +) from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -logger: logging.Logger = logging.getLogger(__name__) - - -class SnowflakePermissionError(MetaError): - """A permission error has happened""" - - -# Required only for mypy, since we are using mixin classes, and not inheritance. -# Reference - https://mypy.readthedocs.io/en/latest/more_types.html#mixin-classes -class SnowflakeLoggingProtocol(Protocol): - logger: logging.Logger - -class SnowflakeQueryProtocol(SnowflakeLoggingProtocol, Protocol): - def get_connection(self) -> SnowflakeConnection: +class SnowflakeStructuredReportMixin(abc.ABC): + @property + @abc.abstractmethod + def structured_reporter(self) -> SourceReport: ... + # TODO: Eventually I want to deprecate these methods and use the structured_reporter directly. + def report_warning(self, key: str, reason: str) -> None: + self.structured_reporter.warning(key, reason) -class SnowflakeQueryMixin: - def query(self: SnowflakeQueryProtocol, query: str) -> Any: - try: - self.logger.info(f"Query : {query}", stacklevel=2) - resp = self.get_connection().cursor(DictCursor).execute(query) - return resp - - except Exception as e: - if is_permission_error(e): - raise SnowflakePermissionError(e) from e - raise + def report_error(self, key: str, reason: str) -> None: + self.structured_reporter.failure(key, reason) -class SnowflakeCommonProtocol(SnowflakeLoggingProtocol, Protocol): +# Required only for mypy, since we are using mixin classes, and not inheritance. +# Reference - https://mypy.readthedocs.io/en/latest/more_types.html#mixin-classes +class SnowflakeCommonProtocol(Protocol): platform: str = "snowflake" config: SnowflakeV2Config @@ -59,6 +46,9 @@ def get_dataset_identifier( ) -> str: ... + def cleanup_qualified_name(self, qualified_name: str) -> str: + ... + def get_dataset_identifier_from_qualified_name(self, qualified_name: str) -> str: ... @@ -72,10 +62,8 @@ def report_error(self, key: str, reason: str) -> None: ... -class SnowflakeCommonMixin: - platform = "snowflake" - - CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX = [ +class SnowsightUrlBuilder: + CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX: ClassVar = [ "us-west-2", "us-east-1", "eu-west-1", @@ -84,13 +72,21 @@ class SnowflakeCommonMixin: "ap-southeast-2", ] + snowsight_base_url: str + + def __init__(self, account_locator: str, region: str, privatelink: bool = False): + cloud, cloud_region_id = self.get_cloud_region_from_snowflake_region_id(region) + self.snowsight_base_url = self.create_snowsight_base_url( + account_locator, cloud_region_id, cloud, privatelink + ) + @staticmethod def create_snowsight_base_url( account_locator: str, cloud_region_id: str, cloud: str, privatelink: bool = False, - ) -> Optional[str]: + ) -> str: if cloud: url_cloud_provider_suffix = f".{cloud}" @@ -99,7 +95,7 @@ def create_snowsight_base_url( # https://docs.snowflake.com/en/user-guide/admin-account-identifier#non-vps-account-locator-formats-by-cloud-platform-and-region if ( cloud_region_id - in SnowflakeCommonMixin.CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX + in SnowsightUrlBuilder.CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX ): url_cloud_provider_suffix = "" else: @@ -111,7 +107,10 @@ def create_snowsight_base_url( return url @staticmethod - def get_cloud_region_from_snowflake_region_id(region): + def get_cloud_region_from_snowflake_region_id( + region: str, + ) -> Tuple[str, str]: + cloud: str if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING.keys(): cloud, cloud_region_id = SNOWFLAKE_REGION_CLOUD_REGION_MAPPING[region] elif region.startswith(("aws_", "gcp_", "azure_")): @@ -122,14 +121,42 @@ def get_cloud_region_from_snowflake_region_id(region): raise Exception(f"Unknown snowflake region {region}") return cloud, cloud_region_id - def _is_dataset_pattern_allowed( - self: SnowflakeCommonProtocol, + # domain is either "view" or "table" + def get_external_url_for_table( + self, + table_name: str, + schema_name: str, + db_name: str, + domain: Literal[SnowflakeObjectDomain.TABLE, SnowflakeObjectDomain.VIEW], + ) -> Optional[str]: + return f"{self.snowsight_base_url}#/data/databases/{db_name}/schemas/{schema_name}/{domain}/{table_name}/" + + def get_external_url_for_schema( + self, schema_name: str, db_name: str + ) -> Optional[str]: + return f"{self.snowsight_base_url}#/data/databases/{db_name}/schemas/{schema_name}/" + + def get_external_url_for_database(self, db_name: str) -> Optional[str]: + return f"{self.snowsight_base_url}#/data/databases/{db_name}/" + + +class SnowflakeFilterMixin(SnowflakeStructuredReportMixin): + @property + @abc.abstractmethod + def filter_config(self) -> SnowflakeFilterConfig: + ... + + @staticmethod + def _combine_identifier_parts( + table_name: str, schema_name: str, db_name: str + ) -> str: + return f"{db_name}.{schema_name}.{table_name}" + + def is_dataset_pattern_allowed( + self, dataset_name: Optional[str], dataset_type: Optional[str], - is_upstream: bool = False, ) -> bool: - if is_upstream and not self.config.validate_upstreams_against_patterns: - return True if not dataset_type or not dataset_name: return True dataset_params = dataset_name.split(".") @@ -148,47 +175,101 @@ def _is_dataset_pattern_allowed( # NOTE: this case returned `True` earlier when extracting lineage return False - if not self.config.database_pattern.allowed( + if not self.filter_config.database_pattern.allowed( dataset_params[0].strip('"') ) or not is_schema_allowed( - self.config.schema_pattern, + self.filter_config.schema_pattern, dataset_params[1].strip('"'), dataset_params[0].strip('"'), - self.config.match_fully_qualified_names, + self.filter_config.match_fully_qualified_names, ): return False if dataset_type.lower() in { SnowflakeObjectDomain.TABLE - } and not self.config.table_pattern.allowed( - self.get_dataset_identifier_from_qualified_name(dataset_name) + } and not self.filter_config.table_pattern.allowed( + self.cleanup_qualified_name(dataset_name) ): return False if dataset_type.lower() in { - "view", - "materialized_view", - } and not self.config.view_pattern.allowed( - self.get_dataset_identifier_from_qualified_name(dataset_name) + SnowflakeObjectDomain.VIEW, + SnowflakeObjectDomain.MATERIALIZED_VIEW, + } and not self.filter_config.view_pattern.allowed( + self.cleanup_qualified_name(dataset_name) ): return False return True - def snowflake_identifier(self: SnowflakeCommonProtocol, identifier: str) -> str: + # Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers, + # For example "test-database"."test-schema".test_table + # whereas we generate urns without quotes even for quoted identifiers for backward compatibility + # and also unavailability of utility function to identify whether current table/schema/database + # name should be quoted in above method get_dataset_identifier + def cleanup_qualified_name(self, qualified_name: str) -> str: + name_parts = qualified_name.split(".") + if len(name_parts) != 3: + self.structured_reporter.report_warning( + title="Unexpected dataset pattern", + message="We failed to parse a Snowflake qualified name into its constituent parts. " + "DB/schema/table filtering may not work as expected on these entities.", + context=f"{qualified_name} has {len(name_parts)} parts", + ) + return qualified_name.replace('"', "") + return SnowflakeFilterMixin._combine_identifier_parts( + table_name=name_parts[2].strip('"'), + schema_name=name_parts[1].strip('"'), + db_name=name_parts[0].strip('"'), + ) + + +class SnowflakeIdentifierMixin(abc.ABC): + platform = "snowflake" + + @property + @abc.abstractmethod + def identifier_config(self) -> SnowflakeIdentifierConfig: + ... + + def snowflake_identifier(self, identifier: str) -> str: # to be in in sync with older connector, convert name to lowercase - if self.config.convert_urns_to_lowercase: + if self.identifier_config.convert_urns_to_lowercase: return identifier.lower() return identifier - def gen_dataset_urn(self: SnowflakeCommonProtocol, dataset_identifier: str) -> str: + def get_dataset_identifier( + self, table_name: str, schema_name: str, db_name: str + ) -> str: + return self.snowflake_identifier( + SnowflakeCommonMixin._combine_identifier_parts( + table_name=table_name, schema_name=schema_name, db_name=db_name + ) + ) + + def gen_dataset_urn(self, dataset_identifier: str) -> str: return make_dataset_urn_with_platform_instance( platform=self.platform, name=dataset_identifier, - platform_instance=self.config.platform_instance, - env=self.config.env, + platform_instance=self.identifier_config.platform_instance, + env=self.identifier_config.env, ) + +# TODO: We're most of the way there on fully removing SnowflakeCommonProtocol. +class SnowflakeCommonMixin(SnowflakeFilterMixin, SnowflakeIdentifierMixin): + @property + def structured_reporter(self: SnowflakeCommonProtocol) -> SourceReport: + return self.report + + @property + def filter_config(self: SnowflakeCommonProtocol) -> SnowflakeFilterConfig: + return self.config + + @property + def identifier_config(self: SnowflakeCommonProtocol) -> SnowflakeIdentifierConfig: + return self.config + @staticmethod def get_quoted_identifier_for_database(db_name): return f'"{db_name}"' @@ -197,34 +278,13 @@ def get_quoted_identifier_for_database(db_name): def get_quoted_identifier_for_schema(db_name, schema_name): return f'"{db_name}"."{schema_name}"' + def get_dataset_identifier_from_qualified_name(self, qualified_name: str) -> str: + return self.snowflake_identifier(self.cleanup_qualified_name(qualified_name)) + @staticmethod def get_quoted_identifier_for_table(db_name, schema_name, table_name): return f'"{db_name}"."{schema_name}"."{table_name}"' - def get_dataset_identifier( - self: SnowflakeCommonProtocol, table_name: str, schema_name: str, db_name: str - ) -> str: - return self.snowflake_identifier(f"{db_name}.{schema_name}.{table_name}") - - # Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers, - # For example "test-database"."test-schema".test_table - # whereas we generate urns without quotes even for quoted identifiers for backward compatibility - # and also unavailability of utility function to identify whether current table/schema/database - # name should be quoted in above method get_dataset_identifier - def get_dataset_identifier_from_qualified_name( - self: SnowflakeCommonProtocol, qualified_name: str - ) -> str: - name_parts = qualified_name.split(".") - if len(name_parts) != 3: - self.report.report_warning( - "invalid-dataset-pattern", - f"Found non-parseable {name_parts} for {qualified_name}", - ) - return self.snowflake_identifier(qualified_name.replace('"', "")) - return self.get_dataset_identifier( - name_parts[2].strip('"'), name_parts[1].strip('"'), name_parts[0].strip('"') - ) - # Note - decide how to construct user urns. # Historically urns were created using part before @ from user's email. # Users without email were skipped from both user entries as well as aggregates. @@ -255,70 +315,3 @@ def warn_if_stateful_else_error( self.report_warning(key, reason) else: self.report_error(key, reason) - - def report_warning(self: SnowflakeCommonProtocol, key: str, reason: str) -> None: - self.report.report_warning(key, reason) - self.logger.warning(f"{key} => {reason}") - - def report_error(self: SnowflakeCommonProtocol, key: str, reason: str) -> None: - self.report.report_failure(key, reason) - self.logger.error(f"{key} => {reason}") - - -class SnowflakeConnectionProtocol(SnowflakeLoggingProtocol, Protocol): - connection: Optional[SnowflakeConnection] - config: SnowflakeV2Config - report: SnowflakeV2Report - - def create_connection(self) -> Optional[SnowflakeConnection]: - ... - - def report_error(self, key: str, reason: str) -> None: - ... - - -class SnowflakeConnectionMixin: - def get_connection(self: SnowflakeConnectionProtocol) -> SnowflakeConnection: - if self.connection is None: - # Ideally this is never called here - self.logger.info("Did you forget to initialize connection for module?") - self.connection = self.create_connection() - - # Connection is already present by the time its used for query - # Every module initializes the connection or fails and returns - assert self.connection is not None - return self.connection - - # If connection succeeds, return connection, else return None and report failure - def create_connection( - self: SnowflakeConnectionProtocol, - ) -> Optional[SnowflakeConnection]: - try: - conn = self.config.get_connection() - except Exception as e: - logger.debug(e, exc_info=e) - if "not granted to this user" in str(e): - self.report_error( - GENERIC_PERMISSION_ERROR_KEY, - f"Failed to connect with snowflake due to error {e}", - ) - else: - logger.debug(e, exc_info=e) - self.report_error( - "snowflake-connection", - f"Failed to connect to snowflake instance due to error {e}.", - ) - return None - else: - return conn - - def close(self: SnowflakeConnectionProtocol) -> None: - if self.connection is not None and not self.connection.is_closed(): - self.connection.close() - - -def is_permission_error(e: Exception) -> bool: - msg = str(e) - # 002003 (02000): SQL compilation error: Database/SCHEMA 'XXXX' does not exist or not authorized. - # Insufficient privileges to operate on database 'XXXX' - return "Insufficient privileges" in msg or "not authorized" in msg diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index f39620b79cfd43..d8eda98da422b9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -7,8 +7,6 @@ from dataclasses import dataclass from typing import Dict, Iterable, List, Optional, Union -from snowflake.connector import SnowflakeConnection - from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SupportStatus, @@ -36,6 +34,10 @@ SnowflakeAssertionsHandler, ) from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnection, + SnowflakeConnectionConfig, +) from datahub.ingestion.source.snowflake.snowflake_lineage_v2 import ( SnowflakeLineageExtractor, ) @@ -54,8 +56,7 @@ ) from datahub.ingestion.source.snowflake.snowflake_utils import ( SnowflakeCommonMixin, - SnowflakeConnectionMixin, - SnowflakeQueryMixin, + SnowsightUrlBuilder, ) from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler from datahub.ingestion.source.state.redundant_run_skip_handler import ( @@ -68,7 +69,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionSourceBase, ) -from datahub.ingestion.source_config.sql.snowflake import BaseSnowflakeConfig from datahub.ingestion.source_report.ingestion_stage import ( LINEAGE_EXTRACTION, METADATA_EXTRACTION, @@ -119,8 +119,6 @@ supported=True, ) class SnowflakeV2Source( - SnowflakeQueryMixin, - SnowflakeConnectionMixin, SnowflakeCommonMixin, StatefulIngestionSourceBase, TestableSource, @@ -130,7 +128,8 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): self.config: SnowflakeV2Config = config self.report: SnowflakeV2Report = SnowflakeV2Report() self.logger = logger - self.connection: Optional[SnowflakeConnection] = None + + self.connection = self.config.get_connection() self.domain_registry: Optional[DomainRegistry] = None if self.config.domain: @@ -139,7 +138,7 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): ) # For database, schema, tables, views, etc - self.data_dictionary = SnowflakeDataDictionary() + self.data_dictionary = SnowflakeDataDictionary(connection=self.connection) self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None self.aggregator: Optional[SqlParsingAggregator] = None @@ -180,6 +179,7 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): self.lineage_extractor = SnowflakeLineageExtractor( config, self.report, + connection=self.connection, dataset_urn_builder=self.gen_dataset_urn, redundant_run_skip_handler=redundant_lineage_run_skip_handler, sql_aggregator=self.aggregator, @@ -200,6 +200,7 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): self.usage_extractor = SnowflakeUsageExtractor( config, self.report, + connection=self.connection, dataset_urn_builder=self.gen_dataset_urn, redundant_run_skip_handler=redundant_usage_run_skip_handler, ) @@ -232,7 +233,9 @@ def test_connection(config_dict: dict) -> TestConnectionReport: test_report = TestConnectionReport() try: - connection_conf = BaseSnowflakeConfig.parse_obj_allow_extras(config_dict) + connection_conf = SnowflakeConnectionConfig.parse_obj_allow_extras( + config_dict + ) connection: SnowflakeConnection = connection_conf.get_connection() assert connection @@ -258,7 +261,7 @@ def test_connection(config_dict: dict) -> TestConnectionReport: @staticmethod def check_capabilities( - conn: SnowflakeConnection, connection_conf: BaseSnowflakeConfig + conn: SnowflakeConnection, connection_conf: SnowflakeConnectionConfig ) -> Dict[Union[SourceCapability, str], CapabilityReport]: # Currently only overall capabilities are reported. # Resource level variations in capabilities are not considered. @@ -269,19 +272,14 @@ class SnowflakePrivilege: object_name: str object_type: str - def query(query): - logger.info(f"Query : {query}") - resp = conn.cursor().execute(query) - return resp - _report: Dict[Union[SourceCapability, str], CapabilityReport] = dict() privileges: List[SnowflakePrivilege] = [] capabilities: List[SourceCapability] = [c.capability for c in SnowflakeV2Source.get_capabilities() if c.capability not in (SourceCapability.PLATFORM_INSTANCE, SourceCapability.DOMAINS, SourceCapability.DELETION_DETECTION)] # type: ignore - cur = query("select current_role()") + cur = conn.query("select current_role()") current_role = [row[0] for row in cur][0] - cur = query("select current_secondary_roles()") + cur = conn.query("select current_secondary_roles()") secondary_roles_str = json.loads([row[0] for row in cur][0])["roles"] secondary_roles = ( [] if secondary_roles_str == "" else secondary_roles_str.split(",") @@ -298,7 +296,7 @@ def query(query): role = roles[i] i = i + 1 # for some roles, quoting is necessary. for example test-role - cur = query(f'show grants to role "{role}"') + cur = conn.query(f'show grants to role "{role}"') for row in cur: privilege = SnowflakePrivilege( privilege=row[1], object_type=row[2], object_name=row[3] @@ -363,7 +361,7 @@ def query(query): ): roles.append(privilege.object_name) - cur = query("select current_warehouse()") + cur = conn.query("select current_warehouse()") current_warehouse = [row[0] for row in cur][0] default_failure_messages = { @@ -425,15 +423,15 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self._snowflake_clear_ocsp_cache() - self.connection = self.create_connection() + self.connection = self.config.get_connection() if self.connection is None: return - self.inspect_session_metadata() + self.inspect_session_metadata(self.connection) - snowsight_base_url = None + snowsight_url_builder = None if self.config.include_external_url: - snowsight_base_url = self.get_snowsight_base_url() + snowsight_url_builder = self.get_snowsight_url_builder() if self.report.default_warehouse is None: self.report_warehouse_failure() @@ -446,7 +444,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: domain_registry=self.domain_registry, profiler=self.profiler, aggregator=self.aggregator, - snowsight_base_url=snowsight_base_url, + snowsight_url_builder=snowsight_url_builder, + dataset_urn_builder=self.gen_dataset_urn, ) self.report.set_ingestion_stage("*", METADATA_EXTRACTION) @@ -499,7 +498,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if self.config.include_assertion_results: yield from SnowflakeAssertionsHandler( - self.config, self.report, self.gen_dataset_urn + self.config, self.report, self.connection ).get_assertion_workunits(discovered_datasets) def report_warehouse_failure(self) -> None: @@ -536,22 +535,22 @@ def add_config_to_report(self) -> None: self.config.end_time, ) - def inspect_session_metadata(self) -> None: + def inspect_session_metadata(self, connection: SnowflakeConnection) -> None: try: logger.info("Checking current version") - for db_row in self.query(SnowflakeQuery.current_version()): + for db_row in connection.query(SnowflakeQuery.current_version()): self.report.saas_version = db_row["CURRENT_VERSION()"] except Exception as e: self.report_error("version", f"Error: {e}") try: logger.info("Checking current role") - for db_row in self.query(SnowflakeQuery.current_role()): + for db_row in connection.query(SnowflakeQuery.current_role()): self.report.role = db_row["CURRENT_ROLE()"] except Exception as e: self.report_error("version", f"Error: {e}") try: logger.info("Checking current warehouse") - for db_row in self.query(SnowflakeQuery.current_warehouse()): + for db_row in connection.query(SnowflakeQuery.current_warehouse()): self.report.default_warehouse = db_row["CURRENT_WAREHOUSE()"] except Exception as e: self.report_error("current_warehouse", f"Error: {e}") @@ -565,13 +564,13 @@ def inspect_session_metadata(self) -> None: except Exception: self.report.edition = None - def get_snowsight_base_url(self) -> Optional[str]: + def get_snowsight_url_builder(self) -> Optional[SnowsightUrlBuilder]: try: # See https://docs.snowflake.com/en/user-guide/admin-account-identifier.html#finding-the-region-and-locator-for-an-account - for db_row in self.query(SnowflakeQuery.current_account()): + for db_row in self.connection.query(SnowflakeQuery.current_account()): account_locator = db_row["CURRENT_ACCOUNT()"] - for db_row in self.query(SnowflakeQuery.current_region()): + for db_row in self.connection.query(SnowflakeQuery.current_region()): region = db_row["CURRENT_REGION()"] self.report.account_locator = account_locator @@ -581,30 +580,25 @@ def get_snowsight_base_url(self) -> Optional[str]: region = region.split(".")[-1].lower() account_locator = account_locator.lower() - cloud, cloud_region_id = self.get_cloud_region_from_snowflake_region_id( - region - ) - - # For privatelink, account identifier ends with .privatelink - # See https://docs.snowflake.com/en/user-guide/organizations-connect.html#private-connectivity-urls - return self.create_snowsight_base_url( + return SnowsightUrlBuilder( account_locator, - cloud_region_id, - cloud, - self.config.account_id.endswith(".privatelink"), # type:ignore + region, + # For privatelink, account identifier ends with .privatelink + # See https://docs.snowflake.com/en/user-guide/organizations-connect.html#private-connectivity-urls + privatelink=self.config.account_id.endswith(".privatelink"), ) except Exception as e: - self.warn( - self.logger, - "snowsight url", - f"unable to get snowsight base url due to an error -> {e}", + self.report.warning( + title="External URL Generation Failed", + message="We were unable to infer the Snowsight base URL for your Snowflake account. External URLs will not be generated.", + exc=e, ) return None def is_standard_edition(self) -> bool: try: - self.query(SnowflakeQuery.show_tags()) + self.connection.query(SnowflakeQuery.show_tags()) return False except Exception as e: if "Unsupported feature 'TAG'" in str(e): diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py index dd7592f6a8a5e3..93c7025aeee4ea 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py @@ -8,8 +8,9 @@ from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.source_common import ( - DatasetSourceConfigMixin, + EnvConfigMixin, LowerCaseDatasetUrnConfigMixin, + PlatformInstanceConfigMixin, ) from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.ingestion.api.incremental_lineage_helper import ( @@ -30,17 +31,7 @@ logger: logging.Logger = logging.getLogger(__name__) -class SQLCommonConfig( - StatefulIngestionConfigBase, - DatasetSourceConfigMixin, - LowerCaseDatasetUrnConfigMixin, - IncrementalLineageConfigMixin, - ClassificationSourceConfigMixin, -): - options: dict = pydantic.Field( - default_factory=dict, - description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.", - ) +class SQLFilterConfig(ConfigModel): # Although the 'table_pattern' enables you to skip everything from certain schemas, # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter @@ -57,6 +48,32 @@ class SQLCommonConfig( default=AllowDenyPattern.allow_all(), description="Regex patterns for views to filter in ingestion. Note: Defaults to table_pattern if not specified. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'", ) + + @pydantic.root_validator(pre=True) + def view_pattern_is_table_pattern_unless_specified( + cls, values: Dict[str, Any] + ) -> Dict[str, Any]: + view_pattern = values.get("view_pattern") + table_pattern = values.get("table_pattern") + if table_pattern and not view_pattern: + logger.info(f"Applying table_pattern {table_pattern} to view_pattern.") + values["view_pattern"] = table_pattern + return values + + +class SQLCommonConfig( + StatefulIngestionConfigBase, + PlatformInstanceConfigMixin, + EnvConfigMixin, + LowerCaseDatasetUrnConfigMixin, + IncrementalLineageConfigMixin, + ClassificationSourceConfigMixin, + SQLFilterConfig, +): + options: dict = pydantic.Field( + default_factory=dict, + description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.", + ) profile_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), description="Regex patterns to filter tables (or specific columns) for profiling during ingestion. Note that only tables allowed by the `table_pattern` will be considered.", @@ -103,17 +120,6 @@ def is_profiling_enabled(self) -> bool: self.profiling.operation_config ) - @pydantic.root_validator(pre=True) - def view_pattern_is_table_pattern_unless_specified( - cls, values: Dict[str, Any] - ) -> Dict[str, Any]: - view_pattern = values.get("view_pattern") - table_pattern = values.get("table_pattern") - if table_pattern and not view_pattern: - logger.info(f"Applying table_pattern {table_pattern} to view_pattern.") - values["view_pattern"] = table_pattern - return values - @pydantic.root_validator(skip_on_failure=True) def ensure_profiling_pattern_is_passed_to_profiling( cls, values: Dict[str, Any] diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/__init__.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/__init__.py deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 27daae11e2295f..677b96269fe586 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -1,6 +1,7 @@ import contextlib import dataclasses import enum +import functools import itertools import json import logging @@ -14,6 +15,7 @@ import datahub.emitter.mce_builder as builder import datahub.metadata.schema_classes as models +from datahub.configuration.time_window_config import get_time_bucket from datahub.emitter.mce_builder import get_sys_time, make_ts_millis from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.sql_parsing_builder import compute_upstream_fields @@ -30,7 +32,7 @@ SchemaFieldUrn, ) from datahub.sql_parsing.schema_resolver import SchemaResolver, SchemaResolverInterface -from datahub.sql_parsing.sql_parsing_common import QueryType +from datahub.sql_parsing.sql_parsing_common import QueryType, QueryTypeProps from datahub.sql_parsing.sqlglot_lineage import ( ColumnLineageInfo, ColumnRef, @@ -103,6 +105,7 @@ class QueryMetadata: upstreams: List[UrnStr] # this is direct upstreams, which may be temp tables column_lineage: List[ColumnLineageInfo] + column_usage: Dict[UrnStr, Set[UrnStr]] confidence_score: float used_temp_tables: bool = True @@ -128,12 +131,44 @@ class KnownQueryLineageInfo: downstream: UrnStr upstreams: List[UrnStr] column_lineage: Optional[List[ColumnLineageInfo]] = None + column_usage: Optional[Dict[UrnStr, Set[UrnStr]]] = None timestamp: Optional[datetime] = None session_id: Optional[str] = None query_type: QueryType = QueryType.UNKNOWN +@dataclasses.dataclass +class KnownLineageMapping: + upstream_urn: UrnStr + downstream_urn: UrnStr + lineage_type: str = models.DatasetLineageTypeClass.COPY + + +@dataclasses.dataclass +class PreparsedQuery: + # If not provided, we will generate one using the fast fingerprint generator. + query_id: Optional[QueryId] + + query_text: str + + upstreams: List[UrnStr] + downstream: Optional[UrnStr] = None + column_lineage: Optional[List[ColumnLineageInfo]] = None + column_usage: Optional[Dict[UrnStr, Set[UrnStr]]] = None + inferred_schema: Optional[List[models.SchemaFieldClass]] = None + confidence_score: float = 1.0 + + query_count: int = 1 + user: Optional[CorpUserUrn] = None + timestamp: Optional[datetime] = None + session_id: str = _MISSING_SESSION_ID + query_type: QueryType = QueryType.UNKNOWN + query_type_props: QueryTypeProps = dataclasses.field( + default_factory=lambda: QueryTypeProps() + ) + + @dataclasses.dataclass class SqlAggregatorReport(Report): _aggregator: "SqlParsingAggregator" @@ -165,6 +200,7 @@ class SqlAggregatorReport(Report): # Other lineage loading metrics. num_known_query_lineage: int = 0 + num_preparsed_queries: int = 0 num_known_mapping_lineage: int = 0 num_table_renames: int = 0 @@ -183,10 +219,19 @@ class SqlAggregatorReport(Report): schema_resolver_count: Optional[int] = None num_unique_query_fingerprints: Optional[int] = None num_urns_with_lineage: Optional[int] = None + num_lineage_skipped_due_to_filters: int = 0 + + # Queries. num_queries_entities_generated: int = 0 + num_queries_skipped_due_to_filters: int = 0 # Usage-related. usage_skipped_missing_timestamp: int = 0 + num_query_usage_stats_generated: int = 0 + + # Operation-related. + num_operations_generated: int = 0 + num_operations_skipped_due_to_filters: int = 0 def compute_stats(self) -> None: self.schema_resolver_count = self._aggregator._schema_resolver.schema_count() @@ -209,10 +254,13 @@ def __init__( graph: Optional[DataHubGraph] = None, generate_lineage: bool = True, generate_queries: bool = True, + generate_query_subject_fields: bool = True, generate_usage_statistics: bool = False, + generate_query_usage_statistics: bool = False, generate_operations: bool = False, usage_config: Optional[BaseUsageConfig] = None, - is_temp_table: Optional[Callable[[UrnStr], bool]] = None, + is_temp_table: Optional[Callable[[str], bool]] = None, + is_allowed_table: Optional[Callable[[str], bool]] = None, format_queries: bool = True, query_log: QueryLogSetting = _DEFAULT_QUERY_LOG_SETTING, ) -> None: @@ -222,19 +270,24 @@ def __init__( self.generate_lineage = generate_lineage self.generate_queries = generate_queries + self.generate_query_subject_fields = generate_query_subject_fields self.generate_usage_statistics = generate_usage_statistics + self.generate_query_usage_statistics = generate_query_usage_statistics self.generate_operations = generate_operations if self.generate_queries and not self.generate_lineage: raise ValueError("Queries will only be generated if lineage is enabled") self.usage_config = usage_config - if self.generate_usage_statistics and self.usage_config is None: + if ( + self.generate_usage_statistics or self.generate_query_usage_statistics + ) and self.usage_config is None: raise ValueError("Usage statistics generation requires a usage config") self.report = SqlAggregatorReport(_aggregator=self) # can be used by BQ where we have a "temp_table_dataset_prefix" - self.is_temp_table = is_temp_table + self._is_temp_table = is_temp_table + self._is_allowed_table = is_allowed_table self.format_queries = format_queries self.query_log = query_log @@ -325,6 +378,15 @@ def __init__( assert self.usage_config is not None self._usage_aggregator = UsageAggregator(config=self.usage_config) + # Query usage aggregator. + # Map of query ID -> { bucket -> count } + self._query_usage_counts: Optional[FileBackedDict[Dict[datetime, int]]] = None + if self.generate_query_usage_statistics: + self._query_usage_counts = FileBackedDict[Dict[datetime, int]]( + shared_connection=self._shared_connection, + tablename="query_usage_counts", + ) + def close(self) -> None: self._exit_stack.close() @@ -387,6 +449,40 @@ def _maybe_format_query(self, query: str) -> str: return try_format_query(query, self.platform.platform_name) return query + @functools.lru_cache(maxsize=128) + def _name_from_urn(self, urn: UrnStr) -> str: + name = DatasetUrn.from_string(urn).name + if ( + platform_instance := self._schema_resolver.platform_instance + ) and name.startswith(platform_instance): + # Remove the platform instance from the name. + name = name[len(platform_instance) + 1 :] + return name + + def is_temp_table(self, urn: UrnStr) -> bool: + if self._is_temp_table is None: + return False + return self._is_temp_table(self._name_from_urn(urn)) + + def is_allowed_table(self, urn: UrnStr) -> bool: + if self.is_temp_table(urn): + return False + if self._is_allowed_table is None: + return True + return self._is_allowed_table(self._name_from_urn(urn)) + + def add( + self, item: Union[KnownQueryLineageInfo, KnownLineageMapping, PreparsedQuery] + ) -> None: + if isinstance(item, KnownQueryLineageInfo): + self.add_known_query_lineage(item) + elif isinstance(item, KnownLineageMapping): + self.add_known_lineage_mapping(item.upstream_urn, item.downstream_urn) + elif isinstance(item, PreparsedQuery): + self.add_preparsed_query(item) + else: + raise ValueError(f"Cannot add unknown item type: {type(item)}") + def add_known_query_lineage( self, known_query_lineage: KnownQueryLineageInfo, merge_lineage: bool = False ) -> None: @@ -428,6 +524,7 @@ def add_known_query_lineage( actor=None, upstreams=known_query_lineage.upstreams, column_lineage=known_query_lineage.column_lineage or [], + column_usage=known_query_lineage.column_usage or {}, confidence_score=1.0, ), merge_lineage=merge_lineage, @@ -480,6 +577,7 @@ def add_known_lineage_mapping( actor=None, upstreams=[upstream_urn], column_lineage=[], + column_usage={}, confidence_score=1.0, ) ) @@ -567,20 +665,69 @@ def add_observed_query( elif parsed.debug_info.column_error: self.report.num_observed_queries_column_failed += 1 + query_fingerprint = parsed.query_fingerprint + + self.add_preparsed_query( + PreparsedQuery( + query_id=query_fingerprint, + query_text=query, + query_count=usage_multiplier, + timestamp=query_timestamp, + user=user, + session_id=session_id, + query_type=parsed.query_type, + query_type_props=parsed.query_type_props, + upstreams=parsed.in_tables, + downstream=parsed.out_tables[0] if parsed.out_tables else None, + column_lineage=parsed.column_lineage, + # TODO: We need a full list of columns referenced, not just the out tables. + column_usage=compute_upstream_fields(parsed), + inferred_schema=infer_output_schema(parsed), + confidence_score=parsed.debug_info.confidence, + ), + is_known_temp_table=is_known_temp_table, + require_out_table_schema=require_out_table_schema, + session_has_temp_tables=session_has_temp_tables, + _is_internal=True, + ) + + def add_preparsed_query( + self, + parsed: PreparsedQuery, + is_known_temp_table: bool = False, + require_out_table_schema: bool = False, + session_has_temp_tables: bool = True, + _is_internal: bool = False, + ) -> None: + if not _is_internal: + self.report.num_preparsed_queries += 1 + + if parsed.timestamp: + # Sanity check - some of our usage subroutines require the timestamp to be in UTC. + # Ideally we'd actually reject missing tzinfo too, but we can tighten that later. + assert parsed.timestamp.tzinfo in {None, timezone.utc} + + query_fingerprint = parsed.query_id + if not query_fingerprint: + query_fingerprint = get_query_fingerprint( + parsed.query_text, + platform=self.platform.platform_name, + fast=True, + ) + # Format the query. - formatted_query = self._maybe_format_query(query) + formatted_query = self._maybe_format_query(parsed.query_text) # Register the query's usage. if not self._usage_aggregator: pass # usage is not enabled - elif query_timestamp is None: + elif parsed.timestamp is None: self.report.usage_skipped_missing_timestamp += 1 else: - # TODO: We need a full list of columns referenced, not just the out tables. - upstream_fields = compute_upstream_fields(parsed) - for upstream_urn in parsed.in_tables: - # If the upstream table is a temp table, don't log usage for it. - if (self.is_temp_table and self.is_temp_table(upstream_urn)) or ( + upstream_fields = parsed.column_usage or {} + for upstream_urn in parsed.upstreams: + # If the upstream table is a temp table or otherwise denied by filters, don't log usage for it. + if not self.is_allowed_table(upstream_urn) or ( require_out_table_schema and not self._schema_resolver.has_urn(upstream_urn) ): @@ -588,42 +735,49 @@ def add_observed_query( self._usage_aggregator.aggregate_event( resource=upstream_urn, - start_time=query_timestamp, + start_time=parsed.timestamp, query=formatted_query, - user=user.urn() if user else None, + user=parsed.user.urn() if parsed.user else None, fields=sorted(upstream_fields.get(upstream_urn, [])), - count=usage_multiplier, + count=parsed.query_count, ) - if not parsed.out_tables: - return - out_table = parsed.out_tables[0] - query_fingerprint = parsed.query_fingerprint - assert query_fingerprint is not None - - # Handle table renames. - is_renamed_table = False - if out_table in self._table_renames: - out_table = self._table_renames[out_table] - is_renamed_table = True + if self._query_usage_counts is not None and parsed.timestamp is not None: + assert self.usage_config is not None + bucket = get_time_bucket( + parsed.timestamp, self.usage_config.bucket_duration + ) + counts = self._query_usage_counts.for_mutation(query_fingerprint, {}) + counts[bucket] = counts.get(bucket, 0) + parsed.query_count # Register the query. self._add_to_query_map( QueryMetadata( query_id=query_fingerprint, formatted_query_string=formatted_query, - session_id=session_id, + session_id=parsed.session_id, query_type=parsed.query_type, lineage_type=models.DatasetLineageTypeClass.TRANSFORMED, - latest_timestamp=query_timestamp, - actor=user, - upstreams=parsed.in_tables, + latest_timestamp=parsed.timestamp, + actor=parsed.user, + upstreams=parsed.upstreams, column_lineage=parsed.column_lineage or [], - confidence_score=parsed.debug_info.confidence, + column_usage=parsed.column_usage or {}, + confidence_score=parsed.confidence_score, used_temp_tables=session_has_temp_tables, ) ) + if not parsed.downstream: + return + out_table = parsed.downstream + + # Handle table renames. + is_renamed_table = False + if out_table in self._table_renames: + out_table = self._table_renames[out_table] + is_renamed_table = True + # Register the query's lineage. if ( is_known_temp_table @@ -634,7 +788,7 @@ def add_observed_query( or ( not is_renamed_table and ( - (self.is_temp_table and self.is_temp_table(out_table)) + self.is_temp_table(out_table) or ( require_out_table_schema and not self._schema_resolver.has_urn(out_table) @@ -643,12 +797,11 @@ def add_observed_query( ) ): # Infer the schema of the output table and track it for later. - inferred_schema = infer_output_schema(parsed) - if inferred_schema is not None: - self._inferred_temp_schemas[query_fingerprint] = inferred_schema + if parsed.inferred_schema is not None: + self._inferred_temp_schemas[query_fingerprint] = parsed.inferred_schema # Also track the lineage for the temp table, for merging purposes later. - self._temp_lineage_map.for_mutation(session_id, {})[ + self._temp_lineage_map.for_mutation(parsed.session_id, {})[ out_table ] = query_fingerprint @@ -743,6 +896,7 @@ def _process_view_definition( actor=None, upstreams=parsed.in_tables, column_lineage=parsed.column_lineage or [], + column_usage=compute_upstream_fields(parsed), confidence_score=parsed.debug_info.confidence, ) ) @@ -824,6 +978,7 @@ def _add_to_query_map( # here just in case more schemas got registered in the interim. current.upstreams = new.upstreams current.column_lineage = new.column_lineage + current.column_usage = new.column_usage current.confidence_score = new.confidence_score else: # In the case of known query lineage, we might get things one at a time. @@ -838,13 +993,16 @@ def _add_to_query_map( self._query_map[query_fingerprint] = new def gen_metadata(self) -> Iterable[MetadataChangeProposalWrapper]: - # diff from v1 - we generate operations here, and it also - # generates MCPWs instead of workunits - yield from self._gen_lineage_mcps() + queries_generated: Set[QueryId] = set() + + yield from self._gen_lineage_mcps(queries_generated) + yield from self._gen_remaining_queries(queries_generated) yield from self._gen_usage_statistics_mcps() yield from self._gen_operation_mcps() - def _gen_lineage_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: + def _gen_lineage_mcps( + self, queries_generated: Set[QueryId] + ) -> Iterable[MetadataChangeProposalWrapper]: if not self.generate_lineage: return @@ -856,7 +1014,6 @@ def _gen_lineage_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: self._view_definitions.clear() # Generate lineage and queries. - queries_generated: Set[QueryId] = set() for downstream_urn in sorted(self._lineage_map): yield from self._gen_lineage_for_downstream( downstream_urn, queries_generated=queries_generated @@ -879,6 +1036,10 @@ def _query_type_precedence(cls, query_type: str) -> int: def _gen_lineage_for_downstream( self, downstream_urn: str, queries_generated: Set[QueryId] ) -> Iterable[MetadataChangeProposalWrapper]: + if not self.is_allowed_table(downstream_urn): + self.report.num_lineage_skipped_due_to_filters += 1 + return + query_ids = self._lineage_map[downstream_urn] queries: List[QueryMetadata] = [ self._resolve_query_with_temp_tables(self._query_map[query_id]) @@ -983,41 +1144,13 @@ def _gen_lineage_for_downstream( ) for query_id in required_queries: - if not self.can_generate_query(query_id): - continue - # Avoid generating the same query twice. if query_id in queries_generated: continue queries_generated.add(query_id) - self.report.num_queries_entities_generated += 1 query = queries_map[query_id] - yield from MetadataChangeProposalWrapper.construct_many( - entityUrn=self._query_urn(query_id), - aspects=[ - models.QueryPropertiesClass( - statement=models.QueryStatementClass( - value=query.formatted_query_string, - language=models.QueryLanguageClass.SQL, - ), - source=models.QuerySourceClass.SYSTEM, - created=query.make_created_audit_stamp(), - lastModified=query.make_last_modified_audit_stamp(), - ), - models.QuerySubjectsClass( - subjects=[ - models.QuerySubjectClass(entity=dataset_urn) - for dataset_urn in itertools.chain( - [downstream_urn], query.upstreams - ) - ] - ), - models.DataPlatformInstanceClass( - platform=self.platform.urn(), - ), - ], - ) + yield from self._gen_query(query, downstream_urn) @classmethod def _query_urn(cls, query_id: QueryId) -> str: @@ -1043,9 +1176,121 @@ def _is_known_lineage_query_id(cls, query_id: QueryId) -> bool: # never conflict with a real query fingerprint. return query_id.startswith("known_") + def _gen_remaining_queries( + self, queries_generated: Set[QueryId] + ) -> Iterable[MetadataChangeProposalWrapper]: + if not self.generate_queries or not self.generate_query_usage_statistics: + return + + assert self._query_usage_counts is not None + for query_id in self._query_usage_counts: + if query_id in queries_generated: + continue + queries_generated.add(query_id) + + yield from self._gen_query(self._query_map[query_id]) + def can_generate_query(self, query_id: QueryId) -> bool: return self.generate_queries and not self._is_known_lineage_query_id(query_id) + def _gen_query( + self, query: QueryMetadata, downstream_urn: Optional[str] = None + ) -> Iterable[MetadataChangeProposalWrapper]: + query_id = query.query_id + if not self.can_generate_query(query_id): + return + + # If a query doesn't involve any allowed tables, skip it. + if downstream_urn is None and not any( + self.is_allowed_table(urn) for urn in query.upstreams + ): + self.report.num_queries_skipped_due_to_filters += 1 + return + + query_subject_urns = OrderedSet[UrnStr]() + for upstream in query.upstreams: + query_subject_urns.add(upstream) + if self.generate_query_subject_fields: + for column in query.column_usage.get(upstream, []): + query_subject_urns.add( + builder.make_schema_field_urn(upstream, column) + ) + if downstream_urn: + query_subject_urns.add(downstream_urn) + if self.generate_query_subject_fields: + for column_lineage in query.column_lineage: + query_subject_urns.add( + builder.make_schema_field_urn( + downstream_urn, column_lineage.downstream.column + ) + ) + + yield from MetadataChangeProposalWrapper.construct_many( + entityUrn=self._query_urn(query_id), + aspects=[ + models.QueryPropertiesClass( + statement=models.QueryStatementClass( + value=query.formatted_query_string, + language=models.QueryLanguageClass.SQL, + ), + source=models.QuerySourceClass.SYSTEM, + created=query.make_created_audit_stamp(), + lastModified=query.make_last_modified_audit_stamp(), + ), + models.QuerySubjectsClass( + subjects=[ + models.QuerySubjectClass(entity=urn) + for urn in query_subject_urns + ] + ), + models.DataPlatformInstanceClass( + platform=self.platform.urn(), + ), + ], + ) + self.report.num_queries_entities_generated += 1 + + if self._query_usage_counts is not None: + assert self.usage_config is not None + + # This is slightly lossy, since we only store one unique + # user per query instead of tracking all of them. + # We also lose information because we don't keep track + # of users / lastExecutedAt timestamps per bucket. + user = query.actor + + query_counter = self._query_usage_counts.get(query_id) + if not query_counter: + return + for bucket in self.usage_config.buckets(): + count = query_counter.get(bucket) + if not count: + continue + + yield MetadataChangeProposalWrapper( + entityUrn=self._query_urn(query_id), + aspect=models.QueryUsageStatisticsClass( + timestampMillis=make_ts_millis(bucket), + eventGranularity=models.TimeWindowSizeClass( + unit=self.usage_config.bucket_duration, multiple=1 + ), + queryCount=count, + uniqueUserCount=1, + userCounts=( + [ + models.DatasetUserUsageCountsClass( + user=user.urn(), + count=count, + ) + ] + if user + else None + ), + ), + ) + + self.report.num_query_usage_stats_generated += 1 + def _resolve_query_with_temp_tables( self, base_query: QueryMetadata, @@ -1210,6 +1455,11 @@ def _gen_operation_for_downstream( # We don't generate operations for SELECTs. return + if not self.is_allowed_table(downstream_urn): + self.report.num_operations_skipped_due_to_filters += 1 + return + + self.report.num_operations_generated += 1 aspect = models.OperationClass( timestampMillis=make_ts_millis(datetime.now(tz=timezone.utc)), operationType=operation_type, diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index 1d3e2c8b95af3a..5ef2eb420b8ed5 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -528,7 +528,7 @@ def default_query_results( # noqa: C901 snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( start_time_millis=1654473600000, end_time_millis=1654586220000, - include_view_lineage=False, + include_view_lineage=True, include_column_lineage=False, ), ): @@ -604,8 +604,10 @@ def default_query_results( # noqa: C901 snowflake_query.SnowflakeQuery.view_dependencies(), snowflake_query.SnowflakeQuery.show_external_tables(), snowflake_query.SnowflakeQuery.copy_lineage_history( - 1654473600000, - 1654586220000, + start_time_millis=1654473600000, end_time_millis=1654621200000 + ), + snowflake_query.SnowflakeQuery.copy_lineage_history( + start_time_millis=1654473600000, end_time_millis=1654586220000 ), ]: return [] diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json index 82b29c051114a7..5cba4e8b338223 100644 --- a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json +++ b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json @@ -4513,9 +4513,6 @@ "aspect": { "json": { "subjects": [ - { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)" - }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" }, @@ -4524,13 +4521,49 @@ }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,other_db.other_schema.table_1,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_10)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_1)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -5102,13 +5135,43 @@ }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -5136,18 +5199,48 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -5659,18 +5752,48 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -5862,18 +5985,48 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -6265,18 +6418,48 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -6674,18 +6857,48 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -6827,18 +7040,48 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -6850,18 +7093,78 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_10)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_8)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -6962,18 +7265,48 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -7099,18 +7432,48 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -7122,18 +7485,78 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_10)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_8)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json index 78d3b920767f72..71a74f883bf1bf 100644 --- a/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json +++ b/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json @@ -3892,11 +3892,71 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_10)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_8)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_10)" } ] } @@ -4111,11 +4171,71 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_10)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_8)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_10)" } ] } diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py index 32c2a63c3ac593..0b838b0bb59c3a 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py @@ -1,11 +1,12 @@ from datetime import datetime, timezone from unittest import mock +import pytest from freezegun import freeze_time from pytest import fixture from datahub.configuration.common import AllowDenyPattern, DynamicTypedConfig -from datahub.ingestion.run.pipeline import Pipeline +from datahub.ingestion.run.pipeline import Pipeline, PipelineInitError from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig from datahub.ingestion.source.snowflake import snowflake_query from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config @@ -72,11 +73,10 @@ def test_snowflake_missing_role_access_causes_pipeline_failure( "250001 (08001): Failed to connect to DB: abc12345.ap-south-1.snowflakecomputing.com:443. Role 'TEST_ROLE' specified in the connect string is not granted to this user. Contact your local system administrator, or attempt to login with another role, e.g. PUBLIC" ) - pipeline = Pipeline(snowflake_pipeline_config) - pipeline.run() - assert "permission-error" in [ - failure.message for failure in pipeline.source.get_report().failures - ] + with pytest.raises(PipelineInitError, match="Permissions error"): + pipeline = Pipeline(snowflake_pipeline_config) + pipeline.run() + pipeline.raise_from_status() @freeze_time(FROZEN_TIME) @@ -227,7 +227,7 @@ def test_snowflake_missing_snowflake_lineage_permission_causes_pipeline_failure( snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( start_time_millis=1654473600000, end_time_millis=1654586220000, - include_view_lineage=False, + include_view_lineage=True, include_column_lineage=True, ) ], diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json index 3893b649bd5c8a..94c8947dba9ff1 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json @@ -113,11 +113,20 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_basic_lineage.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_basic_lineage.json index 2eb3753473d7d9..839a224a41b63c 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_basic_lineage.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_basic_lineage.json @@ -81,11 +81,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),b)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json index dd110a5d928df0..d3ec3843168188 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json @@ -93,11 +93,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),b)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" } ] } @@ -145,11 +157,29 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),c)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),b)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_multistep_temp_table.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_multistep_temp_table.json index 5f3e81b7b9eb9b..a9b5a3a7cbefac 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_multistep_temp_table.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_multistep_temp_table.json @@ -106,14 +106,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream1,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.prod_foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream1,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.prod_foo,PROD),a)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.prod_foo,PROD),b)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.prod_foo,PROD),c)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts.json index 49458e06b0bb91..fcbe0ec5aeb839 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts.json @@ -118,11 +118,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream2,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream2,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream2,PROD),c)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.downstream,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.downstream,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.downstream,PROD),c)" } ] } @@ -170,11 +182,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream1,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream1,PROD),b)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.downstream,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream1,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.downstream,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.downstream,PROD),b)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts_from_temp_tables.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts_from_temp_tables.json index 9567aef095f9ac..48aecb90151804 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts_from_temp_tables.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts_from_temp_tables.json @@ -168,9 +168,6 @@ "aspect": { "json": { "subjects": [ - { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD)" - }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.customer,PROD)" }, @@ -179,6 +176,21 @@ }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.online_survey,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),customer_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),customer_email)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),return_date)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),return_reason)" } ] } @@ -226,14 +238,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.customer,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.in_person_returns,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.customer,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),customer_id)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.in_person_returns,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),customer_email)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),return_date)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json index fc5e5ef879fe14..9a4d405e50a7a4 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json @@ -81,11 +81,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.baz,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.baz,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.baz,PROD),b)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.baz,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),b)" } ] } @@ -184,11 +196,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD),b)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json index 377e3e02c970ea..743e2738fc70c6 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json @@ -81,11 +81,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),b)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" } ] } @@ -185,11 +197,17 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_session2,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_session2,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_session2,PROD),c)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_view_lineage.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_view_lineage.json index bf2296c99356e8..973813dae6073c 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_view_lineage.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_view_lineage.json @@ -81,11 +81,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),b)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" } ] } diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py index 0400bd6a72aa5f..be6efd3e121ff1 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py @@ -1,4 +1,5 @@ from datetime import datetime, timezone +from typing import Iterable from unittest import mock import pytest @@ -24,7 +25,7 @@ @pytest.fixture -def stateful_source(mock_datahub_graph: DataHubGraph) -> SnowflakeV2Source: +def stateful_source(mock_datahub_graph: DataHubGraph) -> Iterable[SnowflakeV2Source]: pipeline_name = "test_redundant_run_lineage" run_id = "test_redundant_run" ctx = PipelineContext( @@ -43,8 +44,9 @@ def stateful_source(mock_datahub_graph: DataHubGraph) -> SnowflakeV2Source: ), ), ) - source = SnowflakeV2Source(ctx=ctx, config=config) - return source + + with mock.patch("snowflake.connector.connect"): + yield SnowflakeV2Source(ctx=ctx, config=config) def test_redundant_run_job_ids(stateful_source: SnowflakeV2Source) -> None: diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/test_snowflake_source.py index 69a7510692df1d..3353e74449c957 100644 --- a/metadata-ingestion/tests/unit/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/test_snowflake_source.py @@ -14,7 +14,7 @@ SnowflakeCloudProvider, ) from datahub.ingestion.source.snowflake.snowflake_config import ( - DEFAULT_TABLES_DENY_LIST, + DEFAULT_TEMP_TABLES_PATTERNS, SnowflakeV2Config, ) from datahub.ingestion.source.snowflake.snowflake_query import ( @@ -24,7 +24,7 @@ from datahub.ingestion.source.snowflake.snowflake_usage_v2 import ( SnowflakeObjectAccessEntry, ) -from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin +from datahub.ingestion.source.snowflake.snowflake_utils import SnowsightUrlBuilder from datahub.ingestion.source.snowflake.snowflake_v2 import SnowflakeV2Source from tests.test_helpers import test_connection_helpers @@ -445,7 +445,9 @@ def test_aws_cloud_region_from_snowflake_region_id(): ( cloud, cloud_region_id, - ) = SnowflakeV2Source.get_cloud_region_from_snowflake_region_id("aws_ca_central_1") + ) = SnowsightUrlBuilder.get_cloud_region_from_snowflake_region_id( + "aws_ca_central_1" + ) assert cloud == SnowflakeCloudProvider.AWS assert cloud_region_id == "ca-central-1" @@ -453,7 +455,9 @@ def test_aws_cloud_region_from_snowflake_region_id(): ( cloud, cloud_region_id, - ) = SnowflakeV2Source.get_cloud_region_from_snowflake_region_id("aws_us_east_1_gov") + ) = SnowsightUrlBuilder.get_cloud_region_from_snowflake_region_id( + "aws_us_east_1_gov" + ) assert cloud == SnowflakeCloudProvider.AWS assert cloud_region_id == "us-east-1" @@ -463,7 +467,9 @@ def test_google_cloud_region_from_snowflake_region_id(): ( cloud, cloud_region_id, - ) = SnowflakeV2Source.get_cloud_region_from_snowflake_region_id("gcp_europe_west2") + ) = SnowsightUrlBuilder.get_cloud_region_from_snowflake_region_id( + "gcp_europe_west2" + ) assert cloud == SnowflakeCloudProvider.GCP assert cloud_region_id == "europe-west2" @@ -473,7 +479,7 @@ def test_azure_cloud_region_from_snowflake_region_id(): ( cloud, cloud_region_id, - ) = SnowflakeV2Source.get_cloud_region_from_snowflake_region_id( + ) = SnowsightUrlBuilder.get_cloud_region_from_snowflake_region_id( "azure_switzerlandnorth" ) @@ -483,7 +489,7 @@ def test_azure_cloud_region_from_snowflake_region_id(): ( cloud, cloud_region_id, - ) = SnowflakeV2Source.get_cloud_region_from_snowflake_region_id( + ) = SnowsightUrlBuilder.get_cloud_region_from_snowflake_region_id( "azure_centralindia" ) @@ -493,7 +499,7 @@ def test_azure_cloud_region_from_snowflake_region_id(): def test_unknown_cloud_region_from_snowflake_region_id(): with pytest.raises(Exception, match="Unknown snowflake region"): - SnowflakeV2Source.get_cloud_region_from_snowflake_region_id( + SnowsightUrlBuilder.get_cloud_region_from_snowflake_region_id( "somecloud_someregion" ) @@ -529,8 +535,10 @@ def test_snowflake_query_create_deny_regex_sql(): ) assert ( - create_deny_regex_sql_filter(DEFAULT_TABLES_DENY_LIST, ["upstream_table_name"]) - == r"NOT RLIKE(upstream_table_name,'.*\.FIVETRAN_.*_STAGING\..*','i') AND NOT RLIKE(upstream_table_name,'.*__DBT_TMP$','i') AND NOT RLIKE(upstream_table_name,'.*\.SEGMENT_[a-f0-9]{8}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{12}','i') AND NOT RLIKE(upstream_table_name,'.*\.STAGING_.*_[a-f0-9]{8}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{12}','i')" + create_deny_regex_sql_filter( + DEFAULT_TEMP_TABLES_PATTERNS, ["upstream_table_name"] + ) + == r"NOT RLIKE(upstream_table_name,'.*\.FIVETRAN_.*_STAGING\..*','i') AND NOT RLIKE(upstream_table_name,'.*__DBT_TMP$','i') AND NOT RLIKE(upstream_table_name,'.*\.SEGMENT_[a-f0-9]{8}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{12}','i') AND NOT RLIKE(upstream_table_name,'.*\.STAGING_.*_[a-f0-9]{8}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{12}','i') AND NOT RLIKE(upstream_table_name,'.*\.(GE_TMP_|GE_TEMP_|GX_TEMP_)[0-9A-F]{8}','i')" ) @@ -588,26 +596,15 @@ def test_email_filter_query_generation_with_case_insensitive_filter(): def test_create_snowsight_base_url_us_west(): - ( - cloud, - cloud_region_id, - ) = SnowflakeCommonMixin.get_cloud_region_from_snowflake_region_id("aws_us_west_2") - - result = SnowflakeCommonMixin.create_snowsight_base_url( - "account_locator", cloud_region_id, cloud, False - ) + result = SnowsightUrlBuilder( + "account_locator", "aws_us_west_2", privatelink=False + ).snowsight_base_url assert result == "https://app.snowflake.com/us-west-2/account_locator/" def test_create_snowsight_base_url_ap_northeast_1(): - ( - cloud, - cloud_region_id, - ) = SnowflakeCommonMixin.get_cloud_region_from_snowflake_region_id( - "aws_ap_northeast_1" - ) + result = SnowsightUrlBuilder( + "account_locator", "aws_ap_northeast_1", privatelink=False + ).snowsight_base_url - result = SnowflakeCommonMixin.create_snowsight_base_url( - "account_locator", cloud_region_id, cloud, False - ) assert result == "https://app.snowflake.com/ap-northeast-1.aws/account_locator/" diff --git a/metadata-models/src/main/pegasus/com/linkedin/query/QueryUsageStatistics.pdl b/metadata-models/src/main/pegasus/com/linkedin/query/QueryUsageStatistics.pdl new file mode 100644 index 00000000000000..275077e5e6a475 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/query/QueryUsageStatistics.pdl @@ -0,0 +1,43 @@ +namespace com.linkedin.query + +import com.linkedin.timeseries.TimeseriesAspectBase +import com.linkedin.dataset.DatasetUserUsageCounts + +/** + * Stats corresponding to dataset's usage. + */ +@Aspect = { + "name": "queryUsageStatistics", + "type": "timeseries", +} +record QueryUsageStatistics includes TimeseriesAspectBase { + /** + * Total query count in this bucket + */ + @TimeseriesField = {} + queryCount: optional int + + /** + * Query cost for this query and bucket + */ + @TimeseriesField = {} + queryCost: optional double + + /** + * Last executed timestamp + */ + @TimeseriesField = {} + lastExecutedAt: optional long + + /** + * Unique user count + */ + @TimeseriesField = {} + uniqueUserCount: optional int + + /** + * Users within this bucket, with frequency counts + */ + @TimeseriesFieldCollection = {"key":"user"} + userCounts: optional array[DatasetUserUsageCounts] +} diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 2e713ee4104bf3..c8344b7de1e127 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -504,6 +504,7 @@ entities: aspects: - queryProperties - querySubjects + - queryUsageStatistics - status - dataPlatformInstance - subTypes