From e55817713cad5ddcacc43c3ceddcad3dfc37e671 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 1 Jul 2024 17:36:03 -0700 Subject: [PATCH 01/32] add preparsed_query abstraction --- .../source/snowflake/snowflake_lineage_v2.py | 34 +++-- .../sql_parsing/sql_parsing_aggregator.py | 119 ++++++++++++++---- 2 files changed, 120 insertions(+), 33 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index b12ef4d19c45c8..e29e85e515fefe 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -30,6 +30,7 @@ from datahub.sql_parsing.sql_parsing_aggregator import ( ColumnLineageInfo, ColumnRef, + KnownLineageMapping, KnownQueryLineageInfo, SqlParsingAggregator, UrnStr, @@ -264,13 +265,20 @@ def _populate_external_upstreams(self, discovered_tables: List[str]) -> None: with PerfTimer() as timer: self.report.num_external_table_edges_scanned = 0 - self._populate_external_lineage_from_copy_history(discovered_tables) + for ( + known_lineage_mapping + ) in self._populate_external_lineage_from_copy_history(discovered_tables): + self.sql_aggregator.add(known_lineage_mapping) logger.info( "Done populating external lineage from copy history. " f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far." ) - self._populate_external_lineage_from_show_query(discovered_tables) + for ( + known_lineage_mapping + ) in self._populate_external_lineage_from_show_query(discovered_tables): + self.sql_aggregator.add(known_lineage_mapping) + logger.info( "Done populating external lineage from show external tables. " f"Found {self.report.num_external_table_edges_scanned} external lineage edges so far." @@ -282,7 +290,7 @@ def _populate_external_upstreams(self, discovered_tables: List[str]) -> None: # NOTE: Snowflake does not log this information to the access_history table. def _populate_external_lineage_from_show_query( self, discovered_tables: List[str] - ) -> None: + ) -> Iterable[KnownLineageMapping]: external_tables_query: str = SnowflakeQuery.show_external_tables() try: for db_row in self.query(external_tables_query): @@ -293,11 +301,11 @@ def _populate_external_lineage_from_show_query( if key not in discovered_tables: continue if db_row["location"].startswith("s3://"): - self.sql_aggregator.add_known_lineage_mapping( - downstream_urn=self.dataset_urn_builder(key), + yield KnownLineageMapping( upstream_urn=make_s3_urn_for_lineage( db_row["location"], self.config.env ), + downstream_urn=self.dataset_urn_builder(key), ) self.report.num_external_table_edges_scanned += 1 @@ -316,7 +324,7 @@ def _populate_external_lineage_from_show_query( # NOTE: Snowflake does not log this information to the access_history table. def _populate_external_lineage_from_copy_history( self, discovered_tables: List[str] - ) -> None: + ) -> Iterable[KnownLineageMapping]: query: str = SnowflakeQuery.copy_lineage_history( start_time_millis=int(self.start_time.timestamp() * 1000), end_time_millis=int(self.end_time.timestamp() * 1000), @@ -325,7 +333,11 @@ def _populate_external_lineage_from_copy_history( try: for db_row in self.query(query): - self._process_external_lineage_result_row(db_row, discovered_tables) + known_lineage_mapping = self._process_external_lineage_result_row( + db_row, discovered_tables + ) + if known_lineage_mapping: + yield known_lineage_mapping except Exception as e: if isinstance(e, SnowflakePermissionError): error_msg = "Failed to get external lineage. Please grant imported privileges on SNOWFLAKE database. " @@ -340,7 +352,7 @@ def _populate_external_lineage_from_copy_history( def _process_external_lineage_result_row( self, db_row: dict, discovered_tables: List[str] - ) -> None: + ) -> Optional[KnownLineageMapping]: # key is the down-stream table name key: str = self.get_dataset_identifier_from_qualified_name( db_row["DOWNSTREAM_TABLE_NAME"] @@ -353,11 +365,11 @@ def _process_external_lineage_result_row( for loc in external_locations: if loc.startswith("s3://"): - self.sql_aggregator.add_known_lineage_mapping( - downstream_urn=self.dataset_urn_builder(key), + self.report.num_external_table_edges_scanned += 1 + return KnownLineageMapping( upstream_urn=make_s3_urn_for_lineage(loc, self.config.env), + downstream_urn=self.dataset_urn_builder(key), ) - self.report.num_external_table_edges_scanned += 1 def _fetch_upstream_lineages_for_tables(self) -> Iterable[UpstreamLineageEdge]: query: str = SnowflakeQuery.table_to_table_lineage_history_v2( diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 27daae11e2295f..2d79ecde47dd92 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -30,7 +30,7 @@ SchemaFieldUrn, ) from datahub.sql_parsing.schema_resolver import SchemaResolver, SchemaResolverInterface -from datahub.sql_parsing.sql_parsing_common import QueryType +from datahub.sql_parsing.sql_parsing_common import QueryType, QueryTypeProps from datahub.sql_parsing.sqlglot_lineage import ( ColumnLineageInfo, ColumnRef, @@ -134,6 +134,37 @@ class KnownQueryLineageInfo: query_type: QueryType = QueryType.UNKNOWN +@dataclasses.dataclass +class KnownLineageMapping: + upstream_urn: UrnStr + downstream_urn: UrnStr + lineage_type: str = models.DatasetLineageTypeClass.COPY + + +@dataclasses.dataclass +class PreparsedQuery: + # If not provided, we will generate one using the fast fingerprint generator. + query_id: Optional[QueryId] + + query_text: str + + upstreams: List[UrnStr] + downstream: Optional[UrnStr] = None + column_lineage: Optional[List[ColumnLineageInfo]] = None + column_usage: Optional[Dict[UrnStr, Set[UrnStr]]] = None + inferred_schema: Optional[List[models.SchemaFieldClass]] = None + confidence_score: float = 1.0 + + query_count: int = 1 + user: Optional[CorpUserUrn] = None + timestamp: Optional[datetime] = None + session_id: str = _MISSING_SESSION_ID + query_type: QueryType = QueryType.UNKNOWN + query_type_props: QueryTypeProps = dataclasses.field( + default_factory=lambda: QueryTypeProps() + ) + + @dataclasses.dataclass class SqlAggregatorReport(Report): _aggregator: "SqlParsingAggregator" @@ -387,6 +418,14 @@ def _maybe_format_query(self, query: str) -> str: return try_format_query(query, self.platform.platform_name) return query + def add(self, item: Union[KnownQueryLineageInfo, KnownLineageMapping]) -> None: + if isinstance(item, KnownQueryLineageInfo): + self.add_known_query_lineage(item) + elif isinstance(item, KnownLineageMapping): + self.add_known_lineage_mapping(item.upstream_urn, item.downstream_urn) + else: + raise ValueError(f"Cannot add unknown item type: {type(item)}") + def add_known_query_lineage( self, known_query_lineage: KnownQueryLineageInfo, merge_lineage: bool = False ) -> None: @@ -567,18 +606,57 @@ def add_observed_query( elif parsed.debug_info.column_error: self.report.num_observed_queries_column_failed += 1 + query_fingerprint = parsed.query_fingerprint + + self.add_preparsed_query( + PreparsedQuery( + query_id=query_fingerprint, + query_text=query, + query_count=usage_multiplier, + timestamp=query_timestamp, + user=user, + session_id=session_id, + query_type=parsed.query_type, + query_type_props=parsed.query_type_props, + upstreams=parsed.in_tables, + downstream=parsed.out_tables[0] if parsed.out_tables else None, + column_lineage=parsed.column_lineage, + # TODO: We need a full list of columns referenced, not just the out tables. + column_usage=compute_upstream_fields(parsed), + inferred_schema=infer_output_schema(parsed), + confidence_score=parsed.debug_info.confidence, + ), + is_known_temp_table=is_known_temp_table, + require_out_table_schema=require_out_table_schema, + session_has_temp_tables=session_has_temp_tables, + ) + + def add_preparsed_query( + self, + parsed: PreparsedQuery, + is_known_temp_table: bool = False, + require_out_table_schema: bool = False, + session_has_temp_tables: bool = False, + ) -> None: + query_fingerprint = parsed.query_id + if not query_fingerprint: + query_fingerprint = get_query_fingerprint( + parsed.query_text, + platform=self.platform.platform_name, + fast=True, + ) + # Format the query. - formatted_query = self._maybe_format_query(query) + formatted_query = self._maybe_format_query(parsed.query_text) # Register the query's usage. if not self._usage_aggregator: pass # usage is not enabled - elif query_timestamp is None: + elif parsed.timestamp is None: self.report.usage_skipped_missing_timestamp += 1 else: - # TODO: We need a full list of columns referenced, not just the out tables. - upstream_fields = compute_upstream_fields(parsed) - for upstream_urn in parsed.in_tables: + upstream_fields = parsed.column_usage or {} + for upstream_urn in parsed.upstreams: # If the upstream table is a temp table, don't log usage for it. if (self.is_temp_table and self.is_temp_table(upstream_urn)) or ( require_out_table_schema @@ -588,18 +666,16 @@ def add_observed_query( self._usage_aggregator.aggregate_event( resource=upstream_urn, - start_time=query_timestamp, + start_time=parsed.timestamp, query=formatted_query, - user=user.urn() if user else None, + user=parsed.user.urn() if parsed.user else None, fields=sorted(upstream_fields.get(upstream_urn, [])), - count=usage_multiplier, + count=parsed.query_count, ) - if not parsed.out_tables: + if not parsed.downstream: return - out_table = parsed.out_tables[0] - query_fingerprint = parsed.query_fingerprint - assert query_fingerprint is not None + out_table = parsed.downstream # Handle table renames. is_renamed_table = False @@ -612,14 +688,14 @@ def add_observed_query( QueryMetadata( query_id=query_fingerprint, formatted_query_string=formatted_query, - session_id=session_id, + session_id=parsed.session_id, query_type=parsed.query_type, lineage_type=models.DatasetLineageTypeClass.TRANSFORMED, - latest_timestamp=query_timestamp, - actor=user, - upstreams=parsed.in_tables, + latest_timestamp=parsed.timestamp, + actor=parsed.user, + upstreams=parsed.upstreams, column_lineage=parsed.column_lineage or [], - confidence_score=parsed.debug_info.confidence, + confidence_score=parsed.confidence_score, used_temp_tables=session_has_temp_tables, ) ) @@ -643,12 +719,11 @@ def add_observed_query( ) ): # Infer the schema of the output table and track it for later. - inferred_schema = infer_output_schema(parsed) - if inferred_schema is not None: - self._inferred_temp_schemas[query_fingerprint] = inferred_schema + if parsed.inferred_schema is not None: + self._inferred_temp_schemas[query_fingerprint] = parsed.inferred_schema # Also track the lineage for the temp table, for merging purposes later. - self._temp_lineage_map.for_mutation(session_id, {})[ + self._temp_lineage_map.for_mutation(parsed.session_id, {})[ out_table ] = query_fingerprint From ff937f78d1c30aa7a9025f3f4592bdc8850f2a49 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 2 Jul 2024 16:22:59 -0700 Subject: [PATCH 02/32] add query stats + fixes --- .../source/snowflake/snowflake_queries.py | 413 ++++++++++++++++++ .../sql_parsing/sql_parsing_aggregator.py | 183 ++++++-- .../linkedin/query/QueryUsageStatistics.pdl | 43 ++ .../src/main/resources/entity-registry.yml | 1 + 4 files changed, 594 insertions(+), 46 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py create mode 100644 metadata-models/src/main/pegasus/com/linkedin/query/QueryUsageStatistics.pdl diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py new file mode 100644 index 00000000000000..b2e165218fff0c --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -0,0 +1,413 @@ +import functools +import json +import logging +import pathlib +import tempfile +from dataclasses import dataclass +from datetime import datetime +from typing import Any, Dict, Iterable, List, Optional, Union + +import pydantic +from snowflake.connector.cursor import DictCursor +from typing_extensions import Self + +from datahub.configuration.source_common import ( + EnvConfigMixin, + LowerCaseDatasetUrnConfigMixin, + PlatformInstanceConfigMixin, +) +from datahub.configuration.time_window_config import ( + BaseTimeWindowConfig, + BucketDuration, +) +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.source import Source, SourceReport +from datahub.ingestion.api.source_helpers import auto_workunit +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery +from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin +from datahub.ingestion.source.usage.usage_common import BaseUsageConfig +from datahub.ingestion.source_config.sql.snowflake import BaseSnowflakeConfig +from datahub.metadata._urns.urn_defs import CorpUserUrn +from datahub.sql_parsing.sql_parsing_aggregator import ( + KnownLineageMapping, + PreparsedQuery, + SqlAggregatorReport, + SqlParsingAggregator, +) +from datahub.sql_parsing.sql_parsing_common import QueryType +from datahub.sql_parsing.sqlglot_lineage import ( + ColumnLineageInfo, + ColumnRef, + DownstreamColumnRef, +) +from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList + +logger = logging.getLogger(__name__) + + +class SnowflakeQueriesConfig( + PlatformInstanceConfigMixin, EnvConfigMixin, LowerCaseDatasetUrnConfigMixin +): + connection: BaseSnowflakeConfig + + # TODO: Support stateful ingestion for the time windows. + window: BaseTimeWindowConfig = BaseTimeWindowConfig() + + # TODO: make this a proper allow/deny pattern + deny_usernames: List[str] = [] + + # TODO: support temporary_tables_pattern + + local_temp_path: Optional[pathlib.Path] = None + # TODO: support copying files to s3 + + convert_urns_to_lowercase: bool = pydantic.Field( + # Override the default. + default=True, + description="Whether to convert dataset urns to lowercase.", + ) + + +@dataclass +class SnowflakeQueriesReport(SourceReport): + window: Optional[BaseTimeWindowConfig] = None + + sql_aggregator: Optional[SqlAggregatorReport] = None + + +class SnowflakeQueriesSource(Source, SnowflakeCommonMixin): + def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesConfig): + self.ctx = ctx + self.config = config + self.report = SnowflakeQueriesReport() + + self.platform = "snowflake" + + self.aggregator = SqlParsingAggregator( + platform=self.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + # graph=self.ctx.graph, + # TODO: Make these configurable. + generate_lineage=True, + generate_queries=True, + generate_usage_statistics=True, + generate_query_usage_statistics=True, + usage_config=BaseUsageConfig( + bucket_duration=self.config.window.bucket_duration, + start_time=self.config.window.start_time, + end_time=self.config.window.end_time, + # TODO make the rest of the fields configurable + ), + generate_operations=True, + format_queries=False, + ) + self.report.sql_aggregator = self.aggregator.report + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> Self: + config = SnowflakeQueriesConfig.parse_obj(config_dict) + return cls(ctx, config) + + @functools.cached_property + def local_temp_path(self) -> pathlib.Path: + if self.config.local_temp_path: + assert self.config.local_temp_path.is_dir() + return self.config.local_temp_path + + path = pathlib.Path(tempfile.mkdtemp()) + path.mkdir(parents=True, exist_ok=True) + logger.info(f"Using local temp path: {path}") + return path + + def get_workunits_internal( + self, + ) -> Iterable[MetadataWorkUnit]: + self.report.window = self.config.window + + # TODO: Add some logic to check if the cached audit log is stale or not. + audit_log_file = self.local_temp_path / "audit_log.sqlite" + use_cached_audit_log = audit_log_file.exists() + + if use_cached_audit_log: + logger.info("Using cached audit log") + shared_connection = ConnectionWrapper(audit_log_file) + queries = FileBackedList(shared_connection) + else: + audit_log_file.unlink(missing_ok=True) + + shared_connection = ConnectionWrapper(audit_log_file) + queries = FileBackedList(shared_connection) + + logger.info("Fetching audit log") + for entry in self.fetch_audit_log(): + queries.append(entry) + + for query in queries: + self.aggregator.add(query) + + yield from auto_workunit(self.aggregator.gen_metadata()) + + def fetch_audit_log( + self, + ) -> Iterable[Union[KnownLineageMapping, PreparsedQuery]]: + """ + # TODO: we need to fetch this info from somewhere + discovered_tables = [] + + snowflake_lineage_v2 = SnowflakeLineageExtractor( + config=self.config, # type: ignore + report=self.report, # type: ignore + dataset_urn_builder=self.gen_dataset_urn, + redundant_run_skip_handler=None, + sql_aggregator=self.aggregator, # TODO this should be unused + ) + + for ( + known_lineage_mapping + ) in snowflake_lineage_v2._populate_external_lineage_from_copy_history( + discovered_tables=discovered_tables + ): + interim_results.append(known_lineage_mapping) + + for ( + known_lineage_mapping + ) in snowflake_lineage_v2._populate_external_lineage_from_show_query( + discovered_tables=discovered_tables + ): + interim_results.append(known_lineage_mapping) + """ + + audit_log_query = _build_enriched_audit_log_query( + start_time=self.config.window.start_time, + end_time=self.config.window.end_time, + bucket_duration=self.config.window.bucket_duration, + deny_usernames=self.config.deny_usernames, + ) + + conn = self.config.connection.get_connection() + resp = conn.cursor(DictCursor).execute(audit_log_query) + assert resp is not None + + for row in resp: + assert isinstance(row, dict) + entry = self._parse_audit_log_response(row) + yield entry + + # HACK: This makes mypy happy with our usage of the mixin methods. + gen_dataset_urn = SnowflakeCommonMixin.gen_dataset_urn + snowflake_identifier = SnowflakeCommonMixin.snowflake_identifier + + def _parse_audit_log_response(self, row: Dict[str, Any]) -> PreparsedQuery: + json_fields = { + "DIRECT_OBJECTS_ACCESSED", + "OBJECTS_MODIFIED", + } + + res = {} + for key, value in row.items(): + if key in json_fields and value: + value = json.loads(value) + key = key.lower() + res[key] = value + + direct_objects_accessed = res["direct_objects_accessed"] + objects_modified = res["objects_modified"] + + upstreams = [] + column_usage = {} + + for obj in direct_objects_accessed: + dataset = self.gen_dataset_urn(self.snowflake_identifier(obj["objectName"])) + + columns = set() + for column in obj["columns"]: + columns.add(self.snowflake_identifier(column["columnName"])) + + upstreams.append(dataset) + column_usage[dataset] = columns + + downstream = None + column_lineage = None + for obj in objects_modified: + # We don't expect there to be more than one object modified. + # TODO: Warn if that happens. + + downstream = self.gen_dataset_urn( + self.snowflake_identifier(obj["objectName"]) + ) + column_lineage = [] + for column in obj["columns"]: + column_lineage.append( + ColumnLineageInfo( + downstream=DownstreamColumnRef( + dataset=downstream, + column=self.snowflake_identifier(column["columnName"]), + ), + upstreams=[ + ColumnRef( + table=self.gen_dataset_urn( + self.snowflake_identifier(upstream["objectName"]) + ), + column=self.snowflake_identifier( + upstream["columnName"] + ), + ) + for upstream in column["directSources"] + # TODO Check object domain. + ], + ) + ) + + # TODO: Support filtering the table names. + # if objects_modified: + # breakpoint() + + # TODO implement email address mapping + user = CorpUserUrn(res["user_name"]) + + # TODO need to map snowflake query types to ours + query_type = SNOWFLAKE_QUERY_TYPE_MAPPING.get( + res["query_type"], QueryType.UNKNOWN + ) + + entry = PreparsedQuery( + query_id=res["query_fingerprint"], + query_text=res["query_text"], + upstreams=upstreams, + downstream=downstream, + column_lineage=column_lineage, + column_usage=column_usage, + inferred_schema=None, + confidence_score=1, + query_count=res["query_count"], + user=user, + timestamp=res["query_start_time"], + session_id=res["session_id"], + query_type=query_type, + ) + return entry + + def get_report(self) -> SnowflakeQueriesReport: + return self.report + + +# Make sure we don't try to generate too much info for a single query. +_MAX_TABLES_PER_QUERY = 20 + + +def _build_enriched_audit_log_query( + start_time: datetime, + end_time: datetime, + bucket_duration: BucketDuration, + deny_usernames: Optional[List[str]], +) -> str: + start_time_millis = int(start_time.timestamp() * 1000) + end_time_millis = int(end_time.timestamp() * 1000) + + users_filter = "" + if deny_usernames: + user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames) + users_filter = f"user_name NOT IN ({user_not_in})" + + time_bucket_size = bucket_duration.value + assert time_bucket_size in ("HOUR", "DAY", "MONTH") + + return f"""\ +WITH +fingerprinted_queries as ( + SELECT *, + -- TODO: Generate better fingerprints for each query by pushing down regex logic. + query_history.query_parameterized_hash as query_fingerprint + FROM + snowflake.account_usage.query_history + WHERE + query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3) + AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3) + AND execution_status = 'SUCCESS' + AND {users_filter or 'TRUE'} +) +, deduplicated_queries as ( + SELECT + *, + DATE_TRUNC( + 'DAY', + CONVERT_TIMEZONE('UTC', start_time) + ) AS bucket_start_time, + COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint) AS query_count, + FROM + fingerprinted_queries + QUALIFY + ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint ORDER BY start_time DESC) = 1 +) +, raw_access_history AS ( + SELECT + query_id, + query_start_time, + user_name, + direct_objects_accessed, + objects_modified, + FROM + snowflake.account_usage.access_history + WHERE + query_start_time >= to_timestamp_ltz({start_time_millis}, 3) + AND query_start_time < to_timestamp_ltz({end_time_millis}, 3) + AND {users_filter or 'TRUE'} + AND query_id IN ( + SELECT query_id FROM deduplicated_queries + ) +) +, filtered_access_history AS ( + -- TODO: Add table filter clause. + SELECT + query_id, + query_start_time, + ARRAY_SLICE( + FILTER(direct_objects_accessed, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}), + 0, {_MAX_TABLES_PER_QUERY} + ) as direct_objects_accessed, + -- TODO: Drop the columns.baseSources subfield. + FILTER(objects_modified, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}) as objects_modified, + FROM raw_access_history + WHERE ( array_size(direct_objects_accessed) > 0 or array_size(objects_modified) > 0 ) +) +, query_access_history AS ( + SELECT + q.bucket_start_time, + q.query_id, + q.query_fingerprint, + q.query_count, + q.session_id AS "SESSION_ID", + q.start_time AS "QUERY_START_TIME", + q.total_elapsed_time AS "QUERY_DURATION", + q.query_text AS "QUERY_TEXT", + q.query_type AS "QUERY_TYPE", + q.database_name as "DEFAULT_DB", + q.schema_name as "DEFAULT_SCHEMA", + q.rows_inserted AS "ROWS_INSERTED", + q.rows_updated AS "ROWS_UPDATED", + q.rows_deleted AS "ROWS_DELETED", + q.user_name AS "USER_NAME", + q.role_name AS "ROLE_NAME", + a.direct_objects_accessed, + a.objects_modified, + FROM deduplicated_queries q + JOIN filtered_access_history a USING (query_id) +) +SELECT * FROM query_access_history +""" + + +SNOWFLAKE_QUERY_TYPE_MAPPING = { + "INSERT": QueryType.INSERT, + "UPDATE": QueryType.UPDATE, + "DELETE": QueryType.DELETE, + "CREATE": QueryType.CREATE_OTHER, + "CREATE_TABLE": QueryType.CREATE_DDL, + "CREATE_VIEW": QueryType.CREATE_VIEW, + "CREATE_TABLE_AS_SELECT": QueryType.CREATE_TABLE_AS_SELECT, + "MERGE": QueryType.MERGE, + "COPY": QueryType.UNKNOWN, + "TRUNCATE_TABLE": QueryType.UNKNOWN, +} diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 2d79ecde47dd92..66ac26bb5eea55 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -14,6 +14,7 @@ import datahub.emitter.mce_builder as builder import datahub.metadata.schema_classes as models +from datahub.configuration.time_window_config import get_time_bucket from datahub.emitter.mce_builder import get_sys_time, make_ts_millis from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.sql_parsing_builder import compute_upstream_fields @@ -196,6 +197,7 @@ class SqlAggregatorReport(Report): # Other lineage loading metrics. num_known_query_lineage: int = 0 + num_preparsed_queries: int = 0 num_known_mapping_lineage: int = 0 num_table_renames: int = 0 @@ -218,6 +220,7 @@ class SqlAggregatorReport(Report): # Usage-related. usage_skipped_missing_timestamp: int = 0 + num_query_usage_stats_generated: int = 0 def compute_stats(self) -> None: self.schema_resolver_count = self._aggregator._schema_resolver.schema_count() @@ -241,6 +244,7 @@ def __init__( generate_lineage: bool = True, generate_queries: bool = True, generate_usage_statistics: bool = False, + generate_query_usage_statistics: bool = False, generate_operations: bool = False, usage_config: Optional[BaseUsageConfig] = None, is_temp_table: Optional[Callable[[UrnStr], bool]] = None, @@ -254,12 +258,15 @@ def __init__( self.generate_lineage = generate_lineage self.generate_queries = generate_queries self.generate_usage_statistics = generate_usage_statistics + self.generate_query_usage_statistics = generate_query_usage_statistics self.generate_operations = generate_operations if self.generate_queries and not self.generate_lineage: raise ValueError("Queries will only be generated if lineage is enabled") self.usage_config = usage_config - if self.generate_usage_statistics and self.usage_config is None: + if ( + self.generate_usage_statistics or self.generate_query_usage_statistics + ) and self.usage_config is None: raise ValueError("Usage statistics generation requires a usage config") self.report = SqlAggregatorReport(_aggregator=self) @@ -356,6 +363,15 @@ def __init__( assert self.usage_config is not None self._usage_aggregator = UsageAggregator(config=self.usage_config) + # Query usage aggregator. + # Map of query ID -> { bucket -> count } + self._query_usage_counts: Optional[FileBackedDict[Dict[datetime, int]]] = None + if self.generate_query_usage_statistics: + self._query_usage_counts = FileBackedDict[Dict[datetime, int]]( + shared_connection=self._shared_connection, + tablename="query_usage_counts", + ) + def close(self) -> None: self._exit_stack.close() @@ -418,11 +434,15 @@ def _maybe_format_query(self, query: str) -> str: return try_format_query(query, self.platform.platform_name) return query - def add(self, item: Union[KnownQueryLineageInfo, KnownLineageMapping]) -> None: + def add( + self, item: Union[KnownQueryLineageInfo, KnownLineageMapping, PreparsedQuery] + ) -> None: if isinstance(item, KnownQueryLineageInfo): self.add_known_query_lineage(item) elif isinstance(item, KnownLineageMapping): self.add_known_lineage_mapping(item.upstream_urn, item.downstream_urn) + elif isinstance(item, PreparsedQuery): + self.add_preparsed_query(item) else: raise ValueError(f"Cannot add unknown item type: {type(item)}") @@ -629,6 +649,7 @@ def add_observed_query( is_known_temp_table=is_known_temp_table, require_out_table_schema=require_out_table_schema, session_has_temp_tables=session_has_temp_tables, + _is_internal=True, ) def add_preparsed_query( @@ -636,8 +657,12 @@ def add_preparsed_query( parsed: PreparsedQuery, is_known_temp_table: bool = False, require_out_table_schema: bool = False, - session_has_temp_tables: bool = False, + session_has_temp_tables: bool = True, + _is_internal: bool = False, ) -> None: + if not _is_internal: + self.report.num_preparsed_queries += 1 + query_fingerprint = parsed.query_id if not query_fingerprint: query_fingerprint = get_query_fingerprint( @@ -673,15 +698,13 @@ def add_preparsed_query( count=parsed.query_count, ) - if not parsed.downstream: - return - out_table = parsed.downstream - - # Handle table renames. - is_renamed_table = False - if out_table in self._table_renames: - out_table = self._table_renames[out_table] - is_renamed_table = True + if self._query_usage_counts is not None and parsed.timestamp is not None: + assert self.usage_config is not None + bucket = get_time_bucket( + parsed.timestamp, self.usage_config.bucket_duration + ) + counts = self._query_usage_counts.for_mutation(query_fingerprint, {}) + counts[bucket] = counts.get(bucket, 0) + parsed.query_count # Register the query. self._add_to_query_map( @@ -700,6 +723,16 @@ def add_preparsed_query( ) ) + if not parsed.downstream: + return + out_table = parsed.downstream + + # Handle table renames. + is_renamed_table = False + if out_table in self._table_renames: + out_table = self._table_renames[out_table] + is_renamed_table = True + # Register the query's lineage. if ( is_known_temp_table @@ -913,13 +946,16 @@ def _add_to_query_map( self._query_map[query_fingerprint] = new def gen_metadata(self) -> Iterable[MetadataChangeProposalWrapper]: - # diff from v1 - we generate operations here, and it also - # generates MCPWs instead of workunits - yield from self._gen_lineage_mcps() + queries_generated: Set[QueryId] = set() + + yield from self._gen_lineage_mcps(queries_generated) + yield from self._gen_remaining_queries(queries_generated) yield from self._gen_usage_statistics_mcps() yield from self._gen_operation_mcps() - def _gen_lineage_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: + def _gen_lineage_mcps( + self, queries_generated: Set[QueryId] + ) -> Iterable[MetadataChangeProposalWrapper]: if not self.generate_lineage: return @@ -931,7 +967,6 @@ def _gen_lineage_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: self._view_definitions.clear() # Generate lineage and queries. - queries_generated: Set[QueryId] = set() for downstream_urn in sorted(self._lineage_map): yield from self._gen_lineage_for_downstream( downstream_urn, queries_generated=queries_generated @@ -1058,41 +1093,13 @@ def _gen_lineage_for_downstream( ) for query_id in required_queries: - if not self.can_generate_query(query_id): - continue - # Avoid generating the same query twice. if query_id in queries_generated: continue queries_generated.add(query_id) - self.report.num_queries_entities_generated += 1 query = queries_map[query_id] - yield from MetadataChangeProposalWrapper.construct_many( - entityUrn=self._query_urn(query_id), - aspects=[ - models.QueryPropertiesClass( - statement=models.QueryStatementClass( - value=query.formatted_query_string, - language=models.QueryLanguageClass.SQL, - ), - source=models.QuerySourceClass.SYSTEM, - created=query.make_created_audit_stamp(), - lastModified=query.make_last_modified_audit_stamp(), - ), - models.QuerySubjectsClass( - subjects=[ - models.QuerySubjectClass(entity=dataset_urn) - for dataset_urn in itertools.chain( - [downstream_urn], query.upstreams - ) - ] - ), - models.DataPlatformInstanceClass( - platform=self.platform.urn(), - ), - ], - ) + yield from self._gen_query(query, downstream_urn) @classmethod def _query_urn(cls, query_id: QueryId) -> str: @@ -1118,9 +1125,93 @@ def _is_known_lineage_query_id(cls, query_id: QueryId) -> bool: # never conflict with a real query fingerprint. return query_id.startswith("known_") + def _gen_remaining_queries( + self, queries_generated: Set[QueryId] + ) -> Iterable[MetadataChangeProposalWrapper]: + if not self.generate_queries or not self.generate_query_usage_statistics: + return + + for query_id in self._query_usage_counts: + if query_id in queries_generated: + continue + queries_generated.add(query_id) + + yield from self._gen_query(self._query_map[query_id]) + def can_generate_query(self, query_id: QueryId) -> bool: return self.generate_queries and not self._is_known_lineage_query_id(query_id) + def _gen_query( + self, query: QueryMetadata, downstream_urn: Optional[str] = None + ) -> Iterable[MetadataChangeProposalWrapper]: + query_id = query.query_id + if not self.can_generate_query(query_id): + return + + yield from MetadataChangeProposalWrapper.construct_many( + entityUrn=self._query_urn(query_id), + aspects=[ + models.QueryPropertiesClass( + statement=models.QueryStatementClass( + value=query.formatted_query_string, + language=models.QueryLanguageClass.SQL, + ), + source=models.QuerySourceClass.SYSTEM, + created=query.make_created_audit_stamp(), + lastModified=query.make_last_modified_audit_stamp(), + ), + models.QuerySubjectsClass( + subjects=[ + models.QuerySubjectClass(entity=dataset_urn) + for dataset_urn in itertools.chain( + [downstream_urn], query.upstreams + ) + ] + ), + models.DataPlatformInstanceClass( + platform=self.platform.urn(), + ), + ], + ) + self.report.num_queries_entities_generated += 1 + + if self._query_usage_counts is not None: + assert self.usage_config is not None + + # This is slightly lossy, since we only store one unique + # user per query instead of tracking all of them. + # We also lose information because we don't keep track + # of users / lastExecutedAt timestamps per bucket. + user = query.actor + + query_counter = self._query_usage_counts.get(query_id) + if not query_counter: + return + for bucket in self.usage_config.buckets(): + count = query_counter.get(bucket) + if not count: + continue + + yield MetadataChangeProposalWrapper( + entityUrn=self._query_urn(query_id), + aspect=models.QueryUsageStatisticsClass( + timestampMillis=make_ts_millis(bucket), + eventGranularity=models.TimeWindowSizeClass( + unit=self.usage_config.bucket_duration, multiple=1 + ), + queryCount=count, + uniqueUserCount=1, + userCounts=[ + models.DatasetUserUsageCountsClass( + user=user.urn(), + count=count, + ) + ], + ), + ) + + self.report.num_query_usage_stats_generated += 1 + def _resolve_query_with_temp_tables( self, base_query: QueryMetadata, diff --git a/metadata-models/src/main/pegasus/com/linkedin/query/QueryUsageStatistics.pdl b/metadata-models/src/main/pegasus/com/linkedin/query/QueryUsageStatistics.pdl new file mode 100644 index 00000000000000..275077e5e6a475 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/query/QueryUsageStatistics.pdl @@ -0,0 +1,43 @@ +namespace com.linkedin.query + +import com.linkedin.timeseries.TimeseriesAspectBase +import com.linkedin.dataset.DatasetUserUsageCounts + +/** + * Stats corresponding to dataset's usage. + */ +@Aspect = { + "name": "queryUsageStatistics", + "type": "timeseries", +} +record QueryUsageStatistics includes TimeseriesAspectBase { + /** + * Total query count in this bucket + */ + @TimeseriesField = {} + queryCount: optional int + + /** + * Query cost for this query and bucket + */ + @TimeseriesField = {} + queryCost: optional double + + /** + * Last executed timestamp + */ + @TimeseriesField = {} + lastExecutedAt: optional long + + /** + * Unique user count + */ + @TimeseriesField = {} + uniqueUserCount: optional int + + /** + * Users within this bucket, with frequency counts + */ + @TimeseriesFieldCollection = {"key":"user"} + userCounts: optional array[DatasetUserUsageCounts] +} diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 6a6683418bf386..693588de482dab 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -504,6 +504,7 @@ entities: aspects: - queryProperties - querySubjects + - queryUsageStatistics - status - dataPlatformInstance - subTypes From 8bed4dcabb23ceca32df0c4b3257cdf2f0e92bba Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 2 Jul 2024 21:54:34 -0700 Subject: [PATCH 03/32] emit columns in query subjects --- .../source/snowflake/snowflake_queries.py | 9 +++-- .../sql_parsing/sql_parsing_aggregator.py | 36 ++++++++++++++++--- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index b2e165218fff0c..d21b12b30fb0f1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -4,7 +4,7 @@ import pathlib import tempfile from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timezone from typing import Any, Dict, Iterable, List, Optional, Union import pydantic @@ -267,6 +267,9 @@ def _parse_audit_log_response(self, row: Dict[str, Any]) -> PreparsedQuery: # TODO implement email address mapping user = CorpUserUrn(res["user_name"]) + timestamp: datetime = res["query_start_time"] + timestamp = timestamp.astimezone(timezone.utc) + # TODO need to map snowflake query types to ours query_type = SNOWFLAKE_QUERY_TYPE_MAPPING.get( res["query_type"], QueryType.UNKNOWN @@ -283,7 +286,7 @@ def _parse_audit_log_response(self, row: Dict[str, Any]) -> PreparsedQuery: confidence_score=1, query_count=res["query_count"], user=user, - timestamp=res["query_start_time"], + timestamp=timestamp, session_id=res["session_id"], query_type=query_type, ) @@ -332,7 +335,7 @@ def _build_enriched_audit_log_query( SELECT *, DATE_TRUNC( - 'DAY', + {time_bucket_size}, CONVERT_TIMEZONE('UTC', start_time) ) AS bucket_start_time, COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint) AS query_count, diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 66ac26bb5eea55..d73c914f17e9b6 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -104,6 +104,7 @@ class QueryMetadata: upstreams: List[UrnStr] # this is direct upstreams, which may be temp tables column_lineage: List[ColumnLineageInfo] + column_usage: Dict[UrnStr, Set[UrnStr]] confidence_score: float used_temp_tables: bool = True @@ -129,6 +130,7 @@ class KnownQueryLineageInfo: downstream: UrnStr upstreams: List[UrnStr] column_lineage: Optional[List[ColumnLineageInfo]] = None + column_usage: Optional[Dict[UrnStr, Set[UrnStr]]] = None timestamp: Optional[datetime] = None session_id: Optional[str] = None @@ -487,6 +489,7 @@ def add_known_query_lineage( actor=None, upstreams=known_query_lineage.upstreams, column_lineage=known_query_lineage.column_lineage or [], + column_usage=known_query_lineage.column_usage or {}, confidence_score=1.0, ), merge_lineage=merge_lineage, @@ -539,6 +542,7 @@ def add_known_lineage_mapping( actor=None, upstreams=[upstream_urn], column_lineage=[], + column_usage={}, confidence_score=1.0, ) ) @@ -663,6 +667,11 @@ def add_preparsed_query( if not _is_internal: self.report.num_preparsed_queries += 1 + if parsed.timestamp: + # Sanity check - some of our usage subroutines require the timestamp to be in UTC. + # Ideally we'd actually reject missing tzinfo too, but we can tighten that later. + assert parsed.timestamp.tzinfo in {None, timezone.utc} + query_fingerprint = parsed.query_id if not query_fingerprint: query_fingerprint = get_query_fingerprint( @@ -718,6 +727,7 @@ def add_preparsed_query( actor=parsed.user, upstreams=parsed.upstreams, column_lineage=parsed.column_lineage or [], + column_usage=parsed.column_usage or {}, confidence_score=parsed.confidence_score, used_temp_tables=session_has_temp_tables, ) @@ -851,6 +861,7 @@ def _process_view_definition( actor=None, upstreams=parsed.in_tables, column_lineage=parsed.column_lineage or [], + column_usage=compute_upstream_fields(parsed), confidence_score=parsed.debug_info.confidence, ) ) @@ -932,6 +943,7 @@ def _add_to_query_map( # here just in case more schemas got registered in the interim. current.upstreams = new.upstreams current.column_lineage = new.column_lineage + current.column_usage = new.column_usage current.confidence_score = new.confidence_score else: # In the case of known query lineage, we might get things one at a time. @@ -1148,6 +1160,22 @@ def _gen_query( if not self.can_generate_query(query_id): return + query_subject_urns: List[UrnStr] = [] + for upstream in query.upstreams: + query_subject_urns.append(upstream) + for column in query.column_usage.get(upstream, []): + query_subject_urns.append( + builder.make_schema_field_urn(upstream, column) + ) + if downstream_urn: + query_subject_urns.append(downstream_urn) + for column_lineage in query.column_lineage: + query_subject_urns.append( + builder.make_schema_field_urn( + downstream_urn, column_lineage.downstream.column + ) + ) + yield from MetadataChangeProposalWrapper.construct_many( entityUrn=self._query_urn(query_id), aspects=[ @@ -1162,10 +1190,8 @@ def _gen_query( ), models.QuerySubjectsClass( subjects=[ - models.QuerySubjectClass(entity=dataset_urn) - for dataset_urn in itertools.chain( - [downstream_urn], query.upstreams - ) + models.QuerySubjectClass(entity=urn) + for urn in query_subject_urns ] ), models.DataPlatformInstanceClass( @@ -1210,7 +1236,7 @@ def _gen_query( ), ) - self.report.num_query_usage_stats_generated += 1 + self.report.num_query_usage_stats_generated += 1 def _resolve_query_with_temp_tables( self, From 5ad296398eef0e989c7388e516c7395c8f718d5c Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 2 Jul 2024 21:55:30 -0700 Subject: [PATCH 04/32] add setup.py --- metadata-ingestion/setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index e8508a6e7c827c..e1aecb2c44b808 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -409,6 +409,7 @@ "salesforce": {"simple-salesforce"}, "snowflake": snowflake_common | usage_common | sqlglot_lib, "snowflake-summary": snowflake_common | usage_common | sqlglot_lib, + "snowflake-queries": snowflake_common | usage_common | sqlglot_lib, "sqlalchemy": sql_common, "sql-queries": usage_common | sqlglot_lib, "slack": slack, @@ -661,6 +662,7 @@ "slack = datahub.ingestion.source.slack.slack:SlackSource", "snowflake = datahub.ingestion.source.snowflake.snowflake_v2:SnowflakeV2Source", "snowflake-summary = datahub.ingestion.source.snowflake.snowflake_summary:SnowflakeSummarySource", + "snowflake-queries = datahub.ingestion.source.snowflake.snowflake_queries:SnowflakeQueriesSource", "superset = datahub.ingestion.source.superset:SupersetSource", "tableau = datahub.ingestion.source.tableau:TableauSource", "openapi = datahub.ingestion.source.openapi:OpenApiSource", From bc625dc9af266c0bb9f299c37d76728008ba0ad3 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 3 Jul 2024 16:15:46 -0700 Subject: [PATCH 05/32] fix some lint issues --- .../source/snowflake/snowflake_lineage_v2.py | 4 +++- .../source/snowflake/snowflake_utils.py | 6 ++---- .../sql_parsing/sql_parsing_aggregator.py | 17 +++++++++++------ 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index e29e85e515fefe..ccefc240f99526 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -358,7 +358,7 @@ def _process_external_lineage_result_row( db_row["DOWNSTREAM_TABLE_NAME"] ) if key not in discovered_tables: - return + return None if db_row["UPSTREAM_LOCATIONS"] is not None: external_locations = json.loads(db_row["UPSTREAM_LOCATIONS"]) @@ -371,6 +371,8 @@ def _process_external_lineage_result_row( downstream_urn=self.dataset_urn_builder(key), ) + return None + def _fetch_upstream_lineages_for_tables(self) -> Iterable[UpstreamLineageEdge]: query: str = SnowflakeQuery.table_to_table_lineage_history_v2( start_time_millis=int(self.start_time.timestamp() * 1000), diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index 5d4cc38469f7ce..4adcc470d0d5b6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -257,12 +257,10 @@ def warn_if_stateful_else_error( self.report_error(key, reason) def report_warning(self: SnowflakeCommonProtocol, key: str, reason: str) -> None: - self.report.report_warning(key, reason) - self.logger.warning(f"{key} => {reason}") + self.report.warning(key, reason) def report_error(self: SnowflakeCommonProtocol, key: str, reason: str) -> None: - self.report.report_failure(key, reason) - self.logger.error(f"{key} => {reason}") + self.report.failure(key, reason) class SnowflakeConnectionProtocol(SnowflakeLoggingProtocol, Protocol): diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index d73c914f17e9b6..e810be58b95ee8 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -1143,6 +1143,7 @@ def _gen_remaining_queries( if not self.generate_queries or not self.generate_query_usage_statistics: return + assert self._query_usage_counts is not None for query_id in self._query_usage_counts: if query_id in queries_generated: continue @@ -1227,12 +1228,16 @@ def _gen_query( ), queryCount=count, uniqueUserCount=1, - userCounts=[ - models.DatasetUserUsageCountsClass( - user=user.urn(), - count=count, - ) - ], + userCounts=( + [ + models.DatasetUserUsageCountsClass( + user=user.urn(), + count=count, + ) + ] + if user + else None + ), ), ) From a8b1c7bf7aca95d36042df3263130bfba6d1d20c Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 3 Jul 2024 17:08:11 -0700 Subject: [PATCH 06/32] add a SnowflakeConnection wrapper class --- .../ingestion/source/fivetran/config.py | 4 +- .../source/snowflake/snowflake_assertion.py | 20 ++--- .../source/snowflake/snowflake_config.py | 4 +- .../source/snowflake/snowflake_connection.py | 47 ++++++++++ .../source/snowflake/snowflake_data_reader.py | 4 +- .../source/snowflake/snowflake_lineage_v2.py | 33 ++++--- .../source/snowflake/snowflake_profiler.py | 2 +- .../source/snowflake/snowflake_queries.py | 8 +- .../source/snowflake/snowflake_schema.py | 39 ++++----- .../source/snowflake/snowflake_schema_gen.py | 16 ++-- .../source/snowflake/snowflake_summary.py | 33 ++----- .../source/snowflake/snowflake_usage_v2.py | 38 ++++---- .../source/snowflake/snowflake_utils.py | 87 +------------------ .../source/snowflake/snowflake_v2.py | 59 ++++++------- .../ingestion/source_config/sql/snowflake.py | 31 ++++++- .../snowflake/test_snowflake_failures.py | 10 ++- 16 files changed, 186 insertions(+), 249 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py index f55d9f89ad97f1..46780b9eef1365 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py @@ -20,7 +20,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionConfigBase, ) -from datahub.ingestion.source_config.sql.snowflake import BaseSnowflakeConfig +from datahub.ingestion.source_config.sql.snowflake import SnowflakeConnectionConfig from datahub.utilities.perf_timer import PerfTimer logger = logging.getLogger(__name__) @@ -66,7 +66,7 @@ class Constant: } -class SnowflakeDestinationConfig(BaseSnowflakeConfig): +class SnowflakeDestinationConfig(SnowflakeConnectionConfig): database: str = Field(description="The fivetran connector log database.") log_schema: str = Field(description="The fivetran connector log schema.") diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py index a28a81cc5b955d..5fc1a45709296e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py @@ -12,13 +12,10 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config +from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeCommonMixin, - SnowflakeConnectionMixin, - SnowflakeQueryMixin, -) +from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( AssertionResult, AssertionResultType, @@ -40,30 +37,25 @@ class DataQualityMonitoringResult(BaseModel): VALUE: int -class SnowflakeAssertionsHandler( - SnowflakeCommonMixin, SnowflakeQueryMixin, SnowflakeConnectionMixin -): +class SnowflakeAssertionsHandler(SnowflakeCommonMixin): def __init__( self, config: SnowflakeV2Config, report: SnowflakeV2Report, + connection: SnowflakeConnection, dataset_urn_builder: Callable[[str], str], ) -> None: self.config = config self.report = report self.logger = logger self.dataset_urn_builder = dataset_urn_builder - self.connection = None + self.connection = connection self._urns_processed: List[str] = [] def get_assertion_workunits( self, discovered_datasets: List[str] ) -> Iterable[MetadataWorkUnit]: - self.connection = self.create_connection() - if self.connection is None: - return - - cur = self.query( + cur = self.connection.query( SnowflakeQuery.dmf_assertion_results( datetime_to_ts_millis(self.config.start_time), datetime_to_ts_millis(self.config.end_time), diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 588187e8e11c28..907cde64faf7b8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -19,8 +19,8 @@ StatefulUsageConfigMixin, ) from datahub.ingestion.source_config.sql.snowflake import ( - BaseSnowflakeConfig, SnowflakeConfig, + SnowflakeConnectionConfig, ) from datahub.ingestion.source_config.usage.snowflake_usage import SnowflakeUsageConfig from datahub.utilities.global_warning_util import add_global_warning @@ -259,7 +259,7 @@ def get_sql_alchemy_url( password: Optional[SecretStr] = None, role: Optional[str] = None, ) -> str: - return BaseSnowflakeConfig.get_sql_alchemy_url( + return SnowflakeConnectionConfig.get_sql_alchemy_url( self, database=database, username=username, password=password, role=role ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py new file mode 100644 index 00000000000000..d1424ff2dd783a --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py @@ -0,0 +1,47 @@ +import logging +from typing import Any + +from snowflake.connector import SnowflakeConnection as NativeSnowflakeConnection +from snowflake.connector.cursor import DictCursor + +from datahub.configuration.common import MetaError + +logger = logging.getLogger(__name__) + + +class SnowflakeConnection: + _connection: NativeSnowflakeConnection + + def __init__(self, connection: NativeSnowflakeConnection): + self._connection = connection + + def native_connection(self) -> NativeSnowflakeConnection: + return self._connection + + def query(self, query: str) -> Any: + try: + logger.info(f"Query: {query}", stacklevel=2) + resp = self._connection.cursor(DictCursor).execute(query) + return resp + + except Exception as e: + if _is_permission_error(e): + raise SnowflakePermissionError(e) from e + raise + + def is_closed(self) -> bool: + return self._connection.is_closed() + + def close(self): + self._connection.close() + + +def _is_permission_error(e: Exception) -> bool: + msg = str(e) + # 002003 (02000): SQL compilation error: Database/SCHEMA 'XXXX' does not exist or not authorized. + # Insufficient privileges to operate on database 'XXXX' + return "Insufficient privileges" in msg or "not authorized" in msg + + +class SnowflakePermissionError(MetaError): + """A permission error has happened""" diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_data_reader.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_data_reader.py index 9fa81cb1bd20cb..c9615bb498fe48 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_data_reader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_data_reader.py @@ -2,9 +2,9 @@ from typing import Any, Callable, Dict, List import pandas as pd -from snowflake.connector import SnowflakeConnection from datahub.ingestion.source.common.data_reader import DataReader +from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection from datahub.utilities.perf_timer import PerfTimer logger = logging.Logger(__name__) @@ -39,7 +39,7 @@ def get_sample_data_for_table( logger.debug( f"Collecting sample values for table {db_name}.{schema_name}.{table_name}" ) - with PerfTimer() as timer, self.conn.cursor() as cursor: + with PerfTimer() as timer, self.conn.native_connection().cursor() as cursor: sql = f'select * from "{db_name}"."{schema_name}"."{table_name}" sample ({sample_size} rows);' cursor.execute(sql) dat = cursor.fetchall() diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index ccefc240f99526..3e702593a89bb5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -5,9 +5,9 @@ from typing import Any, Callable, Collection, Iterable, List, Optional, Set, Tuple, Type from pydantic import BaseModel, validator -from snowflake.connector import SnowflakeConnection from datahub.configuration.datetimes import parse_absolute_time +from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage from datahub.ingestion.source.snowflake.constants import ( @@ -15,14 +15,13 @@ SnowflakeEdition, ) from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config -from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery -from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeCommonMixin, - SnowflakeConnectionMixin, +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnection, SnowflakePermissionError, - SnowflakeQueryMixin, ) +from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery +from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report +from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantLineageRunSkipHandler, ) @@ -102,9 +101,7 @@ class SnowflakeColumnId: object_domain: Optional[str] = None -class SnowflakeLineageExtractor( - SnowflakeQueryMixin, SnowflakeConnectionMixin, SnowflakeCommonMixin -): +class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable): """ Extracts Lineage from Snowflake. Following lineage edges are considered. @@ -121,6 +118,7 @@ def __init__( self, config: SnowflakeV2Config, report: SnowflakeV2Report, + connection: SnowflakeConnection, dataset_urn_builder: Callable[[str], str], redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler], sql_aggregator: SqlParsingAggregator, @@ -129,7 +127,7 @@ def __init__( self.report = report self.logger = logger self.dataset_urn_builder = dataset_urn_builder - self.connection: Optional[SnowflakeConnection] = None + self.connection = connection self.sql_aggregator = sql_aggregator self.redundant_run_skip_handler = redundant_run_skip_handler @@ -166,10 +164,6 @@ def get_workunits( if not self._should_ingest_lineage(): return - self.connection = self.create_connection() - if self.connection is None: - return - # s3 dataset -> snowflake table self._populate_external_upstreams(discovered_tables) @@ -293,7 +287,7 @@ def _populate_external_lineage_from_show_query( ) -> Iterable[KnownLineageMapping]: external_tables_query: str = SnowflakeQuery.show_external_tables() try: - for db_row in self.query(external_tables_query): + for db_row in self.connection.query(external_tables_query): key = self.get_dataset_identifier( db_row["name"], db_row["schema_name"], db_row["database_name"] ) @@ -332,7 +326,7 @@ def _populate_external_lineage_from_copy_history( ) try: - for db_row in self.query(query): + for db_row in self.connection.query(query): known_lineage_mapping = self._process_external_lineage_result_row( db_row, discovered_tables ) @@ -382,7 +376,7 @@ def _fetch_upstream_lineages_for_tables(self) -> Iterable[UpstreamLineageEdge]: include_column_lineage=self.config.include_column_lineage, ) try: - for db_row in self.query(query): + for db_row in self.connection.query(query): edge = self._process_upstream_lineage_row(db_row) if edge: yield edge @@ -561,3 +555,6 @@ def _should_ingest_lineage(self) -> bool: def report_status(self, step: str, status: bool) -> None: if self.redundant_run_skip_handler: self.redundant_run_skip_handler.report_current_run_status(step, status) + + def close(self) -> None: + pass diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py index 5e6ade29344eb0..4deeb9f96f48eb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py @@ -164,7 +164,7 @@ def callable_for_db_connection(self, db_name: str) -> Callable: schema_name = self.database_default_schema.get(db_name) def get_db_connection(): - conn = self.config.get_connection() + conn = self.config.get_native_connection() conn.cursor().execute(SnowflakeQuery.use_database(db_name)) # As mentioned here - https://docs.snowflake.com/en/sql-reference/sql/use-database#usage-notes diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index d21b12b30fb0f1..f5f8cb9cf1c578 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -8,7 +8,6 @@ from typing import Any, Dict, Iterable, List, Optional, Union import pydantic -from snowflake.connector.cursor import DictCursor from typing_extensions import Self from datahub.configuration.source_common import ( @@ -27,7 +26,7 @@ from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin from datahub.ingestion.source.usage.usage_common import BaseUsageConfig -from datahub.ingestion.source_config.sql.snowflake import BaseSnowflakeConfig +from datahub.ingestion.source_config.sql.snowflake import SnowflakeConnectionConfig from datahub.metadata._urns.urn_defs import CorpUserUrn from datahub.sql_parsing.sql_parsing_aggregator import ( KnownLineageMapping, @@ -49,7 +48,7 @@ class SnowflakeQueriesConfig( PlatformInstanceConfigMixin, EnvConfigMixin, LowerCaseDatasetUrnConfigMixin ): - connection: BaseSnowflakeConfig + connection: SnowflakeConnectionConfig # TODO: Support stateful ingestion for the time windows. window: BaseTimeWindowConfig = BaseTimeWindowConfig() @@ -187,8 +186,7 @@ def fetch_audit_log( ) conn = self.config.connection.get_connection() - resp = conn.cursor(DictCursor).execute(audit_log_query) - assert resp is not None + resp = conn.query(audit_log_query) for row in resp: assert isinstance(row, dict) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py index 4bc684a22514c4..ce8f20d23aa6b1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py @@ -5,15 +5,13 @@ from datetime import datetime from typing import Callable, Dict, Iterable, List, MutableMapping, Optional -from snowflake.connector import SnowflakeConnection - from datahub.ingestion.api.report import SupportsAsObj from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain +from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection from datahub.ingestion.source.snowflake.snowflake_query import ( SHOW_VIEWS_MAX_PAGE_SIZE, SnowflakeQuery, ) -from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeQueryMixin from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView from datahub.utilities.file_backed_collections import FileBackedDict from datahub.utilities.prefix_batch_builder import build_prefix_batches @@ -185,19 +183,12 @@ def get_column_tags_for_table( ) -class SnowflakeDataDictionary(SnowflakeQueryMixin, SupportsAsObj): - def __init__(self) -> None: +class SnowflakeDataDictionary(SupportsAsObj): + def __init__(self, connection: SnowflakeConnection) -> None: self.logger = logger - self.connection: Optional[SnowflakeConnection] = None - def set_connection(self, connection: SnowflakeConnection) -> None: self.connection = connection - def get_connection(self) -> SnowflakeConnection: - # Connection is already present by the time this is called - assert self.connection is not None - return self.connection - def as_obj(self) -> Dict[str, Dict[str, int]]: # TODO: Move this into a proper report type that gets computed. @@ -221,7 +212,7 @@ def as_obj(self) -> Dict[str, Dict[str, int]]: def show_databases(self) -> List[SnowflakeDatabase]: databases: List[SnowflakeDatabase] = [] - cur = self.query( + cur = self.connection.query( SnowflakeQuery.show_databases(), ) @@ -238,7 +229,7 @@ def show_databases(self) -> List[SnowflakeDatabase]: def get_databases(self, db_name: str) -> List[SnowflakeDatabase]: databases: List[SnowflakeDatabase] = [] - cur = self.query( + cur = self.connection.query( SnowflakeQuery.get_databases(db_name), ) @@ -256,7 +247,7 @@ def get_databases(self, db_name: str) -> List[SnowflakeDatabase]: def get_schemas_for_database(self, db_name: str) -> List[SnowflakeSchema]: snowflake_schemas = [] - cur = self.query( + cur = self.connection.query( SnowflakeQuery.schemas_for_database(db_name), ) @@ -276,7 +267,7 @@ def get_tables_for_database( ) -> Optional[Dict[str, List[SnowflakeTable]]]: tables: Dict[str, List[SnowflakeTable]] = {} try: - cur = self.query( + cur = self.connection.query( SnowflakeQuery.tables_for_database(db_name), ) except Exception as e: @@ -309,7 +300,7 @@ def get_tables_for_schema( ) -> List[SnowflakeTable]: tables: List[SnowflakeTable] = [] - cur = self.query( + cur = self.connection.query( SnowflakeQuery.tables_for_schema(schema_name, db_name), ) @@ -337,7 +328,7 @@ def get_views_for_database(self, db_name: str) -> Dict[str, List[SnowflakeView]] first_iteration = True view_pagination_marker: Optional[str] = None while first_iteration or view_pagination_marker is not None: - cur = self.query( + cur = self.connection.query( SnowflakeQuery.show_views_for_database( db_name, limit=page_limit, @@ -406,7 +397,7 @@ def get_columns_for_schema( schema_name, db_name, object_batch ) - cur = self.query(query) + cur = self.connection.query(query) for column in cur: if column["TABLE_NAME"] not in columns: @@ -430,7 +421,7 @@ def get_pk_constraints_for_schema( self, schema_name: str, db_name: str ) -> Dict[str, SnowflakePK]: constraints: Dict[str, SnowflakePK] = {} - cur = self.query( + cur = self.connection.query( SnowflakeQuery.show_primary_keys_for_schema(schema_name, db_name), ) @@ -449,7 +440,7 @@ def get_fk_constraints_for_schema( constraints: Dict[str, List[SnowflakeFK]] = {} fk_constraints_map: Dict[str, SnowflakeFK] = {} - cur = self.query( + cur = self.connection.query( SnowflakeQuery.show_foreign_keys_for_schema(schema_name, db_name), ) @@ -481,7 +472,7 @@ def get_tags_for_database_without_propagation( self, db_name: str, ) -> _SnowflakeTagCache: - cur = self.query( + cur = self.connection.query( SnowflakeQuery.get_all_tags_in_database_without_propagation(db_name) ) @@ -536,7 +527,7 @@ def get_tags_for_object_with_propagation( ) -> List[SnowflakeTag]: tags: List[SnowflakeTag] = [] - cur = self.query( + cur = self.connection.query( SnowflakeQuery.get_all_tags_on_object_with_propagation( db_name, quoted_identifier, domain ), @@ -557,7 +548,7 @@ def get_tags_on_columns_for_table( self, quoted_table_name: str, db_name: str ) -> Dict[str, List[SnowflakeTag]]: tags: Dict[str, List[SnowflakeTag]] = defaultdict(list) - cur = self.query( + cur = self.connection.query( SnowflakeQuery.get_tags_on_columns_with_propagation( db_name, quoted_table_name ), diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index b6f16cd671b8d3..00fd597251d595 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -4,8 +4,6 @@ import queue from typing import Dict, Iterable, List, Optional, Union -from snowflake.connector import SnowflakeConnection - from datahub.configuration.pattern_utils import is_schema_allowed from datahub.emitter.mce_builder import ( make_data_platform_urn, @@ -32,6 +30,10 @@ SnowflakeV2Config, TagOption, ) +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnection, + SnowflakePermissionError, +) from datahub.ingestion.source.snowflake.snowflake_data_reader import SnowflakeDataReader from datahub.ingestion.source.snowflake.snowflake_profiler import SnowflakeProfiler from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report @@ -51,9 +53,6 @@ from datahub.ingestion.source.snowflake.snowflake_utils import ( SnowflakeCommonMixin, SnowflakeCommonProtocol, - SnowflakeConnectionMixin, - SnowflakePermissionError, - SnowflakeQueryMixin, ) from datahub.ingestion.source.sql.sql_utils import ( add_table_to_schema_container, @@ -141,8 +140,6 @@ class SnowflakeSchemaGenerator( - SnowflakeQueryMixin, - SnowflakeConnectionMixin, SnowflakeCommonMixin, SnowflakeCommonProtocol, ): @@ -161,8 +158,9 @@ def __init__( self.connection: SnowflakeConnection = connection self.logger = logger - self.data_dictionary: SnowflakeDataDictionary = SnowflakeDataDictionary() - self.data_dictionary.set_connection(self.connection) + self.data_dictionary: SnowflakeDataDictionary = SnowflakeDataDictionary( + connection=self.connection + ) self.report.data_dictionary_cache = self.data_dictionary self.domain_registry: Optional[DomainRegistry] = domain_registry diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py index cd6f17092e810a..1706ea88bd9ea1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py @@ -4,7 +4,6 @@ from typing import Dict, Iterable, List, Optional import pydantic -from snowflake.connector import SnowflakeConnection from datahub.configuration.common import AllowDenyPattern from datahub.configuration.source_common import LowerCaseDatasetUrnConfigMixin @@ -20,18 +19,14 @@ from datahub.ingestion.source.snowflake.snowflake_schema_gen import ( SnowflakeSchemaGenerator, ) -from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeCommonMixin, - SnowflakeConnectionMixin, - SnowflakeQueryMixin, -) -from datahub.ingestion.source_config.sql.snowflake import BaseSnowflakeConfig +from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin +from datahub.ingestion.source_config.sql.snowflake import SnowflakeConnectionConfig from datahub.ingestion.source_report.time_window import BaseTimeWindowReport from datahub.utilities.lossy_collections import LossyList class SnowflakeSummaryConfig( - BaseSnowflakeConfig, BaseTimeWindowConfig, LowerCaseDatasetUrnConfigMixin + SnowflakeConnectionConfig, BaseTimeWindowConfig, LowerCaseDatasetUrnConfigMixin ): # Copied from SnowflakeConfig. @@ -81,8 +76,6 @@ def report_entity_scanned(self, name: str, ent_type: str = "table") -> None: @config_class(SnowflakeSummaryConfig) @support_status(SupportStatus.INCUBATING) class SnowflakeSummarySource( - SnowflakeQueryMixin, - SnowflakeConnectionMixin, SnowflakeCommonMixin, Source, ): @@ -90,24 +83,12 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeSummaryConfig): super().__init__(ctx) self.config: SnowflakeSummaryConfig = config self.report: SnowflakeSummaryReport = SnowflakeSummaryReport() - - self.data_dictionary = SnowflakeDataDictionary() - self.connection: Optional[SnowflakeConnection] = None self.logger = logging.getLogger(__name__) - def create_connection(self) -> Optional[SnowflakeConnection]: - # TODO: Eventually we'll want to use the implementation from SnowflakeConnectionMixin, - # since it has better error reporting. - # return super().create_connection() - return self.config.get_connection() + self.connection = self.config.get_connection() + self.data_dictionary = SnowflakeDataDictionary(connection=self.connection) def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - self.connection = self.create_connection() - if self.connection is None: - return - - self.data_dictionary.set_connection(self.connection) - # Databases. databases: List[SnowflakeDatabase] = [] for database in self.get_databases() or []: # type: ignore @@ -139,7 +120,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # Queries for usage. start_time_millis = self.config.start_time.timestamp() * 1000 end_time_millis = self.config.end_time.timestamp() * 1000 - for row in self.query( + for row in self.connection.query( f"""\ SELECT COUNT(*) AS CNT FROM snowflake.account_usage.query_history @@ -150,7 +131,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.report.num_snowflake_queries = row["CNT"] # Queries for lineage/operations. - for row in self.query( + for row in self.connection.query( f"""\ SELECT COUNT(*) AS CNT FROM diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index e8b56a01944ad2..ed483d99a23b49 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -5,23 +5,22 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple import pydantic -from snowflake.connector import SnowflakeConnection from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.emitter.mce_builder import make_user_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.snowflake.constants import SnowflakeEdition from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config -from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery -from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeCommonMixin, - SnowflakeConnectionMixin, +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnection, SnowflakePermissionError, - SnowflakeQueryMixin, ) +from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery +from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report +from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantUsageRunSkipHandler, ) @@ -107,13 +106,12 @@ class SnowflakeJoinedAccessEvent(PermissiveModel): role_name: str -class SnowflakeUsageExtractor( - SnowflakeQueryMixin, SnowflakeConnectionMixin, SnowflakeCommonMixin -): +class SnowflakeUsageExtractor(SnowflakeCommonMixin, Closeable): def __init__( self, config: SnowflakeV2Config, report: SnowflakeV2Report, + connection: SnowflakeConnection, dataset_urn_builder: Callable[[str], str], redundant_run_skip_handler: Optional[RedundantUsageRunSkipHandler], ) -> None: @@ -121,7 +119,7 @@ def __init__( self.report: SnowflakeV2Report = report self.dataset_urn_builder = dataset_urn_builder self.logger = logger - self.connection: Optional[SnowflakeConnection] = None + self.connection = connection self.redundant_run_skip_handler = redundant_run_skip_handler self.start_time, self.end_time = ( @@ -144,11 +142,6 @@ def get_usage_workunits( return self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION) - - self.connection = self.create_connection() - if self.connection is None: - return - if self.report.edition == SnowflakeEdition.STANDARD.value: logger.info( "Snowflake Account is Standard Edition. Usage and Operation History Feature is not supported." @@ -207,7 +200,7 @@ def _get_workunits_internal( with PerfTimer() as timer: logger.info("Getting aggregated usage statistics") try: - results = self.query( + results = self.connection.query( SnowflakeQuery.usage_per_object_per_time_bucket_for_time_window( start_time_millis=int(self.start_time.timestamp() * 1000), end_time_millis=int(self.end_time.timestamp() * 1000), @@ -376,7 +369,8 @@ def _get_snowflake_history(self) -> Iterable[SnowflakeJoinedAccessEvent]: with PerfTimer() as timer: query = self._make_operations_query() try: - results = self.query(query) + assert self.connection is not None + results = self.connection.query(query) except Exception as e: logger.debug(e, exc_info=e) self.warn_if_stateful_else_error( @@ -398,7 +392,10 @@ def _make_operations_query(self) -> str: def _check_usage_date_ranges(self) -> None: with PerfTimer() as timer: try: - results = self.query(SnowflakeQuery.get_access_history_date_range()) + assert self.connection is not None + results = self.connection.query( + SnowflakeQuery.get_access_history_date_range() + ) except Exception as e: if isinstance(e, SnowflakePermissionError): error_msg = "Failed to get usage. Please grant imported privileges on SNOWFLAKE database. " @@ -590,3 +587,6 @@ def _should_ingest_usage(self) -> bool: def report_status(self, step: str, status: bool) -> None: if self.redundant_run_skip_handler: self.redundant_run_skip_handler.report_current_run_status(step, status) + + def close(self) -> None: + pass diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index 4adcc470d0d5b6..3e348e08804a9c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -1,15 +1,11 @@ import logging -from typing import Any, Optional +from typing import Optional -from snowflake.connector import SnowflakeConnection -from snowflake.connector.cursor import DictCursor from typing_extensions import Protocol -from datahub.configuration.common import MetaError from datahub.configuration.pattern_utils import is_schema_allowed from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance from datahub.ingestion.source.snowflake.constants import ( - GENERIC_PERMISSION_ERROR_KEY, SNOWFLAKE_REGION_CLOUD_REGION_MAPPING, SnowflakeCloudProvider, SnowflakeObjectDomain, @@ -20,34 +16,12 @@ logger: logging.Logger = logging.getLogger(__name__) -class SnowflakePermissionError(MetaError): - """A permission error has happened""" - - # Required only for mypy, since we are using mixin classes, and not inheritance. # Reference - https://mypy.readthedocs.io/en/latest/more_types.html#mixin-classes class SnowflakeLoggingProtocol(Protocol): logger: logging.Logger -class SnowflakeQueryProtocol(SnowflakeLoggingProtocol, Protocol): - def get_connection(self) -> SnowflakeConnection: - ... - - -class SnowflakeQueryMixin: - def query(self: SnowflakeQueryProtocol, query: str) -> Any: - try: - self.logger.info(f"Query : {query}", stacklevel=2) - resp = self.get_connection().cursor(DictCursor).execute(query) - return resp - - except Exception as e: - if is_permission_error(e): - raise SnowflakePermissionError(e) from e - raise - - class SnowflakeCommonProtocol(SnowflakeLoggingProtocol, Protocol): platform: str = "snowflake" @@ -261,62 +235,3 @@ def report_warning(self: SnowflakeCommonProtocol, key: str, reason: str) -> None def report_error(self: SnowflakeCommonProtocol, key: str, reason: str) -> None: self.report.failure(key, reason) - - -class SnowflakeConnectionProtocol(SnowflakeLoggingProtocol, Protocol): - connection: Optional[SnowflakeConnection] - config: SnowflakeV2Config - report: SnowflakeV2Report - - def create_connection(self) -> Optional[SnowflakeConnection]: - ... - - def report_error(self, key: str, reason: str) -> None: - ... - - -class SnowflakeConnectionMixin: - def get_connection(self: SnowflakeConnectionProtocol) -> SnowflakeConnection: - if self.connection is None: - # Ideally this is never called here - self.logger.info("Did you forget to initialize connection for module?") - self.connection = self.create_connection() - - # Connection is already present by the time its used for query - # Every module initializes the connection or fails and returns - assert self.connection is not None - return self.connection - - # If connection succeeds, return connection, else return None and report failure - def create_connection( - self: SnowflakeConnectionProtocol, - ) -> Optional[SnowflakeConnection]: - try: - conn = self.config.get_connection() - except Exception as e: - logger.debug(e, exc_info=e) - if "not granted to this user" in str(e): - self.report_error( - GENERIC_PERMISSION_ERROR_KEY, - f"Failed to connect with snowflake due to error {e}", - ) - else: - logger.debug(e, exc_info=e) - self.report_error( - "snowflake-connection", - f"Failed to connect to snowflake instance due to error {e}.", - ) - return None - else: - return conn - - def close(self: SnowflakeConnectionProtocol) -> None: - if self.connection is not None and not self.connection.is_closed(): - self.connection.close() - - -def is_permission_error(e: Exception) -> bool: - msg = str(e) - # 002003 (02000): SQL compilation error: Database/SCHEMA 'XXXX' does not exist or not authorized. - # Insufficient privileges to operate on database 'XXXX' - return "Insufficient privileges" in msg or "not authorized" in msg diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index f39620b79cfd43..96447b2b2a21b9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -7,8 +7,6 @@ from dataclasses import dataclass from typing import Dict, Iterable, List, Optional, Union -from snowflake.connector import SnowflakeConnection - from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SupportStatus, @@ -36,6 +34,7 @@ SnowflakeAssertionsHandler, ) from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config +from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection from datahub.ingestion.source.snowflake.snowflake_lineage_v2 import ( SnowflakeLineageExtractor, ) @@ -52,11 +51,7 @@ from datahub.ingestion.source.snowflake.snowflake_usage_v2 import ( SnowflakeUsageExtractor, ) -from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeCommonMixin, - SnowflakeConnectionMixin, - SnowflakeQueryMixin, -) +from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantLineageRunSkipHandler, @@ -68,7 +63,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionSourceBase, ) -from datahub.ingestion.source_config.sql.snowflake import BaseSnowflakeConfig +from datahub.ingestion.source_config.sql.snowflake import SnowflakeConnectionConfig from datahub.ingestion.source_report.ingestion_stage import ( LINEAGE_EXTRACTION, METADATA_EXTRACTION, @@ -119,8 +114,6 @@ supported=True, ) class SnowflakeV2Source( - SnowflakeQueryMixin, - SnowflakeConnectionMixin, SnowflakeCommonMixin, StatefulIngestionSourceBase, TestableSource, @@ -130,7 +123,8 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): self.config: SnowflakeV2Config = config self.report: SnowflakeV2Report = SnowflakeV2Report() self.logger = logger - self.connection: Optional[SnowflakeConnection] = None + + self.connection = self.config.get_connection() self.domain_registry: Optional[DomainRegistry] = None if self.config.domain: @@ -139,7 +133,7 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): ) # For database, schema, tables, views, etc - self.data_dictionary = SnowflakeDataDictionary() + self.data_dictionary = SnowflakeDataDictionary(connection=self.connection) self.lineage_extractor: Optional[SnowflakeLineageExtractor] = None self.aggregator: Optional[SqlParsingAggregator] = None @@ -180,6 +174,7 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): self.lineage_extractor = SnowflakeLineageExtractor( config, self.report, + connection=self.connection, dataset_urn_builder=self.gen_dataset_urn, redundant_run_skip_handler=redundant_lineage_run_skip_handler, sql_aggregator=self.aggregator, @@ -200,6 +195,7 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): self.usage_extractor = SnowflakeUsageExtractor( config, self.report, + connection=self.connection, dataset_urn_builder=self.gen_dataset_urn, redundant_run_skip_handler=redundant_usage_run_skip_handler, ) @@ -232,7 +228,9 @@ def test_connection(config_dict: dict) -> TestConnectionReport: test_report = TestConnectionReport() try: - connection_conf = BaseSnowflakeConfig.parse_obj_allow_extras(config_dict) + connection_conf = SnowflakeConnectionConfig.parse_obj_allow_extras( + config_dict + ) connection: SnowflakeConnection = connection_conf.get_connection() assert connection @@ -258,7 +256,7 @@ def test_connection(config_dict: dict) -> TestConnectionReport: @staticmethod def check_capabilities( - conn: SnowflakeConnection, connection_conf: BaseSnowflakeConfig + conn: SnowflakeConnection, connection_conf: SnowflakeConnectionConfig ) -> Dict[Union[SourceCapability, str], CapabilityReport]: # Currently only overall capabilities are reported. # Resource level variations in capabilities are not considered. @@ -269,19 +267,14 @@ class SnowflakePrivilege: object_name: str object_type: str - def query(query): - logger.info(f"Query : {query}") - resp = conn.cursor().execute(query) - return resp - _report: Dict[Union[SourceCapability, str], CapabilityReport] = dict() privileges: List[SnowflakePrivilege] = [] capabilities: List[SourceCapability] = [c.capability for c in SnowflakeV2Source.get_capabilities() if c.capability not in (SourceCapability.PLATFORM_INSTANCE, SourceCapability.DOMAINS, SourceCapability.DELETION_DETECTION)] # type: ignore - cur = query("select current_role()") + cur = conn.query("select current_role()") current_role = [row[0] for row in cur][0] - cur = query("select current_secondary_roles()") + cur = conn.query("select current_secondary_roles()") secondary_roles_str = json.loads([row[0] for row in cur][0])["roles"] secondary_roles = ( [] if secondary_roles_str == "" else secondary_roles_str.split(",") @@ -298,7 +291,7 @@ def query(query): role = roles[i] i = i + 1 # for some roles, quoting is necessary. for example test-role - cur = query(f'show grants to role "{role}"') + cur = conn.query(f'show grants to role "{role}"') for row in cur: privilege = SnowflakePrivilege( privilege=row[1], object_type=row[2], object_name=row[3] @@ -363,7 +356,7 @@ def query(query): ): roles.append(privilege.object_name) - cur = query("select current_warehouse()") + cur = conn.query("select current_warehouse()") current_warehouse = [row[0] for row in cur][0] default_failure_messages = { @@ -425,11 +418,11 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self._snowflake_clear_ocsp_cache() - self.connection = self.create_connection() + self.connection = self.config.get_connection() if self.connection is None: return - self.inspect_session_metadata() + self.inspect_session_metadata(self.connection) snowsight_base_url = None if self.config.include_external_url: @@ -499,7 +492,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if self.config.include_assertion_results: yield from SnowflakeAssertionsHandler( - self.config, self.report, self.gen_dataset_urn + self.config, self.report, self.connection, self.gen_dataset_urn ).get_assertion_workunits(discovered_datasets) def report_warehouse_failure(self) -> None: @@ -536,22 +529,22 @@ def add_config_to_report(self) -> None: self.config.end_time, ) - def inspect_session_metadata(self) -> None: + def inspect_session_metadata(self, connection: SnowflakeConnection) -> None: try: logger.info("Checking current version") - for db_row in self.query(SnowflakeQuery.current_version()): + for db_row in connection.query(SnowflakeQuery.current_version()): self.report.saas_version = db_row["CURRENT_VERSION()"] except Exception as e: self.report_error("version", f"Error: {e}") try: logger.info("Checking current role") - for db_row in self.query(SnowflakeQuery.current_role()): + for db_row in connection.query(SnowflakeQuery.current_role()): self.report.role = db_row["CURRENT_ROLE()"] except Exception as e: self.report_error("version", f"Error: {e}") try: logger.info("Checking current warehouse") - for db_row in self.query(SnowflakeQuery.current_warehouse()): + for db_row in connection.query(SnowflakeQuery.current_warehouse()): self.report.default_warehouse = db_row["CURRENT_WAREHOUSE()"] except Exception as e: self.report_error("current_warehouse", f"Error: {e}") @@ -568,10 +561,10 @@ def inspect_session_metadata(self) -> None: def get_snowsight_base_url(self) -> Optional[str]: try: # See https://docs.snowflake.com/en/user-guide/admin-account-identifier.html#finding-the-region-and-locator-for-an-account - for db_row in self.query(SnowflakeQuery.current_account()): + for db_row in self.connection.query(SnowflakeQuery.current_account()): account_locator = db_row["CURRENT_ACCOUNT()"] - for db_row in self.query(SnowflakeQuery.current_region()): + for db_row in self.connection.query(SnowflakeQuery.current_region()): region = db_row["CURRENT_REGION()"] self.report.account_locator = account_locator @@ -604,7 +597,7 @@ def get_snowsight_base_url(self) -> Optional[str]: def is_standard_edition(self) -> bool: try: - self.query(SnowflakeQuery.show_tags()) + self.connection.query(SnowflakeQuery.show_tags()) return False except Exception as e: if "Unsupported feature 'TAG'" in str(e): diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index 521e755b6a00c5..5aec5b80810aac 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -12,7 +12,11 @@ OAUTH_AUTHENTICATOR, ) -from datahub.configuration.common import AllowDenyPattern, ConfigModel +from datahub.configuration.common import ( + AllowDenyPattern, + ConfigModel, + ConfigurationError, +) from datahub.configuration.connection_resolver import auto_connection_resolver from datahub.configuration.oauth import OAuthConfiguration, OAuthIdentityProvider from datahub.configuration.time_window_config import BaseTimeWindowConfig @@ -21,6 +25,10 @@ CLIENT_PREFETCH_THREADS, CLIENT_SESSION_KEEP_ALIVE, ) +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnection, + SnowflakePermissionError, +) from datahub.ingestion.source.sql.oauth_generator import OAuthTokenGenerator from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri from datahub.utilities.config_clean import ( @@ -43,7 +51,7 @@ SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com" -class BaseSnowflakeConfig(ConfigModel): +class SnowflakeConnectionConfig(ConfigModel): # Note: this config model is also used by the snowflake-usage source. _connection = auto_connection_resolver() @@ -310,7 +318,7 @@ def get_key_pair_connection(self) -> snowflake.connector.SnowflakeConnection: **connect_args, ) - def get_connection(self) -> snowflake.connector.SnowflakeConnection: + def get_native_connection(self) -> snowflake.connector.SnowflakeConnection: connect_args = self.get_options()["connect_args"] if self.authentication_type == "DEFAULT_AUTHENTICATOR": return snowflake.connector.connect( @@ -341,8 +349,23 @@ def get_connection(self) -> snowflake.connector.SnowflakeConnection: # not expected to be here raise Exception("Not expected to be here.") + def get_connection(self) -> SnowflakeConnection: + try: + return SnowflakeConnection(self.get_native_connection()) + except Exception as e: + logger.debug(e, exc_info=e) + + if "not granted to this user" in str(e): + raise SnowflakePermissionError( + f"Permissions error when connecting to snowflake: {e}" + ) from e + + raise ConfigurationError( + f"Failed to connect to snowflake instance: {e}" + ) from e + -class SnowflakeConfig(BaseSnowflakeConfig, BaseTimeWindowConfig, SQLCommonConfig): +class SnowflakeConfig(SnowflakeConnectionConfig, BaseTimeWindowConfig, SQLCommonConfig): include_table_lineage: bool = pydantic.Field( default=True, description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.", diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py index 23f5c10b10f8e8..2b7bf6a29e1adb 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py @@ -1,11 +1,12 @@ from datetime import datetime, timezone from unittest import mock +import pytest from freezegun import freeze_time from pytest import fixture from datahub.configuration.common import AllowDenyPattern, DynamicTypedConfig -from datahub.ingestion.run.pipeline import Pipeline +from datahub.ingestion.run.pipeline import Pipeline, PipelineInitError from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig from datahub.ingestion.source.snowflake import snowflake_query from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config @@ -72,9 +73,10 @@ def test_snowflake_missing_role_access_causes_pipeline_failure( "250001 (08001): Failed to connect to DB: abc12345.ap-south-1.snowflakecomputing.com:443. Role 'TEST_ROLE' specified in the connect string is not granted to this user. Contact your local system administrator, or attempt to login with another role, e.g. PUBLIC" ) - pipeline = Pipeline(snowflake_pipeline_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() + with pytest.raises(PipelineInitError, match="Permissions error"): + pipeline = Pipeline(snowflake_pipeline_config) + pipeline.run() + pipeline.raise_from_status() @freeze_time(FROZEN_TIME) From 890d63f126dee75a4930b051fc96f166ff840f2e Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 3 Jul 2024 17:40:51 -0700 Subject: [PATCH 07/32] remove SnowflakeLoggingProtocol --- .../ingestion/source/snowflake/snowflake_usage_v2.py | 8 ++++---- .../ingestion/source/snowflake/snowflake_utils.py | 9 +-------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index ed483d99a23b49..b50764a4231656 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -286,7 +286,7 @@ def build_usage_statistics_for_dataset( f"Failed to parse usage statistics for dataset {dataset_identifier} due to error {e}.", exc_info=e, ) - self.report_warning( + self.report.warning( "Failed to parse usage statistics for dataset", dataset_identifier ) @@ -404,7 +404,7 @@ def _check_usage_date_ranges(self) -> None: ) else: logger.debug(e, exc_info=e) - self.report_warning( + self.report.warning( "usage", f"Extracting the date range for usage data from Snowflake failed due to error {e}.", ) @@ -416,7 +416,7 @@ def _check_usage_date_ranges(self) -> None: or db_row["MIN_TIME"] is None or db_row["MAX_TIME"] is None ): - self.report_warning( + self.report.warning( "check-usage-data", f"Missing data for access_history {db_row}.", ) @@ -502,7 +502,7 @@ def _process_snowflake_history_row( yield event except Exception as e: self.report.rows_parsing_error += 1 - self.report_warning( + self.report.warning( "operation", f"Failed to parse operation history row {event_dict}, {e}", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index 3e348e08804a9c..b4405f3431d2aa 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -1,4 +1,3 @@ -import logging from typing import Optional from typing_extensions import Protocol @@ -13,16 +12,10 @@ from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -logger: logging.Logger = logging.getLogger(__name__) - # Required only for mypy, since we are using mixin classes, and not inheritance. # Reference - https://mypy.readthedocs.io/en/latest/more_types.html#mixin-classes -class SnowflakeLoggingProtocol(Protocol): - logger: logging.Logger - - -class SnowflakeCommonProtocol(SnowflakeLoggingProtocol, Protocol): +class SnowflakeCommonProtocol(Protocol): platform: str = "snowflake" config: SnowflakeV2Config From ae86094f944abacc512d8c8c029fe70e8cbd65c0 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 3 Jul 2024 17:42:14 -0700 Subject: [PATCH 08/32] make snowflake_connection have config info + stop using source_config/... --- .../ingestion/source/fivetran/config.py | 4 +- .../source/snowflake/snowflake_config.py | 40 +- .../source/snowflake/snowflake_connection.py | 361 +++++++++++++++- .../source/snowflake/snowflake_queries.py | 4 +- .../source/snowflake/snowflake_summary.py | 4 +- .../source/snowflake/snowflake_v2.py | 6 +- .../ingestion/source_config/sql/__init__.py | 0 .../ingestion/source_config/sql/snowflake.py | 395 ------------------ 8 files changed, 404 insertions(+), 410 deletions(-) delete mode 100644 metadata-ingestion/src/datahub/ingestion/source_config/sql/__init__.py delete mode 100644 metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py index 46780b9eef1365..f8b1c6dd93d6d9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py @@ -13,6 +13,9 @@ from datahub.ingestion.source.bigquery_v2.bigquery_config import ( BigQueryConnectionConfig, ) +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnectionConfig, +) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalSourceReport, StatefulStaleMetadataRemovalConfig, @@ -20,7 +23,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionConfigBase, ) -from datahub.ingestion.source_config.sql.snowflake import SnowflakeConnectionConfig from datahub.utilities.perf_timer import PerfTimer logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 907cde64faf7b8..dfe860b250c009 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -4,24 +4,26 @@ from enum import Enum from typing import Dict, List, Optional, Set, cast +import pydantic from pydantic import Field, SecretStr, root_validator, validator from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.pattern_utils import UUID_REGEX +from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.ingestion.glossary.classification_mixin import ( ClassificationSourceConfigMixin, ) +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnectionConfig, +) +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulLineageConfigMixin, StatefulProfilingConfigMixin, StatefulUsageConfigMixin, ) -from datahub.ingestion.source_config.sql.snowflake import ( - SnowflakeConfig, - SnowflakeConnectionConfig, -) from datahub.ingestion.source_config.usage.snowflake_usage import SnowflakeUsageConfig from datahub.utilities.global_warning_util import add_global_warning @@ -73,6 +75,36 @@ def source_database(self) -> DatabaseId: return DatabaseId(self.database, self.platform_instance) +class SnowflakeConfig(SnowflakeConnectionConfig, BaseTimeWindowConfig, SQLCommonConfig): + include_table_lineage: bool = pydantic.Field( + default=True, + description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.", + ) + include_view_lineage: bool = pydantic.Field( + default=True, + description="If enabled, populates the snowflake view->table and table->view lineages. Requires appropriate grants given to the role, and include_table_lineage to be True. view->table lineage requires Snowflake Enterprise Edition or above.", + ) + + database_pattern: AllowDenyPattern = AllowDenyPattern( + deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"] + ) + + ignore_start_time_lineage: bool = False + upstream_lineage_in_report: bool = False + + @pydantic.root_validator(skip_on_failure=True) + def validate_include_view_lineage(cls, values): + if ( + "include_table_lineage" in values + and not values.get("include_table_lineage") + and values.get("include_view_lineage") + ): + raise ValueError( + "include_table_lineage must be True for include_view_lineage to be set." + ) + return values + + class SnowflakeV2Config( SnowflakeConfig, SnowflakeUsageConfig, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py index d1424ff2dd783a..e981ed3e2e6650 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py @@ -1,13 +1,366 @@ import logging -from typing import Any +from typing import Any, Dict, Optional +import pydantic +import snowflake.connector +from cryptography.hazmat.backends import default_backend +from cryptography.hazmat.primitives import serialization from snowflake.connector import SnowflakeConnection as NativeSnowflakeConnection from snowflake.connector.cursor import DictCursor +from snowflake.connector.network import ( + DEFAULT_AUTHENTICATOR, + EXTERNAL_BROWSER_AUTHENTICATOR, + KEY_PAIR_AUTHENTICATOR, + OAUTH_AUTHENTICATOR, +) -from datahub.configuration.common import MetaError +from datahub.configuration.common import ConfigModel, ConfigurationError, MetaError +from datahub.configuration.connection_resolver import auto_connection_resolver +from datahub.configuration.oauth import OAuthConfiguration, OAuthIdentityProvider +from datahub.configuration.validate_field_rename import pydantic_renamed_field +from datahub.ingestion.source.snowflake.constants import ( + CLIENT_PREFETCH_THREADS, + CLIENT_SESSION_KEEP_ALIVE, +) +from datahub.ingestion.source.sql.oauth_generator import OAuthTokenGenerator +from datahub.ingestion.source.sql.sql_config import make_sqlalchemy_uri +from datahub.utilities.config_clean import ( + remove_protocol, + remove_suffix, + remove_trailing_slashes, +) logger = logging.getLogger(__name__) +_APPLICATION_NAME: str = "acryl_datahub" + +_VALID_AUTH_TYPES: Dict[str, str] = { + "DEFAULT_AUTHENTICATOR": DEFAULT_AUTHENTICATOR, + "EXTERNAL_BROWSER_AUTHENTICATOR": EXTERNAL_BROWSER_AUTHENTICATOR, + "KEY_PAIR_AUTHENTICATOR": KEY_PAIR_AUTHENTICATOR, + "OAUTH_AUTHENTICATOR": OAUTH_AUTHENTICATOR, +} + +_SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com" + + +class SnowflakePermissionError(MetaError): + """A permission error has happened""" + + +class SnowflakeConnectionConfig(ConfigModel): + # Note: this config model is also used by the snowflake-usage source. + + _connection = auto_connection_resolver() + + options: dict = pydantic.Field( + default_factory=dict, + description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.", + ) + + scheme: str = "snowflake" + username: Optional[str] = pydantic.Field( + default=None, description="Snowflake username." + ) + password: Optional[pydantic.SecretStr] = pydantic.Field( + default=None, exclude=True, description="Snowflake password." + ) + private_key: Optional[str] = pydantic.Field( + default=None, + description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n' if using key pair authentication. Encrypted version of private key will be in a form of '-----BEGIN ENCRYPTED PRIVATE KEY-----\\nencrypted-private-key\\n-----END ENCRYPTED PRIVATE KEY-----\\n' See: https://docs.snowflake.com/en/user-guide/key-pair-auth.html", + ) + + private_key_path: Optional[str] = pydantic.Field( + default=None, + description="The path to the private key if using key pair authentication. Ignored if `private_key` is set. See: https://docs.snowflake.com/en/user-guide/key-pair-auth.html", + ) + private_key_password: Optional[pydantic.SecretStr] = pydantic.Field( + default=None, + exclude=True, + description="Password for your private key. Required if using key pair authentication with encrypted private key.", + ) + + oauth_config: Optional[OAuthConfiguration] = pydantic.Field( + default=None, + description="oauth configuration - https://docs.snowflake.com/en/user-guide/python-connector-example.html#connecting-with-oauth", + ) + authentication_type: str = pydantic.Field( + default="DEFAULT_AUTHENTICATOR", + description='The type of authenticator to use when connecting to Snowflake. Supports "DEFAULT_AUTHENTICATOR", "OAUTH_AUTHENTICATOR", "EXTERNAL_BROWSER_AUTHENTICATOR" and "KEY_PAIR_AUTHENTICATOR".', + ) + account_id: str = pydantic.Field( + description="Snowflake account identifier. e.g. xy12345, xy12345.us-east-2.aws, xy12345.us-central1.gcp, xy12345.central-us.azure, xy12345.us-west-2.privatelink. Refer [Account Identifiers](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html#format-2-legacy-account-locator-in-a-region) for more details.", + ) + warehouse: Optional[str] = pydantic.Field( + default=None, description="Snowflake warehouse." + ) + role: Optional[str] = pydantic.Field(default=None, description="Snowflake role.") + connect_args: Optional[Dict[str, Any]] = pydantic.Field( + default=None, + description="Connect args to pass to Snowflake SqlAlchemy driver", + exclude=True, + ) + + def get_account(self) -> str: + assert self.account_id + return self.account_id + + rename_host_port_to_account_id = pydantic_renamed_field("host_port", "account_id") + + @pydantic.validator("account_id") + def validate_account_id(cls, account_id: str) -> str: + account_id = remove_protocol(account_id) + account_id = remove_trailing_slashes(account_id) + account_id = remove_suffix(account_id, _SNOWFLAKE_HOST_SUFFIX) + return account_id + + @pydantic.validator("authentication_type", always=True) + def authenticator_type_is_valid(cls, v, values): + if v not in _VALID_AUTH_TYPES.keys(): + raise ValueError( + f"unsupported authenticator type '{v}' was provided," + f" use one of {list(_VALID_AUTH_TYPES.keys())}" + ) + if ( + values.get("private_key") is not None + or values.get("private_key_path") is not None + ) and v != "KEY_PAIR_AUTHENTICATOR": + raise ValueError( + f"Either `private_key` and `private_key_path` is set but `authentication_type` is {v}. " + f"Should be set to 'KEY_PAIR_AUTHENTICATOR' when using key pair authentication" + ) + if v == "KEY_PAIR_AUTHENTICATOR": + # If we are using key pair auth, we need the private key path and password to be set + if ( + values.get("private_key") is None + and values.get("private_key_path") is None + ): + raise ValueError( + f"Both `private_key` and `private_key_path` are none. " + f"At least one should be set when using {v} authentication" + ) + elif v == "OAUTH_AUTHENTICATOR": + cls._check_oauth_config(values.get("oauth_config")) + logger.info(f"using authenticator type '{v}'") + return v + + @staticmethod + def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None: + if oauth_config is None: + raise ValueError( + "'oauth_config' is none but should be set when using OAUTH_AUTHENTICATOR authentication" + ) + if oauth_config.use_certificate is True: + if oauth_config.provider == OAuthIdentityProvider.OKTA: + raise ValueError( + "Certificate authentication is not supported for Okta." + ) + if oauth_config.encoded_oauth_private_key is None: + raise ValueError( + "'base64_encoded_oauth_private_key' was none " + "but should be set when using certificate for oauth_config" + ) + if oauth_config.encoded_oauth_public_key is None: + raise ValueError( + "'base64_encoded_oauth_public_key' was none" + "but should be set when using use_certificate true for oauth_config" + ) + elif oauth_config.client_secret is None: + raise ValueError( + "'oauth_config.client_secret' was none " + "but should be set when using use_certificate false for oauth_config" + ) + + def get_sql_alchemy_url( + self, + database: Optional[str] = None, + username: Optional[str] = None, + password: Optional[pydantic.SecretStr] = None, + role: Optional[str] = None, + ) -> str: + if username is None: + username = self.username + if password is None: + password = self.password + if role is None: + role = self.role + return make_sqlalchemy_uri( + self.scheme, + username, + password.get_secret_value() if password else None, + self.account_id, + f'"{database}"' if database is not None else database, + uri_opts={ + # Drop the options if value is None. + key: value + for (key, value) in { + "authenticator": _VALID_AUTH_TYPES.get(self.authentication_type), + "warehouse": self.warehouse, + "role": role, + "application": _APPLICATION_NAME, + }.items() + if value + }, + ) + + _computed_connect_args: Optional[dict] = None + + def get_connect_args(self) -> dict: + """ + Builds connect args, adding defaults and reading a private key from the file if needed. + Caches the results in a private instance variable to avoid reading the file multiple times. + """ + + if self._computed_connect_args is not None: + return self._computed_connect_args + + connect_args: Dict[str, Any] = { + # Improves performance and avoids timeout errors for larger query result + CLIENT_PREFETCH_THREADS: 10, + CLIENT_SESSION_KEEP_ALIVE: True, + # Let user override the default config values + **(self.connect_args or {}), + } + + if ( + "private_key" not in connect_args + and self.authentication_type == "KEY_PAIR_AUTHENTICATOR" + ): + if self.private_key is not None: + pkey_bytes = self.private_key.replace("\\n", "\n").encode() + else: + assert ( + self.private_key_path + ), "missing required private key path to read key from" + with open(self.private_key_path, "rb") as key: + pkey_bytes = key.read() + + p_key = serialization.load_pem_private_key( + pkey_bytes, + password=self.private_key_password.get_secret_value().encode() + if self.private_key_password is not None + else None, + backend=default_backend(), + ) + + pkb: bytes = p_key.private_bytes( + encoding=serialization.Encoding.DER, + format=serialization.PrivateFormat.PKCS8, + encryption_algorithm=serialization.NoEncryption(), + ) + + connect_args["private_key"] = pkb + + self._computed_connect_args = connect_args + return connect_args + + def get_options(self) -> dict: + options_connect_args: Dict = self.get_connect_args() + options_connect_args.update(self.options.get("connect_args", {})) + self.options["connect_args"] = options_connect_args + return self.options + + def get_oauth_connection(self) -> NativeSnowflakeConnection: + assert ( + self.oauth_config + ), "oauth_config should be provided if using oauth based authentication" + generator = OAuthTokenGenerator( + client_id=self.oauth_config.client_id, + authority_url=self.oauth_config.authority_url, + provider=self.oauth_config.provider, + username=self.username, + password=self.password, + ) + if self.oauth_config.use_certificate: + response = generator.get_token_with_certificate( + private_key_content=str(self.oauth_config.encoded_oauth_public_key), + public_key_content=str(self.oauth_config.encoded_oauth_private_key), + scopes=self.oauth_config.scopes, + ) + else: + assert self.oauth_config.client_secret + response = generator.get_token_with_secret( + secret=str(self.oauth_config.client_secret.get_secret_value()), + scopes=self.oauth_config.scopes, + ) + try: + token = response["access_token"] + except KeyError: + raise ValueError( + f"access_token not found in response {response}. " + "Please check your OAuth configuration." + ) + connect_args = self.get_options()["connect_args"] + return snowflake.connector.connect( + user=self.username, + account=self.account_id, + token=token, + role=self.role, + warehouse=self.warehouse, + authenticator=_VALID_AUTH_TYPES.get(self.authentication_type), + application=_APPLICATION_NAME, + **connect_args, + ) + + def get_key_pair_connection(self) -> NativeSnowflakeConnection: + connect_args = self.get_options()["connect_args"] + + return snowflake.connector.connect( + user=self.username, + account=self.account_id, + warehouse=self.warehouse, + role=self.role, + authenticator=_VALID_AUTH_TYPES.get(self.authentication_type), + application=_APPLICATION_NAME, + **connect_args, + ) + + def get_native_connection(self) -> NativeSnowflakeConnection: + connect_args = self.get_options()["connect_args"] + if self.authentication_type == "DEFAULT_AUTHENTICATOR": + return snowflake.connector.connect( + user=self.username, + password=self.password.get_secret_value() if self.password else None, + account=self.account_id, + warehouse=self.warehouse, + role=self.role, + application=_APPLICATION_NAME, + **connect_args, + ) + elif self.authentication_type == "OAUTH_AUTHENTICATOR": + return self.get_oauth_connection() + elif self.authentication_type == "KEY_PAIR_AUTHENTICATOR": + return self.get_key_pair_connection() + elif self.authentication_type == "EXTERNAL_BROWSER_AUTHENTICATOR": + return snowflake.connector.connect( + user=self.username, + password=self.password.get_secret_value() if self.password else None, + account=self.account_id, + warehouse=self.warehouse, + role=self.role, + authenticator=_VALID_AUTH_TYPES.get(self.authentication_type), + application=_APPLICATION_NAME, + **connect_args, + ) + else: + # not expected to be here + raise Exception("Not expected to be here.") + + def get_connection(self) -> "SnowflakeConnection": + try: + return SnowflakeConnection(self.get_native_connection()) + except Exception as e: + logger.debug(e, exc_info=e) + + if "not granted to this user" in str(e): + raise SnowflakePermissionError( + f"Permissions error when connecting to snowflake: {e}" + ) from e + + raise ConfigurationError( + f"Failed to connect to snowflake instance: {e}" + ) from e + class SnowflakeConnection: _connection: NativeSnowflakeConnection @@ -41,7 +394,3 @@ def _is_permission_error(e: Exception) -> bool: # 002003 (02000): SQL compilation error: Database/SCHEMA 'XXXX' does not exist or not authorized. # Insufficient privileges to operate on database 'XXXX' return "Insufficient privileges" in msg or "not authorized" in msg - - -class SnowflakePermissionError(MetaError): - """A permission error has happened""" diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index f5f8cb9cf1c578..e26555cada5a87 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -23,10 +23,12 @@ from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnectionConfig, +) from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin from datahub.ingestion.source.usage.usage_common import BaseUsageConfig -from datahub.ingestion.source_config.sql.snowflake import SnowflakeConnectionConfig from datahub.metadata._urns.urn_defs import CorpUserUrn from datahub.sql_parsing.sql_parsing_aggregator import ( KnownLineageMapping, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py index 1706ea88bd9ea1..9094e9c9feee46 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py @@ -12,6 +12,9 @@ from datahub.ingestion.api.decorators import SupportStatus, config_class, support_status from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnectionConfig, +) from datahub.ingestion.source.snowflake.snowflake_schema import ( SnowflakeDatabase, SnowflakeDataDictionary, @@ -20,7 +23,6 @@ SnowflakeSchemaGenerator, ) from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin -from datahub.ingestion.source_config.sql.snowflake import SnowflakeConnectionConfig from datahub.ingestion.source_report.time_window import BaseTimeWindowReport from datahub.utilities.lossy_collections import LossyList diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 96447b2b2a21b9..18cff222cffabb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -34,7 +34,10 @@ SnowflakeAssertionsHandler, ) from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config -from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection +from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnection, + SnowflakeConnectionConfig, +) from datahub.ingestion.source.snowflake.snowflake_lineage_v2 import ( SnowflakeLineageExtractor, ) @@ -63,7 +66,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionSourceBase, ) -from datahub.ingestion.source_config.sql.snowflake import SnowflakeConnectionConfig from datahub.ingestion.source_report.ingestion_stage import ( LINEAGE_EXTRACTION, METADATA_EXTRACTION, diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/__init__.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/__init__.py deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py deleted file mode 100644 index 5aec5b80810aac..00000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ /dev/null @@ -1,395 +0,0 @@ -import logging -from typing import Any, Dict, Optional - -import pydantic -import snowflake.connector -from cryptography.hazmat.backends import default_backend -from cryptography.hazmat.primitives import serialization -from snowflake.connector.network import ( - DEFAULT_AUTHENTICATOR, - EXTERNAL_BROWSER_AUTHENTICATOR, - KEY_PAIR_AUTHENTICATOR, - OAUTH_AUTHENTICATOR, -) - -from datahub.configuration.common import ( - AllowDenyPattern, - ConfigModel, - ConfigurationError, -) -from datahub.configuration.connection_resolver import auto_connection_resolver -from datahub.configuration.oauth import OAuthConfiguration, OAuthIdentityProvider -from datahub.configuration.time_window_config import BaseTimeWindowConfig -from datahub.configuration.validate_field_rename import pydantic_renamed_field -from datahub.ingestion.source.snowflake.constants import ( - CLIENT_PREFETCH_THREADS, - CLIENT_SESSION_KEEP_ALIVE, -) -from datahub.ingestion.source.snowflake.snowflake_connection import ( - SnowflakeConnection, - SnowflakePermissionError, -) -from datahub.ingestion.source.sql.oauth_generator import OAuthTokenGenerator -from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri -from datahub.utilities.config_clean import ( - remove_protocol, - remove_suffix, - remove_trailing_slashes, -) - -logger: logging.Logger = logging.getLogger(__name__) - -APPLICATION_NAME: str = "acryl_datahub" - -VALID_AUTH_TYPES: Dict[str, str] = { - "DEFAULT_AUTHENTICATOR": DEFAULT_AUTHENTICATOR, - "EXTERNAL_BROWSER_AUTHENTICATOR": EXTERNAL_BROWSER_AUTHENTICATOR, - "KEY_PAIR_AUTHENTICATOR": KEY_PAIR_AUTHENTICATOR, - "OAUTH_AUTHENTICATOR": OAUTH_AUTHENTICATOR, -} - -SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com" - - -class SnowflakeConnectionConfig(ConfigModel): - # Note: this config model is also used by the snowflake-usage source. - - _connection = auto_connection_resolver() - - options: dict = pydantic.Field( - default_factory=dict, - description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.", - ) - - scheme: str = "snowflake" - username: Optional[str] = pydantic.Field( - default=None, description="Snowflake username." - ) - password: Optional[pydantic.SecretStr] = pydantic.Field( - default=None, exclude=True, description="Snowflake password." - ) - private_key: Optional[str] = pydantic.Field( - default=None, - description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n' if using key pair authentication. Encrypted version of private key will be in a form of '-----BEGIN ENCRYPTED PRIVATE KEY-----\\nencrypted-private-key\\n-----END ENCRYPTED PRIVATE KEY-----\\n' See: https://docs.snowflake.com/en/user-guide/key-pair-auth.html", - ) - - private_key_path: Optional[str] = pydantic.Field( - default=None, - description="The path to the private key if using key pair authentication. Ignored if `private_key` is set. See: https://docs.snowflake.com/en/user-guide/key-pair-auth.html", - ) - private_key_password: Optional[pydantic.SecretStr] = pydantic.Field( - default=None, - exclude=True, - description="Password for your private key. Required if using key pair authentication with encrypted private key.", - ) - - oauth_config: Optional[OAuthConfiguration] = pydantic.Field( - default=None, - description="oauth configuration - https://docs.snowflake.com/en/user-guide/python-connector-example.html#connecting-with-oauth", - ) - authentication_type: str = pydantic.Field( - default="DEFAULT_AUTHENTICATOR", - description='The type of authenticator to use when connecting to Snowflake. Supports "DEFAULT_AUTHENTICATOR", "OAUTH_AUTHENTICATOR", "EXTERNAL_BROWSER_AUTHENTICATOR" and "KEY_PAIR_AUTHENTICATOR".', - ) - account_id: str = pydantic.Field( - description="Snowflake account identifier. e.g. xy12345, xy12345.us-east-2.aws, xy12345.us-central1.gcp, xy12345.central-us.azure, xy12345.us-west-2.privatelink. Refer [Account Identifiers](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html#format-2-legacy-account-locator-in-a-region) for more details.", - ) - warehouse: Optional[str] = pydantic.Field( - default=None, description="Snowflake warehouse." - ) - role: Optional[str] = pydantic.Field(default=None, description="Snowflake role.") - connect_args: Optional[Dict[str, Any]] = pydantic.Field( - default=None, - description="Connect args to pass to Snowflake SqlAlchemy driver", - exclude=True, - ) - - def get_account(self) -> str: - assert self.account_id - return self.account_id - - rename_host_port_to_account_id = pydantic_renamed_field("host_port", "account_id") - - @pydantic.validator("account_id") - def validate_account_id(cls, account_id: str) -> str: - account_id = remove_protocol(account_id) - account_id = remove_trailing_slashes(account_id) - account_id = remove_suffix(account_id, SNOWFLAKE_HOST_SUFFIX) - return account_id - - @pydantic.validator("authentication_type", always=True) - def authenticator_type_is_valid(cls, v, values): - if v not in VALID_AUTH_TYPES.keys(): - raise ValueError( - f"unsupported authenticator type '{v}' was provided," - f" use one of {list(VALID_AUTH_TYPES.keys())}" - ) - if ( - values.get("private_key") is not None - or values.get("private_key_path") is not None - ) and v != "KEY_PAIR_AUTHENTICATOR": - raise ValueError( - f"Either `private_key` and `private_key_path` is set but `authentication_type` is {v}. " - f"Should be set to 'KEY_PAIR_AUTHENTICATOR' when using key pair authentication" - ) - if v == "KEY_PAIR_AUTHENTICATOR": - # If we are using key pair auth, we need the private key path and password to be set - if ( - values.get("private_key") is None - and values.get("private_key_path") is None - ): - raise ValueError( - f"Both `private_key` and `private_key_path` are none. " - f"At least one should be set when using {v} authentication" - ) - elif v == "OAUTH_AUTHENTICATOR": - cls._check_oauth_config(values.get("oauth_config")) - logger.info(f"using authenticator type '{v}'") - return v - - @staticmethod - def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None: - if oauth_config is None: - raise ValueError( - "'oauth_config' is none but should be set when using OAUTH_AUTHENTICATOR authentication" - ) - if oauth_config.use_certificate is True: - if oauth_config.provider == OAuthIdentityProvider.OKTA: - raise ValueError( - "Certificate authentication is not supported for Okta." - ) - if oauth_config.encoded_oauth_private_key is None: - raise ValueError( - "'base64_encoded_oauth_private_key' was none " - "but should be set when using certificate for oauth_config" - ) - if oauth_config.encoded_oauth_public_key is None: - raise ValueError( - "'base64_encoded_oauth_public_key' was none" - "but should be set when using use_certificate true for oauth_config" - ) - elif oauth_config.client_secret is None: - raise ValueError( - "'oauth_config.client_secret' was none " - "but should be set when using use_certificate false for oauth_config" - ) - - def get_sql_alchemy_url( - self, - database: Optional[str] = None, - username: Optional[str] = None, - password: Optional[pydantic.SecretStr] = None, - role: Optional[str] = None, - ) -> str: - if username is None: - username = self.username - if password is None: - password = self.password - if role is None: - role = self.role - return make_sqlalchemy_uri( - self.scheme, - username, - password.get_secret_value() if password else None, - self.account_id, - f'"{database}"' if database is not None else database, - uri_opts={ - # Drop the options if value is None. - key: value - for (key, value) in { - "authenticator": VALID_AUTH_TYPES.get(self.authentication_type), - "warehouse": self.warehouse, - "role": role, - "application": APPLICATION_NAME, - }.items() - if value - }, - ) - - _computed_connect_args: Optional[dict] = None - - def get_connect_args(self) -> dict: - """ - Builds connect args, adding defaults and reading a private key from the file if needed. - Caches the results in a private instance variable to avoid reading the file multiple times. - """ - - if self._computed_connect_args is not None: - return self._computed_connect_args - - connect_args: Dict[str, Any] = { - # Improves performance and avoids timeout errors for larger query result - CLIENT_PREFETCH_THREADS: 10, - CLIENT_SESSION_KEEP_ALIVE: True, - # Let user override the default config values - **(self.connect_args or {}), - } - - if ( - "private_key" not in connect_args - and self.authentication_type == "KEY_PAIR_AUTHENTICATOR" - ): - if self.private_key is not None: - pkey_bytes = self.private_key.replace("\\n", "\n").encode() - else: - assert ( - self.private_key_path - ), "missing required private key path to read key from" - with open(self.private_key_path, "rb") as key: - pkey_bytes = key.read() - - p_key = serialization.load_pem_private_key( - pkey_bytes, - password=self.private_key_password.get_secret_value().encode() - if self.private_key_password is not None - else None, - backend=default_backend(), - ) - - pkb: bytes = p_key.private_bytes( - encoding=serialization.Encoding.DER, - format=serialization.PrivateFormat.PKCS8, - encryption_algorithm=serialization.NoEncryption(), - ) - - connect_args["private_key"] = pkb - - self._computed_connect_args = connect_args - return connect_args - - def get_options(self) -> dict: - options_connect_args: Dict = self.get_connect_args() - options_connect_args.update(self.options.get("connect_args", {})) - self.options["connect_args"] = options_connect_args - return self.options - - def get_oauth_connection(self) -> snowflake.connector.SnowflakeConnection: - assert ( - self.oauth_config - ), "oauth_config should be provided if using oauth based authentication" - generator = OAuthTokenGenerator( - client_id=self.oauth_config.client_id, - authority_url=self.oauth_config.authority_url, - provider=self.oauth_config.provider, - username=self.username, - password=self.password, - ) - if self.oauth_config.use_certificate: - response = generator.get_token_with_certificate( - private_key_content=str(self.oauth_config.encoded_oauth_public_key), - public_key_content=str(self.oauth_config.encoded_oauth_private_key), - scopes=self.oauth_config.scopes, - ) - else: - assert self.oauth_config.client_secret - response = generator.get_token_with_secret( - secret=str(self.oauth_config.client_secret.get_secret_value()), - scopes=self.oauth_config.scopes, - ) - try: - token = response["access_token"] - except KeyError: - raise ValueError( - f"access_token not found in response {response}. " - "Please check your OAuth configuration." - ) - connect_args = self.get_options()["connect_args"] - return snowflake.connector.connect( - user=self.username, - account=self.account_id, - token=token, - role=self.role, - warehouse=self.warehouse, - authenticator=VALID_AUTH_TYPES.get(self.authentication_type), - application=APPLICATION_NAME, - **connect_args, - ) - - def get_key_pair_connection(self) -> snowflake.connector.SnowflakeConnection: - connect_args = self.get_options()["connect_args"] - - return snowflake.connector.connect( - user=self.username, - account=self.account_id, - warehouse=self.warehouse, - role=self.role, - authenticator=VALID_AUTH_TYPES.get(self.authentication_type), - application=APPLICATION_NAME, - **connect_args, - ) - - def get_native_connection(self) -> snowflake.connector.SnowflakeConnection: - connect_args = self.get_options()["connect_args"] - if self.authentication_type == "DEFAULT_AUTHENTICATOR": - return snowflake.connector.connect( - user=self.username, - password=self.password.get_secret_value() if self.password else None, - account=self.account_id, - warehouse=self.warehouse, - role=self.role, - application=APPLICATION_NAME, - **connect_args, - ) - elif self.authentication_type == "OAUTH_AUTHENTICATOR": - return self.get_oauth_connection() - elif self.authentication_type == "KEY_PAIR_AUTHENTICATOR": - return self.get_key_pair_connection() - elif self.authentication_type == "EXTERNAL_BROWSER_AUTHENTICATOR": - return snowflake.connector.connect( - user=self.username, - password=self.password.get_secret_value() if self.password else None, - account=self.account_id, - warehouse=self.warehouse, - role=self.role, - authenticator=VALID_AUTH_TYPES.get(self.authentication_type), - application=APPLICATION_NAME, - **connect_args, - ) - else: - # not expected to be here - raise Exception("Not expected to be here.") - - def get_connection(self) -> SnowflakeConnection: - try: - return SnowflakeConnection(self.get_native_connection()) - except Exception as e: - logger.debug(e, exc_info=e) - - if "not granted to this user" in str(e): - raise SnowflakePermissionError( - f"Permissions error when connecting to snowflake: {e}" - ) from e - - raise ConfigurationError( - f"Failed to connect to snowflake instance: {e}" - ) from e - - -class SnowflakeConfig(SnowflakeConnectionConfig, BaseTimeWindowConfig, SQLCommonConfig): - include_table_lineage: bool = pydantic.Field( - default=True, - description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.", - ) - include_view_lineage: bool = pydantic.Field( - default=True, - description="If enabled, populates the snowflake view->table and table->view lineages. Requires appropriate grants given to the role, and include_table_lineage to be True. view->table lineage requires Snowflake Enterprise Edition or above.", - ) - - database_pattern: AllowDenyPattern = AllowDenyPattern( - deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"] - ) - - ignore_start_time_lineage: bool = False - upstream_lineage_in_report: bool = False - - @pydantic.root_validator(skip_on_failure=True) - def validate_include_view_lineage(cls, values): - if ( - "include_table_lineage" in values - and not values.get("include_table_lineage") - and values.get("include_view_lineage") - ): - raise ValueError( - "include_table_lineage must be True for include_view_lineage to be set." - ) - return values From 8827d10d92fd14e2c60cd840be5249d5bdfdbfb4 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 3 Jul 2024 20:02:02 -0700 Subject: [PATCH 09/32] tweak type annotations --- .../datahub/ingestion/source/snowflake/snowflake_utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index b4405f3431d2aa..7b61ab81aac292 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Tuple from typing_extensions import Protocol @@ -78,7 +78,10 @@ def create_snowsight_base_url( return url @staticmethod - def get_cloud_region_from_snowflake_region_id(region): + def get_cloud_region_from_snowflake_region_id( + region: str, + ) -> Tuple[str, str]: + cloud: str if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING.keys(): cloud, cloud_region_id = SNOWFLAKE_REGION_CLOUD_REGION_MAPPING[region] elif region.startswith(("aws_", "gcp_", "azure_")): From 3d211106d738bb1cded9cf806e22f26ab0b3d05e Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 8 Jul 2024 12:22:38 -0700 Subject: [PATCH 10/32] add configurability --- .../source/snowflake/snowflake_queries.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index e26555cada5a87..0c31f8a6829331 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -69,6 +69,12 @@ class SnowflakeQueriesConfig( description="Whether to convert dataset urns to lowercase.", ) + include_lineage: bool = True + include_queries: bool = True + include_usage_statistics: bool = True + include_query_usage_statistics: bool = False + include_operations: bool = True + @dataclass class SnowflakeQueriesReport(SourceReport): @@ -90,18 +96,17 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesConfig): platform_instance=self.config.platform_instance, env=self.config.env, # graph=self.ctx.graph, - # TODO: Make these configurable. - generate_lineage=True, - generate_queries=True, - generate_usage_statistics=True, - generate_query_usage_statistics=True, + generate_lineage=self.config.include_lineage, + generate_queries=self.config.include_queries, + generate_usage_statistics=self.config.include_usage_statistics, + generate_query_usage_statistics=self.config.include_query_usage_statistics, usage_config=BaseUsageConfig( bucket_duration=self.config.window.bucket_duration, start_time=self.config.window.start_time, end_time=self.config.window.end_time, # TODO make the rest of the fields configurable ), - generate_operations=True, + generate_operations=self.config.include_operations, format_queries=False, ) self.report.sql_aggregator = self.aggregator.report From 2205961802dbf1761a8a9e9407df0d340fb63f80 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 8 Jul 2024 14:51:02 -0700 Subject: [PATCH 11/32] improve domain filtering --- .../source/snowflake/snowflake_queries.py | 15 +++++++++------ .../source/snowflake/snowflake_query.py | 16 +++++++++------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 0c31f8a6829331..ffd3ff524492b9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -227,8 +227,8 @@ def _parse_audit_log_response(self, row: Dict[str, Any]) -> PreparsedQuery: dataset = self.gen_dataset_urn(self.snowflake_identifier(obj["objectName"])) columns = set() - for column in obj["columns"]: - columns.add(self.snowflake_identifier(column["columnName"])) + for modified_column in obj["columns"]: + columns.add(self.snowflake_identifier(modified_column["columnName"])) upstreams.append(dataset) column_usage[dataset] = columns @@ -243,12 +243,14 @@ def _parse_audit_log_response(self, row: Dict[str, Any]) -> PreparsedQuery: self.snowflake_identifier(obj["objectName"]) ) column_lineage = [] - for column in obj["columns"]: + for modified_column in obj["columns"]: column_lineage.append( ColumnLineageInfo( downstream=DownstreamColumnRef( dataset=downstream, - column=self.snowflake_identifier(column["columnName"]), + column=self.snowflake_identifier( + modified_column["columnName"] + ), ), upstreams=[ ColumnRef( @@ -259,8 +261,9 @@ def _parse_audit_log_response(self, row: Dict[str, Any]) -> PreparsedQuery: upstream["columnName"] ), ) - for upstream in column["directSources"] - # TODO Check object domain. + for upstream in modified_column["directSources"] + if upstream["objectDomain"] + in SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS ], ) ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index 9f655b34177fc6..f2eb8efcf2f797 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -28,13 +28,15 @@ def create_deny_regex_sql_filter( class SnowflakeQuery: - ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = ( - "(" - f"'{SnowflakeObjectDomain.TABLE.capitalize()}'," - f"'{SnowflakeObjectDomain.EXTERNAL_TABLE.capitalize()}'," - f"'{SnowflakeObjectDomain.VIEW.capitalize()}'," - f"'{SnowflakeObjectDomain.MATERIALIZED_VIEW.capitalize()}'" - ")" + ACCESS_HISTORY_TABLE_VIEW_DOMAINS = { + SnowflakeObjectDomain.TABLE.capitalize(), + SnowflakeObjectDomain.EXTERNAL_TABLE.capitalize(), + SnowflakeObjectDomain.VIEW.capitalize(), + SnowflakeObjectDomain.MATERIALIZED_VIEW.capitalize(), + } + + ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER = "({})".format( + ",".join(f"'{domain}'" for domain in ACCESS_HISTORY_TABLE_VIEW_DOMAINS) ) ACCESS_HISTORY_TABLE_DOMAINS_FILTER = ( "(" From 4edfce298876515b22c4719a23e368af4e9e44b4 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 8 Jul 2024 15:07:59 -0700 Subject: [PATCH 12/32] add parser exception handling --- .../ingestion/source/snowflake/snowflake_queries.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index ffd3ff524492b9..363fa23c69e083 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -197,8 +197,16 @@ def fetch_audit_log( for row in resp: assert isinstance(row, dict) - entry = self._parse_audit_log_response(row) - yield entry + try: + entry = self._parse_audit_log_response(row) + except Exception as e: + self.report.warning( + "Error parsing audit log row", + context=f"{e}", + exc=e, + ) + else: + yield entry # HACK: This makes mypy happy with our usage of the mixin methods. gen_dataset_urn = SnowflakeCommonMixin.gen_dataset_urn From 8b1eaf16c54ddfd4f38372c3dbb833cea5c2d870 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 9 Jul 2024 09:56:36 -0700 Subject: [PATCH 13/32] add additional lines --- .../datahub/ingestion/source/snowflake/snowflake_queries.py | 5 ++++- .../src/datahub/sql_parsing/sql_parsing_aggregator.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 363fa23c69e083..53ab87ba81b1d4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -195,7 +195,10 @@ def fetch_audit_log( conn = self.config.connection.get_connection() resp = conn.query(audit_log_query) - for row in resp: + for i, row in enumerate(resp): + if i % 1000 == 0: + logger.info(f"Processed {i} audit log rows") + assert isinstance(row, dict) try: entry = self._parse_audit_log_response(row) diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index e810be58b95ee8..7770417a20bad8 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -218,7 +218,7 @@ class SqlAggregatorReport(Report): schema_resolver_count: Optional[int] = None num_unique_query_fingerprints: Optional[int] = None num_urns_with_lineage: Optional[int] = None - num_queries_entities_generated: int = 0 + num_query_entities_generated: int = 0 # Usage-related. usage_skipped_missing_timestamp: int = 0 @@ -1200,7 +1200,7 @@ def _gen_query( ), ], ) - self.report.num_queries_entities_generated += 1 + self.report.num_query_entities_generated += 1 if self._query_usage_counts is not None: assert self.usage_config is not None From 605279a3ce3775224df8602bb20e8786f39220d6 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 9 Jul 2024 14:44:30 -0700 Subject: [PATCH 14/32] refactor snowsight url generation, create SnowsightUrlBuilder --- .../source/snowflake/snowflake_schema_gen.py | 47 ++++++------------- .../source/snowflake/snowflake_utils.py | 40 +++++++++++++--- .../source/snowflake/snowflake_v2.py | 36 +++++++------- .../tests/unit/test_snowflake_source.py | 43 ++++++++--------- 4 files changed, 84 insertions(+), 82 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index 00fd597251d595..0c3a2c84362b95 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -53,6 +53,7 @@ from datahub.ingestion.source.snowflake.snowflake_utils import ( SnowflakeCommonMixin, SnowflakeCommonProtocol, + SnowsightUrlBuilder, ) from datahub.ingestion.source.sql.sql_utils import ( add_table_to_schema_container, @@ -151,7 +152,7 @@ def __init__( domain_registry: Optional[DomainRegistry], profiler: Optional[SnowflakeProfiler], aggregator: Optional[SqlParsingAggregator], - snowsight_base_url: Optional[str], + snowsight_url_builder: Optional[SnowsightUrlBuilder], ) -> None: self.config: SnowflakeV2Config = config self.report: SnowflakeV2Report = report @@ -169,7 +170,9 @@ def __init__( config, self.data_dictionary, self.report ) self.profiler: Optional[SnowflakeProfiler] = profiler - self.snowsight_base_url: Optional[str] = snowsight_base_url + self.snowsight_url_builder: Optional[ + SnowsightUrlBuilder + ] = snowsight_url_builder # These are populated as side-effects of get_workunits_internal. self.databases: List[SnowflakeDatabase] = [] @@ -770,7 +773,7 @@ def get_dataset_properties( qualifiedName=f"{db_name}.{schema_name}.{table.name}", customProperties={}, externalUrl=( - self.get_external_url_for_table( + self.snowsight_url_builder.get_external_url_for_table( table.name, schema_name, db_name, @@ -780,7 +783,7 @@ def get_dataset_properties( else SnowflakeObjectDomain.VIEW ), ) - if self.config.include_external_url + if self.snowsight_url_builder else None ), ) @@ -909,8 +912,8 @@ def gen_database_containers( domain_registry=self.domain_registry, domain_config=self.config.domain, external_url=( - self.get_external_url_for_database(database.name) - if self.config.include_external_url + self.snowsight_url_builder.get_external_url_for_database(database.name) + if self.snowsight_url_builder else None ), description=database.comment, @@ -965,8 +968,10 @@ def gen_schema_containers( domain_registry=self.domain_registry, description=schema.comment, external_url=( - self.get_external_url_for_schema(schema.name, db_name) - if self.config.include_external_url + self.snowsight_url_builder.get_external_url_for_schema( + schema.name, db_name + ) + if self.snowsight_url_builder else None ), created=( @@ -977,11 +982,7 @@ def gen_schema_containers( last_modified=( int(schema.last_altered.timestamp() * 1000) if schema.last_altered is not None - else ( - int(schema.created.timestamp() * 1000) - if schema.created is not None - else None - ) + else None ), tags=( [self.snowflake_identifier(tag.identifier()) for tag in schema.tags] @@ -1046,23 +1047,3 @@ def get_fk_constraints_for_table( # Access to table but none of its constraints - is this possible ? return constraints.get(table_name, []) - - # domain is either "view" or "table" - def get_external_url_for_table( - self, table_name: str, schema_name: str, db_name: str, domain: str - ) -> Optional[str]: - if self.snowsight_base_url is not None: - return f"{self.snowsight_base_url}#/data/databases/{db_name}/schemas/{schema_name}/{domain}/{table_name}/" - return None - - def get_external_url_for_schema( - self, schema_name: str, db_name: str - ) -> Optional[str]: - if self.snowsight_base_url is not None: - return f"{self.snowsight_base_url}#/data/databases/{db_name}/schemas/{schema_name}/" - return None - - def get_external_url_for_database(self, db_name: str) -> Optional[str]: - if self.snowsight_base_url is not None: - return f"{self.snowsight_base_url}#/data/databases/{db_name}/" - return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index 7b61ab81aac292..fb6d15f6874a83 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -1,4 +1,4 @@ -from typing import Optional, Tuple +from typing import ClassVar, Literal, Optional, Tuple from typing_extensions import Protocol @@ -39,10 +39,8 @@ def report_error(self, key: str, reason: str) -> None: ... -class SnowflakeCommonMixin: - platform = "snowflake" - - CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX = [ +class SnowsightUrlBuilder: + CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX: ClassVar = [ "us-west-2", "us-east-1", "eu-west-1", @@ -51,6 +49,14 @@ class SnowflakeCommonMixin: "ap-southeast-2", ] + snowsight_base_url: str + + def __init__(self, account_locator: str, region: str, privatelink: bool = False): + cloud, cloud_region_id = self.get_cloud_region_from_snowflake_region_id(region) + self.snowsight_base_url = self.create_snowsight_base_url( + account_locator, cloud_region_id, cloud, privatelink + ) + @staticmethod def create_snowsight_base_url( account_locator: str, @@ -66,7 +72,7 @@ def create_snowsight_base_url( # https://docs.snowflake.com/en/user-guide/admin-account-identifier#non-vps-account-locator-formats-by-cloud-platform-and-region if ( cloud_region_id - in SnowflakeCommonMixin.CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX + in SnowsightUrlBuilder.CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX ): url_cloud_provider_suffix = "" else: @@ -92,6 +98,28 @@ def get_cloud_region_from_snowflake_region_id( raise Exception(f"Unknown snowflake region {region}") return cloud, cloud_region_id + # domain is either "view" or "table" + def get_external_url_for_table( + self, + table_name: str, + schema_name: str, + db_name: str, + domain: Literal[SnowflakeObjectDomain.TABLE, SnowflakeObjectDomain.VIEW], + ) -> Optional[str]: + return f"{self.snowsight_base_url}#/data/databases/{db_name}/schemas/{schema_name}/{domain}/{table_name}/" + + def get_external_url_for_schema( + self, schema_name: str, db_name: str + ) -> Optional[str]: + return f"{self.snowsight_base_url}#/data/databases/{db_name}/schemas/{schema_name}/" + + def get_external_url_for_database(self, db_name: str) -> Optional[str]: + return f"{self.snowsight_base_url}#/data/databases/{db_name}/" + + +class SnowflakeCommonMixin: + platform = "snowflake" + def _is_dataset_pattern_allowed( self: SnowflakeCommonProtocol, dataset_name: Optional[str], diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 18cff222cffabb..e4b9f73125d1c4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -54,7 +54,10 @@ from datahub.ingestion.source.snowflake.snowflake_usage_v2 import ( SnowflakeUsageExtractor, ) -from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin +from datahub.ingestion.source.snowflake.snowflake_utils import ( + SnowflakeCommonMixin, + SnowsightUrlBuilder, +) from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantLineageRunSkipHandler, @@ -426,9 +429,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.inspect_session_metadata(self.connection) - snowsight_base_url = None + snowsight_url_builder = None if self.config.include_external_url: - snowsight_base_url = self.get_snowsight_base_url() + snowsight_url_builder = self.get_snowsight_url_builder() if self.report.default_warehouse is None: self.report_warehouse_failure() @@ -441,7 +444,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: domain_registry=self.domain_registry, profiler=self.profiler, aggregator=self.aggregator, - snowsight_base_url=snowsight_base_url, + snowsight_url_builder=snowsight_url_builder, ) self.report.set_ingestion_stage("*", METADATA_EXTRACTION) @@ -560,7 +563,7 @@ def inspect_session_metadata(self, connection: SnowflakeConnection) -> None: except Exception: self.report.edition = None - def get_snowsight_base_url(self) -> Optional[str]: + def get_snowsight_url_builder(self) -> Optional[SnowsightUrlBuilder]: try: # See https://docs.snowflake.com/en/user-guide/admin-account-identifier.html#finding-the-region-and-locator-for-an-account for db_row in self.connection.query(SnowflakeQuery.current_account()): @@ -576,24 +579,19 @@ def get_snowsight_base_url(self) -> Optional[str]: region = region.split(".")[-1].lower() account_locator = account_locator.lower() - cloud, cloud_region_id = self.get_cloud_region_from_snowflake_region_id( - region - ) - - # For privatelink, account identifier ends with .privatelink - # See https://docs.snowflake.com/en/user-guide/organizations-connect.html#private-connectivity-urls - return self.create_snowsight_base_url( + return SnowsightUrlBuilder( account_locator, - cloud_region_id, - cloud, - self.config.account_id.endswith(".privatelink"), # type:ignore + region, + # For privatelink, account identifier ends with .privatelink + # See https://docs.snowflake.com/en/user-guide/organizations-connect.html#private-connectivity-urls + privatelink=self.config.account_id.endswith(".privatelink"), ) except Exception as e: - self.warn( - self.logger, - "snowsight url", - f"unable to get snowsight base url due to an error -> {e}", + self.report.warning( + title="External URL Generation Failed", + message="We were unable to infer the Snowsight base URL for your Snowflake account. External URLs will not be generated.", + exc=e, ) return None diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/test_snowflake_source.py index 69a7510692df1d..2d9be91d94deb6 100644 --- a/metadata-ingestion/tests/unit/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/test_snowflake_source.py @@ -24,7 +24,7 @@ from datahub.ingestion.source.snowflake.snowflake_usage_v2 import ( SnowflakeObjectAccessEntry, ) -from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin +from datahub.ingestion.source.snowflake.snowflake_utils import SnowsightUrlBuilder from datahub.ingestion.source.snowflake.snowflake_v2 import SnowflakeV2Source from tests.test_helpers import test_connection_helpers @@ -445,7 +445,9 @@ def test_aws_cloud_region_from_snowflake_region_id(): ( cloud, cloud_region_id, - ) = SnowflakeV2Source.get_cloud_region_from_snowflake_region_id("aws_ca_central_1") + ) = SnowsightUrlBuilder.get_cloud_region_from_snowflake_region_id( + "aws_ca_central_1" + ) assert cloud == SnowflakeCloudProvider.AWS assert cloud_region_id == "ca-central-1" @@ -453,7 +455,9 @@ def test_aws_cloud_region_from_snowflake_region_id(): ( cloud, cloud_region_id, - ) = SnowflakeV2Source.get_cloud_region_from_snowflake_region_id("aws_us_east_1_gov") + ) = SnowsightUrlBuilder.get_cloud_region_from_snowflake_region_id( + "aws_us_east_1_gov" + ) assert cloud == SnowflakeCloudProvider.AWS assert cloud_region_id == "us-east-1" @@ -463,7 +467,9 @@ def test_google_cloud_region_from_snowflake_region_id(): ( cloud, cloud_region_id, - ) = SnowflakeV2Source.get_cloud_region_from_snowflake_region_id("gcp_europe_west2") + ) = SnowsightUrlBuilder.get_cloud_region_from_snowflake_region_id( + "gcp_europe_west2" + ) assert cloud == SnowflakeCloudProvider.GCP assert cloud_region_id == "europe-west2" @@ -473,7 +479,7 @@ def test_azure_cloud_region_from_snowflake_region_id(): ( cloud, cloud_region_id, - ) = SnowflakeV2Source.get_cloud_region_from_snowflake_region_id( + ) = SnowsightUrlBuilder.get_cloud_region_from_snowflake_region_id( "azure_switzerlandnorth" ) @@ -483,7 +489,7 @@ def test_azure_cloud_region_from_snowflake_region_id(): ( cloud, cloud_region_id, - ) = SnowflakeV2Source.get_cloud_region_from_snowflake_region_id( + ) = SnowsightUrlBuilder.get_cloud_region_from_snowflake_region_id( "azure_centralindia" ) @@ -493,7 +499,7 @@ def test_azure_cloud_region_from_snowflake_region_id(): def test_unknown_cloud_region_from_snowflake_region_id(): with pytest.raises(Exception, match="Unknown snowflake region"): - SnowflakeV2Source.get_cloud_region_from_snowflake_region_id( + SnowsightUrlBuilder.get_cloud_region_from_snowflake_region_id( "somecloud_someregion" ) @@ -588,26 +594,15 @@ def test_email_filter_query_generation_with_case_insensitive_filter(): def test_create_snowsight_base_url_us_west(): - ( - cloud, - cloud_region_id, - ) = SnowflakeCommonMixin.get_cloud_region_from_snowflake_region_id("aws_us_west_2") - - result = SnowflakeCommonMixin.create_snowsight_base_url( - "account_locator", cloud_region_id, cloud, False - ) + result = SnowsightUrlBuilder( + "account_locator", "aws_us_west_2", privatelink=False + ).snowsight_base_url assert result == "https://app.snowflake.com/us-west-2/account_locator/" def test_create_snowsight_base_url_ap_northeast_1(): - ( - cloud, - cloud_region_id, - ) = SnowflakeCommonMixin.get_cloud_region_from_snowflake_region_id( - "aws_ap_northeast_1" - ) + result = SnowsightUrlBuilder( + "account_locator", "aws_ap_northeast_1", privatelink=False + ).snowsight_base_url - result = SnowflakeCommonMixin.create_snowsight_base_url( - "account_locator", cloud_region_id, cloud, False - ) assert result == "https://app.snowflake.com/ap-northeast-1.aws/account_locator/" From de8f10887da630be0a3a499c71a15c85170ff2a5 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 9 Jul 2024 14:53:20 -0700 Subject: [PATCH 15/32] add schema fields to query subjects --- .../snowflake/snowflake_golden.json | 473 +++++++++++++++++- .../snowflake_privatelink_golden.json | 124 ++++- 2 files changed, 570 insertions(+), 27 deletions(-) diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json index 82b29c051114a7..5cba4e8b338223 100644 --- a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json +++ b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json @@ -4513,9 +4513,6 @@ "aspect": { "json": { "subjects": [ - { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)" - }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" }, @@ -4524,13 +4521,49 @@ }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,other_db.other_schema.table_1,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_10)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_1)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -5102,13 +5135,43 @@ }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -5136,18 +5199,48 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -5659,18 +5752,48 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -5862,18 +5985,48 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -6265,18 +6418,48 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -6674,18 +6857,48 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -6827,18 +7040,48 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -6850,18 +7093,78 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_10)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_8)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -6962,18 +7265,48 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -7099,18 +7432,48 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, @@ -7122,18 +7485,78 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_10)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_8)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_10)" } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2024_03_01-16_29_41", + "runId": "snowflake-2024_07_09-14_45_18", "lastRunId": "no-run-id-provided" } }, diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json index 78d3b920767f72..71a74f883bf1bf 100644 --- a/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json +++ b/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json @@ -3892,11 +3892,71 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_10)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_8)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_10)" } ] } @@ -4111,11 +4171,71 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_10)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_8)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_2)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_3)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_4)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_5)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_6)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_7)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_8)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_9)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_10)" } ] } From 0de982c3ad685127f3016f8648e2aa4d625d42be Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 9 Jul 2024 17:45:16 -0700 Subject: [PATCH 16/32] fix unexpected queries in tests --- .../ingestion/source/snowflake/snowflake_lineage_v2.py | 5 ++++- metadata-ingestion/tests/integration/snowflake/common.py | 8 +++++--- .../integration/snowflake/test_snowflake_failures.py | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 3e702593a89bb5..80b79240088d2c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -372,7 +372,10 @@ def _fetch_upstream_lineages_for_tables(self) -> Iterable[UpstreamLineageEdge]: start_time_millis=int(self.start_time.timestamp() * 1000), end_time_millis=int(self.end_time.timestamp() * 1000), upstreams_deny_pattern=self.config.temporary_tables_pattern, - include_view_lineage=self.config.include_view_lineage, + # The self.config.include_view_lineage setting is about fetching upstreams of views. + # We always generate lineage pointing at views from tables, even if self.config.include_view_lineage is False. + # TODO: Remove this `include_view_lineage` flag, since it's effectively dead code. + include_view_lineage=True, include_column_lineage=self.config.include_column_lineage, ) try: diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index 1d3e2c8b95af3a..5ef2eb420b8ed5 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -528,7 +528,7 @@ def default_query_results( # noqa: C901 snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( start_time_millis=1654473600000, end_time_millis=1654586220000, - include_view_lineage=False, + include_view_lineage=True, include_column_lineage=False, ), ): @@ -604,8 +604,10 @@ def default_query_results( # noqa: C901 snowflake_query.SnowflakeQuery.view_dependencies(), snowflake_query.SnowflakeQuery.show_external_tables(), snowflake_query.SnowflakeQuery.copy_lineage_history( - 1654473600000, - 1654586220000, + start_time_millis=1654473600000, end_time_millis=1654621200000 + ), + snowflake_query.SnowflakeQuery.copy_lineage_history( + start_time_millis=1654473600000, end_time_millis=1654586220000 ), ]: return [] diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py index 3c3b1aac7601b4..0b838b0bb59c3a 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py @@ -227,7 +227,7 @@ def test_snowflake_missing_snowflake_lineage_permission_causes_pipeline_failure( snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2( start_time_millis=1654473600000, end_time_millis=1654586220000, - include_view_lineage=False, + include_view_lineage=True, include_column_lineage=True, ) ], From 287d2b622158e589dd6d2facad09d7ce7bce1a7c Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 9 Jul 2024 16:55:26 -0700 Subject: [PATCH 17/32] refactor snowflake configs --- .../source/bigquery_v2/bigquery_audit.py | 2 +- .../source/snowflake/snowflake_config.py | 50 +++++++++++++------ .../ingestion/source/sql/sql_config.py | 48 ++++++++++-------- 3 files changed, 62 insertions(+), 38 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py index 0e7e98b0e5e8f0..f8fcea7c57545c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py @@ -192,7 +192,7 @@ def from_string_name(cls, ref: str) -> "BigQueryTableRef": def from_urn(cls, urn: str) -> "BigQueryTableRef": """Raises: ValueError if urn is not a valid BigQuery table URN.""" dataset_urn = DatasetUrn.create_from_string(urn) - split = dataset_urn.get_dataset_name().rsplit(".", 3) + split = dataset_urn.name.rsplit(".", 3) if len(split) == 3: project, dataset, table = split else: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index dfe860b250c009..581be6286216c2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -18,7 +18,7 @@ from datahub.ingestion.source.snowflake.snowflake_connection import ( SnowflakeConnectionConfig, ) -from datahub.ingestion.source.sql.sql_config import SQLCommonConfig +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterConfig from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulLineageConfigMixin, StatefulProfilingConfigMixin, @@ -75,7 +75,40 @@ def source_database(self) -> DatabaseId: return DatabaseId(self.database, self.platform_instance) -class SnowflakeConfig(SnowflakeConnectionConfig, BaseTimeWindowConfig, SQLCommonConfig): +class SnowflakeFilterConfig(SQLFilterConfig): + database_pattern: AllowDenyPattern = Field( + AllowDenyPattern( + deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"], + ), + description="Regex patterns for databases to filter in ingestion.", + ) + + schema_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="Regex patterns for schemas to filter in ingestion. Will match against the full `database.schema` name if `match_fully_qualified_names` is enabled.", + ) + # table_pattern and view_pattern are inherited from SQLFilterConfig + + match_fully_qualified_names: bool = Field( + default=False, + description="Whether `schema_pattern` is matched against fully qualified schema name `.`.", + ) + + +class SnowflakeIdentifierConfig(ConfigModel): + convert_urns_to_lowercase: bool = Field( + default=True, + ) + + +class SnowflakeConfig( + SnowflakeConnectionConfig, + BaseTimeWindowConfig, + SQLCommonConfig, + # SnowflakeFilterConfig must come after SQLCommon config, so that the documentation overrides are applied. + SnowflakeFilterConfig, + SnowflakeIdentifierConfig, +): include_table_lineage: bool = pydantic.Field( default=True, description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.", @@ -85,10 +118,6 @@ class SnowflakeConfig(SnowflakeConnectionConfig, BaseTimeWindowConfig, SQLCommon description="If enabled, populates the snowflake view->table and table->view lineages. Requires appropriate grants given to the role, and include_table_lineage to be True. view->table lineage requires Snowflake Enterprise Edition or above.", ) - database_pattern: AllowDenyPattern = AllowDenyPattern( - deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"] - ) - ignore_start_time_lineage: bool = False upstream_lineage_in_report: bool = False @@ -113,10 +142,6 @@ class SnowflakeV2Config( StatefulProfilingConfigMixin, ClassificationSourceConfigMixin, ): - convert_urns_to_lowercase: bool = Field( - default=True, - ) - include_usage_stats: bool = Field( default=True, description="If enabled, populates the snowflake usage statistics. Requires appropriate grants given to the role.", @@ -165,11 +190,6 @@ class SnowflakeV2Config( description="Whether to populate Snowsight url for Snowflake Objects", ) - match_fully_qualified_names: bool = Field( - default=False, - description="Whether `schema_pattern` is matched against fully qualified schema name `.`.", - ) - _use_legacy_lineage_method_removed = pydantic_removed_field( "use_legacy_lineage_method" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py index dd7592f6a8a5e3..3956c617a5f226 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py @@ -30,17 +30,7 @@ logger: logging.Logger = logging.getLogger(__name__) -class SQLCommonConfig( - StatefulIngestionConfigBase, - DatasetSourceConfigMixin, - LowerCaseDatasetUrnConfigMixin, - IncrementalLineageConfigMixin, - ClassificationSourceConfigMixin, -): - options: dict = pydantic.Field( - default_factory=dict, - description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.", - ) +class SQLFilterConfig(ConfigModel): # Although the 'table_pattern' enables you to skip everything from certain schemas, # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter @@ -57,6 +47,31 @@ class SQLCommonConfig( default=AllowDenyPattern.allow_all(), description="Regex patterns for views to filter in ingestion. Note: Defaults to table_pattern if not specified. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'", ) + + @pydantic.root_validator(pre=True) + def view_pattern_is_table_pattern_unless_specified( + cls, values: Dict[str, Any] + ) -> Dict[str, Any]: + view_pattern = values.get("view_pattern") + table_pattern = values.get("table_pattern") + if table_pattern and not view_pattern: + logger.info(f"Applying table_pattern {table_pattern} to view_pattern.") + values["view_pattern"] = table_pattern + return values + + +class SQLCommonConfig( + StatefulIngestionConfigBase, + DatasetSourceConfigMixin, + LowerCaseDatasetUrnConfigMixin, + IncrementalLineageConfigMixin, + ClassificationSourceConfigMixin, + SQLFilterConfig, +): + options: dict = pydantic.Field( + default_factory=dict, + description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.", + ) profile_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), description="Regex patterns to filter tables (or specific columns) for profiling during ingestion. Note that only tables allowed by the `table_pattern` will be considered.", @@ -103,17 +118,6 @@ def is_profiling_enabled(self) -> bool: self.profiling.operation_config ) - @pydantic.root_validator(pre=True) - def view_pattern_is_table_pattern_unless_specified( - cls, values: Dict[str, Any] - ) -> Dict[str, Any]: - view_pattern = values.get("view_pattern") - table_pattern = values.get("table_pattern") - if table_pattern and not view_pattern: - logger.info(f"Applying table_pattern {table_pattern} to view_pattern.") - values["view_pattern"] = table_pattern - return values - @pydantic.root_validator(skip_on_failure=True) def ensure_profiling_pattern_is_passed_to_profiling( cls, values: Dict[str, Any] From 83ba873721ed501786ae6de3148736283c30c882 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 9 Jul 2024 18:03:45 -0700 Subject: [PATCH 18/32] tweak ordering of classes --- .../datahub/ingestion/source/snowflake/snowflake_config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 581be6286216c2..6849ba5a18a944 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -102,12 +102,12 @@ class SnowflakeIdentifierConfig(ConfigModel): class SnowflakeConfig( + SnowflakeFilterConfig, + SnowflakeIdentifierConfig, + # SnowflakeFilterConfig must come before (higher precedence) the SQLCommon config, so that the documentation overrides are applied. SnowflakeConnectionConfig, BaseTimeWindowConfig, SQLCommonConfig, - # SnowflakeFilterConfig must come after SQLCommon config, so that the documentation overrides are applied. - SnowflakeFilterConfig, - SnowflakeIdentifierConfig, ): include_table_lineage: bool = pydantic.Field( default=True, From 1794feb8a070e60d9f2197eae4c9009c9cb0ccd3 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 9 Jul 2024 18:18:35 -0700 Subject: [PATCH 19/32] more config refactoring --- .../source/snowflake/snowflake_config.py | 46 ++++--- .../source/snowflake/snowflake_lineage_v2.py | 20 +-- .../source/snowflake/snowflake_summary.py | 31 +---- .../source/snowflake/snowflake_utils.py | 122 ++++++++++++------ 4 files changed, 126 insertions(+), 93 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 6849ba5a18a944..a4a310253ceb9a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -94,6 +94,30 @@ class SnowflakeFilterConfig(SQLFilterConfig): description="Whether `schema_pattern` is matched against fully qualified schema name `.`.", ) + @root_validator(pre=False, skip_on_failure=True) + def validate_legacy_schema_pattern(cls, values: Dict) -> Dict: + schema_pattern: Optional[AllowDenyPattern] = values.get("schema_pattern") + match_fully_qualified_names = values.get("match_fully_qualified_names") + + if ( + schema_pattern is not None + and schema_pattern != AllowDenyPattern.allow_all() + and match_fully_qualified_names is not None + and not match_fully_qualified_names + ): + logger.warning( + "Please update `schema_pattern` to match against fully qualified schema name `.` and set config `match_fully_qualified_names : True`." + "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. " + "The config option `match_fully_qualified_names` will be deprecated in future and the default behavior will assume `match_fully_qualified_names: True`." + ) + + # Always exclude reporting metadata for INFORMATION_SCHEMA schema + if schema_pattern is not None and schema_pattern: + logger.debug("Adding deny for INFORMATION_SCHEMA to schema_pattern.") + cast(AllowDenyPattern, schema_pattern).deny.append(r".*INFORMATION_SCHEMA$") + + return values + class SnowflakeIdentifierConfig(ConfigModel): convert_urns_to_lowercase: bool = Field( @@ -101,6 +125,7 @@ class SnowflakeIdentifierConfig(ConfigModel): ) +# TODO: SnowflakeConfig is unused except for this inheritance. We should collapse the config inheritance hierarchy. class SnowflakeConfig( SnowflakeFilterConfig, SnowflakeIdentifierConfig, @@ -262,27 +287,6 @@ def validate_unsupported_configs(cls, values: Dict) -> Dict: "include_read_operational_stats is not supported. Set `include_read_operational_stats` to False.", ) - match_fully_qualified_names = values.get("match_fully_qualified_names") - - schema_pattern: Optional[AllowDenyPattern] = values.get("schema_pattern") - - if ( - schema_pattern is not None - and schema_pattern != AllowDenyPattern.allow_all() - and match_fully_qualified_names is not None - and not match_fully_qualified_names - ): - logger.warning( - "Please update `schema_pattern` to match against fully qualified schema name `.` and set config `match_fully_qualified_names : True`." - "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. " - "The config option `match_fully_qualified_names` will be deprecated in future and the default behavior will assume `match_fully_qualified_names: True`." - ) - - # Always exclude reporting metadata for INFORMATION_SCHEMA schema - if schema_pattern is not None and schema_pattern: - logger.debug("Adding deny for INFORMATION_SCHEMA to schema_pattern.") - cast(AllowDenyPattern, schema_pattern).deny.append(r".*INFORMATION_SCHEMA$") - include_technical_schema = values.get("include_technical_schema") include_profiles = ( values.get("profiling") is not None and values["profiling"].enabled diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 80b79240088d2c..5302382814f5a4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -420,10 +420,12 @@ def map_query_result_upstreams( upstream_name = self.get_dataset_identifier_from_qualified_name( upstream_table.upstream_object_name ) - if upstream_name and self._is_dataset_pattern_allowed( - upstream_name, - upstream_table.upstream_object_domain, - is_upstream=True, + if upstream_name and ( + not self.config.validate_upstreams_against_patterns + or self._is_dataset_pattern_allowed( + upstream_name, + upstream_table.upstream_object_domain, + ) ): upstreams.append(self.dataset_urn_builder(upstream_name)) except Exception as e: @@ -504,10 +506,12 @@ def build_finegrained_lineage_upstreams( if ( upstream_col.object_name and upstream_col.column_name - and self._is_dataset_pattern_allowed( - upstream_col.object_name, - upstream_col.object_domain, - is_upstream=True, + and ( + not self.config.validate_upstreams_against_patterns + or self._is_dataset_pattern_allowed( + upstream_col.object_name, + upstream_col.object_domain, + ) ) ): upstream_dataset_name = self.get_dataset_identifier_from_qualified_name( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py index 9094e9c9feee46..6f26d35aa976df 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py @@ -3,15 +3,13 @@ from collections import defaultdict from typing import Dict, Iterable, List, Optional -import pydantic - -from datahub.configuration.common import AllowDenyPattern from datahub.configuration.source_common import LowerCaseDatasetUrnConfigMixin from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import SupportStatus, config_class, support_status from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeFilterConfig from datahub.ingestion.source.snowflake.snowflake_connection import ( SnowflakeConnectionConfig, ) @@ -28,29 +26,12 @@ class SnowflakeSummaryConfig( - SnowflakeConnectionConfig, BaseTimeWindowConfig, LowerCaseDatasetUrnConfigMixin + SnowflakeFilterConfig, + SnowflakeConnectionConfig, + BaseTimeWindowConfig, + LowerCaseDatasetUrnConfigMixin, ): - - # Copied from SnowflakeConfig. - database_pattern: AllowDenyPattern = AllowDenyPattern( - deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"] - ) - schema_pattern: AllowDenyPattern = pydantic.Field( - default=AllowDenyPattern.allow_all(), - description="Regex patterns for schemas to filter in ingestion. Specify regex to only match the schema name. e.g. to match all tables in schema analytics, use the regex 'analytics'", - ) - table_pattern: AllowDenyPattern = pydantic.Field( - default=AllowDenyPattern.allow_all(), - description="Regex patterns for tables to filter in ingestion. Specify regex to match the entire table name in database.schema.table format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'", - ) - view_pattern: AllowDenyPattern = pydantic.Field( - default=AllowDenyPattern.allow_all(), - description="Regex patterns for views to filter in ingestion. Note: Defaults to table_pattern if not specified. Specify regex to match the entire view name in database.schema.view format. e.g. to match all views starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'", - ) - match_fully_qualified_names: bool = pydantic.Field( - default=True, - description="Whether `schema_pattern` is matched against fully qualified schema name `.`.", - ) + pass @dataclasses.dataclass diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index fb6d15f6874a83..2fd837d8d2f957 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -1,18 +1,37 @@ +import abc from typing import ClassVar, Literal, Optional, Tuple from typing_extensions import Protocol from datahub.configuration.pattern_utils import is_schema_allowed from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.ingestion.api.source import SourceReport from datahub.ingestion.source.snowflake.constants import ( SNOWFLAKE_REGION_CLOUD_REGION_MAPPING, SnowflakeCloudProvider, SnowflakeObjectDomain, ) -from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config +from datahub.ingestion.source.snowflake.snowflake_config import ( + SnowflakeFilterConfig, + SnowflakeV2Config, +) from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report +class SnowflakeStructuredReportMixin(abc.ABC): + @property + @abc.abstractmethod + def structured_reporter(self) -> SourceReport: + ... + + # TODO: Eventually I want to deprecate these methods and use the structured_reporter directly. + def report_warning(self, key: str, reason: str) -> None: + self.structured_reporter.warning(key, reason) + + def report_error(self, key: str, reason: str) -> None: + self.structured_reporter.failure(key, reason) + + # Required only for mypy, since we are using mixin classes, and not inheritance. # Reference - https://mypy.readthedocs.io/en/latest/more_types.html#mixin-classes class SnowflakeCommonProtocol(Protocol): @@ -26,6 +45,9 @@ def get_dataset_identifier( ) -> str: ... + def cleanup_qualified_name(self, qualified_name: str) -> str: + ... + def get_dataset_identifier_from_qualified_name(self, qualified_name: str) -> str: ... @@ -63,7 +85,7 @@ def create_snowsight_base_url( cloud_region_id: str, cloud: str, privatelink: bool = False, - ) -> Optional[str]: + ) -> str: if cloud: url_cloud_provider_suffix = f".{cloud}" @@ -117,17 +139,23 @@ def get_external_url_for_database(self, db_name: str) -> Optional[str]: return f"{self.snowsight_base_url}#/data/databases/{db_name}/" -class SnowflakeCommonMixin: - platform = "snowflake" +class SnowflakeFilterMixin(SnowflakeStructuredReportMixin): + @property + @abc.abstractmethod + def filter_config(self) -> SnowflakeFilterConfig: + ... + + @staticmethod + def _combine_identifier_parts( + table_name: str, schema_name: str, db_name: str + ) -> str: + return f"{db_name}.{schema_name}.{table_name}" def _is_dataset_pattern_allowed( - self: SnowflakeCommonProtocol, + self, dataset_name: Optional[str], dataset_type: Optional[str], - is_upstream: bool = False, ) -> bool: - if is_upstream and not self.config.validate_upstreams_against_patterns: - return True if not dataset_type or not dataset_name: return True dataset_params = dataset_name.split(".") @@ -146,33 +174,65 @@ def _is_dataset_pattern_allowed( # NOTE: this case returned `True` earlier when extracting lineage return False - if not self.config.database_pattern.allowed( + if not self.filter_config.database_pattern.allowed( dataset_params[0].strip('"') ) or not is_schema_allowed( - self.config.schema_pattern, + self.filter_config.schema_pattern, dataset_params[1].strip('"'), dataset_params[0].strip('"'), - self.config.match_fully_qualified_names, + self.filter_config.match_fully_qualified_names, ): return False if dataset_type.lower() in { SnowflakeObjectDomain.TABLE - } and not self.config.table_pattern.allowed( - self.get_dataset_identifier_from_qualified_name(dataset_name) + } and not self.filter_config.table_pattern.allowed( + self.cleanup_qualified_name(dataset_name) ): return False if dataset_type.lower() in { - "view", - "materialized_view", - } and not self.config.view_pattern.allowed( - self.get_dataset_identifier_from_qualified_name(dataset_name) + SnowflakeObjectDomain.VIEW, + SnowflakeObjectDomain.MATERIALIZED_VIEW, + } and not self.filter_config.view_pattern.allowed( + self.cleanup_qualified_name(dataset_name) ): return False return True + # Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers, + # For example "test-database"."test-schema".test_table + # whereas we generate urns without quotes even for quoted identifiers for backward compatibility + # and also unavailability of utility function to identify whether current table/schema/database + # name should be quoted in above method get_dataset_identifier + def cleanup_qualified_name(self, qualified_name: str) -> str: + name_parts = qualified_name.split(".") + if len(name_parts) != 3: + self.structured_reporter.report_warning( + title="Unexpected dataset pattern", + message="We failed to parse a Snowflake qualified name into its constituent parts. DB/schema/table filtering may not work as expected.", + context=f"{qualified_name} has {len(name_parts)} parts", + ) + return qualified_name.replace('"', "") + return SnowflakeFilterMixin._combine_identifier_parts( + table_name=name_parts[2].strip('"'), + schema_name=name_parts[1].strip('"'), + db_name=name_parts[0].strip('"'), + ) + + +class SnowflakeCommonMixin(SnowflakeFilterMixin): + platform = "snowflake" + + @property + def structured_reporter(self: SnowflakeCommonProtocol) -> SourceReport: + return self.report + + @property + def filter_config(self: SnowflakeCommonProtocol) -> SnowflakeFilterConfig: + return self.config + def snowflake_identifier(self: SnowflakeCommonProtocol, identifier: str) -> str: # to be in in sync with older connector, convert name to lowercase if self.config.convert_urns_to_lowercase: @@ -202,26 +262,16 @@ def get_quoted_identifier_for_table(db_name, schema_name, table_name): def get_dataset_identifier( self: SnowflakeCommonProtocol, table_name: str, schema_name: str, db_name: str ) -> str: - return self.snowflake_identifier(f"{db_name}.{schema_name}.{table_name}") + return self.snowflake_identifier( + SnowflakeCommonMixin._combine_identifier_parts( + table_name=table_name, schema_name=schema_name, db_name=db_name + ) + ) - # Qualified Object names from snowflake audit logs have quotes for for snowflake quoted identifiers, - # For example "test-database"."test-schema".test_table - # whereas we generate urns without quotes even for quoted identifiers for backward compatibility - # and also unavailability of utility function to identify whether current table/schema/database - # name should be quoted in above method get_dataset_identifier def get_dataset_identifier_from_qualified_name( self: SnowflakeCommonProtocol, qualified_name: str ) -> str: - name_parts = qualified_name.split(".") - if len(name_parts) != 3: - self.report.report_warning( - "invalid-dataset-pattern", - f"Found non-parseable {name_parts} for {qualified_name}", - ) - return self.snowflake_identifier(qualified_name.replace('"', "")) - return self.get_dataset_identifier( - name_parts[2].strip('"'), name_parts[1].strip('"'), name_parts[0].strip('"') - ) + return self.snowflake_identifier(self.cleanup_qualified_name(qualified_name)) # Note - decide how to construct user urns. # Historically urns were created using part before @ from user's email. @@ -253,9 +303,3 @@ def warn_if_stateful_else_error( self.report_warning(key, reason) else: self.report_error(key, reason) - - def report_warning(self: SnowflakeCommonProtocol, key: str, reason: str) -> None: - self.report.warning(key, reason) - - def report_error(self: SnowflakeCommonProtocol, key: str, reason: str) -> None: - self.report.failure(key, reason) From c52cf0512086d493a381f68b5fedc76697161a2c Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 9 Jul 2024 18:46:08 -0700 Subject: [PATCH 20/32] more refactoring --- .../source/snowflake/snowflake_assertion.py | 13 +++-- .../source/snowflake/snowflake_config.py | 4 +- .../source/snowflake/snowflake_schema_gen.py | 45 ++++++++++------ .../source/snowflake/snowflake_summary.py | 42 +++++++-------- .../source/snowflake/snowflake_utils.py | 51 +++++++++++-------- .../source/snowflake/snowflake_v2.py | 1 + 6 files changed, 91 insertions(+), 65 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py index 5fc1a45709296e..8c7fbf41219e5b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py @@ -11,11 +11,14 @@ ) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config +from datahub.ingestion.source.snowflake.snowflake_config import ( + SnowflakeIdentifierConfig, + SnowflakeV2Config, +) from datahub.ingestion.source.snowflake.snowflake_connection import SnowflakeConnection from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin +from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeIdentifierMixin from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( AssertionResult, AssertionResultType, @@ -37,7 +40,7 @@ class DataQualityMonitoringResult(BaseModel): VALUE: int -class SnowflakeAssertionsHandler(SnowflakeCommonMixin): +class SnowflakeAssertionsHandler(SnowflakeIdentifierMixin): def __init__( self, config: SnowflakeV2Config, @@ -52,6 +55,10 @@ def __init__( self.connection = connection self._urns_processed: List[str] = [] + @property + def identifier_config(self) -> SnowflakeIdentifierConfig: + return self.config + def get_assertion_workunits( self, discovered_datasets: List[str] ) -> Iterable[MetadataWorkUnit]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index a4a310253ceb9a..3f569b10d97d46 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -9,6 +9,7 @@ from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.pattern_utils import UUID_REGEX +from datahub.configuration.source_common import LowerCaseDatasetUrnConfigMixin from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field @@ -119,7 +120,8 @@ def validate_legacy_schema_pattern(cls, values: Dict) -> Dict: return values -class SnowflakeIdentifierConfig(ConfigModel): +class SnowflakeIdentifierConfig(LowerCaseDatasetUrnConfigMixin): + # Changing default value here. convert_urns_to_lowercase: bool = Field( default=True, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index 0c3a2c84362b95..a1b427544ef97e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -2,7 +2,7 @@ import itertools import logging import queue -from typing import Dict, Iterable, List, Optional, Union +from typing import Callable, Dict, Iterable, List, Optional, Union from datahub.configuration.pattern_utils import is_schema_allowed from datahub.emitter.mce_builder import ( @@ -12,6 +12,7 @@ make_tag_urn, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.source import SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.glossary.classification_mixin import ( ClassificationHandler, @@ -27,6 +28,8 @@ SnowflakeObjectDomain, ) from datahub.ingestion.source.snowflake.snowflake_config import ( + SnowflakeFilterConfig, + SnowflakeIdentifierConfig, SnowflakeV2Config, TagOption, ) @@ -51,8 +54,8 @@ ) from datahub.ingestion.source.snowflake.snowflake_tag import SnowflakeTagExtractor from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeCommonMixin, - SnowflakeCommonProtocol, + SnowflakeFilterMixin, + SnowflakeIdentifierMixin, SnowsightUrlBuilder, ) from datahub.ingestion.source.sql.sql_utils import ( @@ -140,15 +143,13 @@ } -class SnowflakeSchemaGenerator( - SnowflakeCommonMixin, - SnowflakeCommonProtocol, -): +class SnowflakeSchemaGenerator(SnowflakeFilterMixin, SnowflakeIdentifierMixin): def __init__( self, config: SnowflakeV2Config, report: SnowflakeV2Report, connection: SnowflakeConnection, + dataset_urn_builder: Callable[[str], str], domain_registry: Optional[DomainRegistry], profiler: Optional[SnowflakeProfiler], aggregator: Optional[SqlParsingAggregator], @@ -157,7 +158,7 @@ def __init__( self.config: SnowflakeV2Config = config self.report: SnowflakeV2Report = report self.connection: SnowflakeConnection = connection - self.logger = logger + self.dataset_urn_builder = dataset_urn_builder self.data_dictionary: SnowflakeDataDictionary = SnowflakeDataDictionary( connection=self.connection @@ -181,11 +182,23 @@ def __init__( def get_connection(self) -> SnowflakeConnection: return self.connection + @property + def structured_reporter(self) -> SourceReport: + return self.report + + @property + def filter_config(self) -> SnowflakeFilterConfig: + return self.config + + @property + def identifier_config(self) -> SnowflakeIdentifierConfig: + return self.config + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.databases = [] for database in self.get_databases() or []: self.report.report_entity_scanned(database.name, "database") - if not self.config.database_pattern.allowed(database.name): + if not self.filter_config.database_pattern.allowed(database.name): self.report.report_dropped(f"{database.name}.*") else: self.databases.append(database) @@ -349,10 +362,10 @@ def fetch_schemas_for_database( for schema in self.data_dictionary.get_schemas_for_database(db_name): self.report.report_entity_scanned(schema.name, "schema") if not is_schema_allowed( - self.config.schema_pattern, + self.filter_config.schema_pattern, schema.name, db_name, - self.config.match_fully_qualified_names, + self.filter_config.match_fully_qualified_names, ): self.report.report_dropped(f"{db_name}.{schema.name}.*") else: @@ -433,7 +446,7 @@ def _process_schema( ) if view.view_definition: self.aggregator.add_view_definition( - view_urn=self.gen_dataset_urn(view_identifier), + view_urn=self.dataset_urn_builder(view_identifier), view_definition=view.view_definition, default_db=db_name, default_schema=schema_name, @@ -463,7 +476,7 @@ def fetch_views_for_schema( self.report.report_entity_scanned(view_name, "view") - if not self.config.view_pattern.allowed(view_name): + if not self.filter_config.view_pattern.allowed(view_name): self.report.report_dropped(view_name) else: views.append(view) @@ -496,7 +509,7 @@ def fetch_tables_for_schema( table.name, schema_name, db_name ) self.report.report_entity_scanned(table_identifier) - if not self.config.table_pattern.allowed(table_identifier): + if not self.filter_config.table_pattern.allowed(table_identifier): self.report.report_dropped(table_identifier) else: tables.append(table) @@ -665,7 +678,7 @@ def gen_dataset_workunits( yield from self._process_tag(tag) dataset_name = self.get_dataset_identifier(table.name, schema_name, db_name) - dataset_urn = self.gen_dataset_urn(dataset_name) + dataset_urn = self.dataset_urn_builder(dataset_name) status = Status(removed=False) yield MetadataChangeProposalWrapper( @@ -807,7 +820,7 @@ def gen_schema_metadata( db_name: str, ) -> SchemaMetadata: dataset_name = self.get_dataset_identifier(table.name, schema_name, db_name) - dataset_urn = self.gen_dataset_urn(dataset_name) + dataset_urn = self.dataset_urn_builder(dataset_name) foreign_keys: Optional[List[ForeignKeyConstraint]] = None if isinstance(table, SnowflakeTable) and len(table.foreign_keys) > 0: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py index 6f26d35aa976df..8d1455b4a6059e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py @@ -13,14 +13,10 @@ from datahub.ingestion.source.snowflake.snowflake_connection import ( SnowflakeConnectionConfig, ) -from datahub.ingestion.source.snowflake.snowflake_schema import ( - SnowflakeDatabase, - SnowflakeDataDictionary, -) +from datahub.ingestion.source.snowflake.snowflake_schema import SnowflakeDatabase from datahub.ingestion.source.snowflake.snowflake_schema_gen import ( SnowflakeSchemaGenerator, ) -from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin from datahub.ingestion.source_report.time_window import BaseTimeWindowReport from datahub.utilities.lossy_collections import LossyList @@ -58,10 +54,7 @@ def report_entity_scanned(self, name: str, ent_type: str = "table") -> None: @config_class(SnowflakeSummaryConfig) @support_status(SupportStatus.INCUBATING) -class SnowflakeSummarySource( - SnowflakeCommonMixin, - Source, -): +class SnowflakeSummarySource(Source): def __init__(self, ctx: PipelineContext, config: SnowflakeSummaryConfig): super().__init__(ctx) self.config: SnowflakeSummaryConfig = config @@ -69,12 +62,22 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeSummaryConfig): self.logger = logging.getLogger(__name__) self.connection = self.config.get_connection() - self.data_dictionary = SnowflakeDataDictionary(connection=self.connection) def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + schema_generator = SnowflakeSchemaGenerator( + config=self.config, # type: ignore + report=self.report, # type: ignore + connection=self.connection, + dataset_urn_builder=lambda x: "", + domain_registry=None, + profiler=None, + aggregator=None, + snowsight_url_builder=None, + ) + # Databases. databases: List[SnowflakeDatabase] = [] - for database in self.get_databases() or []: # type: ignore + for database in schema_generator.get_databases() or []: # TODO: Support database_patterns. if not self.config.database_pattern.allowed(database.name): self.report.report_dropped(f"{database.name}.*") @@ -83,16 +86,16 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # Schemas. for database in databases: - self.fetch_schemas_for_database(database, database.name) # type: ignore + schema_generator.fetch_schemas_for_database(database, database.name) self.report.schema_counters[database.name] = len(database.schemas) for schema in database.schemas: # Tables/views. - tables = self.fetch_tables_for_schema( # type: ignore + tables = schema_generator.fetch_tables_for_schema( schema, database.name, schema.name ) - views = self.fetch_views_for_schema( # type: ignore + views = schema_generator.fetch_views_for_schema( schema, database.name, schema.name ) @@ -130,16 +133,5 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # This source doesn't produce any metadata itself. All important information goes into the report. yield from [] - # This is a bit of a hack, but lets us reuse the code from the main ingestion source. - # Mypy doesn't really know how to deal with it though, which is why we have all these - # type ignore comments. - get_databases = SnowflakeSchemaGenerator.get_databases - get_databases_from_ischema = SnowflakeSchemaGenerator.get_databases_from_ischema - fetch_schemas_for_database = SnowflakeSchemaGenerator.fetch_schemas_for_database - fetch_tables_for_schema = SnowflakeSchemaGenerator.fetch_tables_for_schema - fetch_views_for_schema = SnowflakeSchemaGenerator.fetch_views_for_schema - get_tables_for_schema = SnowflakeSchemaGenerator.get_tables_for_schema - get_views_for_schema = SnowflakeSchemaGenerator.get_views_for_schema - def get_report(self) -> SnowflakeSummaryReport: return self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index 2fd837d8d2f957..6e671d223cd5ad 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -13,6 +13,7 @@ ) from datahub.ingestion.source.snowflake.snowflake_config import ( SnowflakeFilterConfig, + SnowflakeIdentifierConfig, SnowflakeV2Config, ) from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report @@ -222,9 +223,32 @@ def cleanup_qualified_name(self, qualified_name: str) -> str: ) -class SnowflakeCommonMixin(SnowflakeFilterMixin): +class SnowflakeIdentifierMixin(abc.ABC): platform = "snowflake" + @property + @abc.abstractmethod + def identifier_config(self) -> SnowflakeIdentifierConfig: + ... + + def snowflake_identifier(self, identifier: str) -> str: + # to be in in sync with older connector, convert name to lowercase + if self.identifier_config.convert_urns_to_lowercase: + return identifier.lower() + return identifier + + def get_dataset_identifier( + self, table_name: str, schema_name: str, db_name: str + ) -> str: + return self.snowflake_identifier( + SnowflakeCommonMixin._combine_identifier_parts( + table_name=table_name, schema_name=schema_name, db_name=db_name + ) + ) + + +# TODO: We're most of the way there on fully removing SnowflakeCommonProtocol. +class SnowflakeCommonMixin(SnowflakeFilterMixin, SnowflakeIdentifierMixin): @property def structured_reporter(self: SnowflakeCommonProtocol) -> SourceReport: return self.report @@ -233,11 +257,9 @@ def structured_reporter(self: SnowflakeCommonProtocol) -> SourceReport: def filter_config(self: SnowflakeCommonProtocol) -> SnowflakeFilterConfig: return self.config - def snowflake_identifier(self: SnowflakeCommonProtocol, identifier: str) -> str: - # to be in in sync with older connector, convert name to lowercase - if self.config.convert_urns_to_lowercase: - return identifier.lower() - return identifier + @property + def identifier_config(self: SnowflakeCommonProtocol) -> SnowflakeIdentifierConfig: + return self.config def gen_dataset_urn(self: SnowflakeCommonProtocol, dataset_identifier: str) -> str: return make_dataset_urn_with_platform_instance( @@ -255,24 +277,13 @@ def get_quoted_identifier_for_database(db_name): def get_quoted_identifier_for_schema(db_name, schema_name): return f'"{db_name}"."{schema_name}"' + def get_dataset_identifier_from_qualified_name(self, qualified_name: str) -> str: + return self.snowflake_identifier(self.cleanup_qualified_name(qualified_name)) + @staticmethod def get_quoted_identifier_for_table(db_name, schema_name, table_name): return f'"{db_name}"."{schema_name}"."{table_name}"' - def get_dataset_identifier( - self: SnowflakeCommonProtocol, table_name: str, schema_name: str, db_name: str - ) -> str: - return self.snowflake_identifier( - SnowflakeCommonMixin._combine_identifier_parts( - table_name=table_name, schema_name=schema_name, db_name=db_name - ) - ) - - def get_dataset_identifier_from_qualified_name( - self: SnowflakeCommonProtocol, qualified_name: str - ) -> str: - return self.snowflake_identifier(self.cleanup_qualified_name(qualified_name)) - # Note - decide how to construct user urns. # Historically urns were created using part before @ from user's email. # Users without email were skipped from both user entries as well as aggregates. diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index e4b9f73125d1c4..d4b43bfb420e6f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -445,6 +445,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: profiler=self.profiler, aggregator=self.aggregator, snowsight_url_builder=snowsight_url_builder, + dataset_urn_builder=self.gen_dataset_urn, ) self.report.set_ingestion_stage("*", METADATA_EXTRACTION) From a15f966dc3d3195012fa39e503f44887cc223442 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 9 Jul 2024 19:51:50 -0700 Subject: [PATCH 21/32] add gen urn to identifier mixin --- .../source/snowflake/snowflake_assertion.py | 6 ++--- .../source/snowflake/snowflake_config.py | 12 +++++++--- .../source/snowflake/snowflake_queries.py | 24 +++++++++++++++---- .../source/snowflake/snowflake_summary.py | 1 + .../source/snowflake/snowflake_utils.py | 16 ++++++------- .../source/snowflake/snowflake_v2.py | 2 +- .../ingestion/source/sql/sql_config.py | 6 +++-- 7 files changed, 44 insertions(+), 23 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py index 8c7fbf41219e5b..2a1d18c83e6fa8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_assertion.py @@ -1,6 +1,6 @@ import logging from datetime import datetime -from typing import Callable, Iterable, List, Optional +from typing import Iterable, List, Optional from pydantic import BaseModel @@ -46,12 +46,10 @@ def __init__( config: SnowflakeV2Config, report: SnowflakeV2Report, connection: SnowflakeConnection, - dataset_urn_builder: Callable[[str], str], ) -> None: self.config = config self.report = report self.logger = logger - self.dataset_urn_builder = dataset_urn_builder self.connection = connection self._urns_processed: List[str] = [] @@ -109,7 +107,7 @@ def _process_result_row( aspect=AssertionRunEvent( timestampMillis=datetime_to_ts_millis(result.MEASUREMENT_TIME), runId=result.MEASUREMENT_TIME.strftime("%Y-%m-%dT%H:%M:%SZ"), - asserteeUrn=self.dataset_urn_builder(assertee), + asserteeUrn=self.gen_dataset_urn(assertee), status=AssertionRunStatus.COMPLETE, assertionUrn=make_assertion_urn(assertion_guid), result=AssertionResult( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 3f569b10d97d46..7c88b4689e10e6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -9,7 +9,11 @@ from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.pattern_utils import UUID_REGEX -from datahub.configuration.source_common import LowerCaseDatasetUrnConfigMixin +from datahub.configuration.source_common import ( + EnvConfigMixin, + LowerCaseDatasetUrnConfigMixin, + PlatformInstanceConfigMixin, +) from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field @@ -120,7 +124,9 @@ def validate_legacy_schema_pattern(cls, values: Dict) -> Dict: return values -class SnowflakeIdentifierConfig(LowerCaseDatasetUrnConfigMixin): +class SnowflakeIdentifierConfig( + PlatformInstanceConfigMixin, EnvConfigMixin, LowerCaseDatasetUrnConfigMixin +): # Changing default value here. convert_urns_to_lowercase: bool = Field( default=True, @@ -129,8 +135,8 @@ class SnowflakeIdentifierConfig(LowerCaseDatasetUrnConfigMixin): # TODO: SnowflakeConfig is unused except for this inheritance. We should collapse the config inheritance hierarchy. class SnowflakeConfig( - SnowflakeFilterConfig, SnowflakeIdentifierConfig, + SnowflakeFilterConfig, # SnowflakeFilterConfig must come before (higher precedence) the SQLCommon config, so that the documentation overrides are applied. SnowflakeConnectionConfig, BaseTimeWindowConfig, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 53ab87ba81b1d4..f07ac38397cdf1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -19,6 +19,7 @@ BaseTimeWindowConfig, BucketDuration, ) +from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.source_helpers import auto_workunit @@ -27,7 +28,10 @@ SnowflakeConnectionConfig, ) from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery -from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin +from datahub.ingestion.source.snowflake.snowflake_utils import ( + SnowflakeFilterMixin, + SnowflakeIdentifierMixin, +) from datahub.ingestion.source.usage.usage_common import BaseUsageConfig from datahub.metadata._urns.urn_defs import CorpUserUrn from datahub.sql_parsing.sql_parsing_aggregator import ( @@ -83,7 +87,13 @@ class SnowflakeQueriesReport(SourceReport): sql_aggregator: Optional[SqlAggregatorReport] = None -class SnowflakeQueriesSource(Source, SnowflakeCommonMixin): +class SnowflakeQueriesExtractor(SnowflakeFilterMixin, SnowflakeIdentifierMixin): + def __init__(self, config: SnowflakeQueriesConfig, report: SnowflakeQueriesReport): + self.config = config + self.report = report + + +class SnowflakeQueriesSource(Source): def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesConfig): self.ctx = ctx self.config = config @@ -211,9 +221,13 @@ def fetch_audit_log( else: yield entry - # HACK: This makes mypy happy with our usage of the mixin methods. - gen_dataset_urn = SnowflakeCommonMixin.gen_dataset_urn - snowflake_identifier = SnowflakeCommonMixin.snowflake_identifier + def gen_dataset_urn(self, dataset_identifier: str) -> str: + return make_dataset_urn_with_platform_instance( + platform=self.platform, + name=dataset_identifier, + platform_instance=self.config.platform_instance, + env=self.config.env, + ) def _parse_audit_log_response(self, row: Dict[str, Any]) -> PreparsedQuery: json_fields = { diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py index 8d1455b4a6059e..f78ae70291f8a7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_summary.py @@ -65,6 +65,7 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeSummaryConfig): def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: schema_generator = SnowflakeSchemaGenerator( + # This is a hack, but we just hope that the config / report have all the fields we need. config=self.config, # type: ignore report=self.report, # type: ignore connection=self.connection, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index 6e671d223cd5ad..1f3fd89bf9d712 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -246,6 +246,14 @@ def get_dataset_identifier( ) ) + def gen_dataset_urn(self, dataset_identifier: str) -> str: + return make_dataset_urn_with_platform_instance( + platform=self.platform, + name=dataset_identifier, + platform_instance=self.identifier_config.platform_instance, + env=self.identifier_config.env, + ) + # TODO: We're most of the way there on fully removing SnowflakeCommonProtocol. class SnowflakeCommonMixin(SnowflakeFilterMixin, SnowflakeIdentifierMixin): @@ -261,14 +269,6 @@ def filter_config(self: SnowflakeCommonProtocol) -> SnowflakeFilterConfig: def identifier_config(self: SnowflakeCommonProtocol) -> SnowflakeIdentifierConfig: return self.config - def gen_dataset_urn(self: SnowflakeCommonProtocol, dataset_identifier: str) -> str: - return make_dataset_urn_with_platform_instance( - platform=self.platform, - name=dataset_identifier, - platform_instance=self.config.platform_instance, - env=self.config.env, - ) - @staticmethod def get_quoted_identifier_for_database(db_name): return f'"{db_name}"' diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index d4b43bfb420e6f..d8eda98da422b9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -498,7 +498,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if self.config.include_assertion_results: yield from SnowflakeAssertionsHandler( - self.config, self.report, self.connection, self.gen_dataset_urn + self.config, self.report, self.connection ).get_assertion_workunits(discovered_datasets) def report_warehouse_failure(self) -> None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py index 3956c617a5f226..93c7025aeee4ea 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py @@ -8,8 +8,9 @@ from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.source_common import ( - DatasetSourceConfigMixin, + EnvConfigMixin, LowerCaseDatasetUrnConfigMixin, + PlatformInstanceConfigMixin, ) from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.ingestion.api.incremental_lineage_helper import ( @@ -62,7 +63,8 @@ def view_pattern_is_table_pattern_unless_specified( class SQLCommonConfig( StatefulIngestionConfigBase, - DatasetSourceConfigMixin, + PlatformInstanceConfigMixin, + EnvConfigMixin, LowerCaseDatasetUrnConfigMixin, IncrementalLineageConfigMixin, ClassificationSourceConfigMixin, From ca5c0bc9ce8f1866e7f5f058c18edffe538d1b77 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 10 Jul 2024 11:50:21 -0700 Subject: [PATCH 22/32] add SnowflakeQueriesExtractor interface --- .../source/snowflake/snowflake_queries.py | 129 ++++++++++++------ .../source/snowflake/snowflake_utils.py | 3 +- 2 files changed, 88 insertions(+), 44 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index f07ac38397cdf1..b312e1883df7d2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -10,21 +10,21 @@ import pydantic from typing_extensions import Self -from datahub.configuration.source_common import ( - EnvConfigMixin, - LowerCaseDatasetUrnConfigMixin, - PlatformInstanceConfigMixin, -) from datahub.configuration.time_window_config import ( BaseTimeWindowConfig, BucketDuration, ) -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.report import Report from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.snowflake.snowflake_config import ( + SnowflakeFilterConfig, + SnowflakeIdentifierConfig, +) from datahub.ingestion.source.snowflake.snowflake_connection import ( + SnowflakeConnection, SnowflakeConnectionConfig, ) from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery @@ -51,11 +51,7 @@ logger = logging.getLogger(__name__) -class SnowflakeQueriesConfig( - PlatformInstanceConfigMixin, EnvConfigMixin, LowerCaseDatasetUrnConfigMixin -): - connection: SnowflakeConnectionConfig - +class SnowflakeQueriesExtractorConfig(SnowflakeIdentifierConfig, SnowflakeFilterConfig): # TODO: Support stateful ingestion for the time windows. window: BaseTimeWindowConfig = BaseTimeWindowConfig() @@ -64,8 +60,13 @@ class SnowflakeQueriesConfig( # TODO: support temporary_tables_pattern - local_temp_path: Optional[pathlib.Path] = None - # TODO: support copying files to s3 + local_temp_path: Optional[pathlib.Path] = pydantic.Field( + default=None, + description="Local path to store the audit log.", + # TODO: For now, this is simply an advanced config to make local testing easier. + # Eventually, we will want to store date-specific files in the directory and use it as a cache. + hidden_from_docs=True, + ) convert_urns_to_lowercase: bool = pydantic.Field( # Override the default. @@ -80,26 +81,34 @@ class SnowflakeQueriesConfig( include_operations: bool = True +class SnowflakeQueriesSourceConfig(SnowflakeQueriesExtractorConfig): + connection: SnowflakeConnectionConfig + + @dataclass -class SnowflakeQueriesReport(SourceReport): +class SnowflakeQueriesExtractorReport(Report): window: Optional[BaseTimeWindowConfig] = None sql_aggregator: Optional[SqlAggregatorReport] = None -class SnowflakeQueriesExtractor(SnowflakeFilterMixin, SnowflakeIdentifierMixin): - def __init__(self, config: SnowflakeQueriesConfig, report: SnowflakeQueriesReport): - self.config = config - self.report = report +@dataclass +class SnowflakeQueriesSourceReport(SourceReport): + queries_extractor: Optional[SnowflakeQueriesExtractorReport] = None -class SnowflakeQueriesSource(Source): - def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesConfig): - self.ctx = ctx - self.config = config - self.report = SnowflakeQueriesReport() +class SnowflakeQueriesExtractor(SnowflakeFilterMixin, SnowflakeIdentifierMixin): + def __init__( + self, + connection: SnowflakeConnection, + config: SnowflakeQueriesExtractorConfig, + structured_report: SourceReport, + ): + self.connection = connection - self.platform = "snowflake" + self.config = config + self.report = SnowflakeQueriesExtractorReport() + self._structured_report = structured_report self.aggregator = SqlParsingAggregator( platform=self.platform, @@ -121,10 +130,17 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesConfig): ) self.report.sql_aggregator = self.aggregator.report - @classmethod - def create(cls, config_dict: dict, ctx: PipelineContext) -> Self: - config = SnowflakeQueriesConfig.parse_obj(config_dict) - return cls(ctx, config) + @property + def structured_reporter(self) -> SourceReport: + return self._structured_report + + @property + def filter_config(self) -> SnowflakeFilterConfig: + return self.config + + @property + def identifier_config(self) -> SnowflakeIdentifierConfig: + return self.config @functools.cached_property def local_temp_path(self) -> pathlib.Path: @@ -146,6 +162,7 @@ def get_workunits_internal( audit_log_file = self.local_temp_path / "audit_log.sqlite" use_cached_audit_log = audit_log_file.exists() + queries: FileBackedList[Union[KnownLineageMapping, PreparsedQuery]] if use_cached_audit_log: logger.info("Using cached audit log") shared_connection = ConnectionWrapper(audit_log_file) @@ -202,8 +219,7 @@ def fetch_audit_log( deny_usernames=self.config.deny_usernames, ) - conn = self.config.connection.get_connection() - resp = conn.query(audit_log_query) + resp = self.connection.query(audit_log_query) for i, row in enumerate(resp): if i % 1000 == 0: @@ -213,21 +229,17 @@ def fetch_audit_log( try: entry = self._parse_audit_log_response(row) except Exception as e: - self.report.warning( + self.structured_reporter.warning( "Error parsing audit log row", - context=f"{e}", + context=f"{row}", exc=e, ) else: yield entry - def gen_dataset_urn(self, dataset_identifier: str) -> str: - return make_dataset_urn_with_platform_instance( - platform=self.platform, - name=dataset_identifier, - platform_instance=self.config.platform_instance, - env=self.config.env, - ) + def get_dataset_identifier_from_qualified_name(self, qualified_name: str) -> str: + # Copied from SnowflakeCommonMixin. + return self.snowflake_identifier(self.cleanup_qualified_name(qualified_name)) def _parse_audit_log_response(self, row: Dict[str, Any]) -> PreparsedQuery: json_fields = { @@ -249,7 +261,9 @@ def _parse_audit_log_response(self, row: Dict[str, Any]) -> PreparsedQuery: column_usage = {} for obj in direct_objects_accessed: - dataset = self.gen_dataset_urn(self.snowflake_identifier(obj["objectName"])) + dataset = self.gen_dataset_urn( + self.get_dataset_identifier_from_qualified_name(obj["objectName"]) + ) columns = set() for modified_column in obj["columns"]: @@ -265,7 +279,7 @@ def _parse_audit_log_response(self, row: Dict[str, Any]) -> PreparsedQuery: # TODO: Warn if that happens. downstream = self.gen_dataset_urn( - self.snowflake_identifier(obj["objectName"]) + self.get_dataset_identifier_from_qualified_name(obj["objectName"]) ) column_lineage = [] for modified_column in obj["columns"]: @@ -280,7 +294,9 @@ def _parse_audit_log_response(self, row: Dict[str, Any]) -> PreparsedQuery: upstreams=[ ColumnRef( table=self.gen_dataset_urn( - self.snowflake_identifier(upstream["objectName"]) + self.get_dataset_identifier_from_qualified_name( + upstream["objectName"] + ) ), column=self.snowflake_identifier( upstream["columnName"] @@ -325,7 +341,34 @@ def _parse_audit_log_response(self, row: Dict[str, Any]) -> PreparsedQuery: ) return entry - def get_report(self) -> SnowflakeQueriesReport: + +class SnowflakeQueriesSource(Source): + def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesSourceConfig): + self.ctx = ctx + self.config = config + self.report = SnowflakeQueriesSourceReport() + + self.platform = "snowflake" + + self.connection = self.config.connection.get_connection() + + self.queries_extractor = SnowflakeQueriesExtractor( + connection=self.connection, + config=self.config, + structured_report=self.report, + ) + self.report.queries_extractor = self.queries_extractor.report + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> Self: + config = SnowflakeQueriesSourceConfig.parse_obj(config_dict) + return cls(ctx, config) + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + # TODO: Disable auto status processor? + return self.queries_extractor.get_workunits_internal() + + def get_report(self) -> SnowflakeQueriesSourceReport: return self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index 1f3fd89bf9d712..da0e6c04869c65 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -212,7 +212,8 @@ def cleanup_qualified_name(self, qualified_name: str) -> str: if len(name_parts) != 3: self.structured_reporter.report_warning( title="Unexpected dataset pattern", - message="We failed to parse a Snowflake qualified name into its constituent parts. DB/schema/table filtering may not work as expected.", + message="We failed to parse a Snowflake qualified name into its constituent parts. " + "DB/schema/table filtering may not work as expected on these entities.", context=f"{qualified_name} has {len(name_parts)} parts", ) return qualified_name.replace('"', "") From f0b8e79886a640a4f34596304d5231c5345cf579 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 10 Jul 2024 13:16:50 -0700 Subject: [PATCH 23/32] improve warnings --- metadata-ingestion/src/datahub/ingestion/api/source.py | 5 +++++ .../ingestion/source/snowflake/snowflake_queries.py | 10 +++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index d78500b4401e5f..ad1b312ef445c1 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -45,6 +45,8 @@ logger = logging.getLogger(__name__) +_MAX_CONTEXT_STRING_LENGTH = 300 + class SourceCapability(Enum): PLATFORM_INSTANCE = "Platform Instance" @@ -112,6 +114,9 @@ def report_log( log_key = f"{title}-{message}" entries = self._entries[level] + if context and len(context) > _MAX_CONTEXT_STRING_LENGTH: + context = f"{context[:_MAX_CONTEXT_STRING_LENGTH]} ..." + log_content = f"{message} => {context}" if context else message if exc: log_content += f"{log_content}: {exc}" diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index b312e1883df7d2..40322e317b8efd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -227,7 +227,7 @@ def fetch_audit_log( assert isinstance(row, dict) try: - entry = self._parse_audit_log_response(row) + entry = self._parse_audit_log_row(row) except Exception as e: self.structured_reporter.warning( "Error parsing audit log row", @@ -241,7 +241,7 @@ def get_dataset_identifier_from_qualified_name(self, qualified_name: str) -> str # Copied from SnowflakeCommonMixin. return self.snowflake_identifier(self.cleanup_qualified_name(qualified_name)) - def _parse_audit_log_response(self, row: Dict[str, Any]) -> PreparsedQuery: + def _parse_audit_log_row(self, row: Dict[str, Any]) -> PreparsedQuery: json_fields = { "DIRECT_OBJECTS_ACCESSED", "OBJECTS_MODIFIED", @@ -276,7 +276,11 @@ def _parse_audit_log_response(self, row: Dict[str, Any]) -> PreparsedQuery: column_lineage = None for obj in objects_modified: # We don't expect there to be more than one object modified. - # TODO: Warn if that happens. + if downstream: + self.structured_reporter.report_warning( + message="Unexpectedly got multiple downstream entities from the Snowflake audit log.", + context=f"{row}", + ) downstream = self.gen_dataset_urn( self.get_dataset_identifier_from_qualified_name(obj["objectName"]) From a76e0150fbe0c16824a377f8752ee940e0aeb0d1 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 10 Jul 2024 16:05:27 -0700 Subject: [PATCH 24/32] add filters support --- .../ingestion/source/redshift/lineage_v2.py | 2 +- .../source/snowflake/snowflake_lineage_v2.py | 4 +- .../source/snowflake/snowflake_queries.py | 20 +++++- .../source/snowflake/snowflake_usage_v2.py | 4 +- .../source/snowflake/snowflake_utils.py | 2 +- .../sql_parsing/sql_parsing_aggregator.py | 69 ++++++++++++++----- 6 files changed, 76 insertions(+), 25 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py index 062a99de6b7358..31efb6c5038a02 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py @@ -105,7 +105,7 @@ def build( for schema, tables in schemas.items() for table in tables } - self.aggregator.is_temp_table = lambda urn: urn not in self.known_urns + self.aggregator._is_temp_table = lambda urn: urn not in self.known_urns # Handle all the temp tables up front. if self.config.resolve_temp_table_in_lineage: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 5302382814f5a4..3e65f062004189 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -422,7 +422,7 @@ def map_query_result_upstreams( ) if upstream_name and ( not self.config.validate_upstreams_against_patterns - or self._is_dataset_pattern_allowed( + or self.is_dataset_pattern_allowed( upstream_name, upstream_table.upstream_object_domain, ) @@ -508,7 +508,7 @@ def build_finegrained_lineage_upstreams( and upstream_col.column_name and ( not self.config.validate_upstreams_against_patterns - or self._is_dataset_pattern_allowed( + or self.is_dataset_pattern_allowed( upstream_col.object_name, upstream_col.object_domain, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 40322e317b8efd..6b4196367fcd49 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -2,6 +2,7 @@ import json import logging import pathlib +import re import tempfile from dataclasses import dataclass from datetime import datetime, timezone @@ -19,7 +20,9 @@ from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain from datahub.ingestion.source.snowflake.snowflake_config import ( + DEFAULT_TABLES_DENY_LIST, SnowflakeFilterConfig, SnowflakeIdentifierConfig, ) @@ -58,7 +61,12 @@ class SnowflakeQueriesExtractorConfig(SnowflakeIdentifierConfig, SnowflakeFilter # TODO: make this a proper allow/deny pattern deny_usernames: List[str] = [] - # TODO: support temporary_tables_pattern + temporary_tables_pattern: List[str] = pydantic.Field( + default=DEFAULT_TABLES_DENY_LIST, + description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to " + "match the entire table name in database.schema.table format. Defaults are to set in such a way " + "to ignore the temporary staging tables created by known ETL tools.", + ) local_temp_path: Optional[pathlib.Path] = pydantic.Field( default=None, @@ -126,6 +134,8 @@ def __init__( # TODO make the rest of the fields configurable ), generate_operations=self.config.include_operations, + is_temp_table=self.is_temp_table, + is_allowed_table=self.is_allowed_table, format_queries=False, ) self.report.sql_aggregator = self.aggregator.report @@ -153,6 +163,14 @@ def local_temp_path(self) -> pathlib.Path: logger.info(f"Using local temp path: {path}") return path + def is_temp_table(self, name: str) -> bool: + return any( + re.match(pattern, name) for pattern in self.config.temporary_tables_pattern + ) + + def is_allowed_table(self, name: str) -> bool: + return self.is_dataset_pattern_allowed(name, SnowflakeObjectDomain.TABLE) + def get_workunits_internal( self, ) -> Iterable[MetadataWorkUnit]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index b50764a4231656..c5e0994059f2e4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -232,7 +232,7 @@ def _get_workunits_internal( logger.debug(f"Processing usage row number {results.rownumber}") logger.debug(self.report.usage_aggregation.as_string()) - if not self._is_dataset_pattern_allowed( + if not self.is_dataset_pattern_allowed( row["OBJECT_NAME"], row["OBJECT_DOMAIN"], ): @@ -561,7 +561,7 @@ def _is_unsupported_object_accessed(self, obj: Dict[str, Any]) -> bool: def _is_object_valid(self, obj: Dict[str, Any]) -> bool: if self._is_unsupported_object_accessed( obj - ) or not self._is_dataset_pattern_allowed( + ) or not self.is_dataset_pattern_allowed( obj.get("objectName"), obj.get("objectDomain") ): return False diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index da0e6c04869c65..c33fbb3d0bfc80 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -152,7 +152,7 @@ def _combine_identifier_parts( ) -> str: return f"{db_name}.{schema_name}.{table_name}" - def _is_dataset_pattern_allowed( + def is_dataset_pattern_allowed( self, dataset_name: Optional[str], dataset_type: Optional[str], diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 7770417a20bad8..68c289f8329695 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -218,12 +218,17 @@ class SqlAggregatorReport(Report): schema_resolver_count: Optional[int] = None num_unique_query_fingerprints: Optional[int] = None num_urns_with_lineage: Optional[int] = None - num_query_entities_generated: int = 0 + num_queries_entities_generated: int = 0 + num_lineage_skipped_due_to_filters: int = 0 # Usage-related. usage_skipped_missing_timestamp: int = 0 num_query_usage_stats_generated: int = 0 + # Operation-related. + num_operations_generated: int = 0 + num_operations_skipped_due_to_filters: int = 0 + def compute_stats(self) -> None: self.schema_resolver_count = self._aggregator._schema_resolver.schema_count() self.num_unique_query_fingerprints = len(self._aggregator._query_map) @@ -245,11 +250,13 @@ def __init__( graph: Optional[DataHubGraph] = None, generate_lineage: bool = True, generate_queries: bool = True, + generate_query_subject_fields: bool = False, generate_usage_statistics: bool = False, generate_query_usage_statistics: bool = False, generate_operations: bool = False, usage_config: Optional[BaseUsageConfig] = None, is_temp_table: Optional[Callable[[UrnStr], bool]] = None, + is_allowed_table: Optional[Callable[[UrnStr], bool]] = None, format_queries: bool = True, query_log: QueryLogSetting = _DEFAULT_QUERY_LOG_SETTING, ) -> None: @@ -259,6 +266,7 @@ def __init__( self.generate_lineage = generate_lineage self.generate_queries = generate_queries + self.generate_query_subject_fields = generate_query_subject_fields self.generate_usage_statistics = generate_usage_statistics self.generate_query_usage_statistics = generate_query_usage_statistics self.generate_operations = generate_operations @@ -274,7 +282,8 @@ def __init__( self.report = SqlAggregatorReport(_aggregator=self) # can be used by BQ where we have a "temp_table_dataset_prefix" - self.is_temp_table = is_temp_table + self._is_temp_table = is_temp_table + self._is_allowed_table = is_allowed_table self.format_queries = format_queries self.query_log = query_log @@ -436,6 +445,18 @@ def _maybe_format_query(self, query: str) -> str: return try_format_query(query, self.platform.platform_name) return query + def is_temp_table(self, urn: UrnStr) -> bool: + if self._is_temp_table is None: + return False + return self._is_temp_table(urn) + + def is_allowed_table(self, urn: UrnStr) -> bool: + if self.is_temp_table(urn): + return False + if self._is_allowed_table is None: + return True + return self._is_allowed_table(urn) + def add( self, item: Union[KnownQueryLineageInfo, KnownLineageMapping, PreparsedQuery] ) -> None: @@ -691,8 +712,8 @@ def add_preparsed_query( else: upstream_fields = parsed.column_usage or {} for upstream_urn in parsed.upstreams: - # If the upstream table is a temp table, don't log usage for it. - if (self.is_temp_table and self.is_temp_table(upstream_urn)) or ( + # If the upstream table is a temp table or otherwise denied by filters, don't log usage for it. + if not self.is_allowed_table(upstream_urn) or ( require_out_table_schema and not self._schema_resolver.has_urn(upstream_urn) ): @@ -753,7 +774,7 @@ def add_preparsed_query( or ( not is_renamed_table and ( - (self.is_temp_table and self.is_temp_table(out_table)) + self.is_temp_table(out_table) or ( require_out_table_schema and not self._schema_resolver.has_urn(out_table) @@ -772,9 +793,10 @@ def add_preparsed_query( else: # Non-temp tables immediately generate lineage. - self._lineage_map.for_mutation(out_table, OrderedSet()).add( - query_fingerprint - ) + if self.is_allowed_table(out_table): + self._lineage_map.for_mutation(out_table, OrderedSet()).add( + query_fingerprint + ) def add_table_rename( self, @@ -980,6 +1002,10 @@ def _gen_lineage_mcps( # Generate lineage and queries. for downstream_urn in sorted(self._lineage_map): + if not self.is_allowed_table(downstream_urn): + self.report.num_lineage_skipped_due_to_filters += 1 + continue + yield from self._gen_lineage_for_downstream( downstream_urn, queries_generated=queries_generated ) @@ -1164,18 +1190,20 @@ def _gen_query( query_subject_urns: List[UrnStr] = [] for upstream in query.upstreams: query_subject_urns.append(upstream) - for column in query.column_usage.get(upstream, []): - query_subject_urns.append( - builder.make_schema_field_urn(upstream, column) - ) + if self.generate_query_subject_fields: + for column in query.column_usage.get(upstream, []): + query_subject_urns.append( + builder.make_schema_field_urn(upstream, column) + ) if downstream_urn: query_subject_urns.append(downstream_urn) - for column_lineage in query.column_lineage: - query_subject_urns.append( - builder.make_schema_field_urn( - downstream_urn, column_lineage.downstream.column + if self.generate_query_subject_fields: + for column_lineage in query.column_lineage: + query_subject_urns.append( + builder.make_schema_field_urn( + downstream_urn, column_lineage.downstream.column + ) ) - ) yield from MetadataChangeProposalWrapper.construct_many( entityUrn=self._query_urn(query_id), @@ -1200,7 +1228,7 @@ def _gen_query( ), ], ) - self.report.num_query_entities_generated += 1 + self.report.num_queries_entities_generated += 1 if self._query_usage_counts is not None: assert self.usage_config is not None @@ -1407,6 +1435,11 @@ def _gen_operation_for_downstream( # We don't generate operations for SELECTs. return + if not self.is_allowed_table(downstream_urn): + self.report.num_operations_skipped_due_to_filters += 1 + return + + self.report.num_operations_generated += 1 aspect = models.OperationClass( timestampMillis=make_ts_millis(datetime.now(tz=timezone.utc)), operationType=operation_type, From b789e3fbbabc638f5d263879a75802ab9454850d Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 10 Jul 2024 16:57:18 -0700 Subject: [PATCH 25/32] add schema fields to aggregator goldens --- .../sql_parsing/sql_parsing_aggregator.py | 2 +- .../test_add_known_query_lineage.json | 11 +++++- .../test_basic_lineage.json | 14 +++++++- .../test_column_lineage_deduplication.json | 34 +++++++++++++++++-- .../test_multistep_temp_table.json | 13 +++++-- .../test_overlapping_inserts.json | 28 +++++++++++++-- ..._overlapping_inserts_from_temp_tables.json | 31 ++++++++++++++--- .../aggregator_goldens/test_table_rename.json | 28 +++++++++++++-- .../aggregator_goldens/test_temp_table.json | 22 ++++++++++-- .../aggregator_goldens/test_view_lineage.json | 14 +++++++- 10 files changed, 178 insertions(+), 19 deletions(-) diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 68c289f8329695..eff6ba6075fd49 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -250,7 +250,7 @@ def __init__( graph: Optional[DataHubGraph] = None, generate_lineage: bool = True, generate_queries: bool = True, - generate_query_subject_fields: bool = False, + generate_query_subject_fields: bool = True, generate_usage_statistics: bool = False, generate_query_usage_statistics: bool = False, generate_operations: bool = False, diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json index 3893b649bd5c8a..94c8947dba9ff1 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_add_known_query_lineage.json @@ -113,11 +113,20 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_basic_lineage.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_basic_lineage.json index 2eb3753473d7d9..839a224a41b63c 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_basic_lineage.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_basic_lineage.json @@ -81,11 +81,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),b)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json index dd110a5d928df0..d3ec3843168188 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_column_lineage_deduplication.json @@ -93,11 +93,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),b)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" } ] } @@ -145,11 +157,29 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),c)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),b)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_multistep_temp_table.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_multistep_temp_table.json index 5f3e81b7b9eb9b..a9b5a3a7cbefac 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_multistep_temp_table.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_multistep_temp_table.json @@ -106,14 +106,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream1,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream2,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.prod_foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream1,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.prod_foo,PROD),a)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.prod_foo,PROD),b)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.prod_foo,PROD),c)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts.json index 49458e06b0bb91..fcbe0ec5aeb839 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts.json @@ -118,11 +118,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream2,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream2,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream2,PROD),c)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.downstream,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream2,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.downstream,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.downstream,PROD),c)" } ] } @@ -170,11 +182,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream1,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream1,PROD),b)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.downstream,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.upstream1,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.downstream,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.downstream,PROD),b)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts_from_temp_tables.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts_from_temp_tables.json index 9567aef095f9ac..48aecb90151804 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts_from_temp_tables.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_overlapping_inserts_from_temp_tables.json @@ -168,9 +168,6 @@ "aspect": { "json": { "subjects": [ - { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD)" - }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.customer,PROD)" }, @@ -179,6 +176,21 @@ }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.online_survey,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),customer_id)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),customer_email)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),return_date)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),return_reason)" } ] } @@ -226,14 +238,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.customer,PROD)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.in_person_returns,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.customer,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),customer_id)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.in_person_returns,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),customer_email)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.all_returns,PROD),return_date)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json index fc5e5ef879fe14..9a4d405e50a7a4 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_table_rename.json @@ -81,11 +81,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.baz,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.baz,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.baz,PROD),b)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.baz,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),b)" } ] } @@ -184,11 +196,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD),b)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json index 377e3e02c970ea..743e2738fc70c6 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_temp_table.json @@ -81,11 +81,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),b)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" } ] } @@ -185,11 +197,17 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_session2,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_session2,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_session2,PROD),c)" } ] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_view_lineage.json b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_view_lineage.json index bf2296c99356e8..973813dae6073c 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_view_lineage.json +++ b/metadata-ingestion/tests/unit/sql_parsing/aggregator_goldens/test_view_lineage.json @@ -81,11 +81,23 @@ "aspect": { "json": { "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD),b)" + }, { "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD)" }, { - "entity": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)" + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)" } ] } From 69e782a7ab2b7ab286b5d4b4219a15ccf285e6a3 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 10 Jul 2024 17:17:26 -0700 Subject: [PATCH 26/32] fix test mock --- .../state/test_redundant_run_skip_handler.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py index 0400bd6a72aa5f..d3617acfa87aab 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py @@ -1,4 +1,5 @@ from datetime import datetime, timezone +from typing import Iterable from unittest import mock import pytest @@ -24,7 +25,7 @@ @pytest.fixture -def stateful_source(mock_datahub_graph: DataHubGraph) -> SnowflakeV2Source: +def stateful_source(mock_datahub_graph: DataHubGraph) -> Iterable[SnowflakeV2Source]: pipeline_name = "test_redundant_run_lineage" run_id = "test_redundant_run" ctx = PipelineContext( @@ -43,8 +44,9 @@ def stateful_source(mock_datahub_graph: DataHubGraph) -> SnowflakeV2Source: ), ), ) - source = SnowflakeV2Source(ctx=ctx, config=config) - return source + + with mock.patch("snowflake.connector.connect") as mock_connect: + yield SnowflakeV2Source(ctx=ctx, config=config) def test_redundant_run_job_ids(stateful_source: SnowflakeV2Source) -> None: From 0533c833ea70c752b510cd77b7b40d8bf39bd9cc Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 10 Jul 2024 18:19:03 -0700 Subject: [PATCH 27/32] fix table filtering logic --- .../ingestion/source/redshift/lineage_v2.py | 10 +++- .../source/snowflake/snowflake_config.py | 1 + .../source/snowflake/snowflake_queries.py | 3 +- .../sql_parsing/sql_parsing_aggregator.py | 46 +++++++++++++------ 4 files changed, 45 insertions(+), 15 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py index 31efb6c5038a02..526e5e2cf12d02 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py @@ -105,7 +105,15 @@ def build( for schema, tables in schemas.items() for table in tables } - self.aggregator._is_temp_table = lambda urn: urn not in self.known_urns + self.aggregator._is_temp_table = ( + lambda name: DatasetUrn.create_from_ids( + self.platform, + name, + env=self.config.env, + platform_instance=self.config.platform_instance, + ).urn() + not in self.known_urns + ) # Handle all the temp tables up front. if self.config.resolve_temp_table_in_lineage: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 7c88b4689e10e6..eba9f63a44e904 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -44,6 +44,7 @@ r".*__DBT_TMP$", # dbt rf".*\.SEGMENT_{UUID_REGEX}", # segment rf".*\.STAGING_.*_{UUID_REGEX}", # stitch + r".*\.(ge_tmp_|ge_temp_|gx_temp_)[0-9a-f]{8}", # great expectations ] diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 6b4196367fcd49..d6fa134c4dd8f0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -165,7 +165,8 @@ def local_temp_path(self) -> pathlib.Path: def is_temp_table(self, name: str) -> bool: return any( - re.match(pattern, name) for pattern in self.config.temporary_tables_pattern + re.match(pattern, name, flags=re.IGNORECASE) + for pattern in self.config.temporary_tables_pattern ) def is_allowed_table(self, name: str) -> bool: diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index eff6ba6075fd49..d842705dd2f834 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -1,6 +1,7 @@ import contextlib import dataclasses import enum +import functools import itertools import json import logging @@ -218,9 +219,12 @@ class SqlAggregatorReport(Report): schema_resolver_count: Optional[int] = None num_unique_query_fingerprints: Optional[int] = None num_urns_with_lineage: Optional[int] = None - num_queries_entities_generated: int = 0 num_lineage_skipped_due_to_filters: int = 0 + # Queries. + num_queries_entities_generated: int = 0 + num_queries_skipped_due_to_filters: int = 0 + # Usage-related. usage_skipped_missing_timestamp: int = 0 num_query_usage_stats_generated: int = 0 @@ -255,8 +259,8 @@ def __init__( generate_query_usage_statistics: bool = False, generate_operations: bool = False, usage_config: Optional[BaseUsageConfig] = None, - is_temp_table: Optional[Callable[[UrnStr], bool]] = None, - is_allowed_table: Optional[Callable[[UrnStr], bool]] = None, + is_temp_table: Optional[Callable[[str], bool]] = None, + is_allowed_table: Optional[Callable[[str], bool]] = None, format_queries: bool = True, query_log: QueryLogSetting = _DEFAULT_QUERY_LOG_SETTING, ) -> None: @@ -445,17 +449,27 @@ def _maybe_format_query(self, query: str) -> str: return try_format_query(query, self.platform.platform_name) return query + @functools.lru_cache(maxsize=128) + def _name_from_urn(self, urn: UrnStr) -> str: + name = DatasetUrn.from_string(urn).name + if ( + platform_instance := self._schema_resolver.platform_instance + ) and name.startswith(platform_instance): + # Remove the platform instance from the name. + name = name[len(platform_instance) + 1 :] + return name + def is_temp_table(self, urn: UrnStr) -> bool: if self._is_temp_table is None: return False - return self._is_temp_table(urn) + return self._is_temp_table(self._name_from_urn(urn)) def is_allowed_table(self, urn: UrnStr) -> bool: if self.is_temp_table(urn): return False if self._is_allowed_table is None: return True - return self._is_allowed_table(urn) + return self._is_allowed_table(self._name_from_urn(urn)) def add( self, item: Union[KnownQueryLineageInfo, KnownLineageMapping, PreparsedQuery] @@ -793,10 +807,9 @@ def add_preparsed_query( else: # Non-temp tables immediately generate lineage. - if self.is_allowed_table(out_table): - self._lineage_map.for_mutation(out_table, OrderedSet()).add( - query_fingerprint - ) + self._lineage_map.for_mutation(out_table, OrderedSet()).add( + query_fingerprint + ) def add_table_rename( self, @@ -1002,10 +1015,6 @@ def _gen_lineage_mcps( # Generate lineage and queries. for downstream_urn in sorted(self._lineage_map): - if not self.is_allowed_table(downstream_urn): - self.report.num_lineage_skipped_due_to_filters += 1 - continue - yield from self._gen_lineage_for_downstream( downstream_urn, queries_generated=queries_generated ) @@ -1027,6 +1036,10 @@ def _query_type_precedence(cls, query_type: str) -> int: def _gen_lineage_for_downstream( self, downstream_urn: str, queries_generated: Set[QueryId] ) -> Iterable[MetadataChangeProposalWrapper]: + if not self.is_allowed_table(downstream_urn): + self.report.num_lineage_skipped_due_to_filters += 1 + return + query_ids = self._lineage_map[downstream_urn] queries: List[QueryMetadata] = [ self._resolve_query_with_temp_tables(self._query_map[query_id]) @@ -1187,6 +1200,13 @@ def _gen_query( if not self.can_generate_query(query_id): return + # If a query doesn't involve any allowed tables, skip it. + if downstream_urn is None and not any( + self.is_allowed_table(urn) for urn in query.upstreams + ): + self.report.num_queries_skipped_due_to_filters += 1 + return + query_subject_urns: List[UrnStr] = [] for upstream in query.upstreams: query_subject_urns.append(upstream) From c8c9ac541a8d88dff0aa6018000ec5d953ec95f8 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 10 Jul 2024 18:23:23 -0700 Subject: [PATCH 28/32] fix filtering logic --- .../ingestion/source/snowflake/snowflake_config.py | 6 +++--- .../ingestion/source/snowflake/snowflake_queries.py | 4 ++-- .../ingestion/source/snowflake/snowflake_query.py | 10 ++++++---- metadata-ingestion/tests/unit/test_snowflake_source.py | 8 +++++--- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index eba9f63a44e904..f6247eb949417b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -39,12 +39,12 @@ # # DBT incremental models create temporary tables ending with __dbt_tmp # Ref - https://discourse.getdbt.com/t/handling-bigquery-incremental-dbt-tmp-tables/7540 -DEFAULT_TABLES_DENY_LIST = [ +DEFAULT_TEMP_TABLES_PATTERNS = [ r".*\.FIVETRAN_.*_STAGING\..*", # fivetran r".*__DBT_TMP$", # dbt rf".*\.SEGMENT_{UUID_REGEX}", # segment rf".*\.STAGING_.*_{UUID_REGEX}", # stitch - r".*\.(ge_tmp_|ge_temp_|gx_temp_)[0-9a-f]{8}", # great expectations + r".*\.(GE_TMP_|GE_TEMP_|GX_TEMP_)[0-9A-F]{8}", # great expectations ] @@ -240,7 +240,7 @@ class SnowflakeV2Config( # This is required since access_history table does not capture whether the table was temporary table. temporary_tables_pattern: List[str] = Field( - default=DEFAULT_TABLES_DENY_LIST, + default=DEFAULT_TEMP_TABLES_PATTERNS, description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to " "match the entire table name in database.schema.table format. Defaults are to set in such a way " "to ignore the temporary staging tables created by known ETL tools.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index d6fa134c4dd8f0..93c18f562dbb9e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -22,7 +22,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain from datahub.ingestion.source.snowflake.snowflake_config import ( - DEFAULT_TABLES_DENY_LIST, + DEFAULT_TEMP_TABLES_PATTERNS, SnowflakeFilterConfig, SnowflakeIdentifierConfig, ) @@ -62,7 +62,7 @@ class SnowflakeQueriesExtractorConfig(SnowflakeIdentifierConfig, SnowflakeFilter deny_usernames: List[str] = [] temporary_tables_pattern: List[str] = pydantic.Field( - default=DEFAULT_TABLES_DENY_LIST, + default=DEFAULT_TEMP_TABLES_PATTERNS, description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to " "match the entire table name in database.schema.table format. Defaults are to set in such a way " "to ignore the temporary staging tables created by known ETL tools.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index f2eb8efcf2f797..a2e18a64d9a809 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -3,7 +3,9 @@ from datahub.configuration.common import AllowDenyPattern from datahub.configuration.time_window_config import BucketDuration from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain -from datahub.ingestion.source.snowflake.snowflake_config import DEFAULT_TABLES_DENY_LIST +from datahub.ingestion.source.snowflake.snowflake_config import ( + DEFAULT_TEMP_TABLES_PATTERNS, +) from datahub.utilities.prefix_batch_builder import PrefixGroup SHOW_VIEWS_MAX_PAGE_SIZE = 10000 @@ -358,7 +360,7 @@ def table_to_table_lineage_history_v2( end_time_millis: int, include_view_lineage: bool = True, include_column_lineage: bool = True, - upstreams_deny_pattern: List[str] = DEFAULT_TABLES_DENY_LIST, + upstreams_deny_pattern: List[str] = DEFAULT_TEMP_TABLES_PATTERNS, ) -> str: if include_column_lineage: return SnowflakeQuery.table_upstreams_with_column_lineage( @@ -409,7 +411,7 @@ def show_external_tables() -> str: def copy_lineage_history( start_time_millis: int, end_time_millis: int, - downstreams_deny_pattern: List[str] = DEFAULT_TABLES_DENY_LIST, + downstreams_deny_pattern: List[str] = DEFAULT_TEMP_TABLES_PATTERNS, ) -> str: temp_table_filter = create_deny_regex_sql_filter( downstreams_deny_pattern, @@ -452,7 +454,7 @@ def usage_per_object_per_time_bucket_for_time_window( include_top_n_queries: bool, email_domain: Optional[str], email_filter: AllowDenyPattern, - table_deny_pattern: List[str] = DEFAULT_TABLES_DENY_LIST, + table_deny_pattern: List[str] = DEFAULT_TEMP_TABLES_PATTERNS, ) -> str: if not include_top_n_queries: top_n_queries = 0 diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/test_snowflake_source.py index 2d9be91d94deb6..3353e74449c957 100644 --- a/metadata-ingestion/tests/unit/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/test_snowflake_source.py @@ -14,7 +14,7 @@ SnowflakeCloudProvider, ) from datahub.ingestion.source.snowflake.snowflake_config import ( - DEFAULT_TABLES_DENY_LIST, + DEFAULT_TEMP_TABLES_PATTERNS, SnowflakeV2Config, ) from datahub.ingestion.source.snowflake.snowflake_query import ( @@ -535,8 +535,10 @@ def test_snowflake_query_create_deny_regex_sql(): ) assert ( - create_deny_regex_sql_filter(DEFAULT_TABLES_DENY_LIST, ["upstream_table_name"]) - == r"NOT RLIKE(upstream_table_name,'.*\.FIVETRAN_.*_STAGING\..*','i') AND NOT RLIKE(upstream_table_name,'.*__DBT_TMP$','i') AND NOT RLIKE(upstream_table_name,'.*\.SEGMENT_[a-f0-9]{8}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{12}','i') AND NOT RLIKE(upstream_table_name,'.*\.STAGING_.*_[a-f0-9]{8}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{12}','i')" + create_deny_regex_sql_filter( + DEFAULT_TEMP_TABLES_PATTERNS, ["upstream_table_name"] + ) + == r"NOT RLIKE(upstream_table_name,'.*\.FIVETRAN_.*_STAGING\..*','i') AND NOT RLIKE(upstream_table_name,'.*__DBT_TMP$','i') AND NOT RLIKE(upstream_table_name,'.*\.SEGMENT_[a-f0-9]{8}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{12}','i') AND NOT RLIKE(upstream_table_name,'.*\.STAGING_.*_[a-f0-9]{8}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{4}[-_][a-f0-9]{12}','i') AND NOT RLIKE(upstream_table_name,'.*\.(GE_TMP_|GE_TEMP_|GX_TEMP_)[0-9A-F]{8}','i')" ) From 4c4fb2275f6b2bdb512bcbebd5b4f87e4f08b9c9 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 10 Jul 2024 18:25:18 -0700 Subject: [PATCH 29/32] fix dup subjects for merge statements --- .../src/datahub/sql_parsing/sql_parsing_aggregator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index d842705dd2f834..677b96269fe586 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -1207,19 +1207,19 @@ def _gen_query( self.report.num_queries_skipped_due_to_filters += 1 return - query_subject_urns: List[UrnStr] = [] + query_subject_urns = OrderedSet[UrnStr]() for upstream in query.upstreams: - query_subject_urns.append(upstream) + query_subject_urns.add(upstream) if self.generate_query_subject_fields: for column in query.column_usage.get(upstream, []): - query_subject_urns.append( + query_subject_urns.add( builder.make_schema_field_urn(upstream, column) ) if downstream_urn: - query_subject_urns.append(downstream_urn) + query_subject_urns.add(downstream_urn) if self.generate_query_subject_fields: for column_lineage in query.column_lineage: - query_subject_urns.append( + query_subject_urns.add( builder.make_schema_field_urn( downstream_urn, column_lineage.downstream.column ) From 2e31235b2605da236a381a4efe6960ccdb7c3fa4 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 10 Jul 2024 19:53:14 -0700 Subject: [PATCH 30/32] fix lint --- .../stateful_ingestion/state/test_redundant_run_skip_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py index d3617acfa87aab..be6efd3e121ff1 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py @@ -45,7 +45,7 @@ def stateful_source(mock_datahub_graph: DataHubGraph) -> Iterable[SnowflakeV2Sou ), ) - with mock.patch("snowflake.connector.connect") as mock_connect: + with mock.patch("snowflake.connector.connect"): yield SnowflakeV2Source(ctx=ctx, config=config) From a23f86cbcfa53f05ef46983b91bb2e1bb185fc28 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 10 Jul 2024 21:29:54 -0700 Subject: [PATCH 31/32] fix(build): upgrade vercel builds to Node 20.x --- docs-website/vercel-setup.sh | 10 +++++----- metadata-ingestion/scripts/install_deps.sh | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docs-website/vercel-setup.sh b/docs-website/vercel-setup.sh index 01a1ee65b34f01..4bb40eaddf4775 100755 --- a/docs-website/vercel-setup.sh +++ b/docs-website/vercel-setup.sh @@ -4,6 +4,10 @@ set -euxo pipefail ./metadata-ingestion/scripts/install_deps.sh +# Set up java version for gradle +yum install java-17-amazon-corretto -y +java --version + # Build python from source. # Amazon Linux 2 has Python 3.8, but it's version of OpenSSL is super old and hence it # doesn't work with the packages we use. As such, we have to build Python from source. @@ -11,8 +15,7 @@ set -euxo pipefail # for reuse. yum groupinstall "Development Tools" -y -yum erase openssl-devel -y -yum install openssl11 openssl11-devel libffi-devel bzip2-devel wget nodejs -y +yum install openssl openssl-devel libffi-devel bzip2-devel wget nodejs -y wget https://www.python.org/ftp/python/3.10.11/Python-3.10.11.tgz tar -xf Python-3.10.11.tgz @@ -29,6 +32,3 @@ rm "$py3" ln "$(which python3.10)" "$py3" python3 --version -# Set up java version for gradle -yum install java-17-amazon-corretto -java --version \ No newline at end of file diff --git a/metadata-ingestion/scripts/install_deps.sh b/metadata-ingestion/scripts/install_deps.sh index bae0278056ebbd..80a07cb04cb447 100755 --- a/metadata-ingestion/scripts/install_deps.sh +++ b/metadata-ingestion/scripts/install_deps.sh @@ -18,7 +18,8 @@ else sqlite-devel \ xz-devel \ libxml2-devel \ - libxslt-devel + libxslt-devel \ + krb5-devel else $sudo_cmd apt-get update && $sudo_cmd apt-get install -y \ python3-ldap \ From 9d4334358c5dccc1e435222e2ef7d94f4e5f3449 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 11 Jul 2024 12:59:19 -0700 Subject: [PATCH 32/32] fix import --- .../src/datahub/ingestion/source/snowflake/snowflake_queries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 93c18f562dbb9e..c647a624a54673 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -36,7 +36,7 @@ SnowflakeIdentifierMixin, ) from datahub.ingestion.source.usage.usage_common import BaseUsageConfig -from datahub.metadata._urns.urn_defs import CorpUserUrn +from datahub.metadata.urns import CorpUserUrn from datahub.sql_parsing.sql_parsing_aggregator import ( KnownLineageMapping, PreparsedQuery,