diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 5c9f13a9c8879..5b896bc41cbf5 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -12,6 +12,8 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ### Other notable Changes +- #6611 - Snowflake `schema_pattern` now accepts pattern for fully qualified schema name in format `.` by setting config `match_fully_qualified_names : True`. Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. The config option `match_fully_qualified_names` will be deprecated in future and the default behavior will assume `match_fully_qualified_names: True`." + ## 0.9.3 ### Breaking Changes diff --git a/metadata-ingestion/src/datahub/configuration/pattern_utils.py b/metadata-ingestion/src/datahub/configuration/pattern_utils.py new file mode 100644 index 0000000000000..313e68c41812f --- /dev/null +++ b/metadata-ingestion/src/datahub/configuration/pattern_utils.py @@ -0,0 +1,13 @@ +from datahub.configuration.common import AllowDenyPattern + + +def is_schema_allowed( + schema_pattern: AllowDenyPattern, + schema_name: str, + db_name: str, + match_fully_qualified_schema_name: bool, +) -> bool: + if match_fully_qualified_schema_name: + return schema_pattern.allowed(f"{db_name}.{schema_name}") + else: + return schema_pattern.allowed(schema_name) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 43a14228e0efd..643ba4f1db579 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -49,6 +49,11 @@ class SnowflakeV2Config(SnowflakeConfig, SnowflakeUsageConfig): description="Whether to populate Snowsight url for Snowflake Objects", ) + match_fully_qualified_names = bool = Field( + default=False, + description="Whether `schema_pattern` is matched against fully qualified schema name `.`.", + ) + @root_validator(pre=False) def validate_unsupported_configs(cls, values: Dict) -> Dict: @@ -70,11 +75,26 @@ def validate_unsupported_configs(cls, values: Dict) -> Dict: "include_read_operational_stats is not supported. Set `include_read_operational_stats` to False.", ) + match_fully_qualified_names = values.get("match_fully_qualified_names") + + schema_pattern: Optional[AllowDenyPattern] = values.get("schema_pattern") + + if ( + schema_pattern is not None + and schema_pattern != AllowDenyPattern.allow_all() + and match_fully_qualified_names is not None + and not match_fully_qualified_names + ): + logger.warning( + "Please update `schema_pattern` to match against fully qualified schema name `.` and set config `match_fully_qualified_names : True`." + "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. " + "The config option `match_fully_qualified_names` will be deprecated in future and the default behavior will assume `match_fully_qualified_names: True`." + ) + # Always exclude reporting metadata for INFORMATION_SCHEMA schema - schema_pattern = values.get("schema_pattern") if schema_pattern is not None and schema_pattern: logger.debug("Adding deny for INFORMATION_SCHEMA to schema_pattern.") - cast(AllowDenyPattern, schema_pattern).deny.append(r"^INFORMATION_SCHEMA$") + cast(AllowDenyPattern, schema_pattern).deny.append(r".*INFORMATION_SCHEMA$") include_technical_schema = values.get("include_technical_schema") include_profiles = ( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py index 9fe9696a0f2a0..2cc2c9100199c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py @@ -7,6 +7,7 @@ from sqlalchemy import create_engine, inspect from sqlalchemy.sql import sqltypes +from datahub.configuration.pattern_utils import is_schema_allowed from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance from datahub.ingestion.api.common import WorkUnit from datahub.ingestion.source.ge_data_profiler import ( @@ -55,7 +56,12 @@ def get_workunits(self, databases: List[SnowflakeDatabase]) -> Iterable[WorkUnit continue profile_requests = [] for schema in db.schemas: - if not self.config.schema_pattern.allowed(schema.name): + if not is_schema_allowed( + self.config.schema_pattern, + schema.name, + db.name, + self.config.match_fully_qualified_names, + ): continue for table in schema.tables: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index dadae620956c7..c3b9be555a4f8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -7,6 +7,7 @@ from snowflake.connector.cursor import DictCursor from typing_extensions import Protocol +from datahub.configuration.pattern_utils import is_schema_allowed from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config @@ -136,7 +137,12 @@ def _is_dataset_pattern_allowed( if not self.config.database_pattern.allowed( dataset_params[0].strip('"') - ) or not self.config.schema_pattern.allowed(dataset_params[1].strip('"')): + ) or not is_schema_allowed( + self.config.schema_pattern, + dataset_params[1].strip('"'), + dataset_params[0].strip('"'), + self.config.match_fully_qualified_names, + ): return False if dataset_type.lower() in {"table"} and not self.config.table_pattern.allowed( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 352da03ce83f5..4c96e0fb32edb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -7,6 +7,7 @@ import pydantic from snowflake.connector import SnowflakeConnection +from datahub.configuration.pattern_utils import is_schema_allowed from datahub.emitter.mce_builder import ( make_container_urn, make_data_platform_urn, @@ -508,7 +509,12 @@ def _process_database( self.report.report_entity_scanned(snowflake_schema.name, "schema") - if not self.config.schema_pattern.allowed(snowflake_schema.name): + if not is_schema_allowed( + self.config.schema_pattern, + snowflake_schema.name, + db_name, + self.config.match_fully_qualified_names, + ): self.report.report_dropped(f"{db_name}.{snowflake_schema.name}.*") continue diff --git a/metadata-ingestion/tests/integration/snowflake-beta/test_snowflake_beta.py b/metadata-ingestion/tests/integration/snowflake-beta/test_snowflake_beta.py index cb09b529b0d38..7945b7f1fa152 100644 --- a/metadata-ingestion/tests/integration/snowflake-beta/test_snowflake_beta.py +++ b/metadata-ingestion/tests/integration/snowflake-beta/test_snowflake_beta.py @@ -55,7 +55,13 @@ def default_query_results(query): "CREATED": datetime(2021, 6, 8, 0, 0, 0, 0), "LAST_ALTERED": datetime(2021, 6, 8, 0, 0, 0, 0), "COMMENT": "comment for TEST_DB.TEST_SCHEMA", - } + }, + { + "SCHEMA_NAME": "TEST2_SCHEMA", + "CREATED": datetime(2021, 6, 8, 0, 0, 0, 0), + "LAST_ALTERED": datetime(2021, 6, 8, 0, 0, 0, 0), + "COMMENT": "comment for TEST_DB.TEST_SCHEMA", + }, ] elif query == SnowflakeQuery.tables_for_database("TEST_DB"): return [ @@ -339,7 +345,8 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): username="TST_USR", password="TST_PWD", include_views=False, - table_pattern=AllowDenyPattern(allow=["test_db.test_schema.*"]), + match_fully_qualified_names=True, + schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), include_technical_schema=True, include_table_lineage=True, include_view_lineage=False, @@ -408,7 +415,7 @@ def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_ username="TST_USR", password="TST_PWD", include_views=False, - table_pattern=AllowDenyPattern(allow=["test_db.test_schema.*"]), + schema_pattern=AllowDenyPattern(allow=["test_schema"]), include_technical_schema=True, include_table_lineage=False, include_view_lineage=False,