datahub-project · treff7es · Dec 28, 2022 · Dec 20, 2022 · Dec 21, 2022 · Dec 21, 2022
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -62,6 +62,10 @@
 from datahub.ingestion.source.bigquery_v2.lineage import BigqueryLineageExtractor
 from datahub.ingestion.source.bigquery_v2.profiler import BigqueryProfiler
 from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
+from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
+from datahub.ingestion.source.state.redundant_run_skip_handler import (
+    RedundantRunSkipHandler,
+)
 from datahub.ingestion.source.state.sql_common_state import (
     BaseSQLAlchemyCheckpointState,
 )
@@ -112,6 +116,7 @@
     auto_stale_entity_removal,
     auto_status_aspect,
 )
+from datahub.utilities.time import datetime_to_ts_millis
 
 logger: logging.Logger = logging.getLogger(__name__)
 
@@ -206,7 +211,6 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
         # For database, schema, tables, views, etc
         self.lineage_extractor = BigqueryLineageExtractor(config, self.report)
         self.usage_extractor = BigQueryUsageExtractor(config, self.report)
-        self.profiler = BigqueryProfiler(config, self.report)
 
         # Currently caching using instance variables
         # TODO - rewrite cache for readability or use out of the box solution
@@ -231,6 +235,25 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
                 cached_domains=[k for k in self.config.domain], graph=self.ctx.graph
             )
 
+        self.redundant_run_skip_handler = RedundantRunSkipHandler(
+            source=self,
+            config=self.config,
+            pipeline_name=self.ctx.pipeline_name,
+            run_id=self.ctx.run_id,
+        )
+
+        self.profiling_state_handler: Optional[ProfilingHandler] = None
+        if self.config.enable_profiling_state:
+            self.profiling_state_handler = ProfilingHandler(
+                source=self,
+                config=self.config,
+                pipeline_name=self.ctx.pipeline_name,
+                run_id=self.ctx.run_id,
+            )
+        self.profiler = BigqueryProfiler(
+            config, self.report, self.profiling_state_handler
+        )
+
         atexit.register(cleanup, config)
 
     @classmethod
@@ -602,9 +625,48 @@ def _process_project(
                 continue
 
         if self.config.include_table_lineage:
+            if (
+                self.config.enable_lineage_extraction_state
+                and self.redundant_run_skip_handler.should_skip_this_run(
+                    cur_start_time_millis=datetime_to_ts_millis(self.config.start_time)
+                )
+            ):
+                # Skip this run
+                self.report.report_warning(
+                    "lineage-extraction",
+                    f"Skip this run as there was a run later than the current start time: {self.config.start_time}",
+                )
+                return
+
+            if self.config.enable_lineage_extraction_state:
+                # Update the checkpoint state for this run.
+                self.redundant_run_skip_handler.update_state(
+                    start_time_millis=datetime_to_ts_millis(self.config.start_time),
+                    end_time_millis=datetime_to_ts_millis(self.config.end_time),
+                )
+
             yield from self.generate_lineage(project_id)
 
         if self.config.include_usage_statistics:
+            if (
+                self.config.enable_usage_extraction_state
+                and self.redundant_run_skip_handler.should_skip_this_run(
+                    cur_start_time_millis=datetime_to_ts_millis(self.config.start_time)
+                )
+            ):
+                self.report.report_warning(
+                    "usage-extraction",
+                    f"Skip this run as there was a run later than the current start time: {self.config.start_time}",
+                )
+                return
+
+            if self.config.enable_usage_extraction_state:
+                # Update the checkpoint state for this run.
+                self.redundant_run_skip_handler.update_state(
+                    start_time_millis=datetime_to_ts_millis(self.config.start_time),
+                    end_time_millis=datetime_to_ts_millis(self.config.end_time),
+                )
+
             yield from self.generate_usage_statistics(project_id)
 
     def generate_lineage(self, project_id: str) -> Iterable[MetadataWorkUnit]:

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -4,7 +4,12 @@
 
 from pydantic import Field, PositiveInt, root_validator
 
-from datahub.configuration.common import AllowDenyPattern, LineageConfig
+from datahub.configuration.common import AllowDenyPattern
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    LineageStatefulIngestionConfig,
+    ProfilingStatefulIngestionConfig,
+    UsageStatefulIngestionConfig,
+)
 from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
 from datahub.ingestion.source_config.sql.bigquery import BigQueryConfig
 
@@ -23,7 +28,12 @@ class BigQueryUsageConfig(BaseUsageConfig):
     )
 
 
-class BigQueryV2Config(BigQueryConfig, LineageConfig):
+class BigQueryV2Config(
+    BigQueryConfig,
+    LineageStatefulIngestionConfig,
+    UsageStatefulIngestionConfig,
+    ProfilingStatefulIngestionConfig,
+):
     project_id_pattern: AllowDenyPattern = Field(
         default=AllowDenyPattern.allow_all(),
         description="Regex patterns for project_id to filter in ingestion.",

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py
@@ -20,6 +20,7 @@
     GenericProfiler,
     TableProfilerRequest,
 )
+from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
 
 logger = logging.getLogger(__name__)
 
@@ -34,8 +35,13 @@ class BigqueryProfiler(GenericProfiler):
     config: BigQueryV2Config
     report: BigQueryV2Report
 
-    def __init__(self, config: BigQueryV2Config, report: BigQueryV2Report) -> None:
-        super().__init__(config, report, "bigquery")
+    def __init__(
+        self,
+        config: BigQueryV2Config,
+        report: BigQueryV2Report,
+        state_handler: Optional[ProfilingHandler] = None,
+    ) -> None:
+        super().__init__(config, report, "bigquery", state_handler)
         self.config = config
         self.report = report
 
@@ -164,17 +170,16 @@ def get_workunits(
                     continue
 
                 for table in tables[project][dataset]:
+                    normalized_table_name = BigqueryTableIdentifier(
+                        project_id=project, dataset=dataset, table=table.name
+                    ).get_table_name()
                     for column in table.columns:
                         # Profiler has issues with complex types (array, struct, geography, json), so we deny those types from profiling
                         # We also filter columns without data type as it means that column is part of a complex type.
                         if not column.data_type or any(
                             word in column.data_type.lower()
                             for word in ["array", "struct", "geography", "json"]
                         ):
-                            normalized_table_name = BigqueryTableIdentifier(
-                                project_id=project, dataset=dataset, table=table.name
-                            ).get_table_name()
-
                             self.config.profile_pattern.deny.append(
                                 f"^{normalized_table_name}.{column.field_path}$"
                             )
@@ -188,40 +193,51 @@ def get_workunits(
 
             if len(profile_requests) == 0:
                 continue
-            table_profile_requests = cast(List[TableProfilerRequest], profile_requests)
-            for request, profile in self.generate_profiles(
-                table_profile_requests,
-                self.config.profiling.max_workers,
-                platform=self.platform,
-                profiler_args=self.get_profile_args(),
-            ):
-                if request is None or profile is None:
-                    continue
+            yield from self.generate_wu_from_profile_requests(profile_requests)
+
+    def generate_wu_from_profile_requests(
+        self, profile_requests: List[BigqueryProfilerRequest]
+    ) -> Iterable[MetadataWorkUnit]:
+        table_profile_requests = cast(List[TableProfilerRequest], profile_requests)
+        for request, profile in self.generate_profiles(
+            table_profile_requests,
+            self.config.profiling.max_workers,
+            platform=self.platform,
+            profiler_args=self.get_profile_args(),
+        ):
+            if request is None or profile is None:
+                continue
 
-                request = cast(BigqueryProfilerRequest, request)
-                profile.sizeInBytes = request.table.size_in_bytes
-                # If table is partitioned we profile only one partition (if nothing set then the last one)
-                # but for table level we can use the rows_count from the table metadata
-                # This way even though column statistics only reflects one partition data but the rows count
-                # shows the proper count.
-                if profile.partitionSpec and profile.partitionSpec.partition:
-                    profile.rowCount = request.table.rows_count
+            request = cast(BigqueryProfilerRequest, request)
+            profile.sizeInBytes = request.table.size_in_bytes
+            # If table is partitioned we profile only one partition (if nothing set then the last one)
+            # but for table level we can use the rows_count from the table metadata
+            # This way even though column statistics only reflects one partition data but the rows count
+            # shows the proper count.
+            if profile.partitionSpec and profile.partitionSpec.partition:
+                profile.rowCount = request.table.rows_count
 
-                dataset_name = request.pretty_name
-                dataset_urn = make_dataset_urn_with_platform_instance(
-                    self.platform,
-                    dataset_name,
-                    self.config.platform_instance,
-                    self.config.env,
-                )
-                wu = wrap_aspect_as_workunit(
-                    "dataset",
-                    dataset_urn,
-                    "datasetProfile",
-                    profile,
+            dataset_name = request.pretty_name
+            dataset_urn = make_dataset_urn_with_platform_instance(
+                self.platform,
+                dataset_name,
+                self.config.platform_instance,
+                self.config.env,
+            )
+            # We don't add to the profiler state if we only do table level profiling as it always happens
+            if self.state_handler and not request.profile_table_level_only:
+                self.state_handler.add_to_state(
+                    dataset_urn, int(datetime.datetime.utcnow().timestamp() * 1000)
                 )
-                self.report.report_workunit(wu)
-                yield wu
+
+            wu = wrap_aspect_as_workunit(
+                "dataset",
+                dataset_urn,
+                "datasetProfile",
+                profile,
+            )
+            self.report.report_workunit(wu)
+            yield wu
 
     def get_bigquery_profile_request(
         self, project: str, dataset: str, table: BigqueryTable

diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py
@@ -5,6 +5,10 @@
 
 from datahub.configuration.common import AllowDenyPattern
 from datahub.ingestion.glossary.classifier import ClassificationConfig
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+    ProfilingStatefulIngestionConfig,
+    UsageStatefulIngestionConfig,
+)
 from datahub.ingestion.source_config.sql.snowflake import (
     BaseSnowflakeConfig,
     SnowflakeConfig,
@@ -15,7 +19,12 @@
 logger = logging.Logger(__name__)
 
 
-class SnowflakeV2Config(SnowflakeConfig, SnowflakeUsageConfig):
+class SnowflakeV2Config(
+    SnowflakeConfig,
+    SnowflakeUsageConfig,
+    UsageStatefulIngestionConfig,
+    ProfilingStatefulIngestionConfig,
+):
     convert_urns_to_lowercase: bool = Field(
         default=True,
     )
@@ -54,7 +63,7 @@ class SnowflakeV2Config(SnowflakeConfig, SnowflakeUsageConfig):
         description="Whether to populate Snowsight url for Snowflake Objects",
     )
 
-    match_fully_qualified_names = bool = Field(
+    match_fully_qualified_names: bool = Field(
         default=False,
         description="Whether `schema_pattern` is matched against fully qualified schema name `<catalog>.<schema>`.",
     )

diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py
@@ -1,5 +1,6 @@
 import dataclasses
 import logging
+from datetime import datetime
 from typing import Callable, Iterable, List, Optional, cast
 
 from snowflake.sqlalchemy import snowdialect
@@ -28,6 +29,7 @@
     GenericProfiler,
     TableProfilerRequest,
 )
+from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
 
 snowdialect.ischema_names["GEOGRAPHY"] = sqltypes.NullType
 
@@ -44,8 +46,13 @@ class SnowflakeProfiler(SnowflakeCommonMixin, GenericProfiler, SnowflakeCommonPr
     config: SnowflakeV2Config
     report: SnowflakeV2Report
 
-    def __init__(self, config: SnowflakeV2Config, report: SnowflakeV2Report) -> None:
-        super().__init__(config, report, self.platform)
+    def __init__(
+        self,
+        config: SnowflakeV2Config,
+        report: SnowflakeV2Report,
+        state_handler: Optional[ProfilingHandler] = None,
+    ) -> None:
+        super().__init__(config, report, self.platform, state_handler)
         self.config = config
         self.report = report
         self.logger = logger
@@ -102,6 +109,13 @@ def get_workunits(self, databases: List[SnowflakeDatabase]) -> Iterable[WorkUnit
                     self.config.platform_instance,
                     self.config.env,
                 )
+
+                # We don't add to the profiler state if we only do table level profiling as it always happens
+                if self.state_handler:
+                    self.state_handler.add_to_state(
+                        dataset_urn, int(datetime.utcnow().timestamp() * 1000)
+                    )
+
                 yield self.wrap_aspect_as_workunit(
                     "dataset",
                     dataset_urn,

diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
@@ -68,6 +68,7 @@
     SnowflakeQueryMixin,
 )
 from datahub.ingestion.source.sql.sql_common import SqlContainerSubTypes
+from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
 from datahub.ingestion.source.state.redundant_run_skip_handler import (
     RedundantRunSkipHandler,
 )
@@ -225,9 +226,20 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config):
             # For usage stats
             self.usage_extractor = SnowflakeUsageExtractor(config, self.report)
 
+        self.profiling_state_handler: Optional[ProfilingHandler] = None
+        if self.config.enable_profiling_state:
+            self.profiling_state_handler = ProfilingHandler(
+                source=self,
+                config=self.config,
+                pipeline_name=self.ctx.pipeline_name,
+                run_id=self.ctx.run_id,
+            )
+
         if config.profiling.enabled:
             # For profiling
-            self.profiler = SnowflakeProfiler(config, self.report)
+            self.profiler = SnowflakeProfiler(
+                config, self.report, self.profiling_state_handler
+            )
 
         if self.is_classification_enabled():
             self.classifiers = self.get_classifiers()
@@ -455,17 +467,25 @@ def get_workunits(self) -> Iterable[WorkUnit]:
             yield from self.profiler.get_workunits(databases)
 
         if self.config.include_usage_stats or self.config.include_operational_stats:
-            if self.redundant_run_skip_handler.should_skip_this_run(
-                cur_start_time_millis=datetime_to_ts_millis(self.config.start_time)
+            if (
+                self.config.enable_usage_extraction_state
+                and self.redundant_run_skip_handler.should_skip_this_run(
+                    cur_start_time_millis=datetime_to_ts_millis(self.config.start_time)
+                )
             ):
                 # Skip this run
+                self.report.report_warning(
+                    "usage-extraction",
+                    f"Skip this run as there was a run later than the current start time: {self.config.start_time}",
+                )
                 return
 
-            # Update the checkpoint state for this run.
-            self.redundant_run_skip_handler.update_state(
-                start_time_millis=datetime_to_ts_millis(self.config.start_time),
-                end_time_millis=datetime_to_ts_millis(self.config.end_time),
-            )
+            if self.config.enable_usage_extraction_state:
+                # Update the checkpoint state for this run.
+                self.redundant_run_skip_handler.update_state(
+                    start_time_millis=datetime_to_ts_millis(self.config.start_time),
+                    end_time_millis=datetime_to_ts_millis(self.config.end_time),
+                )
 
             discovered_datasets: List[str] = [
                 self.get_dataset_identifier(table.name, schema.name, db.name)