From ec243d179baa2cb4f53d1c7e925b2306f7cf35c8 Mon Sep 17 00:00:00 2001 From: treff7es Date: Tue, 6 Dec 2022 16:13:13 +0100 Subject: [PATCH] Disabling by default to run sql parser in a separate process Fixing adding views to the global view list --- .../src/datahub/configuration/common.py | 5 + .../ingestion/source/bigquery_v2/bigquery.py | 7 +- .../ingestion/source/bigquery_v2/lineage.py | 8 +- .../ingestion/source/bigquery_v2/profiler2.py | 272 ------------------ .../datahub/utilities/bigquery_sql_parser.py | 4 +- 5 files changed, 18 insertions(+), 278 deletions(-) delete mode 100644 metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler2.py diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 95d852bbe7b606..72982bce93a6fd 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -224,3 +224,8 @@ class LineageConfig(ConfigModel): default=True, description="When enabled, emits lineage as incremental to existing lineage already in DataHub. When disabled, re-states lineage on each run.", ) + + sql_parser_use_external_process: bool = Field( + default=False, + description="When enabled, sql parser will run in isolated in a separate process. This can affect processing time but can protect from sql parser's mem leak.", + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index a48c05a7b8fe3c..1ad8512b9ef2d0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -698,6 +698,11 @@ def _process_view( conn, table_identifier, column_limit=self.config.column_limit ) + if dataset_name not in self.db_views[project_id]: + self.db_views[project_id][dataset_name] = [] + + self.db_views[project_id][dataset_name].append(view) + view_workunits = self.gen_view_dataset_workunits(view, project_id, dataset_name) for wu in view_workunits: self.report.report_workunit(wu) @@ -1142,8 +1147,6 @@ def get_views_for_dataset( views = self.db_views.get(project_id) - # get all views for database failed, - # falling back to get views for schema if not views: return BigQueryDataDictionary.get_views_for_dataset( conn, project_id, dataset_name, self.config.profiling.enabled diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 16505f4d27dc91..a4dd52e8d834a7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -431,7 +431,9 @@ def _create_lineage_map(self, entries: Iterable[QueryEvent]) -> Dict[str, Set[st # in the references. There is no distinction between direct/base objects accessed. So doing sql parsing # to ensure we only use direct objects accessed for lineage try: - parser = BigQuerySQLParser(e.query) + parser = BigQuerySQLParser( + e.query, self.config.sql_parser_use_external_process + ) referenced_objs = set( map(lambda x: x.split(".")[-1], parser.get_tables()) ) @@ -468,7 +470,9 @@ def parse_view_lineage( parsed_tables = set() if view.ddl: try: - parser = BigQuerySQLParser(view.ddl) + parser = BigQuerySQLParser( + view.ddl, self.config.sql_parser_use_external_process + ) tables = parser.get_tables() except Exception as ex: logger.debug( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler2.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler2.py deleted file mode 100644 index 5d01aad1779322..00000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler2.py +++ /dev/null @@ -1,272 +0,0 @@ -import dataclasses -import datetime -import logging -from typing import Dict, Iterable, List, Optional, Tuple, cast - -from dateutil.relativedelta import relativedelta -from sqlalchemy import create_engine, inspect -from sqlalchemy.engine.reflection import Inspector - -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance -from datahub.emitter.mcp_builder import wrap_aspect_as_workunit -from datahub.ingestion.api.common import WorkUnit -from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier -from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config -from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report -from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( - BigqueryColumn, - BigqueryTable, -) -from datahub.ingestion.source.ge_data_profiler import ( - DatahubGEProfiler, - GEProfilerRequest, -) -from datahub.ingestion.source.sql.sql_generic_profiler import ( - GenericProfiler, - TableProfilerRequest, -) -from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile -from datahub.metadata.schema_classes import DatasetProfileClass - -logger = logging.getLogger(__name__) - - -@dataclasses.dataclass -class BigqueryProfilerRequest(GEProfilerRequest): - table: BigqueryTable - profile_table_level_only: bool = False - - -class BigqueryProfiler(GenericProfiler): - def __init__(self, config: BigQueryV2Config, report: BigQueryV2Report) -> None: - super().__init__(config, report, "bigquery") - self.config = config - self.report = report - - @staticmethod - def get_partition_range_from_partition_id( - partition_id: str, partition_datetime: Optional[datetime.datetime] - ) -> Tuple[datetime.datetime, datetime.datetime]: - partition_range_map: Dict[int, Tuple[relativedelta, str]] = { - 4: (relativedelta(years=1), "%Y"), - 6: (relativedelta(months=1), "%Y%m"), - 8: (relativedelta(days=1), "%Y%m%d"), - 10: (relativedelta(hours=1), "%Y%m%d%H"), - } - - duration: relativedelta - if partition_range_map.get(len(partition_id)): - (delta, format) = partition_range_map[len(partition_id)] - duration = delta - if not partition_datetime: - partition_datetime = datetime.datetime.strptime(partition_id, format) - else: - raise ValueError( - f"check your partition_id {partition_id}. It must be yearly/monthly/daily/hourly." - ) - upper_bound_partition_datetime = partition_datetime + duration - return partition_datetime, upper_bound_partition_datetime - - def generate_partition_profiler_query( - self, - project: str, - schema: str, - table: BigqueryTable, - partition_datetime: Optional[datetime.datetime], - ) -> Tuple[Optional[str], Optional[str]]: - """ - Method returns partition id if table is partitioned or sharded and generate custom partition query for - partitioned table. - See more about partitioned tables at https://cloud.google.com/bigquery/docs/partitioned-tables - """ - logger.debug( - f"generate partition profiler query for project: {project} schema: {schema} and table {table.name}, partition_datetime: {partition_datetime}" - ) - partition = table.max_partition_id - if partition: - partition_where_clause: str - - if not table.time_partitioning: - partition_column: Optional[BigqueryColumn] = None - for column in table.columns: - if column.is_partition_column: - partition_column = column - break - if partition_column: - partition_where_clause = f"{partition_column.name} >= {partition}" - else: - logger.warning( - f"Partitioned table {table.name} without partiton column" - ) - return None, None - else: - logger.debug( - f"{table.name} is partitioned and partition column is {partition}" - ) - try: - ( - partition_datetime, - upper_bound_partition_datetime, - ) = self.get_partition_range_from_partition_id( - partition, partition_datetime - ) - except ValueError as e: - logger.error( - f"Unable to get partition range for partition id: {partition} it failed with exception {e}" - ) - self.report.invalid_partition_ids[ - f"{schema}.{table.name}" - ] = partition - return None, None - - partition_column_type: str = "DATE" - for c in table.columns: - if c.is_partition_column: - partition_column_type = c.data_type - - if table.time_partitioning.type_ in ("DAY", "MONTH", "YEAR"): - partition_where_clause = f"`{table.time_partitioning.field}` BETWEEN {partition_column_type}('{partition_datetime}') AND {partition_column_type}('{upper_bound_partition_datetime}')" - elif table.time_partitioning.type_ in ("HOUR"): - partition_where_clause = f"`{table.time_partitioning.field}` BETWEEN '{partition_datetime}' AND '{upper_bound_partition_datetime}'" - else: - logger.warning( - f"Not supported partition type {table.time_partitioning.type_}" - ) - return None, None - custom_sql = """ -SELECT - * -FROM - `{table_catalog}.{table_schema}.{table_name}` -WHERE - {partition_where_clause} - """.format( - table_catalog=project, - table_schema=schema, - table_name=table.name, - partition_where_clause=partition_where_clause, - ) - - return (partition, custom_sql) - if table.max_shard_id: - # For sharded table we want to get the partition id but not needed to generate custom query - return table.max_shard_id, None - - return None, None - - def get_workunits( - self, tables: Dict[str, Dict[str, List[BigqueryTable]]] - ) -> Iterable[WorkUnit]: - - # Otherwise, if column level profiling is enabled, use GE profiler. - for project in tables.keys(): - if not self.config.project_id_pattern.allowed(project): - continue - profile_requests = [] - - for dataset in tables[project]: - if not self.config.schema_pattern.allowed(dataset): - continue - - for table in tables[project][dataset]: - # Emit the profile work unit - profile_request = self.get_bigquery_profile_request( - project=project, dataset=dataset, table=table - ) - if profile_request is not None: - profile_requests.append(profile_request) - - if len(profile_requests) == 0: - continue - profile_requests = cast(List[TableProfilerRequest], profile_requests) - for request, profile in self.generate_profiles( - profile_requests, - self.config.profiling.max_workers, - platform=self.platform, - profiler_args=self.get_profile_args(), - ): - if request is None or profile is None: - continue - - request = cast(BigqueryProfilerRequest, request) - profile.sizeInBytes = request.table.size_in_bytes - # If table is partitioned we profile only one partition (if nothing set then the last one) - # but for table level we can use the rows_count from the table metadata - # This way even though column statistics only reflects one partition data but the rows count - # shows the proper count. - if profile.partitionSpec and profile.partitionSpec.partition: - profile.rowCount = request.table.rows_count - - dataset_name = request.pretty_name - dataset_urn = make_dataset_urn_with_platform_instance( - self.platform, - dataset_name, - self.config.platform_instance, - self.config.env, - ) - wu = wrap_aspect_as_workunit( - "dataset", - dataset_urn, - "datasetProfile", - profile, - ) - self.report.report_workunit(wu) - yield wu - - def get_bigquery_profile_request( - self, project: str, dataset: str, table: BigqueryTable - ) -> Optional[BigqueryProfilerRequest]: - skip_profiling = False - profile_table_level_only = self.config.profiling.profile_table_level_only - dataset_name = BigqueryTableIdentifier( - project_id=project, dataset=dataset, table=table.name - ).get_table_name() - if not self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, table.size_in_bytes, table.rows_count - ): - profile_table_level_only = True - self.report.num_tables_not_eligible_profiling[dataset] = ( - self.report.num_tables_not_eligible_profiling.get(dataset, 0) + 1 - ) - - if not table.columns: - skip_profiling = True - - if skip_profiling: - if self.config.profiling.report_dropped_profiles: - self.report.report_dropped(f"profile of {dataset_name}") - return None - (partition, custom_sql) = self.generate_partition_profiler_query( - project, dataset, table, self.config.profiling.partition_datetime - ) - - if partition is None and table.time_partitioning: - self.report.report_warning( - "profile skipped as partitioned table is empty or partition id was invalid", - dataset_name, - ) - return None - - if ( - partition is not None - and not self.config.profiling.partition_profiling_enabled - ): - logger.debug( - f"{dataset_name} and partition {partition} is skipped because profiling.partition_profiling_enabled property is disabled" - ) - return None - - self.report.report_entity_profiled(dataset_name) - logger.debug(f"Preparing profiling request for {dataset_name}") - profile_request = BigqueryProfilerRequest( - pretty_name=dataset_name, - batch_kwargs=dict( - schema=project, - table=f"{dataset}.{table.name}", - custom_sql=custom_sql, - partition=partition, - ), - table=table, - profile_table_level_only=profile_table_level_only, - ) - return profile_request diff --git a/metadata-ingestion/src/datahub/utilities/bigquery_sql_parser.py b/metadata-ingestion/src/datahub/utilities/bigquery_sql_parser.py index f84fe6cd7cb964..ca23a60fab8ae5 100644 --- a/metadata-ingestion/src/datahub/utilities/bigquery_sql_parser.py +++ b/metadata-ingestion/src/datahub/utilities/bigquery_sql_parser.py @@ -9,11 +9,11 @@ class BigQuerySQLParser(SQLParser): parser: SQLParser - def __init__(self, sql_query: str) -> None: + def __init__(self, sql_query: str, use_external_process: bool = False) -> None: super().__init__(sql_query) self._parsed_sql_query = self.parse_sql_query(sql_query) - self.parser = SqlLineageSQLParser(self._parsed_sql_query) + self.parser = SqlLineageSQLParser(self._parsed_sql_query, use_external_process) def parse_sql_query(self, sql_query: str) -> str: sql_query = BigQuerySQLParser._parse_bigquery_comment_sign(sql_query)