datahub-project · treff7es · Nov 28, 2022 · Nov 28, 2022 · hsheth2 · Nov 28, 2022
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -492,12 +492,19 @@ def get_workunits(self) -> Iterable[WorkUnit]:
         conn: bigquery.Client = self.get_bigquery_client()
         self.add_config_to_report()
 
-        projects: List[BigqueryProject] = BigQueryDataDictionary.get_projects(conn)
-        if len(projects) == 0:
-            logger.warning(
-                "Get projects didn't return any project. Maybe resourcemanager.projects.get permission is missing for the service account. You can assign predefined roles/bigquery.metadataViewer role to your service account."
+        projects: List[BigqueryProject]
+        if self.config.project_id:
+            project = BigqueryProject(
+                id=self.config.project_id, name=self.config.project_id
             )
-            return
+            projects = [project]
+        else:
+            projects = BigQueryDataDictionary.get_projects(conn)
+            if len(projects) == 0:
+                logger.warning(
+                    "Get projects didn't return any project. Maybe resourcemanager.projects.get permission is missing for the service account. You can assign predefined roles/bigquery.metadataViewer role to your service account."
+                )
+                return
 
         for project_id in projects:
             if not self.config.project_id_pattern.allowed(project_id.id):

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -64,7 +64,7 @@ class BigQueryV2Config(BigQueryConfig, LineageConfig):
     # The inheritance hierarchy is wonky here, but these options need modifications.
     project_id: Optional[str] = Field(
         default=None,
-        description="[deprecated] Use project_id_pattern instead.",
+        description="[deprecated] Use project_id_pattern instead. You can use this property if you only want to ingest one project and don't want to give project resourcemanager.projects.list to your service account",
     )
     storage_project_id: None = Field(default=None, hidden_from_schema=True)
 
@@ -97,14 +97,13 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
 
         if project_id_pattern == AllowDenyPattern.allow_all() and project_id:
             logging.warning(
-                "project_id_pattern is not set but project_id is set, setting project_id as project_id_pattern. project_id will be deprecated, please use project_id_pattern instead."
+                "project_id_pattern is not set but project_id is set, source will only ingest the project_id project. project_id will be deprecated, please use project_id_pattern instead."
             )
             values["project_id_pattern"] = AllowDenyPattern(allow=[f"^{project_id}$"])
         elif project_id_pattern != AllowDenyPattern.allow_all() and project_id:
             logging.warning(
-                "project_id will be ignored in favour of project_id_pattern. project_id will be deprecated, please use project_id only."
+                "use project_id_pattern whenever possible. project_id will be deprecated, please use project_id_pattern only if possible."
             )
-            values.pop("project_id")
 
         dataset_pattern = values.get("dataset_pattern")
         schema_pattern = values.get("schema_pattern")

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py
@@ -186,6 +186,13 @@ def get_workunits(
 
                 request = cast(BigqueryProfilerRequest, request)
                 profile.sizeInBytes = request.table.size_in_bytes
+                # If table is partitioned we profile only one partition (if nothing set then the last one)
+                # but for table level we can use the rows_count from the table metadata
+                # This way even though column statistics only reflects one partition data but the rows count
+                # shows the proper count.
+                if profile.partitionSpec and profile.partitionSpec.partition:
+                    profile.rowCount = request.table.rows_count
+
                 dataset_name = request.pretty_name
                 dataset_urn = make_dataset_urn_with_platform_instance(
                     self.platform,
@@ -250,7 +257,10 @@ def get_bigquery_profile_request(
         profile_request = BigqueryProfilerRequest(
             pretty_name=dataset_name,
             batch_kwargs=dict(
-                schema=project, table=f"{dataset}.{table.name}", custom_sql=custom_sql
+                schema=project,
+                table=f"{dataset}.{table.name}",
+                custom_sql=custom_sql,
+                partition=partition,
             ),
             table=table,
             profile_table_level_only=profile_table_level_only,