Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(ingest): bigquery - Setting partition id for profiling data and project_id fix #6558

Merged
merged 1 commit into from
Nov 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -492,12 +492,19 @@ def get_workunits(self) -> Iterable[WorkUnit]:
conn: bigquery.Client = self.get_bigquery_client()
self.add_config_to_report()

projects: List[BigqueryProject] = BigQueryDataDictionary.get_projects(conn)
if len(projects) == 0:
logger.warning(
"Get projects didn't return any project. Maybe resourcemanager.projects.get permission is missing for the service account. You can assign predefined roles/bigquery.metadataViewer role to your service account."
projects: List[BigqueryProject]
if self.config.project_id:
project = BigqueryProject(
id=self.config.project_id, name=self.config.project_id
)
return
projects = [project]
else:
projects = BigQueryDataDictionary.get_projects(conn)
if len(projects) == 0:
logger.warning(
"Get projects didn't return any project. Maybe resourcemanager.projects.get permission is missing for the service account. You can assign predefined roles/bigquery.metadataViewer role to your service account."
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1- this is much more useful!

)
return

for project_id in projects:
if not self.config.project_id_pattern.allowed(project_id.id):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class BigQueryV2Config(BigQueryConfig, LineageConfig):
# The inheritance hierarchy is wonky here, but these options need modifications.
project_id: Optional[str] = Field(
default=None,
description="[deprecated] Use project_id_pattern instead.",
description="[deprecated] Use project_id_pattern instead. You can use this property if you only want to ingest one project and don't want to give project resourcemanager.projects.list to your service account",
)
storage_project_id: None = Field(default=None, hidden_from_schema=True)

Expand Down Expand Up @@ -97,14 +97,13 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict:

if project_id_pattern == AllowDenyPattern.allow_all() and project_id:
logging.warning(
"project_id_pattern is not set but project_id is set, setting project_id as project_id_pattern. project_id will be deprecated, please use project_id_pattern instead."
"project_id_pattern is not set but project_id is set, source will only ingest the project_id project. project_id will be deprecated, please use project_id_pattern instead."
)
values["project_id_pattern"] = AllowDenyPattern(allow=[f"^{project_id}$"])
elif project_id_pattern != AllowDenyPattern.allow_all() and project_id:
logging.warning(
"project_id will be ignored in favour of project_id_pattern. project_id will be deprecated, please use project_id only."
"use project_id_pattern whenever possible. project_id will be deprecated, please use project_id_pattern only if possible."
)
values.pop("project_id")

dataset_pattern = values.get("dataset_pattern")
schema_pattern = values.get("schema_pattern")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,13 @@ def get_workunits(

request = cast(BigqueryProfilerRequest, request)
profile.sizeInBytes = request.table.size_in_bytes
# If table is partitioned we profile only one partition (if nothing set then the last one)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thank you! very good comments..

# but for table level we can use the rows_count from the table metadata
# This way even though column statistics only reflects one partition data but the rows count
# shows the proper count.
if profile.partitionSpec and profile.partitionSpec.partition:
profile.rowCount = request.table.rows_count

dataset_name = request.pretty_name
dataset_urn = make_dataset_urn_with_platform_instance(
self.platform,
Expand Down Expand Up @@ -250,7 +257,10 @@ def get_bigquery_profile_request(
profile_request = BigqueryProfilerRequest(
pretty_name=dataset_name,
batch_kwargs=dict(
schema=project, table=f"{dataset}.{table.name}", custom_sql=custom_sql
schema=project,
table=f"{dataset}.{table.name}",
custom_sql=custom_sql,
partition=partition,
),
table=table,
profile_table_level_only=profile_table_level_only,
Expand Down