-
Notifications
You must be signed in to change notification settings - Fork 3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix(ingest): bigquery - Fixing querying non-date partition columns in profiling #6554
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -115,11 +115,17 @@ def generate_partition_profiler_query( | |
] = partition | ||
return None, None | ||
|
||
partition_column_type: str = "DATE" | ||
for c in table.columns: | ||
if c.is_partition_column: | ||
partition_column_type = c.data_type | ||
|
||
if table.time_partitioning.type_ in ("DAY", "MONTH", "YEAR"): | ||
partition_where_clause = "{column_name} BETWEEN DATE('{partition_id}') AND DATE('{upper_bound_partition_id}')".format( | ||
partition_where_clause = "{column_name} BETWEEN {partition_column_type}('{partition_id}') AND {partition_column_type}('{upper_bound_partition_id}')".format( | ||
column_name=table.time_partitioning.field, | ||
partition_id=partition_datetime, | ||
upper_bound_partition_id=upper_bound_partition_datetime, | ||
partition_column_type=partition_column_type, | ||
) | ||
elif table.time_partitioning.type_ in ("HOUR"): | ||
partition_where_clause = "{column_name} BETWEEN '{partition_id}' AND '{upper_bound_partition_id}'".format( | ||
|
@@ -216,14 +222,10 @@ def get_bigquery_profile_request( | |
if not self.is_dataset_eligible_for_profiling( | ||
dataset_name, table.last_altered, table.size_in_bytes, table.rows_count | ||
): | ||
# Profile only table level if dataset is filtered from profiling | ||
# due to size limits alone | ||
if self.is_dataset_eligible_for_profiling( | ||
dataset_name, table.last_altered, 0, 0 | ||
): | ||
profile_table_level_only = True | ||
else: | ||
skip_profiling = True | ||
profile_table_level_only = True | ||
self.report.num_tables_not_eligible_profiling[dataset] = ( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion: make There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we try to use in report structs that can't grow indefinitely and there this way I think won't work :( |
||
self.report.num_tables_not_eligible_profiling.get(dataset, 0) + 1 | ||
) | ||
|
||
if not table.columns: | ||
skip_profiling = True | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -66,7 +66,7 @@ class OperationalDataMeta: | |
def bigquery_audit_metadata_query_template( | ||
dataset: str, | ||
use_date_sharded_tables: bool, | ||
table_allow_filter: str = None, | ||
table_allow_filter: Optional[str] = None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. was this change necessary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This field should be Optional and mypy (linter) was complaining about it. |
||
limit: Optional[int] = None, | ||
) -> str: | ||
""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
suggestion: you could use f-strings here and avoid
.format