Skip to content

Commit

Permalink
fix: Update the key format of set 'grouped_tables' (#421)
Browse files Browse the repository at this point in the history
Signed-off-by: xuans <[email protected]>
  • Loading branch information
xuan616 authored Dec 7, 2020
1 parent 0dcf07e commit 4c9e5f7
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 3 deletions.
1 change: 1 addition & 0 deletions databuilder/extractor/base_bigquery_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class BaseBigQueryExtractor(Extractor):
DEFAULT_PAGE_SIZE = 300
NUM_RETRIES = 3
DATE_LENGTH = 8
SHARDED_TABLE_KEY_FORMAT = '{dataset_id}/{table_id}'

def init(self, conf: ConfigTree) -> None:
# should use key_path, or cred_key if the former doesn't exist
Expand Down
9 changes: 6 additions & 3 deletions databuilder/extractor/bigquery_metadata_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,16 @@ def _retrieve_tables(self, dataset: DatasetRef) -> Any:
# If the last eight characters are digits, we assume the table is of a table date range type
# and then we only need one schema definition
table_prefix = table_id[:-BigQueryMetadataExtractor.DATE_LENGTH]
if table_prefix in self.grouped_tables:
table_id = table_prefix
sharded_table_key = BigQueryMetadataExtractor.SHARDED_TABLE_KEY_FORMAT.format(
dataset_id=tableRef['datasetId'],
table_id=table_id)
if sharded_table_key in self.grouped_tables:
# If one table in the date range is processed, then ignore other ones
# (it adds too much metadata)
continue

table_id = table_prefix
self.grouped_tables.add(table_prefix)
self.grouped_tables.add(sharded_table_key)

table = self.bigquery_service.tables().get(
projectId=tableRef['projectId'],
Expand Down

0 comments on commit 4c9e5f7

Please sign in to comment.