From 3cfaf4871c2c20b5d6acb7699c1818555fe683a0 Mon Sep 17 00:00:00 2001 From: Anna Scholtz Date: Wed, 8 Nov 2023 16:30:11 -0800 Subject: [PATCH] Make base tables configurable in glean_usage generator (#4534) * Make base tables configurable in glean_usage generator * Fix event extras unnesting in event monitoring --- sql_generators/glean_usage/__init__.py | 32 ++++++++++--------- sql_generators/glean_usage/common.py | 9 +++--- .../glean_usage/event_monitoring_live.py | 7 ++-- .../event_monitoring_aggregates_v1.query.sql | 3 +- .../event_monitoring_live_v1.init.sql | 3 +- 5 files changed, 30 insertions(+), 24 deletions(-) diff --git a/sql_generators/glean_usage/__init__.py b/sql_generators/glean_usage/__init__.py index 6d7cd109fd3..1229ee4d9fa 100644 --- a/sql_generators/glean_usage/__init__.py +++ b/sql_generators/glean_usage/__init__.py @@ -21,7 +21,7 @@ metrics_clients_daily, metrics_clients_last_seen, ) -from sql_generators.glean_usage.common import get_app_info, list_baseline_tables +from sql_generators.glean_usage.common import get_app_info, list_tables # list of methods for generating queries GLEAN_TABLES = [ @@ -97,19 +97,21 @@ def generate( elif exclude: table_filter = partial(table_matches_patterns, exclude, True) - baseline_tables = list_baseline_tables( - project_id=target_project, - only_tables=[only] if only else None, - table_filter=table_filter, - ) - - # filter out skipped apps - baseline_tables = [ - baseline_table - for baseline_table in baseline_tables - if baseline_table.split(".")[1] - not in [f"{skipped_app}_stable" for skipped_app in SKIP_APPS] - ] + def get_tables(table_name="baseline_v1"): + baseline_tables = list_tables( + project_id=target_project, + only_tables=[only] if only else None, + table_filter=table_filter, + table_name=table_name, + ) + + # filter out skipped apps + return [ + baseline_table + for baseline_table in baseline_tables + if baseline_table.split(".")[1] + not in [f"{skipped_app}_stable" for skipped_app in SKIP_APPS] + ] output_dir = Path(output_dir) / target_project @@ -134,8 +136,8 @@ def generate( ), baseline_table, ) - for baseline_table in baseline_tables for table in GLEAN_TABLES + for baseline_table in get_tables(table_name=table.base_table_name) ] # Parameters to generate per-app datasets consist of the function to be called diff --git a/sql_generators/glean_usage/common.py b/sql_generators/glean_usage/common.py index d22c0853358..f45ac0431cb 100644 --- a/sql_generators/glean_usage/common.py +++ b/sql_generators/glean_usage/common.py @@ -51,13 +51,13 @@ def write_dataset_metadata(output_dir, full_table_id, derived_dataset_metadata=F target.write_text(rendered) -def list_baseline_tables(project_id, only_tables, table_filter): +def list_tables(project_id, only_tables, table_filter, table_name="baseline_v1"): """Return names of all matching baseline tables in shared-prod.""" prod_baseline_tables = [ s.stable_table for s in get_stable_table_schemas() if s.schema_id == "moz://mozilla.org/schemas/glean/ping/1" - and s.bq_table == "baseline_v1" + and s.bq_table == table_name ] prod_datasets_with_baseline = [t.split(".")[0] for t in prod_baseline_tables] stable_datasets = prod_datasets_with_baseline @@ -78,9 +78,9 @@ def list_baseline_tables(project_id, only_tables, table_filter): if d.endswith("_stable") and d in prod_datasets_with_baseline } return [ - f"{project_id}.{d}.baseline_v1" + f"{project_id}.{d}.{table_name}" for d in stable_datasets - if table_filter(f"{d}.baseline_v1") + if table_filter(f"{d}.{table_name}") ] @@ -163,6 +163,7 @@ def __init__(self): self.per_app_enabled = True self.across_apps_enabled = True self.cross_channel_template = "cross_channel.view.sql" + self.base_table_name = "baseline_v1" def skip_existing(self, output_dir="sql/", project_id="moz-fx-data-shared-prod"): """Existing files configured not to be overridden during generation.""" diff --git a/sql_generators/glean_usage/event_monitoring_live.py b/sql_generators/glean_usage/event_monitoring_live.py index 69e4bbb8441..227fb359369 100644 --- a/sql_generators/glean_usage/event_monitoring_live.py +++ b/sql_generators/glean_usage/event_monitoring_live.py @@ -33,6 +33,7 @@ def __init__(self) -> None: self.prefix = PREFIX self.target_table_id = TARGET_TABLE_ID self.custom_render_kwargs = {} + self.base_table_name = "events_v1" def generate_per_app_id( self, project_id, baseline_table, output_dir=None, use_cloud_function=True @@ -106,11 +107,11 @@ def generate_across_apps( if not self.across_apps_enabled: return - prod_datasets_with_baseline = [ + prod_datasets_with_event = [ s.bq_dataset_family for s in get_stable_table_schemas() if s.schema_id == "moz://mozilla.org/schemas/glean/ping/1" - and s.bq_table == "baseline_v1" + and s.bq_table == "events_v1" ] aggregate_table = "event_monitoring_aggregates_v1" @@ -124,7 +125,7 @@ def generate_across_apps( table=target_view_name, target_table=f"{TARGET_DATASET_CROSS_APP}_derived.{aggregate_table}", apps=apps, - prod_datasets=prod_datasets_with_baseline, + prod_datasets=prod_datasets_with_event, ) render_kwargs.update(self.custom_render_kwargs) diff --git a/sql_generators/glean_usage/templates/event_monitoring_aggregates_v1.query.sql b/sql_generators/glean_usage/templates/event_monitoring_aggregates_v1.query.sql index b845034f695..a06ae45c755 100644 --- a/sql_generators/glean_usage/templates/event_monitoring_aggregates_v1.query.sql +++ b/sql_generators/glean_usage/templates/event_monitoring_aggregates_v1.query.sql @@ -68,10 +68,11 @@ `{{ project_id }}.{{ dataset['bq_dataset_family'] }}_stable.events_v1` CROSS JOIN UNNEST(events) AS event, - UNNEST(event.extra) AS event_extra, -- Iterator for accessing experiments. -- Add one more for aggregating events across all experiments UNNEST(GENERATE_ARRAY(0, ARRAY_LENGTH(ping_info.experiments))) AS experiment_index + LEFT JOIN + UNNEST(event.extra) AS event_extra WHERE DATE(submission_timestamp) = @submission_date GROUP BY diff --git a/sql_generators/glean_usage/templates/event_monitoring_live_v1.init.sql b/sql_generators/glean_usage/templates/event_monitoring_live_v1.init.sql index 9850467966c..141c3fa893d 100644 --- a/sql_generators/glean_usage/templates/event_monitoring_live_v1.init.sql +++ b/sql_generators/glean_usage/templates/event_monitoring_live_v1.init.sql @@ -88,10 +88,11 @@ IF `{{ project_id }}.{{ dataset }}_live.events_v1` CROSS JOIN UNNEST(events) AS event, - UNNEST(event.extra) AS event_extra, -- Iterator for accessing experiments. -- Add one more for aggregating events across all experiments UNNEST(GENERATE_ARRAY(0, ARRAY_LENGTH(ping_info.experiments))) AS experiment_index + LEFT JOIN + UNNEST(event.extra) AS event_extra {% elif dataset_id in ["accounts_frontend", "accounts_backend"] %} -- FxA uses custom pings to send events without a category and extras. SELECT