-
Notifications
You must be signed in to change notification settings - Fork 104
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This table aggregates session-level data from GA.
- Loading branch information
Showing
12 changed files
with
727 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1298,3 +1298,19 @@ bqetl_ads: | |
tags: | ||
- impact/tier_1 | ||
- repo/bigquery-etl | ||
|
||
bqetl_mozilla_org_derived: | ||
schedule_interval: 0 2 * * * | ||
default_args: | ||
depends_on_past: false | ||
email: | ||
- [email protected] | ||
- [email protected] | ||
email_on_failure: true | ||
email_on_retry: true | ||
owner: [email protected] | ||
retries: 2 | ||
retry_delay: 30m | ||
start_date: "2023-11-13" | ||
tags: | ||
- impact/tier_1 |
14 changes: 14 additions & 0 deletions
14
sql/moz-fx-data-shared-prod/mozilla_org/dataset_metadata.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
friendly_name: Mozilla.org | ||
description: |- | ||
Mozilla.org data, usually derived from GA. | ||
dataset_base_acl: view | ||
user_facing: true | ||
labels: {} | ||
default_table_workgroup_access: | ||
- role: roles/bigquery.dataViewer | ||
members: | ||
- workgroup:mozilla-confidential | ||
workgroup_access: | ||
- role: roles/bigquery.dataViewer | ||
members: | ||
- workgroup:mozilla-confidential |
13 changes: 13 additions & 0 deletions
13
sql/moz-fx-data-shared-prod/mozilla_org/ga_sessions/view.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
CREATE OR REPLACE VIEW | ||
`moz-fx-data-shared-prod.mozilla_org.ga_sessions` | ||
AS | ||
SELECT | ||
* REPLACE ( | ||
mozdata.analysis.ga_nullify_string(campaign) AS campaign, | ||
mozdata.analysis.ga_nullify_string(source) AS source, | ||
mozdata.analysis.ga_nullify_string(medium) AS medium, | ||
mozdata.analysis.ga_nullify_string(term) AS term, | ||
mozdata.analysis.ga_nullify_string(content) AS content | ||
) | ||
FROM | ||
`moz-fx-data-shared-prod.mozilla_org_derived.ga_sessions_v1` |
14 changes: 14 additions & 0 deletions
14
sql/moz-fx-data-shared-prod/mozilla_org_derived/dataset_metadata.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
friendly_name: Mozilla.org Derived | ||
description: |- | ||
Mozilla.org data, usually derived from Google Analytics | ||
dataset_base_acl: derived | ||
user_facing: false | ||
labels: {} | ||
default_table_workgroup_access: | ||
- role: roles/bigquery.dataViewer | ||
members: | ||
- workgroup:mozilla-confidential | ||
workgroup_access: | ||
- role: roles/bigquery.dataViewer | ||
members: | ||
- workgroup:mozilla-confidential |
7 changes: 7 additions & 0 deletions
7
sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_sessions_v1/checks.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#fail | ||
-- ga_session_id should be unique across all partitions | ||
{{ is_unique(["ga_session_id"]) }} | ||
|
||
#fail | ||
{{ min_row_count(10000) }} | ||
|
20 changes: 20 additions & 0 deletions
20
sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_sessions_v1/metadata.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
friendly_name: Ga Sessions | ||
description: |- | ||
One row for each GA session. | ||
owners: | ||
- [email protected] | ||
labels: | ||
incremental: true | ||
owner1: [email protected] | ||
scheduling: | ||
dag_name: bqetl_mozilla_org_derived | ||
bigquery: | ||
time_partitioning: | ||
type: day | ||
field: 'session_date' | ||
require_partition_filter: true | ||
expiration_days: null | ||
clustering: | ||
fields: ["country"] | ||
references: {} | ||
deprecated: false |
154 changes: 154 additions & 0 deletions
154
sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_sessions_v1/query.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
-- First note: This table is meant to be forwards-compatible | ||
-- with the GA4 schema: https://support.google.com/analytics/answer/7029846 | ||
-- But that's harder, since some of the data is contained within events there (e.g. session_start is an event_param, with the value as the session_id) | ||
-- See https://www.ga4bigquery.com/sessions-dimensions-metrics-ga4/ | ||
-- Second note: We do not store user_ids, only client_ids | ||
-- After migration client_ids will be called pseudo_user_ids, see | ||
-- https://louder.com.au/2022/06/27/client-id-in-ga4-what-is-it-and-how-to-get-it-in-your-report/ | ||
-- Third note: The only non-forwards-compatible field is mobileDeviceInfo | ||
-- in GA4, that will be split into its components (model, manufacturer, etc.) | ||
-- I think we can simply handle this in the view using some UDFs | ||
-- Fourth note: Data is updated up to three days after the event happens, see | ||
-- https://support.google.com/analytics/answer/7029846?#tables | ||
CREATE TEMP FUNCTION normalize_install_target(target STRING) | ||
RETURNS STRING AS ( | ||
-- See https://sql.telemetry.mozilla.org/queries/95883/source | ||
CASE | ||
WHEN target LIKE "Firefox for Desktop%" | ||
THEN "desktop_release" | ||
WHEN target LIKE "Firefox ESR%" | ||
THEN "desktop_esr" | ||
WHEN target LIKE "Firefox Developer Edition%" | ||
THEN "desktop_developer_edition" | ||
WHEN target LIKE "Firefox Beta%" | ||
THEN "desktop_beta" | ||
WHEN target LIKE "Firefox Nightly Edition%" | ||
THEN "desktop_nightly" | ||
WHEN target LIKE "Firefox for Android%" | ||
THEN "android_release" | ||
WHEN target LIKE "Firefox Beta Android%" | ||
THEN "android_beta" | ||
WHEN target LIKE "Firefox for iOS%" | ||
THEN "ios_release" | ||
ELSE NULL | ||
END | ||
); | ||
|
||
WITH daily_sessions AS ( | ||
SELECT | ||
mozdata.analysis.ga_nullify_string(clientId) AS ga_client_id, | ||
-- visitId (or sessionId in GA4) is guaranteed unique only among one client, look at visitId here https://support.google.com/analytics/answer/3437719?hl=en | ||
CONCAT(mozdata.analysis.ga_nullify_string(clientId), CAST(visitId AS STRING)) AS ga_session_id, | ||
MIN(PARSE_DATE('%Y%m%d', date)) AS session_date, | ||
MIN(visitNumber) = 1 AS is_first_session, | ||
MIN(visitNumber) AS session_number, | ||
ARRAY_CONCAT_AGG(hits) AS hits, | ||
SUM(totals.timeOnSite) AS time_on_site, | ||
SUM(totals.pageviews) AS pageviews, | ||
/* Geos */ | ||
MIN_BY(geoNetwork.country, visitStartTime) AS country, | ||
MIN_BY(geoNetwork.region, visitStartTime) AS region, | ||
MIN_BY(geoNetwork.city, visitStartTime) AS city, | ||
/* Attribution */ | ||
MIN_BY( | ||
CAST(trafficSource.adwordsClickInfo.campaignId AS STRING), | ||
visitStartTime | ||
) AS campaign_id, | ||
MIN_BY(trafficSource.campaign, visitStartTime) AS campaign, | ||
MIN_BY(trafficSource.source, visitStartTime) AS source, | ||
MIN_BY(trafficSource.medium, visitStartTime) AS medium, | ||
MIN_BY(trafficSource.keyword, visitStartTime) AS term, | ||
MIN_BY(trafficSource.adContent, visitStartTime) AS content, | ||
ARRAY_AGG( | ||
mozdata.analysis.ga_nullify_string(trafficSource.adwordsClickInfo.gclId) IGNORE NULLS | ||
)[0] AS gclid, | ||
/* Device */ | ||
MIN_BY(device.deviceCategory, visitStartTime) AS device_category, | ||
MIN_BY(device.mobileDeviceModel, visitStartTime) AS mobile_device_model, | ||
MIN_BY(device.mobileDeviceInfo, visitStartTime) AS mobile_device_string, | ||
MIN_BY(device.operatingSystem, visitStartTime) AS os, | ||
MIN_BY(device.operatingSystemVersion, visitStartTime) AS os_version, | ||
MIN_BY(device.language, visitStartTime) AS language, | ||
MIN_BY(device.browser, visitStartTime) AS browser, | ||
MIN_BY(device.browserVersion, visitStartTime) AS browser_version, | ||
FROM | ||
`moz-fx-data-marketing-prod.65789850.ga_sessions_*` | ||
WHERE | ||
-- This table is partitioned, so we only process the data from session_date | ||
-- To handle late-arriving data, we process 3 days of data each day (re-processing the past 2) | ||
-- as separate Airflow tasks (or via bqetl backfill, I haven't decided yet) | ||
-- | ||
-- Here, we need to take data from yesterday, just in case some of our sessions from today | ||
-- actually started yesterday. If they did, they'll be filtered out in the HAVING clause | ||
_TABLE_SUFFIX | ||
BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(@session_date, INTERVAL 1 DAY)) | ||
-- However, we have data for today that will arrive _tomorrow_! Some inter-day sessions | ||
-- will be present in two days, with the same ids. A session should never span more | ||
-- than two days though, see https://sql.telemetry.mozilla.org/queries/95882/source | ||
-- If one does, our uniqueness check will alert us | ||
AND FORMAT_DATE('%Y%m%d', DATE_ADD(@session_date, INTERVAL 1 DAY)) | ||
GROUP BY | ||
ga_client_id, | ||
ga_session_id | ||
HAVING | ||
-- Don't include entries from today that started yesterday | ||
session_date = @session_date | ||
) | ||
SELECT | ||
* EXCEPT (hits), | ||
( | ||
SELECT | ||
LOGICAL_OR(type = 'EVENT' AND eventInfo.eventAction = 'Firefox Download') | ||
FROM | ||
UNNEST(hits) | ||
) AS had_download_event, | ||
( | ||
SELECT | ||
MAX_BY(normalize_install_target(eventInfo.eventLabel), hitNumber) | ||
FROM | ||
UNNEST(hits) | ||
WHERE | ||
type = 'EVENT' | ||
AND eventInfo.eventAction = 'Firefox Download' | ||
AND normalize_install_target(eventInfo.eventLabel) IS NOT NULL | ||
) AS last_reported_install_target, | ||
( | ||
SELECT | ||
ARRAY_AGG(DISTINCT normalize_install_target(eventInfo.eventLabel)) | ||
FROM | ||
UNNEST(hits) | ||
WHERE | ||
type = 'EVENT' | ||
AND eventInfo.eventAction = 'Firefox Download' | ||
AND normalize_install_target(eventInfo.eventLabel) IS NOT NULL | ||
) AS all_reported_install_targets, | ||
( | ||
SELECT | ||
MAX_BY(eventInfo.eventLabel, hitNumber) | ||
FROM | ||
UNNEST(hits) | ||
WHERE | ||
type = 'EVENT' | ||
AND eventInfo.eventAction = 'Stub Session ID' | ||
AND mozdata.analysis.ga_nullify_string(eventInfo.eventLabel) IS NOT NULL | ||
) AS last_reported_stub_session_id, | ||
( | ||
SELECT | ||
ARRAY_AGG(DISTINCT eventInfo.eventLabel) | ||
FROM | ||
UNNEST(hits) | ||
WHERE | ||
type = 'EVENT' | ||
AND eventInfo.eventAction = 'Stub Session ID' | ||
AND mozdata.analysis.ga_nullify_string(eventInfo.eventLabel) IS NOT NULL | ||
) AS all_reported_stub_session_ids, | ||
( | ||
-- Most sessions only have 1 landing screen | ||
-- https://sql.telemetry.mozilla.org/queries/95884/source | ||
SELECT | ||
MIN_BY(appInfo.landingScreenName, hitNumber) | ||
FROM | ||
UNNEST(hits) | ||
) AS landing_screen, | ||
FROM | ||
daily_sessions |
127 changes: 127 additions & 0 deletions
127
sql/moz-fx-data-shared-prod/mozilla_org_derived/ga_sessions_v1/schema.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
fields: | ||
- name: ga_client_id | ||
mode: NULLABLE | ||
type: STRING | ||
description: "Uniquely identifiers a GA client, using a cookie on moz.org." | ||
- name: ga_session_id | ||
mode: NULLABLE | ||
type: STRING | ||
description: "Uniquely identifiers a GA session." | ||
- name: session_date | ||
mode: NULLABLE | ||
type: DATE | ||
description: "The date of the session. Some sessions span two days: if it does, we take the earlier date." | ||
- name: is_first_session | ||
mode: NULLABLE | ||
type: BOOLEAN | ||
description: "Whether this is the first session for the client." | ||
- name: session_number | ||
mode: NULLABLE | ||
type: INTEGER | ||
description: "The session number for this client. Starts at 1, consecutively increasing." | ||
- name: time_on_site | ||
mode: NULLABLE | ||
type: INTEGER | ||
description: "Amount of time the user was on the site for this session." | ||
- name: pageviews | ||
mode: NULLABLE | ||
type: INTEGER | ||
description: "Total pageviews for this session." | ||
- name: country | ||
type: STRING | ||
mode: NULLABLE | ||
description: "First reported country for a GA user." | ||
- name: region | ||
type: STRING | ||
mode: NULLABLE | ||
description: "First reported region for a GA user." | ||
- name: city | ||
type: STRING | ||
mode: NULLABLE | ||
description: "First reported city for a GA user." | ||
- name: campaign_id | ||
type: STRING | ||
mode: NULLABLE | ||
description: "First reported campaign ID. Usually associated with AdWords." | ||
- name: campaign | ||
type: STRING | ||
mode: NULLABLE | ||
description: "First reported campaign value. Usually set by the utm_campaign URL parameter." | ||
- name: source | ||
type: STRING | ||
mode: NULLABLE | ||
description: > | ||
First reported source of the traffic. Could be the name of the search engine, | ||
the referring hostname, or a value of the utm_source URL parameter. | ||
- name: medium | ||
type: STRING | ||
mode: NULLABLE | ||
description: "First reported medium of the traffic source. Could be 'organic', 'cpc', 'referral', or the value of the utm_medium URL parameter." | ||
- name: term | ||
type: STRING | ||
mode: NULLABLE | ||
description: "First reported term, or keyword, value. If this was a search results page, this is the keyword entered." | ||
- name: content | ||
type: STRING | ||
mode: NULLABLE | ||
description: "First reported ad content of the traffic source. Can be set by the utm_content URL parameter." | ||
- name: gclid | ||
type: STRING | ||
mode: NULLABLE | ||
description: "A Google Click ID, which uniquely represent an ad click for Google ads." | ||
- name: device_category | ||
type: STRING | ||
mode: NULLABLE | ||
description: "First reported device category value. The type of device (Mobile, Tablet, Desktop)." | ||
- name: mobile_device_model | ||
type: STRING | ||
mode: NULLABLE | ||
description: "First reported device model value." | ||
- name: mobile_device_string | ||
type: STRING | ||
mode: NULLABLE | ||
description: "First reported mobile device string. The branding, model, and marketing name used to identify the mobile device." | ||
- name: os | ||
type: STRING | ||
mode: NULLABLE | ||
description: "First reported operating system of the device (e.g., 'Macintosh' or 'Windows')." | ||
- name: os_version | ||
type: STRING | ||
mode: NULLABLE | ||
description: "First reported os_version value." | ||
- name: language | ||
type: STRING | ||
mode: NULLABLE | ||
description: "First reported language the device is set to use. Expressed as the IETF language code." | ||
- name: browser | ||
type: STRING | ||
mode: NULLABLE | ||
description: "First reported browser used (e.g., 'Chrome' or 'Firefox')." | ||
- name: browser_version | ||
type: STRING | ||
mode: NULLABLE | ||
description: "First reported browser_version value." | ||
- name: had_download_event | ||
type: BOOLEAN | ||
mode: NULLABLE | ||
description: "Whether this session had a download event for Firefox." | ||
- name: last_reported_install_target | ||
type: STRING | ||
mode: NULLABLE | ||
description: "The last reported install target for this session (e.g. 'desktop_release' or 'android_beta')." | ||
- name: all_reported_install_targets | ||
type: STRING | ||
mode: REPEATED | ||
description: "All install targets reported for this session (e.g. 'desktop_release' or 'android_beta'." | ||
- name: last_reported_stub_session_id | ||
type: STRING | ||
mode: NULLABLE | ||
description: "The last reported Stub Session ID for this session. Can be used to join with `dl_ga_triplets` to get dl_tokens." | ||
- name: all_reported_stub_session_ids | ||
type: STRING | ||
mode: REPEATED | ||
description: "All reported Stub Session IDs for this session. Can be used to join with `dl_ga_triplets` to get dl_tokens." | ||
- name: landing_screen | ||
type: STRING | ||
mode: NULLABLE | ||
description: "The first reported landing screen for this session. Most sessions only have one, so this is a safe value to use." |
Oops, something went wrong.