Add ga_sessions_v1 table & view (#4554)

* Add ga_sessions_v1 table & view

This table aggregates session-level data from GA.

* Rename nullify string func

* Apply suggestions from code review

Co-authored-by: Alexander <anicholson@mozilla.com>

* Add upstream backfill deps

* Move depends_on to correct section

---------

Co-authored-by: Alexander <anicholson@mozilla.com>
This commit is contained in:
Frank Bertsch 2023-11-16 15:58:33 -05:00 коммит произвёл GitHub
Родитель 8d7e0fa264
Коммит cbb843e455
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
12 изменённых файлов: 738 добавлений и 0 удалений

Просмотреть файл

@ -1298,3 +1298,19 @@ bqetl_ads:
tags:
- impact/tier_1
- repo/bigquery-etl
bqetl_mozilla_org_derived:
schedule_interval: 0 2 * * *
default_args:
depends_on_past: false
email:
- frank@mozilla.com
- telemetry-alerts@mozilla.com
email_on_failure: true
email_on_retry: true
owner: frank@mozilla.com
retries: 2
retry_delay: 30m
start_date: "2023-11-13"
tags:
- impact/tier_1

Просмотреть файл

@ -0,0 +1,14 @@
friendly_name: Mozilla.org
description: |-
Mozilla.org data, usually derived from Google Analytics (GA).
dataset_base_acl: view
user_facing: true
labels: {}
default_table_workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,13 @@
CREATE OR REPLACE VIEW
`moz-fx-data-shared-prod.mozilla_org.ga_sessions`
AS
SELECT
* REPLACE (
mozfun.ga.nullify_string(campaign) AS campaign,
mozfun.ga.nullify_string(source) AS source,
mozfun.ga.nullify_string(medium) AS medium,
mozfun.ga.nullify_string(term) AS term,
mozfun.ga.nullify_string(content) AS content
)
FROM
`moz-fx-data-shared-prod.mozilla_org_derived.ga_sessions_v1`

Просмотреть файл

@ -0,0 +1,14 @@
friendly_name: Mozilla.org Derived
description: |-
Mozilla.org data, usually derived from Google Analytics
dataset_base_acl: derived
user_facing: false
labels: {}
default_table_workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,7 @@
#fail
-- ga_session_id should be unique across all partitions
{{ is_unique(["ga_session_id"]) }}
#fail
{{ min_row_count(10000) }}

Просмотреть файл

@ -0,0 +1,31 @@
friendly_name: Ga Sessions
description: |-
One row for each GA session.
owners:
- frank@mozilla.com
labels:
incremental: true
owner1: frank@mozilla.com
scheduling:
dag_name: bqetl_mozilla_org_derived
date_partition_parameter: session_date
depends_on:
- task_id: "mozilla_org_derived__ga_sessions__v1__backfill_-1"
dag_name: ga_sessions_backfill
execution_delta: 1h
- task_id: "mozilla_org_derived__ga_sessions__v1__backfill_-2"
dag_name: ga_sessions_backfill
execution_delta: 1h
- task_id: "mozilla_org_derived__ga_sessions__v1__backfill_-3"
dag_name: ga_sessions_backfill
execution_delta: 1h
bigquery:
time_partitioning:
type: day
field: 'session_date'
require_partition_filter: true
expiration_days: null
clustering:
fields: ["country"]
references: {}
deprecated: false

Просмотреть файл

@ -0,0 +1,154 @@
-- First note: This table is meant to be forwards-compatible
-- with the GA4 schema: https://support.google.com/analytics/answer/7029846
-- But that's harder, since some of the data is contained within events there (e.g. session_start is an event_param, with the value as the session_id)
-- See https://www.ga4bigquery.com/sessions-dimensions-metrics-ga4/
-- Second note: We do not store user_ids, only client_ids
-- After migration client_ids will be called pseudo_user_ids, see
-- https://louder.com.au/2022/06/27/client-id-in-ga4-what-is-it-and-how-to-get-it-in-your-report/
-- Third note: The only non-forwards-compatible field is mobileDeviceInfo
-- in GA4, that will be split into its components (model, manufacturer, etc.)
-- I think we can simply handle this in the view using some UDFs
-- Fourth note: Data is updated up to three days after the event happens, see
-- https://support.google.com/analytics/answer/7029846?#tables
CREATE TEMP FUNCTION normalize_install_target(target STRING)
RETURNS STRING AS (
-- See https://sql.telemetry.mozilla.org/queries/95883/source
CASE
WHEN target LIKE "Firefox for Desktop%"
THEN "desktop_release"
WHEN target LIKE "Firefox ESR%"
THEN "desktop_esr"
WHEN target LIKE "Firefox Developer Edition%"
THEN "desktop_developer_edition"
WHEN target LIKE "Firefox Beta%"
THEN "desktop_beta"
WHEN target LIKE "Firefox Nightly Edition%"
THEN "desktop_nightly"
WHEN target LIKE "Firefox for Android%"
THEN "android_release"
WHEN target LIKE "Firefox Beta Android%"
THEN "android_beta"
WHEN target LIKE "Firefox for iOS%"
THEN "ios_release"
ELSE NULL
END
);
WITH daily_sessions AS (
SELECT
mozfun.ga.nullify_string(clientId) AS ga_client_id,
-- visitId (or sessionId in GA4) is guaranteed unique only among one client, look at visitId here https://support.google.com/analytics/answer/3437719?hl=en
CONCAT(mozfun.ga.nullify_string(clientId), CAST(visitId AS STRING)) AS ga_session_id,
MIN(PARSE_DATE('%Y%m%d', date)) AS session_date,
MIN(visitNumber) = 1 AS is_first_session,
MIN(visitNumber) AS session_number,
ARRAY_CONCAT_AGG(hits) AS hits,
SUM(totals.timeOnSite) AS time_on_site,
SUM(totals.pageviews) AS pageviews,
/* Geos */
MIN_BY(geoNetwork.country, visitStartTime) AS country,
MIN_BY(geoNetwork.region, visitStartTime) AS region,
MIN_BY(geoNetwork.city, visitStartTime) AS city,
/* Attribution */
MIN_BY(
CAST(trafficSource.adwordsClickInfo.campaignId AS STRING),
visitStartTime
) AS campaign_id,
MIN_BY(trafficSource.campaign, visitStartTime) AS campaign,
MIN_BY(trafficSource.source, visitStartTime) AS source,
MIN_BY(trafficSource.medium, visitStartTime) AS medium,
MIN_BY(trafficSource.keyword, visitStartTime) AS term,
MIN_BY(trafficSource.adContent, visitStartTime) AS content,
ARRAY_AGG(
mozfun.ga.nullify_string(trafficSource.adwordsClickInfo.gclId) IGNORE NULLS
)[0] AS gclid,
/* Device */
MIN_BY(device.deviceCategory, visitStartTime) AS device_category,
MIN_BY(device.mobileDeviceModel, visitStartTime) AS mobile_device_model,
MIN_BY(device.mobileDeviceInfo, visitStartTime) AS mobile_device_string,
MIN_BY(device.operatingSystem, visitStartTime) AS os,
MIN_BY(device.operatingSystemVersion, visitStartTime) AS os_version,
MIN_BY(device.language, visitStartTime) AS language,
MIN_BY(device.browser, visitStartTime) AS browser,
MIN_BY(device.browserVersion, visitStartTime) AS browser_version,
FROM
`moz-fx-data-marketing-prod.65789850.ga_sessions_*`
WHERE
-- This table is partitioned, so we only process the data from session_date
-- To handle late-arriving data, we process 3 days of data each day (re-processing the past 2)
-- as separate Airflow tasks (or via bqetl backfill, I haven't decided yet)
--
-- Here, we need to take data from yesterday, just in case some of our sessions from today
-- actually started yesterday. If they did, they'll be filtered out in the HAVING clause
_TABLE_SUFFIX
BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(@session_date, INTERVAL 1 DAY))
-- However, we have data for today that will arrive _tomorrow_! Some inter-day sessions
-- will be present in two days, with the same ids. A session should never span more
-- than two days though, see https://sql.telemetry.mozilla.org/queries/95882/source
-- If one does, our uniqueness check will alert us
AND FORMAT_DATE('%Y%m%d', DATE_ADD(@session_date, INTERVAL 1 DAY))
GROUP BY
ga_client_id,
ga_session_id
HAVING
-- Don't include entries from today that started yesterday
session_date = @session_date
)
SELECT
* EXCEPT (hits),
(
SELECT
LOGICAL_OR(type = 'EVENT' AND eventInfo.eventAction = 'Firefox Download')
FROM
UNNEST(hits)
) AS had_download_event,
(
SELECT
MAX_BY(normalize_install_target(eventInfo.eventLabel), hitNumber)
FROM
UNNEST(hits)
WHERE
type = 'EVENT'
AND eventInfo.eventAction = 'Firefox Download'
AND normalize_install_target(eventInfo.eventLabel) IS NOT NULL
) AS last_reported_install_target,
(
SELECT
ARRAY_AGG(DISTINCT normalize_install_target(eventInfo.eventLabel))
FROM
UNNEST(hits)
WHERE
type = 'EVENT'
AND eventInfo.eventAction = 'Firefox Download'
AND normalize_install_target(eventInfo.eventLabel) IS NOT NULL
) AS all_reported_install_targets,
(
SELECT
MAX_BY(eventInfo.eventLabel, hitNumber)
FROM
UNNEST(hits)
WHERE
type = 'EVENT'
AND eventInfo.eventAction = 'Stub Session ID'
AND mozfun.ga.nullify_string(eventInfo.eventLabel) IS NOT NULL
) AS last_reported_stub_session_id,
(
SELECT
ARRAY_AGG(DISTINCT eventInfo.eventLabel)
FROM
UNNEST(hits)
WHERE
type = 'EVENT'
AND eventInfo.eventAction = 'Stub Session ID'
AND mozfun.ga.nullify_string(eventInfo.eventLabel) IS NOT NULL
) AS all_reported_stub_session_ids,
(
-- Most sessions only have 1 landing screen
-- https://sql.telemetry.mozilla.org/queries/95884/source
SELECT
MIN_BY(appInfo.landingScreenName, hitNumber)
FROM
UNNEST(hits)
) AS landing_screen,
FROM
daily_sessions

Просмотреть файл

@ -0,0 +1,127 @@
fields:
- name: ga_client_id
mode: NULLABLE
type: STRING
description: "Uniquely identifiers a GA client, using a cookie on moz.org."
- name: ga_session_id
mode: NULLABLE
type: STRING
description: "Uniquely identifiers a GA session."
- name: session_date
mode: NULLABLE
type: DATE
description: "The date of the session. Some sessions span two days: if it does, we take the earlier date."
- name: is_first_session
mode: NULLABLE
type: BOOLEAN
description: "Whether this is the first session for the client."
- name: session_number
mode: NULLABLE
type: INTEGER
description: "The session number for this client. Starts at 1, consecutively increasing."
- name: time_on_site
mode: NULLABLE
type: INTEGER
description: "Amount of time the user was on the site for this session."
- name: pageviews
mode: NULLABLE
type: INTEGER
description: "Total pageviews for this session."
- name: country
type: STRING
mode: NULLABLE
description: "First reported country for a GA user."
- name: region
type: STRING
mode: NULLABLE
description: "First reported region for a GA user."
- name: city
type: STRING
mode: NULLABLE
description: "First reported city for a GA user."
- name: campaign_id
type: STRING
mode: NULLABLE
description: "First reported campaign ID. Usually associated with AdWords."
- name: campaign
type: STRING
mode: NULLABLE
description: "First reported campaign value. Usually set by the utm_campaign URL parameter."
- name: source
type: STRING
mode: NULLABLE
description: >
First reported source of the traffic. Could be the name of the search engine,
the referring hostname, or a value of the utm_source URL parameter.
- name: medium
type: STRING
mode: NULLABLE
description: "First reported medium of the traffic source. Could be 'organic', 'cpc', 'referral', or the value of the utm_medium URL parameter."
- name: term
type: STRING
mode: NULLABLE
description: "First reported term, or keyword, value. If this was a search results page, this is the keyword entered."
- name: content
type: STRING
mode: NULLABLE
description: "First reported ad content of the traffic source. Can be set by the utm_content URL parameter."
- name: gclid
type: STRING
mode: NULLABLE
description: "A Google Click ID, which uniquely represent an ad click for Google ads."
- name: device_category
type: STRING
mode: NULLABLE
description: "First reported device category value. The type of device (Mobile, Tablet, Desktop)."
- name: mobile_device_model
type: STRING
mode: NULLABLE
description: "First reported device model value."
- name: mobile_device_string
type: STRING
mode: NULLABLE
description: "First reported mobile device string. The branding, model, and marketing name used to identify the mobile device."
- name: os
type: STRING
mode: NULLABLE
description: "First reported operating system of the device (e.g., 'Macintosh' or 'Windows')."
- name: os_version
type: STRING
mode: NULLABLE
description: "First reported os_version value."
- name: language
type: STRING
mode: NULLABLE
description: "First reported language the device is set to use. Expressed as the IETF language code."
- name: browser
type: STRING
mode: NULLABLE
description: "First reported browser used (e.g., 'Chrome' or 'Firefox')."
- name: browser_version
type: STRING
mode: NULLABLE
description: "First reported browser_version value."
- name: had_download_event
type: BOOLEAN
mode: NULLABLE
description: "Whether this session had a download event for Firefox."
- name: last_reported_install_target
type: STRING
mode: NULLABLE
description: "The last reported install target for this session (e.g. 'desktop_release' or 'android_beta')."
- name: all_reported_install_targets
type: STRING
mode: REPEATED
description: "All install targets reported for this session (e.g. 'desktop_release' or 'android_beta'."
- name: last_reported_stub_session_id
type: STRING
mode: NULLABLE
description: "The last reported Stub Session ID for this session. Can be used to join with `dl_ga_triplets` to get dl_tokens."
- name: all_reported_stub_session_ids
type: STRING
mode: REPEATED
description: "All reported Stub Session IDs for this session. Can be used to join with `dl_ga_triplets` to get dl_tokens."
- name: landing_screen
type: STRING
mode: NULLABLE
description: "The first reported landing screen for this session. Most sessions only have one, so this is a safe value to use."

Просмотреть файл

@ -0,0 +1,225 @@
[
{
"mode": "NULLABLE",
"name": "visitId",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "visitNumber",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "visitStartTime",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "date",
"type": "STRING"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "timeOnSite",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "pageviews",
"type": "INTEGER"
}
],
"mode": "NULLABLE",
"name": "totals",
"type": "RECORD"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "campaign",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "source",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "medium",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "adContent",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "keyword",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "adwordsClickInfo",
"type": "RECORD",
"fields": [
{
"mode": "NULLABLE",
"name": "gclId",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "campaignId",
"type": "INTEGER"
}
]
}
],
"mode": "NULLABLE",
"name": "trafficSource",
"type": "RECORD"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "browser",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "browserVersion",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "operatingSystem",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "operatingSystemVersion",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "language",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "deviceCategory",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "mobileDeviceModel",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "mobileDeviceInfo",
"type": "STRING"
}
],
"mode": "NULLABLE",
"name": "device",
"type": "RECORD"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "country",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "region",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "city",
"type": "STRING"
}
],
"mode": "NULLABLE",
"name": "geoNetwork",
"type": "RECORD"
},
{
"fields": [
{
"fields": [
{
"mode": "NULLABLE",
"name": "pagePath",
"type": "STRING"
}
],
"mode": "NULLABLE",
"name": "page",
"type": "RECORD"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "landingScreenName",
"type": "STRING"
}
],
"mode": "NULLABLE",
"name": "appInfo",
"type": "RECORD"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "eventCategory",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "eventAction",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "eventLabel",
"type": "STRING"
}
],
"mode": "NULLABLE",
"name": "eventInfo",
"type": "RECORD"
},
{
"mode": "NULLABLE",
"name": "type",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "hitNumber",
"type": "INTEGER"
}
],
"mode": "REPEATED",
"name": "hits",
"type": "RECORD"
},
{
"mode": "NULLABLE",
"name": "clientId",
"type": "STRING"
}
]

Просмотреть файл

@ -0,0 +1,31 @@
- ga_client_id: client1
ga_session_id: client11
session_date: 2023-03-31
is_first_session: true
session_number: 1
time_on_site: 33
pageviews: 2
country: earlierCountry
region: earlierRegion
city: earlierCity
campaign_id: "1"
gclid: "earlierGclid"
campaign: "earlierCampaign"
source: "earlierSource"
medium: "earlierMedium"
content: "earlierContent"
term: "earlierKeyword"
device_category: "earlierDeviceCategory"
mobile_device_model: "earlierMobileDeviceModel"
mobile_device_string: "earlierMobileDeviceInfo"
os: "earlierOperatingSystem"
os_version: "earlierOperatingSystemVersion"
language: "earlierLanguage"
browser: "earlierBrowser"
browser_version: "earlierBrowserVersion"
had_download_event: true
last_reported_install_target: "android_release"
all_reported_install_targets: ["desktop_release", "android_release"]
last_reported_stub_session_id: "laterStubSessionId"
all_reported_stub_session_ids: ["earlierStubSessionId", "laterStubSessionId"]
landing_screen: first

Просмотреть файл

@ -0,0 +1,102 @@
- clientId: client1
visitId: '1'
visitNumber: 1
date: '20230331'
visitStartTime: 1
geoNetwork:
country: "earlierCountry"
region: "earlierRegion"
city: "earlierCity"
trafficSource:
adwordsClickInfo:
campaignId: 1
gclId: "earlierGclid"
campaign: "earlierCampaign"
source: "earlierSource"
medium: "earlierMedium"
adContent: "earlierContent"
keyword: "earlierKeyword"
device:
deviceCategory: "earlierDeviceCategory"
mobileDeviceModel: "earlierMobileDeviceModel"
mobileDeviceInfo: "earlierMobileDeviceInfo"
operatingSystem: "earlierOperatingSystem"
operatingSystemVersion: "earlierOperatingSystemVersion"
language: "earlierLanguage"
browser: "earlierBrowser"
browserVersion: "earlierBrowserVersion"
totals:
pageviews: 1
timeOnSite: 11
hits:
- page:
pagePath: "/en-GB/firefox/session"
appInfo:
landingScreenName: first
eventInfo:
eventCategory: "/firefox/ Interactions"
eventAction: Stub Session ID
eventLabel: earlierStubSessionId
hitNumber: 1
type: EVENT
- page:
pagePath: "/en-GB/firefox/session"
appInfo:
landingScreenName: second
eventInfo:
eventCategory: "/firefox/ Interactions"
eventAction: Stub Session ID
eventLabel: laterStubSessionId
hitNumber: 2
type: EVENT
- page:
pagePath: "/en-GB/firefox/"
appInfo:
landingScreenName: third
eventInfo:
eventCategory: "/firefox/ Interactions"
eventAction: Firefox Download
eventLabel: Firefox for Desktop
hitNumber: 3
type: EVENT
- clientId: client1
visitId: '1'
visitNumber: 1
date: '20230331'
visitStartTime: 2
geoNetwork:
country: "laterCountry"
region: "laterRegion"
city: "laterCity"
trafficSource:
adwordsClickInfo:
campaignId: 2
gclId: "laterGclid"
campaign: "laterCampaign"
source: "laterSource"
medium: "laterMedium"
adContent: "laterContent"
keyword: "laterKeyword"
device:
deviceCategory: "laterDeviceCategory"
mobileDeviceModel: "laterMobileDeviceModel"
mobileDeviceInfo: "laterMobileDeviceInfo"
operatingSystem: "laterOperatingSystem"
operatingSystemVersion: "laterOperatingSystemVersion"
language: "laterLanguage"
browser: "laterBrowser"
browserVersion: "laterBrowserVersion"
totals:
pageviews: 1
timeOnSite: 22
hits:
- page:
pagePath: "/en-GB/firefox/"
appInfo:
landingScreenName: fourth
eventInfo:
eventCategory: "/firefox/ Interactions"
eventAction: Firefox Download
eventLabel: Firefox for Android
hitNumber: 4
type: EVENT

Просмотреть файл

@ -0,0 +1,4 @@
---
- name: session_date
type: DATE
value: 2023-03-31