Include GA intraday sessions tables (#4582)

* Include GA intraday sessions tables

* Update doc string on backfilling ga_sessions

* Dont dryrun stub_attribution view
This commit is contained in:
Frank Bertsch 2023-11-20 11:58:45 -05:00 коммит произвёл GitHub
Родитель f3b13c652e
Коммит 05fed88b07
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 356 добавлений и 20 удалений

Просмотреть файл

@ -178,6 +178,7 @@ dry_run:
- sql/moz-fx-data-shared-prod/firefox_desktop/top_sites/view.sql
- sql/moz-fx-data-shared-prod/firefox_desktop/quick_suggest/view.sql
- sql/moz-fx-data-shared-prod/stub_attribution_service_derived/dl_token_ga_attribution_lookup_v1/query.sql
- sql/moz-fx-data-shared-prod/stub_attribution_service/dl_token_ga_attribution_lookup/view.sql
# Materialized views
- sql/moz-fx-data-shared-prod/telemetry_derived/experiment_search_events_live_v1/init.sql
- sql/moz-fx-data-shared-prod/telemetry_derived/experiment_events_live_v1/init.sql

Просмотреть файл

@ -34,7 +34,39 @@ RETURNS STRING AS (
END
);
WITH daily_sessions AS (
WITH historic_and_intraday AS (
SELECT
*
FROM
`moz-fx-data-marketing-prod.65789850.ga_sessions_*`
WHERE
-- This table is partitioned, so we only process the data from session_date
-- To handle late-arriving data, we process 3 days of data each day (re-processing the past 2)
-- as separate Airflow tasks
--
-- Here, we need to take data from yesterday, just in case some of our sessions from today
-- actually started yesterday. If they did, they'll be filtered out in the HAVING clause
_TABLE_SUFFIX
BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(@session_date, INTERVAL 1 DAY))
-- However, we have data for today that will arrive _tomorrow_! Some inter-day sessions
-- will be present in two days, with the same ids. A session should never span more
-- than two days though, see https://sql.telemetry.mozilla.org/queries/95882/source
-- If one does, our uniqueness check will alert us
AND FORMAT_DATE('%Y%m%d', DATE_ADD(@session_date, INTERVAL 1 DAY))
UNION ALL
-- Intraday sessions are "real-time" exports of sessions of the current day
-- usually we wouldn't need these, but sometimes GA is slow in adding the
-- intraday sessions back into ga_sessions
SELECT
*
FROM
`moz-fx-data-marketing-prod.65789850.ga_sessions_intraday_*`
WHERE
_TABLE_SUFFIX
BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(@session_date, INTERVAL 1 DAY))
AND FORMAT_DATE('%Y%m%d', DATE_ADD(@session_date, INTERVAL 1 DAY))
),
daily_sessions AS (
SELECT
mozfun.ga.nullify_string(clientId) AS ga_client_id,
-- visitId (or sessionId in GA4) is guaranteed unique only among one client, look at visitId here https://support.google.com/analytics/answer/3437719?hl=en
@ -59,9 +91,9 @@ WITH daily_sessions AS (
MIN_BY(trafficSource.medium, visitStartTime) AS medium,
MIN_BY(trafficSource.keyword, visitStartTime) AS term,
MIN_BY(trafficSource.adContent, visitStartTime) AS content,
ARRAY_AGG(
mozfun.ga.nullify_string(trafficSource.adwordsClickInfo.gclId) IGNORE NULLS
)[0] AS gclid,
ARRAY_AGG(mozfun.ga.nullify_string(trafficSource.adwordsClickInfo.gclId) IGNORE NULLS)[
0
] AS gclid,
/* Device */
MIN_BY(device.deviceCategory, visitStartTime) AS device_category,
MIN_BY(device.mobileDeviceModel, visitStartTime) AS mobile_device_model,
@ -72,26 +104,12 @@ WITH daily_sessions AS (
MIN_BY(device.browser, visitStartTime) AS browser,
MIN_BY(device.browserVersion, visitStartTime) AS browser_version,
FROM
`moz-fx-data-marketing-prod.65789850.ga_sessions_*`
WHERE
-- This table is partitioned, so we only process the data from session_date
-- To handle late-arriving data, we process 3 days of data each day (re-processing the past 2)
-- as separate Airflow tasks (or via bqetl backfill, I haven't decided yet)
--
-- Here, we need to take data from yesterday, just in case some of our sessions from today
-- actually started yesterday. If they did, they'll be filtered out in the HAVING clause
_TABLE_SUFFIX
BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(@session_date, INTERVAL 1 DAY))
-- However, we have data for today that will arrive _tomorrow_! Some inter-day sessions
-- will be present in two days, with the same ids. A session should never span more
-- than two days though, see https://sql.telemetry.mozilla.org/queries/95882/source
-- If one does, our uniqueness check will alert us
AND FORMAT_DATE('%Y%m%d', DATE_ADD(@session_date, INTERVAL 1 DAY))
historic_and_intraday
GROUP BY
ga_client_id,
ga_session_id
HAVING
-- Don't include entries from today that started yesterday
-- Don't include entries from that started yesterday or tomorrow
session_date = @session_date
)
SELECT

Просмотреть файл

@ -0,0 +1,225 @@
[
{
"mode": "NULLABLE",
"name": "visitId",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "visitNumber",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "visitStartTime",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "date",
"type": "STRING"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "timeOnSite",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "pageviews",
"type": "INTEGER"
}
],
"mode": "NULLABLE",
"name": "totals",
"type": "RECORD"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "campaign",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "source",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "medium",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "adContent",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "keyword",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "adwordsClickInfo",
"type": "RECORD",
"fields": [
{
"mode": "NULLABLE",
"name": "gclId",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "campaignId",
"type": "INTEGER"
}
]
}
],
"mode": "NULLABLE",
"name": "trafficSource",
"type": "RECORD"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "browser",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "browserVersion",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "operatingSystem",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "operatingSystemVersion",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "language",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "deviceCategory",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "mobileDeviceModel",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "mobileDeviceInfo",
"type": "STRING"
}
],
"mode": "NULLABLE",
"name": "device",
"type": "RECORD"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "country",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "region",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "city",
"type": "STRING"
}
],
"mode": "NULLABLE",
"name": "geoNetwork",
"type": "RECORD"
},
{
"fields": [
{
"fields": [
{
"mode": "NULLABLE",
"name": "pagePath",
"type": "STRING"
}
],
"mode": "NULLABLE",
"name": "page",
"type": "RECORD"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "landingScreenName",
"type": "STRING"
}
],
"mode": "NULLABLE",
"name": "appInfo",
"type": "RECORD"
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "eventCategory",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "eventAction",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "eventLabel",
"type": "STRING"
}
],
"mode": "NULLABLE",
"name": "eventInfo",
"type": "RECORD"
},
{
"mode": "NULLABLE",
"name": "type",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "hitNumber",
"type": "INTEGER"
}
],
"mode": "REPEATED",
"name": "hits",
"type": "RECORD"
},
{
"mode": "NULLABLE",
"name": "clientId",
"type": "STRING"
}
]

Просмотреть файл

@ -29,3 +29,34 @@
last_reported_stub_session_id: "laterStubSessionId"
all_reported_stub_session_ids: ["earlierStubSessionId", "laterStubSessionId"]
landing_screen: first
- ga_client_id: clientIntraday
ga_session_id: clientIntraday1
session_date: 2023-03-31
is_first_session: true
session_number: 1
time_on_site: 11
pageviews: 1
country: earlierCountry
region: earlierRegion
city: earlierCity
campaign_id: "1"
gclid: "earlierGclid"
campaign: "earlierCampaign"
source: "earlierSource"
medium: "earlierMedium"
content: "earlierContent"
term: "earlierKeyword"
device_category: "earlierDeviceCategory"
mobile_device_model: "earlierMobileDeviceModel"
mobile_device_string: "earlierMobileDeviceInfo"
os: "earlierOperatingSystem"
os_version: "earlierOperatingSystemVersion"
language: "earlierLanguage"
browser: "earlierBrowser"
browser_version: "earlierBrowserVersion"
had_download_event: true
last_reported_install_target: "desktop_release"
all_reported_install_targets: ["desktop_release"]
last_reported_stub_session_id: "laterStubSessionId"
all_reported_stub_session_ids: ["earlierStubSessionId", "laterStubSessionId"]
landing_screen: first

Просмотреть файл

@ -0,0 +1,61 @@
- clientId: clientIntraday
visitId: '1'
visitNumber: 1
date: '20230331'
visitStartTime: 1
geoNetwork:
country: "earlierCountry"
region: "earlierRegion"
city: "earlierCity"
trafficSource:
adwordsClickInfo:
campaignId: 1
gclId: "earlierGclid"
campaign: "earlierCampaign"
source: "earlierSource"
medium: "earlierMedium"
adContent: "earlierContent"
keyword: "earlierKeyword"
device:
deviceCategory: "earlierDeviceCategory"
mobileDeviceModel: "earlierMobileDeviceModel"
mobileDeviceInfo: "earlierMobileDeviceInfo"
operatingSystem: "earlierOperatingSystem"
operatingSystemVersion: "earlierOperatingSystemVersion"
language: "earlierLanguage"
browser: "earlierBrowser"
browserVersion: "earlierBrowserVersion"
totals:
pageviews: 1
timeOnSite: 11
hits:
- page:
pagePath: "/en-GB/firefox/session"
appInfo:
landingScreenName: first
eventInfo:
eventCategory: "/firefox/ Interactions"
eventAction: Stub Session ID
eventLabel: earlierStubSessionId
hitNumber: 1
type: EVENT
- page:
pagePath: "/en-GB/firefox/session"
appInfo:
landingScreenName: second
eventInfo:
eventCategory: "/firefox/ Interactions"
eventAction: Stub Session ID
eventLabel: laterStubSessionId
hitNumber: 2
type: EVENT
- page:
pagePath: "/en-GB/firefox/"
appInfo:
landingScreenName: third
eventInfo:
eventCategory: "/firefox/ Interactions"
eventAction: Firefox Download
eventLabel: Firefox for Desktop
hitNumber: 3
type: EVENT