bigquery-etl/bigquery_etl/glean_usage/templates/baseline_clients_daily_v1.sql

141 строка
4.5 KiB
SQL

{{ header }}
{% if init %}
CREATE TABLE IF NOT EXISTS
`{{ daily_table }}`
PARTITION BY
submission_date
CLUSTER BY
normalized_channel,
sample_id
OPTIONS
(require_partition_filter = TRUE)
AS
{% endif %}
WITH base AS (
SELECT
submission_timestamp,
DATE(submission_timestamp) AS submission_date,
LOWER(client_info.client_id) AS client_id,
sample_id,
SAFE.PARSE_DATE('%F', SUBSTR(client_info.first_run_date, 1, 10)) AS first_run_date,
SAFE.PARSE_TIMESTAMP('%FT%H:%M%Ez', ping_info.end_time) AS parsed_end_time,
udf.glean_timespan_seconds(metrics.timespan.glean_baseline_duration) AS duration,
client_info.android_sdk_version,
client_info.app_build,
client_info.app_channel,
client_info.app_display_version,
client_info.architecture,
client_info.device_manufacturer,
client_info.device_model,
client_info.telemetry_sdk_build,
COALESCE(client_info.locale, metrics.string.glean_baseline_locale) AS locale,
metadata.geo.city,
metadata.geo.country,
normalized_channel,
normalized_os,
normalized_os_version,
FROM
`{{ baseline_table }}`
-- Baseline pings with 'foreground' reason were first introduced in early April 2020;
-- we initially excluded them from baseline_clients_daily so that we could measure
-- effects on KPIs. On 2020-08-25, we removed the filter on reason and backfilled. See:
-- https://bugzilla.mozilla.org/show_bug.cgi?id=1627286
-- https://jira.mozilla.com/browse/DS-1018
),
--
with_dates AS (
SELECT
*,
-- For explanation of session start time calculation, see Glean docs:
-- https://mozilla.github.io/glean/book/user/pings/baseline.html#contents
DATE(TIMESTAMP_SUB(parsed_end_time, INTERVAL duration SECOND)) AS session_start_date,
DATE(parsed_end_time) AS session_end_date,
FROM
base
),
--
with_date_offsets AS (
SELECT
*,
DATE_DIFF(submission_date, session_start_date, DAY) AS session_start_date_offset,
DATE_DIFF(submission_date, session_end_date, DAY) AS session_end_date_offset,
FROM
with_dates
),
--
windowed AS (
SELECT
submission_date,
client_id,
sample_id,
ROW_NUMBER() OVER w1_unframed AS _n,
--
-- Take the earliest first_run_date if ambiguous.
MIN(first_run_date) OVER w1 AS first_run_date,
--
-- Sums over distinct baseline pings.
SUM(IF(duration BETWEEN 0 AND 100000, duration, 0)) OVER w1 AS durations,
--
-- Bit patterns capturing activity dates relative to the submission date.
BIT_OR(
1 << IF(session_start_date_offset BETWEEN 0 AND 27, session_start_date_offset, NULL)
) OVER w1 AS days_seen_session_start_bits,
BIT_OR(
1 << IF(session_end_date_offset BETWEEN 0 AND 27, session_end_date_offset, NULL)
) OVER w1 AS days_seen_session_end_bits,
--
-- For all other dimensions, we use the mode of observed values in the day.
udf.mode_last(ARRAY_AGG(normalized_channel) OVER w1) AS normalized_channel,
udf.mode_last(ARRAY_AGG(normalized_os) OVER w1) AS normalized_os,
udf.mode_last(ARRAY_AGG(normalized_os_version) OVER w1) AS normalized_os_version,
udf.mode_last(ARRAY_AGG(android_sdk_version) OVER w1) AS android_sdk_version,
udf.mode_last(ARRAY_AGG(locale) OVER w1) AS locale,
udf.mode_last(ARRAY_AGG(city) OVER w1) AS city,
udf.mode_last(ARRAY_AGG(country) OVER w1) AS country,
udf.mode_last(ARRAY_AGG(app_build) OVER w1) AS app_build,
udf.mode_last(ARRAY_AGG(app_channel) OVER w1) AS app_channel,
udf.mode_last(ARRAY_AGG(app_display_version) OVER w1) AS app_display_version,
udf.mode_last(ARRAY_AGG(architecture) OVER w1) AS architecture,
udf.mode_last(ARRAY_AGG(device_manufacturer) OVER w1) AS device_manufacturer,
udf.mode_last(ARRAY_AGG(device_model) OVER w1) AS device_model,
udf.mode_last(ARRAY_AGG(telemetry_sdk_build) OVER w1) AS telemetry_sdk_build,
FROM
with_date_offsets
WHERE
{% if init %}
submission_date >= '2018-01-01'
{% else %}
submission_date = @submission_date
{% endif %}
WINDOW
w1 AS (
PARTITION BY
sample_id,
client_id,
submission_date
ORDER BY
submission_timestamp
ROWS BETWEEN
UNBOUNDED PRECEDING
AND UNBOUNDED FOLLOWING
),
-- We must provide a modified window for ROW_NUMBER which cannot accept a frame clause.
w1_unframed AS (
PARTITION BY
sample_id,
client_id,
submission_date
ORDER BY
submission_timestamp
)
)
--
SELECT
* EXCEPT (_n)
FROM
windowed
WHERE
_n = 1