fix(glam): add missing fields for percentiles and probe counts with non-norm data (#5769)

This commit is contained in:
Eduardo Filho 2024-06-07 13:53:12 -04:00 коммит произвёл GitHub
Родитель aaf75a37de
Коммит 99d982af0d
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
2 изменённых файлов: 91 добавлений и 47 удалений

Просмотреть файл

@ -1,12 +1,16 @@
{{ header }}
WITH final_probe_extract AS ( SELECT
WITH final_probe_extract AS (
SELECT
channel,
app_version as version,
app_version AS version,
ping_type,
os,
app_build_id as build_id,
IF(app_build_id="*", NULL, SAFE_CAST({{ build_date_udf }}(app_build_id) AS STRING)) as build_date,
app_build_id AS build_id,
IF(
app_build_id = "*",
NULL,
SAFE_CAST({{ build_date_udf }}(app_build_id) AS STRING)
) AS build_date,
metric,
metric_type,
-- BigQuery has some null unicode characters which Postgresql doesn't like,
@ -14,11 +18,17 @@ WITH final_probe_extract AS ( SELECT
-- length.
SUBSTR(REPLACE(key, r"\x00", ""), 0, 200) AS metric_key,
client_agg_type,
MAX(total_users) as total_users,
MAX(IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(aggregates), NULL)) as histogram,
MAX(IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(aggregates), NULL)) as percentiles,
MAX(IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL)) as non_norm_histogram,
MAX(IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL)) as non_norm_percentiles
MAX(total_users) AS total_users,
MAX(IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(aggregates), NULL)) AS histogram,
MAX(
IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(aggregates), NULL)
) AS percentiles,
MAX(
IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL)
) AS non_norm_histogram,
MAX(
IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL)
) AS non_norm_percentiles
FROM
`{{ dataset }}.{{ prefix }}__view_probe_counts_v1`
WHERE
@ -37,16 +47,24 @@ GROUP BY
),
-- to populate total_sample for agg_type other than 'count'
glam_sample_counts AS (
SELECT fsc1.os,
SELECT
fsc1.os,
fsc1.app_version,
fsc1.app_build_id,
fsc1.metric,
fsc1.key,
fsc1.ping_type,
fsc1.agg_type,
CASE WHEN fsc1.agg_type in ('max','min','sum','avg') AND fsc2.agg_type = 'count' THEN fsc2.total_sample ELSE fsc1.total_sample END as total_sample
FROM `{{ dataset }}.{{ prefix }}__view_sample_counts_v1` fsc1
INNER JOIN `{{ dataset }}.{{ prefix }}__view_sample_counts_v1` fsc2
CASE
WHEN fsc1.agg_type IN ('max', 'min', 'sum', 'avg')
AND fsc2.agg_type = 'count'
THEN fsc2.total_sample
ELSE fsc1.total_sample
END AS total_sample
FROM
`{{ dataset }}.{{ prefix }}__view_sample_counts_v1` fsc1
INNER JOIN
`{{ dataset }}.{{ prefix }}__view_sample_counts_v1` fsc2
ON fsc1.os = fsc2.os
AND fsc1.app_build_id = fsc2.app_build_id
AND fsc1.app_version = fsc2.app_version
@ -55,7 +73,8 @@ glam_sample_counts AS (
AND fsc1.ping_type = fsc2.ping_type
),
-- get all the rcords from view_probe_counts and the matching from view_sample_counts
ranked_data AS (SELECT
ranked_data AS (
SELECT
cp.channel,
cp.version,
cp.os,
@ -69,14 +88,36 @@ ranked_data AS (SELECT
total_users,
histogram,
percentiles,
CASE WHEN client_agg_type = '' THEN 0 ELSE total_sample END AS total_sample,
ROW_NUMBER() OVER (PARTITION BY cp.version, cp.os, cp.build_id,cp.ping_type, cp.metric, cp.metric_key, cp.client_agg_type,cp.metric_type,histogram, percentiles
ORDER BY total_users, total_sample DESC) as rnk
non_norm_histogram,
non_norm_percentiles,
CASE
WHEN client_agg_type = ''
THEN 0
ELSE total_sample
END AS total_sample,
ROW_NUMBER() OVER (
PARTITION BY
cp.version,
cp.os,
cp.build_id,
cp.ping_type,
cp.metric,
cp.metric_key,
cp.client_agg_type,
cp.metric_type,
histogram,
percentiles,
non_norm_histogram,
non_norm_percentiles
ORDER BY
total_users,
total_sample DESC
) AS rnk
FROM
final_probe_extract cp
LEFT JOIN glam_sample_counts sc
ON
sc.os = cp.os
LEFT JOIN
glam_sample_counts sc
ON sc.os = cp.os
AND sc.app_build_id = cp.build_id
AND sc.app_version = cp.version
AND sc.metric = cp.metric
@ -85,7 +126,8 @@ LEFT JOIN glam_sample_counts sc
AND (sc.agg_type = cp.client_agg_type OR cp.client_agg_type = '')
)
--remove duplicates
SELECT channel,
SELECT
channel,
version,
ping_type,
os,
@ -100,6 +142,8 @@ SELECT channel,
percentiles,
non_norm_histogram,
non_norm_percentiles,
CAST(total_sample as INT) total_sample
FROM ranked_data
WHERE rnk = 1
CAST(total_sample AS INT) total_sample
FROM
ranked_data
WHERE
rnk = 1

Просмотреть файл

@ -1,6 +1,6 @@
{{ header }}
SELECT
* EXCEPT (aggregates) REPLACE('percentiles' AS agg_type),
* EXCEPT (aggregates, non_norm_aggregates) REPLACE('percentiles' AS agg_type),
ARRAY<STRUCT<key STRING, value FLOAT64>>[
('0.1', mozfun.glam.percentile(0.1, aggregates, metric_type)),
('1', mozfun.glam.percentile(1, aggregates, metric_type)),