From 99d982af0d0c76422a63a6af13ed389aa940f52e Mon Sep 17 00:00:00 2001 From: Eduardo Filho Date: Fri, 7 Jun 2024 13:53:12 -0400 Subject: [PATCH] fix(glam): add missing fields for percentiles and probe counts with non-norm data (#5769) --- .../templates/extract_probe_counts_v1.sql | 134 ++++++++++++------ .../templates/histogram_percentiles_v1.sql | 4 +- 2 files changed, 91 insertions(+), 47 deletions(-) diff --git a/bigquery_etl/glam/templates/extract_probe_counts_v1.sql b/bigquery_etl/glam/templates/extract_probe_counts_v1.sql index 287f43ae53..f4249222f7 100644 --- a/bigquery_etl/glam/templates/extract_probe_counts_v1.sql +++ b/bigquery_etl/glam/templates/extract_probe_counts_v1.sql @@ -1,12 +1,16 @@ {{ header }} - -WITH final_probe_extract AS ( SELECT +WITH final_probe_extract AS ( + SELECT channel, - app_version as version, + app_version AS version, ping_type, os, - app_build_id as build_id, - IF(app_build_id="*", NULL, SAFE_CAST({{ build_date_udf }}(app_build_id) AS STRING)) as build_date, + app_build_id AS build_id, + IF( + app_build_id = "*", + NULL, + SAFE_CAST({{ build_date_udf }}(app_build_id) AS STRING) + ) AS build_date, metric, metric_type, -- BigQuery has some null unicode characters which Postgresql doesn't like, @@ -14,17 +18,23 @@ WITH final_probe_extract AS ( SELECT -- length. SUBSTR(REPLACE(key, r"\x00", ""), 0, 200) AS metric_key, client_agg_type, - MAX(total_users) as total_users, - MAX(IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(aggregates), NULL)) as histogram, - MAX(IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(aggregates), NULL)) as percentiles, - MAX(IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL)) as non_norm_histogram, - MAX(IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL)) as non_norm_percentiles -FROM + MAX(total_users) AS total_users, + MAX(IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(aggregates), NULL)) AS histogram, + MAX( + IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(aggregates), NULL) + ) AS percentiles, + MAX( + IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL) + ) AS non_norm_histogram, + MAX( + IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL) + ) AS non_norm_percentiles + FROM `{{ dataset }}.{{ prefix }}__view_probe_counts_v1` -WHERE + WHERE total_users > {{ total_users }} AND app_version NOT IN (2015815747, 2015819723, 2015828803, 2015829155, 3015815747) -GROUP BY + GROUP BY channel, app_version, ping_type, @@ -37,17 +47,25 @@ GROUP BY ), -- to populate total_sample for agg_type other than 'count' glam_sample_counts AS ( - SELECT fsc1.os, + SELECT + fsc1.os, fsc1.app_version, fsc1.app_build_id, fsc1.metric, fsc1.key, fsc1.ping_type, fsc1.agg_type, - CASE WHEN fsc1.agg_type in ('max','min','sum','avg') AND fsc2.agg_type = 'count' THEN fsc2.total_sample ELSE fsc1.total_sample END as total_sample - FROM `{{ dataset }}.{{ prefix }}__view_sample_counts_v1` fsc1 - INNER JOIN `{{ dataset }}.{{ prefix }}__view_sample_counts_v1` fsc2 - ON fsc1.os = fsc2.os + CASE + WHEN fsc1.agg_type IN ('max', 'min', 'sum', 'avg') + AND fsc2.agg_type = 'count' + THEN fsc2.total_sample + ELSE fsc1.total_sample + END AS total_sample + FROM + `{{ dataset }}.{{ prefix }}__view_sample_counts_v1` fsc1 + INNER JOIN + `{{ dataset }}.{{ prefix }}__view_sample_counts_v1` fsc2 + ON fsc1.os = fsc2.os AND fsc1.app_build_id = fsc2.app_build_id AND fsc1.app_version = fsc2.app_version AND fsc1.metric = fsc2.metric @@ -55,37 +73,61 @@ glam_sample_counts AS ( AND fsc1.ping_type = fsc2.ping_type ), -- get all the rcords from view_probe_counts and the matching from view_sample_counts -ranked_data AS (SELECT - cp.channel, - cp.version, - cp.os, - cp.ping_type, - cp.build_id, - cp.build_date, - cp.metric, - cp.metric_key, - cp.client_agg_type, - cp.metric_type, - total_users, - histogram, - percentiles, - CASE WHEN client_agg_type = '' THEN 0 ELSE total_sample END AS total_sample, - ROW_NUMBER() OVER (PARTITION BY cp.version, cp.os, cp.build_id,cp.ping_type, cp.metric, cp.metric_key, cp.client_agg_type,cp.metric_type,histogram, percentiles - ORDER BY total_users, total_sample DESC) as rnk -FROM - final_probe_extract cp -LEFT JOIN glam_sample_counts sc - ON - sc.os = cp.os +ranked_data AS ( + SELECT + cp.channel, + cp.version, + cp.os, + cp.ping_type, + cp.build_id, + cp.build_date, + cp.metric, + cp.metric_key, + cp.client_agg_type, + cp.metric_type, + total_users, + histogram, + percentiles, + non_norm_histogram, + non_norm_percentiles, + CASE + WHEN client_agg_type = '' + THEN 0 + ELSE total_sample + END AS total_sample, + ROW_NUMBER() OVER ( + PARTITION BY + cp.version, + cp.os, + cp.build_id, + cp.ping_type, + cp.metric, + cp.metric_key, + cp.client_agg_type, + cp.metric_type, + histogram, + percentiles, + non_norm_histogram, + non_norm_percentiles + ORDER BY + total_users, + total_sample DESC + ) AS rnk + FROM + final_probe_extract cp + LEFT JOIN + glam_sample_counts sc + ON sc.os = cp.os AND sc.app_build_id = cp.build_id AND sc.app_version = cp.version AND sc.metric = cp.metric AND sc.key = cp.metric_key AND total_sample IS NOT NULL - AND (sc.agg_type = cp.client_agg_type OR cp.client_agg_type='') + AND (sc.agg_type = cp.client_agg_type OR cp.client_agg_type = '') ) --remove duplicates -SELECT channel, +SELECT + channel, version, ping_type, os, @@ -100,6 +142,8 @@ SELECT channel, percentiles, non_norm_histogram, non_norm_percentiles, - CAST(total_sample as INT) total_sample -FROM ranked_data -WHERE rnk = 1 + CAST(total_sample AS INT) total_sample +FROM + ranked_data +WHERE + rnk = 1 diff --git a/bigquery_etl/glam/templates/histogram_percentiles_v1.sql b/bigquery_etl/glam/templates/histogram_percentiles_v1.sql index d0bb4781fe..4a67f354e7 100644 --- a/bigquery_etl/glam/templates/histogram_percentiles_v1.sql +++ b/bigquery_etl/glam/templates/histogram_percentiles_v1.sql @@ -1,6 +1,6 @@ {{ header }} SELECT - * EXCEPT (aggregates) REPLACE('percentiles' AS agg_type), + * EXCEPT (aggregates, non_norm_aggregates) REPLACE('percentiles' AS agg_type), ARRAY>[ ('0.1', mozfun.glam.percentile(0.1, aggregates, metric_type)), ('1', mozfun.glam.percentile(1, aggregates, metric_type)), @@ -24,4 +24,4 @@ SELECT ('99.9', mozfun.glam.percentile(99.9, non_norm_aggregates, metric_type)) ] AS non_norm_aggregates FROM - glam_etl.{{ prefix }}__histogram_probe_counts_v1 + glam_etl.{{ prefix }} __histogram_probe_counts_v1