fix(glam): add missing fields for percentiles and probe counts with non-norm data (#5769)

2024-06-07 13:53:12 -04:00 · 2024-06-07 13:53:12 -04:00 · 99d982af0d
--- a/bigquery_etl/glam/templates/extract_probe_counts_v1.sql
+++ b/bigquery_etl/glam/templates/extract_probe_counts_v1.sql
@ -1,12 +1,16 @@
 {{ header }}
-
-WITH final_probe_extract AS ( SELECT
+WITH final_probe_extract AS (
+  SELECT
    channel,
-    app_version as version,
+    app_version AS version,
    ping_type,
    os,
-    app_build_id as build_id,
-    IF(app_build_id="*", NULL, SAFE_CAST({{ build_date_udf }}(app_build_id) AS STRING)) as build_date,
+    app_build_id AS build_id,
+    IF(
+      app_build_id = "*",
+      NULL,
+      SAFE_CAST({{ build_date_udf }}(app_build_id) AS STRING)
+    ) AS build_date,
    metric,
    metric_type,
    -- BigQuery has some null unicode characters which Postgresql doesn't like,
@ -14,11 +18,17 @@ WITH final_probe_extract AS ( SELECT
    -- length.
    SUBSTR(REPLACE(key, r"\x00", ""), 0, 200) AS metric_key,
    client_agg_type,
-    MAX(total_users) as total_users,
-    MAX(IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(aggregates), NULL)) as histogram,
-    MAX(IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(aggregates), NULL)) as percentiles,
-    MAX(IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL)) as non_norm_histogram,
-    MAX(IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL)) as non_norm_percentiles
+    MAX(total_users) AS total_users,
+    MAX(IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(aggregates), NULL)) AS histogram,
+    MAX(
+      IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(aggregates), NULL)
+    ) AS percentiles,
+    MAX(
+      IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL)
+    ) AS non_norm_histogram,
+    MAX(
+      IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL)
+    ) AS non_norm_percentiles
  FROM
    `{{ dataset }}.{{ prefix }}__view_probe_counts_v1`
  WHERE
@ -37,16 +47,24 @@ GROUP BY
 ),
 -- to populate total_sample for agg_type other than 'count'
 glam_sample_counts AS (
-  SELECT fsc1.os,
+  SELECT
+    fsc1.os,
    fsc1.app_version,
    fsc1.app_build_id,
    fsc1.metric,
    fsc1.key,
    fsc1.ping_type,
    fsc1.agg_type,
-    CASE WHEN fsc1.agg_type in ('max','min','sum','avg') AND fsc2.agg_type = 'count' THEN fsc2.total_sample ELSE fsc1.total_sample END as total_sample
-  FROM `{{ dataset }}.{{ prefix }}__view_sample_counts_v1`  fsc1
-  INNER JOIN `{{ dataset }}.{{ prefix }}__view_sample_counts_v1`  fsc2
+    CASE
+      WHEN fsc1.agg_type IN ('max', 'min', 'sum', 'avg')
+        AND fsc2.agg_type = 'count'
+        THEN fsc2.total_sample
+      ELSE fsc1.total_sample
+    END AS total_sample
+  FROM
+    `{{ dataset }}.{{ prefix }}__view_sample_counts_v1` fsc1
+  INNER JOIN
+    `{{ dataset }}.{{ prefix }}__view_sample_counts_v1` fsc2
    ON fsc1.os = fsc2.os
    AND fsc1.app_build_id = fsc2.app_build_id
    AND fsc1.app_version = fsc2.app_version
@ -55,7 +73,8 @@ glam_sample_counts AS (
    AND fsc1.ping_type = fsc2.ping_type
 ),
 -- get all the rcords from view_probe_counts and the matching from view_sample_counts
-ranked_data AS (SELECT
+ranked_data AS (
+  SELECT
    cp.channel,
    cp.version,
    cp.os,
@ -69,14 +88,36 @@ ranked_data AS (SELECT
    total_users,
    histogram,
    percentiles,
-  CASE WHEN client_agg_type = '' THEN 0 ELSE total_sample END AS total_sample,
-  ROW_NUMBER() OVER (PARTITION BY cp.version, cp.os, cp.build_id,cp.ping_type, cp.metric, cp.metric_key, cp.client_agg_type,cp.metric_type,histogram, percentiles
-                    ORDER BY total_users, total_sample DESC) as rnk
+    non_norm_histogram,
+    non_norm_percentiles,
+    CASE
+      WHEN client_agg_type = ''
+        THEN 0
+      ELSE total_sample
+    END AS total_sample,
+    ROW_NUMBER() OVER (
+      PARTITION BY
+        cp.version,
+        cp.os,
+        cp.build_id,
+        cp.ping_type,
+        cp.metric,
+        cp.metric_key,
+        cp.client_agg_type,
+        cp.metric_type,
+        histogram,
+        percentiles,
+        non_norm_histogram,
+        non_norm_percentiles
+      ORDER BY
+        total_users,
+        total_sample DESC
+    ) AS rnk
  FROM
    final_probe_extract cp
-LEFT JOIN glam_sample_counts sc
-  ON
-    sc.os = cp.os
+  LEFT JOIN
+    glam_sample_counts sc
+    ON sc.os = cp.os
    AND sc.app_build_id = cp.build_id
    AND sc.app_version = cp.version
    AND sc.metric = cp.metric
@ -85,7 +126,8 @@ LEFT JOIN glam_sample_counts sc
    AND (sc.agg_type = cp.client_agg_type OR cp.client_agg_type = '')
 )
 --remove duplicates
-SELECT channel,
+SELECT
+  channel,
  version,
  ping_type,
  os,
@ -100,6 +142,8 @@ SELECT channel,
  percentiles,
  non_norm_histogram,
  non_norm_percentiles,
-  CAST(total_sample as INT) total_sample
-FROM ranked_data
-WHERE rnk = 1
+  CAST(total_sample AS INT) total_sample
+FROM
+  ranked_data
+WHERE
+  rnk = 1
--- a/bigquery_etl/glam/templates/histogram_percentiles_v1.sql
+++ b/bigquery_etl/glam/templates/histogram_percentiles_v1.sql
@ -1,6 +1,6 @@
 {{ header }}
 SELECT
-  * EXCEPT (aggregates) REPLACE('percentiles' AS agg_type),
+  * EXCEPT (aggregates, non_norm_aggregates) REPLACE('percentiles' AS agg_type),
  ARRAY<STRUCT<key STRING, value FLOAT64>>[
    ('0.1', mozfun.glam.percentile(0.1, aggregates, metric_type)),
    ('1', mozfun.glam.percentile(1, aggregates, metric_type)),