From 99d982af0d0c76422a63a6af13ed389aa940f52e Mon Sep 17 00:00:00 2001
From: Eduardo Filho <edugomfilho@gmail.com>
Date: Fri, 7 Jun 2024 13:53:12 -0400
Subject: [PATCH] fix(glam): add missing fields for percentiles and probe
 counts with non-norm data (#5769)

---
 .../templates/extract_probe_counts_v1.sql     | 134 ++++++++++++------
 .../templates/histogram_percentiles_v1.sql    |   4 +-
 2 files changed, 91 insertions(+), 47 deletions(-)

diff --git a/bigquery_etl/glam/templates/extract_probe_counts_v1.sql b/bigquery_etl/glam/templates/extract_probe_counts_v1.sql
index 287f43ae53..f4249222f7 100644
--- a/bigquery_etl/glam/templates/extract_probe_counts_v1.sql
+++ b/bigquery_etl/glam/templates/extract_probe_counts_v1.sql
@@ -1,12 +1,16 @@
 {{ header }}
-
-WITH final_probe_extract AS ( SELECT
+WITH final_probe_extract AS (
+  SELECT
     channel,
-    app_version as version,
+    app_version AS version,
     ping_type,
     os,
-    app_build_id as build_id,
-    IF(app_build_id="*", NULL, SAFE_CAST({{ build_date_udf }}(app_build_id) AS STRING)) as build_date,
+    app_build_id AS build_id,
+    IF(
+      app_build_id = "*",
+      NULL,
+      SAFE_CAST({{ build_date_udf }}(app_build_id) AS STRING)
+    ) AS build_date,
     metric,
     metric_type,
     -- BigQuery has some null unicode characters which Postgresql doesn't like,
@@ -14,17 +18,23 @@ WITH final_probe_extract AS ( SELECT
     -- length.
     SUBSTR(REPLACE(key, r"\x00", ""), 0, 200) AS metric_key,
     client_agg_type,
-    MAX(total_users) as total_users,
-    MAX(IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(aggregates), NULL)) as histogram,
-    MAX(IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(aggregates), NULL)) as percentiles,
-    MAX(IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL)) as non_norm_histogram,
-    MAX(IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL)) as non_norm_percentiles
-FROM
+    MAX(total_users) AS total_users,
+    MAX(IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(aggregates), NULL)) AS histogram,
+    MAX(
+      IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(aggregates), NULL)
+    ) AS percentiles,
+    MAX(
+      IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL)
+    ) AS non_norm_histogram,
+    MAX(
+      IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(non_norm_aggregates), NULL)
+    ) AS non_norm_percentiles
+  FROM
     `{{ dataset }}.{{ prefix }}__view_probe_counts_v1`
-WHERE
+  WHERE
     total_users > {{ total_users }}
     AND app_version NOT IN (2015815747, 2015819723, 2015828803, 2015829155, 3015815747)
-GROUP BY
+  GROUP BY
     channel,
     app_version,
     ping_type,
@@ -37,17 +47,25 @@ GROUP BY
 ),
 -- to populate total_sample for agg_type other than 'count'
 glam_sample_counts AS (
-  SELECT fsc1.os,
+  SELECT
+    fsc1.os,
     fsc1.app_version,
     fsc1.app_build_id,
     fsc1.metric,
     fsc1.key,
     fsc1.ping_type,
     fsc1.agg_type,
-    CASE WHEN fsc1.agg_type in ('max','min','sum','avg') AND fsc2.agg_type = 'count' THEN fsc2.total_sample ELSE fsc1.total_sample END as total_sample
-  FROM `{{ dataset }}.{{ prefix }}__view_sample_counts_v1`  fsc1
-  INNER JOIN `{{ dataset }}.{{ prefix }}__view_sample_counts_v1`  fsc2
-  ON fsc1.os = fsc2.os
+    CASE
+      WHEN fsc1.agg_type IN ('max', 'min', 'sum', 'avg')
+        AND fsc2.agg_type = 'count'
+        THEN fsc2.total_sample
+      ELSE fsc1.total_sample
+    END AS total_sample
+  FROM
+    `{{ dataset }}.{{ prefix }}__view_sample_counts_v1` fsc1
+  INNER JOIN
+    `{{ dataset }}.{{ prefix }}__view_sample_counts_v1` fsc2
+    ON fsc1.os = fsc2.os
     AND fsc1.app_build_id = fsc2.app_build_id
     AND fsc1.app_version = fsc2.app_version
     AND fsc1.metric = fsc2.metric
@@ -55,37 +73,61 @@ glam_sample_counts AS (
     AND fsc1.ping_type = fsc2.ping_type
 ),
 -- get all the rcords from view_probe_counts and the matching from view_sample_counts
-ranked_data AS (SELECT
-  cp.channel,
-  cp.version,
-  cp.os,
-  cp.ping_type,
-  cp.build_id,
-  cp.build_date,
-  cp.metric,
-  cp.metric_key,
-  cp.client_agg_type,
-  cp.metric_type,
-  total_users,
-  histogram,
-  percentiles,
-  CASE WHEN client_agg_type = '' THEN 0 ELSE total_sample END AS total_sample,
-  ROW_NUMBER() OVER (PARTITION BY cp.version, cp.os, cp.build_id,cp.ping_type, cp.metric, cp.metric_key, cp.client_agg_type,cp.metric_type,histogram, percentiles
-                    ORDER BY total_users, total_sample DESC) as rnk
-FROM
-  final_probe_extract cp
-LEFT JOIN glam_sample_counts sc
-  ON
-    sc.os = cp.os
+ranked_data AS (
+  SELECT
+    cp.channel,
+    cp.version,
+    cp.os,
+    cp.ping_type,
+    cp.build_id,
+    cp.build_date,
+    cp.metric,
+    cp.metric_key,
+    cp.client_agg_type,
+    cp.metric_type,
+    total_users,
+    histogram,
+    percentiles,
+    non_norm_histogram,
+    non_norm_percentiles,
+    CASE
+      WHEN client_agg_type = ''
+        THEN 0
+      ELSE total_sample
+    END AS total_sample,
+    ROW_NUMBER() OVER (
+      PARTITION BY
+        cp.version,
+        cp.os,
+        cp.build_id,
+        cp.ping_type,
+        cp.metric,
+        cp.metric_key,
+        cp.client_agg_type,
+        cp.metric_type,
+        histogram,
+        percentiles,
+        non_norm_histogram,
+        non_norm_percentiles
+      ORDER BY
+        total_users,
+        total_sample DESC
+    ) AS rnk
+  FROM
+    final_probe_extract cp
+  LEFT JOIN
+    glam_sample_counts sc
+    ON sc.os = cp.os
     AND sc.app_build_id = cp.build_id
     AND sc.app_version = cp.version
     AND sc.metric = cp.metric
     AND sc.key = cp.metric_key
     AND total_sample IS NOT NULL
-    AND (sc.agg_type = cp.client_agg_type OR cp.client_agg_type='')
+    AND (sc.agg_type = cp.client_agg_type OR cp.client_agg_type = '')
 )
 --remove duplicates
-SELECT channel,
+SELECT
+  channel,
   version,
   ping_type,
   os,
@@ -100,6 +142,8 @@ SELECT channel,
   percentiles,
   non_norm_histogram,
   non_norm_percentiles,
-  CAST(total_sample as INT) total_sample
-FROM ranked_data
-WHERE rnk = 1
+  CAST(total_sample AS INT) total_sample
+FROM
+  ranked_data
+WHERE
+  rnk = 1
diff --git a/bigquery_etl/glam/templates/histogram_percentiles_v1.sql b/bigquery_etl/glam/templates/histogram_percentiles_v1.sql
index d0bb4781fe..4a67f354e7 100644
--- a/bigquery_etl/glam/templates/histogram_percentiles_v1.sql
+++ b/bigquery_etl/glam/templates/histogram_percentiles_v1.sql
@@ -1,6 +1,6 @@
 {{ header }}
 SELECT
-  * EXCEPT (aggregates) REPLACE('percentiles' AS agg_type),
+  * EXCEPT (aggregates, non_norm_aggregates) REPLACE('percentiles' AS agg_type),
   ARRAY<STRUCT<key STRING, value FLOAT64>>[
     ('0.1', mozfun.glam.percentile(0.1, aggregates, metric_type)),
     ('1', mozfun.glam.percentile(1, aggregates, metric_type)),
@@ -24,4 +24,4 @@ SELECT
     ('99.9', mozfun.glam.percentile(99.9, non_norm_aggregates, metric_type))
   ] AS non_norm_aggregates
 FROM
-  glam_etl.{{ prefix }}__histogram_probe_counts_v1
+  glam_etl.{{ prefix }} __histogram_probe_counts_v1