[glam-etl] Use transpose logic for fenix extracts (#1011)

2020-05-28 11:57:55 -07:00 · 2020-05-28 11:57:55 -07:00 · 1f7c218729
--- a/bigquery_etl/glam/templates/extract_probe_counts_v1.sql
+++ b/bigquery_etl/glam/templates/extract_probe_counts_v1.sql
@ -1,37 +1,23 @@
 {{ header }}
-- TODO: Remove deduping when dupes are fixed.
-WITH deduped AS (
-    SELECT
-        *,
-        ROW_NUMBER() OVER(
-            PARTITION BY
-                ping_type,
-                os,
-                app_version,
-                app_build_id,
-                channel,
-                metric,
-                metric_type,
-                key,
-                client_agg_type,
-                agg_type
-            ORDER BY
-                total_users DESC
-        ) AS rank
-    FROM
-        `{{ dataset }}.{{ prefix }}__view_probe_counts_v1`
-    WHERE
-        channel IS NOT NULL
-        AND app_version IS NOT NULL
-        AND total_users >= 100
-)
+
+CREATE TEMP FUNCTION udf_js_flatten(histogram ARRAY<STRUCT<key STRING, value FLOAT64>>)
+RETURNS STRING
+LANGUAGE js
+AS
+  '''
+    let obj = {};
+    histogram.map(function(r) {
+        obj[r.key] = parseFloat(r.value.toFixed(4));
+    });
+    return JSON.stringify(obj);
+''';

 SELECT
    channel,
    app_version as version,
-    COALESCE(ping_type, "*") as ping_type,
-    COALESCE(os, "*") AS os,
-    COALESCE(app_build_id, "*") AS build_id,
+    ping_type,
+    os,
+    app_build_id as build_id,
    metric,
    metric_type,
    -- BigQuery has some null unicode characters which Postgresql doesn't like,
@ -39,10 +25,18 @@ SELECT
    -- length.
    SUBSTR(REPLACE(key, r"\x00", ""), 0, 200) AS metric_key,
    client_agg_type,
-    agg_type,
-    total_users,
-    TO_JSON_STRING(aggregates) AS data
+    MAX(total_users) as total_users,
+    MAX(IF(agg_type = "histogram", udf_js_flatten(aggregates), NULL)) as histogram,
+    MAX(IF(agg_type = "percentiles", udf_js_flatten(aggregates), NULL)) as percentiles,
 FROM
-    deduped
-WHERE
-    rank = 1
+    `{{ dataset }}.{{ prefix }}__view_probe_counts_v1`
+GROUP BY
+    channel,
+    app_version,
+    ping_type,
+    os,
+    app_build_id,
+    metric,
+    metric_type,
+    key,
+    client_agg_type
--- a/script/glam/export_csv
+++ b/script/glam/export_csv
@ -8,15 +8,15 @@ src_project=${SRC_PROJECT:-glam-fenix-dev}
 # TODO: glam-dev-bespoke-nonprod-dataops-mozgcp-net
 dst_project=${DST_PROJECT:-$src_project}
 dataset=${DATASET:-glam_etl_dev}
-product=${PRODUCT:-fenix}   # TODO: set default to org_mozilla_fenix
+product=${PRODUCT:-org_mozilla_fenix}

 bucket="gs://${dst_project}"
 gsutil ls "$bucket" > /dev/null

 bq extract --destination_format CSV --noprint_header \
-    "${src_project}:${dataset}.${product}_extract_probe_counts_v1" \
+    "${src_project}:${dataset}.${product}__extract_probe_counts_v1" \
    "$bucket/glam-extract-${product}-*.csv"

 bq extract --destination_format CSV --noprint_header \
-    "${src_project}:${dataset}.${product}_extract_user_counts_v1" \
+    "${src_project}:${dataset}.${product}__extract_user_counts_v1" \
    "$bucket/glam-extract-${product}-counts.csv"
--- a/sql/glam_etl/org_mozilla_fenix__extract_probe_counts_v1/query.sql
+++ b/sql/glam_etl/org_mozilla_fenix__extract_probe_counts_v1/query.sql
@ -1,36 +1,22 @@
 -- query for org_mozilla_fenix__extract_probe_counts_v1;
-- TODO: Remove deduping when dupes are fixed.
-WITH deduped AS (
-  SELECT
-    *,
-    ROW_NUMBER() OVER (
-      PARTITION BY
-        ping_type,
-        os,
-        app_version,
-        app_build_id,
-        channel,
-        metric,
-        metric_type,
-        key,
-        client_agg_type,
-        agg_type
-      ORDER BY
-        total_users DESC
-    ) AS rank
-  FROM
-    `glam_etl.org_mozilla_fenix__view_probe_counts_v1`
-  WHERE
-    channel IS NOT NULL
-    AND app_version IS NOT NULL
-    AND total_users >= 100
-)
+CREATE TEMP FUNCTION udf_js_flatten(histogram ARRAY<STRUCT<key STRING, value FLOAT64>>)
+RETURNS STRING
+LANGUAGE js
+AS
+  '''
+    let obj = {};
+    histogram.map(function(r) {
+        obj[r.key] = parseFloat(r.value.toFixed(4));
+    });
+    return JSON.stringify(obj);
+''';
+
 SELECT
  channel,
  app_version AS version,
-  COALESCE(ping_type, "*") AS ping_type,
-  COALESCE(os, "*") AS os,
-  COALESCE(app_build_id, "*") AS build_id,
+  ping_type,
+  os,
+  app_build_id AS build_id,
  metric,
  metric_type,
    -- BigQuery has some null unicode characters which Postgresql doesn't like,
@ -38,10 +24,18 @@ SELECT
    -- length.
  SUBSTR(REPLACE(key, r"\x00", ""), 0, 200) AS metric_key,
  client_agg_type,
-  agg_type,
-  total_users,
-  TO_JSON_STRING(aggregates) AS data
+  MAX(total_users) AS total_users,
+  MAX(IF(agg_type = "histogram", udf_js_flatten(aggregates), NULL)) AS histogram,
+  MAX(IF(agg_type = "percentiles", udf_js_flatten(aggregates), NULL)) AS percentiles,
 FROM
-  deduped
-WHERE
-  rank = 1
+  `glam_etl.org_mozilla_fenix__view_probe_counts_v1`
+GROUP BY
+  channel,
+  app_version,
+  ping_type,
+  os,
+  app_build_id,
+  metric,
+  metric_type,
+  key,
+  client_agg_type