diff --git a/bigquery_etl/glam/templates/extract_probe_counts_v1.sql b/bigquery_etl/glam/templates/extract_probe_counts_v1.sql index 6f5e729d94..8341896423 100644 --- a/bigquery_etl/glam/templates/extract_probe_counts_v1.sql +++ b/bigquery_etl/glam/templates/extract_probe_counts_v1.sql @@ -1,37 +1,23 @@ {{ header }} --- TODO: Remove deduping when dupes are fixed. -WITH deduped AS ( - SELECT - *, - ROW_NUMBER() OVER( - PARTITION BY - ping_type, - os, - app_version, - app_build_id, - channel, - metric, - metric_type, - key, - client_agg_type, - agg_type - ORDER BY - total_users DESC - ) AS rank - FROM - `{{ dataset }}.{{ prefix }}__view_probe_counts_v1` - WHERE - channel IS NOT NULL - AND app_version IS NOT NULL - AND total_users >= 100 -) + +CREATE TEMP FUNCTION udf_js_flatten(histogram ARRAY>) +RETURNS STRING +LANGUAGE js +AS + ''' + let obj = {}; + histogram.map(function(r) { + obj[r.key] = parseFloat(r.value.toFixed(4)); + }); + return JSON.stringify(obj); +'''; SELECT channel, app_version as version, - COALESCE(ping_type, "*") as ping_type, - COALESCE(os, "*") AS os, - COALESCE(app_build_id, "*") AS build_id, + ping_type, + os, + app_build_id as build_id, metric, metric_type, -- BigQuery has some null unicode characters which Postgresql doesn't like, @@ -39,10 +25,18 @@ SELECT -- length. SUBSTR(REPLACE(key, r"\x00", ""), 0, 200) AS metric_key, client_agg_type, - agg_type, - total_users, - TO_JSON_STRING(aggregates) AS data + MAX(total_users) as total_users, + MAX(IF(agg_type = "histogram", udf_js_flatten(aggregates), NULL)) as histogram, + MAX(IF(agg_type = "percentiles", udf_js_flatten(aggregates), NULL)) as percentiles, FROM - deduped -WHERE - rank = 1 + `{{ dataset }}.{{ prefix }}__view_probe_counts_v1` +GROUP BY + channel, + app_version, + ping_type, + os, + app_build_id, + metric, + metric_type, + key, + client_agg_type diff --git a/script/glam/export_csv b/script/glam/export_csv index b20f87b4de..c13e57f1aa 100755 --- a/script/glam/export_csv +++ b/script/glam/export_csv @@ -8,15 +8,15 @@ src_project=${SRC_PROJECT:-glam-fenix-dev} # TODO: glam-dev-bespoke-nonprod-dataops-mozgcp-net dst_project=${DST_PROJECT:-$src_project} dataset=${DATASET:-glam_etl_dev} -product=${PRODUCT:-fenix} # TODO: set default to org_mozilla_fenix +product=${PRODUCT:-org_mozilla_fenix} bucket="gs://${dst_project}" gsutil ls "$bucket" > /dev/null bq extract --destination_format CSV --noprint_header \ - "${src_project}:${dataset}.${product}_extract_probe_counts_v1" \ + "${src_project}:${dataset}.${product}__extract_probe_counts_v1" \ "$bucket/glam-extract-${product}-*.csv" bq extract --destination_format CSV --noprint_header \ - "${src_project}:${dataset}.${product}_extract_user_counts_v1" \ + "${src_project}:${dataset}.${product}__extract_user_counts_v1" \ "$bucket/glam-extract-${product}-counts.csv" diff --git a/sql/glam_etl/org_mozilla_fenix__extract_probe_counts_v1/query.sql b/sql/glam_etl/org_mozilla_fenix__extract_probe_counts_v1/query.sql index 54c50c44b2..71a836adcb 100644 --- a/sql/glam_etl/org_mozilla_fenix__extract_probe_counts_v1/query.sql +++ b/sql/glam_etl/org_mozilla_fenix__extract_probe_counts_v1/query.sql @@ -1,36 +1,22 @@ -- query for org_mozilla_fenix__extract_probe_counts_v1; --- TODO: Remove deduping when dupes are fixed. -WITH deduped AS ( - SELECT - *, - ROW_NUMBER() OVER ( - PARTITION BY - ping_type, - os, - app_version, - app_build_id, - channel, - metric, - metric_type, - key, - client_agg_type, - agg_type - ORDER BY - total_users DESC - ) AS rank - FROM - `glam_etl.org_mozilla_fenix__view_probe_counts_v1` - WHERE - channel IS NOT NULL - AND app_version IS NOT NULL - AND total_users >= 100 -) +CREATE TEMP FUNCTION udf_js_flatten(histogram ARRAY>) +RETURNS STRING +LANGUAGE js +AS + ''' + let obj = {}; + histogram.map(function(r) { + obj[r.key] = parseFloat(r.value.toFixed(4)); + }); + return JSON.stringify(obj); +'''; + SELECT channel, app_version AS version, - COALESCE(ping_type, "*") AS ping_type, - COALESCE(os, "*") AS os, - COALESCE(app_build_id, "*") AS build_id, + ping_type, + os, + app_build_id AS build_id, metric, metric_type, -- BigQuery has some null unicode characters which Postgresql doesn't like, @@ -38,10 +24,18 @@ SELECT -- length. SUBSTR(REPLACE(key, r"\x00", ""), 0, 200) AS metric_key, client_agg_type, - agg_type, - total_users, - TO_JSON_STRING(aggregates) AS data + MAX(total_users) AS total_users, + MAX(IF(agg_type = "histogram", udf_js_flatten(aggregates), NULL)) AS histogram, + MAX(IF(agg_type = "percentiles", udf_js_flatten(aggregates), NULL)) AS percentiles, FROM - deduped -WHERE - rank = 1 + `glam_etl.org_mozilla_fenix__view_probe_counts_v1` +GROUP BY + channel, + app_version, + ping_type, + os, + app_build_id, + metric, + metric_type, + key, + client_agg_type