[glam-etl] Use transpose logic for fenix extracts (#1011)

This commit is contained in:
Anthony Miyaguchi 2020-05-28 11:57:55 -07:00 коммит произвёл GitHub
Родитель 2e68006911
Коммит 1f7c218729
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 61 добавлений и 73 удалений

Просмотреть файл

@ -1,37 +1,23 @@
{{ header }}
-- TODO: Remove deduping when dupes are fixed.
WITH deduped AS (
SELECT
*,
ROW_NUMBER() OVER(
PARTITION BY
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
client_agg_type,
agg_type
ORDER BY
total_users DESC
) AS rank
FROM
`{{ dataset }}.{{ prefix }}__view_probe_counts_v1`
WHERE
channel IS NOT NULL
AND app_version IS NOT NULL
AND total_users >= 100
)
CREATE TEMP FUNCTION udf_js_flatten(histogram ARRAY<STRUCT<key STRING, value FLOAT64>>)
RETURNS STRING
LANGUAGE js
AS
'''
let obj = {};
histogram.map(function(r) {
obj[r.key] = parseFloat(r.value.toFixed(4));
});
return JSON.stringify(obj);
''';
SELECT
channel,
app_version as version,
COALESCE(ping_type, "*") as ping_type,
COALESCE(os, "*") AS os,
COALESCE(app_build_id, "*") AS build_id,
ping_type,
os,
app_build_id as build_id,
metric,
metric_type,
-- BigQuery has some null unicode characters which Postgresql doesn't like,
@ -39,10 +25,18 @@ SELECT
-- length.
SUBSTR(REPLACE(key, r"\x00", ""), 0, 200) AS metric_key,
client_agg_type,
agg_type,
total_users,
TO_JSON_STRING(aggregates) AS data
MAX(total_users) as total_users,
MAX(IF(agg_type = "histogram", udf_js_flatten(aggregates), NULL)) as histogram,
MAX(IF(agg_type = "percentiles", udf_js_flatten(aggregates), NULL)) as percentiles,
FROM
deduped
WHERE
rank = 1
`{{ dataset }}.{{ prefix }}__view_probe_counts_v1`
GROUP BY
channel,
app_version,
ping_type,
os,
app_build_id,
metric,
metric_type,
key,
client_agg_type

Просмотреть файл

@ -8,15 +8,15 @@ src_project=${SRC_PROJECT:-glam-fenix-dev}
# TODO: glam-dev-bespoke-nonprod-dataops-mozgcp-net
dst_project=${DST_PROJECT:-$src_project}
dataset=${DATASET:-glam_etl_dev}
product=${PRODUCT:-fenix} # TODO: set default to org_mozilla_fenix
product=${PRODUCT:-org_mozilla_fenix}
bucket="gs://${dst_project}"
gsutil ls "$bucket" > /dev/null
bq extract --destination_format CSV --noprint_header \
"${src_project}:${dataset}.${product}_extract_probe_counts_v1" \
"${src_project}:${dataset}.${product}__extract_probe_counts_v1" \
"$bucket/glam-extract-${product}-*.csv"
bq extract --destination_format CSV --noprint_header \
"${src_project}:${dataset}.${product}_extract_user_counts_v1" \
"${src_project}:${dataset}.${product}__extract_user_counts_v1" \
"$bucket/glam-extract-${product}-counts.csv"

Просмотреть файл

@ -1,36 +1,22 @@
-- query for org_mozilla_fenix__extract_probe_counts_v1;
-- TODO: Remove deduping when dupes are fixed.
WITH deduped AS (
SELECT
*,
ROW_NUMBER() OVER (
PARTITION BY
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
client_agg_type,
agg_type
ORDER BY
total_users DESC
) AS rank
FROM
`glam_etl.org_mozilla_fenix__view_probe_counts_v1`
WHERE
channel IS NOT NULL
AND app_version IS NOT NULL
AND total_users >= 100
)
CREATE TEMP FUNCTION udf_js_flatten(histogram ARRAY<STRUCT<key STRING, value FLOAT64>>)
RETURNS STRING
LANGUAGE js
AS
'''
let obj = {};
histogram.map(function(r) {
obj[r.key] = parseFloat(r.value.toFixed(4));
});
return JSON.stringify(obj);
''';
SELECT
channel,
app_version AS version,
COALESCE(ping_type, "*") AS ping_type,
COALESCE(os, "*") AS os,
COALESCE(app_build_id, "*") AS build_id,
ping_type,
os,
app_build_id AS build_id,
metric,
metric_type,
-- BigQuery has some null unicode characters which Postgresql doesn't like,
@ -38,10 +24,18 @@ SELECT
-- length.
SUBSTR(REPLACE(key, r"\x00", ""), 0, 200) AS metric_key,
client_agg_type,
agg_type,
total_users,
TO_JSON_STRING(aggregates) AS data
MAX(total_users) AS total_users,
MAX(IF(agg_type = "histogram", udf_js_flatten(aggregates), NULL)) AS histogram,
MAX(IF(agg_type = "percentiles", udf_js_flatten(aggregates), NULL)) AS percentiles,
FROM
deduped
WHERE
rank = 1
`glam_etl.org_mozilla_fenix__view_probe_counts_v1`
GROUP BY
channel,
app_version,
ping_type,
os,
app_build_id,
metric,
metric_type,
key,
client_agg_type