[glam-etl] Use transpose logic for fenix extracts (#1011)
This commit is contained in:
Родитель
2e68006911
Коммит
1f7c218729
|
@ -1,37 +1,23 @@
|
|||
{{ header }}
|
||||
-- TODO: Remove deduping when dupes are fixed.
|
||||
WITH deduped AS (
|
||||
SELECT
|
||||
*,
|
||||
ROW_NUMBER() OVER(
|
||||
PARTITION BY
|
||||
ping_type,
|
||||
os,
|
||||
app_version,
|
||||
app_build_id,
|
||||
channel,
|
||||
metric,
|
||||
metric_type,
|
||||
key,
|
||||
client_agg_type,
|
||||
agg_type
|
||||
ORDER BY
|
||||
total_users DESC
|
||||
) AS rank
|
||||
FROM
|
||||
`{{ dataset }}.{{ prefix }}__view_probe_counts_v1`
|
||||
WHERE
|
||||
channel IS NOT NULL
|
||||
AND app_version IS NOT NULL
|
||||
AND total_users >= 100
|
||||
)
|
||||
|
||||
CREATE TEMP FUNCTION udf_js_flatten(histogram ARRAY<STRUCT<key STRING, value FLOAT64>>)
|
||||
RETURNS STRING
|
||||
LANGUAGE js
|
||||
AS
|
||||
'''
|
||||
let obj = {};
|
||||
histogram.map(function(r) {
|
||||
obj[r.key] = parseFloat(r.value.toFixed(4));
|
||||
});
|
||||
return JSON.stringify(obj);
|
||||
''';
|
||||
|
||||
SELECT
|
||||
channel,
|
||||
app_version as version,
|
||||
COALESCE(ping_type, "*") as ping_type,
|
||||
COALESCE(os, "*") AS os,
|
||||
COALESCE(app_build_id, "*") AS build_id,
|
||||
ping_type,
|
||||
os,
|
||||
app_build_id as build_id,
|
||||
metric,
|
||||
metric_type,
|
||||
-- BigQuery has some null unicode characters which Postgresql doesn't like,
|
||||
|
@ -39,10 +25,18 @@ SELECT
|
|||
-- length.
|
||||
SUBSTR(REPLACE(key, r"\x00", ""), 0, 200) AS metric_key,
|
||||
client_agg_type,
|
||||
agg_type,
|
||||
total_users,
|
||||
TO_JSON_STRING(aggregates) AS data
|
||||
MAX(total_users) as total_users,
|
||||
MAX(IF(agg_type = "histogram", udf_js_flatten(aggregates), NULL)) as histogram,
|
||||
MAX(IF(agg_type = "percentiles", udf_js_flatten(aggregates), NULL)) as percentiles,
|
||||
FROM
|
||||
deduped
|
||||
WHERE
|
||||
rank = 1
|
||||
`{{ dataset }}.{{ prefix }}__view_probe_counts_v1`
|
||||
GROUP BY
|
||||
channel,
|
||||
app_version,
|
||||
ping_type,
|
||||
os,
|
||||
app_build_id,
|
||||
metric,
|
||||
metric_type,
|
||||
key,
|
||||
client_agg_type
|
||||
|
|
|
@ -8,15 +8,15 @@ src_project=${SRC_PROJECT:-glam-fenix-dev}
|
|||
# TODO: glam-dev-bespoke-nonprod-dataops-mozgcp-net
|
||||
dst_project=${DST_PROJECT:-$src_project}
|
||||
dataset=${DATASET:-glam_etl_dev}
|
||||
product=${PRODUCT:-fenix} # TODO: set default to org_mozilla_fenix
|
||||
product=${PRODUCT:-org_mozilla_fenix}
|
||||
|
||||
bucket="gs://${dst_project}"
|
||||
gsutil ls "$bucket" > /dev/null
|
||||
|
||||
bq extract --destination_format CSV --noprint_header \
|
||||
"${src_project}:${dataset}.${product}_extract_probe_counts_v1" \
|
||||
"${src_project}:${dataset}.${product}__extract_probe_counts_v1" \
|
||||
"$bucket/glam-extract-${product}-*.csv"
|
||||
|
||||
bq extract --destination_format CSV --noprint_header \
|
||||
"${src_project}:${dataset}.${product}_extract_user_counts_v1" \
|
||||
"${src_project}:${dataset}.${product}__extract_user_counts_v1" \
|
||||
"$bucket/glam-extract-${product}-counts.csv"
|
||||
|
|
|
@ -1,36 +1,22 @@
|
|||
-- query for org_mozilla_fenix__extract_probe_counts_v1;
|
||||
-- TODO: Remove deduping when dupes are fixed.
|
||||
WITH deduped AS (
|
||||
SELECT
|
||||
*,
|
||||
ROW_NUMBER() OVER (
|
||||
PARTITION BY
|
||||
ping_type,
|
||||
os,
|
||||
app_version,
|
||||
app_build_id,
|
||||
channel,
|
||||
metric,
|
||||
metric_type,
|
||||
key,
|
||||
client_agg_type,
|
||||
agg_type
|
||||
ORDER BY
|
||||
total_users DESC
|
||||
) AS rank
|
||||
FROM
|
||||
`glam_etl.org_mozilla_fenix__view_probe_counts_v1`
|
||||
WHERE
|
||||
channel IS NOT NULL
|
||||
AND app_version IS NOT NULL
|
||||
AND total_users >= 100
|
||||
)
|
||||
CREATE TEMP FUNCTION udf_js_flatten(histogram ARRAY<STRUCT<key STRING, value FLOAT64>>)
|
||||
RETURNS STRING
|
||||
LANGUAGE js
|
||||
AS
|
||||
'''
|
||||
let obj = {};
|
||||
histogram.map(function(r) {
|
||||
obj[r.key] = parseFloat(r.value.toFixed(4));
|
||||
});
|
||||
return JSON.stringify(obj);
|
||||
''';
|
||||
|
||||
SELECT
|
||||
channel,
|
||||
app_version AS version,
|
||||
COALESCE(ping_type, "*") AS ping_type,
|
||||
COALESCE(os, "*") AS os,
|
||||
COALESCE(app_build_id, "*") AS build_id,
|
||||
ping_type,
|
||||
os,
|
||||
app_build_id AS build_id,
|
||||
metric,
|
||||
metric_type,
|
||||
-- BigQuery has some null unicode characters which Postgresql doesn't like,
|
||||
|
@ -38,10 +24,18 @@ SELECT
|
|||
-- length.
|
||||
SUBSTR(REPLACE(key, r"\x00", ""), 0, 200) AS metric_key,
|
||||
client_agg_type,
|
||||
agg_type,
|
||||
total_users,
|
||||
TO_JSON_STRING(aggregates) AS data
|
||||
MAX(total_users) AS total_users,
|
||||
MAX(IF(agg_type = "histogram", udf_js_flatten(aggregates), NULL)) AS histogram,
|
||||
MAX(IF(agg_type = "percentiles", udf_js_flatten(aggregates), NULL)) AS percentiles,
|
||||
FROM
|
||||
deduped
|
||||
WHERE
|
||||
rank = 1
|
||||
`glam_etl.org_mozilla_fenix__view_probe_counts_v1`
|
||||
GROUP BY
|
||||
channel,
|
||||
app_version,
|
||||
ping_type,
|
||||
os,
|
||||
app_build_id,
|
||||
metric,
|
||||
metric_type,
|
||||
key,
|
||||
client_agg_type
|
||||
|
|
Загрузка…
Ссылка в новой задаче