* firefox_desktop_to_glam

added sql files part 1

* update with the lastest mozfun function
This commit is contained in:
Alekhya 2021-11-22 13:45:19 -05:00 коммит произвёл GitHub
Родитель ca6b93a403
Коммит 80af7b96df
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
35 изменённых файлов: 3275 добавлений и 0 удалений

Просмотреть файл

@ -173,6 +173,24 @@ def main():
"num_versions_to_keep": 3,
"total_users": 90000,
},
"firefox_desktop_glam_nightly": {
"build_date_udf": "mozfun.glam.build_hour_to_datetime",
"filter_version": True,
"num_versions_to_keep": 3,
"total_users": 10,
},
"firefox_desktop_glam_beta": {
"build_date_udf": "mozfun.glam.build_hour_to_datetime",
"filter_version": True,
"num_versions_to_keep": 3,
"total_users": 10,
},
"firefox_desktop_glam_release": {
"build_date_udf": "mozfun.glam.build_hour_to_datetime",
"filter_version": True,
"num_versions_to_keep": 3,
"total_users": 10,
},
}
validate(instance=config, schema=config_schema)

Просмотреть файл

@ -0,0 +1,17 @@
CREATE OR REPLACE VIEW
`{{ project }}`.glam_etl.firefox_desktop_glam_beta__view_clients_daily_histogram_aggregates_v1
AS
WITH extracted AS (
SELECT
*
FROM
`{{ project }}`.glam_etl.firefox_desktop__view_clients_daily_histogram_aggregates_v1
WHERE
channel = 'beta'
)
SELECT
* EXCEPT (app_build_id, channel),
`mozfun.glam.build_seconds_to_hour`(app_build_id) AS app_build_id,
"*" AS channel
FROM
extracted

Просмотреть файл

@ -0,0 +1,17 @@
CREATE OR REPLACE VIEW
`{{ project }}`.glam_etl.firefox_desktop_glam_beta__view_clients_daily_scalar_aggregates_v1
AS
WITH extracted AS (
SELECT
*
FROM
`{{ project }}`.glam_etl.firefox_desktop__view_clients_daily_scalar_aggregates_v1
WHERE
channel = 'beta'
)
SELECT
* EXCEPT (app_build_id, channel),
`mozfun.glam.build_seconds_to_hour`(app_build_id) AS app_build_id,
"*" AS channel
FROM
extracted

Просмотреть файл

@ -0,0 +1,17 @@
CREATE OR REPLACE VIEW
`{{ project }}`.glam_etl.firefox_desktop_glam_nightly__view_clients_daily_histogram_aggregates_v1
AS
WITH extracted AS (
SELECT
*
FROM
`{{ project }}`.glam_etl.firefox_desktop__view_clients_daily_histogram_aggregates_v1
WHERE
channel = 'nightly'
)
SELECT
* EXCEPT (app_build_id, channel),
`mozfun.glam.build_seconds_to_hour`(app_build_id) AS app_build_id,
"*" AS channel
FROM
extracted

Просмотреть файл

@ -0,0 +1,17 @@
CREATE OR REPLACE VIEW
`{{ project }}`.glam_etl.firefox_desktop_glam_nightly__view_clients_daily_scalar_aggregates_v1
AS
WITH extracted AS (
SELECT
*
FROM
`{{ project }}`.glam_etl.firefox_desktop__view_clients_daily_scalar_aggregates_v1
WHERE
channel = 'nightly'
)
SELECT
* EXCEPT (app_build_id, channel),
`mozfun.glam.build_seconds_to_hour`(app_build_id) AS app_build_id,
"*" AS channel
FROM
extracted

Просмотреть файл

@ -0,0 +1,17 @@
CREATE OR REPLACE VIEW
`{{ project }}`.glam_etl.firefox_desktop_glam_nightly__view_clients_daily_histogram_aggregates_v1
AS
WITH extracted AS (
SELECT
*
FROM
`{{ project }}`.glam_etl.firefox_desktop__view_clients_daily_histogram_aggregates_v1
WHERE
channel = 'release'
)
SELECT
* EXCEPT (app_build_id, channel),
`mozfun.glam.build_seconds_to_hour`(app_build_id) AS app_build_id,
"*" AS channel
FROM
extracted

Просмотреть файл

@ -0,0 +1,17 @@
CREATE OR REPLACE VIEW
`{{ project }}`.glam_etl.firefox_desktop_glam_nightly__view_clients_daily_scalar_aggregates_v1
AS
WITH extracted AS (
SELECT
*
FROM
`{{ project }}`.glam_etl.firefox_desktop__view_clients_daily_scalar_aggregates_v1
WHERE
channel = 'release'
)
SELECT
* EXCEPT (app_build_id, channel),
`mozfun.glam.build_seconds_to_hour`(app_build_id) AS app_build_id,
"*" AS channel
FROM
extracted

Просмотреть файл

@ -0,0 +1,54 @@
#!/bin/bash
# generate sql for checking into the repository and for testing the workflow
set -e
project=${PROJECT:-glam-fenix-dev}
skip_generate=${SKIP_GENERATE:-false}
skip_daily=${SKIP_DAILY:-false}
generate_only=${GENERATE_ONLY:-false}
# NOTE: there are three app_ids that we must look at for historical context. For
# the purpose of this script, it is sufficient to look only at what is currently
# "firefox desktop". We must have at least one table scalar/histogram tables for
# each of the referenced tables in the view. We'll keep all pings for
# firefox_desktop, and only the metrics ping for the others.
app_ids=(
"firefox_desktop"
)
logical_app_id="firefox_desktop_glam_nightly"
dir="$(dirname "$0")/.."
sql_dir=$dir/../../sql/$project/glam_etl
if [[ $skip_generate == false ]]; then
for app_id in "${app_ids[@]}"; do
PRODUCT=$app_id STAGE=daily $dir/generate_glean_sql &
done
wait
# remove tables to reduce noise of checked-in queries
for app_id in "${app_ids[@]}"; do
if [[ $app_id == "firefox_desktop" ]]; then
continue
fi
for path in "${sql_dir}/${app_id}__clients"*; do
if [[ $path == "${sql_dir}/${app_id}__clients"*metrics* ]]; then
continue
fi
rm -r $path
done
done
PRODUCT=$logical_app_id STAGE=incremental $dir/generate_glean_sql
fi
if [[ $generate_only != false ]]; then
bqetl glam glean update-schemas
exit
fi
if [[ $skip_daily == false ]]; then
for app_id in "${app_ids[@]}"; do
PRODUCT=$app_id STAGE=daily $dir/run_glam_sql
done
fi
PRODUCT=$logical_app_id STAGE=incremental $dir/run_glam_sql
bqetl glam glean update-schemas

Просмотреть файл

@ -0,0 +1,148 @@
-- Query generated by: python3 -m bigquery_etl.glam.clients_daily_histogram_aggregates --source-table firefox_desktop_stable.metrics_v1
WITH extracted AS (
SELECT
*,
DATE(submission_timestamp) AS submission_date,
client_info.client_id,
"metrics" AS ping_type,
COALESCE(
SAFE_CAST(SPLIT(client_info.app_display_version, '.')[OFFSET(0)] AS INT64),
0
) AS app_version,
client_info.os AS os,
client_info.app_build AS app_build_id,
client_info.app_channel AS channel
FROM
`moz-fx-data-shared-prod.firefox_desktop_stable.metrics_v1`
WHERE
DATE(submission_timestamp) = @submission_date
AND client_info.client_id IS NOT NULL
),
histograms AS (
SELECT
sample_id,
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY<STRUCT<metric STRING, metric_type STRING, value ARRAY<STRUCT<key STRING, value INT64>>>>[
(
"fog_ipc_buffer_sizes",
"memory_distribution",
metrics.memory_distribution.fog_ipc_buffer_sizes.values
),
(
"fog_ipc_flush_durations",
"timing_distribution",
metrics.timing_distribution.fog_ipc_flush_durations.values
),
(
"glean_database_size",
"memory_distribution",
metrics.memory_distribution.glean_database_size.values
),
(
"glean_upload_discarded_exceeding_pings_size",
"memory_distribution",
metrics.memory_distribution.glean_upload_discarded_exceeding_pings_size.values
),
(
"glean_upload_pending_pings_directory_size",
"memory_distribution",
metrics.memory_distribution.glean_upload_pending_pings_directory_size.values
),
(
"paint_build_displaylist_time",
"timing_distribution",
metrics.timing_distribution.paint_build_displaylist_time.values
)
] AS metadata
FROM
extracted
),
flattened_histograms AS (
SELECT
sample_id,
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metadata.*
FROM
histograms,
UNNEST(metadata) AS metadata
WHERE
value IS NOT NULL
),
-- ARRAY_CONCAT_AGG may fail if the array of records exceeds 20 MB when
-- serialized and shuffled. This may exhibit itself in a pathological case where
-- the a single client sends *many* pings in a single day. However, this case
-- has not been observed. If this does occur, each histogram should be unnested
-- aggregated. This will force more shuffles and is inefficient. This may be
-- mitigated by removing all of the empty entries which are sent to keep bucket
-- ranges contiguous.
--
-- Tested via org_mozilla_fenix.metrics_v1 for 2020-02-23, unnest vs concat
-- Slot consumed: 00:50:15 vs 00:06:45, Shuffled: 27.5GB vs 6.0 GB
aggregated AS (
SELECT
sample_id,
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
mozfun.map.sum(ARRAY_CONCAT_AGG(value)) AS value
FROM
flattened_histograms
GROUP BY
sample_id,
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metric,
metric_type
)
SELECT
sample_id,
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY_AGG(
STRUCT<
metric STRING,
metric_type STRING,
key STRING,
agg_type STRING,
value ARRAY<STRUCT<key STRING, value INT64>>
>(metric, metric_type, '', 'summed_histogram', value)
) AS histogram_aggregates
FROM
aggregated
GROUP BY
sample_id,
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel

Просмотреть файл

@ -0,0 +1,239 @@
-- Query generated by: python3 -m bigquery_etl.glam.clients_daily_scalar_aggregates --source-table firefox_desktop_stable.baseline_v1
WITH extracted AS (
SELECT
*,
DATE(submission_timestamp) AS submission_date,
client_info.client_id,
"baseline" AS ping_type,
COALESCE(
SAFE_CAST(SPLIT(client_info.app_display_version, '.')[OFFSET(0)] AS INT64),
0
) AS app_version,
client_info.os AS os,
client_info.app_build AS app_build_id,
client_info.app_channel AS channel
FROM
`moz-fx-data-shared-prod.firefox_desktop_stable.baseline_v1`
WHERE
DATE(submission_timestamp) = @submission_date
AND client_info.client_id IS NOT NULL
),
unlabeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY<STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>>[
(
'glean_baseline_duration',
'timespan',
'',
'avg',
avg(CAST(metrics.timespan.glean_baseline_duration.value AS INT64))
),
(
'glean_baseline_duration',
'timespan',
'',
'count',
IF(MIN(metrics.timespan.glean_baseline_duration.value) IS NULL, NULL, COUNT(*))
),
(
'glean_baseline_duration',
'timespan',
'',
'max',
max(CAST(metrics.timespan.glean_baseline_duration.value AS INT64))
),
(
'glean_baseline_duration',
'timespan',
'',
'min',
min(CAST(metrics.timespan.glean_baseline_duration.value AS INT64))
),
(
'glean_baseline_duration',
'timespan',
'',
'sum',
sum(CAST(metrics.timespan.glean_baseline_duration.value AS INT64))
),
(
'glean_validation_metrics_ping_count',
'counter',
'',
'avg',
avg(CAST(metrics.counter.glean_validation_metrics_ping_count AS INT64))
),
(
'glean_validation_metrics_ping_count',
'counter',
'',
'count',
IF(MIN(metrics.counter.glean_validation_metrics_ping_count) IS NULL, NULL, COUNT(*))
),
(
'glean_validation_metrics_ping_count',
'counter',
'',
'max',
max(CAST(metrics.counter.glean_validation_metrics_ping_count AS INT64))
),
(
'glean_validation_metrics_ping_count',
'counter',
'',
'min',
min(CAST(metrics.counter.glean_validation_metrics_ping_count AS INT64))
),
(
'glean_validation_metrics_ping_count',
'counter',
'',
'sum',
sum(CAST(metrics.counter.glean_validation_metrics_ping_count AS INT64))
)
] AS scalar_aggregates
FROM
extracted
GROUP BY
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel
),
grouped_labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY<STRUCT<name STRING, type STRING, value ARRAY<STRUCT<key STRING, value INT64>>>>[
(
'glean_error_invalid_label',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_label
),
(
'glean_error_invalid_overflow',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_overflow
),
(
'glean_error_invalid_state',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_state
),
(
'glean_error_invalid_value',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_value
),
(
'glean_validation_pings_submitted',
'labeled_counter',
metrics.labeled_counter.glean_validation_pings_submitted
)
] AS metrics
FROM
extracted
),
flattened_labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metrics.name AS metric,
metrics.type AS metric_type,
value.key AS key,
value.value AS value
FROM
grouped_labeled_metrics
CROSS JOIN
UNNEST(metrics) AS metrics,
UNNEST(metrics.value) AS value
),
aggregated_labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
MAX(value) AS max,
MIN(value) AS min,
AVG(value) AS avg,
SUM(value) AS sum,
IF(MIN(value) IS NULL, NULL, COUNT(*)) AS count
FROM
flattened_labeled_metrics
GROUP BY
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key
),
labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY_CONCAT_AGG(
ARRAY<STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>>[
(metric, metric_type, key, 'max', max),
(metric, metric_type, key, 'min', min),
(metric, metric_type, key, 'avg', avg),
(metric, metric_type, key, 'sum', sum),
(metric, metric_type, key, 'count', count)
]
) AS scalar_aggregates
FROM
aggregated_labeled_metrics
GROUP BY
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel
)
SELECT
*
FROM
unlabeled_metrics
UNION ALL
SELECT
*
FROM
labeled_metrics

Просмотреть файл

@ -0,0 +1,164 @@
-- Query generated by: python3 -m bigquery_etl.glam.clients_daily_scalar_aggregates --source-table firefox_desktop_stable.deletion_request_v1
WITH extracted AS (
SELECT
*,
DATE(submission_timestamp) AS submission_date,
client_info.client_id,
"deletion-request" AS ping_type,
COALESCE(
SAFE_CAST(SPLIT(client_info.app_display_version, '.')[OFFSET(0)] AS INT64),
0
) AS app_version,
client_info.os AS os,
client_info.app_build AS app_build_id,
client_info.app_channel AS channel
FROM
`moz-fx-data-shared-prod.firefox_desktop_stable.deletion_request_v1`
WHERE
DATE(submission_timestamp) = @submission_date
AND client_info.client_id IS NOT NULL
),
unlabeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY<STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>>[
] AS scalar_aggregates
FROM
extracted
GROUP BY
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel
),
grouped_labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY<STRUCT<name STRING, type STRING, value ARRAY<STRUCT<key STRING, value INT64>>>>[
(
'glean_error_invalid_label',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_label
),
(
'glean_error_invalid_overflow',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_overflow
),
(
'glean_error_invalid_state',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_state
),
(
'glean_error_invalid_value',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_value
)
] AS metrics
FROM
extracted
),
flattened_labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metrics.name AS metric,
metrics.type AS metric_type,
value.key AS key,
value.value AS value
FROM
grouped_labeled_metrics
CROSS JOIN
UNNEST(metrics) AS metrics,
UNNEST(metrics.value) AS value
),
aggregated_labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
MAX(value) AS max,
MIN(value) AS min,
AVG(value) AS avg,
SUM(value) AS sum,
IF(MIN(value) IS NULL, NULL, COUNT(*)) AS count
FROM
flattened_labeled_metrics
GROUP BY
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key
),
labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY_CONCAT_AGG(
ARRAY<STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>>[
(metric, metric_type, key, 'max', max),
(metric, metric_type, key, 'min', min),
(metric, metric_type, key, 'avg', avg),
(metric, metric_type, key, 'sum', sum),
(metric, metric_type, key, 'count', count)
]
) AS scalar_aggregates
FROM
aggregated_labeled_metrics
GROUP BY
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel
)
SELECT
*
FROM
unlabeled_metrics
UNION ALL
SELECT
*
FROM
labeled_metrics

Просмотреть файл

@ -0,0 +1,164 @@
-- Query generated by: python3 -m bigquery_etl.glam.clients_daily_scalar_aggregates --source-table firefox_desktop_stable.events_v1
WITH extracted AS (
SELECT
*,
DATE(submission_timestamp) AS submission_date,
client_info.client_id,
"events" AS ping_type,
COALESCE(
SAFE_CAST(SPLIT(client_info.app_display_version, '.')[OFFSET(0)] AS INT64),
0
) AS app_version,
client_info.os AS os,
client_info.app_build AS app_build_id,
client_info.app_channel AS channel
FROM
`moz-fx-data-shared-prod.firefox_desktop_stable.events_v1`
WHERE
DATE(submission_timestamp) = @submission_date
AND client_info.client_id IS NOT NULL
),
unlabeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY<STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>>[
] AS scalar_aggregates
FROM
extracted
GROUP BY
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel
),
grouped_labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY<STRUCT<name STRING, type STRING, value ARRAY<STRUCT<key STRING, value INT64>>>>[
(
'glean_error_invalid_label',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_label
),
(
'glean_error_invalid_overflow',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_overflow
),
(
'glean_error_invalid_state',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_state
),
(
'glean_error_invalid_value',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_value
)
] AS metrics
FROM
extracted
),
flattened_labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metrics.name AS metric,
metrics.type AS metric_type,
value.key AS key,
value.value AS value
FROM
grouped_labeled_metrics
CROSS JOIN
UNNEST(metrics) AS metrics,
UNNEST(metrics.value) AS value
),
aggregated_labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
MAX(value) AS max,
MIN(value) AS min,
AVG(value) AS avg,
SUM(value) AS sum,
IF(MIN(value) IS NULL, NULL, COUNT(*)) AS count
FROM
flattened_labeled_metrics
GROUP BY
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key
),
labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY_CONCAT_AGG(
ARRAY<STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>>[
(metric, metric_type, key, 'max', max),
(metric, metric_type, key, 'min', min),
(metric, metric_type, key, 'avg', avg),
(metric, metric_type, key, 'sum', sum),
(metric, metric_type, key, 'count', count)
]
) AS scalar_aggregates
FROM
aggregated_labeled_metrics
GROUP BY
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel
)
SELECT
*
FROM
unlabeled_metrics
UNION ALL
SELECT
*
FROM
labeled_metrics

Просмотреть файл

@ -0,0 +1,178 @@
-- Query generated by: python3 -m bigquery_etl.glam.clients_daily_scalar_aggregates --source-table firefox_desktop_stable.fog_validation_v1
WITH extracted AS (
SELECT
*,
DATE(submission_timestamp) AS submission_date,
client_info.client_id,
"fog-validation" AS ping_type,
COALESCE(
SAFE_CAST(SPLIT(client_info.app_display_version, '.')[OFFSET(0)] AS INT64),
0
) AS app_version,
client_info.os AS os,
client_info.app_build AS app_build_id,
client_info.app_channel AS channel
FROM
`moz-fx-data-shared-prod.firefox_desktop_stable.fog_validation_v1`
WHERE
DATE(submission_timestamp) = @submission_date
AND client_info.client_id IS NOT NULL
),
unlabeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY<STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>>[
(
'fog_validation_profile_disk_is_ssd',
'boolean',
'',
'false',
SUM(CAST(NOT metrics.boolean.fog_validation_profile_disk_is_ssd AS INT64))
),
(
'fog_validation_profile_disk_is_ssd',
'boolean',
'',
'true',
SUM(CAST(metrics.boolean.fog_validation_profile_disk_is_ssd AS INT64))
)
] AS scalar_aggregates
FROM
extracted
GROUP BY
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel
),
grouped_labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY<STRUCT<name STRING, type STRING, value ARRAY<STRUCT<key STRING, value INT64>>>>[
(
'glean_error_invalid_label',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_label
),
(
'glean_error_invalid_overflow',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_overflow
),
(
'glean_error_invalid_state',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_state
),
(
'glean_error_invalid_value',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_value
)
] AS metrics
FROM
extracted
),
flattened_labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metrics.name AS metric,
metrics.type AS metric_type,
value.key AS key,
value.value AS value
FROM
grouped_labeled_metrics
CROSS JOIN
UNNEST(metrics) AS metrics,
UNNEST(metrics.value) AS value
),
aggregated_labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
MAX(value) AS max,
MIN(value) AS min,
AVG(value) AS avg,
SUM(value) AS sum,
IF(MIN(value) IS NULL, NULL, COUNT(*)) AS count
FROM
flattened_labeled_metrics
GROUP BY
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key
),
labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY_CONCAT_AGG(
ARRAY<STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>>[
(metric, metric_type, key, 'max', max),
(metric, metric_type, key, 'min', min),
(metric, metric_type, key, 'avg', avg),
(metric, metric_type, key, 'sum', sum),
(metric, metric_type, key, 'count', count)
]
) AS scalar_aggregates
FROM
aggregated_labeled_metrics
GROUP BY
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel
)
SELECT
*
FROM
unlabeled_metrics
UNION ALL
SELECT
*
FROM
labeled_metrics

Просмотреть файл

@ -0,0 +1,591 @@
-- Query generated by: python3 -m bigquery_etl.glam.clients_daily_scalar_aggregates --source-table firefox_desktop_stable.metrics_v1
WITH extracted AS (
SELECT
*,
DATE(submission_timestamp) AS submission_date,
client_info.client_id,
"metrics" AS ping_type,
COALESCE(
SAFE_CAST(SPLIT(client_info.app_display_version, '.')[OFFSET(0)] AS INT64),
0
) AS app_version,
client_info.os AS os,
client_info.app_build AS app_build_id,
client_info.app_channel AS channel
FROM
`moz-fx-data-shared-prod.firefox_desktop_stable.metrics_v1`
WHERE
DATE(submission_timestamp) = @submission_date
AND client_info.client_id IS NOT NULL
),
unlabeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY<STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>>[
(
'browser_ui_proton_enabled',
'boolean',
'',
'false',
SUM(CAST(NOT metrics.boolean.browser_ui_proton_enabled AS INT64))
),
(
'browser_ui_proton_enabled',
'boolean',
'',
'true',
SUM(CAST(metrics.boolean.browser_ui_proton_enabled AS INT64))
),
(
'fog_failed_idle_registration',
'boolean',
'',
'false',
SUM(CAST(NOT metrics.boolean.fog_failed_idle_registration AS INT64))
),
(
'fog_failed_idle_registration',
'boolean',
'',
'true',
SUM(CAST(metrics.boolean.fog_failed_idle_registration AS INT64))
),
(
'fog_initialization',
'timespan',
'',
'avg',
avg(CAST(metrics.timespan.fog_initialization.value AS INT64))
),
(
'fog_initialization',
'timespan',
'',
'count',
IF(MIN(metrics.timespan.fog_initialization.value) IS NULL, NULL, COUNT(*))
),
(
'fog_initialization',
'timespan',
'',
'max',
max(CAST(metrics.timespan.fog_initialization.value AS INT64))
),
(
'fog_initialization',
'timespan',
'',
'min',
min(CAST(metrics.timespan.fog_initialization.value AS INT64))
),
(
'fog_initialization',
'timespan',
'',
'sum',
sum(CAST(metrics.timespan.fog_initialization.value AS INT64))
),
(
'fog_ipc_replay_failures',
'counter',
'',
'avg',
avg(CAST(metrics.counter.fog_ipc_replay_failures AS INT64))
),
(
'fog_ipc_replay_failures',
'counter',
'',
'count',
IF(MIN(metrics.counter.fog_ipc_replay_failures) IS NULL, NULL, COUNT(*))
),
(
'fog_ipc_replay_failures',
'counter',
'',
'max',
max(CAST(metrics.counter.fog_ipc_replay_failures AS INT64))
),
(
'fog_ipc_replay_failures',
'counter',
'',
'min',
min(CAST(metrics.counter.fog_ipc_replay_failures AS INT64))
),
(
'fog_ipc_replay_failures',
'counter',
'',
'sum',
sum(CAST(metrics.counter.fog_ipc_replay_failures AS INT64))
),
(
'glean_core_migration_successful',
'boolean',
'',
'false',
SUM(CAST(NOT metrics.boolean.glean_core_migration_successful AS INT64))
),
(
'glean_core_migration_successful',
'boolean',
'',
'true',
SUM(CAST(metrics.boolean.glean_core_migration_successful AS INT64))
),
('glean_error_io', 'counter', '', 'avg', avg(CAST(metrics.counter.glean_error_io AS INT64))),
(
'glean_error_io',
'counter',
'',
'count',
IF(MIN(metrics.counter.glean_error_io) IS NULL, NULL, COUNT(*))
),
('glean_error_io', 'counter', '', 'max', max(CAST(metrics.counter.glean_error_io AS INT64))),
('glean_error_io', 'counter', '', 'min', min(CAST(metrics.counter.glean_error_io AS INT64))),
('glean_error_io', 'counter', '', 'sum', sum(CAST(metrics.counter.glean_error_io AS INT64))),
(
'glean_error_preinit_tasks_overflow',
'counter',
'',
'avg',
avg(CAST(metrics.counter.glean_error_preinit_tasks_overflow AS INT64))
),
(
'glean_error_preinit_tasks_overflow',
'counter',
'',
'count',
IF(MIN(metrics.counter.glean_error_preinit_tasks_overflow) IS NULL, NULL, COUNT(*))
),
(
'glean_error_preinit_tasks_overflow',
'counter',
'',
'max',
max(CAST(metrics.counter.glean_error_preinit_tasks_overflow AS INT64))
),
(
'glean_error_preinit_tasks_overflow',
'counter',
'',
'min',
min(CAST(metrics.counter.glean_error_preinit_tasks_overflow AS INT64))
),
(
'glean_error_preinit_tasks_overflow',
'counter',
'',
'sum',
sum(CAST(metrics.counter.glean_error_preinit_tasks_overflow AS INT64))
),
(
'glean_error_preinit_tasks_timeout',
'boolean',
'',
'false',
SUM(CAST(NOT metrics.boolean.glean_error_preinit_tasks_timeout AS INT64))
),
(
'glean_error_preinit_tasks_timeout',
'boolean',
'',
'true',
SUM(CAST(metrics.boolean.glean_error_preinit_tasks_timeout AS INT64))
),
(
'glean_time_invalid_timezone_offset',
'counter',
'',
'avg',
avg(CAST(metrics.counter.glean_time_invalid_timezone_offset AS INT64))
),
(
'glean_time_invalid_timezone_offset',
'counter',
'',
'count',
IF(MIN(metrics.counter.glean_time_invalid_timezone_offset) IS NULL, NULL, COUNT(*))
),
(
'glean_time_invalid_timezone_offset',
'counter',
'',
'max',
max(CAST(metrics.counter.glean_time_invalid_timezone_offset AS INT64))
),
(
'glean_time_invalid_timezone_offset',
'counter',
'',
'min',
min(CAST(metrics.counter.glean_time_invalid_timezone_offset AS INT64))
),
(
'glean_time_invalid_timezone_offset',
'counter',
'',
'sum',
sum(CAST(metrics.counter.glean_time_invalid_timezone_offset AS INT64))
),
(
'glean_upload_deleted_pings_after_quota_hit',
'counter',
'',
'avg',
avg(CAST(metrics.counter.glean_upload_deleted_pings_after_quota_hit AS INT64))
),
(
'glean_upload_deleted_pings_after_quota_hit',
'counter',
'',
'count',
IF(MIN(metrics.counter.glean_upload_deleted_pings_after_quota_hit) IS NULL, NULL, COUNT(*))
),
(
'glean_upload_deleted_pings_after_quota_hit',
'counter',
'',
'max',
max(CAST(metrics.counter.glean_upload_deleted_pings_after_quota_hit AS INT64))
),
(
'glean_upload_deleted_pings_after_quota_hit',
'counter',
'',
'min',
min(CAST(metrics.counter.glean_upload_deleted_pings_after_quota_hit AS INT64))
),
(
'glean_upload_deleted_pings_after_quota_hit',
'counter',
'',
'sum',
sum(CAST(metrics.counter.glean_upload_deleted_pings_after_quota_hit AS INT64))
),
(
'glean_upload_pending_pings',
'counter',
'',
'avg',
avg(CAST(metrics.counter.glean_upload_pending_pings AS INT64))
),
(
'glean_upload_pending_pings',
'counter',
'',
'count',
IF(MIN(metrics.counter.glean_upload_pending_pings) IS NULL, NULL, COUNT(*))
),
(
'glean_upload_pending_pings',
'counter',
'',
'max',
max(CAST(metrics.counter.glean_upload_pending_pings AS INT64))
),
(
'glean_upload_pending_pings',
'counter',
'',
'min',
min(CAST(metrics.counter.glean_upload_pending_pings AS INT64))
),
(
'glean_upload_pending_pings',
'counter',
'',
'sum',
sum(CAST(metrics.counter.glean_upload_pending_pings AS INT64))
),
(
'glean_validation_app_forceclosed_count',
'counter',
'',
'avg',
avg(CAST(metrics.counter.glean_validation_app_forceclosed_count AS INT64))
),
(
'glean_validation_app_forceclosed_count',
'counter',
'',
'count',
IF(MIN(metrics.counter.glean_validation_app_forceclosed_count) IS NULL, NULL, COUNT(*))
),
(
'glean_validation_app_forceclosed_count',
'counter',
'',
'max',
max(CAST(metrics.counter.glean_validation_app_forceclosed_count AS INT64))
),
(
'glean_validation_app_forceclosed_count',
'counter',
'',
'min',
min(CAST(metrics.counter.glean_validation_app_forceclosed_count AS INT64))
),
(
'glean_validation_app_forceclosed_count',
'counter',
'',
'sum',
sum(CAST(metrics.counter.glean_validation_app_forceclosed_count AS INT64))
),
(
'glean_validation_baseline_ping_count',
'counter',
'',
'avg',
avg(CAST(metrics.counter.glean_validation_baseline_ping_count AS INT64))
),
(
'glean_validation_baseline_ping_count',
'counter',
'',
'count',
IF(MIN(metrics.counter.glean_validation_baseline_ping_count) IS NULL, NULL, COUNT(*))
),
(
'glean_validation_baseline_ping_count',
'counter',
'',
'max',
max(CAST(metrics.counter.glean_validation_baseline_ping_count AS INT64))
),
(
'glean_validation_baseline_ping_count',
'counter',
'',
'min',
min(CAST(metrics.counter.glean_validation_baseline_ping_count AS INT64))
),
(
'glean_validation_baseline_ping_count',
'counter',
'',
'sum',
sum(CAST(metrics.counter.glean_validation_baseline_ping_count AS INT64))
),
(
'glean_validation_foreground_count',
'counter',
'',
'avg',
avg(CAST(metrics.counter.glean_validation_foreground_count AS INT64))
),
(
'glean_validation_foreground_count',
'counter',
'',
'count',
IF(MIN(metrics.counter.glean_validation_foreground_count) IS NULL, NULL, COUNT(*))
),
(
'glean_validation_foreground_count',
'counter',
'',
'max',
max(CAST(metrics.counter.glean_validation_foreground_count AS INT64))
),
(
'glean_validation_foreground_count',
'counter',
'',
'min',
min(CAST(metrics.counter.glean_validation_foreground_count AS INT64))
),
(
'glean_validation_foreground_count',
'counter',
'',
'sum',
sum(CAST(metrics.counter.glean_validation_foreground_count AS INT64))
),
(
'power_total_cpu_time_ms',
'counter',
'',
'avg',
avg(CAST(metrics.counter.power_total_cpu_time_ms AS INT64))
),
(
'power_total_cpu_time_ms',
'counter',
'',
'count',
IF(MIN(metrics.counter.power_total_cpu_time_ms) IS NULL, NULL, COUNT(*))
),
(
'power_total_cpu_time_ms',
'counter',
'',
'max',
max(CAST(metrics.counter.power_total_cpu_time_ms AS INT64))
),
(
'power_total_cpu_time_ms',
'counter',
'',
'min',
min(CAST(metrics.counter.power_total_cpu_time_ms AS INT64))
),
(
'power_total_cpu_time_ms',
'counter',
'',
'sum',
sum(CAST(metrics.counter.power_total_cpu_time_ms AS INT64))
)
] AS scalar_aggregates
FROM
extracted
GROUP BY
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel
),
grouped_labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY<STRUCT<name STRING, type STRING, value ARRAY<STRUCT<key STRING, value INT64>>>>[
(
'glean_error_invalid_label',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_label
),
(
'glean_error_invalid_overflow',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_overflow
),
(
'glean_error_invalid_state',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_state
),
(
'glean_error_invalid_value',
'labeled_counter',
metrics.labeled_counter.glean_error_invalid_value
),
(
'glean_upload_ping_upload_failure',
'labeled_counter',
metrics.labeled_counter.glean_upload_ping_upload_failure
),
(
'glean_validation_pings_submitted',
'labeled_counter',
metrics.labeled_counter.glean_validation_pings_submitted
)
] AS metrics
FROM
extracted
),
flattened_labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metrics.name AS metric,
metrics.type AS metric_type,
value.key AS key,
value.value AS value
FROM
grouped_labeled_metrics
CROSS JOIN
UNNEST(metrics) AS metrics,
UNNEST(metrics.value) AS value
),
aggregated_labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
MAX(value) AS max,
MIN(value) AS min,
AVG(value) AS avg,
SUM(value) AS sum,
IF(MIN(value) IS NULL, NULL, COUNT(*)) AS count
FROM
flattened_labeled_metrics
GROUP BY
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key
),
labeled_metrics AS (
SELECT
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel,
ARRAY_CONCAT_AGG(
ARRAY<STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>>[
(metric, metric_type, key, 'max', max),
(metric, metric_type, key, 'min', min),
(metric, metric_type, key, 'avg', avg),
(metric, metric_type, key, 'sum', sum),
(metric, metric_type, key, 'count', count)
]
) AS scalar_aggregates
FROM
aggregated_labeled_metrics
GROUP BY
client_id,
ping_type,
submission_date,
os,
app_version,
app_build_id,
channel
)
SELECT
*
FROM
unlabeled_metrics
UNION ALL
SELECT
*
FROM
labeled_metrics

Просмотреть файл

@ -0,0 +1,10 @@
-- view for firefox_desktop__view_clients_daily_histogram_aggregates_v1;
-- View for histogram aggregates that handles time-partitioning
CREATE OR REPLACE VIEW
`glam-fenix-dev.glam_etl.firefox_desktop__view_clients_daily_histogram_aggregates_v1`
AS
SELECT
* EXCEPT (submission_date),
DATE(_PARTITIONTIME) AS submission_date
FROM
`glam-fenix-dev.glam_etl.firefox_desktop__clients_daily_histogram_aggregates*`

Просмотреть файл

@ -0,0 +1,10 @@
-- view for firefox_desktop__view_clients_daily_scalar_aggregates_v1;
-- View to union daily scalar aggregates with date partitioning
CREATE OR REPLACE VIEW
`glam-fenix-dev.glam_etl.firefox_desktop__view_clients_daily_scalar_aggregates_v1`
AS
SELECT
* EXCEPT (submission_date),
DATE(_PARTITIONTIME) AS submission_date
FROM
`glam-fenix-dev.glam_etl.firefox_desktop__clients_daily_scalar_aggregates*`

Просмотреть файл

@ -0,0 +1,26 @@
-- init for firefox_desktop_glam_nightly__clients_histogram_aggregates_v1;
CREATE TABLE IF NOT EXISTS
`glam-fenix-dev.glam_etl.firefox_desktop_glam_nightly__clients_histogram_aggregates_v1`(
sample_id INT64,
client_id STRING,
ping_type STRING,
os STRING,
app_version INT64,
app_build_id STRING,
channel STRING,
histogram_aggregates ARRAY<
STRUCT<
metric STRING,
metric_type STRING,
key STRING,
agg_type STRING,
value ARRAY<STRUCT<key STRING, value INT64>>
>
>
)
PARTITION BY
RANGE_BUCKET(sample_id, GENERATE_ARRAY(0, 100, 1))
CLUSTER BY
app_version,
channel,
client_id

Просмотреть файл

@ -0,0 +1,212 @@
-- query for firefox_desktop_glam_nightly__clients_histogram_aggregates_v1;
CREATE TEMP FUNCTION udf_merged_user_data(aggs ANY TYPE)
RETURNS ARRAY<
STRUCT<
metric STRING,
metric_type STRING,
key STRING,
agg_type STRING,
value ARRAY<STRUCT<key STRING, value INT64>>
>
> AS (
(
WITH unnested AS (
SELECT
*
FROM
UNNEST(aggs)
),
aggregated_data AS (
SELECT AS STRUCT
metric,
metric_type,
key,
agg_type,
mozfun.map.sum(ARRAY_CONCAT_AGG(value)) AS value
FROM
unnested
GROUP BY
metric,
metric_type,
key,
agg_type
)
SELECT
ARRAY_AGG((metric, metric_type, key, agg_type, value))
FROM
aggregated_data
)
);
CREATE TEMP FUNCTION filter_values(aggs ARRAY<STRUCT<key STRING, value INT64>>)
RETURNS ARRAY<STRUCT<key STRING, value INT64>> AS (
ARRAY(
SELECT AS STRUCT
agg.key,
SUM(agg.value) AS value
FROM
UNNEST(aggs) agg
-- Prevent overflows by only keeping buckets where value is less than 2^40
-- allowing 2^24 entries. This value was chosen somewhat abitrarily, typically
-- the max histogram value is somewhere on the order of ~20 bits.
WHERE
agg.value <= POW(2, 40)
GROUP BY
agg.key
)
);
WITH extracted_accumulated AS (
SELECT
*
FROM
glam_etl.firefox_desktop_glam_nightly__clients_histogram_aggregates_v1
WHERE
sample_id >= @min_sample_id
AND sample_id <= @max_sample_id
),
filtered_accumulated AS (
SELECT
sample_id,
client_id,
ping_type,
os,
app_version,
app_build_id,
channel,
histogram_aggregates
FROM
extracted_accumulated
LEFT JOIN
glam_etl.firefox_desktop_glam_nightly__latest_versions_v1
USING
(channel)
WHERE
-- allow for builds to be slighly ahead of the current submission date, to
-- account for a reasonable amount of clock skew
mozfun.glam.build_hour_to_datetime(app_build_id) < DATE_ADD(@submission_date, INTERVAL 3 day)
-- only keep builds from the last year
AND mozfun.glam.build_hour_to_datetime(app_build_id) > DATE_SUB(
@submission_date,
INTERVAL 365 day
)
AND app_version > (latest_version - 3)
),
-- unnest the daily data
extracted_daily AS (
SELECT
* EXCEPT (app_version, histogram_aggregates),
CAST(app_version AS INT64) AS app_version,
unnested_histogram_aggregates AS histogram_aggregates
FROM
glam_etl.firefox_desktop_glam_nightly__view_clients_daily_histogram_aggregates_v1,
UNNEST(histogram_aggregates) unnested_histogram_aggregates
WHERE
submission_date = @submission_date
AND value IS NOT NULL
AND ARRAY_LENGTH(value) > 0
),
filtered_daily AS (
SELECT
sample_id,
client_id,
ping_type,
os,
app_version,
app_build_id,
channel,
histogram_aggregates.*
FROM
extracted_daily
LEFT JOIN
glam_etl.firefox_desktop_glam_nightly__latest_versions_v1
USING
(channel)
WHERE
-- allow for builds to be slighly ahead of the current submission date, to
-- account for a reasonable amount of clock skew
mozfun.glam.build_hour_to_datetime(app_build_id) < DATE_ADD(@submission_date, INTERVAL 3 day)
-- only keep builds from the last year
AND mozfun.glam.build_hour_to_datetime(app_build_id) > DATE_SUB(
@submission_date,
INTERVAL 365 day
)
AND app_version > (latest_version - 3)
),
-- re-aggregate based on the latest version
aggregated_daily AS (
SELECT
sample_id,
client_id,
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
agg_type,
mozfun.map.sum(ARRAY_CONCAT_AGG(filter_values(value))) AS value
FROM
filtered_daily
GROUP BY
sample_id,
client_id,
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
agg_type
),
-- note: this seems costly, if it's just going to be unnested again
transformed_daily AS (
SELECT
sample_id,
client_id,
ping_type,
os,
app_version,
app_build_id,
channel,
ARRAY_AGG(
STRUCT<
metric STRING,
metric_type STRING,
key STRING,
agg_type STRING,
aggregates ARRAY<STRUCT<key STRING, value INT64>>
>(metric, metric_type, key, agg_type, value)
) AS histogram_aggregates
FROM
aggregated_daily
GROUP BY
sample_id,
client_id,
ping_type,
os,
app_version,
app_build_id,
channel
)
SELECT
COALESCE(accumulated.sample_id, daily.sample_id) AS sample_id,
COALESCE(accumulated.client_id, daily.client_id) AS client_id,
COALESCE(accumulated.ping_type, daily.ping_type) AS ping_type,
COALESCE(accumulated.os, daily.os) AS os,
COALESCE(accumulated.app_version, daily.app_version) AS app_version,
COALESCE(accumulated.app_build_id, daily.app_build_id) AS app_build_id,
COALESCE(accumulated.channel, daily.channel) AS channel,
udf_merged_user_data(
ARRAY_CONCAT(accumulated.histogram_aggregates, daily.histogram_aggregates)
) AS histogram_aggregates
FROM
filtered_accumulated AS accumulated
FULL OUTER JOIN
transformed_daily AS daily
USING
(sample_id, client_id, ping_type, os, app_version, app_build_id, channel)

Просмотреть файл

@ -0,0 +1,19 @@
-- init for firefox_desktop_glam_nightly__clients_scalar_aggregates_v1;
CREATE TABLE IF NOT EXISTS
`glam-fenix-dev.glam_etl.firefox_desktop_glam_nightly__clients_scalar_aggregates_v1`(
client_id STRING,
ping_type STRING,
os STRING,
app_version INT64,
app_build_id STRING,
channel STRING,
scalar_aggregates ARRAY<
STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>
>
)
PARTITION BY
RANGE_BUCKET(app_version, GENERATE_ARRAY(0, 100, 1))
CLUSTER BY
app_version,
channel,
client_id

Просмотреть файл

@ -0,0 +1,258 @@
-- query for firefox_desktop_glam_nightly__clients_scalar_aggregates_v1;
CREATE TEMP FUNCTION udf_merged_user_data(
aggs ARRAY<STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>>
)
RETURNS ARRAY<
STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>
> AS (
(
WITH unnested AS (
SELECT
*
FROM
UNNEST(aggs)
WHERE
agg_type != "avg"
),
aggregated AS (
SELECT
metric,
metric_type,
key,
agg_type,
--format:off
CASE agg_type
WHEN 'max' THEN max(value)
WHEN 'min' THEN min(value)
WHEN 'count' THEN sum(value)
WHEN 'sum' THEN sum(value)
WHEN 'false' THEN sum(value)
WHEN 'true' THEN sum(value)
END AS value
--format:on
FROM
unnested
WHERE
value IS NOT NULL
GROUP BY
metric,
metric_type,
key,
agg_type
),
scalar_count_and_sum AS (
SELECT
metric,
metric_type,
key,
'avg' AS agg_type,
--format:off
CASE WHEN agg_type = 'count' THEN value ELSE 0 END AS count,
CASE WHEN agg_type = 'sum' THEN value ELSE 0 END AS sum
--format:on
FROM
aggregated
WHERE
agg_type IN ('sum', 'count')
),
scalar_averages AS (
SELECT
* EXCEPT (count, sum),
SUM(sum) / SUM(count) AS agg_value
FROM
scalar_count_and_sum
GROUP BY
metric,
metric_type,
key,
agg_type
),
merged_data AS (
SELECT
*
FROM
aggregated
UNION ALL
SELECT
*
FROM
scalar_averages
)
SELECT
ARRAY_AGG((metric, metric_type, key, agg_type, value))
FROM
merged_data
)
);
WITH filtered_date_channel AS (
SELECT
*
FROM
glam_etl.firefox_desktop_glam_nightly__view_clients_daily_scalar_aggregates_v1
WHERE
submission_date = @submission_date
),
filtered_aggregates AS (
SELECT
submission_date,
client_id,
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
agg_type,
value
FROM
filtered_date_channel
CROSS JOIN
UNNEST(scalar_aggregates)
WHERE
value IS NOT NULL
),
version_filtered_new AS (
SELECT
submission_date,
scalar_aggs.client_id,
scalar_aggs.ping_type,
scalar_aggs.os,
scalar_aggs.app_version,
scalar_aggs.app_build_id,
scalar_aggs.channel,
metric,
metric_type,
key,
agg_type,
value
FROM
filtered_aggregates AS scalar_aggs
LEFT JOIN
glam_etl.firefox_desktop_glam_nightly__latest_versions_v1
USING
(channel)
WHERE
-- allow for builds to be slighly ahead of the current submission date, to
-- account for a reasonable amount of clock skew
mozfun.glam.build_hour_to_datetime(app_build_id) < DATE_ADD(@submission_date, INTERVAL 3 day)
-- only keep builds from the last year
AND mozfun.glam.build_hour_to_datetime(app_build_id) > DATE_SUB(
@submission_date,
INTERVAL 365 day
)
AND app_version > (latest_version - 3)
),
scalar_aggregates_new AS (
SELECT
client_id,
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
agg_type,
--format:off
CASE agg_type
WHEN 'max' THEN max(value)
WHEN 'min' THEN min(value)
WHEN 'count' THEN sum(value)
WHEN 'sum' THEN sum(value)
WHEN 'false' THEN sum(value)
WHEN 'true' THEN sum(value)
END AS value
--format:on
FROM
version_filtered_new
WHERE
-- avoid overflows from very large numbers that are typically anomalies
value <= POW(2, 40)
GROUP BY
client_id,
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
agg_type
),
filtered_new AS (
SELECT
client_id,
ping_type,
os,
app_version,
app_build_id,
channel,
ARRAY_AGG((metric, metric_type, key, agg_type, value)) AS scalar_aggregates
FROM
scalar_aggregates_new
GROUP BY
client_id,
ping_type,
os,
app_version,
app_build_id,
channel
),
filtered_old AS (
SELECT
scalar_aggs.client_id,
scalar_aggs.ping_type,
scalar_aggs.os,
scalar_aggs.app_version,
scalar_aggs.app_build_id,
scalar_aggs.channel,
scalar_aggregates
FROM
glam_etl.firefox_desktop_glam_nightly__clients_scalar_aggregates_v1 AS scalar_aggs
LEFT JOIN
glam_etl.firefox_desktop_glam_nightly__latest_versions_v1
USING
(channel)
WHERE
-- allow for builds to be slighly ahead of the current submission date, to
-- account for a reasonable amount of clock skew
mozfun.glam.build_hour_to_datetime(app_build_id) < DATE_ADD(@submission_date, INTERVAL 3 day)
-- only keep builds from the last year
AND mozfun.glam.build_hour_to_datetime(app_build_id) > DATE_SUB(
@submission_date,
INTERVAL 365 day
)
AND app_version > (latest_version - 3)
),
joined_new_old AS (
SELECT
COALESCE(old_data.client_id, new_data.client_id) AS client_id,
COALESCE(old_data.ping_type, new_data.ping_type) AS ping_type,
COALESCE(old_data.os, new_data.os) AS os,
COALESCE(old_data.app_version, new_data.app_version) AS app_version,
COALESCE(old_data.app_build_id, new_data.app_build_id) AS app_build_id,
COALESCE(old_data.channel, new_data.channel) AS channel,
COALESCE(old_data.scalar_aggregates, []) AS old_aggs,
COALESCE(new_data.scalar_aggregates, []) AS new_aggs
FROM
filtered_new AS new_data
FULL OUTER JOIN
filtered_old AS old_data
USING
(client_id, ping_type, os, app_version, app_build_id, channel)
)
SELECT
client_id,
ping_type,
os,
app_version,
app_build_id,
channel,
udf_merged_user_data(ARRAY_CONCAT(old_aggs, new_aggs)) AS scalar_aggregates
FROM
joined_new_old

Просмотреть файл

@ -0,0 +1,38 @@
-- query for firefox_desktop_glam_nightly__extract_probe_counts_v1;
SELECT
channel,
app_version AS version,
ping_type,
os,
app_build_id AS build_id,
IF(
app_build_id = "*",
NULL,
SAFE_CAST(mozfun.glam.build_hour_to_datetime(app_build_id) AS STRING)
) AS build_date,
metric,
metric_type,
-- BigQuery has some null unicode characters which Postgresql doesn't like,
-- so we remove those here. Also limit string length to 200 to match column
-- length.
SUBSTR(REPLACE(key, r"\x00", ""), 0, 200) AS metric_key,
client_agg_type,
MAX(total_users) AS total_users,
MAX(IF(agg_type = "histogram", mozfun.glam.histogram_cast_json(aggregates), NULL)) AS histogram,
MAX(
IF(agg_type = "percentiles", mozfun.glam.histogram_cast_json(aggregates), NULL)
) AS percentiles,
FROM
`glam_etl.firefox_desktop_glam_nightly__view_probe_counts_v1`
WHERE
total_users > 10
GROUP BY
channel,
app_version,
ping_type,
os,
app_build_id,
metric,
metric_type,
key,
client_agg_type

Просмотреть файл

@ -0,0 +1,17 @@
-- query for firefox_desktop_glam_nightly__extract_sample_counts_v1;
SELECT
channel,
app_version,
metric,
key,
coalesce(ping_type, "*") AS ping_type,
COALESCE(app_build_id, "*") AS app_build_id,
IF(
app_build_id = "*",
NULL,
SAFE_CAST(mozfun.glam.build_hour_to_datetime(app_build_id) AS STRING)
) AS build_date,
COALESCE(os, "*") AS os,
total_sample
FROM
`glam_etl.firefox_desktop_glam_nightly__view_sample_counts_v1`

Просмотреть файл

@ -0,0 +1,33 @@
-- query for firefox_desktop_glam_nightly__extract_user_counts_v1;
WITH deduped AS (
SELECT
*,
ROW_NUMBER() OVER (
PARTITION BY
channel,
app_version,
ping_type,
app_build_id,
os
ORDER BY
total_users DESC
) AS rank
FROM
`glam_etl.firefox_desktop_glam_nightly__view_user_counts_v1`
)
SELECT
channel,
app_version,
coalesce(ping_type, "*") AS ping_type,
COALESCE(app_build_id, "*") AS app_build_id,
IF(
app_build_id = "*",
NULL,
SAFE_CAST(mozfun.glam.build_hour_to_datetime(app_build_id) AS STRING)
) AS build_date,
COALESCE(os, "*") AS os,
total_users
FROM
deduped
WHERE
rank = 1;

Просмотреть файл

@ -0,0 +1,254 @@
-- query for firefox_desktop_glam_nightly__histogram_bucket_counts_v1;
WITH
-- Cross join with the attribute combinations to reduce the query complexity
-- with respect to the number of operations. A table with n rows cross joined
-- with a combination of m attributes will generate a new table with n*m rows.
-- The glob ("*") symbol can be understood as selecting all of values belonging
-- to that group.
static_combos AS (
SELECT
combos.*
FROM
UNNEST(
ARRAY<STRUCT<ping_type STRING, os STRING, app_build_id STRING>>[
(NULL, NULL, NULL),
(NULL, NULL, "*"),
(NULL, "*", NULL),
("*", NULL, NULL),
(NULL, "*", "*"),
("*", NULL, "*"),
("*", "*", NULL),
("*", "*", "*")
]
) AS combos
),
all_combos AS (
SELECT
table.* EXCEPT (ping_type, os, app_build_id),
COALESCE(combo.ping_type, table.ping_type) AS ping_type,
COALESCE(combo.os, table.os) AS os,
COALESCE(combo.app_build_id, table.app_build_id) AS app_build_id
FROM
glam_etl.firefox_desktop_glam_nightly__clients_histogram_aggregates_v1 table
CROSS JOIN
static_combos combo
),
normalized_histograms AS (
SELECT
ping_type,
os,
app_version,
app_build_id,
channel,
ARRAY(
SELECT AS STRUCT
metric,
metric_type,
key,
agg_type,
mozfun.glam.histogram_normalized_sum(value, 1.0) AS aggregates
FROM
UNNEST(histogram_aggregates)
) AS histogram_aggregates
FROM
all_combos
),
unnested AS (
SELECT
ping_type,
os,
app_version,
app_build_id,
channel,
histogram_aggregates.metric AS metric,
histogram_aggregates.metric_type AS metric_type,
histogram_aggregates.key AS key,
histogram_aggregates.agg_type AS agg_type,
aggregates.key AS bucket,
aggregates.value
FROM
normalized_histograms,
UNNEST(histogram_aggregates) AS histogram_aggregates,
UNNEST(aggregates) AS aggregates
),
-- Find information that can be used to construct the bucket range. Most of the
-- distributions follow a bucketing rule of 8*log2(n). This doesn't apply to the
-- custom distributions e.g. GeckoView, which needs to incorporate information
-- from the probe info service.
-- See: https://mozilla.github.io/glean/book/user/metrics/custom_distribution.html
distribution_metadata AS (
SELECT
*
FROM
UNNEST(
[
STRUCT(
"custom_distribution" AS metric_type,
"geckoview_document_site_origins" AS metric,
0 AS range_min,
100 AS range_max,
50 AS bucket_count,
"exponential" AS histogram_type
),
STRUCT(
"custom_distribution" AS metric_type,
"geckoview_per_document_site_origins" AS metric,
0 AS range_min,
100 AS range_max,
50 AS bucket_count,
"exponential" AS histogram_type
),
STRUCT(
"custom_distribution" AS metric_type,
"gfx_checkerboard_peak_pixel_count" AS metric,
1 AS range_min,
66355200 AS range_max,
50 AS bucket_count,
"exponential" AS histogram_type
),
STRUCT(
"custom_distribution" AS metric_type,
"gfx_checkerboard_severity" AS metric,
1 AS range_min,
1073741824 AS range_max,
50 AS bucket_count,
"exponential" AS histogram_type
),
STRUCT(
"custom_distribution" AS metric_type,
"gfx_content_frame_time_from_paint" AS metric,
1 AS range_min,
5000 AS range_max,
50 AS bucket_count,
"exponential" AS histogram_type
),
STRUCT(
"custom_distribution" AS metric_type,
"gfx_content_frame_time_from_vsync" AS metric,
8 AS range_min,
792 AS range_max,
100 AS bucket_count,
"linear" AS histogram_type
),
STRUCT(
"custom_distribution" AS metric_type,
"gfx_content_frame_time_with_svg" AS metric,
1 AS range_min,
5000 AS range_max,
50 AS bucket_count,
"exponential" AS histogram_type
),
STRUCT(
"custom_distribution" AS metric_type,
"gfx_content_frame_time_without_resource_upload" AS metric,
1 AS range_min,
5000 AS range_max,
50 AS bucket_count,
"exponential" AS histogram_type
),
STRUCT(
"custom_distribution" AS metric_type,
"gfx_content_frame_time_without_upload" AS metric,
1 AS range_min,
5000 AS range_max,
50 AS bucket_count,
"exponential" AS histogram_type
),
STRUCT(
"custom_distribution" AS metric_type,
"js_baseline_compile_percentage" AS metric,
0 AS range_min,
100 AS range_max,
20 AS bucket_count,
"linear" AS histogram_type
),
STRUCT(
"custom_distribution" AS metric_type,
"js_delazification_percentage" AS metric,
0 AS range_min,
100 AS range_max,
20 AS bucket_count,
"linear" AS histogram_type
),
STRUCT(
"custom_distribution" AS metric_type,
"js_execution_percentage" AS metric,
0 AS range_min,
100 AS range_max,
20 AS bucket_count,
"linear" AS histogram_type
),
STRUCT(
"custom_distribution" AS metric_type,
"js_xdr_encode_percentage" AS metric,
0 AS range_min,
100 AS range_max,
20 AS bucket_count,
"linear" AS histogram_type
),
STRUCT(
"custom_distribution" AS metric_type,
"performance_clone_deserialize_items" AS metric,
1 AS range_min,
2147483646 AS range_max,
50 AS bucket_count,
"exponential" AS histogram_type
)
]
)
UNION ALL
SELECT
metric_type,
metric,
NULL AS range_min,
MAX(SAFE_CAST(bucket AS INT64)) AS range_max,
NULL AS bucket_count,
NULL AS histogram_type
FROM
unnested
WHERE
metric_type <> "custom_distribution"
GROUP BY
metric_type,
metric
),
records AS (
SELECT
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
agg_type,
STRUCT<key STRING, value FLOAT64>(CAST(bucket AS STRING), 1.0 * SUM(value)) AS record
FROM
unnested
GROUP BY
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
agg_type,
bucket
)
SELECT
* EXCEPT (metric_type, histogram_type),
-- Suffix `custom_distribution` with bucketing type
IF(
histogram_type IS NOT NULL,
CONCAT(metric_type, "_", histogram_type),
metric_type
) AS metric_type
FROM
records
LEFT OUTER JOIN
distribution_metadata
USING
(metric_type, metric)

Просмотреть файл

@ -0,0 +1,14 @@
-- query for firefox_desktop_glam_nightly__histogram_percentiles_v1;
SELECT
* EXCEPT (aggregates) REPLACE('percentiles' AS agg_type),
ARRAY<STRUCT<key STRING, value FLOAT64>>[
('5', mozfun.glam.percentile(5, aggregates, metric_type)),
('25', mozfun.glam.percentile(25, aggregates, metric_type)),
('50', mozfun.glam.percentile(50, aggregates, metric_type)),
('75', mozfun.glam.percentile(75, aggregates, metric_type)),
('95', mozfun.glam.percentile(95, aggregates, metric_type)),
('99', mozfun.glam.percentile(99, aggregates, metric_type)),
('99.9', mozfun.glam.percentile(99.9, aggregates, metric_type))
] AS aggregates
FROM
glam_etl.firefox_desktop_glam_nightly__histogram_probe_counts_v1

Просмотреть файл

@ -0,0 +1,79 @@
-- query for firefox_desktop_glam_nightly__histogram_probe_counts_v1;
CREATE TEMP FUNCTION udf_get_buckets(
metric_type STRING,
range_min INT64,
range_max INT64,
bucket_count INT64
)
RETURNS ARRAY<INT64> AS (
(
WITH buckets AS (
SELECT
CASE
WHEN
metric_type = 'timing_distribution'
THEN
-- https://mozilla.github.io/glean/book/user/metrics/timing_distribution.html
mozfun.glam.histogram_generate_functional_buckets(2, 8, range_max)
WHEN
metric_type = 'memory_distribution'
THEN
-- https://mozilla.github.io/glean/book/user/metrics/memory_distribution.html
mozfun.glam.histogram_generate_functional_buckets(2, 16, range_max)
WHEN
metric_type = 'custom_distribution_exponential'
THEN
mozfun.glam.histogram_generate_exponential_buckets(range_min, range_max, bucket_count)
WHEN
metric_type = 'custom_distribution_linear'
THEN
mozfun.glam.histogram_generate_linear_buckets(range_min, range_max, bucket_count)
ELSE
[]
END
AS arr
)
SELECT
ARRAY_AGG(CAST(item AS INT64))
FROM
buckets
CROSS JOIN
UNNEST(arr) AS item
)
);
SELECT
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
agg_type AS client_agg_type,
'histogram' AS agg_type,
CAST(ROUND(SUM(record.value)) AS INT64) AS total_users,
mozfun.glam.histogram_fill_buckets_dirichlet(
mozfun.map.sum(ARRAY_AGG(record)),
mozfun.glam.histogram_buckets_cast_string_array(
udf_get_buckets(metric_type, range_min, range_max, bucket_count)
),
CAST(ROUND(SUM(record.value)) AS INT64)
) AS aggregates
FROM
glam_etl.firefox_desktop_glam_nightly__histogram_bucket_counts_v1
GROUP BY
ping_type,
os,
app_version,
app_build_id,
channel,
range_min,
range_max,
bucket_count,
metric,
metric_type,
key,
client_agg_type,
agg_type

Просмотреть файл

@ -0,0 +1,36 @@
-- query for firefox_desktop_glam_nightly__latest_versions_v1;
WITH extracted AS (
SELECT
client_id,
channel,
app_version
FROM
glam_etl.firefox_desktop_glam_nightly__view_clients_daily_scalar_aggregates_v1
WHERE
submission_date
BETWEEN DATE_SUB(@submission_date, INTERVAL 28 DAY)
AND @submission_date
AND channel IS NOT NULL
),
transformed AS (
SELECT
channel,
app_version
FROM
extracted
GROUP BY
channel,
app_version
HAVING
COUNT(DISTINCT client_id) > 5
ORDER BY
channel,
app_version DESC
)
SELECT
channel,
MAX(app_version) AS latest_version
FROM
transformed
GROUP BY
channel

Просмотреть файл

@ -0,0 +1,273 @@
-- query for firefox_desktop_glam_nightly__scalar_bucket_counts_v1;
CREATE TEMP FUNCTION udf_boolean_buckets(
scalar_aggs ARRAY<
STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>
>
)
RETURNS ARRAY<
STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, bucket STRING>
> AS (
(
WITH boolean_columns AS (
SELECT
metric,
metric_type,
key,
agg_type,
CASE
agg_type
WHEN
'true'
THEN
value
ELSE
0
END
AS bool_true,
CASE
agg_type
WHEN
'false'
THEN
value
ELSE
0
END
AS bool_false
FROM
UNNEST(scalar_aggs)
WHERE
metric_type IN ("boolean")
),
summed_bools AS (
SELECT
metric,
metric_type,
key,
'' AS agg_type,
SUM(bool_true) AS bool_true,
SUM(bool_false) AS bool_false
FROM
boolean_columns
GROUP BY
1,
2,
3,
4
),
booleans AS (
SELECT
* EXCEPT (bool_true, bool_false),
CASE
WHEN
bool_true > 0
AND bool_false > 0
THEN
"sometimes"
WHEN
bool_true > 0
AND bool_false = 0
THEN
"always"
WHEN
bool_true = 0
AND bool_false > 0
THEN
"never"
END
AS bucket
FROM
summed_bools
WHERE
bool_true > 0
OR bool_false > 0
)
SELECT
ARRAY_AGG((metric, metric_type, key, agg_type, bucket))
FROM
booleans
)
);
WITH
-- Cross join with the attribute combinations to reduce the query complexity
-- with respect to the number of operations. A table with n rows cross joined
-- with a combination of m attributes will generate a new table with n*m rows.
-- The glob ("*") symbol can be understood as selecting all of values belonging
-- to that group.
static_combos AS (
SELECT
combos.*
FROM
UNNEST(
ARRAY<STRUCT<ping_type STRING, os STRING, app_build_id STRING>>[
(NULL, NULL, NULL),
(NULL, NULL, "*"),
(NULL, "*", NULL),
("*", NULL, NULL),
(NULL, "*", "*"),
("*", NULL, "*"),
("*", "*", NULL),
("*", "*", "*")
]
) AS combos
),
all_combos AS (
SELECT
table.* EXCEPT (ping_type, os, app_build_id),
COALESCE(combo.ping_type, table.ping_type) AS ping_type,
COALESCE(combo.os, table.os) AS os,
COALESCE(combo.app_build_id, table.app_build_id) AS app_build_id
FROM
glam_etl.firefox_desktop_glam_nightly__clients_scalar_aggregates_v1 table
CROSS JOIN
static_combos combo
),
bucketed_booleans AS (
SELECT
client_id,
ping_type,
os,
app_version,
app_build_id,
channel,
NULL AS range_min,
NULL AS range_max,
NULL AS bucket_count,
udf_boolean_buckets(scalar_aggregates) AS scalar_aggregates,
FROM
all_combos
),
log_min_max AS (
SELECT
metric,
key,
LOG(IF(MIN(value) <= 0, 1, MIN(value)), 2) AS range_min,
LOG(IF(MAX(value) <= 0, 1, MAX(value)), 2) AS range_max,
100 AS bucket_count
FROM
all_combos
CROSS JOIN
UNNEST(scalar_aggregates)
WHERE
metric_type <> "boolean"
GROUP BY
1,
2
),
buckets_by_metric AS (
SELECT
*,
ARRAY(
SELECT
FORMAT("%.*f", 2, bucket)
FROM
UNNEST(
mozfun.glam.histogram_generate_scalar_buckets(range_min, range_max, bucket_count)
) AS bucket
) AS buckets
FROM
log_min_max
),
bucketed_scalars AS (
SELECT
client_id,
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
agg_type,
range_min,
range_max,
bucket_count,
-- Keep two decimal places before converting bucket to a string
SAFE_CAST(
FORMAT("%.*f", 2, mozfun.glam.histogram_bucket_from_value(buckets, value) + 0.0001) AS STRING
) AS bucket
FROM
all_combos
CROSS JOIN
UNNEST(scalar_aggregates)
LEFT JOIN
buckets_by_metric
USING
(metric, key)
WHERE
metric_type IN ("counter", "quantity", "labeled_counter", "timespan")
),
booleans_and_scalars AS (
SELECT
client_id,
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
agg_type,
range_min,
range_max,
bucket_count,
bucket
FROM
bucketed_booleans
CROSS JOIN
UNNEST(scalar_aggregates)
UNION ALL
SELECT
client_id,
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
agg_type,
range_min,
range_max,
bucket_count,
bucket
FROM
bucketed_scalars
)
SELECT
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
agg_type AS client_agg_type,
'histogram' AS agg_type,
range_min,
range_max,
bucket_count,
bucket,
-- we could rely on count(*) because there is one row per client and bucket
COUNT(DISTINCT client_id) AS count
FROM
booleans_and_scalars
GROUP BY
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
client_agg_type,
range_min,
range_max,
bucket_count,
bucket

Просмотреть файл

@ -0,0 +1,78 @@
-- query for firefox_desktop_glam_nightly__scalar_percentiles_v1;
WITH flat_clients_scalar_aggregates AS (
SELECT
* EXCEPT (scalar_aggregates)
FROM
glam_etl.firefox_desktop_glam_nightly__clients_scalar_aggregates_v1
CROSS JOIN
UNNEST(scalar_aggregates)
),
-- Cross join with the attribute combinations to reduce the query complexity
-- with respect to the number of operations. A table with n rows cross joined
-- with a combination of m attributes will generate a new table with n*m rows.
-- The glob ("*") symbol can be understood as selecting all of values belonging
-- to that group.
static_combos AS (
SELECT
combos.*
FROM
UNNEST(
ARRAY<STRUCT<ping_type STRING, os STRING, app_build_id STRING>>[
(NULL, NULL, NULL),
(NULL, NULL, "*"),
(NULL, "*", NULL),
("*", NULL, NULL),
(NULL, "*", "*"),
("*", NULL, "*"),
("*", "*", NULL),
("*", "*", "*")
]
) AS combos
),
all_combos AS (
SELECT
table.* EXCEPT (ping_type, os, app_build_id),
COALESCE(combo.ping_type, table.ping_type) AS ping_type,
COALESCE(combo.os, table.os) AS os,
COALESCE(combo.app_build_id, table.app_build_id) AS app_build_id
FROM
flat_clients_scalar_aggregates table
CROSS JOIN
static_combos combo
),
percentiles AS (
SELECT
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
agg_type AS client_agg_type,
'percentiles' AS agg_type,
COUNT(DISTINCT(client_id)) AS total_users,
APPROX_QUANTILES(value, 1000) AS aggregates
FROM
all_combos
GROUP BY
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
client_agg_type
)
SELECT
* REPLACE (
mozfun.glam.map_from_array_offsets_precise(
[5.0, 25.0, 50.0, 75.0, 95.0, 99.0, 99.9],
aggregates
) AS aggregates
)
FROM
percentiles

Просмотреть файл

@ -0,0 +1,53 @@
-- query for firefox_desktop_glam_nightly__scalar_probe_counts_v1;
SELECT
ping_type,
os,
app_version,
app_build_id,
channel,
metric,
metric_type,
key,
client_agg_type,
agg_type,
SUM(count) AS total_users,
mozfun.glam.histogram_fill_buckets_dirichlet(
mozfun.map.sum(ARRAY_AGG(STRUCT<key STRING, value FLOAT64>(bucket, count))),
CASE
WHEN
metric_type IN ("counter", "quantity", "labeled_counter", "timespan")
THEN
ARRAY(
SELECT
FORMAT("%.*f", 2, bucket)
FROM
UNNEST(
mozfun.glam.histogram_generate_scalar_buckets(range_min, range_max, bucket_count)
) AS bucket
ORDER BY
bucket
)
WHEN
metric_type IN ("boolean")
THEN
['always', 'never', 'sometimes']
END
,
SUM(count)
) AS aggregates
FROM
glam_etl.firefox_desktop_glam_nightly__scalar_bucket_counts_v1
GROUP BY
ping_type,
os,
app_version,
app_build_id,
channel,
range_min,
range_max,
bucket_count,
metric,
metric_type,
key,
client_agg_type,
agg_type

Просмотреть файл

@ -0,0 +1,17 @@
CREATE OR REPLACE VIEW
`glam-fenix-dev`.glam_etl.firefox_desktop_glam_nightly__view_clients_daily_histogram_aggregates_v1
AS
WITH extracted AS (
SELECT
*
FROM
`glam-fenix-dev`.glam_etl.firefox_desktop__view_clients_daily_histogram_aggregates_v1
WHERE
channel = 'nightly'
)
SELECT
* EXCEPT (app_build_id, channel),
`mozfun.glam.build_seconds_to_hour`(app_build_id) AS app_build_id,
"*" AS channel
FROM
extracted

Просмотреть файл

@ -0,0 +1,17 @@
CREATE OR REPLACE VIEW
`glam-fenix-dev`.glam_etl.firefox_desktop_glam_nightly__view_clients_daily_scalar_aggregates_v1
AS
WITH extracted AS (
SELECT
*
FROM
`glam-fenix-dev`.glam_etl.firefox_desktop__view_clients_daily_scalar_aggregates_v1
WHERE
channel = 'nightly'
)
SELECT
* EXCEPT (app_build_id, channel),
`mozfun.glam.build_seconds_to_hour`(app_build_id) AS app_build_id,
"*" AS channel
FROM
extracted

Просмотреть файл

@ -0,0 +1,29 @@
-- view for firefox_desktop_glam_nightly__view_probe_counts_v1;
CREATE OR REPLACE VIEW
`glam-fenix-dev.glam_etl.firefox_desktop_glam_nightly__view_probe_counts_v1`
AS
WITH all_counts AS (
SELECT
*
FROM
`glam-fenix-dev.glam_etl.firefox_desktop_glam_nightly__scalar_probe_counts_v1`
UNION ALL
SELECT
*
FROM
`glam-fenix-dev.glam_etl.firefox_desktop_glam_nightly__histogram_probe_counts_v1`
UNION ALL
SELECT
*
FROM
`glam-fenix-dev.glam_etl.firefox_desktop_glam_nightly__scalar_percentiles_v1`
UNION ALL
SELECT
*
FROM
`glam-fenix-dev.glam_etl.firefox_desktop_glam_nightly__histogram_percentiles_v1`
)
SELECT
*
FROM
all_counts

Просмотреть файл

@ -0,0 +1,71 @@
-- view for firefox_desktop_glam_nightly__view_sample_counts_v1;
CREATE OR REPLACE VIEW
`glam-fenix-dev.glam_etl.firefox_desktop_glam_nightly__view_sample_counts_v1`
AS
WITH all_clients AS (
SELECT
ping_type,
os,
app_version,
app_build_id,
channel,
key,
metric,
value
FROM
`glam-fenix-dev`.glam_etl.firefox_desktop_glam_nightly__clients_histogram_aggregates_v1,
UNNEST(histogram_aggregates) h1
),
-- Cross join with the attribute combinations to reduce the query complexity
-- with respect to the number of operations. A table with n rows cross joined
-- with a combination of m attributes will generate a new table with n*m rows.
-- The glob ("*") symbol can be understood as selecting all of values belonging
-- to that group.
static_combos AS (
SELECT
combos.*
FROM
UNNEST(
ARRAY<STRUCT<ping_type STRING, os STRING, app_build_id STRING>>[
(NULL, NULL, NULL),
(NULL, NULL, "*"),
(NULL, "*", NULL),
("*", NULL, NULL),
(NULL, "*", "*"),
("*", NULL, "*"),
("*", "*", NULL),
("*", "*", "*")
]
) AS combos
),
all_combos AS (
SELECT
table.* EXCEPT (ping_type, os, app_build_id),
COALESCE(combo.ping_type, table.ping_type) AS ping_type,
COALESCE(combo.os, table.os) AS os,
COALESCE(combo.app_build_id, table.app_build_id) AS app_build_id
FROM
all_clients table
CROSS JOIN
static_combos combo
)
SELECT
ping_type,
os,
app_version,
app_build_id,
channel,
all_combos.key,
metric,
SUM(v1.value) AS total_sample
FROM
all_combos,
UNNEST(value) AS v1
GROUP BY
ping_type,
os,
app_version,
app_build_id,
channel,
key,
metric

Просмотреть файл

@ -0,0 +1,73 @@
-- view for firefox_desktop_glam_nightly__view_user_counts_v1;
CREATE OR REPLACE VIEW
`glam-fenix-dev.glam_etl.firefox_desktop_glam_nightly__view_user_counts_v1`
AS
WITH all_clients AS (
SELECT
client_id,
ping_type,
os,
app_version,
app_build_id,
channel
FROM
`glam-fenix-dev`.glam_etl.firefox_desktop_glam_nightly__clients_scalar_aggregates_v1
UNION ALL
SELECT
client_id,
ping_type,
os,
app_version,
app_build_id,
channel
FROM
`glam-fenix-dev`.glam_etl.firefox_desktop_glam_nightly__clients_histogram_aggregates_v1
),
-- Cross join with the attribute combinations to reduce the query complexity
-- with respect to the number of operations. A table with n rows cross joined
-- with a combination of m attributes will generate a new table with n*m rows.
-- The glob ("*") symbol can be understood as selecting all of values belonging
-- to that group.
static_combos AS (
SELECT
combos.*
FROM
UNNEST(
ARRAY<STRUCT<ping_type STRING, os STRING, app_build_id STRING>>[
(NULL, NULL, NULL),
(NULL, NULL, "*"),
(NULL, "*", NULL),
("*", NULL, NULL),
(NULL, "*", "*"),
("*", NULL, "*"),
("*", "*", NULL),
("*", "*", "*")
]
) AS combos
),
all_combos AS (
SELECT
table.* EXCEPT (ping_type, os, app_build_id),
COALESCE(combo.ping_type, table.ping_type) AS ping_type,
COALESCE(combo.os, table.os) AS os,
COALESCE(combo.app_build_id, table.app_build_id) AS app_build_id
FROM
all_clients table
CROSS JOIN
static_combos combo
)
SELECT
ping_type,
os,
app_version,
app_build_id,
channel,
COUNT(DISTINCT client_id) AS total_users
FROM
all_combos
GROUP BY
ping_type,
os,
app_version,
app_build_id,
channel