Update exponential histogram doc

- Use normalized_channel instead of application.channel
- Remove date filter, use current_date instead
   - this is b/c main_nightly only has 6 months of data
- Update doclines to clarify table use
This commit is contained in:
Frank Bertsch 2021-01-27 09:55:26 -05:00
Родитель a90b7de437
Коммит b0e8bf99d7
1 изменённых файлов: 24 добавлений и 30 удалений

Просмотреть файл

@ -32,10 +32,10 @@ As of this writing, each main ping histogram is encoded as a JSON string inside
SELECT
payload.histograms.FX_TAB_SWITCH_SPINNER_VISIBLE_MS AS histogram_json,
FROM
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample of non-nightly data
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample across channels
WHERE
sample_id = 42
AND normalized_channel = 'nightly'
AND normalized_channel = 'nightly' -- Only technically necessary if using telemetry.main or telemetry.main_1pct (see above)
AND DATE(submission_timestamp) = '2020-04-20'
AND payload.histograms.FX_TAB_SWITCH_SPINNER_VISIBLE_MS IS NOT NULL
LIMIT
@ -63,10 +63,10 @@ WITH intermediate AS (
SELECT
udf.json_extract_histogram(payload.histograms.FX_TAB_SWITCH_SPINNER_VISIBLE_MS) AS histogram,
FROM
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample of non-nightly data
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample across channels
WHERE
sample_id = 42
AND normalized_channel = 'nightly'
AND normalized_channel = 'nightly' -- Only technically necessary if using telemetry.main or telemetry.main_1pct (see above)
AND DATE(submission_timestamp) = '2020-04-20'
AND payload.histograms.FX_TAB_SWITCH_SPINNER_VISIBLE_MS IS NOT NULL
LIMIT
@ -104,9 +104,9 @@ WITH merged_histogram AS (
ARRAY_AGG(udf.json_extract_histogram(payload.histograms.FX_TAB_SWITCH_SPINNER_VISIBLE_MS))
) AS spinner_visible_ms,
FROM
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample of non-nightly data
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample across channels
WHERE
application.channel = 'nightly'
normalized_channel = 'nightly' -- Only technically necessary if using telemetry.main or telemetry.main_1pct (see above)
AND normalized_os = 'Windows'
AND DATE(submission_timestamp) = '2020-04-20'
),
@ -148,27 +148,25 @@ Knowing the approximate distribution of results on a given day is sort of intere
We can do this simply by _grouping by_ the build id field, and then merging the histograms corresponding to each:
```sql
DECLARE four_twenty DEFAULT DATE('2020-04-20');
WITH per_build_day AS (
SELECT
PARSE_DATETIME("%Y%m%d%H%M%S", application.build_id) AS build_id,
KEY,
SUM(value) AS value,
FROM
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample of non-nightly data,
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample across channels,
UNNEST(
udf.json_extract_histogram(
payload.histograms.FX_TAB_SWITCH_SPINNER_VISIBLE_MS
).VALUES
)
WHERE
application.channel = 'nightly'
normalized_channel = 'nightly' -- Only technically necessary if using telemetry.main or telemetry.main_1pct (see above)
AND normalized_os = 'Windows'
AND application.build_id > FORMAT_DATE("%Y%m%d", DATE_SUB(four_twenty, INTERVAL 2 WEEK))
AND application.build_id <= FORMAT_DATE("%Y%m%d", four_twenty)
AND DATE(submission_timestamp) >= DATE_SUB(four_twenty, INTERVAL 2 WEEK)
AND DATE(submission_timestamp) <= four_twenty
AND application.build_id > FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE, INTERVAL 2 WEEK))
AND application.build_id <= FORMAT_DATE("%Y%m%d", CURRENT_DATE)
AND DATE(submission_timestamp) >= DATE_SUB(CURRENT_DATE, INTERVAL 2 WEEK)
AND DATE(submission_timestamp) <= CURRENT_DATE
GROUP BY
KEY,
build_id
@ -216,8 +214,6 @@ A solution used by GLAM is to give each client "one vote": that is, the aggregat
We can reproduce this approach by using the [`mozfun.hist.normalize`](https://mozilla.github.io/bigquery-etl/mozfun/hist/#normalize) UDF, which explicitly takes a set of histograms and makes sure that the values for each one sum up to exactly one:
```sql
DECLARE four_twenty DEFAULT DATE('2020-04-20');
WITH per_build_client_day AS (
SELECT
PARSE_DATETIME("%Y%m%d%H%M%S", application.build_id) AS build_id,
@ -232,14 +228,14 @@ WITH per_build_client_day AS (
)
) AS tab_switch_visible_ms
FROM
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample of non-nightly data
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample across channels
WHERE
application.channel = 'nightly'
normalized_channel = 'nightly' -- Only technically necessary if using telemetry.main or telemetry.main_1pct (see above)
AND normalized_os = 'Windows'
AND application.build_id > FORMAT_DATE("%Y%m%d", DATE_SUB(four_twenty, INTERVAL 14 DAY))
AND application.build_id <= FORMAT_DATE("%Y%m%d", four_twenty)
AND DATE(submission_timestamp) >= DATE_SUB(four_twenty, INTERVAL 14 DAY)
AND DATE(submission_timestamp) <= four_twenty
AND application.build_id > FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE, INTERVAL 14 DAY))
AND application.build_id <= FORMAT_DATE("%Y%m%d", CURRENT_DATE)
AND DATE(submission_timestamp) >= DATE_SUB(CURRENT_DATE, INTERVAL 14 DAY)
AND DATE(submission_timestamp) <= CURRENT_DATE
GROUP BY
build_id,
client_id
@ -302,8 +298,6 @@ OK, so we've reproduced GLAM, but that isn't particularly exciting in and of its
We can filter our query to _just_ that group of users by adding a `AND normalized_os_version="6.1"` clause to our query above:
```sql
DECLARE four_twenty DEFAULT DATE('2020-04-20');
WITH per_build_client_day AS (
SELECT
PARSE_DATETIME("%Y%m%d%H%M%S", application.build_id) AS build_id,
@ -318,15 +312,15 @@ WITH per_build_client_day AS (
)
) AS tab_switch_visible_ms
FROM
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample of non-nightly data
telemetry.main_nightly -- Use telemetry.main_1pct for a 1% sample across channels
WHERE
application.channel = 'nightly'
normalized_channel = 'nightly' -- Only technically necessary if using telemetry.main or telemetry.main_1pct (see above)
AND normalized_os = 'Windows'
AND normalized_os_version = "6.1"
AND application.build_id > FORMAT_DATE("%Y%m%d", DATE_SUB(four_twenty, INTERVAL 14 DAY))
AND application.build_id <= FORMAT_DATE("%Y%m%d", four_twenty)
AND DATE(submission_timestamp) >= DATE_SUB(four_twenty, INTERVAL 14 DAY)
AND DATE(submission_timestamp) <= four_twenty
AND application.build_id > FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE, INTERVAL 14 DAY))
AND application.build_id <= FORMAT_DATE("%Y%m%d", CURRENT_DATE)
AND DATE(submission_timestamp) >= DATE_SUB(CURRENT_DATE, INTERVAL 14 DAY)
AND DATE(submission_timestamp) <= CURRENT_DATE
GROUP BY
build_id,
client_id