Use days_since_* and new table name (#41)

This commit is contained in:
Daniel Thorn 2019-03-28 10:39:14 -07:00 коммит произвёл GitHub
Родитель 904f25f242
Коммит 51378e8049
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
13 изменённых файлов: 120 добавлений и 57 удалений

Просмотреть файл

@ -9,7 +9,8 @@ Recommended practices
=== ===
- Queries - Queries
- Should be defined in files named as `sql/table_version.sql` e.g. `sql/clients_daily_v6.sql` - Should be defined in files named as `sql/table_version.sql` e.g.
`sql/clients_daily_v6.sql`
- Should not specify a project or dataset in table names to simplify testing - Should not specify a project or dataset in table names to simplify testing
- Should be [incremental](#incremental-queries) - Should be [incremental](#incremental-queries)
- Should filter input tables on partition and clustering columns - Should filter input tables on partition and clustering columns
@ -48,10 +49,10 @@ Incremental queries have these properties:
- Should be impacted by values from a finite number of preceding partitions - Should be impacted by values from a finite number of preceding partitions
- This allows for backfilling in chunks instead of serially for all time - This allows for backfilling in chunks instead of serially for all time
and limiting backfills to a certain number of days following updated data and limiting backfills to a certain number of days following updated data
- For example `sql/nondesktop_clients_last_seen_v1.sql` can be run serially - For example `sql/clients_last_seen_v1.sql` can be run serially on any 28 day
on any 28 day period and the last day will be the same whether or not the period and the last day will be the same whether or not the partition
partition preceding the first day was missing because values are only preceding the first day was missing because values are only impacted by
impacted by 27 preceding days 27 preceding days
Tests Tests
===== =====

Просмотреть файл

@ -1,30 +1,43 @@
WITH current_sample AS ( WITH
_current AS (
SELECT SELECT
submission_date_s3 AS last_seen_date, * EXCEPT (submission_date_s3),
* EXCEPT (submission_date_s3) 0 AS days_since_seen,
-- For measuring Active MAU, where this is the day since this
-- client_id was an Active User as defined by
-- https://docs.telemetry.mozilla.org/cookbooks/active_dau.html
IF(scalar_parent_browser_engagement_total_uri_count_sum >= 5,
0,
NULL) AS days_since_visited_5_uri
FROM FROM
clients_daily_v6 clients_daily_v6
WHERE WHERE
submission_date_s3 = @submission_date submission_date_s3 = @submission_date ),
), previous AS ( _previous AS (
SELECT SELECT
* EXCEPT (submission_date, * EXCEPT (submission_date) REPLACE(
generated_time) -- omit values outside 28 day window
IF(days_since_visited_5_uri < 27,
days_since_visited_5_uri,
NULL) AS days_since_visited_5_uri)
FROM FROM
analysis.clients_last_seen_v1 clients_last_seen_v1
WHERE WHERE
submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY) submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY)
AND last_seen_date > DATE_SUB(@submission_date, INTERVAL 28 DAY) AND clients_last_seen_v1.days_since_seen < 27 )
)
SELECT SELECT
@submission_date AS submission_date, @submission_date AS submission_date,
CURRENT_DATETIME() AS generated_time, IF(_current.client_id IS NOT NULL,
IF(current_sample.client_id IS NOT NULL, _current,
current_sample, _previous).* EXCEPT (days_since_seen,
previous).* days_since_visited_5_uri),
COALESCE(_current.days_since_seen,
_previous.days_since_seen + 1) AS days_since_seen,
COALESCE(_current.days_since_visited_5_uri,
_previous.days_since_visited_5_uri + 1) AS days_since_visited_5_uri
FROM FROM
current_sample _current
FULL JOIN FULL JOIN
previous _previous
USING USING
(client_id) (client_id)

Просмотреть файл

@ -1,18 +1,15 @@
WITH
inactive_days AS (
SELECT
*,
DATE_DIFF(submission_date, last_seen_date, DAY) AS _inactive_days
FROM
clients_last_seen_v1
)
SELECT SELECT
submission_date, submission_date,
CURRENT_DATETIME() AS generated_time, COUNTIF(days_since_seen < 28) AS mau,
COUNTIF(_inactive_days < 28) AS mau, COUNTIF(days_since_seen < 7) AS wau,
COUNTIF(_inactive_days < 7) AS wau, COUNTIF(days_since_seen < 1) AS dau,
COUNTIF(_inactive_days < 1) AS dau, -- Active MAU counts all Active Users on any day in the last 28 days not just
-- the most recent day making COUNTIF(_days_since_seen < 28 AND visited_5_uri)
-- incorrect. Instead we track days_since_visited_5_uri and use that.
-- https://docs.telemetry.mozilla.org/cookbooks/active_dau.html
COUNTIF(days_since_visited_5_uri < 28) AS visited_5_uri_mau,
COUNTIF(days_since_visited_5_uri < 7) AS visited_5_uri_wau,
COUNTIF(days_since_visited_5_uri < 1) AS visited_5_uri_dau,
-- We hash client_ids into 20 buckets to aid in computing -- We hash client_ids into 20 buckets to aid in computing
-- confidence intervals for mau/wau/dau sums; the particular hash -- confidence intervals for mau/wau/dau sums; the particular hash
-- function and number of buckets is subject to change in the future. -- function and number of buckets is subject to change in the future.
@ -25,7 +22,7 @@ SELECT
country, country,
distribution_id distribution_id
FROM FROM
inactive_days clients_last_seen_v1
WHERE WHERE
client_id IS NOT NULL client_id IS NOT NULL
AND submission_date = @submission_date AND submission_date = @submission_date

Просмотреть файл

@ -1,12 +1,14 @@
CREATE OR REPLACE VIEW
firefox_desktop_exact_mau28_v1 AS
SELECT SELECT
submission_date, submission_date,
CURRENT_DATETIME() AS generated_time,
SUM(mau) AS mau, SUM(mau) AS mau,
SUM(wau) AS wau, SUM(wau) AS wau,
SUM(dau) AS dau SUM(dau) AS dau,
SUM(visited_5_uri_mau) AS visited_5_uri_mau,
SUM(visited_5_uri_wau) AS visited_5_uri_wau,
SUM(visited_5_uri_dau) AS visited_5_uri_dau
FROM FROM
firefox_desktop_exact_mau28_by_dimensions_v1 `moz-fx-data-derived-datasets.telemetry.firefox_desktop_exact_mau28_by_dimensions_v1`
WHERE
submission_date = @submission_date
GROUP BY GROUP BY
submission_date submission_date

Просмотреть файл

@ -0,0 +1,5 @@
id,v,day,last_seen
a,0,2019-01-01,2019-01-01
c,0,2019-01-01,2019-01-01
e,0,2019-01-01,2019-01-01
g,0,2019-01-01,2019-01-01
1 id v day last_seen
2 a 0 2019-01-01 2019-01-01
3 c 0 2019-01-01 2019-01-01
4 e 0 2019-01-01 2019-01-01
5 g 0 2019-01-01 2019-01-01

Просмотреть файл

@ -0,0 +1,9 @@
id,v,day
a,2,2019-01-03
b,2,2019-01-03
c,2,2019-01-03
d,2,2019-01-03
c,1,2019-01-02
d,1,2019-01-02
e,1,2019-01-02
f,1,2019-01-02
1 id v day
2 a 2 2019-01-03
3 b 2 2019-01-03
4 c 2 2019-01-03
5 d 2 2019-01-03
6 c 1 2019-01-02
7 d 1 2019-01-02
8 e 1 2019-01-02
9 f 1 2019-01-02

Просмотреть файл

@ -0,0 +1,14 @@
id,v,day,last_seen
a,0,2019-01-02,2019-01-01
a,2,2019-01-03,2019-01-03
b,2,2019-01-03,2019-01-03
c,1,2019-01-02,2019-01-02
c,2,2019-01-03,2019-01-03
d,1,2019-01-02,2019-01-02
d,2,2019-01-03,2019-01-03
e,1,2019-01-02,2019-01-02
e,1,2019-01-02,2019-01-02
f,1,2019-01-02,2019-01-02
f,1,2019-01-02,2019-01-02
g,0,2019-01-02,2019-01-01
g,0,2019-01-02,2019-01-01
1 id v day last_seen
2 a 0 2019-01-02 2019-01-01
3 a 2 2019-01-03 2019-01-03
4 b 2 2019-01-03 2019-01-03
5 c 1 2019-01-02 2019-01-02
6 c 2 2019-01-03 2019-01-03
7 d 1 2019-01-02 2019-01-02
8 d 2 2019-01-03 2019-01-03
9 e 1 2019-01-02 2019-01-02
10 e 1 2019-01-02 2019-01-02
11 f 1 2019-01-02 2019-01-02
12 f 1 2019-01-02 2019-01-02
13 g 0 2019-01-02 2019-01-01
14 g 0 2019-01-02 2019-01-01

Просмотреть файл

@ -0,0 +1,12 @@
loads:
- source: ''
destination: clients_daily_v6
job_config:
schema:
fields:
query:
queryParameters:
- name: submission_date
parameterType: {type: DATE}
parameterValue: {value: 2019-1-2}

Просмотреть файл

@ -1,2 +0,0 @@
{"submission_date":"2019-01-01","generated_time":"2019-01-02T01:00:00","last_seen_date":"2019-01-01","active_hours_sum":0.0,"attribution":{"source":"prev"},"client_id":"a"}
{"submission_date":"2019-01-01","generated_time":"2019-01-02T01:00:00","last_seen_date":"2019-01-01","active_hours_sum":0.0,"attribution":{"source":"prev"},"client_id":"b"}

Просмотреть файл

@ -5,6 +5,11 @@
"mode": "REQUIRED", "mode": "REQUIRED",
"description": "time_partitioning_field" "description": "time_partitioning_field"
}, },
{
"name": "client_id",
"type": "STRING",
"mode": "REQUIRED"
},
{ {
"name": "active_hours_sum", "name": "active_hours_sum",
"type": "FLOAT", "type": "FLOAT",
@ -23,8 +28,8 @@
] ]
}, },
{ {
"name": "client_id", "name": "scalar_parent_browser_engagement_total_uri_count_sum",
"type": "STRING", "type": "INT64",
"mode": "REQUIRED" "mode": "NULLABLE"
} }
] ]

Просмотреть файл

@ -0,0 +1,2 @@
{"submission_date":"2019-01-01","active_hours_sum":0.0,"attribution":{"source":"prev"},"client_id":"a","days_since_seen":0}
{"submission_date":"2019-01-01","active_hours_sum":0.0,"attribution":{"source":"prev"},"client_id":"b","days_since_seen":0}

Просмотреть файл

@ -5,13 +5,8 @@
"mode": "REQUIRED" "mode": "REQUIRED"
}, },
{ {
"name": "generated_time", "name": "client_id",
"type": "DATETIME", "type": "STRING",
"mode": "REQUIRED"
},
{
"name": "last_seen_date",
"type": "DATE",
"mode": "REQUIRED" "mode": "REQUIRED"
}, },
{ {
@ -32,8 +27,18 @@
] ]
}, },
{ {
"name": "client_id", "name": "scalar_parent_browser_engagement_total_uri_count_sum",
"type": "STRING", "type": "INT64",
"mode": "NULLABLE"
},
{
"name": "days_since_seen",
"type": "INT64",
"mode": "REQUIRED" "mode": "REQUIRED"
},
{
"name": "days_since_visited_5_uri",
"type": "INT64",
"mode": "NULLABLE"
} }
] ]

Просмотреть файл

@ -1,3 +1,3 @@
{"submission_date":"2019-01-02","active_hours_sum":0.0,"attribution":{"source":"prev"},"client_id":"a","last_seen_date":"2019-01-01"} {"submission_date":"2019-01-02","client_id":"a","active_hours_sum":0.0,"attribution":{"source":"prev"},"scalar_parent_browser_engagement_total_uri_count_sum":null,"days_since_seen":1,"days_since_visited_5_uri":null}
{"submission_date":"2019-01-02","active_hours_sum":1.0,"attribution":{"source":"test"},"client_id":"b","last_seen_date":"2019-01-02"} {"submission_date":"2019-01-02","client_id":"b","active_hours_sum":1.0,"attribution":{"source":"test"},"scalar_parent_browser_engagement_total_uri_count_sum":null,"days_since_seen":0,"days_since_visited_5_uri":null}
{"submission_date":"2019-01-02","active_hours_sum":1.0,"attribution":{"source":"test"},"client_id":"c","last_seen_date":"2019-01-02"} {"submission_date":"2019-01-02","client_id":"c","active_hours_sum":1.0,"attribution":{"source":"test"},"scalar_parent_browser_engagement_total_uri_count_sum":null,"days_since_seen":0,"days_since_visited_5_uri":null}