Use days_since_* and new table name (#41)

This commit is contained in:
Daniel Thorn 2019-03-28 10:39:14 -07:00 коммит произвёл GitHub
Родитель 904f25f242
Коммит 51378e8049
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
13 изменённых файлов: 120 добавлений и 57 удалений

Просмотреть файл

@ -9,7 +9,8 @@ Recommended practices
===
- Queries
- Should be defined in files named as `sql/table_version.sql` e.g. `sql/clients_daily_v6.sql`
- Should be defined in files named as `sql/table_version.sql` e.g.
`sql/clients_daily_v6.sql`
- Should not specify a project or dataset in table names to simplify testing
- Should be [incremental](#incremental-queries)
- Should filter input tables on partition and clustering columns
@ -48,10 +49,10 @@ Incremental queries have these properties:
- Should be impacted by values from a finite number of preceding partitions
- This allows for backfilling in chunks instead of serially for all time
and limiting backfills to a certain number of days following updated data
- For example `sql/nondesktop_clients_last_seen_v1.sql` can be run serially
on any 28 day period and the last day will be the same whether or not the
partition preceding the first day was missing because values are only
impacted by 27 preceding days
- For example `sql/clients_last_seen_v1.sql` can be run serially on any 28 day
period and the last day will be the same whether or not the partition
preceding the first day was missing because values are only impacted by
27 preceding days
Tests
=====

Просмотреть файл

@ -1,30 +1,43 @@
WITH current_sample AS (
WITH
_current AS (
SELECT
submission_date_s3 AS last_seen_date,
* EXCEPT (submission_date_s3)
* EXCEPT (submission_date_s3),
0 AS days_since_seen,
-- For measuring Active MAU, where this is the day since this
-- client_id was an Active User as defined by
-- https://docs.telemetry.mozilla.org/cookbooks/active_dau.html
IF(scalar_parent_browser_engagement_total_uri_count_sum >= 5,
0,
NULL) AS days_since_visited_5_uri
FROM
clients_daily_v6
WHERE
submission_date_s3 = @submission_date
), previous AS (
submission_date_s3 = @submission_date ),
_previous AS (
SELECT
* EXCEPT (submission_date,
generated_time)
* EXCEPT (submission_date) REPLACE(
-- omit values outside 28 day window
IF(days_since_visited_5_uri < 27,
days_since_visited_5_uri,
NULL) AS days_since_visited_5_uri)
FROM
analysis.clients_last_seen_v1
clients_last_seen_v1
WHERE
submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY)
AND last_seen_date > DATE_SUB(@submission_date, INTERVAL 28 DAY)
)
AND clients_last_seen_v1.days_since_seen < 27 )
SELECT
@submission_date AS submission_date,
CURRENT_DATETIME() AS generated_time,
IF(current_sample.client_id IS NOT NULL,
current_sample,
previous).*
IF(_current.client_id IS NOT NULL,
_current,
_previous).* EXCEPT (days_since_seen,
days_since_visited_5_uri),
COALESCE(_current.days_since_seen,
_previous.days_since_seen + 1) AS days_since_seen,
COALESCE(_current.days_since_visited_5_uri,
_previous.days_since_visited_5_uri + 1) AS days_since_visited_5_uri
FROM
current_sample
_current
FULL JOIN
previous
_previous
USING
(client_id)

Просмотреть файл

@ -1,18 +1,15 @@
WITH
inactive_days AS (
SELECT
*,
DATE_DIFF(submission_date, last_seen_date, DAY) AS _inactive_days
FROM
clients_last_seen_v1
)
SELECT
submission_date,
CURRENT_DATETIME() AS generated_time,
COUNTIF(_inactive_days < 28) AS mau,
COUNTIF(_inactive_days < 7) AS wau,
COUNTIF(_inactive_days < 1) AS dau,
COUNTIF(days_since_seen < 28) AS mau,
COUNTIF(days_since_seen < 7) AS wau,
COUNTIF(days_since_seen < 1) AS dau,
-- Active MAU counts all Active Users on any day in the last 28 days not just
-- the most recent day making COUNTIF(_days_since_seen < 28 AND visited_5_uri)
-- incorrect. Instead we track days_since_visited_5_uri and use that.
-- https://docs.telemetry.mozilla.org/cookbooks/active_dau.html
COUNTIF(days_since_visited_5_uri < 28) AS visited_5_uri_mau,
COUNTIF(days_since_visited_5_uri < 7) AS visited_5_uri_wau,
COUNTIF(days_since_visited_5_uri < 1) AS visited_5_uri_dau,
-- We hash client_ids into 20 buckets to aid in computing
-- confidence intervals for mau/wau/dau sums; the particular hash
-- function and number of buckets is subject to change in the future.
@ -25,7 +22,7 @@ SELECT
country,
distribution_id
FROM
inactive_days
clients_last_seen_v1
WHERE
client_id IS NOT NULL
AND submission_date = @submission_date

Просмотреть файл

@ -1,12 +1,14 @@
CREATE OR REPLACE VIEW
firefox_desktop_exact_mau28_v1 AS
SELECT
submission_date,
CURRENT_DATETIME() AS generated_time,
SUM(mau) AS mau,
SUM(wau) AS wau,
SUM(dau) AS dau
SUM(dau) AS dau,
SUM(visited_5_uri_mau) AS visited_5_uri_mau,
SUM(visited_5_uri_wau) AS visited_5_uri_wau,
SUM(visited_5_uri_dau) AS visited_5_uri_dau
FROM
firefox_desktop_exact_mau28_by_dimensions_v1
WHERE
submission_date = @submission_date
`moz-fx-data-derived-datasets.telemetry.firefox_desktop_exact_mau28_by_dimensions_v1`
GROUP BY
submission_date

Просмотреть файл

@ -0,0 +1,5 @@
id,v,day,last_seen
a,0,2019-01-01,2019-01-01
c,0,2019-01-01,2019-01-01
e,0,2019-01-01,2019-01-01
g,0,2019-01-01,2019-01-01
1 id v day last_seen
2 a 0 2019-01-01 2019-01-01
3 c 0 2019-01-01 2019-01-01
4 e 0 2019-01-01 2019-01-01
5 g 0 2019-01-01 2019-01-01

Просмотреть файл

@ -0,0 +1,9 @@
id,v,day
a,2,2019-01-03
b,2,2019-01-03
c,2,2019-01-03
d,2,2019-01-03
c,1,2019-01-02
d,1,2019-01-02
e,1,2019-01-02
f,1,2019-01-02
1 id v day
2 a 2 2019-01-03
3 b 2 2019-01-03
4 c 2 2019-01-03
5 d 2 2019-01-03
6 c 1 2019-01-02
7 d 1 2019-01-02
8 e 1 2019-01-02
9 f 1 2019-01-02

Просмотреть файл

@ -0,0 +1,14 @@
id,v,day,last_seen
a,0,2019-01-02,2019-01-01
a,2,2019-01-03,2019-01-03
b,2,2019-01-03,2019-01-03
c,1,2019-01-02,2019-01-02
c,2,2019-01-03,2019-01-03
d,1,2019-01-02,2019-01-02
d,2,2019-01-03,2019-01-03
e,1,2019-01-02,2019-01-02
e,1,2019-01-02,2019-01-02
f,1,2019-01-02,2019-01-02
f,1,2019-01-02,2019-01-02
g,0,2019-01-02,2019-01-01
g,0,2019-01-02,2019-01-01
1 id v day last_seen
2 a 0 2019-01-02 2019-01-01
3 a 2 2019-01-03 2019-01-03
4 b 2 2019-01-03 2019-01-03
5 c 1 2019-01-02 2019-01-02
6 c 2 2019-01-03 2019-01-03
7 d 1 2019-01-02 2019-01-02
8 d 2 2019-01-03 2019-01-03
9 e 1 2019-01-02 2019-01-02
10 e 1 2019-01-02 2019-01-02
11 f 1 2019-01-02 2019-01-02
12 f 1 2019-01-02 2019-01-02
13 g 0 2019-01-02 2019-01-01
14 g 0 2019-01-02 2019-01-01

Просмотреть файл

@ -0,0 +1,12 @@
loads:
- source: ''
destination: clients_daily_v6
job_config:
schema:
fields:
query:
queryParameters:
- name: submission_date
parameterType: {type: DATE}
parameterValue: {value: 2019-1-2}

Просмотреть файл

@ -1,2 +0,0 @@
{"submission_date":"2019-01-01","generated_time":"2019-01-02T01:00:00","last_seen_date":"2019-01-01","active_hours_sum":0.0,"attribution":{"source":"prev"},"client_id":"a"}
{"submission_date":"2019-01-01","generated_time":"2019-01-02T01:00:00","last_seen_date":"2019-01-01","active_hours_sum":0.0,"attribution":{"source":"prev"},"client_id":"b"}

Просмотреть файл

@ -5,6 +5,11 @@
"mode": "REQUIRED",
"description": "time_partitioning_field"
},
{
"name": "client_id",
"type": "STRING",
"mode": "REQUIRED"
},
{
"name": "active_hours_sum",
"type": "FLOAT",
@ -23,8 +28,8 @@
]
},
{
"name": "client_id",
"type": "STRING",
"mode": "REQUIRED"
"name": "scalar_parent_browser_engagement_total_uri_count_sum",
"type": "INT64",
"mode": "NULLABLE"
}
]

Просмотреть файл

@ -0,0 +1,2 @@
{"submission_date":"2019-01-01","active_hours_sum":0.0,"attribution":{"source":"prev"},"client_id":"a","days_since_seen":0}
{"submission_date":"2019-01-01","active_hours_sum":0.0,"attribution":{"source":"prev"},"client_id":"b","days_since_seen":0}

Просмотреть файл

@ -5,13 +5,8 @@
"mode": "REQUIRED"
},
{
"name": "generated_time",
"type": "DATETIME",
"mode": "REQUIRED"
},
{
"name": "last_seen_date",
"type": "DATE",
"name": "client_id",
"type": "STRING",
"mode": "REQUIRED"
},
{
@ -32,8 +27,18 @@
]
},
{
"name": "client_id",
"type": "STRING",
"name": "scalar_parent_browser_engagement_total_uri_count_sum",
"type": "INT64",
"mode": "NULLABLE"
},
{
"name": "days_since_seen",
"type": "INT64",
"mode": "REQUIRED"
},
{
"name": "days_since_visited_5_uri",
"type": "INT64",
"mode": "NULLABLE"
}
]

Просмотреть файл

@ -1,3 +1,3 @@
{"submission_date":"2019-01-02","active_hours_sum":0.0,"attribution":{"source":"prev"},"client_id":"a","last_seen_date":"2019-01-01"}
{"submission_date":"2019-01-02","active_hours_sum":1.0,"attribution":{"source":"test"},"client_id":"b","last_seen_date":"2019-01-02"}
{"submission_date":"2019-01-02","active_hours_sum":1.0,"attribution":{"source":"test"},"client_id":"c","last_seen_date":"2019-01-02"}
{"submission_date":"2019-01-02","client_id":"a","active_hours_sum":0.0,"attribution":{"source":"prev"},"scalar_parent_browser_engagement_total_uri_count_sum":null,"days_since_seen":1,"days_since_visited_5_uri":null}
{"submission_date":"2019-01-02","client_id":"b","active_hours_sum":1.0,"attribution":{"source":"test"},"scalar_parent_browser_engagement_total_uri_count_sum":null,"days_since_seen":0,"days_since_visited_5_uri":null}
{"submission_date":"2019-01-02","client_id":"c","active_hours_sum":1.0,"attribution":{"source":"test"},"scalar_parent_browser_engagement_total_uri_count_sum":null,"days_since_seen":0,"days_since_visited_5_uri":null}