Use bit arrays in clients_last_seen

This methodology was developed in draft #80

The result here is that `clients_last_seen_v1` becomes a view that should
be compatible with existing usage (it has all the same columns as the existing
table), but also has underlying bit patterns for each usage criterion,
which can be used for calculating engagement, retention, and new profile
metrics (those will be in a later PR).

In the next day or two, there will be similar PRs for nondesktop and FxA.
This commit is contained in:
Jeff Klukas 2019-05-10 16:12:22 -04:00
Родитель c36204de2f
Коммит 789cabe1b3
21 изменённых файлов: 238 добавлений и 71 удалений

Просмотреть файл

@ -27,6 +27,7 @@ Recommended practices
- Should be [incremental]
- Should filter input tables on partition and clustering columns
- Should use `_` prefix in generated column names not meant for output
- Should use `_bits` suffix for any integer column that represents a bit pattern
- Should not use `DATETIME` type, due to incompatiblity with
[spark-bigquery-connector]
- Should use the earliest row for each `document_id` by `submission_timestamp`

Просмотреть файл

@ -0,0 +1,24 @@
CREATE TABLE
clients_last_seen_raw_v1
PARTITION BY
submission_date
CLUSTER BY
sample_id,
client_id AS
SELECT
CAST(NULL AS DATE) AS submission_date,
0 AS days_seen_bits,
0 AS days_visited_5_uri_bits,
0 AS days_opened_dev_tools_bits,
0 AS days_since_created_profile,
CAST(NULL AS BOOLEAN) ping_seen_within_6_days_of_profile_creation,
-- We make sure to delay * until the end so that as new columns are added
-- to clients_daily, we can add those columns in the same order to the end
-- of this schema, which may be necessary for the daily join query between
-- the two tables to validate.
* EXCEPT (submission_date_s3)
FROM
clients_daily_v6
WHERE
-- Output empty table and read no input rows
FALSE

Просмотреть файл

@ -0,0 +1,74 @@
-- Equivalent to, but more efficient than, calling udf_bitmask_range(1, 28)
CREATE TEMP FUNCTION bitmask_lowest_28() AS (0x0FFFFFFF);
--
CREATE TEMP FUNCTION shift_one_day(x INT64) AS (IFNULL((x << 1) & bitmask_lowest_28(), 0));
--
CREATE TEMP FUNCTION combine_days(prev INT64, curr INT64) AS (shift_one_day(prev) + IFNULL(curr, 0));
--
WITH
_current AS (
SELECT
-- In this raw table, we capture the history of activity over the past
-- 28 days for each usage criterion as a single 64-bit integer. The
-- rightmost bit represents whether the user was active in the current day.
CAST(TRUE AS INT64) AS days_seen_bits,
-- For measuring Active MAU, where this is the days since this
-- client_id was an Active User as defined by
-- https://docs.telemetry.mozilla.org/cookbooks/active_dau.html
CAST(scalar_parent_browser_engagement_total_uri_count_sum >= 5 AS INT64) AS days_visited_5_uri_bits,
CAST(devtools_toolbox_opened_count_sum > 0 AS INT64) AS days_opened_dev_tools_bits,
DATE_DIFF(submission_date_s3, SAFE.PARSE_DATE("%F", SUBSTR(profile_creation_date, 0, 10)), DAY) AS days_since_created_profile,
CAST(NULL AS BOOLEAN) AS ping_seen_within_6_days_of_profile_creation,
* EXCEPT (submission_date_s3)
FROM
clients_daily_v6
WHERE
submission_date_s3 = @submission_date ),
--
_previous AS (
SELECT
* EXCEPT (submission_date)
REPLACE (
-- Scrub values outside 28 day window.
IF(days_since_created_profile BETWEEN 0 AND 26, days_since_created_profile, NULL) AS days_since_created_profile)
FROM
clients_last_seen_raw_v1 AS cls
WHERE
submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY)
-- Filter out rows from yesterday that have now fallen outside the 28-day window.
AND shift_one_day(days_seen_bits) > 0),
--
_joined AS (
SELECT
@submission_date AS submission_date,
IF(_current.client_id IS NOT NULL,
_current,
_previous).* REPLACE (
combine_days(_previous.days_seen_bits, _current.days_seen_bits) AS days_seen_bits,
combine_days(_previous.days_visited_5_uri_bits, _current.days_visited_5_uri_bits) AS days_visited_5_uri_bits,
combine_days(_previous.days_opened_dev_tools_bits, _current.days_opened_dev_tools_bits) AS days_opened_dev_tools_bits,
-- We want to base new profile creation date on the first profile_creation_date
-- value we observe, so we propagate an existing non-null value in preference
-- to a non-null value on today's observation.
COALESCE(_previous.days_since_created_profile + 1,
_current.days_since_created_profile) AS days_since_created_profile,
-- We only trust profile_creation_date if we see a ping within one week,
-- so we calculate this on day 6 and propagate to subsequent days.
IF(COALESCE(_previous.days_since_created_profile + 1,
_current.days_since_created_profile) = 6, TRUE, _previous.ping_seen_within_6_days_of_profile_creation) AS ping_seen_within_6_days_of_profile_creation)
FROM
_current
FULL JOIN
_previous
USING
-- Include sample_id to match the clustering of the tables, which may improve
-- join performance.
(sample_id, client_id))
--
SELECT
* REPLACE (
-- Null out any fields that may contain data leaked from beyond our 28 day window.
IF(days_since_created_profile BETWEEN 0 AND 27, days_since_created_profile, NULL) AS days_since_created_profile,
IF(days_since_created_profile BETWEEN 0 AND 27, ping_seen_within_6_days_of_profile_creation, NULL) AS ping_seen_within_6_days_of_profile_creation)
FROM
_joined

Просмотреть файл

@ -1,10 +0,0 @@
SELECT
DATE(NULL) AS submission_date,
* EXCEPT (submission_date_s3),
0 AS days_since_seen,
NULL AS days_since_visited_5_uri
FROM
clients_daily_v6
WHERE
-- Output empty table and read no input rows
FALSE

Просмотреть файл

@ -1,44 +1,10 @@
WITH
_current AS (
SELECT
* EXCEPT (submission_date_s3, fxa_configured),
0 AS days_since_seen,
-- For measuring Active MAU, where this is the days since this
-- client_id was an Active User as defined by
-- https://docs.telemetry.mozilla.org/cookbooks/active_dau.html
IF(scalar_parent_browser_engagement_total_uri_count_sum >= 5,
0,
NULL) AS days_since_visited_5_uri,
fxa_configured
FROM
clients_daily_v6
WHERE
submission_date_s3 = @submission_date ),
_previous AS (
SELECT
* EXCEPT (submission_date) REPLACE(
-- omit values outside 28 day window
IF(days_since_visited_5_uri < 27,
days_since_visited_5_uri,
NULL) AS days_since_visited_5_uri)
FROM
clients_last_seen_v1
WHERE
submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY)
AND clients_last_seen_v1.days_since_seen < 27 )
CREATE OR REPLACE VIEW
`moz-fx-data-derived-datasets.telemetry.clients_last_seen_v1` AS
SELECT
@submission_date AS submission_date,
IF(_current.client_id IS NOT NULL,
_current,
_previous).* EXCEPT (days_since_seen,
days_since_visited_5_uri),
COALESCE(_current.days_since_seen,
_previous.days_since_seen + 1) AS days_since_seen,
COALESCE(_current.days_since_visited_5_uri,
_previous.days_since_visited_5_uri + 1) AS days_since_visited_5_uri
-- We cannot use UDFs in a view, so we paste the body of udf_bitpos(bits) literally here.
CAST(SAFE.LOG(days_seen_bits & -days_seen_bits, 2) AS INT64) AS days_since_seen,
CAST(SAFE.LOG(days_visited_5_uri_bits & -days_visited_5_uri_bits, 2) AS INT64) AS days_since_visited_5_uri,
CAST(SAFE.LOG(days_opened_dev_tools_bits & -days_opened_dev_tools_bits, 2) AS INT64) AS days_since_opened_dev_tools,
*
FROM
_current
FULL JOIN
_previous
USING
(client_id)
`moz-fx-data-derived-datasets.telemetry.clients_last_seen_raw_v1`

Просмотреть файл

@ -0,0 +1,2 @@
{"submission_date_s3":"2019-01-02","active_hours_sum":1.0,"devtools_toolbox_opened_count_sum":2.0,"profile_creation_date":"2018-12-27 00:00:00","attribution":{"source":"test"},"client_id":"b","sample_id":0}
{"submission_date_s3":"2019-01-02","active_hours_sum":1.0,"devtools_toolbox_opened_count_sum":0.0,"profile_creation_date":"2018-09-01 00:00:00","attribution":{"source":"test"},"client_id":"c","sample_id":0}

Просмотреть файл

@ -10,11 +10,26 @@
"type": "STRING",
"mode": "REQUIRED"
},
{
"name": "sample_id",
"type": "INTEGER",
"mode": "REQUIRED"
},
{
"name": "active_hours_sum",
"type": "FLOAT",
"mode": "REQUIRED"
},
{
"name": "devtools_toolbox_opened_count_sum",
"type": "FLOAT",
"mode": "REQUIRED"
},
{
"name": "profile_creation_date",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "attribution",
"type": "RECORD",

Просмотреть файл

@ -0,0 +1,3 @@
{"submission_date":"2019-01-01","active_hours_sum":0.0,"devtools_toolbox_opened_count_sum":2.0,"attribution":{"source":"prev"},"client_id":"a","sample_id":0,"days_seen_bits":3,"days_opened_dev_tools_bits":1}
{"submission_date":"2019-01-01","active_hours_sum":0.0,"devtools_toolbox_opened_count_sum":0.0,"attribution":{"source":"prev"},"client_id":"b","sample_id":0,"days_seen_bits":0}
{"submission_date":"2019-01-01","active_hours_sum":0.0,"devtools_toolbox_opened_count_sum":2.0,"attribution":{"source":"prev"},"client_id":"d","sample_id":0,"days_seen_bits":0}

Просмотреть файл

@ -1,4 +1,29 @@
[
{
"name": "days_seen_bits",
"type": "INT64",
"mode": "REQUIRED"
},
{
"name": "days_visited_5_uri_bits",
"type": "INT64",
"mode": "NULLABLE"
},
{
"name": "days_opened_dev_tools_bits",
"type": "INT64",
"mode": "NULLABLE"
},
{
"name": "days_since_created_profile",
"type": "INT64",
"mode": "NULLABLE"
},
{
"name": "ping_seen_within_6_days_of_profile_creation",
"type": "BOOLEAN",
"mode": "NULLABLE"
},
{
"name": "submission_date",
"type": "DATE",
@ -9,11 +34,26 @@
"type": "STRING",
"mode": "REQUIRED"
},
{
"name": "sample_id",
"type": "INTEGER",
"mode": "REQUIRED"
},
{
"name": "active_hours_sum",
"type": "FLOAT",
"mode": "REQUIRED"
},
{
"name": "devtools_toolbox_opened_count_sum",
"type": "FLOAT",
"mode": "REQUIRED"
},
{
"name": "profile_creation_date",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "attribution",
"type": "RECORD",
@ -31,16 +71,6 @@
"type": "INT64",
"mode": "NULLABLE"
},
{
"name": "days_since_seen",
"type": "INT64",
"mode": "REQUIRED"
},
{
"name": "days_since_visited_5_uri",
"type": "INT64",
"mode": "NULLABLE"
},
{
"name": "fxa_configured",
"type": "BOOLEAN",

Просмотреть файл

@ -0,0 +1,3 @@
{"submission_date": "2019-01-02", "days_seen_bits": 6, "days_visited_5_uri_bits": 0, "days_opened_dev_tools_bits": 2, "client_id": "a", "sample_id": 0, "active_hours_sum": 0.0, "devtools_toolbox_opened_count_sum": 2.0, "attribution": {"source": "prev"}}
{"submission_date": "2019-01-02", "days_seen_bits": 1, "days_visited_5_uri_bits": 0, "days_opened_dev_tools_bits": 1, "days_since_created_profile": 6, "ping_seen_within_6_days_of_profile_creation": true, "client_id": "b", "sample_id": 0, "active_hours_sum": 1.0, "devtools_toolbox_opened_count_sum": 2.0, "profile_creation_date": "2018-12-27 00:00:00", "attribution": {"source": "test"}}
{"submission_date": "2019-01-02", "days_seen_bits": 1, "days_visited_5_uri_bits": 0, "days_opened_dev_tools_bits": 0, "client_id": "c", "sample_id": 0, "active_hours_sum": 1.0, "devtools_toolbox_opened_count_sum": 0.0, "profile_creation_date": "2018-09-01 00:00:00", "attribution": {"source": "test"}}

Просмотреть файл

@ -1,2 +0,0 @@
{"submission_date_s3":"2019-01-02","active_hours_sum":1.0,"attribution":{"source":"test"},"client_id":"b"}
{"submission_date_s3":"2019-01-02","active_hours_sum":1.0,"attribution":{"source":"test"},"client_id":"c"}

Просмотреть файл

@ -1,2 +0,0 @@
{"submission_date":"2019-01-01","active_hours_sum":0.0,"attribution":{"source":"prev"},"client_id":"a","days_since_seen":0}
{"submission_date":"2019-01-01","active_hours_sum":0.0,"attribution":{"source":"prev"},"client_id":"b","days_since_seen":0}

Просмотреть файл

@ -1,3 +0,0 @@
{"submission_date":"2019-01-02","client_id":"a","active_hours_sum":0.0,"attribution":{"source":"prev"},"days_since_seen":1}
{"submission_date":"2019-01-02","client_id":"b","active_hours_sum":1.0,"attribution":{"source":"test"},"days_since_seen":0}
{"submission_date":"2019-01-02","client_id":"c","active_hours_sum":1.0,"attribution":{"source":"test"},"days_since_seen":0}

Просмотреть файл

@ -3,7 +3,7 @@
# file, you can obtain one at http://mozilla.org/MPL/2.0/.
"""Automatically generated tests."""
from google.api_core.exceptions import NotFound
from google.api_core.exceptions import BadRequest, NotFound
from google.cloud import bigquery
from .util import coerce_result, generate_tests
@ -58,7 +58,11 @@ def tables(bq, dataset, generated_test):
break # stop because there can only be one time partitioning field
with open(table.source_path, "rb") as file_obj:
job = bq.load_table_from_file(file_obj, destination, job_config=job_config)
job.result()
try:
job.result()
except BadRequest:
print(job.errors)
raise
# clean up handled by default_dataset fixture
@ -76,5 +80,6 @@ def test_generated(bq, dataset, generated_test):
job = bq.query(generated_test.modified_query, job_config=job_config)
result = list(coerce_result(*job.result()))
result.sort(key=lambda row: json.dumps(row))
generated_test.expect.sort(key=lambda row: json.dumps(row))
assert generated_test.expect == result

31
udf/udf_bitmask_range.sql Normal file
Просмотреть файл

@ -0,0 +1,31 @@
CREATE TEMP FUNCTION
udf_bitmask_range( start_ordinal INT64,
_length INT64) AS ((
SELECT
SUM(1 << (_n - 1))
FROM
UNNEST(GENERATE_ARRAY(start_ordinal, start_ordinal + _length - 1)) AS _n ));
/*
Returns a bitmask that can be used to return a subset of an integer representing
a bit array. The start_ordinal argument is an integer specifying the starting
position of the slice, with start_ordinal = 1 indicating the first bit.
The length argument is the number of bits to include in the mask.
The arguments were chosen to match the semantics of the SUBSTR function; see
https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-and-operators#substr
Examples:
SELECT udf_bitmask_range(1, 1);
1
SELECT udf_bitmask_range(2, 4);
30
-- Taking just the second and third bits (from the right) of binary 11011 should give us 00010 (decimal 2)
SELECT ((1 << 4) + (1 << 3) + (1 << 1) + (1 << 0)) & udf_range_bitmask(2, 2);
2
*/

30
udf/udf_bitpos.sql Normal file
Просмотреть файл

@ -0,0 +1,30 @@
CREATE TEMP FUNCTION
udf_bitpos( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
/*
Returns a 0-based index of the rightmost set bit in the passed bit pattern
or null if no bits are set (bits = 0).
To determine this position, we take a bitwise AND of the bit pattern and
its complement, then we determine the position of the bit via base-2 logarithm;
see https://stackoverflow.com/a/42747608/1260237
Examples:
SELECT udf_bitpos(0);
null
SELECT udf_bitpos(1);
0
SELECT udf_bitpos(2);
1
SELECT udf_bitpos(8);
3
SELECT udf_bitpos(8 + 1);
0
*/