Use bit arrays in clients_last_seen
This methodology was developed in draft #80 The result here is that `clients_last_seen_v1` becomes a view that should be compatible with existing usage (it has all the same columns as the existing table), but also has underlying bit patterns for each usage criterion, which can be used for calculating engagement, retention, and new profile metrics (those will be in a later PR). In the next day or two, there will be similar PRs for nondesktop and FxA.
This commit is contained in:
Родитель
c36204de2f
Коммит
789cabe1b3
|
@ -27,6 +27,7 @@ Recommended practices
|
|||
- Should be [incremental]
|
||||
- Should filter input tables on partition and clustering columns
|
||||
- Should use `_` prefix in generated column names not meant for output
|
||||
- Should use `_bits` suffix for any integer column that represents a bit pattern
|
||||
- Should not use `DATETIME` type, due to incompatiblity with
|
||||
[spark-bigquery-connector]
|
||||
- Should use the earliest row for each `document_id` by `submission_timestamp`
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
CREATE TABLE
|
||||
clients_last_seen_raw_v1
|
||||
PARTITION BY
|
||||
submission_date
|
||||
CLUSTER BY
|
||||
sample_id,
|
||||
client_id AS
|
||||
SELECT
|
||||
CAST(NULL AS DATE) AS submission_date,
|
||||
0 AS days_seen_bits,
|
||||
0 AS days_visited_5_uri_bits,
|
||||
0 AS days_opened_dev_tools_bits,
|
||||
0 AS days_since_created_profile,
|
||||
CAST(NULL AS BOOLEAN) ping_seen_within_6_days_of_profile_creation,
|
||||
-- We make sure to delay * until the end so that as new columns are added
|
||||
-- to clients_daily, we can add those columns in the same order to the end
|
||||
-- of this schema, which may be necessary for the daily join query between
|
||||
-- the two tables to validate.
|
||||
* EXCEPT (submission_date_s3)
|
||||
FROM
|
||||
clients_daily_v6
|
||||
WHERE
|
||||
-- Output empty table and read no input rows
|
||||
FALSE
|
|
@ -0,0 +1,74 @@
|
|||
-- Equivalent to, but more efficient than, calling udf_bitmask_range(1, 28)
|
||||
CREATE TEMP FUNCTION bitmask_lowest_28() AS (0x0FFFFFFF);
|
||||
--
|
||||
CREATE TEMP FUNCTION shift_one_day(x INT64) AS (IFNULL((x << 1) & bitmask_lowest_28(), 0));
|
||||
--
|
||||
CREATE TEMP FUNCTION combine_days(prev INT64, curr INT64) AS (shift_one_day(prev) + IFNULL(curr, 0));
|
||||
--
|
||||
WITH
|
||||
_current AS (
|
||||
SELECT
|
||||
-- In this raw table, we capture the history of activity over the past
|
||||
-- 28 days for each usage criterion as a single 64-bit integer. The
|
||||
-- rightmost bit represents whether the user was active in the current day.
|
||||
CAST(TRUE AS INT64) AS days_seen_bits,
|
||||
-- For measuring Active MAU, where this is the days since this
|
||||
-- client_id was an Active User as defined by
|
||||
-- https://docs.telemetry.mozilla.org/cookbooks/active_dau.html
|
||||
CAST(scalar_parent_browser_engagement_total_uri_count_sum >= 5 AS INT64) AS days_visited_5_uri_bits,
|
||||
CAST(devtools_toolbox_opened_count_sum > 0 AS INT64) AS days_opened_dev_tools_bits,
|
||||
DATE_DIFF(submission_date_s3, SAFE.PARSE_DATE("%F", SUBSTR(profile_creation_date, 0, 10)), DAY) AS days_since_created_profile,
|
||||
CAST(NULL AS BOOLEAN) AS ping_seen_within_6_days_of_profile_creation,
|
||||
* EXCEPT (submission_date_s3)
|
||||
FROM
|
||||
clients_daily_v6
|
||||
WHERE
|
||||
submission_date_s3 = @submission_date ),
|
||||
--
|
||||
_previous AS (
|
||||
SELECT
|
||||
* EXCEPT (submission_date)
|
||||
REPLACE (
|
||||
-- Scrub values outside 28 day window.
|
||||
IF(days_since_created_profile BETWEEN 0 AND 26, days_since_created_profile, NULL) AS days_since_created_profile)
|
||||
FROM
|
||||
clients_last_seen_raw_v1 AS cls
|
||||
WHERE
|
||||
submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY)
|
||||
-- Filter out rows from yesterday that have now fallen outside the 28-day window.
|
||||
AND shift_one_day(days_seen_bits) > 0),
|
||||
--
|
||||
_joined AS (
|
||||
SELECT
|
||||
@submission_date AS submission_date,
|
||||
IF(_current.client_id IS NOT NULL,
|
||||
_current,
|
||||
_previous).* REPLACE (
|
||||
combine_days(_previous.days_seen_bits, _current.days_seen_bits) AS days_seen_bits,
|
||||
combine_days(_previous.days_visited_5_uri_bits, _current.days_visited_5_uri_bits) AS days_visited_5_uri_bits,
|
||||
combine_days(_previous.days_opened_dev_tools_bits, _current.days_opened_dev_tools_bits) AS days_opened_dev_tools_bits,
|
||||
-- We want to base new profile creation date on the first profile_creation_date
|
||||
-- value we observe, so we propagate an existing non-null value in preference
|
||||
-- to a non-null value on today's observation.
|
||||
COALESCE(_previous.days_since_created_profile + 1,
|
||||
_current.days_since_created_profile) AS days_since_created_profile,
|
||||
-- We only trust profile_creation_date if we see a ping within one week,
|
||||
-- so we calculate this on day 6 and propagate to subsequent days.
|
||||
IF(COALESCE(_previous.days_since_created_profile + 1,
|
||||
_current.days_since_created_profile) = 6, TRUE, _previous.ping_seen_within_6_days_of_profile_creation) AS ping_seen_within_6_days_of_profile_creation)
|
||||
FROM
|
||||
_current
|
||||
FULL JOIN
|
||||
_previous
|
||||
USING
|
||||
-- Include sample_id to match the clustering of the tables, which may improve
|
||||
-- join performance.
|
||||
(sample_id, client_id))
|
||||
--
|
||||
SELECT
|
||||
* REPLACE (
|
||||
-- Null out any fields that may contain data leaked from beyond our 28 day window.
|
||||
IF(days_since_created_profile BETWEEN 0 AND 27, days_since_created_profile, NULL) AS days_since_created_profile,
|
||||
IF(days_since_created_profile BETWEEN 0 AND 27, ping_seen_within_6_days_of_profile_creation, NULL) AS ping_seen_within_6_days_of_profile_creation)
|
||||
FROM
|
||||
_joined
|
|
@ -1,10 +0,0 @@
|
|||
SELECT
|
||||
DATE(NULL) AS submission_date,
|
||||
* EXCEPT (submission_date_s3),
|
||||
0 AS days_since_seen,
|
||||
NULL AS days_since_visited_5_uri
|
||||
FROM
|
||||
clients_daily_v6
|
||||
WHERE
|
||||
-- Output empty table and read no input rows
|
||||
FALSE
|
|
@ -1,44 +1,10 @@
|
|||
WITH
|
||||
_current AS (
|
||||
SELECT
|
||||
* EXCEPT (submission_date_s3, fxa_configured),
|
||||
0 AS days_since_seen,
|
||||
-- For measuring Active MAU, where this is the days since this
|
||||
-- client_id was an Active User as defined by
|
||||
-- https://docs.telemetry.mozilla.org/cookbooks/active_dau.html
|
||||
IF(scalar_parent_browser_engagement_total_uri_count_sum >= 5,
|
||||
0,
|
||||
NULL) AS days_since_visited_5_uri,
|
||||
fxa_configured
|
||||
FROM
|
||||
clients_daily_v6
|
||||
WHERE
|
||||
submission_date_s3 = @submission_date ),
|
||||
_previous AS (
|
||||
SELECT
|
||||
* EXCEPT (submission_date) REPLACE(
|
||||
-- omit values outside 28 day window
|
||||
IF(days_since_visited_5_uri < 27,
|
||||
days_since_visited_5_uri,
|
||||
NULL) AS days_since_visited_5_uri)
|
||||
FROM
|
||||
clients_last_seen_v1
|
||||
WHERE
|
||||
submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY)
|
||||
AND clients_last_seen_v1.days_since_seen < 27 )
|
||||
CREATE OR REPLACE VIEW
|
||||
`moz-fx-data-derived-datasets.telemetry.clients_last_seen_v1` AS
|
||||
SELECT
|
||||
@submission_date AS submission_date,
|
||||
IF(_current.client_id IS NOT NULL,
|
||||
_current,
|
||||
_previous).* EXCEPT (days_since_seen,
|
||||
days_since_visited_5_uri),
|
||||
COALESCE(_current.days_since_seen,
|
||||
_previous.days_since_seen + 1) AS days_since_seen,
|
||||
COALESCE(_current.days_since_visited_5_uri,
|
||||
_previous.days_since_visited_5_uri + 1) AS days_since_visited_5_uri
|
||||
-- We cannot use UDFs in a view, so we paste the body of udf_bitpos(bits) literally here.
|
||||
CAST(SAFE.LOG(days_seen_bits & -days_seen_bits, 2) AS INT64) AS days_since_seen,
|
||||
CAST(SAFE.LOG(days_visited_5_uri_bits & -days_visited_5_uri_bits, 2) AS INT64) AS days_since_visited_5_uri,
|
||||
CAST(SAFE.LOG(days_opened_dev_tools_bits & -days_opened_dev_tools_bits, 2) AS INT64) AS days_since_opened_dev_tools,
|
||||
*
|
||||
FROM
|
||||
_current
|
||||
FULL JOIN
|
||||
_previous
|
||||
USING
|
||||
(client_id)
|
||||
`moz-fx-data-derived-datasets.telemetry.clients_last_seen_raw_v1`
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
{"submission_date_s3":"2019-01-02","active_hours_sum":1.0,"devtools_toolbox_opened_count_sum":2.0,"profile_creation_date":"2018-12-27 00:00:00","attribution":{"source":"test"},"client_id":"b","sample_id":0}
|
||||
{"submission_date_s3":"2019-01-02","active_hours_sum":1.0,"devtools_toolbox_opened_count_sum":0.0,"profile_creation_date":"2018-09-01 00:00:00","attribution":{"source":"test"},"client_id":"c","sample_id":0}
|
|
@ -10,11 +10,26 @@
|
|||
"type": "STRING",
|
||||
"mode": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "sample_id",
|
||||
"type": "INTEGER",
|
||||
"mode": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "active_hours_sum",
|
||||
"type": "FLOAT",
|
||||
"mode": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "devtools_toolbox_opened_count_sum",
|
||||
"type": "FLOAT",
|
||||
"mode": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "profile_creation_date",
|
||||
"type": "STRING",
|
||||
"mode": "NULLABLE"
|
||||
},
|
||||
{
|
||||
"name": "attribution",
|
||||
"type": "RECORD",
|
|
@ -0,0 +1,3 @@
|
|||
{"submission_date":"2019-01-01","active_hours_sum":0.0,"devtools_toolbox_opened_count_sum":2.0,"attribution":{"source":"prev"},"client_id":"a","sample_id":0,"days_seen_bits":3,"days_opened_dev_tools_bits":1}
|
||||
{"submission_date":"2019-01-01","active_hours_sum":0.0,"devtools_toolbox_opened_count_sum":0.0,"attribution":{"source":"prev"},"client_id":"b","sample_id":0,"days_seen_bits":0}
|
||||
{"submission_date":"2019-01-01","active_hours_sum":0.0,"devtools_toolbox_opened_count_sum":2.0,"attribution":{"source":"prev"},"client_id":"d","sample_id":0,"days_seen_bits":0}
|
|
@ -1,4 +1,29 @@
|
|||
[
|
||||
{
|
||||
"name": "days_seen_bits",
|
||||
"type": "INT64",
|
||||
"mode": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "days_visited_5_uri_bits",
|
||||
"type": "INT64",
|
||||
"mode": "NULLABLE"
|
||||
},
|
||||
{
|
||||
"name": "days_opened_dev_tools_bits",
|
||||
"type": "INT64",
|
||||
"mode": "NULLABLE"
|
||||
},
|
||||
{
|
||||
"name": "days_since_created_profile",
|
||||
"type": "INT64",
|
||||
"mode": "NULLABLE"
|
||||
},
|
||||
{
|
||||
"name": "ping_seen_within_6_days_of_profile_creation",
|
||||
"type": "BOOLEAN",
|
||||
"mode": "NULLABLE"
|
||||
},
|
||||
{
|
||||
"name": "submission_date",
|
||||
"type": "DATE",
|
||||
|
@ -9,11 +34,26 @@
|
|||
"type": "STRING",
|
||||
"mode": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "sample_id",
|
||||
"type": "INTEGER",
|
||||
"mode": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "active_hours_sum",
|
||||
"type": "FLOAT",
|
||||
"mode": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "devtools_toolbox_opened_count_sum",
|
||||
"type": "FLOAT",
|
||||
"mode": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "profile_creation_date",
|
||||
"type": "STRING",
|
||||
"mode": "NULLABLE"
|
||||
},
|
||||
{
|
||||
"name": "attribution",
|
||||
"type": "RECORD",
|
||||
|
@ -31,16 +71,6 @@
|
|||
"type": "INT64",
|
||||
"mode": "NULLABLE"
|
||||
},
|
||||
{
|
||||
"name": "days_since_seen",
|
||||
"type": "INT64",
|
||||
"mode": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "days_since_visited_5_uri",
|
||||
"type": "INT64",
|
||||
"mode": "NULLABLE"
|
||||
},
|
||||
{
|
||||
"name": "fxa_configured",
|
||||
"type": "BOOLEAN",
|
|
@ -0,0 +1,3 @@
|
|||
{"submission_date": "2019-01-02", "days_seen_bits": 6, "days_visited_5_uri_bits": 0, "days_opened_dev_tools_bits": 2, "client_id": "a", "sample_id": 0, "active_hours_sum": 0.0, "devtools_toolbox_opened_count_sum": 2.0, "attribution": {"source": "prev"}}
|
||||
{"submission_date": "2019-01-02", "days_seen_bits": 1, "days_visited_5_uri_bits": 0, "days_opened_dev_tools_bits": 1, "days_since_created_profile": 6, "ping_seen_within_6_days_of_profile_creation": true, "client_id": "b", "sample_id": 0, "active_hours_sum": 1.0, "devtools_toolbox_opened_count_sum": 2.0, "profile_creation_date": "2018-12-27 00:00:00", "attribution": {"source": "test"}}
|
||||
{"submission_date": "2019-01-02", "days_seen_bits": 1, "days_visited_5_uri_bits": 0, "days_opened_dev_tools_bits": 0, "client_id": "c", "sample_id": 0, "active_hours_sum": 1.0, "devtools_toolbox_opened_count_sum": 0.0, "profile_creation_date": "2018-09-01 00:00:00", "attribution": {"source": "test"}}
|
|
@ -1,2 +0,0 @@
|
|||
{"submission_date_s3":"2019-01-02","active_hours_sum":1.0,"attribution":{"source":"test"},"client_id":"b"}
|
||||
{"submission_date_s3":"2019-01-02","active_hours_sum":1.0,"attribution":{"source":"test"},"client_id":"c"}
|
|
@ -1,2 +0,0 @@
|
|||
{"submission_date":"2019-01-01","active_hours_sum":0.0,"attribution":{"source":"prev"},"client_id":"a","days_since_seen":0}
|
||||
{"submission_date":"2019-01-01","active_hours_sum":0.0,"attribution":{"source":"prev"},"client_id":"b","days_since_seen":0}
|
|
@ -1,3 +0,0 @@
|
|||
{"submission_date":"2019-01-02","client_id":"a","active_hours_sum":0.0,"attribution":{"source":"prev"},"days_since_seen":1}
|
||||
{"submission_date":"2019-01-02","client_id":"b","active_hours_sum":1.0,"attribution":{"source":"test"},"days_since_seen":0}
|
||||
{"submission_date":"2019-01-02","client_id":"c","active_hours_sum":1.0,"attribution":{"source":"test"},"days_since_seen":0}
|
|
@ -3,7 +3,7 @@
|
|||
# file, you can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
"""Automatically generated tests."""
|
||||
|
||||
from google.api_core.exceptions import NotFound
|
||||
from google.api_core.exceptions import BadRequest, NotFound
|
||||
from google.cloud import bigquery
|
||||
from .util import coerce_result, generate_tests
|
||||
|
||||
|
@ -58,7 +58,11 @@ def tables(bq, dataset, generated_test):
|
|||
break # stop because there can only be one time partitioning field
|
||||
with open(table.source_path, "rb") as file_obj:
|
||||
job = bq.load_table_from_file(file_obj, destination, job_config=job_config)
|
||||
job.result()
|
||||
try:
|
||||
job.result()
|
||||
except BadRequest:
|
||||
print(job.errors)
|
||||
raise
|
||||
# clean up handled by default_dataset fixture
|
||||
|
||||
|
||||
|
@ -76,5 +80,6 @@ def test_generated(bq, dataset, generated_test):
|
|||
job = bq.query(generated_test.modified_query, job_config=job_config)
|
||||
result = list(coerce_result(*job.result()))
|
||||
result.sort(key=lambda row: json.dumps(row))
|
||||
generated_test.expect.sort(key=lambda row: json.dumps(row))
|
||||
|
||||
assert generated_test.expect == result
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
CREATE TEMP FUNCTION
|
||||
udf_bitmask_range( start_ordinal INT64,
|
||||
_length INT64) AS ((
|
||||
SELECT
|
||||
SUM(1 << (_n - 1))
|
||||
FROM
|
||||
UNNEST(GENERATE_ARRAY(start_ordinal, start_ordinal + _length - 1)) AS _n ));
|
||||
|
||||
/*
|
||||
|
||||
Returns a bitmask that can be used to return a subset of an integer representing
|
||||
a bit array. The start_ordinal argument is an integer specifying the starting
|
||||
position of the slice, with start_ordinal = 1 indicating the first bit.
|
||||
The length argument is the number of bits to include in the mask.
|
||||
|
||||
The arguments were chosen to match the semantics of the SUBSTR function; see
|
||||
https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-and-operators#substr
|
||||
|
||||
Examples:
|
||||
|
||||
SELECT udf_bitmask_range(1, 1);
|
||||
1
|
||||
|
||||
SELECT udf_bitmask_range(2, 4);
|
||||
30
|
||||
|
||||
-- Taking just the second and third bits (from the right) of binary 11011 should give us 00010 (decimal 2)
|
||||
SELECT ((1 << 4) + (1 << 3) + (1 << 1) + (1 << 0)) & udf_range_bitmask(2, 2);
|
||||
2
|
||||
|
||||
*/
|
|
@ -0,0 +1,30 @@
|
|||
CREATE TEMP FUNCTION
|
||||
udf_bitpos( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
|
||||
|
||||
/*
|
||||
|
||||
Returns a 0-based index of the rightmost set bit in the passed bit pattern
|
||||
or null if no bits are set (bits = 0).
|
||||
|
||||
To determine this position, we take a bitwise AND of the bit pattern and
|
||||
its complement, then we determine the position of the bit via base-2 logarithm;
|
||||
see https://stackoverflow.com/a/42747608/1260237
|
||||
|
||||
Examples:
|
||||
|
||||
SELECT udf_bitpos(0);
|
||||
null
|
||||
|
||||
SELECT udf_bitpos(1);
|
||||
0
|
||||
|
||||
SELECT udf_bitpos(2);
|
||||
1
|
||||
|
||||
SELECT udf_bitpos(8);
|
||||
3
|
||||
|
||||
SELECT udf_bitpos(8 + 1);
|
||||
0
|
||||
|
||||
*/
|
Загрузка…
Ссылка в новой задаче