diff --git a/README.md b/README.md index 4b8203dfaf..7b62189a59 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ Recommended practices - Should be [incremental] - Should filter input tables on partition and clustering columns - Should use `_` prefix in generated column names not meant for output +- Should use `_bits` suffix for any integer column that represents a bit pattern - Should not use `DATETIME` type, due to incompatiblity with [spark-bigquery-connector] - Should use the earliest row for each `document_id` by `submission_timestamp` diff --git a/sql/clients_last_seen_raw_v1.init.sql b/sql/clients_last_seen_raw_v1.init.sql new file mode 100644 index 0000000000..3a86490a40 --- /dev/null +++ b/sql/clients_last_seen_raw_v1.init.sql @@ -0,0 +1,24 @@ +CREATE TABLE + clients_last_seen_raw_v1 +PARTITION BY + submission_date +CLUSTER BY + sample_id, + client_id AS +SELECT + CAST(NULL AS DATE) AS submission_date, + 0 AS days_seen_bits, + 0 AS days_visited_5_uri_bits, + 0 AS days_opened_dev_tools_bits, + 0 AS days_since_created_profile, + CAST(NULL AS BOOLEAN) ping_seen_within_6_days_of_profile_creation, + -- We make sure to delay * until the end so that as new columns are added + -- to clients_daily, we can add those columns in the same order to the end + -- of this schema, which may be necessary for the daily join query between + -- the two tables to validate. + * EXCEPT (submission_date_s3) +FROM + clients_daily_v6 +WHERE + -- Output empty table and read no input rows + FALSE diff --git a/sql/clients_last_seen_raw_v1.sql b/sql/clients_last_seen_raw_v1.sql new file mode 100644 index 0000000000..7efa8e1d6b --- /dev/null +++ b/sql/clients_last_seen_raw_v1.sql @@ -0,0 +1,74 @@ +-- Equivalent to, but more efficient than, calling udf_bitmask_range(1, 28) +CREATE TEMP FUNCTION bitmask_lowest_28() AS (0x0FFFFFFF); +-- +CREATE TEMP FUNCTION shift_one_day(x INT64) AS (IFNULL((x << 1) & bitmask_lowest_28(), 0)); +-- +CREATE TEMP FUNCTION combine_days(prev INT64, curr INT64) AS (shift_one_day(prev) + IFNULL(curr, 0)); +-- +WITH + _current AS ( + SELECT + -- In this raw table, we capture the history of activity over the past + -- 28 days for each usage criterion as a single 64-bit integer. The + -- rightmost bit represents whether the user was active in the current day. + CAST(TRUE AS INT64) AS days_seen_bits, + -- For measuring Active MAU, where this is the days since this + -- client_id was an Active User as defined by + -- https://docs.telemetry.mozilla.org/cookbooks/active_dau.html + CAST(scalar_parent_browser_engagement_total_uri_count_sum >= 5 AS INT64) AS days_visited_5_uri_bits, + CAST(devtools_toolbox_opened_count_sum > 0 AS INT64) AS days_opened_dev_tools_bits, + DATE_DIFF(submission_date_s3, SAFE.PARSE_DATE("%F", SUBSTR(profile_creation_date, 0, 10)), DAY) AS days_since_created_profile, + CAST(NULL AS BOOLEAN) AS ping_seen_within_6_days_of_profile_creation, + * EXCEPT (submission_date_s3) + FROM + clients_daily_v6 + WHERE + submission_date_s3 = @submission_date ), + -- + _previous AS ( + SELECT + * EXCEPT (submission_date) + REPLACE ( + -- Scrub values outside 28 day window. + IF(days_since_created_profile BETWEEN 0 AND 26, days_since_created_profile, NULL) AS days_since_created_profile) + FROM + clients_last_seen_raw_v1 AS cls + WHERE + submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY) + -- Filter out rows from yesterday that have now fallen outside the 28-day window. + AND shift_one_day(days_seen_bits) > 0), + -- + _joined AS ( + SELECT + @submission_date AS submission_date, + IF(_current.client_id IS NOT NULL, + _current, + _previous).* REPLACE ( + combine_days(_previous.days_seen_bits, _current.days_seen_bits) AS days_seen_bits, + combine_days(_previous.days_visited_5_uri_bits, _current.days_visited_5_uri_bits) AS days_visited_5_uri_bits, + combine_days(_previous.days_opened_dev_tools_bits, _current.days_opened_dev_tools_bits) AS days_opened_dev_tools_bits, + -- We want to base new profile creation date on the first profile_creation_date + -- value we observe, so we propagate an existing non-null value in preference + -- to a non-null value on today's observation. + COALESCE(_previous.days_since_created_profile + 1, + _current.days_since_created_profile) AS days_since_created_profile, + -- We only trust profile_creation_date if we see a ping within one week, + -- so we calculate this on day 6 and propagate to subsequent days. + IF(COALESCE(_previous.days_since_created_profile + 1, + _current.days_since_created_profile) = 6, TRUE, _previous.ping_seen_within_6_days_of_profile_creation) AS ping_seen_within_6_days_of_profile_creation) + FROM + _current + FULL JOIN + _previous + USING + -- Include sample_id to match the clustering of the tables, which may improve + -- join performance. + (sample_id, client_id)) + -- +SELECT + * REPLACE ( + -- Null out any fields that may contain data leaked from beyond our 28 day window. + IF(days_since_created_profile BETWEEN 0 AND 27, days_since_created_profile, NULL) AS days_since_created_profile, + IF(days_since_created_profile BETWEEN 0 AND 27, ping_seen_within_6_days_of_profile_creation, NULL) AS ping_seen_within_6_days_of_profile_creation) +FROM + _joined diff --git a/sql/clients_last_seen_v1.init.sql b/sql/clients_last_seen_v1.init.sql deleted file mode 100644 index 9b966a19c2..0000000000 --- a/sql/clients_last_seen_v1.init.sql +++ /dev/null @@ -1,10 +0,0 @@ -SELECT - DATE(NULL) AS submission_date, - * EXCEPT (submission_date_s3), - 0 AS days_since_seen, - NULL AS days_since_visited_5_uri -FROM - clients_daily_v6 -WHERE - -- Output empty table and read no input rows - FALSE diff --git a/sql/clients_last_seen_v1.sql b/sql/clients_last_seen_v1.sql index 24cfec6fcb..9904ef6793 100644 --- a/sql/clients_last_seen_v1.sql +++ b/sql/clients_last_seen_v1.sql @@ -1,44 +1,10 @@ -WITH - _current AS ( - SELECT - * EXCEPT (submission_date_s3, fxa_configured), - 0 AS days_since_seen, - -- For measuring Active MAU, where this is the days since this - -- client_id was an Active User as defined by - -- https://docs.telemetry.mozilla.org/cookbooks/active_dau.html - IF(scalar_parent_browser_engagement_total_uri_count_sum >= 5, - 0, - NULL) AS days_since_visited_5_uri, - fxa_configured - FROM - clients_daily_v6 - WHERE - submission_date_s3 = @submission_date ), - _previous AS ( - SELECT - * EXCEPT (submission_date) REPLACE( - -- omit values outside 28 day window - IF(days_since_visited_5_uri < 27, - days_since_visited_5_uri, - NULL) AS days_since_visited_5_uri) - FROM - clients_last_seen_v1 - WHERE - submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY) - AND clients_last_seen_v1.days_since_seen < 27 ) +CREATE OR REPLACE VIEW + `moz-fx-data-derived-datasets.telemetry.clients_last_seen_v1` AS SELECT - @submission_date AS submission_date, - IF(_current.client_id IS NOT NULL, - _current, - _previous).* EXCEPT (days_since_seen, - days_since_visited_5_uri), - COALESCE(_current.days_since_seen, - _previous.days_since_seen + 1) AS days_since_seen, - COALESCE(_current.days_since_visited_5_uri, - _previous.days_since_visited_5_uri + 1) AS days_since_visited_5_uri + -- We cannot use UDFs in a view, so we paste the body of udf_bitpos(bits) literally here. + CAST(SAFE.LOG(days_seen_bits & -days_seen_bits, 2) AS INT64) AS days_since_seen, + CAST(SAFE.LOG(days_visited_5_uri_bits & -days_visited_5_uri_bits, 2) AS INT64) AS days_since_visited_5_uri, + CAST(SAFE.LOG(days_opened_dev_tools_bits & -days_opened_dev_tools_bits, 2) AS INT64) AS days_since_opened_dev_tools, + * FROM - _current -FULL JOIN - _previous -USING - (client_id) + `moz-fx-data-derived-datasets.telemetry.clients_last_seen_raw_v1` diff --git a/tests/clients_last_seen_v1/test_backfill/clients_last_seen_v1.csv b/tests/clients_last_seen_raw_v1/test_backfill/clients_last_seen_raw_v1.csv similarity index 100% rename from tests/clients_last_seen_v1/test_backfill/clients_last_seen_v1.csv rename to tests/clients_last_seen_raw_v1/test_backfill/clients_last_seen_raw_v1.csv diff --git a/tests/clients_last_seen_v1/test_backfill/daily.csv b/tests/clients_last_seen_raw_v1/test_backfill/daily.csv similarity index 100% rename from tests/clients_last_seen_v1/test_backfill/daily.csv rename to tests/clients_last_seen_raw_v1/test_backfill/daily.csv diff --git a/tests/clients_last_seen_v1/test_backfill/expect.csv b/tests/clients_last_seen_raw_v1/test_backfill/expect.csv similarity index 100% rename from tests/clients_last_seen_v1/test_backfill/expect.csv rename to tests/clients_last_seen_raw_v1/test_backfill/expect.csv diff --git a/tests/clients_last_seen_v1/test_empty_input.yaml b/tests/clients_last_seen_raw_v1/test_empty_input.yaml similarity index 100% rename from tests/clients_last_seen_v1/test_empty_input.yaml rename to tests/clients_last_seen_raw_v1/test_empty_input.yaml diff --git a/tests/clients_last_seen_raw_v1/test_single_day/clients_daily_v6.ndjson b/tests/clients_last_seen_raw_v1/test_single_day/clients_daily_v6.ndjson new file mode 100644 index 0000000000..ca2e7f64a7 --- /dev/null +++ b/tests/clients_last_seen_raw_v1/test_single_day/clients_daily_v6.ndjson @@ -0,0 +1,2 @@ +{"submission_date_s3":"2019-01-02","active_hours_sum":1.0,"devtools_toolbox_opened_count_sum":2.0,"profile_creation_date":"2018-12-27 00:00:00","attribution":{"source":"test"},"client_id":"b","sample_id":0} +{"submission_date_s3":"2019-01-02","active_hours_sum":1.0,"devtools_toolbox_opened_count_sum":0.0,"profile_creation_date":"2018-09-01 00:00:00","attribution":{"source":"test"},"client_id":"c","sample_id":0} diff --git a/tests/clients_last_seen_v1/test_single_day/clients_daily_v6.schema.json b/tests/clients_last_seen_raw_v1/test_single_day/clients_daily_v6.schema.json similarity index 72% rename from tests/clients_last_seen_v1/test_single_day/clients_daily_v6.schema.json rename to tests/clients_last_seen_raw_v1/test_single_day/clients_daily_v6.schema.json index e1aee2ec0b..9e00de403b 100644 --- a/tests/clients_last_seen_v1/test_single_day/clients_daily_v6.schema.json +++ b/tests/clients_last_seen_raw_v1/test_single_day/clients_daily_v6.schema.json @@ -10,11 +10,26 @@ "type": "STRING", "mode": "REQUIRED" }, + { + "name": "sample_id", + "type": "INTEGER", + "mode": "REQUIRED" + }, { "name": "active_hours_sum", "type": "FLOAT", "mode": "REQUIRED" }, + { + "name": "devtools_toolbox_opened_count_sum", + "type": "FLOAT", + "mode": "REQUIRED" + }, + { + "name": "profile_creation_date", + "type": "STRING", + "mode": "NULLABLE" + }, { "name": "attribution", "type": "RECORD", diff --git a/tests/clients_last_seen_raw_v1/test_single_day/clients_last_seen_raw_v1.ndjson b/tests/clients_last_seen_raw_v1/test_single_day/clients_last_seen_raw_v1.ndjson new file mode 100644 index 0000000000..464efbf599 --- /dev/null +++ b/tests/clients_last_seen_raw_v1/test_single_day/clients_last_seen_raw_v1.ndjson @@ -0,0 +1,3 @@ +{"submission_date":"2019-01-01","active_hours_sum":0.0,"devtools_toolbox_opened_count_sum":2.0,"attribution":{"source":"prev"},"client_id":"a","sample_id":0,"days_seen_bits":3,"days_opened_dev_tools_bits":1} +{"submission_date":"2019-01-01","active_hours_sum":0.0,"devtools_toolbox_opened_count_sum":0.0,"attribution":{"source":"prev"},"client_id":"b","sample_id":0,"days_seen_bits":0} +{"submission_date":"2019-01-01","active_hours_sum":0.0,"devtools_toolbox_opened_count_sum":2.0,"attribution":{"source":"prev"},"client_id":"d","sample_id":0,"days_seen_bits":0} diff --git a/tests/clients_last_seen_v1/test_single_day/clients_last_seen_v1.schema.json b/tests/clients_last_seen_raw_v1/test_single_day/clients_last_seen_raw_v1.schema.json similarity index 54% rename from tests/clients_last_seen_v1/test_single_day/clients_last_seen_v1.schema.json rename to tests/clients_last_seen_raw_v1/test_single_day/clients_last_seen_raw_v1.schema.json index cb093ffbfe..d16d25c4eb 100644 --- a/tests/clients_last_seen_v1/test_single_day/clients_last_seen_v1.schema.json +++ b/tests/clients_last_seen_raw_v1/test_single_day/clients_last_seen_raw_v1.schema.json @@ -1,4 +1,29 @@ [ + { + "name": "days_seen_bits", + "type": "INT64", + "mode": "REQUIRED" + }, + { + "name": "days_visited_5_uri_bits", + "type": "INT64", + "mode": "NULLABLE" + }, + { + "name": "days_opened_dev_tools_bits", + "type": "INT64", + "mode": "NULLABLE" + }, + { + "name": "days_since_created_profile", + "type": "INT64", + "mode": "NULLABLE" + }, + { + "name": "ping_seen_within_6_days_of_profile_creation", + "type": "BOOLEAN", + "mode": "NULLABLE" + }, { "name": "submission_date", "type": "DATE", @@ -9,11 +34,26 @@ "type": "STRING", "mode": "REQUIRED" }, + { + "name": "sample_id", + "type": "INTEGER", + "mode": "REQUIRED" + }, { "name": "active_hours_sum", "type": "FLOAT", "mode": "REQUIRED" }, + { + "name": "devtools_toolbox_opened_count_sum", + "type": "FLOAT", + "mode": "REQUIRED" + }, + { + "name": "profile_creation_date", + "type": "STRING", + "mode": "NULLABLE" + }, { "name": "attribution", "type": "RECORD", @@ -31,16 +71,6 @@ "type": "INT64", "mode": "NULLABLE" }, - { - "name": "days_since_seen", - "type": "INT64", - "mode": "REQUIRED" - }, - { - "name": "days_since_visited_5_uri", - "type": "INT64", - "mode": "NULLABLE" - }, { "name": "fxa_configured", "type": "BOOLEAN", diff --git a/tests/clients_last_seen_raw_v1/test_single_day/expect.ndjson b/tests/clients_last_seen_raw_v1/test_single_day/expect.ndjson new file mode 100644 index 0000000000..32f6d0c597 --- /dev/null +++ b/tests/clients_last_seen_raw_v1/test_single_day/expect.ndjson @@ -0,0 +1,3 @@ +{"submission_date": "2019-01-02", "days_seen_bits": 6, "days_visited_5_uri_bits": 0, "days_opened_dev_tools_bits": 2, "client_id": "a", "sample_id": 0, "active_hours_sum": 0.0, "devtools_toolbox_opened_count_sum": 2.0, "attribution": {"source": "prev"}} +{"submission_date": "2019-01-02", "days_seen_bits": 1, "days_visited_5_uri_bits": 0, "days_opened_dev_tools_bits": 1, "days_since_created_profile": 6, "ping_seen_within_6_days_of_profile_creation": true, "client_id": "b", "sample_id": 0, "active_hours_sum": 1.0, "devtools_toolbox_opened_count_sum": 2.0, "profile_creation_date": "2018-12-27 00:00:00", "attribution": {"source": "test"}} +{"submission_date": "2019-01-02", "days_seen_bits": 1, "days_visited_5_uri_bits": 0, "days_opened_dev_tools_bits": 0, "client_id": "c", "sample_id": 0, "active_hours_sum": 1.0, "devtools_toolbox_opened_count_sum": 0.0, "profile_creation_date": "2018-09-01 00:00:00", "attribution": {"source": "test"}} diff --git a/tests/clients_last_seen_v1/test_single_day/query_params.yaml b/tests/clients_last_seen_raw_v1/test_single_day/query_params.yaml similarity index 100% rename from tests/clients_last_seen_v1/test_single_day/query_params.yaml rename to tests/clients_last_seen_raw_v1/test_single_day/query_params.yaml diff --git a/tests/clients_last_seen_v1/test_single_day/clients_daily_v6.ndjson b/tests/clients_last_seen_v1/test_single_day/clients_daily_v6.ndjson deleted file mode 100644 index 02bfeaa181..0000000000 --- a/tests/clients_last_seen_v1/test_single_day/clients_daily_v6.ndjson +++ /dev/null @@ -1,2 +0,0 @@ -{"submission_date_s3":"2019-01-02","active_hours_sum":1.0,"attribution":{"source":"test"},"client_id":"b"} -{"submission_date_s3":"2019-01-02","active_hours_sum":1.0,"attribution":{"source":"test"},"client_id":"c"} diff --git a/tests/clients_last_seen_v1/test_single_day/clients_last_seen_v1.ndjson b/tests/clients_last_seen_v1/test_single_day/clients_last_seen_v1.ndjson deleted file mode 100644 index d834d53943..0000000000 --- a/tests/clients_last_seen_v1/test_single_day/clients_last_seen_v1.ndjson +++ /dev/null @@ -1,2 +0,0 @@ -{"submission_date":"2019-01-01","active_hours_sum":0.0,"attribution":{"source":"prev"},"client_id":"a","days_since_seen":0} -{"submission_date":"2019-01-01","active_hours_sum":0.0,"attribution":{"source":"prev"},"client_id":"b","days_since_seen":0} diff --git a/tests/clients_last_seen_v1/test_single_day/expect.ndjson b/tests/clients_last_seen_v1/test_single_day/expect.ndjson deleted file mode 100644 index 0079fc4f4f..0000000000 --- a/tests/clients_last_seen_v1/test_single_day/expect.ndjson +++ /dev/null @@ -1,3 +0,0 @@ -{"submission_date":"2019-01-02","client_id":"a","active_hours_sum":0.0,"attribution":{"source":"prev"},"days_since_seen":1} -{"submission_date":"2019-01-02","client_id":"b","active_hours_sum":1.0,"attribution":{"source":"test"},"days_since_seen":0} -{"submission_date":"2019-01-02","client_id":"c","active_hours_sum":1.0,"attribution":{"source":"test"},"days_since_seen":0} diff --git a/tests/test_generated.py b/tests/test_generated.py index a7a69ff714..4132fb28e6 100644 --- a/tests/test_generated.py +++ b/tests/test_generated.py @@ -3,7 +3,7 @@ # file, you can obtain one at http://mozilla.org/MPL/2.0/. """Automatically generated tests.""" -from google.api_core.exceptions import NotFound +from google.api_core.exceptions import BadRequest, NotFound from google.cloud import bigquery from .util import coerce_result, generate_tests @@ -58,7 +58,11 @@ def tables(bq, dataset, generated_test): break # stop because there can only be one time partitioning field with open(table.source_path, "rb") as file_obj: job = bq.load_table_from_file(file_obj, destination, job_config=job_config) - job.result() + try: + job.result() + except BadRequest: + print(job.errors) + raise # clean up handled by default_dataset fixture @@ -76,5 +80,6 @@ def test_generated(bq, dataset, generated_test): job = bq.query(generated_test.modified_query, job_config=job_config) result = list(coerce_result(*job.result())) result.sort(key=lambda row: json.dumps(row)) + generated_test.expect.sort(key=lambda row: json.dumps(row)) assert generated_test.expect == result diff --git a/udf/udf_bitmask_range.sql b/udf/udf_bitmask_range.sql new file mode 100644 index 0000000000..09e74ed2d8 --- /dev/null +++ b/udf/udf_bitmask_range.sql @@ -0,0 +1,31 @@ +CREATE TEMP FUNCTION + udf_bitmask_range( start_ordinal INT64, + _length INT64) AS (( + SELECT + SUM(1 << (_n - 1)) + FROM + UNNEST(GENERATE_ARRAY(start_ordinal, start_ordinal + _length - 1)) AS _n )); + +/* + +Returns a bitmask that can be used to return a subset of an integer representing +a bit array. The start_ordinal argument is an integer specifying the starting +position of the slice, with start_ordinal = 1 indicating the first bit. +The length argument is the number of bits to include in the mask. + +The arguments were chosen to match the semantics of the SUBSTR function; see +https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-and-operators#substr + +Examples: + +SELECT udf_bitmask_range(1, 1); +1 + +SELECT udf_bitmask_range(2, 4); +30 + +-- Taking just the second and third bits (from the right) of binary 11011 should give us 00010 (decimal 2) +SELECT ((1 << 4) + (1 << 3) + (1 << 1) + (1 << 0)) & udf_range_bitmask(2, 2); +2 + +*/ diff --git a/udf/udf_bitpos.sql b/udf/udf_bitpos.sql new file mode 100644 index 0000000000..9939e73898 --- /dev/null +++ b/udf/udf_bitpos.sql @@ -0,0 +1,30 @@ +CREATE TEMP FUNCTION + udf_bitpos( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64)); + +/* + +Returns a 0-based index of the rightmost set bit in the passed bit pattern +or null if no bits are set (bits = 0). + +To determine this position, we take a bitwise AND of the bit pattern and +its complement, then we determine the position of the bit via base-2 logarithm; +see https://stackoverflow.com/a/42747608/1260237 + +Examples: + +SELECT udf_bitpos(0); +null + +SELECT udf_bitpos(1); +0 + +SELECT udf_bitpos(2); +1 + +SELECT udf_bitpos(8); +3 + +SELECT udf_bitpos(8 + 1); +0 + +*/