* Add new UDFs for BYTE column/day_seen

Rename bitpos to align with the new convention.

* Add search_rfm dataset for LTV

* Move RFM calculations to UDF

* Address review feedback

* Fully escape UDFs

* Fix _actual_ missing UDF

* Don't dryrun; access denied
This commit is contained in:
Frank Bertsch 2019-12-19 18:01:44 -05:00 коммит произвёл GitHub
Родитель 65053ad5e1
Коммит de80cfd652
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
13 изменённых файлов: 181 добавлений и 16 удалений

Просмотреть файл

@ -38,6 +38,7 @@ SKIP = {
"sql/telemetry/fxa_oauth_events_v1/query.sql",
"sql/search_derived/search_clients_last_seen_v1/init.sql",
"sql/search_derived/search_clients_last_seen_v1/query.sql",
"sql/search/search_rfm/view.sql",
# Already exists (and lacks an "OR REPLACE" clause)
"sql/org_mozilla_fenix_derived/clients_last_seen_v1/init.sql",
"sql/telemetry_derived/core_clients_last_seen_v1/init.sql",

Просмотреть файл

@ -0,0 +1,19 @@
CREATE OR REPLACE VIEW
`moz-fx-data-shared-prod.search.search_rfm` AS
SELECT
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_seen_bytes) AS days_seen,
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_searched_bytes) AS days_searched,
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_tagged_searched_bytes) AS days_tagged_searched,
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_searched_with_ads_bytes) AS days_searched_with_ads,
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_clicked_ads_bytes) AS days_clicked_ads,
`moz-fx-data-shared-prod.udf.bits_to_days_since_first_seen`(days_created_profile_bytes) AS days_since_created_profile,
* EXCEPT (
days_seen_bytes,
days_searched_bytes,
days_tagged_searched_bytes,
days_searched_with_ads_bytes,
days_clicked_ads_bytes,
days_created_profile_bytes
)
FROM
`moz-fx-data-shared-prod.search_derived.search_clients_last_seen_v1`

Просмотреть файл

@ -11,7 +11,7 @@ CREATE TEMP FUNCTION
BIT_COUNT(x & udf_bitmask_lowest_7())
);
CREATE TEMP FUNCTION
udf_bitpos( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
udf_pos_of_trailing_set_bit( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
CREATE TEMP FUNCTION
udf_smoot_usage_from_28_bits(
bit_arrays ARRAY<STRUCT<days_created_profile_bits INT64, days_active_bits INT64>>)
@ -20,8 +20,8 @@ CREATE TEMP FUNCTION
unnested AS (
SELECT
days_active_bits AS bits,
udf_bitpos(days_created_profile_bits) AS dnp,
udf_bitpos(days_active_bits) AS days_since_active,
udf_pos_of_trailing_set_bit(days_created_profile_bits) AS dnp,
udf_pos_of_trailing_set_bit(days_active_bits) AS days_since_active,
udf_bitcount_lowest_7(days_active_bits) AS active_days_in_week
FROM
UNNEST(bit_arrays) )

Просмотреть файл

@ -11,7 +11,7 @@ CREATE TEMP FUNCTION
BIT_COUNT(x & udf_bitmask_lowest_7())
);
CREATE TEMP FUNCTION
udf_bitpos( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
udf_pos_of_trailing_set_bit( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
CREATE TEMP FUNCTION
udf_smoot_usage_from_28_bits(
bit_arrays ARRAY<STRUCT<days_created_profile_bits INT64, days_active_bits INT64>>)
@ -20,8 +20,8 @@ CREATE TEMP FUNCTION
unnested AS (
SELECT
days_active_bits AS bits,
udf_bitpos(days_created_profile_bits) AS dnp,
udf_bitpos(days_active_bits) AS days_since_active,
udf_pos_of_trailing_set_bit(days_created_profile_bits) AS dnp,
udf_pos_of_trailing_set_bit(days_active_bits) AS days_since_active,
udf_bitcount_lowest_7(days_active_bits) AS active_days_in_week
FROM
UNNEST(bit_arrays) )

Просмотреть файл

@ -11,7 +11,7 @@ CREATE TEMP FUNCTION
BIT_COUNT(x & udf_bitmask_lowest_7())
);
CREATE TEMP FUNCTION
udf_bitpos( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
udf_pos_of_trailing_set_bit( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
CREATE TEMP FUNCTION
udf_smoot_usage_from_28_bits(
bit_arrays ARRAY<STRUCT<days_created_profile_bits INT64, days_active_bits INT64>>)
@ -20,8 +20,8 @@ CREATE TEMP FUNCTION
unnested AS (
SELECT
days_active_bits AS bits,
udf_bitpos(days_created_profile_bits) AS dnp,
udf_bitpos(days_active_bits) AS days_since_active,
udf_pos_of_trailing_set_bit(days_created_profile_bits) AS dnp,
udf_pos_of_trailing_set_bit(days_active_bits) AS days_since_active,
udf_bitcount_lowest_7(days_active_bits) AS active_days_in_week
FROM
UNNEST(bit_arrays) )

Просмотреть файл

@ -0,0 +1,19 @@
CREATE OR REPLACE VIEW
`moz-fx-data-shared-prod.search.search_rfm` AS
SELECT
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_seen_bytes) AS days_seen,
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_searched_bytes) AS days_searched,
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_tagged_searched_bytes) AS days_tagged_searched,
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_searched_with_ads_bytes) AS days_searched_with_ads,
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_clicked_ads_bytes) AS days_clicked_ads,
`moz-fx-data-shared-prod.udf.bits_to_days_since_first_seen`(days_created_profile_bytes) AS days_since_created_profile,
* EXCEPT (
days_seen_bytes,
days_searched_bytes,
days_tagged_searched_bytes,
days_searched_with_ads_bytes,
days_clicked_ads_bytes,
days_created_profile_bytes
)
FROM
`moz-fx-data-shared-prod.search_derived.search_clients_last_seen_v1`

Просмотреть файл

@ -0,0 +1,9 @@
/*
Given a BYTE, get the number of days the user was seen.
NULL input returns NULL output.
*/
CREATE TEMP FUNCTION
udf_bits_to_days_seen(b BYTES) AS (
BIT_COUNT(b));

Просмотреть файл

@ -0,0 +1,42 @@
/*
Given a BYTES, return the number of days since the
client was first seen.
If no bits are set, returns NULL, indicating we don't know.
Otherwise the result is 0-indexed, meaning that for \x01,
it will return 0.
Results showed this being between 5-10x faster than the simpler alternative:
CREATE TEMP FUNCTION
udf_bits_to_days_since_first_seen(b BYTES) AS ((
SELECT MAX(n)
FROM UNNEST(GENERATE_ARRAY(0, 8 * BYTE_LENGTH(b))) AS n
WHERE BIT_COUNT(SUBSTR(b >> n, -1) & b'\x01') > 0));
See also: bits_to_days_since_seen.sql
*/
CREATE TEMP FUNCTION
udf_bits_to_days_since_first_seen(b BYTES) AS ((
WITH leading AS (
-- Extract the leading 0 bytes and first set byte.
-- Trimming forces NULL for bytes with no set bits.
SELECT REGEXP_EXTRACT(RTRIM(b, b'\x00'), CAST('(^\x00*.)' AS BYTES)) AS head
)
SELECT
-- The remaining bytes in b, after head, are all days after first seen
(8 * (BYTE_LENGTH(b) - BYTE_LENGTH(head)))
-- Add the loc of the first set bit in the final byte of tail, for additional days
+ udf_pos_of_leading_set_bit(TO_CODE_POINTS(SUBSTR(head, -1, 1))[OFFSET(0)])
FROM leading
));
-- Tests
SELECT
assert_equals(0, udf_bits_to_days_since_first_seen(b'\x00\x01')),
assert_equals(0, udf_bits_to_days_since_first_seen(b'\x00\x00\x00\x01')),
assert_equals(8, udf_bits_to_days_since_first_seen(b'\x01\x00')),
assert_equals(NULL, udf_bits_to_days_since_first_seen(b'\x00\x00')),
assert_equals(1, udf_bits_to_days_since_first_seen(b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03')),
assert_equals(79, udf_bits_to_days_since_first_seen(b'\xF0\x00\x00\x00\x00\x00\x00\x00\x00\x00'));

Просмотреть файл

@ -0,0 +1,41 @@
/*
Given a BYTES, return the number of days since the client was
last seen.
If no bits are set, returns NULL, indicating we don't know.
Otherwise the results are 0-indexed, meaning \x01 will return 0.
Tests showed this being 5-10x faster than the simpler alternative:
CREATE TEMP FUNCTION
udf_bits_to_days_since_seen(b BYTES) AS ((
SELECT MIN(n)
FROM UNNEST(GENERATE_ARRAY(0, 364)) AS n
WHERE BIT_COUNT(SUBSTR(b >> n, -1) & b'\x01') > 0));
See also: bits_to_days_since_first_seen.sql
*/
CREATE TEMP FUNCTION
udf_bits_to_days_since_seen(b BYTES) AS ((
WITH trailing AS (
-- Extract the first set byte with the trailing zeroes
-- LTRIM forces NULL for bytes with no set bits
SELECT REGEXP_EXTRACT(LTRIM(b, b'\x00'), CAST('(.\x00*$)' AS BYTES)) AS tail
)
SELECT
-- Sum all trailing zeroes
(8 * (BYTE_LENGTH(tail) - 1))
-- Add the loc of the last set bit
+ udf_pos_of_trailing_set_bit(TO_CODE_POINTS(SUBSTR(tail, 1, 1))[OFFSET(0)])
FROM trailing
));
-- Tests
SELECT
assert_equals(0, udf_bits_to_days_since_seen(b'\x00\x01')),
assert_equals(0, udf_bits_to_days_since_seen(b'\x00\x00\x00\x01')),
assert_equals(8, udf_bits_to_days_since_seen(b'\x01\x00')),
assert_equals(NULL, udf_bits_to_days_since_seen(b'\x00\x00')),
assert_equals(0, udf_bits_to_days_since_seen(b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03')),
assert_equals(76, udf_bits_to_days_since_seen(b'\xF0\x00\x00\x00\x00\x00\x00\x00\x00\x00'));

Просмотреть файл

@ -0,0 +1,17 @@
/*
Return the frequency, recency, and T from a BYTE array
*/
CREATE TEMP FUNCTION
udf_days_seen_bytes_to_rfm(days_seen_bytes BYTES) AS (
STRUCT(
udf_bits_to_days_seen(days_seen_bytes) AS frequency,
udf_bits_to_days_since_first_seen(days_seen_bytes) AS T,
udf_bits_to_days_since_first_seen(days_seen_bytes)
- udf_bits_to_days_since_seen(days_seen_bytes) AS recency
));
-- Tests
SELECT
assert_equals(STRUCT(2 AS frequency, 4 AS T, 2 AS recency), udf_days_seen_bytes_to_rfm(b'\x14'))

Просмотреть файл

@ -0,0 +1,17 @@
/*
Returns the 0-based index of the first set bit.
No set bits returns NULL.
*/
CREATE TEMP FUNCTION
udf_pos_of_leading_set_bit(i INT64) AS (
NULLIF(CAST(CEIL(SAFE.LOG(i+1, 2)) AS INT64) - 1, -1));
-- Tests
SELECT
assert_equals(udf_pos_of_leading_set_bit(0), NULL),
assert_equals(udf_pos_of_leading_set_bit(1), 0),
assert_equals(udf_pos_of_leading_set_bit(2), 1);

Просмотреть файл

@ -10,12 +10,12 @@ see https://stackoverflow.com/a/42747608/1260237
*/
CREATE TEMP FUNCTION
udf_bitpos( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
udf_pos_of_trailing_set_bit( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
-- Tests
SELECT
assert_null(udf_bitpos(0)),
assert_equals(0, udf_bitpos(1)),
assert_equals(3, udf_bitpos(8)),
assert_equals(0, udf_bitpos(8 + 1))
assert_null(udf_pos_of_trailing_set_bit(0)),
assert_equals(0, udf_pos_of_trailing_set_bit(1)),
assert_equals(3, udf_pos_of_trailing_set_bit(8)),
assert_equals(0, udf_pos_of_trailing_set_bit(8 + 1))

Просмотреть файл

@ -13,8 +13,8 @@ CREATE TEMP FUNCTION
unnested AS (
SELECT
days_active_bits AS bits,
udf_bitpos(days_created_profile_bits) AS dnp,
udf_bitpos(days_active_bits) AS days_since_active,
udf_pos_of_trailing_set_bit(days_created_profile_bits) AS dnp,
udf_pos_of_trailing_set_bit(days_active_bits) AS days_since_active,
udf_bitcount_lowest_7(days_active_bits) AS active_days_in_week
FROM
UNNEST(bit_arrays) )