RFM View for LTV (#611)
* Add new UDFs for BYTE column/day_seen Rename bitpos to align with the new convention. * Add search_rfm dataset for LTV * Move RFM calculations to UDF * Address review feedback * Fully escape UDFs * Fix _actual_ missing UDF * Don't dryrun; access denied
This commit is contained in:
Родитель
65053ad5e1
Коммит
de80cfd652
|
@ -38,6 +38,7 @@ SKIP = {
|
|||
"sql/telemetry/fxa_oauth_events_v1/query.sql",
|
||||
"sql/search_derived/search_clients_last_seen_v1/init.sql",
|
||||
"sql/search_derived/search_clients_last_seen_v1/query.sql",
|
||||
"sql/search/search_rfm/view.sql",
|
||||
# Already exists (and lacks an "OR REPLACE" clause)
|
||||
"sql/org_mozilla_fenix_derived/clients_last_seen_v1/init.sql",
|
||||
"sql/telemetry_derived/core_clients_last_seen_v1/init.sql",
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
CREATE OR REPLACE VIEW
|
||||
`moz-fx-data-shared-prod.search.search_rfm` AS
|
||||
SELECT
|
||||
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_seen_bytes) AS days_seen,
|
||||
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_searched_bytes) AS days_searched,
|
||||
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_tagged_searched_bytes) AS days_tagged_searched,
|
||||
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_searched_with_ads_bytes) AS days_searched_with_ads,
|
||||
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_clicked_ads_bytes) AS days_clicked_ads,
|
||||
`moz-fx-data-shared-prod.udf.bits_to_days_since_first_seen`(days_created_profile_bytes) AS days_since_created_profile,
|
||||
* EXCEPT (
|
||||
days_seen_bytes,
|
||||
days_searched_bytes,
|
||||
days_tagged_searched_bytes,
|
||||
days_searched_with_ads_bytes,
|
||||
days_clicked_ads_bytes,
|
||||
days_created_profile_bytes
|
||||
)
|
||||
FROM
|
||||
`moz-fx-data-shared-prod.search_derived.search_clients_last_seen_v1`
|
|
@ -11,7 +11,7 @@ CREATE TEMP FUNCTION
|
|||
BIT_COUNT(x & udf_bitmask_lowest_7())
|
||||
);
|
||||
CREATE TEMP FUNCTION
|
||||
udf_bitpos( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
|
||||
udf_pos_of_trailing_set_bit( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
|
||||
CREATE TEMP FUNCTION
|
||||
udf_smoot_usage_from_28_bits(
|
||||
bit_arrays ARRAY<STRUCT<days_created_profile_bits INT64, days_active_bits INT64>>)
|
||||
|
@ -20,8 +20,8 @@ CREATE TEMP FUNCTION
|
|||
unnested AS (
|
||||
SELECT
|
||||
days_active_bits AS bits,
|
||||
udf_bitpos(days_created_profile_bits) AS dnp,
|
||||
udf_bitpos(days_active_bits) AS days_since_active,
|
||||
udf_pos_of_trailing_set_bit(days_created_profile_bits) AS dnp,
|
||||
udf_pos_of_trailing_set_bit(days_active_bits) AS days_since_active,
|
||||
udf_bitcount_lowest_7(days_active_bits) AS active_days_in_week
|
||||
FROM
|
||||
UNNEST(bit_arrays) )
|
||||
|
|
|
@ -11,7 +11,7 @@ CREATE TEMP FUNCTION
|
|||
BIT_COUNT(x & udf_bitmask_lowest_7())
|
||||
);
|
||||
CREATE TEMP FUNCTION
|
||||
udf_bitpos( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
|
||||
udf_pos_of_trailing_set_bit( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
|
||||
CREATE TEMP FUNCTION
|
||||
udf_smoot_usage_from_28_bits(
|
||||
bit_arrays ARRAY<STRUCT<days_created_profile_bits INT64, days_active_bits INT64>>)
|
||||
|
@ -20,8 +20,8 @@ CREATE TEMP FUNCTION
|
|||
unnested AS (
|
||||
SELECT
|
||||
days_active_bits AS bits,
|
||||
udf_bitpos(days_created_profile_bits) AS dnp,
|
||||
udf_bitpos(days_active_bits) AS days_since_active,
|
||||
udf_pos_of_trailing_set_bit(days_created_profile_bits) AS dnp,
|
||||
udf_pos_of_trailing_set_bit(days_active_bits) AS days_since_active,
|
||||
udf_bitcount_lowest_7(days_active_bits) AS active_days_in_week
|
||||
FROM
|
||||
UNNEST(bit_arrays) )
|
||||
|
|
|
@ -11,7 +11,7 @@ CREATE TEMP FUNCTION
|
|||
BIT_COUNT(x & udf_bitmask_lowest_7())
|
||||
);
|
||||
CREATE TEMP FUNCTION
|
||||
udf_bitpos( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
|
||||
udf_pos_of_trailing_set_bit( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
|
||||
CREATE TEMP FUNCTION
|
||||
udf_smoot_usage_from_28_bits(
|
||||
bit_arrays ARRAY<STRUCT<days_created_profile_bits INT64, days_active_bits INT64>>)
|
||||
|
@ -20,8 +20,8 @@ CREATE TEMP FUNCTION
|
|||
unnested AS (
|
||||
SELECT
|
||||
days_active_bits AS bits,
|
||||
udf_bitpos(days_created_profile_bits) AS dnp,
|
||||
udf_bitpos(days_active_bits) AS days_since_active,
|
||||
udf_pos_of_trailing_set_bit(days_created_profile_bits) AS dnp,
|
||||
udf_pos_of_trailing_set_bit(days_active_bits) AS days_since_active,
|
||||
udf_bitcount_lowest_7(days_active_bits) AS active_days_in_week
|
||||
FROM
|
||||
UNNEST(bit_arrays) )
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
CREATE OR REPLACE VIEW
|
||||
`moz-fx-data-shared-prod.search.search_rfm` AS
|
||||
SELECT
|
||||
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_seen_bytes) AS days_seen,
|
||||
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_searched_bytes) AS days_searched,
|
||||
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_tagged_searched_bytes) AS days_tagged_searched,
|
||||
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_searched_with_ads_bytes) AS days_searched_with_ads,
|
||||
`moz-fx-data-shared-prod.udf.days_seen_bytes_to_rfm`(days_clicked_ads_bytes) AS days_clicked_ads,
|
||||
`moz-fx-data-shared-prod.udf.bits_to_days_since_first_seen`(days_created_profile_bytes) AS days_since_created_profile,
|
||||
* EXCEPT (
|
||||
days_seen_bytes,
|
||||
days_searched_bytes,
|
||||
days_tagged_searched_bytes,
|
||||
days_searched_with_ads_bytes,
|
||||
days_clicked_ads_bytes,
|
||||
days_created_profile_bytes
|
||||
)
|
||||
FROM
|
||||
`moz-fx-data-shared-prod.search_derived.search_clients_last_seen_v1`
|
|
@ -0,0 +1,9 @@
|
|||
/*
|
||||
Given a BYTE, get the number of days the user was seen.
|
||||
|
||||
NULL input returns NULL output.
|
||||
*/
|
||||
|
||||
CREATE TEMP FUNCTION
|
||||
udf_bits_to_days_seen(b BYTES) AS (
|
||||
BIT_COUNT(b));
|
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
Given a BYTES, return the number of days since the
|
||||
client was first seen.
|
||||
|
||||
If no bits are set, returns NULL, indicating we don't know.
|
||||
Otherwise the result is 0-indexed, meaning that for \x01,
|
||||
it will return 0.
|
||||
|
||||
Results showed this being between 5-10x faster than the simpler alternative:
|
||||
CREATE TEMP FUNCTION
|
||||
udf_bits_to_days_since_first_seen(b BYTES) AS ((
|
||||
SELECT MAX(n)
|
||||
FROM UNNEST(GENERATE_ARRAY(0, 8 * BYTE_LENGTH(b))) AS n
|
||||
WHERE BIT_COUNT(SUBSTR(b >> n, -1) & b'\x01') > 0));
|
||||
|
||||
See also: bits_to_days_since_seen.sql
|
||||
*/
|
||||
|
||||
CREATE TEMP FUNCTION
|
||||
udf_bits_to_days_since_first_seen(b BYTES) AS ((
|
||||
WITH leading AS (
|
||||
-- Extract the leading 0 bytes and first set byte.
|
||||
-- Trimming forces NULL for bytes with no set bits.
|
||||
SELECT REGEXP_EXTRACT(RTRIM(b, b'\x00'), CAST('(^\x00*.)' AS BYTES)) AS head
|
||||
)
|
||||
|
||||
SELECT
|
||||
-- The remaining bytes in b, after head, are all days after first seen
|
||||
(8 * (BYTE_LENGTH(b) - BYTE_LENGTH(head)))
|
||||
-- Add the loc of the first set bit in the final byte of tail, for additional days
|
||||
+ udf_pos_of_leading_set_bit(TO_CODE_POINTS(SUBSTR(head, -1, 1))[OFFSET(0)])
|
||||
FROM leading
|
||||
));
|
||||
|
||||
-- Tests
|
||||
SELECT
|
||||
assert_equals(0, udf_bits_to_days_since_first_seen(b'\x00\x01')),
|
||||
assert_equals(0, udf_bits_to_days_since_first_seen(b'\x00\x00\x00\x01')),
|
||||
assert_equals(8, udf_bits_to_days_since_first_seen(b'\x01\x00')),
|
||||
assert_equals(NULL, udf_bits_to_days_since_first_seen(b'\x00\x00')),
|
||||
assert_equals(1, udf_bits_to_days_since_first_seen(b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03')),
|
||||
assert_equals(79, udf_bits_to_days_since_first_seen(b'\xF0\x00\x00\x00\x00\x00\x00\x00\x00\x00'));
|
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
Given a BYTES, return the number of days since the client was
|
||||
last seen.
|
||||
|
||||
If no bits are set, returns NULL, indicating we don't know.
|
||||
Otherwise the results are 0-indexed, meaning \x01 will return 0.
|
||||
|
||||
Tests showed this being 5-10x faster than the simpler alternative:
|
||||
CREATE TEMP FUNCTION
|
||||
udf_bits_to_days_since_seen(b BYTES) AS ((
|
||||
SELECT MIN(n)
|
||||
FROM UNNEST(GENERATE_ARRAY(0, 364)) AS n
|
||||
WHERE BIT_COUNT(SUBSTR(b >> n, -1) & b'\x01') > 0));
|
||||
|
||||
See also: bits_to_days_since_first_seen.sql
|
||||
*/
|
||||
|
||||
CREATE TEMP FUNCTION
|
||||
udf_bits_to_days_since_seen(b BYTES) AS ((
|
||||
WITH trailing AS (
|
||||
-- Extract the first set byte with the trailing zeroes
|
||||
-- LTRIM forces NULL for bytes with no set bits
|
||||
SELECT REGEXP_EXTRACT(LTRIM(b, b'\x00'), CAST('(.\x00*$)' AS BYTES)) AS tail
|
||||
)
|
||||
|
||||
SELECT
|
||||
-- Sum all trailing zeroes
|
||||
(8 * (BYTE_LENGTH(tail) - 1))
|
||||
-- Add the loc of the last set bit
|
||||
+ udf_pos_of_trailing_set_bit(TO_CODE_POINTS(SUBSTR(tail, 1, 1))[OFFSET(0)])
|
||||
FROM trailing
|
||||
));
|
||||
|
||||
-- Tests
|
||||
SELECT
|
||||
assert_equals(0, udf_bits_to_days_since_seen(b'\x00\x01')),
|
||||
assert_equals(0, udf_bits_to_days_since_seen(b'\x00\x00\x00\x01')),
|
||||
assert_equals(8, udf_bits_to_days_since_seen(b'\x01\x00')),
|
||||
assert_equals(NULL, udf_bits_to_days_since_seen(b'\x00\x00')),
|
||||
assert_equals(0, udf_bits_to_days_since_seen(b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03')),
|
||||
assert_equals(76, udf_bits_to_days_since_seen(b'\xF0\x00\x00\x00\x00\x00\x00\x00\x00\x00'));
|
|
@ -0,0 +1,17 @@
|
|||
/*
|
||||
Return the frequency, recency, and T from a BYTE array
|
||||
*/
|
||||
|
||||
CREATE TEMP FUNCTION
|
||||
udf_days_seen_bytes_to_rfm(days_seen_bytes BYTES) AS (
|
||||
STRUCT(
|
||||
udf_bits_to_days_seen(days_seen_bytes) AS frequency,
|
||||
udf_bits_to_days_since_first_seen(days_seen_bytes) AS T,
|
||||
udf_bits_to_days_since_first_seen(days_seen_bytes)
|
||||
- udf_bits_to_days_since_seen(days_seen_bytes) AS recency
|
||||
));
|
||||
|
||||
-- Tests
|
||||
|
||||
SELECT
|
||||
assert_equals(STRUCT(2 AS frequency, 4 AS T, 2 AS recency), udf_days_seen_bytes_to_rfm(b'\x14'))
|
|
@ -0,0 +1,17 @@
|
|||
/*
|
||||
Returns the 0-based index of the first set bit.
|
||||
|
||||
No set bits returns NULL.
|
||||
*/
|
||||
|
||||
|
||||
CREATE TEMP FUNCTION
|
||||
udf_pos_of_leading_set_bit(i INT64) AS (
|
||||
NULLIF(CAST(CEIL(SAFE.LOG(i+1, 2)) AS INT64) - 1, -1));
|
||||
|
||||
-- Tests
|
||||
|
||||
SELECT
|
||||
assert_equals(udf_pos_of_leading_set_bit(0), NULL),
|
||||
assert_equals(udf_pos_of_leading_set_bit(1), 0),
|
||||
assert_equals(udf_pos_of_leading_set_bit(2), 1);
|
|
@ -10,12 +10,12 @@ see https://stackoverflow.com/a/42747608/1260237
|
|||
*/
|
||||
|
||||
CREATE TEMP FUNCTION
|
||||
udf_bitpos( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
|
||||
udf_pos_of_trailing_set_bit( bits INT64 ) AS ( CAST(SAFE.LOG(bits & -bits, 2) AS INT64));
|
||||
|
||||
-- Tests
|
||||
|
||||
SELECT
|
||||
assert_null(udf_bitpos(0)),
|
||||
assert_equals(0, udf_bitpos(1)),
|
||||
assert_equals(3, udf_bitpos(8)),
|
||||
assert_equals(0, udf_bitpos(8 + 1))
|
||||
assert_null(udf_pos_of_trailing_set_bit(0)),
|
||||
assert_equals(0, udf_pos_of_trailing_set_bit(1)),
|
||||
assert_equals(3, udf_pos_of_trailing_set_bit(8)),
|
||||
assert_equals(0, udf_pos_of_trailing_set_bit(8 + 1))
|
|
@ -13,8 +13,8 @@ CREATE TEMP FUNCTION
|
|||
unnested AS (
|
||||
SELECT
|
||||
days_active_bits AS bits,
|
||||
udf_bitpos(days_created_profile_bits) AS dnp,
|
||||
udf_bitpos(days_active_bits) AS days_since_active,
|
||||
udf_pos_of_trailing_set_bit(days_created_profile_bits) AS dnp,
|
||||
udf_pos_of_trailing_set_bit(days_active_bits) AS days_since_active,
|
||||
udf_bitcount_lowest_7(days_active_bits) AS active_days_in_week
|
||||
FROM
|
||||
UNNEST(bit_arrays) )
|
||||
|
|
Загрузка…
Ссылка в новой задаче