Create tables that have state values per day (#4634)

* Create tables that have state values per day

* Change Airflow DAG

* Move markov states to cols rather than array

* Move bot/bad client filter to materialized table

* Add install_source and consecutive_days_seen features

* Add field to CTE

* Use jinja vars instead of sql variables

* Use correct UDF incantation
This commit is contained in:
Frank Bertsch 2023-12-06 12:54:46 -05:00 коммит произвёл GitHub
Родитель b7b6c58e23
Коммит 16bdbcbcc8
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 210 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,56 @@
{% set max_weeks = 32 %}
{% set death_time = 168 %}
{% set lookback = 28 %}
CREATE OR REPLACE VIEW
`moz-fx-data-shared-prod.fenix.ltv_states`
AS
WITH extracted_fields AS (
SELECT
*,
BIT_COUNT(
`mozfun`.bytes.extract_bits(days_seen_bytes, -1 * {{ lookback }}, {{ lookback }})
) AS activity_pattern,
BIT_COUNT(`mozfun`.bytes.extract_bits(days_seen_bytes, -1, 1)) AS active_on_this_date,
FROM
`moz-fx-data-shared-prod.fenix_derived.ltv_states_v1`
)
SELECT
client_id,
submission_date,
STRUCT(
mozfun.ltv.android_states_v1(
adjust_network,
days_since_first_seen,
submission_date,
first_seen_date,
activity_pattern,
active_on_this_date,
{{ max_weeks }},
first_reported_country
) AS android_states_v1,
mozfun.ltv.android_states_with_paid_v1(
adjust_network,
days_since_first_seen,
submission_date,
first_seen_date,
activity_pattern,
active_on_this_date,
{{ max_weeks }},
first_reported_country
) AS android_states_with_paid_v1,
mozfun.ltv.android_states_with_paid_v2(
adjust_network,
days_since_first_seen,
days_since_seen,
{{ death_time }},
submission_date,
first_seen_date,
activity_pattern,
active_on_this_date,
{{ max_weeks }},
first_reported_country
) AS android_states_with_paid_v2
) AS markov_states,
* EXCEPT (client_id, submission_date)
FROM
extracted_fields

Просмотреть файл

@ -0,0 +1,12 @@
#fail
{{ is_unique("client_id", "submission_date = @submission_date") }}
#fail
{{ min_row_count(10000, "submission_date = @submission_date") }}
#fail
{{ min_row_count(10000, "submission_date = @submission_date AND first_seen_date = @submission_date") }}
#fail
{{ min_row_count(10000, "submission_date = @submission_date AND days_since_seen = 0") }}

Просмотреть файл

@ -0,0 +1,20 @@
friendly_name: Ltv States
description: |-
Fields required for retrieving LTV for clients.
owners:
- frank@mozilla.com
labels:
incremental: true
owner1: frank@mozilla.com
scheduling:
dag_name: bqetl_org_mozilla_firefox_derived
bigquery:
time_partitioning:
type: day
field: submission_date
require_partition_filter: true
expiration_days: null
clustering:
fields: [first_reported_country, first_seen_date]
references: {}
deprecated: false

Просмотреть файл

@ -0,0 +1,63 @@
WITH clients_yearly AS (
SELECT
client_id,
sample_id,
submission_date,
first_seen_date,
days_since_first_seen,
days_since_seen,
consecutive_days_seen,
days_seen_bytes,
FROM
fenix.clients_yearly
WHERE
{% if is_init() %}
submission_date >= "2010-01-01"
{% else %}
submission_date = @submission_date
{% endif %}
)
SELECT
client_id,
sample_id,
clients_yearly.submission_date,
clients_yearly.first_seen_date,
clients_yearly.days_since_first_seen,
clients_yearly.days_since_seen,
clients_yearly.consecutive_days_seen,
clients_yearly.days_seen_bytes,
(
SELECT
LEAST(value, 10000)
FROM
UNNEST(ad_click_history)
WHERE
key = clients_yearly.submission_date
) AS ad_clicks_on_date,
(
SELECT
SUM(LEAST(value, 10000))
FROM
UNNEST(ad_click_history)
WHERE
key <= clients_yearly.submission_date
) AS total_historic_ad_clicks,
firefox_android_clients.first_reported_country,
firefox_android_clients.first_reported_isp,
firefox_android_clients.adjust_network,
firefox_android_clients.install_source,
FROM
clients_yearly
JOIN
fenix.firefox_android_clients
USING
(sample_id, client_id)
LEFT JOIN
fenix.client_adclicks_history
USING
(sample_id, client_id)
WHERE
-- BrowserStack clients are bots, we don't want to accidentally report on them
first_reported_isp != "BrowserStack"
-- Remove clients who are new on this day, but have more/less than 1 day of activity
AND NOT (days_since_first_seen = 0 AND BIT_COUNT(days_seen_bytes) != 1)

Просмотреть файл

@ -0,0 +1,59 @@
fields:
- name: client_id
mode: NULLABLE
type: STRING
description: "Client ID; uniquely identifies a client. Joinable with fenix.firefox_android_clients."
- name: sample_id
mode: NULLABLE
type: INTEGER
description: "A number, 0-99, that samples by client_id."
- name: submission_date
mode: NULLABLE
type: DATE
description: "Submission date, the date of the data. Also the partition key."
- name: first_seen_date
mode: NULLABLE
type: DATE
description: "First submission date that this client was seen on."
- name: days_since_first_seen
mode: NULLABLE
type: INTEGER
description: "Number of days since this client was first seen."
- name: days_since_seen
mode: NULLABLE
type: INTEGER
description: "Number of days since this client was last seen. For example, if they were seen yesterday, days_since_seen would be 1."
- name: consecutive_days_seen
mode: NULLABLE
type: INTEGER
description: >
Number of consecutive days this client has been seen.
For example, if they were missing two days ago but present yesterday & today, consecutive_days_seen would be 2.
- name: days_seen_bytes
mode: NULLABLE
type: BYTES
description: "Days seen over the past year, represented as bytes."
- name: ad_clicks_on_date
mode: NULLABLE
type: INTEGER
description: "Number of ad clicks by this client on this submission date."
- name: total_historic_ad_clicks
mode: NULLABLE
type: INTEGER
description: "Total historic ad clicks by this client up to this date (inclusive of this date)."
- name: first_reported_country
mode: NULLABLE
type: STRING
description: "First country reported by this client."
- name: first_reported_isp
mode: NULLABLE
type: STRING
description: "First ISP reported by this client."
- name: adjust_network
mode: NULLABLE
type: STRING
description: "First Adjust Network reported by this client."
- name: install_source
mode: NULLABLE
type: STRING
description: "First install source reported by this client."