Create tables that have state values per day (#4634)
* Create tables that have state values per day * Change Airflow DAG * Move markov states to cols rather than array * Move bot/bad client filter to materialized table * Add install_source and consecutive_days_seen features * Add field to CTE * Use jinja vars instead of sql variables * Use correct UDF incantation
This commit is contained in:
Родитель
b7b6c58e23
Коммит
16bdbcbcc8
|
@ -0,0 +1,56 @@
|
|||
{% set max_weeks = 32 %}
|
||||
{% set death_time = 168 %}
|
||||
{% set lookback = 28 %}
|
||||
CREATE OR REPLACE VIEW
|
||||
`moz-fx-data-shared-prod.fenix.ltv_states`
|
||||
AS
|
||||
WITH extracted_fields AS (
|
||||
SELECT
|
||||
*,
|
||||
BIT_COUNT(
|
||||
`mozfun`.bytes.extract_bits(days_seen_bytes, -1 * {{ lookback }}, {{ lookback }})
|
||||
) AS activity_pattern,
|
||||
BIT_COUNT(`mozfun`.bytes.extract_bits(days_seen_bytes, -1, 1)) AS active_on_this_date,
|
||||
FROM
|
||||
`moz-fx-data-shared-prod.fenix_derived.ltv_states_v1`
|
||||
)
|
||||
SELECT
|
||||
client_id,
|
||||
submission_date,
|
||||
STRUCT(
|
||||
mozfun.ltv.android_states_v1(
|
||||
adjust_network,
|
||||
days_since_first_seen,
|
||||
submission_date,
|
||||
first_seen_date,
|
||||
activity_pattern,
|
||||
active_on_this_date,
|
||||
{{ max_weeks }},
|
||||
first_reported_country
|
||||
) AS android_states_v1,
|
||||
mozfun.ltv.android_states_with_paid_v1(
|
||||
adjust_network,
|
||||
days_since_first_seen,
|
||||
submission_date,
|
||||
first_seen_date,
|
||||
activity_pattern,
|
||||
active_on_this_date,
|
||||
{{ max_weeks }},
|
||||
first_reported_country
|
||||
) AS android_states_with_paid_v1,
|
||||
mozfun.ltv.android_states_with_paid_v2(
|
||||
adjust_network,
|
||||
days_since_first_seen,
|
||||
days_since_seen,
|
||||
{{ death_time }},
|
||||
submission_date,
|
||||
first_seen_date,
|
||||
activity_pattern,
|
||||
active_on_this_date,
|
||||
{{ max_weeks }},
|
||||
first_reported_country
|
||||
) AS android_states_with_paid_v2
|
||||
) AS markov_states,
|
||||
* EXCEPT (client_id, submission_date)
|
||||
FROM
|
||||
extracted_fields
|
|
@ -0,0 +1,12 @@
|
|||
#fail
|
||||
{{ is_unique("client_id", "submission_date = @submission_date") }}
|
||||
|
||||
#fail
|
||||
{{ min_row_count(10000, "submission_date = @submission_date") }}
|
||||
|
||||
#fail
|
||||
{{ min_row_count(10000, "submission_date = @submission_date AND first_seen_date = @submission_date") }}
|
||||
|
||||
#fail
|
||||
{{ min_row_count(10000, "submission_date = @submission_date AND days_since_seen = 0") }}
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
friendly_name: Ltv States
|
||||
description: |-
|
||||
Fields required for retrieving LTV for clients.
|
||||
owners:
|
||||
- frank@mozilla.com
|
||||
labels:
|
||||
incremental: true
|
||||
owner1: frank@mozilla.com
|
||||
scheduling:
|
||||
dag_name: bqetl_org_mozilla_firefox_derived
|
||||
bigquery:
|
||||
time_partitioning:
|
||||
type: day
|
||||
field: submission_date
|
||||
require_partition_filter: true
|
||||
expiration_days: null
|
||||
clustering:
|
||||
fields: [first_reported_country, first_seen_date]
|
||||
references: {}
|
||||
deprecated: false
|
|
@ -0,0 +1,63 @@
|
|||
WITH clients_yearly AS (
|
||||
SELECT
|
||||
client_id,
|
||||
sample_id,
|
||||
submission_date,
|
||||
first_seen_date,
|
||||
days_since_first_seen,
|
||||
days_since_seen,
|
||||
consecutive_days_seen,
|
||||
days_seen_bytes,
|
||||
FROM
|
||||
fenix.clients_yearly
|
||||
WHERE
|
||||
{% if is_init() %}
|
||||
submission_date >= "2010-01-01"
|
||||
{% else %}
|
||||
submission_date = @submission_date
|
||||
{% endif %}
|
||||
)
|
||||
SELECT
|
||||
client_id,
|
||||
sample_id,
|
||||
clients_yearly.submission_date,
|
||||
clients_yearly.first_seen_date,
|
||||
clients_yearly.days_since_first_seen,
|
||||
clients_yearly.days_since_seen,
|
||||
clients_yearly.consecutive_days_seen,
|
||||
clients_yearly.days_seen_bytes,
|
||||
(
|
||||
SELECT
|
||||
LEAST(value, 10000)
|
||||
FROM
|
||||
UNNEST(ad_click_history)
|
||||
WHERE
|
||||
key = clients_yearly.submission_date
|
||||
) AS ad_clicks_on_date,
|
||||
(
|
||||
SELECT
|
||||
SUM(LEAST(value, 10000))
|
||||
FROM
|
||||
UNNEST(ad_click_history)
|
||||
WHERE
|
||||
key <= clients_yearly.submission_date
|
||||
) AS total_historic_ad_clicks,
|
||||
firefox_android_clients.first_reported_country,
|
||||
firefox_android_clients.first_reported_isp,
|
||||
firefox_android_clients.adjust_network,
|
||||
firefox_android_clients.install_source,
|
||||
FROM
|
||||
clients_yearly
|
||||
JOIN
|
||||
fenix.firefox_android_clients
|
||||
USING
|
||||
(sample_id, client_id)
|
||||
LEFT JOIN
|
||||
fenix.client_adclicks_history
|
||||
USING
|
||||
(sample_id, client_id)
|
||||
WHERE
|
||||
-- BrowserStack clients are bots, we don't want to accidentally report on them
|
||||
first_reported_isp != "BrowserStack"
|
||||
-- Remove clients who are new on this day, but have more/less than 1 day of activity
|
||||
AND NOT (days_since_first_seen = 0 AND BIT_COUNT(days_seen_bytes) != 1)
|
|
@ -0,0 +1,59 @@
|
|||
fields:
|
||||
- name: client_id
|
||||
mode: NULLABLE
|
||||
type: STRING
|
||||
description: "Client ID; uniquely identifies a client. Joinable with fenix.firefox_android_clients."
|
||||
- name: sample_id
|
||||
mode: NULLABLE
|
||||
type: INTEGER
|
||||
description: "A number, 0-99, that samples by client_id."
|
||||
- name: submission_date
|
||||
mode: NULLABLE
|
||||
type: DATE
|
||||
description: "Submission date, the date of the data. Also the partition key."
|
||||
- name: first_seen_date
|
||||
mode: NULLABLE
|
||||
type: DATE
|
||||
description: "First submission date that this client was seen on."
|
||||
- name: days_since_first_seen
|
||||
mode: NULLABLE
|
||||
type: INTEGER
|
||||
description: "Number of days since this client was first seen."
|
||||
- name: days_since_seen
|
||||
mode: NULLABLE
|
||||
type: INTEGER
|
||||
description: "Number of days since this client was last seen. For example, if they were seen yesterday, days_since_seen would be 1."
|
||||
- name: consecutive_days_seen
|
||||
mode: NULLABLE
|
||||
type: INTEGER
|
||||
description: >
|
||||
Number of consecutive days this client has been seen.
|
||||
For example, if they were missing two days ago but present yesterday & today, consecutive_days_seen would be 2.
|
||||
- name: days_seen_bytes
|
||||
mode: NULLABLE
|
||||
type: BYTES
|
||||
description: "Days seen over the past year, represented as bytes."
|
||||
- name: ad_clicks_on_date
|
||||
mode: NULLABLE
|
||||
type: INTEGER
|
||||
description: "Number of ad clicks by this client on this submission date."
|
||||
- name: total_historic_ad_clicks
|
||||
mode: NULLABLE
|
||||
type: INTEGER
|
||||
description: "Total historic ad clicks by this client up to this date (inclusive of this date)."
|
||||
- name: first_reported_country
|
||||
mode: NULLABLE
|
||||
type: STRING
|
||||
description: "First country reported by this client."
|
||||
- name: first_reported_isp
|
||||
mode: NULLABLE
|
||||
type: STRING
|
||||
description: "First ISP reported by this client."
|
||||
- name: adjust_network
|
||||
mode: NULLABLE
|
||||
type: STRING
|
||||
description: "First Adjust Network reported by this client."
|
||||
- name: install_source
|
||||
mode: NULLABLE
|
||||
type: STRING
|
||||
description: "First install source reported by this client."
|
Загрузка…
Ссылка в новой задаче