Bug 1803425 - Create clients_yearly table for Fenix clients (#3383)
* Create clients_yearly table for Fenix clients This table contains a year of history for Fenix. It utilizes similar logic to search.search_clients_last_seen, but the naming specifies the year-long byte field. Currently only looks at Fenix data, which migrated ~2020-09-01; so full data is only available beginning 2021-09-01. * Add schema for baseline_clients_daily * Move to referenced tables * Remove duplicates from baseline_clients_daily * Add dependency on baseline_clients_daily
This commit is contained in:
Родитель
26413d5ecd
Коммит
a212c66163
16
dags.yaml
16
dags.yaml
|
@ -343,6 +343,22 @@ bqetl_org_mozilla_fenix_derived:
|
||||||
tags:
|
tags:
|
||||||
- impact/tier_1
|
- impact/tier_1
|
||||||
|
|
||||||
|
bqetl_org_mozilla_firefox_derived:
|
||||||
|
schedule_interval: 0 2 * * *
|
||||||
|
default_args:
|
||||||
|
depends_on_past: false
|
||||||
|
email:
|
||||||
|
- frank@mozilla.com
|
||||||
|
- telemetry-alerts@mozilla.com
|
||||||
|
email_on_failure: true
|
||||||
|
email_on_retry: true
|
||||||
|
owner: frank@mozilla.com
|
||||||
|
retries: 2
|
||||||
|
retry_delay: 30m
|
||||||
|
start_date: "2022-11-30"
|
||||||
|
tags:
|
||||||
|
- impact/tier_1
|
||||||
|
|
||||||
bqetl_google_analytics_derived:
|
bqetl_google_analytics_derived:
|
||||||
schedule_interval: 0 23 * * *
|
schedule_interval: 0 23 * * *
|
||||||
description: |
|
description: |
|
||||||
|
|
|
@ -0,0 +1,67 @@
|
||||||
|
# Generated via https://github.com/mozilla/bigquery-etl/blob/main/bigquery_etl/query_scheduling/generate_airflow_dags.py
|
||||||
|
|
||||||
|
from airflow import DAG
|
||||||
|
from airflow.sensors.external_task import ExternalTaskMarker
|
||||||
|
from airflow.sensors.external_task import ExternalTaskSensor
|
||||||
|
from airflow.utils.task_group import TaskGroup
|
||||||
|
import datetime
|
||||||
|
from utils.constants import ALLOWED_STATES, FAILED_STATES
|
||||||
|
from utils.gcp import bigquery_etl_query, gke_command
|
||||||
|
|
||||||
|
docs = """
|
||||||
|
### bqetl_org_mozilla_firefox_derived
|
||||||
|
|
||||||
|
Built from bigquery-etl repo, [`dags/bqetl_org_mozilla_firefox_derived.py`](https://github.com/mozilla/bigquery-etl/blob/main/dags/bqetl_org_mozilla_firefox_derived.py)
|
||||||
|
|
||||||
|
#### Owner
|
||||||
|
|
||||||
|
frank@mozilla.com
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
default_args = {
|
||||||
|
"owner": "frank@mozilla.com",
|
||||||
|
"start_date": datetime.datetime(2022, 11, 30, 0, 0),
|
||||||
|
"end_date": None,
|
||||||
|
"email": ["frank@mozilla.com", "telemetry-alerts@mozilla.com"],
|
||||||
|
"depends_on_past": False,
|
||||||
|
"retry_delay": datetime.timedelta(seconds=1800),
|
||||||
|
"email_on_failure": True,
|
||||||
|
"email_on_retry": True,
|
||||||
|
"retries": 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
tags = ["impact/tier_1", "repo/bigquery-etl"]
|
||||||
|
|
||||||
|
with DAG(
|
||||||
|
"bqetl_org_mozilla_firefox_derived",
|
||||||
|
default_args=default_args,
|
||||||
|
schedule_interval="0 2 * * *",
|
||||||
|
doc_md=docs,
|
||||||
|
tags=tags,
|
||||||
|
) as dag:
|
||||||
|
|
||||||
|
fenix_derived__clients_yearly__v1 = bigquery_etl_query(
|
||||||
|
task_id="fenix_derived__clients_yearly__v1",
|
||||||
|
destination_table="clients_yearly_v1",
|
||||||
|
dataset_id="fenix_derived",
|
||||||
|
project_id="moz-fx-data-shared-prod",
|
||||||
|
owner="frank@mozilla.com",
|
||||||
|
email=["frank@mozilla.com", "telemetry-alerts@mozilla.com"],
|
||||||
|
date_partition_parameter="submission_date",
|
||||||
|
depends_on_past=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
wait_for_baseline_clients_first_seen = ExternalTaskSensor(
|
||||||
|
task_id="wait_for_baseline_clients_first_seen",
|
||||||
|
external_dag_id="copy_deduplicate",
|
||||||
|
external_task_id="baseline_clients_first_seen",
|
||||||
|
execution_delta=datetime.timedelta(seconds=3600),
|
||||||
|
check_existence=True,
|
||||||
|
mode="reschedule",
|
||||||
|
allowed_states=ALLOWED_STATES,
|
||||||
|
failed_states=FAILED_STATES,
|
||||||
|
pool="DATA_ENG_EXTERNALTASKSENSOR",
|
||||||
|
)
|
||||||
|
|
||||||
|
fenix_derived__clients_yearly__v1.set_upstream(wait_for_baseline_clients_first_seen)
|
|
@ -0,0 +1,18 @@
|
||||||
|
CREATE TABLE IF NOT EXISTS
|
||||||
|
fenix_derived.clients_yearly_v1
|
||||||
|
PARTITION BY
|
||||||
|
submission_date
|
||||||
|
CLUSTER BY
|
||||||
|
sample_id,
|
||||||
|
client_id
|
||||||
|
OPTIONS
|
||||||
|
(require_partition_filter = TRUE)
|
||||||
|
AS
|
||||||
|
SELECT
|
||||||
|
CAST(NULL AS BYTES) AS days_seen_bytes,
|
||||||
|
*,
|
||||||
|
FROM
|
||||||
|
fenix.baseline_clients_daily
|
||||||
|
WHERE
|
||||||
|
-- Output empty table and read no input rows
|
||||||
|
FALSE
|
|
@ -0,0 +1,31 @@
|
||||||
|
friendly_name: Clients Yearly
|
||||||
|
description: |
|
||||||
|
Captures activity of each fenix client
|
||||||
|
in the past 365 days for each submission date.
|
||||||
|
|
||||||
|
Exposed to users as view `fenix.clients_yearly` and used
|
||||||
|
as the basis for LTV calculations.
|
||||||
|
owners:
|
||||||
|
- frank@mozilla.com
|
||||||
|
labels:
|
||||||
|
schedule: daily
|
||||||
|
scheduling:
|
||||||
|
dag_name: bqetl_org_mozilla_firefox_derived
|
||||||
|
depends_on_past: true
|
||||||
|
referenced_tables:
|
||||||
|
- - 'moz-fx-data-shared-prod'
|
||||||
|
- 'org_mozilla_firefox'
|
||||||
|
- 'baseline_clients_daily_v1'
|
||||||
|
depends_on:
|
||||||
|
- task_id: baseline_clients_first_seen
|
||||||
|
dag_name: copy_deduplicate
|
||||||
|
execution_delta: 1h
|
||||||
|
bigquery:
|
||||||
|
time_partitioning:
|
||||||
|
field: submission_date
|
||||||
|
type: day
|
||||||
|
require_partition_filter: true
|
||||||
|
clustering:
|
||||||
|
fields:
|
||||||
|
- sample_id
|
||||||
|
- client_id
|
|
@ -0,0 +1,56 @@
|
||||||
|
WITH base AS (
|
||||||
|
-- There are duplicates now in `baseline_clients_daily` because of the join with `clients_first_seen`,
|
||||||
|
-- so we take the minimum
|
||||||
|
SELECT
|
||||||
|
*,
|
||||||
|
ROW_NUMBER() OVER (
|
||||||
|
PARTITION BY
|
||||||
|
client_id,
|
||||||
|
submission_date
|
||||||
|
ORDER BY
|
||||||
|
first_seen_date ASC,
|
||||||
|
first_run_date ASC
|
||||||
|
) AS rn,
|
||||||
|
FROM
|
||||||
|
fenix.baseline_clients_daily
|
||||||
|
WHERE
|
||||||
|
submission_date = @submission_date
|
||||||
|
),
|
||||||
|
_current AS (
|
||||||
|
SELECT
|
||||||
|
-- In this raw table, we capture the history of activity over the past
|
||||||
|
-- 365 days for each usage criterion as an array of bytes. The
|
||||||
|
-- rightmost bit represents whether the user was active in the current day.
|
||||||
|
udf.bool_to_365_bits(TRUE) AS days_seen_bytes,
|
||||||
|
* EXCEPT (submission_date, rn),
|
||||||
|
FROM
|
||||||
|
base
|
||||||
|
WHERE
|
||||||
|
rn = 1
|
||||||
|
),
|
||||||
|
_previous AS (
|
||||||
|
SELECT
|
||||||
|
* EXCEPT (submission_date)
|
||||||
|
FROM
|
||||||
|
fenix_derived.clients_yearly_v1
|
||||||
|
WHERE
|
||||||
|
submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY)
|
||||||
|
-- Filter out rows from yesterday that have now fallen outside the 365-day window.
|
||||||
|
AND BIT_COUNT(udf.shift_365_bits_one_day(days_seen_bytes)) > 0
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
@submission_date AS submission_date,
|
||||||
|
IF(_current.client_id IS NOT NULL, _current, _previous).* REPLACE (
|
||||||
|
udf.combine_adjacent_days_365_bits(
|
||||||
|
_previous.days_seen_bytes,
|
||||||
|
_current.days_seen_bytes
|
||||||
|
) AS days_seen_bytes
|
||||||
|
)
|
||||||
|
FROM
|
||||||
|
_current
|
||||||
|
FULL OUTER JOIN
|
||||||
|
_previous
|
||||||
|
USING
|
||||||
|
-- Include sample_id to match the clustering of the tables, which may improve
|
||||||
|
-- join performance.
|
||||||
|
(sample_id, client_id)
|
|
@ -0,0 +1,72 @@
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: submission_date
|
||||||
|
type: DATE
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: client_id
|
||||||
|
type: STRING
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: sample_id
|
||||||
|
type: INTEGER
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: first_run_date
|
||||||
|
type: DATE
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: durations
|
||||||
|
type: INTEGER
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: days_seen_session_start_bits
|
||||||
|
type: INTEGER
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: days_seen_session_end_bits
|
||||||
|
type: INTEGER
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: normalized_channel
|
||||||
|
type: STRING
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: normalized_os
|
||||||
|
type: STRING
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: normalized_os_version
|
||||||
|
type: STRING
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: android_sdk_version
|
||||||
|
type: STRING
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: locale
|
||||||
|
type: STRING
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: city
|
||||||
|
type: STRING
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: country
|
||||||
|
type: STRING
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: app_build
|
||||||
|
type: STRING
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: app_channel
|
||||||
|
type: STRING
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: app_display_version
|
||||||
|
type: STRING
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: architecture
|
||||||
|
type: STRING
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: device_manufacturer
|
||||||
|
type: STRING
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: device_model
|
||||||
|
type: STRING
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: telemetry_sdk_build
|
||||||
|
type: STRING
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: first_seen_date
|
||||||
|
type: DATE
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: is_new_profile
|
||||||
|
type: BOOLEAN
|
||||||
|
- mode: NULLABLE
|
||||||
|
name: isp
|
||||||
|
type: STRING
|
Загрузка…
Ссылка в новой задаче