Bug 1803425 - Create clients_yearly table for Fenix clients (#3383)

* Create clients_yearly table for Fenix clients

This table contains a year of history for Fenix.
It utilizes similar logic to search.search_clients_last_seen,
but the naming specifies the year-long byte field.

Currently only looks at Fenix data, which migrated
~2020-09-01; so full data is only available beginning 2021-09-01.

* Add schema for baseline_clients_daily

* Move to referenced tables

* Remove duplicates from baseline_clients_daily

* Add dependency on baseline_clients_daily
This commit is contained in:
Frank Bertsch 2022-12-01 11:21:54 -05:00 коммит произвёл GitHub
Родитель 26413d5ecd
Коммит a212c66163
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 260 добавлений и 0 удалений

Просмотреть файл

@ -343,6 +343,22 @@ bqetl_org_mozilla_fenix_derived:
tags: tags:
- impact/tier_1 - impact/tier_1
bqetl_org_mozilla_firefox_derived:
schedule_interval: 0 2 * * *
default_args:
depends_on_past: false
email:
- frank@mozilla.com
- telemetry-alerts@mozilla.com
email_on_failure: true
email_on_retry: true
owner: frank@mozilla.com
retries: 2
retry_delay: 30m
start_date: "2022-11-30"
tags:
- impact/tier_1
bqetl_google_analytics_derived: bqetl_google_analytics_derived:
schedule_interval: 0 23 * * * schedule_interval: 0 23 * * *
description: | description: |

Просмотреть файл

@ -0,0 +1,67 @@
# Generated via https://github.com/mozilla/bigquery-etl/blob/main/bigquery_etl/query_scheduling/generate_airflow_dags.py
from airflow import DAG
from airflow.sensors.external_task import ExternalTaskMarker
from airflow.sensors.external_task import ExternalTaskSensor
from airflow.utils.task_group import TaskGroup
import datetime
from utils.constants import ALLOWED_STATES, FAILED_STATES
from utils.gcp import bigquery_etl_query, gke_command
docs = """
### bqetl_org_mozilla_firefox_derived
Built from bigquery-etl repo, [`dags/bqetl_org_mozilla_firefox_derived.py`](https://github.com/mozilla/bigquery-etl/blob/main/dags/bqetl_org_mozilla_firefox_derived.py)
#### Owner
frank@mozilla.com
"""
default_args = {
"owner": "frank@mozilla.com",
"start_date": datetime.datetime(2022, 11, 30, 0, 0),
"end_date": None,
"email": ["frank@mozilla.com", "telemetry-alerts@mozilla.com"],
"depends_on_past": False,
"retry_delay": datetime.timedelta(seconds=1800),
"email_on_failure": True,
"email_on_retry": True,
"retries": 2,
}
tags = ["impact/tier_1", "repo/bigquery-etl"]
with DAG(
"bqetl_org_mozilla_firefox_derived",
default_args=default_args,
schedule_interval="0 2 * * *",
doc_md=docs,
tags=tags,
) as dag:
fenix_derived__clients_yearly__v1 = bigquery_etl_query(
task_id="fenix_derived__clients_yearly__v1",
destination_table="clients_yearly_v1",
dataset_id="fenix_derived",
project_id="moz-fx-data-shared-prod",
owner="frank@mozilla.com",
email=["frank@mozilla.com", "telemetry-alerts@mozilla.com"],
date_partition_parameter="submission_date",
depends_on_past=True,
)
wait_for_baseline_clients_first_seen = ExternalTaskSensor(
task_id="wait_for_baseline_clients_first_seen",
external_dag_id="copy_deduplicate",
external_task_id="baseline_clients_first_seen",
execution_delta=datetime.timedelta(seconds=3600),
check_existence=True,
mode="reschedule",
allowed_states=ALLOWED_STATES,
failed_states=FAILED_STATES,
pool="DATA_ENG_EXTERNALTASKSENSOR",
)
fenix_derived__clients_yearly__v1.set_upstream(wait_for_baseline_clients_first_seen)

Просмотреть файл

@ -0,0 +1,18 @@
CREATE TABLE IF NOT EXISTS
fenix_derived.clients_yearly_v1
PARTITION BY
submission_date
CLUSTER BY
sample_id,
client_id
OPTIONS
(require_partition_filter = TRUE)
AS
SELECT
CAST(NULL AS BYTES) AS days_seen_bytes,
*,
FROM
fenix.baseline_clients_daily
WHERE
-- Output empty table and read no input rows
FALSE

Просмотреть файл

@ -0,0 +1,31 @@
friendly_name: Clients Yearly
description: |
Captures activity of each fenix client
in the past 365 days for each submission date.
Exposed to users as view `fenix.clients_yearly` and used
as the basis for LTV calculations.
owners:
- frank@mozilla.com
labels:
schedule: daily
scheduling:
dag_name: bqetl_org_mozilla_firefox_derived
depends_on_past: true
referenced_tables:
- - 'moz-fx-data-shared-prod'
- 'org_mozilla_firefox'
- 'baseline_clients_daily_v1'
depends_on:
- task_id: baseline_clients_first_seen
dag_name: copy_deduplicate
execution_delta: 1h
bigquery:
time_partitioning:
field: submission_date
type: day
require_partition_filter: true
clustering:
fields:
- sample_id
- client_id

Просмотреть файл

@ -0,0 +1,56 @@
WITH base AS (
-- There are duplicates now in `baseline_clients_daily` because of the join with `clients_first_seen`,
-- so we take the minimum
SELECT
*,
ROW_NUMBER() OVER (
PARTITION BY
client_id,
submission_date
ORDER BY
first_seen_date ASC,
first_run_date ASC
) AS rn,
FROM
fenix.baseline_clients_daily
WHERE
submission_date = @submission_date
),
_current AS (
SELECT
-- In this raw table, we capture the history of activity over the past
-- 365 days for each usage criterion as an array of bytes. The
-- rightmost bit represents whether the user was active in the current day.
udf.bool_to_365_bits(TRUE) AS days_seen_bytes,
* EXCEPT (submission_date, rn),
FROM
base
WHERE
rn = 1
),
_previous AS (
SELECT
* EXCEPT (submission_date)
FROM
fenix_derived.clients_yearly_v1
WHERE
submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY)
-- Filter out rows from yesterday that have now fallen outside the 365-day window.
AND BIT_COUNT(udf.shift_365_bits_one_day(days_seen_bytes)) > 0
)
SELECT
@submission_date AS submission_date,
IF(_current.client_id IS NOT NULL, _current, _previous).* REPLACE (
udf.combine_adjacent_days_365_bits(
_previous.days_seen_bytes,
_current.days_seen_bytes
) AS days_seen_bytes
)
FROM
_current
FULL OUTER JOIN
_previous
USING
-- Include sample_id to match the clustering of the tables, which may improve
-- join performance.
(sample_id, client_id)

Просмотреть файл

@ -0,0 +1,72 @@
- mode: NULLABLE
name: submission_date
type: DATE
- mode: NULLABLE
name: client_id
type: STRING
- mode: NULLABLE
name: sample_id
type: INTEGER
- mode: NULLABLE
name: first_run_date
type: DATE
- mode: NULLABLE
name: durations
type: INTEGER
- mode: NULLABLE
name: days_seen_session_start_bits
type: INTEGER
- mode: NULLABLE
name: days_seen_session_end_bits
type: INTEGER
- mode: NULLABLE
name: normalized_channel
type: STRING
- mode: NULLABLE
name: normalized_os
type: STRING
- mode: NULLABLE
name: normalized_os_version
type: STRING
- mode: NULLABLE
name: android_sdk_version
type: STRING
- mode: NULLABLE
name: locale
type: STRING
- mode: NULLABLE
name: city
type: STRING
- mode: NULLABLE
name: country
type: STRING
- mode: NULLABLE
name: app_build
type: STRING
- mode: NULLABLE
name: app_channel
type: STRING
- mode: NULLABLE
name: app_display_version
type: STRING
- mode: NULLABLE
name: architecture
type: STRING
- mode: NULLABLE
name: device_manufacturer
type: STRING
- mode: NULLABLE
name: device_model
type: STRING
- mode: NULLABLE
name: telemetry_sdk_build
type: STRING
- mode: NULLABLE
name: first_seen_date
type: DATE
- mode: NULLABLE
name: is_new_profile
type: BOOLEAN
- mode: NULLABLE
name: isp
type: STRING