* Add bqetl_fxa_events DAG

Replaces the fxa_events DAG in telemetry-airflow

Co-authored-by: Anna Scholtz <anna@scholtzan.net>
This commit is contained in:
Jeff Klukas 2020-07-14 07:58:11 -04:00 коммит произвёл GitHub
Родитель d5bbfe297c
Коммит b0efb898c5
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
19 изменённых файлов: 563 добавлений и 0 удалений

Просмотреть файл

@ -71,6 +71,15 @@ bqetl_mobile_search:
retries: 1 retries: 1
retry_delay: 5m retry_delay: 5m
bqetl_fxa_events:
schedule_interval: 30 1 * * *
default_args:
owner: jklukas@mozilla.com
start_date: '2019-03-01'
email: ['telemetry-alerts@mozilla.com', 'jklukas@mozilla.com']
retries: 1
retry_delay: 10m
bqetl_gud: bqetl_gud:
schedule_interval: 0 3 * * * schedule_interval: 0 3 * * *
default_args: default_args:

226
dags/bqetl_fxa_events.py Normal file
Просмотреть файл

@ -0,0 +1,226 @@
# Generated via https://github.com/mozilla/bigquery-etl/blob/master/bigquery_etl/query_scheduling/generate_airflow_dags.py
from airflow import DAG
from airflow.operators.sensors import ExternalTaskSensor
import datetime
from utils.gcp import bigquery_etl_query
default_args = {
"owner": "jklukas@mozilla.com",
"start_date": datetime.datetime(2019, 3, 1, 0, 0),
"email": ["telemetry-alerts@mozilla.com", "jklukas@mozilla.com"],
"depends_on_past": False,
"retry_delay": datetime.timedelta(seconds=600),
"email_on_failure": True,
"email_on_retry": True,
"retries": 1,
}
with DAG(
"bqetl_fxa_events", default_args=default_args, schedule_interval="30 1 * * *"
) as dag:
firefox_accounts_derived__fxa_users_daily__v1 = bigquery_etl_query(
task_id="firefox_accounts_derived__fxa_users_daily__v1",
destination_table="fxa_users_daily_v1",
dataset_id="firefox_accounts_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=["jklukas@mozilla.com", "telemetry-alerts@mozilla.com"],
date_partition_parameter="submission_date",
depends_on_past=False,
dag=dag,
)
firefox_accounts_derived__fxa_users_services_last_seen__v1 = bigquery_etl_query(
task_id="firefox_accounts_derived__fxa_users_services_last_seen__v1",
destination_table="fxa_users_services_last_seen_v1",
dataset_id="firefox_accounts_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=["jklukas@mozilla.com", "telemetry-alerts@mozilla.com"],
start_date=datetime.datetime(2019, 10, 8, 0, 0),
date_partition_parameter="submission_date",
depends_on_past=True,
dag=dag,
)
firefox_accounts_derived__fxa_auth_events__v1 = bigquery_etl_query(
task_id="firefox_accounts_derived__fxa_auth_events__v1",
destination_table="fxa_auth_events_v1",
dataset_id="firefox_accounts_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=["jklukas@mozilla.com", "telemetry-alerts@mozilla.com"],
date_partition_parameter="submission_date",
depends_on_past=False,
arguments=["--schema_update_option=ALLOW_FIELD_ADDITION"],
dag=dag,
)
firefox_accounts_derived__exact_mau28__v1 = bigquery_etl_query(
task_id="firefox_accounts_derived__exact_mau28__v1",
destination_table="exact_mau28_v1",
dataset_id="firefox_accounts_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=["jklukas@mozilla.com", "telemetry-alerts@mozilla.com"],
date_partition_parameter="submission_date",
depends_on_past=False,
dag=dag,
)
firefox_accounts_derived__fxa_users_services_first_seen__v1 = bigquery_etl_query(
task_id="firefox_accounts_derived__fxa_users_services_first_seen__v1",
destination_table="fxa_users_services_first_seen_v1",
dataset_id="firefox_accounts_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=["jklukas@mozilla.com", "telemetry-alerts@mozilla.com"],
date_partition_parameter=None,
depends_on_past=False,
dag=dag,
)
firefox_accounts_derived__fxa_users_services_daily__v1 = bigquery_etl_query(
task_id="firefox_accounts_derived__fxa_users_services_daily__v1",
destination_table="fxa_users_services_daily_v1",
dataset_id="firefox_accounts_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=["jklukas@mozilla.com", "telemetry-alerts@mozilla.com"],
date_partition_parameter="submission_date",
depends_on_past=False,
dag=dag,
)
firefox_accounts_derived__fxa_users_last_seen__v1 = bigquery_etl_query(
task_id="firefox_accounts_derived__fxa_users_last_seen__v1",
destination_table="fxa_users_last_seen_v1",
dataset_id="firefox_accounts_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=["jklukas@mozilla.com", "telemetry-alerts@mozilla.com"],
start_date=datetime.datetime(2019, 4, 23, 0, 0),
date_partition_parameter="submission_date",
depends_on_past=True,
dag=dag,
)
firefox_accounts_derived__fxa_content_events__v1 = bigquery_etl_query(
task_id="firefox_accounts_derived__fxa_content_events__v1",
destination_table="fxa_content_events_v1",
dataset_id="firefox_accounts_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=["jklukas@mozilla.com", "telemetry-alerts@mozilla.com"],
date_partition_parameter="submission_date",
depends_on_past=False,
arguments=["--schema_update_option=ALLOW_FIELD_ADDITION"],
dag=dag,
)
firefox_accounts_derived__fxa_auth_bounce_events__v1 = bigquery_etl_query(
task_id="firefox_accounts_derived__fxa_auth_bounce_events__v1",
destination_table="fxa_auth_bounce_events_v1",
dataset_id="firefox_accounts_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=["jklukas@mozilla.com", "telemetry-alerts@mozilla.com"],
date_partition_parameter="submission_date",
depends_on_past=False,
arguments=["--schema_update_option=ALLOW_FIELD_ADDITION"],
dag=dag,
)
firefox_accounts_derived__fxa_log_content_events__v1 = bigquery_etl_query(
task_id="firefox_accounts_derived__fxa_log_content_events__v1",
destination_table="fxa_log_content_events_v1",
dataset_id="firefox_accounts_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=["jklukas@mozilla.com", "telemetry-alerts@mozilla.com"],
date_partition_parameter="submission_date",
depends_on_past=False,
dag=dag,
)
firefox_accounts_derived__fxa_log_device_command_events__v1 = bigquery_etl_query(
task_id="firefox_accounts_derived__fxa_log_device_command_events__v1",
destination_table="fxa_log_device_command_events_v1",
dataset_id="firefox_accounts_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=["jklukas@mozilla.com", "telemetry-alerts@mozilla.com"],
date_partition_parameter="submission_date",
depends_on_past=False,
dag=dag,
)
firefox_accounts_derived__fxa_delete_events__v1 = bigquery_etl_query(
task_id="firefox_accounts_derived__fxa_delete_events__v1",
destination_table="fxa_delete_events_v1",
dataset_id="firefox_accounts_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=["jklukas@mozilla.com", "telemetry-alerts@mozilla.com"],
date_partition_parameter="submission_date",
depends_on_past=False,
dag=dag,
)
firefox_accounts_derived__fxa_log_auth_events__v1 = bigquery_etl_query(
task_id="firefox_accounts_derived__fxa_log_auth_events__v1",
destination_table="fxa_log_auth_events_v1",
dataset_id="firefox_accounts_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=["jklukas@mozilla.com", "telemetry-alerts@mozilla.com"],
date_partition_parameter="submission_date",
depends_on_past=False,
dag=dag,
)
firefox_accounts_derived__fxa_users_daily__v1.set_upstream(
firefox_accounts_derived__fxa_auth_bounce_events__v1
)
firefox_accounts_derived__fxa_users_daily__v1.set_upstream(
firefox_accounts_derived__fxa_auth_events__v1
)
firefox_accounts_derived__fxa_users_daily__v1.set_upstream(
firefox_accounts_derived__fxa_content_events__v1
)
firefox_accounts_derived__fxa_users_services_last_seen__v1.set_upstream(
firefox_accounts_derived__fxa_users_services_daily__v1
)
firefox_accounts_derived__fxa_users_services_last_seen__v1.set_upstream(
firefox_accounts_derived__fxa_users_services_first_seen__v1
)
firefox_accounts_derived__exact_mau28__v1.set_upstream(
firefox_accounts_derived__fxa_users_last_seen__v1
)
firefox_accounts_derived__fxa_users_services_first_seen__v1.set_upstream(
firefox_accounts_derived__fxa_auth_events__v1
)
firefox_accounts_derived__fxa_users_services_first_seen__v1.set_upstream(
firefox_accounts_derived__fxa_content_events__v1
)
firefox_accounts_derived__fxa_users_services_daily__v1.set_upstream(
firefox_accounts_derived__fxa_auth_events__v1
)
firefox_accounts_derived__fxa_users_services_daily__v1.set_upstream(
firefox_accounts_derived__fxa_content_events__v1
)
firefox_accounts_derived__fxa_users_last_seen__v1.set_upstream(
firefox_accounts_derived__fxa_users_daily__v1
)

Просмотреть файл

@ -42,6 +42,18 @@ with DAG("bqetl_gud", default_args=default_args, schedule_interval="0 3 * * *")
dag=dag, dag=dag,
) )
telemetry_derived__smoot_usage_fxa__v2 = bigquery_etl_query(
task_id="telemetry_derived__smoot_usage_fxa__v2",
destination_table="smoot_usage_fxa_v2",
dataset_id="telemetry_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=["jklukas@mozilla.com", "telemetry-alerts@mozilla.com"],
date_partition_parameter="submission_date",
depends_on_past=False,
dag=dag,
)
telemetry_derived__smoot_usage_desktop_compressed__v2 = bigquery_etl_query( telemetry_derived__smoot_usage_desktop_compressed__v2 = bigquery_etl_query(
task_id="telemetry_derived__smoot_usage_desktop_compressed__v2", task_id="telemetry_derived__smoot_usage_desktop_compressed__v2",
destination_table="smoot_usage_desktop_compressed_v2", destination_table="smoot_usage_desktop_compressed_v2",
@ -78,6 +90,18 @@ with DAG("bqetl_gud", default_args=default_args, schedule_interval="0 3 * * *")
dag=dag, dag=dag,
) )
telemetry_derived__smoot_usage_fxa_compressed__v2 = bigquery_etl_query(
task_id="telemetry_derived__smoot_usage_fxa_compressed__v2",
destination_table="smoot_usage_fxa_compressed_v2",
dataset_id="telemetry_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=["jklukas@mozilla.com", "telemetry-alerts@mozilla.com"],
date_partition_parameter="submission_date",
depends_on_past=False,
dag=dag,
)
telemetry_derived__smoot_usage_nondesktop__v2 = bigquery_etl_query( telemetry_derived__smoot_usage_nondesktop__v2 = bigquery_etl_query(
task_id="telemetry_derived__smoot_usage_nondesktop__v2", task_id="telemetry_derived__smoot_usage_nondesktop__v2",
destination_table="smoot_usage_nondesktop_v2", destination_table="smoot_usage_nondesktop_v2",
@ -107,10 +131,27 @@ with DAG("bqetl_gud", default_args=default_args, schedule_interval="0 3 * * *")
telemetry_derived__smoot_usage_desktop__v2 telemetry_derived__smoot_usage_desktop__v2
) )
telemetry_derived__smoot_usage_new_profiles__v2.set_upstream(
telemetry_derived__smoot_usage_fxa__v2
)
telemetry_derived__smoot_usage_new_profiles__v2.set_upstream( telemetry_derived__smoot_usage_new_profiles__v2.set_upstream(
telemetry_derived__smoot_usage_nondesktop__v2 telemetry_derived__smoot_usage_nondesktop__v2
) )
wait_for_firefox_accounts_derived__fxa_users_last_seen__v1 = ExternalTaskSensor(
task_id="wait_for_firefox_accounts_derived__fxa_users_last_seen__v1",
external_dag_id="bqetl_fxa_events",
external_task_id="firefox_accounts_derived__fxa_users_last_seen__v1",
execution_delta=datetime.timedelta(seconds=5400),
check_existence=True,
mode="reschedule",
)
telemetry_derived__smoot_usage_fxa__v2.set_upstream(
wait_for_firefox_accounts_derived__fxa_users_last_seen__v1
)
telemetry_derived__smoot_usage_desktop_compressed__v2.set_upstream( telemetry_derived__smoot_usage_desktop_compressed__v2.set_upstream(
telemetry_derived__smoot_usage_desktop__v2 telemetry_derived__smoot_usage_desktop__v2
) )
@ -123,6 +164,10 @@ with DAG("bqetl_gud", default_args=default_args, schedule_interval="0 3 * * *")
telemetry_derived__smoot_usage_nondesktop__v2 telemetry_derived__smoot_usage_nondesktop__v2
) )
telemetry_derived__smoot_usage_fxa_compressed__v2.set_upstream(
telemetry_derived__smoot_usage_fxa__v2
)
wait_for_telemetry_derived__core_clients_last_seen__v1 = ExternalTaskSensor( wait_for_telemetry_derived__core_clients_last_seen__v1 = ExternalTaskSensor(
task_id="wait_for_telemetry_derived__core_clients_last_seen__v1", task_id="wait_for_telemetry_derived__core_clients_last_seen__v1",
external_dag_id="bqetl_core", external_dag_id="bqetl_core",

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Firefox Accounts Exact Mau 28
description: Base table for exact FxA MAU by dimensions
owners:
- jklukas@mozilla.com
labels:
application: fxa
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_fxa_events

Просмотреть файл

@ -0,0 +1,11 @@
friendly_name: FxA Auth Bounce Events
description: Selected Amplitude events extracted from FxA auth_bounce server logs
owners:
- jklukas@mozilla.com
labels:
application: fxa
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_fxa_events
arguments: ['--schema_update_option=ALLOW_FIELD_ADDITION']

Просмотреть файл

@ -0,0 +1,11 @@
friendly_name: FxA Auth Events
description: Selected Amplitude events extracted from FxA auth server logs
owners:
- jklukas@mozilla.com
labels:
application: fxa
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_fxa_events
arguments: ['--schema_update_option=ALLOW_FIELD_ADDITION']

Просмотреть файл

@ -0,0 +1,11 @@
friendly_name: FxA Content Events
description: Selected Amplitude events extracted from FxA content server logs
owners:
- jklukas@mozilla.com
labels:
application: fxa
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_fxa_events
arguments: ['--schema_update_option=ALLOW_FIELD_ADDITION']

Просмотреть файл

@ -0,0 +1,15 @@
friendly_name: FxA Delete Events
description: Deletion events extracted from FxA auth server logs used as signal for Mozilla to delete analysis data associated with the user
owners:
- jklukas@mozilla.com
labels:
application: fxa
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_fxa_events
# This query references secret keys that are not available for dry runs,
# so we must explicitly write out dependencies. In this case, the query
# depends only on fxa logs produced via Stackdriver integration, so no other
# scheduled tasks are involved and the referenced_tables list is empty.
referenced_tables: []

Просмотреть файл

@ -9,3 +9,5 @@ labels:
application: fxa application: fxa
incremental: true incremental: true
schedule: daily schedule: daily
scheduling:
dag_name: bqetl_fxa_events

Просмотреть файл

@ -9,3 +9,5 @@ labels:
application: fxa application: fxa
incremental: true incremental: true
schedule: daily schedule: daily
scheduling:
dag_name: bqetl_fxa_events

Просмотреть файл

@ -9,3 +9,10 @@ labels:
application: fxa application: fxa
incremental: true incremental: true
schedule: daily schedule: daily
scheduling:
dag_name: bqetl_fxa_events
# This query references secret keys that are not available for dry runs,
# so we must explicitly write out dependencies. In this case, the query
# depends only on fxa logs produced via Stackdriver integration, so no other
# scheduled tasks are involved and the referenced_tables list is empty.
referenced_tables: []

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: FxA Users Daily
description: Usage aggregations per FxA user per day
owners:
- jklukas@mozilla.com
labels:
application: fxa
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_fxa_events

Просмотреть файл

@ -0,0 +1,12 @@
friendly_name: FxA Users Last Seen
description: Usage aggregations per FxA user per day over a 28-day window
owners:
- jklukas@mozilla.com
labels:
application: fxa
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_fxa_events
depends_on_past: true
start_date: '2019-04-23'

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: FxA Users Services Daily
description: Usage aggregations per FxA user per FxA service per day
owners:
- jklukas@mozilla.com
labels:
application: fxa
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_fxa_events

Просмотреть файл

@ -0,0 +1,14 @@
friendly_name: FxA Users Services First Seen
description: Usage aggregations describing when each FxA user was first seen
owners:
- jklukas@mozilla.com
labels:
application: fxa
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_fxa_events
# We completely recreate this table every day;
# making it incremental is possible but nuanced since it windows over
# events that may cross the midnight boundary.
date_partition_parameter: null

Просмотреть файл

@ -0,0 +1,136 @@
WITH base AS (
SELECT
* REPLACE (
IF(service IS NULL AND event_type = 'fxa_activity - cert_signed', 'sync', service) AS service
)
FROM
firefox_accounts.fxa_content_auth_oauth_events
),
-- use a window function to look within each USER and SERVICE for the first value of service, os, and country.
-- also, get the first value of flow_id for later use and create a boolean column that is true if the first instance of a service usage includes a registration.
-- [kimmy] the variable first_service_timestamp_last is named so because it is actually the last timestamp recorder in the user's first flow,
-- NOT the first timestamp in their first flow.
-- it's used later on to order by service, so i'm keeping it here and just renaming it.
first_services AS (
SELECT
ROW_NUMBER() OVER w1_unframed AS _n,
user_id,
service,
-- using mode_last with w1_reversed to get mode_first
udf.mode_last(ARRAY_AGG(`timestamp`) OVER w1_reversed) AS first_service_timestamp_last,
udf.mode_last(ARRAY_AGG(flow_id) OVER w1_reversed) AS first_service_flow,
LOGICAL_OR(IFNULL(event_type = 'fxa_reg - complete', FALSE)) OVER w1_reversed AS did_register
FROM
base
WHERE
(
(event_type IN ('fxa_login - complete', 'fxa_reg - complete') AND service IS NOT NULL)
OR (event_type LIKE 'fxa_activity%')
)
AND DATE(`timestamp`) >= '2019-03-01'
AND user_id IS NOT NULL
WINDOW
-- We must provide a window with `ORDER BY timestamp DESC` so that udf.mode_last actually aggregates mode first.
w1_reversed AS (
PARTITION BY
user_id,
service
ORDER BY
`timestamp` DESC
ROWS BETWEEN
UNBOUNDED PRECEDING
AND UNBOUNDED FOLLOWING
),
-- We must provide a modified window for ROW_NUMBER which cannot accept a frame clause.
w1_unframed AS (
PARTITION BY
user_id,
service
ORDER BY
`timestamp`
)
),
-- we need this next section because `did_register` will be BOTH true and false within the flows that the user registered on.
-- this dedupes the rows from above and sets did_register to true only on flows that included a registration
-- I've verified that `date(first_service_timestamp), count(distinct user_id) where did_register = true group by 1` matches the counts of registrations per day in amplitude.
first_services_g AS (
SELECT
* EXCEPT (_n)
FROM
first_services
WHERE
_n = 1
),
-- sadly, `entrypoint` is null on registration complete and login complete events.
-- this means we have to use first_service_flow to join back on the original source table's flow_id,
-- and take the first occurrence of `entrypoint` within the flow that the user first appeared in the service on.
flows AS (
SELECT
DISTINCT s.first_service_flow,
FIRST_VALUE(f.entrypoint) OVER (
PARTITION BY
f.flow_id
ORDER BY
f.`timestamp`
ROWS BETWEEN
UNBOUNDED PRECEDING
AND UNBOUNDED FOLLOWING
) AS first_service_entrypoint,
FIRST_VALUE(f.timestamp) OVER (
PARTITION BY
f.flow_id
ORDER BY
f.`timestamp`
ROWS BETWEEN
UNBOUNDED PRECEDING
AND UNBOUNDED FOLLOWING
) AS first_service_timestamp,
FIRST_VALUE(f.country) OVER (
PARTITION BY
f.flow_id
ORDER BY
f.`timestamp`
ROWS BETWEEN
UNBOUNDED PRECEDING
AND UNBOUNDED FOLLOWING
) AS first_service_country,
FIRST_VALUE(f.os_name) OVER (
PARTITION BY
f.flow_id
ORDER BY
f.`timestamp`
ROWS BETWEEN
UNBOUNDED PRECEDING
AND UNBOUNDED FOLLOWING
) AS first_service_os
FROM
first_services_g s
INNER JOIN
firefox_accounts.fxa_content_auth_oauth_events AS f
ON
s.first_service_flow = f.flow_id
WHERE
f.entrypoint IS NOT NULL
AND s.first_service_flow IS NOT NULL
AND DATE(f.`timestamp`) >= '2019-03-01'
)
-- finally take the entrypoint data and join it back on the other information (os, country etc).
-- also, add a row number that indicates the order in which the user signed up for their services.
SELECT
s.user_id,
s.service,
s.first_service_flow,
s.did_register,
f.first_service_entrypoint AS entrypoint,
f.first_service_timestamp,
f.first_service_country,
f.first_service_os,
ROW_NUMBER() OVER (PARTITION BY s.user_id ORDER BY first_service_timestamp_last) AS service_number
FROM
first_services_g s
LEFT JOIN
flows f
USING
(first_service_flow)
WHERE
first_service_flow IS NOT NULL

Просмотреть файл

@ -0,0 +1,12 @@
friendly_name: FxA Users Services Last Seen
description: Usage aggregations per FxA user per FxA service per day over a 28-day window
owners:
- jklukas@mozilla.com
labels:
application: fxa
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_fxa_events
depends_on_past: true
start_date: '2019-10-08'

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Smoot Usage FxA Compressed
description: Compressed usage metrics for FxA users.
owners:
- jklukas@mozilla.com
labels:
application: desktop
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_gud

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Smoot Usage FxA
description: Usage metrics for FxA clients.
owners:
- jklukas@mozilla.com
labels:
application: desktop
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_gud