feat(DENG-351): fxa_users_services_devices_daily_v1 model added to firefox_accounts_derived dataset (#3378)

* fxa_users_services_devices_daily_events_v1 model added to firefox_accounts_derived dataset

* added schema.yaml for fxa_users_services_devices_daily_events_v1

* removed invalid comment from metadata.yaml for fxa_users_services_devices_daily_events_v1

* removed invalid comment from metadata.yaml for fxa_users_services_devices_daily_events_v1

* added bigquery table configuration

* added more comments as requested by @gkabbz in PR#3378

* regenerated dags/bqetl_fxa_events.py

* made some tweaks to the query to make it more explicit

* fixed fxa_events DAG

* fixed fxa_users_services_devices_daily_events_v1 schema

* updated table description for fxa_users_services_devices_daily_events_v1

* renamed fxa_users_services_devices_daily_events_v1 to fxa_users_services_devices_daily_v1
This commit is contained in:
kik-kik 2022-11-30 14:16:07 +00:00 коммит произвёл GitHub
Родитель 6371122594
Коммит b62df40396
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 327 добавлений и 0 удалений

Просмотреть файл

@ -300,6 +300,21 @@ with DAG(
depends_on_past=False,
)
firefox_accounts_derived__fxa_users_services_devices_daily__v1 = bigquery_etl_query(
task_id="firefox_accounts_derived__fxa_users_services_devices_daily__v1",
destination_table="fxa_users_services_devices_daily_v1",
dataset_id="firefox_accounts_derived",
project_id="moz-fx-data-shared-prod",
owner="kignasiak@mozilla.com",
email=[
"dthorn@mozilla.com",
"kignasiak@mozilla.com",
"telemetry-alerts@mozilla.com",
],
date_partition_parameter="submission_date",
depends_on_past=False,
)
firefox_accounts_derived__nonprod_fxa_auth_events__v1 = bigquery_etl_query(
task_id="firefox_accounts_derived__nonprod_fxa_auth_events__v1",
destination_table="nonprod_fxa_auth_events_v1",
@ -367,3 +382,11 @@ with DAG(
firefox_accounts_derived__fxa_users_services_daily__v1.set_upstream(
firefox_accounts_derived__fxa_content_events__v1
)
firefox_accounts_derived__fxa_users_services_devices_daily__v1.set_upstream(
firefox_accounts_derived__fxa_auth_events__v1
)
firefox_accounts_derived__fxa_users_services_devices_daily__v1.set_upstream(
firefox_accounts_derived__fxa_content_events__v1
)

Просмотреть файл

@ -0,0 +1,38 @@
---
friendly_name: FxA Users Services Devices Daily Events
description: |
Contains entries for user, service, device combination observed
on daily basis. Only 'fxa_login - complete' and 'fxa_reg - complete'
events are considered.
This model also serves as the basis for:
- fxa_users_services_devices_first_seen_events_v1
- fxa_users_services_devices_last_seen_events_v1
For more info on FxA data check:
- https://docs.telemetry.mozilla.org/datasets/fxa.html
- https://mozilla.github.io/ecosystem-platform/
owners:
- kignasiak@mozilla.com
labels:
application: fxa
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_fxa_events
date_partition_parameter: submission_date
bigquery:
time_partitioning:
field: timestamp
type: day
require_partition_filter: true
clustering:
fields:
- service
- os_name
- country
# Other notes:
# windows over events that may cross the midnight boundary.
# TODO: maybe should mention this inside the query...

Просмотреть файл

@ -0,0 +1,152 @@
WITH fxa_events AS (
SELECT
`timestamp`,
user_id,
IF(service IS NULL AND event_type = 'fxa_activity - cert_signed', 'sync', service) AS service,
device_id,
os_name,
flow_id,
event_type,
country,
`language`,
entrypoint,
utm_term,
utm_medium,
utm_source,
utm_campaign,
utm_content,
ua_version,
ua_browser,
FROM
`moz-fx-data-shared-prod.firefox_accounts.fxa_content_auth_oauth_events` -- TODO: this will need updated to fxa_all_events once unified
WHERE
DATE(`timestamp`)
BETWEEN DATE_SUB(@submission_date, INTERVAL 1 DAY)
AND @submission_date
-- re-using the filter from users_services_daily_v1 for consistency across the models
-- at some point in the future we should re-evaluate this list
AND event_type NOT IN ( --
'fxa_email - bounced',
'fxa_email - click',
'fxa_email - sent',
'fxa_reg - password_blocked',
'fxa_reg - password_common',
'fxa_reg - password_enrolled',
'fxa_reg - password_missing',
'fxa_sms - sent',
'mktg - email_click',
'mktg - email_open',
'mktg - email_sent',
'sync - repair_success',
'sync - repair_triggered'
)
),
entrypoints AS (
SELECT DISTINCT
flow_id,
entrypoint
FROM
fxa_events
WHERE
-- if both values are not set then the record
-- cannot be used for mapping
flow_id IS NOT NULL
AND entrypoint IS NOT NULL
QUALIFY
ROW_NUMBER() OVER (PARTITION BY flow_id ORDER BY `timestamp` ASC) = 1
),
utms AS (
SELECT DISTINCT
flow_id,
FIRST_VALUE(utm_term) OVER (_window) AS utm_term,
FIRST_VALUE(utm_medium) OVER (_window) AS utm_medium,
FIRST_VALUE(utm_source) OVER (_window) AS utm_source,
FIRST_VALUE(utm_campaign) OVER (_window) AS utm_campaign,
FIRST_VALUE(utm_content) OVER (_window) AS utm_content,
FROM
fxa_events
WHERE
flow_id IS NOT NULL
WINDOW
_window AS (
PARTITION BY
flow_id
ORDER BY
`timestamp` ASC
ROWS BETWEEN
UNBOUNDED PRECEDING
AND UNBOUNDED FOLLOWING
)
),
device_service_users_entries AS (
SELECT DISTINCT
`timestamp`,
user_id,
service,
device_id,
os_name,
flow_id,
event_type,
country,
`language`,
ua_version,
ua_browser,
FROM
fxa_events
WHERE
DATE(`timestamp`) = @submission_date
-- Filtering out for these specific events to be consistent with the logic used by
-- fxa_users_daily_v1 and fxa_users_services_daily_v1
AND ((event_type IN ('fxa_login - complete', 'fxa_reg - complete') AND service IS NOT NULL))
)
SELECT
-- device_service_users_entries
device_service_users_entries.`timestamp`,
device_service_users_entries.flow_id,
device_service_users_entries.event_type,
device_service_users_entries.user_id,
device_service_users_entries.service,
device_service_users_entries.device_id,
device_service_users_entries.os_name,
device_service_users_entries.country,
device_service_users_entries.`language`,
device_service_users_entries.ua_version,
device_service_users_entries.ua_browser,
-- entrypoints
entrypoints.entrypoint,
-- utms
utms.utm_term,
utms.utm_medium,
utms.utm_source,
utms.utm_campaign,
utms.utm_content,
FROM
device_service_users_entries
LEFT JOIN
entrypoints
USING
(flow_id)
LEFT JOIN
utms
USING
(flow_id)
WHERE
-- making sure the user is registered
user_id IS NOT NULL
-- making sure there is a flow_id associated with this session
-- the current logic relies on this value being set to retrieve
-- its attributes correctly
AND flow_id IS NOT NULL
-- if either service or device_id is null then the record
-- is useless for this model
AND service IS NOT NULL
AND device_id IS NOT NULL
QUALIFY
ROW_NUMBER() OVER (
PARTITION BY
user_id,
service,
device_id
ORDER BY
`timestamp` ASC
) = 1 -- this could be partitioned by first_flow_id to handle multiple device_ids being attached to a single flow_id

Просмотреть файл

@ -0,0 +1,114 @@
fields:
- mode: NULLABLE
name: timestamp
type: TIMESTAMP
description: |
Datetime value when the event occurred.
- mode: NULLABLE
name: flow_id
type: STRING
description: |
The flow identifier. A randomly-generated opaque id.
- mode: NULLABLE
name: event_type
type: STRING
description: |
Type of the FxA event recorded.
- mode: NULLABLE
name: user_id
type: STRING
description:
- mode: NULLABLE
name: service
type: STRING
description: |
The service identifier. For Sync it may be empty or sync.
For OAuth reliers it is their hex client id.
- mode: NULLABLE
name: device_id
type: STRING
description: |
The most granual field.
The id of the device record.
This does correlate back to a record the FxA user db.
- mode: NULLABLE
name: os_name
type: STRING
description:
- mode: NULLABLE
name: country
type: STRING
description:
- mode: NULLABLE
name: language
type: STRING
description:
- mode: NULLABLE
name: ua_version
type: STRING
description: |
The user's browser version.
- mode: NULLABLE
name: ua_browser
type: STRING
description: |
The user's web browser, e.g. 'Firefox' or 'Chrome'.
- mode: NULLABLE
name: entrypoint
type: STRING
description: |
The entrypoint of the flow session.
Typically a UI touchpoint like "preferences".
- mode: NULLABLE
name: utm_term
type: STRING
description: |
Marketing campaign search term
for the flow session.
Not stored if the DNT request header was 1.
- mode: NULLABLE
name: utm_medium
type: STRING
description: |
Marketing campaign medium for the flow session.
Not stored if the DNT request header was 1.
- mode: NULLABLE
name: utm_source
type: STRING
description: |
Marketing campaign source for
the flow session.
Not stored if the DNT request header was 1.
- mode: NULLABLE
name: utm_campaign
type: STRING
description: |
Marketing campaign identifier
for the flow session.
Not stored if the DNT request header was 1.
- mode: NULLABLE
name: utm_content
type: STRING
description: |
Marketing campaign content identifier
for the flow session.
Not stored if the DNT request header was 1.