feat(DENG-351): fxa_users_services_devices_daily_v1 model added to firefox_accounts_derived dataset (#3378)
* fxa_users_services_devices_daily_events_v1 model added to firefox_accounts_derived dataset * added schema.yaml for fxa_users_services_devices_daily_events_v1 * removed invalid comment from metadata.yaml for fxa_users_services_devices_daily_events_v1 * removed invalid comment from metadata.yaml for fxa_users_services_devices_daily_events_v1 * added bigquery table configuration * added more comments as requested by @gkabbz in PR#3378 * regenerated dags/bqetl_fxa_events.py * made some tweaks to the query to make it more explicit * fixed fxa_events DAG * fixed fxa_users_services_devices_daily_events_v1 schema * updated table description for fxa_users_services_devices_daily_events_v1 * renamed fxa_users_services_devices_daily_events_v1 to fxa_users_services_devices_daily_v1
This commit is contained in:
Родитель
6371122594
Коммит
b62df40396
|
@ -300,6 +300,21 @@ with DAG(
|
|||
depends_on_past=False,
|
||||
)
|
||||
|
||||
firefox_accounts_derived__fxa_users_services_devices_daily__v1 = bigquery_etl_query(
|
||||
task_id="firefox_accounts_derived__fxa_users_services_devices_daily__v1",
|
||||
destination_table="fxa_users_services_devices_daily_v1",
|
||||
dataset_id="firefox_accounts_derived",
|
||||
project_id="moz-fx-data-shared-prod",
|
||||
owner="kignasiak@mozilla.com",
|
||||
email=[
|
||||
"dthorn@mozilla.com",
|
||||
"kignasiak@mozilla.com",
|
||||
"telemetry-alerts@mozilla.com",
|
||||
],
|
||||
date_partition_parameter="submission_date",
|
||||
depends_on_past=False,
|
||||
)
|
||||
|
||||
firefox_accounts_derived__nonprod_fxa_auth_events__v1 = bigquery_etl_query(
|
||||
task_id="firefox_accounts_derived__nonprod_fxa_auth_events__v1",
|
||||
destination_table="nonprod_fxa_auth_events_v1",
|
||||
|
@ -367,3 +382,11 @@ with DAG(
|
|||
firefox_accounts_derived__fxa_users_services_daily__v1.set_upstream(
|
||||
firefox_accounts_derived__fxa_content_events__v1
|
||||
)
|
||||
|
||||
firefox_accounts_derived__fxa_users_services_devices_daily__v1.set_upstream(
|
||||
firefox_accounts_derived__fxa_auth_events__v1
|
||||
)
|
||||
|
||||
firefox_accounts_derived__fxa_users_services_devices_daily__v1.set_upstream(
|
||||
firefox_accounts_derived__fxa_content_events__v1
|
||||
)
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
---
|
||||
|
||||
friendly_name: FxA Users Services Devices Daily Events
|
||||
description: |
|
||||
Contains entries for user, service, device combination observed
|
||||
on daily basis. Only 'fxa_login - complete' and 'fxa_reg - complete'
|
||||
events are considered.
|
||||
|
||||
This model also serves as the basis for:
|
||||
- fxa_users_services_devices_first_seen_events_v1
|
||||
- fxa_users_services_devices_last_seen_events_v1
|
||||
|
||||
For more info on FxA data check:
|
||||
- https://docs.telemetry.mozilla.org/datasets/fxa.html
|
||||
- https://mozilla.github.io/ecosystem-platform/
|
||||
owners:
|
||||
- kignasiak@mozilla.com
|
||||
labels:
|
||||
application: fxa
|
||||
incremental: true
|
||||
schedule: daily
|
||||
scheduling:
|
||||
dag_name: bqetl_fxa_events
|
||||
date_partition_parameter: submission_date
|
||||
bigquery:
|
||||
time_partitioning:
|
||||
field: timestamp
|
||||
type: day
|
||||
require_partition_filter: true
|
||||
clustering:
|
||||
fields:
|
||||
- service
|
||||
- os_name
|
||||
- country
|
||||
|
||||
# Other notes:
|
||||
# windows over events that may cross the midnight boundary.
|
||||
# TODO: maybe should mention this inside the query...
|
|
@ -0,0 +1,152 @@
|
|||
WITH fxa_events AS (
|
||||
SELECT
|
||||
`timestamp`,
|
||||
user_id,
|
||||
IF(service IS NULL AND event_type = 'fxa_activity - cert_signed', 'sync', service) AS service,
|
||||
device_id,
|
||||
os_name,
|
||||
flow_id,
|
||||
event_type,
|
||||
country,
|
||||
`language`,
|
||||
entrypoint,
|
||||
utm_term,
|
||||
utm_medium,
|
||||
utm_source,
|
||||
utm_campaign,
|
||||
utm_content,
|
||||
ua_version,
|
||||
ua_browser,
|
||||
FROM
|
||||
`moz-fx-data-shared-prod.firefox_accounts.fxa_content_auth_oauth_events` -- TODO: this will need updated to fxa_all_events once unified
|
||||
WHERE
|
||||
DATE(`timestamp`)
|
||||
BETWEEN DATE_SUB(@submission_date, INTERVAL 1 DAY)
|
||||
AND @submission_date
|
||||
-- re-using the filter from users_services_daily_v1 for consistency across the models
|
||||
-- at some point in the future we should re-evaluate this list
|
||||
AND event_type NOT IN ( --
|
||||
'fxa_email - bounced',
|
||||
'fxa_email - click',
|
||||
'fxa_email - sent',
|
||||
'fxa_reg - password_blocked',
|
||||
'fxa_reg - password_common',
|
||||
'fxa_reg - password_enrolled',
|
||||
'fxa_reg - password_missing',
|
||||
'fxa_sms - sent',
|
||||
'mktg - email_click',
|
||||
'mktg - email_open',
|
||||
'mktg - email_sent',
|
||||
'sync - repair_success',
|
||||
'sync - repair_triggered'
|
||||
)
|
||||
),
|
||||
entrypoints AS (
|
||||
SELECT DISTINCT
|
||||
flow_id,
|
||||
entrypoint
|
||||
FROM
|
||||
fxa_events
|
||||
WHERE
|
||||
-- if both values are not set then the record
|
||||
-- cannot be used for mapping
|
||||
flow_id IS NOT NULL
|
||||
AND entrypoint IS NOT NULL
|
||||
QUALIFY
|
||||
ROW_NUMBER() OVER (PARTITION BY flow_id ORDER BY `timestamp` ASC) = 1
|
||||
),
|
||||
utms AS (
|
||||
SELECT DISTINCT
|
||||
flow_id,
|
||||
FIRST_VALUE(utm_term) OVER (_window) AS utm_term,
|
||||
FIRST_VALUE(utm_medium) OVER (_window) AS utm_medium,
|
||||
FIRST_VALUE(utm_source) OVER (_window) AS utm_source,
|
||||
FIRST_VALUE(utm_campaign) OVER (_window) AS utm_campaign,
|
||||
FIRST_VALUE(utm_content) OVER (_window) AS utm_content,
|
||||
FROM
|
||||
fxa_events
|
||||
WHERE
|
||||
flow_id IS NOT NULL
|
||||
WINDOW
|
||||
_window AS (
|
||||
PARTITION BY
|
||||
flow_id
|
||||
ORDER BY
|
||||
`timestamp` ASC
|
||||
ROWS BETWEEN
|
||||
UNBOUNDED PRECEDING
|
||||
AND UNBOUNDED FOLLOWING
|
||||
)
|
||||
),
|
||||
device_service_users_entries AS (
|
||||
SELECT DISTINCT
|
||||
`timestamp`,
|
||||
user_id,
|
||||
service,
|
||||
device_id,
|
||||
os_name,
|
||||
flow_id,
|
||||
event_type,
|
||||
country,
|
||||
`language`,
|
||||
ua_version,
|
||||
ua_browser,
|
||||
FROM
|
||||
fxa_events
|
||||
WHERE
|
||||
DATE(`timestamp`) = @submission_date
|
||||
-- Filtering out for these specific events to be consistent with the logic used by
|
||||
-- fxa_users_daily_v1 and fxa_users_services_daily_v1
|
||||
AND ((event_type IN ('fxa_login - complete', 'fxa_reg - complete') AND service IS NOT NULL))
|
||||
)
|
||||
SELECT
|
||||
-- device_service_users_entries
|
||||
device_service_users_entries.`timestamp`,
|
||||
device_service_users_entries.flow_id,
|
||||
device_service_users_entries.event_type,
|
||||
device_service_users_entries.user_id,
|
||||
device_service_users_entries.service,
|
||||
device_service_users_entries.device_id,
|
||||
device_service_users_entries.os_name,
|
||||
device_service_users_entries.country,
|
||||
device_service_users_entries.`language`,
|
||||
device_service_users_entries.ua_version,
|
||||
device_service_users_entries.ua_browser,
|
||||
-- entrypoints
|
||||
entrypoints.entrypoint,
|
||||
-- utms
|
||||
utms.utm_term,
|
||||
utms.utm_medium,
|
||||
utms.utm_source,
|
||||
utms.utm_campaign,
|
||||
utms.utm_content,
|
||||
FROM
|
||||
device_service_users_entries
|
||||
LEFT JOIN
|
||||
entrypoints
|
||||
USING
|
||||
(flow_id)
|
||||
LEFT JOIN
|
||||
utms
|
||||
USING
|
||||
(flow_id)
|
||||
WHERE
|
||||
-- making sure the user is registered
|
||||
user_id IS NOT NULL
|
||||
-- making sure there is a flow_id associated with this session
|
||||
-- the current logic relies on this value being set to retrieve
|
||||
-- its attributes correctly
|
||||
AND flow_id IS NOT NULL
|
||||
-- if either service or device_id is null then the record
|
||||
-- is useless for this model
|
||||
AND service IS NOT NULL
|
||||
AND device_id IS NOT NULL
|
||||
QUALIFY
|
||||
ROW_NUMBER() OVER (
|
||||
PARTITION BY
|
||||
user_id,
|
||||
service,
|
||||
device_id
|
||||
ORDER BY
|
||||
`timestamp` ASC
|
||||
) = 1 -- this could be partitioned by first_flow_id to handle multiple device_ids being attached to a single flow_id
|
|
@ -0,0 +1,114 @@
|
|||
fields:
|
||||
|
||||
- mode: NULLABLE
|
||||
name: timestamp
|
||||
type: TIMESTAMP
|
||||
description: |
|
||||
Datetime value when the event occurred.
|
||||
|
||||
- mode: NULLABLE
|
||||
name: flow_id
|
||||
type: STRING
|
||||
description: |
|
||||
The flow identifier. A randomly-generated opaque id.
|
||||
|
||||
- mode: NULLABLE
|
||||
name: event_type
|
||||
type: STRING
|
||||
description: |
|
||||
Type of the FxA event recorded.
|
||||
|
||||
- mode: NULLABLE
|
||||
name: user_id
|
||||
type: STRING
|
||||
description:
|
||||
|
||||
- mode: NULLABLE
|
||||
name: service
|
||||
type: STRING
|
||||
description: |
|
||||
The service identifier. For Sync it may be empty or sync.
|
||||
For OAuth reliers it is their hex client id.
|
||||
|
||||
- mode: NULLABLE
|
||||
|
||||
|
||||
name: device_id
|
||||
type: STRING
|
||||
description: |
|
||||
The most granual field.
|
||||
The id of the device record.
|
||||
This does correlate back to a record the FxA user db.
|
||||
|
||||
- mode: NULLABLE
|
||||
name: os_name
|
||||
type: STRING
|
||||
description:
|
||||
|
||||
- mode: NULLABLE
|
||||
name: country
|
||||
type: STRING
|
||||
description:
|
||||
|
||||
- mode: NULLABLE
|
||||
name: language
|
||||
type: STRING
|
||||
description:
|
||||
|
||||
- mode: NULLABLE
|
||||
name: ua_version
|
||||
type: STRING
|
||||
description: |
|
||||
The user's browser version.
|
||||
|
||||
- mode: NULLABLE
|
||||
name: ua_browser
|
||||
type: STRING
|
||||
description: |
|
||||
The user's web browser, e.g. 'Firefox' or 'Chrome'.
|
||||
|
||||
- mode: NULLABLE
|
||||
name: entrypoint
|
||||
type: STRING
|
||||
description: |
|
||||
The entrypoint of the flow session.
|
||||
Typically a UI touchpoint like "preferences".
|
||||
|
||||
- mode: NULLABLE
|
||||
name: utm_term
|
||||
type: STRING
|
||||
description: |
|
||||
Marketing campaign search term
|
||||
for the flow session.
|
||||
Not stored if the DNT request header was 1.
|
||||
|
||||
- mode: NULLABLE
|
||||
name: utm_medium
|
||||
type: STRING
|
||||
description: |
|
||||
Marketing campaign medium for the flow session.
|
||||
Not stored if the DNT request header was 1.
|
||||
|
||||
- mode: NULLABLE
|
||||
name: utm_source
|
||||
type: STRING
|
||||
description: |
|
||||
Marketing campaign source for
|
||||
the flow session.
|
||||
Not stored if the DNT request header was 1.
|
||||
|
||||
- mode: NULLABLE
|
||||
name: utm_campaign
|
||||
type: STRING
|
||||
description: |
|
||||
Marketing campaign identifier
|
||||
for the flow session.
|
||||
Not stored if the DNT request header was 1.
|
||||
|
||||
- mode: NULLABLE
|
||||
name: utm_content
|
||||
type: STRING
|
||||
description: |
|
||||
Marketing campaign content identifier
|
||||
for the flow session.
|
||||
Not stored if the DNT request header was 1.
|
Загрузка…
Ссылка в новой задаче