From b62df40396ab9ad922160a2114f6195dd7701e6f Mon Sep 17 00:00:00 2001 From: kik-kik <42538694+kik-kik@users.noreply.github.com> Date: Wed, 30 Nov 2022 14:16:07 +0000 Subject: [PATCH] feat(DENG-351): fxa_users_services_devices_daily_v1 model added to firefox_accounts_derived dataset (#3378) * fxa_users_services_devices_daily_events_v1 model added to firefox_accounts_derived dataset * added schema.yaml for fxa_users_services_devices_daily_events_v1 * removed invalid comment from metadata.yaml for fxa_users_services_devices_daily_events_v1 * removed invalid comment from metadata.yaml for fxa_users_services_devices_daily_events_v1 * added bigquery table configuration * added more comments as requested by @gkabbz in PR#3378 * regenerated dags/bqetl_fxa_events.py * made some tweaks to the query to make it more explicit * fixed fxa_events DAG * fixed fxa_users_services_devices_daily_events_v1 schema * updated table description for fxa_users_services_devices_daily_events_v1 * renamed fxa_users_services_devices_daily_events_v1 to fxa_users_services_devices_daily_v1 --- dags/bqetl_fxa_events.py | 23 +++ .../metadata.yaml | 38 +++++ .../query.sql | 152 ++++++++++++++++++ .../schema.yaml | 114 +++++++++++++ 4 files changed, 327 insertions(+) create mode 100644 sql/moz-fx-data-shared-prod/firefox_accounts_derived/fxa_users_services_devices_daily_v1/metadata.yaml create mode 100644 sql/moz-fx-data-shared-prod/firefox_accounts_derived/fxa_users_services_devices_daily_v1/query.sql create mode 100644 sql/moz-fx-data-shared-prod/firefox_accounts_derived/fxa_users_services_devices_daily_v1/schema.yaml diff --git a/dags/bqetl_fxa_events.py b/dags/bqetl_fxa_events.py index c6baca8afb..4810e81f3f 100644 --- a/dags/bqetl_fxa_events.py +++ b/dags/bqetl_fxa_events.py @@ -300,6 +300,21 @@ with DAG( depends_on_past=False, ) + firefox_accounts_derived__fxa_users_services_devices_daily__v1 = bigquery_etl_query( + task_id="firefox_accounts_derived__fxa_users_services_devices_daily__v1", + destination_table="fxa_users_services_devices_daily_v1", + dataset_id="firefox_accounts_derived", + project_id="moz-fx-data-shared-prod", + owner="kignasiak@mozilla.com", + email=[ + "dthorn@mozilla.com", + "kignasiak@mozilla.com", + "telemetry-alerts@mozilla.com", + ], + date_partition_parameter="submission_date", + depends_on_past=False, + ) + firefox_accounts_derived__nonprod_fxa_auth_events__v1 = bigquery_etl_query( task_id="firefox_accounts_derived__nonprod_fxa_auth_events__v1", destination_table="nonprod_fxa_auth_events_v1", @@ -367,3 +382,11 @@ with DAG( firefox_accounts_derived__fxa_users_services_daily__v1.set_upstream( firefox_accounts_derived__fxa_content_events__v1 ) + + firefox_accounts_derived__fxa_users_services_devices_daily__v1.set_upstream( + firefox_accounts_derived__fxa_auth_events__v1 + ) + + firefox_accounts_derived__fxa_users_services_devices_daily__v1.set_upstream( + firefox_accounts_derived__fxa_content_events__v1 + ) diff --git a/sql/moz-fx-data-shared-prod/firefox_accounts_derived/fxa_users_services_devices_daily_v1/metadata.yaml b/sql/moz-fx-data-shared-prod/firefox_accounts_derived/fxa_users_services_devices_daily_v1/metadata.yaml new file mode 100644 index 0000000000..4734321e30 --- /dev/null +++ b/sql/moz-fx-data-shared-prod/firefox_accounts_derived/fxa_users_services_devices_daily_v1/metadata.yaml @@ -0,0 +1,38 @@ +--- + +friendly_name: FxA Users Services Devices Daily Events +description: | + Contains entries for user, service, device combination observed + on daily basis. Only 'fxa_login - complete' and 'fxa_reg - complete' + events are considered. + + This model also serves as the basis for: + - fxa_users_services_devices_first_seen_events_v1 + - fxa_users_services_devices_last_seen_events_v1 + + For more info on FxA data check: + - https://docs.telemetry.mozilla.org/datasets/fxa.html + - https://mozilla.github.io/ecosystem-platform/ +owners: + - kignasiak@mozilla.com +labels: + application: fxa + incremental: true + schedule: daily +scheduling: + dag_name: bqetl_fxa_events + date_partition_parameter: submission_date +bigquery: + time_partitioning: + field: timestamp + type: day + require_partition_filter: true + clustering: + fields: + - service + - os_name + - country + +# Other notes: +# windows over events that may cross the midnight boundary. +# TODO: maybe should mention this inside the query... diff --git a/sql/moz-fx-data-shared-prod/firefox_accounts_derived/fxa_users_services_devices_daily_v1/query.sql b/sql/moz-fx-data-shared-prod/firefox_accounts_derived/fxa_users_services_devices_daily_v1/query.sql new file mode 100644 index 0000000000..6d95d9c197 --- /dev/null +++ b/sql/moz-fx-data-shared-prod/firefox_accounts_derived/fxa_users_services_devices_daily_v1/query.sql @@ -0,0 +1,152 @@ +WITH fxa_events AS ( + SELECT + `timestamp`, + user_id, + IF(service IS NULL AND event_type = 'fxa_activity - cert_signed', 'sync', service) AS service, + device_id, + os_name, + flow_id, + event_type, + country, + `language`, + entrypoint, + utm_term, + utm_medium, + utm_source, + utm_campaign, + utm_content, + ua_version, + ua_browser, + FROM + `moz-fx-data-shared-prod.firefox_accounts.fxa_content_auth_oauth_events` -- TODO: this will need updated to fxa_all_events once unified + WHERE + DATE(`timestamp`) + BETWEEN DATE_SUB(@submission_date, INTERVAL 1 DAY) + AND @submission_date + -- re-using the filter from users_services_daily_v1 for consistency across the models + -- at some point in the future we should re-evaluate this list + AND event_type NOT IN ( -- + 'fxa_email - bounced', + 'fxa_email - click', + 'fxa_email - sent', + 'fxa_reg - password_blocked', + 'fxa_reg - password_common', + 'fxa_reg - password_enrolled', + 'fxa_reg - password_missing', + 'fxa_sms - sent', + 'mktg - email_click', + 'mktg - email_open', + 'mktg - email_sent', + 'sync - repair_success', + 'sync - repair_triggered' + ) +), +entrypoints AS ( + SELECT DISTINCT + flow_id, + entrypoint + FROM + fxa_events + WHERE + -- if both values are not set then the record + -- cannot be used for mapping + flow_id IS NOT NULL + AND entrypoint IS NOT NULL + QUALIFY + ROW_NUMBER() OVER (PARTITION BY flow_id ORDER BY `timestamp` ASC) = 1 +), +utms AS ( + SELECT DISTINCT + flow_id, + FIRST_VALUE(utm_term) OVER (_window) AS utm_term, + FIRST_VALUE(utm_medium) OVER (_window) AS utm_medium, + FIRST_VALUE(utm_source) OVER (_window) AS utm_source, + FIRST_VALUE(utm_campaign) OVER (_window) AS utm_campaign, + FIRST_VALUE(utm_content) OVER (_window) AS utm_content, + FROM + fxa_events + WHERE + flow_id IS NOT NULL + WINDOW + _window AS ( + PARTITION BY + flow_id + ORDER BY + `timestamp` ASC + ROWS BETWEEN + UNBOUNDED PRECEDING + AND UNBOUNDED FOLLOWING + ) +), +device_service_users_entries AS ( + SELECT DISTINCT + `timestamp`, + user_id, + service, + device_id, + os_name, + flow_id, + event_type, + country, + `language`, + ua_version, + ua_browser, + FROM + fxa_events + WHERE + DATE(`timestamp`) = @submission_date + -- Filtering out for these specific events to be consistent with the logic used by + -- fxa_users_daily_v1 and fxa_users_services_daily_v1 + AND ((event_type IN ('fxa_login - complete', 'fxa_reg - complete') AND service IS NOT NULL)) +) +SELECT + -- device_service_users_entries + device_service_users_entries.`timestamp`, + device_service_users_entries.flow_id, + device_service_users_entries.event_type, + device_service_users_entries.user_id, + device_service_users_entries.service, + device_service_users_entries.device_id, + device_service_users_entries.os_name, + device_service_users_entries.country, + device_service_users_entries.`language`, + device_service_users_entries.ua_version, + device_service_users_entries.ua_browser, + -- entrypoints + entrypoints.entrypoint, + -- utms + utms.utm_term, + utms.utm_medium, + utms.utm_source, + utms.utm_campaign, + utms.utm_content, +FROM + device_service_users_entries +LEFT JOIN + entrypoints +USING + (flow_id) +LEFT JOIN + utms +USING + (flow_id) +WHERE + -- making sure the user is registered + user_id IS NOT NULL + -- making sure there is a flow_id associated with this session + -- the current logic relies on this value being set to retrieve + -- its attributes correctly + AND flow_id IS NOT NULL + -- if either service or device_id is null then the record + -- is useless for this model + AND service IS NOT NULL + AND device_id IS NOT NULL +QUALIFY + ROW_NUMBER() OVER ( + PARTITION BY + user_id, + service, + device_id + ORDER BY + `timestamp` ASC + ) = 1 -- this could be partitioned by first_flow_id to handle multiple device_ids being attached to a single flow_id diff --git a/sql/moz-fx-data-shared-prod/firefox_accounts_derived/fxa_users_services_devices_daily_v1/schema.yaml b/sql/moz-fx-data-shared-prod/firefox_accounts_derived/fxa_users_services_devices_daily_v1/schema.yaml new file mode 100644 index 0000000000..3d1bdecb25 --- /dev/null +++ b/sql/moz-fx-data-shared-prod/firefox_accounts_derived/fxa_users_services_devices_daily_v1/schema.yaml @@ -0,0 +1,114 @@ +fields: + +- mode: NULLABLE + name: timestamp + type: TIMESTAMP + description: | + Datetime value when the event occurred. + +- mode: NULLABLE + name: flow_id + type: STRING + description: | + The flow identifier. A randomly-generated opaque id. + +- mode: NULLABLE + name: event_type + type: STRING + description: | + Type of the FxA event recorded. + +- mode: NULLABLE + name: user_id + type: STRING + description: + +- mode: NULLABLE + name: service + type: STRING + description: | + The service identifier. For Sync it may be empty or sync. + For OAuth reliers it is their hex client id. + +- mode: NULLABLE + + + name: device_id + type: STRING + description: | + The most granual field. + The id of the device record. + This does correlate back to a record the FxA user db. + +- mode: NULLABLE + name: os_name + type: STRING + description: + +- mode: NULLABLE + name: country + type: STRING + description: + +- mode: NULLABLE + name: language + type: STRING + description: + +- mode: NULLABLE + name: ua_version + type: STRING + description: | + The user's browser version. + +- mode: NULLABLE + name: ua_browser + type: STRING + description: | + The user's web browser, e.g. 'Firefox' or 'Chrome'. + +- mode: NULLABLE + name: entrypoint + type: STRING + description: | + The entrypoint of the flow session. + Typically a UI touchpoint like "preferences". + +- mode: NULLABLE + name: utm_term + type: STRING + description: | + Marketing campaign search term + for the flow session. + Not stored if the DNT request header was 1. + +- mode: NULLABLE + name: utm_medium + type: STRING + description: | + Marketing campaign medium for the flow session. + Not stored if the DNT request header was 1. + +- mode: NULLABLE + name: utm_source + type: STRING + description: | + Marketing campaign source for + the flow session. + Not stored if the DNT request header was 1. + +- mode: NULLABLE + name: utm_campaign + type: STRING + description: | + Marketing campaign identifier + for the flow session. + Not stored if the DNT request header was 1. + +- mode: NULLABLE + name: utm_content + type: STRING + description: | + Marketing campaign content identifier + for the flow session. + Not stored if the DNT request header was 1.