RS-306 - ETL for newtab interactions (#3073)

This commit is contained in:
Alexander 2022-07-18 16:32:03 -04:00 коммит произвёл GitHub
Родитель b6e38e240b
Коммит f486dd1526
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 353 добавлений и 0 удалений

Просмотреть файл

@ -766,3 +766,21 @@ bqetl_fog_decision_support:
tags:
- impact/tier_3
- repo/bigquery-etl
bqetl_newtab:
default_args:
depends_on_past: false
email:
- telemetry-alerts@mozilla.com
- anicholson@mozilla.com
email_on_failure: true
email_on_retry: true
end_date: null
owner: anicholson@mozilla.com
retries: 2
retry_delay: 30m
start_date: '2022-07-01'
description: Schedules newtab related queries.
schedule_interval: daily
tags:
- impact/tier_1

72
dags/bqetl_newtab.py Normal file
Просмотреть файл

@ -0,0 +1,72 @@
# Generated via https://github.com/mozilla/bigquery-etl/blob/main/bigquery_etl/query_scheduling/generate_airflow_dags.py
from airflow import DAG
from airflow.sensors.external_task import ExternalTaskMarker
from airflow.sensors.external_task import ExternalTaskSensor
from airflow.utils.task_group import TaskGroup
import datetime
from utils.constants import ALLOWED_STATES, FAILED_STATES
from utils.gcp import bigquery_etl_query, gke_command
docs = """
### bqetl_newtab
Built from bigquery-etl repo, [`dags/bqetl_newtab.py`](https://github.com/mozilla/bigquery-etl/blob/main/dags/bqetl_newtab.py)
#### Description
Schedules newtab related queries.
#### Owner
anicholson@mozilla.com
"""
default_args = {
"owner": "anicholson@mozilla.com",
"start_date": datetime.datetime(2022, 7, 1, 0, 0),
"end_date": None,
"email": ["telemetry-alerts@mozilla.com", "anicholson@mozilla.com"],
"depends_on_past": False,
"retry_delay": datetime.timedelta(seconds=1800),
"email_on_failure": True,
"email_on_retry": True,
"retries": 2,
}
tags = ["impact/tier_1", "repo/bigquery-etl"]
with DAG(
"bqetl_newtab",
default_args=default_args,
schedule_interval="@daily",
doc_md=docs,
tags=tags,
) as dag:
telemetry_derived__newtab_interactions__v1 = bigquery_etl_query(
task_id="telemetry_derived__newtab_interactions__v1",
destination_table="newtab_interactions_v1",
dataset_id="telemetry_derived",
project_id="moz-fx-data-shared-prod",
owner="anicholson@mozilla.com",
email=["anicholson@mozilla.com", "telemetry-alerts@mozilla.com"],
date_partition_parameter="submission_date",
depends_on_past=False,
)
wait_for_copy_deduplicate_all = ExternalTaskSensor(
task_id="wait_for_copy_deduplicate_all",
external_dag_id="copy_deduplicate",
external_task_id="copy_deduplicate_all",
execution_delta=datetime.timedelta(days=-1, seconds=82800),
check_existence=True,
mode="reschedule",
allowed_states=ALLOWED_STATES,
failed_states=FAILED_STATES,
pool="DATA_ENG_EXTERNALTASKSENSOR",
)
telemetry_derived__newtab_interactions__v1.set_upstream(
wait_for_copy_deduplicate_all
)

Просмотреть файл

@ -0,0 +1,7 @@
CREATE OR REPLACE VIEW
`moz-fx-data-shared-prod.telemetry.newtab_interactions`
AS
SELECT
*
FROM
`moz-fx-data-shared-prod.telemetry_derived.newtab_interactions_v1`

Просмотреть файл

@ -0,0 +1,22 @@
friendly_name: Newtab Interactions
description: |-
Interactions on the newtab page, roughly one row per newtab "visit" per-day
(identified by the visit id).
owners:
- anicholson@mozilla.com
labels:
application: firefox
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_newtab
bigquery:
time_partitioning:
field: submission_date
type: day
require_partition_filter: true
expiration_days: null
clustering:
fields:
- channel
- country_code

Просмотреть файл

@ -0,0 +1,131 @@
WITH events_unnested AS (
SELECT
DATE(submission_timestamp) AS submission_date,
category AS event_category,
name AS event_name,
timestamp AS event_timestamp,
client_info,
metadata,
normalized_os,
normalized_os_version,
normalized_country_code,
normalized_channel,
ping_info,
extra AS event_details,
metrics
FROM
-- https://dictionary.telemetry.mozilla.org/apps/firefox_desktop/pings/newtab
`moz-fx-data-shared-prod.firefox_desktop_stable.newtab_v1`,
UNNEST(events)
WHERE
DATE(submission_timestamp) = @submission_date
AND category IN ('newtab', 'topsites', 'newtab.search', 'newtab.search.ad')
AND name IN ('closed', 'opened', 'impression', 'issued', 'click')
),
categorized_events AS (
SELECT
-- Unique Identifiers
client_info.client_id,
mozfun.map.get_key(event_details, "newtab_visit_id") AS newtab_visit_id,
-- Metrics
event_name = "issued"
AND event_category = "newtab.search" AS is_search_issued,
-- ??? is_tagged_search
-- ??? is_follow_on_search
event_name = "impression"
AND event_category = 'newtab.search.ad'
AND mozfun.map.get_key(event_details, "is_tagged") = "true" AS is_tagged_search_ad_impression,
event_name = "impression"
AND event_category = 'newtab.search.ad'
AND mozfun.map.get_key(
event_details,
"is_follow_on"
) = "true" AS is_follow_on_search_ad_impression,
event_name = "click"
AND event_category = 'newtab.search.ad'
AND mozfun.map.get_key(event_details, "is_tagged") = "true" AS is_tagged_search_ad_click,
event_name = "click"
AND event_category = 'newtab.search.ad'
AND mozfun.map.get_key(event_details, "is_follow_on") = "true" AS is_follow_on_search_ad_click,
event_category = 'topsites'
AND event_name = 'impression' AS is_topsite_impression,
event_category = 'topsites'
AND event_name = 'impression'
AND mozfun.map.get_key(
event_details,
"is_sponsored"
) = "true" AS is_sponsored_topsite_impression,
event_category = 'topsites'
AND event_name = 'click' AS is_topsite_click,
event_category = 'topsites'
AND event_name = 'click'
AND mozfun.map.get_key(event_details, "is_sponsored") = "true" AS is_sponsored_topsite_click,
IF(event_name = "opened", event_timestamp, NULL) AS newtab_visit_started_at,
IF(event_name = "closed", event_timestamp, NULL) AS newtab_visit_ended_at,
-- Client/Session-unique attributes
normalized_os,
normalized_os_version,
normalized_country_code,
normalized_channel,
client_info.app_display_version,
mozfun.map.get_key(event_details, "source") AS newtab_open_source,
metrics.string.search_engine_private_engine_id AS default_search_engine,
metrics.string.search_engine_default_engine_id AS default_private_search_engine,
ping_info.experiments,
-- ??? private_browsing_mode
-- Partially unique session attributes
mozfun.map.get_key(event_details, "telemetry_id") AS search_engine,
mozfun.map.get_key(event_details, "search_access_point") AS search_access_point,
-- ??? topsite_advertiser_id
-- ??? topsite_position
submission_date
FROM
events_unnested
)
SELECT
newtab_visit_id,
client_id,
submission_date,
search_engine,
search_access_point,
-- topsite_advertiser_id,
-- topsite_position,
ANY_VALUE(experiments) AS experiments,
ANY_VALUE(default_private_search_engine) AS default_private_search_engine,
ANY_VALUE(default_search_engine) AS default_search_engine,
ANY_VALUE(normalized_os) AS os,
ANY_VALUE(normalized_os_version) AS os_version,
ANY_VALUE(normalized_country_code) AS country_code,
ANY_VALUE(normalized_channel) AS channel,
ANY_VALUE(app_display_version) AS browser_version,
"Firefox Desktop" AS browser_name,
ANY_VALUE(newtab_open_source) AS newtab_open_source,
MIN(newtab_visit_started_at) AS newtab_visit_started_at,
MIN(newtab_visit_ended_at) AS newtab_visit_ended_at,
COUNTIF(is_topsite_click) AS topsite_clicks,
COUNTIF(is_sponsored_topsite_click) AS sponsored_topsite_clicks,
COUNTIF(is_topsite_impression) AS topsite_impressions,
COUNTIF(is_sponsored_topsite_impression) AS sponsored_topsite_impressions,
COUNTIF(is_search_issued) AS searches,
COUNTIF(is_tagged_search_ad_click) AS tagged_search_ad_clicks,
COUNTIF(is_tagged_search_ad_impression) AS tagged_search_ad_impressions,
COUNTIF(is_follow_on_search_ad_click) AS follow_on_search_ad_clicks,
COUNTIF(is_follow_on_search_ad_impression) AS follow_on_search_ad_impressions,
COUNTIF(
is_tagged_search_ad_click
AND is_follow_on_search_ad_click
) AS tagged_follow_on_search_ad_clicks,
COUNTIF(
is_tagged_search_ad_impression
AND is_follow_on_search_ad_impression
) AS tagged_follow_on_search_ad_impressions,
FROM
categorized_events
GROUP BY
newtab_visit_id,
client_id,
submission_date,
search_engine,
search_access_point
-- topsite_advertiser_id,
-- topsite_position

Просмотреть файл

@ -0,0 +1,103 @@
fields:
- mode: NULLABLE
name: newtab_visit_id
type: STRING
- mode: NULLABLE
name: client_id
type: STRING
- mode: NULLABLE
name: submission_date
type: DATE
- mode: NULLABLE
name: search_engine
type: STRING
- mode: NULLABLE
name: search_access_point
type: STRING
- fields:
- mode: NULLABLE
name: key
type: STRING
- fields:
- mode: NULLABLE
name: branch
type: STRING
- fields:
- mode: NULLABLE
name: type
type: STRING
mode: NULLABLE
name: extra
type: RECORD
mode: NULLABLE
name: value
type: RECORD
mode: REPEATED
name: experiments
type: RECORD
- mode: NULLABLE
name: default_private_search_engine
type: STRING
- mode: NULLABLE
name: default_search_engine
type: STRING
- mode: NULLABLE
name: os
type: STRING
- mode: NULLABLE
name: os_version
type: STRING
- mode: NULLABLE
name: country_code
type: STRING
- mode: NULLABLE
name: channel
type: STRING
- mode: NULLABLE
name: browser_version
type: STRING
- mode: NULLABLE
name: browser_name
type: STRING
- mode: NULLABLE
name: newtab_open_source
type: STRING
- mode: NULLABLE
name: newtab_visit_started_at
type: INTEGER
- mode: NULLABLE
name: newtab_visit_ended_at
type: INTEGER
- mode: NULLABLE
name: topsite_clicks
type: INTEGER
- mode: NULLABLE
name: sponsored_topsite_clicks
type: INTEGER
- mode: NULLABLE
name: topsite_impressions
type: INTEGER
- mode: NULLABLE
name: sponsored_topsite_impressions
type: INTEGER
- mode: NULLABLE
name: searches
type: INTEGER
- mode: NULLABLE
name: tagged_search_ad_clicks
type: INTEGER
- mode: NULLABLE
name: tagged_search_ad_impressions
type: INTEGER
- mode: NULLABLE
name: follow_on_search_ad_clicks
type: INTEGER
- mode: NULLABLE
name: follow_on_search_ad_impressions
type: INTEGER
- mode: NULLABLE
name: tagged_follow_on_search_ad_clicks
type: INTEGER
- mode: NULLABLE
name: tagged_follow_on_search_ad_impressions
type: INTEGER