RS-306 - ETL for newtab interactions (#3073)
This commit is contained in:
Родитель
b6e38e240b
Коммит
f486dd1526
18
dags.yaml
18
dags.yaml
|
@ -766,3 +766,21 @@ bqetl_fog_decision_support:
|
|||
tags:
|
||||
- impact/tier_3
|
||||
- repo/bigquery-etl
|
||||
|
||||
bqetl_newtab:
|
||||
default_args:
|
||||
depends_on_past: false
|
||||
email:
|
||||
- telemetry-alerts@mozilla.com
|
||||
- anicholson@mozilla.com
|
||||
email_on_failure: true
|
||||
email_on_retry: true
|
||||
end_date: null
|
||||
owner: anicholson@mozilla.com
|
||||
retries: 2
|
||||
retry_delay: 30m
|
||||
start_date: '2022-07-01'
|
||||
description: Schedules newtab related queries.
|
||||
schedule_interval: daily
|
||||
tags:
|
||||
- impact/tier_1
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
# Generated via https://github.com/mozilla/bigquery-etl/blob/main/bigquery_etl/query_scheduling/generate_airflow_dags.py
|
||||
|
||||
from airflow import DAG
|
||||
from airflow.sensors.external_task import ExternalTaskMarker
|
||||
from airflow.sensors.external_task import ExternalTaskSensor
|
||||
from airflow.utils.task_group import TaskGroup
|
||||
import datetime
|
||||
from utils.constants import ALLOWED_STATES, FAILED_STATES
|
||||
from utils.gcp import bigquery_etl_query, gke_command
|
||||
|
||||
docs = """
|
||||
### bqetl_newtab
|
||||
|
||||
Built from bigquery-etl repo, [`dags/bqetl_newtab.py`](https://github.com/mozilla/bigquery-etl/blob/main/dags/bqetl_newtab.py)
|
||||
|
||||
#### Description
|
||||
|
||||
Schedules newtab related queries.
|
||||
#### Owner
|
||||
|
||||
anicholson@mozilla.com
|
||||
"""
|
||||
|
||||
|
||||
default_args = {
|
||||
"owner": "anicholson@mozilla.com",
|
||||
"start_date": datetime.datetime(2022, 7, 1, 0, 0),
|
||||
"end_date": None,
|
||||
"email": ["telemetry-alerts@mozilla.com", "anicholson@mozilla.com"],
|
||||
"depends_on_past": False,
|
||||
"retry_delay": datetime.timedelta(seconds=1800),
|
||||
"email_on_failure": True,
|
||||
"email_on_retry": True,
|
||||
"retries": 2,
|
||||
}
|
||||
|
||||
tags = ["impact/tier_1", "repo/bigquery-etl"]
|
||||
|
||||
with DAG(
|
||||
"bqetl_newtab",
|
||||
default_args=default_args,
|
||||
schedule_interval="@daily",
|
||||
doc_md=docs,
|
||||
tags=tags,
|
||||
) as dag:
|
||||
|
||||
telemetry_derived__newtab_interactions__v1 = bigquery_etl_query(
|
||||
task_id="telemetry_derived__newtab_interactions__v1",
|
||||
destination_table="newtab_interactions_v1",
|
||||
dataset_id="telemetry_derived",
|
||||
project_id="moz-fx-data-shared-prod",
|
||||
owner="anicholson@mozilla.com",
|
||||
email=["anicholson@mozilla.com", "telemetry-alerts@mozilla.com"],
|
||||
date_partition_parameter="submission_date",
|
||||
depends_on_past=False,
|
||||
)
|
||||
|
||||
wait_for_copy_deduplicate_all = ExternalTaskSensor(
|
||||
task_id="wait_for_copy_deduplicate_all",
|
||||
external_dag_id="copy_deduplicate",
|
||||
external_task_id="copy_deduplicate_all",
|
||||
execution_delta=datetime.timedelta(days=-1, seconds=82800),
|
||||
check_existence=True,
|
||||
mode="reschedule",
|
||||
allowed_states=ALLOWED_STATES,
|
||||
failed_states=FAILED_STATES,
|
||||
pool="DATA_ENG_EXTERNALTASKSENSOR",
|
||||
)
|
||||
|
||||
telemetry_derived__newtab_interactions__v1.set_upstream(
|
||||
wait_for_copy_deduplicate_all
|
||||
)
|
|
@ -0,0 +1,7 @@
|
|||
CREATE OR REPLACE VIEW
|
||||
`moz-fx-data-shared-prod.telemetry.newtab_interactions`
|
||||
AS
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
`moz-fx-data-shared-prod.telemetry_derived.newtab_interactions_v1`
|
|
@ -0,0 +1,22 @@
|
|||
friendly_name: Newtab Interactions
|
||||
description: |-
|
||||
Interactions on the newtab page, roughly one row per newtab "visit" per-day
|
||||
(identified by the visit id).
|
||||
owners:
|
||||
- anicholson@mozilla.com
|
||||
labels:
|
||||
application: firefox
|
||||
incremental: true
|
||||
schedule: daily
|
||||
scheduling:
|
||||
dag_name: bqetl_newtab
|
||||
bigquery:
|
||||
time_partitioning:
|
||||
field: submission_date
|
||||
type: day
|
||||
require_partition_filter: true
|
||||
expiration_days: null
|
||||
clustering:
|
||||
fields:
|
||||
- channel
|
||||
- country_code
|
|
@ -0,0 +1,131 @@
|
|||
WITH events_unnested AS (
|
||||
SELECT
|
||||
DATE(submission_timestamp) AS submission_date,
|
||||
category AS event_category,
|
||||
name AS event_name,
|
||||
timestamp AS event_timestamp,
|
||||
client_info,
|
||||
metadata,
|
||||
normalized_os,
|
||||
normalized_os_version,
|
||||
normalized_country_code,
|
||||
normalized_channel,
|
||||
ping_info,
|
||||
extra AS event_details,
|
||||
metrics
|
||||
FROM
|
||||
-- https://dictionary.telemetry.mozilla.org/apps/firefox_desktop/pings/newtab
|
||||
`moz-fx-data-shared-prod.firefox_desktop_stable.newtab_v1`,
|
||||
UNNEST(events)
|
||||
WHERE
|
||||
DATE(submission_timestamp) = @submission_date
|
||||
AND category IN ('newtab', 'topsites', 'newtab.search', 'newtab.search.ad')
|
||||
AND name IN ('closed', 'opened', 'impression', 'issued', 'click')
|
||||
),
|
||||
categorized_events AS (
|
||||
SELECT
|
||||
-- Unique Identifiers
|
||||
client_info.client_id,
|
||||
mozfun.map.get_key(event_details, "newtab_visit_id") AS newtab_visit_id,
|
||||
-- Metrics
|
||||
event_name = "issued"
|
||||
AND event_category = "newtab.search" AS is_search_issued,
|
||||
-- ??? is_tagged_search
|
||||
-- ??? is_follow_on_search
|
||||
event_name = "impression"
|
||||
AND event_category = 'newtab.search.ad'
|
||||
AND mozfun.map.get_key(event_details, "is_tagged") = "true" AS is_tagged_search_ad_impression,
|
||||
event_name = "impression"
|
||||
AND event_category = 'newtab.search.ad'
|
||||
AND mozfun.map.get_key(
|
||||
event_details,
|
||||
"is_follow_on"
|
||||
) = "true" AS is_follow_on_search_ad_impression,
|
||||
event_name = "click"
|
||||
AND event_category = 'newtab.search.ad'
|
||||
AND mozfun.map.get_key(event_details, "is_tagged") = "true" AS is_tagged_search_ad_click,
|
||||
event_name = "click"
|
||||
AND event_category = 'newtab.search.ad'
|
||||
AND mozfun.map.get_key(event_details, "is_follow_on") = "true" AS is_follow_on_search_ad_click,
|
||||
event_category = 'topsites'
|
||||
AND event_name = 'impression' AS is_topsite_impression,
|
||||
event_category = 'topsites'
|
||||
AND event_name = 'impression'
|
||||
AND mozfun.map.get_key(
|
||||
event_details,
|
||||
"is_sponsored"
|
||||
) = "true" AS is_sponsored_topsite_impression,
|
||||
event_category = 'topsites'
|
||||
AND event_name = 'click' AS is_topsite_click,
|
||||
event_category = 'topsites'
|
||||
AND event_name = 'click'
|
||||
AND mozfun.map.get_key(event_details, "is_sponsored") = "true" AS is_sponsored_topsite_click,
|
||||
IF(event_name = "opened", event_timestamp, NULL) AS newtab_visit_started_at,
|
||||
IF(event_name = "closed", event_timestamp, NULL) AS newtab_visit_ended_at,
|
||||
-- Client/Session-unique attributes
|
||||
normalized_os,
|
||||
normalized_os_version,
|
||||
normalized_country_code,
|
||||
normalized_channel,
|
||||
client_info.app_display_version,
|
||||
mozfun.map.get_key(event_details, "source") AS newtab_open_source,
|
||||
metrics.string.search_engine_private_engine_id AS default_search_engine,
|
||||
metrics.string.search_engine_default_engine_id AS default_private_search_engine,
|
||||
ping_info.experiments,
|
||||
-- ??? private_browsing_mode
|
||||
-- Partially unique session attributes
|
||||
mozfun.map.get_key(event_details, "telemetry_id") AS search_engine,
|
||||
mozfun.map.get_key(event_details, "search_access_point") AS search_access_point,
|
||||
-- ??? topsite_advertiser_id
|
||||
-- ??? topsite_position
|
||||
submission_date
|
||||
FROM
|
||||
events_unnested
|
||||
)
|
||||
SELECT
|
||||
newtab_visit_id,
|
||||
client_id,
|
||||
submission_date,
|
||||
search_engine,
|
||||
search_access_point,
|
||||
-- topsite_advertiser_id,
|
||||
-- topsite_position,
|
||||
ANY_VALUE(experiments) AS experiments,
|
||||
ANY_VALUE(default_private_search_engine) AS default_private_search_engine,
|
||||
ANY_VALUE(default_search_engine) AS default_search_engine,
|
||||
ANY_VALUE(normalized_os) AS os,
|
||||
ANY_VALUE(normalized_os_version) AS os_version,
|
||||
ANY_VALUE(normalized_country_code) AS country_code,
|
||||
ANY_VALUE(normalized_channel) AS channel,
|
||||
ANY_VALUE(app_display_version) AS browser_version,
|
||||
"Firefox Desktop" AS browser_name,
|
||||
ANY_VALUE(newtab_open_source) AS newtab_open_source,
|
||||
MIN(newtab_visit_started_at) AS newtab_visit_started_at,
|
||||
MIN(newtab_visit_ended_at) AS newtab_visit_ended_at,
|
||||
COUNTIF(is_topsite_click) AS topsite_clicks,
|
||||
COUNTIF(is_sponsored_topsite_click) AS sponsored_topsite_clicks,
|
||||
COUNTIF(is_topsite_impression) AS topsite_impressions,
|
||||
COUNTIF(is_sponsored_topsite_impression) AS sponsored_topsite_impressions,
|
||||
COUNTIF(is_search_issued) AS searches,
|
||||
COUNTIF(is_tagged_search_ad_click) AS tagged_search_ad_clicks,
|
||||
COUNTIF(is_tagged_search_ad_impression) AS tagged_search_ad_impressions,
|
||||
COUNTIF(is_follow_on_search_ad_click) AS follow_on_search_ad_clicks,
|
||||
COUNTIF(is_follow_on_search_ad_impression) AS follow_on_search_ad_impressions,
|
||||
COUNTIF(
|
||||
is_tagged_search_ad_click
|
||||
AND is_follow_on_search_ad_click
|
||||
) AS tagged_follow_on_search_ad_clicks,
|
||||
COUNTIF(
|
||||
is_tagged_search_ad_impression
|
||||
AND is_follow_on_search_ad_impression
|
||||
) AS tagged_follow_on_search_ad_impressions,
|
||||
FROM
|
||||
categorized_events
|
||||
GROUP BY
|
||||
newtab_visit_id,
|
||||
client_id,
|
||||
submission_date,
|
||||
search_engine,
|
||||
search_access_point
|
||||
-- topsite_advertiser_id,
|
||||
-- topsite_position
|
|
@ -0,0 +1,103 @@
|
|||
fields:
|
||||
- mode: NULLABLE
|
||||
name: newtab_visit_id
|
||||
type: STRING
|
||||
- mode: NULLABLE
|
||||
name: client_id
|
||||
type: STRING
|
||||
- mode: NULLABLE
|
||||
name: submission_date
|
||||
type: DATE
|
||||
- mode: NULLABLE
|
||||
name: search_engine
|
||||
type: STRING
|
||||
- mode: NULLABLE
|
||||
name: search_access_point
|
||||
type: STRING
|
||||
- fields:
|
||||
- mode: NULLABLE
|
||||
name: key
|
||||
type: STRING
|
||||
- fields:
|
||||
- mode: NULLABLE
|
||||
name: branch
|
||||
type: STRING
|
||||
- fields:
|
||||
- mode: NULLABLE
|
||||
name: type
|
||||
type: STRING
|
||||
mode: NULLABLE
|
||||
name: extra
|
||||
type: RECORD
|
||||
mode: NULLABLE
|
||||
name: value
|
||||
type: RECORD
|
||||
mode: REPEATED
|
||||
name: experiments
|
||||
type: RECORD
|
||||
- mode: NULLABLE
|
||||
name: default_private_search_engine
|
||||
type: STRING
|
||||
- mode: NULLABLE
|
||||
name: default_search_engine
|
||||
type: STRING
|
||||
- mode: NULLABLE
|
||||
name: os
|
||||
type: STRING
|
||||
- mode: NULLABLE
|
||||
name: os_version
|
||||
type: STRING
|
||||
- mode: NULLABLE
|
||||
name: country_code
|
||||
type: STRING
|
||||
- mode: NULLABLE
|
||||
name: channel
|
||||
type: STRING
|
||||
- mode: NULLABLE
|
||||
name: browser_version
|
||||
type: STRING
|
||||
- mode: NULLABLE
|
||||
name: browser_name
|
||||
type: STRING
|
||||
- mode: NULLABLE
|
||||
name: newtab_open_source
|
||||
type: STRING
|
||||
- mode: NULLABLE
|
||||
name: newtab_visit_started_at
|
||||
type: INTEGER
|
||||
- mode: NULLABLE
|
||||
name: newtab_visit_ended_at
|
||||
type: INTEGER
|
||||
- mode: NULLABLE
|
||||
name: topsite_clicks
|
||||
type: INTEGER
|
||||
- mode: NULLABLE
|
||||
name: sponsored_topsite_clicks
|
||||
type: INTEGER
|
||||
- mode: NULLABLE
|
||||
name: topsite_impressions
|
||||
type: INTEGER
|
||||
- mode: NULLABLE
|
||||
name: sponsored_topsite_impressions
|
||||
type: INTEGER
|
||||
- mode: NULLABLE
|
||||
name: searches
|
||||
type: INTEGER
|
||||
- mode: NULLABLE
|
||||
name: tagged_search_ad_clicks
|
||||
type: INTEGER
|
||||
- mode: NULLABLE
|
||||
name: tagged_search_ad_impressions
|
||||
type: INTEGER
|
||||
- mode: NULLABLE
|
||||
name: follow_on_search_ad_clicks
|
||||
type: INTEGER
|
||||
- mode: NULLABLE
|
||||
name: follow_on_search_ad_impressions
|
||||
type: INTEGER
|
||||
- mode: NULLABLE
|
||||
name: tagged_follow_on_search_ad_clicks
|
||||
type: INTEGER
|
||||
- mode: NULLABLE
|
||||
name: tagged_follow_on_search_ad_impressions
|
||||
type: INTEGER
|
Загрузка…
Ссылка в новой задаче