adding script to land files in GCS for merino migration (#6032)
* adding script to land files in GCS for merino migration * formatting yaml * formatting yaml again * updating sql * updating dag name * adding dataset and table for merino exports * updating check to add fail * updating start date * updating dag name
This commit is contained in:
Родитель
c4bdac3f32
Коммит
41e8c9b221
21
dags.yaml
21
dags.yaml
|
@ -1790,3 +1790,24 @@ bqetl_firefox_desktop_ad_click_history:
|
|||
tags:
|
||||
- repo/bigquery-etl
|
||||
- impact/tier_2
|
||||
|
||||
bqetl_merino_newtab_aggregates_to_gcs:
|
||||
default_args:
|
||||
depends_on_past: false
|
||||
email:
|
||||
- cbeck@mozilla.com
|
||||
- gkatre@mozilla.com
|
||||
email_on_failure: true
|
||||
email_on_retry: false
|
||||
end_date: null
|
||||
owner: cbeck@mozilla.com
|
||||
retries: 2
|
||||
retry_delay: 5m
|
||||
start_date: '2024-08-12'
|
||||
description: |
|
||||
Aggregates Newtab engagement data that lands in a GCS bucket for Merino recommendations.
|
||||
repo: bigquery-etl
|
||||
schedule_interval: "*/20 * * * *"
|
||||
tags:
|
||||
- repo/bigquery-etl
|
||||
- impact/tier_1
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
friendly_name: Merino External
|
||||
description: |-
|
||||
Data aggregated from Newtab engagements and exported to GCS for Merino consumption
|
||||
dataset_base_acl: external
|
||||
user_facing: false
|
||||
labels: {}
|
||||
workgroup_access:
|
||||
- role: roles/bigquery.dataViewer
|
||||
members:
|
||||
- workgroup:mozilla-confidential
|
|
@ -0,0 +1,16 @@
|
|||
-- macro checks
|
||||
|
||||
#fail
|
||||
{{ not_null(["scheduled_corpus_item_id"]) }}
|
||||
|
||||
#fail
|
||||
{{ is_unique(["scheduled_corpus_item_id"]) }}
|
||||
|
||||
#fail
|
||||
{{ not_null(["impression_count"]) }}
|
||||
|
||||
#fail
|
||||
{{ not_null(["click_count"]) }}
|
||||
|
||||
#fail
|
||||
{{ min_row_count(1) }}
|
|
@ -0,0 +1,15 @@
|
|||
friendly_name: Merino Newtab Aggregates
|
||||
description: |-
|
||||
Aggregated Newtab engagement data for Merino recommendations. Used to export JSON objects to GCS.
|
||||
See https://mozilla-hub.atlassian.net/browse/MC-1256
|
||||
owners:
|
||||
- cbeck@mozilla.com
|
||||
- gkatre@mozilla.com
|
||||
labels:
|
||||
incremental: false
|
||||
owner: cbeck
|
||||
bigquery:
|
||||
time_partitioning: null
|
||||
scheduling:
|
||||
dag_name: bqetl_merino_newtab_aggregates_to_gcs
|
||||
date_partition_parameter: null
|
|
@ -0,0 +1,47 @@
|
|||
WITH deduplicated_pings AS (
|
||||
SELECT
|
||||
submission_timestamp,
|
||||
document_id,
|
||||
events,
|
||||
FROM
|
||||
`moz-fx-data-shared-prod.firefox_desktop_live.newtab_v1`
|
||||
WHERE
|
||||
submission_timestamp > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 DAY)
|
||||
QUALIFY
|
||||
ROW_NUMBER() OVER (
|
||||
PARTITION BY
|
||||
DATE(submission_timestamp),
|
||||
document_id
|
||||
ORDER BY
|
||||
submission_timestamp DESC
|
||||
) = 1
|
||||
),
|
||||
flattened_newtab_events AS (
|
||||
SELECT
|
||||
document_id,
|
||||
submission_timestamp,
|
||||
unnested_events.name AS event_name,
|
||||
mozfun.map.get_key(
|
||||
unnested_events.extra,
|
||||
'scheduled_corpus_item_id'
|
||||
) AS scheduled_corpus_item_id,
|
||||
mozfun.map.get_key(unnested_events.extra, 'position') AS position,
|
||||
COUNT(1) OVER (PARTITION BY document_id, unnested_events.name) AS user_event_count
|
||||
FROM
|
||||
deduplicated_pings,
|
||||
UNNEST(events) AS unnested_events
|
||||
--filter to Pocket events
|
||||
WHERE
|
||||
unnested_events.category = 'pocket'
|
||||
AND unnested_events.name IN ('impression', 'click', 'save', 'dismiss')
|
||||
--keep only data with a non-null scheduled corpus item ID
|
||||
AND (mozfun.map.get_key(unnested_events.extra, 'scheduled_corpus_item_id') IS NOT NULL)
|
||||
)
|
||||
SELECT
|
||||
scheduled_corpus_item_id,
|
||||
SUM(CASE WHEN event_name = 'impression' THEN 1 ELSE 0 END) AS impression_count,
|
||||
SUM(CASE WHEN event_name = 'click' THEN 1 ELSE 0 END) AS click_count
|
||||
FROM
|
||||
flattened_newtab_events
|
||||
GROUP BY
|
||||
1;
|
|
@ -0,0 +1,10 @@
|
|||
fields:
|
||||
- mode: NULLABLE
|
||||
name: scheduled_corpus_item_id
|
||||
type: STRING
|
||||
- mode: NULLABLE
|
||||
name: impression_count
|
||||
type: INTEGER
|
||||
- mode: NULLABLE
|
||||
name: click_count
|
||||
type: INTEGER
|
Загрузка…
Ссылка в новой задаче