adding script to land files in GCS for merino migration (#6032)

* adding script to land files in GCS for merino migration

* formatting yaml

* formatting yaml again

* updating sql

* updating dag name

* adding dataset and table for merino exports

* updating check to add fail

* updating start date

* updating dag name
This commit is contained in:
Chelsey Beck 2024-08-09 16:52:40 -07:00 коммит произвёл GitHub
Родитель c4bdac3f32
Коммит 41e8c9b221
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
6 изменённых файлов: 119 добавлений и 0 удалений

Просмотреть файл

@ -1790,3 +1790,24 @@ bqetl_firefox_desktop_ad_click_history:
tags:
- repo/bigquery-etl
- impact/tier_2
bqetl_merino_newtab_aggregates_to_gcs:
default_args:
depends_on_past: false
email:
- cbeck@mozilla.com
- gkatre@mozilla.com
email_on_failure: true
email_on_retry: false
end_date: null
owner: cbeck@mozilla.com
retries: 2
retry_delay: 5m
start_date: '2024-08-12'
description: |
Aggregates Newtab engagement data that lands in a GCS bucket for Merino recommendations.
repo: bigquery-etl
schedule_interval: "*/20 * * * *"
tags:
- repo/bigquery-etl
- impact/tier_1

Просмотреть файл

@ -0,0 +1,10 @@
friendly_name: Merino External
description: |-
Data aggregated from Newtab engagements and exported to GCS for Merino consumption
dataset_base_acl: external
user_facing: false
labels: {}
workgroup_access:
- role: roles/bigquery.dataViewer
members:
- workgroup:mozilla-confidential

Просмотреть файл

@ -0,0 +1,16 @@
-- macro checks
#fail
{{ not_null(["scheduled_corpus_item_id"]) }}
#fail
{{ is_unique(["scheduled_corpus_item_id"]) }}
#fail
{{ not_null(["impression_count"]) }}
#fail
{{ not_null(["click_count"]) }}
#fail
{{ min_row_count(1) }}

Просмотреть файл

@ -0,0 +1,15 @@
friendly_name: Merino Newtab Aggregates
description: |-
Aggregated Newtab engagement data for Merino recommendations. Used to export JSON objects to GCS.
See https://mozilla-hub.atlassian.net/browse/MC-1256
owners:
- cbeck@mozilla.com
- gkatre@mozilla.com
labels:
incremental: false
owner: cbeck
bigquery:
time_partitioning: null
scheduling:
dag_name: bqetl_merino_newtab_aggregates_to_gcs
date_partition_parameter: null

Просмотреть файл

@ -0,0 +1,47 @@
WITH deduplicated_pings AS (
SELECT
submission_timestamp,
document_id,
events,
FROM
`moz-fx-data-shared-prod.firefox_desktop_live.newtab_v1`
WHERE
submission_timestamp > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 DAY)
QUALIFY
ROW_NUMBER() OVER (
PARTITION BY
DATE(submission_timestamp),
document_id
ORDER BY
submission_timestamp DESC
) = 1
),
flattened_newtab_events AS (
SELECT
document_id,
submission_timestamp,
unnested_events.name AS event_name,
mozfun.map.get_key(
unnested_events.extra,
'scheduled_corpus_item_id'
) AS scheduled_corpus_item_id,
mozfun.map.get_key(unnested_events.extra, 'position') AS position,
COUNT(1) OVER (PARTITION BY document_id, unnested_events.name) AS user_event_count
FROM
deduplicated_pings,
UNNEST(events) AS unnested_events
--filter to Pocket events
WHERE
unnested_events.category = 'pocket'
AND unnested_events.name IN ('impression', 'click', 'save', 'dismiss')
--keep only data with a non-null scheduled corpus item ID
AND (mozfun.map.get_key(unnested_events.extra, 'scheduled_corpus_item_id') IS NOT NULL)
)
SELECT
scheduled_corpus_item_id,
SUM(CASE WHEN event_name = 'impression' THEN 1 ELSE 0 END) AS impression_count,
SUM(CASE WHEN event_name = 'click' THEN 1 ELSE 0 END) AS click_count
FROM
flattened_newtab_events
GROUP BY
1;

Просмотреть файл

@ -0,0 +1,10 @@
fields:
- mode: NULLABLE
name: scheduled_corpus_item_id
type: STRING
- mode: NULLABLE
name: impression_count
type: INTEGER
- mode: NULLABLE
name: click_count
type: INTEGER