Bug 1635906 Add client ID lookup table for AET (#1335)

* Bug 1635906 Add client ID lookup table for AET

Builds on top of the eco_uid lookup table added in #1323

* Exempt from dry run
This commit is contained in:
Jeff Klukas 2020-09-24 08:43:18 -04:00 коммит произвёл GitHub
Родитель 7e4054fb5e
Коммит 504c007d6c
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 95 добавлений и 0 удалений

Просмотреть файл

@ -20,6 +20,7 @@ import sys
SKIP = {
# Access Denied
"sql/account_ecosystem_derived/ecosystem_client_id_lookup_v1/query.sql",
"sql/activity_stream/impression_stats_flat/view.sql",
"sql/activity_stream/tile_id_types/view.sql",
"sql/monitoring/deletion_request_volume_v1/query.sql",

Просмотреть файл

@ -20,6 +20,18 @@ with DAG(
"bqetl_account_ecosystem", default_args=default_args, schedule_interval="0 2 * * *"
) as dag:
account_ecosystem_derived__ecosystem_client_id_lookup__v1 = bigquery_etl_query(
task_id="account_ecosystem_derived__ecosystem_client_id_lookup__v1",
destination_table="ecosystem_client_id_lookup_v1",
dataset_id="account_ecosystem_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=["jklukas@mozilla.com"],
date_partition_parameter="submission_date",
depends_on_past=True,
dag=dag,
)
account_ecosystem_derived__ecosystem_user_id_lookup__v1 = bigquery_etl_query(
task_id="account_ecosystem_derived__ecosystem_user_id_lookup__v1",
destination_table="ecosystem_user_id_lookup_v1",
@ -34,6 +46,9 @@ with DAG(
dag=dag,
)
account_ecosystem_derived__ecosystem_client_id_lookup__v1.set_upstream(
account_ecosystem_derived__ecosystem_user_id_lookup__v1
)
wait_for_copy_deduplicate_all = ExternalTaskSensor(
task_id="wait_for_copy_deduplicate_all",
external_dag_id="copy_deduplicate",
@ -44,6 +59,10 @@ with DAG(
pool="DATA_ENG_EXTERNALTASKSENSOR",
)
account_ecosystem_derived__ecosystem_client_id_lookup__v1.set_upstream(
wait_for_copy_deduplicate_all
)
account_ecosystem_derived__ecosystem_user_id_lookup__v1.set_upstream(
wait_for_copy_deduplicate_all
)

Просмотреть файл

@ -0,0 +1,8 @@
CREATE TABLE IF NOT EXISTS
`moz-fx-data-shared-prod.account_ecosystem_derived.ecosystem_client_id_lookup_v1`(
ecosystem_client_id_hash STRING NOT NULL,
canonical_id STRING NOT NULL,
first_seen_date DATE NOT NULL
)
PARTITION BY
first_seen_date

Просмотреть файл

@ -0,0 +1,17 @@
friendly_name: Ecosystem Client ID Lookup
description: >
Lookup table of ecosystem_client_id_hash to canonical_id.
owners:
- jklukas@mozilla.com
labels:
application: aet
schedule: daily
incremental: true
scheduling:
dag_name: bqetl_account_ecosystem
depends_on_past: True
# We access a restricted table for getting an HMAC key, so cannot dry run
# and must explicitly list referenced tables.
referenced_tables:
- ['telemetry_stable', 'account_ecosystem_v4']
- ['account_ecosystem_derived', 'ecosystem_user_id_lookup_v1']

Просмотреть файл

@ -0,0 +1,50 @@
WITH hmac_key AS (
SELECT
AEAD.DECRYPT_BYTES(
(SELECT keyset FROM `moz-fx-dataops-secrets.airflow_query_keys.aet_prod`),
ciphertext,
CAST(key_id AS BYTES)
) AS value
FROM
`moz-fx-data-shared-prod.account_ecosystem_restricted.encrypted_keys_v1`
WHERE
key_id = 'aet_hmac_prod'
),
unioned AS (
SELECT
submission_timestamp,
payload.ecosystem_user_id,
payload.ecosystem_client_id,
FROM
telemetry_stable.account_ecosystem_v4
-- As we add AET to additional applications, they will be added here via UNION ALL
-- and also added to the list of referenced tables in metadata.yaml
),
hashed AS (
SELECT
DISTINCT TO_HEX(
udf.hmac_sha256((SELECT * FROM hmac_key), CAST(ecosystem_client_id AS BYTES))
) AS ecosystem_client_id_hash,
ecosystem_user_id,
DATE(submission_timestamp) AS first_seen_date,
FROM
unioned
WHERE
DATE(submission_timestamp) = @submission_date
)
SELECT
ecosystem_client_id_hash,
euil.canonical_id,
hashed.first_seen_date,
FROM
hashed
JOIN
ecosystem_user_id_lookup_v1 AS euil
USING
(ecosystem_user_id)
LEFT JOIN
ecosystem_client_id_lookup_v1 AS existing
USING
(ecosystem_client_id_hash)
WHERE
existing.ecosystem_client_id_hash IS NULL