Bug 1753489 Remove ETL for Firefox Reality (#2767)

* Bug 1753489 Remove ETL for Firefox Reality

Since this product is no longer maintained by Mozilla.

See https://bugzilla.mozilla.org/show_bug.cgi?id=1753489

This is just the first cleanup step. We can remove these datasets and all
content once this PR is merged. But the live/stable tables will require
a separate effort.
This commit is contained in:
Jeff Klukas 2022-03-02 10:03:21 -05:00 коммит произвёл GitHub
Родитель db4bd02f61
Коммит 616d2c8e3c
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
12 изменённых файлов: 0 добавлений и 481 удалений

Просмотреть файл

@ -45,24 +45,6 @@ bqetl_amo_stats:
tags:
- impact/tier_1
bqetl_vrbrowser:
schedule_interval: 0 2 * * *
description: |
Custom ETL based on Glean pings from Mozilla VR Browser.
default_args:
owner: jklukas@mozilla.com
start_date: "2019-07-25"
email:
[
"telemetry-alerts@mozilla.com",
"jklukas@mozilla.com",
"ascholtz@mozilla.com",
]
retries: 1
retry_delay: 5m
tags:
- impact/tier_3
bqetl_core:
schedule_interval: 0 2 * * *
description:

Просмотреть файл

@ -1,141 +0,0 @@
# Generated via https://github.com/mozilla/bigquery-etl/blob/main/bigquery_etl/query_scheduling/generate_airflow_dags.py
from airflow import DAG
from operators.task_sensor import ExternalTaskCompletedSensor
import datetime
from utils.gcp import bigquery_etl_query, gke_command
docs = """
### bqetl_vrbrowser
Built from bigquery-etl repo, [`dags/bqetl_vrbrowser.py`](https://github.com/mozilla/bigquery-etl/blob/main/dags/bqetl_vrbrowser.py)
#### Description
Custom ETL based on Glean pings from Mozilla VR Browser.
#### Owner
jklukas@mozilla.com
"""
default_args = {
"owner": "jklukas@mozilla.com",
"start_date": datetime.datetime(2019, 7, 25, 0, 0),
"end_date": None,
"email": [
"telemetry-alerts@mozilla.com",
"jklukas@mozilla.com",
"ascholtz@mozilla.com",
],
"depends_on_past": False,
"retry_delay": datetime.timedelta(seconds=300),
"email_on_failure": True,
"email_on_retry": True,
"retries": 1,
}
tags = ["impact/tier_3", "repo/bigquery-etl"]
with DAG(
"bqetl_vrbrowser",
default_args=default_args,
schedule_interval="0 2 * * *",
doc_md=docs,
tags=tags,
) as dag:
org_mozilla_vrbrowser_derived__baseline_daily__v1 = bigquery_etl_query(
task_id="org_mozilla_vrbrowser_derived__baseline_daily__v1",
destination_table="baseline_daily_v1",
dataset_id="org_mozilla_vrbrowser_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=[
"ascholtz@mozilla.com",
"jklukas@mozilla.com",
"telemetry-alerts@mozilla.com",
],
date_partition_parameter="submission_date",
depends_on_past=False,
dag=dag,
)
org_mozilla_vrbrowser_derived__clients_daily__v1 = bigquery_etl_query(
task_id="org_mozilla_vrbrowser_derived__clients_daily__v1",
destination_table="clients_daily_v1",
dataset_id="org_mozilla_vrbrowser_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=[
"ascholtz@mozilla.com",
"jklukas@mozilla.com",
"telemetry-alerts@mozilla.com",
],
date_partition_parameter="submission_date",
depends_on_past=False,
dag=dag,
)
org_mozilla_vrbrowser_derived__clients_last_seen__v1 = bigquery_etl_query(
task_id="org_mozilla_vrbrowser_derived__clients_last_seen__v1",
destination_table="clients_last_seen_v1",
dataset_id="org_mozilla_vrbrowser_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=[
"ascholtz@mozilla.com",
"jklukas@mozilla.com",
"telemetry-alerts@mozilla.com",
],
date_partition_parameter="submission_date",
depends_on_past=True,
dag=dag,
)
org_mozilla_vrbrowser_derived__metrics_daily__v1 = bigquery_etl_query(
task_id="org_mozilla_vrbrowser_derived__metrics_daily__v1",
destination_table="metrics_daily_v1",
dataset_id="org_mozilla_vrbrowser_derived",
project_id="moz-fx-data-shared-prod",
owner="jklukas@mozilla.com",
email=[
"ascholtz@mozilla.com",
"jklukas@mozilla.com",
"telemetry-alerts@mozilla.com",
],
date_partition_parameter="submission_date",
depends_on_past=False,
dag=dag,
)
wait_for_copy_deduplicate_all = ExternalTaskCompletedSensor(
task_id="wait_for_copy_deduplicate_all",
external_dag_id="copy_deduplicate",
external_task_id="copy_deduplicate_all",
execution_delta=datetime.timedelta(seconds=3600),
check_existence=True,
mode="reschedule",
pool="DATA_ENG_EXTERNALTASKSENSOR",
)
org_mozilla_vrbrowser_derived__baseline_daily__v1.set_upstream(
wait_for_copy_deduplicate_all
)
org_mozilla_vrbrowser_derived__clients_daily__v1.set_upstream(
org_mozilla_vrbrowser_derived__baseline_daily__v1
)
org_mozilla_vrbrowser_derived__clients_daily__v1.set_upstream(
org_mozilla_vrbrowser_derived__metrics_daily__v1
)
org_mozilla_vrbrowser_derived__clients_last_seen__v1.set_upstream(
org_mozilla_vrbrowser_derived__clients_daily__v1
)
org_mozilla_vrbrowser_derived__metrics_daily__v1.set_upstream(
wait_for_copy_deduplicate_all
)

Просмотреть файл

@ -1,18 +0,0 @@
CREATE OR REPLACE VIEW
`moz-fx-data-shared-prod.org_mozilla_vrbrowser.clients_last_seen`
AS
WITH with_days_since AS (
SELECT
mozfun.bits28.days_since_seen(days_seen_bits) AS days_since_seen,
mozfun.bits28.days_since_seen(days_created_profile_bits) AS days_since_created_profile,
*
FROM
`moz-fx-data-shared-prod.org_mozilla_vrbrowser_derived.clients_last_seen_v1`
)
--
SELECT
-- Include date_last_seen for compatibility with existing queries.
DATE_SUB(submission_date, INTERVAL days_since_seen DAY) AS date_last_seen,
*
FROM
with_days_since

Просмотреть файл

@ -1,13 +0,0 @@
---
friendly_name: VR Browser Baseline Daily
description: >
A daily aggregate of baseline pings from each Firefox Reality client,
partitioned by day
owners:
- jklukas@mozilla.com
- ascholtz@mozilla.com
labels:
application: firefox-reality
schedule: daily
scheduling:
dag_name: bqetl_vrbrowser

Просмотреть файл

@ -1,76 +0,0 @@
WITH baseline_v1 AS (
SELECT
DATE(submission_timestamp) AS submission_date,
LOWER(client_info.client_id) AS client_id,
submission_timestamp,
document_id,
client_info,
sample_id,
metadata,
normalized_channel,
metrics AS baseline_metrics
FROM
org_mozilla_vrbrowser_stable.baseline_v1
WHERE
client_info.client_id IS NOT NULL
),
--
windowed AS (
SELECT
submission_date,
client_id,
sample_id,
ROW_NUMBER() OVER w1_unframed AS _n,
--
-- Take the earliest first_run_date if ambiguous.
MIN(SAFE.PARSE_DATE('%F', SUBSTR(client_info.first_run_date, 1, 10))) OVER w1 AS first_run_date,
--
-- Sums over distinct baseline pings.
SUM(
udf.glean_timespan_seconds(baseline_metrics.timespan.glean_baseline_duration)
) OVER w1 AS durations,
--
-- For all other dimensions, we use the mode of observed values in the day.
udf.mode_last(ARRAY_AGG(client_info.os) OVER w1) AS os,
udf.mode_last(ARRAY_AGG(client_info.os_version) OVER w1) AS os_version,
udf.mode_last(ARRAY_AGG(baseline_metrics.string.glean_baseline_locale) OVER w1) AS locale,
udf.json_mode_last(
ARRAY_AGG(udf.geo_struct(metadata.geo.country, metadata.geo.city, NULL, NULL)) OVER w1
).* EXCEPT (geo_subdivision1, geo_subdivision2),
udf.mode_last(ARRAY_AGG(client_info.device_manufacturer) OVER w1) AS device_manufacturer,
udf.mode_last(ARRAY_AGG(client_info.device_model) OVER w1) AS device_model,
udf.mode_last(ARRAY_AGG(client_info.app_build) OVER w1) AS app_build,
udf.mode_last(ARRAY_AGG(normalized_channel) OVER w1) AS normalized_channel,
udf.mode_last(ARRAY_AGG(client_info.architecture) OVER w1) AS architecture,
udf.mode_last(ARRAY_AGG(client_info.app_display_version) OVER w1) AS app_display_version
FROM
baseline_v1
WHERE
-- Reprocess all dates by running this query with --parameter=submission_date:DATE:NULL
(@submission_date IS NULL OR @submission_date = submission_date)
WINDOW
w1 AS (
PARTITION BY
client_id,
submission_date
ORDER BY
submission_date
ROWS BETWEEN
UNBOUNDED PRECEDING
AND UNBOUNDED FOLLOWING
),
-- We must provide a modified window for ROW_NUMBER which cannot accept a frame clause.
w1_unframed AS (
PARTITION BY
client_id,
submission_date
ORDER BY
submission_timestamp
)
)
SELECT
* EXCEPT (_n)
FROM
windowed
WHERE
_n = 1

Просмотреть файл

@ -1,13 +0,0 @@
---
friendly_name: VR Browser Clients Daily
description: >
A daily aggregate of baseline and metrics pings from each
Firefox Reality client, partitioned by day
owners:
- jklukas@mozilla.com
- ascholtz@mozilla.com
labels:
application: firefox-reality
schedule: daily
scheduling:
dag_name: bqetl_vrbrowser

Просмотреть файл

@ -1,28 +0,0 @@
SELECT
baseline.submission_date,
baseline.client_id,
baseline.sample_id,
'Firefox Reality' AS app_name,
baseline.first_run_date,
baseline.durations,
baseline.os,
baseline.os_version,
baseline.locale,
baseline.country,
baseline.city,
baseline.device_manufacturer,
baseline.device_model,
baseline.app_build,
baseline.normalized_channel,
baseline.architecture,
baseline.app_display_version,
metrics.distribution_channel_name
FROM
baseline_daily_v1 AS baseline
LEFT JOIN
metrics_daily_v1 AS metrics
USING
(submission_date, client_id)
WHERE
-- Reprocess all dates by running this query with --parameter=submission_date:DATE:NULL
(@submission_date IS NULL OR @submission_date = submission_date)

Просмотреть файл

@ -1,25 +0,0 @@
CREATE TABLE
`moz-fx-data-shared-prod.org_mozilla_vrbrowser_derived.clients_last_seen_v1`
PARTITION BY
submission_date
CLUSTER BY
app_name,
os,
sample_id
OPTIONS
(require_partition_filter = TRUE)
AS
SELECT
CAST(NULL AS DATE) AS submission_date,
0 AS days_seen_bits,
0 AS days_created_profile_bits,
-- We make sure to delay * until the end so that as new columns are added
-- to clients_daily, we can add those columns in the same order to the end
-- of this schema, which may be necessary for the daily join query between
-- the two tables to validate.
* EXCEPT (submission_date)
FROM
`moz-fx-data-shared-prod.org_mozilla_vrbrowser_derived.clients_daily_v1`
WHERE
-- Output empty table and read no input rows
FALSE

Просмотреть файл

@ -1,14 +0,0 @@
---
friendly_name: VR Browser Clients Last Seen
description: >
Captures history of activity of each Firefox Reality client in 28 day
windows for each submission date.
owners:
- jklukas@mozilla.com
- ascholtz@mozilla.com
labels:
application: firefox-reality
schedule: daily
scheduling:
dag_name: bqetl_vrbrowser
depends_on_past: true

Просмотреть файл

@ -1,49 +0,0 @@
WITH _current AS (
SELECT
-- In this raw table, we capture the history of activity over the past
-- 28 days for each usage criterion as a single 64-bit integer. The
-- rightmost bit represents whether the user was active in the current day.
CAST(TRUE AS INT64) AS days_seen_bits,
udf.days_since_created_profile_as_28_bits(
DATE_DIFF(submission_date, first_run_date, DAY)
) AS days_created_profile_bits,
* EXCEPT (submission_date)
FROM
clients_daily_v1
WHERE
submission_date = @submission_date
),
--
_previous AS (
SELECT
* EXCEPT (submission_date)
FROM
clients_last_seen_v1 AS cls
WHERE
submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY)
-- Filter out rows from yesterday that have now fallen outside the 28-day window.
AND udf.shift_28_bits_one_day(days_seen_bits) > 0
)
--
SELECT
@submission_date AS submission_date,
IF(
_current.client_id IS NOT NULL,
_current,
_previous
).* REPLACE ( --
udf.combine_adjacent_days_28_bits(
_previous.days_seen_bits,
_current.days_seen_bits
) AS days_seen_bits,
udf.coalesce_adjacent_days_28_bits(
_previous.days_created_profile_bits,
_current.days_created_profile_bits
) AS days_created_profile_bits
)
FROM
_current
FULL JOIN
_previous
USING
(client_id)

Просмотреть файл

@ -1,13 +0,0 @@
---
friendly_name: VR Browser Metrics Daily
description: >
A daily aggregate of metrics pings from each Firefox Reality client,
partitioned by day
owners:
- jklukas@mozilla.com
- ascholtz@mozilla.com
labels:
application: firefox-reality
schedule: daily
scheduling:
dag_name: bqetl_vrbrowser

Просмотреть файл

@ -1,73 +0,0 @@
WITH metrics_v1 AS (
SELECT
DATE(submission_timestamp) AS submission_date,
LOWER(client_info.client_id) AS client_id,
submission_timestamp,
document_id,
client_info,
sample_id,
metadata,
normalized_channel,
metrics
FROM
org_mozilla_vrbrowser_stable.metrics_v1
WHERE
client_info.client_id IS NOT NULL
),
--
windowed AS (
SELECT
submission_date,
client_id,
sample_id,
ROW_NUMBER() OVER w1_unframed AS _n,
--
-- Take the earliest first_run_date if ambiguous.
MIN(SAFE.PARSE_DATE('%F', SUBSTR(client_info.first_run_date, 1, 10))) OVER w1 AS first_run_date,
--
-- Use the mode of observed values in the day.
udf.mode_last(ARRAY_AGG(client_info.os) OVER w1) AS os,
udf.mode_last(ARRAY_AGG(client_info.os_version) OVER w1) AS os_version,
udf.json_mode_last(
ARRAY_AGG(udf.geo_struct(metadata.geo.country, metadata.geo.city, NULL, NULL)) OVER w1
).* EXCEPT (geo_subdivision1, geo_subdivision2),
udf.mode_last(ARRAY_AGG(client_info.device_manufacturer) OVER w1) AS device_manufacturer,
udf.mode_last(ARRAY_AGG(client_info.device_model) OVER w1) AS device_model,
udf.mode_last(ARRAY_AGG(client_info.app_build) OVER w1) AS app_build,
udf.mode_last(ARRAY_AGG(normalized_channel) OVER w1) AS normalized_channel,
udf.mode_last(ARRAY_AGG(client_info.architecture) OVER w1) AS architecture,
udf.mode_last(ARRAY_AGG(client_info.app_display_version) OVER w1) AS app_display_version,
udf.mode_last(
ARRAY_AGG(metrics.string.distribution_channel_name) OVER w1
) AS distribution_channel_name
FROM
metrics_v1
WHERE
-- Reprocess all dates by running this query with --parameter=submission_date:DATE:NULL
(@submission_date IS NULL OR @submission_date = submission_date)
WINDOW
w1 AS (
PARTITION BY
client_id,
submission_date
ORDER BY
submission_date
ROWS BETWEEN
UNBOUNDED PRECEDING
AND UNBOUNDED FOLLOWING
),
-- We must provide a modified window for ROW_NUMBER which cannot accept a frame clause.
w1_unframed AS (
PARTITION BY
client_id,
submission_date
ORDER BY
submission_timestamp
)
)
SELECT
* EXCEPT (_n)
FROM
windowed
WHERE
_n = 1