Bug 1753489 Remove ETL for Firefox Reality (#2767)
* Bug 1753489 Remove ETL for Firefox Reality Since this product is no longer maintained by Mozilla. See https://bugzilla.mozilla.org/show_bug.cgi?id=1753489 This is just the first cleanup step. We can remove these datasets and all content once this PR is merged. But the live/stable tables will require a separate effort.
This commit is contained in:
Родитель
db4bd02f61
Коммит
616d2c8e3c
18
dags.yaml
18
dags.yaml
|
@ -45,24 +45,6 @@ bqetl_amo_stats:
|
|||
tags:
|
||||
- impact/tier_1
|
||||
|
||||
bqetl_vrbrowser:
|
||||
schedule_interval: 0 2 * * *
|
||||
description: |
|
||||
Custom ETL based on Glean pings from Mozilla VR Browser.
|
||||
default_args:
|
||||
owner: jklukas@mozilla.com
|
||||
start_date: "2019-07-25"
|
||||
email:
|
||||
[
|
||||
"telemetry-alerts@mozilla.com",
|
||||
"jklukas@mozilla.com",
|
||||
"ascholtz@mozilla.com",
|
||||
]
|
||||
retries: 1
|
||||
retry_delay: 5m
|
||||
tags:
|
||||
- impact/tier_3
|
||||
|
||||
bqetl_core:
|
||||
schedule_interval: 0 2 * * *
|
||||
description:
|
||||
|
|
|
@ -1,141 +0,0 @@
|
|||
# Generated via https://github.com/mozilla/bigquery-etl/blob/main/bigquery_etl/query_scheduling/generate_airflow_dags.py
|
||||
|
||||
from airflow import DAG
|
||||
from operators.task_sensor import ExternalTaskCompletedSensor
|
||||
import datetime
|
||||
from utils.gcp import bigquery_etl_query, gke_command
|
||||
|
||||
docs = """
|
||||
### bqetl_vrbrowser
|
||||
|
||||
Built from bigquery-etl repo, [`dags/bqetl_vrbrowser.py`](https://github.com/mozilla/bigquery-etl/blob/main/dags/bqetl_vrbrowser.py)
|
||||
|
||||
#### Description
|
||||
|
||||
Custom ETL based on Glean pings from Mozilla VR Browser.
|
||||
|
||||
#### Owner
|
||||
|
||||
jklukas@mozilla.com
|
||||
"""
|
||||
|
||||
|
||||
default_args = {
|
||||
"owner": "jklukas@mozilla.com",
|
||||
"start_date": datetime.datetime(2019, 7, 25, 0, 0),
|
||||
"end_date": None,
|
||||
"email": [
|
||||
"telemetry-alerts@mozilla.com",
|
||||
"jklukas@mozilla.com",
|
||||
"ascholtz@mozilla.com",
|
||||
],
|
||||
"depends_on_past": False,
|
||||
"retry_delay": datetime.timedelta(seconds=300),
|
||||
"email_on_failure": True,
|
||||
"email_on_retry": True,
|
||||
"retries": 1,
|
||||
}
|
||||
|
||||
tags = ["impact/tier_3", "repo/bigquery-etl"]
|
||||
|
||||
with DAG(
|
||||
"bqetl_vrbrowser",
|
||||
default_args=default_args,
|
||||
schedule_interval="0 2 * * *",
|
||||
doc_md=docs,
|
||||
tags=tags,
|
||||
) as dag:
|
||||
|
||||
org_mozilla_vrbrowser_derived__baseline_daily__v1 = bigquery_etl_query(
|
||||
task_id="org_mozilla_vrbrowser_derived__baseline_daily__v1",
|
||||
destination_table="baseline_daily_v1",
|
||||
dataset_id="org_mozilla_vrbrowser_derived",
|
||||
project_id="moz-fx-data-shared-prod",
|
||||
owner="jklukas@mozilla.com",
|
||||
email=[
|
||||
"ascholtz@mozilla.com",
|
||||
"jklukas@mozilla.com",
|
||||
"telemetry-alerts@mozilla.com",
|
||||
],
|
||||
date_partition_parameter="submission_date",
|
||||
depends_on_past=False,
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
org_mozilla_vrbrowser_derived__clients_daily__v1 = bigquery_etl_query(
|
||||
task_id="org_mozilla_vrbrowser_derived__clients_daily__v1",
|
||||
destination_table="clients_daily_v1",
|
||||
dataset_id="org_mozilla_vrbrowser_derived",
|
||||
project_id="moz-fx-data-shared-prod",
|
||||
owner="jklukas@mozilla.com",
|
||||
email=[
|
||||
"ascholtz@mozilla.com",
|
||||
"jklukas@mozilla.com",
|
||||
"telemetry-alerts@mozilla.com",
|
||||
],
|
||||
date_partition_parameter="submission_date",
|
||||
depends_on_past=False,
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
org_mozilla_vrbrowser_derived__clients_last_seen__v1 = bigquery_etl_query(
|
||||
task_id="org_mozilla_vrbrowser_derived__clients_last_seen__v1",
|
||||
destination_table="clients_last_seen_v1",
|
||||
dataset_id="org_mozilla_vrbrowser_derived",
|
||||
project_id="moz-fx-data-shared-prod",
|
||||
owner="jklukas@mozilla.com",
|
||||
email=[
|
||||
"ascholtz@mozilla.com",
|
||||
"jklukas@mozilla.com",
|
||||
"telemetry-alerts@mozilla.com",
|
||||
],
|
||||
date_partition_parameter="submission_date",
|
||||
depends_on_past=True,
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
org_mozilla_vrbrowser_derived__metrics_daily__v1 = bigquery_etl_query(
|
||||
task_id="org_mozilla_vrbrowser_derived__metrics_daily__v1",
|
||||
destination_table="metrics_daily_v1",
|
||||
dataset_id="org_mozilla_vrbrowser_derived",
|
||||
project_id="moz-fx-data-shared-prod",
|
||||
owner="jklukas@mozilla.com",
|
||||
email=[
|
||||
"ascholtz@mozilla.com",
|
||||
"jklukas@mozilla.com",
|
||||
"telemetry-alerts@mozilla.com",
|
||||
],
|
||||
date_partition_parameter="submission_date",
|
||||
depends_on_past=False,
|
||||
dag=dag,
|
||||
)
|
||||
|
||||
wait_for_copy_deduplicate_all = ExternalTaskCompletedSensor(
|
||||
task_id="wait_for_copy_deduplicate_all",
|
||||
external_dag_id="copy_deduplicate",
|
||||
external_task_id="copy_deduplicate_all",
|
||||
execution_delta=datetime.timedelta(seconds=3600),
|
||||
check_existence=True,
|
||||
mode="reschedule",
|
||||
pool="DATA_ENG_EXTERNALTASKSENSOR",
|
||||
)
|
||||
|
||||
org_mozilla_vrbrowser_derived__baseline_daily__v1.set_upstream(
|
||||
wait_for_copy_deduplicate_all
|
||||
)
|
||||
|
||||
org_mozilla_vrbrowser_derived__clients_daily__v1.set_upstream(
|
||||
org_mozilla_vrbrowser_derived__baseline_daily__v1
|
||||
)
|
||||
|
||||
org_mozilla_vrbrowser_derived__clients_daily__v1.set_upstream(
|
||||
org_mozilla_vrbrowser_derived__metrics_daily__v1
|
||||
)
|
||||
|
||||
org_mozilla_vrbrowser_derived__clients_last_seen__v1.set_upstream(
|
||||
org_mozilla_vrbrowser_derived__clients_daily__v1
|
||||
)
|
||||
|
||||
org_mozilla_vrbrowser_derived__metrics_daily__v1.set_upstream(
|
||||
wait_for_copy_deduplicate_all
|
||||
)
|
|
@ -1,18 +0,0 @@
|
|||
CREATE OR REPLACE VIEW
|
||||
`moz-fx-data-shared-prod.org_mozilla_vrbrowser.clients_last_seen`
|
||||
AS
|
||||
WITH with_days_since AS (
|
||||
SELECT
|
||||
mozfun.bits28.days_since_seen(days_seen_bits) AS days_since_seen,
|
||||
mozfun.bits28.days_since_seen(days_created_profile_bits) AS days_since_created_profile,
|
||||
*
|
||||
FROM
|
||||
`moz-fx-data-shared-prod.org_mozilla_vrbrowser_derived.clients_last_seen_v1`
|
||||
)
|
||||
--
|
||||
SELECT
|
||||
-- Include date_last_seen for compatibility with existing queries.
|
||||
DATE_SUB(submission_date, INTERVAL days_since_seen DAY) AS date_last_seen,
|
||||
*
|
||||
FROM
|
||||
with_days_since
|
|
@ -1,13 +0,0 @@
|
|||
---
|
||||
friendly_name: VR Browser Baseline Daily
|
||||
description: >
|
||||
A daily aggregate of baseline pings from each Firefox Reality client,
|
||||
partitioned by day
|
||||
owners:
|
||||
- jklukas@mozilla.com
|
||||
- ascholtz@mozilla.com
|
||||
labels:
|
||||
application: firefox-reality
|
||||
schedule: daily
|
||||
scheduling:
|
||||
dag_name: bqetl_vrbrowser
|
|
@ -1,76 +0,0 @@
|
|||
WITH baseline_v1 AS (
|
||||
SELECT
|
||||
DATE(submission_timestamp) AS submission_date,
|
||||
LOWER(client_info.client_id) AS client_id,
|
||||
submission_timestamp,
|
||||
document_id,
|
||||
client_info,
|
||||
sample_id,
|
||||
metadata,
|
||||
normalized_channel,
|
||||
metrics AS baseline_metrics
|
||||
FROM
|
||||
org_mozilla_vrbrowser_stable.baseline_v1
|
||||
WHERE
|
||||
client_info.client_id IS NOT NULL
|
||||
),
|
||||
--
|
||||
windowed AS (
|
||||
SELECT
|
||||
submission_date,
|
||||
client_id,
|
||||
sample_id,
|
||||
ROW_NUMBER() OVER w1_unframed AS _n,
|
||||
--
|
||||
-- Take the earliest first_run_date if ambiguous.
|
||||
MIN(SAFE.PARSE_DATE('%F', SUBSTR(client_info.first_run_date, 1, 10))) OVER w1 AS first_run_date,
|
||||
--
|
||||
-- Sums over distinct baseline pings.
|
||||
SUM(
|
||||
udf.glean_timespan_seconds(baseline_metrics.timespan.glean_baseline_duration)
|
||||
) OVER w1 AS durations,
|
||||
--
|
||||
-- For all other dimensions, we use the mode of observed values in the day.
|
||||
udf.mode_last(ARRAY_AGG(client_info.os) OVER w1) AS os,
|
||||
udf.mode_last(ARRAY_AGG(client_info.os_version) OVER w1) AS os_version,
|
||||
udf.mode_last(ARRAY_AGG(baseline_metrics.string.glean_baseline_locale) OVER w1) AS locale,
|
||||
udf.json_mode_last(
|
||||
ARRAY_AGG(udf.geo_struct(metadata.geo.country, metadata.geo.city, NULL, NULL)) OVER w1
|
||||
).* EXCEPT (geo_subdivision1, geo_subdivision2),
|
||||
udf.mode_last(ARRAY_AGG(client_info.device_manufacturer) OVER w1) AS device_manufacturer,
|
||||
udf.mode_last(ARRAY_AGG(client_info.device_model) OVER w1) AS device_model,
|
||||
udf.mode_last(ARRAY_AGG(client_info.app_build) OVER w1) AS app_build,
|
||||
udf.mode_last(ARRAY_AGG(normalized_channel) OVER w1) AS normalized_channel,
|
||||
udf.mode_last(ARRAY_AGG(client_info.architecture) OVER w1) AS architecture,
|
||||
udf.mode_last(ARRAY_AGG(client_info.app_display_version) OVER w1) AS app_display_version
|
||||
FROM
|
||||
baseline_v1
|
||||
WHERE
|
||||
-- Reprocess all dates by running this query with --parameter=submission_date:DATE:NULL
|
||||
(@submission_date IS NULL OR @submission_date = submission_date)
|
||||
WINDOW
|
||||
w1 AS (
|
||||
PARTITION BY
|
||||
client_id,
|
||||
submission_date
|
||||
ORDER BY
|
||||
submission_date
|
||||
ROWS BETWEEN
|
||||
UNBOUNDED PRECEDING
|
||||
AND UNBOUNDED FOLLOWING
|
||||
),
|
||||
-- We must provide a modified window for ROW_NUMBER which cannot accept a frame clause.
|
||||
w1_unframed AS (
|
||||
PARTITION BY
|
||||
client_id,
|
||||
submission_date
|
||||
ORDER BY
|
||||
submission_timestamp
|
||||
)
|
||||
)
|
||||
SELECT
|
||||
* EXCEPT (_n)
|
||||
FROM
|
||||
windowed
|
||||
WHERE
|
||||
_n = 1
|
|
@ -1,13 +0,0 @@
|
|||
---
|
||||
friendly_name: VR Browser Clients Daily
|
||||
description: >
|
||||
A daily aggregate of baseline and metrics pings from each
|
||||
Firefox Reality client, partitioned by day
|
||||
owners:
|
||||
- jklukas@mozilla.com
|
||||
- ascholtz@mozilla.com
|
||||
labels:
|
||||
application: firefox-reality
|
||||
schedule: daily
|
||||
scheduling:
|
||||
dag_name: bqetl_vrbrowser
|
|
@ -1,28 +0,0 @@
|
|||
SELECT
|
||||
baseline.submission_date,
|
||||
baseline.client_id,
|
||||
baseline.sample_id,
|
||||
'Firefox Reality' AS app_name,
|
||||
baseline.first_run_date,
|
||||
baseline.durations,
|
||||
baseline.os,
|
||||
baseline.os_version,
|
||||
baseline.locale,
|
||||
baseline.country,
|
||||
baseline.city,
|
||||
baseline.device_manufacturer,
|
||||
baseline.device_model,
|
||||
baseline.app_build,
|
||||
baseline.normalized_channel,
|
||||
baseline.architecture,
|
||||
baseline.app_display_version,
|
||||
metrics.distribution_channel_name
|
||||
FROM
|
||||
baseline_daily_v1 AS baseline
|
||||
LEFT JOIN
|
||||
metrics_daily_v1 AS metrics
|
||||
USING
|
||||
(submission_date, client_id)
|
||||
WHERE
|
||||
-- Reprocess all dates by running this query with --parameter=submission_date:DATE:NULL
|
||||
(@submission_date IS NULL OR @submission_date = submission_date)
|
|
@ -1,25 +0,0 @@
|
|||
CREATE TABLE
|
||||
`moz-fx-data-shared-prod.org_mozilla_vrbrowser_derived.clients_last_seen_v1`
|
||||
PARTITION BY
|
||||
submission_date
|
||||
CLUSTER BY
|
||||
app_name,
|
||||
os,
|
||||
sample_id
|
||||
OPTIONS
|
||||
(require_partition_filter = TRUE)
|
||||
AS
|
||||
SELECT
|
||||
CAST(NULL AS DATE) AS submission_date,
|
||||
0 AS days_seen_bits,
|
||||
0 AS days_created_profile_bits,
|
||||
-- We make sure to delay * until the end so that as new columns are added
|
||||
-- to clients_daily, we can add those columns in the same order to the end
|
||||
-- of this schema, which may be necessary for the daily join query between
|
||||
-- the two tables to validate.
|
||||
* EXCEPT (submission_date)
|
||||
FROM
|
||||
`moz-fx-data-shared-prod.org_mozilla_vrbrowser_derived.clients_daily_v1`
|
||||
WHERE
|
||||
-- Output empty table and read no input rows
|
||||
FALSE
|
|
@ -1,14 +0,0 @@
|
|||
---
|
||||
friendly_name: VR Browser Clients Last Seen
|
||||
description: >
|
||||
Captures history of activity of each Firefox Reality client in 28 day
|
||||
windows for each submission date.
|
||||
owners:
|
||||
- jklukas@mozilla.com
|
||||
- ascholtz@mozilla.com
|
||||
labels:
|
||||
application: firefox-reality
|
||||
schedule: daily
|
||||
scheduling:
|
||||
dag_name: bqetl_vrbrowser
|
||||
depends_on_past: true
|
|
@ -1,49 +0,0 @@
|
|||
WITH _current AS (
|
||||
SELECT
|
||||
-- In this raw table, we capture the history of activity over the past
|
||||
-- 28 days for each usage criterion as a single 64-bit integer. The
|
||||
-- rightmost bit represents whether the user was active in the current day.
|
||||
CAST(TRUE AS INT64) AS days_seen_bits,
|
||||
udf.days_since_created_profile_as_28_bits(
|
||||
DATE_DIFF(submission_date, first_run_date, DAY)
|
||||
) AS days_created_profile_bits,
|
||||
* EXCEPT (submission_date)
|
||||
FROM
|
||||
clients_daily_v1
|
||||
WHERE
|
||||
submission_date = @submission_date
|
||||
),
|
||||
--
|
||||
_previous AS (
|
||||
SELECT
|
||||
* EXCEPT (submission_date)
|
||||
FROM
|
||||
clients_last_seen_v1 AS cls
|
||||
WHERE
|
||||
submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY)
|
||||
-- Filter out rows from yesterday that have now fallen outside the 28-day window.
|
||||
AND udf.shift_28_bits_one_day(days_seen_bits) > 0
|
||||
)
|
||||
--
|
||||
SELECT
|
||||
@submission_date AS submission_date,
|
||||
IF(
|
||||
_current.client_id IS NOT NULL,
|
||||
_current,
|
||||
_previous
|
||||
).* REPLACE ( --
|
||||
udf.combine_adjacent_days_28_bits(
|
||||
_previous.days_seen_bits,
|
||||
_current.days_seen_bits
|
||||
) AS days_seen_bits,
|
||||
udf.coalesce_adjacent_days_28_bits(
|
||||
_previous.days_created_profile_bits,
|
||||
_current.days_created_profile_bits
|
||||
) AS days_created_profile_bits
|
||||
)
|
||||
FROM
|
||||
_current
|
||||
FULL JOIN
|
||||
_previous
|
||||
USING
|
||||
(client_id)
|
|
@ -1,13 +0,0 @@
|
|||
---
|
||||
friendly_name: VR Browser Metrics Daily
|
||||
description: >
|
||||
A daily aggregate of metrics pings from each Firefox Reality client,
|
||||
partitioned by day
|
||||
owners:
|
||||
- jklukas@mozilla.com
|
||||
- ascholtz@mozilla.com
|
||||
labels:
|
||||
application: firefox-reality
|
||||
schedule: daily
|
||||
scheduling:
|
||||
dag_name: bqetl_vrbrowser
|
|
@ -1,73 +0,0 @@
|
|||
WITH metrics_v1 AS (
|
||||
SELECT
|
||||
DATE(submission_timestamp) AS submission_date,
|
||||
LOWER(client_info.client_id) AS client_id,
|
||||
submission_timestamp,
|
||||
document_id,
|
||||
client_info,
|
||||
sample_id,
|
||||
metadata,
|
||||
normalized_channel,
|
||||
metrics
|
||||
FROM
|
||||
org_mozilla_vrbrowser_stable.metrics_v1
|
||||
WHERE
|
||||
client_info.client_id IS NOT NULL
|
||||
),
|
||||
--
|
||||
windowed AS (
|
||||
SELECT
|
||||
submission_date,
|
||||
client_id,
|
||||
sample_id,
|
||||
ROW_NUMBER() OVER w1_unframed AS _n,
|
||||
--
|
||||
-- Take the earliest first_run_date if ambiguous.
|
||||
MIN(SAFE.PARSE_DATE('%F', SUBSTR(client_info.first_run_date, 1, 10))) OVER w1 AS first_run_date,
|
||||
--
|
||||
-- Use the mode of observed values in the day.
|
||||
udf.mode_last(ARRAY_AGG(client_info.os) OVER w1) AS os,
|
||||
udf.mode_last(ARRAY_AGG(client_info.os_version) OVER w1) AS os_version,
|
||||
udf.json_mode_last(
|
||||
ARRAY_AGG(udf.geo_struct(metadata.geo.country, metadata.geo.city, NULL, NULL)) OVER w1
|
||||
).* EXCEPT (geo_subdivision1, geo_subdivision2),
|
||||
udf.mode_last(ARRAY_AGG(client_info.device_manufacturer) OVER w1) AS device_manufacturer,
|
||||
udf.mode_last(ARRAY_AGG(client_info.device_model) OVER w1) AS device_model,
|
||||
udf.mode_last(ARRAY_AGG(client_info.app_build) OVER w1) AS app_build,
|
||||
udf.mode_last(ARRAY_AGG(normalized_channel) OVER w1) AS normalized_channel,
|
||||
udf.mode_last(ARRAY_AGG(client_info.architecture) OVER w1) AS architecture,
|
||||
udf.mode_last(ARRAY_AGG(client_info.app_display_version) OVER w1) AS app_display_version,
|
||||
udf.mode_last(
|
||||
ARRAY_AGG(metrics.string.distribution_channel_name) OVER w1
|
||||
) AS distribution_channel_name
|
||||
FROM
|
||||
metrics_v1
|
||||
WHERE
|
||||
-- Reprocess all dates by running this query with --parameter=submission_date:DATE:NULL
|
||||
(@submission_date IS NULL OR @submission_date = submission_date)
|
||||
WINDOW
|
||||
w1 AS (
|
||||
PARTITION BY
|
||||
client_id,
|
||||
submission_date
|
||||
ORDER BY
|
||||
submission_date
|
||||
ROWS BETWEEN
|
||||
UNBOUNDED PRECEDING
|
||||
AND UNBOUNDED FOLLOWING
|
||||
),
|
||||
-- We must provide a modified window for ROW_NUMBER which cannot accept a frame clause.
|
||||
w1_unframed AS (
|
||||
PARTITION BY
|
||||
client_id,
|
||||
submission_date
|
||||
ORDER BY
|
||||
submission_timestamp
|
||||
)
|
||||
)
|
||||
SELECT
|
||||
* EXCEPT (_n)
|
||||
FROM
|
||||
windowed
|
||||
WHERE
|
||||
_n = 1
|
Загрузка…
Ссылка в новой задаче