Move experiments_search_aggregates logic to a view (#1239)

This commit is contained in:
Daniel Thorn 2020-08-18 15:17:22 -07:00 коммит произвёл GitHub
Родитель 654401a36d
Коммит e72b2777d2
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 81 добавлений и 125 удалений

Просмотреть файл

@ -0,0 +1,61 @@
CREATE OR REPLACE VIEW
`moz-fx-data-shared-prod.telemetry_derived.experiment_search_aggregates_base`
AS
WITH live_and_stable AS (
SELECT
*,
'telemetry_stable' AS dataset_id
FROM
`moz-fx-data-shared-prod.telemetry_stable.main_v4`
UNION ALL
SELECT
*,
'telemetry_live' AS dataset_id
FROM
`moz-fx-data-shared-prod.telemetry_live.main_v4`
)
SELECT
date(submission_timestamp) AS submission_date,
dataset_id,
unnested_experiments.key AS experiment,
unnested_experiments.value AS branch,
TIMESTAMP_ADD(
TIMESTAMP_TRUNC(submission_timestamp, HOUR),
-- Aggregates event counts over 5-minute intervals
INTERVAL(DIV(EXTRACT(MINUTE FROM submission_timestamp), 5) * 5) MINUTE
) AS window_start,
TIMESTAMP_ADD(
TIMESTAMP_TRUNC(submission_timestamp, HOUR),
INTERVAL((DIV(EXTRACT(MINUTE FROM submission_timestamp), 5) + 1) * 5) MINUTE
) AS window_end,
SUM(unnested_ad_clicks.value) AS ad_clicks_count,
SUM(unnested_search_with_ads.value) AS search_with_ads_count,
SUM(unnested_search_counts.count) AS search_count,
FROM
live_and_stable,
UNNEST(
ARRAY(SELECT AS STRUCT key, value.branch AS value FROM UNNEST(environment.experiments))
) AS unnested_experiments,
UNNEST(payload.processes.parent.keyed_scalars.browser_search_ad_clicks) AS unnested_ad_clicks,
UNNEST(
payload.processes.parent.keyed_scalars.browser_search_with_ads
) AS unnested_search_with_ads,
UNNEST(
ARRAY(
SELECT AS STRUCT
SUBSTR(_key, 0, pos - 2) AS engine,
SUBSTR(_key, pos) AS source,
`moz-fx-data-shared-prod`.udf.extract_histogram_sum(value) AS `count`
FROM
UNNEST(payload.keyed_histograms.search_counts),
UNNEST([REPLACE(key, 'in-content.', 'in-content:')]) AS _key,
UNNEST([LENGTH(REGEXP_EXTRACT(_key, '.+[.].'))]) AS pos
)
) AS unnested_search_counts
GROUP BY
submission_date,
dataset_id,
experiment,
branch,
window_start,
window_end

Просмотреть файл

@ -7,17 +7,16 @@ import sys
import textwrap
from time import sleep
p = argparse.ArgumentParser()
p.add_argument(
"--submission-date",
type=str,
help="Cut-off date for using pre-computed vs live tables in view",
required=True
required=True,
)
p.add_argument(
"--json-output",
action='store_true',
action="store_true",
help="Output the result wrapped in json parseable as an XCOM",
)
p.add_argument(
@ -27,6 +26,7 @@ p.add_argument(
help="Add a delay before executing the script to allow time for the xcom sidecar to complete startup",
)
def generate_sql(opts):
"""Create a BigQuery SQL query for the experiment search aggregates view with new date filled in.
Unfortunately, BigQuery does not allow parameters in view definitions, so this script is a very thin
@ -34,82 +34,24 @@ def generate_sql(opts):
live table for the view"""
query = textwrap.dedent(
"""
CREATE
OR REPLACE VIEW `moz-fx-data-shared-prod.telemetry_derived.experiment_search_aggregates_live_v1` AS
WITH all_experiments_searches_live AS (
SELECT
submission_timestamp AS timestamp,
unnested_experiments,
unnested_ad_clicks,
unnested_search_with_ads,
unnested_search_counts
FROM
`moz-fx-data-shared-prod.telemetry_live.main_v4`,
UNNEST(
ARRAY(SELECT AS STRUCT key, value.branch AS value FROM UNNEST(environment.experiments))
) AS unnested_experiments,
UNNEST(payload.processes.parent.keyed_scalars.browser_search_ad_clicks) AS unnested_ad_clicks,
UNNEST(
payload.processes.parent.keyed_scalars.browser_search_with_ads
) AS unnested_search_with_ads,
UNNEST(
ARRAY(
SELECT AS STRUCT
SUBSTR(_key, 0, pos - 2) AS engine,
SUBSTR(_key, pos) AS source,
udf.extract_histogram_sum(value) AS `count`
FROM
UNNEST(payload.keyed_histograms.search_counts),
UNNEST([REPLACE(key, 'in-content.', 'in-content:')]) AS _key,
UNNEST([LENGTH(REGEXP_EXTRACT(_key, '.+[.].'))]) AS pos
)
) AS unnested_search_counts
WHERE
date(submission_timestamp) > '{submission_date}'
AND ARRAY_LENGTH(environment.experiments) > 0
),
live AS (
SELECT
unnested_experiments.key AS experiment,
unnested_experiments.value AS branch,
TIMESTAMP_ADD(
TIMESTAMP_TRUNC(`timestamp`, HOUR),
-- Aggregates event counts over 5-minute intervals
INTERVAL(DIV(EXTRACT(MINUTE FROM `timestamp`), 5) * 5) MINUTE
) AS window_start,
TIMESTAMP_ADD(
TIMESTAMP_TRUNC(`timestamp`, HOUR),
INTERVAL((DIV(EXTRACT(MINUTE FROM `timestamp`), 5) + 1) * 5) MINUTE
) AS window_end,
SUM(unnested_ad_clicks.value) AS ad_clicks_count,
SUM(unnested_search_with_ads.value) AS search_with_ads_count,
SUM(unnested_search_counts.count) AS search_count
FROM
all_experiments_searches_live
GROUP BY
experiment,
branch,
window_start,
window_end
),
previous AS (
CREATE OR REPLACE VIEW
`moz-fx-data-shared-prod.telemetry.experiment_search_aggregates_live_v1`
AS
WITH all_searches AS (
SELECT
*
FROM
`moz-fx-data-shared-prod.telemetry_derived.experiment_search_aggregates_v1`
WHERE
date(window_start) <= '{submission_date}'
),
all_searches AS (
SELECT
*
FROM
previous
date(window_start) <= DATE '{submission_date}'
UNION ALL
SELECT
*
* EXCEPT (submission_date, dataset_id)
FROM
live
`moz-fx-data-shared-prod.telemetry_derived.experiment_search_aggregates_base`
WHERE
submission_date > DATE '{submission_date}'
AND dataset_id = 'telemetry_live'
)
SELECT
*,
@ -132,7 +74,7 @@ def generate_sql(opts):
**opts
)
)
if opts['json_output']:
if opts["json_output"]:
return json.dumps(query)
else:
return query
@ -141,7 +83,7 @@ def generate_sql(opts):
def main(argv, out=print):
"""Print experiment search aggregates view sql to stdout."""
opts = vars(p.parse_args(argv[1:]))
sleep(opts['wait_seconds'])
sleep(opts["wait_seconds"])
out(generate_sql(opts))

Просмотреть файл

@ -1,54 +1,7 @@
WITH all_experiments_searches AS (
SELECT
submission_timestamp AS timestamp,
unnested_experiments,
unnested_ad_clicks,
unnested_search_with_ads,
unnested_search_counts
FROM
`moz-fx-data-shared-prod.telemetry_stable.main_v4`,
UNNEST(
ARRAY(SELECT AS STRUCT key, value.branch AS value FROM UNNEST(environment.experiments))
) AS unnested_experiments,
UNNEST(payload.processes.parent.keyed_scalars.browser_search_ad_clicks) AS unnested_ad_clicks,
UNNEST(
payload.processes.parent.keyed_scalars.browser_search_with_ads
) AS unnested_search_with_ads,
UNNEST(
ARRAY(
SELECT AS STRUCT
SUBSTR(_key, 0, pos - 2) AS engine,
SUBSTR(_key, pos) AS source,
udf.extract_histogram_sum(value) AS `count`
FROM
UNNEST(payload.keyed_histograms.search_counts),
UNNEST([REPLACE(key, 'in-content.', 'in-content:')]) AS _key,
UNNEST([LENGTH(REGEXP_EXTRACT(_key, '.+[.].'))]) AS pos
)
) AS unnested_search_counts
WHERE
date(submission_timestamp) = @submission_date
AND ARRAY_LENGTH(environment.experiments) > 0
)
SELECT
unnested_experiments.key AS experiment,
unnested_experiments.value AS branch,
TIMESTAMP_ADD(
TIMESTAMP_TRUNC(`timestamp`, HOUR),
-- Aggregates event counts over 5-minute intervals
INTERVAL(DIV(EXTRACT(MINUTE FROM `timestamp`), 5) * 5) MINUTE
) AS window_start,
TIMESTAMP_ADD(
TIMESTAMP_TRUNC(`timestamp`, HOUR),
INTERVAL((DIV(EXTRACT(MINUTE FROM `timestamp`), 5) + 1) * 5) MINUTE
) AS window_end,
SUM(unnested_ad_clicks.value) AS ad_clicks_count,
SUM(unnested_search_with_ads.value) AS search_with_ads_count,
SUM(unnested_search_counts.count) AS search_count
* EXCEPT (submission_date, dataset_id)
FROM
all_experiments_searches
GROUP BY
experiment,
branch,
window_start,
window_end
experiment_search_aggregates_base
WHERE
submission_date = @submission_date
AND dataset_id = 'telemetry_stable'