Move experiments_search_aggregates logic to a view (#1239)
This commit is contained in:
Родитель
654401a36d
Коммит
e72b2777d2
|
@ -0,0 +1,61 @@
|
|||
CREATE OR REPLACE VIEW
|
||||
`moz-fx-data-shared-prod.telemetry_derived.experiment_search_aggregates_base`
|
||||
AS
|
||||
WITH live_and_stable AS (
|
||||
SELECT
|
||||
*,
|
||||
'telemetry_stable' AS dataset_id
|
||||
FROM
|
||||
`moz-fx-data-shared-prod.telemetry_stable.main_v4`
|
||||
UNION ALL
|
||||
SELECT
|
||||
*,
|
||||
'telemetry_live' AS dataset_id
|
||||
FROM
|
||||
`moz-fx-data-shared-prod.telemetry_live.main_v4`
|
||||
)
|
||||
SELECT
|
||||
date(submission_timestamp) AS submission_date,
|
||||
dataset_id,
|
||||
unnested_experiments.key AS experiment,
|
||||
unnested_experiments.value AS branch,
|
||||
TIMESTAMP_ADD(
|
||||
TIMESTAMP_TRUNC(submission_timestamp, HOUR),
|
||||
-- Aggregates event counts over 5-minute intervals
|
||||
INTERVAL(DIV(EXTRACT(MINUTE FROM submission_timestamp), 5) * 5) MINUTE
|
||||
) AS window_start,
|
||||
TIMESTAMP_ADD(
|
||||
TIMESTAMP_TRUNC(submission_timestamp, HOUR),
|
||||
INTERVAL((DIV(EXTRACT(MINUTE FROM submission_timestamp), 5) + 1) * 5) MINUTE
|
||||
) AS window_end,
|
||||
SUM(unnested_ad_clicks.value) AS ad_clicks_count,
|
||||
SUM(unnested_search_with_ads.value) AS search_with_ads_count,
|
||||
SUM(unnested_search_counts.count) AS search_count,
|
||||
FROM
|
||||
live_and_stable,
|
||||
UNNEST(
|
||||
ARRAY(SELECT AS STRUCT key, value.branch AS value FROM UNNEST(environment.experiments))
|
||||
) AS unnested_experiments,
|
||||
UNNEST(payload.processes.parent.keyed_scalars.browser_search_ad_clicks) AS unnested_ad_clicks,
|
||||
UNNEST(
|
||||
payload.processes.parent.keyed_scalars.browser_search_with_ads
|
||||
) AS unnested_search_with_ads,
|
||||
UNNEST(
|
||||
ARRAY(
|
||||
SELECT AS STRUCT
|
||||
SUBSTR(_key, 0, pos - 2) AS engine,
|
||||
SUBSTR(_key, pos) AS source,
|
||||
`moz-fx-data-shared-prod`.udf.extract_histogram_sum(value) AS `count`
|
||||
FROM
|
||||
UNNEST(payload.keyed_histograms.search_counts),
|
||||
UNNEST([REPLACE(key, 'in-content.', 'in-content:')]) AS _key,
|
||||
UNNEST([LENGTH(REGEXP_EXTRACT(_key, '.+[.].'))]) AS pos
|
||||
)
|
||||
) AS unnested_search_counts
|
||||
GROUP BY
|
||||
submission_date,
|
||||
dataset_id,
|
||||
experiment,
|
||||
branch,
|
||||
window_start,
|
||||
window_end
|
|
@ -7,17 +7,16 @@ import sys
|
|||
import textwrap
|
||||
from time import sleep
|
||||
|
||||
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument(
|
||||
"--submission-date",
|
||||
type=str,
|
||||
help="Cut-off date for using pre-computed vs live tables in view",
|
||||
required=True
|
||||
required=True,
|
||||
)
|
||||
p.add_argument(
|
||||
"--json-output",
|
||||
action='store_true',
|
||||
action="store_true",
|
||||
help="Output the result wrapped in json parseable as an XCOM",
|
||||
)
|
||||
p.add_argument(
|
||||
|
@ -27,6 +26,7 @@ p.add_argument(
|
|||
help="Add a delay before executing the script to allow time for the xcom sidecar to complete startup",
|
||||
)
|
||||
|
||||
|
||||
def generate_sql(opts):
|
||||
"""Create a BigQuery SQL query for the experiment search aggregates view with new date filled in.
|
||||
Unfortunately, BigQuery does not allow parameters in view definitions, so this script is a very thin
|
||||
|
@ -34,82 +34,24 @@ def generate_sql(opts):
|
|||
live table for the view"""
|
||||
query = textwrap.dedent(
|
||||
"""
|
||||
CREATE
|
||||
OR REPLACE VIEW `moz-fx-data-shared-prod.telemetry_derived.experiment_search_aggregates_live_v1` AS
|
||||
WITH all_experiments_searches_live AS (
|
||||
SELECT
|
||||
submission_timestamp AS timestamp,
|
||||
unnested_experiments,
|
||||
unnested_ad_clicks,
|
||||
unnested_search_with_ads,
|
||||
unnested_search_counts
|
||||
FROM
|
||||
`moz-fx-data-shared-prod.telemetry_live.main_v4`,
|
||||
UNNEST(
|
||||
ARRAY(SELECT AS STRUCT key, value.branch AS value FROM UNNEST(environment.experiments))
|
||||
) AS unnested_experiments,
|
||||
UNNEST(payload.processes.parent.keyed_scalars.browser_search_ad_clicks) AS unnested_ad_clicks,
|
||||
UNNEST(
|
||||
payload.processes.parent.keyed_scalars.browser_search_with_ads
|
||||
) AS unnested_search_with_ads,
|
||||
UNNEST(
|
||||
ARRAY(
|
||||
SELECT AS STRUCT
|
||||
SUBSTR(_key, 0, pos - 2) AS engine,
|
||||
SUBSTR(_key, pos) AS source,
|
||||
udf.extract_histogram_sum(value) AS `count`
|
||||
FROM
|
||||
UNNEST(payload.keyed_histograms.search_counts),
|
||||
UNNEST([REPLACE(key, 'in-content.', 'in-content:')]) AS _key,
|
||||
UNNEST([LENGTH(REGEXP_EXTRACT(_key, '.+[.].'))]) AS pos
|
||||
)
|
||||
) AS unnested_search_counts
|
||||
WHERE
|
||||
date(submission_timestamp) > '{submission_date}'
|
||||
AND ARRAY_LENGTH(environment.experiments) > 0
|
||||
),
|
||||
live AS (
|
||||
SELECT
|
||||
unnested_experiments.key AS experiment,
|
||||
unnested_experiments.value AS branch,
|
||||
TIMESTAMP_ADD(
|
||||
TIMESTAMP_TRUNC(`timestamp`, HOUR),
|
||||
-- Aggregates event counts over 5-minute intervals
|
||||
INTERVAL(DIV(EXTRACT(MINUTE FROM `timestamp`), 5) * 5) MINUTE
|
||||
) AS window_start,
|
||||
TIMESTAMP_ADD(
|
||||
TIMESTAMP_TRUNC(`timestamp`, HOUR),
|
||||
INTERVAL((DIV(EXTRACT(MINUTE FROM `timestamp`), 5) + 1) * 5) MINUTE
|
||||
) AS window_end,
|
||||
SUM(unnested_ad_clicks.value) AS ad_clicks_count,
|
||||
SUM(unnested_search_with_ads.value) AS search_with_ads_count,
|
||||
SUM(unnested_search_counts.count) AS search_count
|
||||
FROM
|
||||
all_experiments_searches_live
|
||||
GROUP BY
|
||||
experiment,
|
||||
branch,
|
||||
window_start,
|
||||
window_end
|
||||
),
|
||||
previous AS (
|
||||
CREATE OR REPLACE VIEW
|
||||
`moz-fx-data-shared-prod.telemetry.experiment_search_aggregates_live_v1`
|
||||
AS
|
||||
WITH all_searches AS (
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
`moz-fx-data-shared-prod.telemetry_derived.experiment_search_aggregates_v1`
|
||||
WHERE
|
||||
date(window_start) <= '{submission_date}'
|
||||
),
|
||||
all_searches AS (
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
previous
|
||||
date(window_start) <= DATE '{submission_date}'
|
||||
UNION ALL
|
||||
SELECT
|
||||
*
|
||||
* EXCEPT (submission_date, dataset_id)
|
||||
FROM
|
||||
live
|
||||
`moz-fx-data-shared-prod.telemetry_derived.experiment_search_aggregates_base`
|
||||
WHERE
|
||||
submission_date > DATE '{submission_date}'
|
||||
AND dataset_id = 'telemetry_live'
|
||||
)
|
||||
SELECT
|
||||
*,
|
||||
|
@ -132,7 +74,7 @@ def generate_sql(opts):
|
|||
**opts
|
||||
)
|
||||
)
|
||||
if opts['json_output']:
|
||||
if opts["json_output"]:
|
||||
return json.dumps(query)
|
||||
else:
|
||||
return query
|
||||
|
@ -141,7 +83,7 @@ def generate_sql(opts):
|
|||
def main(argv, out=print):
|
||||
"""Print experiment search aggregates view sql to stdout."""
|
||||
opts = vars(p.parse_args(argv[1:]))
|
||||
sleep(opts['wait_seconds'])
|
||||
sleep(opts["wait_seconds"])
|
||||
out(generate_sql(opts))
|
||||
|
||||
|
||||
|
|
|
@ -1,54 +1,7 @@
|
|||
WITH all_experiments_searches AS (
|
||||
SELECT
|
||||
submission_timestamp AS timestamp,
|
||||
unnested_experiments,
|
||||
unnested_ad_clicks,
|
||||
unnested_search_with_ads,
|
||||
unnested_search_counts
|
||||
FROM
|
||||
`moz-fx-data-shared-prod.telemetry_stable.main_v4`,
|
||||
UNNEST(
|
||||
ARRAY(SELECT AS STRUCT key, value.branch AS value FROM UNNEST(environment.experiments))
|
||||
) AS unnested_experiments,
|
||||
UNNEST(payload.processes.parent.keyed_scalars.browser_search_ad_clicks) AS unnested_ad_clicks,
|
||||
UNNEST(
|
||||
payload.processes.parent.keyed_scalars.browser_search_with_ads
|
||||
) AS unnested_search_with_ads,
|
||||
UNNEST(
|
||||
ARRAY(
|
||||
SELECT AS STRUCT
|
||||
SUBSTR(_key, 0, pos - 2) AS engine,
|
||||
SUBSTR(_key, pos) AS source,
|
||||
udf.extract_histogram_sum(value) AS `count`
|
||||
FROM
|
||||
UNNEST(payload.keyed_histograms.search_counts),
|
||||
UNNEST([REPLACE(key, 'in-content.', 'in-content:')]) AS _key,
|
||||
UNNEST([LENGTH(REGEXP_EXTRACT(_key, '.+[.].'))]) AS pos
|
||||
)
|
||||
) AS unnested_search_counts
|
||||
WHERE
|
||||
date(submission_timestamp) = @submission_date
|
||||
AND ARRAY_LENGTH(environment.experiments) > 0
|
||||
)
|
||||
SELECT
|
||||
unnested_experiments.key AS experiment,
|
||||
unnested_experiments.value AS branch,
|
||||
TIMESTAMP_ADD(
|
||||
TIMESTAMP_TRUNC(`timestamp`, HOUR),
|
||||
-- Aggregates event counts over 5-minute intervals
|
||||
INTERVAL(DIV(EXTRACT(MINUTE FROM `timestamp`), 5) * 5) MINUTE
|
||||
) AS window_start,
|
||||
TIMESTAMP_ADD(
|
||||
TIMESTAMP_TRUNC(`timestamp`, HOUR),
|
||||
INTERVAL((DIV(EXTRACT(MINUTE FROM `timestamp`), 5) + 1) * 5) MINUTE
|
||||
) AS window_end,
|
||||
SUM(unnested_ad_clicks.value) AS ad_clicks_count,
|
||||
SUM(unnested_search_with_ads.value) AS search_with_ads_count,
|
||||
SUM(unnested_search_counts.count) AS search_count
|
||||
* EXCEPT (submission_date, dataset_id)
|
||||
FROM
|
||||
all_experiments_searches
|
||||
GROUP BY
|
||||
experiment,
|
||||
branch,
|
||||
window_start,
|
||||
window_end
|
||||
experiment_search_aggregates_base
|
||||
WHERE
|
||||
submission_date = @submission_date
|
||||
AND dataset_id = 'telemetry_stable'
|
||||
|
|
Загрузка…
Ссылка в новой задаче