This commit is contained in:
Anna Scholtz 2022-09-07 13:19:30 -07:00
Родитель 6084fe2731
Коммит 13cdab45a6
11 изменённых файлов: 447 добавлений и 587 удалений

Просмотреть файл

@ -1,8 +1,9 @@
"""OpMon.""" """OpMon."""
import enum import enum
from typing import List, Optional from typing import List, Optional, TYPE_CHECKING
from opmon.config import Summary if TYPE_CHECKING:
from opmon.config import Summary
import attr import attr
@ -85,7 +86,7 @@ class Alert:
name: str name: str
type: AlertType type: AlertType
probes: List[Summary] probes: List["Summary"]
friendly_name: Optional[str] = None friendly_name: Optional[str] = None
description: Optional[str] = None description: Optional[str] = None
percentiles: List[int] = [] percentiles: List[int] = []

Просмотреть файл

@ -128,7 +128,7 @@ class ProbeDefinition:
description: Optional[str] = None description: Optional[str] = None
category: Optional[str] = None category: Optional[str] = None
type: Optional[str] = None type: Optional[str] = None
statistics: Optional[Dict[str, Dict[str, Any]]] = None statistics: Optional[Dict[str, Dict[str, Any]]] = {"percentile": {}} # todo: remove default?
def resolve(self, spec: "MonitoringSpec") -> List[Summary]: def resolve(self, spec: "MonitoringSpec") -> List[Summary]:
"""Create and return a `Probe` instance from this definition.""" """Create and return a `Probe` instance from this definition."""
@ -157,12 +157,13 @@ class ProbeDefinition:
stats_params = copy.deepcopy(params) stats_params = copy.deepcopy(params)
summaries.append( for stat in statistic.from_dict(stats_params).computation(probe):
Summary( summaries.append(
metric=probe, Summary(
statistic=statistic.from_dict(stats_params), metric=probe,
statistic=stat,
)
) )
)
return summaries return summaries

Просмотреть файл

@ -31,3 +31,11 @@ class ConfigurationException(OpmonException):
def __init__(self, slug, message="Project has been incorrectly configured."): def __init__(self, slug, message="Project has been incorrectly configured."):
"""Initialize exception.""" """Initialize exception."""
super().__init__(f"{slug} -> {message}") super().__init__(f"{slug} -> {message}")
class StatisticNotImplementedForTypeException(OpmonException):
"""Exception thrown when statistic is not implemented for metric type."""
def __init__(self, slug, message="Statistic not implemented for metric type."):
"""Initialize exception."""
super().__init__(f"{slug} -> {message}")

Просмотреть файл

@ -30,7 +30,12 @@ class ExternalConfig:
def validate(self, experiment: Optional[experimenter.Experiment] = None) -> None: def validate(self, experiment: Optional[experimenter.Experiment] = None) -> None:
"""Validate the external config.""" """Validate the external config."""
conf = self.spec.resolve(experiment) conf = self.spec.resolve(experiment)
Monitoring(project="project", dataset="dataset", slug=self.slug, config=conf).validate() Monitoring(
project="moz-fx-data-shared-prod",
dataset="operational_monitoring",
slug=self.slug,
config=conf,
).validate()
def entity_from_path(path: Path) -> ExternalConfig: def entity_from_path(path: Path) -> ExternalConfig:

Просмотреть файл

@ -18,8 +18,7 @@ from .utils import bq_normalize_name
PATH = Path(os.path.dirname(__file__)) PATH = Path(os.path.dirname(__file__))
QUERY_FILENAME = "{}_query.sql" QUERY_FILENAME = "metric_query.sql"
VIEW_FILENAME = "metric_view.sql"
ALERTS_FILENAME = "alerts_view.sql" ALERTS_FILENAME = "alerts_view.sql"
STATISTICS_FILENAME = "statistics.sql" STATISTICS_FILENAME = "statistics.sql"
TEMPLATE_FOLDER = PATH / "templates" TEMPLATE_FOLDER = PATH / "templates"
@ -54,12 +53,8 @@ class Monitoring:
def run(self, submission_date): def run(self, submission_date):
"""Execute and generate the operational monitoring ETL for a specific date.""" """Execute and generate the operational monitoring ETL for a specific date."""
for data_type in DATA_TYPES: print(f"Run metrics query for {self.slug}")
# Periodically print so airflow gke operator doesn't think task is dead self.bigquery.execute(self._run_metrics_sql(submission_date))
print(f"Run query for {self.slug} for {data_type} types")
self._run_sql_for_data_type(submission_date, data_type)
print(f"Create view for {self.slug}")
self.bigquery.execute(self._get_view_sql())
print("Calculate statistics") print("Calculate statistics")
self.bigquery.execute(self._get_statistics_sql(submission_date)) self.bigquery.execute(self._get_statistics_sql(submission_date))
@ -68,7 +63,7 @@ class Monitoring:
self._run_sql_for_alerts(submission_date) self._run_sql_for_alerts(submission_date)
return True return True
def _run_sql_for_data_type(self, submission_date: datetime, data_type: str): def _run_metrics_sql(self, submission_date: datetime):
"""Generate and execute the ETL for a specific data type.""" """Generate and execute the ETL for a specific data type."""
try: try:
self._check_runnable(submission_date) self._check_runnable(submission_date)
@ -77,10 +72,10 @@ class Monitoring:
return return
date_partition = str(submission_date).replace("-", "").split(" ")[0] date_partition = str(submission_date).replace("-", "").split(" ")[0]
destination_table = f"{self.normalized_slug}_{data_type}${date_partition}" destination_table = f"{self.normalized_slug}${date_partition}"
self.bigquery.execute( self.bigquery.execute(
self._get_data_type_sql(submission_date=submission_date, data_type=data_type), self._get_metrics_sql(submission_date=submission_date),
destination_table, destination_table,
clustering=["build_id"], clustering=["build_id"],
time_partitioning="submission_date", time_partitioning="submission_date",
@ -95,28 +90,19 @@ class Monitoring:
sql = template.render(**render_kwargs) sql = template.render(**render_kwargs)
return sql return sql
def _get_data_type_sql( def _get_metrics_sql(
self, submission_date: datetime, data_type: str, first_run: Optional[bool] = None self, submission_date: datetime, first_run: Optional[bool] = None
) -> str: ) -> str:
"""Return SQL for data_type ETL.""" """Return SQL for data_type ETL."""
probes = self.config.probes probes = self.config.probes
probes = [probe for probe in probes if probe.metric.type == data_type]
if len(probes) == 0: if len(probes) == 0:
# There are no probes for this data source + data type combo # There are no probes for this data source + data type combo
logger.warning( logger.warning(
f"No probes for data type {data_type} configured for {self.slug}.", f"No metrics configured for {self.slug}.",
extra={"experiment": self.slug}, extra={"experiment": self.slug},
) )
# todo:
# xaxis metadata to be used to decide whether the entire table is replaced
# Or just a partition.
#
# Note: there is a subtle design here in which date partitions are replaced
# if the data is for a build over build analysis but the entire table is
# replaced if it's a submission date analysis.
# group probes that are part of the same dataset # group probes that are part of the same dataset
# necessary for creating the SQL template # necessary for creating the SQL template
metrics_per_dataset = {} metrics_per_dataset = {}
@ -124,13 +110,14 @@ class Monitoring:
if probe.metric.data_source.name not in metrics_per_dataset: if probe.metric.data_source.name not in metrics_per_dataset:
metrics_per_dataset[probe.metric.data_source.name] = [probe.metric] metrics_per_dataset[probe.metric.data_source.name] = [probe.metric]
else: else:
metrics_per_dataset[probe.metric.data_source.name].append(probe.metric) if probe.metric not in metrics_per_dataset[probe.metric.data_source.name]:
metrics_per_dataset[probe.metric.data_source.name].append(probe.metric)
# check if this is the first time the queries are executed # check if this is the first time the queries are executed
# the queries are referencing the destination table if build_id is used for the time frame # the queries are referencing the destination table if build_id is used for the time frame
if first_run is None: if first_run is None:
destination_table = ( destination_table = (
f"{self.project}.{self.dataset}_derived.{self.normalized_slug}_{data_type}" f"{self.project}.{self.dataset}_derived.{self.normalized_slug}"
) )
first_run = True first_run = True
try: try:
@ -147,30 +134,15 @@ class Monitoring:
"dataset": self.dataset, "dataset": self.dataset,
"first_run": first_run, "first_run": first_run,
"dimensions": self.config.dimensions, "dimensions": self.config.dimensions,
# "user_count_threshold": USERS_PER_BUILD_THRESHOLDS[
# self.config.project.population.channel
# ],
"metrics_per_dataset": metrics_per_dataset, "metrics_per_dataset": metrics_per_dataset,
"slug": self.slug, "slug": self.slug,
"normalized_slug": self.normalized_slug, "normalized_slug": self.normalized_slug,
} }
sql_filename = QUERY_FILENAME.format(data_type) sql_filename = QUERY_FILENAME
sql = self._render_sql(sql_filename, render_kwargs) sql = self._render_sql(sql_filename, render_kwargs)
return sql return sql
def _get_view_sql(self) -> str:
"""Return the SQL to create a BigQuery view."""
render_kwargs = {
"gcp_project": self.project,
"dataset": self.dataset,
"config": self.config.project,
"normalized_slug": self.normalized_slug,
"dimensions": self.config.dimensions,
}
sql = self._render_sql(VIEW_FILENAME, render_kwargs)
return sql
def _get_statistics_sql(self, submission_date) -> str: def _get_statistics_sql(self, submission_date) -> str:
"""Return the SQL to run the statistics.""" """Return the SQL to run the statistics."""
render_kwargs = { render_kwargs = {
@ -179,7 +151,8 @@ class Monitoring:
"config": self.config.project, "config": self.config.project,
"normalized_slug": self.normalized_slug, "normalized_slug": self.normalized_slug,
"dimensions": self.config.dimensions, "dimensions": self.config.dimensions,
"probes": self.config.probes, "summaries": self.config.probes,
"submission_date": submission_date,
} }
sql = self._render_sql(STATISTICS_FILENAME, render_kwargs) sql = self._render_sql(STATISTICS_FILENAME, render_kwargs)
return sql return sql
@ -237,11 +210,18 @@ class Monitoring:
"""Validate ETL and configs of opmon project.""" """Validate ETL and configs of opmon project."""
self._check_runnable() self._check_runnable()
for data_type in DATA_TYPES: metrics_sql = self._get_metrics_sql(
data_type_sql = self._get_data_type_sql( submission_date=self.config.project.start_date, # type: ignore
submission_date=self.config.project.start_date, # type: ignore first_run=True,
data_type=data_type, )
first_run=True, dry_run_query(metrics_sql)
) # print(data_type_sql)
dry_run_query(data_type_sql)
print(data_type_sql) statistics_sql = self._get_statistics_sql(
submission_date=self.config.project.start_date, # type: ignore
)
# print(statistics_sql)
dry_run_query(statistics_sql)
# todo: validate alerts
# todo: update alerts view/query

Просмотреть файл

@ -1,9 +1,12 @@
import re import re
from abc import ABC, abstractmethod from abc import ABC
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import attr import attr
from opmon import Probe
from opmon.errors import StatisticNotImplementedForTypeException
@attr.s(auto_attribs=True) @attr.s(auto_attribs=True)
class StatisticComputation: class StatisticComputation:
@ -31,9 +34,25 @@ class Statistic(ABC):
name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", cls.__name__) name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", cls.__name__)
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower() return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower()
@abstractmethod def computation(self, metric: Probe) -> List[StatisticComputation]:
def computation(self, value: str = "values") -> List[StatisticComputation]: if metric.type == "scalar":
return NotImplemented return self._scalar_computation(metric)
elif metric.type == "histogram":
return self._histogram_computation(metric)
else:
raise StatisticNotImplementedForTypeException(
f"Statistic {self.name()} not implemented for type {metric.type} ({metric.name})"
)
def _scalar_computation(self, metric: Probe) -> List[StatisticComputation]:
raise StatisticNotImplementedForTypeException(
f"Statistic {self.name()} not implemented for type {metric.type} ({metric.name})"
)
def _histogram_computation(self, metric: Probe) -> List[StatisticComputation]:
raise StatisticNotImplementedForTypeException(
f"Statistic {self.name()} not implemented for type {metric.type} ({metric.name})"
)
@classmethod @classmethod
def from_dict(cls, config_dict: Dict[str, Any]): def from_dict(cls, config_dict: Dict[str, Any]):
@ -42,30 +61,30 @@ class Statistic(ABC):
class Count(Statistic): class Count(Statistic):
def computation(self, value: str = "values"): def _scalar_computation(self, metric: Probe):
return [ return [
StatisticComputation( StatisticComputation(
point=f"COUNT({value})", point=f"COUNT({metric.name})",
name=self.name(), name=self.name(),
) )
] ]
class Sum(Statistic): class Sum(Statistic):
def computation(self, value: str = "values"): def _scalar_computation(self, metric: Probe):
return [ return [
StatisticComputation( StatisticComputation(
point=f"SUM({value})", point=f"SUM({metric.name})",
name=self.name(), name=self.name(),
) )
] ]
class Mean(Statistic): class Mean(Statistic):
def computation(self, value: str = "values"): def _scalar_computation(self, metric: Probe):
return [ return [
StatisticComputation( StatisticComputation(
point=f"AVG({value})", point=f"AVG({metric.name})",
name=self.name(), name=self.name(),
) )
] ]
@ -75,11 +94,14 @@ class Quantile(Statistic):
number_of_quantiles: int = 100 number_of_quantiles: int = 100
quantile: int = 50 quantile: int = 50
def computation(self, value: str = "values"): def _scalar_computation(self, metric: Probe):
return [ return [
StatisticComputation( StatisticComputation(
point=f""" point=f"""
APPROX_QUANTILES({value}, {self.number_of_quantiles})[OFFSET({self.quantile})] APPROX_QUANTILES(
{metric.name},
{self.number_of_quantiles}
)[OFFSET({self.quantile})]
""", """,
name=self.name(), name=self.name(),
) )
@ -90,30 +112,252 @@ class Quantile(Statistic):
class Percentile(Statistic): class Percentile(Statistic):
percentiles: List[int] = [50, 90, 99] percentiles: List[int] = [50, 90, 99]
def computation(self, value: str = "values"): def _scalar_computation(self, metric: Probe):
return [ return [
StatisticComputation( StatisticComputation(
point=f""" point=f"""
`moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci( `moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci(
{percentile}, {percentile},
STRUCT( STRUCT<
{value} bucket_count INT64,
sum INT64,
histogram_type INT64,
`range` ARRAY<INT64>,
VALUES
ARRAY<STRUCT<key FLOAT64, value FLOAT64>
>>(1,
COALESCE(
SAFE_CAST(
SAFE_CAST(
FORMAT(
"%.*f",
2,
COALESCE(
mozfun.glam.histogram_bucket_from_value(
{metric.name}_buckets,
SAFE_CAST({metric.name} AS FLOAT64)
), 0) + 0.0001
)
AS FLOAT64)
AS INT64),
0),
1,
[
0,
COALESCE(
SAFE_CAST(
SAFE_CAST(
FORMAT(
"%.*f",
2,
COALESCE(
mozfun.glam.histogram_bucket_from_value(
{metric.name}_buckets,
SAFE_CAST({metric.name} AS FLOAT64)
), 0
) + 0.0001
)
AS FLOAT64)
AS INT64),
0)
],
[
STRUCT<key FLOAT64, value FLOAT64>(
COALESCE(
SAFE_CAST(
FORMAT(
"%.*f",
2,
COALESCE(
mozfun.glam.histogram_bucket_from_value(
{metric.name}_buckets,
SAFE_CAST({metric.name} AS FLOAT64)
),
0) + 0.0001
) AS FLOAT64
), 0.0
), 1
)
]
) )
).percentile ).percentile
""", """,
lower=f""" lower=f"""
`moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci( `moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci(
{percentile}, {percentile},
STRUCT( STRUCT<
{value} bucket_count INT64,
sum INT64,
histogram_type INT64,
`range` ARRAY<INT64>,
VALUES
ARRAY<STRUCT<key FLOAT64, value FLOAT64>
>>(1,
COALESCE(
SAFE_CAST(
SAFE_CAST(
FORMAT(
"%.*f",
2,
COALESCE(
mozfun.glam.histogram_bucket_from_value(
{metric.name}_buckets,
SAFE_CAST({metric.name} AS FLOAT64)
), 0) + 0.0001
)
AS FLOAT64)
AS INT64),
0),
1,
[
0,
COALESCE(
SAFE_CAST(
SAFE_CAST(
FORMAT(
"%.*f",
2,
COALESCE(
mozfun.glam.histogram_bucket_from_value(
{metric.name}_buckets,
SAFE_CAST({metric.name} AS FLOAT64)
), 0
) + 0.0001
)
AS FLOAT64)
AS INT64),
0)
],
[
STRUCT<key FLOAT64, value FLOAT64>(
COALESCE(
SAFE_CAST(
FORMAT(
"%.*f",
2,
COALESCE(
mozfun.glam.histogram_bucket_from_value(
{metric.name}_buckets,
SAFE_CAST({metric.name} AS FLOAT64)
),
0) + 0.0001
) AS FLOAT64
), 0.0
), 1
)
]
) )
).low ).low
""", """,
upper=f""" upper=f"""
`moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci( `moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci(
{percentile}, {percentile},
STRUCT( STRUCT<
{value} bucket_count INT64,
sum INT64,
histogram_type INT64,
`range` ARRAY<INT64>,
VALUES
ARRAY<STRUCT<key FLOAT64, value FLOAT64>
>>(1,
COALESCE(
SAFE_CAST(
SAFE_CAST(
FORMAT(
"%.*f",
2,
COALESCE(
mozfun.glam.histogram_bucket_from_value(
{metric.name}_buckets,
SAFE_CAST({metric.name} AS FLOAT64)
), 0) + 0.0001
)
AS FLOAT64)
AS INT64),
0),
1,
[
0,
COALESCE(
SAFE_CAST(
SAFE_CAST(
FORMAT(
"%.*f",
2,
COALESCE(
mozfun.glam.histogram_bucket_from_value(
{metric.name}_buckets,
SAFE_CAST({metric.name} AS FLOAT64)
), 0
) + 0.0001
)
AS FLOAT64)
AS INT64),
0)
],
[
STRUCT<key FLOAT64, value FLOAT64>(
COALESCE(
SAFE_CAST(
FORMAT(
"%.*f",
2,
COALESCE(
mozfun.glam.histogram_bucket_from_value(
{metric.name}_buckets,
SAFE_CAST({metric.name} AS FLOAT64)
),
0) + 0.0001
) AS FLOAT64
), 0.0
), 1
)
]
)
).high
""",
name=self.name(),
parameter=str(percentile),
)
for percentile in self.percentiles
]
def _histogram_computation(self, metric: Probe) -> List[StatisticComputation]:
return [
StatisticComputation(
point=f"""
`moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci(
{percentile},
STRUCT(
histogram_normalized_sum(
mozfun.hist.merge(
ARRAY_AGG({metric.name} IGNORE NULLS)
).values, 1.0
)
)
).percentile
""",
lower=f"""
`moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci(
{percentile},
STRUCT(
histogram_normalized_sum(
mozfun.hist.merge(
ARRAY_AGG({metric.name} IGNORE NULLS)
).values, 1.0
)
)
).low
""",
upper=f"""
`moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci(
{percentile},
STRUCT(
histogram_normalized_sum(
mozfun.hist.merge(
ARRAY_AGG({metric.name} IGNORE NULLS)
).values, 1.0
)
) )
).high ).high
""", """,

Просмотреть файл

@ -1,229 +0,0 @@
{{ header }}
{% include 'population.sql' %},
-- for each data source that is used
-- select the metric values
{% for data_source, metrics in metrics_per_dataset.items() -%}
merged_metrics_{{ data_source }} AS (
SELECT
DATE({{ metrics[0].data_source.submission_date_column }}) AS submission_date,
{{ config.population.data_source.client_id_column }} AS client_id,
p.population_build_id AS build_id,
ARRAY<
STRUCT<
metric STRING,
histograms ARRAY<
STRUCT<
bucket_count INT64,
sum INT64,
histogram_type INT64,
`range` ARRAY<INT64>,
values ARRAY<STRUCT<key INT64, value INT64>>>
>>
>[
{% for metric in metrics %}
(
"{{ metric.name }}",
{{ metric.select_expression }}
)
{{ "," if not loop.last else "" }}
{% endfor %}
] AS metrics,
FROM
{{ metrics[0].data_source.from_expression }}
RIGHT JOIN
(
SELECT
client_id AS population_client_id,
submission_date AS population_submission_date,
build_id AS population_build_id
FROM
population
) AS p
ON
{{ metrics[0].data_source.submission_date_column }} = p.population_submission_date AND
{{ config.population.data_source.client_id_column }} = p.population_client_id
WHERE
{% if config.xaxis.value == "submission_date" %}
DATE({{ metrics[0].data_source.submission_date_column }}) = DATE('{{ submission_date }}')
{% else %}
-- when aggregating by build_id, only use the most recent 14 days of data
DATE({{ metrics[0].data_source.submission_date_column }}) BETWEEN DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 14 DAY) AND DATE('{{ submission_date }}')
{% endif %}
GROUP BY
submission_date,
client_id,
build_id
),
{% endfor %}
-- combine the metrics from all the data sources
joined_histograms AS (
SELECT
population.submission_date AS submission_date,
population.client_id AS client_id,
population.build_id,
{% for dimension in dimensions %}
population.{{ dimension.name }} AS {{ dimension.name }},
{% endfor %}
population.branch AS branch,
{% if metrics_per_dataset != {} %}
ARRAY_CONCAT(
{% for data_source, metrics in metrics_per_dataset.items() %}
merged_metrics_{{ data_source }}.metrics
{% endfor %}
) AS metrics
{% else %}
[] AS metrics,
{% endif %}
FROM population
{% for data_source, metrics in metrics_per_dataset.items() %}
LEFT JOIN merged_metrics_{{ data_source }}
USING(submission_date, client_id)
{% endfor %}
),
-- merge histograms if client has multiple
merged_histograms AS (
SELECT
submission_date,
client_id,
build_id,
branch,
{% for dimension in dimensions %}
{{ dimension.name }},
{% endfor %}
{% if metrics_per_dataset != {} %}
ARRAY_AGG(
STRUCT<
name STRING,
histogram STRUCT<
bucket_count INT64,
sum INT64,
histogram_type INT64,
`range` ARRAY<INT64>,
values ARRAY<STRUCT<key INT64, value INT64>>
>
> (
metric,
CASE
WHEN
histograms IS NULL
THEN
NULL
ELSE
mozfun.hist.merge(histograms)
END
)
) AS metrics
{% else %}
[] AS metrics
{% endif %}
FROM
joined_histograms
CROSS JOIN
UNNEST(metrics)
{% if not config.population.monitor_entire_population %}
WHERE branch IN (
-- If branches are not defined, assume it's a rollout
-- and fall back to branches labeled as enabled/disabled
{% if config.population.branches|length > 0 -%}
{% for branch in config.population.branches -%}
"{{ branch }}"
{{ "," if not loop.last else "" }}
{% endfor -%}
{% else -%}
"enabled", "disabled"
{% endif -%}
)
{% endif %}
GROUP BY
submission_date,
client_id,
build_id,
{% for dimension in dimensions %}
{{ dimension.name }},
{% endfor %}
branch
),
-- Cast histograms to have string keys so we can use the histogram normalization function
normalized_histograms AS (
SELECT
submission_date,
client_id,
build_id,
{% for dimension in dimensions -%}
{{ dimension.name }},
{% endfor -%}
branch,
{% if metrics_per_dataset != {} %}
name AS metric,
{% else %}
NULL AS metric,
{% endif %}
{% if metrics_per_dataset != {} %}
STRUCT<
bucket_count INT64,
sum INT64,
histogram_type INT64,
`range` ARRAY<INT64>,
VALUES
ARRAY<STRUCT<key STRING, value INT64>>
>(histogram.bucket_count,
histogram.sum,
histogram.histogram_type,
histogram.range,
ARRAY(SELECT AS STRUCT CAST(keyval.key AS STRING), keyval.value FROM UNNEST(histogram.values) keyval)
) AS value
{% else %}
NULL AS value
{% endif %}
FROM merged_histograms
CROSS JOIN UNNEST(metrics)
)
{% if first_run or config.xaxis.value == "submission_date" -%}
SELECT
submission_date,
client_id,
build_id,
{% for dimension in dimensions -%}
{{ dimension.name }},
{% endfor %}
branch,
metric AS probe,
value
FROM
normalized_histograms
{% else -%}
SELECT
DATE('{{ submission_date }}') AS submission_date,
client_id,
build_id,
{% for dimension in dimensions -%}
{{ dimension.name }},
{% endfor %}
branch,
metric AS probe,
value
FROM normalized_histograms _current
WHERE
PARSE_DATE('%Y%m%d', CAST(build_id AS STRING)) >= DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 14 DAY)
UNION ALL
SELECT
DATE('{{ submission_date }}') AS submission_date,
client_id,
build_id,
{% for dimension in dimensions -%}
{{ dimension.name }},
{% endfor %}
branch,
metric AS probe,
value
FROM normalized_histograms _prev
WHERE
PARSE_DATE('%Y%m%d', CAST(build_id AS STRING)) < DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 14 DAY)
AND submission_date = DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 1 DAY)
{% endif -%}

Просмотреть файл

@ -5,25 +5,14 @@
-- for each data source that is used -- for each data source that is used
-- select the metric values -- select the metric values
{% for data_source, metrics in metrics_per_dataset.items() -%} {% for data_source, metrics in metrics_per_dataset.items() -%}
merged_scalars_{{ data_source }} AS ( merged_metrics_{{ data_source }} AS (
SELECT SELECT
DATE({{ metrics[0].data_source.submission_date_column }}) AS submission_date, DATE({{ metrics[0].data_source.submission_date_column }}) AS submission_date,
{{ config.population.data_source.client_id_column }} AS client_id, {{ config.population.data_source.client_id_column }} AS client_id,
p.population_build_id AS build_id, p.population_build_id AS build_id,
ARRAY< {% for metric in metrics -%}
STRUCT< {{ metric.select_expression }} AS {{ metric.name }},
name STRING, {% endfor -%}
value FLOAT64
>
>[
{% for metric in metrics -%}
(
"{{ metric.name }}",
CAST({{ metric.select_expression }} AS FLOAT64)
)
{{ "," if not loop.last else "" }}
{% endfor -%}
] AS metrics,
FROM FROM
{{ metrics[0].data_source.from_expression }} {{ metrics[0].data_source.from_expression }}
RIGHT JOIN RIGHT JOIN
@ -53,7 +42,7 @@ merged_scalars_{{ data_source }} AS (
{% endfor %} {% endfor %}
-- combine the metrics from all the data sources -- combine the metrics from all the data sources
joined_scalars AS ( joined_metrics AS (
SELECT SELECT
population.submission_date AS submission_date, population.submission_date AS submission_date,
population.client_id AS client_id, population.client_id AS client_id,
@ -62,25 +51,23 @@ joined_scalars AS (
population.{{ dimension.name }} AS {{ dimension.name }}, population.{{ dimension.name }} AS {{ dimension.name }},
{% endfor %} {% endfor %}
population.branch AS branch, population.branch AS branch,
ARRAY_CONCAT( {% for data_source, metrics in metrics_per_dataset.items() -%}
{% for data_source, metrics in metrics_per_dataset.items() -%} {% for metric in metrics -%}
COALESCE(merged_scalars_{{ data_source }}.metrics, []) {{ metric.name }},
{{ "," if not loop.last else "" }} {% endfor -%}
{% endfor -%} {% endfor -%}
) AS metrics
FROM population FROM population
{% for data_source, metrics in metrics_per_dataset.items() -%} {% for data_source, metrics in metrics_per_dataset.items() -%}
LEFT JOIN merged_scalars_{{ data_source }} LEFT JOIN merged_metrics_{{ data_source }}
USING(submission_date, client_id, build_id) USING(submission_date, client_id, build_id)
{% endfor %} {% endfor %}
), ),
-- unnest the combined metrics so we get -- normalize histograms and apply filters
-- the metric values for each client for each date normalized_metrics AS (
flattened_scalars AS ( SELECT
SELECT * EXCEPT(metrics) *
FROM joined_scalars FROM joined_metrics
CROSS JOIN UNNEST(metrics)
{% if not config.population.monitor_entire_population %} {% if not config.population.monitor_entire_population %}
WHERE branch IN ( WHERE branch IN (
-- If branches are not defined, assume it's a rollout -- If branches are not defined, assume it's a rollout
@ -98,44 +85,20 @@ flattened_scalars AS (
) )
{% if first_run or config.xaxis.value == "submission_date" -%} {% if first_run or config.xaxis.value == "submission_date" -%}
SELECT SELECT
submission_date, *
client_id,
build_id,
{% for dimension in dimensions -%}
{{ dimension.name }},
{% endfor %}
branch,
name,
value
FROM FROM
flattened_scalars normalized_metrics
{% else -%} {% else -%}
-- if data is aggregated by build ID, then aggregate data with previous runs -- if data is aggregated by build ID, then aggregate data with previous runs
SELECT SELECT
DATE('{{ submission_date }}') AS submission_date, *
client_id, FROM normalized_metrics _current
build_id,
{% for dimension in dimensions -%}
{{ dimension.name }},
{% endfor %}
branch,
name,
value
FROM flattened_scalars _current
WHERE WHERE
PARSE_DATE('%Y%m%d', CAST(build_id AS STRING)) >= DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 14 DAY) PARSE_DATE('%Y%m%d', CAST(build_id AS STRING)) >= DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 14 DAY)
UNION ALL UNION ALL
SELECT SELECT
DATE('{{ submission_date }}') AS submission_date, SELECT * REPLACE (DATE('{{ submission_date }}') AS submission_date)
client_id, FROM normalized_metrics _prev
build_id,
{% for dimension in dimensions -%}
{{ dimension.name }},
{% endfor %}
branch,
name,
value
FROM flattened_scalars _prev
WHERE WHERE
PARSE_DATE('%Y%m%d', CAST(build_id AS STRING)) < DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 14 DAY) PARSE_DATE('%Y%m%d', CAST(build_id AS STRING)) < DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 14 DAY)
AND submission_date = DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 1 DAY) AND submission_date = DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 1 DAY)

Просмотреть файл

@ -1,167 +0,0 @@
{{ header }}
CREATE OR REPLACE VIEW
`{{ gcp_project }}.{{ dataset }}.{{ normalized_slug }}`
AS
-- Prepare scalar values
WITH filtered_scalars AS (
SELECT *
FROM `{{ gcp_project }}.{{ dataset }}_derived.{{ normalized_slug }}_scalar`
WHERE {% include 'where_clause.sql' -%}
),
log_min_max AS (
SELECT
name,
LOG(IF(MIN(value) <= 0, 1, MIN(value)), 2) log_min,
LOG(IF(MAX(value) <= 0, 1, MAX(value)), 2) log_max
FROM
filtered_scalars
GROUP BY name),
buckets_by_metric AS (
SELECT
name,
ARRAY(SELECT FORMAT("%.*f", 2, bucket) FROM UNNEST(
mozfun.glam.histogram_generate_scalar_buckets(log_min, log_max, 100)
) AS bucket ORDER BY bucket) AS buckets
FROM log_min_max
),
aggregated_scalars AS (
SELECT
client_id,
{% if config.xaxis.value == "submission_date" -%}
submission_date,
{% else %}
build_id,
{% endif %}
{% for dimension in dimensions -%}
{{ dimension.name }},
{% endfor -%}
branch,
name,
value
FROM
filtered_scalars
),
-- Prepare histogram values
filtered_histograms AS (
SELECT *
FROM `{{ gcp_project }}.{{ dataset }}_derived.{{ normalized_slug }}_histogram`
WHERE {% include 'where_clause.sql' -%}
),
normalized_histograms AS (
SELECT
client_id,
{% if config.xaxis.value == "submission_date" -%}
submission_date,
{% else -%}
build_id,
{% endif -%}
{% for dimension in dimensions -%}
{{ dimension.name }},
{% endfor -%}
branch,
probe,
{% if metrics_per_dataset != {} %}
STRUCT<
bucket_count INT64,
sum INT64,
histogram_type INT64,
`range` ARRAY<INT64>,
VALUES
ARRAY<STRUCT<key STRING, value FLOAT64>>
>(
ANY_VALUE(value.bucket_count),
ANY_VALUE(value.sum),
ANY_VALUE(value.histogram_type),
ANY_VALUE(value.range),
mozfun.glam.histogram_normalized_sum(
mozfun.hist.merge(ARRAY_AGG(value IGNORE NULLS)).values,
1.0
)
) AS value
{% else %}
NULL AS value
{% endif %}
FROM filtered_histograms
GROUP BY
client_id,
{% if config.xaxis.value == "submission_date" -%}
submission_date,
{% else -%}
build_id,
{% endif %}
{% for dimension in dimensions -%}
{{ dimension.name }},
{% endfor -%}
branch,
probe)
-- Cast histograms to have FLOAT64 keys
-- so we can use the histogram jackknife percentile function.
SELECT
client_id,
{% if config.xaxis.value == "submission_date" -%}
submission_date,
{% else -%}
build_id,
{% endif %}
{% for dimension in dimensions -%}
{{ dimension.name }},
{% endfor -%}
branch,
probe AS probe,
{% if metrics_per_dataset != {} %}
STRUCT<
bucket_count INT64,
sum INT64,
histogram_type INT64,
`range` ARRAY<INT64>,
VALUES
ARRAY<STRUCT<key FLOAT64, value FLOAT64>
>>(value.bucket_count,
value.sum,
value.histogram_type,
value.range,
ARRAY(SELECT AS STRUCT CAST(keyval.key AS FLOAT64), keyval.value FROM UNNEST(value.values) keyval)
) AS value
{% else %}
NULL AS value
{% endif %}
FROM normalized_histograms
UNION ALL
SELECT
client_id,
{% if config.xaxis.value == "submission_date" -%}
submission_date,
{% else %}
build_id,
{% endif %}
{% for dimension in dimensions -%}
{{ dimension.name }},
{% endfor -%}
branch,
name AS probe,
STRUCT<
bucket_count INT64,
sum INT64,
histogram_type INT64,
`range` ARRAY<INT64>,
VALUES
ARRAY<STRUCT<key FLOAT64, value FLOAT64>
>>(1,
COALESCE(SAFE_CAST(SAFE_CAST(FORMAT("%.*f", 2, COALESCE(mozfun.glam.histogram_bucket_from_value(buckets, SAFE_CAST(value AS FLOAT64)), 0) + 0.0001) AS FLOAT64) AS INT64), 0),
1,
[0, COALESCE(SAFE_CAST(SAFE_CAST(FORMAT("%.*f", 2, COALESCE(mozfun.glam.histogram_bucket_from_value(buckets, SAFE_CAST(value AS FLOAT64)), 0) + 0.0001) AS FLOAT64) AS INT64), 0)],
[STRUCT<key FLOAT64, value FLOAT64>(
COALESCE(SAFE_CAST(FORMAT("%.*f", 2, COALESCE(mozfun.glam.histogram_bucket_from_value(buckets, SAFE_CAST(value AS FLOAT64)), 0) + 0.0001) AS FLOAT64), 0.0), 1
)]
) AS value
FROM
aggregated_scalars
LEFT JOIN buckets_by_metric USING(name)

Просмотреть файл

@ -0,0 +1,40 @@
CREATE TEMPORARY FUNCTION histogram_normalized_sum(
arrs ARRAY<STRUCT<key INT64, value INT64>>,
weight FLOAT64
)
RETURNS ARRAY<STRUCT<key INT64, value FLOAT64>> AS (
-- Input: one histogram for a single client.
-- Returns the normalized sum of the input maps.
-- It returns the total_count[k] / SUM(total_count)
-- for each key k.
(
WITH total_counts AS (
SELECT
sum(a.value) AS total_count
FROM
UNNEST(arrs) AS a
),
summed_counts AS (
SELECT
a.key AS k,
SUM(a.value) AS v
FROM
UNNEST(arrs) AS a
GROUP BY
a.key
)
SELECT
ARRAY_AGG(
STRUCT<key INT64, value FLOAT64>(
k,
COALESCE(SAFE_DIVIDE(1.0 * v, total_count), 0) * weight
)
ORDER BY
SAFE_CAST(k AS INT64)
)
FROM
summed_counts
CROSS JOIN
total_counts
)
);

Просмотреть файл

@ -1,4 +1,39 @@
WITH merged AS ( {{ header }}
{% include 'normalized_sum_udf.sql' %}
WITH filtered_metrics AS (
SELECT *
FROM `{{ gcp_project }}.{{ dataset }}_derived.{{ normalized_slug }}`
WHERE {% include 'where_clause.sql' -%}
),
-- bucket metrics that use percentile
buckets_by_metric AS (
SELECT
[] AS dummy,
{% set seen_metrics = [] %}
{% for summary in summaries %}
{% if summary.statistic.name == "percentile" %}
{% if summary.metric.type == "scalar" -%}
{% if summary.metric.name not in seen_metrics %}
{% if seen_metrics.append(summary.metric.name) %} {% endif %}
ARRAY(SELECT FORMAT("%.*f", 2, bucket) FROM UNNEST(
mozfun.glam.histogram_generate_scalar_buckets(
LOG(IF(MIN(value) <= 0, 1, MIN({{ summary.metric.name }})), 2),
LOG(IF(MAX(value) <= 0, 1, MAX({{ summary.metric.name }})), 2),
100
)
) AS bucket ORDER BY bucket) AS {{ summary.metric.name }}_buckets,
{% endif %}
{% endif %}
{% endif %}
{% endfor %}
FROM filtered_metrics
),
stats AS (
SELECT SELECT
{% if config.xaxis.value == "submission_date" -%} {% if config.xaxis.value == "submission_date" -%}
submission_date, submission_date,
@ -9,24 +44,37 @@ WITH merged AS (
{{ dimension.name }}, {{ dimension.name }},
{% endfor -%} {% endfor -%}
branch, branch,
probe AS metric, ARRAY<STRUCT<
mozfun.hist.merge(ARRAY_AGG(value IGNORE NULLS)).values AS values metric STRING,
statistic STRING,
point FLOAT64,
lower FLOAT64,
upper FLOAT64,
parameter STRING
>>[
{% for summary in summaries %}
STRUCT(
'{{ summary.metric.name }}' AS metric,
'{{ summary.statistic.name }}' AS statistic,
{{ summary.statistic.point }} AS point
{% if summary.statistic.lower -%}
,{{ summary.statistic.lower }} AS lower
{% endif -%}
{% if summary.statistic.upper -%}
,{{ summary.statistic.upper }} AS upper
{% endif -%}
{% if summary.statistic.parameter -%}
,'{{ summary.statistic.parameter }}' AS parameter
{% endif -%}
)
{{ "," if not loop.last else "" }}
{% endfor %}
] AS statistics
FROM FROM
`{{ gcp_project }}.{{ dataset }}.{{ normalized_slug }}` `{{ gcp_project }}.{{ dataset }}.{{ normalized_slug }}`
CROSS JOIN buckets_by_metric
WHERE submission_date = DATE("{{ submission_date }}") WHERE submission_date = DATE("{{ submission_date }}")
GROUP BY GROUP BY
{% if config.xaxis.value == "submission_date" -%}
submission_date,
{% else %}
{% for dimension in dimensions -%}
{{ dimension.name }},
{% endfor -%}
build_id,
{% endif %}
branch,
metric
), stats AS (
SELECT
{% if config.xaxis.value == "submission_date" -%} {% if config.xaxis.value == "submission_date" -%}
submission_date, submission_date,
{% else %} {% else %}
@ -35,43 +83,9 @@ WITH merged AS (
{% for dimension in dimensions -%} {% for dimension in dimensions -%}
{{ dimension.name }}, {{ dimension.name }},
{% endfor -%} {% endfor -%}
branch, branch
metric,
CASE value
{% for probe in probes %}
WHEN probe = "{{ probe.metric.name }}"
THEN ARRAY<STRUCT<>>[(
{% for stat in probe.statistics %}
{{ stat.name }} AS statistic,
{{ stat.point }} AS point,
{% if stat.lower -%}
stat.lower AS lower,
{% endif -%}
{% if stat.upper -%}
stat.upper AS upper,
{% endif -%}
{% if stat.parameter -%}
stat.parameter AS parameter,
{% endif -%}
{% enfor %}
)]
{% endfor %}
ELSE NULL
END AS values
FROM
merged
GROUP BY
{% if config.xaxis.value == "submission_date" -%}
submission_date,
{% else %}
{% for dimension in dimensions -%}
{{ dimension.name }},
{% endfor -%}
build_id,
{% endif %}
branch,
metric
) )
SELECT SELECT
{% if config.xaxis.value == "submission_date" -%} {% if config.xaxis.value == "submission_date" -%}
submission_date, submission_date,
@ -82,10 +96,10 @@ SELECT
{{ dimension.name }}, {{ dimension.name }},
{% endfor -%} {% endfor -%}
branch, branch,
metric, statistic.metric AS metric,
statistic.name AS statistic, statistic.name AS statistic,
statistic.point AS point, statistic.point AS point,
statistic.lower AS lower, statistic.lower AS lower,
statistic.upper AS upper, statistic.upper AS upper,
statistic.parameter AS parameter statistic.parameter AS parameter
FROM stats, UNNEST(values) as statistic FROM stats, UNNEST(statistics) as statistic