зеркало из https://github.com/mozilla/opmon.git
Support custom statistics
This commit is contained in:
Родитель
6084fe2731
Коммит
13cdab45a6
|
@ -1,8 +1,9 @@
|
||||||
"""OpMon."""
|
"""OpMon."""
|
||||||
import enum
|
import enum
|
||||||
from typing import List, Optional
|
from typing import List, Optional, TYPE_CHECKING
|
||||||
|
|
||||||
from opmon.config import Summary
|
if TYPE_CHECKING:
|
||||||
|
from opmon.config import Summary
|
||||||
|
|
||||||
import attr
|
import attr
|
||||||
|
|
||||||
|
@ -85,7 +86,7 @@ class Alert:
|
||||||
|
|
||||||
name: str
|
name: str
|
||||||
type: AlertType
|
type: AlertType
|
||||||
probes: List[Summary]
|
probes: List["Summary"]
|
||||||
friendly_name: Optional[str] = None
|
friendly_name: Optional[str] = None
|
||||||
description: Optional[str] = None
|
description: Optional[str] = None
|
||||||
percentiles: List[int] = []
|
percentiles: List[int] = []
|
||||||
|
|
|
@ -128,7 +128,7 @@ class ProbeDefinition:
|
||||||
description: Optional[str] = None
|
description: Optional[str] = None
|
||||||
category: Optional[str] = None
|
category: Optional[str] = None
|
||||||
type: Optional[str] = None
|
type: Optional[str] = None
|
||||||
statistics: Optional[Dict[str, Dict[str, Any]]] = None
|
statistics: Optional[Dict[str, Dict[str, Any]]] = {"percentile": {}} # todo: remove default?
|
||||||
|
|
||||||
def resolve(self, spec: "MonitoringSpec") -> List[Summary]:
|
def resolve(self, spec: "MonitoringSpec") -> List[Summary]:
|
||||||
"""Create and return a `Probe` instance from this definition."""
|
"""Create and return a `Probe` instance from this definition."""
|
||||||
|
@ -157,12 +157,13 @@ class ProbeDefinition:
|
||||||
|
|
||||||
stats_params = copy.deepcopy(params)
|
stats_params = copy.deepcopy(params)
|
||||||
|
|
||||||
summaries.append(
|
for stat in statistic.from_dict(stats_params).computation(probe):
|
||||||
Summary(
|
summaries.append(
|
||||||
metric=probe,
|
Summary(
|
||||||
statistic=statistic.from_dict(stats_params),
|
metric=probe,
|
||||||
|
statistic=stat,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
return summaries
|
return summaries
|
||||||
|
|
||||||
|
|
|
@ -31,3 +31,11 @@ class ConfigurationException(OpmonException):
|
||||||
def __init__(self, slug, message="Project has been incorrectly configured."):
|
def __init__(self, slug, message="Project has been incorrectly configured."):
|
||||||
"""Initialize exception."""
|
"""Initialize exception."""
|
||||||
super().__init__(f"{slug} -> {message}")
|
super().__init__(f"{slug} -> {message}")
|
||||||
|
|
||||||
|
|
||||||
|
class StatisticNotImplementedForTypeException(OpmonException):
|
||||||
|
"""Exception thrown when statistic is not implemented for metric type."""
|
||||||
|
|
||||||
|
def __init__(self, slug, message="Statistic not implemented for metric type."):
|
||||||
|
"""Initialize exception."""
|
||||||
|
super().__init__(f"{slug} -> {message}")
|
||||||
|
|
|
@ -30,7 +30,12 @@ class ExternalConfig:
|
||||||
def validate(self, experiment: Optional[experimenter.Experiment] = None) -> None:
|
def validate(self, experiment: Optional[experimenter.Experiment] = None) -> None:
|
||||||
"""Validate the external config."""
|
"""Validate the external config."""
|
||||||
conf = self.spec.resolve(experiment)
|
conf = self.spec.resolve(experiment)
|
||||||
Monitoring(project="project", dataset="dataset", slug=self.slug, config=conf).validate()
|
Monitoring(
|
||||||
|
project="moz-fx-data-shared-prod",
|
||||||
|
dataset="operational_monitoring",
|
||||||
|
slug=self.slug,
|
||||||
|
config=conf,
|
||||||
|
).validate()
|
||||||
|
|
||||||
|
|
||||||
def entity_from_path(path: Path) -> ExternalConfig:
|
def entity_from_path(path: Path) -> ExternalConfig:
|
||||||
|
|
|
@ -18,8 +18,7 @@ from .utils import bq_normalize_name
|
||||||
|
|
||||||
PATH = Path(os.path.dirname(__file__))
|
PATH = Path(os.path.dirname(__file__))
|
||||||
|
|
||||||
QUERY_FILENAME = "{}_query.sql"
|
QUERY_FILENAME = "metric_query.sql"
|
||||||
VIEW_FILENAME = "metric_view.sql"
|
|
||||||
ALERTS_FILENAME = "alerts_view.sql"
|
ALERTS_FILENAME = "alerts_view.sql"
|
||||||
STATISTICS_FILENAME = "statistics.sql"
|
STATISTICS_FILENAME = "statistics.sql"
|
||||||
TEMPLATE_FOLDER = PATH / "templates"
|
TEMPLATE_FOLDER = PATH / "templates"
|
||||||
|
@ -54,12 +53,8 @@ class Monitoring:
|
||||||
|
|
||||||
def run(self, submission_date):
|
def run(self, submission_date):
|
||||||
"""Execute and generate the operational monitoring ETL for a specific date."""
|
"""Execute and generate the operational monitoring ETL for a specific date."""
|
||||||
for data_type in DATA_TYPES:
|
print(f"Run metrics query for {self.slug}")
|
||||||
# Periodically print so airflow gke operator doesn't think task is dead
|
self.bigquery.execute(self._run_metrics_sql(submission_date))
|
||||||
print(f"Run query for {self.slug} for {data_type} types")
|
|
||||||
self._run_sql_for_data_type(submission_date, data_type)
|
|
||||||
print(f"Create view for {self.slug}")
|
|
||||||
self.bigquery.execute(self._get_view_sql())
|
|
||||||
|
|
||||||
print("Calculate statistics")
|
print("Calculate statistics")
|
||||||
self.bigquery.execute(self._get_statistics_sql(submission_date))
|
self.bigquery.execute(self._get_statistics_sql(submission_date))
|
||||||
|
@ -68,7 +63,7 @@ class Monitoring:
|
||||||
self._run_sql_for_alerts(submission_date)
|
self._run_sql_for_alerts(submission_date)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _run_sql_for_data_type(self, submission_date: datetime, data_type: str):
|
def _run_metrics_sql(self, submission_date: datetime):
|
||||||
"""Generate and execute the ETL for a specific data type."""
|
"""Generate and execute the ETL for a specific data type."""
|
||||||
try:
|
try:
|
||||||
self._check_runnable(submission_date)
|
self._check_runnable(submission_date)
|
||||||
|
@ -77,10 +72,10 @@ class Monitoring:
|
||||||
return
|
return
|
||||||
|
|
||||||
date_partition = str(submission_date).replace("-", "").split(" ")[0]
|
date_partition = str(submission_date).replace("-", "").split(" ")[0]
|
||||||
destination_table = f"{self.normalized_slug}_{data_type}${date_partition}"
|
destination_table = f"{self.normalized_slug}${date_partition}"
|
||||||
|
|
||||||
self.bigquery.execute(
|
self.bigquery.execute(
|
||||||
self._get_data_type_sql(submission_date=submission_date, data_type=data_type),
|
self._get_metrics_sql(submission_date=submission_date),
|
||||||
destination_table,
|
destination_table,
|
||||||
clustering=["build_id"],
|
clustering=["build_id"],
|
||||||
time_partitioning="submission_date",
|
time_partitioning="submission_date",
|
||||||
|
@ -95,28 +90,19 @@ class Monitoring:
|
||||||
sql = template.render(**render_kwargs)
|
sql = template.render(**render_kwargs)
|
||||||
return sql
|
return sql
|
||||||
|
|
||||||
def _get_data_type_sql(
|
def _get_metrics_sql(
|
||||||
self, submission_date: datetime, data_type: str, first_run: Optional[bool] = None
|
self, submission_date: datetime, first_run: Optional[bool] = None
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Return SQL for data_type ETL."""
|
"""Return SQL for data_type ETL."""
|
||||||
probes = self.config.probes
|
probes = self.config.probes
|
||||||
probes = [probe for probe in probes if probe.metric.type == data_type]
|
|
||||||
|
|
||||||
if len(probes) == 0:
|
if len(probes) == 0:
|
||||||
# There are no probes for this data source + data type combo
|
# There are no probes for this data source + data type combo
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"No probes for data type {data_type} configured for {self.slug}.",
|
f"No metrics configured for {self.slug}.",
|
||||||
extra={"experiment": self.slug},
|
extra={"experiment": self.slug},
|
||||||
)
|
)
|
||||||
|
|
||||||
# todo:
|
|
||||||
# xaxis metadata to be used to decide whether the entire table is replaced
|
|
||||||
# Or just a partition.
|
|
||||||
#
|
|
||||||
# Note: there is a subtle design here in which date partitions are replaced
|
|
||||||
# if the data is for a build over build analysis but the entire table is
|
|
||||||
# replaced if it's a submission date analysis.
|
|
||||||
|
|
||||||
# group probes that are part of the same dataset
|
# group probes that are part of the same dataset
|
||||||
# necessary for creating the SQL template
|
# necessary for creating the SQL template
|
||||||
metrics_per_dataset = {}
|
metrics_per_dataset = {}
|
||||||
|
@ -124,13 +110,14 @@ class Monitoring:
|
||||||
if probe.metric.data_source.name not in metrics_per_dataset:
|
if probe.metric.data_source.name not in metrics_per_dataset:
|
||||||
metrics_per_dataset[probe.metric.data_source.name] = [probe.metric]
|
metrics_per_dataset[probe.metric.data_source.name] = [probe.metric]
|
||||||
else:
|
else:
|
||||||
metrics_per_dataset[probe.metric.data_source.name].append(probe.metric)
|
if probe.metric not in metrics_per_dataset[probe.metric.data_source.name]:
|
||||||
|
metrics_per_dataset[probe.metric.data_source.name].append(probe.metric)
|
||||||
|
|
||||||
# check if this is the first time the queries are executed
|
# check if this is the first time the queries are executed
|
||||||
# the queries are referencing the destination table if build_id is used for the time frame
|
# the queries are referencing the destination table if build_id is used for the time frame
|
||||||
if first_run is None:
|
if first_run is None:
|
||||||
destination_table = (
|
destination_table = (
|
||||||
f"{self.project}.{self.dataset}_derived.{self.normalized_slug}_{data_type}"
|
f"{self.project}.{self.dataset}_derived.{self.normalized_slug}"
|
||||||
)
|
)
|
||||||
first_run = True
|
first_run = True
|
||||||
try:
|
try:
|
||||||
|
@ -147,30 +134,15 @@ class Monitoring:
|
||||||
"dataset": self.dataset,
|
"dataset": self.dataset,
|
||||||
"first_run": first_run,
|
"first_run": first_run,
|
||||||
"dimensions": self.config.dimensions,
|
"dimensions": self.config.dimensions,
|
||||||
# "user_count_threshold": USERS_PER_BUILD_THRESHOLDS[
|
|
||||||
# self.config.project.population.channel
|
|
||||||
# ],
|
|
||||||
"metrics_per_dataset": metrics_per_dataset,
|
"metrics_per_dataset": metrics_per_dataset,
|
||||||
"slug": self.slug,
|
"slug": self.slug,
|
||||||
"normalized_slug": self.normalized_slug,
|
"normalized_slug": self.normalized_slug,
|
||||||
}
|
}
|
||||||
|
|
||||||
sql_filename = QUERY_FILENAME.format(data_type)
|
sql_filename = QUERY_FILENAME
|
||||||
sql = self._render_sql(sql_filename, render_kwargs)
|
sql = self._render_sql(sql_filename, render_kwargs)
|
||||||
return sql
|
return sql
|
||||||
|
|
||||||
def _get_view_sql(self) -> str:
|
|
||||||
"""Return the SQL to create a BigQuery view."""
|
|
||||||
render_kwargs = {
|
|
||||||
"gcp_project": self.project,
|
|
||||||
"dataset": self.dataset,
|
|
||||||
"config": self.config.project,
|
|
||||||
"normalized_slug": self.normalized_slug,
|
|
||||||
"dimensions": self.config.dimensions,
|
|
||||||
}
|
|
||||||
sql = self._render_sql(VIEW_FILENAME, render_kwargs)
|
|
||||||
return sql
|
|
||||||
|
|
||||||
def _get_statistics_sql(self, submission_date) -> str:
|
def _get_statistics_sql(self, submission_date) -> str:
|
||||||
"""Return the SQL to run the statistics."""
|
"""Return the SQL to run the statistics."""
|
||||||
render_kwargs = {
|
render_kwargs = {
|
||||||
|
@ -179,7 +151,8 @@ class Monitoring:
|
||||||
"config": self.config.project,
|
"config": self.config.project,
|
||||||
"normalized_slug": self.normalized_slug,
|
"normalized_slug": self.normalized_slug,
|
||||||
"dimensions": self.config.dimensions,
|
"dimensions": self.config.dimensions,
|
||||||
"probes": self.config.probes,
|
"summaries": self.config.probes,
|
||||||
|
"submission_date": submission_date,
|
||||||
}
|
}
|
||||||
sql = self._render_sql(STATISTICS_FILENAME, render_kwargs)
|
sql = self._render_sql(STATISTICS_FILENAME, render_kwargs)
|
||||||
return sql
|
return sql
|
||||||
|
@ -237,11 +210,18 @@ class Monitoring:
|
||||||
"""Validate ETL and configs of opmon project."""
|
"""Validate ETL and configs of opmon project."""
|
||||||
self._check_runnable()
|
self._check_runnable()
|
||||||
|
|
||||||
for data_type in DATA_TYPES:
|
metrics_sql = self._get_metrics_sql(
|
||||||
data_type_sql = self._get_data_type_sql(
|
submission_date=self.config.project.start_date, # type: ignore
|
||||||
submission_date=self.config.project.start_date, # type: ignore
|
first_run=True,
|
||||||
data_type=data_type,
|
)
|
||||||
first_run=True,
|
dry_run_query(metrics_sql)
|
||||||
)
|
# print(data_type_sql)
|
||||||
dry_run_query(data_type_sql)
|
|
||||||
print(data_type_sql)
|
statistics_sql = self._get_statistics_sql(
|
||||||
|
submission_date=self.config.project.start_date, # type: ignore
|
||||||
|
)
|
||||||
|
# print(statistics_sql)
|
||||||
|
dry_run_query(statistics_sql)
|
||||||
|
|
||||||
|
# todo: validate alerts
|
||||||
|
# todo: update alerts view/query
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
import re
|
import re
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
import attr
|
import attr
|
||||||
|
|
||||||
|
from opmon import Probe
|
||||||
|
from opmon.errors import StatisticNotImplementedForTypeException
|
||||||
|
|
||||||
|
|
||||||
@attr.s(auto_attribs=True)
|
@attr.s(auto_attribs=True)
|
||||||
class StatisticComputation:
|
class StatisticComputation:
|
||||||
|
@ -31,9 +34,25 @@ class Statistic(ABC):
|
||||||
name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", cls.__name__)
|
name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", cls.__name__)
|
||||||
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower()
|
return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower()
|
||||||
|
|
||||||
@abstractmethod
|
def computation(self, metric: Probe) -> List[StatisticComputation]:
|
||||||
def computation(self, value: str = "values") -> List[StatisticComputation]:
|
if metric.type == "scalar":
|
||||||
return NotImplemented
|
return self._scalar_computation(metric)
|
||||||
|
elif metric.type == "histogram":
|
||||||
|
return self._histogram_computation(metric)
|
||||||
|
else:
|
||||||
|
raise StatisticNotImplementedForTypeException(
|
||||||
|
f"Statistic {self.name()} not implemented for type {metric.type} ({metric.name})"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _scalar_computation(self, metric: Probe) -> List[StatisticComputation]:
|
||||||
|
raise StatisticNotImplementedForTypeException(
|
||||||
|
f"Statistic {self.name()} not implemented for type {metric.type} ({metric.name})"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _histogram_computation(self, metric: Probe) -> List[StatisticComputation]:
|
||||||
|
raise StatisticNotImplementedForTypeException(
|
||||||
|
f"Statistic {self.name()} not implemented for type {metric.type} ({metric.name})"
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, config_dict: Dict[str, Any]):
|
def from_dict(cls, config_dict: Dict[str, Any]):
|
||||||
|
@ -42,30 +61,30 @@ class Statistic(ABC):
|
||||||
|
|
||||||
|
|
||||||
class Count(Statistic):
|
class Count(Statistic):
|
||||||
def computation(self, value: str = "values"):
|
def _scalar_computation(self, metric: Probe):
|
||||||
return [
|
return [
|
||||||
StatisticComputation(
|
StatisticComputation(
|
||||||
point=f"COUNT({value})",
|
point=f"COUNT({metric.name})",
|
||||||
name=self.name(),
|
name=self.name(),
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
class Sum(Statistic):
|
class Sum(Statistic):
|
||||||
def computation(self, value: str = "values"):
|
def _scalar_computation(self, metric: Probe):
|
||||||
return [
|
return [
|
||||||
StatisticComputation(
|
StatisticComputation(
|
||||||
point=f"SUM({value})",
|
point=f"SUM({metric.name})",
|
||||||
name=self.name(),
|
name=self.name(),
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
class Mean(Statistic):
|
class Mean(Statistic):
|
||||||
def computation(self, value: str = "values"):
|
def _scalar_computation(self, metric: Probe):
|
||||||
return [
|
return [
|
||||||
StatisticComputation(
|
StatisticComputation(
|
||||||
point=f"AVG({value})",
|
point=f"AVG({metric.name})",
|
||||||
name=self.name(),
|
name=self.name(),
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -75,11 +94,14 @@ class Quantile(Statistic):
|
||||||
number_of_quantiles: int = 100
|
number_of_quantiles: int = 100
|
||||||
quantile: int = 50
|
quantile: int = 50
|
||||||
|
|
||||||
def computation(self, value: str = "values"):
|
def _scalar_computation(self, metric: Probe):
|
||||||
return [
|
return [
|
||||||
StatisticComputation(
|
StatisticComputation(
|
||||||
point=f"""
|
point=f"""
|
||||||
APPROX_QUANTILES({value}, {self.number_of_quantiles})[OFFSET({self.quantile})]
|
APPROX_QUANTILES(
|
||||||
|
{metric.name},
|
||||||
|
{self.number_of_quantiles}
|
||||||
|
)[OFFSET({self.quantile})]
|
||||||
""",
|
""",
|
||||||
name=self.name(),
|
name=self.name(),
|
||||||
)
|
)
|
||||||
|
@ -90,30 +112,252 @@ class Quantile(Statistic):
|
||||||
class Percentile(Statistic):
|
class Percentile(Statistic):
|
||||||
percentiles: List[int] = [50, 90, 99]
|
percentiles: List[int] = [50, 90, 99]
|
||||||
|
|
||||||
def computation(self, value: str = "values"):
|
def _scalar_computation(self, metric: Probe):
|
||||||
return [
|
return [
|
||||||
StatisticComputation(
|
StatisticComputation(
|
||||||
point=f"""
|
point=f"""
|
||||||
`moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci(
|
`moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci(
|
||||||
{percentile},
|
{percentile},
|
||||||
STRUCT(
|
STRUCT<
|
||||||
{value}
|
bucket_count INT64,
|
||||||
|
sum INT64,
|
||||||
|
histogram_type INT64,
|
||||||
|
`range` ARRAY<INT64>,
|
||||||
|
VALUES
|
||||||
|
ARRAY<STRUCT<key FLOAT64, value FLOAT64>
|
||||||
|
>>(1,
|
||||||
|
COALESCE(
|
||||||
|
SAFE_CAST(
|
||||||
|
SAFE_CAST(
|
||||||
|
FORMAT(
|
||||||
|
"%.*f",
|
||||||
|
2,
|
||||||
|
COALESCE(
|
||||||
|
mozfun.glam.histogram_bucket_from_value(
|
||||||
|
{metric.name}_buckets,
|
||||||
|
SAFE_CAST({metric.name} AS FLOAT64)
|
||||||
|
), 0) + 0.0001
|
||||||
|
)
|
||||||
|
AS FLOAT64)
|
||||||
|
AS INT64),
|
||||||
|
0),
|
||||||
|
1,
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
COALESCE(
|
||||||
|
SAFE_CAST(
|
||||||
|
SAFE_CAST(
|
||||||
|
FORMAT(
|
||||||
|
"%.*f",
|
||||||
|
2,
|
||||||
|
COALESCE(
|
||||||
|
mozfun.glam.histogram_bucket_from_value(
|
||||||
|
{metric.name}_buckets,
|
||||||
|
SAFE_CAST({metric.name} AS FLOAT64)
|
||||||
|
), 0
|
||||||
|
) + 0.0001
|
||||||
|
)
|
||||||
|
AS FLOAT64)
|
||||||
|
AS INT64),
|
||||||
|
0)
|
||||||
|
],
|
||||||
|
[
|
||||||
|
STRUCT<key FLOAT64, value FLOAT64>(
|
||||||
|
COALESCE(
|
||||||
|
SAFE_CAST(
|
||||||
|
FORMAT(
|
||||||
|
"%.*f",
|
||||||
|
2,
|
||||||
|
COALESCE(
|
||||||
|
mozfun.glam.histogram_bucket_from_value(
|
||||||
|
{metric.name}_buckets,
|
||||||
|
SAFE_CAST({metric.name} AS FLOAT64)
|
||||||
|
),
|
||||||
|
0) + 0.0001
|
||||||
|
) AS FLOAT64
|
||||||
|
), 0.0
|
||||||
|
), 1
|
||||||
|
)
|
||||||
|
]
|
||||||
)
|
)
|
||||||
).percentile
|
).percentile
|
||||||
""",
|
""",
|
||||||
lower=f"""
|
lower=f"""
|
||||||
`moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci(
|
`moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci(
|
||||||
{percentile},
|
{percentile},
|
||||||
STRUCT(
|
STRUCT<
|
||||||
{value}
|
bucket_count INT64,
|
||||||
|
sum INT64,
|
||||||
|
histogram_type INT64,
|
||||||
|
`range` ARRAY<INT64>,
|
||||||
|
VALUES
|
||||||
|
ARRAY<STRUCT<key FLOAT64, value FLOAT64>
|
||||||
|
>>(1,
|
||||||
|
COALESCE(
|
||||||
|
SAFE_CAST(
|
||||||
|
SAFE_CAST(
|
||||||
|
FORMAT(
|
||||||
|
"%.*f",
|
||||||
|
2,
|
||||||
|
COALESCE(
|
||||||
|
mozfun.glam.histogram_bucket_from_value(
|
||||||
|
{metric.name}_buckets,
|
||||||
|
SAFE_CAST({metric.name} AS FLOAT64)
|
||||||
|
), 0) + 0.0001
|
||||||
|
)
|
||||||
|
AS FLOAT64)
|
||||||
|
AS INT64),
|
||||||
|
0),
|
||||||
|
1,
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
COALESCE(
|
||||||
|
SAFE_CAST(
|
||||||
|
SAFE_CAST(
|
||||||
|
FORMAT(
|
||||||
|
"%.*f",
|
||||||
|
2,
|
||||||
|
COALESCE(
|
||||||
|
mozfun.glam.histogram_bucket_from_value(
|
||||||
|
{metric.name}_buckets,
|
||||||
|
SAFE_CAST({metric.name} AS FLOAT64)
|
||||||
|
), 0
|
||||||
|
) + 0.0001
|
||||||
|
)
|
||||||
|
AS FLOAT64)
|
||||||
|
AS INT64),
|
||||||
|
0)
|
||||||
|
],
|
||||||
|
[
|
||||||
|
STRUCT<key FLOAT64, value FLOAT64>(
|
||||||
|
COALESCE(
|
||||||
|
SAFE_CAST(
|
||||||
|
FORMAT(
|
||||||
|
"%.*f",
|
||||||
|
2,
|
||||||
|
COALESCE(
|
||||||
|
mozfun.glam.histogram_bucket_from_value(
|
||||||
|
{metric.name}_buckets,
|
||||||
|
SAFE_CAST({metric.name} AS FLOAT64)
|
||||||
|
),
|
||||||
|
0) + 0.0001
|
||||||
|
) AS FLOAT64
|
||||||
|
), 0.0
|
||||||
|
), 1
|
||||||
|
)
|
||||||
|
]
|
||||||
)
|
)
|
||||||
).low
|
).low
|
||||||
""",
|
""",
|
||||||
upper=f"""
|
upper=f"""
|
||||||
`moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci(
|
`moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci(
|
||||||
{percentile},
|
{percentile},
|
||||||
STRUCT(
|
STRUCT<
|
||||||
{value}
|
bucket_count INT64,
|
||||||
|
sum INT64,
|
||||||
|
histogram_type INT64,
|
||||||
|
`range` ARRAY<INT64>,
|
||||||
|
VALUES
|
||||||
|
ARRAY<STRUCT<key FLOAT64, value FLOAT64>
|
||||||
|
>>(1,
|
||||||
|
COALESCE(
|
||||||
|
SAFE_CAST(
|
||||||
|
SAFE_CAST(
|
||||||
|
FORMAT(
|
||||||
|
"%.*f",
|
||||||
|
2,
|
||||||
|
COALESCE(
|
||||||
|
mozfun.glam.histogram_bucket_from_value(
|
||||||
|
{metric.name}_buckets,
|
||||||
|
SAFE_CAST({metric.name} AS FLOAT64)
|
||||||
|
), 0) + 0.0001
|
||||||
|
)
|
||||||
|
AS FLOAT64)
|
||||||
|
AS INT64),
|
||||||
|
0),
|
||||||
|
1,
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
COALESCE(
|
||||||
|
SAFE_CAST(
|
||||||
|
SAFE_CAST(
|
||||||
|
FORMAT(
|
||||||
|
"%.*f",
|
||||||
|
2,
|
||||||
|
COALESCE(
|
||||||
|
mozfun.glam.histogram_bucket_from_value(
|
||||||
|
{metric.name}_buckets,
|
||||||
|
SAFE_CAST({metric.name} AS FLOAT64)
|
||||||
|
), 0
|
||||||
|
) + 0.0001
|
||||||
|
)
|
||||||
|
AS FLOAT64)
|
||||||
|
AS INT64),
|
||||||
|
0)
|
||||||
|
],
|
||||||
|
[
|
||||||
|
STRUCT<key FLOAT64, value FLOAT64>(
|
||||||
|
COALESCE(
|
||||||
|
SAFE_CAST(
|
||||||
|
FORMAT(
|
||||||
|
"%.*f",
|
||||||
|
2,
|
||||||
|
COALESCE(
|
||||||
|
mozfun.glam.histogram_bucket_from_value(
|
||||||
|
{metric.name}_buckets,
|
||||||
|
SAFE_CAST({metric.name} AS FLOAT64)
|
||||||
|
),
|
||||||
|
0) + 0.0001
|
||||||
|
) AS FLOAT64
|
||||||
|
), 0.0
|
||||||
|
), 1
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
).high
|
||||||
|
""",
|
||||||
|
name=self.name(),
|
||||||
|
parameter=str(percentile),
|
||||||
|
)
|
||||||
|
for percentile in self.percentiles
|
||||||
|
]
|
||||||
|
|
||||||
|
def _histogram_computation(self, metric: Probe) -> List[StatisticComputation]:
|
||||||
|
return [
|
||||||
|
StatisticComputation(
|
||||||
|
point=f"""
|
||||||
|
`moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci(
|
||||||
|
{percentile},
|
||||||
|
STRUCT(
|
||||||
|
histogram_normalized_sum(
|
||||||
|
mozfun.hist.merge(
|
||||||
|
ARRAY_AGG({metric.name} IGNORE NULLS)
|
||||||
|
).values, 1.0
|
||||||
|
)
|
||||||
|
)
|
||||||
|
).percentile
|
||||||
|
""",
|
||||||
|
lower=f"""
|
||||||
|
`moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci(
|
||||||
|
{percentile},
|
||||||
|
STRUCT(
|
||||||
|
histogram_normalized_sum(
|
||||||
|
mozfun.hist.merge(
|
||||||
|
ARRAY_AGG({metric.name} IGNORE NULLS)
|
||||||
|
).values, 1.0
|
||||||
|
)
|
||||||
|
)
|
||||||
|
).low
|
||||||
|
""",
|
||||||
|
upper=f"""
|
||||||
|
`moz-fx-data-shared-prod`.udf_js.jackknife_percentile_ci(
|
||||||
|
{percentile},
|
||||||
|
STRUCT(
|
||||||
|
histogram_normalized_sum(
|
||||||
|
mozfun.hist.merge(
|
||||||
|
ARRAY_AGG({metric.name} IGNORE NULLS)
|
||||||
|
).values, 1.0
|
||||||
|
)
|
||||||
)
|
)
|
||||||
).high
|
).high
|
||||||
""",
|
""",
|
||||||
|
|
|
@ -1,229 +0,0 @@
|
||||||
{{ header }}
|
|
||||||
|
|
||||||
{% include 'population.sql' %},
|
|
||||||
|
|
||||||
-- for each data source that is used
|
|
||||||
-- select the metric values
|
|
||||||
{% for data_source, metrics in metrics_per_dataset.items() -%}
|
|
||||||
merged_metrics_{{ data_source }} AS (
|
|
||||||
SELECT
|
|
||||||
DATE({{ metrics[0].data_source.submission_date_column }}) AS submission_date,
|
|
||||||
{{ config.population.data_source.client_id_column }} AS client_id,
|
|
||||||
p.population_build_id AS build_id,
|
|
||||||
ARRAY<
|
|
||||||
STRUCT<
|
|
||||||
metric STRING,
|
|
||||||
histograms ARRAY<
|
|
||||||
STRUCT<
|
|
||||||
bucket_count INT64,
|
|
||||||
sum INT64,
|
|
||||||
histogram_type INT64,
|
|
||||||
`range` ARRAY<INT64>,
|
|
||||||
values ARRAY<STRUCT<key INT64, value INT64>>>
|
|
||||||
>>
|
|
||||||
>[
|
|
||||||
{% for metric in metrics %}
|
|
||||||
(
|
|
||||||
"{{ metric.name }}",
|
|
||||||
{{ metric.select_expression }}
|
|
||||||
)
|
|
||||||
{{ "," if not loop.last else "" }}
|
|
||||||
{% endfor %}
|
|
||||||
] AS metrics,
|
|
||||||
FROM
|
|
||||||
{{ metrics[0].data_source.from_expression }}
|
|
||||||
RIGHT JOIN
|
|
||||||
(
|
|
||||||
SELECT
|
|
||||||
client_id AS population_client_id,
|
|
||||||
submission_date AS population_submission_date,
|
|
||||||
build_id AS population_build_id
|
|
||||||
FROM
|
|
||||||
population
|
|
||||||
) AS p
|
|
||||||
ON
|
|
||||||
{{ metrics[0].data_source.submission_date_column }} = p.population_submission_date AND
|
|
||||||
{{ config.population.data_source.client_id_column }} = p.population_client_id
|
|
||||||
WHERE
|
|
||||||
{% if config.xaxis.value == "submission_date" %}
|
|
||||||
DATE({{ metrics[0].data_source.submission_date_column }}) = DATE('{{ submission_date }}')
|
|
||||||
{% else %}
|
|
||||||
-- when aggregating by build_id, only use the most recent 14 days of data
|
|
||||||
DATE({{ metrics[0].data_source.submission_date_column }}) BETWEEN DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 14 DAY) AND DATE('{{ submission_date }}')
|
|
||||||
{% endif %}
|
|
||||||
GROUP BY
|
|
||||||
submission_date,
|
|
||||||
client_id,
|
|
||||||
build_id
|
|
||||||
),
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
-- combine the metrics from all the data sources
|
|
||||||
joined_histograms AS (
|
|
||||||
SELECT
|
|
||||||
population.submission_date AS submission_date,
|
|
||||||
population.client_id AS client_id,
|
|
||||||
population.build_id,
|
|
||||||
{% for dimension in dimensions %}
|
|
||||||
population.{{ dimension.name }} AS {{ dimension.name }},
|
|
||||||
{% endfor %}
|
|
||||||
population.branch AS branch,
|
|
||||||
{% if metrics_per_dataset != {} %}
|
|
||||||
ARRAY_CONCAT(
|
|
||||||
{% for data_source, metrics in metrics_per_dataset.items() %}
|
|
||||||
merged_metrics_{{ data_source }}.metrics
|
|
||||||
{% endfor %}
|
|
||||||
) AS metrics
|
|
||||||
{% else %}
|
|
||||||
[] AS metrics,
|
|
||||||
{% endif %}
|
|
||||||
FROM population
|
|
||||||
{% for data_source, metrics in metrics_per_dataset.items() %}
|
|
||||||
LEFT JOIN merged_metrics_{{ data_source }}
|
|
||||||
USING(submission_date, client_id)
|
|
||||||
{% endfor %}
|
|
||||||
),
|
|
||||||
|
|
||||||
-- merge histograms if client has multiple
|
|
||||||
merged_histograms AS (
|
|
||||||
SELECT
|
|
||||||
submission_date,
|
|
||||||
client_id,
|
|
||||||
build_id,
|
|
||||||
branch,
|
|
||||||
{% for dimension in dimensions %}
|
|
||||||
{{ dimension.name }},
|
|
||||||
{% endfor %}
|
|
||||||
{% if metrics_per_dataset != {} %}
|
|
||||||
ARRAY_AGG(
|
|
||||||
STRUCT<
|
|
||||||
name STRING,
|
|
||||||
histogram STRUCT<
|
|
||||||
bucket_count INT64,
|
|
||||||
sum INT64,
|
|
||||||
histogram_type INT64,
|
|
||||||
`range` ARRAY<INT64>,
|
|
||||||
values ARRAY<STRUCT<key INT64, value INT64>>
|
|
||||||
>
|
|
||||||
> (
|
|
||||||
metric,
|
|
||||||
CASE
|
|
||||||
WHEN
|
|
||||||
histograms IS NULL
|
|
||||||
THEN
|
|
||||||
NULL
|
|
||||||
ELSE
|
|
||||||
mozfun.hist.merge(histograms)
|
|
||||||
END
|
|
||||||
)
|
|
||||||
) AS metrics
|
|
||||||
{% else %}
|
|
||||||
[] AS metrics
|
|
||||||
{% endif %}
|
|
||||||
FROM
|
|
||||||
joined_histograms
|
|
||||||
CROSS JOIN
|
|
||||||
UNNEST(metrics)
|
|
||||||
{% if not config.population.monitor_entire_population %}
|
|
||||||
WHERE branch IN (
|
|
||||||
-- If branches are not defined, assume it's a rollout
|
|
||||||
-- and fall back to branches labeled as enabled/disabled
|
|
||||||
{% if config.population.branches|length > 0 -%}
|
|
||||||
{% for branch in config.population.branches -%}
|
|
||||||
"{{ branch }}"
|
|
||||||
{{ "," if not loop.last else "" }}
|
|
||||||
{% endfor -%}
|
|
||||||
{% else -%}
|
|
||||||
"enabled", "disabled"
|
|
||||||
{% endif -%}
|
|
||||||
)
|
|
||||||
{% endif %}
|
|
||||||
GROUP BY
|
|
||||||
submission_date,
|
|
||||||
client_id,
|
|
||||||
build_id,
|
|
||||||
{% for dimension in dimensions %}
|
|
||||||
{{ dimension.name }},
|
|
||||||
{% endfor %}
|
|
||||||
branch
|
|
||||||
),
|
|
||||||
|
|
||||||
-- Cast histograms to have string keys so we can use the histogram normalization function
|
|
||||||
normalized_histograms AS (
|
|
||||||
SELECT
|
|
||||||
submission_date,
|
|
||||||
client_id,
|
|
||||||
build_id,
|
|
||||||
{% for dimension in dimensions -%}
|
|
||||||
{{ dimension.name }},
|
|
||||||
{% endfor -%}
|
|
||||||
branch,
|
|
||||||
{% if metrics_per_dataset != {} %}
|
|
||||||
name AS metric,
|
|
||||||
{% else %}
|
|
||||||
NULL AS metric,
|
|
||||||
{% endif %}
|
|
||||||
{% if metrics_per_dataset != {} %}
|
|
||||||
STRUCT<
|
|
||||||
bucket_count INT64,
|
|
||||||
sum INT64,
|
|
||||||
histogram_type INT64,
|
|
||||||
`range` ARRAY<INT64>,
|
|
||||||
VALUES
|
|
||||||
ARRAY<STRUCT<key STRING, value INT64>>
|
|
||||||
>(histogram.bucket_count,
|
|
||||||
histogram.sum,
|
|
||||||
histogram.histogram_type,
|
|
||||||
histogram.range,
|
|
||||||
ARRAY(SELECT AS STRUCT CAST(keyval.key AS STRING), keyval.value FROM UNNEST(histogram.values) keyval)
|
|
||||||
) AS value
|
|
||||||
{% else %}
|
|
||||||
NULL AS value
|
|
||||||
{% endif %}
|
|
||||||
FROM merged_histograms
|
|
||||||
CROSS JOIN UNNEST(metrics)
|
|
||||||
)
|
|
||||||
|
|
||||||
{% if first_run or config.xaxis.value == "submission_date" -%}
|
|
||||||
SELECT
|
|
||||||
submission_date,
|
|
||||||
client_id,
|
|
||||||
build_id,
|
|
||||||
{% for dimension in dimensions -%}
|
|
||||||
{{ dimension.name }},
|
|
||||||
{% endfor %}
|
|
||||||
branch,
|
|
||||||
metric AS probe,
|
|
||||||
value
|
|
||||||
FROM
|
|
||||||
normalized_histograms
|
|
||||||
{% else -%}
|
|
||||||
SELECT
|
|
||||||
DATE('{{ submission_date }}') AS submission_date,
|
|
||||||
client_id,
|
|
||||||
build_id,
|
|
||||||
{% for dimension in dimensions -%}
|
|
||||||
{{ dimension.name }},
|
|
||||||
{% endfor %}
|
|
||||||
branch,
|
|
||||||
metric AS probe,
|
|
||||||
value
|
|
||||||
FROM normalized_histograms _current
|
|
||||||
WHERE
|
|
||||||
PARSE_DATE('%Y%m%d', CAST(build_id AS STRING)) >= DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 14 DAY)
|
|
||||||
UNION ALL
|
|
||||||
SELECT
|
|
||||||
DATE('{{ submission_date }}') AS submission_date,
|
|
||||||
client_id,
|
|
||||||
build_id,
|
|
||||||
{% for dimension in dimensions -%}
|
|
||||||
{{ dimension.name }},
|
|
||||||
{% endfor %}
|
|
||||||
branch,
|
|
||||||
metric AS probe,
|
|
||||||
value
|
|
||||||
FROM normalized_histograms _prev
|
|
||||||
WHERE
|
|
||||||
PARSE_DATE('%Y%m%d', CAST(build_id AS STRING)) < DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 14 DAY)
|
|
||||||
AND submission_date = DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 1 DAY)
|
|
||||||
{% endif -%}
|
|
|
@ -5,25 +5,14 @@
|
||||||
-- for each data source that is used
|
-- for each data source that is used
|
||||||
-- select the metric values
|
-- select the metric values
|
||||||
{% for data_source, metrics in metrics_per_dataset.items() -%}
|
{% for data_source, metrics in metrics_per_dataset.items() -%}
|
||||||
merged_scalars_{{ data_source }} AS (
|
merged_metrics_{{ data_source }} AS (
|
||||||
SELECT
|
SELECT
|
||||||
DATE({{ metrics[0].data_source.submission_date_column }}) AS submission_date,
|
DATE({{ metrics[0].data_source.submission_date_column }}) AS submission_date,
|
||||||
{{ config.population.data_source.client_id_column }} AS client_id,
|
{{ config.population.data_source.client_id_column }} AS client_id,
|
||||||
p.population_build_id AS build_id,
|
p.population_build_id AS build_id,
|
||||||
ARRAY<
|
{% for metric in metrics -%}
|
||||||
STRUCT<
|
{{ metric.select_expression }} AS {{ metric.name }},
|
||||||
name STRING,
|
{% endfor -%}
|
||||||
value FLOAT64
|
|
||||||
>
|
|
||||||
>[
|
|
||||||
{% for metric in metrics -%}
|
|
||||||
(
|
|
||||||
"{{ metric.name }}",
|
|
||||||
CAST({{ metric.select_expression }} AS FLOAT64)
|
|
||||||
)
|
|
||||||
{{ "," if not loop.last else "" }}
|
|
||||||
{% endfor -%}
|
|
||||||
] AS metrics,
|
|
||||||
FROM
|
FROM
|
||||||
{{ metrics[0].data_source.from_expression }}
|
{{ metrics[0].data_source.from_expression }}
|
||||||
RIGHT JOIN
|
RIGHT JOIN
|
||||||
|
@ -53,7 +42,7 @@ merged_scalars_{{ data_source }} AS (
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
-- combine the metrics from all the data sources
|
-- combine the metrics from all the data sources
|
||||||
joined_scalars AS (
|
joined_metrics AS (
|
||||||
SELECT
|
SELECT
|
||||||
population.submission_date AS submission_date,
|
population.submission_date AS submission_date,
|
||||||
population.client_id AS client_id,
|
population.client_id AS client_id,
|
||||||
|
@ -62,25 +51,23 @@ joined_scalars AS (
|
||||||
population.{{ dimension.name }} AS {{ dimension.name }},
|
population.{{ dimension.name }} AS {{ dimension.name }},
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
population.branch AS branch,
|
population.branch AS branch,
|
||||||
ARRAY_CONCAT(
|
{% for data_source, metrics in metrics_per_dataset.items() -%}
|
||||||
{% for data_source, metrics in metrics_per_dataset.items() -%}
|
{% for metric in metrics -%}
|
||||||
COALESCE(merged_scalars_{{ data_source }}.metrics, [])
|
{{ metric.name }},
|
||||||
{{ "," if not loop.last else "" }}
|
{% endfor -%}
|
||||||
{% endfor -%}
|
{% endfor -%}
|
||||||
) AS metrics
|
|
||||||
FROM population
|
FROM population
|
||||||
{% for data_source, metrics in metrics_per_dataset.items() -%}
|
{% for data_source, metrics in metrics_per_dataset.items() -%}
|
||||||
LEFT JOIN merged_scalars_{{ data_source }}
|
LEFT JOIN merged_metrics_{{ data_source }}
|
||||||
USING(submission_date, client_id, build_id)
|
USING(submission_date, client_id, build_id)
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
),
|
),
|
||||||
|
|
||||||
-- unnest the combined metrics so we get
|
-- normalize histograms and apply filters
|
||||||
-- the metric values for each client for each date
|
normalized_metrics AS (
|
||||||
flattened_scalars AS (
|
SELECT
|
||||||
SELECT * EXCEPT(metrics)
|
*
|
||||||
FROM joined_scalars
|
FROM joined_metrics
|
||||||
CROSS JOIN UNNEST(metrics)
|
|
||||||
{% if not config.population.monitor_entire_population %}
|
{% if not config.population.monitor_entire_population %}
|
||||||
WHERE branch IN (
|
WHERE branch IN (
|
||||||
-- If branches are not defined, assume it's a rollout
|
-- If branches are not defined, assume it's a rollout
|
||||||
|
@ -98,44 +85,20 @@ flattened_scalars AS (
|
||||||
)
|
)
|
||||||
{% if first_run or config.xaxis.value == "submission_date" -%}
|
{% if first_run or config.xaxis.value == "submission_date" -%}
|
||||||
SELECT
|
SELECT
|
||||||
submission_date,
|
*
|
||||||
client_id,
|
|
||||||
build_id,
|
|
||||||
{% for dimension in dimensions -%}
|
|
||||||
{{ dimension.name }},
|
|
||||||
{% endfor %}
|
|
||||||
branch,
|
|
||||||
name,
|
|
||||||
value
|
|
||||||
FROM
|
FROM
|
||||||
flattened_scalars
|
normalized_metrics
|
||||||
{% else -%}
|
{% else -%}
|
||||||
-- if data is aggregated by build ID, then aggregate data with previous runs
|
-- if data is aggregated by build ID, then aggregate data with previous runs
|
||||||
SELECT
|
SELECT
|
||||||
DATE('{{ submission_date }}') AS submission_date,
|
*
|
||||||
client_id,
|
FROM normalized_metrics _current
|
||||||
build_id,
|
|
||||||
{% for dimension in dimensions -%}
|
|
||||||
{{ dimension.name }},
|
|
||||||
{% endfor %}
|
|
||||||
branch,
|
|
||||||
name,
|
|
||||||
value
|
|
||||||
FROM flattened_scalars _current
|
|
||||||
WHERE
|
WHERE
|
||||||
PARSE_DATE('%Y%m%d', CAST(build_id AS STRING)) >= DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 14 DAY)
|
PARSE_DATE('%Y%m%d', CAST(build_id AS STRING)) >= DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 14 DAY)
|
||||||
UNION ALL
|
UNION ALL
|
||||||
SELECT
|
SELECT
|
||||||
DATE('{{ submission_date }}') AS submission_date,
|
SELECT * REPLACE (DATE('{{ submission_date }}') AS submission_date)
|
||||||
client_id,
|
FROM normalized_metrics _prev
|
||||||
build_id,
|
|
||||||
{% for dimension in dimensions -%}
|
|
||||||
{{ dimension.name }},
|
|
||||||
{% endfor %}
|
|
||||||
branch,
|
|
||||||
name,
|
|
||||||
value
|
|
||||||
FROM flattened_scalars _prev
|
|
||||||
WHERE
|
WHERE
|
||||||
PARSE_DATE('%Y%m%d', CAST(build_id AS STRING)) < DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 14 DAY)
|
PARSE_DATE('%Y%m%d', CAST(build_id AS STRING)) < DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 14 DAY)
|
||||||
AND submission_date = DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 1 DAY)
|
AND submission_date = DATE_SUB(DATE('{{ submission_date }}'), INTERVAL 1 DAY)
|
|
@ -1,167 +0,0 @@
|
||||||
{{ header }}
|
|
||||||
|
|
||||||
CREATE OR REPLACE VIEW
|
|
||||||
`{{ gcp_project }}.{{ dataset }}.{{ normalized_slug }}`
|
|
||||||
AS
|
|
||||||
-- Prepare scalar values
|
|
||||||
|
|
||||||
WITH filtered_scalars AS (
|
|
||||||
SELECT *
|
|
||||||
FROM `{{ gcp_project }}.{{ dataset }}_derived.{{ normalized_slug }}_scalar`
|
|
||||||
WHERE {% include 'where_clause.sql' -%}
|
|
||||||
),
|
|
||||||
|
|
||||||
log_min_max AS (
|
|
||||||
SELECT
|
|
||||||
name,
|
|
||||||
LOG(IF(MIN(value) <= 0, 1, MIN(value)), 2) log_min,
|
|
||||||
LOG(IF(MAX(value) <= 0, 1, MAX(value)), 2) log_max
|
|
||||||
FROM
|
|
||||||
filtered_scalars
|
|
||||||
GROUP BY name),
|
|
||||||
|
|
||||||
buckets_by_metric AS (
|
|
||||||
SELECT
|
|
||||||
name,
|
|
||||||
ARRAY(SELECT FORMAT("%.*f", 2, bucket) FROM UNNEST(
|
|
||||||
mozfun.glam.histogram_generate_scalar_buckets(log_min, log_max, 100)
|
|
||||||
) AS bucket ORDER BY bucket) AS buckets
|
|
||||||
FROM log_min_max
|
|
||||||
),
|
|
||||||
|
|
||||||
aggregated_scalars AS (
|
|
||||||
SELECT
|
|
||||||
client_id,
|
|
||||||
{% if config.xaxis.value == "submission_date" -%}
|
|
||||||
submission_date,
|
|
||||||
{% else %}
|
|
||||||
build_id,
|
|
||||||
{% endif %}
|
|
||||||
{% for dimension in dimensions -%}
|
|
||||||
{{ dimension.name }},
|
|
||||||
{% endfor -%}
|
|
||||||
branch,
|
|
||||||
name,
|
|
||||||
value
|
|
||||||
FROM
|
|
||||||
filtered_scalars
|
|
||||||
),
|
|
||||||
|
|
||||||
-- Prepare histogram values
|
|
||||||
filtered_histograms AS (
|
|
||||||
SELECT *
|
|
||||||
FROM `{{ gcp_project }}.{{ dataset }}_derived.{{ normalized_slug }}_histogram`
|
|
||||||
WHERE {% include 'where_clause.sql' -%}
|
|
||||||
),
|
|
||||||
|
|
||||||
normalized_histograms AS (
|
|
||||||
SELECT
|
|
||||||
client_id,
|
|
||||||
{% if config.xaxis.value == "submission_date" -%}
|
|
||||||
submission_date,
|
|
||||||
{% else -%}
|
|
||||||
build_id,
|
|
||||||
{% endif -%}
|
|
||||||
{% for dimension in dimensions -%}
|
|
||||||
{{ dimension.name }},
|
|
||||||
{% endfor -%}
|
|
||||||
branch,
|
|
||||||
probe,
|
|
||||||
{% if metrics_per_dataset != {} %}
|
|
||||||
STRUCT<
|
|
||||||
bucket_count INT64,
|
|
||||||
sum INT64,
|
|
||||||
histogram_type INT64,
|
|
||||||
`range` ARRAY<INT64>,
|
|
||||||
VALUES
|
|
||||||
ARRAY<STRUCT<key STRING, value FLOAT64>>
|
|
||||||
>(
|
|
||||||
ANY_VALUE(value.bucket_count),
|
|
||||||
ANY_VALUE(value.sum),
|
|
||||||
ANY_VALUE(value.histogram_type),
|
|
||||||
ANY_VALUE(value.range),
|
|
||||||
mozfun.glam.histogram_normalized_sum(
|
|
||||||
mozfun.hist.merge(ARRAY_AGG(value IGNORE NULLS)).values,
|
|
||||||
1.0
|
|
||||||
)
|
|
||||||
) AS value
|
|
||||||
{% else %}
|
|
||||||
NULL AS value
|
|
||||||
{% endif %}
|
|
||||||
FROM filtered_histograms
|
|
||||||
GROUP BY
|
|
||||||
client_id,
|
|
||||||
{% if config.xaxis.value == "submission_date" -%}
|
|
||||||
submission_date,
|
|
||||||
{% else -%}
|
|
||||||
build_id,
|
|
||||||
{% endif %}
|
|
||||||
{% for dimension in dimensions -%}
|
|
||||||
{{ dimension.name }},
|
|
||||||
{% endfor -%}
|
|
||||||
branch,
|
|
||||||
probe)
|
|
||||||
|
|
||||||
-- Cast histograms to have FLOAT64 keys
|
|
||||||
-- so we can use the histogram jackknife percentile function.
|
|
||||||
SELECT
|
|
||||||
client_id,
|
|
||||||
{% if config.xaxis.value == "submission_date" -%}
|
|
||||||
submission_date,
|
|
||||||
{% else -%}
|
|
||||||
build_id,
|
|
||||||
{% endif %}
|
|
||||||
{% for dimension in dimensions -%}
|
|
||||||
{{ dimension.name }},
|
|
||||||
{% endfor -%}
|
|
||||||
branch,
|
|
||||||
probe AS probe,
|
|
||||||
{% if metrics_per_dataset != {} %}
|
|
||||||
STRUCT<
|
|
||||||
bucket_count INT64,
|
|
||||||
sum INT64,
|
|
||||||
histogram_type INT64,
|
|
||||||
`range` ARRAY<INT64>,
|
|
||||||
VALUES
|
|
||||||
ARRAY<STRUCT<key FLOAT64, value FLOAT64>
|
|
||||||
>>(value.bucket_count,
|
|
||||||
value.sum,
|
|
||||||
value.histogram_type,
|
|
||||||
value.range,
|
|
||||||
ARRAY(SELECT AS STRUCT CAST(keyval.key AS FLOAT64), keyval.value FROM UNNEST(value.values) keyval)
|
|
||||||
) AS value
|
|
||||||
{% else %}
|
|
||||||
NULL AS value
|
|
||||||
{% endif %}
|
|
||||||
FROM normalized_histograms
|
|
||||||
UNION ALL
|
|
||||||
SELECT
|
|
||||||
client_id,
|
|
||||||
{% if config.xaxis.value == "submission_date" -%}
|
|
||||||
submission_date,
|
|
||||||
{% else %}
|
|
||||||
build_id,
|
|
||||||
{% endif %}
|
|
||||||
{% for dimension in dimensions -%}
|
|
||||||
{{ dimension.name }},
|
|
||||||
{% endfor -%}
|
|
||||||
branch,
|
|
||||||
name AS probe,
|
|
||||||
STRUCT<
|
|
||||||
bucket_count INT64,
|
|
||||||
sum INT64,
|
|
||||||
histogram_type INT64,
|
|
||||||
`range` ARRAY<INT64>,
|
|
||||||
VALUES
|
|
||||||
ARRAY<STRUCT<key FLOAT64, value FLOAT64>
|
|
||||||
>>(1,
|
|
||||||
COALESCE(SAFE_CAST(SAFE_CAST(FORMAT("%.*f", 2, COALESCE(mozfun.glam.histogram_bucket_from_value(buckets, SAFE_CAST(value AS FLOAT64)), 0) + 0.0001) AS FLOAT64) AS INT64), 0),
|
|
||||||
1,
|
|
||||||
[0, COALESCE(SAFE_CAST(SAFE_CAST(FORMAT("%.*f", 2, COALESCE(mozfun.glam.histogram_bucket_from_value(buckets, SAFE_CAST(value AS FLOAT64)), 0) + 0.0001) AS FLOAT64) AS INT64), 0)],
|
|
||||||
[STRUCT<key FLOAT64, value FLOAT64>(
|
|
||||||
COALESCE(SAFE_CAST(FORMAT("%.*f", 2, COALESCE(mozfun.glam.histogram_bucket_from_value(buckets, SAFE_CAST(value AS FLOAT64)), 0) + 0.0001) AS FLOAT64), 0.0), 1
|
|
||||||
)]
|
|
||||||
) AS value
|
|
||||||
FROM
|
|
||||||
aggregated_scalars
|
|
||||||
LEFT JOIN buckets_by_metric USING(name)
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
CREATE TEMPORARY FUNCTION histogram_normalized_sum(
|
||||||
|
arrs ARRAY<STRUCT<key INT64, value INT64>>,
|
||||||
|
weight FLOAT64
|
||||||
|
)
|
||||||
|
RETURNS ARRAY<STRUCT<key INT64, value FLOAT64>> AS (
|
||||||
|
-- Input: one histogram for a single client.
|
||||||
|
-- Returns the normalized sum of the input maps.
|
||||||
|
-- It returns the total_count[k] / SUM(total_count)
|
||||||
|
-- for each key k.
|
||||||
|
(
|
||||||
|
WITH total_counts AS (
|
||||||
|
SELECT
|
||||||
|
sum(a.value) AS total_count
|
||||||
|
FROM
|
||||||
|
UNNEST(arrs) AS a
|
||||||
|
),
|
||||||
|
summed_counts AS (
|
||||||
|
SELECT
|
||||||
|
a.key AS k,
|
||||||
|
SUM(a.value) AS v
|
||||||
|
FROM
|
||||||
|
UNNEST(arrs) AS a
|
||||||
|
GROUP BY
|
||||||
|
a.key
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
ARRAY_AGG(
|
||||||
|
STRUCT<key INT64, value FLOAT64>(
|
||||||
|
k,
|
||||||
|
COALESCE(SAFE_DIVIDE(1.0 * v, total_count), 0) * weight
|
||||||
|
)
|
||||||
|
ORDER BY
|
||||||
|
SAFE_CAST(k AS INT64)
|
||||||
|
)
|
||||||
|
FROM
|
||||||
|
summed_counts
|
||||||
|
CROSS JOIN
|
||||||
|
total_counts
|
||||||
|
)
|
||||||
|
);
|
|
@ -1,4 +1,39 @@
|
||||||
WITH merged AS (
|
{{ header }}
|
||||||
|
|
||||||
|
{% include 'normalized_sum_udf.sql' %}
|
||||||
|
|
||||||
|
WITH filtered_metrics AS (
|
||||||
|
SELECT *
|
||||||
|
FROM `{{ gcp_project }}.{{ dataset }}_derived.{{ normalized_slug }}`
|
||||||
|
WHERE {% include 'where_clause.sql' -%}
|
||||||
|
),
|
||||||
|
|
||||||
|
-- bucket metrics that use percentile
|
||||||
|
buckets_by_metric AS (
|
||||||
|
SELECT
|
||||||
|
[] AS dummy,
|
||||||
|
{% set seen_metrics = [] %}
|
||||||
|
{% for summary in summaries %}
|
||||||
|
{% if summary.statistic.name == "percentile" %}
|
||||||
|
{% if summary.metric.type == "scalar" -%}
|
||||||
|
{% if summary.metric.name not in seen_metrics %}
|
||||||
|
{% if seen_metrics.append(summary.metric.name) %} {% endif %}
|
||||||
|
ARRAY(SELECT FORMAT("%.*f", 2, bucket) FROM UNNEST(
|
||||||
|
mozfun.glam.histogram_generate_scalar_buckets(
|
||||||
|
LOG(IF(MIN(value) <= 0, 1, MIN({{ summary.metric.name }})), 2),
|
||||||
|
LOG(IF(MAX(value) <= 0, 1, MAX({{ summary.metric.name }})), 2),
|
||||||
|
100
|
||||||
|
)
|
||||||
|
) AS bucket ORDER BY bucket) AS {{ summary.metric.name }}_buckets,
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
FROM filtered_metrics
|
||||||
|
),
|
||||||
|
|
||||||
|
|
||||||
|
stats AS (
|
||||||
SELECT
|
SELECT
|
||||||
{% if config.xaxis.value == "submission_date" -%}
|
{% if config.xaxis.value == "submission_date" -%}
|
||||||
submission_date,
|
submission_date,
|
||||||
|
@ -9,24 +44,37 @@ WITH merged AS (
|
||||||
{{ dimension.name }},
|
{{ dimension.name }},
|
||||||
{% endfor -%}
|
{% endfor -%}
|
||||||
branch,
|
branch,
|
||||||
probe AS metric,
|
ARRAY<STRUCT<
|
||||||
mozfun.hist.merge(ARRAY_AGG(value IGNORE NULLS)).values AS values
|
metric STRING,
|
||||||
|
statistic STRING,
|
||||||
|
point FLOAT64,
|
||||||
|
lower FLOAT64,
|
||||||
|
upper FLOAT64,
|
||||||
|
parameter STRING
|
||||||
|
>>[
|
||||||
|
{% for summary in summaries %}
|
||||||
|
STRUCT(
|
||||||
|
'{{ summary.metric.name }}' AS metric,
|
||||||
|
'{{ summary.statistic.name }}' AS statistic,
|
||||||
|
{{ summary.statistic.point }} AS point
|
||||||
|
{% if summary.statistic.lower -%}
|
||||||
|
,{{ summary.statistic.lower }} AS lower
|
||||||
|
{% endif -%}
|
||||||
|
{% if summary.statistic.upper -%}
|
||||||
|
,{{ summary.statistic.upper }} AS upper
|
||||||
|
{% endif -%}
|
||||||
|
{% if summary.statistic.parameter -%}
|
||||||
|
,'{{ summary.statistic.parameter }}' AS parameter
|
||||||
|
{% endif -%}
|
||||||
|
)
|
||||||
|
{{ "," if not loop.last else "" }}
|
||||||
|
{% endfor %}
|
||||||
|
] AS statistics
|
||||||
FROM
|
FROM
|
||||||
`{{ gcp_project }}.{{ dataset }}.{{ normalized_slug }}`
|
`{{ gcp_project }}.{{ dataset }}.{{ normalized_slug }}`
|
||||||
|
CROSS JOIN buckets_by_metric
|
||||||
WHERE submission_date = DATE("{{ submission_date }}")
|
WHERE submission_date = DATE("{{ submission_date }}")
|
||||||
GROUP BY
|
GROUP BY
|
||||||
{% if config.xaxis.value == "submission_date" -%}
|
|
||||||
submission_date,
|
|
||||||
{% else %}
|
|
||||||
{% for dimension in dimensions -%}
|
|
||||||
{{ dimension.name }},
|
|
||||||
{% endfor -%}
|
|
||||||
build_id,
|
|
||||||
{% endif %}
|
|
||||||
branch,
|
|
||||||
metric
|
|
||||||
), stats AS (
|
|
||||||
SELECT
|
|
||||||
{% if config.xaxis.value == "submission_date" -%}
|
{% if config.xaxis.value == "submission_date" -%}
|
||||||
submission_date,
|
submission_date,
|
||||||
{% else %}
|
{% else %}
|
||||||
|
@ -35,43 +83,9 @@ WITH merged AS (
|
||||||
{% for dimension in dimensions -%}
|
{% for dimension in dimensions -%}
|
||||||
{{ dimension.name }},
|
{{ dimension.name }},
|
||||||
{% endfor -%}
|
{% endfor -%}
|
||||||
branch,
|
branch
|
||||||
metric,
|
|
||||||
CASE value
|
|
||||||
{% for probe in probes %}
|
|
||||||
WHEN probe = "{{ probe.metric.name }}"
|
|
||||||
THEN ARRAY<STRUCT<>>[(
|
|
||||||
{% for stat in probe.statistics %}
|
|
||||||
{{ stat.name }} AS statistic,
|
|
||||||
{{ stat.point }} AS point,
|
|
||||||
{% if stat.lower -%}
|
|
||||||
stat.lower AS lower,
|
|
||||||
{% endif -%}
|
|
||||||
{% if stat.upper -%}
|
|
||||||
stat.upper AS upper,
|
|
||||||
{% endif -%}
|
|
||||||
{% if stat.parameter -%}
|
|
||||||
stat.parameter AS parameter,
|
|
||||||
{% endif -%}
|
|
||||||
{% enfor %}
|
|
||||||
)]
|
|
||||||
{% endfor %}
|
|
||||||
ELSE NULL
|
|
||||||
END AS values
|
|
||||||
FROM
|
|
||||||
merged
|
|
||||||
GROUP BY
|
|
||||||
{% if config.xaxis.value == "submission_date" -%}
|
|
||||||
submission_date,
|
|
||||||
{% else %}
|
|
||||||
{% for dimension in dimensions -%}
|
|
||||||
{{ dimension.name }},
|
|
||||||
{% endfor -%}
|
|
||||||
build_id,
|
|
||||||
{% endif %}
|
|
||||||
branch,
|
|
||||||
metric
|
|
||||||
)
|
)
|
||||||
|
|
||||||
SELECT
|
SELECT
|
||||||
{% if config.xaxis.value == "submission_date" -%}
|
{% if config.xaxis.value == "submission_date" -%}
|
||||||
submission_date,
|
submission_date,
|
||||||
|
@ -82,10 +96,10 @@ SELECT
|
||||||
{{ dimension.name }},
|
{{ dimension.name }},
|
||||||
{% endfor -%}
|
{% endfor -%}
|
||||||
branch,
|
branch,
|
||||||
metric,
|
statistic.metric AS metric,
|
||||||
statistic.name AS statistic,
|
statistic.name AS statistic,
|
||||||
statistic.point AS point,
|
statistic.point AS point,
|
||||||
statistic.lower AS lower,
|
statistic.lower AS lower,
|
||||||
statistic.upper AS upper,
|
statistic.upper AS upper,
|
||||||
statistic.parameter AS parameter
|
statistic.parameter AS parameter
|
||||||
FROM stats, UNNEST(values) as statistic
|
FROM stats, UNNEST(statistics) as statistic
|
||||||
|
|
Загрузка…
Ссылка в новой задаче