Metric hub integration - rewrite SSL ratios to use metrics (#3698)

* Add metrics.data_source()

* Rewrite SSL ratios to use metrics

* Fix docs formatting
This commit is contained in:
Anna Scholtz 2023-04-04 15:41:44 -07:00 коммит произвёл GitHub
Родитель e857d3e7e5
Коммит 48d8c7603d
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
8 изменённых файлов: 123 добавлений и 89 удалений

Просмотреть файл

@ -21,6 +21,7 @@ from ..routine.parse_routine import (
read_routine_dir,
)
from ..schema import SCHEMA_FILE, Schema
from ..util.common import render
from ..view import View
VIEW_FILE = "view.sql"
@ -275,7 +276,7 @@ def _update_references(artifact_files, project_id, dataset_suffix, sql_dir):
for path in Path(sql_dir).rglob("*.sql"):
# apply substitutions
if path.is_file():
sql = path.read_text()
sql = render(path.name, template_folder=path.parent, format=False)
for ref in replace_references:
sql = re.sub(ref[0], ref[1], sql)

Просмотреть файл

@ -1,4 +1,5 @@
"""Generic utility functions."""
import glob
import logging
import os
import random
@ -27,6 +28,11 @@ REV_WORD_BOUND_PAT = re.compile(
)
SQL_DIR = "sql/"
FILE_PATH = Path(os.path.dirname(__file__))
TEST_PROJECT = "bigquery-etl-integration-test"
SKIP_RENDER = {
# uses {%s} which results in unknown tag exception
"sql/mozfun/hist/string_to_json/udf.sql",
}
def snake_case(line: str) -> str:
@ -61,13 +67,33 @@ def render(
**kwargs,
) -> str:
"""Render a given template query using Jinja."""
file_loader = FileSystemLoader(f"{template_folder}")
env = Environment(loader=file_loader)
main_sql = env.get_template(sql_filename)
if "metrics" not in kwargs:
rendered = main_sql.render(**kwargs, metrics=MetricHub())
path = Path(template_folder) / sql_filename
skip = SKIP_RENDER
if TEST_PROJECT in str(path):
# check if staged file needs to be skipped
skip.update(
[
p
for f in [Path(s) for s in skip]
for p in glob.glob(
f"sql/{TEST_PROJECT}/{f.parent.parent.name}*/{f.parent.name}/{f.name}",
recursive=True,
)
]
)
if any(s in str(path) for s in skip):
rendered = path.read_text()
else:
rendered = main_sql.render(**kwargs)
file_loader = FileSystemLoader(f"{template_folder}")
env = Environment(loader=file_loader)
main_sql = env.get_template(sql_filename)
if "metrics" not in kwargs:
rendered = main_sql.render(**kwargs, metrics=MetricHub())
else:
rendered = main_sql.render(**kwargs)
if format:
rendered = reformat(rendered)
return rendered

Просмотреть файл

@ -42,43 +42,42 @@
- Queries, views and UDFs can reference metrics and data sources that have been defined in [metric-hub](https://mozilla.github.io/metric-hub/)
- To reference metrics use `{{ metrics.calculate() }}`:
```sql
SELECT
*
FROM
{{ metrics.calculate(
metrics=['days_of_use', 'active_hours'],
platform='firefox_desktop',
group_by={'sample_id': 'sample_id', 'channel': 'application.channel'},
where='submission_date = "2023-01-01"'
) }}
-- this translates to
SELECT
*
FROM
(
WITH clients_daily AS (
```sql
SELECT
client_id AS client_id,
submission_date AS submission_date,
COALESCE(SUM(active_hours_sum), 0) AS active_hours,
COUNT(submission_date) AS days_of_use,
*
FROM
mozdata.telemetry.clients_daily
GROUP BY
client_id,
submission_date
)
SELECT
clients_daily.client_id,
clients_daily.submission_date,
active_hours,
days_of_use,
FROM
clients_daily
)
```
{{ metrics.calculate(
metrics=['days_of_use', 'active_hours'],
platform='firefox_desktop',
group_by={'sample_id': 'sample_id', 'channel': 'application.channel'},
where='submission_date = "2023-01-01"'
) }}
-- this translates to
SELECT
*
FROM
(
WITH clients_daily AS (
SELECT
client_id AS client_id,
submission_date AS submission_date,
COALESCE(SUM(active_hours_sum), 0) AS active_hours,
COUNT(submission_date) AS days_of_use,
FROM
mozdata.telemetry.clients_daily
GROUP BY
client_id,
submission_date
)
SELECT
clients_daily.client_id,
clients_daily.submission_date,
active_hours,
days_of_use,
FROM
clients_daily
)
```
- `metrics`: unique reference(s) to metric definition, all [metric definitions](https://mozilla.github.io/metric-hub/metrics/firefox_desktop/) are aggregations (e.g. SUM, AVG, ...)
- `platform`: platform to compute metrics for (e.g. `firefox_desktop`, `firefox_ios`, `fenix`, ...)
- `group_by`: fields used in the GROUP BY statement; this is a dictionary where the key represents the alias, the value is the field path; `GROUP BY` always includes the configured `client_id` and `submission_date` fields
@ -86,27 +85,26 @@ FROM
- `group_by_client_id`: Whether the field configured as `client_id` (defined as part of the data source specification in metric-hub) should be part of the `GROUP BY`. `True` by default
- `group_by_submission_date`: Whether the field configured as `submission_date` (defined as part of the data source specification in metric-hub) should be part of the `GROUP BY`. `True` by default
- To reference data source definitions use `{{ metrics.data_source() }}`:
```sql
SELECT
*
FROM
{{ metrics.data_source(
data_source='main',
platform='firefox_desktop',
where='submission_date = "2023-01-01"'
) }}
-- this translates to
SELECT
*
FROM
(
SELECT * FROM `mozdata.telemetry.main`
WHERE submission_date = "2023-01-01"
)
```
- To render queries that use Jinja expressions or statements use `./bqetl query render path/to/query.py`
```sql
SELECT
*
FROM
{{ metrics.data_source(
data_source='main',
platform='firefox_desktop',
where='submission_date = "2023-01-01"'
) }}
-- this translates to
SELECT
*
FROM
(
SELECT *
FROM `mozdata.telemetry.main`
WHERE submission_date = "2023-01-01"
)
```
- To render queries that use Jinja expressions or statements use `./bqetl query render path/to/query.sql`
- The `generated-sql` branch has rendered queries/views/UDFs
- `./bqetl query run` does support running Jinja queries

Просмотреть файл

@ -1,25 +1,24 @@
SELECT
DATE(submission_timestamp) AS submission_date,
environment.system.os.name AS os,
metadata.geo.country,
SUM(
mozfun.map.get_key(mozfun.hist.extract(payload.histograms.http_pageload_is_ssl).values, 0)
) AS non_ssl_loads,
SUM(
mozfun.map.get_key(mozfun.hist.extract(payload.histograms.http_pageload_is_ssl).values, 1)
) AS ssl_loads,
-- ratio of pings that have the probe
COUNT(payload.histograms.http_pageload_is_ssl) / COUNT(*) AS reporting_ratio
FROM
telemetry.main
WHERE
sample_id = 42
AND normalized_channel = 'release'
AND environment.system.os.name IN ('Windows_NT', 'Darwin', 'Linux')
AND application.name = 'Firefox'
AND DATE(submission_timestamp) > DATE '2016-11-01'
AND (DATE(submission_timestamp) = @submission_date OR @submission_date IS NULL)
GROUP BY
submission_date,
os,
country
country,
non_ssl_loads_v1 AS non_ssl_loads,
ssl_loads_v1 AS ssl_loads,
http_pageload_is_ssl_ratio_v1 AS reporting_ratio
FROM
{{
metrics.calculate(
metrics=["non_ssl_loads_v1", "ssl_loads_v1", "http_pageload_is_ssl_ratio_v1"],
platform="firefox_desktop",
group_by={"os": "environment.system.os.name", "country": "metadata.geo.country"},
where="""
sample_id = 42
AND normalized_channel = 'release'
AND environment.system.os.name IN ('Windows_NT', 'Darwin', 'Linux')
AND application.name = 'Firefox'
AND DATE(submission_timestamp) > DATE '2016-11-01'
AND (DATE(submission_timestamp) = @submission_date OR @submission_date IS NULL)
""",
group_by_client_id=False
)
}}

Просмотреть файл

@ -1,4 +1,3 @@
{% raw %}
CREATE OR REPLACE FUNCTION hist.string_to_json(input STRING) AS (
CASE
WHEN STARTS_WITH(TRIM(input), '{')
@ -46,7 +45,6 @@ CREATE OR REPLACE FUNCTION hist.string_to_json(input STRING) AS (
END
);
{% endraw %}
-- Tests
WITH test_data AS (
SELECT

Просмотреть файл

@ -49,7 +49,7 @@ def generate(target_project, output_dir, use_cloud_function):
env = Environment(loader=FileSystemLoader(str(THIS_PATH / "templates")))
mobile_query_template = env.get_template("mobile_query.sql")
desktop_query_template = env.get_template("desktop_query.sql")
metadata_template = env.get_template("metadata.yaml")
metadata_template = "metadata.yaml"
view_template = env.get_template("view.sql")
focus_android_view_template = env.get_template("focus_android_view.sql")
mobile_view_template = env.get_template("mobile_view.sql")
@ -85,7 +85,7 @@ def generate(target_project, output_dir, use_cloud_function):
basename="metadata.yaml",
sql=render(
metadata_template,
template_folder="templates",
template_folder=THIS_PATH / "templates",
app_value=browser.value,
app_name=browser.name,
format=False,

Просмотреть файл

@ -24,6 +24,18 @@
"type": "RECORD",
"name": "system",
"mode": "NULLABLE"
},
{
"fields": [
{
"type": "STRING",
"name": "name",
"mode": "NULLABLE"
}
],
"type": "RECORD",
"name": "experiments",
"mode": "NULLABLE"
}
],
"type": "RECORD",