Metric hub integration - rewrite SSL ratios to use metrics (#3698)
* Add metrics.data_source() * Rewrite SSL ratios to use metrics * Fix docs formatting
This commit is contained in:
Родитель
e857d3e7e5
Коммит
48d8c7603d
|
@ -21,6 +21,7 @@ from ..routine.parse_routine import (
|
|||
read_routine_dir,
|
||||
)
|
||||
from ..schema import SCHEMA_FILE, Schema
|
||||
from ..util.common import render
|
||||
from ..view import View
|
||||
|
||||
VIEW_FILE = "view.sql"
|
||||
|
@ -275,7 +276,7 @@ def _update_references(artifact_files, project_id, dataset_suffix, sql_dir):
|
|||
for path in Path(sql_dir).rglob("*.sql"):
|
||||
# apply substitutions
|
||||
if path.is_file():
|
||||
sql = path.read_text()
|
||||
sql = render(path.name, template_folder=path.parent, format=False)
|
||||
|
||||
for ref in replace_references:
|
||||
sql = re.sub(ref[0], ref[1], sql)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
"""Generic utility functions."""
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
|
@ -27,6 +28,11 @@ REV_WORD_BOUND_PAT = re.compile(
|
|||
)
|
||||
SQL_DIR = "sql/"
|
||||
FILE_PATH = Path(os.path.dirname(__file__))
|
||||
TEST_PROJECT = "bigquery-etl-integration-test"
|
||||
SKIP_RENDER = {
|
||||
# uses {%s} which results in unknown tag exception
|
||||
"sql/mozfun/hist/string_to_json/udf.sql",
|
||||
}
|
||||
|
||||
|
||||
def snake_case(line: str) -> str:
|
||||
|
@ -61,13 +67,33 @@ def render(
|
|||
**kwargs,
|
||||
) -> str:
|
||||
"""Render a given template query using Jinja."""
|
||||
file_loader = FileSystemLoader(f"{template_folder}")
|
||||
env = Environment(loader=file_loader)
|
||||
main_sql = env.get_template(sql_filename)
|
||||
if "metrics" not in kwargs:
|
||||
rendered = main_sql.render(**kwargs, metrics=MetricHub())
|
||||
path = Path(template_folder) / sql_filename
|
||||
skip = SKIP_RENDER
|
||||
|
||||
if TEST_PROJECT in str(path):
|
||||
# check if staged file needs to be skipped
|
||||
skip.update(
|
||||
[
|
||||
p
|
||||
for f in [Path(s) for s in skip]
|
||||
for p in glob.glob(
|
||||
f"sql/{TEST_PROJECT}/{f.parent.parent.name}*/{f.parent.name}/{f.name}",
|
||||
recursive=True,
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
if any(s in str(path) for s in skip):
|
||||
rendered = path.read_text()
|
||||
else:
|
||||
rendered = main_sql.render(**kwargs)
|
||||
file_loader = FileSystemLoader(f"{template_folder}")
|
||||
env = Environment(loader=file_loader)
|
||||
main_sql = env.get_template(sql_filename)
|
||||
if "metrics" not in kwargs:
|
||||
rendered = main_sql.render(**kwargs, metrics=MetricHub())
|
||||
else:
|
||||
rendered = main_sql.render(**kwargs)
|
||||
|
||||
if format:
|
||||
rendered = reformat(rendered)
|
||||
return rendered
|
||||
|
|
|
@ -42,43 +42,42 @@
|
|||
- Queries, views and UDFs can reference metrics and data sources that have been defined in [metric-hub](https://mozilla.github.io/metric-hub/)
|
||||
- To reference metrics use `{{ metrics.calculate() }}`:
|
||||
|
||||
```sql
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
{{ metrics.calculate(
|
||||
metrics=['days_of_use', 'active_hours'],
|
||||
platform='firefox_desktop',
|
||||
group_by={'sample_id': 'sample_id', 'channel': 'application.channel'},
|
||||
where='submission_date = "2023-01-01"'
|
||||
) }}
|
||||
|
||||
-- this translates to
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
(
|
||||
WITH clients_daily AS (
|
||||
```sql
|
||||
SELECT
|
||||
client_id AS client_id,
|
||||
submission_date AS submission_date,
|
||||
COALESCE(SUM(active_hours_sum), 0) AS active_hours,
|
||||
COUNT(submission_date) AS days_of_use,
|
||||
*
|
||||
FROM
|
||||
mozdata.telemetry.clients_daily
|
||||
GROUP BY
|
||||
client_id,
|
||||
submission_date
|
||||
)
|
||||
SELECT
|
||||
clients_daily.client_id,
|
||||
clients_daily.submission_date,
|
||||
active_hours,
|
||||
days_of_use,
|
||||
FROM
|
||||
clients_daily
|
||||
)
|
||||
```
|
||||
{{ metrics.calculate(
|
||||
metrics=['days_of_use', 'active_hours'],
|
||||
platform='firefox_desktop',
|
||||
group_by={'sample_id': 'sample_id', 'channel': 'application.channel'},
|
||||
where='submission_date = "2023-01-01"'
|
||||
) }}
|
||||
-- this translates to
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
(
|
||||
WITH clients_daily AS (
|
||||
SELECT
|
||||
client_id AS client_id,
|
||||
submission_date AS submission_date,
|
||||
COALESCE(SUM(active_hours_sum), 0) AS active_hours,
|
||||
COUNT(submission_date) AS days_of_use,
|
||||
FROM
|
||||
mozdata.telemetry.clients_daily
|
||||
GROUP BY
|
||||
client_id,
|
||||
submission_date
|
||||
)
|
||||
SELECT
|
||||
clients_daily.client_id,
|
||||
clients_daily.submission_date,
|
||||
active_hours,
|
||||
days_of_use,
|
||||
FROM
|
||||
clients_daily
|
||||
)
|
||||
```
|
||||
- `metrics`: unique reference(s) to metric definition, all [metric definitions](https://mozilla.github.io/metric-hub/metrics/firefox_desktop/) are aggregations (e.g. SUM, AVG, ...)
|
||||
- `platform`: platform to compute metrics for (e.g. `firefox_desktop`, `firefox_ios`, `fenix`, ...)
|
||||
- `group_by`: fields used in the GROUP BY statement; this is a dictionary where the key represents the alias, the value is the field path; `GROUP BY` always includes the configured `client_id` and `submission_date` fields
|
||||
|
@ -86,27 +85,26 @@ FROM
|
|||
- `group_by_client_id`: Whether the field configured as `client_id` (defined as part of the data source specification in metric-hub) should be part of the `GROUP BY`. `True` by default
|
||||
- `group_by_submission_date`: Whether the field configured as `submission_date` (defined as part of the data source specification in metric-hub) should be part of the `GROUP BY`. `True` by default
|
||||
- To reference data source definitions use `{{ metrics.data_source() }}`:
|
||||
|
||||
```sql
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
{{ metrics.data_source(
|
||||
data_source='main',
|
||||
platform='firefox_desktop',
|
||||
where='submission_date = "2023-01-01"'
|
||||
) }}
|
||||
|
||||
-- this translates to
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
(
|
||||
SELECT * FROM `mozdata.telemetry.main`
|
||||
WHERE submission_date = "2023-01-01"
|
||||
)
|
||||
```
|
||||
- To render queries that use Jinja expressions or statements use `./bqetl query render path/to/query.py`
|
||||
```sql
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
{{ metrics.data_source(
|
||||
data_source='main',
|
||||
platform='firefox_desktop',
|
||||
where='submission_date = "2023-01-01"'
|
||||
) }}
|
||||
-- this translates to
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
(
|
||||
SELECT *
|
||||
FROM `mozdata.telemetry.main`
|
||||
WHERE submission_date = "2023-01-01"
|
||||
)
|
||||
```
|
||||
- To render queries that use Jinja expressions or statements use `./bqetl query render path/to/query.sql`
|
||||
- The `generated-sql` branch has rendered queries/views/UDFs
|
||||
- `./bqetl query run` does support running Jinja queries
|
||||
|
||||
|
|
|
@ -1,25 +1,24 @@
|
|||
SELECT
|
||||
DATE(submission_timestamp) AS submission_date,
|
||||
environment.system.os.name AS os,
|
||||
metadata.geo.country,
|
||||
SUM(
|
||||
mozfun.map.get_key(mozfun.hist.extract(payload.histograms.http_pageload_is_ssl).values, 0)
|
||||
) AS non_ssl_loads,
|
||||
SUM(
|
||||
mozfun.map.get_key(mozfun.hist.extract(payload.histograms.http_pageload_is_ssl).values, 1)
|
||||
) AS ssl_loads,
|
||||
-- ratio of pings that have the probe
|
||||
COUNT(payload.histograms.http_pageload_is_ssl) / COUNT(*) AS reporting_ratio
|
||||
FROM
|
||||
telemetry.main
|
||||
WHERE
|
||||
sample_id = 42
|
||||
AND normalized_channel = 'release'
|
||||
AND environment.system.os.name IN ('Windows_NT', 'Darwin', 'Linux')
|
||||
AND application.name = 'Firefox'
|
||||
AND DATE(submission_timestamp) > DATE '2016-11-01'
|
||||
AND (DATE(submission_timestamp) = @submission_date OR @submission_date IS NULL)
|
||||
GROUP BY
|
||||
submission_date,
|
||||
os,
|
||||
country
|
||||
country,
|
||||
non_ssl_loads_v1 AS non_ssl_loads,
|
||||
ssl_loads_v1 AS ssl_loads,
|
||||
http_pageload_is_ssl_ratio_v1 AS reporting_ratio
|
||||
FROM
|
||||
{{
|
||||
metrics.calculate(
|
||||
metrics=["non_ssl_loads_v1", "ssl_loads_v1", "http_pageload_is_ssl_ratio_v1"],
|
||||
platform="firefox_desktop",
|
||||
group_by={"os": "environment.system.os.name", "country": "metadata.geo.country"},
|
||||
where="""
|
||||
sample_id = 42
|
||||
AND normalized_channel = 'release'
|
||||
AND environment.system.os.name IN ('Windows_NT', 'Darwin', 'Linux')
|
||||
AND application.name = 'Firefox'
|
||||
AND DATE(submission_timestamp) > DATE '2016-11-01'
|
||||
AND (DATE(submission_timestamp) = @submission_date OR @submission_date IS NULL)
|
||||
""",
|
||||
group_by_client_id=False
|
||||
)
|
||||
}}
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
{% raw %}
|
||||
CREATE OR REPLACE FUNCTION hist.string_to_json(input STRING) AS (
|
||||
CASE
|
||||
WHEN STARTS_WITH(TRIM(input), '{')
|
||||
|
@ -46,7 +45,6 @@ CREATE OR REPLACE FUNCTION hist.string_to_json(input STRING) AS (
|
|||
END
|
||||
);
|
||||
|
||||
{% endraw %}
|
||||
-- Tests
|
||||
WITH test_data AS (
|
||||
SELECT
|
||||
|
|
|
@ -49,7 +49,7 @@ def generate(target_project, output_dir, use_cloud_function):
|
|||
env = Environment(loader=FileSystemLoader(str(THIS_PATH / "templates")))
|
||||
mobile_query_template = env.get_template("mobile_query.sql")
|
||||
desktop_query_template = env.get_template("desktop_query.sql")
|
||||
metadata_template = env.get_template("metadata.yaml")
|
||||
metadata_template = "metadata.yaml"
|
||||
view_template = env.get_template("view.sql")
|
||||
focus_android_view_template = env.get_template("focus_android_view.sql")
|
||||
mobile_view_template = env.get_template("mobile_view.sql")
|
||||
|
@ -85,7 +85,7 @@ def generate(target_project, output_dir, use_cloud_function):
|
|||
basename="metadata.yaml",
|
||||
sql=render(
|
||||
metadata_template,
|
||||
template_folder="templates",
|
||||
template_folder=THIS_PATH / "templates",
|
||||
app_value=browser.value,
|
||||
app_name=browser.name,
|
||||
format=False,
|
||||
|
|
|
@ -24,6 +24,18 @@
|
|||
"type": "RECORD",
|
||||
"name": "system",
|
||||
"mode": "NULLABLE"
|
||||
},
|
||||
{
|
||||
"fields": [
|
||||
{
|
||||
"type": "STRING",
|
||||
"name": "name",
|
||||
"mode": "NULLABLE"
|
||||
}
|
||||
],
|
||||
"type": "RECORD",
|
||||
"name": "experiments",
|
||||
"mode": "NULLABLE"
|
||||
}
|
||||
],
|
||||
"type": "RECORD",
|
Загрузка…
Ссылка в новой задаче