feat: add usage_reporting generator

This commit is contained in:
kik-kik 2025-01-10 18:04:12 +01:00 коммит произвёл kik-kik
Родитель 2d616396d7
Коммит 3c70a12977
14 изменённых файлов: 599 добавлений и 0 удалений

Просмотреть файл

@ -386,6 +386,18 @@ generate:
stable_views:
skip_datasets:
- mlhackweek_search
usage_reporting:
apps:
fenix:
channels:
- nightly
firefox_ios:
channels:
- nightly
# firefox_desktop:
# # firefox_desktop is a single app containing multiple channels
# channels: null
retention_exclusion_list:
- sql/moz-fx-data-shared-prod/search_derived/acer_cohort_v1

Просмотреть файл

@ -0,0 +1,3 @@
# Usage Reporting
TODO

Просмотреть файл

@ -0,0 +1,204 @@
"""Usage Reporting ETL."""
from os import path
from pathlib import Path
import click
from jinja2 import Environment, FileSystemLoader
from bigquery_etl.cli.utils import is_valid_project, use_cloud_function_option
from bigquery_etl.config import ConfigLoader
from bigquery_etl.format_sql.formatter import reformat
from bigquery_etl.util.common import write_sql
from sql_generators.glean_usage.common import get_app_info
GENERATOR_ROOT = Path(path.dirname(__file__))
HEADER = f"Generated via `{GENERATOR_ROOT.name}` SQL generator."
VERSION = "v1"
TEMPLATES_LOCATION = "templates"
CHANNEL_TEMPLATES = (
"usage_reporting_clients_daily_v1.query.sql",
"usage_reporting_clients_first_seen_v1.query.sql",
"usage_reporting_clients_last_seen_v1.query.sql",
)
CHANNEL_VIEW_TEMPLATE = "channel.view.sql"
ARTIFACT_TEMPLATES = (
"metadata.yaml",
"schema.yaml",
)
APP_UNION_VIEW_TEMPLATE = "app_union.view.sql"
@click.command()
@click.option(
"--target-project",
"--target_project",
help="GCP project ID",
default="moz-fx-data-shared-prod",
callback=is_valid_project,
)
@click.option(
"--output-dir",
"--output_dir",
help="Output directory generated SQL is written to",
type=click.Path(file_okay=False),
default="sql",
)
@click.option(
"--parallelism",
"-p",
help="Maximum number of tasks to execute concurrently",
default=8,
)
@click.option(
"--except",
"-x",
"exclude",
help="Process all tables except for the given tables",
)
@click.option(
"--only",
"-o",
help="Process only the given tables",
)
@click.option(
"--app_name",
"--app-name",
help="App to generate per-app dataset metadata and union views for.",
)
@use_cloud_function_option
def generate(
target_project, output_dir, parallelism, exclude, only, app_name, use_cloud_function
):
"""Generate usage_reporting queries and views."""
usage_reporting_apps = ConfigLoader.get(
"generate", "usage_reporting", "apps", fallback=[]
)
app_info_filtered = dict()
for app_name, app_info in get_app_info().items():
if app_name not in usage_reporting_apps:
continue
app_info_filtered[app_name] = dict()
if len(app_info) == 1:
app_info_filtered[app_name]["multichannel"] = app_info[0]
else:
for index, channel_info in enumerate(app_info):
if (
channel := channel_info.get("app_channel")
) not in usage_reporting_apps[app_name]["channels"]:
continue
app_info_filtered[app_name][f"{channel}__{index}"] = channel_info
output_dir = Path(output_dir) / target_project
jinja_env = Environment(loader=FileSystemLoader(str(GENERATOR_ROOT / "templates")))
default_template_args = {
"project_id": target_project,
"usage_reporting_stable_table_name": "usage_reporting_v1",
"header": HEADER,
}
for app_name, app_channels in app_info_filtered.items():
app_template_args = {
"app_name": app_name,
**default_template_args,
}
for channel_template in CHANNEL_TEMPLATES:
table_name = channel_template.split(".")[0]
for channel_name, channel_info in app_channels.items():
channel_dataset = channel_info["bq_dataset_family"]
channel_args = {
"channel_name": channel_name.split("__")[0],
"channel_dataset": channel_dataset,
"table_name": table_name,
"view_name": table_name[:-3],
**app_template_args,
}
channel_table_id = (
f"{target_project}.{channel_dataset}_derived.{table_name}"
)
channel_view_id = (
f"{target_project}.{channel_dataset}.{table_name[:-3]}"
)
channel_query_template = jinja_env.get_template(channel_template)
rendered_query = channel_query_template.render(
**channel_args,
)
write_sql(
output_dir=output_dir,
full_table_id=channel_table_id,
basename="query.sql",
sql=reformat(rendered_query),
skip_existing=False,
)
for channel_query_artifact_template in ARTIFACT_TEMPLATES:
_artifact_template = jinja_env.get_template(
f"{table_name}.{channel_query_artifact_template}"
)
rendered_artifact = _artifact_template.render(
**channel_args,
format=False,
)
write_sql(
output_dir=output_dir,
full_table_id=channel_table_id,
basename=channel_query_artifact_template,
sql=rendered_artifact,
skip_existing=False,
)
channel_view_template = jinja_env.get_template(CHANNEL_VIEW_TEMPLATE)
# Do not render channel view if only a single channel exists.
if channel_name == "multichannel":
continue
rendered_channel_view = channel_view_template.render(
**channel_args,
)
write_sql(
output_dir=output_dir,
full_table_id=channel_view_id,
basename="view.sql",
sql=reformat(rendered_channel_view),
skip_existing=False,
)
channels_info = [
{
"channel_dataset": channel_info["bq_dataset_family"],
"channel_name": channel_info.get("app_channel"),
}
for channel_info in app_channels.values()
]
app_union_view_template = jinja_env.get_template(APP_UNION_VIEW_TEMPLATE)
rendered_app_union_view = app_union_view_template.render(
channels_info=channels_info,
**app_template_args,
table_name=table_name,
view_name=table_name[:-3],
)
write_sql(
output_dir=output_dir,
full_table_id=f"{target_project}.{app_name}.{table_name[:-3]}",
basename="view.sql",
sql=reformat(rendered_app_union_view),
skip_existing=False,
)

Просмотреть файл

@ -0,0 +1,19 @@
-- {{ header }}
CREATE OR REPLACE VIEW
`{{ project_id }}.{{ app_name }}.{{ view_name }}`
AS
{% for channel in channels_info -%}
{% if not loop.first -%}
UNION ALL
{% endif -%}
SELECT
"{{ channel.channel_dataset }}" AS normalized_app_id,
-- {% if app_name == "fenix" and view_name == "usage_reporting_clients_daily" %}
-- mozfun.norm.fenix_app_info("{{ channel.channel_dataset }}", app_build).channel AS normalized_channel,
-- {% else %}
"{{ channel.channel_name }}" AS normalized_channel,
-- {% endif %}
*,
FROM
`{{ project_id }}.{{ channel.channel_dataset }}.{{ view_name }}`
{% endfor %}

Просмотреть файл

@ -0,0 +1,8 @@
-- {{ header }}
CREATE OR REPLACE VIEW
`{{ project_id }}.{{ channel_dataset }}.{{ view_name }}`
AS
SELECT
*
FROM
`{{ project_id }}.{{ channel_dataset }}_derived.{{ table_name }}`

Просмотреть файл

@ -0,0 +1,23 @@
# {{ header }}
friendly_name: Clients Daily Based on the DAU Reporting Ping.
description: |-
A daily aggregate of usage_reporting pings per `profile_usage_id`.
Cluster by: `normalized_channel`, `normalized_country_code`
owners:
- kik@mozilla.com
labels:
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_glean_usage
task_group: {{ app_name }}
bigquery:
time_partitioning:
type: day
field: submission_date
require_partition_filter: true
clustering:
fields:
- normalized_country_code

Просмотреть файл

@ -0,0 +1,56 @@
-- {{ header }}
WITH usage_reporting_base AS (
SELECT
submission_timestamp,
DATE(submission_timestamp) AS submission_date,
metrics.uuid.usage_profile_id,
SAFE.PARSE_DATE('%F', SUBSTR(metrics.datetime.usage_first_run_date, 1, 10)) AS first_run_date,
metrics.string.usage_app_channel AS app_channel,
normalized_country_code,
metrics.string.usage_os AS os,
metrics.string.usage_os_version AS os_version,
metrics.string.usage_app_display_version AS app_display_version,
metrics.string.usage_app_build AS app_build,
{% if has_distribution_id %}
metrics.string.metrics_distribution_id AS distribution_id,
{% else %}
CAST(NULL AS STRING) AS distribution_id,
{% endif %}
COALESCE(metrics.boolean.metrics_default_browser, FALSE) AS is_default_browser,
metrics.string.usage_reason AS reason,
{% if "_desktop" in app_name %}
COALESCE(metrics.counter.browser_engagement_uri_count, 0) AS browser_engagement_uri_count,
COALESCE(metrics.counter.browser_engagement_active_ticks, 0) AS browser_engagement_active_ticks,
{% endif %}
CAST(NULL AS BOOLEAN) AS is_active, -- Eventually is_active value will come from the client.
COALESCE(metrics.timespan.usage_duration.value) AS duration,
FROM
`{{ project_id }}.{{ channel_dataset }}_stable.{{ usage_reporting_stable_table_name }}`
WHERE
DATE(submission_timestamp) = @submission_date
AND metrics.uuid.usage_profile_id IS NOT NULL
)
SELECT
submission_date,
usage_profile_id,
udf.mode_last(ARRAY_AGG(first_run_date IGNORE NULLS ORDER BY submission_timestamp ASC)) AS first_run_date,
udf.mode_last(ARRAY_AGG(app_channel IGNORE NULLS ORDER BY submission_timestamp ASC)) AS app_channel,
udf.mode_last(ARRAY_AGG(normalized_country_code IGNORE NULLS ORDER BY submission_timestamp ASC)) AS normalized_country_code,
udf.mode_last(ARRAY_AGG(os IGNORE NULLS ORDER BY submission_timestamp ASC)) AS os,
udf.mode_last(ARRAY_AGG(os_version IGNORE NULLS ORDER BY submission_timestamp ASC)) AS os_version,
udf.mode_last(ARRAY_AGG(app_build IGNORE NULLS ORDER BY submission_timestamp ASC)) AS app_build,
udf.mode_last(ARRAY_AGG(app_display_version IGNORE NULLS ORDER BY submission_timestamp ASC)) AS app_display_version,
udf.mode_last(ARRAY_AGG(distribution_id IGNORE NULLS ORDER BY submission_timestamp ASC)) AS distribution_id,
udf.mode_last(ARRAY_AGG(is_default_browser IGNORE NULLS ORDER BY submission_timestamp ASC)) AS is_default_browser,
udf.mode_last(ARRAY_AGG(reason IGNORE NULLS ORDER BY submission_timestamp ASC)) AS reason,
{% if "_desktop" in app_name %}
COALESCE(LOGICAL_OR(is_active), SUM(browser_engagement_uri_count) > 0 AND SUM(browser_engagement_active_ticks) > 0, FALSE) AS is_active,
{% else %}
COALESCE(LOGICAL_OR(is_active), SUM(IF(duration BETWEEN 0 AND 100000, duration, 0)) > 0, FALSE) AS is_active,
{% endif %}
FROM
usage_reporting_base
GROUP BY
submission_date,
usage_profile_id

Просмотреть файл

@ -0,0 +1,83 @@
# {{ header }}
fields:
- mode: NULLABLE
name: submission_date
type: DATE
description: |
Logical date used for processing and paritioning.
- mode: NULLABLE
name: usage_profile_id
type: STRING
description:
- mode: NULLABLE
name: first_run_date
type: DATE
description: |
The date of the first run of the application.
- mode: NULLABLE
name: app_channel
type: STRING
description: |
The channel the application is being distributed on.
- mode: NULLABLE
name: normalized_country_code
type: STRING
description: |
Country code
- mode: NULLABLE
name: os
type: STRING
description: |
The name of the operating system.
- mode: NULLABLE
name: os_version
type: STRING
description: |
The user-visible version of the operating system (e.g. "1.2.3").
If the version detection fails, this metric gets set to Unknown.
- mode: NULLABLE
name: app_build
type: STRING
description: |
The build identifier generated by the CI system (e.g. "1234/A").
If the value was not provided through configuration, this metric gets set to Unknown.
- mode: NULLABLE
name: app_display_version
type: STRING
description: |
The user visible version string (e.g. "1.0.3").
If the value was not provided through configuration, this metric gets set to Unknown.
- mode: NULLABLE
name: distribution_id
type: STRING
description: |
A string containing the distribution identifier. This was used to identify installs
from Mozilla Online, but now also identifies partnership deal distributions.
- mode: NULLABLE
name: is_default_browser
type: BOOLEAN
description: |
Is Firefox the default browser.
- mode: NULLABLE
name: reason
type: STRING
description: |
The optional reason the ping was submitted. The specific values for reason are specific to each ping,
and are documented in the ping's pings.yaml file.
- mode: NULLABLE
name: is_active
type: BOOLEAN
description: |
A flag field indicating whether the specific client was active.

Просмотреть файл

@ -0,0 +1,19 @@
# {{ header }}
friendly_name: Clients First Seen Based on the DAU Reporting Ping.
description: |-
A representation of when we saw each `profile_usage_id`
for the first time based on the usage_reporting ping.
owners:
- kik@mozilla.com
labels:
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_glean_usage
task_group: {{ app_name }}
bigquery:
time_partitioning:
type: day
field: first_seen_date
require_partition_filter: false

Просмотреть файл

@ -0,0 +1,57 @@
-- {{ header }}
WITH
_current AS (
SELECT
usage_profile_id,
{% raw %}
{% if is_init() %}
DATE(MIN(submission_timestamp)) AS first_seen_date,
{% else %}
@submission_date AS first_seen_date,
{% endif %}
{% endraw %}
FROM
`{{ project_id }}.{{ channel_dataset }}.usage_reporting_clients_daily`
WHERE
usage_profile_id IS NOT NULL
{% raw %}
{% if is_init() %}
AND DATE(submission_timestamp) > "2014-10-10"
{% else %}
AND DATE(submission_timestamp) = @submission_date
{% endif %}
{% endraw %}
GROUP BY
usage_profile_id
),
_previous AS (
SELECT
usage_profile_id,
FROM
`{{ project_id }}.{{ channel_dataset }}.{{ table_name }}`
WHERE
{% raw %}
{% if is_init() %}
False
{% else %}
first_seen_date < @submission_date
{% endif %}
{% endraw %}
)
SELECT
first_seen_date,
usage_profile_id,
FROM
_current
LEFT JOIN
_previous
USING (usage_profile_id)
WHERE
_previous.usage_profile_id IS NULL
QUALIFY
IF(
COUNT(*) OVER (PARTITION BY usage_profile_id) > 1,
ERROR("Duplicate usage_profile_id combination detected."),
TRUE
)

Просмотреть файл

@ -0,0 +1,13 @@
# {{ header }}
fields:
- mode: NULLABLE
name: usage_profile_id
type: STRING
description: |
A UUID of the usage_profile.
- mode: NULLABLE
name: first_seen_date
type: DATE
description: |
Logical date of when we observed the client for the first time in our warehouse.

Просмотреть файл

@ -0,0 +1,19 @@
# {{ header }}
friendly_name: Clients Last Seen Based on the DAU Reporting Ping.
description: |-
A daily aggregate of the usage_reporting ping for each `profile_usage_id`
representing their activity.
owners:
- kik@mozilla.com
labels:
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_glean_usage
task_group: {{ app_name }}
bigquery:
time_partitioning:
type: day
field: submission_date
require_partition_filter: true

Просмотреть файл

@ -0,0 +1,52 @@
-- {{ header }}
-- In this raw table, we capture the history of activity over the past
-- 28 days for each usage criterion as a single 64-bit integer.
WITH _current AS (
SELECT
usage_profile_id,
-- The rightmost bit in 'days_since_seen' represents whether the user sent a usage_reporting ping on the submission_date.
CAST(TRUE AS INT64) AS days_seen_bits,
-- The rightmost bit in days_active_bits represents whether the user counts as active on the submission_date.
CAST(TRUE AS INT64) & CAST(is_active AS INT64) AS days_active_bits,
udf.days_since_created_profile_as_28_bits(
DATE_DIFF(submission_date, first_run_date, DAY)
) AS days_created_profile_bits,
FROM
`{{ project_id }}.{{ channel_dataset }}.usage_reporting_clients_daily`
WHERE
submission_date = @submission_date
),
_previous AS (
SELECT
usage_profile_id,
days_seen_bits,
days_active_bits,
days_created_profile_bits,
FROM
`{{ project_id }}.{{ channel_dataset }}.{{ table_name }}`
WHERE
submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY)
-- Filter out rows from yesterday that have now fallen outside the 28-day window.
AND udf.shift_28_bits_one_day(days_seen_bits) > 0
)
SELECT
@submission_date AS submission_date,
IF(_current.usage_profile_id IS NOT NULL, _current, _previous).* REPLACE (
udf.combine_adjacent_days_28_bits(
_previous.days_seen_bits,
_current.days_seen_bits
) AS days_seen_bits,
udf.combine_adjacent_days_28_bits(
_previous.days_active_bits,
_current.days_active_bits
) AS days_active_bits,
udf.combine_adjacent_days_28_bits(
_previous.days_created_profile_bits,
_current.days_created_profile_bits
) AS days_created_profile_bits
)
FROM
_current
FULL JOIN
_previous
USING (usage_profile_id)

Просмотреть файл

@ -0,0 +1,31 @@
# {{ header }}
fields:
- mode: NULLABLE
name: submission_date
type: DATE
description: |
Logical date used for processing and paritioning.
- mode: NULLABLE
name: usage_profile_id
type: STRING
description: |
A UUID of the usage_profile.
- mode: NULLABLE
name: days_seen_bits
type: INTEGER
description: |
Bit field shows on which of the last 28 days a client sent us the usage_reporting ping.
- mode: NULLABLE
name: days_active_bits
type: INTEGER
description: |
Bit field shows on which of the last 28 days a client fulfilled the active criteria.
- mode: NULLABLE
name: days_created_profile_bits
type: INTEGER
description: |
bit field indicating how many days lapsed since profile creation.