Add unnested labeled counter to glean ping explores (#139)

* Add unnested label view

* Add hidden dimension for labeled counter (probably unnecessary)

* Add measures to separate view (broken because of circular dependencies)

* Move unnested view logic into glean ping directly

* Add explore logic to expose join views in explore

* Add test for labeled counter view

* Add description to view

* Break gigantic test for lookml into smaller tests

* Remove extra semicolons and description from view

* Remove duplicate values from explore/view

* Update _to_lookml for API changes

* Add looker suggest explores

* Update tests for suggest explores and new labels

* Use client id as primary key for joins

* Join using document_id as primary key

* Update tests for document id as primary key

* Fix description from rebase

* Address review: longer time period for labels

* Remove document id primary key

* Revert "Remove document id primary key"

This reverts commit 242913c241.

* Sort views in the correct order

* Reduce suggest time to 30 days
This commit is contained in:
Anthony Miyaguchi 2021-06-04 06:54:18 -07:00 коммит произвёл GitHub
Родитель 4e4e24f4db
Коммит d54af3a23a
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 374 добавлений и 22 удалений

Просмотреть файл

@ -26,12 +26,50 @@ class GleanPingExplore(PingExplore):
}
# collapse whitespace in the description so the lookml looks a little better
ping_description = " ".join(ping_descriptions[self.name].split())
# insert the description in
lookml = super()._to_lookml(v1_name)
lookml[0][
"description"
] = f"Explore for the {self.name} ping. {ping_description}"
return lookml
views_lookml = self.get_view_lookml(self.views["base_view"])
# The first view, by convention, is always the base view with the
# majority of the dimensions from the top level.
base = views_lookml["views"][0]
base_name = base["name"]
joins = []
for view in views_lookml["views"][1:]:
if view["name"].startswith("suggest__"):
continue
view_name = view["name"]
metric = "__".join(view["name"].split("__")[1:])
joins.append(
{
"name": view_name,
"relationship": "one_to_many",
"sql": (
f"LEFT JOIN UNNEST(${{{base_name}.{metric}}}) AS {view_name} "
f"ON ${{{base_name}.document_id}} = ${{{view_name}.document_id}}"
),
}
)
base_explore = {
"name": self.name,
# list the base explore first by prefixing with a space
"view_label": f" {self.name.title()}",
"description": f"Explore for the {self.name} ping. {ping_description}",
"view_name": self.views["base_view"],
"always_filter": {
"filters": self.get_required_filters("base_view"),
},
"joins": joins,
}
suggests = []
for view in views_lookml["views"][1:]:
if not view["name"].startswith("suggest__"):
continue
suggests.append({"name": view["name"], "hidden": "yes"})
return [base_explore] + suggests
@staticmethod
def from_views(views: List[View]) -> Iterator[PingExplore]:

Просмотреть файл

@ -1,6 +1,7 @@
"""Class to describe a Glean Ping View."""
import logging
from collections import Counter
from textwrap import dedent
from typing import Any, Dict, Iterable, List, Optional, Union
import click
@ -19,6 +20,7 @@ DISTRIBUTION_TYPES = {
ALLOWED_TYPES = DISTRIBUTION_TYPES | {
"boolean",
"counter",
"labeled_counter",
"datetime",
"jwe",
"quantity",
@ -35,6 +37,104 @@ class GleanPingView(PingView):
type: str = "glean_ping_view"
allow_glean: bool = True
def to_lookml(self, bq_client, v1_name: Optional[str]) -> Dict[str, Any]:
"""Generate LookML for this view.
The Glean views include a labeled metrics, which need to be joined
against the view in the explore.
"""
lookml = super().to_lookml(bq_client, v1_name)
# iterate over all of the glean metrics and generate views for unnested
# fields as necessary. Append them to the list of existing view
# definitions.
table = next(
(table for table in self.tables if table.get("channel") == "release"),
self.tables[0],
)["table"]
dimensions = self.get_dimensions(bq_client, table, v1_name)
client_id_field = self._get_client_id(dimensions, table)
view_definitions = []
metrics = self._get_glean_metrics(v1_name)
for metric in metrics:
if metric.type == "labeled_counter":
looker_name = self._to_looker_name(metric)
view_name = f"{self.name}__{looker_name}"
suggest_name = f"suggest__{view_name}"
join_view = {
"name": view_name,
"label": (
"_".join(looker_name.split("__")[1:]).replace("_", " ").title()
),
"dimensions": [
{
"name": "document_id",
"type": "string",
"sql": f"${{{self.name}.document_id}}",
"primary_key": "yes",
"hidden": "yes",
},
{
"name": "key",
"type": "string",
"sql": "${TABLE}.key",
"suggest_explore": suggest_name,
"suggest_dimension": f"{suggest_name}.key",
},
{
"name": "value",
"type": "number",
"sql": "${TABLE}.value",
"hidden": "yes",
},
],
"measures": [
{
"name": "count",
"type": "sum",
"sql": "${value}",
},
{
"name": "client_count",
"type": "count_distinct",
"sql": f"case when ${{value}} > 0 then ${{{self.name}.{client_id_field}}} end",
},
],
}
suggest_view = {
"name": suggest_name,
"derived_table": {
"sql": dedent(
f"""
select
m.key,
count(*) as n
from {table} as t,
unnest(metrics.{metric.type}.{metric.id.replace(".", "_")}) as m
where date(submission_timestamp) > date_sub(current_date, interval 30 day)
and sample_id = 0
group by key
order by n desc
"""
)
},
"dimensions": [
{"name": "key", "type": "string", "sql": "${TABLE}.key"}
],
}
view_definitions += [join_view, suggest_view]
# deduplicate view definitions, because somehow a few entries make it in
# twice e.g. metrics__metrics__labeled_counter__media_audio_init_failure
view_definitions = sorted(
{v["name"]: v for v in view_definitions}.values(), key=lambda x: x["name"] # type: ignore
)
lookml["views"] += view_definitions
return lookml
def _get_links(self, dimension: dict) -> List[Dict[str, str]]:
"""Get a link annotation given a metric name."""
name = self._get_name(dimension)
@ -83,20 +183,30 @@ class GleanPingView(PingView):
return ping_probes
def _to_looker_name(self, metric: GleanProbe, suffix: str = "") -> str:
"""Convert a glean probe into a looker name."""
*category, name = metric.id.split(".")
category = "_".join(category)
sep = "" if not category else "_"
label = name
looker_name = f"metrics__{metric.type}__{category}{sep}{label}"
if suffix:
looker_name = f"{looker_name}__{suffix}"
return looker_name
def _make_dimension(
self, metric: GleanProbe, suffix: str, sql_map: Dict[str, Dict[str, str]]
) -> Optional[Dict[str, Union[str, List[Dict[str, str]]]]]:
*category, name = metric.id.split(".")
category = "_".join(category)
sep = "" if not category else "_"
label = name
sep = "_"
if not category:
sep = ""
looker_name = f"metrics__{metric.type}__{category}{sep}{name}"
if suffix:
label = f"{name}_{suffix}"
looker_name = f"metrics__{metric.type}__{category}{sep}{name}__{suffix}"
looker_name = f"{looker_name}__{suffix}"
if looker_name not in sql_map:
return None
@ -127,6 +237,13 @@ class GleanPingView(PingView):
],
}
# remove some elements from the definition if we're handling a labeled
# counter, as an initial join dimension
if metric.type == "labeled_counter":
# this field is not used since labeled counters are maps
del lookml["type"]
lookml["hidden"] = "yes"
if metric.description:
lookml["description"] = metric.description
@ -174,11 +291,14 @@ class GleanPingView(PingView):
) -> List[Dict[str, Any]]:
"""Get the set of dimensions for this view."""
all_fields = super().get_dimensions(bq_client, table, v1_name)
return self._get_glean_metric_dimensions(all_fields, v1_name) + [
fields = self._get_glean_metric_dimensions(all_fields, v1_name) + [
self._add_link(d)
for d in all_fields
if not d["name"].startswith("metrics__")
]
# later entries will override earlier entries, if there are duplicates
field_dict = {f["name"]: f for f in fields}
return list(field_dict.values())
def get_measures(
self, dimensions: List[dict], table: str, v1_name: Optional[str]

Просмотреть файл

@ -2,7 +2,6 @@
from __future__ import annotations
from collections import defaultdict
from itertools import filterfalse
from typing import Any, Dict, Iterator, List, Optional, Union
import click
@ -69,12 +68,16 @@ class PingView(View):
)["table"]
dimensions = self.get_dimensions(bq_client, table, v1_name)
view_defn["dimensions"] = list(
filterfalse(lookml_utils._is_dimension_group, dimensions)
)
view_defn["dimension_groups"] = list(
filter(lookml_utils._is_dimension_group, dimensions)
)
# set document id field as a primary key for joins
view_defn["dimensions"] = [
d if d["name"] != "document_id" else dict(**d, primary_key="yes")
for d in dimensions
if not lookml_utils._is_dimension_group(d)
]
view_defn["dimension_groups"] = [
d for d in dimensions if lookml_utils._is_dimension_group(d)
]
# add measures
view_defn["measures"] = self.get_measures(dimensions, table, v1_name)

Просмотреть файл

@ -1,3 +1,4 @@
import contextlib
from pathlib import Path
from textwrap import dedent
from unittest.mock import Mock, patch
@ -153,6 +154,15 @@ class MockClient:
SchemaField("no_category_counter", "INTEGER"),
],
),
SchemaField(
"labeled_counter",
"RECORD",
"REPEATED",
fields=[
SchemaField("key", "STRING"),
SchemaField("value", "INTEGER"),
],
),
SchemaField(
"custom_distribution",
"RECORD",
@ -445,6 +455,14 @@ def msg_glean_probes():
"test.counter",
{"type": "counter", "history": history_with_descr, "name": "test.counter"},
),
GleanProbe(
"test.labeled_counter",
{
"type": "labeled_counter",
"history": history_with_descr,
"name": "test.labeled_counter",
},
),
GleanProbe(
"no_category_counter",
{"type": "counter", "history": history, "name": "no_category_counter"},
@ -516,9 +534,8 @@ def msg_glean_probes():
]
@patch("generator.views.glean_ping_view.GleanPing")
@patch("generator.explores.glean_ping_explore.GleanPing")
def test_lookml_actual(
@contextlib.contextmanager
def _prepare_lookml_actual_test(
mock_glean_ping_view,
mock_glean_ping_explore,
runner,
@ -598,6 +615,27 @@ def test_lookml_actual(
with runner.isolated_filesystem():
with patch("google.cloud.bigquery.Client", MockClient):
_lookml(open(namespaces), glean_apps, "looker-hub/")
yield namespaces_text
@patch("generator.views.glean_ping_view.GleanPing")
@patch("generator.explores.glean_ping_explore.GleanPing")
def test_lookml_actual_baseline_view(
mock_glean_ping_view,
mock_glean_ping_explore,
runner,
glean_apps,
tmp_path,
msg_glean_probes,
):
with _prepare_lookml_actual_test(
mock_glean_ping_view,
mock_glean_ping_explore,
runner,
glean_apps,
tmp_path,
msg_glean_probes,
) as namespaces_text:
expected = {
"views": [
{
@ -619,6 +657,7 @@ def test_lookml_actual(
"name": "document_id",
"hidden": "yes",
"sql": "${TABLE}.document_id",
"primary_key": "yes",
},
],
"measures": [
@ -640,6 +679,26 @@ def test_lookml_actual(
lkml.load(Path("looker-hub/custom/views/baseline.view.lkml").read_text()),
)
print_and_test(namespaces_text, open(Path("looker-hub/namespaces.yaml")).read())
@patch("generator.views.glean_ping_view.GleanPing")
@patch("generator.explores.glean_ping_explore.GleanPing")
def test_lookml_actual_baseline_view_parameterized(
mock_glean_ping_view,
mock_glean_ping_explore,
runner,
glean_apps,
tmp_path,
msg_glean_probes,
):
with _prepare_lookml_actual_test(
mock_glean_ping_view,
mock_glean_ping_explore,
runner,
glean_apps,
tmp_path,
msg_glean_probes,
):
expected = {
"views": [
{
@ -802,6 +861,26 @@ def test_lookml_actual(
Path("looker-hub/glean-app/views/baseline.view.lkml").read_text()
),
)
@patch("generator.views.glean_ping_view.GleanPing")
@patch("generator.explores.glean_ping_explore.GleanPing")
def test_lookml_actual_metrics_view(
mock_glean_ping_view,
mock_glean_ping_explore,
runner,
glean_apps,
tmp_path,
msg_glean_probes,
):
with _prepare_lookml_actual_test(
mock_glean_ping_view,
mock_glean_ping_explore,
runner,
glean_apps,
tmp_path,
msg_glean_probes,
):
expected = {
"views": [
{
@ -1129,13 +1208,86 @@ def test_lookml_actual(
"type": "count_distinct",
},
],
}
},
{
"dimensions": [
{
"hidden": "yes",
"name": "document_id",
"primary_key": "yes",
"sql": "${metrics.document_id}",
"type": "string",
},
{
"name": "key",
"sql": "${TABLE}.key",
"suggest_dimension": "suggest__metrics__metrics__labeled_counter__test_labeled_counter.key",
"suggest_explore": "suggest__metrics__metrics__labeled_counter__test_labeled_counter",
"type": "string",
},
{
"hidden": "yes",
"name": "value",
"sql": "${TABLE}.value",
"type": "number",
},
],
"label": "Labeled Counter Test Labeled Counter",
"measures": [
{"name": "count", "sql": "${value}", "type": "sum"},
{
"name": "client_count",
"sql": "case when ${value} > 0 then "
"${metrics.client_info__client_id} end",
"type": "count_distinct",
},
],
"name": "metrics__metrics__labeled_counter__test_labeled_counter",
},
{
"derived_table": {
"sql": "select\n"
" m.key,\n"
" count(*) as n\n"
"from mozdata.glean_app.metrics as "
"t,\n"
"unnest(metrics.labeled_counter.test_labeled_counter) as m\n"
"where date(submission_timestamp) > date_sub(current_date, interval 30 day)\n"
" and sample_id = 0\n"
"group by key\n"
"order by n desc"
},
"dimensions": [
{"name": "key", "sql": "${TABLE}.key", "type": "string"}
],
"name": "suggest__metrics__metrics__labeled_counter__test_labeled_counter",
},
]
}
print_and_test(
expected,
lkml.load(Path("looker-hub/glean-app/views/metrics.view.lkml").read_text()),
)
@patch("generator.views.glean_ping_view.GleanPing")
@patch("generator.explores.glean_ping_explore.GleanPing")
def test_lookml_actual_growth_accounting_view(
mock_glean_ping_view,
mock_glean_ping_explore,
runner,
glean_apps,
tmp_path,
msg_glean_probes,
):
with _prepare_lookml_actual_test(
mock_glean_ping_view,
mock_glean_ping_explore,
runner,
glean_apps,
tmp_path,
msg_glean_probes,
):
expected = {
"views": [
{
@ -1175,6 +1327,25 @@ def test_lookml_actual(
),
)
@patch("generator.views.glean_ping_view.GleanPing")
@patch("generator.explores.glean_ping_explore.GleanPing")
def test_lookml_actual_baseline_explore(
mock_glean_ping_view,
mock_glean_ping_explore,
runner,
glean_apps,
tmp_path,
msg_glean_probes,
):
with _prepare_lookml_actual_test(
mock_glean_ping_view,
mock_glean_ping_explore,
runner,
glean_apps,
tmp_path,
msg_glean_probes,
):
expected = {
"includes": ["/looker-hub/glean-app/views/baseline.view.lkml"],
"explores": [
@ -1182,6 +1353,7 @@ def test_lookml_actual(
"name": "baseline",
"description": "Explore for the baseline ping. The baseline ping is foo.",
"view_name": "baseline",
"view_label": " Baseline",
"always_filter": {
"filters": [
{"channel": "mozdata.glean^_app.baseline"},
@ -1199,6 +1371,25 @@ def test_lookml_actual(
),
)
@patch("generator.views.glean_ping_view.GleanPing")
@patch("generator.explores.glean_ping_explore.GleanPing")
def test_lookml_actual_client_counts(
mock_glean_ping_view,
mock_glean_ping_explore,
runner,
glean_apps,
tmp_path,
msg_glean_probes,
):
with _prepare_lookml_actual_test(
mock_glean_ping_view,
mock_glean_ping_explore,
runner,
glean_apps,
tmp_path,
msg_glean_probes,
):
expected = {
"includes": ["baseline_clients_daily_table.view.lkml"],
"views": [