Refactor client id field and add sum/client counts to measures

This commit is contained in:
Anthony Miyaguchi 2021-04-30 12:25:06 -07:00 коммит произвёл Frank Bertsch
Родитель 4db7a7c295
Коммит 5cd60d4401
3 изменённых файлов: 77 добавлений и 33 удалений

Просмотреть файл

@ -1,6 +1,9 @@
"""Class to describe a Glean Ping View."""
from collections import Counter
from typing import Any, Dict, List
import click
from .ping_view import PingView
@ -35,3 +38,46 @@ class GleanPingView(PingView):
self._annotate_dimension(d)
for d in super().get_dimensions(bq_client, table)
]
def get_measures(self, dimensions: List[dict], table: str) -> List[Dict[str, str]]:
"""Generate measures from a list of dimensions.
When no dimension-specific measures are found, return a single "count" measure.
Raise ClickException if dimensions result in duplicate measures.
"""
measures = super().get_measures(dimensions, table)
client_id_field = self._get_client_id(dimensions, table)
for dimension in dimensions:
dimension_name = dimension["name"]
if "metrics__counter__" in dimension_name:
# handle the counters in the metric ping
name = dimension_name.ltrim("metrics__")
measures += [
{
"name": name,
"type": "sum",
"sql": f"${{{dimension_name}}}",
},
{
"name": f"{name}_client_count",
"type": "count_distinct",
"sql": (
f"case when ${{{dimension_name}}} > 0 then "
f"${{{client_id_field}}}"
),
},
]
# check if there are any duplicate values, and report the first one that
# shows up
names = [measure["name"] for measure in measures]
duplicates = [k for k, v in Counter(names).items() if v > 1]
if duplicates:
name = duplicates[0]
raise click.ClickException(
f"duplicate measure {name!r} for table {table!r}"
)
return measures

Просмотреть файл

@ -1,7 +1,7 @@
"""Class to describe a Ping View."""
from __future__ import annotations
from collections import Counter, defaultdict
from collections import defaultdict
from itertools import filterfalse
from typing import Any, Dict, Iterator, List
@ -101,6 +101,19 @@ class PingView(View):
# add dimensions and dimension groups
return lookml_utils._generate_dimensions(bq_client, table)
def _get_client_id(self, dimensions: List[dict], table: str) -> str:
"""Return the first field that looks like a client identifier."""
client_id_fields = [
d["name"]
for d in dimensions
if d["name"] in {"client_id", "client_info__client_id"}
]
if not client_id_fields:
raise click.ClickException(f"Missing client_id dimension in {table!r}")
if len(client_id_fields) > 1:
raise click.ClickException(f"Duplicate client_id dimension in {table!r}")
return client_id_fields[0]
def get_measures(self, dimensions: List[dict], table: str) -> List[Dict[str, str]]:
"""Generate measures from a list of dimensions.
@ -108,36 +121,21 @@ class PingView(View):
Raise ClickException if dimensions result in duplicate measures.
"""
measures = []
# iterate through each of the dimensions and accumulate any measures
# that we want to include in the view
# Iterate through each of the dimensions and accumulate any measures
# that we want to include in the view. We pull out the client id first
# since we'll use it to calculate per-measure client counts.
client_id_field = self._get_client_id(dimensions, table)
measures = [
{
"name": "clients",
"type": "count_distinct",
"sql": f"${{{client_id_field}}}",
}
]
for dimension in dimensions:
dimension_name = dimension["name"]
if dimension_name in {"client_id", "client_info__client_id"}:
measure = {
"name": "clients",
"type": "count_distinct",
"sql": f"${{{dimension_name}}}",
}
elif dimension_name == "document_id":
measure = {"name": "ping_count", "type": "count"}
else:
continue
measures.append(measure)
# check if there are any duplicate values, and report the first one that
# shows up
names = [measure["name"] for measure in measures]
duplicates = [k for k, v in Counter(names).items() if v > 1]
if duplicates:
name = duplicates[0]
raise click.ClickException(
f"duplicate measure {name!r} for table {table!r}"
)
if not measures:
raise click.ClickException(
f"Missing client_id and doc_id dimensions in {table!r}"
)
if dimension_name == "document_id":
measures += [{"name": "ping_count", "type": "count"}]
return measures

Просмотреть файл

@ -100,7 +100,7 @@ class MockClient:
bigquery.schema.SchemaField("parsed_date", "DATE"),
],
)
if table_ref == "mozdata.fail.duplicate_measure":
if table_ref == "mozdata.fail.duplicate_client":
return bigquery.Table(
table_ref,
schema=[
@ -445,7 +445,7 @@ def test_duplicate_dimension(runner, glean_apps, tmp_path):
_lookml(open(namespaces), glean_apps, "looker-hub/")
def test_duplicate_measure(runner, glean_apps, tmp_path):
def test_duplicate_client_id(runner, glean_apps, tmp_path):
namespaces = tmp_path / "namespaces.yaml"
namespaces.write_text(
dedent(
@ -458,7 +458,7 @@ def test_duplicate_measure(runner, glean_apps, tmp_path):
type: ping_view
tables:
- channel: release
table: mozdata.fail.duplicate_measure
table: mozdata.fail.duplicate_client
"""
)
)