Refactor client id field and add sum/client counts to measures
This commit is contained in:
Родитель
4db7a7c295
Коммит
5cd60d4401
|
@ -1,6 +1,9 @@
|
|||
"""Class to describe a Glean Ping View."""
|
||||
from collections import Counter
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import click
|
||||
|
||||
from .ping_view import PingView
|
||||
|
||||
|
||||
|
@ -35,3 +38,46 @@ class GleanPingView(PingView):
|
|||
self._annotate_dimension(d)
|
||||
for d in super().get_dimensions(bq_client, table)
|
||||
]
|
||||
|
||||
def get_measures(self, dimensions: List[dict], table: str) -> List[Dict[str, str]]:
|
||||
"""Generate measures from a list of dimensions.
|
||||
|
||||
When no dimension-specific measures are found, return a single "count" measure.
|
||||
|
||||
Raise ClickException if dimensions result in duplicate measures.
|
||||
"""
|
||||
measures = super().get_measures(dimensions, table)
|
||||
client_id_field = self._get_client_id(dimensions, table)
|
||||
|
||||
for dimension in dimensions:
|
||||
dimension_name = dimension["name"]
|
||||
if "metrics__counter__" in dimension_name:
|
||||
# handle the counters in the metric ping
|
||||
name = dimension_name.ltrim("metrics__")
|
||||
measures += [
|
||||
{
|
||||
"name": name,
|
||||
"type": "sum",
|
||||
"sql": f"${{{dimension_name}}}",
|
||||
},
|
||||
{
|
||||
"name": f"{name}_client_count",
|
||||
"type": "count_distinct",
|
||||
"sql": (
|
||||
f"case when ${{{dimension_name}}} > 0 then "
|
||||
f"${{{client_id_field}}}"
|
||||
),
|
||||
},
|
||||
]
|
||||
|
||||
# check if there are any duplicate values, and report the first one that
|
||||
# shows up
|
||||
names = [measure["name"] for measure in measures]
|
||||
duplicates = [k for k, v in Counter(names).items() if v > 1]
|
||||
if duplicates:
|
||||
name = duplicates[0]
|
||||
raise click.ClickException(
|
||||
f"duplicate measure {name!r} for table {table!r}"
|
||||
)
|
||||
|
||||
return measures
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
"""Class to describe a Ping View."""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import Counter, defaultdict
|
||||
from collections import defaultdict
|
||||
from itertools import filterfalse
|
||||
from typing import Any, Dict, Iterator, List
|
||||
|
||||
|
@ -101,6 +101,19 @@ class PingView(View):
|
|||
# add dimensions and dimension groups
|
||||
return lookml_utils._generate_dimensions(bq_client, table)
|
||||
|
||||
def _get_client_id(self, dimensions: List[dict], table: str) -> str:
|
||||
"""Return the first field that looks like a client identifier."""
|
||||
client_id_fields = [
|
||||
d["name"]
|
||||
for d in dimensions
|
||||
if d["name"] in {"client_id", "client_info__client_id"}
|
||||
]
|
||||
if not client_id_fields:
|
||||
raise click.ClickException(f"Missing client_id dimension in {table!r}")
|
||||
if len(client_id_fields) > 1:
|
||||
raise click.ClickException(f"Duplicate client_id dimension in {table!r}")
|
||||
return client_id_fields[0]
|
||||
|
||||
def get_measures(self, dimensions: List[dict], table: str) -> List[Dict[str, str]]:
|
||||
"""Generate measures from a list of dimensions.
|
||||
|
||||
|
@ -108,36 +121,21 @@ class PingView(View):
|
|||
|
||||
Raise ClickException if dimensions result in duplicate measures.
|
||||
"""
|
||||
measures = []
|
||||
# iterate through each of the dimensions and accumulate any measures
|
||||
# that we want to include in the view
|
||||
# Iterate through each of the dimensions and accumulate any measures
|
||||
# that we want to include in the view. We pull out the client id first
|
||||
# since we'll use it to calculate per-measure client counts.
|
||||
client_id_field = self._get_client_id(dimensions, table)
|
||||
measures = [
|
||||
{
|
||||
"name": "clients",
|
||||
"type": "count_distinct",
|
||||
"sql": f"${{{client_id_field}}}",
|
||||
}
|
||||
]
|
||||
|
||||
for dimension in dimensions:
|
||||
dimension_name = dimension["name"]
|
||||
if dimension_name in {"client_id", "client_info__client_id"}:
|
||||
measure = {
|
||||
"name": "clients",
|
||||
"type": "count_distinct",
|
||||
"sql": f"${{{dimension_name}}}",
|
||||
}
|
||||
elif dimension_name == "document_id":
|
||||
measure = {"name": "ping_count", "type": "count"}
|
||||
else:
|
||||
continue
|
||||
measures.append(measure)
|
||||
|
||||
# check if there are any duplicate values, and report the first one that
|
||||
# shows up
|
||||
names = [measure["name"] for measure in measures]
|
||||
duplicates = [k for k, v in Counter(names).items() if v > 1]
|
||||
if duplicates:
|
||||
name = duplicates[0]
|
||||
raise click.ClickException(
|
||||
f"duplicate measure {name!r} for table {table!r}"
|
||||
)
|
||||
|
||||
if not measures:
|
||||
raise click.ClickException(
|
||||
f"Missing client_id and doc_id dimensions in {table!r}"
|
||||
)
|
||||
if dimension_name == "document_id":
|
||||
measures += [{"name": "ping_count", "type": "count"}]
|
||||
|
||||
return measures
|
||||
|
|
|
@ -100,7 +100,7 @@ class MockClient:
|
|||
bigquery.schema.SchemaField("parsed_date", "DATE"),
|
||||
],
|
||||
)
|
||||
if table_ref == "mozdata.fail.duplicate_measure":
|
||||
if table_ref == "mozdata.fail.duplicate_client":
|
||||
return bigquery.Table(
|
||||
table_ref,
|
||||
schema=[
|
||||
|
@ -445,7 +445,7 @@ def test_duplicate_dimension(runner, glean_apps, tmp_path):
|
|||
_lookml(open(namespaces), glean_apps, "looker-hub/")
|
||||
|
||||
|
||||
def test_duplicate_measure(runner, glean_apps, tmp_path):
|
||||
def test_duplicate_client_id(runner, glean_apps, tmp_path):
|
||||
namespaces = tmp_path / "namespaces.yaml"
|
||||
namespaces.write_text(
|
||||
dedent(
|
||||
|
@ -458,7 +458,7 @@ def test_duplicate_measure(runner, glean_apps, tmp_path):
|
|||
type: ping_view
|
||||
tables:
|
||||
- channel: release
|
||||
table: mozdata.fail.duplicate_measure
|
||||
table: mozdata.fail.duplicate_client
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
|
Загрузка…
Ссылка в новой задаче