Fix #1587 - fix inconsistent range_min and range_max in bucket counts (#1591)

* Fix egregious double counting in scalar bucket counts

* Update for newer version of black

* Update scalar bucket count test to account for combinations

* Update minimal test for histogram bucket counts

* Add test for multiple clients in histogram aggregates

* Remove deduplicated cte in histogram bucket counts

* Use count distinct for client counts to be explicit
This commit is contained in:
Anthony Miyaguchi 2020-12-04 14:47:45 -08:00 коммит произвёл GitHub
Родитель 2af5515382
Коммит ce9fe86ed2
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
23 изменённых файлов: 950 добавлений и 452 удалений

Просмотреть файл

@ -104,12 +104,7 @@ def scalar_bucket_counts(**kwargs):
def histogram_bucket_counts(**kwargs):
"""Variables for clients histogram bucket counts."""
attributes_list = ["ping_type", "os", "app_version", "app_build_id", "channel"]
metric_attributes_list = [
"metric",
"metric_type",
"key",
"agg_type",
]
metric_attributes_list = ["metric", "metric_type", "key", "agg_type"]
fixed_attributes = ["app_version", "channel"]
cubed_attributes = [x for x in attributes_list if x not in fixed_attributes]
return dict(

Просмотреть файл

@ -1,9 +1,6 @@
{{ header }}
{% from 'macros.sql' import enumerate_table_combinations %}
{# TODO: remove this import by factoring it out as a proper udf #}
{% include "clients_histogram_aggregates_v1.udf.sql" %}
WITH
{{
enumerate_table_combinations(
@ -13,20 +10,6 @@ WITH
attribute_combinations
)
}},
-- Ensure there is a single record per client id
deduplicated_combos AS (
SELECT
client_id,
{{ attributes }},
udf_merged_user_data(
ARRAY_CONCAT_AGG(histogram_aggregates)
) AS histogram_aggregates
FROM
all_combos
GROUP BY
client_id,
{{ attributes }}
),
normalized_histograms AS (
SELECT
{{ attributes }},
@ -37,7 +20,7 @@ normalized_histograms AS (
FROM unnest(histogram_aggregates)
)AS histogram_aggregates
FROM
deduplicated_combos
all_combos
),
unnested AS (
SELECT

Просмотреть файл

@ -2,9 +2,6 @@
{% include "scalar_bucket_counts_v1.udf.sql" %}
{% from 'macros.sql' import enumerate_table_combinations %}
{# TODO: remove this import by factoring it out as a proper udf #}
{% include "clients_scalar_aggregates_v1.udf.sql" %}
WITH
{{
enumerate_table_combinations(
@ -14,20 +11,6 @@ WITH
attribute_combinations
)
}},
-- Ensure there is a single record per client id
deduplicated_combos AS (
SELECT
client_id,
{{ attributes }},
udf_merged_user_data(
ARRAY_CONCAT_AGG(scalar_aggregates)
) AS scalar_aggregates
FROM
all_combos
GROUP BY
client_id,
{{ attributes }}
),
bucketed_booleans AS (
SELECT
client_id,
@ -37,17 +20,17 @@ bucketed_booleans AS (
NULL AS bucket_count,
udf_boolean_buckets(scalar_aggregates) AS scalar_aggregates,
FROM
deduplicated_combos
all_combos
),
log_min_max AS (
SELECT
metric,
key,
LOG(IF(MIN(value) <= 0, 1, MIN(value)), 2) range_min,
LOG(IF(MAX(value) <= 0, 1, MAX(value)), 2) range_max,
LOG(IF(MIN(value) <= 0, 1, MIN(value)), 2) as range_min,
LOG(IF(MAX(value) <= 0, 1, MAX(value)), 2) as range_max,
100 as bucket_count
FROM
deduplicated_combos
all_combos
CROSS JOIN UNNEST(scalar_aggregates)
WHERE
metric_type <> "boolean"
@ -76,7 +59,7 @@ bucketed_scalars AS (
FORMAT("%.*f", 2, mozfun.glam.histogram_bucket_from_value(buckets, value) + 0.0001)
AS STRING) AS bucket
FROM
deduplicated_combos
all_combos
CROSS JOIN UNNEST(scalar_aggregates)
LEFT JOIN buckets_by_metric
USING(metric, key)
@ -119,7 +102,8 @@ SELECT
range_max,
bucket_count,
bucket,
COUNT(*) AS count
-- we could rely on count(*) because there is one row per client and bucket
COUNT(DISTINCT client_id) AS count
FROM
booleans_and_scalars
GROUP BY

Просмотреть файл

@ -1,43 +1,4 @@
-- query for org_mozilla_fenix_glam_nightly__histogram_bucket_counts_v1;
CREATE TEMP FUNCTION udf_merged_user_data(aggs ANY TYPE)
RETURNS ARRAY<
STRUCT<
metric STRING,
metric_type STRING,
key STRING,
agg_type STRING,
value ARRAY<STRUCT<key STRING, value INT64>>
>
> AS (
(
WITH unnested AS (
SELECT
*
FROM
UNNEST(aggs)
),
aggregated_data AS (
SELECT AS STRUCT
metric,
metric_type,
key,
agg_type,
mozfun.map.sum(ARRAY_CONCAT_AGG(value)) AS value
FROM
unnested
GROUP BY
metric,
metric_type,
key,
agg_type
)
SELECT
ARRAY_AGG((metric, metric_type, key, agg_type, value))
FROM
aggregated_data
)
);
WITH
-- Cross join with the attribute combinations to reduce the query complexity
-- with respect to the number of operations. A table with n rows cross joined
@ -72,26 +33,6 @@ all_combos AS (
CROSS JOIN
static_combos combo
),
-- Ensure there is a single record per client id
deduplicated_combos AS (
SELECT
client_id,
ping_type,
os,
app_version,
app_build_id,
channel,
udf_merged_user_data(ARRAY_CONCAT_AGG(histogram_aggregates)) AS histogram_aggregates
FROM
all_combos
GROUP BY
client_id,
ping_type,
os,
app_version,
app_build_id,
channel
),
normalized_histograms AS (
SELECT
ping_type,
@ -110,7 +51,7 @@ normalized_histograms AS (
UNNEST(histogram_aggregates)
) AS histogram_aggregates
FROM
deduplicated_combos
all_combos
),
unnested AS (
SELECT

Просмотреть файл

@ -89,92 +89,6 @@ RETURNS ARRAY<
)
);
CREATE TEMP FUNCTION udf_merged_user_data(
aggs ARRAY<STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>>
)
RETURNS ARRAY<
STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>
> AS (
(
WITH unnested AS (
SELECT
*
FROM
UNNEST(aggs)
WHERE
agg_type != "avg"
),
aggregated AS (
SELECT
metric,
metric_type,
key,
agg_type,
--format:off
CASE agg_type
WHEN 'max' THEN max(value)
WHEN 'min' THEN min(value)
WHEN 'count' THEN sum(value)
WHEN 'sum' THEN sum(value)
WHEN 'false' THEN sum(value)
WHEN 'true' THEN sum(value)
END AS value
--format:on
FROM
unnested
WHERE
value IS NOT NULL
GROUP BY
metric,
metric_type,
key,
agg_type
),
scalar_count_and_sum AS (
SELECT
metric,
metric_type,
key,
'avg' AS agg_type,
--format:off
CASE WHEN agg_type = 'count' THEN value ELSE 0 END AS count,
CASE WHEN agg_type = 'sum' THEN value ELSE 0 END AS sum
--format:on
FROM
aggregated
WHERE
agg_type IN ('sum', 'count')
),
scalar_averages AS (
SELECT
* EXCEPT (count, sum),
SUM(sum) / SUM(count) AS agg_value
FROM
scalar_count_and_sum
GROUP BY
metric,
metric_type,
key,
agg_type
),
merged_data AS (
SELECT
*
FROM
aggregated
UNION ALL
SELECT
*
FROM
scalar_averages
)
SELECT
ARRAY_AGG((metric, metric_type, key, agg_type, value))
FROM
merged_data
)
);
WITH
-- Cross join with the attribute combinations to reduce the query complexity
-- with respect to the number of operations. A table with n rows cross joined
@ -209,26 +123,6 @@ all_combos AS (
CROSS JOIN
static_combos combo
),
-- Ensure there is a single record per client id
deduplicated_combos AS (
SELECT
client_id,
ping_type,
os,
app_version,
app_build_id,
channel,
udf_merged_user_data(ARRAY_CONCAT_AGG(scalar_aggregates)) AS scalar_aggregates
FROM
all_combos
GROUP BY
client_id,
ping_type,
os,
app_version,
app_build_id,
channel
),
bucketed_booleans AS (
SELECT
client_id,
@ -242,17 +136,17 @@ bucketed_booleans AS (
NULL AS bucket_count,
udf_boolean_buckets(scalar_aggregates) AS scalar_aggregates,
FROM
deduplicated_combos
all_combos
),
log_min_max AS (
SELECT
metric,
key,
LOG(IF(MIN(value) <= 0, 1, MIN(value)), 2) range_min,
LOG(IF(MAX(value) <= 0, 1, MAX(value)), 2) range_max,
LOG(IF(MIN(value) <= 0, 1, MIN(value)), 2) AS range_min,
LOG(IF(MAX(value) <= 0, 1, MAX(value)), 2) AS range_max,
100 AS bucket_count
FROM
deduplicated_combos
all_combos
CROSS JOIN
UNNEST(scalar_aggregates)
WHERE
@ -295,7 +189,7 @@ bucketed_scalars AS (
FORMAT("%.*f", 2, mozfun.glam.histogram_bucket_from_value(buckets, value) + 0.0001) AS STRING
) AS bucket
FROM
deduplicated_combos
all_combos
CROSS JOIN
UNNEST(scalar_aggregates)
LEFT JOIN
@ -359,7 +253,8 @@ SELECT
range_max,
bucket_count,
bucket,
COUNT(*) AS count
-- we could rely on count(*) because there is one row per client and bucket
COUNT(DISTINCT client_id) AS count
FROM
booleans_and_scalars
GROUP BY

Просмотреть файл

@ -20,6 +20,13 @@ python org_mozilla_fenix_glam_nightly__extract_user_counts_v1/test_minimal/data.
pytest -k extract_user_counts
```
The easiest way to generate sql for testing is to run the following script from
the project root.
```bash
GENERATE_ONLY=true script/glam/test/test_glean_org_mozilla_fenix_glam_nightly
```
## Creating a new test
To create a new test, copy the `test_minimal` directory and rename it to reflect

Просмотреть файл

@ -32,7 +32,7 @@ CLIENTS_HISTOGRAM_AGGREGATES = [
{"key": "1", "value": 0},
{"key": "2", "value": 1},
],
},
}
],
}
]
@ -53,11 +53,8 @@ CLIENTS_DAILY_HISTOGRAM_AGGREGATES = [
"metric_type": "timing_distribution",
"key": "",
"agg_type": "summed_histogram",
"value": [
{"key": "0", "value": 1},
{"key": "1", "value": 0},
],
},
"value": [{"key": "0", "value": 1}, {"key": "1", "value": 0}],
}
],
}
]
@ -82,7 +79,7 @@ EXPECT = [
{"key": "1", "value": 0},
{"key": "2", "value": 1},
],
},
}
],
}
]

Просмотреть файл

@ -46,7 +46,7 @@ CLIENTS_DAILY_HISTOGRAM_AGGREGATES = [
{"key": "112863206", "value": 1},
{"key": "123078199", "value": 0},
],
},
}
],
}
]
@ -70,7 +70,7 @@ EXPECT = [
{"key": "112863206", "value": 1},
{"key": "123078199", "value": 0},
],
},
}
],
}
]

Просмотреть файл

@ -18,7 +18,7 @@ VIEW_USER_COUNTS = [
"app_build_id": APP_BUILD_ID,
"channel": "*",
"total_users": 44444,
},
}
]
EXPECT = [

Просмотреть файл

@ -1,6 +1,7 @@
"""Testing data for query."""
from pathlib import Path
from itertools import product
import yaml
@ -9,16 +10,19 @@ ROOT = Path(__file__).parent
UUID = "df735f02-efe5-4b07-b212-583bb99ba241"
SUBMISSION_DATE = "2020-10-01"
APP_BUILD_ID = "2020100100"
OS = "Android"
PING_TYPE = "metrics"
# NOTE: what happens when channel = "*"?
# See the scalar_bucket_counts minimal example for more details on the
# preconditions.
CLIENTS_HISTOGRAM_AGGREGATES = [
{
"sample_id": 1,
"client_id": UUID,
"ping_type": "*",
"os": "*",
"ping_type": PING_TYPE,
"os": OS,
"app_version": 84,
"app_build_id": "*",
"app_build_id": APP_BUILD_ID,
"channel": "*",
"histogram_aggregates": [
{
@ -27,41 +31,34 @@ CLIENTS_HISTOGRAM_AGGREGATES = [
"key": "",
"agg_type": "summed_histogram",
"value": [
{"key": "112863206", "value": 1},
{"key": "123078199", "value": 0},
{"key": "1", "value": 1},
{"key": "2", "value": 0},
],
},
}
],
}
]
BASE_ROW = {
"agg_type": "summed_histogram",
"app_build_id": "*",
"app_version": 84,
"channel": "*",
"key": "",
"metric": "network_tcp_connection",
"metric_type": "timing_distribution",
"os": "*",
"ping_type": "*",
"range_max": 2,
"record": {"key": "1", "value": 1.0},
}
EXPECT = [
{
"agg_type": "summed_histogram",
"app_build_id": "*",
"app_version": 84,
"channel": "*",
"key": "",
"metric": "network_tcp_connection",
"metric_type": "timing_distribution",
"os": "*",
"ping_type": "*",
"range_max": 123078199,
"record": {"key": "112863206", "value": 1.0},
},
{
"agg_type": "summed_histogram",
"app_build_id": "*",
"app_version": 84,
"channel": "*",
"key": "",
"metric": "network_tcp_connection",
"metric_type": "timing_distribution",
"os": "*",
"ping_type": "*",
"range_max": 123078199,
"record": {"key": "123078199", "value": 0.0},
},
{**BASE_ROW, **dict(zip(["record", "ping_type", "os", "app_build_id"], values))}
for values in product(
[{"key": "1", "value": 1.0}, {"key": "2", "value": 0.0}],
*zip([PING_TYPE, OS, APP_BUILD_ID], ["*"] * 3),
)
]
prefix = "glam_etl"

Просмотреть файл

@ -1,15 +1,15 @@
- agg_type: summed_histogram
app_build_id: '*'
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: '*'
range_max: 123078199
record:
key: '112863206'
os: Android
ping_type: metrics
range_max: 2
record: &id001
key: '1'
value: 1.0
- agg_type: summed_histogram
app_build_id: '*'
@ -18,9 +18,163 @@
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: metrics
range_max: 2
record: *id001
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: metrics
range_max: 2
record: *id001
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: metrics
range_max: 2
record: *id001
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: '*'
range_max: 2
record: *id001
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: '*'
range_max: 2
record: *id001
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: '*'
range_max: 123078199
record:
key: '123078199'
range_max: 2
record: *id001
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: '*'
range_max: 2
record: *id001
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: metrics
range_max: 2
record: &id002
key: '2'
value: 0.0
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: metrics
range_max: 2
record: *id002
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: metrics
range_max: 2
record: *id002
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: metrics
range_max: 2
record: *id002
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: '*'
range_max: 2
record: *id002
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: '*'
range_max: 2
record: *id002
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: '*'
range_max: 2
record: *id002
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: '*'
range_max: 2
record: *id002

Просмотреть файл

@ -1,4 +1,4 @@
- app_build_id: '*'
- app_build_id: '2020100100'
app_version: 84
channel: '*'
client_id: df735f02-efe5-4b07-b212-583bb99ba241
@ -8,10 +8,10 @@
metric: network_tcp_connection
metric_type: timing_distribution
value:
- key: '112863206'
- key: '1'
value: 1
- key: '123078199'
- key: '2'
value: 0
os: '*'
ping_type: '*'
os: Android
ping_type: metrics
sample_id: 1

Просмотреть файл

@ -0,0 +1,99 @@
"""Testing data for query."""
from pathlib import Path
from itertools import product
from uuid import uuid4
import yaml
ROOT = Path(__file__).parent
SUBMISSION_DATE = "2020-10-01"
APP_BUILD_ID = "2020100100"
OS = "Android"
PING_TYPE = "metrics"
# NOTE: each client contributes a total of 1 to the final aggregate
CLIENTS_HISTOGRAM_AGGREGATES = [
{
"sample_id": 1,
"client_id": str(uuid4()),
"ping_type": PING_TYPE,
"os": OS,
"app_version": 84,
"app_build_id": APP_BUILD_ID,
"channel": "*",
"histogram_aggregates": [
{
"metric": "network_tcp_connection",
"metric_type": "timing_distribution",
"key": "",
"agg_type": "summed_histogram",
"value": [
{"key": "1", "value": 1},
{"key": "2", "value": 0},
],
}
],
},
{
"sample_id": 1,
"client_id": str(uuid4()),
"ping_type": PING_TYPE,
"os": OS,
"app_version": 84,
"app_build_id": APP_BUILD_ID,
"channel": "*",
"histogram_aggregates": [
{
"metric": "network_tcp_connection",
"metric_type": "timing_distribution",
"key": "",
"agg_type": "summed_histogram",
"value": [
{"key": "1", "value": 1},
{"key": "3", "value": 1},
],
}
],
},
]
BASE_ROW = {
"agg_type": "summed_histogram",
"app_build_id": "*",
"app_version": 84,
"channel": "*",
"key": "",
"metric": "network_tcp_connection",
"metric_type": "timing_distribution",
"os": "*",
"ping_type": "*",
"range_max": 3,
"record": {"key": "1", "value": 1.0},
}
EXPECT = [
{**BASE_ROW, **dict(zip(["record", "ping_type", "os", "app_build_id"], values))}
for values in product(
[
{"key": "1", "value": 1.5},
{"key": "2", "value": 0.0},
{"key": "3", "value": 0.5},
],
*zip([PING_TYPE, OS, APP_BUILD_ID], ["*"] * 3),
)
]
prefix = "glam_etl"
tables = [
(
f"{prefix}.org_mozilla_fenix_glam_nightly__clients_histogram_aggregates_v1.yaml",
CLIENTS_HISTOGRAM_AGGREGATES,
),
("expect.yaml", EXPECT),
]
for name, data in tables:
with (ROOT / name).open("w") as fp:
yaml.dump(data, fp)

Просмотреть файл

@ -0,0 +1,270 @@
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: metrics
range_max: 3
record: &id001
key: '1'
value: 1.5
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: metrics
range_max: 3
record: *id001
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: metrics
range_max: 3
record: *id001
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: metrics
range_max: 3
record: *id001
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: '*'
range_max: 3
record: *id001
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: '*'
range_max: 3
record: *id001
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: '*'
range_max: 3
record: *id001
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: '*'
range_max: 3
record: *id001
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: metrics
range_max: 3
record: &id002
key: '2'
value: 0.0
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: metrics
range_max: 3
record: *id002
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: metrics
range_max: 3
record: *id002
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: metrics
range_max: 3
record: *id002
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: '*'
range_max: 3
record: *id002
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: '*'
range_max: 3
record: *id002
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: '*'
range_max: 3
record: *id002
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: '*'
range_max: 3
record: *id002
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: metrics
range_max: 3
record: &id003
key: '3'
value: 0.5
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: metrics
range_max: 3
record: *id003
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: metrics
range_max: 3
record: *id003
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: metrics
range_max: 3
record: *id003
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: '*'
range_max: 3
record: *id003
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: Android
ping_type: '*'
range_max: 3
record: *id003
- agg_type: summed_histogram
app_build_id: '2020100100'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: '*'
range_max: 3
record: *id003
- agg_type: summed_histogram
app_build_id: '*'
app_version: 84
channel: '*'
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
os: '*'
ping_type: '*'
range_max: 3
record: *id003

Просмотреть файл

@ -0,0 +1,47 @@
- mode: NULLABLE
name: sample_id
type: INTEGER
- mode: NULLABLE
name: client_id
type: STRING
- mode: NULLABLE
name: ping_type
type: STRING
- mode: NULLABLE
name: os
type: STRING
- mode: NULLABLE
name: app_version
type: INTEGER
- mode: NULLABLE
name: app_build_id
type: STRING
- mode: NULLABLE
name: channel
type: STRING
- fields:
- mode: NULLABLE
name: metric
type: STRING
- mode: NULLABLE
name: metric_type
type: STRING
- mode: NULLABLE
name: key
type: STRING
- mode: NULLABLE
name: agg_type
type: STRING
- fields:
- mode: NULLABLE
name: key
type: STRING
- mode: NULLABLE
name: value
type: INTEGER
mode: REPEATED
name: value
type: RECORD
mode: REPEATED
name: histogram_aggregates
type: RECORD

Просмотреть файл

@ -0,0 +1,34 @@
- app_build_id: '2020100100'
app_version: 84
channel: '*'
client_id: 5c2364fc-46d4-4537-9575-a8ddaba2b42f
histogram_aggregates:
- agg_type: summed_histogram
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
value:
- key: '1'
value: 1
- key: '2'
value: 0
os: Android
ping_type: metrics
sample_id: 1
- app_build_id: '2020100100'
app_version: 84
channel: '*'
client_id: 813d66e1-57dc-46aa-84ec-682b48cc2e3c
histogram_aggregates:
- agg_type: summed_histogram
key: ''
metric: network_tcp_connection
metric_type: timing_distribution
value:
- key: '1'
value: 1
- key: '3'
value: 1
os: Android
ping_type: metrics
sample_id: 1

Просмотреть файл

@ -0,0 +1,12 @@
- name: submission_date
type: DATE
value: '2020-10-01'
- name: min_sample_id
type: INT64
value: 0
- name: max_sample_id
type: INT64
value: 99
- name: sample_size
type: INT64
value: 100

Просмотреть файл

@ -1,6 +1,7 @@
"""Testing data for query."""
from pathlib import Path
from uuid import uuid4
from itertools import product
import yaml
@ -8,13 +9,18 @@ ROOT = Path(__file__).parent
SUBMISSION_DATE = "2020-10-01"
APP_BUILD_ID = "2020100100"
OS = "Android"
PING_TYPE = "metrics"
# Other tests: non * fields
# Testing precondition: ping_type, os, and app_build_id must not be "*". See
# models.py under the scalar_bucket_counts parameters to see that sets fields
# are used in the static combinations. If these are set to "*", then they will
# be double counted...
CLIENTS_SCALAR_AGGREGATES = [
{
"client_id": str(uuid4()),
"ping_type": "*",
"os": "*",
"ping_type": PING_TYPE,
"os": OS,
"app_version": 84,
"app_build_id": APP_BUILD_ID,
"channel": "*",
@ -30,8 +36,8 @@ CLIENTS_SCALAR_AGGREGATES = [
},
{
"client_id": str(uuid4()),
"ping_type": "*",
"os": "*",
"ping_type": PING_TYPE,
"os": OS,
"app_version": 84,
"app_build_id": APP_BUILD_ID,
"channel": "*",
@ -47,108 +53,36 @@ CLIENTS_SCALAR_AGGREGATES = [
},
]
# TODO: why are the range_min and range_max set at these values?
# we must generate the set of combinations. Each one of these have the same
# values though.
BASE_ROW = {
"agg_type": "histogram",
"app_build_id": "*",
"app_version": 84,
"bucket": "4.00",
"bucket_count": 100,
"channel": "*",
"client_agg_type": "count",
"count": 1,
"key": "",
"metric": "places_manager_write_query_count",
"metric_type": "counter",
"os": "*",
"ping_type": "*",
"range_max": 3.0,
"range_min": 2.0,
}
# Didn't intend to code golf. This enumerates all of the "static combinations"
# by taking the cross product of all values. Each of these can take on a value
# from each of the clients above. Since each attribute combination has a single
# client, we do not have to change the "count" in the base row.
EXPECT = [
{
"agg_type": "histogram",
"app_build_id": "*",
"app_version": 84,
"bucket": "16.00",
"bucket_count": 100,
"channel": "*",
"client_agg_type": "count",
"count": 1,
"key": "",
"metric": "places_manager_write_query_count",
"metric_type": "counter",
"os": "*",
"ping_type": "*",
"range_max": 5.0,
"range_min": 0.0,
},
{
"agg_type": "histogram",
"app_build_id": "*",
"app_version": 84,
"bucket": "32.00",
"bucket_count": 100,
"channel": "*",
"client_agg_type": "count",
"count": 1,
"key": "",
"metric": "places_manager_write_query_count",
"metric_type": "counter",
"os": "*",
"ping_type": "*",
"range_max": 5.0,
"range_min": 0.0,
},
{
"agg_type": "histogram",
"app_build_id": "*",
"app_version": 84,
"bucket_count": 100,
"channel": "*",
"client_agg_type": "avg",
"count": 2,
"key": "",
"metric": "places_manager_write_query_count",
"metric_type": "counter",
"os": "*",
"ping_type": "*",
"range_max": 5.0,
"range_min": 0.0,
},
{
"agg_type": "histogram",
"app_build_id": "2020100100",
"app_version": 84,
"bucket": "16.00",
"bucket_count": 100,
"channel": "*",
"client_agg_type": "count",
"count": 1,
"key": "",
"metric": "places_manager_write_query_count",
"metric_type": "counter",
"os": "*",
"ping_type": "*",
"range_max": 5.0,
"range_min": 0.0,
},
{
"agg_type": "histogram",
"app_build_id": "2020100100",
"app_version": 84,
"bucket": "32.00",
"bucket_count": 100,
"channel": "*",
"client_agg_type": "count",
"count": 1,
"key": "",
"metric": "places_manager_write_query_count",
"metric_type": "counter",
"os": "*",
"ping_type": "*",
"range_max": 5.0,
"range_min": 0.0,
},
{
"agg_type": "histogram",
"app_build_id": "2020100100",
"app_version": 84,
"bucket_count": 100,
"channel": "*",
"client_agg_type": "avg",
"count": 2,
"key": "",
"metric": "places_manager_write_query_count",
"metric_type": "counter",
"os": "*",
"ping_type": "*",
"range_max": 5.0,
"range_min": 0.0,
},
{**BASE_ROW, **dict(zip(["bucket", "ping_type", "os", "app_build_id"], values))}
for values in product(
["4.00", "8.00"], *zip([PING_TYPE, OS, APP_BUILD_ID], ["*"] * 3)
)
]
prefix = "glam_etl"

Просмотреть файл

@ -1,51 +1,97 @@
- agg_type: histogram
app_build_id: '*'
app_version: 84
bucket: '16.00'
bucket_count: 100
channel: '*'
client_agg_type: count
count: 1
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: '*'
ping_type: '*'
range_max: 5.0
range_min: 0.0
- agg_type: histogram
app_build_id: '*'
app_version: 84
bucket: '32.00'
bucket_count: 100
channel: '*'
client_agg_type: count
count: 1
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: '*'
ping_type: '*'
range_max: 5.0
range_min: 0.0
- agg_type: histogram
app_build_id: '*'
app_version: 84
bucket_count: 100
channel: '*'
client_agg_type: avg
count: 2
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: '*'
ping_type: '*'
range_max: 5.0
range_min: 0.0
- agg_type: histogram
app_build_id: '2020100100'
app_version: 84
bucket: '16.00'
bucket: '4.00'
bucket_count: 100
channel: '*'
client_agg_type: count
count: 1
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: Android
ping_type: metrics
range_max: 3.0
range_min: 2.0
- agg_type: histogram
app_build_id: '*'
app_version: 84
bucket: '4.00'
bucket_count: 100
channel: '*'
client_agg_type: count
count: 1
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: Android
ping_type: metrics
range_max: 3.0
range_min: 2.0
- agg_type: histogram
app_build_id: '2020100100'
app_version: 84
bucket: '4.00'
bucket_count: 100
channel: '*'
client_agg_type: count
count: 1
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: '*'
ping_type: metrics
range_max: 3.0
range_min: 2.0
- agg_type: histogram
app_build_id: '*'
app_version: 84
bucket: '4.00'
bucket_count: 100
channel: '*'
client_agg_type: count
count: 1
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: '*'
ping_type: metrics
range_max: 3.0
range_min: 2.0
- agg_type: histogram
app_build_id: '2020100100'
app_version: 84
bucket: '4.00'
bucket_count: 100
channel: '*'
client_agg_type: count
count: 1
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: Android
ping_type: '*'
range_max: 3.0
range_min: 2.0
- agg_type: histogram
app_build_id: '*'
app_version: 84
bucket: '4.00'
bucket_count: 100
channel: '*'
client_agg_type: count
count: 1
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: Android
ping_type: '*'
range_max: 3.0
range_min: 2.0
- agg_type: histogram
app_build_id: '2020100100'
app_version: 84
bucket: '4.00'
bucket_count: 100
channel: '*'
client_agg_type: count
@ -55,12 +101,12 @@
metric_type: counter
os: '*'
ping_type: '*'
range_max: 5.0
range_min: 0.0
range_max: 3.0
range_min: 2.0
- agg_type: histogram
app_build_id: '2020100100'
app_build_id: '*'
app_version: 84
bucket: '32.00'
bucket: '4.00'
bucket_count: 100
channel: '*'
client_agg_type: count
@ -70,19 +116,125 @@
metric_type: counter
os: '*'
ping_type: '*'
range_max: 5.0
range_min: 0.0
range_max: 3.0
range_min: 2.0
- agg_type: histogram
app_build_id: '2020100100'
app_version: 84
bucket: '8.00'
bucket_count: 100
channel: '*'
client_agg_type: avg
count: 2
client_agg_type: count
count: 1
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: Android
ping_type: metrics
range_max: 3.0
range_min: 2.0
- agg_type: histogram
app_build_id: '*'
app_version: 84
bucket: '8.00'
bucket_count: 100
channel: '*'
client_agg_type: count
count: 1
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: Android
ping_type: metrics
range_max: 3.0
range_min: 2.0
- agg_type: histogram
app_build_id: '2020100100'
app_version: 84
bucket: '8.00'
bucket_count: 100
channel: '*'
client_agg_type: count
count: 1
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: '*'
ping_type: metrics
range_max: 3.0
range_min: 2.0
- agg_type: histogram
app_build_id: '*'
app_version: 84
bucket: '8.00'
bucket_count: 100
channel: '*'
client_agg_type: count
count: 1
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: '*'
ping_type: metrics
range_max: 3.0
range_min: 2.0
- agg_type: histogram
app_build_id: '2020100100'
app_version: 84
bucket: '8.00'
bucket_count: 100
channel: '*'
client_agg_type: count
count: 1
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: Android
ping_type: '*'
range_max: 3.0
range_min: 2.0
- agg_type: histogram
app_build_id: '*'
app_version: 84
bucket: '8.00'
bucket_count: 100
channel: '*'
client_agg_type: count
count: 1
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: Android
ping_type: '*'
range_max: 3.0
range_min: 2.0
- agg_type: histogram
app_build_id: '2020100100'
app_version: 84
bucket: '8.00'
bucket_count: 100
channel: '*'
client_agg_type: count
count: 1
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: '*'
ping_type: '*'
range_max: 5.0
range_min: 0.0
range_max: 3.0
range_min: 2.0
- agg_type: histogram
app_build_id: '*'
app_version: 84
bucket: '8.00'
bucket_count: 100
channel: '*'
client_agg_type: count
count: 1
key: ''
metric: places_manager_write_query_count
metric_type: counter
os: '*'
ping_type: '*'
range_max: 3.0
range_min: 2.0

Просмотреть файл

@ -1,9 +1,9 @@
- app_build_id: '2020100100'
app_version: 84
channel: '*'
client_id: e4b68766-0c07-4896-92d3-0f920dc202f0
os: '*'
ping_type: '*'
client_id: 885422ea-a5fb-489e-b5ac-efa2d57d22f4
os: Android
ping_type: metrics
scalar_aggregates:
- agg_type: count
key: ''
@ -13,9 +13,9 @@
- app_build_id: '2020100100'
app_version: 84
channel: '*'
client_id: 8d20cc29-7bd8-4595-85a0-43aecc9e5432
os: '*'
ping_type: '*'
client_id: 8f206cdb-95d1-46d1-8295-7cc033c76b87
os: Android
ping_type: metrics
scalar_aggregates:
- agg_type: count
key: ''

Просмотреть файл

@ -24,7 +24,7 @@ CLIENTS_SCALAR_AGGREGATES = [
"key": "",
"agg_type": "count",
"value": 4.0,
},
}
],
},
{
@ -41,7 +41,7 @@ CLIENTS_SCALAR_AGGREGATES = [
"key": "",
"agg_type": "count",
"value": 8.0,
},
}
],
},
]

Просмотреть файл

@ -32,9 +32,9 @@ EXPECT = [
{
"agg_type": "histogram",
"aggregates": [
{"key": "1.00", "value": 0.16666666666666666},
{"key": "2.00", "value": 0.6666666666666666},
{"key": "4.00", "value": 0.16666666666666666},
{"key": "1.00", "value": 0.166_666_666_666_666_66},
{"key": "2.00", "value": 0.666_666_666_666_666_6},
{"key": "4.00", "value": 0.166_666_666_666_666_66},
],
"app_build_id": "*",
"app_version": 84,

Просмотреть файл

@ -41,10 +41,7 @@ def main(test_name):
# function. We'll also include dates in the future. There is a new
# version every day.
rows = [input_row(i, i, i) for i in range(-10, HISTORY_DAYS + 2)]
yaml.dump(
sorted(rows, key=lambda x: x["client_info"]["app_build"]) * 6,
fp,
)
yaml.dump(sorted(rows, key=lambda x: x["client_info"]["app_build"]) * 6, fp)
# bad rows, versions less than 100 put before and after the 100 mark. The
# one for fenix will probably get filtered out because of the channel norm
# udf.