* fix(glam_fog): Create function histogram_filter_high_values to prevent INT64 overflow

* fix(glam_fog): Use histogram_filter_high_values to avoid overflow
This commit is contained in:
Eduardo Filho 2024-09-18 12:40:41 -04:00 коммит произвёл GitHub
Родитель 7d613414e8
Коммит 68937b4ad0
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
5 изменённых файлов: 53 добавлений и 18 удалений

Просмотреть файл

@ -71,7 +71,7 @@ aggregated AS (
{{ attributes }}, {{ attributes }},
metric, metric,
metric_type, metric_type,
mozfun.map.sum(ARRAY_CONCAT_AGG(value)) as value mozfun.map.sum(ARRAY_CONCAT_AGG(mozfun.glam.histogram_filter_high_values(value))) as value
FROM FROM
flattened_histograms flattened_histograms
GROUP BY GROUP BY

Просмотреть файл

@ -18,22 +18,6 @@
{% endif %} {% endif %}
{% endset %} {% endset %}
CREATE TEMP FUNCTION filter_values(aggs ARRAY<STRUCT<key STRING, value INT64>>)
RETURNS ARRAY<STRUCT<key STRING, value INT64>>
AS (
ARRAY(
SELECT AS STRUCT agg.key, SUM(agg.value) AS value
FROM UNNEST(aggs) agg
-- Prevent overflows by only keeping buckets where value is less than 2^40
-- allowing 2^24 entries. This value was chosen somewhat abitrarily, typically
-- the max histogram value is somewhere on the order of ~20 bits.
-- Negative values are incorrect and should not happen but were observed,
-- probably due to some bit flips.
WHERE agg.value BETWEEN 0 AND POW(2, 40)
GROUP BY agg.key
)
);
WITH extracted_accumulated AS ( WITH extracted_accumulated AS (
SELECT SELECT
* *
@ -84,7 +68,7 @@ aggregated_daily AS (
SELECT SELECT
{{ attributes }}, {{ attributes }},
{{ metric_attributes }}, {{ metric_attributes }},
mozfun.map.sum(ARRAY_CONCAT_AGG(filter_values(value))) AS value mozfun.map.sum(ARRAY_CONCAT_AGG(mozfun.glam.histogram_filter_high_values(value))) AS value
FROM FROM
filtered_daily filtered_daily
GROUP BY GROUP BY

Просмотреть файл

Просмотреть файл

@ -0,0 +1,7 @@
description: |
Prevent overflows by only keeping buckets where value is less than 2^40
allowing 2^24 entries. This value was chosen somewhat abitrarily, typically
the max histogram value is somewhere on the order of ~20 bits.
Negative values are incorrect and should not happen but were observed,
probably due to some bit flips.
friendly_name: Histogram filter high values

Просмотреть файл

@ -0,0 +1,44 @@
CREATE OR REPLACE FUNCTION glam.histogram_filter_high_values(
aggs ARRAY<STRUCT<key STRING, value INT64>>
)
RETURNS ARRAY<STRUCT<key STRING, value INT64>> AS (
ARRAY(
SELECT AS STRUCT
agg.key,
SUM(agg.value) AS value
FROM
UNNEST(aggs) agg
WHERE
agg.value
BETWEEN 0
AND POW(2, 40)
GROUP BY
agg.key
)
);
SELECT
assert.array_equals(
ARRAY<STRUCT<key STRING, value FLOAT64>>[('key1', 3), ('key2', 1099511627776)],
glam.histogram_filter_high_values(
[
STRUCT(
'key1' AS key,
1 AS value
), -- The first two elements have the same key and should be summed
STRUCT('key1' AS key, 2 AS value),
STRUCT(
'key2' AS key,
1099511627776 AS value
), -- This is exactly 2^40 and should not be excluded
STRUCT('key2' AS key, 1099511627777 AS value), -- This exceeds 2^40 and should be excluded
STRUCT('key1' AS key, -5 AS value) -- Should be excluded due to being negative
]
)
);
SELECT
assert.array_equals(
ARRAY<STRUCT<key STRING, value FLOAT64>>[],
glam.histogram_filter_high_values([])
);