fix(glam_fog) SUM overflow (#6218)
* fix(glam_fog): Create function histogram_filter_high_values to prevent INT64 overflow * fix(glam_fog): Use histogram_filter_high_values to avoid overflow
This commit is contained in:
Родитель
7d613414e8
Коммит
68937b4ad0
|
@ -71,7 +71,7 @@ aggregated AS (
|
|||
{{ attributes }},
|
||||
metric,
|
||||
metric_type,
|
||||
mozfun.map.sum(ARRAY_CONCAT_AGG(value)) as value
|
||||
mozfun.map.sum(ARRAY_CONCAT_AGG(mozfun.glam.histogram_filter_high_values(value))) as value
|
||||
FROM
|
||||
flattened_histograms
|
||||
GROUP BY
|
||||
|
|
|
@ -18,22 +18,6 @@
|
|||
{% endif %}
|
||||
{% endset %}
|
||||
|
||||
CREATE TEMP FUNCTION filter_values(aggs ARRAY<STRUCT<key STRING, value INT64>>)
|
||||
RETURNS ARRAY<STRUCT<key STRING, value INT64>>
|
||||
AS (
|
||||
ARRAY(
|
||||
SELECT AS STRUCT agg.key, SUM(agg.value) AS value
|
||||
FROM UNNEST(aggs) agg
|
||||
-- Prevent overflows by only keeping buckets where value is less than 2^40
|
||||
-- allowing 2^24 entries. This value was chosen somewhat abitrarily, typically
|
||||
-- the max histogram value is somewhere on the order of ~20 bits.
|
||||
-- Negative values are incorrect and should not happen but were observed,
|
||||
-- probably due to some bit flips.
|
||||
WHERE agg.value BETWEEN 0 AND POW(2, 40)
|
||||
GROUP BY agg.key
|
||||
)
|
||||
);
|
||||
|
||||
WITH extracted_accumulated AS (
|
||||
SELECT
|
||||
*
|
||||
|
@ -84,7 +68,7 @@ aggregated_daily AS (
|
|||
SELECT
|
||||
{{ attributes }},
|
||||
{{ metric_attributes }},
|
||||
mozfun.map.sum(ARRAY_CONCAT_AGG(filter_values(value))) AS value
|
||||
mozfun.map.sum(ARRAY_CONCAT_AGG(mozfun.glam.histogram_filter_high_values(value))) AS value
|
||||
FROM
|
||||
filtered_daily
|
||||
GROUP BY
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
description: |
|
||||
Prevent overflows by only keeping buckets where value is less than 2^40
|
||||
allowing 2^24 entries. This value was chosen somewhat abitrarily, typically
|
||||
the max histogram value is somewhere on the order of ~20 bits.
|
||||
Negative values are incorrect and should not happen but were observed,
|
||||
probably due to some bit flips.
|
||||
friendly_name: Histogram filter high values
|
|
@ -0,0 +1,44 @@
|
|||
CREATE OR REPLACE FUNCTION glam.histogram_filter_high_values(
|
||||
aggs ARRAY<STRUCT<key STRING, value INT64>>
|
||||
)
|
||||
RETURNS ARRAY<STRUCT<key STRING, value INT64>> AS (
|
||||
ARRAY(
|
||||
SELECT AS STRUCT
|
||||
agg.key,
|
||||
SUM(agg.value) AS value
|
||||
FROM
|
||||
UNNEST(aggs) agg
|
||||
WHERE
|
||||
agg.value
|
||||
BETWEEN 0
|
||||
AND POW(2, 40)
|
||||
GROUP BY
|
||||
agg.key
|
||||
)
|
||||
);
|
||||
|
||||
SELECT
|
||||
assert.array_equals(
|
||||
ARRAY<STRUCT<key STRING, value FLOAT64>>[('key1', 3), ('key2', 1099511627776)],
|
||||
glam.histogram_filter_high_values(
|
||||
[
|
||||
STRUCT(
|
||||
'key1' AS key,
|
||||
1 AS value
|
||||
), -- The first two elements have the same key and should be summed
|
||||
STRUCT('key1' AS key, 2 AS value),
|
||||
STRUCT(
|
||||
'key2' AS key,
|
||||
1099511627776 AS value
|
||||
), -- This is exactly 2^40 and should not be excluded
|
||||
STRUCT('key2' AS key, 1099511627777 AS value), -- This exceeds 2^40 and should be excluded
|
||||
STRUCT('key1' AS key, -5 AS value) -- Should be excluded due to being negative
|
||||
]
|
||||
)
|
||||
);
|
||||
|
||||
SELECT
|
||||
assert.array_equals(
|
||||
ARRAY<STRUCT<key STRING, value FLOAT64>>[],
|
||||
glam.histogram_filter_high_values([])
|
||||
);
|
Загрузка…
Ссылка в новой задаче