Migrate mode UDFs
This commit is contained in:
Родитель
e2361ee4b4
Коммит
d6371be8b1
|
@ -0,0 +1,2 @@
|
|||
description: 'Glean Percentile'
|
||||
friendly_name: Glean Percentile
|
|
@ -0,0 +1,51 @@
|
|||
CREATE OR REPLACE FUNCTION glean.percentile(
|
||||
percentile FLOAT64,
|
||||
histogram ARRAY<STRUCT<key STRING, value FLOAT64>>,
|
||||
type STRING
|
||||
)
|
||||
RETURNS FLOAT64
|
||||
LANGUAGE js
|
||||
AS
|
||||
'''
|
||||
if (percentile < 0 || percentile > 100) {
|
||||
throw "percentile must be a value between 0 and 100";
|
||||
}
|
||||
|
||||
let values = histogram.map(bucket => bucket.value);
|
||||
let total = values.reduce((a, b) => a + b);
|
||||
let normalized = values.map(value => value / total);
|
||||
|
||||
// Find the index into the cumulative distribution function that corresponds
|
||||
// to the percentile. This undershoots the true value of the percentile.
|
||||
let acc = 0;
|
||||
let index = null;
|
||||
for (let i = 0; i < normalized.length; i++) {
|
||||
acc += normalized[i];
|
||||
index = i;
|
||||
if (acc >= percentile / 100) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: we do not perform geometric or linear interpolation, but this would
|
||||
// be the place to implement it.
|
||||
return histogram[index].key;
|
||||
''';
|
||||
|
||||
SELECT
|
||||
assert_equals(
|
||||
2,
|
||||
glean.percentile(
|
||||
50.0,
|
||||
ARRAY<STRUCT<key STRING, value FLOAT64>>[("0", 1), ("2", 2), ("3", 1)],
|
||||
"timing_distribution"
|
||||
)
|
||||
);
|
||||
|
||||
#xfail
|
||||
SELECT
|
||||
glean.percentile(
|
||||
101.0,
|
||||
ARRAY<STRUCT<key STRING, value FLOAT64>>[("0", 1), ("2", 2), ("3", 1)],
|
||||
"timing_distribution"
|
||||
);
|
|
@ -6,12 +6,12 @@ Merge an array of histograms into a single histogram.
|
|||
*/
|
||||
CREATE OR REPLACE FUNCTION hist.merge(histogram_list ANY TYPE) AS (
|
||||
STRUCT(
|
||||
mode.last(ARRAY(SELECT bucket_count FROM UNNEST(histogram_list))) AS bucket_count,
|
||||
stats.mode_last(ARRAY(SELECT bucket_count FROM UNNEST(histogram_list))) AS bucket_count,
|
||||
(SELECT SUM(`sum`) FROM UNNEST(histogram_list)) AS `sum`,
|
||||
mode.last(ARRAY(SELECT histogram_type FROM UNNEST(histogram_list))) AS histogram_type,
|
||||
stats.mode_last(ARRAY(SELECT histogram_type FROM UNNEST(histogram_list))) AS histogram_type,
|
||||
[
|
||||
mode.last(ARRAY(SELECT `range`[SAFE_OFFSET(0)] FROM UNNEST(histogram_list))),
|
||||
mode.last(ARRAY(SELECT `range`[SAFE_OFFSET(1)] FROM UNNEST(histogram_list)))
|
||||
stats.mode_last(ARRAY(SELECT `range`[SAFE_OFFSET(0)] FROM UNNEST(histogram_list))),
|
||||
stats.mode_last(ARRAY(SELECT `range`[SAFE_OFFSET(1)] FROM UNNEST(histogram_list)))
|
||||
] AS `range`,
|
||||
ARRAY(
|
||||
SELECT AS STRUCT
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
description: 'Returns the most frequently occuring element in an array. In the case
|
||||
of multiple values tied for the highest count, it returns the value that appears
|
||||
latest in the array. Nulls are ignored. See also: `stats.mode_last_retain_nulls`,
|
||||
which retains nulls.'
|
||||
friendly_name: Mode Last
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
Returns the most frequently occuring element in an array.
|
||||
In the case of multiple values tied for the highest count, it returns the value
|
||||
that appears latest in the array. Nulls are ignored.
|
||||
See also: `stats.mode_last_retain_nulls`, which retains nulls.
|
||||
*/
|
||||
CREATE OR REPLACE FUNCTION stats.mode_last(list ANY TYPE) AS (
|
||||
(
|
||||
SELECT
|
||||
_value
|
||||
FROM
|
||||
UNNEST(list) AS _value
|
||||
WITH OFFSET AS _offset
|
||||
GROUP BY
|
||||
_value
|
||||
ORDER BY
|
||||
COUNT(_value) DESC,
|
||||
MAX(_offset) DESC
|
||||
LIMIT
|
||||
1
|
||||
)
|
||||
);
|
||||
|
||||
-- Test
|
||||
SELECT
|
||||
assert_equals('bar', stats.mode_last(['foo', 'bar', 'baz', 'bar', 'fred'])),
|
||||
assert_equals('baz', stats.mode_last(['foo', 'bar', 'baz', 'bar', 'baz', 'fred'])),
|
||||
assert_equals('foo', stats.mode_last([NULL, 'foo', NULL]));
|
|
@ -0,0 +1,5 @@
|
|||
description: 'Returns the most frequently occuring element in an array. In the case
|
||||
of multiple values tied for the highest count, it returns the value that appears
|
||||
latest in the array. Nulls are retained. See also: `stats.mode_last, which ignores
|
||||
nulls.'
|
||||
friendly_name: Mode Last Retain Nulls
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
Returns the most frequently occuring element in an array.
|
||||
In the case of multiple values tied for the highest count, it returns the value
|
||||
that appears latest in the array. Nulls are retained.
|
||||
See also: `stats.mode_last, which ignores nulls.
|
||||
*/
|
||||
CREATE OR REPLACE FUNCTION stats.mode_last_retain_nulls(list ANY TYPE) AS (
|
||||
(
|
||||
SELECT
|
||||
_value
|
||||
FROM
|
||||
UNNEST(list) AS _value
|
||||
WITH OFFSET AS _offset
|
||||
GROUP BY
|
||||
_value
|
||||
ORDER BY
|
||||
COUNT(*) DESC,
|
||||
MAX(_offset) DESC
|
||||
LIMIT
|
||||
1
|
||||
)
|
||||
);
|
||||
|
||||
-- Test
|
||||
SELECT
|
||||
assert_equals('bar', stats.mode_last_retain_nulls(['foo', 'bar', 'baz', 'bar', 'fred'])),
|
||||
assert_equals('baz', stats.mode_last_retain_nulls(['foo', 'bar', 'baz', 'bar', 'baz', 'fred'])),
|
||||
assert_equals(CAST(NULL AS STRING), stats.mode_last_retain_nulls([NULL, 'foo', NULL]));
|
|
@ -1,31 +1,4 @@
|
|||
/*
|
||||
Returns the most frequently occuring element in an array.
|
||||
In the case of multiple values tied for the highest count, it returns the value
|
||||
that appears latest in the array. Nulls are ignored.
|
||||
See also: `udf.mode_last_retain_nulls`, which retains nulls.
|
||||
*/
|
||||
|
||||
CREATE OR REPLACE FUNCTION
|
||||
udf.mode_last(list ANY TYPE) AS ((
|
||||
SELECT
|
||||
_value
|
||||
FROM
|
||||
UNNEST(list) AS _value
|
||||
WITH
|
||||
OFFSET
|
||||
AS
|
||||
_offset
|
||||
GROUP BY
|
||||
_value
|
||||
ORDER BY
|
||||
COUNT(_value) DESC,
|
||||
MAX(_offset) DESC
|
||||
LIMIT
|
||||
1 ));
|
||||
|
||||
-- Test
|
||||
|
||||
SELECT
|
||||
assert_equals('bar', udf.mode_last(['foo', 'bar', 'baz', 'bar', 'fred'])),
|
||||
assert_equals('baz', udf.mode_last(['foo', 'bar', 'baz', 'bar', 'baz', 'fred'])),
|
||||
assert_equals('foo', udf.mode_last([null, 'foo', null]));
|
||||
-- Legacy wrapper around a function moved to mozfun.
|
||||
CREATE OR REPLACE FUNCTION udf.mode_last(list ANY TYPE) AS (
|
||||
mozfun.stats.mode_last(list)
|
||||
);
|
||||
|
|
|
@ -1,28 +1,4 @@
|
|||
/*
|
||||
Returns the most frequently occuring element in an array.
|
||||
In the case of multiple values tied for the highest count, it returns the value
|
||||
that appears latest in the array. Nulls are retained.
|
||||
See also: `udf.mode_last`, which ignores nulls.
|
||||
*/
|
||||
-- Legacy wrapper around a function moved to mozfun.
|
||||
CREATE OR REPLACE FUNCTION udf.mode_last_retain_nulls(list ANY TYPE) AS (
|
||||
(
|
||||
SELECT
|
||||
_value
|
||||
FROM
|
||||
UNNEST(list) AS _value
|
||||
WITH OFFSET AS _offset
|
||||
GROUP BY
|
||||
_value
|
||||
ORDER BY
|
||||
COUNT(*) DESC,
|
||||
MAX(_offset) DESC
|
||||
LIMIT
|
||||
1
|
||||
)
|
||||
mozfun.stats.mode_last_retain_nulls(list)
|
||||
);
|
||||
|
||||
-- Test
|
||||
SELECT
|
||||
assert_equals('bar', udf.mode_last_retain_nulls(['foo', 'bar', 'baz', 'bar', 'fred'])),
|
||||
assert_equals('baz', udf.mode_last_retain_nulls(['foo', 'bar', 'baz', 'bar', 'baz', 'fred'])),
|
||||
assert_equals(CAST(NULL AS STRING), udf.mode_last_retain_nulls([NULL, 'foo', NULL]));
|
||||
|
|
|
@ -1,51 +1,8 @@
|
|||
-- Legacy wrapper around a function moved to mozfun.
|
||||
CREATE OR REPLACE FUNCTION udf_js.glean_percentile(
|
||||
percentile FLOAT64,
|
||||
histogram ARRAY<STRUCT<key STRING, value FLOAT64>>,
|
||||
type STRING
|
||||
)
|
||||
RETURNS FLOAT64
|
||||
LANGUAGE js
|
||||
AS
|
||||
'''
|
||||
if (percentile < 0 || percentile > 100) {
|
||||
throw "percentile must be a value between 0 and 100";
|
||||
}
|
||||
|
||||
let values = histogram.map(bucket => bucket.value);
|
||||
let total = values.reduce((a, b) => a + b);
|
||||
let normalized = values.map(value => value / total);
|
||||
|
||||
// Find the index into the cumulative distribution function that corresponds
|
||||
// to the percentile. This undershoots the true value of the percentile.
|
||||
let acc = 0;
|
||||
let index = null;
|
||||
for (let i = 0; i < normalized.length; i++) {
|
||||
acc += normalized[i];
|
||||
index = i;
|
||||
if (acc >= percentile / 100) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: we do not perform geometric or linear interpolation, but this would
|
||||
// be the place to implement it.
|
||||
return histogram[index].key;
|
||||
''';
|
||||
|
||||
SELECT
|
||||
assert_equals(
|
||||
2,
|
||||
udf_js.glean_percentile(
|
||||
50.0,
|
||||
ARRAY<STRUCT<key STRING, value FLOAT64>>[("0", 1), ("2", 2), ("3", 1)],
|
||||
"timing_distribution"
|
||||
)
|
||||
);
|
||||
|
||||
#xfail
|
||||
SELECT
|
||||
udf_js.glean_percentile(
|
||||
101.0,
|
||||
ARRAY<STRUCT<key STRING, value FLOAT64>>[("0", 1), ("2", 2), ("3", 1)],
|
||||
"timing_distribution"
|
||||
);
|
||||
) AS (
|
||||
mozfun.glean.percentile(percentile, histogram, type)
|
||||
);
|
||||
|
|
Загрузка…
Ссылка в новой задаче