This commit is contained in:
Anna Scholtz 2020-07-14 13:19:15 -07:00
Родитель e2361ee4b4
Коммит d6371be8b1
10 изменённых файлов: 133 добавлений и 108 удалений

Просмотреть файл

@ -0,0 +1,2 @@
description: 'Glean Percentile'
friendly_name: Glean Percentile

Просмотреть файл

@ -0,0 +1,51 @@
CREATE OR REPLACE FUNCTION glean.percentile(
percentile FLOAT64,
histogram ARRAY<STRUCT<key STRING, value FLOAT64>>,
type STRING
)
RETURNS FLOAT64
LANGUAGE js
AS
'''
if (percentile < 0 || percentile > 100) {
throw "percentile must be a value between 0 and 100";
}
let values = histogram.map(bucket => bucket.value);
let total = values.reduce((a, b) => a + b);
let normalized = values.map(value => value / total);
// Find the index into the cumulative distribution function that corresponds
// to the percentile. This undershoots the true value of the percentile.
let acc = 0;
let index = null;
for (let i = 0; i < normalized.length; i++) {
acc += normalized[i];
index = i;
if (acc >= percentile / 100) {
break;
}
}
// NOTE: we do not perform geometric or linear interpolation, but this would
// be the place to implement it.
return histogram[index].key;
''';
SELECT
assert_equals(
2,
glean.percentile(
50.0,
ARRAY<STRUCT<key STRING, value FLOAT64>>[("0", 1), ("2", 2), ("3", 1)],
"timing_distribution"
)
);
#xfail
SELECT
glean.percentile(
101.0,
ARRAY<STRUCT<key STRING, value FLOAT64>>[("0", 1), ("2", 2), ("3", 1)],
"timing_distribution"
);

Просмотреть файл

@ -6,12 +6,12 @@ Merge an array of histograms into a single histogram.
*/
CREATE OR REPLACE FUNCTION hist.merge(histogram_list ANY TYPE) AS (
STRUCT(
mode.last(ARRAY(SELECT bucket_count FROM UNNEST(histogram_list))) AS bucket_count,
stats.mode_last(ARRAY(SELECT bucket_count FROM UNNEST(histogram_list))) AS bucket_count,
(SELECT SUM(`sum`) FROM UNNEST(histogram_list)) AS `sum`,
mode.last(ARRAY(SELECT histogram_type FROM UNNEST(histogram_list))) AS histogram_type,
stats.mode_last(ARRAY(SELECT histogram_type FROM UNNEST(histogram_list))) AS histogram_type,
[
mode.last(ARRAY(SELECT `range`[SAFE_OFFSET(0)] FROM UNNEST(histogram_list))),
mode.last(ARRAY(SELECT `range`[SAFE_OFFSET(1)] FROM UNNEST(histogram_list)))
stats.mode_last(ARRAY(SELECT `range`[SAFE_OFFSET(0)] FROM UNNEST(histogram_list))),
stats.mode_last(ARRAY(SELECT `range`[SAFE_OFFSET(1)] FROM UNNEST(histogram_list)))
] AS `range`,
ARRAY(
SELECT AS STRUCT

Просмотреть файл

@ -0,0 +1,5 @@
description: 'Returns the most frequently occuring element in an array. In the case
of multiple values tied for the highest count, it returns the value that appears
latest in the array. Nulls are ignored. See also: `stats.mode_last_retain_nulls`,
which retains nulls.'
friendly_name: Mode Last

Просмотреть файл

@ -0,0 +1,28 @@
/*
Returns the most frequently occuring element in an array.
In the case of multiple values tied for the highest count, it returns the value
that appears latest in the array. Nulls are ignored.
See also: `stats.mode_last_retain_nulls`, which retains nulls.
*/
CREATE OR REPLACE FUNCTION stats.mode_last(list ANY TYPE) AS (
(
SELECT
_value
FROM
UNNEST(list) AS _value
WITH OFFSET AS _offset
GROUP BY
_value
ORDER BY
COUNT(_value) DESC,
MAX(_offset) DESC
LIMIT
1
)
);
-- Test
SELECT
assert_equals('bar', stats.mode_last(['foo', 'bar', 'baz', 'bar', 'fred'])),
assert_equals('baz', stats.mode_last(['foo', 'bar', 'baz', 'bar', 'baz', 'fred'])),
assert_equals('foo', stats.mode_last([NULL, 'foo', NULL]));

Просмотреть файл

@ -0,0 +1,5 @@
description: 'Returns the most frequently occuring element in an array. In the case
of multiple values tied for the highest count, it returns the value that appears
latest in the array. Nulls are retained. See also: `stats.mode_last, which ignores
nulls.'
friendly_name: Mode Last Retain Nulls

Просмотреть файл

@ -0,0 +1,28 @@
/*
Returns the most frequently occuring element in an array.
In the case of multiple values tied for the highest count, it returns the value
that appears latest in the array. Nulls are retained.
See also: `stats.mode_last, which ignores nulls.
*/
CREATE OR REPLACE FUNCTION stats.mode_last_retain_nulls(list ANY TYPE) AS (
(
SELECT
_value
FROM
UNNEST(list) AS _value
WITH OFFSET AS _offset
GROUP BY
_value
ORDER BY
COUNT(*) DESC,
MAX(_offset) DESC
LIMIT
1
)
);
-- Test
SELECT
assert_equals('bar', stats.mode_last_retain_nulls(['foo', 'bar', 'baz', 'bar', 'fred'])),
assert_equals('baz', stats.mode_last_retain_nulls(['foo', 'bar', 'baz', 'bar', 'baz', 'fred'])),
assert_equals(CAST(NULL AS STRING), stats.mode_last_retain_nulls([NULL, 'foo', NULL]));

Просмотреть файл

@ -1,31 +1,4 @@
/*
Returns the most frequently occuring element in an array.
In the case of multiple values tied for the highest count, it returns the value
that appears latest in the array. Nulls are ignored.
See also: `udf.mode_last_retain_nulls`, which retains nulls.
*/
CREATE OR REPLACE FUNCTION
udf.mode_last(list ANY TYPE) AS ((
SELECT
_value
FROM
UNNEST(list) AS _value
WITH
OFFSET
AS
_offset
GROUP BY
_value
ORDER BY
COUNT(_value) DESC,
MAX(_offset) DESC
LIMIT
1 ));
-- Test
SELECT
assert_equals('bar', udf.mode_last(['foo', 'bar', 'baz', 'bar', 'fred'])),
assert_equals('baz', udf.mode_last(['foo', 'bar', 'baz', 'bar', 'baz', 'fred'])),
assert_equals('foo', udf.mode_last([null, 'foo', null]));
-- Legacy wrapper around a function moved to mozfun.
CREATE OR REPLACE FUNCTION udf.mode_last(list ANY TYPE) AS (
mozfun.stats.mode_last(list)
);

Просмотреть файл

@ -1,28 +1,4 @@
/*
Returns the most frequently occuring element in an array.
In the case of multiple values tied for the highest count, it returns the value
that appears latest in the array. Nulls are retained.
See also: `udf.mode_last`, which ignores nulls.
*/
-- Legacy wrapper around a function moved to mozfun.
CREATE OR REPLACE FUNCTION udf.mode_last_retain_nulls(list ANY TYPE) AS (
(
SELECT
_value
FROM
UNNEST(list) AS _value
WITH OFFSET AS _offset
GROUP BY
_value
ORDER BY
COUNT(*) DESC,
MAX(_offset) DESC
LIMIT
1
)
mozfun.stats.mode_last_retain_nulls(list)
);
-- Test
SELECT
assert_equals('bar', udf.mode_last_retain_nulls(['foo', 'bar', 'baz', 'bar', 'fred'])),
assert_equals('baz', udf.mode_last_retain_nulls(['foo', 'bar', 'baz', 'bar', 'baz', 'fred'])),
assert_equals(CAST(NULL AS STRING), udf.mode_last_retain_nulls([NULL, 'foo', NULL]));

Просмотреть файл

@ -1,51 +1,8 @@
-- Legacy wrapper around a function moved to mozfun.
CREATE OR REPLACE FUNCTION udf_js.glean_percentile(
percentile FLOAT64,
histogram ARRAY<STRUCT<key STRING, value FLOAT64>>,
type STRING
)
RETURNS FLOAT64
LANGUAGE js
AS
'''
if (percentile < 0 || percentile > 100) {
throw "percentile must be a value between 0 and 100";
}
let values = histogram.map(bucket => bucket.value);
let total = values.reduce((a, b) => a + b);
let normalized = values.map(value => value / total);
// Find the index into the cumulative distribution function that corresponds
// to the percentile. This undershoots the true value of the percentile.
let acc = 0;
let index = null;
for (let i = 0; i < normalized.length; i++) {
acc += normalized[i];
index = i;
if (acc >= percentile / 100) {
break;
}
}
// NOTE: we do not perform geometric or linear interpolation, but this would
// be the place to implement it.
return histogram[index].key;
''';
SELECT
assert_equals(
2,
udf_js.glean_percentile(
50.0,
ARRAY<STRUCT<key STRING, value FLOAT64>>[("0", 1), ("2", 2), ("3", 1)],
"timing_distribution"
)
);
#xfail
SELECT
udf_js.glean_percentile(
101.0,
ARRAY<STRUCT<key STRING, value FLOAT64>>[("0", 1), ("2", 2), ("3", 1)],
"timing_distribution"
);
) AS (
mozfun.glean.percentile(percentile, histogram, type)
);