Migrate mode UDFs

2020-07-14 13:19:15 -07:00 · 2020-07-14 13:19:15 -07:00 · d6371be8b1
--- a/mozfun/glean/percentile/metadata.yaml
+++ b/mozfun/glean/percentile/metadata.yaml
@ -0,0 +1,2 @@
+description: 'Glean Percentile'
+friendly_name: Glean Percentile
--- a/mozfun/glean/percentile/udf.sql
+++ b/mozfun/glean/percentile/udf.sql
@ -0,0 +1,51 @@
+CREATE OR REPLACE FUNCTION glean.percentile(
+  percentile FLOAT64,
+  histogram ARRAY<STRUCT<key STRING, value FLOAT64>>,
+  type STRING
+)
+RETURNS FLOAT64
+LANGUAGE js
+AS
+  '''
+  if (percentile < 0 || percentile > 100) {
+      throw "percentile must be a value between 0 and 100";
+  }
+
+  let values = histogram.map(bucket => bucket.value);
+  let total = values.reduce((a, b) => a + b);
+  let normalized = values.map(value => value / total);
+
+  // Find the index into the cumulative distribution function that corresponds
+  // to the percentile. This undershoots the true value of the percentile.
+  let acc = 0;
+  let index = null;
+  for (let i = 0; i < normalized.length; i++) {
+      acc += normalized[i];
+      index = i;
+      if (acc >= percentile / 100) {
+          break;
+      }
+  }
+
+  // NOTE: we do not perform geometric or linear interpolation, but this would
+  // be the place to implement it.
+  return histogram[index].key;
+''';
+
+SELECT
+  assert_equals(
+    2,
+    glean.percentile(
+      50.0,
+      ARRAY<STRUCT<key STRING, value FLOAT64>>[("0", 1), ("2", 2), ("3", 1)],
+      "timing_distribution"
+    )
+  );
+
+#xfail
+SELECT
+  glean.percentile(
+    101.0,
+    ARRAY<STRUCT<key STRING, value FLOAT64>>[("0", 1), ("2", 2), ("3", 1)],
+    "timing_distribution"
+  );
--- a/mozfun/hist/merge/udf.sql
+++ b/mozfun/hist/merge/udf.sql
@ -6,12 +6,12 @@ Merge an array of histograms into a single histogram.
 */
 CREATE OR REPLACE FUNCTION hist.merge(histogram_list ANY TYPE) AS (
  STRUCT(
-    mode.last(ARRAY(SELECT bucket_count FROM UNNEST(histogram_list))) AS bucket_count,
+    stats.mode_last(ARRAY(SELECT bucket_count FROM UNNEST(histogram_list))) AS bucket_count,
    (SELECT SUM(`sum`) FROM UNNEST(histogram_list)) AS `sum`,
-    mode.last(ARRAY(SELECT histogram_type FROM UNNEST(histogram_list))) AS histogram_type,
+    stats.mode_last(ARRAY(SELECT histogram_type FROM UNNEST(histogram_list))) AS histogram_type,
    [
-      mode.last(ARRAY(SELECT `range`[SAFE_OFFSET(0)] FROM UNNEST(histogram_list))),
-      mode.last(ARRAY(SELECT `range`[SAFE_OFFSET(1)] FROM UNNEST(histogram_list)))
+      stats.mode_last(ARRAY(SELECT `range`[SAFE_OFFSET(0)] FROM UNNEST(histogram_list))),
+      stats.mode_last(ARRAY(SELECT `range`[SAFE_OFFSET(1)] FROM UNNEST(histogram_list)))
    ] AS `range`,
    ARRAY(
      SELECT AS STRUCT
--- a/mozfun/stats/mode_last/metadata.yaml
+++ b/mozfun/stats/mode_last/metadata.yaml
@ -0,0 +1,5 @@
+description: 'Returns the most frequently occuring element in an array. In the case
+  of multiple values tied for the highest count, it returns the value that appears
+  latest in the array. Nulls are ignored. See also: `stats.mode_last_retain_nulls`,
+  which retains nulls.'
+friendly_name: Mode Last
--- a/mozfun/stats/mode_last/udf.sql
+++ b/mozfun/stats/mode_last/udf.sql
@ -0,0 +1,28 @@
+/*
+Returns the most frequently occuring element in an array.
+In the case of multiple values tied for the highest count, it returns the value
+that appears latest in the array. Nulls are ignored.
+See also: `stats.mode_last_retain_nulls`, which retains nulls.
+*/
+CREATE OR REPLACE FUNCTION stats.mode_last(list ANY TYPE) AS (
+  (
+    SELECT
+      _value
+    FROM
+      UNNEST(list) AS _value
+      WITH OFFSET AS _offset
+    GROUP BY
+      _value
+    ORDER BY
+      COUNT(_value) DESC,
+      MAX(_offset) DESC
+    LIMIT
+      1
+  )
+);
+
+-- Test
+SELECT
+  assert_equals('bar', stats.mode_last(['foo', 'bar', 'baz', 'bar', 'fred'])),
+  assert_equals('baz', stats.mode_last(['foo', 'bar', 'baz', 'bar', 'baz', 'fred'])),
+  assert_equals('foo', stats.mode_last([NULL, 'foo', NULL]));
--- a/mozfun/stats/mode_last_retain_nulls/metadata.yaml
+++ b/mozfun/stats/mode_last_retain_nulls/metadata.yaml
@ -0,0 +1,5 @@
+description: 'Returns the most frequently occuring element in an array. In the case
+  of multiple values tied for the highest count, it returns the value that appears
+  latest in the array. Nulls are retained. See also: `stats.mode_last, which ignores
+  nulls.'
+friendly_name: Mode Last Retain Nulls
--- a/mozfun/stats/mode_last_retain_nulls/udf.sql
+++ b/mozfun/stats/mode_last_retain_nulls/udf.sql
@ -0,0 +1,28 @@
+/*
+Returns the most frequently occuring element in an array.
+In the case of multiple values tied for the highest count, it returns the value
+that appears latest in the array. Nulls are retained.
+See also: `stats.mode_last, which ignores nulls.
+*/
+CREATE OR REPLACE FUNCTION stats.mode_last_retain_nulls(list ANY TYPE) AS (
+  (
+    SELECT
+      _value
+    FROM
+      UNNEST(list) AS _value
+      WITH OFFSET AS _offset
+    GROUP BY
+      _value
+    ORDER BY
+      COUNT(*) DESC,
+      MAX(_offset) DESC
+    LIMIT
+      1
+  )
+);
+
+-- Test
+SELECT
+  assert_equals('bar', stats.mode_last_retain_nulls(['foo', 'bar', 'baz', 'bar', 'fred'])),
+  assert_equals('baz', stats.mode_last_retain_nulls(['foo', 'bar', 'baz', 'bar', 'baz', 'fred'])),
+  assert_equals(CAST(NULL AS STRING), stats.mode_last_retain_nulls([NULL, 'foo', NULL]));
--- a/udf/mode_last.sql
+++ b/udf/mode_last.sql
@ -1,31 +1,4 @@
-/*
-Returns the most frequently occuring element in an array.
-In the case of multiple values tied for the highest count, it returns the value
-that appears latest in the array. Nulls are ignored.
-See also: `udf.mode_last_retain_nulls`, which retains nulls.
-*/
-
-CREATE OR REPLACE FUNCTION
-  udf.mode_last(list ANY TYPE) AS ((
-    SELECT
-      _value
-    FROM
-      UNNEST(list) AS _value
-    WITH
-    OFFSET
-      AS
-    _offset
-    GROUP BY
-      _value
-    ORDER BY
-      COUNT(_value) DESC,
-      MAX(_offset) DESC
-    LIMIT
-      1 ));
-
-- Test
-
-SELECT
-  assert_equals('bar', udf.mode_last(['foo', 'bar', 'baz', 'bar', 'fred'])),
-  assert_equals('baz', udf.mode_last(['foo', 'bar', 'baz', 'bar', 'baz', 'fred'])),
-  assert_equals('foo', udf.mode_last([null, 'foo', null]));
+-- Legacy wrapper around a function moved to mozfun.
+CREATE OR REPLACE FUNCTION udf.mode_last(list ANY TYPE) AS (
+  mozfun.stats.mode_last(list)
+);
--- a/udf/mode_last_retain_nulls.sql
+++ b/udf/mode_last_retain_nulls.sql
@ -1,28 +1,4 @@
-/*
-Returns the most frequently occuring element in an array.
-In the case of multiple values tied for the highest count, it returns the value
-that appears latest in the array. Nulls are retained.
-See also: `udf.mode_last`, which ignores nulls.
-*/
+-- Legacy wrapper around a function moved to mozfun.
 CREATE OR REPLACE FUNCTION udf.mode_last_retain_nulls(list ANY TYPE) AS (
-  (
-    SELECT
-      _value
-    FROM
-      UNNEST(list) AS _value
-      WITH OFFSET AS _offset
-    GROUP BY
-      _value
-    ORDER BY
-      COUNT(*) DESC,
-      MAX(_offset) DESC
-    LIMIT
-      1
-  )
+  mozfun.stats.mode_last_retain_nulls(list)
 );
-
-- Test
-SELECT
-  assert_equals('bar', udf.mode_last_retain_nulls(['foo', 'bar', 'baz', 'bar', 'fred'])),
-  assert_equals('baz', udf.mode_last_retain_nulls(['foo', 'bar', 'baz', 'bar', 'baz', 'fred'])),
-  assert_equals(CAST(NULL AS STRING), udf.mode_last_retain_nulls([NULL, 'foo', NULL]));
--- a/udf_js/glean_percentile.sql
+++ b/udf_js/glean_percentile.sql
@ -1,51 +1,8 @@
+-- Legacy wrapper around a function moved to mozfun.
 CREATE OR REPLACE FUNCTION udf_js.glean_percentile(
  percentile FLOAT64,
  histogram ARRAY<STRUCT<key STRING, value FLOAT64>>,
  type STRING
-)
-RETURNS FLOAT64
-LANGUAGE js
-AS
-  '''
-  if (percentile < 0 || percentile > 100) {
-      throw "percentile must be a value between 0 and 100";
-  }
-
-  let values = histogram.map(bucket => bucket.value);
-  let total = values.reduce((a, b) => a + b);
-  let normalized = values.map(value => value / total);
-
-  // Find the index into the cumulative distribution function that corresponds
-  // to the percentile. This undershoots the true value of the percentile.
-  let acc = 0;
-  let index = null;
-  for (let i = 0; i < normalized.length; i++) {
-      acc += normalized[i];
-      index = i;
-      if (acc >= percentile / 100) {
-          break;
-      }
-  }
-
-  // NOTE: we do not perform geometric or linear interpolation, but this would
-  // be the place to implement it.
-  return histogram[index].key;
-''';
-
-SELECT
-  assert_equals(
-    2,
-    udf_js.glean_percentile(
-      50.0,
-      ARRAY<STRUCT<key STRING, value FLOAT64>>[("0", 1), ("2", 2), ("3", 1)],
-      "timing_distribution"
-    )
-  );
-
-#xfail
-SELECT
-  udf_js.glean_percentile(
-    101.0,
-    ARRAY<STRUCT<key STRING, value FLOAT64>>[("0", 1), ("2", 2), ("3", 1)],
-    "timing_distribution"
-  );
+) AS (
+  mozfun.glean.percentile(percentile, histogram, type)
+);