Bug 1630997: Control min and max values in timing distribution (#850)

2020-05-07 11:43:04 -04:00 · 2020-05-07 11:43:04 -04:00 · 366478c35c
--- a/.dictionary
+++ b/.dictionary
@ -58,6 +58,7 @@ URIs
 UTF
 UUID
 UUIDs
+Underflowing
 WebRender
 Xcode
 YAML
@ -129,6 +130,7 @@ mozilla
 mozregression
 mypy
 namespace
+ns
 ol
 pdoc
 pidcat
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,10 @@

 [Full changelog](https://github.com/mozilla/glean/compare/v29.0.0...master)

+* General:
+  * The version of glean_parser has been upgraded to v1.20.3
+  * The minimum and maximum values of a timing distribution can now be controlled by the `time_unit` parameter. See [bug 1630997](https://bugzilla.mozilla.org/show_bug.cgi?id=1630997) for more details.
+
 # v29.0.0 (2020-05-05)

 [Full changelog](https://github.com/mozilla/glean/compare/v28.0.0...v29.0.0)
--- a/docs/user/metrics/timing_distribution.md
+++ b/docs/user/metrics/timing_distribution.md
@ -17,6 +17,17 @@ If the Glean upload is disabled at the time `stopAndAccumulate` is called, nothi

 Multiple concurrent timespans in different threads may be measured at the same time.

+Timings are always stored and sent in the payload as nanoseconds. However, the `time_unit` parameter
+controls the minimum and maximum values that will recorded:
+
+  - `nanosecond`: 1ns <= x <= 10 minutes
+  - `microsecond`: 1μs <= x <= ~6.94 days
+  - `millisecond`: 1ms <= x <= ~19 years
+
+Overflowing this range is considered an error and is reported through the error reporting mechanism. Underflowing this range is not an error and the value is silently truncated to the minimum value.
+
+Additionally, when a metric comes from GeckoView (the `geckoview_datapoint` parameter is present), the `time_unit` parameter specifies the unit that the samples are in when passed to Glean. Glean will convert all of the incoming samples to nanoseconds internally.
+
 ## Configuration

 If you wanted to create a timing distribution to measure page load times, first you need to add an entry for it to the `metrics.yaml` file:
@ -30,8 +41,6 @@ pages:
    ...
 ```

-> Note: Timing distributions have an optional `time_unit` parameter that is only used when samples are provided directly from an external tool in a unit other than nanoseconds.
-
 ## API

 Now you can use the timing distribution from the application's code.
@ -226,7 +235,13 @@ assert 1 == metrics.pages.page_load.test_get_num_recorded_errors(

  * On Python 3.7 and later, [`time.monotonic_ns()`](https://docs.python.org/3/library/time.html#time.monotonic_ns) is used.  On earlier versions of Python, [`time.monotonics()`](https://docs.python.org/3/library/time.html#time.monotonic) is used, which is not guaranteed to have nanosecond resolution.

-* The maximum timing value that will be recorded is 10 minutes. Longer times will be truncated to 10 minutes and an error will be recorded.
+* The maximum timing value that will be recorded depends on the `time_unit` parameter:
+
+  - `nanosecond`: 1ns <= x <= 10 minutes
+  - `microsecond`: 1μs <= x <= ~6.94 days
+  - `millisecond`: 1ms <= x <= ~19 years
+
+  Longer times will be truncated to the maximum value and an error will be recorded.

 ## Examples

@ -236,7 +251,7 @@ assert 1 == metrics.pages.page_load.test_get_num_recorded_errors(

 * `invalid_value`: If recording a negative timespan.
 * `invalid_state`: If a non-existing/stopped timer is stopped again.
-* `invalid_overflow`: If recording a time longer than 10 minutes.
+* `invalid_overflow`: If recording a time longer than the maximum for the given unit.

 ## Reference

--- a/glean-core/android/src/test/java/mozilla/telemetry/glean/private/TimingDistributionMetricTypeTest.kt
+++ b/glean-core/android/src/test/java/mozilla/telemetry/glean/private/TimingDistributionMetricTypeTest.kt
@ -40,7 +40,7 @@ class TimingDistributionMetricTypeTest {
            lifetime = Lifetime.Ping,
            name = "timing_distribution",
            sendInPings = listOf("store1"),
-            timeUnit = TimeUnit.Millisecond
+            timeUnit = TimeUnit.Nanosecond
        ))

        // Accumulate a few values
@ -74,7 +74,7 @@ class TimingDistributionMetricTypeTest {
            lifetime = Lifetime.Ping,
            name = "timing_distribution",
            sendInPings = listOf("store1"),
-            timeUnit = TimeUnit.Millisecond
+            timeUnit = TimeUnit.Nanosecond
        )

        // Attempt to store the timespan using set
@ -95,7 +95,7 @@ class TimingDistributionMetricTypeTest {
            lifetime = Lifetime.Ping,
            name = "timing_distribution",
            sendInPings = listOf("store1"),
-            timeUnit = TimeUnit.Millisecond
+            timeUnit = TimeUnit.Nanosecond
        )
        metric.testGetValue()
    }
@ -109,7 +109,7 @@ class TimingDistributionMetricTypeTest {
            lifetime = Lifetime.Ping,
            name = "timing_distribution",
            sendInPings = listOf("store1", "store2", "store3"),
-            timeUnit = TimeUnit.Millisecond
+            timeUnit = TimeUnit.Nanosecond
        ))

        // Accumulate a few values
@ -244,7 +244,7 @@ class TimingDistributionMetricTypeTest {
            lifetime = Lifetime.Ping,
            name = "timing_distribution_samples",
            sendInPings = listOf("store1"),
-            timeUnit = TimeUnit.Second
+            timeUnit = TimeUnit.Nanosecond
        ))

        // Create a test function to "measure". This works by mocking the getElapsedNanos return
@ -309,4 +309,41 @@ class TimingDistributionMetricTypeTest {
            assertTrue("Metric must not have a value", !metric.testHasValue())
        }
    }
+
+    @Test
+    fun `ensure that time_unit controls truncation`() {
+        val maxSampleTime = 1000L * 1000 * 1000 * 60 * 10
+
+        for (unit in listOf(
+                TimeUnit.Nanosecond,
+                TimeUnit.Microsecond,
+                TimeUnit.Millisecond
+        )) {
+            val metric = spy(TimingDistributionMetricType(
+                disabled = false,
+                category = "telemetry",
+                lifetime = Lifetime.Ping,
+                name = "test_${unit.name}",
+                sendInPings = listOf("store1"),
+                timeUnit = unit
+            ))
+
+            for (value in listOf(
+                    1L,
+                    1000L,
+                    100000L,
+                    maxSampleTime,
+                    maxSampleTime * 1000L,
+                    maxSampleTime * 1000000L
+            )) {
+                `when`(metric.getElapsedTimeNanos()).thenReturn(0L)
+                val timerId = metric.start()
+                `when`(metric.getElapsedTimeNanos()).thenReturn(value)
+                metric.stopAndAccumulate(timerId)
+            }
+
+            val snapshot = metric.testGetValue()
+            assertTrue(snapshot.values.size < 318)
+        }
+    }
 }
--- a/glean-core/ios/GleanTests/Metrics/TimingDistributionMetricTests.swift
+++ b/glean-core/ios/GleanTests/Metrics/TimingDistributionMetricTests.swift
@ -19,7 +19,7 @@ class TimingDistributionTypeTests: XCTestCase {
            sendInPings: ["store1"],
            lifetime: .ping,
            disabled: false,
-            timeUnit: .millisecond
+            timeUnit: .nanosecond
        )

        // Accumulate a few values
@ -42,7 +42,7 @@ class TimingDistributionTypeTests: XCTestCase {
            sendInPings: ["store1"],
            lifetime: .ping,
            disabled: true,
-            timeUnit: .millisecond
+            timeUnit: .nanosecond
        )

        XCTAssertFalse(metric.testHasValue())
@ -103,7 +103,7 @@ class TimingDistributionTypeTests: XCTestCase {
            sendInPings: ["store1"],
            lifetime: .ping,
            disabled: false,
-            timeUnit: .millisecond
+            timeUnit: .nanosecond
        )

        XCTAssertFalse(metric.testHasValue())
@ -123,7 +123,7 @@ class TimingDistributionTypeTests: XCTestCase {
            sendInPings: ["store1"],
            lifetime: .ping,
            disabled: false,
-            timeUnit: .millisecond
+            timeUnit: .nanosecond
        )

        metric.stopAndAccumulate(0)
@ -138,7 +138,7 @@ class TimingDistributionTypeTests: XCTestCase {
            sendInPings: ["store1"],
            lifetime: .ping,
            disabled: false,
-            timeUnit: .millisecond
+            timeUnit: .nanosecond
        )

        func testFunc(value: Bool) -> Bool {
@ -170,7 +170,7 @@ class TimingDistributionTypeTests: XCTestCase {
            sendInPings: ["store1"],
            lifetime: .application,
            disabled: false,
-            timeUnit: .millisecond
+            timeUnit: .nanosecond
        )

        XCTAssertFalse(metric.testHasValue())
--- a/glean-core/ios/sdk_generator.sh
+++ b/glean-core/ios/sdk_generator.sh
@ -25,7 +25,7 @@

 set -e

-GLEAN_PARSER_VERSION=1.20.2
+GLEAN_PARSER_VERSION=1.20.3

 # CMDNAME is used in the usage text below.
 # shellcheck disable=SC2034
--- a/glean-core/python/setup.py
+++ b/glean-core/python/setup.py
@ -48,7 +48,7 @@ with (ROOT.parent / "Cargo.toml").open() as cargo:

 requirements = [
    "cffi>=1",
-    "glean_parser==1.20.2",
+    "glean_parser==1.20.3",
    "iso8601>=0.1.10; python_version<='3.6'",
 ]

--- a/glean-core/python/tests/metrics/test_timing_distribution.py
+++ b/glean-core/python/tests/metrics/test_timing_distribution.py
@ -12,14 +12,14 @@ from glean import testing
 from glean import _util


-def test_the_api_saves_to_its_storage_engine():
+def test_the_api_saves_to_its_storage_engine(monkeypatch):
    metric = metrics.TimingDistributionMetricType(
        disabled=False,
        category="telemetry",
        lifetime=Lifetime.APPLICATION,
        name="timing_distribution",
        send_in_pings=["store1"],
-        time_unit=TimeUnit.MILLISECOND,
+        time_unit=TimeUnit.NANOSECOND,
    )

    override_time = 0
@ -27,18 +27,13 @@ def test_the_api_saves_to_its_storage_engine():
    def override_time_ns():
        return override_time

-    original_time_ns = _util.time_ns
-    _util.time_ns = override_time_ns
+    monkeypatch.setattr(_util, "time_ns", override_time_ns)

-    try:
-        for i in range(1, 4):
-            override_time = 0
-            timer_id = metric.start()
-            override_time = i
-            metric.stop_and_accumulate(timer_id)
-
-    finally:
-        _util.time_ns = original_time_ns
+    for i in range(1, 4):
+        override_time = 0
+        timer_id = metric.start()
+        override_time = i
+        metric.stop_and_accumulate(timer_id)

    assert metric.test_has_value()
    snapshot = metric.test_get_value()
@ -53,7 +48,7 @@ def test_disabled_timing_distributions_must_not_record_data():
        lifetime=Lifetime.APPLICATION,
        name="timing_distribution",
        send_in_pings=["store1"],
-        time_unit=TimeUnit.MILLISECOND,
+        time_unit=TimeUnit.NANOSECOND,
    )

    timer_id = metric.start()
@ -69,21 +64,21 @@ def test_get_value_throws():
        lifetime=Lifetime.APPLICATION,
        name="timing_distribution",
        send_in_pings=["store1"],
-        time_unit=TimeUnit.MILLISECOND,
+        time_unit=TimeUnit.NANOSECOND,
    )

    with pytest.raises(ValueError):
        metric.test_get_value()


-def test_api_saves_to_secondary_pings():
+def test_api_saves_to_secondary_pings(monkeypatch):
    metric = metrics.TimingDistributionMetricType(
        disabled=False,
        category="telemetry",
        lifetime=Lifetime.APPLICATION,
        name="timing_distribution",
        send_in_pings=["store1", "store2", "store3"],
-        time_unit=TimeUnit.MILLISECOND,
+        time_unit=TimeUnit.NANOSECOND,
    )

    override_time = 0
@ -91,18 +86,13 @@ def test_api_saves_to_secondary_pings():
    def override_time_ns():
        return override_time

-    original_time_ns = _util.time_ns
-    _util.time_ns = override_time_ns
+    monkeypatch.setattr(_util, "time_ns", override_time_ns)

-    try:
-        for i in range(1, 4):
-            override_time = 0
-            timer_id = metric.start()
-            override_time = i
-            metric.stop_and_accumulate(timer_id)
-
-    finally:
-        _util.time_ns = original_time_ns
+    for i in range(1, 4):
+        override_time = 0
+        timer_id = metric.start()
+        override_time = i
+        metric.stop_and_accumulate(timer_id)

    for store in ["store1", "store2", "store3"]:
        assert metric.test_has_value(store)
@ -118,8 +108,45 @@ def test_stopping_a_non_existent_timer_records_an_error():
        lifetime=Lifetime.APPLICATION,
        name="timing_distribution",
        send_in_pings=["store1", "store2", "store3"],
-        time_unit=TimeUnit.SECOND,
+        time_unit=TimeUnit.NANOSECOND,
    )

    metric.stop_and_accumulate(-1)
    assert 1 == metric.test_get_num_recorded_errors(testing.ErrorType.INVALID_STATE)
+
+
+def test_time_unit_controls_truncation(monkeypatch):
+    max_sample_time = 1000 * 1000 * 1000 * 60 * 10
+
+    override_time = 0
+
+    def override_time_ns():
+        return override_time
+
+    monkeypatch.setattr(_util, "time_ns", override_time_ns)
+
+    for unit in [TimeUnit.NANOSECOND, TimeUnit.MICROSECOND, TimeUnit.MILLISECOND]:
+        metric = metrics.TimingDistributionMetricType(
+            disabled=False,
+            category="telemetry",
+            lifetime=Lifetime.APPLICATION,
+            name="timing_distribution_{}".format(unit.name),
+            send_in_pings=["baseline"],
+            time_unit=unit,
+        )
+
+        for value in [
+            1,
+            100,
+            100000,
+            max_sample_time,
+            max_sample_time * 1000,
+            max_sample_time * 1000000,
+        ]:
+            override_time = 0
+            timer_id = metric.start()
+            override_time = value
+            metric.stop_and_accumulate(timer_id)
+
+        snapshot = metric.test_get_value()
+        assert len(snapshot.values) < 318
--- a/glean-core/src/lib_unit_tests.rs
+++ b/glean-core/src/lib_unit_tests.rs
@ -5,9 +5,12 @@
 // NOTE: This is a test-only file that contains unit tests for
 // the lib.rs file.

+use std::collections::HashSet;
+use std::iter::FromIterator;
+
 use super::*;
 use crate::metrics::RecordedExperimentData;
-use crate::metrics::{StringMetric, TimeUnit, TimespanMetric};
+use crate::metrics::{StringMetric, TimeUnit, TimespanMetric, TimingDistributionMetric};

 const GLOBAL_APPLICATION_ID: &str = "org.mozilla.glean.test.app";
 pub fn new_glean(tempdir: Option<tempfile::TempDir>) -> (Glean, tempfile::TempDir) {
@ -644,3 +647,115 @@ fn test_change_metric_type_runtime() {
    // https://bugzilla.mozilla.org/show_bug.cgi?id=1621757#c1 for more context.
    assert_eq!(None, string_metric.test_get_value(&glean, ping_name));
 }
+
+#[test]
+fn timing_distribution_truncation() {
+    let dir = tempfile::tempdir().unwrap();
+
+    let (glean, _) = new_glean(Some(dir));
+    let max_sample_time = 1000 * 1000 * 1000 * 60 * 10;
+
+    for (unit, expected_keys) in &[
+        (
+            TimeUnit::Nanosecond,
+            HashSet::<u64>::from_iter(vec![961_548, 939, 599_512_966_122, 1]),
+        ),
+        (
+            TimeUnit::Microsecond,
+            HashSet::<u64>::from_iter(vec![939, 562_949_953_421_318, 599_512_966_122, 961_548]),
+        ),
+        (
+            TimeUnit::Millisecond,
+            HashSet::<u64>::from_iter(vec![
+                961_548,
+                576_460_752_303_431_040,
+                599_512_966_122,
+                562_949_953_421_318,
+            ]),
+        ),
+    ] {
+        let mut dist = TimingDistributionMetric::new(
+            CommonMetricData {
+                name: format!("local_metric_{:?}", unit),
+                category: "local".into(),
+                send_in_pings: vec!["baseline".into()],
+                ..Default::default()
+            },
+            *unit,
+        );
+
+        for &value in &[
+            1,
+            1_000,
+            1_000_000,
+            max_sample_time,
+            max_sample_time * 1_000,
+            max_sample_time * 1_000_000,
+        ] {
+            let timer_id = dist.set_start(0);
+            dist.set_stop_and_accumulate(&glean, timer_id, value);
+        }
+
+        let hist = dist.test_get_value(&glean, "baseline").unwrap();
+
+        assert_eq!(4, hist.values().len());
+        for &key in hist.values().keys() {
+            assert!(key < max_sample_time * unit.as_nanos(1))
+        }
+
+        let keys = HashSet::<u64>::from_iter(hist.values().keys().cloned());
+        assert_eq!(keys, *expected_keys);
+
+        let snapshot = hist.snapshot();
+        // The number of samples was originally designed around 1ns to
+        // 10minutes, with 8 steps per power of 2, which works out to 316 items.
+        // This is to ensure that holds even when the time unit is changed.
+        assert!(snapshot.len() < 316);
+    }
+}
+
+#[test]
+fn timing_distribution_truncation_accumulate() {
+    let dir = tempfile::tempdir().unwrap();
+
+    let (glean, _) = new_glean(Some(dir));
+    let max_sample_time = 1000 * 1000 * 1000 * 60 * 10;
+
+    for &unit in &[
+        TimeUnit::Nanosecond,
+        TimeUnit::Microsecond,
+        TimeUnit::Millisecond,
+    ] {
+        let mut dist = TimingDistributionMetric::new(
+            CommonMetricData {
+                name: format!("local_metric_{:?}", unit),
+                category: "local".into(),
+                send_in_pings: vec!["baseline".into()],
+                ..Default::default()
+            },
+            unit,
+        );
+
+        dist.accumulate_samples_signed(
+            &glean,
+            vec![
+                1,
+                1_000,
+                1_000_000,
+                max_sample_time,
+                max_sample_time * 1_000,
+                max_sample_time * 1_000_000,
+            ],
+        );
+
+        let hist = dist.test_get_value(&glean, "baseline").unwrap();
+
+        assert_eq!(4, hist.values().len());
+
+        let snapshot = hist.snapshot();
+        // The number of samples was originally designed around 1ns to
+        // 10minutes, with 8 steps per power of 2, which works out to 316 items.
+        // This is to ensure that holds even when the time unit is changed.
+        assert!(snapshot.len() < 316);
+    }
+}
--- a/glean-core/src/metrics/memory_distribution.rs
+++ b/glean-core/src/metrics/memory_distribution.rs
@ -171,7 +171,7 @@ impl MemoryDistributionMetric {

        if num_too_log_samples > 0 {
            let msg = format!(
-                "Accumulated {} samples longer than 10 minutes",
+                "Accumulated {} samples larger than 1TB",
                num_too_log_samples
            );
            record_error(
--- a/glean-core/src/metrics/timing_distribution.rs
+++ b/glean-core/src/metrics/timing_distribution.rs
@ -21,8 +21,13 @@ const LOG_BASE: f64 = 2.0;
 // The buckets per each order of magnitude of the logarithm.
 const BUCKETS_PER_MAGNITUDE: f64 = 8.0;

-// Maximum time of 10 minutes in nanoseconds. This maximum means we
-// retain a maximum of 313 buckets.
+// Maximum time, which means we retain a maximum of 316 buckets.
+// It is automatically adjusted based on the `time_unit` parameter
+// so that:
+//
+// - `nanosecond`: 10 minutes
+// - `microsecond`: ~6.94 days
+// - `millisecond`: ~19 years
 const MAX_SAMPLE_TIME: u64 = 1000 * 1000 * 1000 * 60 * 10;

 /// Identifier for a running timer.
@ -174,11 +179,23 @@ impl TimingDistributionMetric {
            Ok(duration) => duration,
        };

-        if duration > MAX_SAMPLE_TIME {
-            let msg = "Sample is longer than 10 minutes";
+        let min_sample_time = self.time_unit.as_nanos(1);
+        let max_sample_time = self.time_unit.as_nanos(MAX_SAMPLE_TIME);
+
+        duration = if duration < min_sample_time {
+            // If measurement is less than the minimum, just truncate. This is
+            // not recorded as an error.
+            min_sample_time
+        } else if duration > max_sample_time {
+            let msg = format!(
+                "Sample is longer than the max for a time_unit of {:?} ({} ns)",
+                self.time_unit, max_sample_time
+            );
            record_error(glean, &self.meta, ErrorType::InvalidOverflow, msg, None);
-            duration = MAX_SAMPLE_TIME;
-        }
+            max_sample_time
+        } else {
+            duration
+        };

        if !self.should_record(glean) {
            return;
@ -236,6 +253,7 @@ impl TimingDistributionMetric {
    pub fn accumulate_samples_signed(&mut self, glean: &Glean, samples: Vec<i64>) {
        let mut num_negative_samples = 0;
        let mut num_too_long_samples = 0;
+        let max_sample_time = self.time_unit.as_nanos(MAX_SAMPLE_TIME);

        glean.storage().record_with(glean, &self.meta, |old_value| {
            let mut hist = match old_value {
@ -247,13 +265,20 @@ impl TimingDistributionMetric {
                if sample < 0 {
                    num_negative_samples += 1;
                } else {
-                    let sample = sample as u64;
-                    let mut sample = self.time_unit.as_nanos(sample);
-                    if sample > MAX_SAMPLE_TIME {
+                    let mut sample = sample as u64;
+
+                    // Check the range prior to converting the incoming unit to
+                    // nanoseconds, so we can compare against the constant
+                    // MAX_SAMPLE_TIME.
+                    if sample == 0 {
+                        sample = 1;
+                    } else if sample > MAX_SAMPLE_TIME {
                        num_too_long_samples += 1;
                        sample = MAX_SAMPLE_TIME;
                    }

+                    sample = self.time_unit.as_nanos(sample);
+
                    hist.accumulate(sample);
                }
            }
@ -273,8 +298,8 @@ impl TimingDistributionMetric {

        if num_too_long_samples > 0 {
            let msg = format!(
-                "Accumulated {} samples longer than 10 minutes",
-                num_too_long_samples
+                "{} samples are longer than the maximum of {}",
+                num_too_long_samples, max_sample_time
            );
            record_error(
                glean,
--- a/gradle-plugin/src/main/groovy/mozilla/telemetry/glean-gradle-plugin/GleanGradlePlugin.groovy
+++ b/gradle-plugin/src/main/groovy/mozilla/telemetry/glean-gradle-plugin/GleanGradlePlugin.groovy
@ -34,7 +34,7 @@ class GleanMetricsYamlTransform extends ArtifactTransform {
@SuppressWarnings("GrPackage")
 class GleanPlugin implements Plugin<Project> {
    // The version of glean_parser to install from PyPI.
-    private String GLEAN_PARSER_VERSION = "1.20.2"
+    private String GLEAN_PARSER_VERSION = "1.20.3"
    // The version of Miniconda is explicitly specified.
    // Miniconda3-4.5.12 is known to not work on Windows.
    private String MINICONDA_VERSION = "4.5.11"