Bug 1630997: Control min and max values in timing distribution (#850)

This commit is contained in:
Michael Droettboom 2020-05-07 11:43:04 -04:00 коммит произвёл GitHub
Родитель 44bdbd4f7d
Коммит 366478c35c
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
12 изменённых файлов: 285 добавлений и 60 удалений

Просмотреть файл

@ -58,6 +58,7 @@ URIs
UTF
UUID
UUIDs
Underflowing
WebRender
Xcode
YAML
@ -129,6 +130,7 @@ mozilla
mozregression
mypy
namespace
ns
ol
pdoc
pidcat

Просмотреть файл

@ -2,6 +2,10 @@
[Full changelog](https://github.com/mozilla/glean/compare/v29.0.0...master)
* General:
* The version of glean_parser has been upgraded to v1.20.3
* The minimum and maximum values of a timing distribution can now be controlled by the `time_unit` parameter. See [bug 1630997](https://bugzilla.mozilla.org/show_bug.cgi?id=1630997) for more details.
# v29.0.0 (2020-05-05)
[Full changelog](https://github.com/mozilla/glean/compare/v28.0.0...v29.0.0)

Просмотреть файл

@ -17,6 +17,17 @@ If the Glean upload is disabled at the time `stopAndAccumulate` is called, nothi
Multiple concurrent timespans in different threads may be measured at the same time.
Timings are always stored and sent in the payload as nanoseconds. However, the `time_unit` parameter
controls the minimum and maximum values that will recorded:
- `nanosecond`: 1ns <= x <= 10 minutes
- `microsecond`: 1μs <= x <= ~6.94 days
- `millisecond`: 1ms <= x <= ~19 years
Overflowing this range is considered an error and is reported through the error reporting mechanism. Underflowing this range is not an error and the value is silently truncated to the minimum value.
Additionally, when a metric comes from GeckoView (the `geckoview_datapoint` parameter is present), the `time_unit` parameter specifies the unit that the samples are in when passed to Glean. Glean will convert all of the incoming samples to nanoseconds internally.
## Configuration
If you wanted to create a timing distribution to measure page load times, first you need to add an entry for it to the `metrics.yaml` file:
@ -30,8 +41,6 @@ pages:
...
```
> Note: Timing distributions have an optional `time_unit` parameter that is only used when samples are provided directly from an external tool in a unit other than nanoseconds.
## API
Now you can use the timing distribution from the application's code.
@ -226,7 +235,13 @@ assert 1 == metrics.pages.page_load.test_get_num_recorded_errors(
* On Python 3.7 and later, [`time.monotonic_ns()`](https://docs.python.org/3/library/time.html#time.monotonic_ns) is used. On earlier versions of Python, [`time.monotonics()`](https://docs.python.org/3/library/time.html#time.monotonic) is used, which is not guaranteed to have nanosecond resolution.
* The maximum timing value that will be recorded is 10 minutes. Longer times will be truncated to 10 minutes and an error will be recorded.
* The maximum timing value that will be recorded depends on the `time_unit` parameter:
- `nanosecond`: 1ns <= x <= 10 minutes
- `microsecond`: 1μs <= x <= ~6.94 days
- `millisecond`: 1ms <= x <= ~19 years
Longer times will be truncated to the maximum value and an error will be recorded.
## Examples
@ -236,7 +251,7 @@ assert 1 == metrics.pages.page_load.test_get_num_recorded_errors(
* `invalid_value`: If recording a negative timespan.
* `invalid_state`: If a non-existing/stopped timer is stopped again.
* `invalid_overflow`: If recording a time longer than 10 minutes.
* `invalid_overflow`: If recording a time longer than the maximum for the given unit.
## Reference

Просмотреть файл

@ -40,7 +40,7 @@ class TimingDistributionMetricTypeTest {
lifetime = Lifetime.Ping,
name = "timing_distribution",
sendInPings = listOf("store1"),
timeUnit = TimeUnit.Millisecond
timeUnit = TimeUnit.Nanosecond
))
// Accumulate a few values
@ -74,7 +74,7 @@ class TimingDistributionMetricTypeTest {
lifetime = Lifetime.Ping,
name = "timing_distribution",
sendInPings = listOf("store1"),
timeUnit = TimeUnit.Millisecond
timeUnit = TimeUnit.Nanosecond
)
// Attempt to store the timespan using set
@ -95,7 +95,7 @@ class TimingDistributionMetricTypeTest {
lifetime = Lifetime.Ping,
name = "timing_distribution",
sendInPings = listOf("store1"),
timeUnit = TimeUnit.Millisecond
timeUnit = TimeUnit.Nanosecond
)
metric.testGetValue()
}
@ -109,7 +109,7 @@ class TimingDistributionMetricTypeTest {
lifetime = Lifetime.Ping,
name = "timing_distribution",
sendInPings = listOf("store1", "store2", "store3"),
timeUnit = TimeUnit.Millisecond
timeUnit = TimeUnit.Nanosecond
))
// Accumulate a few values
@ -244,7 +244,7 @@ class TimingDistributionMetricTypeTest {
lifetime = Lifetime.Ping,
name = "timing_distribution_samples",
sendInPings = listOf("store1"),
timeUnit = TimeUnit.Second
timeUnit = TimeUnit.Nanosecond
))
// Create a test function to "measure". This works by mocking the getElapsedNanos return
@ -309,4 +309,41 @@ class TimingDistributionMetricTypeTest {
assertTrue("Metric must not have a value", !metric.testHasValue())
}
}
@Test
fun `ensure that time_unit controls truncation`() {
val maxSampleTime = 1000L * 1000 * 1000 * 60 * 10
for (unit in listOf(
TimeUnit.Nanosecond,
TimeUnit.Microsecond,
TimeUnit.Millisecond
)) {
val metric = spy(TimingDistributionMetricType(
disabled = false,
category = "telemetry",
lifetime = Lifetime.Ping,
name = "test_${unit.name}",
sendInPings = listOf("store1"),
timeUnit = unit
))
for (value in listOf(
1L,
1000L,
100000L,
maxSampleTime,
maxSampleTime * 1000L,
maxSampleTime * 1000000L
)) {
`when`(metric.getElapsedTimeNanos()).thenReturn(0L)
val timerId = metric.start()
`when`(metric.getElapsedTimeNanos()).thenReturn(value)
metric.stopAndAccumulate(timerId)
}
val snapshot = metric.testGetValue()
assertTrue(snapshot.values.size < 318)
}
}
}

Просмотреть файл

@ -19,7 +19,7 @@ class TimingDistributionTypeTests: XCTestCase {
sendInPings: ["store1"],
lifetime: .ping,
disabled: false,
timeUnit: .millisecond
timeUnit: .nanosecond
)
// Accumulate a few values
@ -42,7 +42,7 @@ class TimingDistributionTypeTests: XCTestCase {
sendInPings: ["store1"],
lifetime: .ping,
disabled: true,
timeUnit: .millisecond
timeUnit: .nanosecond
)
XCTAssertFalse(metric.testHasValue())
@ -103,7 +103,7 @@ class TimingDistributionTypeTests: XCTestCase {
sendInPings: ["store1"],
lifetime: .ping,
disabled: false,
timeUnit: .millisecond
timeUnit: .nanosecond
)
XCTAssertFalse(metric.testHasValue())
@ -123,7 +123,7 @@ class TimingDistributionTypeTests: XCTestCase {
sendInPings: ["store1"],
lifetime: .ping,
disabled: false,
timeUnit: .millisecond
timeUnit: .nanosecond
)
metric.stopAndAccumulate(0)
@ -138,7 +138,7 @@ class TimingDistributionTypeTests: XCTestCase {
sendInPings: ["store1"],
lifetime: .ping,
disabled: false,
timeUnit: .millisecond
timeUnit: .nanosecond
)
func testFunc(value: Bool) -> Bool {
@ -170,7 +170,7 @@ class TimingDistributionTypeTests: XCTestCase {
sendInPings: ["store1"],
lifetime: .application,
disabled: false,
timeUnit: .millisecond
timeUnit: .nanosecond
)
XCTAssertFalse(metric.testHasValue())

Просмотреть файл

@ -25,7 +25,7 @@
set -e
GLEAN_PARSER_VERSION=1.20.2
GLEAN_PARSER_VERSION=1.20.3
# CMDNAME is used in the usage text below.
# shellcheck disable=SC2034

Просмотреть файл

@ -48,7 +48,7 @@ with (ROOT.parent / "Cargo.toml").open() as cargo:
requirements = [
"cffi>=1",
"glean_parser==1.20.2",
"glean_parser==1.20.3",
"iso8601>=0.1.10; python_version<='3.6'",
]

Просмотреть файл

@ -12,14 +12,14 @@ from glean import testing
from glean import _util
def test_the_api_saves_to_its_storage_engine():
def test_the_api_saves_to_its_storage_engine(monkeypatch):
metric = metrics.TimingDistributionMetricType(
disabled=False,
category="telemetry",
lifetime=Lifetime.APPLICATION,
name="timing_distribution",
send_in_pings=["store1"],
time_unit=TimeUnit.MILLISECOND,
time_unit=TimeUnit.NANOSECOND,
)
override_time = 0
@ -27,18 +27,13 @@ def test_the_api_saves_to_its_storage_engine():
def override_time_ns():
return override_time
original_time_ns = _util.time_ns
_util.time_ns = override_time_ns
monkeypatch.setattr(_util, "time_ns", override_time_ns)
try:
for i in range(1, 4):
override_time = 0
timer_id = metric.start()
override_time = i
metric.stop_and_accumulate(timer_id)
finally:
_util.time_ns = original_time_ns
for i in range(1, 4):
override_time = 0
timer_id = metric.start()
override_time = i
metric.stop_and_accumulate(timer_id)
assert metric.test_has_value()
snapshot = metric.test_get_value()
@ -53,7 +48,7 @@ def test_disabled_timing_distributions_must_not_record_data():
lifetime=Lifetime.APPLICATION,
name="timing_distribution",
send_in_pings=["store1"],
time_unit=TimeUnit.MILLISECOND,
time_unit=TimeUnit.NANOSECOND,
)
timer_id = metric.start()
@ -69,21 +64,21 @@ def test_get_value_throws():
lifetime=Lifetime.APPLICATION,
name="timing_distribution",
send_in_pings=["store1"],
time_unit=TimeUnit.MILLISECOND,
time_unit=TimeUnit.NANOSECOND,
)
with pytest.raises(ValueError):
metric.test_get_value()
def test_api_saves_to_secondary_pings():
def test_api_saves_to_secondary_pings(monkeypatch):
metric = metrics.TimingDistributionMetricType(
disabled=False,
category="telemetry",
lifetime=Lifetime.APPLICATION,
name="timing_distribution",
send_in_pings=["store1", "store2", "store3"],
time_unit=TimeUnit.MILLISECOND,
time_unit=TimeUnit.NANOSECOND,
)
override_time = 0
@ -91,18 +86,13 @@ def test_api_saves_to_secondary_pings():
def override_time_ns():
return override_time
original_time_ns = _util.time_ns
_util.time_ns = override_time_ns
monkeypatch.setattr(_util, "time_ns", override_time_ns)
try:
for i in range(1, 4):
override_time = 0
timer_id = metric.start()
override_time = i
metric.stop_and_accumulate(timer_id)
finally:
_util.time_ns = original_time_ns
for i in range(1, 4):
override_time = 0
timer_id = metric.start()
override_time = i
metric.stop_and_accumulate(timer_id)
for store in ["store1", "store2", "store3"]:
assert metric.test_has_value(store)
@ -118,8 +108,45 @@ def test_stopping_a_non_existent_timer_records_an_error():
lifetime=Lifetime.APPLICATION,
name="timing_distribution",
send_in_pings=["store1", "store2", "store3"],
time_unit=TimeUnit.SECOND,
time_unit=TimeUnit.NANOSECOND,
)
metric.stop_and_accumulate(-1)
assert 1 == metric.test_get_num_recorded_errors(testing.ErrorType.INVALID_STATE)
def test_time_unit_controls_truncation(monkeypatch):
max_sample_time = 1000 * 1000 * 1000 * 60 * 10
override_time = 0
def override_time_ns():
return override_time
monkeypatch.setattr(_util, "time_ns", override_time_ns)
for unit in [TimeUnit.NANOSECOND, TimeUnit.MICROSECOND, TimeUnit.MILLISECOND]:
metric = metrics.TimingDistributionMetricType(
disabled=False,
category="telemetry",
lifetime=Lifetime.APPLICATION,
name="timing_distribution_{}".format(unit.name),
send_in_pings=["baseline"],
time_unit=unit,
)
for value in [
1,
100,
100000,
max_sample_time,
max_sample_time * 1000,
max_sample_time * 1000000,
]:
override_time = 0
timer_id = metric.start()
override_time = value
metric.stop_and_accumulate(timer_id)
snapshot = metric.test_get_value()
assert len(snapshot.values) < 318

Просмотреть файл

@ -5,9 +5,12 @@
// NOTE: This is a test-only file that contains unit tests for
// the lib.rs file.
use std::collections::HashSet;
use std::iter::FromIterator;
use super::*;
use crate::metrics::RecordedExperimentData;
use crate::metrics::{StringMetric, TimeUnit, TimespanMetric};
use crate::metrics::{StringMetric, TimeUnit, TimespanMetric, TimingDistributionMetric};
const GLOBAL_APPLICATION_ID: &str = "org.mozilla.glean.test.app";
pub fn new_glean(tempdir: Option<tempfile::TempDir>) -> (Glean, tempfile::TempDir) {
@ -644,3 +647,115 @@ fn test_change_metric_type_runtime() {
// https://bugzilla.mozilla.org/show_bug.cgi?id=1621757#c1 for more context.
assert_eq!(None, string_metric.test_get_value(&glean, ping_name));
}
#[test]
fn timing_distribution_truncation() {
let dir = tempfile::tempdir().unwrap();
let (glean, _) = new_glean(Some(dir));
let max_sample_time = 1000 * 1000 * 1000 * 60 * 10;
for (unit, expected_keys) in &[
(
TimeUnit::Nanosecond,
HashSet::<u64>::from_iter(vec![961_548, 939, 599_512_966_122, 1]),
),
(
TimeUnit::Microsecond,
HashSet::<u64>::from_iter(vec![939, 562_949_953_421_318, 599_512_966_122, 961_548]),
),
(
TimeUnit::Millisecond,
HashSet::<u64>::from_iter(vec![
961_548,
576_460_752_303_431_040,
599_512_966_122,
562_949_953_421_318,
]),
),
] {
let mut dist = TimingDistributionMetric::new(
CommonMetricData {
name: format!("local_metric_{:?}", unit),
category: "local".into(),
send_in_pings: vec!["baseline".into()],
..Default::default()
},
*unit,
);
for &value in &[
1,
1_000,
1_000_000,
max_sample_time,
max_sample_time * 1_000,
max_sample_time * 1_000_000,
] {
let timer_id = dist.set_start(0);
dist.set_stop_and_accumulate(&glean, timer_id, value);
}
let hist = dist.test_get_value(&glean, "baseline").unwrap();
assert_eq!(4, hist.values().len());
for &key in hist.values().keys() {
assert!(key < max_sample_time * unit.as_nanos(1))
}
let keys = HashSet::<u64>::from_iter(hist.values().keys().cloned());
assert_eq!(keys, *expected_keys);
let snapshot = hist.snapshot();
// The number of samples was originally designed around 1ns to
// 10minutes, with 8 steps per power of 2, which works out to 316 items.
// This is to ensure that holds even when the time unit is changed.
assert!(snapshot.len() < 316);
}
}
#[test]
fn timing_distribution_truncation_accumulate() {
let dir = tempfile::tempdir().unwrap();
let (glean, _) = new_glean(Some(dir));
let max_sample_time = 1000 * 1000 * 1000 * 60 * 10;
for &unit in &[
TimeUnit::Nanosecond,
TimeUnit::Microsecond,
TimeUnit::Millisecond,
] {
let mut dist = TimingDistributionMetric::new(
CommonMetricData {
name: format!("local_metric_{:?}", unit),
category: "local".into(),
send_in_pings: vec!["baseline".into()],
..Default::default()
},
unit,
);
dist.accumulate_samples_signed(
&glean,
vec![
1,
1_000,
1_000_000,
max_sample_time,
max_sample_time * 1_000,
max_sample_time * 1_000_000,
],
);
let hist = dist.test_get_value(&glean, "baseline").unwrap();
assert_eq!(4, hist.values().len());
let snapshot = hist.snapshot();
// The number of samples was originally designed around 1ns to
// 10minutes, with 8 steps per power of 2, which works out to 316 items.
// This is to ensure that holds even when the time unit is changed.
assert!(snapshot.len() < 316);
}
}

Просмотреть файл

@ -171,7 +171,7 @@ impl MemoryDistributionMetric {
if num_too_log_samples > 0 {
let msg = format!(
"Accumulated {} samples longer than 10 minutes",
"Accumulated {} samples larger than 1TB",
num_too_log_samples
);
record_error(

Просмотреть файл

@ -21,8 +21,13 @@ const LOG_BASE: f64 = 2.0;
// The buckets per each order of magnitude of the logarithm.
const BUCKETS_PER_MAGNITUDE: f64 = 8.0;
// Maximum time of 10 minutes in nanoseconds. This maximum means we
// retain a maximum of 313 buckets.
// Maximum time, which means we retain a maximum of 316 buckets.
// It is automatically adjusted based on the `time_unit` parameter
// so that:
//
// - `nanosecond`: 10 minutes
// - `microsecond`: ~6.94 days
// - `millisecond`: ~19 years
const MAX_SAMPLE_TIME: u64 = 1000 * 1000 * 1000 * 60 * 10;
/// Identifier for a running timer.
@ -174,11 +179,23 @@ impl TimingDistributionMetric {
Ok(duration) => duration,
};
if duration > MAX_SAMPLE_TIME {
let msg = "Sample is longer than 10 minutes";
let min_sample_time = self.time_unit.as_nanos(1);
let max_sample_time = self.time_unit.as_nanos(MAX_SAMPLE_TIME);
duration = if duration < min_sample_time {
// If measurement is less than the minimum, just truncate. This is
// not recorded as an error.
min_sample_time
} else if duration > max_sample_time {
let msg = format!(
"Sample is longer than the max for a time_unit of {:?} ({} ns)",
self.time_unit, max_sample_time
);
record_error(glean, &self.meta, ErrorType::InvalidOverflow, msg, None);
duration = MAX_SAMPLE_TIME;
}
max_sample_time
} else {
duration
};
if !self.should_record(glean) {
return;
@ -236,6 +253,7 @@ impl TimingDistributionMetric {
pub fn accumulate_samples_signed(&mut self, glean: &Glean, samples: Vec<i64>) {
let mut num_negative_samples = 0;
let mut num_too_long_samples = 0;
let max_sample_time = self.time_unit.as_nanos(MAX_SAMPLE_TIME);
glean.storage().record_with(glean, &self.meta, |old_value| {
let mut hist = match old_value {
@ -247,13 +265,20 @@ impl TimingDistributionMetric {
if sample < 0 {
num_negative_samples += 1;
} else {
let sample = sample as u64;
let mut sample = self.time_unit.as_nanos(sample);
if sample > MAX_SAMPLE_TIME {
let mut sample = sample as u64;
// Check the range prior to converting the incoming unit to
// nanoseconds, so we can compare against the constant
// MAX_SAMPLE_TIME.
if sample == 0 {
sample = 1;
} else if sample > MAX_SAMPLE_TIME {
num_too_long_samples += 1;
sample = MAX_SAMPLE_TIME;
}
sample = self.time_unit.as_nanos(sample);
hist.accumulate(sample);
}
}
@ -273,8 +298,8 @@ impl TimingDistributionMetric {
if num_too_long_samples > 0 {
let msg = format!(
"Accumulated {} samples longer than 10 minutes",
num_too_long_samples
"{} samples are longer than the maximum of {}",
num_too_long_samples, max_sample_time
);
record_error(
glean,

Просмотреть файл

@ -34,7 +34,7 @@ class GleanMetricsYamlTransform extends ArtifactTransform {
@SuppressWarnings("GrPackage")
class GleanPlugin implements Plugin<Project> {
// The version of glean_parser to install from PyPI.
private String GLEAN_PARSER_VERSION = "1.20.2"
private String GLEAN_PARSER_VERSION = "1.20.3"
// The version of Miniconda is explicitly specified.
// Miniconda3-4.5.12 is known to not work on Windows.
private String MINICONDA_VERSION = "4.5.11"