Bug 1318297 - Support initial implementation for Health ping. r=gfritzsche

Health ping is supposed to be a small ping for monitoring failures. Current implementation tracks discarded for size and send failures.
2017-07-25 16:19:31 +01:00 · 2017-07-25 16:19:31 +01:00 · 13c06e90e4
--- a/toolkit/components/telemetry/TelemetryController.jsm
+++ b/toolkit/components/telemetry/TelemetryController.jsm
@ -76,6 +76,8 @@ XPCOMUtils.defineLazyModuleGetter(this, "TelemetryReportingPolicy",
                                  "resource://gre/modules/TelemetryReportingPolicy.jsm");
 XPCOMUtils.defineLazyModuleGetter(this, "TelemetryModules",
                                  "resource://gre/modules/TelemetryModules.jsm");
+XPCOMUtils.defineLazyModuleGetter(this, "TelemetryHealthPing",
+                                  "resource://gre/modules/TelemetryHealthPing.jsm");

 /**
 * Setup Telemetry logging. This function also gets called when loggin related
@ -788,6 +790,9 @@ var Impl = {
      // Stop any ping sending.
      await TelemetrySend.shutdown();

+      // Send latest data.
+      await TelemetryHealthPing.shutdown();
+
      await TelemetrySession.shutdown();

      // First wait for clients processing shutdown.
--- a/toolkit/components/telemetry/TelemetryHealthPing.jsm
+++ b/toolkit/components/telemetry/TelemetryHealthPing.jsm
@ -0,0 +1,248 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/*
+ * This module collects data on send failures and other critical issues with Telemetry submissions.
+ */
+
+"use strict";
+
+this.EXPORTED_SYMBOLS = [
+  "TelemetryHealthPing",
+];
+
+const {classes: Cc, interfaces: Ci, utils: Cu, results: Cr} = Components;
+
+Cu.import("resource://gre/modules/XPCOMUtils.jsm", this);
+
+XPCOMUtils.defineLazyModuleGetter(this, "TelemetryController", "resource://gre/modules/TelemetryController.jsm");
+XPCOMUtils.defineLazyModuleGetter(this, "setTimeout", "resource://gre/modules/Timer.jsm");
+XPCOMUtils.defineLazyModuleGetter(this, "clearTimeout", "resource://gre/modules/Timer.jsm");
+XPCOMUtils.defineLazyModuleGetter(this, "TelemetryUtils", "resource://gre/modules/TelemetryUtils.jsm");
+XPCOMUtils.defineLazyModuleGetter(this, "TelemetrySend", "resource://gre/modules/TelemetrySend.jsm");
+XPCOMUtils.defineLazyModuleGetter(this, "Services", "resource://gre/modules/Services.jsm");
+XPCOMUtils.defineLazyModuleGetter(this, "Log", "resource://gre/modules/Log.jsm");
+XPCOMUtils.defineLazyModuleGetter(this, "Preferences", "resource://gre/modules/Preferences.jsm");
+
+const Utils = TelemetryUtils;
+
+const MS_IN_A_MINUTE = 60 * 1000;
+const IS_HEALTH_PING_ENABLED = Preferences.get(TelemetryUtils.Preferences.HealthPingEnabled, true);
+
+// Send health ping every hour
+const SEND_TICK_DELAY = 60 * MS_IN_A_MINUTE;
+
+// Send top 10 discarded pings only to minimize health ping size
+const MAX_SEND_DISCARDED_PINGS = 10;
+
+const LOGGER_NAME = "Toolkit.Telemetry";
+const LOGGER_PREFIX = "TelemetryHealthPing::";
+
+var Policy = {
+  setSchedulerTickTimeout: (callback, delayMs) => setTimeout(callback, delayMs),
+  clearSchedulerTickTimeout: (id) => clearTimeout(id)
+};
+
+this.TelemetryHealthPing = {
+  Reason: Object.freeze({
+    IMMEDIATE: "immediate", // Ping was sent immediately after recording with no delay.
+    DELAYED: "delayed",     // Recorded data was sent after a delay.
+    SHUT_DOWN: "shutdown",  // Recorded data was sent on shutdown.
+  }),
+
+  FailureType: Object.freeze({
+    DISCARDED_FOR_SIZE: "pingDiscardedForSize",
+    SEND_FAILURE: "sendFailure",
+  }),
+
+  OsInfo: Object.freeze({
+    "name": Services.appinfo.OS,
+    "version": Services.sysinfo.get("kernel_version") || Services.sysinfo.get("version")
+  }),
+
+  HEALTH_PING_TYPE: "health",
+
+  _logger: null,
+
+  // The health ping is sent every every SEND_TICK_DELAY.
+  // Initialize this so that first failures are sent immediately.
+  _lastSendTime: -SEND_TICK_DELAY,
+
+  /**
+   * This stores reported send failures with the following structure:
+   * {
+   *  type1: {
+   *    subtype1: value,
+   *    ...
+   *    subtypeN: value
+   *  },
+   *  ...
+   * }
+   */
+  _failures: {},
+  _timeoutId: null,
+
+  /**
+   * Record a failure to send a ping out.
+   * @param {String} failureType The type of failure (e.g. "timeout", ...).
+   * @returns {Promise} Test-only, resolved when the ping is stored or sent.
+   */
+  recordSendFailure(failureType) {
+    return this._addToFailure(this.FailureType.SEND_FAILURE, failureType);
+  },
+
+  /**
+   * Record that a ping was discarded and its type.
+   * @param {String} pingType The type of discarded ping (e.g. "main", ...).
+   * @returns {Promise} Test-only, resolved when the ping is stored or sent.
+   */
+  recordDiscardedPing(pingType) {
+    return this._addToFailure(this.FailureType.DISCARDED_FOR_SIZE, pingType);
+  },
+
+  /**
+   * Assemble payload.
+   * @param {String} reason A string indicating the triggering reason (e.g. "immediate", "delayed", "shutdown").
+   * @returns {Object} The assembled payload.
+   */
+  _assemblePayload(reason) {
+    this._log.trace("_assemblePayload()");
+    let payload = {
+      os: this.OsInfo,
+      reason
+    };
+
+    for (let key of Object.keys(this._failures)) {
+      if (key === this.FailureType.DISCARDED_FOR_SIZE) {
+        payload[key] = this._getTopDiscardFailures(this._failures[key]);
+      } else {
+        payload[key] = this._failures[key];
+      }
+    }
+
+    return payload;
+  },
+
+  /**
+   * Sort input dictionary descending by value.
+   * @param {Object} failures A dictionary of failures subtype and count.
+   * @returns {Object} Sorted failures by value.
+   */
+  _getTopDiscardFailures(failures) {
+    this._log.trace("_getTopDiscardFailures()");
+    let sortedItems = Object.entries(failures).sort((first, second) => {
+      return second[1] - first[1];
+    });
+
+    let result = {};
+    sortedItems.slice(0, MAX_SEND_DISCARDED_PINGS).forEach(([key, value]) => {
+      result[key] = value;
+    });
+
+    return result;
+  },
+
+  /**
+   * Assemble the failure information and submit it.
+   * @param {String} reason A string indicating the triggering reason (e.g. "immediate", "delayed", "shutdown").
+   * @returns {Promise} Test-only promise that resolves when the ping was stored or sent (if any).
+   */
+  _submitPing(reason) {
+    if (!IS_HEALTH_PING_ENABLED || !this._hasDataToSend()) {
+      return Promise.resolve();
+    }
+
+    this._log.trace("_submitPing(" + reason + ")");
+    let payload = this._assemblePayload(reason);
+    this._clearData();
+    this._lastSendTime = Utils.monotonicNow();
+
+    return new Promise(r =>
+      // If we submit the health ping immediately, the send task would be triggered again
+      // before discarding oversized pings from the queue.
+      // To work around this, we send the ping on the next tick.
+      Services.tm.dispatchToMainThread(() => r(
+        TelemetryController
+          .submitExternalPing(this.HEALTH_PING_TYPE, payload, {addClientId: true}))));
+  },
+
+  /**
+   * Accumulate failure information and trigger a ping immediately or on timeout.
+   * @param {String} failureType The type of failure (e.g. "timeout", ...).
+   * @param {String} failureSubType The subtype of failure (e.g. ping type, ...).
+   * @returns {Promise} Test-only, resolved when the ping is stored or sent.
+   */
+  _addToFailure(failureType, failureSubType) {
+    this._log.trace("_addToFailure() - with type and subtype: " + failureType + " : " + failureSubType);
+
+    if (!(failureType in this._failures)) {
+      this._failures[failureType] = {};
+    }
+
+    let current = this._failures[failureType][failureSubType] || 0;
+    this._failures[failureType][failureSubType] = current + 1;
+
+    const now = Utils.monotonicNow();
+    if ((now - this._lastSendTime) >= SEND_TICK_DELAY) {
+      return this._submitPing(this.Reason.IMMEDIATE);
+    }
+
+    let submissionDelay = SEND_TICK_DELAY - now - this._lastSendTime;
+    this._timeoutId =
+      Policy.setSchedulerTickTimeout(() => TelemetryHealthPing._submitPing(this.Reason.DELAYED), submissionDelay);
+    return Promise.resolve();
+  },
+
+  /**
+   * @returns {boolean} Check the availability of recorded failures data.
+   */
+  _hasDataToSend() {
+    return Object.keys(this._failures).length !== 0;
+  },
+
+  /**
+   * Clear recorded failures data.
+   */
+  _clearData() {
+    this._log.trace("_clearData()");
+    this._failures = {};
+  },
+
+  /**
+   * Clear and reset timeout.
+   */
+  _resetTimeout() {
+    if (this._timeoutId) {
+      Policy.clearSchedulerTickTimeout(this._timeoutId);
+      this._timeoutId = null;
+    }
+  },
+
+  /**
+   * Submit latest ping on shutdown.
+   * @returns {Promise} Test-only, resolved when the ping is stored or sent.
+   */
+  shutdown() {
+    this._log.trace("shutdown()");
+    this._resetTimeout();
+    return this._submitPing(this.Reason.SHUT_DOWN);
+  },
+
+  /**
+   * Test-only, restore to initial state.
+   */
+  testReset() {
+    this._lastSendTime = -SEND_TICK_DELAY;
+    this._clearData();
+    this._resetTimeout();
+  },
+
+  get _log() {
+    if (!this._logger) {
+      this._logger = Log.repository.getLoggerWithMessagePrefix(LOGGER_NAME, LOGGER_PREFIX + "::");
+    }
+
+    return this._logger;
+  },
+};
--- a/toolkit/components/telemetry/TelemetrySend.jsm
+++ b/toolkit/components/telemetry/TelemetrySend.jsm
@ -39,6 +39,9 @@ XPCOMUtils.defineLazyModuleGetter(this, "OS",
 XPCOMUtils.defineLazyServiceGetter(this, "Telemetry",
                                   "@mozilla.org/base/telemetry;1",
                                   "nsITelemetry");
+XPCOMUtils.defineLazyModuleGetter(this, "TelemetryHealthPing",
+                                  "resource://gre/modules/TelemetryHealthPing.jsm");
+

 const Utils = TelemetryUtils;

@ -95,15 +98,6 @@ const XHR_ERROR_TYPE = [
  "eRedirect",
 ];

-function monotonicNow() {
-  try {
-    return Telemetry.msSinceProcessStart();
-  } catch (ex) {
-    // If this fails fall back to the (non-monotonic) Date value.
-    return Date.now();
-  }
-}
-
 /**
 * This is a policy object used to override behavior within this module.
 * Tests override properties on this object to allow for control of behavior
@ -186,6 +180,14 @@ this.TelemetrySend = {
    return TelemetrySendImpl.pendingPingCount;
  },

+  testSetTimeoutForPingSubmit(timeoutInMS) {
+    TelemetrySendImpl._pingSubmissionTimeout = timeoutInMS;
+  },
+
+  testResetTimeOutToDefault() {
+    TelemetrySendImpl._pingSubmissionTimeout = PING_SUBMIT_TIMEOUT_MS;
+  },
+
  /**
   * Partial setup that runs immediately at startup. This currently triggers
   * the crash report annotations.
@ -592,6 +594,8 @@ var TelemetrySendImpl = {
  // Count of pending pings that were overdue.
  _overduePingCount: 0,

+  _pingSubmissionTimeout: PING_SUBMIT_TIMEOUT_MS,
+
  OBSERVER_TOPICS: [
    TOPIC_IDLE_DAILY,
    TOPIC_QUIT_APPLICATION_GRANTED,
@ -1007,7 +1011,7 @@ var TelemetrySendImpl = {
    let hsend = Telemetry.getHistogramById(sendId);
    let hsuccess = Telemetry.getHistogramById("TELEMETRY_SUCCESS");

-    hsend.add(monotonicNow() - startTime);
+    hsend.add(Utils.monotonicNow() - startTime);
    hsuccess.add(success);

    if (!success) {
@ -1076,7 +1080,7 @@ var TelemetrySendImpl = {

    let request = new ServiceRequest();
    request.mozBackgroundRequest = true;
-    request.timeout = PING_SUBMIT_TIMEOUT_MS;
+    request.timeout = this._pingSubmissionTimeout;

    request.open("POST", url, true);
    request.overrideMimeType("text/plain");
@ -1088,7 +1092,7 @@ var TelemetrySendImpl = {
    // Prevent the request channel from running though URLClassifier (bug 1296802)
    request.channel.loadFlags &= ~Ci.nsIChannel.LOAD_CLASSIFY_URI;

-    const monotonicStartTime = monotonicNow();
+    const monotonicStartTime = Utils.monotonicNow();
    let deferred = PromiseUtils.defer();

    let onRequestFinished = (success, event) => {
@ -1118,6 +1122,8 @@ var TelemetrySendImpl = {
      if (failure === "error") {
        failure = XHR_ERROR_TYPE[request.errorCode];
      }
+
+      TelemetryHealthPing.recordSendFailure(failure);
      Telemetry.getHistogramById("TELEMETRY_SEND_FAILURE_TYPE").add(failure);

      this._log.error("_doPing - error making request to " + url + ": " + failure);
@ -1163,10 +1169,10 @@ var TelemetrySendImpl = {
    let converter = Cc["@mozilla.org/intl/scriptableunicodeconverter"]
                    .createInstance(Ci.nsIScriptableUnicodeConverter);
    converter.charset = "UTF-8";
-    let startTime = monotonicNow();
+    let startTime = Utils.monotonicNow();
    let utf8Payload = converter.ConvertFromUnicode(JSON.stringify(networkPayload));
    utf8Payload += converter.Finish();
-    Telemetry.getHistogramById("TELEMETRY_STRINGIFY").add(monotonicNow() - startTime);
+    Telemetry.getHistogramById("TELEMETRY_STRINGIFY").add(Utils.monotonicNow() - startTime);

    // Check the size and drop pings which are too big.
    const pingSizeBytes = utf8Payload.length;
@ -1177,16 +1183,18 @@ var TelemetrySendImpl = {
               .add(Math.floor(pingSizeBytes / 1024 / 1024));
      // We don't need to call |request.abort()| as it was not sent yet.
      this._pendingPingRequests.delete(id);
+
+      TelemetryHealthPing.recordDiscardedPing(ping.type);
      return TelemetryStorage.removePendingPing(id);
    }

    let payloadStream = Cc["@mozilla.org/io/string-input-stream;1"]
                        .createInstance(Ci.nsIStringInputStream);
-    startTime = monotonicNow();
+    startTime = Utils.monotonicNow();
    payloadStream.data = gzipCompressString(utf8Payload);

    const compressedPingSizeKB = Math.floor(payloadStream.data.length / 1024);
-    Telemetry.getHistogramById("TELEMETRY_COMPRESS").add(monotonicNow() - startTime);
+    Telemetry.getHistogramById("TELEMETRY_COMPRESS").add(Utils.monotonicNow() - startTime);
    request.send(payloadStream);

    return deferred.promise;
--- a/toolkit/components/telemetry/TelemetryUtils.jsm
+++ b/toolkit/components/telemetry/TelemetryUtils.jsm
@ -11,6 +11,7 @@ this.EXPORTED_SYMBOLS = [
 const {classes: Cc, interfaces: Ci, results: Cr, utils: Cu} = Components;

 Cu.import("resource://gre/modules/Preferences.jsm", this);
+Cu.import("resource://gre/modules/Services.jsm", this);

 const MILLISECONDS_PER_DAY = 24 * 60 * 60 * 1000;

@ -29,6 +30,7 @@ this.TelemetryUtils = {
    ArchiveEnabled: "toolkit.telemetry.archive.enabled",
    CachedClientId: "toolkit.telemetry.cachedClientID",
    FirstRun: "toolkit.telemetry.reportingpolicy.firstRun",
+    HealthPingEnabled: "toolkit.telemetry.healthping.enabled",
    OverrideOfficialCheck: "toolkit.telemetry.send.overrideOfficialCheck",
    Server: "toolkit.telemetry.server",
    ShutdownPingSender: "toolkit.telemetry.shutdownPingSender.enabled",
@ -185,4 +187,16 @@ this.TelemetryUtils = {
      + sign(tzOffset) + padNumber(Math.floor(Math.abs(tzOffset / 60)), 2)
      + ":" + padNumber(Math.abs(tzOffset % 60), 2);
  },
+
+  /**
+   * @returns {number} The monotonic time since the process start
+   * or (non-monotonic) Date value if this fails back.
+   */
+  monotonicNow() {
+    try {
+      return Services.telemetry.msSinceProcessStart();
+    } catch (ex) {
+      return Date.now();
+    }
+  }
 };
--- a/toolkit/components/telemetry/docs/internals/preferences.rst
+++ b/toolkit/components/telemetry/docs/internals/preferences.rst
@ -141,3 +141,7 @@ The following prefs are for testing purpose only.
 ``toolkit.telemetry.send.overrideOfficialCheck``

  If true, allows sending pings on unofficial builds. Requires a restart.
+
+``toolkit.telemetry.healthping.enabled``
+
+  If false, sending health pings is disabled. Defaults to true.
--- a/toolkit/components/telemetry/moz.build
+++ b/toolkit/components/telemetry/moz.build
@ -84,6 +84,7 @@ EXTRA_JS_MODULES += [
    'TelemetryArchive.jsm',
    'TelemetryController.jsm',
    'TelemetryEnvironment.jsm',
+    'TelemetryHealthPing.jsm',
    'TelemetryLog.jsm',
    'TelemetryModules.jsm',
    'TelemetryReportingPolicy.jsm',
--- a/toolkit/components/telemetry/tests/unit/head.js
+++ b/toolkit/components/telemetry/tests/unit/head.js
@ -301,6 +301,10 @@ function setEmptyPrefWatchlist() {
  });
 }

+function histogramValueCount(histogramSnapshot) {
+  return histogramSnapshot.counts.reduce((a, b) => a + b);
+}
+
 if (runningInParent) {
  // Set logging preferences for all the tests.
  Services.prefs.setCharPref("toolkit.telemetry.log.level", "Trace");
@ -319,7 +323,8 @@ if (runningInParent) {
  // Ensure browser experiments are also disabled, to avoid network activity
  // when toggling PREF_ENABLED.
  Services.prefs.setBoolPref("experiments.enabled", false);
-
+  // Turn off Health Ping submission.
+  Services.prefs.setBoolPref(TelemetryUtils.Preferences.HealthPingEnabled, false);

  fakePingSendTimer((callback, timeout) => {
    Services.tm.dispatchToMainThread(() => callback());
--- a/toolkit/components/telemetry/tests/unit/test_TelemetryHealthPing.js
+++ b/toolkit/components/telemetry/tests/unit/test_TelemetryHealthPing.js
@ -0,0 +1,184 @@
+/* Any copyright is dedicated to the Public Domain.
+ http://creativecommons.org/publicdomain/zero/1.0/
+ */
+
+// This tests the public Telemetry API for submitting Health pings.
+
+"use strict";
+
+Cu.import("resource://gre/modules/TelemetryController.jsm", this);
+Cu.import("resource://gre/modules/TelemetryStorage.jsm", this);
+Cu.import("resource://gre/modules/TelemetryUtils.jsm", this);
+Cu.import("resource://gre/modules/Preferences.jsm", this);
+Cu.import("resource://gre/modules/XPCOMUtils.jsm", this);
+
+XPCOMUtils.defineLazyModuleGetter(this, "TelemetryHealthPing",
+                                  "resource://gre/modules/TelemetryHealthPing.jsm");
+
+function checkHealthPingStructure(ping, expectedFailuresDict) {
+  let payload = ping.payload;
+  Assert.equal(ping.type, TelemetryHealthPing.HEALTH_PING_TYPE, "Should have recorded a health ping.");
+
+  for (let [key, value] of Object.entries(expectedFailuresDict)) {
+    Assert.deepEqual(payload[key], value, "Should have recorded correct entry with key: " + key);
+  }
+}
+
+function fakeHealthSchedulerTimer(set, clear) {
+  let telemetryHealthPing = Cu.import("resource://gre/modules/TelemetryHealthPing.jsm", {});
+  telemetryHealthPing.Policy.setSchedulerTickTimeout = set;
+  telemetryHealthPing.Policy.clearSchedulerTickTimeout = clear;
+}
+
+add_task(async function setup() {
+  // Trigger a proper telemetry init.
+  do_get_profile(true);
+  // Make sure we don't generate unexpected pings due to pref changes.
+  await setEmptyPrefWatchlist();
+  Services.prefs.setBoolPref("toolkit.telemetry.enabled", true);
+  Preferences.set(TelemetryUtils.Preferences.HealthPingEnabled, true);
+
+  await TelemetryController.testSetup();
+  PingServer.start();
+  TelemetrySend.setServer("http://localhost:" + PingServer.port);
+});
+
+add_task(async function test_sendImmediately() {
+  PingServer.clearRequests();
+  TelemetryHealthPing.testReset();
+
+  await TelemetryHealthPing.recordSendFailure("testProblem");
+  let ping = await PingServer.promiseNextPing();
+  checkHealthPingStructure(ping, {
+    [TelemetryHealthPing.FailureType.SEND_FAILURE]: {
+      "testProblem": 1
+    },
+    "os": TelemetryHealthPing.OsInfo,
+    "reason": TelemetryHealthPing.Reason.IMMEDIATE
+  });
+});
+
+add_task(async function test_sendOnDelay() {
+  PingServer.clearRequests();
+  TelemetryHealthPing.testReset();
+
+  // This first failure should immediately trigger a ping. After this, subsequent failures should be throttled.
+  await TelemetryHealthPing.recordSendFailure("testFailure");
+  let testPing = await PingServer.promiseNextPing();
+  Assert.equal(testPing.type, TelemetryHealthPing.HEALTH_PING_TYPE, "Should have recorded a health ping.");
+
+  // Retrieve delayed call back.
+  let pingSubmissionCallBack = null;
+  fakeHealthSchedulerTimer((callBack) => pingSubmissionCallBack = callBack, () => {
+  });
+
+  // Record two failures, health ping must not be send now.
+  await TelemetryHealthPing.recordSendFailure("testFailure");
+  await TelemetryHealthPing.recordSendFailure("testFailure");
+
+  // Wait for sending delayed health ping.
+  await pingSubmissionCallBack();
+
+  let ping = await PingServer.promiseNextPing();
+  checkHealthPingStructure(ping, {
+    [TelemetryHealthPing.FailureType.SEND_FAILURE]: {
+      "testFailure": 2
+    },
+    "os": TelemetryHealthPing.OsInfo,
+    "reason": TelemetryHealthPing.Reason.DELAYED
+  });
+});
+
+add_task(async function test_sendOverSizedPing() {
+  TelemetryHealthPing.testReset();
+  PingServer.clearRequests();
+  let OVER_SIZED_PING_TYPE = "over-sized-ping";
+  let overSizedData = generateRandomString(2 * 1024 * 1024);
+
+  await TelemetryController.submitExternalPing(OVER_SIZED_PING_TYPE, {"data": overSizedData});
+  let ping = await PingServer.promiseNextPing();
+
+  checkHealthPingStructure(ping, {
+    [TelemetryHealthPing.FailureType.DISCARDED_FOR_SIZE]: {
+      [OVER_SIZED_PING_TYPE]: 1
+    },
+    "os": TelemetryHealthPing.OsInfo,
+    "reason": TelemetryHealthPing.Reason.IMMEDIATE
+  });
+});
+
+add_task(async function test_sendOnTimeout() {
+  TelemetryHealthPing.testReset();
+  PingServer.clearRequests();
+  let PING_TYPE = "ping-on-timeout";
+
+  // Set up small ping submission timeout to always have timeout error.
+  TelemetrySend.testSetTimeoutForPingSubmit(2);
+
+  // Reset the timeout after receiving the first ping to be able to send health ping.
+  PingServer.registerPingHandler((request, result) => {
+    PingServer.resetPingHandler();
+    TelemetrySend.testResetTimeOutToDefault();
+  });
+
+  await TelemetryController.submitExternalPing(PING_TYPE, {});
+  let ping = await PingServer.promiseNextPing();
+  checkHealthPingStructure(ping, {
+    [TelemetryHealthPing.FailureType.SEND_FAILURE]: {
+      "timeout": 1
+    },
+    "os": TelemetryHealthPing.OsInfo,
+    "reason": TelemetryHealthPing.Reason.IMMEDIATE
+  });
+
+  // Clear pending pings to avoid resending pings which fail with time out error.
+  await TelemetryStorage.testClearPendingPings();
+});
+
+add_task(async function test_sendOnlyTopTenDiscardedPings() {
+  TelemetryHealthPing.testReset();
+  PingServer.clearRequests();
+  let PING_TYPE = "sort-discarded";
+
+  // This first failure should immediately trigger a ping. After this, subsequent failures should be throttled.
+  await TelemetryHealthPing.recordSendFailure("testFailure");
+  let testPing = await PingServer.promiseNextPing();
+  Assert.equal(testPing.type, TelemetryHealthPing.HEALTH_PING_TYPE, "Should have recorded a health ping.");
+
+
+  // Retrieve delayed call back.
+  let pingSubmissionCallBack = null;
+  fakeHealthSchedulerTimer((callBack) => pingSubmissionCallBack = callBack, () => {
+  });
+
+  // Add failures
+  for (let i = 1; i < 12; i++) {
+    for (let j = 1; j < i; j++) {
+      await TelemetryHealthPing.recordDiscardedPing(PING_TYPE + i);
+    }
+  }
+
+  await pingSubmissionCallBack();
+  let ping = await PingServer.promiseNextPing();
+
+  checkHealthPingStructure(ping, {
+    "os": TelemetryHealthPing.OsInfo,
+    "reason": TelemetryHealthPing.Reason.DELAYED,
+    [TelemetryHealthPing.FailureType.DISCARDED_FOR_SIZE]: {
+      [PING_TYPE + 11]: 10,
+      [PING_TYPE + 10]: 9,
+      [PING_TYPE + 9]: 8,
+      [PING_TYPE + 8]: 7,
+      [PING_TYPE + 7]: 6,
+      [PING_TYPE + 6]: 5,
+      [PING_TYPE + 5]: 4,
+      [PING_TYPE + 4]: 3,
+      [PING_TYPE + 3]: 2,
+      [PING_TYPE + 2]: 1
+    }
+  });
+});
+
+add_task(async function cleanup() {
+  await PingServer.stop();
+});
--- a/toolkit/components/telemetry/tests/unit/test_TelemetrySend.js
+++ b/toolkit/components/telemetry/tests/unit/test_TelemetrySend.js
@ -15,6 +15,10 @@ Cu.import("resource://gre/modules/TelemetryUtils.jsm", this);
 Cu.import("resource://gre/modules/Services.jsm", this);
 Cu.import("resource://gre/modules/Preferences.jsm", this);
 Cu.import("resource://gre/modules/osfile.jsm", this);
+Cu.import("resource://gre/modules/XPCOMUtils.jsm", this);
+
+XPCOMUtils.defineLazyModuleGetter(this, "TelemetryHealthPing",
+  "resource://gre/modules/TelemetryHealthPing.jsm");

 const MS_IN_A_MINUTE = 60 * 1000;

@ -79,6 +83,7 @@ add_task(async function test_setup() {
  // Make sure we don't generate unexpected pings due to pref changes.
  await setEmptyPrefWatchlist();
  Services.prefs.setBoolPref(TelemetryUtils.Preferences.TelemetryEnabled, true);
+  Preferences.set(TelemetryUtils.Preferences.HealthPingEnabled, true);
 });

 // Test the ping sending logic.
@ -326,6 +331,7 @@ add_task(async function test_discardBigPings() {
  // Submit a ping of a normal size and check that we don't count it in the histogram.
  await TelemetryController.submitExternalPing(TEST_PING_TYPE, { test: "test" });
  await TelemetrySend.testWaitOnOutgoingPings();
+  await PingServer.promiseNextPing();

  Assert.equal(histSizeExceeded.snapshot().sum, 0, "Telemetry must report no oversized ping submitted.");
  Assert.equal(histDiscardedSize.snapshot().sum, 0, "Telemetry must report no oversized pings.");
@ -335,15 +341,21 @@ add_task(async function test_discardBigPings() {
  Assert.equal(histogramValueCount(histSendTimeFail.snapshot()), 0, "Should not have recorded send failure time.");

  // Submit an oversized ping and check that it gets discarded.
+  TelemetryHealthPing.testReset();
  await TelemetryController.submitExternalPing(TEST_PING_TYPE, OVERSIZED_PAYLOAD);
-  await TelemetrySend.testWaitOnOutgoingPings();
+  let ping = await PingServer.promiseNextPing();

  Assert.equal(histSizeExceeded.snapshot().sum, 1, "Telemetry must report 1 oversized ping submitted.");
  Assert.equal(histDiscardedSize.snapshot().counts[2], 1, "Telemetry must report a 2MB, oversized, ping submitted.");
-  Assert.deepEqual(histSuccess.snapshot().counts, [0, 1, 0], "Should have recorded sending success.");
-  Assert.equal(histogramValueCount(histSendTimeSuccess.snapshot()), 1, "Should have recorded send success time.");
+  Assert.deepEqual(histSuccess.snapshot().counts, [0, 2, 0], "Should have recorded sending success.");
+  Assert.equal(histogramValueCount(histSendTimeSuccess.snapshot()), 2, "Should have recorded send success time.");
  Assert.greater(histSendTimeSuccess.snapshot().sum, 0, "Should have recorded send success time.");
  Assert.equal(histogramValueCount(histSendTimeFail.snapshot()), 0, "Should not have recorded send failure time.");
+
+  Assert.equal(ping.type, TelemetryHealthPing.HEALTH_PING_TYPE, "Should have received a health ping.");
+  Assert.deepEqual(ping.payload[TelemetryHealthPing.FailureType.DISCARDED_FOR_SIZE],
+    {[TEST_PING_TYPE]: 1}, "Should have recorded correct type of oversized ping.");
+  Assert.deepEqual(ping.payload["os"], TelemetryHealthPing.OsInfo, "Should have correct os info.")
 });

 add_task(async function test_evictedOnServerErrors() {
--- a/toolkit/components/telemetry/tests/unit/xpcshell.ini
+++ b/toolkit/components/telemetry/tests/unit/xpcshell.ini
@ -39,6 +39,7 @@ skip-if = os == "android"
 [test_TelemetryLockCount.js]
 [test_TelemetryLog.js]
 [test_TelemetryController.js]
+[test_TelemetryHealthPing.js]
 tags = addons
 [test_TelemetryController_idle.js]
 [test_TelemetryControllerShutdown.js]