gecko-dev/browser/components/doh/TRRPerformance.sys.mjs

396 строки
11 KiB
JavaScript

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/*
* This module tests TRR performance by issuing DNS requests to TRRs and
* recording telemetry for the network time for each request.
*
* We test each TRR with 5 random subdomains of a canonical domain and also
* a "popular" domain (which the TRR likely have cached).
*
* To ensure data integrity, we run the requests in an aggregator wrapper
* and collect all the results before sending telemetry. If we detect network
* loss, the results are discarded. A new run is triggered upon detection of
* usable network until a full set of results has been captured. We stop retrying
* after 5 attempts.
*/
Services.telemetry.setEventRecordingEnabled(
"security.doh.trrPerformance",
true
);
import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
const lazy = {};
XPCOMUtils.defineLazyServiceGetter(
lazy,
"gNetworkLinkService",
"@mozilla.org/network/network-link-service;1",
"nsINetworkLinkService"
);
XPCOMUtils.defineLazyServiceGetter(
lazy,
"gCaptivePortalService",
"@mozilla.org/network/captive-portal-service;1",
"nsICaptivePortalService"
);
// The canonical domain whose subdomains we will be resolving.
XPCOMUtils.defineLazyPreferenceGetter(
lazy,
"kCanonicalDomain",
"doh-rollout.trrRace.canonicalDomain",
"firefox-dns-perf-test.net."
);
// The number of random subdomains to resolve per TRR.
XPCOMUtils.defineLazyPreferenceGetter(
lazy,
"kRepeats",
"doh-rollout.trrRace.randomSubdomainCount",
5
);
// The "popular" domain that we expect the TRRs to have cached.
XPCOMUtils.defineLazyPreferenceGetter(
lazy,
"kPopularDomains",
"doh-rollout.trrRace.popularDomains",
null,
null,
val =>
val
? val.split(",").map(t => t.trim())
: [
"google.com.",
"youtube.com.",
"amazon.com.",
"facebook.com.",
"yahoo.com.",
]
);
function getRandomSubdomain() {
let uuid = Services.uuid.generateUUID().toString().slice(1, -1); // Discard surrounding braces
return `${uuid}.${lazy.kCanonicalDomain}`;
}
// A wrapper around async DNS lookups. The results are passed on to the supplied
// callback. The wrapper attempts the lookup 3 times before passing on a failure.
// If a false-y `domain` is supplied, a random subdomain will be used. Each retry
// will use a different random subdomain to ensure we bypass chached responses.
export class DNSLookup {
constructor(domain, trrServer, callback) {
this._domain = domain;
this.trrServer = trrServer;
this.callback = callback;
this.retryCount = 0;
}
doLookup() {
this.retryCount++;
try {
this.usedDomain = this._domain || getRandomSubdomain();
Services.dns.asyncResolve(
this.usedDomain,
Ci.nsIDNSService.RESOLVE_TYPE_DEFAULT,
Ci.nsIDNSService.RESOLVE_BYPASS_CACHE,
Services.dns.newAdditionalInfo(this.trrServer, -1),
this,
Services.tm.currentThread,
{}
);
} catch (e) {
console.error(e);
}
}
onLookupComplete(request, record, status) {
// Try again if we failed...
if (!Components.isSuccessCode(status) && this.retryCount < 3) {
this.doLookup();
return;
}
// But after the third try, just pass the status on.
this.callback(request, record, status, this.usedDomain, this.retryCount);
}
}
DNSLookup.prototype.QueryInterface = ChromeUtils.generateQI(["nsIDNSListener"]);
// A wrapper around a single set of measurements. The required lookups are
// triggered and the results aggregated before telemetry is sent. If aborted,
// any aggregated results are discarded.
export class LookupAggregator {
constructor(onCompleteCallback, trrList) {
this.onCompleteCallback = onCompleteCallback;
this.trrList = trrList;
this.aborted = false;
this.networkUnstable = false;
this.captivePortal = false;
this.domains = [];
for (let i = 0; i < lazy.kRepeats; ++i) {
// false-y domain will cause DNSLookup to generate a random one.
this.domains.push(null);
}
this.domains.push(...lazy.kPopularDomains);
this.totalLookups = this.trrList.length * this.domains.length;
this.completedLookups = 0;
this.results = [];
}
run() {
if (this._ran || this._aborted) {
console.error("Trying to re-run a LookupAggregator.");
return;
}
this._ran = true;
for (let trr of this.trrList) {
for (let domain of this.domains) {
new DNSLookup(
domain,
trr,
(request, record, status, usedDomain, retryCount) => {
this.results.push({
domain: usedDomain,
trr,
status,
time: record
? record.QueryInterface(Ci.nsIDNSAddrRecord)
.trrFetchDurationNetworkOnly
: -1,
retryCount,
});
this.completedLookups++;
if (this.completedLookups == this.totalLookups) {
this.recordResults();
}
}
).doLookup();
}
}
}
abort() {
this.aborted = true;
}
markUnstableNetwork() {
this.networkUnstable = true;
}
markCaptivePortal() {
this.captivePortal = true;
}
recordResults() {
if (this.aborted) {
return;
}
for (let { domain, trr, status, time, retryCount } of this.results) {
if (
!(
lazy.kPopularDomains.includes(domain) ||
domain.includes(lazy.kCanonicalDomain)
)
) {
console.error("Expected known domain for reporting, got ", domain);
return;
}
Services.telemetry.recordEvent(
"security.doh.trrPerformance",
"resolved",
"record",
"success",
{
domain,
trr,
status: status.toString(),
time: time.toString(),
retryCount: retryCount.toString(),
networkUnstable: this.networkUnstable.toString(),
captivePortal: this.captivePortal.toString(),
}
);
}
this.onCompleteCallback();
}
}
// This class monitors the network and spawns a new LookupAggregator when ready.
// When the network goes down, an ongoing aggregator is aborted and a new one
// spawned next time we get a link, up to 5 times. On the fifth time, we just
// let the aggegator complete and mark it as tainted.
export class TRRRacer {
constructor(onCompleteCallback, trrList) {
this._aggregator = null;
this._retryCount = 0;
this._complete = false;
this._onCompleteCallback = onCompleteCallback;
this._trrList = trrList;
}
run() {
if (
lazy.gNetworkLinkService.isLinkUp &&
lazy.gCaptivePortalService.state !=
lazy.gCaptivePortalService.LOCKED_PORTAL
) {
this._runNewAggregator();
if (
lazy.gCaptivePortalService.state ==
lazy.gCaptivePortalService.UNLOCKED_PORTAL
) {
this._aggregator.markCaptivePortal();
}
}
Services.obs.addObserver(this, "ipc:network:captive-portal-set-state");
Services.obs.addObserver(this, "network:link-status-changed");
}
onComplete() {
Services.obs.removeObserver(this, "ipc:network:captive-portal-set-state");
Services.obs.removeObserver(this, "network:link-status-changed");
this._complete = true;
if (this._onCompleteCallback) {
this._onCompleteCallback();
}
}
getFastestTRR(returnRandomDefault = false) {
if (!this._complete) {
throw new Error("getFastestTRR: Measurement still running.");
}
return this._getFastestTRRFromResults(
this._aggregator.results,
returnRandomDefault
);
}
/*
* Given an array of { trr, time }, returns the trr with smallest mean time.
* Separate from _getFastestTRR for easy unit-testing.
*
* @returns The TRR with the fastest average time.
* If returnRandomDefault is false-y, returns undefined if no valid
* times were present in the results. Otherwise, returns one of the
* present TRRs at random.
*/
_getFastestTRRFromResults(results, returnRandomDefault = false) {
// First, organize the results into a map of TRR -> array of times
let TRRTimingMap = new Map();
let TRRErrorCount = new Map();
for (let { trr, time } of results) {
if (!TRRTimingMap.has(trr)) {
TRRTimingMap.set(trr, []);
}
if (time != -1) {
TRRTimingMap.get(trr).push(time);
} else {
TRRErrorCount.set(trr, 1 + (TRRErrorCount.get(trr) || 0));
}
}
// Loop through each TRR's array of times, compute the geometric means,
// and remember the fastest TRR. Geometric mean is a bit more forgiving
// in the presence of noise (anomalously high values).
// We don't need the full geometric mean, we simply calculate the arithmetic
// means in log-space and then compare those values.
let fastestTRR;
let fastestAverageTime = -1;
let trrs = [...TRRTimingMap.keys()];
for (let trr of trrs) {
let times = TRRTimingMap.get(trr);
if (!times.length) {
continue;
}
// Skip TRRs that had an error rate of more than 30%.
let errorCount = TRRErrorCount.get(trr) || 0;
let totalResults = times.length + errorCount;
if (errorCount / totalResults > 0.3) {
continue;
}
// Arithmetic mean in log space. Take log of (a + 1) to ensure we never
// take log(0) which would be -Infinity.
let averageTime =
times.map(a => Math.log(a + 1)).reduce((a, b) => a + b) / times.length;
if (fastestAverageTime == -1 || averageTime < fastestAverageTime) {
fastestAverageTime = averageTime;
fastestTRR = trr;
}
}
if (returnRandomDefault && !fastestTRR) {
fastestTRR = trrs[Math.floor(Math.random() * trrs.length)];
}
return fastestTRR;
}
_runNewAggregator() {
this._aggregator = new LookupAggregator(
() => this.onComplete(),
this._trrList
);
this._aggregator.run();
this._retryCount++;
}
// When the link goes *down*, or when we detect a locked captive portal, we
// abort any ongoing LookupAggregator run. When the link goes *up*, or we
// detect a newly unlocked portal, we start a run if one isn't ongoing.
observe(subject, topic, data) {
switch (topic) {
case "network:link-status-changed":
if (this._aggregator && data == "down") {
if (this._retryCount < 5) {
this._aggregator.abort();
} else {
this._aggregator.markUnstableNetwork();
}
} else if (
data == "up" &&
(!this._aggregator || this._aggregator.aborted)
) {
this._runNewAggregator();
}
break;
case "ipc:network:captive-portal-set-state":
if (
this._aggregator &&
lazy.gCaptivePortalService.state ==
lazy.gCaptivePortalService.LOCKED_PORTAL
) {
if (this._retryCount < 5) {
this._aggregator.abort();
} else {
this._aggregator.markCaptivePortal();
}
} else if (
lazy.gCaptivePortalService.state ==
lazy.gCaptivePortalService.UNLOCKED_PORTAL &&
(!this._aggregator || this._aggregator.aborted)
) {
this._runNewAggregator();
}
break;
}
}
}