Bug 1694949 - Add event telemetry containing failure reasons that lead to TRR confirmation failure r=nhnt11,dragana,necko-reviewers

Differential Revision: https://phabricator.services.mozilla.com/D106618
This commit is contained in:
Valentin Gosu 2021-03-03 14:32:24 +00:00
Родитель f3d447ac6a
Коммит a13ad605e4
4 изменённых файлов: 73 добавлений и 36 удалений

Просмотреть файл

@ -930,8 +930,7 @@ void TRR::ReportStatus(nsresult aStatusCode) {
// it as failed; otherwise it can cause the confirmation to fail.
if (UseDefaultServer() && aStatusCode != NS_ERROR_ABORT) {
// Bad content is still considered "okay" if the HTTP response is okay
gTRRService->TRRIsOkay(NS_SUCCEEDED(aStatusCode) ? TRRService::OKAY_NORMAL
: TRRService::OKAY_BAD);
gTRRService->TRRIsOkay(aStatusCode);
}
}

Просмотреть файл

@ -947,26 +947,72 @@ TRRService::Notify(nsITimer* aTimer) {
return NS_OK;
}
void TRRService::TRRIsOkay(enum TrrOkay aReason) {
static char StatusToChar(nsresult aLookupStatus, nsresult aChannelStatus) {
// If the resolution fails in the TRR channel then we'll have a failed
// aChannelStatus. Otherwise, we parse the response - if it's not a valid DNS
// packet or doesn't contain the correct responses aLookupStatus will be a
// failure code.
if (aChannelStatus == NS_OK) {
// Return + if confirmation was OK, or - if confirmation failed
return aLookupStatus == NS_OK ? '+' : '-';
}
if (nsCOMPtr<nsIIOService> ios = do_GetIOService()) {
bool hasConnectiviy = true;
ios->GetConnectivity(&hasConnectiviy);
if (!hasConnectiviy) {
// Browser has no active network interfaces = is offline.
return 'o';
}
}
switch (aChannelStatus) {
case NS_ERROR_NET_TIMEOUT_EXTERNAL:
// TRR timeout expired
return 't';
case NS_ERROR_UNKNOWN_HOST:
// TRRServiceChannel failed to due to unresolved host
return 'd';
default:
break;
}
// The error is a network error
if (NS_ERROR_GET_MODULE(aChannelStatus) == NS_ERROR_MODULE_NETWORK) {
return 'n';
}
// Some other kind of failure.
return '?';
}
void TRRService::TRRIsOkay(nsresult aChannelStatus) {
MOZ_ASSERT_IF(XRE_IsParentProcess(), NS_IsMainThread() || IsOnTRRThread());
MOZ_ASSERT_IF(XRE_IsSocketProcess(), NS_IsMainThread());
Telemetry::AccumulateCategoricalKeyed(
ProviderKey(), aReason == OKAY_NORMAL
ProviderKey(), NS_SUCCEEDED(aChannelStatus)
? Telemetry::LABELS_DNS_TRR_SUCCESS3::Fine
: (aReason == OKAY_TIMEOUT
: (aChannelStatus == NS_ERROR_NET_TIMEOUT_EXTERNAL
? Telemetry::LABELS_DNS_TRR_SUCCESS3::Timeout
: Telemetry::LABELS_DNS_TRR_SUCCESS3::Bad));
if (aReason == OKAY_NORMAL) {
if (NS_SUCCEEDED(aChannelStatus)) {
mConfirmation.mTRRFailures = 0;
} else if ((mMode == nsIDNSService::MODE_TRRFIRST) &&
(mConfirmation.mState == CONFIRM_OK)) {
// only count failures while in OK state
mConfirmation.mFailureReasons[mConfirmation.mTRRFailures %
ConfirmationContext::RESULTS_SIZE] =
StatusToChar(NS_OK, aChannelStatus);
uint32_t fails = ++mConfirmation.mTRRFailures;
if (fails >= StaticPrefs::network_trr_max_fails()) {
LOG(("TRRService goes FAILED after %u failures in a row\n", fails));
mConfirmation.mState = CONFIRM_FAILED;
mConfirmation.mTrigger.Assign("failed-lookups");
mConfirmation.mFailedLookups =
nsDependentCSubstring(mConfirmation.mFailureReasons,
fails % ConfirmationContext::RESULTS_SIZE);
// Fire off a timer and start re-trying the NS domain again
NS_NewTimerWithCallback(getter_AddRefs(mConfirmation.mTimer), this,
mConfirmation.mRetryInterval,
@ -985,6 +1031,7 @@ void TRRService::ConfirmationContext::RecordEvent(const char* aReason) {
mFirstRequestTime = TimeStamp();
mContextChangeReason.Assign(aReason);
mTrigger.Truncate();
mFailedLookups.Truncate();
mRetryInterval = StaticPrefs::network_trr_retry_timeout_ms();
};
@ -1038,6 +1085,11 @@ void TRRService::ConfirmationContext::RecordEvent(const char* aReason) {
nsPrintfCString("%i", mCaptivePortalStatus)},
});
if (mTrigger.Equals("failed-lookups"_ns)) {
extra.ref().AppendElement(
Telemetry::EventExtraEntry{"failedLookups"_ns, mFailedLookups});
}
ConfirmationState state = mState;
Telemetry::RecordEvent(eventType, mozilla::Some(nsPrintfCString("%u", state)),
extra);
@ -1047,33 +1099,8 @@ void TRRService::ConfirmationContext::RecordEvent(const char* aReason) {
void TRRService::ConfirmationContext::RequestCompleted(
nsresult aLookupStatus, nsresult aChannelStatus) {
auto statusToChar = [aLookupStatus, aChannelStatus]() -> char {
if (aChannelStatus == NS_OK) {
// Return + if confirmation was OK, or - if confirmation failed
return aLookupStatus == NS_OK ? '+' : '-';
}
switch (aChannelStatus) {
case NS_ERROR_NET_TIMEOUT_EXTERNAL:
// TRR timeout expired
return 't';
case NS_ERROR_UNKNOWN_HOST:
// TRRServiceChannel failed to due to unresolved host
return 'd';
default:
break;
}
// The error is a network error
if (NS_ERROR_GET_MODULE(aChannelStatus) == NS_ERROR_MODULE_NETWORK) {
return 'n';
}
// Some other kind of failure.
return '?';
};
mResults[mAttemptCount % RESULTS_SIZE] = statusToChar();
mResults[mAttemptCount % RESULTS_SIZE] =
StatusToChar(aLookupStatus, aChannelStatus);
mAttemptCount++;
}

Просмотреть файл

@ -62,8 +62,7 @@ class TRRService : public TRRServiceBase,
bool IsExcludedFromTRR(const nsACString& aHost);
bool MaybeBootstrap(const nsACString& possible, nsACString& result);
enum TrrOkay { OKAY_NORMAL = 0, OKAY_TIMEOUT = 1, OKAY_BAD = 2 };
void TRRIsOkay(enum TrrOkay aReason);
void TRRIsOkay(nsresult aChannelStatus);
bool ParentalControlEnabled() const { return mParentalControlEnabled; }
nsresult DispatchTRRRequest(TRR* aTrrRequest);
@ -146,9 +145,9 @@ class TRRService : public TRRServiceBase,
};
class ConfirmationContext {
public:
static const size_t RESULTS_SIZE = 32;
public:
Atomic<ConfirmationState, Relaxed> mState;
RefPtr<TRR> mTask;
nsCOMPtr<nsITimer> mTimer;
@ -156,6 +155,10 @@ class TRRService : public TRRServiceBase,
// The number of TRR requests that failed in a row.
Atomic<uint32_t, Relaxed> mTRRFailures;
// This buffer holds consecutive TRR failures reported by calling
// TRRIsOkay(). It is only meant for reporting event telemetry.
char mFailureReasons[RESULTS_SIZE] = {0};
// The number of confirmation retries.
uint32_t mAttemptCount = 0;
@ -177,6 +180,10 @@ class TRRService : public TRRServiceBase,
// What triggered the confirmation
nsCString mTrigger;
// String representation of consecutive failed lookups that triggered
// confirmation.
nsCString mFailedLookups;
// Called when a confirmation completes successfully or when the
// confirmation context changes.
void RecordEvent(const char* aReason);

Просмотреть файл

@ -2111,6 +2111,7 @@ network.dns:
objects: ["context"]
bug_numbers:
- 1691408
- 1694949
description: >
This telemetry records the status of the TRR confirmation across.
The value of the event is one of:
@ -2138,6 +2139,9 @@ network.dns:
String representation of the last 32 confirmation results.
Example: nnnnnnttttttttt indicates a number of network (n) failures
followed by timeouts (t).
failedLookups: >
When the trigger is failed-lookups, this contains the string
representation of the failures that triggered the confirmation.
networkID: >
The network ID for the recorded confirmation attempts
captivePortal: >