Bug 1694949 - Add event telemetry containing failure reasons that lead to TRR confirmation failure r=nhnt11,dragana,necko-reviewers

Differential Revision: https://phabricator.services.mozilla.com/D106618
This commit is contained in:
Valentin Gosu 2021-03-03 14:32:24 +00:00
Родитель f3d447ac6a
Коммит a13ad605e4
4 изменённых файлов: 73 добавлений и 36 удалений

Просмотреть файл

@ -930,8 +930,7 @@ void TRR::ReportStatus(nsresult aStatusCode) {
// it as failed; otherwise it can cause the confirmation to fail. // it as failed; otherwise it can cause the confirmation to fail.
if (UseDefaultServer() && aStatusCode != NS_ERROR_ABORT) { if (UseDefaultServer() && aStatusCode != NS_ERROR_ABORT) {
// Bad content is still considered "okay" if the HTTP response is okay // Bad content is still considered "okay" if the HTTP response is okay
gTRRService->TRRIsOkay(NS_SUCCEEDED(aStatusCode) ? TRRService::OKAY_NORMAL gTRRService->TRRIsOkay(aStatusCode);
: TRRService::OKAY_BAD);
} }
} }

Просмотреть файл

@ -947,26 +947,72 @@ TRRService::Notify(nsITimer* aTimer) {
return NS_OK; return NS_OK;
} }
void TRRService::TRRIsOkay(enum TrrOkay aReason) { static char StatusToChar(nsresult aLookupStatus, nsresult aChannelStatus) {
// If the resolution fails in the TRR channel then we'll have a failed
// aChannelStatus. Otherwise, we parse the response - if it's not a valid DNS
// packet or doesn't contain the correct responses aLookupStatus will be a
// failure code.
if (aChannelStatus == NS_OK) {
// Return + if confirmation was OK, or - if confirmation failed
return aLookupStatus == NS_OK ? '+' : '-';
}
if (nsCOMPtr<nsIIOService> ios = do_GetIOService()) {
bool hasConnectiviy = true;
ios->GetConnectivity(&hasConnectiviy);
if (!hasConnectiviy) {
// Browser has no active network interfaces = is offline.
return 'o';
}
}
switch (aChannelStatus) {
case NS_ERROR_NET_TIMEOUT_EXTERNAL:
// TRR timeout expired
return 't';
case NS_ERROR_UNKNOWN_HOST:
// TRRServiceChannel failed to due to unresolved host
return 'd';
default:
break;
}
// The error is a network error
if (NS_ERROR_GET_MODULE(aChannelStatus) == NS_ERROR_MODULE_NETWORK) {
return 'n';
}
// Some other kind of failure.
return '?';
}
void TRRService::TRRIsOkay(nsresult aChannelStatus) {
MOZ_ASSERT_IF(XRE_IsParentProcess(), NS_IsMainThread() || IsOnTRRThread()); MOZ_ASSERT_IF(XRE_IsParentProcess(), NS_IsMainThread() || IsOnTRRThread());
MOZ_ASSERT_IF(XRE_IsSocketProcess(), NS_IsMainThread()); MOZ_ASSERT_IF(XRE_IsSocketProcess(), NS_IsMainThread());
Telemetry::AccumulateCategoricalKeyed( Telemetry::AccumulateCategoricalKeyed(
ProviderKey(), aReason == OKAY_NORMAL ProviderKey(), NS_SUCCEEDED(aChannelStatus)
? Telemetry::LABELS_DNS_TRR_SUCCESS3::Fine ? Telemetry::LABELS_DNS_TRR_SUCCESS3::Fine
: (aReason == OKAY_TIMEOUT : (aChannelStatus == NS_ERROR_NET_TIMEOUT_EXTERNAL
? Telemetry::LABELS_DNS_TRR_SUCCESS3::Timeout ? Telemetry::LABELS_DNS_TRR_SUCCESS3::Timeout
: Telemetry::LABELS_DNS_TRR_SUCCESS3::Bad)); : Telemetry::LABELS_DNS_TRR_SUCCESS3::Bad));
if (aReason == OKAY_NORMAL) { if (NS_SUCCEEDED(aChannelStatus)) {
mConfirmation.mTRRFailures = 0; mConfirmation.mTRRFailures = 0;
} else if ((mMode == nsIDNSService::MODE_TRRFIRST) && } else if ((mMode == nsIDNSService::MODE_TRRFIRST) &&
(mConfirmation.mState == CONFIRM_OK)) { (mConfirmation.mState == CONFIRM_OK)) {
// only count failures while in OK state // only count failures while in OK state
mConfirmation.mFailureReasons[mConfirmation.mTRRFailures %
ConfirmationContext::RESULTS_SIZE] =
StatusToChar(NS_OK, aChannelStatus);
uint32_t fails = ++mConfirmation.mTRRFailures; uint32_t fails = ++mConfirmation.mTRRFailures;
if (fails >= StaticPrefs::network_trr_max_fails()) { if (fails >= StaticPrefs::network_trr_max_fails()) {
LOG(("TRRService goes FAILED after %u failures in a row\n", fails)); LOG(("TRRService goes FAILED after %u failures in a row\n", fails));
mConfirmation.mState = CONFIRM_FAILED; mConfirmation.mState = CONFIRM_FAILED;
mConfirmation.mTrigger.Assign("failed-lookups"); mConfirmation.mTrigger.Assign("failed-lookups");
mConfirmation.mFailedLookups =
nsDependentCSubstring(mConfirmation.mFailureReasons,
fails % ConfirmationContext::RESULTS_SIZE);
// Fire off a timer and start re-trying the NS domain again // Fire off a timer and start re-trying the NS domain again
NS_NewTimerWithCallback(getter_AddRefs(mConfirmation.mTimer), this, NS_NewTimerWithCallback(getter_AddRefs(mConfirmation.mTimer), this,
mConfirmation.mRetryInterval, mConfirmation.mRetryInterval,
@ -985,6 +1031,7 @@ void TRRService::ConfirmationContext::RecordEvent(const char* aReason) {
mFirstRequestTime = TimeStamp(); mFirstRequestTime = TimeStamp();
mContextChangeReason.Assign(aReason); mContextChangeReason.Assign(aReason);
mTrigger.Truncate(); mTrigger.Truncate();
mFailedLookups.Truncate();
mRetryInterval = StaticPrefs::network_trr_retry_timeout_ms(); mRetryInterval = StaticPrefs::network_trr_retry_timeout_ms();
}; };
@ -1038,6 +1085,11 @@ void TRRService::ConfirmationContext::RecordEvent(const char* aReason) {
nsPrintfCString("%i", mCaptivePortalStatus)}, nsPrintfCString("%i", mCaptivePortalStatus)},
}); });
if (mTrigger.Equals("failed-lookups"_ns)) {
extra.ref().AppendElement(
Telemetry::EventExtraEntry{"failedLookups"_ns, mFailedLookups});
}
ConfirmationState state = mState; ConfirmationState state = mState;
Telemetry::RecordEvent(eventType, mozilla::Some(nsPrintfCString("%u", state)), Telemetry::RecordEvent(eventType, mozilla::Some(nsPrintfCString("%u", state)),
extra); extra);
@ -1047,33 +1099,8 @@ void TRRService::ConfirmationContext::RecordEvent(const char* aReason) {
void TRRService::ConfirmationContext::RequestCompleted( void TRRService::ConfirmationContext::RequestCompleted(
nsresult aLookupStatus, nsresult aChannelStatus) { nsresult aLookupStatus, nsresult aChannelStatus) {
auto statusToChar = [aLookupStatus, aChannelStatus]() -> char { mResults[mAttemptCount % RESULTS_SIZE] =
if (aChannelStatus == NS_OK) { StatusToChar(aLookupStatus, aChannelStatus);
// Return + if confirmation was OK, or - if confirmation failed
return aLookupStatus == NS_OK ? '+' : '-';
}
switch (aChannelStatus) {
case NS_ERROR_NET_TIMEOUT_EXTERNAL:
// TRR timeout expired
return 't';
case NS_ERROR_UNKNOWN_HOST:
// TRRServiceChannel failed to due to unresolved host
return 'd';
default:
break;
}
// The error is a network error
if (NS_ERROR_GET_MODULE(aChannelStatus) == NS_ERROR_MODULE_NETWORK) {
return 'n';
}
// Some other kind of failure.
return '?';
};
mResults[mAttemptCount % RESULTS_SIZE] = statusToChar();
mAttemptCount++; mAttemptCount++;
} }

Просмотреть файл

@ -62,8 +62,7 @@ class TRRService : public TRRServiceBase,
bool IsExcludedFromTRR(const nsACString& aHost); bool IsExcludedFromTRR(const nsACString& aHost);
bool MaybeBootstrap(const nsACString& possible, nsACString& result); bool MaybeBootstrap(const nsACString& possible, nsACString& result);
enum TrrOkay { OKAY_NORMAL = 0, OKAY_TIMEOUT = 1, OKAY_BAD = 2 }; void TRRIsOkay(nsresult aChannelStatus);
void TRRIsOkay(enum TrrOkay aReason);
bool ParentalControlEnabled() const { return mParentalControlEnabled; } bool ParentalControlEnabled() const { return mParentalControlEnabled; }
nsresult DispatchTRRRequest(TRR* aTrrRequest); nsresult DispatchTRRRequest(TRR* aTrrRequest);
@ -146,9 +145,9 @@ class TRRService : public TRRServiceBase,
}; };
class ConfirmationContext { class ConfirmationContext {
public:
static const size_t RESULTS_SIZE = 32; static const size_t RESULTS_SIZE = 32;
public:
Atomic<ConfirmationState, Relaxed> mState; Atomic<ConfirmationState, Relaxed> mState;
RefPtr<TRR> mTask; RefPtr<TRR> mTask;
nsCOMPtr<nsITimer> mTimer; nsCOMPtr<nsITimer> mTimer;
@ -156,6 +155,10 @@ class TRRService : public TRRServiceBase,
// The number of TRR requests that failed in a row. // The number of TRR requests that failed in a row.
Atomic<uint32_t, Relaxed> mTRRFailures; Atomic<uint32_t, Relaxed> mTRRFailures;
// This buffer holds consecutive TRR failures reported by calling
// TRRIsOkay(). It is only meant for reporting event telemetry.
char mFailureReasons[RESULTS_SIZE] = {0};
// The number of confirmation retries. // The number of confirmation retries.
uint32_t mAttemptCount = 0; uint32_t mAttemptCount = 0;
@ -177,6 +180,10 @@ class TRRService : public TRRServiceBase,
// What triggered the confirmation // What triggered the confirmation
nsCString mTrigger; nsCString mTrigger;
// String representation of consecutive failed lookups that triggered
// confirmation.
nsCString mFailedLookups;
// Called when a confirmation completes successfully or when the // Called when a confirmation completes successfully or when the
// confirmation context changes. // confirmation context changes.
void RecordEvent(const char* aReason); void RecordEvent(const char* aReason);

Просмотреть файл

@ -2111,6 +2111,7 @@ network.dns:
objects: ["context"] objects: ["context"]
bug_numbers: bug_numbers:
- 1691408 - 1691408
- 1694949
description: > description: >
This telemetry records the status of the TRR confirmation across. This telemetry records the status of the TRR confirmation across.
The value of the event is one of: The value of the event is one of:
@ -2138,6 +2139,9 @@ network.dns:
String representation of the last 32 confirmation results. String representation of the last 32 confirmation results.
Example: nnnnnnttttttttt indicates a number of network (n) failures Example: nnnnnnttttttttt indicates a number of network (n) failures
followed by timeouts (t). followed by timeouts (t).
failedLookups: >
When the trigger is failed-lookups, this contains the string
representation of the failures that triggered the confirmation.
networkID: > networkID: >
The network ID for the recorded confirmation attempts The network ID for the recorded confirmation attempts
captivePortal: > captivePortal: >