updated local event data cache impl

This commit is contained in:
Charles Torre 2022-11-18 16:58:04 -08:00
Родитель 2ea1a3d998
Коммит 28b145edae
2 изменённых файлов: 287 добавлений и 286 удалений

Просмотреть файл

@ -325,7 +325,7 @@ namespace FabricHealer
}
await MonitorHealthEventsAsync();
// Identity-agnostic internal operational telemetry sent to Service Fabric team (only) for use in
// understanding generic behavior of FH in the real world (no PII). This data is sent once a day and will be retained for no more
// than 90 days. Please consider enabling this to help the SF team make this technology better.
@ -363,7 +363,7 @@ namespace FabricHealer
await Task.Delay(
TimeSpan.FromSeconds(
ConfigSettings.HealthCheckIntervalInSeconds > 0 ? ConfigSettings.HealthCheckIntervalInSeconds : 10), Token);
ConfigSettings.HealthCheckIntervalInSeconds > 0 ? ConfigSettings.HealthCheckIntervalInSeconds : 10), Token);
}
RepairLogger.LogInfo("Shutdown signaled. Stopping.");
@ -482,7 +482,7 @@ namespace FabricHealer
await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() => repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(
RepairConstants.FabricHealer,
Token),
Token),
Token);
if (currentFHRepairTasksInProgress.Count == 0)
@ -533,7 +533,7 @@ namespace FabricHealer
}
string errorCode = repairExecutorData.RepairData.Code;
if (string.IsNullOrWhiteSpace(errorCode))
{
continue;
@ -888,7 +888,7 @@ namespace FabricHealer
// Block attempts to schedule node-level or system service restart repairs if one is already executing in the cluster.
var fhRepairTasks = await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairConstants.FabricHealer, Token);
if (fhRepairTasks.Count > 0)
{
foreach (var repair in fhRepairTasks)
@ -1023,8 +1023,8 @@ namespace FabricHealer
null,
ConfigSettings.EnableVerboseLogging);
// Update the in-memory HealthEvent List.
this.repairTaskManager.detectedHealthEvents.Add((repairData.ApplicationName, evt));
// Update the in-memory HealthEvent data.
this.repairTaskManager.detectedHealthEvents.Add((repairData.ApplicationName, evt, DateTime.UtcNow));
// Start the repair workflow.
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
@ -1059,7 +1059,7 @@ namespace FabricHealer
{
try
{
ApplicationUpgradeProgress appUpgradeProgress =
ApplicationUpgradeProgress appUpgradeProgress =
await FabricClientSingleton.ApplicationManager.GetApplicationUpgradeProgressAsync(appName, ConfigSettings.AsyncTimeout, Token);
if (appUpgradeProgress.UpgradeState == ApplicationUpgradeState.RollingBackInProgress
@ -1346,7 +1346,7 @@ namespace FabricHealer
ConfigSettings.EnableVerboseLogging);
// Update the in-memory HealthEvent List.
repairTaskManager.detectedHealthEvents.Add((repairData.ServiceName, evt));
repairTaskManager.detectedHealthEvents.Add((repairData.ServiceName, evt, DateTime.UtcNow));
// Start the repair workflow.
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
@ -1442,7 +1442,8 @@ namespace FabricHealer
EntityType = EntityType.Machine,
Description = evt.HealthInformation.Description,
HealthState = evt.HealthInformation.HealthState,
Source = RepairConstants.FabricHealer
Property = evt.HealthInformation.Property,
Source = evt.HealthInformation.SourceId
};
}
else
@ -1487,7 +1488,7 @@ namespace FabricHealer
null,
ConfigSettings.EnableVerboseLogging);
continue;
continue;
}
// Get repair rules for supplied facts (TelemetryData).
@ -1523,8 +1524,8 @@ namespace FabricHealer
null,
ConfigSettings.EnableVerboseLogging);
// Update the in-memory HealthEvent List.
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, evt));
// Update the in-memory HealthEvent data.
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, evt, DateTime.UtcNow));
// Start the repair workflow.
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
@ -1573,8 +1574,8 @@ namespace FabricHealer
null,
ConfigSettings.EnableVerboseLogging);
// Update the in-memory HealthEvent List.
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, evt));
// Update the in-memory HealthEvent data.
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, evt, DateTime.UtcNow));
// Start the repair workflow.
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
@ -1594,7 +1595,7 @@ namespace FabricHealer
var currentRepairs =
await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairConstants.FabricHealer, Token);
// Block attempts to reschedule another Fabric node-level repair for the same node if a current repair has not yet completed.
if (currentRepairs.Count > 0 && currentRepairs.Any(r => r.ExecutorData.Contains(repairId)))
{
@ -1632,8 +1633,8 @@ namespace FabricHealer
null,
ConfigSettings.EnableVerboseLogging);
// Update the in-memory HealthEvent List.
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, healthEvent));
// Update the in-memory HealthEvent data.
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, healthEvent, DateTime.UtcNow));
// Start the repair workflow.
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
@ -1662,9 +1663,9 @@ namespace FabricHealer
foreach (var partitionHealthState in partitionHealthStates)
{
PartitionHealth partitionHealth =
PartitionHealth partitionHealth =
await FabricClientSingleton.HealthManager.GetPartitionHealthAsync(partitionHealthState.PartitionId, ConfigSettings.AsyncTimeout, Token);
List<ReplicaHealthState> replicaHealthStates = partitionHealth.ReplicaHealthStates.Where(
p => p.AggregatedHealthState == HealthState.Warning || p.AggregatedHealthState == HealthState.Error).ToList();
@ -1679,7 +1680,7 @@ namespace FabricHealer
{
healthEvents = replicaHealth.HealthEvents.Where(
h => h.HealthInformation.HealthState == HealthState.Warning || h.HealthInformation.HealthState == HealthState.Error).ToList();
foreach (HealthEvent healthEvent in healthEvents)
{
if (!healthEvent.HealthInformation.SourceId.Contains("System.RAP"))
@ -1782,7 +1783,7 @@ namespace FabricHealer
}
}
}
private List<string> GetRepairRulesFromErrorCode(string errorCode, string app = null)
{
if (!SupportedErrorCodes.AppErrorCodesDictionary.ContainsKey(errorCode)
@ -1811,7 +1812,7 @@ namespace FabricHealer
case SupportedErrorCodes.AppWarningTooManyOpenFileHandles:
case SupportedErrorCodes.AppWarningTooManyThreads:
repairPolicySectionName =
repairPolicySectionName =
app == RepairConstants.SystemAppName ? RepairConstants.SystemServiceRepairPolicySectionName : RepairConstants.AppRepairPolicySectionName;
break;
@ -1859,7 +1860,7 @@ namespace FabricHealer
// App repair (user).
case RepairConstants.AppObserver:
repairPolicySectionName = RepairConstants.AppRepairPolicySectionName;
repairPolicySectionName = RepairConstants.AppRepairPolicySectionName;
break;
// System service repair.
@ -2084,10 +2085,10 @@ namespace FabricHealer
message,
Token,
null,
true,
true,
TimeSpan.FromDays(1),
"NewVersionAvailable",
EntityType.Application);
EntityType.Application);
}
}
catch (Exception e)
@ -2159,4 +2160,4 @@ namespace FabricHealer
return null;
}
}
}
}

Просмотреть файл

@ -24,21 +24,20 @@ namespace FabricHealer.Repair
{
public sealed class RepairTaskManager : IRepairTasks
{
private static readonly TimeSpan MaxWaitTimeForInfrastructureRepairTaskCompleted = TimeSpan.FromHours(8);
private static readonly TimeSpan MaxWaitTimeForFHRepairTaskCompleted = TimeSpan.FromHours(1);
private readonly RepairTaskEngine repairTaskEngine;
private readonly RepairExecutor repairExecutor;
private readonly TimeSpan asyncTimeout = TimeSpan.FromSeconds(60);
private readonly DateTime healthEventsListCreationTime = DateTime.UtcNow;
private readonly TimeSpan maxLifeTimeHealthEventsData = TimeSpan.FromDays(2);
private readonly TimeSpan maxLifeTimeHealthEventsData = TimeSpan.FromHours(4);
private DateTime lastHealthEventsListClearDateTime;
internal readonly List<(string entityName, HealthEvent healthEvent)> detectedHealthEvents;
internal readonly List<(string entityName, HealthEvent healthEvent, DateTime DateTimeAdded)> detectedHealthEvents;
public RepairTaskManager()
{
repairExecutor = new RepairExecutor();
repairTaskEngine = new RepairTaskEngine();
detectedHealthEvents = new List<(string id, HealthEvent healthEvent)>();
detectedHealthEvents = new List<(string id, HealthEvent healthEvent, DateTime DateTimeAdded)>();
lastHealthEventsListClearDateTime = healthEventsListCreationTime;
}
@ -89,13 +88,13 @@ namespace FabricHealer.Repair
if (node == null)
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Warning,
"RepairTaskManager.StartRepairWorkflowAsync",
"Unable to locate target node. Aborting repair.",
cancellationToken,
null,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Warning,
"RepairTaskManager.StartRepairWorkflowAsync",
"Unable to locate target node. Aborting repair.",
cancellationToken,
null,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
return;
}
@ -111,13 +110,13 @@ namespace FabricHealer.Repair
}
catch (GuanException ge)
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Warning,
"StartRepairWorkflowAsync:GuanException",
$"Failed in Guan: {ge}",
cancellationToken,
null,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Warning,
"StartRepairWorkflowAsync:GuanException",
$"Failed in Guan: {ge}",
cancellationToken,
null,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
}
}
@ -315,7 +314,7 @@ namespace FabricHealer.Repair
{
return true;
}
}
}
}
else
{
@ -394,13 +393,13 @@ namespace FabricHealer.Repair
string actionMessage =
$"Attempting to restart Service Fabric system process {repairData.ProcessName}.";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairExecutor.RestartSystemServiceProcessAsync::Start",
actionMessage,
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairExecutor.RestartSystemServiceProcessAsync::Start",
actionMessage,
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
bool result = await repairExecutor.RestartSystemServiceProcessAsync(repairData, cancellationToken);
@ -411,13 +410,13 @@ namespace FabricHealer.Repair
string statusSuccess = $"Successfully restarted Service Fabric system service process {repairData.ProcessName} on node {repairData.NodeName}.";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairExecutor.RestartSystemServiceProcessAsync::Success",
statusSuccess,
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairExecutor.RestartSystemServiceProcessAsync::Success",
statusSuccess,
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
return true;
}
@ -482,7 +481,7 @@ namespace FabricHealer.Repair
var executorData = new RepairExecutorData
{
ExecutorTimeoutInMinutes = (int)MaxWaitTimeForFHRepairTaskCompleted.TotalMinutes,
ExecutorTimeoutInMinutes = (int)MaxWaitTimeForFHRepairTaskCompleted.TotalMinutes,
RepairData = repairData
};
@ -506,29 +505,29 @@ namespace FabricHealer.Repair
Stopwatch stopWatch = Stopwatch.StartNew();
bool isApproved = false;
var repairs =
var repairs =
await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairConstants.FabricHealer, cancellationToken);
if (repairs.All(repair => repair.TaskId != repairTask.TaskId))
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
$"Failed to find scheduled repair task {repairTask.TaskId}.",
FabricHealerManager.Token,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
$"Failed to find scheduled repair task {repairTask.TaskId}.",
FabricHealerManager.Token,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
return false;
}
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager::WaitingForApproval",
$"Waiting for RM to Approve repair task {repairTask.TaskId}.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager::WaitingForApproval",
$"Waiting for RM to Approve repair task {repairTask.TaskId}.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
while (approvalTimeout >= stopWatch.Elapsed)
{
@ -539,13 +538,13 @@ namespace FabricHealer.Repair
&& (repair.State == RepairTaskState.Completed && repair.ResultStatus == RepairTaskResult.Cancelled
|| repair.Flags == RepairTaskFlags.CancelRequested || repair.Flags == RepairTaskFlags.AbortRequested)))
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
$"Repair Task {repairTask.TaskId} was aborted or cancelled.",
FabricHealerManager.Token,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
$"Repair Task {repairTask.TaskId} was aborted or cancelled.",
FabricHealerManager.Token,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
return false;
}
@ -565,23 +564,23 @@ namespace FabricHealer.Repair
if (isApproved)
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_Approved",
$"RM has Approved repair task {repairTask.TaskId}.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_Approved",
$"RM has Approved repair task {repairTask.TaskId}.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
}
else
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_NotApproved",
$"RM did not Approve repair task {repairTask.TaskId}. Cancelling...",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_NotApproved",
$"RM did not Approve repair task {repairTask.TaskId}. Cancelling...",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
await FabricRepairTasks.CancelRepairTaskAsync(repairTask);
return false;
@ -593,13 +592,13 @@ namespace FabricHealer.Repair
RepairTaskResult.Pending,
cancellationToken);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_MovedExecuting",
$"Executing repair {repairTask.TaskId}.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_MovedExecuting",
$"Executing repair {repairTask.TaskId}.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
bool success;
var repairAction = repairData.RepairPolicy.RepairAction;
@ -609,110 +608,19 @@ namespace FabricHealer.Repair
switch (repairAction)
{
case RepairActionType.DeleteFiles:
success = await DeleteFilesAsyncAsync(repairData, cancellationToken);
break;
// Note: For SF app container services, RestartDeployedCodePackage API does not work.
// Thus, using Restart/Remove(stateful/stateless)Replica API instead, which does restart container instances.
case RepairActionType.RestartCodePackage:
{
if (string.IsNullOrWhiteSpace(repairData.ContainerId))
{
if (string.IsNullOrWhiteSpace(repairData.ContainerId))
{
success = await RestartDeployedCodePackageAsync(repairData, cancellationToken);
}
else
{
if (repairData.PartitionId == null)
{
success = false;
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoPartition",
$"No partition specified.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
break;
}
// Need replica or instance details..
var repList = await FabricHealerManager.FabricClientSingleton.QueryManager.GetReplicaListAsync(
(Guid)repairData.PartitionId,
repairData.ReplicaId,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken);
if (repList.Count == 0)
{
success = false;
break;
}
var rep = repList[0];
// Restarting stateful replica will restart the container instance.
if (rep.ServiceKind == ServiceKind.Stateful)
{
success = await RestartReplicaAsync(repairData, cancellationToken);
}
else
{
// For stateless intances, you need to remove the replica, which will
// restart the container instance.
success = await RemoveReplicaAsync(repairData, cancellationToken);
}
}
break;
success = await RestartDeployedCodePackageAsync(repairData, cancellationToken);
}
case RepairActionType.RemoveReplica:
{
if (repairData.PartitionId == null)
{
success = false;
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoPartition",
$"No partition specified.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
break;
}
var repList = await FabricHealerManager.FabricClientSingleton.QueryManager.GetReplicaListAsync(
(Guid)repairData.PartitionId,
repairData.ReplicaId,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken);
if (repList.Count == 0)
{
success = false;
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoReplica",
$"Stateless Instance {repairData.ReplicaId} not found on partition " +
$"{repairData.PartitionId}.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
break;
}
success = await RemoveReplicaAsync(repairData, cancellationToken);
break;
}
case RepairActionType.RestartProcess:
success = await RestartSystemServiceProcessAsync(repairData, cancellationToken);
break;
case RepairActionType.RestartReplica:
else
{
if (repairData.PartitionId == null)
{
@ -728,6 +636,7 @@ namespace FabricHealer.Repair
break;
}
// Need replica or instance details..
var repList = await FabricHealerManager.FabricClientSingleton.QueryManager.GetReplicaListAsync(
(Guid)repairData.PartitionId,
repairData.ReplicaId,
@ -737,58 +646,148 @@ namespace FabricHealer.Repair
if (repList.Count == 0)
{
success = false;
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoReplica",
$"Stateful replica {repairData.ReplicaId} not found on partition " +
$"{repairData.PartitionId}.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
break;
}
var replica = repList[0];
var rep = repList[0];
// Restart - stateful replica.
if (replica.ServiceKind == ServiceKind.Stateful)
// Restarting stateful replica will restart the container instance.
if (rep.ServiceKind == ServiceKind.Stateful)
{
success = await RestartReplicaAsync(repairData, cancellationToken);
}
else
{
// For stateless replicas (aka instances), you need to remove the replica. The runtime will create a new one
// and place it.
// For stateless intances, you need to remove the replica, which will
// restart the container instance.
success = await RemoveReplicaAsync(repairData, cancellationToken);
}
break;
}
case RepairActionType.RestartFabricNode:
break;
}
case RepairActionType.RemoveReplica:
{
if (repairData.PartitionId == null)
{
var executorData = repairTask.ExecutorData;
if (string.IsNullOrWhiteSpace(executorData))
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.SafeRestartFabricNode",
$"Repair {repairTask.TaskId} is missing ExecutorData.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
success = false;
}
else
{
success = await SafeRestartServiceFabricNodeAsync(repairData, repairTask, cancellationToken);
}
success = false;
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoPartition",
$"No partition specified.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
break;
}
var repList = await FabricHealerManager.FabricClientSingleton.QueryManager.GetReplicaListAsync(
(Guid)repairData.PartitionId,
repairData.ReplicaId,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken);
if (repList.Count == 0)
{
success = false;
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoReplica",
$"Stateless Instance {repairData.ReplicaId} not found on partition " +
$"{repairData.PartitionId}.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
break;
}
success = await RemoveReplicaAsync(repairData, cancellationToken);
break;
}
case RepairActionType.RestartProcess:
success = await RestartSystemServiceProcessAsync(repairData, cancellationToken);
break;
case RepairActionType.RestartReplica:
{
if (repairData.PartitionId == null)
{
success = false;
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoPartition",
$"No partition specified.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
break;
}
var repList = await FabricHealerManager.FabricClientSingleton.QueryManager.GetReplicaListAsync(
(Guid)repairData.PartitionId,
repairData.ReplicaId,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken);
if (repList.Count == 0)
{
success = false;
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoReplica",
$"Stateful replica {repairData.ReplicaId} not found on partition " +
$"{repairData.PartitionId}.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
break;
}
var replica = repList[0];
// Restart - stateful replica.
if (replica.ServiceKind == ServiceKind.Stateful)
{
success = await RestartReplicaAsync(repairData, cancellationToken);
}
else
{
// For stateless replicas (aka instances), you need to remove the replica. The runtime will create a new one
// and place it.
success = await RemoveReplicaAsync(repairData, cancellationToken);
}
break;
}
case RepairActionType.RestartFabricNode:
{
var executorData = repairTask.ExecutorData;
if (string.IsNullOrWhiteSpace(executorData))
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.SafeRestartFabricNode",
$"Repair {repairTask.TaskId} is missing ExecutorData.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
success = false;
}
else
{
success = await SafeRestartServiceFabricNodeAsync(repairData, repairTask, cancellationToken);
}
break;
}
default:
return false;
}
@ -941,23 +940,23 @@ namespace FabricHealer.Repair
if (isHealthy)
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
$"{repairData.RepairPolicy.RepairAction} repair for {repairTarget} has succeeded.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
$"{repairData.RepairPolicy.RepairAction} repair for {repairTarget} has succeeded.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
}
else
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
$"{repairData.RepairPolicy.RepairAction} repair for {repairTarget} has failed. {repairTarget} is still in an unhealthy state.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
$"{repairData.RepairPolicy.RepairAction} repair for {repairTarget} has failed. {repairTarget} is still in an unhealthy state.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
}
// Tell RM we are ready to move to Completed state as our custom code has completed its repair execution successfully.
@ -975,13 +974,13 @@ namespace FabricHealer.Repair
}
// Executor failure. Cancel repair task.
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_ExecuteFailed",
$"Executor failed for repair {repairTask.TaskId}. See logs for details. Cancelling repair task.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_ExecuteFailed",
$"Executor failed for repair {repairTask.TaskId}. See logs for details. Cancelling repair task.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
await FabricRepairTasks.CancelRepairTaskAsync(repairTask);
return false;
@ -991,6 +990,7 @@ namespace FabricHealer.Repair
internal int GetEntityHealthEventCountWithinTimeRange(TelemetryData repairData, TimeSpan timeWindow)
{
int count = 0;
if (repairData == null || detectedHealthEvents == null || !detectedHealthEvents.Any())
{
return count;
@ -1015,17 +1015,11 @@ namespace FabricHealer.Repair
break;
}
var entityHealthEvents = detectedHealthEvents.Where(
evt => evt.entityName == id && evt.healthEvent.HealthInformation.Property == repairData.Property);
foreach (var (_, healthEvent) in entityHealthEvents)
{
if (DateTime.UtcNow.Subtract(healthEvent.SourceUtcTimestamp) > timeWindow)
{
continue;
}
count++;
}
count = detectedHealthEvents.Count(
evt => evt.entityName == id
&& evt.healthEvent.HealthInformation.SourceId == repairData.Source
&& evt.healthEvent.HealthInformation.Property == repairData.Property
&& DateTime.UtcNow.Subtract(evt.healthEvent.SourceUtcTimestamp) <= timeWindow);
// Lifetime management of Health Events list data. Data is kept in-memory only for 2 days. If FH process restarts, data is not preserved.
if (DateTime.UtcNow.Subtract(lastHealthEventsListClearDateTime) >= maxLifeTimeHealthEventsData)
@ -1087,20 +1081,21 @@ namespace FabricHealer.Repair
if (appHealth == null || appHealth.HealthEvents.Count == 0)
{
return TimeSpan.MinValue;
}
}
// How many times has the entity been put into Error health state in the last 2 hours?
// How many times has the entity been put into Error health within the specified time window?
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
{
if (GetEntityHealthEventCountWithinTimeRange(repairData, timeWindow) > 1)
{
var orderedEvents = detectedHealthEvents.Where(
evt => evt.entityName == repairData.ApplicationName &&
evt.healthEvent.HealthInformation.Property == repairData.Property)
evt => evt.entityName == repairData.ApplicationName
&& evt.healthEvent.HealthInformation.SourceId == repairData.Source
&& evt.healthEvent.HealthInformation.Property == repairData.Property
&& DateTime.UtcNow.Subtract(evt.healthEvent.SourceUtcTimestamp) <= timeWindow)
.OrderByDescending(o => o.healthEvent.SourceUtcTimestamp);
return DateTime.UtcNow.Subtract(
orderedEvents.Last().healthEvent.SourceUtcTimestamp);
return DateTime.UtcNow.Subtract(orderedEvents.Last().healthEvent.SourceUtcTimestamp);
}
}
@ -1139,8 +1134,10 @@ namespace FabricHealer.Repair
if (GetEntityHealthEventCountWithinTimeRange(repairData, timeWindow) > 1)
{
var orderedEvents = detectedHealthEvents.Where(
evt => evt.entityName == repairData.ServiceName &&
evt.healthEvent.HealthInformation.Property == repairData.Property)
evt => evt.entityName == repairData.ServiceName
&& evt.healthEvent.HealthInformation.SourceId == repairData.Source
&& evt.healthEvent.HealthInformation.Property == repairData.Property
&& DateTime.UtcNow.Subtract(evt.healthEvent.SourceUtcTimestamp) <= timeWindow)
.OrderByDescending(o => o.healthEvent.SourceUtcTimestamp);
return DateTime.UtcNow.Subtract(orderedEvents.Last().healthEvent.SourceUtcTimestamp);
@ -1175,14 +1172,17 @@ namespace FabricHealer.Repair
return TimeSpan.MinValue;
}
// How many times has the entity been put into Error health state in the last 2 hours?
// How many times has the entity been put into Error health state.
// Look into LastTransition to Error.
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
{
if (GetEntityHealthEventCountWithinTimeRange(repairData, timeWindow) > 1)
{
var orderedEvents = detectedHealthEvents.Where(
evt => evt.entityName == repairData.NodeName &&
evt.healthEvent.HealthInformation.Property == repairData.Property)
evt => evt.entityName == repairData.NodeName
&& evt.healthEvent.HealthInformation.SourceId == repairData.Source
&& evt.healthEvent.HealthInformation.Property == repairData.Property
&& DateTime.UtcNow.Subtract(evt.healthEvent.SourceUtcTimestamp) <= timeWindow)
.OrderByDescending(o => o.healthEvent.SourceUtcTimestamp);
return DateTime.UtcNow.Subtract(orderedEvents.Last().healthEvent.SourceUtcTimestamp);
@ -1252,7 +1252,7 @@ namespace FabricHealer.Repair
switch (repairData.EntityType)
{
case EntityType.Application:
var appHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() => FabricHealerManager.FabricClientSingleton.HealthManager.GetApplicationHealthAsync(
new Uri(repairData.ApplicationName),
@ -1285,7 +1285,7 @@ namespace FabricHealer.Repair
}
return isTargetAppHealedOnTargetNode ? HealthState.Ok : appHealth.AggregatedHealthState;
case EntityType.Service:
var serviceHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
@ -1306,7 +1306,7 @@ namespace FabricHealer.Repair
case EntityType.Node:
case EntityType.Machine:
var nodeHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() => FabricHealerManager.FabricClientSingleton.HealthManager.GetNodeHealthAsync(
repairData.NodeName,
@ -1322,9 +1322,9 @@ namespace FabricHealer.Repair
&& repairData.HealthState == HealthState.Ok);
return isTargetNodeHealed ? HealthState.Ok : nodeHealth.AggregatedHealthState;
case EntityType.Replica:
// Make sure the Partition where the restarted replica was located is now healthy.
var partitionHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() => FabricHealerManager.FabricClientSingleton.HealthManager.GetPartitionHealthAsync(
@ -1334,10 +1334,10 @@ namespace FabricHealer.Repair
token);
return partitionHealth.AggregatedHealthState;
default:
return HealthState.Unknown;
}
}
}
}
}