updated local event data cache impl
This commit is contained in:
Родитель
2ea1a3d998
Коммит
28b145edae
|
@ -325,7 +325,7 @@ namespace FabricHealer
|
|||
}
|
||||
|
||||
await MonitorHealthEventsAsync();
|
||||
|
||||
|
||||
// Identity-agnostic internal operational telemetry sent to Service Fabric team (only) for use in
|
||||
// understanding generic behavior of FH in the real world (no PII). This data is sent once a day and will be retained for no more
|
||||
// than 90 days. Please consider enabling this to help the SF team make this technology better.
|
||||
|
@ -363,7 +363,7 @@ namespace FabricHealer
|
|||
|
||||
await Task.Delay(
|
||||
TimeSpan.FromSeconds(
|
||||
ConfigSettings.HealthCheckIntervalInSeconds > 0 ? ConfigSettings.HealthCheckIntervalInSeconds : 10), Token);
|
||||
ConfigSettings.HealthCheckIntervalInSeconds > 0 ? ConfigSettings.HealthCheckIntervalInSeconds : 10), Token);
|
||||
}
|
||||
|
||||
RepairLogger.LogInfo("Shutdown signaled. Stopping.");
|
||||
|
@ -482,7 +482,7 @@ namespace FabricHealer
|
|||
await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(
|
||||
RepairConstants.FabricHealer,
|
||||
Token),
|
||||
Token),
|
||||
Token);
|
||||
|
||||
if (currentFHRepairTasksInProgress.Count == 0)
|
||||
|
@ -533,7 +533,7 @@ namespace FabricHealer
|
|||
}
|
||||
|
||||
string errorCode = repairExecutorData.RepairData.Code;
|
||||
|
||||
|
||||
if (string.IsNullOrWhiteSpace(errorCode))
|
||||
{
|
||||
continue;
|
||||
|
@ -888,7 +888,7 @@ namespace FabricHealer
|
|||
|
||||
// Block attempts to schedule node-level or system service restart repairs if one is already executing in the cluster.
|
||||
var fhRepairTasks = await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairConstants.FabricHealer, Token);
|
||||
|
||||
|
||||
if (fhRepairTasks.Count > 0)
|
||||
{
|
||||
foreach (var repair in fhRepairTasks)
|
||||
|
@ -1023,8 +1023,8 @@ namespace FabricHealer
|
|||
null,
|
||||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
// Update the in-memory HealthEvent List.
|
||||
this.repairTaskManager.detectedHealthEvents.Add((repairData.ApplicationName, evt));
|
||||
// Update the in-memory HealthEvent data.
|
||||
this.repairTaskManager.detectedHealthEvents.Add((repairData.ApplicationName, evt, DateTime.UtcNow));
|
||||
|
||||
// Start the repair workflow.
|
||||
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
|
||||
|
@ -1059,7 +1059,7 @@ namespace FabricHealer
|
|||
{
|
||||
try
|
||||
{
|
||||
ApplicationUpgradeProgress appUpgradeProgress =
|
||||
ApplicationUpgradeProgress appUpgradeProgress =
|
||||
await FabricClientSingleton.ApplicationManager.GetApplicationUpgradeProgressAsync(appName, ConfigSettings.AsyncTimeout, Token);
|
||||
|
||||
if (appUpgradeProgress.UpgradeState == ApplicationUpgradeState.RollingBackInProgress
|
||||
|
@ -1346,7 +1346,7 @@ namespace FabricHealer
|
|||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
// Update the in-memory HealthEvent List.
|
||||
repairTaskManager.detectedHealthEvents.Add((repairData.ServiceName, evt));
|
||||
repairTaskManager.detectedHealthEvents.Add((repairData.ServiceName, evt, DateTime.UtcNow));
|
||||
|
||||
// Start the repair workflow.
|
||||
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
|
||||
|
@ -1442,7 +1442,8 @@ namespace FabricHealer
|
|||
EntityType = EntityType.Machine,
|
||||
Description = evt.HealthInformation.Description,
|
||||
HealthState = evt.HealthInformation.HealthState,
|
||||
Source = RepairConstants.FabricHealer
|
||||
Property = evt.HealthInformation.Property,
|
||||
Source = evt.HealthInformation.SourceId
|
||||
};
|
||||
}
|
||||
else
|
||||
|
@ -1487,7 +1488,7 @@ namespace FabricHealer
|
|||
null,
|
||||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
continue;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get repair rules for supplied facts (TelemetryData).
|
||||
|
@ -1523,8 +1524,8 @@ namespace FabricHealer
|
|||
null,
|
||||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
// Update the in-memory HealthEvent List.
|
||||
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, evt));
|
||||
// Update the in-memory HealthEvent data.
|
||||
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, evt, DateTime.UtcNow));
|
||||
|
||||
// Start the repair workflow.
|
||||
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
|
||||
|
@ -1573,8 +1574,8 @@ namespace FabricHealer
|
|||
null,
|
||||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
// Update the in-memory HealthEvent List.
|
||||
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, evt));
|
||||
// Update the in-memory HealthEvent data.
|
||||
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, evt, DateTime.UtcNow));
|
||||
|
||||
// Start the repair workflow.
|
||||
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
|
||||
|
@ -1594,7 +1595,7 @@ namespace FabricHealer
|
|||
|
||||
var currentRepairs =
|
||||
await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairConstants.FabricHealer, Token);
|
||||
|
||||
|
||||
// Block attempts to reschedule another Fabric node-level repair for the same node if a current repair has not yet completed.
|
||||
if (currentRepairs.Count > 0 && currentRepairs.Any(r => r.ExecutorData.Contains(repairId)))
|
||||
{
|
||||
|
@ -1632,8 +1633,8 @@ namespace FabricHealer
|
|||
null,
|
||||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
// Update the in-memory HealthEvent List.
|
||||
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, healthEvent));
|
||||
// Update the in-memory HealthEvent data.
|
||||
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, healthEvent, DateTime.UtcNow));
|
||||
|
||||
// Start the repair workflow.
|
||||
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
|
||||
|
@ -1662,9 +1663,9 @@ namespace FabricHealer
|
|||
|
||||
foreach (var partitionHealthState in partitionHealthStates)
|
||||
{
|
||||
PartitionHealth partitionHealth =
|
||||
PartitionHealth partitionHealth =
|
||||
await FabricClientSingleton.HealthManager.GetPartitionHealthAsync(partitionHealthState.PartitionId, ConfigSettings.AsyncTimeout, Token);
|
||||
|
||||
|
||||
List<ReplicaHealthState> replicaHealthStates = partitionHealth.ReplicaHealthStates.Where(
|
||||
p => p.AggregatedHealthState == HealthState.Warning || p.AggregatedHealthState == HealthState.Error).ToList();
|
||||
|
||||
|
@ -1679,7 +1680,7 @@ namespace FabricHealer
|
|||
{
|
||||
healthEvents = replicaHealth.HealthEvents.Where(
|
||||
h => h.HealthInformation.HealthState == HealthState.Warning || h.HealthInformation.HealthState == HealthState.Error).ToList();
|
||||
|
||||
|
||||
foreach (HealthEvent healthEvent in healthEvents)
|
||||
{
|
||||
if (!healthEvent.HealthInformation.SourceId.Contains("System.RAP"))
|
||||
|
@ -1782,7 +1783,7 @@ namespace FabricHealer
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private List<string> GetRepairRulesFromErrorCode(string errorCode, string app = null)
|
||||
{
|
||||
if (!SupportedErrorCodes.AppErrorCodesDictionary.ContainsKey(errorCode)
|
||||
|
@ -1811,7 +1812,7 @@ namespace FabricHealer
|
|||
case SupportedErrorCodes.AppWarningTooManyOpenFileHandles:
|
||||
case SupportedErrorCodes.AppWarningTooManyThreads:
|
||||
|
||||
repairPolicySectionName =
|
||||
repairPolicySectionName =
|
||||
app == RepairConstants.SystemAppName ? RepairConstants.SystemServiceRepairPolicySectionName : RepairConstants.AppRepairPolicySectionName;
|
||||
break;
|
||||
|
||||
|
@ -1859,7 +1860,7 @@ namespace FabricHealer
|
|||
// App repair (user).
|
||||
case RepairConstants.AppObserver:
|
||||
|
||||
repairPolicySectionName = RepairConstants.AppRepairPolicySectionName;
|
||||
repairPolicySectionName = RepairConstants.AppRepairPolicySectionName;
|
||||
break;
|
||||
|
||||
// System service repair.
|
||||
|
@ -2084,10 +2085,10 @@ namespace FabricHealer
|
|||
message,
|
||||
Token,
|
||||
null,
|
||||
true,
|
||||
true,
|
||||
TimeSpan.FromDays(1),
|
||||
"NewVersionAvailable",
|
||||
EntityType.Application);
|
||||
EntityType.Application);
|
||||
}
|
||||
}
|
||||
catch (Exception e)
|
||||
|
@ -2159,4 +2160,4 @@ namespace FabricHealer
|
|||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -24,21 +24,20 @@ namespace FabricHealer.Repair
|
|||
{
|
||||
public sealed class RepairTaskManager : IRepairTasks
|
||||
{
|
||||
private static readonly TimeSpan MaxWaitTimeForInfrastructureRepairTaskCompleted = TimeSpan.FromHours(8);
|
||||
private static readonly TimeSpan MaxWaitTimeForFHRepairTaskCompleted = TimeSpan.FromHours(1);
|
||||
private readonly RepairTaskEngine repairTaskEngine;
|
||||
private readonly RepairExecutor repairExecutor;
|
||||
private readonly TimeSpan asyncTimeout = TimeSpan.FromSeconds(60);
|
||||
private readonly DateTime healthEventsListCreationTime = DateTime.UtcNow;
|
||||
private readonly TimeSpan maxLifeTimeHealthEventsData = TimeSpan.FromDays(2);
|
||||
private readonly TimeSpan maxLifeTimeHealthEventsData = TimeSpan.FromHours(4);
|
||||
private DateTime lastHealthEventsListClearDateTime;
|
||||
internal readonly List<(string entityName, HealthEvent healthEvent)> detectedHealthEvents;
|
||||
internal readonly List<(string entityName, HealthEvent healthEvent, DateTime DateTimeAdded)> detectedHealthEvents;
|
||||
|
||||
public RepairTaskManager()
|
||||
{
|
||||
repairExecutor = new RepairExecutor();
|
||||
repairTaskEngine = new RepairTaskEngine();
|
||||
detectedHealthEvents = new List<(string id, HealthEvent healthEvent)>();
|
||||
detectedHealthEvents = new List<(string id, HealthEvent healthEvent, DateTime DateTimeAdded)>();
|
||||
lastHealthEventsListClearDateTime = healthEventsListCreationTime;
|
||||
}
|
||||
|
||||
|
@ -89,13 +88,13 @@ namespace FabricHealer.Repair
|
|||
|
||||
if (node == null)
|
||||
{
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Warning,
|
||||
"RepairTaskManager.StartRepairWorkflowAsync",
|
||||
"Unable to locate target node. Aborting repair.",
|
||||
cancellationToken,
|
||||
null,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Warning,
|
||||
"RepairTaskManager.StartRepairWorkflowAsync",
|
||||
"Unable to locate target node. Aborting repair.",
|
||||
cancellationToken,
|
||||
null,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
return;
|
||||
}
|
||||
|
@ -111,13 +110,13 @@ namespace FabricHealer.Repair
|
|||
}
|
||||
catch (GuanException ge)
|
||||
{
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Warning,
|
||||
"StartRepairWorkflowAsync:GuanException",
|
||||
$"Failed in Guan: {ge}",
|
||||
cancellationToken,
|
||||
null,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Warning,
|
||||
"StartRepairWorkflowAsync:GuanException",
|
||||
$"Failed in Guan: {ge}",
|
||||
cancellationToken,
|
||||
null,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -315,7 +314,7 @@ namespace FabricHealer.Repair
|
|||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -394,13 +393,13 @@ namespace FabricHealer.Repair
|
|||
string actionMessage =
|
||||
$"Attempting to restart Service Fabric system process {repairData.ProcessName}.";
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairExecutor.RestartSystemServiceProcessAsync::Start",
|
||||
actionMessage,
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairExecutor.RestartSystemServiceProcessAsync::Start",
|
||||
actionMessage,
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
bool result = await repairExecutor.RestartSystemServiceProcessAsync(repairData, cancellationToken);
|
||||
|
||||
|
@ -411,13 +410,13 @@ namespace FabricHealer.Repair
|
|||
|
||||
string statusSuccess = $"Successfully restarted Service Fabric system service process {repairData.ProcessName} on node {repairData.NodeName}.";
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairExecutor.RestartSystemServiceProcessAsync::Success",
|
||||
statusSuccess,
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairExecutor.RestartSystemServiceProcessAsync::Success",
|
||||
statusSuccess,
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -482,7 +481,7 @@ namespace FabricHealer.Repair
|
|||
|
||||
var executorData = new RepairExecutorData
|
||||
{
|
||||
ExecutorTimeoutInMinutes = (int)MaxWaitTimeForFHRepairTaskCompleted.TotalMinutes,
|
||||
ExecutorTimeoutInMinutes = (int)MaxWaitTimeForFHRepairTaskCompleted.TotalMinutes,
|
||||
RepairData = repairData
|
||||
};
|
||||
|
||||
|
@ -506,29 +505,29 @@ namespace FabricHealer.Repair
|
|||
Stopwatch stopWatch = Stopwatch.StartNew();
|
||||
bool isApproved = false;
|
||||
|
||||
var repairs =
|
||||
var repairs =
|
||||
await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairConstants.FabricHealer, cancellationToken);
|
||||
|
||||
if (repairs.All(repair => repair.TaskId != repairTask.TaskId))
|
||||
{
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
|
||||
$"Failed to find scheduled repair task {repairTask.TaskId}.",
|
||||
FabricHealerManager.Token,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
|
||||
$"Failed to find scheduled repair task {repairTask.TaskId}.",
|
||||
FabricHealerManager.Token,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager::WaitingForApproval",
|
||||
$"Waiting for RM to Approve repair task {repairTask.TaskId}.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager::WaitingForApproval",
|
||||
$"Waiting for RM to Approve repair task {repairTask.TaskId}.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
while (approvalTimeout >= stopWatch.Elapsed)
|
||||
{
|
||||
|
@ -539,13 +538,13 @@ namespace FabricHealer.Repair
|
|||
&& (repair.State == RepairTaskState.Completed && repair.ResultStatus == RepairTaskResult.Cancelled
|
||||
|| repair.Flags == RepairTaskFlags.CancelRequested || repair.Flags == RepairTaskFlags.AbortRequested)))
|
||||
{
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
|
||||
$"Repair Task {repairTask.TaskId} was aborted or cancelled.",
|
||||
FabricHealerManager.Token,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
|
||||
$"Repair Task {repairTask.TaskId} was aborted or cancelled.",
|
||||
FabricHealerManager.Token,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -565,23 +564,23 @@ namespace FabricHealer.Repair
|
|||
|
||||
if (isApproved)
|
||||
{
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_Approved",
|
||||
$"RM has Approved repair task {repairTask.TaskId}.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_Approved",
|
||||
$"RM has Approved repair task {repairTask.TaskId}.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
}
|
||||
else
|
||||
{
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_NotApproved",
|
||||
$"RM did not Approve repair task {repairTask.TaskId}. Cancelling...",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_NotApproved",
|
||||
$"RM did not Approve repair task {repairTask.TaskId}. Cancelling...",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
await FabricRepairTasks.CancelRepairTaskAsync(repairTask);
|
||||
return false;
|
||||
|
@ -593,13 +592,13 @@ namespace FabricHealer.Repair
|
|||
RepairTaskResult.Pending,
|
||||
cancellationToken);
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_MovedExecuting",
|
||||
$"Executing repair {repairTask.TaskId}.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_MovedExecuting",
|
||||
$"Executing repair {repairTask.TaskId}.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
bool success;
|
||||
var repairAction = repairData.RepairPolicy.RepairAction;
|
||||
|
@ -609,110 +608,19 @@ namespace FabricHealer.Repair
|
|||
switch (repairAction)
|
||||
{
|
||||
case RepairActionType.DeleteFiles:
|
||||
|
||||
|
||||
success = await DeleteFilesAsyncAsync(repairData, cancellationToken);
|
||||
break;
|
||||
|
||||
// Note: For SF app container services, RestartDeployedCodePackage API does not work.
|
||||
// Thus, using Restart/Remove(stateful/stateless)Replica API instead, which does restart container instances.
|
||||
case RepairActionType.RestartCodePackage:
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(repairData.ContainerId))
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(repairData.ContainerId))
|
||||
{
|
||||
success = await RestartDeployedCodePackageAsync(repairData, cancellationToken);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (repairData.PartitionId == null)
|
||||
{
|
||||
success = false;
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoPartition",
|
||||
$"No partition specified.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
// Need replica or instance details..
|
||||
var repList = await FabricHealerManager.FabricClientSingleton.QueryManager.GetReplicaListAsync(
|
||||
(Guid)repairData.PartitionId,
|
||||
repairData.ReplicaId,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken);
|
||||
|
||||
if (repList.Count == 0)
|
||||
{
|
||||
success = false;
|
||||
break;
|
||||
}
|
||||
|
||||
var rep = repList[0];
|
||||
|
||||
// Restarting stateful replica will restart the container instance.
|
||||
if (rep.ServiceKind == ServiceKind.Stateful)
|
||||
{
|
||||
success = await RestartReplicaAsync(repairData, cancellationToken);
|
||||
}
|
||||
else
|
||||
{
|
||||
// For stateless intances, you need to remove the replica, which will
|
||||
// restart the container instance.
|
||||
success = await RemoveReplicaAsync(repairData, cancellationToken);
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
success = await RestartDeployedCodePackageAsync(repairData, cancellationToken);
|
||||
}
|
||||
case RepairActionType.RemoveReplica:
|
||||
{
|
||||
if (repairData.PartitionId == null)
|
||||
{
|
||||
success = false;
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoPartition",
|
||||
$"No partition specified.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
var repList = await FabricHealerManager.FabricClientSingleton.QueryManager.GetReplicaListAsync(
|
||||
(Guid)repairData.PartitionId,
|
||||
repairData.ReplicaId,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken);
|
||||
|
||||
if (repList.Count == 0)
|
||||
{
|
||||
success = false;
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoReplica",
|
||||
$"Stateless Instance {repairData.ReplicaId} not found on partition " +
|
||||
$"{repairData.PartitionId}.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
success = await RemoveReplicaAsync(repairData, cancellationToken);
|
||||
break;
|
||||
}
|
||||
case RepairActionType.RestartProcess:
|
||||
|
||||
success = await RestartSystemServiceProcessAsync(repairData, cancellationToken);
|
||||
break;
|
||||
|
||||
case RepairActionType.RestartReplica:
|
||||
else
|
||||
{
|
||||
if (repairData.PartitionId == null)
|
||||
{
|
||||
|
@ -728,6 +636,7 @@ namespace FabricHealer.Repair
|
|||
break;
|
||||
}
|
||||
|
||||
// Need replica or instance details..
|
||||
var repList = await FabricHealerManager.FabricClientSingleton.QueryManager.GetReplicaListAsync(
|
||||
(Guid)repairData.PartitionId,
|
||||
repairData.ReplicaId,
|
||||
|
@ -737,58 +646,148 @@ namespace FabricHealer.Repair
|
|||
if (repList.Count == 0)
|
||||
{
|
||||
success = false;
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoReplica",
|
||||
$"Stateful replica {repairData.ReplicaId} not found on partition " +
|
||||
$"{repairData.PartitionId}.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
var replica = repList[0];
|
||||
var rep = repList[0];
|
||||
|
||||
// Restart - stateful replica.
|
||||
if (replica.ServiceKind == ServiceKind.Stateful)
|
||||
// Restarting stateful replica will restart the container instance.
|
||||
if (rep.ServiceKind == ServiceKind.Stateful)
|
||||
{
|
||||
success = await RestartReplicaAsync(repairData, cancellationToken);
|
||||
}
|
||||
else
|
||||
{
|
||||
// For stateless replicas (aka instances), you need to remove the replica. The runtime will create a new one
|
||||
// and place it.
|
||||
// For stateless intances, you need to remove the replica, which will
|
||||
// restart the container instance.
|
||||
success = await RemoveReplicaAsync(repairData, cancellationToken);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case RepairActionType.RestartFabricNode:
|
||||
|
||||
break;
|
||||
}
|
||||
case RepairActionType.RemoveReplica:
|
||||
{
|
||||
if (repairData.PartitionId == null)
|
||||
{
|
||||
var executorData = repairTask.ExecutorData;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(executorData))
|
||||
{
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.SafeRestartFabricNode",
|
||||
$"Repair {repairTask.TaskId} is missing ExecutorData.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
success = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
success = await SafeRestartServiceFabricNodeAsync(repairData, repairTask, cancellationToken);
|
||||
}
|
||||
success = false;
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoPartition",
|
||||
$"No partition specified.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
var repList = await FabricHealerManager.FabricClientSingleton.QueryManager.GetReplicaListAsync(
|
||||
(Guid)repairData.PartitionId,
|
||||
repairData.ReplicaId,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken);
|
||||
|
||||
if (repList.Count == 0)
|
||||
{
|
||||
success = false;
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoReplica",
|
||||
$"Stateless Instance {repairData.ReplicaId} not found on partition " +
|
||||
$"{repairData.PartitionId}.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
success = await RemoveReplicaAsync(repairData, cancellationToken);
|
||||
break;
|
||||
}
|
||||
case RepairActionType.RestartProcess:
|
||||
|
||||
success = await RestartSystemServiceProcessAsync(repairData, cancellationToken);
|
||||
break;
|
||||
|
||||
case RepairActionType.RestartReplica:
|
||||
{
|
||||
if (repairData.PartitionId == null)
|
||||
{
|
||||
success = false;
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoPartition",
|
||||
$"No partition specified.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
var repList = await FabricHealerManager.FabricClientSingleton.QueryManager.GetReplicaListAsync(
|
||||
(Guid)repairData.PartitionId,
|
||||
repairData.ReplicaId,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken);
|
||||
|
||||
if (repList.Count == 0)
|
||||
{
|
||||
success = false;
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoReplica",
|
||||
$"Stateful replica {repairData.ReplicaId} not found on partition " +
|
||||
$"{repairData.PartitionId}.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
var replica = repList[0];
|
||||
|
||||
// Restart - stateful replica.
|
||||
if (replica.ServiceKind == ServiceKind.Stateful)
|
||||
{
|
||||
success = await RestartReplicaAsync(repairData, cancellationToken);
|
||||
}
|
||||
else
|
||||
{
|
||||
// For stateless replicas (aka instances), you need to remove the replica. The runtime will create a new one
|
||||
// and place it.
|
||||
success = await RemoveReplicaAsync(repairData, cancellationToken);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case RepairActionType.RestartFabricNode:
|
||||
{
|
||||
var executorData = repairTask.ExecutorData;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(executorData))
|
||||
{
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.SafeRestartFabricNode",
|
||||
$"Repair {repairTask.TaskId} is missing ExecutorData.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
success = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
success = await SafeRestartServiceFabricNodeAsync(repairData, repairTask, cancellationToken);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
@ -941,23 +940,23 @@ namespace FabricHealer.Repair
|
|||
|
||||
if (isHealthy)
|
||||
{
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
|
||||
$"{repairData.RepairPolicy.RepairAction} repair for {repairTarget} has succeeded.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
|
||||
$"{repairData.RepairPolicy.RepairAction} repair for {repairTarget} has succeeded.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
}
|
||||
else
|
||||
{
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
|
||||
$"{repairData.RepairPolicy.RepairAction} repair for {repairTarget} has failed. {repairTarget} is still in an unhealthy state.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
|
||||
$"{repairData.RepairPolicy.RepairAction} repair for {repairTarget} has failed. {repairTarget} is still in an unhealthy state.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
}
|
||||
|
||||
// Tell RM we are ready to move to Completed state as our custom code has completed its repair execution successfully.
|
||||
|
@ -975,13 +974,13 @@ namespace FabricHealer.Repair
|
|||
}
|
||||
|
||||
// Executor failure. Cancel repair task.
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_ExecuteFailed",
|
||||
$"Executor failed for repair {repairTask.TaskId}. See logs for details. Cancelling repair task.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_ExecuteFailed",
|
||||
$"Executor failed for repair {repairTask.TaskId}. See logs for details. Cancelling repair task.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
await FabricRepairTasks.CancelRepairTaskAsync(repairTask);
|
||||
return false;
|
||||
|
@ -991,6 +990,7 @@ namespace FabricHealer.Repair
|
|||
internal int GetEntityHealthEventCountWithinTimeRange(TelemetryData repairData, TimeSpan timeWindow)
|
||||
{
|
||||
int count = 0;
|
||||
|
||||
if (repairData == null || detectedHealthEvents == null || !detectedHealthEvents.Any())
|
||||
{
|
||||
return count;
|
||||
|
@ -1015,17 +1015,11 @@ namespace FabricHealer.Repair
|
|||
break;
|
||||
}
|
||||
|
||||
var entityHealthEvents = detectedHealthEvents.Where(
|
||||
evt => evt.entityName == id && evt.healthEvent.HealthInformation.Property == repairData.Property);
|
||||
|
||||
foreach (var (_, healthEvent) in entityHealthEvents)
|
||||
{
|
||||
if (DateTime.UtcNow.Subtract(healthEvent.SourceUtcTimestamp) > timeWindow)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
count = detectedHealthEvents.Count(
|
||||
evt => evt.entityName == id
|
||||
&& evt.healthEvent.HealthInformation.SourceId == repairData.Source
|
||||
&& evt.healthEvent.HealthInformation.Property == repairData.Property
|
||||
&& DateTime.UtcNow.Subtract(evt.healthEvent.SourceUtcTimestamp) <= timeWindow);
|
||||
|
||||
// Lifetime management of Health Events list data. Data is kept in-memory only for 2 days. If FH process restarts, data is not preserved.
|
||||
if (DateTime.UtcNow.Subtract(lastHealthEventsListClearDateTime) >= maxLifeTimeHealthEventsData)
|
||||
|
@ -1087,20 +1081,21 @@ namespace FabricHealer.Repair
|
|||
if (appHealth == null || appHealth.HealthEvents.Count == 0)
|
||||
{
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
}
|
||||
|
||||
// How many times has the entity been put into Error health state in the last 2 hours?
|
||||
// How many times has the entity been put into Error health within the specified time window?
|
||||
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
|
||||
{
|
||||
if (GetEntityHealthEventCountWithinTimeRange(repairData, timeWindow) > 1)
|
||||
{
|
||||
var orderedEvents = detectedHealthEvents.Where(
|
||||
evt => evt.entityName == repairData.ApplicationName &&
|
||||
evt.healthEvent.HealthInformation.Property == repairData.Property)
|
||||
evt => evt.entityName == repairData.ApplicationName
|
||||
&& evt.healthEvent.HealthInformation.SourceId == repairData.Source
|
||||
&& evt.healthEvent.HealthInformation.Property == repairData.Property
|
||||
&& DateTime.UtcNow.Subtract(evt.healthEvent.SourceUtcTimestamp) <= timeWindow)
|
||||
.OrderByDescending(o => o.healthEvent.SourceUtcTimestamp);
|
||||
|
||||
return DateTime.UtcNow.Subtract(
|
||||
orderedEvents.Last().healthEvent.SourceUtcTimestamp);
|
||||
return DateTime.UtcNow.Subtract(orderedEvents.Last().healthEvent.SourceUtcTimestamp);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1139,8 +1134,10 @@ namespace FabricHealer.Repair
|
|||
if (GetEntityHealthEventCountWithinTimeRange(repairData, timeWindow) > 1)
|
||||
{
|
||||
var orderedEvents = detectedHealthEvents.Where(
|
||||
evt => evt.entityName == repairData.ServiceName &&
|
||||
evt.healthEvent.HealthInformation.Property == repairData.Property)
|
||||
evt => evt.entityName == repairData.ServiceName
|
||||
&& evt.healthEvent.HealthInformation.SourceId == repairData.Source
|
||||
&& evt.healthEvent.HealthInformation.Property == repairData.Property
|
||||
&& DateTime.UtcNow.Subtract(evt.healthEvent.SourceUtcTimestamp) <= timeWindow)
|
||||
.OrderByDescending(o => o.healthEvent.SourceUtcTimestamp);
|
||||
|
||||
return DateTime.UtcNow.Subtract(orderedEvents.Last().healthEvent.SourceUtcTimestamp);
|
||||
|
@ -1175,14 +1172,17 @@ namespace FabricHealer.Repair
|
|||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
// How many times has the entity been put into Error health state in the last 2 hours?
|
||||
// How many times has the entity been put into Error health state.
|
||||
// Look into LastTransition to Error.
|
||||
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
|
||||
{
|
||||
if (GetEntityHealthEventCountWithinTimeRange(repairData, timeWindow) > 1)
|
||||
{
|
||||
var orderedEvents = detectedHealthEvents.Where(
|
||||
evt => evt.entityName == repairData.NodeName &&
|
||||
evt.healthEvent.HealthInformation.Property == repairData.Property)
|
||||
evt => evt.entityName == repairData.NodeName
|
||||
&& evt.healthEvent.HealthInformation.SourceId == repairData.Source
|
||||
&& evt.healthEvent.HealthInformation.Property == repairData.Property
|
||||
&& DateTime.UtcNow.Subtract(evt.healthEvent.SourceUtcTimestamp) <= timeWindow)
|
||||
.OrderByDescending(o => o.healthEvent.SourceUtcTimestamp);
|
||||
|
||||
return DateTime.UtcNow.Subtract(orderedEvents.Last().healthEvent.SourceUtcTimestamp);
|
||||
|
@ -1252,7 +1252,7 @@ namespace FabricHealer.Repair
|
|||
switch (repairData.EntityType)
|
||||
{
|
||||
case EntityType.Application:
|
||||
|
||||
|
||||
var appHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricHealerManager.FabricClientSingleton.HealthManager.GetApplicationHealthAsync(
|
||||
new Uri(repairData.ApplicationName),
|
||||
|
@ -1285,7 +1285,7 @@ namespace FabricHealer.Repair
|
|||
}
|
||||
|
||||
return isTargetAppHealedOnTargetNode ? HealthState.Ok : appHealth.AggregatedHealthState;
|
||||
|
||||
|
||||
case EntityType.Service:
|
||||
|
||||
var serviceHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
|
@ -1306,7 +1306,7 @@ namespace FabricHealer.Repair
|
|||
|
||||
case EntityType.Node:
|
||||
case EntityType.Machine:
|
||||
|
||||
|
||||
var nodeHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricHealerManager.FabricClientSingleton.HealthManager.GetNodeHealthAsync(
|
||||
repairData.NodeName,
|
||||
|
@ -1322,9 +1322,9 @@ namespace FabricHealer.Repair
|
|||
&& repairData.HealthState == HealthState.Ok);
|
||||
|
||||
return isTargetNodeHealed ? HealthState.Ok : nodeHealth.AggregatedHealthState;
|
||||
|
||||
|
||||
case EntityType.Replica:
|
||||
|
||||
|
||||
// Make sure the Partition where the restarted replica was located is now healthy.
|
||||
var partitionHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricHealerManager.FabricClientSingleton.HealthManager.GetPartitionHealthAsync(
|
||||
|
@ -1334,10 +1334,10 @@ namespace FabricHealer.Repair
|
|||
token);
|
||||
|
||||
return partitionHealth.AggregatedHealthState;
|
||||
|
||||
|
||||
default:
|
||||
return HealthState.Unknown;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче