1.1.1.960 (DEV) - Bug fixes/refactorings.
This commit is contained in:
Родитель
e78820983c
Коммит
34c6e73123
|
@ -4,7 +4,7 @@
|
|||
<id>%PACKAGE_ID%</id>
|
||||
<version>1.1.1.960</version>
|
||||
<releaseNotes>
|
||||
- This version is built for .NET 6 runtime and requires Microsoft.ServiceFabric.Services Versions 6.0.1017 and higher (SF Runtime Version 9.0 and above). Lesser .NET and SF runtime versions are not supported by this release. Use version 1.1.0 for SF Runtime 8.x and .NET Core 3.1.
|
||||
|
||||
</releaseNotes>
|
||||
<authors>Microsoft</authors>
|
||||
<license type="expression">MIT</license>
|
||||
|
|
|
@ -155,6 +155,23 @@ namespace FabricHealer
|
|||
return singleton ??= new FabricHealerManager(context ?? throw new ArgumentException("ServiceContext can't be null..", nameof(context)), token);
|
||||
}
|
||||
|
||||
public static async Task<bool> IsOneNodeClusterAsync()
|
||||
{
|
||||
var nodeQueryDesc = new NodeQueryDescription
|
||||
{
|
||||
MaxResults = 3,
|
||||
};
|
||||
|
||||
NodeList nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricHealerManager.FabricClientSingleton.QueryManager.GetNodePagedListAsync(
|
||||
nodeQueryDesc,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
FabricHealerManager.Token),
|
||||
FabricHealerManager.Token);
|
||||
|
||||
return nodes?.Count == 1;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if repair manager is enabled in the cluster or not
|
||||
/// </summary>
|
||||
|
@ -775,13 +792,12 @@ namespace FabricHealer
|
|||
|| appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardInProgress
|
||||
|| appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardPending)
|
||||
{
|
||||
List<int> udInAppUpgrade = await UpgradeChecker.GetUdsWhereApplicationUpgradeInProgressAsync(appName, Token);
|
||||
var udInAppUpgrade = await UpgradeChecker.GetUDWhereApplicationUpgradeInProgressAsync(appName, Token);
|
||||
string udText = string.Empty;
|
||||
|
||||
// -1 means no upgrade in progress for application.
|
||||
if (udInAppUpgrade.Any(ud => ud > -1))
|
||||
if (udInAppUpgrade != null)
|
||||
{
|
||||
udText = $"in UD {udInAppUpgrade.First(ud => ud > -1)}";
|
||||
udText = $"in UD {udInAppUpgrade.First()}";
|
||||
}
|
||||
|
||||
string telemetryDescription = $"{appName} is upgrading {udText}. Will not attempt application repair at this time.";
|
||||
|
@ -991,7 +1007,7 @@ namespace FabricHealer
|
|||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
// Update the in-memory HealthEvent List.
|
||||
this.repairTaskManager.DetectedHealthEvents.Add(evt);
|
||||
this.repairTaskManager.detectedHealthEvents.Add(evt);
|
||||
|
||||
// Start the repair workflow.
|
||||
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
|
||||
|
@ -1000,19 +1016,7 @@ namespace FabricHealer
|
|||
|
||||
private async Task ProcessServiceHealthAsync(ServiceHealthState serviceHealthState)
|
||||
{
|
||||
// This is just used to make sure there is more than 1 node in the cluster. We don't need a list of all nodes.
|
||||
var nodeQueryDesc = new NodeQueryDescription
|
||||
{
|
||||
MaxResults = 3,
|
||||
};
|
||||
|
||||
NodeList nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricClientSingleton.QueryManager.GetNodePagedListAsync(
|
||||
nodeQueryDesc,
|
||||
ConfigSettings.AsyncTimeout,
|
||||
Token),
|
||||
Token);
|
||||
|
||||
bool isOneNodeCluster = await IsOneNodeClusterAsync();
|
||||
ServiceHealth serviceHealth;
|
||||
Uri appName;
|
||||
Uri serviceName = serviceHealthState.ServiceName;
|
||||
|
@ -1033,28 +1037,27 @@ namespace FabricHealer
|
|||
var name = await FabricClientSingleton.QueryManager.GetApplicationNameAsync(serviceName, ConfigSettings.AsyncTimeout, Token);
|
||||
appName = name.ApplicationName;
|
||||
|
||||
// user service - upgrade check.
|
||||
// User Application upgrade check.
|
||||
if (!appName.OriginalString.Contains(RepairConstants.SystemAppName) && !serviceName.OriginalString.Contains(RepairConstants.SystemAppName))
|
||||
{
|
||||
try
|
||||
{
|
||||
var app = await FabricClientSingleton.QueryManager.GetApplicationNameAsync(serviceName, ConfigSettings.AsyncTimeout, Token);
|
||||
var appUpgradeStatus = await FabricClientSingleton.ApplicationManager.GetApplicationUpgradeProgressAsync(app.ApplicationName);
|
||||
ApplicationUpgradeProgress appUpgradeProgress =
|
||||
await FabricClientSingleton.ApplicationManager.GetApplicationUpgradeProgressAsync(appName, ConfigSettings.AsyncTimeout, Token);
|
||||
|
||||
if (appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingBackInProgress
|
||||
|| appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardInProgress
|
||||
|| appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardPending)
|
||||
if (appUpgradeProgress.UpgradeState == ApplicationUpgradeState.RollingBackInProgress
|
||||
|| appUpgradeProgress.UpgradeState == ApplicationUpgradeState.RollingForwardInProgress
|
||||
|| appUpgradeProgress.UpgradeState == ApplicationUpgradeState.RollingForwardPending)
|
||||
{
|
||||
List<int> udInAppUpgrade = await UpgradeChecker.GetUdsWhereApplicationUpgradeInProgressAsync(serviceName, Token);
|
||||
string udInAppUpgrade = await UpgradeChecker.GetUDWhereApplicationUpgradeInProgressAsync(serviceName, Token);
|
||||
string udText = string.Empty;
|
||||
|
||||
// -1 means no upgrade in progress for application.
|
||||
if (udInAppUpgrade.Any(ud => ud > -1))
|
||||
if (udInAppUpgrade != null)
|
||||
{
|
||||
udText = $"in UD {udInAppUpgrade.First(ud => ud > -1)}";
|
||||
udText = $"in UD {udInAppUpgrade}";
|
||||
}
|
||||
|
||||
string telemetryDescription = $"{app.ApplicationName} is upgrading {udText}. Will not attempt service repair at this time.";
|
||||
string telemetryDescription = $"{appName.OriginalString} is upgrading {udText}. Will not attempt service repair at this time.";
|
||||
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
|
@ -1256,10 +1259,10 @@ namespace FabricHealer
|
|||
// All FH repairs have serialized instances of RepairExecutorData set as the value for a RepairTask's ExecutorData property.
|
||||
if (currentFHRepairs?.Count > 0)
|
||||
{
|
||||
// This prevents starting creating a new repair if another service running on a different node needs to be resarted, for example.
|
||||
// Thing of this as a UD Walk across nodes of service instances in need of repair.
|
||||
// This prevents starting creating a new repair if another service running on a different node needs to be restarted, for example.
|
||||
// Think of this as a UD Walk across nodes of service instances in need of repair.
|
||||
if (ConfigSettings.EnableRollingServiceRestarts
|
||||
&& nodes?.Count > 1
|
||||
&& !isOneNodeCluster
|
||||
&& currentFHRepairs.Any(r => JsonSerializationUtility.TryDeserializeObject(r.ExecutorData, out RepairExecutorData execData)
|
||||
&& execData?.RepairData?.ServiceName?.ToLower() == repairData.ServiceName.ToLower()))
|
||||
{
|
||||
|
@ -1325,7 +1328,7 @@ namespace FabricHealer
|
|||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
// Update the in-memory HealthEvent List.
|
||||
repairTaskManager.DetectedHealthEvents.Add(evt);
|
||||
repairTaskManager.detectedHealthEvents.Add(evt);
|
||||
|
||||
// Start the repair workflow.
|
||||
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
|
||||
|
@ -1334,31 +1337,17 @@ namespace FabricHealer
|
|||
|
||||
private async Task ProcessNodeHealthAsync(IEnumerable<NodeHealthState> nodeHealthStates)
|
||||
{
|
||||
// This is just used to make sure there is more than 1 node in the cluster. We don't need a list of all nodes.
|
||||
var nodeQueryDesc = new NodeQueryDescription
|
||||
{
|
||||
MaxResults = 3,
|
||||
};
|
||||
|
||||
NodeList nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricClientSingleton.QueryManager.GetNodePagedListAsync(
|
||||
nodeQueryDesc,
|
||||
ConfigSettings.AsyncTimeout,
|
||||
Token),
|
||||
Token);
|
||||
|
||||
// Node/Machine repair not supported in onebox clusters..
|
||||
if (nodes?.Count == 1)
|
||||
if (await IsOneNodeClusterAsync())
|
||||
{
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
$"ProcessMachineHealth::NotSupported",
|
||||
"Machine/Fabric Node repair is not supported in clusters with 1 Fabric node.",
|
||||
Token,
|
||||
null,
|
||||
ConfigSettings.EnableVerboseLogging);
|
||||
LogLevel.Info,
|
||||
$"ProcessNodeHealthAsync::ClusterNotSupported",
|
||||
"Machine/Fabric Node repair is not supported in clusters with 1 Fabric node.",
|
||||
Token,
|
||||
null,
|
||||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
return;
|
||||
return;
|
||||
}
|
||||
|
||||
var supportedNodeHealthStates =
|
||||
|
@ -1468,6 +1457,7 @@ namespace FabricHealer
|
|||
continue;
|
||||
}
|
||||
|
||||
// TOTHINK...
|
||||
// If there are mulitple instances of FH deployed to the cluster (like -1 InstanceCount), then don't do machine repairs if this instance of FH
|
||||
// detects a need to do so. Another instance on a different node will take the job. Only DiskObserver-generated repair data has to be done on the node
|
||||
// where FO's DiskObserver emitted the related information, for example (like Disk space issues and the need to clean specified (in logic rules) folders).
|
||||
|
@ -1531,7 +1521,7 @@ namespace FabricHealer
|
|||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
// Update the in-memory HealthEvent List.
|
||||
repairTaskManager.DetectedHealthEvents.Add(evt);
|
||||
repairTaskManager.detectedHealthEvents.Add(evt);
|
||||
|
||||
// Start the repair workflow.
|
||||
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
|
||||
|
@ -1580,7 +1570,7 @@ namespace FabricHealer
|
|||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
// Update the in-memory HealthEvent List.
|
||||
repairTaskManager.DetectedHealthEvents.Add(evt);
|
||||
repairTaskManager.detectedHealthEvents.Add(evt);
|
||||
|
||||
// Start the repair workflow.
|
||||
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
|
||||
|
@ -1638,7 +1628,7 @@ namespace FabricHealer
|
|||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
// Update the in-memory HealthEvent List.
|
||||
repairTaskManager.DetectedHealthEvents.Add(healthEvent);
|
||||
repairTaskManager.detectedHealthEvents.Add(healthEvent);
|
||||
|
||||
// Start the repair workflow.
|
||||
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
|
||||
|
|
|
@ -17,7 +17,7 @@ namespace FabricHealer.Interfaces
|
|||
|
||||
Task<bool> DeleteFilesAsyncAsync(TelemetryData repairData, CancellationToken cancellationToken);
|
||||
|
||||
Task<bool> RemoveServiceFabricNodeStateAsync(string nodeName, CancellationToken cancellationToken);
|
||||
Task RemoveServiceFabricNodeStateAsync(string nodeName, CancellationToken cancellationToken);
|
||||
|
||||
Task<bool> RestartDeployedCodePackageAsync(TelemetryData repairData, CancellationToken cancellationToken);
|
||||
|
||||
|
|
|
@ -73,7 +73,7 @@ Mitigate() :- CheckInsideScheduleInterval(00:10:00), !.
|
|||
## ProbationWaitDurationPre (minimum time-in-error) rule. If this rule succeeds, Guan will immediately stop processing rules.
|
||||
## The HealthState constraint in the head of the rule is for efficiency purposes: don't process sub rules if the supplied entity's health state (fact) is not Error.
|
||||
## GetEntityHealthStateDuration takes a HealthState argument because it can be used to get the active duration of any supported entity health state.
|
||||
Mitigate(HealthState=Error) :- GetEntityHealthStateDuration(?duration, Entity=Machine, HealthState=Error), ?duration <= 00:02:00, !.
|
||||
Mitigate(HealthState=Error) :- GetEntityHealthStateDuration(?duration, Entity=Machine, HealthState=Error), ?duration <= 02:00:00, !.
|
||||
|
||||
## Outside of Probationary period for Error health state.
|
||||
## Try to schedule a machine repair if there are currently less than 2 outstanding repairs for machines in the cluster (MaxOutstandingRepairs).
|
||||
|
|
|
@ -11,7 +11,6 @@ using System.Fabric.Health;
|
|||
using System.Fabric.Query;
|
||||
using System.Fabric.Repair;
|
||||
using System.Linq;
|
||||
using System.Security.Policy;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using FabricHealer.Utilities;
|
||||
|
@ -24,10 +23,9 @@ namespace FabricHealer.Repair
|
|||
public static async Task<bool> IsRepairTaskInDesiredStateAsync(
|
||||
string taskId,
|
||||
string executorName,
|
||||
List<RepairTaskState> desiredStates)
|
||||
IList<RepairTaskState> desiredStates)
|
||||
{
|
||||
IList<RepairTask> repairTaskList = await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(taskId, RepairTaskStateFilter.All, executorName);
|
||||
|
||||
return desiredStates.Any(desiredState => repairTaskList.Count(rt => rt.State == desiredState) > 0);
|
||||
}
|
||||
|
||||
|
@ -35,7 +33,6 @@ namespace FabricHealer.Repair
|
|||
/// Cancels a repair task based on its current state.
|
||||
/// </summary>
|
||||
/// <param name="repairTask"><see cref="RepairTask"/> to be cancelled</param>
|
||||
/// <param name="fabricClient">FabricClient instance.</param>
|
||||
/// <returns></returns>
|
||||
public static async Task CancelRepairTaskAsync(RepairTask repairTask)
|
||||
{
|
||||
|
|
|
@ -7,7 +7,6 @@ using System;
|
|||
using System.Threading.Tasks;
|
||||
using Guan.Logic;
|
||||
using FabricHealer.Utilities.Telemetry;
|
||||
using FabricHealer.Utilities;
|
||||
|
||||
namespace FabricHealer.Repair.Guan
|
||||
{
|
||||
|
|
|
@ -40,6 +40,7 @@ namespace FabricHealer.Repair.Guan
|
|||
// Block attempts to create duplicate repair tasks or more than specified concurrent machine-level repairs.
|
||||
var repairTaskEngine = new RepairTaskEngine();
|
||||
int count = Input.Arguments.Count;
|
||||
string repairAction = null;
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
|
@ -48,7 +49,7 @@ namespace FabricHealer.Repair.Guan
|
|||
switch (typeString)
|
||||
{
|
||||
case "String":
|
||||
string repairAction = (string)Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue();
|
||||
repairAction = (string)Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue();
|
||||
SetPolicyRepairAction(repairAction);
|
||||
break;
|
||||
|
||||
|
@ -100,7 +101,7 @@ namespace FabricHealer.Repair.Guan
|
|||
{
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
$"ScheduleMachineRepairPredicateType::MaxOustandingRepairs",
|
||||
"ScheduleMachineRepairPredicateType::MaxOustandingRepairs",
|
||||
$"The number of outstanding machine repairs is currently at the maximum specified threshold ({RepairData.RepairPolicy.MaxConcurrentRepairs}). " +
|
||||
$"Will not schedule any other machine repairs at this time.",
|
||||
FabricHealerManager.Token);
|
||||
|
|
|
@ -26,16 +26,11 @@ using System.Fabric.Description;
|
|||
|
||||
namespace FabricHealer.Repair
|
||||
{
|
||||
public class RepairExecutor
|
||||
public sealed class RepairExecutor
|
||||
{
|
||||
private const double MaxWaitTimeMinutesForNodeOperation = 60.0;
|
||||
private readonly StatelessServiceContext serviceContext;
|
||||
|
||||
private bool IsOneNodeCluster
|
||||
{
|
||||
get;
|
||||
}
|
||||
|
||||
public RepairExecutor(StatelessServiceContext context, CancellationToken token)
|
||||
{
|
||||
serviceContext = context;
|
||||
|
@ -46,9 +41,6 @@ namespace FabricHealer.Repair
|
|||
{
|
||||
return;
|
||||
}
|
||||
|
||||
IsOneNodeCluster =
|
||||
FabricHealerManager.FabricClientSingleton.QueryManager.GetNodeListAsync(null, FabricHealerManager.ConfigSettings.AsyncTimeout, token).GetAwaiter().GetResult().Count == 1;
|
||||
}
|
||||
catch (FabricException fe)
|
||||
{
|
||||
|
@ -212,7 +204,7 @@ namespace FabricHealer.Repair
|
|||
RepairTask repairTask,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
if (IsOneNodeCluster)
|
||||
if (await FabricHealerManager.IsOneNodeClusterAsync())
|
||||
{
|
||||
string info = "One node cluster detected. Aborting node restart operation.";
|
||||
|
||||
|
|
|
@ -4,9 +4,7 @@
|
|||
// ------------------------------------------------------------
|
||||
|
||||
using System;
|
||||
using System.Fabric.Description;
|
||||
using System.Fabric.Health;
|
||||
using System.Fabric.Query;
|
||||
using System.Fabric.Repair;
|
||||
using System.Linq;
|
||||
using System.Threading;
|
||||
|
@ -118,20 +116,7 @@ namespace FabricHealer.Repair
|
|||
/// <returns></returns>
|
||||
public async Task<RepairTask> ScheduleInfrastructureRepairTaskAsync(TelemetryData repairData, string executorName, CancellationToken cancellationToken)
|
||||
{
|
||||
// This constraint (MaxResults) is used just to make sure there is more 1 node in the cluster. We don't need a list of all nodes.
|
||||
var nodeQueryDesc = new NodeQueryDescription
|
||||
{
|
||||
MaxResults = 3,
|
||||
};
|
||||
|
||||
NodeList nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricHealerManager.FabricClientSingleton.QueryManager.GetNodePagedListAsync(
|
||||
nodeQueryDesc,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken),
|
||||
cancellationToken);
|
||||
|
||||
if (nodes?.Count == 1)
|
||||
if (await FabricHealerManager.IsOneNodeClusterAsync())
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -21,40 +21,40 @@ using FabricHealer.Utilities;
|
|||
|
||||
namespace FabricHealer.Repair
|
||||
{
|
||||
public class RepairTaskManager : IRepairTasks
|
||||
public sealed class RepairTaskManager : IRepairTasks
|
||||
{
|
||||
private static readonly TimeSpan MaxWaitTimeForInfraRepairTaskCompleted = TimeSpan.FromHours(2);
|
||||
private static readonly TimeSpan MaxWaitTimeForInfrastructureRepairTaskCompleted = TimeSpan.FromHours(2);
|
||||
private static readonly TimeSpan MaxWaitTimeForFHRepairTaskCompleted = TimeSpan.FromHours(1);
|
||||
private readonly RepairTaskEngine repairTaskEngine;
|
||||
private readonly RepairExecutor RepairExec;
|
||||
private readonly TimeSpan AsyncTimeout = TimeSpan.FromSeconds(60);
|
||||
private readonly DateTime HealthEventsListCreationTime = DateTime.UtcNow;
|
||||
private readonly TimeSpan MaxLifeTimeHealthEventsData = TimeSpan.FromDays(2);
|
||||
private DateTime LastHealthEventsListClearDateTime;
|
||||
internal readonly List<HealthEvent> DetectedHealthEvents = new List<HealthEvent>();
|
||||
private readonly RepairExecutor repairExecutor;
|
||||
private readonly TimeSpan asyncTimeout = TimeSpan.FromSeconds(60);
|
||||
private readonly DateTime healthEventsListCreationTime = DateTime.UtcNow;
|
||||
private readonly TimeSpan maxLifeTimeHealthEventsData = TimeSpan.FromDays(2);
|
||||
private DateTime lastHealthEventsListClearDateTime;
|
||||
internal readonly List<HealthEvent> detectedHealthEvents = new List<HealthEvent>();
|
||||
internal readonly StatelessServiceContext Context;
|
||||
|
||||
public RepairTaskManager(StatelessServiceContext context, CancellationToken token)
|
||||
{
|
||||
Context = context;
|
||||
RepairExec = new RepairExecutor(context, token);
|
||||
repairExecutor = new RepairExecutor(context, token);
|
||||
repairTaskEngine = new RepairTaskEngine();
|
||||
LastHealthEventsListClearDateTime = HealthEventsListCreationTime;
|
||||
lastHealthEventsListClearDateTime = healthEventsListCreationTime;
|
||||
}
|
||||
|
||||
// TODO.
|
||||
public Task<bool> RemoveServiceFabricNodeStateAsync(string nodeName, CancellationToken cancellationToken)
|
||||
public async Task RemoveServiceFabricNodeStateAsync(string nodeName, CancellationToken cancellationToken)
|
||||
{
|
||||
return Task.FromResult(false);
|
||||
await FabricHealerManager.FabricClientSingleton.ClusterManager.RemoveNodeStateAsync(nodeName, asyncTimeout, cancellationToken);
|
||||
}
|
||||
|
||||
public async Task ActivateServiceFabricNodeAsync(string nodeName, CancellationToken cancellationToken)
|
||||
{
|
||||
await FabricHealerManager.FabricClientSingleton.ClusterManager.ActivateNodeAsync(nodeName, AsyncTimeout, cancellationToken);
|
||||
await FabricHealerManager.FabricClientSingleton.ClusterManager.ActivateNodeAsync(nodeName, asyncTimeout, cancellationToken);
|
||||
}
|
||||
|
||||
public async Task<bool> SafeRestartServiceFabricNodeAsync(TelemetryData repairData, RepairTask repairTask, CancellationToken cancellationToken)
|
||||
{
|
||||
if (!await RepairExec.SafeRestartFabricNodeAsync(repairData, repairTask, cancellationToken))
|
||||
if (!await repairExecutor.SafeRestartFabricNodeAsync(repairData, repairTask, cancellationToken))
|
||||
{
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
|
@ -295,16 +295,14 @@ namespace FabricHealer.Repair
|
|||
|
||||
return true;
|
||||
|
||||
// TODO: ensure the target node is healthy after successful completion of the Repair Job (which includes MaxTimePostRepairHealthCheck (probation)).
|
||||
|
||||
/*var timer = Stopwatch.StartNew();
|
||||
|
||||
// It can take a while to get from a VM reboot/reimage to a healthy Fabric node, so block here until repair completes.
|
||||
// Note that, by design, this will block any other FabricHealer-initiated repair from taking place in the cluster.
|
||||
// FabricHealer is designed to be very conservative with respect to node level repairs.
|
||||
// It is a good idea to not change this default behavior.
|
||||
TimeSpan maxWaitTime =
|
||||
repairData.RepairPolicy.MaxTimePostRepairHealthCheck > TimeSpan.MinValue ? repairData.RepairPolicy.MaxTimePostRepairHealthCheck : MaxWaitTimeForInfraRepairTaskCompleted;
|
||||
|
||||
while (timer.Elapsed < maxWaitTime)
|
||||
while (timer.Elapsed < MaxWaitTimeForInfrastructureRepairTaskCompleted)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
|
@ -317,7 +315,31 @@ namespace FabricHealer.Repair
|
|||
continue;
|
||||
}
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
// TODO: Post-repair HealthState (Ok) probation requires that the target node be healthy for 30 minutes after successful repair...
|
||||
// Check to see if this repair has already completed, but is still in the post-repair probationary period specified in the logic rule.
|
||||
if (repairData.RepairPolicy.MaxTimePostRepairHealthCheck > TimeSpan.MinValue)
|
||||
{
|
||||
int completedRepairInProbation =
|
||||
await FabricRepairTasks.GetCompletedRepairCountWithinTimeRangeAsync(
|
||||
repairData.RepairPolicy.MaxTimePostRepairHealthCheck,
|
||||
repairData,
|
||||
FabricHealerManager.Token,
|
||||
repairData.RepairPolicy.InfrastructureRepairName);
|
||||
|
||||
if (completedRepairInProbation > 0)
|
||||
{
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"ScheduleMachineRepairPredicateType::PostRepairProbation",
|
||||
$"Repair {repairData.RepairPolicy.RepairId} is not yet complete: The target node is in post-repair health check probation: " +
|
||||
$"Probation period = {repairData.RepairPolicy.MaxTimePostRepairHealthCheck}.",
|
||||
FabricHealerManager.Token);
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"InfrastructureRepairTask::Completed",
|
||||
$"Successfully completed repair {repairData.RepairPolicy.RepairId}",
|
||||
|
@ -332,39 +354,39 @@ namespace FabricHealer.Repair
|
|||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"ScheduleInfrastructureRepairTask::Timeout",
|
||||
$"Max wait time of {MaxWaitTimeForInfraRepairTaskCompleted} has elapsed for repair " +
|
||||
$"Max wait time of {MaxWaitTimeForInfrastructureRepairTaskCompleted} has elapsed for repair " +
|
||||
$"{repairData.RepairPolicy.RepairId}.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
return false;*/
|
||||
return false; */
|
||||
}
|
||||
|
||||
public async Task<bool> DeleteFilesAsyncAsync(TelemetryData repairData, CancellationToken cancellationToken)
|
||||
{
|
||||
return await RepairExec.DeleteFilesAsync(
|
||||
return await repairExecutor.DeleteFilesAsync(
|
||||
repairData ?? throw new ArgumentException(nameof(repairData)),
|
||||
cancellationToken);
|
||||
}
|
||||
|
||||
public async Task<bool> RestartReplicaAsync(TelemetryData repairData, CancellationToken cancellationToken)
|
||||
{
|
||||
return await RepairExec.RestartReplicaAsync(
|
||||
return await repairExecutor.RestartReplicaAsync(
|
||||
repairData ?? throw new ArgumentException("repairData can't be null."),
|
||||
cancellationToken);
|
||||
}
|
||||
|
||||
public async Task<bool> RemoveReplicaAsync(TelemetryData repairData, CancellationToken cancellationToken)
|
||||
{
|
||||
return await RepairExec.RemoveReplicaAsync(
|
||||
return await repairExecutor.RemoveReplicaAsync(
|
||||
repairData ?? throw new ArgumentException("repairData can't be null."),
|
||||
cancellationToken);
|
||||
}
|
||||
|
||||
public async Task<bool> RestartDeployedCodePackageAsync(TelemetryData repairData, CancellationToken cancellationToken)
|
||||
{
|
||||
var result = await RepairExec.RestartDeployedCodePackageAsync(
|
||||
var result = await repairExecutor.RestartDeployedCodePackageAsync(
|
||||
repairData ?? throw new ArgumentException("repairData can't be null."),
|
||||
cancellationToken);
|
||||
|
||||
|
@ -401,7 +423,7 @@ namespace FabricHealer.Repair
|
|||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
bool result = await RepairExec.RestartSystemServiceProcessAsync(repairData, cancellationToken);
|
||||
bool result = await repairExecutor.RestartSystemServiceProcessAsync(repairData, cancellationToken);
|
||||
|
||||
if (!result)
|
||||
{
|
||||
|
@ -425,7 +447,7 @@ namespace FabricHealer.Repair
|
|||
{
|
||||
try
|
||||
{
|
||||
var nodes = await FabricHealerManager.FabricClientSingleton.QueryManager.GetNodeListAsync(nodeName, AsyncTimeout, cancellationToken);
|
||||
var nodes = await FabricHealerManager.FabricClientSingleton.QueryManager.GetNodeListAsync(nodeName, asyncTimeout, cancellationToken);
|
||||
return nodes.Count > 0 ? nodes[0] : null;
|
||||
}
|
||||
catch (FabricException fe)
|
||||
|
@ -499,7 +521,7 @@ namespace FabricHealer.Repair
|
|||
|
||||
var executorData = new RepairExecutorData
|
||||
{
|
||||
ExecutorTimeoutInMinutes = (int)MaxWaitTimeForInfraRepairTaskCompleted.TotalMinutes,
|
||||
ExecutorTimeoutInMinutes = (int)MaxWaitTimeForFHRepairTaskCompleted.TotalMinutes,
|
||||
RepairData = repairData
|
||||
};
|
||||
|
||||
|
@ -1008,7 +1030,7 @@ namespace FabricHealer.Repair
|
|||
internal int GetEntityHealthEventCountWithinTimeRange(string property, TimeSpan timeWindow)
|
||||
{
|
||||
int count = 0;
|
||||
var healthEvents = DetectedHealthEvents.Where(evt => evt.HealthInformation.Property == property);
|
||||
var healthEvents = detectedHealthEvents.Where(evt => evt.HealthInformation.Property == property);
|
||||
|
||||
if (healthEvents == null || !healthEvents.Any())
|
||||
{
|
||||
|
@ -1025,10 +1047,10 @@ namespace FabricHealer.Repair
|
|||
}
|
||||
|
||||
// Lifetime management of Health Events list data. Data is kept in-memory only for 2 days. If FH process restarts, data is not preserved.
|
||||
if (DateTime.UtcNow.Subtract(LastHealthEventsListClearDateTime) >= MaxLifeTimeHealthEventsData)
|
||||
if (DateTime.UtcNow.Subtract(lastHealthEventsListClearDateTime) >= maxLifeTimeHealthEventsData)
|
||||
{
|
||||
DetectedHealthEvents.Clear();
|
||||
LastHealthEventsListClearDateTime = DateTime.UtcNow;
|
||||
detectedHealthEvents.Clear();
|
||||
lastHealthEventsListClearDateTime = DateTime.UtcNow;
|
||||
}
|
||||
|
||||
return count;
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
// ------------------------------------------------------------
|
||||
|
||||
using Newtonsoft.Json;
|
||||
using System;
|
||||
using System.IO;
|
||||
|
||||
namespace FabricHealer.Utilities
|
||||
|
|
|
@ -4,8 +4,9 @@
|
|||
// ------------------------------------------------------------
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Fabric;
|
||||
using System.Fabric.Description;
|
||||
using System.Fabric.Query;
|
||||
using System.Linq;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
|
@ -18,70 +19,46 @@ namespace FabricHealer.Repair
|
|||
private static readonly Logger Logger = new Logger("UpgradeLogger");
|
||||
|
||||
/// <summary>
|
||||
/// Gets Application Upgrade Domains (integers) for application or applications
|
||||
/// currently upgrading (or rolling back).
|
||||
/// Gets current Application upgrade domains for specified application.
|
||||
/// </summary>
|
||||
/// <param name="fabricClient">FabricClient instance</param>
|
||||
/// <param name="token">CancellationToken</param>
|
||||
/// <param name="appName" type="optional">Application Name (Uri)</param>
|
||||
/// <returns>List of integers representing UDs</returns>
|
||||
internal static async Task<List<int>> GetUdsWhereApplicationUpgradeInProgressAsync(Uri appName, CancellationToken token)
|
||||
/// <param name="appName">Application Name (Fabric Uri format)</param>
|
||||
/// <returns>UD where application upgrade is currently executing or null if there is no upgrade in progress.</returns>
|
||||
internal static async Task<string> GetUDWhereApplicationUpgradeInProgressAsync(Uri appName, CancellationToken token)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (appName == null)
|
||||
if (appName == null || token.IsCancellationRequested)
|
||||
{
|
||||
throw new ArgumentException("appName must be supplied.");
|
||||
return null;
|
||||
}
|
||||
|
||||
int currentUpgradeDomainInProgress = -1;
|
||||
var upgradeDomainsInProgress = new List<int>();
|
||||
string currentUpgradeDomainInProgress = null;
|
||||
ApplicationUpgradeProgress upgradeProgress =
|
||||
await FabricHealerManager.FabricClientSingleton.ApplicationManager.GetApplicationUpgradeProgressAsync(
|
||||
appName,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
token);
|
||||
|
||||
|
||||
var appList = await FabricHealerManager.FabricClientSingleton.QueryManager.GetApplicationListAsync(appName, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
|
||||
|
||||
foreach (var application in appList)
|
||||
if (upgradeProgress == null)
|
||||
{
|
||||
var upgradeProgress =
|
||||
await FabricHealerManager.FabricClientSingleton.ApplicationManager.GetApplicationUpgradeProgressAsync(
|
||||
application.ApplicationName,
|
||||
TimeSpan.FromMinutes(1),
|
||||
token);
|
||||
|
||||
if (!upgradeProgress.UpgradeState.Equals(ApplicationUpgradeState.RollingBackInProgress) &&
|
||||
!upgradeProgress.UpgradeState.Equals(ApplicationUpgradeState.RollingForwardInProgress) &&
|
||||
!upgradeProgress.UpgradeState.Equals(ApplicationUpgradeState.RollingForwardPending))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (int.TryParse(upgradeProgress.CurrentUpgradeDomainProgress.UpgradeDomainName, out currentUpgradeDomainInProgress))
|
||||
{
|
||||
if (!upgradeDomainsInProgress.Contains(currentUpgradeDomainInProgress))
|
||||
{
|
||||
upgradeDomainsInProgress.Add(currentUpgradeDomainInProgress);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
currentUpgradeDomainInProgress = -1;
|
||||
}
|
||||
}
|
||||
|
||||
// If no UD's are being upgraded then currentUpgradeDomainInProgress
|
||||
// remains -1, otherwise it will be added only once.
|
||||
if (!upgradeDomainsInProgress.Any())
|
||||
{
|
||||
upgradeDomainsInProgress.Add(currentUpgradeDomainInProgress);
|
||||
return null;
|
||||
}
|
||||
|
||||
return upgradeDomainsInProgress;
|
||||
if (upgradeProgress.UpgradeState != ApplicationUpgradeState.RollingBackInProgress &&
|
||||
upgradeProgress.UpgradeState != ApplicationUpgradeState.RollingForwardInProgress &&
|
||||
upgradeProgress.UpgradeState != ApplicationUpgradeState.RollingForwardPending)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
currentUpgradeDomainInProgress = upgradeProgress.CurrentUpgradeDomainProgress.UpgradeDomainName;
|
||||
return currentUpgradeDomainInProgress;
|
||||
}
|
||||
catch (Exception e) when (e is ArgumentException || e is FabricException || e is TimeoutException)
|
||||
{
|
||||
Logger.LogError($"Exception getting UDs for application upgrades in progress:{Environment.NewLine}{e}");
|
||||
|
||||
return new List<int>{ -1 };
|
||||
Logger.LogError($"Exception getting UDs for application upgrade in progress for {appName.OriginalString}:{Environment.NewLine}{e.Message}");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -91,7 +68,7 @@ namespace FabricHealer.Repair
|
|||
/// <param name="fabricClient">FabricClient</param>
|
||||
/// <param name="token"></param>
|
||||
/// <returns>UD in progress</returns>
|
||||
public static async Task<string> GetCurrentUDWhereFabricUpgradeInProgressAsync(CancellationToken token)
|
||||
internal static async Task<string> GetCurrentUDWhereFabricUpgradeInProgressAsync(CancellationToken token)
|
||||
{
|
||||
try
|
||||
{
|
||||
|
@ -122,7 +99,7 @@ namespace FabricHealer.Repair
|
|||
/// <param name="nodeType">NodeType string</param>
|
||||
/// <param name="token">CancellationToken instance</param>
|
||||
/// <returns>true if tenant update is in progress, false otherwise</returns>
|
||||
public static async Task<bool> IsAzureUpdateInProgress(string nodeType, string nodeName, CancellationToken token)
|
||||
internal static async Task<bool> IsAzureUpdateInProgress(string nodeType, string nodeName, CancellationToken token)
|
||||
{
|
||||
var repairTasks = await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
|
||||
null,
|
||||
|
|
|
@ -15,7 +15,6 @@ using Polly;
|
|||
using System.Collections.Generic;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Runtime.InteropServices;
|
||||
using static System.Fabric.Query.ReplicaStatus;
|
||||
|
||||
namespace FabricHealer
|
||||
{
|
||||
|
|
Загрузка…
Ссылка в новой задаче