Re-impl (DEV): MachineRepair.
This commit is contained in:
Родитель
3caffa95d2
Коммит
698bc2b3cc
|
@ -285,7 +285,7 @@ namespace FHTest
|
|||
functorTable.Add(RestartFabricNodePredicateType.Singleton(RepairConstants.RestartFabricNode, repairTaskManager, executorData, repairTaskEngine, repairData));
|
||||
functorTable.Add(RestartFabricSystemProcessPredicateType.Singleton(RepairConstants.RestartFabricSystemProcess, repairTaskManager, repairData));
|
||||
functorTable.Add(RestartReplicaPredicateType.Singleton(RepairConstants.RestartReplica, repairTaskManager, repairData));
|
||||
functorTable.Add(RestartMachinePredicateType.Singleton(RepairConstants.RestartVM, repairTaskManager, repairData));
|
||||
functorTable.Add(ScheduleMachineRepairPredicateType.Singleton(RepairConstants.ScheduleMachineRepair, repairTaskManager, repairData));
|
||||
|
||||
// Parse rules
|
||||
Module module = Module.Parse("external", repairRules, functorTable);
|
||||
|
|
|
@ -356,7 +356,7 @@ namespace FabricHealer
|
|||
|
||||
await Task.Delay(
|
||||
TimeSpan.FromSeconds(
|
||||
ConfigSettings.ExecutionLoopSleepSeconds > 0 ? ConfigSettings.ExecutionLoopSleepSeconds : 10), Token);
|
||||
ConfigSettings.HealthCheckIntervalInSeconds > 0 ? ConfigSettings.HealthCheckIntervalInSeconds : 10), Token);
|
||||
}
|
||||
|
||||
RepairLogger.LogInfo("Shutdown signaled. Stopping.");
|
||||
|
@ -1330,7 +1330,7 @@ namespace FabricHealer
|
|||
repairTaskManager.DetectedHealthEvents.Add(evt);
|
||||
|
||||
// Start the repair workflow.
|
||||
await repairTaskManager.StartRepairWorkflowAsync((TelemetryData)repairData, repairRules, Token);
|
||||
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1357,11 +1357,11 @@ namespace FabricHealer
|
|||
Token.ThrowIfCancellationRequested();
|
||||
|
||||
var nodeHealth = await FabricClientSingleton.HealthManager.GetNodeHealthAsync(node.NodeName);
|
||||
var observerHealthEvents =
|
||||
var nodeHealthEvents =
|
||||
nodeHealth.HealthEvents.Where(
|
||||
s => (s.HealthInformation.HealthState == HealthState.Warning || s.HealthInformation.HealthState == HealthState.Error));
|
||||
|
||||
foreach (var evt in observerHealthEvents)
|
||||
foreach (var evt in nodeHealthEvents)
|
||||
{
|
||||
Token.ThrowIfCancellationRequested();
|
||||
|
||||
|
@ -1373,7 +1373,11 @@ namespace FabricHealer
|
|||
continue;
|
||||
}
|
||||
|
||||
// TODO: Remove this hard requirement (TelemetryData-only). FH can just read health event data and learn what the problem is if FO/FH Proxy did not generate the health event. This is important
|
||||
// for cases where customers are not using FO or FHProxy, but want to use FH.
|
||||
// Check to see if the event Description is a serialized instance of TelemetryData, which would mean the health report was generated in a supported way.
|
||||
// In the case where there is no TelemetryData involved, create a new TelemtryData and set it with the minimum number of facts required to accomplish the goal.
|
||||
// This is trivial for the Machine repair case, but will get more complicated for other entities. That said, it is very doable.
|
||||
if (!JsonSerializationUtility.TryDeserialize(evt.HealthInformation.Description, out TelemetryData repairData))
|
||||
{
|
||||
continue;
|
||||
|
@ -1423,7 +1427,7 @@ namespace FabricHealer
|
|||
continue;
|
||||
}
|
||||
|
||||
// Get repair rules for supported source Observer.
|
||||
// Get repair rules for supplied facts (TelemetryData).
|
||||
var repairRules = GetRepairRulesForTelemetryData(repairData);
|
||||
|
||||
if (repairRules == null || repairRules.Count == 0)
|
||||
|
|
|
@ -79,8 +79,10 @@ Mitigate(AppName="fabric:/PortEater42", MetricName="EphemeralPorts", MetricValue
|
|||
## and where at least 3 health events identifying this problem were produced in the last 15 minutes. This is useful to ensure you don't mitigate a transient (short-lived)
|
||||
## problem as they will self-correct.
|
||||
|
||||
Mitigate(AppName="fabric:/CpuStress", MetricName="CpuPercent", MetricValue=?MetricValue) :- ?MetricValue >= 15,
|
||||
GetHealthEventHistory(?HealthEventCount, 00:15:00),
|
||||
## How long has it been in unhealthy state and how long has it not been in unhealthy state (healthy state).
|
||||
|
||||
Mitigate(ServiceName="fabric:/CpuStress/CpuStressor", MetricName="CpuPercent", MetricValue=?MetricValue) :- ?MetricValue >= 85,
|
||||
GetHealthEventHistory(?HealthEventCount, 00:30:00),
|
||||
?HealthEventCount >= 3,
|
||||
TimeScopedRestartCodePackage(1, 00:15:00).
|
||||
|
||||
|
@ -162,7 +164,7 @@ Mitigate(AppName="fabric:/MyApp42", MetricName="EphemeralPortsPercent", MetricVa
|
|||
Mitigate(AppName=?AppName, MetricName="Threads", MetricValue=?MetricValue) :- ?AppName != "fabric:/FabricObserver", ?MetricValue >= 400, TimeScopedRestartCodePackage(5, 05:00:00).
|
||||
|
||||
## Threads - Any app service. 5 repairs within 5 hour window. This means if FO warns on Thread count, then heal. There are no conditional checks (on MetricValue) to take place.
|
||||
## Mitigate(MetricName="Threads") :- TimeScopedRestartCodePackage(5, 05:00:00).
|
||||
Mitigate(MetricName="Threads") :- TimeScopedRestartCodePackage(5, 05:00:00).
|
||||
|
||||
## Generic rule for restarting any service in Warning or Error. This means any service that is in Error or Warning state and
|
||||
## also specified in the serialized TelemetryData instance that forms the Description of the related Service level Health Event will be restarted.
|
||||
|
@ -186,5 +188,5 @@ TimeScopedRestartReplica(?count, ?time) :- GetRepairHistory(?repairCount, ?time)
|
|||
## See below for an example using both optional arguments. Named arguments are just used for clarity below; you could also just specify RestartCodePackage(true, 00:10:00), for example.
|
||||
## Note: It's up to you to decide if you want RepairManager to conduct pre and post health checks.
|
||||
|
||||
TimeScopedRestartCodePackage() :- RestartCodePackage(DoHealthChecks=true, MaxWaitTimeForHealthStateOk=00:10:00).
|
||||
TimeScopedRestartCodePackage() :- RestartCodePackage(DoHealthChecks=false, MaxWaitTimeForHealthStateOk=00:10:00).
|
||||
TimeScopedRestartReplica() :- RestartReplica(DoHealthChecks=true, MaxWaitTimeForHealthStateOk=00:01:00).
|
|
@ -1,6 +1,6 @@
|
|||
## Logic rules for Machine level repairs in the cluster. Only OS reboot is supported today.
|
||||
## Logic rules for Machine level repairs in the cluster. Only OS reboot is supported today for VMSS-managed clusters.
|
||||
|
||||
## Applicable Named Arguments for Mitigate. Corresponding data is supplied by FabricObserver, Renamed for brevity by FH.
|
||||
## Applicable Named Arguments for Mitigate. Corresponding facts are supplied by FabricObserver or FabricHealerProxy, Renamed for brevity by FH.
|
||||
## | Argument Name | Definition |
|
||||
## |---------------------------|----------------------------------------------------------------------------------------------|
|
||||
## | NodeName | Name of the node |
|
||||
|
@ -22,37 +22,75 @@
|
|||
## | FileHandles (Linux) |
|
||||
## | FileHandlesPercent (Linux)|
|
||||
|
||||
## Supported repair action names.
|
||||
## | Name |
|
||||
## |------------------------------|
|
||||
## | System.Reboot |
|
||||
## | System.ReimageOS |
|
||||
## | System.FullReimage |
|
||||
## | System.Azure.HostReboot |
|
||||
## | System.Azure.HostRepaveData |
|
||||
|
||||
## First, check if we are inside run interval. If so, then cut (!).
|
||||
## This is commented out by default. Just uncomment and set the global run interval for VM level repairs to suit your needs.
|
||||
|
||||
## Mitigate() :- CheckInsideRunInterval(02:00:00), !.
|
||||
|
||||
## TimeScopedRestartVM is an internal predicate to check for the number of times a VM reboot repair has run to completion within a supplied time window.
|
||||
## TimeScopedRebootMachine is an internal predicate to check for the number of times a VM reboot repair has run to completion within a supplied time window.
|
||||
## If Completed VM Repair count is less then supplied value, then run RestartVM mitigation.
|
||||
|
||||
TimeScopedRestartVM(?count, ?time) :- GetRepairHistory(?repairCount, ?time),
|
||||
TimeScopedRebootMachine(?count, ?time) :- GetRepairHistory(?repairCount, ?time),
|
||||
?repairCount < ?count,
|
||||
RestartVM().
|
||||
ScheduleMachineRepair("System.Reboot").
|
||||
|
||||
## Percent Memory in Use (of total physical).
|
||||
|
||||
Mitigate(MetricName="MemoryPercent", MetricValue=?MetricValue) :- ?MetricValue >= 95,
|
||||
Mitigate(MetricName=MemoryPercent, MetricValue=?MetricValue) :- ?MetricValue >= 95,
|
||||
GetHealthEventHistory(?HealthEventCount, 00:15:00),
|
||||
?HealthEventCount >= 3,
|
||||
TimeScopedRestartVM(4, 08:00:00).
|
||||
|
||||
TimeScopedRebootMachine(4, 08:00:00).
|
||||
|
||||
## File Handles/FDs. Linux-only.
|
||||
## Percent Allocated, System-wide.
|
||||
|
||||
Mitigate(MetricName="FileHandlesPercent", MetricValue=?MetricValue, OS="Linux") :- ?MetricValue >= 95,
|
||||
Mitigate(MetricName=FileHandlesPercent, MetricValue=?MetricValue, OS=Linux) :- ?MetricValue >= 95,
|
||||
GetHealthEventHistory(?HealthEventCount, 00:15:00),
|
||||
?HealthEventCount >= 3,
|
||||
TimeScopedRestartVM(2, 08:00:00).
|
||||
TimeScopedRebootMachine(2, 08:00:00).
|
||||
|
||||
## Total Allocated, System-wide.
|
||||
|
||||
Mitigate(MetricName="FileHandles", MetricValue=?MetricValue, OS="Linux") :- ?MetricValue >= 1000000,
|
||||
Mitigate(MetricName=FileHandles, MetricValue=?MetricValue, OS=Linux) :- ?MetricValue >= 1000000,
|
||||
GetHealthEventHistory(?HealthEventCount, 00:15:00),
|
||||
?HealthEventCount >= 3,
|
||||
TimeScopedRestartVM(2, 08:00:00).
|
||||
TimeScopedRebootMachine(2, 08:00:00).
|
||||
|
||||
## Reboot/Reimage. The rules below satisfy the following requirements:
|
||||
## Repair throttling (MaxOutstandingRepairTasks), time-in-Error checking (pre-probation - GetCurrentEntityHealthStateDuration),
|
||||
## cap repair job attempts within time intervals (how many times to try a repair before trying something else, for example, escalation..),
|
||||
## time-to-wait after repair completes (post-probation - ProbationToHealthyWaitDurationPostRepair),
|
||||
## attempt scheduling disk reimage if rebooting didn't work (user-configured repair escalation).
|
||||
|
||||
## Logic workflow. Note: Since we are in this rule file, the context is already known to be Machine. FH has already determined what the facts are.
|
||||
## If Health State is Error, then proceed. This constraint is in the head of the rule for convenience and readability.
|
||||
## How long has the node been in Error state (probation)?
|
||||
## If at least 2 hours, then continue. Else, stop processing rules (end cut (!)).
|
||||
## How many times has this machine repair been run in the last 4 hours? If less than twice, continue. This is added logic, does not exist in RPE. Like all of this, customer owns the configuration.
|
||||
## Only schedule the repair if there are less than 2 machine-level repairs currently in flight in the cluster.
|
||||
## "Currently in flight" means RepairTaskStateFilter.Active | RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing.
|
||||
## Wait 30 mins for green node after IS completes execution and RM has restored node (no health checks for Error state, by default. Only safety checks.)
|
||||
## If any of the sub-rules do not succeed, then Guan will go to the next rule, which in this case will attempt to schedule a Disk reimage repair job with the same logical constraints/workflow.
|
||||
|
||||
Mitigate(NodeName=?NodeName, HealthState=Error) :- GetCurrentEntityHealthStateDuration(?HealthStateDuration, Entity=Machine, Target=?NodeName, State=Error),
|
||||
?HealthStateDuration <= 02:00:00, !.
|
||||
|
||||
Mitigate(NodeName=?NodeName, HealthState=Error) :- GetRepairHistory(?repairCount, 04:00:00),
|
||||
?repairCount < 2,
|
||||
ScheduleMachineRepair("System.Azure.HostReboot", MaxOutstandingRepairTasks=2, ProbationToHealthyWaitDurationPostRepair=00:30:00).
|
||||
|
||||
## Data disk reimage. Not supported on VMSS-managed virtual machines.
|
||||
Mitigate(NodeName=?NodeName, HealthState=Error) :- GetRepairHistory(?repairCount, 04:00:00),
|
||||
?repairCount < 2,
|
||||
ScheduleMachineRepair("System.Azure.HostRepaveData", MaxOutstandingRepairTasks=2, ProbationToHealthyWaitDurationPostRepair=00:30:00).
|
||||
|
||||
## Human intervention is required (Triage)
|
|
@ -1,57 +1,110 @@
|
|||
<?xml version="1.0" encoding="utf-8" ?>
|
||||
<Settings xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://schemas.microsoft.com/2011/01/fabric">
|
||||
<Section Name="RepairManagerConfiguration">
|
||||
<!-- ***Overridable Parameters***
|
||||
These must be set in ApplicationManifest.xml -->
|
||||
<Parameter Name="HealthCheckLoopSleepTimeSeconds" Value="" MustOverride="true" />
|
||||
<Parameter Name="EnableVerboseLogging" Value="" MustOverride="true" />
|
||||
<Parameter Name="EnableTelemetry" Value="" MustOverride="true" />
|
||||
<Parameter Name="EnableETW" Value="" MustOverride="true" />
|
||||
<Parameter Name="EnableAutoMitigation" Value="" MustOverride="true" />
|
||||
<Parameter Name="EnableOperationalTelemetry" Value="" MustOverride="true" />
|
||||
<Parameter Name="EnableRollingServiceRestarts" Value="" MustOverride="true" />
|
||||
<!-- Folder name for local log output. You can use a full path or just a folder name. -->
|
||||
<Parameter Name="LocalLogPath" Value="" MustOverride="true" />
|
||||
<Section Name="RepairManagerConfiguration">
|
||||
<!-- ***Overridable Parameters*** These must be set in ApplicationManifest.xml -->
|
||||
<!-- Interval in seconds for how often FabricHealer wakes up and scans health states to schedule repairs. -->
|
||||
<Parameter Name="HealthCheckIntervalInSeconds" Value="" MustOverride="true" />
|
||||
<Parameter Name="EnableVerboseLogging" Value="" MustOverride="true" />
|
||||
<Parameter Name="EnableTelemetry" Value="" MustOverride="true" />
|
||||
<Parameter Name="EnableETW" Value="" MustOverride="true" />
|
||||
<!-- Big Red Button: You can turn FabricHealer on and off with a versionless parameter-only application upgrade. -->
|
||||
<Parameter Name="EnableAutoMitigation" Value="" MustOverride="true" />
|
||||
<Parameter Name="EnableOperationalTelemetry" Value="" MustOverride="true" />
|
||||
<Parameter Name="EnableRollingServiceRestarts" Value="" MustOverride="true" />
|
||||
<!-- Folder name for local log output. You can use a full path or just a folder name. -->
|
||||
<Parameter Name="LocalLogPath" Value="" MustOverride="true" />
|
||||
|
||||
<!-- ***Non-Overridable Parameters*** These must be set in this file. -->
|
||||
<!-- ***Non-Overridable Parameters*** These must be set in this file. -->
|
||||
|
||||
<!-- Default timeout for async SF API calls. -->
|
||||
<Parameter Name="AsyncOperationTimeoutSeconds" Value="120" />
|
||||
<!-- Required-If EnableTelemetry is set to true in ApplicationManifest. Values can be either AzureApplicationInsights or AzureLogAnalytics -->
|
||||
<Parameter Name="TelemetryProvider" Value="AzureLogAnalytics" />
|
||||
<!-- Required-If TelemetryProvider is AzureApplicationInsights. -->
|
||||
<Parameter Name="AppInsightsInstrumentationKey" Value="" />
|
||||
<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
|
||||
<Parameter Name="LogAnalyticsWorkspaceId" Value="" />
|
||||
<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
|
||||
<Parameter Name="LogAnalyticsSharedKey" Value="" />
|
||||
<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
|
||||
<Parameter Name="LogAnalyticsLogType" Value="FabricHealer" />
|
||||
</Section>
|
||||
<!-- Default timeout for async SF API calls. -->
|
||||
<Parameter Name="AsyncOperationTimeoutSeconds" Value="120" />
|
||||
<!-- Required-If EnableTelemetry is set to true in ApplicationManifest. Values can be either AzureApplicationInsights or AzureLogAnalytics -->
|
||||
<Parameter Name="TelemetryProvider" Value="AzureLogAnalytics" />
|
||||
<!-- Required-If TelemetryProvider is AzureApplicationInsights. -->
|
||||
<Parameter Name="AppInsightsInstrumentationKey" Value="" />
|
||||
<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
|
||||
<Parameter Name="LogAnalyticsWorkspaceId" Value="" />
|
||||
<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
|
||||
<Parameter Name="LogAnalyticsSharedKey" Value="" />
|
||||
<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
|
||||
<Parameter Name="LogAnalyticsLogType" Value="FabricHealer" />
|
||||
</Section>
|
||||
|
||||
<!-- Repair Policies - Overridable Parameters. Must be set in ApplicationManifest.xml. -->
|
||||
<Section Name="AppRepairPolicy">
|
||||
<Parameter Name="Enabled" Value="" MustOverride="true" />
|
||||
<Parameter Name="LogicRulesConfigurationFile" Value="" />
|
||||
</Section>
|
||||
<Section Name="DiskRepairPolicy">
|
||||
<Parameter Name="Enabled" Value="" MustOverride="true" />
|
||||
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
|
||||
</Section>
|
||||
<Section Name="FabricNodeRepairPolicy">
|
||||
<Parameter Name="Enabled" Value="" MustOverride="true" />
|
||||
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
|
||||
</Section>
|
||||
<Section Name="ReplicaRepairPolicy">
|
||||
<Parameter Name="Enabled" Value="" MustOverride="true" />
|
||||
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
|
||||
</Section>
|
||||
<Section Name="SystemServiceRepairPolicy">
|
||||
<Parameter Name="Enabled" Value="" MustOverride="true" />
|
||||
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
|
||||
</Section>
|
||||
|
||||
<!-- Repair Policies.Overridable Parameters. Must be set in ApplicationManifest.xml. -->
|
||||
<Section Name="AppRepairPolicy">
|
||||
<Parameter Name="Enabled" Value="" MustOverride="true" />
|
||||
<Parameter Name="LogicRulesConfigurationFile" Value="" />
|
||||
</Section>
|
||||
<Section Name="DiskRepairPolicy">
|
||||
<Parameter Name="Enabled" Value="" MustOverride="true" />
|
||||
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
|
||||
</Section>
|
||||
<Section Name="FabricNodeRepairPolicy">
|
||||
<Parameter Name="Enabled" Value="" MustOverride="true" />
|
||||
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
|
||||
</Section>
|
||||
<Section Name="ReplicaRepairPolicy">
|
||||
<Parameter Name="Enabled" Value="" MustOverride="true" />
|
||||
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
|
||||
</Section>
|
||||
<Section Name="SystemServiceRepairPolicy">
|
||||
<Parameter Name="Enabled" Value="" MustOverride="true" />
|
||||
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
|
||||
</Section>
|
||||
<Section Name="MachineRepairPolicy">
|
||||
<Parameter Name="Enabled" Value="" MustOverride="true" />
|
||||
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
|
||||
</Section>
|
||||
<!-- Machine Repair. -->
|
||||
<Section Name="MachineRepairPolicy">
|
||||
<!-- FabricHealer is not allowed to schedule a machine repair more often than this interval.
|
||||
This is to prevent the FabricHealer from scheduling too many machine repairs in a short time. -->
|
||||
<Parameter Name="ActionSchedulingIntervalInSeconds" Value="600" />
|
||||
<Parameter Name="ProbationToFailingWaitDurationInSeconds" Value="7200" />
|
||||
<Parameter Name="ProbationToHealthyWaitDurationInSeconds" Value="1800" />
|
||||
<Parameter Name="MinimumHealthyDurationInSeconds" Value="300" />
|
||||
<Parameter Name="Enabled" Value="" MustOverride="true" />
|
||||
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
|
||||
</Section>
|
||||
<!-- Supported Machine level repair actions. -->
|
||||
<!-- RM Repair action strings (built into RM).
|
||||
========================================================================
|
||||
Alias VM (default) Host
|
||||
========================================================================
|
||||
System.Reboot System.Azure.Reboot System.Azure.HostReboot
|
||||
System.ReimageOS System.Azure.ReimageOS
|
||||
System.FullReimage System.Azure.RepaveData System.Azure.HostRepaveData
|
||||
========================================================================-->
|
||||
<Section Name="NodeRepairActionList">
|
||||
<!-- This is the name of the repair that will be provided to RM by FH when scheduling the repair. -->
|
||||
<Parameter Name="System.Azure.HostReboot" Value="RepairPolicyHostReboot" />
|
||||
<!-- This is not supported for VMSS-based machine clusters. -->
|
||||
<!--<Parameter Name="System.Azure.HostRepaveData" Value="RepairPolicyHostRepaveData" />-->
|
||||
<Parameter Name="ManualTriageNeeded" Value="RepairPolicyTriage" />
|
||||
</Section>
|
||||
<Section Name="RepairPolicyHostReboot">
|
||||
<!-- Required - How many repair tasks of this kind be scheduled concurrently. -->
|
||||
<Parameter Name="MaxOutstandingRepairTasks" Value="2" />
|
||||
<!-- Required - Probationary period to failing after this repair -->
|
||||
<Parameter Name="ProbationToFailingWaitDurationPostRepairInSeconds" Value="7200" />
|
||||
<!-- Required - Probationary period to healthy after this repair -->
|
||||
<Parameter Name="ProbationToHealthyWaitDurationPostRepairInSeconds" Value="1800" />
|
||||
<!-- Required - Minimum time of error state before this repair is applied -->
|
||||
<Parameter Name="PolicyActionTimeInSeconds" Value="7200" />
|
||||
<!-- This is optional and defaults to true -->
|
||||
<Parameter Name="IsEnabled" Value="true" />
|
||||
</Section>
|
||||
<!-- This is not supported for VMSS-based machine clusters. -->
|
||||
<Section Name="RepairPolicyHostRepaveData">
|
||||
<Parameter Name="MaxOutstandingRepairTasks" Value="2" />
|
||||
<Parameter Name="ProbationToFailingWaitDurationPostRepairInSeconds" Value="7200" />
|
||||
<Parameter Name="ProbationToHealthyWaitDurationPostRepairInSeconds" Value="1800" />
|
||||
<Parameter Name="PolicyActionTimeInSeconds" Value="14400" />
|
||||
<Parameter Name="IsEnabled" Value="false" />
|
||||
</Section>
|
||||
<Section Name="RepairPolicyTriage">
|
||||
<Parameter Name="MaxOutstandingRepairTasks" Value="2" MustOverride="false" />
|
||||
<Parameter Name="ProbationToFailingWaitDurationPostRepairInSeconds" Value="7200" />
|
||||
<Parameter Name="ProbationToHealthyWaitDurationPostRepairInSeconds" Value="1800" />
|
||||
<Parameter Name="PolicyActionTimeInSeconds" Value="9000" />
|
||||
<Parameter Name="IsEnabled" Value="true" />
|
||||
</Section>
|
||||
<!-- End Machine Repair -->
|
||||
</Settings>
|
||||
|
|
|
@ -6,6 +6,8 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Fabric;
|
||||
using System.Fabric.Description;
|
||||
using System.Fabric.Health;
|
||||
using System.Fabric.Query;
|
||||
using System.Fabric.Repair;
|
||||
using System.Linq;
|
||||
|
@ -154,19 +156,20 @@ namespace FabricHealer.Repair
|
|||
|
||||
await Task.Delay(new Random().Next(100, 1500));
|
||||
|
||||
var isRepairAlreadyInProgress =
|
||||
await repairTaskEngine.IsFHRepairTaskRunningAsync(executorName, repairData, token);
|
||||
bool isRepairInProgress = await repairTaskEngine.IsRepairInProgressAsync(executorName, repairData, token);
|
||||
|
||||
if (isRepairAlreadyInProgress)
|
||||
if (isRepairInProgress)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
switch (repairAction)
|
||||
{
|
||||
case RepairActionType.RestartVM:
|
||||
case RepairActionType.RebootMachine:
|
||||
case RepairActionType.ReimageDisk:
|
||||
case RepairActionType.ReimageOS:
|
||||
|
||||
repairTask = await repairTaskEngine.CreateVmRebootISRepairTaskAsync(repairData, executorName, token);
|
||||
repairTask = await repairTaskEngine.CreateMachineRepairTaskAsync(repairData, executorName, token);
|
||||
break;
|
||||
|
||||
case RepairActionType.DeleteFiles:
|
||||
|
@ -209,7 +212,7 @@ namespace FabricHealer.Repair
|
|||
try
|
||||
{
|
||||
var isRepairAlreadyInProgress =
|
||||
await repairTaskEngine.IsFHRepairTaskRunningAsync(repairTask.Executor, repairData, token);
|
||||
await repairTaskEngine.IsRepairInProgressAsync(repairTask.Executor, repairData, token);
|
||||
|
||||
if (!isRepairAlreadyInProgress)
|
||||
{
|
||||
|
@ -395,5 +398,69 @@ namespace FabricHealer.Repair
|
|||
|
||||
return count;
|
||||
}
|
||||
|
||||
internal static async Task<TimeSpan> GetEntityCurrentHealthStateDurationAsync(EntityType entityType, string entityFilter, HealthState state, CancellationToken token)
|
||||
{
|
||||
HealthEventsFilter healthEventsFilter = new HealthEventsFilter();
|
||||
|
||||
if (state == HealthState.Warning)
|
||||
{
|
||||
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Warning;
|
||||
}
|
||||
else if (state == HealthState.Error)
|
||||
{
|
||||
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Error;
|
||||
}
|
||||
else if (state == HealthState.Ok)
|
||||
{
|
||||
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Ok;
|
||||
}
|
||||
else
|
||||
{
|
||||
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.None;
|
||||
}
|
||||
|
||||
switch (entityType)
|
||||
{
|
||||
case EntityType.Application:
|
||||
break;
|
||||
|
||||
case EntityType.Service:
|
||||
break;
|
||||
|
||||
case EntityType.Machine:
|
||||
case EntityType.Node:
|
||||
|
||||
var queryDesc = new NodeHealthQueryDescription(entityFilter)
|
||||
{
|
||||
EventsFilter = healthEventsFilter
|
||||
};
|
||||
var nodeHealthList =
|
||||
await FabricHealerManager.FabricClientSingleton.HealthManager.GetNodeHealthAsync(
|
||||
queryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
|
||||
|
||||
if (nodeHealthList == null || nodeHealthList.HealthEvents.Count == 0)
|
||||
{
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
foreach (var nodeHealthEvent in nodeHealthList.HealthEvents)
|
||||
{
|
||||
if (nodeHealthEvent.IsExpired)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
return DateTime.UtcNow.Subtract(nodeHealthEvent.SourceUtcTimestamp);
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
// ------------------------------------------------------------
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
|
||||
// ------------------------------------------------------------
|
||||
|
||||
using System;
|
||||
using System.Threading.Tasks;
|
||||
using Guan.Logic;
|
||||
using FabricHealer.Utilities.Telemetry;
|
||||
using System.Fabric.Health;
|
||||
|
||||
namespace FabricHealer.Repair.Guan
|
||||
{
|
||||
public class GetCurrentEntityHealthStateDurationPredicateType : PredicateType
|
||||
{
|
||||
private static RepairTaskManager RepairTaskManager;
|
||||
private static GetCurrentEntityHealthStateDurationPredicateType Instance;
|
||||
|
||||
private class Resolver : GroundPredicateResolver
|
||||
{
|
||||
public Resolver(CompoundTerm input, Constraint constraint, QueryContext context)
|
||||
: base(input, constraint, context, 1)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
// GetCurrentEntityHealthStateDuration(?HealthStateDuration, Machine, ?NodeName, State=Error)
|
||||
protected override async Task<Term> GetNextTermAsync()
|
||||
{
|
||||
if (Input.Arguments.Count != 4)
|
||||
{
|
||||
throw new GuanException("GetCurrentEntityHealthStateDuration predicate requires 4 arguments.");
|
||||
}
|
||||
|
||||
TimeSpan duration;
|
||||
|
||||
if (!Enum.TryParse((string)Input.Arguments[1].Value.GetEffectiveTerm().GetObjectValue(), out EntityType entityType))
|
||||
{
|
||||
throw new GuanException("The second argument of GetCurrentEntityHealthStateDuration must be a valid EntityType value (Application, Service, Node, Machine, etc..)");
|
||||
}
|
||||
|
||||
if (!Enum.TryParse((string)Input.Arguments[3].Value.GetEffectiveTerm().GetObjectValue(), out HealthState state))
|
||||
{
|
||||
throw new GuanException("The third argument of GetCurrentEntityHealthStateDuration must be a valid HealthState value (Error, Warning, etc..)");
|
||||
}
|
||||
|
||||
string nodeName = (string)Input.Arguments[2].Value.GetEffectiveTerm().GetObjectValue();
|
||||
|
||||
|
||||
duration = await FabricRepairTasks.GetEntityCurrentHealthStateDurationAsync(entityType, nodeName, state, RepairTaskManager.Token);
|
||||
|
||||
var result = new CompoundTerm(this.Input.Functor);
|
||||
result.AddArgument(new Constant(duration), "0");
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
public static GetCurrentEntityHealthStateDurationPredicateType Singleton(string name, RepairTaskManager repairTaskManager)
|
||||
{
|
||||
RepairTaskManager = repairTaskManager;
|
||||
return Instance ??= new GetCurrentEntityHealthStateDurationPredicateType(name);
|
||||
}
|
||||
|
||||
private GetCurrentEntityHealthStateDurationPredicateType(string name)
|
||||
: base(name, true, 1)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
public override PredicateResolver CreateResolver(CompoundTerm input, Constraint constraint, QueryContext context)
|
||||
{
|
||||
return new Resolver(input, constraint, context);
|
||||
}
|
||||
|
||||
public override void AdjustTerm(CompoundTerm term, Rule rule)
|
||||
{
|
||||
if (term.Arguments[0].Value.IsGround())
|
||||
{
|
||||
throw new GuanException("The first argument of GetCurrentEntityHealthStateDuration must be a variable: {0}", term);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -70,7 +70,7 @@ namespace FabricHealer.Repair.Guan
|
|||
// Block attempts to create node-level repair tasks if one is already running in the cluster.
|
||||
var repairTaskEngine = new RepairTaskEngine();
|
||||
var isNodeRepairAlreadyInProgress =
|
||||
await repairTaskEngine.IsFHRepairTaskRunningAsync(
|
||||
await repairTaskEngine.IsRepairInProgressAsync(
|
||||
RepairTaskEngine.FabricHealerExecutorName,
|
||||
RepairData,
|
||||
RepairTaskManager.Token);
|
||||
|
|
|
@ -11,11 +11,11 @@ using System.Threading.Tasks;
|
|||
|
||||
namespace FabricHealer.Repair.Guan
|
||||
{
|
||||
public class RestartMachinePredicateType : PredicateType
|
||||
public class ScheduleMachineRepairPredicateType : PredicateType
|
||||
{
|
||||
private static RepairTaskManager RepairTaskManager;
|
||||
private static TelemetryData RepairData;
|
||||
private static RestartMachinePredicateType Instance;
|
||||
private static ScheduleMachineRepairPredicateType Instance;
|
||||
|
||||
private class Resolver : BooleanPredicateResolver
|
||||
{
|
||||
|
@ -27,36 +27,50 @@ namespace FabricHealer.Repair.Guan
|
|||
|
||||
protected override async Task<bool> CheckAsync()
|
||||
{
|
||||
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartVM;
|
||||
if (Input.Arguments.Count == 0)
|
||||
{
|
||||
throw new GuanException("You must provide a repair action name for Infrastructure-level repairs as first argument.");
|
||||
}
|
||||
|
||||
// Repair action name is required.
|
||||
string repairAction = (string)Input.Arguments[0].Value.GetObjectValue();
|
||||
|
||||
/*
|
||||
public const string SystemReboot = "System.Reboot";
|
||||
public const string SystemReimageOS = "System.ReimageOS ";
|
||||
public const string SystemFullReimage = "System.FullReimage";
|
||||
public const string SystemHostReboot = "System.Azure.HostReboot";
|
||||
public const string SystemHostRepaveData = "System.Azure.HostRepaveData";
|
||||
*/
|
||||
|
||||
switch (repairAction)
|
||||
{
|
||||
case RepairConstants.SystemReboot:
|
||||
case RepairConstants.SystemHostReboot:
|
||||
RepairData.RepairPolicy.RepairAction = RepairActionType.RebootMachine;
|
||||
break;
|
||||
|
||||
case RepairConstants.SystemReimageOS:
|
||||
case RepairConstants.SystemFullReimage:
|
||||
case RepairConstants.SystemHostRepaveData:
|
||||
RepairData.RepairPolicy.RepairAction = RepairActionType.ReimageOS;
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new GuanException($"Unrecognized repair action name: {repairAction}. Repair actions are case sensitive.");
|
||||
}
|
||||
|
||||
// FH does not execute repairs for VM level mitigation. InfrastructureService (IS) does,
|
||||
// so, FH schedules VM repairs via RM and the execution is taken care of by IS (the executor).
|
||||
// Block attempts to create duplicate repair tasks.
|
||||
// Block attempts to create duplicate repair tasks or more than specified concurrent machine-level repairs.
|
||||
var repairTaskEngine = new RepairTaskEngine();
|
||||
var isRepairAlreadyInProgress =
|
||||
await repairTaskEngine.IsFHRepairTaskRunningAsync(
|
||||
$"{RepairTaskEngine.InfrastructureServiceName}/{RepairData.NodeType}",
|
||||
RepairData,
|
||||
RepairTaskManager.Token);
|
||||
|
||||
if (isRepairAlreadyInProgress)
|
||||
{
|
||||
string message = $"VM Repair {RepairData.RepairPolicy.RepairId} is already in progress. Will not attempt repair at this time.";
|
||||
|
||||
await RepairTaskManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
$"RestartVMPredicateType::{RepairData.RepairPolicy.RepairId}",
|
||||
message,
|
||||
RepairTaskManager.Token);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
int count = Input.Arguments.Count;
|
||||
long maxConcurrentRepairs = 0;
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
var typeString = Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue().GetType().Name;
|
||||
|
||||
switch (typeString)
|
||||
{
|
||||
case "TimeSpan":
|
||||
|
@ -67,13 +81,39 @@ namespace FabricHealer.Repair.Guan
|
|||
RepairData.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[i].Value.GetObjectValue();
|
||||
break;
|
||||
|
||||
case "Int64":
|
||||
maxConcurrentRepairs = (long)Input.Arguments[i].Value.GetObjectValue();
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new GuanException($"Unsupported input: {Input.Arguments[i].Value.GetObjectValue().GetType()}");
|
||||
}
|
||||
}
|
||||
|
||||
var isRepairAlreadyInProgress =
|
||||
await repairTaskEngine.IsRepairInProgressAsync(
|
||||
$"{RepairTaskEngine.InfrastructureServiceName}/{RepairData.NodeType}",
|
||||
RepairData,
|
||||
RepairTaskManager.Token,
|
||||
maxConcurrentRepairs);
|
||||
|
||||
if (isRepairAlreadyInProgress)
|
||||
{
|
||||
string message = $"VM Repair {RepairData.RepairPolicy.RepairId} is already in progress" +
|
||||
$"{(maxConcurrentRepairs > 0 ? " or max number of concurrent machine repairs has been reached" : "")}. " +
|
||||
$"Will not attempt repair at this time.";
|
||||
|
||||
await RepairTaskManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
$"RestartMachinePredicateType::{RepairData.RepairPolicy.RepairId}",
|
||||
message,
|
||||
RepairTaskManager.Token);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool success = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => RepairTaskManager.ExecuteRMInfrastructureRepairTask(
|
||||
() => RepairTaskManager.ScheduleInfrastructureRepairTask(
|
||||
RepairData,
|
||||
RepairTaskManager.Token),
|
||||
RepairTaskManager.Token);
|
||||
|
@ -81,15 +121,15 @@ namespace FabricHealer.Repair.Guan
|
|||
}
|
||||
}
|
||||
|
||||
public static RestartMachinePredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData repairData)
|
||||
public static ScheduleMachineRepairPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData repairData)
|
||||
{
|
||||
RepairTaskManager = repairTaskManager;
|
||||
RepairData = repairData;
|
||||
|
||||
return Instance ??= new RestartMachinePredicateType(name);
|
||||
return Instance ??= new ScheduleMachineRepairPredicateType(name);
|
||||
}
|
||||
|
||||
private RestartMachinePredicateType(string name)
|
||||
private ScheduleMachineRepairPredicateType(string name)
|
||||
: base(name, true, 0)
|
||||
{
|
||||
|
|
@ -22,6 +22,8 @@ namespace FabricHealer.Repair
|
|||
RestartFabricNode,
|
||||
RestartProcess,
|
||||
RestartReplica,
|
||||
RestartVM,
|
||||
RebootMachine,
|
||||
ReimageDisk,
|
||||
ReimageOS
|
||||
}
|
||||
}
|
|
@ -29,7 +29,7 @@ namespace FabricHealer.Repair
|
|||
public const string EnableRollingServiceRestartsParameter = "EnableRollingServiceRestarts";
|
||||
public const string AppInsightsInstrumentationKeyParameter = "AppInsightsInstrumentationKey";
|
||||
public const string EnableETW = "EnableETW";
|
||||
public const string HealthCheckLoopSleepTimeSeconds = "HealthCheckLoopSleepTimeSeconds";
|
||||
public const string HealthCheckIntervalInSeconds = "HealthCheckIntervalInSeconds";
|
||||
public const string LocalLogPathParameter = "LocalLogPath";
|
||||
public const string AsyncOperationTimeout = "AsyncOperationTimeoutSeconds";
|
||||
public const string EnableFabricHealerOperationalTelemetry = "EnableOperationalTelemetry";
|
||||
|
@ -72,7 +72,15 @@ namespace FabricHealer.Repair
|
|||
public const string RestartFabricNode = "RestartFabricNode";
|
||||
public const string RestartFabricSystemProcess = "RestartFabricSystemProcess";
|
||||
public const string RestartReplica = "RestartReplica";
|
||||
public const string RestartVM = "RestartVM";
|
||||
public const string ScheduleMachineRepair = "ScheduleMachineRepair";
|
||||
public const string ScheduleDiskReimage = "ScheduleDiskReimage";
|
||||
|
||||
// Infra repair names (RM "commands").
|
||||
public const string SystemReboot = "System.Reboot";
|
||||
public const string SystemReimageOS = "System.ReimageOS ";
|
||||
public const string SystemFullReimage = "System.FullReimage";
|
||||
public const string SystemHostReboot = "System.Azure.HostReboot";
|
||||
public const string SystemHostRepaveData = "System.Azure.HostRepaveData";
|
||||
|
||||
// Helper Predicates.
|
||||
public const string CheckInsideRunInterval = "CheckInsideRunInterval";
|
||||
|
|
|
@ -956,42 +956,6 @@ namespace FabricHealer.Repair
|
|||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a machine name string, given a fabric node name.
|
||||
/// </summary>
|
||||
/// <param name="nodeName">Fabric node name</param>
|
||||
/// <param name="cancellationToken"></param>
|
||||
internal async Task<string> GetMachineHostNameFromFabricNodeNameAsync(string nodeName, CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
var nodes = await FabricHealerManager.FabricClientSingleton.QueryManager.GetNodeListAsync(
|
||||
nodeName,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken);
|
||||
|
||||
Node targetNode = nodes.Count > 0 ? nodes[0] : null;
|
||||
|
||||
if (targetNode == null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
string ipOrDnsName = targetNode.IpAddressOrFQDN;
|
||||
var hostEntry = await Dns.GetHostEntryAsync(ipOrDnsName);
|
||||
var machineName = hostEntry.HostName;
|
||||
|
||||
return machineName;
|
||||
}
|
||||
catch (Exception e) when (e is ArgumentException|| e is SocketException|| e is OperationCanceledException || e is TimeoutException)
|
||||
{
|
||||
FabricHealerManager.RepairLogger.LogWarning(
|
||||
$"Unable to determine machine host name from Fabric node name {nodeName}:{Environment.NewLine}{e}");
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Clears existing health warnings for target repair entity. This should only be called after a repair operation succeeds.
|
||||
/// </summary>
|
||||
|
|
|
@ -32,6 +32,14 @@ namespace FabricHealer.Repair
|
|||
get; set;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The name of the infrastucture repair to provide to RM that IS will execute.
|
||||
/// </summary>
|
||||
public string InfrastructureRepairName
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Maximum amount of time to check if health state of repaired target entity is Ok.
|
||||
/// </summary>
|
||||
|
|
|
@ -4,7 +4,9 @@
|
|||
// ------------------------------------------------------------
|
||||
|
||||
using System;
|
||||
using System.ComponentModel.DataAnnotations;
|
||||
using System.Fabric.Description;
|
||||
using System.Fabric.Health;
|
||||
using System.Fabric.Query;
|
||||
using System.Fabric.Repair;
|
||||
using System.Linq;
|
||||
|
@ -12,13 +14,12 @@ using System.Threading;
|
|||
using System.Threading.Tasks;
|
||||
using FabricHealer.Utilities;
|
||||
using FabricHealer.Utilities.Telemetry;
|
||||
using Newtonsoft.Json.Linq;
|
||||
|
||||
namespace FabricHealer.Repair
|
||||
{
|
||||
public sealed class RepairTaskEngine
|
||||
{
|
||||
public const string HostVMReboot = "System.Reboot";
|
||||
public const string HostMachineReboot = "System.Reboot";
|
||||
public const string FHTaskIdPrefix = "FH";
|
||||
public const string AzureTaskIdPrefix = "Azure";
|
||||
public const string FabricHealerExecutorName = "FabricHealer";
|
||||
|
@ -105,8 +106,14 @@ namespace FabricHealer.Repair
|
|||
return repairTasks;
|
||||
}
|
||||
|
||||
// This allows InfrastructureService to schedule and run reboot im concert with VMSS over MR.
|
||||
public async Task<RepairTask> CreateVmRebootISRepairTaskAsync(TelemetryData repairData, string executorName, CancellationToken cancellationToken)
|
||||
/// <summary>
|
||||
/// This schedules a repair task where SF's InfrastructureService will reboot the target machine safely.
|
||||
/// </summary>
|
||||
/// <param name="repairData"></param>
|
||||
/// <param name="executorName"></param>
|
||||
/// <param name="cancellationToken"></param>
|
||||
/// <returns></returns>
|
||||
public async Task<RepairTask> CreateMachineRepairTaskAsync(TelemetryData repairData, string executorName, CancellationToken cancellationToken)
|
||||
{
|
||||
// This constraint (MaxResults) is used just to make sure there is more 1 node in the cluster. We don't need a list of all nodes.
|
||||
var nodeQueryDesc = new NodeQueryDescription
|
||||
|
@ -114,6 +121,13 @@ namespace FabricHealer.Repair
|
|||
MaxResults = 3,
|
||||
};
|
||||
|
||||
string repairActionName = HostMachineReboot;
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(repairData.RepairPolicy.InfrastructureRepairName))
|
||||
{
|
||||
repairActionName = repairData.RepairPolicy.InfrastructureRepairName;
|
||||
}
|
||||
|
||||
NodeList nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricHealerManager.FabricClientSingleton.QueryManager.GetNodePagedListAsync(
|
||||
nodeQueryDesc,
|
||||
|
@ -126,14 +140,14 @@ namespace FabricHealer.Repair
|
|||
return null;
|
||||
}
|
||||
|
||||
string taskId = $"{FHTaskIdPrefix}/{HostVMReboot}/{(uint)repairData.NodeName.GetHashCode()}/{repairData.NodeType}";
|
||||
bool doHealthChecks = !SupportedErrorCodes.GetCodeNameFromErrorCode(repairData.Code).Contains("Error");
|
||||
string taskId = $"{FHTaskIdPrefix}/{repairActionName}/{(uint)repairData.NodeName.GetHashCode()}/{repairData.NodeType}";
|
||||
bool doHealthChecks = repairData.HealthState != HealthState.Error;
|
||||
|
||||
// Error health state on target SF entity can block RM from approving the job to repair it (which is the whole point of doing the job).
|
||||
// So, do not do health checks if customer configures FO to emit Error health reports.
|
||||
// In general, FO should *not* be configured to emit Error events. See FO documentation.
|
||||
|
||||
var repairTask = new ClusterRepairTask(taskId, HostVMReboot)
|
||||
var repairTask = new ClusterRepairTask(taskId, repairActionName)
|
||||
{
|
||||
Target = new NodeRepairTargetDescription(repairData.NodeName),
|
||||
Description = $"{repairData.RepairPolicy.RepairId}",
|
||||
|
@ -146,25 +160,40 @@ namespace FabricHealer.Repair
|
|||
return repairTask;
|
||||
}
|
||||
|
||||
public async Task<bool> IsFHRepairTaskRunningAsync(string executorName, TelemetryData repairdData, CancellationToken token)
|
||||
/// <summary>
|
||||
/// Determines if a repair task is already in flight or if the max number of concurrent repairs has been reached for the target using the information specified in repairData instance.
|
||||
/// </summary>
|
||||
/// <param name="executorName">Name of the repair executor.</param>
|
||||
/// <param name="repairData">TelemetryData instance.</param>
|
||||
/// <param name="token">CancellationToken.</param>
|
||||
/// <param name="maxConcurrentRepairs">Optional: Number of max concurrent repairs for the entity type specified in repairData. Default is 0 which means no concurrent repairs.</param>
|
||||
/// <returns></returns>
|
||||
public async Task<bool> IsRepairInProgressAsync(string executorName, TelemetryData repairData, CancellationToken token, long maxConcurrentRepairs = 0)
|
||||
{
|
||||
// All RepairTasks are prefixed with FH, regardless of repair target type (VM, fabric node, system service process, codepackage, replica).
|
||||
// For VM-level repair, RM will create a new task for IS that replaces FH executor data with IS job info, but the original FH repair task will
|
||||
// remain in an active state which will block any duplicate scheduling by another FH instance.
|
||||
var currentFHRepairTasksInProgress =
|
||||
await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
|
||||
FHTaskIdPrefix,
|
||||
RepairTaskStateFilter.Active | RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing,
|
||||
executorName,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
token);
|
||||
// All RepairTasks are prefixed with FH, regardless of repair target type (VM/Machine, Fabric node, system service process, code package, replica).
|
||||
// For VM-level repairs, RM will create a new task for IS that replaces FH executor data with IS job info.
|
||||
RepairTaskList repairTasksInProgress =
|
||||
await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
|
||||
FHTaskIdPrefix,
|
||||
RepairTaskStateFilter.Active | RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing,
|
||||
executorName,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
token);
|
||||
|
||||
if (currentFHRepairTasksInProgress == null || currentFHRepairTasksInProgress.Count == 0)
|
||||
if (repairTasksInProgress == null || repairTasksInProgress.Count == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
foreach (var repair in currentFHRepairTasksInProgress)
|
||||
// Throttling machine level repairs.
|
||||
if (executorName == $"{InfrastructureServiceName}/{repairData.NodeType}" &&
|
||||
maxConcurrentRepairs > 0 &&
|
||||
repairTasksInProgress.Count(r => r.Executor == executorName) >= maxConcurrentRepairs)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
foreach (var repair in repairTasksInProgress)
|
||||
{
|
||||
// This check is to see if there are any FH-as-executor repairs in flight.
|
||||
if (executorName == FabricHealerExecutorName)
|
||||
|
@ -179,19 +208,17 @@ namespace FabricHealer.Repair
|
|||
return false;
|
||||
}
|
||||
|
||||
// The node repair check ensures that only one node-level repair can take place in a cluster (no concurrent node restarts), by default.
|
||||
// FH is conservative, by design.
|
||||
if (repairdData.RepairPolicy.RepairId == executorData.RepairData.RepairPolicy.RepairId ||
|
||||
executorData.RepairData.RepairPolicy.RepairAction == RepairActionType.RestartFabricNode)
|
||||
// This check ensures that only one repair can be scheduled at a time for the same target.
|
||||
if (repairData.RepairPolicy.RepairId == executorData.RepairData.RepairPolicy.RepairId)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else if (repair.Executor == $"{InfrastructureServiceName}/{repairdData.NodeType}")
|
||||
else if (repair.Executor == $"{InfrastructureServiceName}/{repairData.NodeType}")
|
||||
{
|
||||
// This would block scheduling any VM level operation (reboot) already in flight.
|
||||
// For IS repairs, unique id is stored in the repair task's Description property.
|
||||
if (repair.Description == repairdData.RepairPolicy.RepairId)
|
||||
// This would block rescheduling any VM level operation (reboot) that is already in flight.
|
||||
// NOTE: For Infrastructure-level repairs (IS is executor), unique id is stored in the repair task's Description property.
|
||||
if (repair.Description == repairData.RepairPolicy.RepairId)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -151,7 +151,7 @@ namespace FabricHealer.Repair
|
|||
functorTable.Add(RestartFabricNodePredicateType.Singleton(RepairConstants.RestartFabricNode, this, repairExecutorData, repairTaskEngine, repairData));
|
||||
functorTable.Add(RestartFabricSystemProcessPredicateType.Singleton(RepairConstants.RestartFabricSystemProcess, this, repairData));
|
||||
functorTable.Add(RestartReplicaPredicateType.Singleton(RepairConstants.RestartReplica, this, repairData));
|
||||
functorTable.Add(RestartMachinePredicateType.Singleton(RepairConstants.RestartVM, this, repairData));
|
||||
functorTable.Add(ScheduleMachineRepairPredicateType.Singleton(RepairConstants.ScheduleMachineRepair, this, repairData));
|
||||
|
||||
// Parse rules.
|
||||
Module module = Module.Parse("external", repairRules, functorTable);
|
||||
|
@ -181,7 +181,7 @@ namespace FabricHealer.Repair
|
|||
compoundTerm.AddArgument(new Constant(repairData.NodeType), RepairConstants.NodeType);
|
||||
compoundTerm.AddArgument(new Constant(repairData.ObserverName), RepairConstants.ObserverName);
|
||||
compoundTerm.AddArgument(new Constant(repairData.OS), RepairConstants.OS);
|
||||
compoundTerm.AddArgument(new Constant(Enum.GetName(typeof(ServiceKind), repairData.ServiceKind)), RepairConstants.ServiceKind);
|
||||
compoundTerm.AddArgument(new Constant(repairData.ServiceKind), RepairConstants.ServiceKind);
|
||||
compoundTerm.AddArgument(new Constant(repairData.ServiceName), RepairConstants.ServiceName);
|
||||
compoundTerm.AddArgument(new Constant(repairData.ProcessId), RepairConstants.ProcessId);
|
||||
compoundTerm.AddArgument(new Constant(repairData.ProcessName), RepairConstants.ProcessName);
|
||||
|
@ -198,7 +198,7 @@ namespace FabricHealer.Repair
|
|||
// The repair will be executed by SF Infrastructure service, not FH. This is the case for all
|
||||
// VM-level repairs. IS will communicate with VMSS (for example) to guarantee safe repairs in MR-enabled
|
||||
// clusters.RM, as usual, will orchestrate the repair cycle.
|
||||
public async Task<bool> ExecuteRMInfrastructureRepairTask(TelemetryData repairData, CancellationToken cancellationToken)
|
||||
public async Task<bool> ScheduleInfrastructureRepairTask(TelemetryData repairData, CancellationToken cancellationToken)
|
||||
{
|
||||
var infraServices = await FabricRepairTasks.GetInfrastructureServiceInstancesAsync(cancellationToken);
|
||||
var arrServices = infraServices as Service[] ?? infraServices.ToArray();
|
||||
|
@ -207,7 +207,7 @@ namespace FabricHealer.Repair
|
|||
{
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"ExecuteRMInfrastructureRepairTask",
|
||||
"ScheduleInfrastructureRepairTask",
|
||||
"Infrastructure Service not found. Will not attemp VM repair.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
|
@ -229,8 +229,8 @@ namespace FabricHealer.Repair
|
|||
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteRMInfrastructureRepairTask",
|
||||
$"IS RepairTask {RepairTaskEngine.HostVMReboot} " +
|
||||
"ScheduleInfrastructureRepairTask",
|
||||
$"IS RepairTask {RepairTaskEngine.HostMachineReboot} " +
|
||||
$"Executor set to {executorName}.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
|
@ -243,7 +243,7 @@ namespace FabricHealer.Repair
|
|||
{
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"ExecuteRMInfrastructureRepairTask",
|
||||
"ScheduleInfrastructureRepairTask",
|
||||
"Unable to find InfrastructureService service instance." +
|
||||
"Exiting RepairTaskManager.ScheduleFHRepairTaskAsync.",
|
||||
cancellationToken,
|
||||
|
@ -255,19 +255,26 @@ namespace FabricHealer.Repair
|
|||
|
||||
// Make sure there is not already a repair job executing reboot repair for target node.
|
||||
var isRepairAlreadyInProgress =
|
||||
await repairTaskEngine.IsFHRepairTaskRunningAsync(
|
||||
executorName,
|
||||
repairData,
|
||||
cancellationToken);
|
||||
await repairTaskEngine.IsRepairInProgressAsync(executorName, repairData, cancellationToken);
|
||||
|
||||
if (isRepairAlreadyInProgress)
|
||||
{
|
||||
string machineName = repairData.NodeName;
|
||||
|
||||
try
|
||||
{
|
||||
machineName = Environment.MachineName;
|
||||
}
|
||||
catch (InvalidOperationException)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"ExecuteRMInfrastructureRepairTask",
|
||||
"Virtual machine repair task for VM " +
|
||||
$"{await RepairExec.GetMachineHostNameFromFabricNodeNameAsync(repairData.NodeName, cancellationToken)} " +
|
||||
"is already in progress. Will not schedule another VM repair at this time.",
|
||||
"ScheduleInfrastructureRepairTask",
|
||||
$"Virtual machine repair task for {machineName} is already in progress " +
|
||||
"or max number of concurrent machine repairs has been reached. Will not schedule another machine repair at this time.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
@ -282,7 +289,7 @@ namespace FabricHealer.Repair
|
|||
{
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"ExecuteRMInfrastructureRepairTask",
|
||||
"ScheduleInfrastructureRepairTask",
|
||||
"Unable to create Repair Task.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
|
@ -293,7 +300,7 @@ namespace FabricHealer.Repair
|
|||
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"ExecuteRMInfrastructureRepairTask",
|
||||
"ScheduleInfrastructureRepairTask",
|
||||
$"Successfully created Repair Task {repairTask.TaskId}",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
|
@ -320,7 +327,7 @@ namespace FabricHealer.Repair
|
|||
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"ExecuteRMInfrastructureRepairTask::Completed",
|
||||
"InfrastructureRepairTask::Completed",
|
||||
$"Successfully completed repair {repairData.RepairPolicy.RepairId}",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
|
@ -332,7 +339,7 @@ namespace FabricHealer.Repair
|
|||
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"ExecuteRMInfrastructureRepairTask::Timeout",
|
||||
"ScheduleInfrastructureRepairTask::Timeout",
|
||||
$"Max wait time of {MaxWaitTimeForInfraRepairTaskCompleted} has elapsed for repair " +
|
||||
$"{repairData.RepairPolicy.RepairId}.",
|
||||
cancellationToken,
|
||||
|
@ -441,7 +448,7 @@ namespace FabricHealer.Repair
|
|||
await Task.Delay(new Random().Next(100, 1500));
|
||||
|
||||
// Has the repair already been scheduled by a different FH instance?
|
||||
if (await repairTaskEngine.IsFHRepairTaskRunningAsync(RepairTaskEngine.FHTaskIdPrefix, repairData, cancellationToken))
|
||||
if (await repairTaskEngine.IsRepairInProgressAsync(RepairTaskEngine.FHTaskIdPrefix, repairData, cancellationToken))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
@ -465,7 +472,7 @@ namespace FabricHealer.Repair
|
|||
}
|
||||
|
||||
if (repairExecutorData.RepairData.RepairPolicy.RepairAction != RepairActionType.RestartFabricNode &&
|
||||
repairExecutorData.RepairData.RepairPolicy.RepairAction != RepairActionType.RestartVM)
|
||||
repairExecutorData.RepairData.RepairPolicy.RepairAction != RepairActionType.RebootMachine)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
@ -700,7 +707,7 @@ namespace FabricHealer.Repair
|
|||
success = false;
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoReplica",
|
||||
$"Stateless Instance {repairData.ReplicaId} not found on partition " +
|
||||
$"{repairData.PartitionId}.",
|
||||
cancellationToken,
|
||||
|
@ -745,7 +752,7 @@ namespace FabricHealer.Repair
|
|||
success = false;
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoReplica",
|
||||
$"Stateful replica {repairData.ReplicaId} not found on partition " +
|
||||
$"{repairData.PartitionId}.",
|
||||
cancellationToken,
|
||||
|
@ -941,11 +948,11 @@ namespace FabricHealer.Repair
|
|||
// This is done by setting the repair task to Restoring State with ResultStatus Succeeded. RM will then move forward to Restoring
|
||||
// (and do any restoring health checks if specified), then Complete the repair job.
|
||||
_ = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricRepairTasks.CompleteCustomActionRepairJobAsync(
|
||||
repairTask,
|
||||
Context,
|
||||
cancellationToken),
|
||||
cancellationToken);
|
||||
() => FabricRepairTasks.CompleteCustomActionRepairJobAsync(
|
||||
repairTask,
|
||||
Context,
|
||||
cancellationToken),
|
||||
cancellationToken);
|
||||
|
||||
// Let RM catch up.
|
||||
await Task.Delay(TimeSpan.FromSeconds(3), cancellationToken);
|
||||
|
|
|
@ -23,7 +23,7 @@ namespace FabricHealer.Utilities
|
|||
private set;
|
||||
}
|
||||
|
||||
public int ExecutionLoopSleepSeconds
|
||||
public int HealthCheckIntervalInSeconds
|
||||
{
|
||||
get;
|
||||
private set;
|
||||
|
@ -169,9 +169,9 @@ namespace FabricHealer.Utilities
|
|||
|
||||
LocalLogPathParameter = GetConfigSettingValue( RepairConstants.RepairManagerConfigurationSectionName, RepairConstants.LocalLogPathParameter);
|
||||
|
||||
if (int.TryParse( GetConfigSettingValue(RepairConstants.RepairManagerConfigurationSectionName, RepairConstants.HealthCheckLoopSleepTimeSeconds), out int execFrequency))
|
||||
if (int.TryParse(GetConfigSettingValue(RepairConstants.RepairManagerConfigurationSectionName, RepairConstants.HealthCheckIntervalInSeconds), out int execFrequency))
|
||||
{
|
||||
ExecutionLoopSleepSeconds = execFrequency;
|
||||
HealthCheckIntervalInSeconds = execFrequency;
|
||||
}
|
||||
|
||||
// Rolling service restarts.
|
||||
|
|
|
@ -10,7 +10,6 @@ using System.Fabric.Health;
|
|||
using System;
|
||||
using FabricHealer.Repair;
|
||||
using System.Diagnostics.Tracing;
|
||||
using System.Fabric.Query;
|
||||
|
||||
namespace FabricHealer.Utilities.Telemetry
|
||||
{
|
||||
|
@ -96,8 +95,7 @@ namespace FabricHealer.Utilities.Telemetry
|
|||
get; set;
|
||||
}
|
||||
|
||||
[EventField]
|
||||
public ServiceKind ServiceKind
|
||||
public string ServiceKind
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
|
|
|
@ -12,6 +12,7 @@ using System.Fabric.Health;
|
|||
using System.Runtime.InteropServices;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using System.Xml.Linq;
|
||||
|
||||
namespace FabricHealer.Utilities.Telemetry
|
||||
{
|
||||
|
@ -146,17 +147,22 @@ namespace FabricHealer.Utilities.Telemetry
|
|||
// ETW.
|
||||
if (FabricHealerManager.ConfigSettings.EtwEnabled)
|
||||
{
|
||||
if (healthState == HealthState.Ok || healthState == HealthState.Unknown || healthState == HealthState.Invalid)
|
||||
if (JsonSerializationUtility.TrySerialize(telemData, out string tData))
|
||||
{
|
||||
ServiceEventSource.Current.DataTypeWriteInfo(RepairConstants.EventSourceEventName, telemData);
|
||||
}
|
||||
else if (healthState == HealthState.Warning)
|
||||
{
|
||||
ServiceEventSource.Current.DataTypeWriteWarning(RepairConstants.EventSourceEventName, telemData);
|
||||
}
|
||||
else
|
||||
{
|
||||
ServiceEventSource.Current.DataTypeWriteError(RepairConstants.EventSourceEventName, telemData);
|
||||
var data = new { tData };
|
||||
|
||||
if (healthState == HealthState.Ok || healthState == HealthState.Unknown || healthState == HealthState.Invalid)
|
||||
{
|
||||
ServiceEventSource.Current.DataTypeWriteInfo(RepairConstants.EventSourceEventName, data);
|
||||
}
|
||||
else if (healthState == HealthState.Warning)
|
||||
{
|
||||
ServiceEventSource.Current.DataTypeWriteWarning(RepairConstants.EventSourceEventName, data);
|
||||
}
|
||||
else
|
||||
{
|
||||
ServiceEventSource.Current.DataTypeWriteError(RepairConstants.EventSourceEventName, data);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<!-- FabricHealerManager Settings -->
|
||||
<Parameter Name="AutoMitigationEnabled" DefaultValue="true" />
|
||||
<Parameter Name="EnableETW" DefaultValue="true" />
|
||||
<Parameter Name="MonitorLoopSleepSeconds" DefaultValue="5" />
|
||||
<Parameter Name="HealthCheckIntervalInSeconds" DefaultValue="60" />
|
||||
<Parameter Name="EnableTelemetry" DefaultValue="true" />
|
||||
<!-- Set VerboseLoggingEnabled to true if you want detailed local logging and telemetry/ETW with repair data.
|
||||
This data will live in a folder named RepairData, which will be created in your LocalLogPath directory.
|
||||
|
@ -39,7 +39,7 @@
|
|||
<Settings>
|
||||
<!-- FabricHealerManager -->
|
||||
<Section Name="RepairManagerConfiguration">
|
||||
<Parameter Name="HealthCheckLoopSleepTimeSeconds" Value="[MonitorLoopSleepSeconds]" />
|
||||
<Parameter Name="HealthCheckIntervalInSeconds" Value="[HealthCheckIntervalInSeconds]" />
|
||||
<Parameter Name="EnableAutoMitigation" Value="[AutoMitigationEnabled]" />
|
||||
<Parameter Name="EnableETW" Value="[EnableETW]" />
|
||||
<Parameter Name="EnableTelemetry" Value="[EnableTelemetry]" />
|
||||
|
|
Загрузка…
Ссылка в новой задаче