This commit is contained in:
Charles Torre 2022-08-30 18:55:44 -07:00
Родитель 3caffa95d2
Коммит 698bc2b3cc
19 изменённых файлов: 527 добавлений и 220 удалений

Просмотреть файл

@ -285,7 +285,7 @@ namespace FHTest
functorTable.Add(RestartFabricNodePredicateType.Singleton(RepairConstants.RestartFabricNode, repairTaskManager, executorData, repairTaskEngine, repairData));
functorTable.Add(RestartFabricSystemProcessPredicateType.Singleton(RepairConstants.RestartFabricSystemProcess, repairTaskManager, repairData));
functorTable.Add(RestartReplicaPredicateType.Singleton(RepairConstants.RestartReplica, repairTaskManager, repairData));
functorTable.Add(RestartMachinePredicateType.Singleton(RepairConstants.RestartVM, repairTaskManager, repairData));
functorTable.Add(ScheduleMachineRepairPredicateType.Singleton(RepairConstants.ScheduleMachineRepair, repairTaskManager, repairData));
// Parse rules
Module module = Module.Parse("external", repairRules, functorTable);

Просмотреть файл

@ -356,7 +356,7 @@ namespace FabricHealer
await Task.Delay(
TimeSpan.FromSeconds(
ConfigSettings.ExecutionLoopSleepSeconds > 0 ? ConfigSettings.ExecutionLoopSleepSeconds : 10), Token);
ConfigSettings.HealthCheckIntervalInSeconds > 0 ? ConfigSettings.HealthCheckIntervalInSeconds : 10), Token);
}
RepairLogger.LogInfo("Shutdown signaled. Stopping.");
@ -1330,7 +1330,7 @@ namespace FabricHealer
repairTaskManager.DetectedHealthEvents.Add(evt);
// Start the repair workflow.
await repairTaskManager.StartRepairWorkflowAsync((TelemetryData)repairData, repairRules, Token);
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
}
}
@ -1357,11 +1357,11 @@ namespace FabricHealer
Token.ThrowIfCancellationRequested();
var nodeHealth = await FabricClientSingleton.HealthManager.GetNodeHealthAsync(node.NodeName);
var observerHealthEvents =
var nodeHealthEvents =
nodeHealth.HealthEvents.Where(
s => (s.HealthInformation.HealthState == HealthState.Warning || s.HealthInformation.HealthState == HealthState.Error));
foreach (var evt in observerHealthEvents)
foreach (var evt in nodeHealthEvents)
{
Token.ThrowIfCancellationRequested();
@ -1373,7 +1373,11 @@ namespace FabricHealer
continue;
}
// TODO: Remove this hard requirement (TelemetryData-only). FH can just read health event data and learn what the problem is if FO/FH Proxy did not generate the health event. This is important
// for cases where customers are not using FO or FHProxy, but want to use FH.
// Check to see if the event Description is a serialized instance of TelemetryData, which would mean the health report was generated in a supported way.
// In the case where there is no TelemetryData involved, create a new TelemtryData and set it with the minimum number of facts required to accomplish the goal.
// This is trivial for the Machine repair case, but will get more complicated for other entities. That said, it is very doable.
if (!JsonSerializationUtility.TryDeserialize(evt.HealthInformation.Description, out TelemetryData repairData))
{
continue;
@ -1423,7 +1427,7 @@ namespace FabricHealer
continue;
}
// Get repair rules for supported source Observer.
// Get repair rules for supplied facts (TelemetryData).
var repairRules = GetRepairRulesForTelemetryData(repairData);
if (repairRules == null || repairRules.Count == 0)

Просмотреть файл

@ -79,8 +79,10 @@ Mitigate(AppName="fabric:/PortEater42", MetricName="EphemeralPorts", MetricValue
## and where at least 3 health events identifying this problem were produced in the last 15 minutes. This is useful to ensure you don't mitigate a transient (short-lived)
## problem as they will self-correct.
Mitigate(AppName="fabric:/CpuStress", MetricName="CpuPercent", MetricValue=?MetricValue) :- ?MetricValue >= 15,
GetHealthEventHistory(?HealthEventCount, 00:15:00),
## How long has it been in unhealthy state and how long has it not been in unhealthy state (healthy state).
Mitigate(ServiceName="fabric:/CpuStress/CpuStressor", MetricName="CpuPercent", MetricValue=?MetricValue) :- ?MetricValue >= 85,
GetHealthEventHistory(?HealthEventCount, 00:30:00),
?HealthEventCount >= 3,
TimeScopedRestartCodePackage(1, 00:15:00).
@ -162,7 +164,7 @@ Mitigate(AppName="fabric:/MyApp42", MetricName="EphemeralPortsPercent", MetricVa
Mitigate(AppName=?AppName, MetricName="Threads", MetricValue=?MetricValue) :- ?AppName != "fabric:/FabricObserver", ?MetricValue >= 400, TimeScopedRestartCodePackage(5, 05:00:00).
## Threads - Any app service. 5 repairs within 5 hour window. This means if FO warns on Thread count, then heal. There are no conditional checks (on MetricValue) to take place.
## Mitigate(MetricName="Threads") :- TimeScopedRestartCodePackage(5, 05:00:00).
Mitigate(MetricName="Threads") :- TimeScopedRestartCodePackage(5, 05:00:00).
## Generic rule for restarting any service in Warning or Error. This means any service that is in Error or Warning state and
## also specified in the serialized TelemetryData instance that forms the Description of the related Service level Health Event will be restarted.
@ -186,5 +188,5 @@ TimeScopedRestartReplica(?count, ?time) :- GetRepairHistory(?repairCount, ?time)
## See below for an example using both optional arguments. Named arguments are just used for clarity below; you could also just specify RestartCodePackage(true, 00:10:00), for example.
## Note: It's up to you to decide if you want RepairManager to conduct pre and post health checks.
TimeScopedRestartCodePackage() :- RestartCodePackage(DoHealthChecks=true, MaxWaitTimeForHealthStateOk=00:10:00).
TimeScopedRestartCodePackage() :- RestartCodePackage(DoHealthChecks=false, MaxWaitTimeForHealthStateOk=00:10:00).
TimeScopedRestartReplica() :- RestartReplica(DoHealthChecks=true, MaxWaitTimeForHealthStateOk=00:01:00).

Просмотреть файл

@ -1,6 +1,6 @@
## Logic rules for Machine level repairs in the cluster. Only OS reboot is supported today.
## Logic rules for Machine level repairs in the cluster. Only OS reboot is supported today for VMSS-managed clusters.
## Applicable Named Arguments for Mitigate. Corresponding data is supplied by FabricObserver, Renamed for brevity by FH.
## Applicable Named Arguments for Mitigate. Corresponding facts are supplied by FabricObserver or FabricHealerProxy, Renamed for brevity by FH.
## | Argument Name | Definition |
## |---------------------------|----------------------------------------------------------------------------------------------|
## | NodeName | Name of the node |
@ -22,37 +22,75 @@
## | FileHandles (Linux) |
## | FileHandlesPercent (Linux)|
## Supported repair action names.
## | Name |
## |------------------------------|
## | System.Reboot |
## | System.ReimageOS |
## | System.FullReimage |
## | System.Azure.HostReboot |
## | System.Azure.HostRepaveData |
## First, check if we are inside run interval. If so, then cut (!).
## This is commented out by default. Just uncomment and set the global run interval for VM level repairs to suit your needs.
## Mitigate() :- CheckInsideRunInterval(02:00:00), !.
## TimeScopedRestartVM is an internal predicate to check for the number of times a VM reboot repair has run to completion within a supplied time window.
## TimeScopedRebootMachine is an internal predicate to check for the number of times a VM reboot repair has run to completion within a supplied time window.
## If Completed VM Repair count is less then supplied value, then run RestartVM mitigation.
TimeScopedRestartVM(?count, ?time) :- GetRepairHistory(?repairCount, ?time),
TimeScopedRebootMachine(?count, ?time) :- GetRepairHistory(?repairCount, ?time),
?repairCount < ?count,
RestartVM().
ScheduleMachineRepair("System.Reboot").
## Percent Memory in Use (of total physical).
Mitigate(MetricName="MemoryPercent", MetricValue=?MetricValue) :- ?MetricValue >= 95,
Mitigate(MetricName=MemoryPercent, MetricValue=?MetricValue) :- ?MetricValue >= 95,
GetHealthEventHistory(?HealthEventCount, 00:15:00),
?HealthEventCount >= 3,
TimeScopedRestartVM(4, 08:00:00).
TimeScopedRebootMachine(4, 08:00:00).
## File Handles/FDs. Linux-only.
## Percent Allocated, System-wide.
Mitigate(MetricName="FileHandlesPercent", MetricValue=?MetricValue, OS="Linux") :- ?MetricValue >= 95,
Mitigate(MetricName=FileHandlesPercent, MetricValue=?MetricValue, OS=Linux) :- ?MetricValue >= 95,
GetHealthEventHistory(?HealthEventCount, 00:15:00),
?HealthEventCount >= 3,
TimeScopedRestartVM(2, 08:00:00).
TimeScopedRebootMachine(2, 08:00:00).
## Total Allocated, System-wide.
Mitigate(MetricName="FileHandles", MetricValue=?MetricValue, OS="Linux") :- ?MetricValue >= 1000000,
Mitigate(MetricName=FileHandles, MetricValue=?MetricValue, OS=Linux) :- ?MetricValue >= 1000000,
GetHealthEventHistory(?HealthEventCount, 00:15:00),
?HealthEventCount >= 3,
TimeScopedRestartVM(2, 08:00:00).
TimeScopedRebootMachine(2, 08:00:00).
## Reboot/Reimage. The rules below satisfy the following requirements:
## Repair throttling (MaxOutstandingRepairTasks), time-in-Error checking (pre-probation - GetCurrentEntityHealthStateDuration),
## cap repair job attempts within time intervals (how many times to try a repair before trying something else, for example, escalation..),
## time-to-wait after repair completes (post-probation - ProbationToHealthyWaitDurationPostRepair),
## attempt scheduling disk reimage if rebooting didn't work (user-configured repair escalation).
## Logic workflow. Note: Since we are in this rule file, the context is already known to be Machine. FH has already determined what the facts are.
## If Health State is Error, then proceed. This constraint is in the head of the rule for convenience and readability.
## How long has the node been in Error state (probation)?
## If at least 2 hours, then continue. Else, stop processing rules (end cut (!)).
## How many times has this machine repair been run in the last 4 hours? If less than twice, continue. This is added logic, does not exist in RPE. Like all of this, customer owns the configuration.
## Only schedule the repair if there are less than 2 machine-level repairs currently in flight in the cluster.
## "Currently in flight" means RepairTaskStateFilter.Active | RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing.
## Wait 30 mins for green node after IS completes execution and RM has restored node (no health checks for Error state, by default. Only safety checks.)
## If any of the sub-rules do not succeed, then Guan will go to the next rule, which in this case will attempt to schedule a Disk reimage repair job with the same logical constraints/workflow.
Mitigate(NodeName=?NodeName, HealthState=Error) :- GetCurrentEntityHealthStateDuration(?HealthStateDuration, Entity=Machine, Target=?NodeName, State=Error),
?HealthStateDuration <= 02:00:00, !.
Mitigate(NodeName=?NodeName, HealthState=Error) :- GetRepairHistory(?repairCount, 04:00:00),
?repairCount < 2,
ScheduleMachineRepair("System.Azure.HostReboot", MaxOutstandingRepairTasks=2, ProbationToHealthyWaitDurationPostRepair=00:30:00).
## Data disk reimage. Not supported on VMSS-managed virtual machines.
Mitigate(NodeName=?NodeName, HealthState=Error) :- GetRepairHistory(?repairCount, 04:00:00),
?repairCount < 2,
ScheduleMachineRepair("System.Azure.HostRepaveData", MaxOutstandingRepairTasks=2, ProbationToHealthyWaitDurationPostRepair=00:30:00).
## Human intervention is required (Triage)

Просмотреть файл

@ -1,57 +1,110 @@
<?xml version="1.0" encoding="utf-8" ?>
<Settings xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://schemas.microsoft.com/2011/01/fabric">
<Section Name="RepairManagerConfiguration">
<!-- ***Overridable Parameters***
These must be set in ApplicationManifest.xml -->
<Parameter Name="HealthCheckLoopSleepTimeSeconds" Value="" MustOverride="true" />
<Parameter Name="EnableVerboseLogging" Value="" MustOverride="true" />
<Parameter Name="EnableTelemetry" Value="" MustOverride="true" />
<Parameter Name="EnableETW" Value="" MustOverride="true" />
<Parameter Name="EnableAutoMitigation" Value="" MustOverride="true" />
<Parameter Name="EnableOperationalTelemetry" Value="" MustOverride="true" />
<Parameter Name="EnableRollingServiceRestarts" Value="" MustOverride="true" />
<!-- Folder name for local log output. You can use a full path or just a folder name. -->
<Parameter Name="LocalLogPath" Value="" MustOverride="true" />
<Section Name="RepairManagerConfiguration">
<!-- ***Overridable Parameters*** These must be set in ApplicationManifest.xml -->
<!-- Interval in seconds for how often FabricHealer wakes up and scans health states to schedule repairs. -->
<Parameter Name="HealthCheckIntervalInSeconds" Value="" MustOverride="true" />
<Parameter Name="EnableVerboseLogging" Value="" MustOverride="true" />
<Parameter Name="EnableTelemetry" Value="" MustOverride="true" />
<Parameter Name="EnableETW" Value="" MustOverride="true" />
<!-- Big Red Button: You can turn FabricHealer on and off with a versionless parameter-only application upgrade. -->
<Parameter Name="EnableAutoMitigation" Value="" MustOverride="true" />
<Parameter Name="EnableOperationalTelemetry" Value="" MustOverride="true" />
<Parameter Name="EnableRollingServiceRestarts" Value="" MustOverride="true" />
<!-- Folder name for local log output. You can use a full path or just a folder name. -->
<Parameter Name="LocalLogPath" Value="" MustOverride="true" />
<!-- ***Non-Overridable Parameters*** These must be set in this file. -->
<!-- ***Non-Overridable Parameters*** These must be set in this file. -->
<!-- Default timeout for async SF API calls. -->
<Parameter Name="AsyncOperationTimeoutSeconds" Value="120" />
<!-- Required-If EnableTelemetry is set to true in ApplicationManifest. Values can be either AzureApplicationInsights or AzureLogAnalytics -->
<Parameter Name="TelemetryProvider" Value="AzureLogAnalytics" />
<!-- Required-If TelemetryProvider is AzureApplicationInsights. -->
<Parameter Name="AppInsightsInstrumentationKey" Value="" />
<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
<Parameter Name="LogAnalyticsWorkspaceId" Value="" />
<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
<Parameter Name="LogAnalyticsSharedKey" Value="" />
<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
<Parameter Name="LogAnalyticsLogType" Value="FabricHealer" />
</Section>
<!-- Default timeout for async SF API calls. -->
<Parameter Name="AsyncOperationTimeoutSeconds" Value="120" />
<!-- Required-If EnableTelemetry is set to true in ApplicationManifest. Values can be either AzureApplicationInsights or AzureLogAnalytics -->
<Parameter Name="TelemetryProvider" Value="AzureLogAnalytics" />
<!-- Required-If TelemetryProvider is AzureApplicationInsights. -->
<Parameter Name="AppInsightsInstrumentationKey" Value="" />
<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
<Parameter Name="LogAnalyticsWorkspaceId" Value="" />
<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
<Parameter Name="LogAnalyticsSharedKey" Value="" />
<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
<Parameter Name="LogAnalyticsLogType" Value="FabricHealer" />
</Section>
<!-- Repair Policies - Overridable Parameters. Must be set in ApplicationManifest.xml. -->
<Section Name="AppRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" />
</Section>
<Section Name="DiskRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
</Section>
<Section Name="FabricNodeRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
</Section>
<Section Name="ReplicaRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
</Section>
<Section Name="SystemServiceRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
</Section>
<!-- Repair Policies.Overridable Parameters. Must be set in ApplicationManifest.xml. -->
<Section Name="AppRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" />
</Section>
<Section Name="DiskRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
</Section>
<Section Name="FabricNodeRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
</Section>
<Section Name="ReplicaRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
</Section>
<Section Name="SystemServiceRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
</Section>
<Section Name="MachineRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
</Section>
<!-- Machine Repair. -->
<Section Name="MachineRepairPolicy">
<!-- FabricHealer is not allowed to schedule a machine repair more often than this interval.
This is to prevent the FabricHealer from scheduling too many machine repairs in a short time. -->
<Parameter Name="ActionSchedulingIntervalInSeconds" Value="600" />
<Parameter Name="ProbationToFailingWaitDurationInSeconds" Value="7200" />
<Parameter Name="ProbationToHealthyWaitDurationInSeconds" Value="1800" />
<Parameter Name="MinimumHealthyDurationInSeconds" Value="300" />
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
</Section>
<!-- Supported Machine level repair actions. -->
<!-- RM Repair action strings (built into RM).
========================================================================
Alias VM (default) Host
========================================================================
System.Reboot System.Azure.Reboot System.Azure.HostReboot
System.ReimageOS System.Azure.ReimageOS
System.FullReimage System.Azure.RepaveData System.Azure.HostRepaveData
========================================================================-->
<Section Name="NodeRepairActionList">
<!-- This is the name of the repair that will be provided to RM by FH when scheduling the repair. -->
<Parameter Name="System.Azure.HostReboot" Value="RepairPolicyHostReboot" />
<!-- This is not supported for VMSS-based machine clusters. -->
<!--<Parameter Name="System.Azure.HostRepaveData" Value="RepairPolicyHostRepaveData" />-->
<Parameter Name="ManualTriageNeeded" Value="RepairPolicyTriage" />
</Section>
<Section Name="RepairPolicyHostReboot">
<!-- Required - How many repair tasks of this kind be scheduled concurrently. -->
<Parameter Name="MaxOutstandingRepairTasks" Value="2" />
<!-- Required - Probationary period to failing after this repair -->
<Parameter Name="ProbationToFailingWaitDurationPostRepairInSeconds" Value="7200" />
<!-- Required - Probationary period to healthy after this repair -->
<Parameter Name="ProbationToHealthyWaitDurationPostRepairInSeconds" Value="1800" />
<!-- Required - Minimum time of error state before this repair is applied -->
<Parameter Name="PolicyActionTimeInSeconds" Value="7200" />
<!-- This is optional and defaults to true -->
<Parameter Name="IsEnabled" Value="true" />
</Section>
<!-- This is not supported for VMSS-based machine clusters. -->
<Section Name="RepairPolicyHostRepaveData">
<Parameter Name="MaxOutstandingRepairTasks" Value="2" />
<Parameter Name="ProbationToFailingWaitDurationPostRepairInSeconds" Value="7200" />
<Parameter Name="ProbationToHealthyWaitDurationPostRepairInSeconds" Value="1800" />
<Parameter Name="PolicyActionTimeInSeconds" Value="14400" />
<Parameter Name="IsEnabled" Value="false" />
</Section>
<Section Name="RepairPolicyTriage">
<Parameter Name="MaxOutstandingRepairTasks" Value="2" MustOverride="false" />
<Parameter Name="ProbationToFailingWaitDurationPostRepairInSeconds" Value="7200" />
<Parameter Name="ProbationToHealthyWaitDurationPostRepairInSeconds" Value="1800" />
<Parameter Name="PolicyActionTimeInSeconds" Value="9000" />
<Parameter Name="IsEnabled" Value="true" />
</Section>
<!-- End Machine Repair -->
</Settings>

Просмотреть файл

@ -6,6 +6,8 @@
using System;
using System.Collections.Generic;
using System.Fabric;
using System.Fabric.Description;
using System.Fabric.Health;
using System.Fabric.Query;
using System.Fabric.Repair;
using System.Linq;
@ -154,19 +156,20 @@ namespace FabricHealer.Repair
await Task.Delay(new Random().Next(100, 1500));
var isRepairAlreadyInProgress =
await repairTaskEngine.IsFHRepairTaskRunningAsync(executorName, repairData, token);
bool isRepairInProgress = await repairTaskEngine.IsRepairInProgressAsync(executorName, repairData, token);
if (isRepairAlreadyInProgress)
if (isRepairInProgress)
{
return null;
}
switch (repairAction)
{
case RepairActionType.RestartVM:
case RepairActionType.RebootMachine:
case RepairActionType.ReimageDisk:
case RepairActionType.ReimageOS:
repairTask = await repairTaskEngine.CreateVmRebootISRepairTaskAsync(repairData, executorName, token);
repairTask = await repairTaskEngine.CreateMachineRepairTaskAsync(repairData, executorName, token);
break;
case RepairActionType.DeleteFiles:
@ -209,7 +212,7 @@ namespace FabricHealer.Repair
try
{
var isRepairAlreadyInProgress =
await repairTaskEngine.IsFHRepairTaskRunningAsync(repairTask.Executor, repairData, token);
await repairTaskEngine.IsRepairInProgressAsync(repairTask.Executor, repairData, token);
if (!isRepairAlreadyInProgress)
{
@ -395,5 +398,69 @@ namespace FabricHealer.Repair
return count;
}
internal static async Task<TimeSpan> GetEntityCurrentHealthStateDurationAsync(EntityType entityType, string entityFilter, HealthState state, CancellationToken token)
{
HealthEventsFilter healthEventsFilter = new HealthEventsFilter();
if (state == HealthState.Warning)
{
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Warning;
}
else if (state == HealthState.Error)
{
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Error;
}
else if (state == HealthState.Ok)
{
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Ok;
}
else
{
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.None;
}
switch (entityType)
{
case EntityType.Application:
break;
case EntityType.Service:
break;
case EntityType.Machine:
case EntityType.Node:
var queryDesc = new NodeHealthQueryDescription(entityFilter)
{
EventsFilter = healthEventsFilter
};
var nodeHealthList =
await FabricHealerManager.FabricClientSingleton.HealthManager.GetNodeHealthAsync(
queryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
if (nodeHealthList == null || nodeHealthList.HealthEvents.Count == 0)
{
return TimeSpan.MinValue;
}
foreach (var nodeHealthEvent in nodeHealthList.HealthEvents)
{
if (nodeHealthEvent.IsExpired)
{
continue;
}
return DateTime.UtcNow.Subtract(nodeHealthEvent.SourceUtcTimestamp);
}
break;
default:
return TimeSpan.MinValue;
}
return TimeSpan.MinValue;
}
}
}

Просмотреть файл

@ -0,0 +1,83 @@
// ------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
using System;
using System.Threading.Tasks;
using Guan.Logic;
using FabricHealer.Utilities.Telemetry;
using System.Fabric.Health;
namespace FabricHealer.Repair.Guan
{
public class GetCurrentEntityHealthStateDurationPredicateType : PredicateType
{
private static RepairTaskManager RepairTaskManager;
private static GetCurrentEntityHealthStateDurationPredicateType Instance;
private class Resolver : GroundPredicateResolver
{
public Resolver(CompoundTerm input, Constraint constraint, QueryContext context)
: base(input, constraint, context, 1)
{
}
// GetCurrentEntityHealthStateDuration(?HealthStateDuration, Machine, ?NodeName, State=Error)
protected override async Task<Term> GetNextTermAsync()
{
if (Input.Arguments.Count != 4)
{
throw new GuanException("GetCurrentEntityHealthStateDuration predicate requires 4 arguments.");
}
TimeSpan duration;
if (!Enum.TryParse((string)Input.Arguments[1].Value.GetEffectiveTerm().GetObjectValue(), out EntityType entityType))
{
throw new GuanException("The second argument of GetCurrentEntityHealthStateDuration must be a valid EntityType value (Application, Service, Node, Machine, etc..)");
}
if (!Enum.TryParse((string)Input.Arguments[3].Value.GetEffectiveTerm().GetObjectValue(), out HealthState state))
{
throw new GuanException("The third argument of GetCurrentEntityHealthStateDuration must be a valid HealthState value (Error, Warning, etc..)");
}
string nodeName = (string)Input.Arguments[2].Value.GetEffectiveTerm().GetObjectValue();
duration = await FabricRepairTasks.GetEntityCurrentHealthStateDurationAsync(entityType, nodeName, state, RepairTaskManager.Token);
var result = new CompoundTerm(this.Input.Functor);
result.AddArgument(new Constant(duration), "0");
return result;
}
}
public static GetCurrentEntityHealthStateDurationPredicateType Singleton(string name, RepairTaskManager repairTaskManager)
{
RepairTaskManager = repairTaskManager;
return Instance ??= new GetCurrentEntityHealthStateDurationPredicateType(name);
}
private GetCurrentEntityHealthStateDurationPredicateType(string name)
: base(name, true, 1)
{
}
public override PredicateResolver CreateResolver(CompoundTerm input, Constraint constraint, QueryContext context)
{
return new Resolver(input, constraint, context);
}
public override void AdjustTerm(CompoundTerm term, Rule rule)
{
if (term.Arguments[0].Value.IsGround())
{
throw new GuanException("The first argument of GetCurrentEntityHealthStateDuration must be a variable: {0}", term);
}
}
}
}

Просмотреть файл

@ -70,7 +70,7 @@ namespace FabricHealer.Repair.Guan
// Block attempts to create node-level repair tasks if one is already running in the cluster.
var repairTaskEngine = new RepairTaskEngine();
var isNodeRepairAlreadyInProgress =
await repairTaskEngine.IsFHRepairTaskRunningAsync(
await repairTaskEngine.IsRepairInProgressAsync(
RepairTaskEngine.FabricHealerExecutorName,
RepairData,
RepairTaskManager.Token);

Просмотреть файл

@ -11,11 +11,11 @@ using System.Threading.Tasks;
namespace FabricHealer.Repair.Guan
{
public class RestartMachinePredicateType : PredicateType
public class ScheduleMachineRepairPredicateType : PredicateType
{
private static RepairTaskManager RepairTaskManager;
private static TelemetryData RepairData;
private static RestartMachinePredicateType Instance;
private static ScheduleMachineRepairPredicateType Instance;
private class Resolver : BooleanPredicateResolver
{
@ -27,36 +27,50 @@ namespace FabricHealer.Repair.Guan
protected override async Task<bool> CheckAsync()
{
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartVM;
if (Input.Arguments.Count == 0)
{
throw new GuanException("You must provide a repair action name for Infrastructure-level repairs as first argument.");
}
// Repair action name is required.
string repairAction = (string)Input.Arguments[0].Value.GetObjectValue();
/*
public const string SystemReboot = "System.Reboot";
public const string SystemReimageOS = "System.ReimageOS ";
public const string SystemFullReimage = "System.FullReimage";
public const string SystemHostReboot = "System.Azure.HostReboot";
public const string SystemHostRepaveData = "System.Azure.HostRepaveData";
*/
switch (repairAction)
{
case RepairConstants.SystemReboot:
case RepairConstants.SystemHostReboot:
RepairData.RepairPolicy.RepairAction = RepairActionType.RebootMachine;
break;
case RepairConstants.SystemReimageOS:
case RepairConstants.SystemFullReimage:
case RepairConstants.SystemHostRepaveData:
RepairData.RepairPolicy.RepairAction = RepairActionType.ReimageOS;
break;
default:
throw new GuanException($"Unrecognized repair action name: {repairAction}. Repair actions are case sensitive.");
}
// FH does not execute repairs for VM level mitigation. InfrastructureService (IS) does,
// so, FH schedules VM repairs via RM and the execution is taken care of by IS (the executor).
// Block attempts to create duplicate repair tasks.
// Block attempts to create duplicate repair tasks or more than specified concurrent machine-level repairs.
var repairTaskEngine = new RepairTaskEngine();
var isRepairAlreadyInProgress =
await repairTaskEngine.IsFHRepairTaskRunningAsync(
$"{RepairTaskEngine.InfrastructureServiceName}/{RepairData.NodeType}",
RepairData,
RepairTaskManager.Token);
if (isRepairAlreadyInProgress)
{
string message = $"VM Repair {RepairData.RepairPolicy.RepairId} is already in progress. Will not attempt repair at this time.";
await RepairTaskManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"RestartVMPredicateType::{RepairData.RepairPolicy.RepairId}",
message,
RepairTaskManager.Token);
return false;
}
int count = Input.Arguments.Count;
long maxConcurrentRepairs = 0;
for (int i = 0; i < count; i++)
{
var typeString = Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue().GetType().Name;
switch (typeString)
{
case "TimeSpan":
@ -67,13 +81,39 @@ namespace FabricHealer.Repair.Guan
RepairData.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[i].Value.GetObjectValue();
break;
case "Int64":
maxConcurrentRepairs = (long)Input.Arguments[i].Value.GetObjectValue();
break;
default:
throw new GuanException($"Unsupported input: {Input.Arguments[i].Value.GetObjectValue().GetType()}");
}
}
var isRepairAlreadyInProgress =
await repairTaskEngine.IsRepairInProgressAsync(
$"{RepairTaskEngine.InfrastructureServiceName}/{RepairData.NodeType}",
RepairData,
RepairTaskManager.Token,
maxConcurrentRepairs);
if (isRepairAlreadyInProgress)
{
string message = $"VM Repair {RepairData.RepairPolicy.RepairId} is already in progress" +
$"{(maxConcurrentRepairs > 0 ? " or max number of concurrent machine repairs has been reached" : "")}. " +
$"Will not attempt repair at this time.";
await RepairTaskManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"RestartMachinePredicateType::{RepairData.RepairPolicy.RepairId}",
message,
RepairTaskManager.Token);
return false;
}
bool success = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() => RepairTaskManager.ExecuteRMInfrastructureRepairTask(
() => RepairTaskManager.ScheduleInfrastructureRepairTask(
RepairData,
RepairTaskManager.Token),
RepairTaskManager.Token);
@ -81,15 +121,15 @@ namespace FabricHealer.Repair.Guan
}
}
public static RestartMachinePredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData repairData)
public static ScheduleMachineRepairPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData repairData)
{
RepairTaskManager = repairTaskManager;
RepairData = repairData;
return Instance ??= new RestartMachinePredicateType(name);
return Instance ??= new ScheduleMachineRepairPredicateType(name);
}
private RestartMachinePredicateType(string name)
private ScheduleMachineRepairPredicateType(string name)
: base(name, true, 0)
{

Просмотреть файл

@ -22,6 +22,8 @@ namespace FabricHealer.Repair
RestartFabricNode,
RestartProcess,
RestartReplica,
RestartVM,
RebootMachine,
ReimageDisk,
ReimageOS
}
}

Просмотреть файл

@ -29,7 +29,7 @@ namespace FabricHealer.Repair
public const string EnableRollingServiceRestartsParameter = "EnableRollingServiceRestarts";
public const string AppInsightsInstrumentationKeyParameter = "AppInsightsInstrumentationKey";
public const string EnableETW = "EnableETW";
public const string HealthCheckLoopSleepTimeSeconds = "HealthCheckLoopSleepTimeSeconds";
public const string HealthCheckIntervalInSeconds = "HealthCheckIntervalInSeconds";
public const string LocalLogPathParameter = "LocalLogPath";
public const string AsyncOperationTimeout = "AsyncOperationTimeoutSeconds";
public const string EnableFabricHealerOperationalTelemetry = "EnableOperationalTelemetry";
@ -72,7 +72,15 @@ namespace FabricHealer.Repair
public const string RestartFabricNode = "RestartFabricNode";
public const string RestartFabricSystemProcess = "RestartFabricSystemProcess";
public const string RestartReplica = "RestartReplica";
public const string RestartVM = "RestartVM";
public const string ScheduleMachineRepair = "ScheduleMachineRepair";
public const string ScheduleDiskReimage = "ScheduleDiskReimage";
// Infra repair names (RM "commands").
public const string SystemReboot = "System.Reboot";
public const string SystemReimageOS = "System.ReimageOS ";
public const string SystemFullReimage = "System.FullReimage";
public const string SystemHostReboot = "System.Azure.HostReboot";
public const string SystemHostRepaveData = "System.Azure.HostRepaveData";
// Helper Predicates.
public const string CheckInsideRunInterval = "CheckInsideRunInterval";

Просмотреть файл

@ -956,42 +956,6 @@ namespace FabricHealer.Repair
return true;
}
/// <summary>
/// Returns a machine name string, given a fabric node name.
/// </summary>
/// <param name="nodeName">Fabric node name</param>
/// <param name="cancellationToken"></param>
internal async Task<string> GetMachineHostNameFromFabricNodeNameAsync(string nodeName, CancellationToken cancellationToken)
{
try
{
var nodes = await FabricHealerManager.FabricClientSingleton.QueryManager.GetNodeListAsync(
nodeName,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken);
Node targetNode = nodes.Count > 0 ? nodes[0] : null;
if (targetNode == null)
{
return null;
}
string ipOrDnsName = targetNode.IpAddressOrFQDN;
var hostEntry = await Dns.GetHostEntryAsync(ipOrDnsName);
var machineName = hostEntry.HostName;
return machineName;
}
catch (Exception e) when (e is ArgumentException|| e is SocketException|| e is OperationCanceledException || e is TimeoutException)
{
FabricHealerManager.RepairLogger.LogWarning(
$"Unable to determine machine host name from Fabric node name {nodeName}:{Environment.NewLine}{e}");
}
return null;
}
/// <summary>
/// Clears existing health warnings for target repair entity. This should only be called after a repair operation succeeds.
/// </summary>

Просмотреть файл

@ -32,6 +32,14 @@ namespace FabricHealer.Repair
get; set;
}
/// <summary>
/// The name of the infrastucture repair to provide to RM that IS will execute.
/// </summary>
public string InfrastructureRepairName
{
get; set;
}
/// <summary>
/// Maximum amount of time to check if health state of repaired target entity is Ok.
/// </summary>

Просмотреть файл

@ -4,7 +4,9 @@
// ------------------------------------------------------------
using System;
using System.ComponentModel.DataAnnotations;
using System.Fabric.Description;
using System.Fabric.Health;
using System.Fabric.Query;
using System.Fabric.Repair;
using System.Linq;
@ -12,13 +14,12 @@ using System.Threading;
using System.Threading.Tasks;
using FabricHealer.Utilities;
using FabricHealer.Utilities.Telemetry;
using Newtonsoft.Json.Linq;
namespace FabricHealer.Repair
{
public sealed class RepairTaskEngine
{
public const string HostVMReboot = "System.Reboot";
public const string HostMachineReboot = "System.Reboot";
public const string FHTaskIdPrefix = "FH";
public const string AzureTaskIdPrefix = "Azure";
public const string FabricHealerExecutorName = "FabricHealer";
@ -105,8 +106,14 @@ namespace FabricHealer.Repair
return repairTasks;
}
// This allows InfrastructureService to schedule and run reboot im concert with VMSS over MR.
public async Task<RepairTask> CreateVmRebootISRepairTaskAsync(TelemetryData repairData, string executorName, CancellationToken cancellationToken)
/// <summary>
/// This schedules a repair task where SF's InfrastructureService will reboot the target machine safely.
/// </summary>
/// <param name="repairData"></param>
/// <param name="executorName"></param>
/// <param name="cancellationToken"></param>
/// <returns></returns>
public async Task<RepairTask> CreateMachineRepairTaskAsync(TelemetryData repairData, string executorName, CancellationToken cancellationToken)
{
// This constraint (MaxResults) is used just to make sure there is more 1 node in the cluster. We don't need a list of all nodes.
var nodeQueryDesc = new NodeQueryDescription
@ -114,6 +121,13 @@ namespace FabricHealer.Repair
MaxResults = 3,
};
string repairActionName = HostMachineReboot;
if (!string.IsNullOrWhiteSpace(repairData.RepairPolicy.InfrastructureRepairName))
{
repairActionName = repairData.RepairPolicy.InfrastructureRepairName;
}
NodeList nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() => FabricHealerManager.FabricClientSingleton.QueryManager.GetNodePagedListAsync(
nodeQueryDesc,
@ -126,14 +140,14 @@ namespace FabricHealer.Repair
return null;
}
string taskId = $"{FHTaskIdPrefix}/{HostVMReboot}/{(uint)repairData.NodeName.GetHashCode()}/{repairData.NodeType}";
bool doHealthChecks = !SupportedErrorCodes.GetCodeNameFromErrorCode(repairData.Code).Contains("Error");
string taskId = $"{FHTaskIdPrefix}/{repairActionName}/{(uint)repairData.NodeName.GetHashCode()}/{repairData.NodeType}";
bool doHealthChecks = repairData.HealthState != HealthState.Error;
// Error health state on target SF entity can block RM from approving the job to repair it (which is the whole point of doing the job).
// So, do not do health checks if customer configures FO to emit Error health reports.
// In general, FO should *not* be configured to emit Error events. See FO documentation.
var repairTask = new ClusterRepairTask(taskId, HostVMReboot)
var repairTask = new ClusterRepairTask(taskId, repairActionName)
{
Target = new NodeRepairTargetDescription(repairData.NodeName),
Description = $"{repairData.RepairPolicy.RepairId}",
@ -146,25 +160,40 @@ namespace FabricHealer.Repair
return repairTask;
}
public async Task<bool> IsFHRepairTaskRunningAsync(string executorName, TelemetryData repairdData, CancellationToken token)
/// <summary>
/// Determines if a repair task is already in flight or if the max number of concurrent repairs has been reached for the target using the information specified in repairData instance.
/// </summary>
/// <param name="executorName">Name of the repair executor.</param>
/// <param name="repairData">TelemetryData instance.</param>
/// <param name="token">CancellationToken.</param>
/// <param name="maxConcurrentRepairs">Optional: Number of max concurrent repairs for the entity type specified in repairData. Default is 0 which means no concurrent repairs.</param>
/// <returns></returns>
public async Task<bool> IsRepairInProgressAsync(string executorName, TelemetryData repairData, CancellationToken token, long maxConcurrentRepairs = 0)
{
// All RepairTasks are prefixed with FH, regardless of repair target type (VM, fabric node, system service process, codepackage, replica).
// For VM-level repair, RM will create a new task for IS that replaces FH executor data with IS job info, but the original FH repair task will
// remain in an active state which will block any duplicate scheduling by another FH instance.
var currentFHRepairTasksInProgress =
await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
FHTaskIdPrefix,
RepairTaskStateFilter.Active | RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing,
executorName,
FabricHealerManager.ConfigSettings.AsyncTimeout,
token);
// All RepairTasks are prefixed with FH, regardless of repair target type (VM/Machine, Fabric node, system service process, code package, replica).
// For VM-level repairs, RM will create a new task for IS that replaces FH executor data with IS job info.
RepairTaskList repairTasksInProgress =
await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
FHTaskIdPrefix,
RepairTaskStateFilter.Active | RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing,
executorName,
FabricHealerManager.ConfigSettings.AsyncTimeout,
token);
if (currentFHRepairTasksInProgress == null || currentFHRepairTasksInProgress.Count == 0)
if (repairTasksInProgress == null || repairTasksInProgress.Count == 0)
{
return false;
}
foreach (var repair in currentFHRepairTasksInProgress)
// Throttling machine level repairs.
if (executorName == $"{InfrastructureServiceName}/{repairData.NodeType}" &&
maxConcurrentRepairs > 0 &&
repairTasksInProgress.Count(r => r.Executor == executorName) >= maxConcurrentRepairs)
{
return true;
}
foreach (var repair in repairTasksInProgress)
{
// This check is to see if there are any FH-as-executor repairs in flight.
if (executorName == FabricHealerExecutorName)
@ -179,19 +208,17 @@ namespace FabricHealer.Repair
return false;
}
// The node repair check ensures that only one node-level repair can take place in a cluster (no concurrent node restarts), by default.
// FH is conservative, by design.
if (repairdData.RepairPolicy.RepairId == executorData.RepairData.RepairPolicy.RepairId ||
executorData.RepairData.RepairPolicy.RepairAction == RepairActionType.RestartFabricNode)
// This check ensures that only one repair can be scheduled at a time for the same target.
if (repairData.RepairPolicy.RepairId == executorData.RepairData.RepairPolicy.RepairId)
{
return true;
}
}
else if (repair.Executor == $"{InfrastructureServiceName}/{repairdData.NodeType}")
else if (repair.Executor == $"{InfrastructureServiceName}/{repairData.NodeType}")
{
// This would block scheduling any VM level operation (reboot) already in flight.
// For IS repairs, unique id is stored in the repair task's Description property.
if (repair.Description == repairdData.RepairPolicy.RepairId)
// This would block rescheduling any VM level operation (reboot) that is already in flight.
// NOTE: For Infrastructure-level repairs (IS is executor), unique id is stored in the repair task's Description property.
if (repair.Description == repairData.RepairPolicy.RepairId)
{
return true;
}

Просмотреть файл

@ -151,7 +151,7 @@ namespace FabricHealer.Repair
functorTable.Add(RestartFabricNodePredicateType.Singleton(RepairConstants.RestartFabricNode, this, repairExecutorData, repairTaskEngine, repairData));
functorTable.Add(RestartFabricSystemProcessPredicateType.Singleton(RepairConstants.RestartFabricSystemProcess, this, repairData));
functorTable.Add(RestartReplicaPredicateType.Singleton(RepairConstants.RestartReplica, this, repairData));
functorTable.Add(RestartMachinePredicateType.Singleton(RepairConstants.RestartVM, this, repairData));
functorTable.Add(ScheduleMachineRepairPredicateType.Singleton(RepairConstants.ScheduleMachineRepair, this, repairData));
// Parse rules.
Module module = Module.Parse("external", repairRules, functorTable);
@ -181,7 +181,7 @@ namespace FabricHealer.Repair
compoundTerm.AddArgument(new Constant(repairData.NodeType), RepairConstants.NodeType);
compoundTerm.AddArgument(new Constant(repairData.ObserverName), RepairConstants.ObserverName);
compoundTerm.AddArgument(new Constant(repairData.OS), RepairConstants.OS);
compoundTerm.AddArgument(new Constant(Enum.GetName(typeof(ServiceKind), repairData.ServiceKind)), RepairConstants.ServiceKind);
compoundTerm.AddArgument(new Constant(repairData.ServiceKind), RepairConstants.ServiceKind);
compoundTerm.AddArgument(new Constant(repairData.ServiceName), RepairConstants.ServiceName);
compoundTerm.AddArgument(new Constant(repairData.ProcessId), RepairConstants.ProcessId);
compoundTerm.AddArgument(new Constant(repairData.ProcessName), RepairConstants.ProcessName);
@ -198,7 +198,7 @@ namespace FabricHealer.Repair
// The repair will be executed by SF Infrastructure service, not FH. This is the case for all
// VM-level repairs. IS will communicate with VMSS (for example) to guarantee safe repairs in MR-enabled
// clusters.RM, as usual, will orchestrate the repair cycle.
public async Task<bool> ExecuteRMInfrastructureRepairTask(TelemetryData repairData, CancellationToken cancellationToken)
public async Task<bool> ScheduleInfrastructureRepairTask(TelemetryData repairData, CancellationToken cancellationToken)
{
var infraServices = await FabricRepairTasks.GetInfrastructureServiceInstancesAsync(cancellationToken);
var arrServices = infraServices as Service[] ?? infraServices.ToArray();
@ -207,7 +207,7 @@ namespace FabricHealer.Repair
{
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"ExecuteRMInfrastructureRepairTask",
"ScheduleInfrastructureRepairTask",
"Infrastructure Service not found. Will not attemp VM repair.",
cancellationToken,
repairData,
@ -229,8 +229,8 @@ namespace FabricHealer.Repair
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteRMInfrastructureRepairTask",
$"IS RepairTask {RepairTaskEngine.HostVMReboot} " +
"ScheduleInfrastructureRepairTask",
$"IS RepairTask {RepairTaskEngine.HostMachineReboot} " +
$"Executor set to {executorName}.",
cancellationToken,
repairData,
@ -243,7 +243,7 @@ namespace FabricHealer.Repair
{
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"ExecuteRMInfrastructureRepairTask",
"ScheduleInfrastructureRepairTask",
"Unable to find InfrastructureService service instance." +
"Exiting RepairTaskManager.ScheduleFHRepairTaskAsync.",
cancellationToken,
@ -255,19 +255,26 @@ namespace FabricHealer.Repair
// Make sure there is not already a repair job executing reboot repair for target node.
var isRepairAlreadyInProgress =
await repairTaskEngine.IsFHRepairTaskRunningAsync(
executorName,
repairData,
cancellationToken);
await repairTaskEngine.IsRepairInProgressAsync(executorName, repairData, cancellationToken);
if (isRepairAlreadyInProgress)
{
string machineName = repairData.NodeName;
try
{
machineName = Environment.MachineName;
}
catch (InvalidOperationException)
{
}
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"ExecuteRMInfrastructureRepairTask",
"Virtual machine repair task for VM " +
$"{await RepairExec.GetMachineHostNameFromFabricNodeNameAsync(repairData.NodeName, cancellationToken)} " +
"is already in progress. Will not schedule another VM repair at this time.",
"ScheduleInfrastructureRepairTask",
$"Virtual machine repair task for {machineName} is already in progress " +
"or max number of concurrent machine repairs has been reached. Will not schedule another machine repair at this time.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
@ -282,7 +289,7 @@ namespace FabricHealer.Repair
{
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"ExecuteRMInfrastructureRepairTask",
"ScheduleInfrastructureRepairTask",
"Unable to create Repair Task.",
cancellationToken,
repairData,
@ -293,7 +300,7 @@ namespace FabricHealer.Repair
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"ExecuteRMInfrastructureRepairTask",
"ScheduleInfrastructureRepairTask",
$"Successfully created Repair Task {repairTask.TaskId}",
cancellationToken,
repairData,
@ -320,7 +327,7 @@ namespace FabricHealer.Repair
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"ExecuteRMInfrastructureRepairTask::Completed",
"InfrastructureRepairTask::Completed",
$"Successfully completed repair {repairData.RepairPolicy.RepairId}",
cancellationToken,
repairData,
@ -332,7 +339,7 @@ namespace FabricHealer.Repair
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"ExecuteRMInfrastructureRepairTask::Timeout",
"ScheduleInfrastructureRepairTask::Timeout",
$"Max wait time of {MaxWaitTimeForInfraRepairTaskCompleted} has elapsed for repair " +
$"{repairData.RepairPolicy.RepairId}.",
cancellationToken,
@ -441,7 +448,7 @@ namespace FabricHealer.Repair
await Task.Delay(new Random().Next(100, 1500));
// Has the repair already been scheduled by a different FH instance?
if (await repairTaskEngine.IsFHRepairTaskRunningAsync(RepairTaskEngine.FHTaskIdPrefix, repairData, cancellationToken))
if (await repairTaskEngine.IsRepairInProgressAsync(RepairTaskEngine.FHTaskIdPrefix, repairData, cancellationToken))
{
return null;
}
@ -465,7 +472,7 @@ namespace FabricHealer.Repair
}
if (repairExecutorData.RepairData.RepairPolicy.RepairAction != RepairActionType.RestartFabricNode &&
repairExecutorData.RepairData.RepairPolicy.RepairAction != RepairActionType.RestartVM)
repairExecutorData.RepairData.RepairPolicy.RepairAction != RepairActionType.RebootMachine)
{
continue;
}
@ -700,7 +707,7 @@ namespace FabricHealer.Repair
success = false;
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoReplica",
$"Stateless Instance {repairData.ReplicaId} not found on partition " +
$"{repairData.PartitionId}.",
cancellationToken,
@ -745,7 +752,7 @@ namespace FabricHealer.Repair
success = false;
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoReplica",
$"Stateful replica {repairData.ReplicaId} not found on partition " +
$"{repairData.PartitionId}.",
cancellationToken,
@ -941,11 +948,11 @@ namespace FabricHealer.Repair
// This is done by setting the repair task to Restoring State with ResultStatus Succeeded. RM will then move forward to Restoring
// (and do any restoring health checks if specified), then Complete the repair job.
_ = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() => FabricRepairTasks.CompleteCustomActionRepairJobAsync(
repairTask,
Context,
cancellationToken),
cancellationToken);
() => FabricRepairTasks.CompleteCustomActionRepairJobAsync(
repairTask,
Context,
cancellationToken),
cancellationToken);
// Let RM catch up.
await Task.Delay(TimeSpan.FromSeconds(3), cancellationToken);

Просмотреть файл

@ -23,7 +23,7 @@ namespace FabricHealer.Utilities
private set;
}
public int ExecutionLoopSleepSeconds
public int HealthCheckIntervalInSeconds
{
get;
private set;
@ -169,9 +169,9 @@ namespace FabricHealer.Utilities
LocalLogPathParameter = GetConfigSettingValue( RepairConstants.RepairManagerConfigurationSectionName, RepairConstants.LocalLogPathParameter);
if (int.TryParse( GetConfigSettingValue(RepairConstants.RepairManagerConfigurationSectionName, RepairConstants.HealthCheckLoopSleepTimeSeconds), out int execFrequency))
if (int.TryParse(GetConfigSettingValue(RepairConstants.RepairManagerConfigurationSectionName, RepairConstants.HealthCheckIntervalInSeconds), out int execFrequency))
{
ExecutionLoopSleepSeconds = execFrequency;
HealthCheckIntervalInSeconds = execFrequency;
}
// Rolling service restarts.

Просмотреть файл

@ -10,7 +10,6 @@ using System.Fabric.Health;
using System;
using FabricHealer.Repair;
using System.Diagnostics.Tracing;
using System.Fabric.Query;
namespace FabricHealer.Utilities.Telemetry
{
@ -96,8 +95,7 @@ namespace FabricHealer.Utilities.Telemetry
get; set;
}
[EventField]
public ServiceKind ServiceKind
public string ServiceKind
{
get; set;
}

Просмотреть файл

@ -12,6 +12,7 @@ using System.Fabric.Health;
using System.Runtime.InteropServices;
using System.Threading;
using System.Threading.Tasks;
using System.Xml.Linq;
namespace FabricHealer.Utilities.Telemetry
{
@ -146,17 +147,22 @@ namespace FabricHealer.Utilities.Telemetry
// ETW.
if (FabricHealerManager.ConfigSettings.EtwEnabled)
{
if (healthState == HealthState.Ok || healthState == HealthState.Unknown || healthState == HealthState.Invalid)
if (JsonSerializationUtility.TrySerialize(telemData, out string tData))
{
ServiceEventSource.Current.DataTypeWriteInfo(RepairConstants.EventSourceEventName, telemData);
}
else if (healthState == HealthState.Warning)
{
ServiceEventSource.Current.DataTypeWriteWarning(RepairConstants.EventSourceEventName, telemData);
}
else
{
ServiceEventSource.Current.DataTypeWriteError(RepairConstants.EventSourceEventName, telemData);
var data = new { tData };
if (healthState == HealthState.Ok || healthState == HealthState.Unknown || healthState == HealthState.Invalid)
{
ServiceEventSource.Current.DataTypeWriteInfo(RepairConstants.EventSourceEventName, data);
}
else if (healthState == HealthState.Warning)
{
ServiceEventSource.Current.DataTypeWriteWarning(RepairConstants.EventSourceEventName, data);
}
else
{
ServiceEventSource.Current.DataTypeWriteError(RepairConstants.EventSourceEventName, data);
}
}
}
}

Просмотреть файл

@ -4,7 +4,7 @@
<!-- FabricHealerManager Settings -->
<Parameter Name="AutoMitigationEnabled" DefaultValue="true" />
<Parameter Name="EnableETW" DefaultValue="true" />
<Parameter Name="MonitorLoopSleepSeconds" DefaultValue="5" />
<Parameter Name="HealthCheckIntervalInSeconds" DefaultValue="60" />
<Parameter Name="EnableTelemetry" DefaultValue="true" />
<!-- Set VerboseLoggingEnabled to true if you want detailed local logging and telemetry/ETW with repair data.
This data will live in a folder named RepairData, which will be created in your LocalLogPath directory.
@ -39,7 +39,7 @@
<Settings>
<!-- FabricHealerManager -->
<Section Name="RepairManagerConfiguration">
<Parameter Name="HealthCheckLoopSleepTimeSeconds" Value="[MonitorLoopSleepSeconds]" />
<Parameter Name="HealthCheckIntervalInSeconds" Value="[HealthCheckIntervalInSeconds]" />
<Parameter Name="EnableAutoMitigation" Value="[AutoMitigationEnabled]" />
<Parameter Name="EnableETW" Value="[EnableETW]" />
<Parameter Name="EnableTelemetry" Value="[EnableTelemetry]" />