This commit is contained in:
Charles Torre 2022-10-19 18:48:23 -07:00
Родитель 72dd7c2c4e
Коммит a762bf5cc0
24 изменённых файлов: 1267 добавлений и 621 удалений

Просмотреть файл

@ -32,6 +32,7 @@
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="ServiceFabric.Mocks" Version="6.0.3" />
</ItemGroup>
<ItemGroup>
@ -40,6 +41,9 @@
</ItemGroup>
<ItemGroup>
<None Update="PackageRoot\Config\Settings.xml">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="testrules_wellformed">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>

Просмотреть файл

@ -20,7 +20,12 @@ using System.Fabric.Repair;
using System.Diagnostics;
using System.Fabric.Health;
using SupportedErrorCodes = FabricHealer.Utilities.SupportedErrorCodes;
using EntityType = FabricHealer.EntityType;
using HealthReport = FabricHealer.Utilities.HealthReport;
using EntityType = FabricHealer.Utilities.Telemetry.EntityType;
using System.Xml;
using ServiceFabric.Mocks;
using static ServiceFabric.Mocks.MockConfigurationPackage;
using System.Fabric.Description;
namespace FHTest
{
@ -32,31 +37,10 @@ namespace FHTest
[TestClass]
public class FHUnitTests
{
private static readonly Uri ServiceName = new Uri("fabric:/app/service");
private static readonly Uri TestServiceName = new Uri("fabric:/app/service");
private static readonly FabricClient fabricClient = new FabricClient();
private static readonly ICodePackageActivationContext CodePackageContext
= new MockCodePackageActivationContext(
ServiceName.AbsoluteUri,
"applicationType",
"Code",
"1.0.0.0",
Guid.NewGuid().ToString(),
@"C:\Log",
@"C:\Temp",
@"C:\Work",
"ServiceManifest",
"1.0.0.0");
private readonly StatelessServiceContext context
= new StatelessServiceContext(
new NodeContext("Node0", new NodeId(0, 1), 0, "NodeType1", "TEST.MACHINE"),
CodePackageContext,
"FabricHealer.FabricHealerType",
ServiceName,
null,
Guid.NewGuid(),
long.MaxValue);
private static readonly ICodePackageActivationContext CodePackageContext = null;
private static readonly StatelessServiceContext TestServiceContext = null;
private readonly CancellationToken token = new CancellationToken();
// This is the name of the node used on your local dev machine's SF cluster. If you customize this, then change it.
@ -67,6 +51,102 @@ namespace FHTest
// e.g., if developing on Windows, then something like @"C:\Users\[me]\source\repos\service-fabric-healer\FabricHealer\PackageRoot\Config\LogicRules\";
private const string FHRulesDirectory = @"C:\Users\ctorre\source\repos\service-fabric-healer\FabricHealer\PackageRoot\Config\LogicRules\";
static FHUnitTests()
{
/* SF runtime mocking care of ServiceFabric.Mocks by loekd.
https://github.com/loekd/ServiceFabric.Mocks */
// NOTE: Make changes in Settings.xml located in this project (FabricObserverTests) PackageRoot/Config directory to configure observer settings.
string configPath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "Settings.xml");
ConfigurationPackage configPackage = BuildConfigurationPackageFromSettingsFile(configPath);
CodePackageContext =
new MockCodePackageActivationContext(
TestServiceName.AbsoluteUri,
"applicationType",
"Code",
"1.0.0.0",
Guid.NewGuid().ToString(),
@"C:\Log",
@"C:\Temp",
@"C:\Work",
"ServiceManifest",
"1.0.0.0")
{
ConfigurationPackage = configPackage
};
TestServiceContext =
new StatelessServiceContext(
new NodeContext(NodeName, new NodeId(0, 1), 0, "NodeType0", "TEST.MACHINE"),
CodePackageContext,
"FabricObserver.FabricObserverType",
TestServiceName,
null,
Guid.NewGuid(),
long.MaxValue);
}
/* Helpers */
private static ConfigurationPackage BuildConfigurationPackageFromSettingsFile(string configPath)
{
StringReader sreader = null;
XmlReader xreader = null;
try
{
if (string.IsNullOrWhiteSpace(configPath))
{
return null;
}
string configXml = File.ReadAllText(configPath);
// Safe XML pattern - *Do not use LoadXml*.
XmlDocument xdoc = new XmlDocument { XmlResolver = null };
sreader = new StringReader(configXml);
xreader = XmlReader.Create(sreader, new XmlReaderSettings { XmlResolver = null });
xdoc.Load(xreader);
var nsmgr = new XmlNamespaceManager(xdoc.NameTable);
nsmgr.AddNamespace("sf", "http://schemas.microsoft.com/2011/01/fabric");
var sectionNodes = xdoc.SelectNodes("//sf:Section", nsmgr);
var configSections = new ConfigurationSectionCollection();
if (sectionNodes != null)
{
foreach (XmlNode node in sectionNodes)
{
ConfigurationSection configSection = CreateConfigurationSection(node?.Attributes?.Item(0).Value);
var sectionParams = xdoc.SelectNodes($"//sf:Section[@Name='{configSection.Name}']//sf:Parameter", nsmgr);
if (sectionParams != null)
{
foreach (XmlNode node2 in sectionParams)
{
ConfigurationProperty parameter = CreateConfigurationSectionParameters(node2?.Attributes?.Item(0).Value, node2?.Attributes?.Item(1).Value);
configSection.Parameters.Add(parameter);
}
}
configSections.Add(configSection);
}
var configSettings = CreateConfigurationSettings(configSections);
ConfigurationPackage configPackage = CreateConfigurationPackage(configSettings, configPath.Replace("\\Settings.xml", ""));
return configPackage;
}
}
finally
{
sreader.Dispose();
xreader.Dispose();
}
return null;
}
private static bool IsLocalSFRuntimePresent()
{
try
@ -90,7 +170,8 @@ namespace FHTest
try
{
var repairTasks = await fabricClient.RepairManager.GetRepairTaskListAsync();
var testRepairTasks = repairTasks.Where(r => r.TaskId.EndsWith("TEST_0"));
var testRepairTasks =
repairTasks.Where(r => r.TaskId.EndsWith(NodeName) || r.TaskId.EndsWith("_Node_0") || r.TaskId.EndsWith("TEST_0"));
foreach (var repairTask in testRepairTasks)
{
@ -106,29 +187,122 @@ namespace FHTest
}
}
private static async Task CleanupTestHealthReportsAsync()
{
Logger logger = new Logger("TestLogger");
var fabricClient = new FabricClient();
var apps = await fabricClient.QueryManager.GetApplicationListAsync();
foreach (var app in apps)
{
var replicas = await fabricClient.QueryManager.GetDeployedReplicaListAsync(NodeName, app.ApplicationName);
foreach (var replica in replicas)
{
var serviceHealth = await fabricClient.HealthManager.GetServiceHealthAsync(replica.ServiceName);
var fabricObserverServiceHealthEvents =
serviceHealth.HealthEvents?.Where(
s => s.HealthInformation.HealthState == HealthState.Error || s.HealthInformation.HealthState == HealthState.Warning);
foreach (var evt in fabricObserverServiceHealthEvents)
{
var healthReport = new HealthReport
{
EntityType = EntityType.Service,
HealthMessage = $"Clearing existing AppObserver Test health reports.",
State = HealthState.Ok,
NodeName = NodeName,
EmitLogEvent = false,
ServiceName = replica.ServiceName,
Property = evt.HealthInformation.Property,
SourceId = evt.HealthInformation.SourceId
};
var healthReporter = new FabricHealthReporter(logger);
healthReporter.ReportHealthToServiceFabric(healthReport);
await Task.Delay(250);
}
}
}
// System app reports.
var sysAppHealth = await fabricClient.HealthManager.GetApplicationHealthAsync(new Uri(RepairConstants.SystemAppName));
if (sysAppHealth != null)
{
foreach (var evt in sysAppHealth.HealthEvents.Where(
s => s.HealthInformation.HealthState == HealthState.Error
|| s.HealthInformation.HealthState == HealthState.Warning))
{
var healthReport = new HealthReport
{
EntityType = EntityType.Application,
HealthMessage = $"Clearing existing FSO Test health reports.",
State = HealthState.Ok,
NodeName = NodeName,
EmitLogEvent = false,
AppName = new Uri(RepairConstants.SystemAppName),
Property = evt.HealthInformation.Property,
SourceId = evt.HealthInformation.SourceId
};
var healthReporter = new FabricHealthReporter(logger);
healthReporter.ReportHealthToServiceFabric(healthReport);
await Task.Delay(250);
}
}
// Node reports.
var nodeHealth = await fabricClient.HealthManager.GetNodeHealthAsync(NodeName);
if (nodeHealth != null)
{
var fabricObserverNodeHealthEvents = nodeHealth.HealthEvents?.Where(
s => s.HealthInformation.HealthState == HealthState.Error || s.HealthInformation.HealthState == HealthState.Warning);
foreach (var evt in fabricObserverNodeHealthEvents)
{
var healthReport = new HealthReport
{
EntityType = EntityType.Machine,
HealthMessage = $"Clearing existing FSO Test health reports.",
State = HealthState.Ok,
NodeName = NodeName,
EmitLogEvent = false,
Property = evt.HealthInformation.Property,
SourceId = evt.HealthInformation.SourceId
};
var healthReporter = new FabricHealthReporter(logger);
healthReporter.ReportHealthToServiceFabric(healthReport);
await Task.Delay(250);
}
}
}
[ClassCleanup]
public static async Task TestClassCleanupAsync()
{
await CleanupTestRepairJobsAsync();
await CleanupTestHealthReportsAsync();
// Ensure FHProxy cleans up its health reports.
FabricHealerProxy.Instance.Close();
}
/* GuanLogic Tests */
// Currently, the tests below validate logic rules and the successful scheduling of related local repair jobs.
/* GuanLogic Tests
The tests below validate entity-specific logic rules and the successful scheduling of related local repair jobs. */
// This test ensures your shipping rule files (the guan files located in Config/LogicRules folder)
// contain correctly written rules and that the related local repair job is successfully created.
[TestMethod]
public async Task TestGuanLogic_AllRules_FabricHealer_EnsureWellFormedRules_QueryInitialized()
public async Task AllAppRules_EnsureWellFormedRules_QueryInitialized_Successful()
{
if (!IsLocalSFRuntimePresent())
{
throw new InternalTestFailureException("You must run this test with an active local (dev) SF cluster.");
}
FabricHealerManager.ConfigSettings = new ConfigSettings(context)
FabricHealerManager.ConfigSettings = new ConfigSettings(TestServiceContext)
{
TelemetryEnabled = false
};
@ -137,14 +311,16 @@ namespace FHTest
var repairData = new TelemetryData
{
ApplicationName = "fabric:/test",
NodeName = "TEST_0",
EntityType = FabricHealer.Utilities.Telemetry.EntityType.Service,
NodeName = NodeName,
Code = SupportedErrorCodes.AppErrorMemoryMB,
HealthState = HealthState.Warning,
ServiceName = "fabric:/test0/service0",
Value = 1024.0,
RepairPolicy = new RepairPolicy
{
RepairId = $"Test42_{SupportedErrorCodes.AppErrorMemoryMB}"
RepairId = $"Test42_{SupportedErrorCodes.AppErrorMemoryMB}{NodeName}",
RepairIdPrefix = RepairTaskEngine.FHTaskIdPrefix
}
};
@ -153,20 +329,198 @@ namespace FHTest
RepairData = repairData
};
var files = Directory.GetFiles(FHRulesDirectory);
var file = Path.Combine(FHRulesDirectory, "AppRules.guan");
List<string> repairRules = ParseRulesFile(await File.ReadAllLinesAsync(file, token));
foreach (var file in files)
try
{
List<string> repairRules = ParseRulesFile(await File.ReadAllLinesAsync(file, token));
await TestInitializeGuanAndRunQuery(repairData, repairRules, executorData);
}
catch (GuanException ge)
{
throw new AssertFailedException(ge.Message, ge);
}
}
try
[TestMethod]
public async Task AllMachineRules_EnsureWellFormedRules_QueryInitialized_Successful()
{
if (!IsLocalSFRuntimePresent())
{
throw new InternalTestFailureException("You must run this test with an active local (dev) SF cluster.");
}
FabricHealerManager.ConfigSettings = new ConfigSettings(TestServiceContext)
{
TelemetryEnabled = false
};
// This will be the data used to create a repair task.
var repairData = new TelemetryData
{
EntityType = EntityType.Machine,
NodeName = NodeName,
HealthState = HealthState.Error,
RepairPolicy = new RepairPolicy
{
await TestInitializeGuanAndRunQuery(repairData, repairRules, executorData);
RepairId = $"Test42_MachineRepair{NodeName}",
RepairIdPrefix = RepairTaskEngine.InfraTaskIdPrefix
}
catch (GuanException ge)
};
var executorData = new RepairExecutorData
{
RepairData = repairData
};
var file = Path.Combine(FHRulesDirectory, "MachineRules.guan");
List<string> repairRules = ParseRulesFile(await File.ReadAllLinesAsync(file, token));
try
{
await TestInitializeGuanAndRunQuery(repairData, repairRules, executorData);
}
catch (GuanException ge)
{
throw new AssertFailedException(ge.Message, ge);
}
}
[TestMethod]
public async Task AllDiskRules_EnsureWellFormedRules_QueryInitialized_Successful()
{
if (!IsLocalSFRuntimePresent())
{
throw new InternalTestFailureException("You must run this test with an active local (dev) SF cluster.");
}
FabricHealerManager.ConfigSettings = new ConfigSettings(TestServiceContext)
{
TelemetryEnabled = false
};
// This will be the data used to create a repair task.
var repairData = new TelemetryData
{
EntityType = FabricHealer.Utilities.Telemetry.EntityType.Disk,
NodeName = NodeName,
HealthState = HealthState.Warning,
RepairPolicy = new RepairPolicy
{
throw new AssertFailedException(ge.Message, ge);
RepairId = $"Test42_DiskRepair{NodeName}",
RepairIdPrefix = RepairTaskEngine.InfraTaskIdPrefix
}
};
var executorData = new RepairExecutorData
{
RepairData = repairData
};
var file = Path.Combine(FHRulesDirectory, "DiskRules.guan");
List<string> repairRules = ParseRulesFile(await File.ReadAllLinesAsync(file, token));
try
{
await TestInitializeGuanAndRunQuery(repairData, repairRules, executorData);
}
catch (GuanException ge)
{
throw new AssertFailedException(ge.Message, ge);
}
}
[TestMethod]
public async Task AllReplicaRules_EnsureWellFormedRules_QueryInitialized_Successful()
{
if (!IsLocalSFRuntimePresent())
{
throw new InternalTestFailureException("You must run this test with an active local (dev) SF cluster.");
}
FabricHealerManager.ConfigSettings = new ConfigSettings(TestServiceContext)
{
TelemetryEnabled = false
};
// This will be the data used to create a repair task.
var repairData = new TelemetryData
{
ApplicationName = "fabric:/test",
EntityType = FabricHealer.Utilities.Telemetry.EntityType.Partition,
PartitionId = Guid.NewGuid(),
NodeName = NodeName,
HealthState = HealthState.Warning,
ServiceName = "fabric:/test0/service0",
RepairPolicy = new RepairPolicy
{
RepairId = $"Test42_ReplicaRepair{NodeName}",
RepairIdPrefix = RepairTaskEngine.FHTaskIdPrefix
}
};
var executorData = new RepairExecutorData
{
RepairData = repairData
};
var file = Path.Combine(FHRulesDirectory, "ReplicaRules.guan");
List<string> repairRules = ParseRulesFile(await File.ReadAllLinesAsync(file, token));
try
{
await TestInitializeGuanAndRunQuery(repairData, repairRules, executorData);
}
catch (GuanException ge)
{
throw new AssertFailedException(ge.Message, ge);
}
}
[TestMethod]
public async Task AllSystemServiceRules_EnsureWellFormedRules_QueryInitialized_Successful()
{
if (!IsLocalSFRuntimePresent())
{
throw new InternalTestFailureException("You must run this test with an active local (dev) SF cluster.");
}
FabricHealerManager.ConfigSettings = new ConfigSettings(TestServiceContext)
{
TelemetryEnabled = false
};
// This will be the data used to create a repair task.
var repairData = new TelemetryData
{
ApplicationName = "fabric:/System",
EntityType = FabricHealer.Utilities.Telemetry.EntityType.Partition,
PartitionId = Guid.NewGuid(),
NodeName = NodeName,
HealthState = HealthState.Warning,
RepairPolicy = new RepairPolicy
{
RepairId = $"Test42_SystemServiceRepair{NodeName}",
RepairIdPrefix = RepairTaskEngine.FHTaskIdPrefix
}
};
var executorData = new RepairExecutorData
{
RepairData = repairData
};
var file = Path.Combine(FHRulesDirectory, "SystemServiceRules.guan");
List<string> repairRules = ParseRulesFile(await File.ReadAllLinesAsync(file, token));
try
{
await TestInitializeGuanAndRunQuery(repairData, repairRules, executorData);
}
catch (GuanException ge)
{
throw new AssertFailedException(ge.Message, ge);
}
}
@ -180,7 +534,7 @@ namespace FHTest
throw new InternalTestFailureException("You must run this test with an active local (dev) SF cluster.");
}
FabricHealerManager.ConfigSettings = new ConfigSettings(context)
FabricHealerManager.ConfigSettings = new ConfigSettings(TestServiceContext)
{
TelemetryEnabled = false
};
@ -191,7 +545,7 @@ namespace FHTest
var repairData = new TelemetryData
{
ApplicationName = "fabric:/test0",
NodeName = "TEST_0",
NodeName = NodeName,
Metric = "Memory",
HealthState = HealthState.Warning,
Code = SupportedErrorCodes.AppErrorMemoryMB,
@ -201,7 +555,7 @@ namespace FHTest
PartitionId = default,
RepairPolicy = new RepairPolicy
{
RepairId = $"Test42_{SupportedErrorCodes.AppErrorMemoryMB}"
RepairId = $"Test42_{SupportedErrorCodes.AppErrorMemoryMB}{NodeName}"
}
};
@ -229,7 +583,7 @@ namespace FHTest
throw new InternalTestFailureException("You must run this test with an active local (dev) SF cluster.");
}
FabricHealerManager.ConfigSettings = new ConfigSettings(context)
FabricHealerManager.ConfigSettings = new ConfigSettings(TestServiceContext)
{
TelemetryEnabled = false
};
@ -241,7 +595,7 @@ namespace FHTest
{
ApplicationName = "fabric:/test0",
EntityType = FabricHealer.Utilities.Telemetry.EntityType.Service,
NodeName = "TEST_0",
NodeName = NodeName,
Metric = "Memory",
HealthState = HealthState.Warning,
Code = SupportedErrorCodes.AppErrorMemoryMB,
@ -251,7 +605,7 @@ namespace FHTest
PartitionId = default,
RepairPolicy = new RepairPolicy
{
RepairId = $"Test42_{SupportedErrorCodes.AppErrorMemoryMB}"
RepairId = $"Test42_{SupportedErrorCodes.AppErrorMemoryMB}{NodeName}"
}
};
@ -267,8 +621,8 @@ namespace FHTest
private async Task TestInitializeGuanAndRunQuery(TelemetryData repairData, List<string> repairRules, RepairExecutorData executorData)
{
_ = FabricHealerManager.Instance(context, token);
var repairTaskManager = new RepairTaskManager(context, token);
_ = FabricHealerManager.Instance(TestServiceContext, token);
var repairTaskManager = new RepairTaskManager();
await repairTaskManager.RunGuanQueryAsync(repairData, repairRules, executorData);
}
@ -393,7 +747,7 @@ namespace FHTest
static readonly RepairFacts RepairFactsMachineTarget = new RepairFacts
{
NodeName = NodeName,
EntityType = EntityType.Machine,
EntityType = FabricHealer.EntityType.Machine,
// Specifying Source is Required for unit tests.
// For unit tests, there is no FabricRuntime static, so FHProxy, which utilizes this type, will fail unless Source is provided here.
Source = "fabric:/Test"
@ -417,7 +771,7 @@ namespace FHTest
static readonly RepairFacts DiskRepairFacts = new RepairFacts
{
NodeName = NodeName,
EntityType = EntityType.Disk,
EntityType = FabricHealer.EntityType.Disk,
Metric = SupportedMetricNames.DiskSpaceUsageMb,
Code = SupportedErrorCodes.NodeWarningDiskSpaceMB,
// Specifying Source is Required for unit tests.
@ -567,7 +921,7 @@ namespace FHTest
Assert.IsTrue(generatedWarningService);
Assert.IsTrue(sdata != null);
}
else if (repair.EntityType == EntityType.Disk || repair.EntityType == EntityType.Machine || repair.EntityType == EntityType.Node)
else if (repair.EntityType == FabricHealer.EntityType.Disk || repair.EntityType == FabricHealer.EntityType.Machine || repair.EntityType == FabricHealer.EntityType.Node)
{
var (generatedWarningNode, ndata) = await IsEntityInWarningStateAsync(null, null, NodeName);
Assert.IsTrue(generatedWarningNode);

Просмотреть файл

@ -1,200 +0,0 @@
// ------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
using System;
using System.Collections.Generic;
using System.Collections.ObjectModel;
using System.Fabric;
using System.Fabric.Description;
using System.Fabric.Health;
namespace FHTest
{
public class MockCodePackageActivationContext : ICodePackageActivationContext
{
/// <summary>
/// Initializes a new instance of the <see cref="MockCodePackageActivationContext"/> class.
/// </summary>
/// <param name="applicationName">applicationName.</param>
/// <param name="applicationTypeName">applicationTypeName.</param>
/// <param name="codePackageName">codePackageName.</param>
/// <param name="codePackageVersion">codePackageVersion.</param>
/// <param name="context">context.</param>
/// <param name="logDirectory">logDirectory.</param>
/// <param name="tempDirectory">tempDirectory.</param>
/// <param name="workDirectory">workDirectory.</param>
/// <param name="serviceManifestName">serviceManifestName.</param>
/// <param name="serviceManifestVersion">serviceManifestVersion.</param>
public MockCodePackageActivationContext(
string applicationName,
string applicationTypeName,
string codePackageName,
string codePackageVersion,
string context,
string logDirectory,
string tempDirectory,
string workDirectory,
string serviceManifestName,
string serviceManifestVersion)
{
ApplicationName = applicationName;
ApplicationTypeName = applicationTypeName;
CodePackageName = codePackageName;
CodePackageVersion = codePackageVersion;
ContextId = context;
LogDirectory = logDirectory;
TempDirectory = tempDirectory;
WorkDirectory = workDirectory;
ServiceManifestName = serviceManifestName;
ServiceManifestVersion = serviceManifestVersion;
}
private string ServiceManifestName { get; set; }
private string ServiceManifestVersion { get; set; }
public string ApplicationName { get; private set; }
public string ApplicationTypeName { get; private set; }
public string CodePackageName { get; private set; }
public string CodePackageVersion { get; private set; }
public string ContextId { get; private set; }
public string LogDirectory { get; private set; }
public string TempDirectory { get; private set; }
public string WorkDirectory { get; private set; }
// Interface required events. These are never used. Ignore the Warnings(CS0067) The event 'MockCodePackageActivationContext.CodePackageRemovedEvent' is never used
#pragma warning disable CS0067
public event EventHandler<PackageAddedEventArgs<CodePackage>> CodePackageAddedEvent;
public event EventHandler<PackageModifiedEventArgs<CodePackage>> CodePackageModifiedEvent;
public event EventHandler<PackageRemovedEventArgs<CodePackage>> CodePackageRemovedEvent;
public event EventHandler<PackageAddedEventArgs<ConfigurationPackage>> ConfigurationPackageAddedEvent;
public event EventHandler<PackageModifiedEventArgs<ConfigurationPackage>> ConfigurationPackageModifiedEvent;
public event EventHandler<PackageRemovedEventArgs<ConfigurationPackage>> ConfigurationPackageRemovedEvent;
public event EventHandler<PackageAddedEventArgs<DataPackage>> DataPackageAddedEvent;
public event EventHandler<PackageModifiedEventArgs<DataPackage>> DataPackageModifiedEvent;
public event EventHandler<PackageRemovedEventArgs<DataPackage>> DataPackageRemovedEvent;
#pragma warning restore
public ApplicationPrincipalsDescription GetApplicationPrincipals()
{
return default(ApplicationPrincipalsDescription);
}
public IList<string> GetCodePackageNames()
{
return new List<string>() { CodePackageName };
}
public CodePackage GetCodePackageObject(string packageName)
{
return default(CodePackage);
}
public IList<string> GetConfigurationPackageNames()
{
return new List<string>() { string.Empty };
}
public ConfigurationPackage GetConfigurationPackageObject(string packageName)
{
return default(ConfigurationPackage);
}
public IList<string> GetDataPackageNames()
{
return new List<string>() { string.Empty };
}
public DataPackage GetDataPackageObject(string packageName)
{
return default(DataPackage);
}
public EndpointResourceDescription GetEndpoint(string endpointName)
{
return default(EndpointResourceDescription);
}
public KeyedCollection<string, EndpointResourceDescription> GetEndpoints()
{
return null;
}
public KeyedCollection<string, ServiceGroupTypeDescription> GetServiceGroupTypes()
{
return null;
}
public string GetServiceManifestName()
{
return ServiceManifestName;
}
public string GetServiceManifestVersion()
{
return ServiceManifestVersion;
}
public KeyedCollection<string, ServiceTypeDescription> GetServiceTypes()
{
return null;
}
public void ReportApplicationHealth(HealthInformation healthInformation)
{
}
public void ReportDeployedServicePackageHealth(HealthInformation healthInformation)
{
}
public void ReportDeployedApplicationHealth(HealthInformation healthInformation)
{
}
private bool disposedValue; // To detect redundant calls
protected virtual void Dispose(bool disposing)
{
if (disposedValue)
{
return;
}
if (disposing)
{
// TODO: dispose managed state (managed objects).
}
disposedValue = true;
}
public void Dispose()
{
// Do not change this code. Put cleanup code in Dispose(bool disposing) above.
Dispose(true);
}
public void ReportApplicationHealth(HealthInformation healthInfo, HealthReportSendOptions sendOptions)
{
}
public void ReportDeployedApplicationHealth(HealthInformation healthInfo, HealthReportSendOptions sendOptions)
{
}
public void ReportDeployedServicePackageHealth(HealthInformation healthInfo, HealthReportSendOptions sendOptions)
{
}
}
}

Просмотреть файл

@ -0,0 +1,70 @@
<?xml version="1.0" encoding="utf-8" ?>
<Settings xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://schemas.microsoft.com/2011/01/fabric">
<Section Name="RepairManagerConfiguration">
<!-- ***Overridable Parameters*** These must be set in ApplicationManifest.xml -->
<!-- Interval in seconds for how often FabricHealer wakes up and scans health states to schedule repairs. -->
<Parameter Name="HealthCheckIntervalInSeconds" Value="" MustOverride="true" />
<Parameter Name="EnableVerboseLogging" Value="" MustOverride="true" />
<Parameter Name="EnableTelemetry" Value="" MustOverride="true" />
<Parameter Name="EnableETW" Value="" MustOverride="true" />
<!-- Big Red Button: You can turn FabricHealer on and off with a versionless parameter-only application upgrade. -->
<Parameter Name="EnableAutoMitigation" Value="" MustOverride="true" />
<Parameter Name="EnableOperationalTelemetry" Value="" MustOverride="true" />
<Parameter Name="EnableRollingServiceRestarts" Value="" MustOverride="true" />
<!-- Folder name for local log output. You can use a full path or just a folder name. -->
<Parameter Name="LocalLogPath" Value="" MustOverride="true" />
<!-- ***Non-Overridable Parameters*** These must be set in this file. -->
<!-- Default timeout for async SF API calls. -->
<Parameter Name="AsyncOperationTimeoutSeconds" Value="120" />
<!-- Required-If EnableTelemetry is set to true in ApplicationManifest. Values can be either AzureApplicationInsights or AzureLogAnalytics -->
<Parameter Name="TelemetryProvider" Value="AzureLogAnalytics" />
<!-- Required-If TelemetryProvider is AzureApplicationInsights. -->
<Parameter Name="AppInsightsInstrumentationKey" Value="" />
<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
<Parameter Name="LogAnalyticsWorkspaceId" Value="" />
<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
<Parameter Name="LogAnalyticsSharedKey" Value="" />
<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
<Parameter Name="LogAnalyticsLogType" Value="FabricHealer" />
</Section>
<!-- Repair Policies -->
<Section Name="AppRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
<Parameter Name="MaxRepairsInTimeRange" Value="10, 01:00:00" />
</Section>
<Section Name="DiskRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
<Parameter Name="MaxRepairsInTimeRange" Value="5, 02:00:00" />
</Section>
<Section Name="FabricNodeRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
<Parameter Name="MaxRepairsInTimeRange" Value="5, 02:00:00" />
</Section>
<Section Name="ReplicaRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
<Parameter Name="MaxRepairsInTimeRange" Value="10, 01:00:00" />
</Section>
<Section Name="SystemServiceRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
<Parameter Name="MaxRepairsInTimeRange" Value="5, 02:00:00" />
</Section>
<!-- Machine Repair. -->
<Section Name="MachineRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
<!-- This prevents rules from over-scheduling repairs, which can be detrimental to cluster health.
Think of this as a guardrail that overrides faulty logic with respect to repair count.
The below setting translates to if 5 machine repairs have been scheduled in the last two 2 hours, do not schedule
another one. -->
<Parameter Name="MaxRepairsInTimeRange" Value="5, 02:00:00; 6, 24:00:00; 7, 48:00:00;" />
</Section>
</Settings>

Просмотреть файл

@ -28,13 +28,13 @@ namespace FabricHealer
{
internal static TelemetryUtilities TelemetryUtilities;
internal static RepairData RepairHistory;
internal static StatelessServiceContext ServiceContext;
// Folks often use their own version numbers. This is for internal diagnostic telemetry.
private const string InternalVersionNumber = "1.1.1.831";
private static FabricHealerManager singleton;
private static FabricClient _fabricClient;
private bool disposedValue;
private readonly StatelessServiceContext serviceContext;
private readonly RepairTaskManager repairTaskManager;
private readonly RepairTaskEngine repairTaskEngine;
private readonly Uri systemAppUri = new Uri(RepairConstants.SystemAppName);
@ -125,13 +125,13 @@ namespace FabricHealer
private FabricHealerManager(StatelessServiceContext context, CancellationToken token)
{
serviceContext = context;
ServiceContext = context;
Token = token;
serviceContext.CodePackageActivationContext.ConfigurationPackageModifiedEvent += CodePackageActivationContext_ConfigurationPackageModifiedEvent;
ServiceContext.CodePackageActivationContext.ConfigurationPackageModifiedEvent += CodePackageActivationContext_ConfigurationPackageModifiedEvent;
ConfigSettings = new ConfigSettings(context);
TelemetryUtilities = new TelemetryUtilities(context);
repairTaskEngine = new RepairTaskEngine();
repairTaskManager = new RepairTaskManager(serviceContext, Token);
repairTaskManager = new RepairTaskManager();
RepairLogger = new Logger(RepairConstants.FabricHealer, ConfigSettings.LocalLogPathParameter)
{
EnableVerboseLogging = ConfigSettings.EnableVerboseLogging
@ -184,7 +184,7 @@ namespace FabricHealer
bool isRmDeployed = true;
var healthReport = new HealthReport
{
NodeName = serviceContext.NodeContext.NodeName,
NodeName = ServiceContext.NodeContext.NodeName,
AppName = new Uri(RepairConstants.FabricHealerAppName),
EntityType = EntityType.Application,
HealthMessage = okMessage,
@ -225,7 +225,7 @@ namespace FabricHealer
private async Task<long> GetServiceInstanceCountAsync()
{
ServiceDescription serviceDesc =
await FabricClientSingleton.ServiceManager.GetServiceDescriptionAsync(serviceContext.ServiceName, ConfigSettings.AsyncTimeout, Token);
await FabricClientSingleton.ServiceManager.GetServiceDescriptionAsync(ServiceContext.ServiceName, ConfigSettings.AsyncTimeout, Token);
return (serviceDesc as StatelessServiceDescription).InstanceCount;
}
@ -239,8 +239,7 @@ namespace FabricHealer
/// <param name="parameterName">Name of the parameter.</param>
/// <param name="defaultValue">Default value.</param>
/// <returns>parameter value.</returns>
private static string GetSettingParameterValue(
StatelessServiceContext context,
internal static string GetSettingParameterValue(
string sectionName,
string parameterName,
string defaultValue = null)
@ -250,14 +249,9 @@ namespace FabricHealer
return null;
}
if (context == null)
{
return null;
}
try
{
var serviceConfiguration = context.CodePackageActivationContext.GetConfigurationPackageObject("Config");
var serviceConfiguration = ServiceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config");
if (serviceConfiguration.Settings.Sections.All(sec => sec.Name != sectionName))
{
@ -339,7 +333,7 @@ namespace FabricHealer
{
try
{
using var telemetryEvents = new TelemetryEvents(serviceContext);
using var telemetryEvents = new TelemetryEvents(ServiceContext);
var fhData = GetFabricHealerInternalTelemetryData();
if (fhData != null)
@ -416,7 +410,7 @@ namespace FabricHealer
{
try
{
using var telemetryEvents = new TelemetryEvents(serviceContext);
using var telemetryEvents = new TelemetryEvents(ServiceContext);
var fhData = new FabricHealerCriticalErrorEventData
{
Source = nameof(FabricHealerManager),
@ -517,7 +511,7 @@ namespace FabricHealer
}
// Don't do anything if the orphaned repair was for a different node than this one:
if (_instanceCount == -1 && repairExecutorData.RepairData.NodeName != serviceContext.NodeContext.NodeName)
if (_instanceCount == -1 && repairExecutorData.RepairData.NodeName != ServiceContext.NodeContext.NodeName)
{
continue;
}
@ -584,6 +578,28 @@ namespace FabricHealer
{
await ClearExistingHealthReportsAsync();
ConfigSettings.UpdateConfigSettings(e.NewPackage.Settings);
ConfigurationSettings newSettings = e.NewPackage.Settings;
foreach (var section in newSettings.Sections)
{
RepairLogger.LogInfo($"NewPackage.Settings:{Environment.NewLine}{section.Name}{Environment.NewLine}");
foreach (var param in section.Parameters)
{
RepairLogger.LogInfo($"{param.Name}: {param.Value}{Environment.NewLine}");
}
}
ConfigurationSettings oldSettings = e.OldPackage.Settings;
foreach (var section in oldSettings.Sections)
{
RepairLogger.LogInfo($"OldPackage.Settings:{Environment.NewLine}{section.Name}{Environment.NewLine}");
foreach (var param in section.Parameters)
{
RepairLogger.LogInfo($"{param.Name}: {param.Value}{Environment.NewLine}");
}
}
}
/* Potential TODOs. This list should grow and external predicates should be written to support related workflow composition in logic rule file(s).
@ -615,11 +631,11 @@ namespace FabricHealer
// TOTHINK..
// Don't schedule/execute repairs if this node is in Error state. Error health state should mean that this node is not working properly or put into
// Error by some watchdog (most likely, if this code is even running...).
var nodeHealth = await FabricClientSingleton.HealthManager.GetNodeHealthAsync(serviceContext.NodeContext.NodeName);
/*var nodeHealth = await FabricClientSingleton.HealthManager.GetNodeHealthAsync(ServiceContext.NodeContext.NodeName);
if (nodeHealth.AggregatedHealthState == HealthState.Error)
{
return;
}
}*/
// Check cluster upgrade status. If the cluster is upgrading to a new version (or rolling back)
// then do not attempt any repairs.
@ -840,7 +856,7 @@ namespace FabricHealer
// Since FH can run on each node (-1 InstanceCount), if this is the case then have FH only try to repair app services that are also running on the same node.
// This removes the need to try and orchestrate repairs across nodes (which we will have to do in the non -1 case).
if (_instanceCount == -1 && repairData.NodeName != serviceContext.NodeContext.NodeName)
if (_instanceCount == -1 && repairData.NodeName != ServiceContext.NodeContext.NodeName)
{
continue;
}
@ -943,7 +959,7 @@ namespace FabricHealer
}
// Don't restart thyself.
if (repairData.ServiceName == serviceContext.ServiceName.OriginalString && repairData.NodeName == serviceContext.NodeContext.NodeName)
if (repairData.ServiceName == ServiceContext.ServiceName.OriginalString && repairData.NodeName == ServiceContext.NodeContext.NodeName)
{
continue;
}
@ -984,7 +1000,8 @@ namespace FabricHealer
repairData.RepairPolicy = new RepairPolicy
{
RepairId = repairId
RepairId = repairId,
RepairIdPrefix = RepairTaskEngine.FHTaskIdPrefix
};
repairData.Property = evt.HealthInformation.Property;
string errOrWarn = "Error";
@ -1131,7 +1148,7 @@ namespace FabricHealer
// Since FH can run on each node (-1 InstanceCount), if this is the case then have FH only try to repair app services that are also running on the same node.
// This removes the need to try and orchestrate repairs across nodes (which we will have to do in the non -1 case).
if (_instanceCount == -1 && repairData.NodeName != serviceContext.NodeContext.NodeName)
if (_instanceCount == -1 && repairData.NodeName != ServiceContext.NodeContext.NodeName)
{
continue;
}
@ -1236,7 +1253,7 @@ namespace FabricHealer
}
// Don't restart thyself.
if (repairData.ServiceName == serviceContext.ServiceName.OriginalString && repairData.NodeName == serviceContext.NodeContext.NodeName)
if (repairData.ServiceName == ServiceContext.ServiceName.OriginalString && repairData.NodeName == ServiceContext.NodeContext.NodeName)
{
continue;
}
@ -1305,7 +1322,8 @@ namespace FabricHealer
/* Start repair workflow */
repairData.RepairPolicy = new RepairPolicy
{
RepairId = repairId
RepairId = repairId,
RepairIdPrefix = RepairTaskEngine.FHTaskIdPrefix
};
repairData.Property = evt.HealthInformation.Property;
string errOrWarn = "Error";
@ -1341,7 +1359,7 @@ namespace FabricHealer
{
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"ProcessNodeHealthAsync::ClusterNotSupported",
$"ProcessNodeHealthAsync::ClusterSizeNotSupported",
"Machine/Fabric Node repair is not supported in clusters with 1 Fabric node.",
Token,
null,
@ -1390,7 +1408,7 @@ namespace FabricHealer
}
// Azure tenant/platform update in progress for the target node?
if (await UpgradeChecker.IsAzureUpdateInProgress(nodeType, node.NodeName, Token))
if (await UpgradeChecker.IsAzureUpdateInProgress(node.NodeName, Token))
{
string telemetryDescription = $"{node.NodeName} is down due to Infra repair job (UD = {nodeUD}). Will not schedule another machine repair at this time.";
@ -1458,38 +1476,18 @@ namespace FabricHealer
continue;
}
// TOTHINK...
// If there are mulitple instances of FH deployed to the cluster (like -1 InstanceCount), then don't do machine repairs if this instance of FH
// detects a need to do so. Another instance on a different node will take the job. Only DiskObserver-generated repair data has to be done on the node
// where FO's DiskObserver emitted the related information, for example (like Disk space issues and the need to clean specified (in logic rules) folders).
if ((_instanceCount == -1 || _instanceCount > 2) && node.NodeName == serviceContext.NodeContext.NodeName)
{
continue;
}
// Make sure that there is not already an Infra repair in progress for the target node.
var currentISRepairs =
await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.InfraTaskIdPrefix, Token);
if (currentISRepairs?.Count > 0)
if (await repairTaskEngine.IsNodeLevelRepairCurrentlyInFlightAsync(repairData, Token))
{
// Description for Infra repairs created by FH will always have this form: Machine_Repair_[node type]_[node name] (e.g., Machine_Repair_SFRole0__SFRole0_3).
// TaskId would also work here as FH creates the repair task with an ID that will always have the structure FH/[guid]/[repair action]/[node name]
// (e.g., FH/d0d25d5b-7f36-49cf-aac4-b9d6bfd07e12/System.Reboot/_SFRole0_3)
if (currentISRepairs.Any(r => r.TaskId.EndsWith(node.NodeName) || r.Description.EndsWith(node.NodeName)))
{
var repair = currentISRepairs.FirstOrDefault(r => r.TaskId.EndsWith(node.NodeName) || r.Description.EndsWith(node.NodeName));
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"{node.NodeName}_MachineRepairAlreadyInProgress",
$"There is currently a Machine repair in progress for node {node.NodeName}.",
Token,
null,
ConfigSettings.EnableVerboseLogging);
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"{node.NodeName}_MachineRepairAlreadyInProgress",
$"There is currently a Machine repair in progress for node {node.NodeName}. Repair State: {repair.State}.",
Token,
null,
ConfigSettings.EnableVerboseLogging);
continue;
}
continue;
}
// Get repair rules for supplied facts (TelemetryData).
@ -1502,10 +1500,11 @@ namespace FabricHealer
/* Start repair workflow */
string repairId = $"Machine_Repair_{nodeType}_{repairData.NodeName}";
string repairId = $"MachineRepair_{nodeType}_{repairData.NodeName}";
repairData.RepairPolicy = new RepairPolicy
{
RepairId = repairId
RepairId = repairId,
RepairIdPrefix = RepairTaskEngine.InfraTaskIdPrefix
};
repairData.Property = evt.HealthInformation.Property;
string errOrWarn = "Error";
@ -1536,7 +1535,7 @@ namespace FabricHealer
private async Task ProcessDiskHealthAsync(HealthEvent evt, TelemetryData repairData)
{
// Can only repair local disks.
if (repairData.NodeName != serviceContext.NodeContext.NodeName)
if (repairData.NodeName != ServiceContext.NodeContext.NodeName)
{
return;
}
@ -1554,7 +1553,8 @@ namespace FabricHealer
string repairId = $"Disk_Repair_{repairData.Code}{repairData.NodeName}_DeleteFiles";
repairData.RepairPolicy = new RepairPolicy
{
RepairId = repairId
RepairId = repairId,
RepairIdPrefix = RepairTaskEngine.FHTaskIdPrefix
};
repairData.Property = evt.HealthInformation.Property;
string errOrWarn = "Error";
@ -1612,7 +1612,8 @@ namespace FabricHealer
repairData.RepairPolicy = new RepairPolicy
{
RepairId = repairId
RepairId = repairId,
RepairIdPrefix = RepairTaskEngine.FHTaskIdPrefix
};
repairData.Property = repairData.Property;
string errOrWarn = "Error";
@ -1747,7 +1748,8 @@ namespace FabricHealer
string repairId = $"{nodeName}_{serviceHealth.ServiceName.OriginalString.Remove(0, appName.Length + 1)}_{repairData.PartitionId}";
repairData.RepairPolicy = new RepairPolicy
{
RepairId = repairId
RepairId = repairId,
RepairIdPrefix = RepairTaskEngine.FHTaskIdPrefix
};
repairData.Property = healthEvent.HealthInformation.Property;
@ -1909,7 +1911,7 @@ namespace FabricHealer
break;
// Disk repair.
case EntityType.Disk when serviceContext.NodeContext.NodeName == repairData.NodeName:
case EntityType.Disk when ServiceContext.NodeContext.NodeName == repairData.NodeName:
repairPolicySectionName = RepairConstants.DiskRepairPolicySectionName;
break;
@ -1918,7 +1920,7 @@ namespace FabricHealer
repairPolicySectionName = RepairConstants.MachineRepairPolicySectionName;
break;
// Fabric Node repair (from FabricHealerProxy, for example, where there is no concept of Observer).
// Fabric Node repair.
case EntityType.Node:
repairPolicySectionName = RepairConstants.FabricNodeRepairPolicySectionName;
break;
@ -1934,11 +1936,10 @@ namespace FabricHealer
{
try
{
string logicRulesConfigFileName =
GetSettingParameterValue(serviceContext, repairPolicySectionName, RepairConstants.LogicRulesConfigurationFile);
var configPath = serviceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config").Path;
var rulesFolderPath = Path.Combine(configPath, RepairConstants.LogicRulesFolderName);
var rulesFilePath = Path.Combine(rulesFolderPath, logicRulesConfigFileName);
string logicRulesConfigFileName = GetSettingParameterValue(repairPolicySectionName, RepairConstants.LogicRulesConfigurationFile);
string configPath = ServiceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config").Path;
string rulesFolderPath = Path.Combine(configPath, RepairConstants.LogicRulesFolderName);
string rulesFilePath = Path.Combine(rulesFolderPath, logicRulesConfigFileName);
if (!File.Exists(rulesFilePath))
{
@ -1963,7 +1964,7 @@ namespace FabricHealer
private int GetEnabledRepairRuleCount()
{
var config = serviceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config");
var config = ServiceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config");
int count = 0;
foreach (var section in config.Settings.Sections)
@ -2104,7 +2105,7 @@ namespace FabricHealer
var healthReport = new HealthReport
{
HealthMessage = "Clearing existing health reports as FabricHealer is stopping or updating.",
NodeName = serviceContext.NodeContext.NodeName,
NodeName = ServiceContext.NodeContext.NodeName,
State = HealthState.Ok,
HealthReportTimeToLive = TimeSpan.FromMinutes(5),
};
@ -2124,7 +2125,7 @@ namespace FabricHealer
Thread.Sleep(50);
}
var nodeHealth = await FabricClientSingleton.HealthManager.GetNodeHealthAsync(serviceContext.NodeContext.NodeName);
var nodeHealth = await FabricClientSingleton.HealthManager.GetNodeHealthAsync(ServiceContext.NodeContext.NodeName);
var FHNodeEvents = nodeHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains(RepairConstants.FabricHealer));
foreach (HealthEvent evt in FHNodeEvents)

Просмотреть файл

@ -1,4 +1,4 @@
## Logic rules for scheduling Machine-level repair jobs in the cluster. Only OS reboot is supported today for VMSS-managed clusters.
## Logic rules for scheduling Machine-level repair jobs in the cluster. EntityType fact is Machine.
## FH does not conduct (execute) these repairs. It simply schedules them. InfrastructureService is always the Executor for these types of Repair Jobs.
## Applicable Named Arguments for Mitigate. Facts are supplied by FabricObserver, FHProxy or FH itself.
@ -10,7 +10,7 @@
## | ErrorCode (FO/FHProxy) | Supported Error Code emitted by caller (e.g. "FO002") |
## | MetricName (FO/FHProxy) | Name of the Metric (e.g., CpuPercent or MemoryMB, etc.) |
## | MetricValue (FO/FHProxy) | Corresponding Metric Value (e.g. "85" indicating 85% CPU usage) |
## | OS | The name of the OS from which the data was collected (Linux or Windows)|
## | OS | The name of the OS where FabricHealer is running (Linux or Windows) |
## | HealthState | The HealthState of the target entity: Error or Warning |
## Metric Names, from FO or FHProxy.
@ -25,69 +25,63 @@
## | FileHandles (Linux-only) |
## | FileHandlesPercent (Linux-only)|
## Supported infrastructure repair action names.
## These are supplied to RM with IS as executor.
## | Name |
## |-------------------------------------------|
## | System.Reboot |
## | System.ReimageOS |
## | System.FullReimage |
## If this is what you need, then first check if we are inside the specified run interval for scheduling *any* machine-level repair for any reason.
## Ending with a cut (!) means the goal (Mitigate) has been satisfied and Guan will immediately stop processing rules.
## Mitigate() :- CheckInsideRunInterval(02:00:00), !.
## TimeScopedScheduleReboot is an internal predicate to check for the number of times a Machine reboot repair has run to completion within a supplied time window.
## If Completed Machine Repair count is less then supplied value, then schedule an infrastructure repair via ScheduleMachineRepair predicate.
TimeScopedScheduleReboot(?count, ?time) :- GetRepairHistory(?repairCount, ?time), ?repairCount < ?count,
ScheduleMachineRepair("System.Reboot", MaxOutstandingRepairs=2, ProbationWaitDurationPost=00:30:00).
## TimeScopedScheduleRepair is an internal predicate to check for the number of times the specified machine repair action has run to completion within a supplied time window.
## If the completed machine repair count is less then supplied value, then schedule an infrastructure repair via ScheduleMachineRepair predicate.
TimeScopedScheduleRepair(?count, ?time, ?repairAction) :- GetRepairHistory(?repairCount, ?time), ?repairCount < ?count, ScheduleMachineRepair(?repairAction).
## Metric-defined machine repair scheduling.
## Percent Memory in Use (of total physical).
Mitigate(MetricName=MemoryPercent, MetricValue=?MetricValue) :- ?MetricValue >= 95,
GetHealthEventHistory(?HealthEventCount, 00:15:00), ?HealthEventCount >= 3,
TimeScopedScheduleReboot(4, 08:00:00).
TimeScopedScheduleRepair(4, 08:00:00, System.Reboot).
## File Handles/FDs. Linux-only.
## Percent Allocated, System-wide.
Mitigate(MetricName=FileHandlesPercent, MetricValue=?MetricValue, OS=Linux) :- ?MetricValue >= 95,
GetHealthEventHistory(?HealthEventCount, 00:15:00), ?HealthEventCount >= 3,
TimeScopedScheduleReboot(2, 08:00:00).
TimeScopedScheduleRepair(2, 08:00:00, System.Reboot).
## Total Allocated, System-wide.
Mitigate(MetricName=FileHandles, MetricValue=?MetricValue, OS=Linux) :- ?MetricValue >= 1000000,
GetHealthEventHistory(?HealthEventCount, 00:15:00), ?HealthEventCount >= 3,
TimeScopedScheduleReboot(2, 08:00:00).
TimeScopedScheduleRepair(2, 08:00:00, System.Reboot).
## Non-Metric-defined machine repair scheduling. This means FH will have done the work to figure out if some node is in Error (doesn't matter if FO or FHProxy generated the health event).
## So, in the below set of logic rules, the idea is if some node is in Error, try and repair it via 1. Reboot or 2. Reimage (if Reboot doesn't work) or 3. Heal (if Reimage doesn't work).
## Unstructured machine repair scheduling.
## This means FabricHealerManager provides the facts used here based on HealthEvent store data (not by deserialzing a data type provided by FO or FHProxy).
## Don't schedule any machine repair if one was scheduled less than 10 minutes ago. If this rule succeeds, Guan will immediately stop processing rules (! to make it explicit).
Mitigate() :- CheckInsideScheduleInterval(00:10:00), !.
## Top level rule for post-probation check. FH knows the target node and other repair facts before this rule is executed. So, if any successful machine repair for the target node
## is still in post-repair probation (specified as 30 minutes below, which means 30 minutes after the related SF repair job completed, according to RM.
Mitigate() :- CheckInsideProbationPeriod(00:30:00), !.
## If target node is not in Error, then cut.
## Don't proceed if the target entity is not in Error.
Mitigate(HealthState=?healthState) :- not(?healthState == Error), !.
## TODO: Work on this... Have to account for up/down node states. This will require state.
## ProbationWaitDurationPre (minimum time-in-error) rule. If this rule succeeds, Guan will immediately stop processing rules.
Mitigate() :- GetEntityHealthStateDuration(?duration, Entity=Machine, HealthState=Error), ?duration <= 02:00:00, !.
## Don't proceed if there are 2 machine repairs currently active in the cluster.
Mitigate() :- CheckOutstandingRepairs(2), !.
## TODO: MaxOutstandingRepairs should be pulled out and made a predicate, CheckOutstandingRepairs(MaxRepairs=2), !...
## Reboot. (Supported in VMSS-managed clusters.)
Mitigate() :- GetRepairHistory(?repairCount, 04:00:00, "System.Reboot"), ?repairCount < 2, ScheduleMachineRepair("System.Reboot", MaxOutstandingRepairs=2).
## Don't schedule a repair if one was scheduled less than 10 minutes ago. Do we want this account for all repairs (not just FH-scheduled)?
Mitigate() :- CheckInsideScheduleInterval(00:10:00), !.
## Don't proceed if a node in the cluster is still in post-repair probation (so, a Completed repair, node is recovering).
Mitigate() :- CheckInsideProbationPeriod(00:30:00), !.
## TOTHINK/TODO: Don't proceed if the target node hasn't been in Error (including cyclic Up/Down) state for at least two hours. Note: This is not implemented correctly in FH.
## Mitigate() :- GetEntityHealthStateDuration(?duration, Entity=Machine, HealthState=Error), ?duration <= 02:00:00, !.
## Reboot. Supported in VMSS-managed clusters. Schedule a reboot if the target node hasn't already been rebooted twice in the last 4 hours.
## Don't process any other rules if scheduling succeeds or fails (note the !) and there are less than 2 of these repairs that have completed in the last 4 hours.
Mitigate() :- GetRepairHistory(?repairCount, 04:00:00, System.Reboot), ?repairCount < 2, !, ScheduleMachineRepair(System.Reboot).
## Escalations.
## Reimage. (Not supported in VMSS-managed clusters.)
Mitigate(HealthState=Error) :- GetRepairHistory(?repairCount, 04:00:00, "System.Reboot"), ?repairCount >= 2, ScheduleMachineRepair("System.ReimageOS", MaxOutstandingRepairs=2).
## Reimage. Not supported in VMSS-managed clusters. Schedule a reimage if the target node has already been rebooted twice in the last 4 hours.
## Don't process any other rules if scheduling succeeds or fails (note the !) and there are less than 2 of these repairs that have completed in the last 4 hours.
Mitigate() :- GetRepairHistory(?repairCount, 04:00:00, System.ReimageOS), ?repairCount < 2, !, ScheduleMachineRepair(System.ReimageOS).
## Heal. (Supported in VMSS-managed clusters.)
Mitigate() :- GetRepairHistory(?repairCount, 04:00:00, "System.ReimageOS"), ?repairCount >= 2, ScheduleMachineRepair("System.Azure.Heal", MaxOutstandingRepairs=2).
## Heal. Supported in VMSS-managed clusters. Schedule a heal if the target node has already been reimaged twice in the last 4 hours.
## Don't process any other rules if scheduling succeeds or fails (note the !) and there are less than 2 of these repairs that have completed in the last 4 hours.
Mitigate() :- GetRepairHistory(?repairCount, 04:00:00, System.Azure.Heal), ?repairCount < 2, !, ScheduleMachineRepair(System.Azure.Heal).
## If we end up here, then human intervention is required (Triage).

Просмотреть файл

@ -30,32 +30,41 @@
<Parameter Name="LogAnalyticsLogType" Value="FabricHealer" />
</Section>
<!-- Repair Policies - Overridable Parameters. Must be set in ApplicationManifest.xml. -->
<!-- Repair Policies -->
<Section Name="AppRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
<Parameter Name="MaxRepairsInTimeRange" Value="10, 01:00:00" />
</Section>
<Section Name="DiskRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
<Parameter Name="MaxRepairsInTimeRange" Value="5, 02:00:00" />
</Section>
<Section Name="FabricNodeRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
<Parameter Name="MaxRepairsInTimeRange" Value="5, 02:00:00" />
</Section>
<Section Name="ReplicaRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
<Parameter Name="MaxRepairsInTimeRange" Value="10, 01:00:00" />
</Section>
<Section Name="SystemServiceRepairPolicy">
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
<Parameter Name="MaxRepairsInTimeRange" Value="5, 02:00:00" />
</Section>
<!-- Machine Repair. -->
<Section Name="MachineRepairPolicy">
<Parameter Name="MinimumHealthyDurationInSeconds" Value="300" />
<Parameter Name="Enabled" Value="" MustOverride="true" />
<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
<!-- This prevents rules from over-scheduling repairs, which can be detrimental to cluster health.
Think of this as a guardrail that overrides faulty logic with respect to repair count.
The below setting translates to if 5 machine repairs have been scheduled in the last two 2 hours, do not schedule
another one. -->
<Parameter Name="MaxRepairsInTimeRange" Value="5, 02:00:00; 6, 24:00:00; 7, 48:00:00;" />
</Section>
</Settings>

Просмотреть файл

@ -22,9 +22,17 @@ namespace FabricHealer.Repair
{
public static async Task<bool> IsRepairTaskInDesiredStateAsync(
string taskId,
IList<RepairTaskState> desiredStates)
IList<RepairTaskState> desiredStates,
CancellationToken cancellationToken)
{
IList<RepairTask> repairTaskList = await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(taskId, RepairTaskStateFilter.All, null);
IList<RepairTask> repairTaskList =
await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
taskId,
RepairTaskStateFilter.All,
null,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken);
return desiredStates.Any(desiredState => repairTaskList.Count(rt => rt.State == desiredState) > 0);
}
@ -259,14 +267,13 @@ namespace FabricHealer.Repair
public static async Task<bool> IsLastCompletedFHRepairTaskWithinTimeRangeAsync(
TimeSpan interval,
TelemetryData repairData,
string taskIdPrefix,
CancellationToken cancellationToken)
{
var allRecentFHRepairTasksCompleted =
await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
RepairTaskEngine.FHTaskIdPrefix,
repairData.RepairPolicy.RepairIdPrefix,
RepairTaskStateFilter.Completed,
taskIdPrefix,
null,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken);
@ -277,7 +284,7 @@ namespace FabricHealer.Repair
var orderedRepairList = allRecentFHRepairTasksCompleted.OrderByDescending(o => o.CompletedTimestamp).ToList();
if (taskIdPrefix == RepairTaskEngine.FHTaskIdPrefix)
if (repairData.RepairPolicy.RepairIdPrefix == RepairTaskEngine.FHTaskIdPrefix)
{
var completedFHRepairs = orderedRepairList.Where(
r => r.ResultStatus == RepairTaskResult.Succeeded && r.ExecutorData.Contains(repairData.RepairPolicy.RepairId));
@ -290,7 +297,7 @@ namespace FabricHealer.Repair
}
}
}
else if (taskIdPrefix == RepairTaskEngine.InfraTaskIdPrefix)
else if (repairData.RepairPolicy.RepairIdPrefix == RepairTaskEngine.InfraTaskIdPrefix)
{
var completedInfraRepairs = orderedRepairList.Where(r => r.ResultStatus == RepairTaskResult.Succeeded && r.Description == repairData.RepairPolicy.RepairId);
@ -308,12 +315,12 @@ namespace FabricHealer.Repair
public static async Task<bool> IsLastScheduledRepairJobWithinTimeRangeAsync(
TimeSpan interval,
string TaskIdPrefix,
TelemetryData repairData,
CancellationToken cancellationToken)
{
var allCurrentFHRepairTasks =
await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
TaskIdPrefix,
repairData.RepairPolicy.RepairIdPrefix,
RepairTaskStateFilter.Active | RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing,
null,
FabricHealerManager.ConfigSettings.AsyncTimeout,
@ -345,12 +352,11 @@ namespace FabricHealer.Repair
public static async Task<int> GetCompletedRepairCountWithinTimeRangeAsync(
TimeSpan timeWindow,
TelemetryData repairData,
CancellationToken cancellationToken,
string repairAction = null)
CancellationToken cancellationToken)
{
var allRecentFHRepairTasksCompleted =
await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
RepairTaskEngine.FHTaskIdPrefix,
repairData.RepairPolicy.RepairIdPrefix,
RepairTaskStateFilter.Completed,
null,
FabricHealerManager.ConfigSettings.AsyncTimeout,
@ -372,6 +378,68 @@ namespace FabricHealer.Repair
continue;
}
// Non-Machine repairs scheduled and executed by FH.
if (repair.Executor == RepairConstants.FabricHealer)
{
var fhExecutorData = JsonSerializationUtility.TryDeserializeObject(repair.ExecutorData, out RepairExecutorData exData) ? exData : null;
if (fhExecutorData == null || fhExecutorData.RepairData?.RepairPolicy == null)
{
continue;
}
if (repairData.RepairPolicy.RepairId != fhExecutorData.RepairData.RepairPolicy.RepairId)
{
continue;
}
if (DateTime.UtcNow.Subtract(repair.CompletedTimestamp.Value) <= timeWindow)
{
count++;
}
}
// Machine repairs scheduled by FH.
else if (repairData.RepairPolicy.InfrastructureRepairName == repair.Action)
{
if (DateTime.UtcNow.Subtract(repair.CompletedTimestamp.Value) <= timeWindow)
{
count++;
}
}
}
return count;
}
public static async Task<int> GetCreatedRepairCountWithinTimeRangeAsync(
TimeSpan timeWindow,
TelemetryData repairData,
CancellationToken cancellationToken)
{
var allRecentFHRepairTasksCreated =
await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
null,
RepairTaskStateFilter.Created,
null,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken);
if (allRecentFHRepairTasksCreated?.Count == 0)
{
return 0;
}
int count = 0;
foreach (RepairTask repair in allRecentFHRepairTasksCreated)
{
cancellationToken.ThrowIfCancellationRequested();
if (repair.CreatedTimestamp == null || !repair.CreatedTimestamp.HasValue)
{
continue;
}
// Non-Machine repairs (FH is executor, custom repair ExecutorData supplied by FH.)
if (repair.Executor == RepairConstants.FabricHealer)
{
@ -387,44 +455,46 @@ namespace FabricHealer.Repair
continue;
}
// Note: Completed aborted/cancelled repair tasks should not block repairs if they are inside run interval.
if (DateTime.UtcNow.Subtract(repair.CompletedTimestamp.Value) <= timeWindow
&& repair.Flags != RepairTaskFlags.CancelRequested && repair.Flags != RepairTaskFlags.AbortRequested)
if (DateTime.UtcNow.Subtract(repair.CreatedTimestamp.Value) <= timeWindow)
{
count++;
}
}
// Machine repairs (IS is executor, ExecutorData supplied by IS. Custom FH repair id supplied as repair Description.)
else if (repairData.RepairPolicy.RepairId != repair.Description)
// Machine/other source repairs.
else if (DateTime.UtcNow.Subtract(repair.CreatedTimestamp.Value) <= timeWindow)
{
// Repair action string supplied.
if (!string.IsNullOrWhiteSpace(repairAction) && repairData.RepairPolicy.InfrastructureRepairName == repairAction)
{
if (DateTime.UtcNow.Subtract(repair.CompletedTimestamp.Value) <= timeWindow
&& repair.Flags != RepairTaskFlags.CancelRequested && repair.Flags != RepairTaskFlags.AbortRequested)
{
count++;
}
}
count++;
}
}
return count;
}
internal static async Task<TimeSpan> GetEntityCurrentHealthStateDurationAsync(EntityType entityType, string entityFilter, HealthState state, CancellationToken token)
/// <summary>
/// Returns the anount of time the target entity (application, node, etc) has been in the specified health state.
/// </summary>
/// <param name="entityType">EntityType</param>
/// <param name="nameOrIdFilter">String representation of the target entity's name or ID (e.g., application name or node name or partition id)</param>
/// <param name="healthState">Target HealthState to match.</param>
/// <param name="token">CancellationToken</param>
/// <returns></returns>
internal static async Task<TimeSpan> GetEntityCurrentHealthStateDurationAsync(
EntityType entityType,
string nameOrIdFilter,
HealthState healthState,
CancellationToken token)
{
HealthEventsFilter healthEventsFilter = new HealthEventsFilter();
if (state == HealthState.Warning)
if (healthState == HealthState.Warning)
{
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Warning;
}
else if (state == HealthState.Error)
else if (healthState == HealthState.Error)
{
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Error;
}
else if (state == HealthState.Ok)
else if (healthState == HealthState.Ok)
{
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Ok;
}
@ -436,81 +506,221 @@ namespace FabricHealer.Repair
switch (entityType)
{
case EntityType.Application:
break;
case EntityType.Service:
break;
case EntityType.Machine:
case EntityType.Node:
var queryDesc = new NodeHealthQueryDescription(entityFilter)
var appqueryDesc = new ApplicationHealthQueryDescription(new Uri(nameOrIdFilter))
{
EventsFilter = healthEventsFilter
};
var nodeHealthList =
await FabricHealerManager.FabricClientSingleton.HealthManager.GetNodeHealthAsync(
queryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
if (nodeHealthList == null || nodeHealthList.HealthEvents.Count == 0)
try
{
var appHealth =
await FabricHealerManager.FabricClientSingleton.HealthManager.GetApplicationHealthAsync(
appqueryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
if (appHealth == null || appHealth.HealthEvents.Count == 0)
{
return TimeSpan.MinValue;
}
// How many times has the entity transitioned to Error health state in the last hour?
// This is not going to work if the same event is created in a cycle. TODO: figure out how to do this correctly.
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
{
var appHealthErrorEvents =
appHealth.HealthEvents.Where(
evt => DateTime.UtcNow.Subtract(
evt.LastErrorTransitionAt) <= TimeSpan.FromHours(1)).OrderByDescending(
o => o.LastErrorTransitionAt);
int errorCount = appHealthErrorEvents.Count();
if (errorCount > 1)
{
return DateTime.UtcNow.Subtract(appHealthErrorEvents.First().LastErrorTransitionAt);
}
}
var appHealthEvents = appHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
// return the time since the last health event was issued, as a TimeSpan.
return DateTime.UtcNow.Subtract(appHealthEvents.First().SourceUtcTimestamp);
}
catch (FabricException)
{
return TimeSpan.MinValue;
}
foreach (var nodeHealthEvent in nodeHealthList.HealthEvents)
case EntityType.Partition:
var partitionqueryDesc = new PartitionHealthQueryDescription(Guid.Parse(nameOrIdFilter))
{
if (nodeHealthEvent.IsExpired)
EventsFilter = healthEventsFilter
};
try
{
var partitionHealth =
await FabricHealerManager.FabricClientSingleton.HealthManager.GetPartitionHealthAsync(
partitionqueryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
if (partitionHealth == null || partitionHealth.HealthEvents.Count == 0)
{
continue;
return TimeSpan.MinValue;
}
return DateTime.UtcNow.Subtract(nodeHealthEvent.SourceUtcTimestamp);
// How many times has the entity transitioned to Error health state in the last hour?
// This is not going to work if the same event is created in a cycle. TODO: figure out how to do this correctly.
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
{
var partitionHealthErrorEvents =
partitionHealth.HealthEvents.Where(
evt => DateTime.UtcNow.Subtract(
evt.LastErrorTransitionAt) <= TimeSpan.FromHours(1)).OrderByDescending(
o => o.LastErrorTransitionAt);
int errorCount = partitionHealthErrorEvents.Count();
if (errorCount > 1)
{
return DateTime.UtcNow.Subtract(partitionHealthErrorEvents.First().LastErrorTransitionAt);
}
}
var partitionHealthEvents = partitionHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
return DateTime.UtcNow.Subtract(partitionHealthEvents.First().SourceUtcTimestamp);
}
catch (FabricException)
{
return TimeSpan.MinValue;
}
break;
case EntityType.Service:
var servicequeryDesc = new ServiceHealthQueryDescription(new Uri(nameOrIdFilter))
{
EventsFilter = healthEventsFilter
};
try
{
var serviceHealth =
await FabricHealerManager.FabricClientSingleton.HealthManager.GetServiceHealthAsync(
servicequeryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
if (serviceHealth == null || serviceHealth.HealthEvents.Count == 0)
{
return TimeSpan.MinValue;
}
// How many times has the entity transitioned to Error health state in the last hour?
// This is not going to work if the same event is created in a cycle. TODO: figure out how to do this correctly.
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
{
var serviceHealthErrorEvents =
serviceHealth.HealthEvents.Where(
evt => DateTime.UtcNow.Subtract(
evt.LastErrorTransitionAt) <= TimeSpan.FromHours(1)).OrderByDescending(
o => o.LastErrorTransitionAt);
int errorCount = serviceHealthErrorEvents.Count();
if (errorCount > 1)
{
return DateTime.UtcNow.Subtract(serviceHealthErrorEvents.First().LastErrorTransitionAt);
}
}
var serviceHealthEvents = serviceHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
return DateTime.UtcNow.Subtract(serviceHealthEvents.First().SourceUtcTimestamp);
}
catch (FabricException)
{
return TimeSpan.MinValue;
}
case EntityType.Disk:
case EntityType.Machine:
case EntityType.Node:
var nodequeryDesc = new NodeHealthQueryDescription(nameOrIdFilter)
{
EventsFilter = healthEventsFilter
};
try
{
var nodeHealth =
await FabricHealerManager.FabricClientSingleton.HealthManager.GetNodeHealthAsync(
nodequeryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
if (nodeHealth == null || nodeHealth.HealthEvents.Count == 0)
{
return TimeSpan.MinValue;
}
// How many times has the entity transitioned to Error health state in the last hour?
// This is not going to work if the same event is created in a cycle. TODO: figure out how to do this correctly.
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
{
var nodeHealthErrorEvents =
nodeHealth.HealthEvents.Where(
evt => DateTime.UtcNow.Subtract(
evt.LastErrorTransitionAt) <= TimeSpan.FromHours(2)).OrderByDescending(
o => o.LastErrorTransitionAt);
int errorCount = nodeHealthErrorEvents.Count();
if (errorCount > 1)
{
return DateTime.UtcNow.Subtract(nodeHealthErrorEvents.First().LastErrorTransitionAt);
}
}
var nodeHealthEvents = nodeHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
return DateTime.UtcNow.Subtract(nodeHealthEvents.First().SourceUtcTimestamp);
}
catch (Exception e) when (e is ArgumentException || e is FabricException || e is InvalidOperationException || e is TaskCanceledException || e is TimeoutException)
{
string message = $"Unable to get {healthState} health state duration for {entityType}: {e.Message}";
FabricHealerManager.RepairLogger.LogWarning(message);
return TimeSpan.MinValue;
}
default:
return TimeSpan.MinValue;
}
return TimeSpan.MinValue;
}
internal static async Task<bool> IsRepairInPostProbationAsync(
TimeSpan probationPeriod,
string taskPrefixId,
TelemetryData repairData,
CancellationToken cancellationToken)
// TOTHINK: Should this look at any repair and apply a probation to it (so, not just FH-scheduled/executed repairs).
// This mainly makes sense for node-level repairs (machine).
internal static async Task<bool> IsRepairInPostProbationAsync(TimeSpan probationPeriod, TelemetryData repairData, CancellationToken cancellationToken)
{
var allCurrentFHRepairTasks =
var allCompletedFHRepairTasks =
await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
taskPrefixId,
null, // no prefix filter..
RepairTaskStateFilter.Completed,
null,
null, // no executor name filter..
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken);
if (allCurrentFHRepairTasks == null || allCurrentFHRepairTasks.Count == 0)
if (allCompletedFHRepairTasks == null || allCompletedFHRepairTasks.Count == 0)
{
return false;
}
var orderedRepairList = allCurrentFHRepairTasks.OrderByDescending(o => o.CompletedTimestamp).ToList();
var orderedRepairList = allCompletedFHRepairTasks.OrderByDescending(o => o.CompletedTimestamp).ToList();
foreach (var repair in orderedRepairList)
{
if (repair.Description != repairData.RepairPolicy.RepairId)
{
continue;
}
if (repair.CompletedTimestamp == null)
{
continue;
}
if (repair.CompletedTimestamp.HasValue &&
DateTime.UtcNow.Subtract(repair.CompletedTimestamp.Value) <= probationPeriod)
DateTime.UtcNow.Subtract(repair.CompletedTimestamp.Value) < probationPeriod)
{
return true;
}

Просмотреть файл

@ -1,4 +1,9 @@
using System;
// ------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
using System;
using Guan.Logic;
using FabricHealer.Utilities;
using FabricHealer.Utilities.Telemetry;
@ -37,12 +42,7 @@ namespace FabricHealer.Repair.Guan
return false;
}
bool insideProbationPeriod =
await FabricRepairTasks.IsRepairInPostProbationAsync(
interval,
RepairData.EntityType == EntityType.Machine ? RepairTaskEngine.InfraTaskIdPrefix : RepairTaskEngine.FHTaskIdPrefix,
RepairData,
FabricHealerManager.Token);
bool insideProbationPeriod = await FabricRepairTasks.IsRepairInPostProbationAsync(interval, RepairData, FabricHealerManager.Token);
if (!insideProbationPeriod)
{
@ -50,7 +50,7 @@ namespace FabricHealer.Repair.Guan
}
string message = $"FH repair job {RepairData.RepairPolicy.RepairId} is currently in post-repair health probation ({interval}). " +
$"Will not schedule another repair for the target {RepairData.RepairPolicy} at this time.";
$"Will not schedule another machine repair for {RepairData.NodeName} at this time.";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,

Просмотреть файл

@ -47,7 +47,6 @@ namespace FabricHealer.Repair.Guan
await FabricRepairTasks.IsLastCompletedFHRepairTaskWithinTimeRangeAsync(
interval,
RepairData,
RepairData.EntityType == EntityType.Machine ? RepairTaskEngine.InfraTaskIdPrefix : RepairTaskEngine.FHTaskIdPrefix,
FabricHealerManager.Token);
if (!insideRunInterval)

Просмотреть файл

@ -45,7 +45,7 @@ namespace FabricHealer.Repair.Guan
bool insideScheduleInterval =
await FabricRepairTasks.IsLastScheduledRepairJobWithinTimeRangeAsync(
interval,
RepairData.EntityType == EntityType.Machine ? RepairTaskEngine.InfraTaskIdPrefix : RepairTaskEngine.FHTaskIdPrefix,
RepairData,
FabricHealerManager.Token);
if (!insideScheduleInterval)

Просмотреть файл

@ -0,0 +1,80 @@
// ------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
using Guan.Logic;
using FabricHealer.Utilities;
using FabricHealer.Utilities.Telemetry;
using System.Threading.Tasks;
namespace FabricHealer.Repair.Guan
{
public class CheckOutstandingRepairsPredicateType : PredicateType
{
private static CheckOutstandingRepairsPredicateType Instance;
private static TelemetryData RepairData;
private class Resolver : BooleanPredicateResolver
{
public Resolver(CompoundTerm input, Constraint constraint, QueryContext context)
: base(input, constraint, context)
{
}
protected override async Task<bool> CheckAsync()
{
int count = Input.Arguments.Count;
var repairTaskEngine = new RepairTaskEngine();
if (count == 0 || Input.Arguments[0].Value.GetEffectiveTerm().GetObjectValue().GetType() != typeof(long))
{
throw new GuanException("CheckOutstandingRepairs: One argument is required and it must be a number (int).");
}
long maxRepairs = (long)Input.Arguments[0].Value.GetEffectiveTerm().GetObjectValue();
if (maxRepairs == 0)
{
return false;
}
RepairData.RepairPolicy.MaxConcurrentRepairs = maxRepairs;
int outstandingRepairCount =
await repairTaskEngine.GetOutstandingFHRepairCount(taskIdPrefix: RepairData.RepairPolicy.RepairIdPrefix, FabricHealerManager.Token);
if (outstandingRepairCount >= maxRepairs)
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"CheckOutstandingRepairs::MaxOustanding",
$"The number of outstanding machine repairs is currently at the specified limit ({maxRepairs}). " +
$"Will not schedule any other machine repairs at this time.",
FabricHealerManager.Token);
return true;
}
return false;
}
}
public static CheckOutstandingRepairsPredicateType Singleton(string name, TelemetryData repairData)
{
RepairData = repairData;
return Instance ??= new CheckOutstandingRepairsPredicateType(name);
}
private CheckOutstandingRepairsPredicateType(string name)
: base(name, true, 1)
{
}
public override PredicateResolver CreateResolver(CompoundTerm input, Constraint constraint, QueryContext context)
{
return new Resolver(input, constraint, context);
}
}
}

Просмотреть файл

@ -30,7 +30,7 @@ namespace FabricHealer.Repair.Guan
protected override async Task<bool> CheckAsync()
{
// Can only delete files on the same VM where the FH instance that took the job is running.
if (RepairData.NodeName != RepairTaskManager.Context.NodeContext.NodeName)
if (RepairData.NodeName != FabricHealerManager.ServiceContext.NodeContext.NodeName)
{
return false;
}

Просмотреть файл

@ -43,8 +43,31 @@ namespace FabricHealer.Repair.Guan
{
throw new GuanException("The third argument of GetCurrentEntityHealthStateDuration must be a valid HealthState value (Error, Warning, etc..)");
}
duration = await FabricRepairTasks.GetEntityCurrentHealthStateDurationAsync(entityType, RepairData.NodeName, state, FabricHealerManager.Token);
switch (entityType)
{
case EntityType.Application:
duration = await FabricRepairTasks.GetEntityCurrentHealthStateDurationAsync(entityType, RepairData.ApplicationName, state, FabricHealerManager.Token);
break;
case EntityType.Disk:
case EntityType.Machine:
case EntityType.Node:
duration = await FabricRepairTasks.GetEntityCurrentHealthStateDurationAsync(entityType, RepairData.NodeName, state, FabricHealerManager.Token);
break;
case EntityType.Partition:
duration = await FabricRepairTasks.GetEntityCurrentHealthStateDurationAsync(entityType, RepairData.PartitionId.ToString(), state, FabricHealerManager.Token);
break;
case EntityType.Service:
duration = await FabricRepairTasks.GetEntityCurrentHealthStateDurationAsync(entityType, RepairData.ServiceName, state, FabricHealerManager.Token);
break;
default:
throw new GuanException($"Unsupported entity type: {entityType}");
}
var result = new CompoundTerm(this.Input.Functor);
result.AddArgument(new Constant(duration), "0");

Просмотреть файл

@ -27,7 +27,6 @@ namespace FabricHealer.Repair.Guan
{
long repairCount;
TimeSpan timeWindow = TimeSpan.MinValue;
string repairAction = null;
long args = Input.Arguments.Count;
for (int i = 1; i < args; i++)
@ -37,11 +36,18 @@ namespace FabricHealer.Repair.Guan
switch (typeString)
{
case "TimeSpan":
timeWindow = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue();
break;
// This only makes sense for Machine-level repair rules, where you can specify any string for machine repair action that is supported in your SF configuration.
// Otherwise, FH already knows what you mean with GetRepairHistory([TimeSpan value]), given the repair context (which FH creates).
case "String":
repairAction = Input.Arguments[i].Value.GetEffectiveTerm().GetStringValue();
if (RepairData.EntityType == EntityType.Machine)
{
RepairData.RepairPolicy.InfrastructureRepairName = Input.Arguments[i].Value.GetEffectiveTerm().GetStringValue();
}
break;
default:
@ -53,7 +59,7 @@ namespace FabricHealer.Repair.Guan
if (timeWindow > TimeSpan.MinValue)
{
repairCount =
await FabricRepairTasks.GetCompletedRepairCountWithinTimeRangeAsync(timeWindow, RepairData, FabricHealerManager.Token, repairAction);
await FabricRepairTasks.GetCompletedRepairCountWithinTimeRangeAsync(timeWindow, RepairData, FabricHealerManager.Token);
}
else
{

Просмотреть файл

@ -28,7 +28,7 @@ namespace FabricHealer.Repair.Guan
protected override async Task<bool> CheckAsync()
{
// Can only kill processes on the same node where the FH instance that took the job is running.
if (RepairData.NodeName != RepairTaskManager.Context.NodeContext.NodeName)
if (RepairData.NodeName != FabricHealerManager.ServiceContext.NodeContext.NodeName)
{
return false;
}

Просмотреть файл

@ -53,31 +53,20 @@ namespace FabricHealer.Repair.Guan
RepairData.RepairPolicy.InfrastructureRepairName = repairAction;
break;
case "TimeSpan":
RepairData.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue();
break;
case "Boolean":
RepairData.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue();
break;
// Guan logic defaults to long for numeric types.
case "Int64":
RepairData.RepairPolicy.MaxConcurrentRepairs = (long)Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue();
break;
default:
throw new GuanException(
"Failure in ScheduleMachineRepairPredicateType. Unsupported argument type specified: " +
"Failure in ScheduleMachineRepair. Unsupported argument type specified: " +
$"{Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue().GetType().Name}{Environment.NewLine}" +
$"Only String, TimeSpan, Boolean and Int32/64 argument types are supported by this predicate.");
$"Only String and Boolean argument types are supported by this predicate.");
}
}
bool isRepairAlreadyInProgress =
await repairTaskEngine.IsRepairInProgressAsync(
RepairData.EntityType == EntityType.Machine ? RepairTaskEngine.InfraTaskIdPrefix : RepairTaskEngine.FHTaskIdPrefix,
RepairData,
FabricHealerManager.Token);
await repairTaskEngine.IsNodeLevelRepairCurrentlyInFlightAsync(RepairData, FabricHealerManager.Token);
if (isRepairAlreadyInProgress)
{
@ -86,32 +75,15 @@ namespace FabricHealer.Repair.Guan
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"ScheduleMachineRepairPredicateType::{RepairData.RepairPolicy.RepairId}",
$"ScheduleMachineRepair::{RepairData.RepairPolicy.InfrastructureRepairName}",
message,
FabricHealerManager.Token);
return false;
}
int outstandingRepairCount =
await repairTaskEngine.GetOutstandingRepairCount(taskIdPrefix: RepairTaskEngine.InfraTaskIdPrefix, FabricHealerManager.Token);
if (RepairData.RepairPolicy.MaxConcurrentRepairs > 0 && outstandingRepairCount >= RepairData.RepairPolicy.MaxConcurrentRepairs)
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"ScheduleMachineRepairPredicateType::MaxOustandingRepairs",
$"The number of outstanding machine repairs is currently at the maximum specified threshold ({RepairData.RepairPolicy.MaxConcurrentRepairs}). " +
$"Will not schedule any other machine repairs at this time.",
FabricHealerManager.Token);
return false;
}
// TODO: Experiment with Guan context information for rules (what rule executed for repair).
// Attempt to schedule an Infrastructure Repair Job (where IS is the executor).
bool success = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
bool success = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() => RepairTaskManager.ScheduleInfrastructureRepairTask(
RepairData,
FabricHealerManager.Token),
@ -128,7 +100,7 @@ namespace FabricHealer.Repair.Guan
}
private ScheduleMachineRepairPredicateType(string name)
: base(name, true, 0)
: base(name, true, 1, 2)
{
}

Просмотреть файл

@ -36,6 +36,7 @@ namespace FabricHealer.Repair
// General Repair Settings Parameters.
public const string EnableAutoMitigation = "EnableAutoMitigation";
public const string MaxRepairsInTimeRange = "MaxRepairsInTimeRange";
// RepairPolicy Settings Sections.
public const string FabricNodeRepairPolicySectionName = "FabricNodeRepairPolicy";
@ -88,6 +89,7 @@ namespace FabricHealer.Repair
public const string CheckInsideRunInterval = "CheckInsideRunInterval";
public const string CheckInsideScheduleInterval = "CheckInsideScheduleInterval";
public const string CheckInsideProbationPeriod = "CheckInsideProbationPeriod";
public const string CheckOutstandingRepairs = "CheckOutstandingRepairs";
public const string CheckFolderSize = "CheckFolderSize";
public const string GetEntityHealthStateDuration = "GetEntityHealthStateDuration";
public const string GetHealthEventHistory = "GetHealthEventHistory";

Просмотреть файл

@ -29,12 +29,9 @@ namespace FabricHealer.Repair
public sealed class RepairExecutor
{
private const double MaxWaitTimeMinutesForNodeOperation = 60.0;
private readonly StatelessServiceContext serviceContext;
public RepairExecutor(StatelessServiceContext context, CancellationToken token)
public RepairExecutor()
{
serviceContext = context;
try
{
if (FabricHealerManager.ConfigSettings == null)
@ -248,13 +245,13 @@ namespace FabricHealer.Repair
}
ServiceDescription serviceDesc =
await FabricHealerManager.FabricClientSingleton.ServiceManager.GetServiceDescriptionAsync(serviceContext.ServiceName, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
await FabricHealerManager.FabricClientSingleton.ServiceManager.GetServiceDescriptionAsync(FabricHealerManager.ServiceContext.ServiceName, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
int instanceCount = (serviceDesc as StatelessServiceDescription).InstanceCount;
if (instanceCount == -1)
{
bool isTargetNodeHostingFH = repairData.NodeName == serviceContext.NodeContext.NodeName;
bool isTargetNodeHostingFH = repairData.NodeName == FabricHealerManager.ServiceContext.NodeContext.NodeName;
if (isTargetNodeHostingFH)
{

Просмотреть файл

@ -68,5 +68,14 @@ namespace FabricHealer.Repair
{
get; set;
}
/// <summary>
/// The repair ID prefix used to associate an FH repair to its executor (FH or FH_Infra, for example).
/// </summary>
[EventField]
public string RepairIdPrefix
{
get; set;
}
}
}

Просмотреть файл

@ -4,11 +4,13 @@
// ------------------------------------------------------------
using System;
using System.Fabric;
using System.Fabric.Health;
using System.Fabric.Repair;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using FabricHealer.TelemetryLib;
using FabricHealer.Utilities;
using FabricHealer.Utilities.Telemetry;
@ -207,7 +209,50 @@ namespace FabricHealer.Repair
return false;
}
public async Task<int> GetOutstandingRepairCount(string taskIdPrefix, CancellationToken token)
/// <summary>
/// Determines if a node-impactful repair has already been scheduled/claimed for a target node.
/// </summary>
/// <param name="repairData">TelemetryData instance.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns></returns>
public async Task<bool> IsNodeLevelRepairCurrentlyInFlightAsync(TelemetryData repairData, CancellationToken cancellationToken)
{
try
{
var currentlyExecutingRepairs =
await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
null,
RepairTaskStateFilter.Active | RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing,
null,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken);
if (currentlyExecutingRepairs.Count > 0)
{
foreach (var repair in currentlyExecutingRepairs)
{
if (repair.Impact is NodeRepairImpactDescription impact)
{
if (!impact.ImpactedNodes.Any(
n => n.NodeName == repairData.NodeName && (n.ImpactLevel == NodeImpactLevel.Restart || n.ImpactLevel == NodeImpactLevel.RemoveData)))
{
continue;
}
}
return true;
}
}
}
catch (Exception e) when (e is FabricException || e is TaskCanceledException || e is TimeoutException)
{
}
return false;
}
public async Task<int> GetOutstandingFHRepairCount(string taskIdPrefix, CancellationToken token)
{
RepairTaskList repairTasksInProgress =
await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
@ -224,5 +269,49 @@ namespace FabricHealer.Repair
return repairTasksInProgress.Count(r => r.TaskId.StartsWith(taskIdPrefix));
}
/// <summary>
/// Get number of currently active repair jobs in the cluster.
/// </summary>
/// <param name="taskIdPrefix"></param>
/// <param name="token"></param>
/// <returns></returns>
public async Task<int> GetOutstandingMachineRepairCount(CancellationToken token)
{
int count = 0;
try
{
var currentlyExecutingRepairs =
await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
null,
RepairTaskStateFilter.Active | RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing,
null,
FabricHealerManager.ConfigSettings.AsyncTimeout,
token);
if (currentlyExecutingRepairs.Count > 0)
{
foreach (var repair in currentlyExecutingRepairs)
{
if (repair.Impact is NodeRepairImpactDescription impact)
{
if (!impact.ImpactedNodes.Any(n => n.ImpactLevel == NodeImpactLevel.Restart || n.ImpactLevel == NodeImpactLevel.RemoveData))
{
continue;
}
count++;
}
}
}
}
catch (Exception e) when (e is FabricException || e is TaskCanceledException || e is TimeoutException)
{
}
return count;
}
}
}

Просмотреть файл

@ -18,6 +18,7 @@ using FabricHealer.Interfaces;
using Guan.Logic;
using FabricHealer.Repair.Guan;
using FabricHealer.Utilities;
using System.Transactions;
namespace FabricHealer.Repair
{
@ -32,12 +33,10 @@ namespace FabricHealer.Repair
private readonly TimeSpan maxLifeTimeHealthEventsData = TimeSpan.FromDays(2);
private DateTime lastHealthEventsListClearDateTime;
internal readonly List<HealthEvent> detectedHealthEvents = new List<HealthEvent>();
internal readonly StatelessServiceContext Context;
public RepairTaskManager(StatelessServiceContext context, CancellationToken token)
public RepairTaskManager()
{
Context = context;
repairExecutor = new RepairExecutor(context, token);
repairExecutor = new RepairExecutor();
repairTaskEngine = new RepairTaskEngine();
lastHealthEventsListClearDateTime = healthEventsListCreationTime;
}
@ -136,13 +135,14 @@ namespace FabricHealer.Repair
// Add external helper predicates.
functorTable.Add(CheckFolderSizePredicateType.Singleton(RepairConstants.CheckFolderSize, this, repairData));
functorTable.Add(GetRepairHistoryPredicateType.Singleton(RepairConstants.GetRepairHistory, repairData));
functorTable.Add(GetHealthEventHistoryPredicateType.Singleton(RepairConstants.GetHealthEventHistory, this, repairData));
functorTable.Add(CheckInsideRunIntervalPredicateType.Singleton(RepairConstants.CheckInsideRunInterval, repairData));
functorTable.Add(CheckInsideProbationPeriodType.Singleton(RepairConstants.CheckInsideProbationPeriod, repairData));
functorTable.Add(CheckInsideScheduleIntervalPredicateType.Singleton(RepairConstants.CheckInsideScheduleInterval, repairData));
functorTable.Add(CheckOutstandingRepairsPredicateType.Singleton(RepairConstants.CheckOutstandingRepairs, repairData));
functorTable.Add(EmitMessagePredicateType.Singleton(RepairConstants.EmitMessage));
functorTable.Add(GetEntityHealthStateDurationPredicateType.Singleton(RepairConstants.GetEntityHealthStateDuration, repairData));
functorTable.Add(GetHealthEventHistoryPredicateType.Singleton(RepairConstants.GetHealthEventHistory, this, repairData));
functorTable.Add(GetRepairHistoryPredicateType.Singleton(RepairConstants.GetRepairHistory, repairData));
// Add external repair predicates.
functorTable.Add(DeleteFilesPredicateType.Singleton(RepairConstants.DeleteFiles, this, repairData));
@ -197,71 +197,19 @@ namespace FabricHealer.Repair
}
// The repair will be executed by SF Infrastructure service, not FH. This is the case for all
// Machine-level repairs. IS will communicate with VMSS (for example) to guarantee safe repairs in MR-enabled
// clusters. RM, as usual, will orchestrate the repair cycle.
// Machine-level repairs.
public async Task<bool> ScheduleInfrastructureRepairTask(TelemetryData repairData, CancellationToken cancellationToken)
{
/*var infraServices = await FabricRepairTasks.GetInfrastructureServiceInstancesAsync(cancellationToken);
var arrServices = infraServices as Service[] ?? infraServices.ToArray();
if (arrServices.Length == 0)
// Internal throttling to protect against bad rules (over scheduling of repair tasks within a fixed time range).
if (await RepairCountThrottleMaxCheck(repairData, cancellationToken))
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"ScheduleInfrastructureRepairTask::ISNotFound",
"Infrastructure Service not deployed. Will not attemp Machine repair.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
return false;
}
string executorName = null;
foreach (var service in arrServices)
{
if (!service.ServiceName.OriginalString.Contains(repairData.NodeType))
{
continue;
}
executorName = service.ServiceName.OriginalString;
string message = $"Too many repairs of this type have been scheduled in the last 1 hour: " +
$"{repairData.RepairPolicy.InfrastructureRepairName}. Will not schedule another repair at this time.";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"ScheduleInfrastructureRepairTask::ExecutorFound",
$"Executor set to {executorName} for Machine Repair Action {repairData.RepairPolicy.InfrastructureRepairName}.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
break;
}
if (executorName == null)
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"ScheduleInfrastructureRepairTask::ISNotFound({repairData.NodeType})",
$"Unable to find InfrastructureService instance for node type {repairData.NodeType}.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
return false;
}*/
// Make sure there is not already a repair job executing machine-level repair for target node.
bool isRepairAlreadyInProgress =
await repairTaskEngine.IsRepairInProgressAsync(RepairTaskEngine.InfraTaskIdPrefix, repairData, cancellationToken);
if (isRepairAlreadyInProgress)
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"ScheduleInfrastructureRepairTask::RepairAlreadyInProgress",
$"A Machine repair for {repairData.NodeName} is already in progress.",
$"InternalThrottling({repairData.NodeName}::{repairData.RepairPolicy.InfrastructureRepairName})",
message,
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
@ -274,13 +222,13 @@ namespace FabricHealer.Repair
if (repairTask == null)
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"ScheduleInfrastructureRepairTask::Failure",
"Unable to schedule Infrastructure Repair Task.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"ScheduleInfrastructureRepairTask::Failure",
"Unable to schedule Infrastructure Repair Task.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
return false;
}
@ -296,6 +244,103 @@ namespace FabricHealer.Repair
return true;
}
private static async Task<bool> RepairCountThrottleMaxCheck(TelemetryData repairData, CancellationToken cancellationToken)
{
string repairPolicySectionName;
switch (repairData.EntityType)
{
// App/Service repair (user).
case EntityType.Application when repairData.ApplicationName.ToLower() != RepairConstants.SystemAppName.ToLower():
case EntityType.Service:
case EntityType.StatefulService:
case EntityType.StatelessService:
repairPolicySectionName = RepairConstants.AppRepairPolicySectionName;
break;
// System service process repair.
case EntityType.Application when repairData.ProcessName != null:
case EntityType.Process:
repairPolicySectionName = RepairConstants.SystemServiceRepairPolicySectionName;
break;
// Disk repair.
case EntityType.Disk when FabricHealerManager.ServiceContext.NodeContext.NodeName == repairData.NodeName:
repairPolicySectionName = RepairConstants.DiskRepairPolicySectionName;
break;
// Machine repair.
case EntityType.Machine:
repairPolicySectionName = RepairConstants.MachineRepairPolicySectionName;
break;
// Fabric Node repair.
case EntityType.Node:
repairPolicySectionName = RepairConstants.FabricNodeRepairPolicySectionName;
break;
default:
return false;
}
string throttleSetting = FabricHealerManager.GetSettingParameterValue(repairPolicySectionName, RepairConstants.MaxRepairsInTimeRange);
// <Parameter Name="MaxRepairsInTimeRange" Value="5, 02:00:00" />
// <Parameter Name="MaxRepairsInTimeRange" Value="5, 02:00:00; 6, 24:00:00; 7, 48:00:00;" />
if (throttleSetting.Contains(';'))
{
string[] arrSettings = throttleSetting.Split(';', StringSplitOptions.RemoveEmptyEntries);
foreach (string s in arrSettings)
{
string[] settings = s.Split(',', StringSplitOptions.RemoveEmptyEntries);
if (settings.Length == 0)
{
continue;
}
if (!int.TryParse(settings[0].Trim(), out int maxCount))
{
throw new ArgumentException($"Unsupported value for maxCount specified in {repairPolicySectionName} setting. Please check your configuration.");
}
if (!TimeSpan.TryParse(settings[1].Trim(), out TimeSpan timeRange))
{
throw new ArgumentException($"Unsupported value timeRange in {repairPolicySectionName} setting. Please check your configuration.");
}
if (await FabricRepairTasks.GetCreatedRepairCountWithinTimeRangeAsync(timeRange, repairData, cancellationToken) >= maxCount)
{
return true;
}
}
}
else
{
string[] settings = throttleSetting.Split(',', StringSplitOptions.RemoveEmptyEntries);
if (settings.Length == 0)
{
return false;
}
if (!int.TryParse(settings[0].Trim(), out int maxCount))
{
throw new ArgumentException($"Unsupported value for maxCount specified in {repairPolicySectionName} setting. Please check your configuration.");
}
if (!TimeSpan.TryParse(settings[1].Trim(), out TimeSpan timeRange))
{
throw new ArgumentException($"Unsupported value timeRange in {repairPolicySectionName} setting. Please check your configuration.");
}
return await FabricRepairTasks.GetCreatedRepairCountWithinTimeRangeAsync(timeRange, repairData, cancellationToken) >= maxCount;
}
return false;
}
public async Task<bool> DeleteFilesAsyncAsync(TelemetryData repairData, CancellationToken cancellationToken)
{
return await repairExecutor.DeleteFilesAsync(
@ -340,7 +385,7 @@ namespace FabricHealer.Repair
}
// Can only kill processes on the same node where FH instance that took the job is running.
if (repairData.NodeName != Context.NodeContext.NodeName)
if (repairData.NodeName != FabricHealerManager.ServiceContext.NodeContext.NodeName)
{
return false;
}
@ -383,9 +428,9 @@ namespace FabricHealer.Repair
var nodes = await FabricHealerManager.FabricClientSingleton.QueryManager.GetNodeListAsync(nodeName, asyncTimeout, cancellationToken);
return nodes.Count > 0 ? nodes[0] : null;
}
catch (FabricException fe)
catch (Exception e) when (e is FabricException || e is TaskCanceledException || e is TimeoutException)
{
FabricHealerManager.RepairLogger.LogError($"Error getting node {nodeName}:{Environment.NewLine}{fe}");
FabricHealerManager.RepairLogger.LogError($"Error getting node {nodeName}:{Environment.NewLine}{e}");
return null;
}
}
@ -394,62 +439,44 @@ namespace FabricHealer.Repair
{
await Task.Delay(new Random().Next(500, 1500), cancellationToken);
// Has the repair already been scheduled by a different FH instance?
// Internal throttling to protect against bad rules (over-scheduling of repair tasks within a fixed time range).
if (await RepairCountThrottleMaxCheck(repairData, cancellationToken))
{
string message = $"Too many repairs of this type have been scheduled in the last 15 minutes: " +
$"{repairData.RepairPolicy.RepairId}. Will not schedule another repair at this time.";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"InternalThrottling({repairData.RepairPolicy.RepairId})",
message,
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
return null;
}
// Has the repair already been scheduled?
if (await repairTaskEngine.IsRepairInProgressAsync(RepairTaskEngine.FHTaskIdPrefix, repairData, cancellationToken))
{
return null;
}
// Don't attempt a node-level repair on a node where there is already an active node-level repair. \\
var currentlyExecutingRepairs =
await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
null,
RepairTaskStateFilter.Active | RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing,
null,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken);
if (currentlyExecutingRepairs.Count > 0)
// Don't attempt a node-level repair on a node where there is already an active node-level repair.
if (await repairTaskEngine.IsNodeLevelRepairCurrentlyInFlightAsync(repairData, cancellationToken))
{
foreach (var repair in currentlyExecutingRepairs)
{
if (repair.Impact is NodeRepairImpactDescription impact)
{
if (!impact.ImpactedNodes.Any(
n => n.NodeName == repairData.NodeName && (n.ImpactLevel == NodeImpactLevel.Restart || n.ImpactLevel == NodeImpactLevel.RemoveData)))
{
continue;
}
}
string message = $"Node {repairData.NodeName} already has a node-impactful repair in progress: " +
$"{repairData.RepairPolicy.RepairAction}";
// FH-generated repair jobs.
if (JsonSerializationUtility.TryDeserializeObject(repair.ExecutorData, out RepairExecutorData data))
{
if (data.RepairData.RepairPolicy.RepairAction == RepairActionType.RestartReplica ||
data.RepairData.RepairPolicy.RepairAction == RepairActionType.RemoveReplica ||
data.RepairData.RepairPolicy.RepairAction == RepairActionType.DeleteFiles ||
data.RepairData.RepairPolicy.RepairAction == RepairActionType.RestartCodePackage)
{
continue;
}
}
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"NodeRepairAlreadyInProgress::{repairData.NodeName}",
message,
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
string message =
$"Node {repairData.NodeName} already has a node-impactful repair in progress: " +
$"{Enum.GetName(typeof(RepairActionType), repairData.RepairPolicy.RepairAction)}: {repair.TaskId}" +
"Exiting RepairTaskManager.ScheduleFabricHealerRmRepairTaskAsync.";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"ScheduleRepairTask::NodeRepairAlreadyInProgress",
message,
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
return null;
}
return null;
}
var executorData = new RepairExecutorData
@ -598,7 +625,7 @@ namespace FabricHealer.Repair
if (repairData.PartitionId == null)
{
success = false;
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoPartition",
$"No partition specified.",

Просмотреть файл

@ -94,19 +94,19 @@ namespace FabricHealer.Repair
}
/// <summary>
/// Determines if an Azure tenant update is in progress for cluster VMs.
/// Determines if an Azure tenant/platform update is in progress in the cluster.
/// </summary>
/// <param name="nodeType">NodeType string</param>
/// <param name="token">CancellationToken instance</param>
/// <returns>true if tenant update is in progress, false otherwise</returns>
internal static async Task<bool> IsAzureUpdateInProgress(string nodeType, string nodeName, CancellationToken token)
internal static async Task<bool> IsAzureUpdateInProgress(string nodeName, CancellationToken token)
{
var repairTasks = await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
null,
"Azure",
System.Fabric.Repair.RepairTaskStateFilter.Approved |
System.Fabric.Repair.RepairTaskStateFilter.Active |
System.Fabric.Repair.RepairTaskStateFilter.Executing,
$"fabric:/System/InfrastructureService/{nodeType}",
null,
FabricHealerManager.ConfigSettings.AsyncTimeout,
token);
@ -120,11 +120,11 @@ namespace FabricHealer.Repair
if (repairTasks.ToList().Any(
n => JsonSerializationUtility.TryDeserializeObject(n.ExecutorData, out ISExecutorData data) && data.JobId == nodeName))
{
string message = $"Azure Platform or Tenant Update in progress for {nodeType}. Will not attempt repairs at this time.";
string message = $"Azure Platform or Tenant Update in progress for {nodeName}. Will not attempt repairs at this time.";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"AzurePlatformOrTenantUpdateInProgress",
$"AzurePlatformOrTenantUpdateInProgress_{nodeName}",
message,
token);

Просмотреть файл

@ -16,11 +16,11 @@
<Parameter Name="EnableAppRepair" DefaultValue="true" />
<Parameter Name="EnableDiskRepair" DefaultValue="false" />
<Parameter Name="EnableFabricNodeRepair" DefaultValue="false" />
<Parameter Name="EnableMachineRepair" DefaultValue="false" />
<Parameter Name="EnableMachineRepair" DefaultValue="true" />
<Parameter Name="EnableReplicaRepair" DefaultValue="false" />
<!-- This means FH will apply some mitigation (restarts) to all instances of a target service on a node-by-node basis only (rolling). -->
<Parameter Name="EnableRollingServiceRestarts" DefaultValue="true" />
<Parameter Name="EnableSystemServiceRepair" DefaultValue="false" />
<Parameter Name="EnableSystemServiceRepair" DefaultValue="true" />
<!-- Logic rule files -->
<Parameter Name="AppRulesConfigurationFile" DefaultValue="AppRules.guan" />
<Parameter Name="DiskRulesConfigurationFile" DefaultValue="DiskRules.guan" />