1.1.18: + rule tracing, upgrade mod, RepairPolicy mod, DeactivateFabricNode predicate.

This commit is contained in:
Charles Torre 2023-02-26 12:12:52 -08:00
Родитель c648b6c98a
Коммит 0ab8864831
37 изменённых файлов: 685 добавлений и 612 удалений

Просмотреть файл

@ -23,11 +23,11 @@ function Build-SFPkg {
try { try {
Push-Location $scriptPath Push-Location $scriptPath
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.SelfContained.1.1.17" "$scriptPath\bin\release\FabricHealer\linux-x64\self-contained\FabricHealerType" Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.SelfContained.1.1.18" "$scriptPath\bin\release\FabricHealer\linux-x64\self-contained\FabricHealerType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.FrameworkDependent.1.1.17" "$scriptPath\bin\release\FabricHealer\linux-x64\framework-dependent\FabricHealerType" Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.FrameworkDependent.1.1.18" "$scriptPath\bin\release\FabricHealer\linux-x64\framework-dependent\FabricHealerType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.SelfContained.1.1.17" "$scriptPath\bin\release\FabricHealer\win-x64\self-contained\FabricHealerType" Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.SelfContained.1.1.18" "$scriptPath\bin\release\FabricHealer\win-x64\self-contained\FabricHealerType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.FrameworkDependent.1.1.17" "$scriptPath\bin\release\FabricHealer\win-x64\framework-dependent\FabricHealerType" Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.FrameworkDependent.1.1.18" "$scriptPath\bin\release\FabricHealer\win-x64\framework-dependent\FabricHealerType"
} }
finally { finally {
Pop-Location Pop-Location

Просмотреть файл

@ -11,7 +11,7 @@
}, },
"applicationTypeVersionFabricHealer": { "applicationTypeVersionFabricHealer": {
"type": "string", "type": "string",
"defaultValue": "1.1.17", "defaultValue": "1.1.18",
"metadata": { "metadata": {
"description": "Provide the app version number of FabricHealer. This must be identical to the version specified in the sfpkg." "description": "Provide the app version number of FabricHealer. This must be identical to the version specified in the sfpkg."
} }

Просмотреть файл

@ -6,7 +6,7 @@
"value": "<YOUR-CLUSTER-RESOURCE-NAME>" "value": "<YOUR-CLUSTER-RESOURCE-NAME>"
}, },
"applicationTypeVersionFabricHealer": { "applicationTypeVersionFabricHealer": {
"value": "1.1.17" "value": "1.1.18"
}, },
"packageUrlFabricHealer": { "packageUrlFabricHealer": {
"value": "<PUBLIC-ACCESSIBLE-URL-FOR-FABRICHEALER-SFPKG>" "value": "<PUBLIC-ACCESSIBLE-URL-FOR-FABRICHEALER-SFPKG>"

Просмотреть файл

@ -44,7 +44,7 @@ Here is a full example of exactly what is sent in one of these telemetry events,
"ClusterId": "00000000-1111-1111-0000-00f00d000d", "ClusterId": "00000000-1111-1111-0000-00f00d000d",
"ClusterType": "SFRP", "ClusterType": "SFRP",
"NodeNameHash": "3e83569d4c6aad78083cd081215dafc81e5218556b6a46cb8dd2b183ed0095ad", "NodeNameHash": "3e83569d4c6aad78083cd081215dafc81e5218556b6a46cb8dd2b183ed0095ad",
"FHVersion": "1.1.17", "FHVersion": "1.1.18",
"UpTime": "00:00:00.2164523", "UpTime": "00:00:00.2164523",
"Timestamp": "2023-02-07T21:45:25.2443014Z", "Timestamp": "2023-02-07T21:45:25.2443014Z",
"OS": "Windows", "OS": "Windows",

Просмотреть файл

@ -52,6 +52,9 @@
<None Update="PackageRoot\Config\LogicRules\MachineRules.guan"> <None Update="PackageRoot\Config\LogicRules\MachineRules.guan">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None> </None>
<None Update="TestApp42.zip">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
<None Update="testrules_wellformed.guan"> <None Update="testrules_wellformed.guan">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory> <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None> </None>

Просмотреть файл

@ -47,7 +47,7 @@ namespace FHTest
private const string FHProxyId = "FabricHealerProxy"; private const string FHProxyId = "FabricHealerProxy";
[ClassInitialize] [ClassInitialize]
public static void TestClassStartUp(TestContext testContext) public static async Task TestClassStartUp(TestContext testContext)
{ {
if (!IsLocalSFRuntimePresent()) if (!IsLocalSFRuntimePresent())
{ {
@ -93,6 +93,8 @@ namespace FHTest
{ {
TelemetryEnabled = false TelemetryEnabled = false
}; };
await DeployTestApp42Async();
} }
/* Helpers */ /* Helpers */
@ -168,6 +170,116 @@ namespace FHTest
} }
} }
private static async Task DeployTestApp42Async()
{
string appName = "fabric:/TestApp42";
// If fabric:/TestApp42 is already installed, exit.
var deployedTestApp =
await fabricClient.QueryManager.GetDeployedApplicationListAsync(
NodeName,
new Uri(appName),
TimeSpan.FromSeconds(30),
token);
if (deployedTestApp?.Count > 0)
{
return;
}
string appType = "TestApp42Type";
string appVersion = "1.0.0";
// Change this to suit your configuration (so, if you are on Windows and you installed SF on a different drive, for example).
string imageStoreConnectionString = @"file:C:\SfDevCluster\Data\ImageStoreShare";
string packagePathInImageStore = "TestApp42";
string packagePathZip = Path.Combine(Environment.CurrentDirectory, "TestApp42.zip");
string packagePath = Path.Combine(Environment.CurrentDirectory, "TestApp42", "Release");
try
{
// Unzip the compressed HealthMetrics app package.
System.IO.Compression.ZipFile.ExtractToDirectory(packagePathZip, "TestApp42", true);
// Copy the HealthMetrics app package to a location in the image store.
fabricClient.ApplicationManager.CopyApplicationPackage(imageStoreConnectionString, packagePath, packagePathInImageStore);
// Provision the HealthMetrics application.
await fabricClient.ApplicationManager.ProvisionApplicationAsync(packagePathInImageStore);
// Create HealthMetrics app instance.
ApplicationDescription appDesc = new(new Uri(appName), appType, appVersion);
await fabricClient.ApplicationManager.CreateApplicationAsync(appDesc);
// This is a hack. Withouth this timeout, the deployed test services may not have populated the FC cache?
// You may need to increase this value depending upon your dev machine? You'll find out..
await Task.Delay(TimeSpan.FromSeconds(15));
}
catch (FabricException fe)
{
if (fe.ErrorCode == FabricErrorCode.ApplicationAlreadyExists)
{
await fabricClient.ApplicationManager.DeleteApplicationAsync(new DeleteApplicationDescription(new Uri(appName)) { ForceDelete = true });
await DeployTestApp42Async();
}
else if (fe.ErrorCode == FabricErrorCode.ApplicationTypeAlreadyExists)
{
var appList = await fabricClient.QueryManager.GetApplicationListAsync(new Uri(appName));
if (appList.Count > 0)
{
await fabricClient.ApplicationManager.DeleteApplicationAsync(new DeleteApplicationDescription(new Uri(appName)) { ForceDelete = true });
}
await fabricClient.ApplicationManager.UnprovisionApplicationAsync(appType, appVersion);
await DeployTestApp42Async();
}
}
}
private static async Task<bool> EnsureTestServicesExistAsync(string appName)
{
try
{
var services = await fabricClient.QueryManager.GetServiceListAsync(new Uri(appName));
return services?.Count > 0;
}
catch (FabricElementNotFoundException)
{
}
return false;
}
private static async Task RemoveTestApplicationsAsync()
{
string imageStoreConnectionString = @"file:C:\SfDevCluster\Data\ImageStoreShare";
// TestApp42 \\
if (await EnsureTestServicesExistAsync("fabric:/TestApp42"))
{
string appName = "fabric:/TestApp42";
string appType = "TestApp42Type";
string appVersion = "1.0.0";
string serviceName1 = "fabric:/TestApp42/ChildProcessCreator";
string packagePathInImageStore = "TestApp42";
// Clean up the unzipped directory.
fabricClient.ApplicationManager.RemoveApplicationPackage(imageStoreConnectionString, packagePathInImageStore);
// Delete services.
var deleteServiceDescription1 = new DeleteServiceDescription(new Uri(serviceName1));
await fabricClient.ServiceManager.DeleteServiceAsync(deleteServiceDescription1);
// Delete an application instance from the application type.
var deleteApplicationDescription = new DeleteApplicationDescription(new Uri(appName));
await fabricClient.ApplicationManager.DeleteApplicationAsync(deleteApplicationDescription);
// Un-provision the application type.
await fabricClient.ApplicationManager.UnprovisionApplicationAsync(appType, appVersion);
}
}
[ClassCleanup] [ClassCleanup]
public static async Task TestClassCleanupAsync() public static async Task TestClassCleanupAsync()
{ {
@ -176,6 +288,7 @@ namespace FHTest
// Ensure FHProxy cleans up its health reports. // Ensure FHProxy cleans up its health reports.
FabricHealerProxy.Instance.Close(); FabricHealerProxy.Instance.Close();
await RemoveTestApplicationsAsync();
} }
/* GuanLogic Tests /* GuanLogic Tests
@ -336,6 +449,7 @@ namespace FHTest
}; };
var file = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "LogicRules", "ReplicaRules.guan"); var file = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "LogicRules", "ReplicaRules.guan");
FabricHealerManager.CurrentlyExecutingLogicRulesFileName = "ReplicaRules.guan";
List<string> repairRules = FabricHealerManager.ParseRulesFile(await File.ReadAllLinesAsync(file, token)); List<string> repairRules = FabricHealerManager.ParseRulesFile(await File.ReadAllLinesAsync(file, token));
try try
@ -379,6 +493,7 @@ namespace FHTest
var file = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "LogicRules", "SystemServiceRules.guan"); var file = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "LogicRules", "SystemServiceRules.guan");
List<string> repairRules = FabricHealerManager.ParseRulesFile(await File.ReadAllLinesAsync(file, token)); List<string> repairRules = FabricHealerManager.ParseRulesFile(await File.ReadAllLinesAsync(file, token));
FabricHealerManager.CurrentlyExecutingLogicRulesFileName = "SystemServiceRules.guan";
try try
{ {
@ -490,6 +605,7 @@ namespace FHTest
{ {
string testRulesFilePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "LogicRules", "MachineRules.guan"); string testRulesFilePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "LogicRules", "MachineRules.guan");
string[] rules = await File.ReadAllLinesAsync(testRulesFilePath, token); string[] rules = await File.ReadAllLinesAsync(testRulesFilePath, token);
FabricHealerManager.CurrentlyExecutingLogicRulesFileName = "MachineRules.guan";
List<string> repairRules = FabricHealerManager.ParseRulesFile(rules); List<string> repairRules = FabricHealerManager.ParseRulesFile(rules);
int escalationCount = 4; // reboot, reimage, heal, triage. int escalationCount = 4; // reboot, reimage, heal, triage.
RepairTaskList repairTasks = null; RepairTaskList repairTasks = null;
@ -771,7 +887,7 @@ namespace FHTest
{ {
// The service here must be one that is running in your test cluster. // The service here must be one that is running in your test cluster.
// TODO: install a local test app as part of tests. // TODO: install a local test app as part of tests.
ServiceName = "fabric:/BadApp/BadService", ServiceName = "fabric:/TestApp42/ChildProcessCreator",
NodeName = NodeName, NodeName = NodeName,
// Specifying Source is Required for unit tests. // Specifying Source is Required for unit tests.
// For unit tests, there is no FabricRuntime static, so FHProxy, which utilizes this type, will fail unless Source is provided here. // For unit tests, there is no FabricRuntime static, so FHProxy, which utilizes this type, will fail unless Source is provided here.

Просмотреть файл

@ -10,6 +10,7 @@
<Parameter Name="EnableAutoMitigation" Value="true" /> <Parameter Name="EnableAutoMitigation" Value="true" />
<Parameter Name="EnableOperationalTelemetry" Value="false" /> <Parameter Name="EnableOperationalTelemetry" Value="false" />
<Parameter Name="EnableRollingServiceRestarts" Value="true" /> <Parameter Name="EnableRollingServiceRestarts" Value="true" />
<Parameter Name="EnableLogicRuleTracing" Value="true" />
<!-- Folder name for local log output. You can use a full path or just a folder name. --> <!-- Folder name for local log output. You can use a full path or just a folder name. -->
<Parameter Name="LocalLogPath" Value="fabric_healer_testlogs" /> <Parameter Name="LocalLogPath" Value="fabric_healer_testlogs" />

Двоичные данные
FHTest/TestApp42.zip Normal file

Двоичный файл не отображается.

Просмотреть файл

@ -2,7 +2,7 @@
<package xmlns="http://schemas.microsoft.com/packaging/2013/05/nuspec.xsd"> <package xmlns="http://schemas.microsoft.com/packaging/2013/05/nuspec.xsd">
<metadata minClientVersion="3.3.0"> <metadata minClientVersion="3.3.0">
<id>%PACKAGE_ID%</id> <id>%PACKAGE_ID%</id>
<version>1.1.17</version> <version>1.1.18</version>
<releaseNotes> <releaseNotes>
This release requires Service Fabric runtime version 9 and higher and at least Service Fabric SDK version 6.0.1017. There are several changes and improvements in this This release requires Service Fabric runtime version 9 and higher and at least Service Fabric SDK version 6.0.1017. There are several changes and improvements in this
release including a new machine repair model, updated logic rules, bug fixes, and many code improvements. release including a new machine repair model, updated logic rules, bug fixes, and many code improvements.

Просмотреть файл

@ -25,8 +25,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
Documentation\OperationalTelemetry.md = Documentation\OperationalTelemetry.md Documentation\OperationalTelemetry.md = Documentation\OperationalTelemetry.md
README.md = README.md README.md = README.md
Documentation\Deployment\service-fabric-healer.json = Documentation\Deployment\service-fabric-healer.json Documentation\Deployment\service-fabric-healer.json = Documentation\Deployment\service-fabric-healer.json
Documentation\Deployment\service-fabric-healer.v1.1.18.parameters.json = Documentation\Deployment\service-fabric-healer.v1.1.18.parameters.json
Documentation\Using.md = Documentation\Using.md Documentation\Using.md = Documentation\Using.md
Documentation\Deployment\service-fabric-healer.v1.1.17.parameters.json = Documentation\Deployment\service-fabric-healer.v1.1.17.parameters.json
EndProjectSection EndProjectSection
EndProject EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FHTest", "FHTest\FHTest.csproj", "{8D9712BF-C026-4A36-B6D1-6345137D3B6F}" Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FHTest", "FHTest\FHTest.csproj", "{8D9712BF-C026-4A36-B6D1-6345137D3B6F}"

Просмотреть файл

@ -12,8 +12,8 @@
<RuntimeIdentifier>win-x64</RuntimeIdentifier>--> <RuntimeIdentifier>win-x64</RuntimeIdentifier>-->
<RuntimeIdentifiers>linux-x64;win-x64</RuntimeIdentifiers> <RuntimeIdentifiers>linux-x64;win-x64</RuntimeIdentifiers>
<Product>FabricHealer</Product> <Product>FabricHealer</Product>
<Version>1.1.17</Version> <Version>1.1.18</Version>
<FileVersion>1.1.17</FileVersion> <FileVersion>1.1.18</FileVersion>
<StartupObject>FabricHealer.Program</StartupObject> <StartupObject>FabricHealer.Program</StartupObject>
<Platforms>x64</Platforms> <Platforms>x64</Platforms>
</PropertyGroup> </PropertyGroup>

Просмотреть файл

@ -29,10 +29,15 @@ namespace FabricHealer
{ {
internal static TelemetryUtilities TelemetryUtilities; internal static TelemetryUtilities TelemetryUtilities;
internal static RepairData RepairHistory; internal static RepairData RepairHistory;
public static StatelessServiceContext ServiceContext;
internal static bool EnableRuleTracing { get; set; } = true;
public static StatelessServiceContext ServiceContext { get; private set; }
public static string CurrentlyExecutingLogicRulesFileName { get; set; }
// Folks often use their own version numbers. This is for internal diagnostic telemetry. // Folks often use their own version numbers. This is for internal diagnostic telemetry.
private const string InternalVersionNumber = "1.1.17"; private const string InternalVersionNumber = "1.1.18";
private static FabricHealerManager singleton; private static FabricHealerManager singleton;
private static FabricClient _fabricClient; private static FabricClient _fabricClient;
private bool disposedValue; private bool disposedValue;
@ -305,7 +310,7 @@ namespace FabricHealer
// First, let's clean up any orphaned non-node level FabricHealer repair tasks left pending. This will also resume Fabric Node repairs that // First, let's clean up any orphaned non-node level FabricHealer repair tasks left pending. This will also resume Fabric Node repairs that
// FH owns and was executing at the time FH exited. Only FH-owned repairs will be canceled, not repairs conducted by other executors. // FH owns and was executing at the time FH exited. Only FH-owned repairs will be canceled, not repairs conducted by other executors.
await CancelOrResumeAllRunningFHRepairsAsync(); await CancelAbandonedFHRepairsAsync();
// Run until RunAsync token is cancelled. // Run until RunAsync token is cancelled.
while (!Token.IsCancellationRequested) while (!Token.IsCancellationRequested)
@ -467,17 +472,26 @@ namespace FabricHealer
{ {
// FH looks for and resumes FabricNode restart repair jobs when it starts up (so, it will pick up where it left off in the safe restart sequence // FH looks for and resumes FabricNode restart repair jobs when it starts up (so, it will pick up where it left off in the safe restart sequence
// when the Fabric node hosting FH is the one FH restarted). // when the Fabric node hosting FH is the one FH restarted).
if (JsonSerializationUtility.TryDeserializeObject(repair.ExecutorData, out RepairExecutorData exData, true) if (!JsonSerializationUtility.TryDeserializeObject(repair.ExecutorData, out RepairExecutorData exData, true))
&& exData.RepairPolicy.RepairAction == RepairActionType.RestartFabricNode)
{ {
if (isClosing) continue;
}
// This would mean that the job has node-level Impact and its state is at least Approved.
if (repair.Impact is NodeRepairImpactDescription impact)
{
if (impact.ImpactedNodes.Any(
n => n.NodeName == exData.RepairPolicy.NodeName
&& (n.ImpactLevel == NodeImpactLevel.Restart ||
n.ImpactLevel == NodeImpactLevel.RemoveData ||
n.ImpactLevel == NodeImpactLevel.RemoveNode)))
{ {
continue; continue;
} }
} }
// Was max execution time configured by user? // Was max execution time configured by user?
if (exData != null && exData.RepairPolicy.MaxExecutionTime > TimeSpan.Zero) if (exData.RepairPolicy.MaxExecutionTime > TimeSpan.Zero)
{ {
maxFHExecutorTime = exData.RepairPolicy.MaxExecutionTime; maxFHExecutorTime = exData.RepairPolicy.MaxExecutionTime;
} }
@ -492,7 +506,7 @@ namespace FabricHealer
catch (Exception e) when (e is ArgumentException || e is FabricException || e is InvalidOperationException || e is TimeoutException) catch (Exception e) when (e is ArgumentException || e is FabricException || e is InvalidOperationException || e is TimeoutException)
{ {
#if DEBUG #if DEBUG
RepairLogger.LogWarning($"TryCleanUpOrphanedFabricHealerRepairJobsAsync Failure:{Environment.NewLine}{e}"); RepairLogger.LogWarning($"TryCleanUpOrphanedFabricHealerRepairJobs Failure:{Environment.NewLine}{e}");
#endif #endif
} }
} }
@ -531,15 +545,13 @@ namespace FabricHealer
} }
/// <summary> /// <summary>
/// Cancels all FabricHealer repair tasks currently in flight (unless in Restoring state). /// Cancels all FabricHealer repair tasks currently in flight.
/// OR Resumes fabric node-level repairs that were abandoned due to FH going down while they were processing.
/// </summary> /// </summary>
/// <returns>A Task.</returns> /// <returns>A Task.</returns>
private static async Task CancelOrResumeAllRunningFHRepairsAsync() private static async Task CancelAbandonedFHRepairsAsync()
{ {
try try
{ {
var currentFHRepairTasksInProgress = var currentFHRepairTasksInProgress =
await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() => RepairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync( () => RepairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(
@ -582,49 +594,11 @@ namespace FabricHealer
continue; continue;
} }
// Try and cancel existing repair. We may need to create a new one for abandoned repairs where FH goes down for some reason.
// Note: CancelRepairTaskAsync handles exceptions (IOE) that may be thrown by RM due to state change policy.
// The repair state could change to Completed after this call is made, for example, and before RM API call.
if (repair.State != RepairTaskState.Completed) if (repair.State != RepairTaskState.Completed)
{ {
await FabricRepairTasks.CancelRepairTaskAsync(repair); await FabricRepairTasks.CancelRepairTaskAsync(repair);
} }
/* Resume interrupted Fabric Node restart repairs */
// There is no need to resume simple repairs that do not require multiple repair steps (e.g., codepackage/process/replica restarts).
if (repairExecutorData.RepairPolicy.RepairAction != RepairActionType.RestartFabricNode)
{
continue;
}
string errorCode = repairExecutorData.RepairPolicy.Code;
if (string.IsNullOrWhiteSpace(errorCode))
{
continue;
}
// File Deletion repair is a node-level (VM) repair, but is not multi-step. Ignore.
if (repairExecutorData.RepairPolicy.RepairAction == RepairActionType.DeleteFiles)
{
continue;
}
// Fabric System service warnings/errors from FO can be Node level repair targets (e.g., Fabric binary needs to be restarted).
// FH will restart the node hosting the troubled SF system service if specified in related logic rules.
var repairRules =
GetRepairRulesFromConfiguration(
!string.IsNullOrWhiteSpace(
repairExecutorData.RepairPolicy.ProcessName) ? RepairConstants.SystemServiceRepairPolicySectionName : RepairConstants.FabricNodeRepairPolicySectionName);
var repairData = new TelemetryData
{
NodeName = repairExecutorData.RepairPolicy.NodeName,
Code = errorCode,
};
await RunGuanQueryAsync(repairData, repairRules, Token, repairExecutorData);
RepairLogger.LogInfo("Exiting CancelOrResumeAllRunningFHRepairsAsync: Completed."); RepairLogger.LogInfo("Exiting CancelOrResumeAllRunningFHRepairsAsync: Completed.");
} }
} }
@ -632,13 +606,13 @@ namespace FabricHealer
{ {
if (e is FabricException) if (e is FabricException)
{ {
RepairLogger.LogWarning($"Could not cancel or resume repair tasks. Failed with:{Environment.NewLine}{e}"); RepairLogger.LogWarning($"Could not cancel FH repair tasks. Failed with:{Environment.NewLine}{e}");
} }
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info, LogLevel.Info,
"CancelOrResumeAllRunningFHRepairsAsync", "CancelOrResumeAllRunningFHRepairsAsync",
$"Could not cancel or resume repair tasks. Failed with:{Environment.NewLine}{e}", $"Could not cancel abandoned FH repair tasks. Failed with:{Environment.NewLine}{e}",
Token, Token,
null, null,
ConfigSettings.EnableVerboseLogging); ConfigSettings.EnableVerboseLogging);
@ -1561,7 +1535,7 @@ namespace FabricHealer
continue; continue;
} }
// Disk? // Disk repair?
if (repairData.EntityType == EntityType.Disk) if (repairData.EntityType == EntityType.Disk)
{ {
if (!ConfigSettings.EnableDiskRepair) if (!ConfigSettings.EnableDiskRepair)
@ -1573,7 +1547,7 @@ namespace FabricHealer
continue; continue;
} }
// Fabric node? // Fabric Node repair?
if (repairData.EntityType == EntityType.Node) if (repairData.EntityType == EntityType.Node)
{ {
if (!ConfigSettings.EnableFabricNodeRepair) if (!ConfigSettings.EnableFabricNodeRepair)
@ -1581,17 +1555,13 @@ namespace FabricHealer
continue; continue;
} }
// FabricHealerProxy-generated report, so a restart fabric node request, for example. // FabricObserver/FabricHealerProxy-generated health report.
await ProcessFabricNodeHealthAsync(evt, repairData); await ProcessFabricNodeHealthAsync(evt, repairData);
continue; continue;
} }
} }
// Machine repair \\ // Machine-level repair \\
if (!ConfigSettings.EnableMachineRepair)
{
continue;
}
// Make sure that there is not already an Infra repair in progress for the target node. // Make sure that there is not already an Infra repair in progress for the target node.
if (await RepairTaskEngine.IsNodeLevelRepairCurrentlyInFlightAsync(repairData, Token)) if (await RepairTaskEngine.IsNodeLevelRepairCurrentlyInFlightAsync(repairData, Token))
@ -1617,7 +1587,7 @@ namespace FabricHealer
/* Start repair workflow */ /* Start repair workflow */
string repairId = $"MachineRepair_{nodeType}_{repairData.NodeName}"; string repairId = $"MachineRepair_{repairData.NodeName}";
repairData.RepairPolicy = new RepairPolicy repairData.RepairPolicy = new RepairPolicy
{ {
RepairId = repairId, RepairId = repairId,
@ -1752,8 +1722,8 @@ namespace FabricHealer
return; return;
} }
// There is only one supported repair for a FabricNode: Restart. string action = repairData.RepairPolicy.RepairAction == RepairActionType.DeactivateNode ? "Deactivate" : "Restart";
string repairId = $"{repairData.NodeName}_{repairData.NodeType}_Restart"; string repairId = $"{repairData.NodeName}_{repairData.NodeType}_{action}";
var currentRepairs = var currentRepairs =
await RepairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairConstants.FHTaskIdPrefix, Token); await RepairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairConstants.FHTaskIdPrefix, Token);
@ -2173,6 +2143,7 @@ namespace FabricHealer
return null; return null;
} }
CurrentlyExecutingLogicRulesFileName = logicRulesConfigFileName;
List<string> repairRules = ParseRulesFile(rules); List<string> repairRules = ParseRulesFile(rules);
return repairRules; return repairRules;
} }

Просмотреть файл

@ -1,11 +1,6 @@
## Logic rules for Service Fabric Node repairs. ## Logic rule examples for Service Fabric Node repairs.
## These repairs are not executed by FabricHealer. FH creates repair tasks with the correct node impact specified and RM takes it from there.
## First check if we are inside the run interval. If so, cut (!). This means that no other rules will be processed (no back-tracking). ## Restart/Deactivate. Try Restart twice in 8 hour window. Else, deactivate (Pause) the Fabric node.
## This is commented out by default. Just uncomment and set the global run interval for app Fabric node level repairs to suit your needs. Mitigate(HealthState=Error) :- GetRepairHistory(?repairCount, 08:00:00), ?repairCount < 2, !, RestartFabricNode().
Mitigate(HealthState=Error) :- DeactivateFabricNode().
## Mitigate() :- CheckInsideRunInterval(02:00:00), !.
## This rule means that whatever the Fabric node-level warning data from the issuing service happens to be, restart the target Fabric node if
## the repair hasn't run 4 times in the last 8 hours.
Mitigate() :- GetRepairHistory(?repairCount, 08:00:00), ?repairCount < 4, RestartFabricNode(DoHealthChecks=false, MaxWaitTimeForHealthStateOk=00:45:00, MaxExecutionTime=02:00:00).

Просмотреть файл

@ -1,31 +1,31 @@
## Logic rules for scheduling Machine-level repair jobs in the cluster. EntityType fact is always Machine. ## Logic rules for scheduling Machine-level repair jobs in the cluster. EntityType fact is always Machine.
## FH does not conduct (execute) these repairs. It simply schedules them. InfrastructureService is always the Executor for these types of Repair Jobs. ## FH does not conduct (execute) these repairs. It simply schedules them. InfrastructureService is always the Executor for machine-level Repair Jobs.
## Applicable Named Arguments for Mitigate. Facts are supplied by FabricObserver, FHProxy or FH itself. ## Applicable Named Arguments for Mitigate. Facts are supplied by FabricObserver, FHProxy or FH itself.
## Any argument below with (FO/FHProxy) means that only FO or FHProxy will present the fact. ## Any argument below with (FO/FHProxy) means that only FO or FHProxy will present the fact.
## | Argument Name | Definition | ## | Argument Name | Definition |
## |---------------------------|------------------------------------------------------------------------| ## |---------------------------|------------------------------------------------------------------------|
## | NodeName | Name of the node | ## | NodeName | Name of the node |
## | NodeType | Type of node | ## | NodeType | Type of node |
## | ErrorCode (FO/FHProxy) | Supported Error Code emitted by caller (e.g. "FO002") | ## | ErrorCode (FO/FHProxy) | Supported Error Code emitted by caller (e.g. "FO002") |
## | MetricName (FO/FHProxy) | Name of the Metric (e.g., CpuPercent or MemoryMB, etc.) | ## | MetricName (FO/FHProxy) | Name of the Metric (e.g., CpuPercent or MemoryMB, etc.) |
## | MetricValue (FO/FHProxy) | Corresponding Metric Value (e.g. "85" indicating 85% CPU usage) | ## | MetricValue (FO/FHProxy) | Corresponding Metric Value (e.g. "85" indicating 85% CPU usage) |
## | OS | The name of the OS where FabricHealer is running (Linux or Windows) | ## | OS | The name of the OS where FabricHealer is running (Linux or Windows) |
## | HealthState | The HealthState of the target entity: Error or Warning | ## | HealthState | The HealthState of the target entity: Error or Warning |
## | Source | The Source ID of the related SF Health Event | ## | Source | The Source ID of the related SF Health Event |
## | Property | The Property of the related SF Health Event | ## | Property | The Property of the related SF Health Event |
## Metric Names, from FO or FHProxy. ## Metric Names, from FO or FHProxy.
## | Name | ## | Name |
## |--------------------------------| ## |--------------------------------|
## | ActiveTcpPorts | ## | ActiveTcpPorts |
## | CpuPercent | ## | CpuPercent |
## | EphemeralPorts | ## | EphemeralPorts |
## | EphemeralPortsPercent | ## | EphemeralPortsPercent |
## | MemoryMB | ## | MemoryMB |
## | MemoryPercent | ## | MemoryPercent |
## | Handles (Linux-only) | ## | Handles (Linux-only) |
## | HandlesPercent (Linux-only) | ## | HandlesPercent (Linux-only) |
## The logic program below is a repair specification (policy) that does not require facts from FabricObserver (FO) or FHProxy. ## The logic program below is a repair specification (policy) that does not require facts from FabricObserver (FO) or FHProxy.
@ -34,8 +34,8 @@
## Don't proceed if the target entity is not in Error. ## Don't proceed if the target entity is not in Error.
Mitigate(HealthState=?healthState) :- not(?healthState == Error), !. Mitigate(HealthState=?healthState) :- not(?healthState == Error), !.
## Don't proceed is FabricObserver's NodeObserver is not the source of the Error event. ## Don't proceed unless the specified watchdog created the Error health event.
##Mitigate(Source=?source) :- not(match(?source, "NodeObserver")), !. Mitigate(Source=?source) :- not(match(?source, "EventLogWatchdog")), !.
## Don't proceed if there are already 2 or more machine repairs currently active in the cluster. ## Don't proceed if there are already 2 or more machine repairs currently active in the cluster.
Mitigate() :- CheckOutstandingRepairs(2), !. Mitigate() :- CheckOutstandingRepairs(2), !.
@ -47,9 +47,15 @@ Mitigate() :- CheckInsideScheduleInterval(00:10:00), !.
Mitigate() :- CheckInsideNodeProbationPeriod(00:30:00), !. Mitigate() :- CheckInsideNodeProbationPeriod(00:30:00), !.
## Don't proceed if the target node hasn't been in Error (including cyclic Up/Down) state for at least two hours. ## Don't proceed if the target node hasn't been in Error (including cyclic Up/Down) state for at least two hours.
Mitigate() :- CheckInsideHealthStateMinDuration(02:00:00), !. Mitigate() :- CheckInsideHealthStateMinDuration(00:01:00), !.
## Mitigations (RM repair scheduling logic - InfrastructureService for the target node type will be the repair Executor, not FH). ## For certain environments, the correct mitigation is to deactivate the target node. The below rule schedules a node deactivation (intent is Pause) repair.
Mitigate(Source=?source, Property=?property) :- match(?source, "EventLogWatchdog"),
match(?property, "CriticalMachineFailure"), !,
DeactivateFabricNode(ImpactLevel=RemoveData).
## Infra Mitigations (RM repair scheduling logic - InfrastructureService for the target node type will be the repair Executor, not FH).
## The logic below demonstrates how to specify a repair escalation path: Reboot -> Reimage -> Heal -> Triage (human intervention required). ## The logic below demonstrates how to specify a repair escalation path: Reboot -> Reimage -> Heal -> Triage (human intervention required).
## ScheduleMachineRepair predicate takes any repair action string. There are a handful that are supported by RepairManager/InfrastructureService, like below. ## ScheduleMachineRepair predicate takes any repair action string. There are a handful that are supported by RepairManager/InfrastructureService, like below.
@ -69,4 +75,4 @@ Mitigate() :- GetRepairHistory(?repairCount, 08:00:00, System.Azure.Heal), ?repa
## from scheduling any other machine repairs for the target node until canceled. It also counts against the number of concurrent Active repairs you specified ## from scheduling any other machine repairs for the target node until canceled. It also counts against the number of concurrent Active repairs you specified
## above in the CheckOutstandingRepairs predicate. ## above in the CheckOutstandingRepairs predicate.
Mitigate(NodeName=?nodeName) :- LogInfo("0042_{0}: Specified Machine repair escalations have been exhausted for node {0}. Human intervention is required.", ?nodeName), Mitigate(NodeName=?nodeName) :- LogInfo("0042_{0}: Specified Machine repair escalations have been exhausted for node {0}. Human intervention is required.", ?nodeName),
ScheduleMachineRepair(ManualTriageNeeded). ScheduleMachineRepair(ManualTriageNeeded).

Просмотреть файл

@ -13,6 +13,8 @@
<Parameter Name="EnableRollingServiceRestarts" Value="" MustOverride="true" /> <Parameter Name="EnableRollingServiceRestarts" Value="" MustOverride="true" />
<!-- Folder name for local log output. You can use a full path or just a folder name. --> <!-- Folder name for local log output. You can use a full path or just a folder name. -->
<Parameter Name="LocalLogPath" Value="" MustOverride="true" /> <Parameter Name="LocalLogPath" Value="" MustOverride="true" />
<!-- This will enable FabricHealer to try and trace executed logic rules that employ repair action predicates. -->
<Parameter Name="EnableLogicRuleTracing" Value="" MustOverride="true" />
<!-- ***Non-Overridable Parameters*** These must be set in this file. --> <!-- ***Non-Overridable Parameters*** These must be set in this file. -->

Просмотреть файл

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<ServiceManifest Name="FabricHealerPkg" <ServiceManifest Name="FabricHealerPkg"
Version="1.1.17" Version="1.1.18"
xmlns="http://schemas.microsoft.com/2011/01/fabric" xmlns="http://schemas.microsoft.com/2011/01/fabric"
xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
@ -11,7 +11,7 @@
</ServiceTypes> </ServiceTypes>
<!-- Code package is your service executable. --> <!-- Code package is your service executable. -->
<CodePackage Name="Code" Version="1.1.17"> <CodePackage Name="Code" Version="1.1.18">
<EntryPoint> <EntryPoint>
<ExeHost> <ExeHost>
<Program>FabricHealer</Program> <Program>FabricHealer</Program>
@ -21,5 +21,5 @@
<!-- Config package is the contents of the Config directory under PackageRoot that contains an <!-- Config package is the contents of the Config directory under PackageRoot that contains an
independently-updateable and versioned set of custom configuration settings for your service. --> independently-updateable and versioned set of custom configuration settings for your service. -->
<ConfigPackage Name="Config" Version="1.1.17" /> <ConfigPackage Name="Config" Version="1.1.18" />
</ServiceManifest> </ServiceManifest>

Просмотреть файл

@ -167,8 +167,9 @@ namespace FabricHealer.Repair
repairTask = await RepairTaskEngine.CreateInfrastructureRepairTaskAsync(repairData, token); repairTask = await RepairTaskEngine.CreateInfrastructureRepairTaskAsync(repairData, token);
break; break;
// FH // FH
case RepairActionType.DeactivateNode:
case RepairActionType.DeleteFiles: case RepairActionType.DeleteFiles:
case RepairActionType.RestartCodePackage: case RepairActionType.RestartCodePackage:
case RepairActionType.RestartFabricNode: case RepairActionType.RestartFabricNode:
@ -184,20 +185,16 @@ namespace FabricHealer.Repair
return null; return null;
} }
bool success = await CreateRepairTaskAsync( bool success = await CreateClusterRepairTaskAsync(repairTask, repairData, token);
repairTask,
repairData,
token);
return success ? repairTask : null; return success ? repairTask : null;
} }
private static async Task<bool> CreateRepairTaskAsync( private static async Task<bool> CreateClusterRepairTaskAsync(
RepairTask repairTask, RepairTask repairTask,
TelemetryData repairData, TelemetryData repairData,
CancellationToken token) CancellationToken token)
{ {
if (repairTask == null) if (repairTask == null || repairData?.RepairPolicy == null)
{ {
return false; return false;
} }
@ -214,29 +211,45 @@ namespace FabricHealer.Repair
FabricHealerManager.ConfigSettings.AsyncTimeout, FabricHealerManager.ConfigSettings.AsyncTimeout,
token); token);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"CreateClusterRepairTaskAsync::{repairData.RepairPolicy.RepairId}",
$"Successfully created repair task {repairTask.TaskId}.",
token,
null,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
return true; return true;
} }
else
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"CreateClusterRepairTaskAsync::{repairData.RepairPolicy.RepairId}_AlreadyExists",
$"A repair already exists with internal repair Id {repairData.RepairPolicy.RepairId}. Will not schedule another repair.",
token,
null,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
return false;
}
} }
catch (ArgumentException ae) catch (ArgumentException ae)
{ {
string message = $"Unable to create repairtask:{Environment.NewLine}{ae}";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Warning, LogLevel.Info,
"FabricRepairTasks::TryCreateRepairTaskAsync", "CreateClusterRepairTaskAsync",
message, $"Unable to create repairtask:{Environment.NewLine}{ae}",
token, token,
repairData, repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging); FabricHealerManager.ConfigSettings.EnableVerboseLogging);
} }
catch (FabricException fe) catch (FabricException fe)
{ {
string message = $"Unable to create repairtask:{Environment.NewLine}{fe}";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Warning, LogLevel.Info,
"FabricRepairTasks::TryCreateRepairTaskAsync", $"CreateClusterRepairTaskAsync::Failure({repairData.RepairPolicy.RepairId})",
message, $"Unable to create repair task:{Environment.NewLine}{fe}",
token, token,
repairData, repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging); FabricHealerManager.ConfigSettings.EnableVerboseLogging);
@ -264,13 +277,13 @@ namespace FabricHealer.Repair
{ {
var allSystemServices = var allSystemServices =
await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() => FabricHealerManager.FabricClientSingleton.QueryManager.GetServiceListAsync( () => FabricHealerManager.FabricClientSingleton.QueryManager.GetServiceListAsync(
new Uri(RepairConstants.SystemAppName), new Uri(RepairConstants.SystemAppName),
null, null,
FabricHealerManager.ConfigSettings.AsyncTimeout, FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken), cancellationToken),
cancellationToken); cancellationToken);
var infraInstances = var infraInstances =
allSystemServices.Where(i => i.ServiceTypeName.Equals(RepairConstants.InfrastructureServiceType, StringComparison.InvariantCultureIgnoreCase)); allSystemServices.Where(i => i.ServiceTypeName.Equals(RepairConstants.InfrastructureServiceType, StringComparison.InvariantCultureIgnoreCase));

Просмотреть файл

@ -0,0 +1,139 @@
// ------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
using FabricHealer.Utilities.Telemetry;
using FabricHealer.Utilities;
using Guan.Logic;
using System;
using System.Fabric.Repair;
using System.Threading.Tasks;
namespace FabricHealer.Repair.Guan
{
internal class DeactivateFabricNodePredicateType : PredicateType
{
private static TelemetryData RepairData;
private static DeactivateFabricNodePredicateType Instance;
private class Resolver : BooleanPredicateResolver
{
public Resolver(CompoundTerm input, Constraint constraint, QueryContext context)
: base(input, constraint, context)
{
}
protected override async Task<bool> CheckAsync()
{
RepairData.RepairPolicy.RepairAction = RepairActionType.DeactivateNode;
RepairData.RepairPolicy.RepairIdPrefix = RepairConstants.FHTaskIdPrefix;
RepairData.RepairPolicy.RepairId = $"DeactivateNode::{RepairData.NodeName}";
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
{
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
}
int count = Input.Arguments.Count;
for (int i = 0; i < count; i++)
{
var typeString = Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue().GetType().Name;
switch (typeString)
{
case "Boolean" when i == 0 && count == 4 || Input.Arguments[i].Name.ToLower() == "dohealthchecks":
RepairData.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue();
break;
case "TimeSpan" when i == 1 && count == 4 || Input.Arguments[i].Name.ToLower() == "maxwaittimeforhealthstateok":
RepairData.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue();
break;
case "TimeSpan" when i == 2 && count == 4 || Input.Arguments[i].Name.ToLower() == "maxexecutiontime":
RepairData.RepairPolicy.MaxExecutionTime = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue();
break;
case "String" when i == 3 && count == 4 || Input.Arguments[i].Name.ToLower() == "impactlevel":
string value = Input.Arguments[i].Value.GetEffectiveTerm().GetStringValue().ToLower();
if (value == "removedata")
{
RepairData.RepairPolicy.NodeImpactLevel = NodeImpactLevel.RemoveData;
}
else if (value == "removenode")
{
RepairData.RepairPolicy.NodeImpactLevel = NodeImpactLevel.RemoveNode;
}
else if (value == "restart")
{
RepairData.RepairPolicy.NodeImpactLevel = NodeImpactLevel.Restart;
}
else
{
RepairData.RepairPolicy.NodeImpactLevel = NodeImpactLevel.None;
}
break;
default:
throw new GuanException($"Unsupported argument type for RestartFabricNode: {typeString}");
}
}
var isNodeRepairAlreadyInProgress =
await RepairTaskEngine.IsRepairInProgressAsync(RepairData, FabricHealerManager.Token);
if (isNodeRepairAlreadyInProgress)
{
string message =
$"A repair for Fabric node {RepairData.NodeName} is already in progress in the cluster.";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"DeactivateFabricNode::{RepairData.RepairPolicy.RepairId}",
message,
FabricHealerManager.Token,
RepairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
return false;
}
// Try to schedule the Deactivate repair with RM (RM will deactivate the node, not FH).
RepairTask repairTask = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() => RepairTaskManager.ScheduleFabricHealerRepairTaskAsync(
RepairData,
FabricHealerManager.Token),
FabricHealerManager.Token);
if (repairTask == null)
{
return false;
}
return true;
}
}
public static DeactivateFabricNodePredicateType Singleton(string name, TelemetryData repairData)
{
RepairData = repairData;
return Instance ??= new DeactivateFabricNodePredicateType(name);
}
private DeactivateFabricNodePredicateType(string name)
: base(name, true, 0)
{
}
public override PredicateResolver CreateResolver(CompoundTerm input, Constraint constraint, QueryContext context)
{
return new Resolver(input, constraint, context);
}
}
}

Просмотреть файл

@ -34,6 +34,11 @@ namespace FabricHealer.Repair.Guan
return false; return false;
} }
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
{
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
}
RepairData.RepairPolicy.RepairAction = RepairActionType.DeleteFiles; RepairData.RepairPolicy.RepairAction = RepairActionType.DeleteFiles;
bool recurseSubDirectories = false; bool recurseSubDirectories = false;
string path = Input.Arguments[0].Value.GetEffectiveTerm().GetStringValue(); string path = Input.Arguments[0].Value.GetEffectiveTerm().GetStringValue();

Просмотреть файл

@ -4,6 +4,7 @@
// ------------------------------------------------------------ // ------------------------------------------------------------
using System.Collections.Generic; using System.Collections.Generic;
using System.Diagnostics;
using System.Threading; using System.Threading;
using System.Threading.Tasks; using System.Threading.Tasks;
using Guan.Logic; using Guan.Logic;
@ -27,7 +28,7 @@ namespace FabricHealer.Repair.Guan
QueryContext queryContext = new(moduleProvider); QueryContext queryContext = new(moduleProvider);
queryContext.SetDirection(null, order); queryContext.SetDirection(null, order);
Query query = Query.Create(queryExpression, queryContext); Query query = Query.Create(queryExpression, queryContext);
await query.GetNextAsync(); _ = await query.GetNextAsync();
} }
public async Task RunQueryAsync(List<CompoundTerm> queryExpressions, CancellationToken cancellationToken) public async Task RunQueryAsync(List<CompoundTerm> queryExpressions, CancellationToken cancellationToken)
@ -43,7 +44,7 @@ namespace FabricHealer.Repair.Guan
QueryContext queryContext = new(moduleProvider); QueryContext queryContext = new(moduleProvider);
queryContext.SetDirection(null, order); queryContext.SetDirection(null, order);
Query query = Query.Create(queryExpressions, queryContext, moduleProvider); Query query = Query.Create(queryExpressions, queryContext, moduleProvider);
await query.GetNextAsync(); _ = await query.GetNextAsync();
} }
} }
} }

Просмотреть файл

@ -28,6 +28,12 @@ namespace FabricHealer.Repair.Guan
protected override async Task<bool> CheckAsync() protected override async Task<bool> CheckAsync()
{ {
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartCodePackage; RepairData.RepairPolicy.RepairAction = RepairActionType.RestartCodePackage;
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
{
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
}
int count = Input.Arguments.Count; int count = Input.Arguments.Count;
for (int i = 0; i < count; i++) for (int i = 0; i < count; i++)

Просмотреть файл

@ -9,13 +9,11 @@ using Guan.Logic;
using FabricHealer.Utilities; using FabricHealer.Utilities;
using FabricHealer.Utilities.Telemetry; using FabricHealer.Utilities.Telemetry;
using System.Threading.Tasks; using System.Threading.Tasks;
using System.Threading;
namespace FabricHealer.Repair.Guan namespace FabricHealer.Repair.Guan
{ {
public class RestartFabricNodePredicateType : PredicateType public class RestartFabricNodePredicateType : PredicateType
{ {
private static RepairExecutorData RepairExecutorData;
private static TelemetryData RepairData; private static TelemetryData RepairData;
private static RestartFabricNodePredicateType Instance; private static RestartFabricNodePredicateType Instance;
@ -30,6 +28,13 @@ namespace FabricHealer.Repair.Guan
protected override async Task<bool> CheckAsync() protected override async Task<bool> CheckAsync()
{ {
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartFabricNode; RepairData.RepairPolicy.RepairAction = RepairActionType.RestartFabricNode;
RepairData.RepairPolicy.RepairIdPrefix = RepairConstants.FHTaskIdPrefix;
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
{
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
}
int count = Input.Arguments.Count; int count = Input.Arguments.Count;
for (int i = 0; i < count; i++) for (int i = 0; i < count; i++)
@ -55,59 +60,6 @@ namespace FabricHealer.Repair.Guan
} }
} }
RepairTask repairTask;
bool success;
// This means it's a resumed repair.
if (RepairExecutorData != null)
{
// Historical info, like what step the healer was in when the node went down, is contained in the
// executordata instance.
repairTask = await RepairTaskEngine.CreateFabricHealerRepairTask(RepairExecutorData, FabricHealerManager.Token);
if (repairTask == null)
{
return false;
}
// MaxExecutionTime impl.
using (CancellationTokenSource tokenSource = new())
{
using (var linkedCTS = CancellationTokenSource.CreateLinkedTokenSource(
tokenSource.Token,
FabricHealerManager.Token))
{
TimeSpan maxExecutionTime = TimeSpan.FromMinutes(60);
if (RepairData.RepairPolicy.MaxExecutionTime > TimeSpan.Zero)
{
maxExecutionTime = RepairData.RepairPolicy.MaxExecutionTime;
}
tokenSource.CancelAfter(maxExecutionTime);
tokenSource.Token.Register(() =>
{
_ = FabricHealerManager.TryCleanUpOrphanedFabricHealerRepairJobsAsync();
});
// Try to execute repair (FH executor does this work and manages repair state).
success = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() => RepairTaskManager.ExecuteFabricHealerRepairTaskAsync(
repairTask,
RepairData,
linkedCTS.Token),
linkedCTS.Token);
if (!success && linkedCTS.IsCancellationRequested)
{
await FabricHealerManager.TryCleanUpOrphanedFabricHealerRepairJobsAsync();
}
return success;
}
}
}
// Block attempts to create node-level repair tasks if one is already running in the cluster. // Block attempts to create node-level repair tasks if one is already running in the cluster.
var isNodeRepairAlreadyInProgress = var isNodeRepairAlreadyInProgress =
await RepairTaskEngine.IsRepairInProgressAsync(RepairData, FabricHealerManager.Token); await RepairTaskEngine.IsRepairInProgressAsync(RepairData, FabricHealerManager.Token);
@ -128,8 +80,8 @@ namespace FabricHealer.Repair.Guan
return false; return false;
} }
// Try to schedule repair with RM. // Try to schedule repair with RM for Fabric Node Restart (FH will not be the executor).
repairTask = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( RepairTask repairTask = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() => RepairTaskManager.ScheduleFabricHealerRepairTaskAsync( () => RepairTaskManager.ScheduleFabricHealerRepairTaskAsync(
RepairData, RepairData,
FabricHealerManager.Token), FabricHealerManager.Token),
@ -139,52 +91,13 @@ namespace FabricHealer.Repair.Guan
return false; return false;
} }
using (CancellationTokenSource tokenSource = new()) return true;
{
using (var linkedCTS = CancellationTokenSource.CreateLinkedTokenSource(
tokenSource.Token,
FabricHealerManager.Token))
{
TimeSpan maxExecutionTime = TimeSpan.FromMinutes(60);
if (RepairData.RepairPolicy.MaxExecutionTime > TimeSpan.Zero)
{
maxExecutionTime = RepairData.RepairPolicy.MaxExecutionTime;
}
tokenSource.CancelAfter(maxExecutionTime);
tokenSource.Token.Register(() =>
{
_ = FabricHealerManager.TryCleanUpOrphanedFabricHealerRepairJobsAsync();
});
// Try to execute repair (FH executor does this work and manages repair state).
success = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() => RepairTaskManager.ExecuteFabricHealerRepairTaskAsync(
repairTask,
RepairData,
linkedCTS.Token),
linkedCTS.Token);
if (!success && linkedCTS.IsCancellationRequested)
{
await FabricHealerManager.TryCleanUpOrphanedFabricHealerRepairJobsAsync();
}
return success;
}
}
} }
} }
public static RestartFabricNodePredicateType Singleton( public static RestartFabricNodePredicateType Singleton(string name, TelemetryData repairData)
string name,
RepairExecutorData repairExecutorData,
TelemetryData repairData)
{ {
RepairExecutorData = repairExecutorData;
RepairData = repairData; RepairData = repairData;
return Instance ??= new RestartFabricNodePredicateType(name); return Instance ??= new RestartFabricNodePredicateType(name);
} }

Просмотреть файл

@ -34,6 +34,12 @@ namespace FabricHealer.Repair.Guan
} }
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartProcess; RepairData.RepairPolicy.RepairAction = RepairActionType.RestartProcess;
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
{
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
}
int count = Input.Arguments.Count; int count = Input.Arguments.Count;
for (int i = 0; i < count; i++) for (int i = 0; i < count; i++)

Просмотреть файл

@ -28,6 +28,12 @@ namespace FabricHealer.Repair.Guan
protected override async Task<bool> CheckAsync() protected override async Task<bool> CheckAsync()
{ {
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartReplica; RepairData.RepairPolicy.RepairAction = RepairActionType.RestartReplica;
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
{
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
}
int count = Input.Arguments.Count; int count = Input.Arguments.Count;
for (int i = 0; i < count; i++) for (int i = 0; i < count; i++)

Просмотреть файл

@ -34,6 +34,13 @@ namespace FabricHealer.Repair.Guan
throw new GuanException("You must provide a repair action name for Infrastructure-level repairs as first argument."); throw new GuanException("You must provide a repair action name for Infrastructure-level repairs as first argument.");
} }
RepairData.RepairPolicy.RepairAction = RepairActionType.Infra;
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
{
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
}
// FH does not execute repairs for VM level mitigation. InfrastructureService (IS) does, // FH does not execute repairs for VM level mitigation. InfrastructureService (IS) does,
// so, FH schedules VM repairs via RM and the execution is taken care of by IS (the executor). // so, FH schedules VM repairs via RM and the execution is taken care of by IS (the executor).
// Block attempts to create duplicate repair tasks or more than specified concurrent machine-level repairs. // Block attempts to create duplicate repair tasks or more than specified concurrent machine-level repairs.

Просмотреть файл

@ -0,0 +1,119 @@
// ------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
using FabricHealer.Utilities;
using FabricHealer.Utilities.Telemetry;
using Guan.Logic;
using System;
using System.IO;
using System.Linq;
using System.Threading.Tasks;
namespace FabricHealer.Repair.Guan
{
internal class TraceNextRulePredicateType : PredicateType
{
private static TraceNextRulePredicateType Instance;
private static TelemetryData RepairData;
private class Resolver : BooleanPredicateResolver
{
public Resolver(CompoundTerm input, Constraint constraint, QueryContext context)
: base(input, constraint, context)
{
}
protected override async Task<bool> CheckAsync()
{
string ruleFileName = FabricHealerManager.CurrentlyExecutingLogicRulesFileName, rule = string.Empty;
int lineNumber = 0;
string ruleFilePath =
Path.Combine(
FabricHealerManager.ServiceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config").Path,
"LogicRules",
ruleFileName);
if (!File.Exists(ruleFilePath))
{
throw new GuanException($"Specified rule file path does not exist: {ruleFilePath}");
}
try
{
string[] lines = File.ReadLines(ruleFilePath).ToArray();
for (int i = 0; i < lines.Length; i++)
{
string line = lines[i];
if (line.Contains($":- {RepairConstants.TraceNextRule}", StringComparison.OrdinalIgnoreCase))
{
lineNumber = i + 1;
line = lines[lineNumber];
while (string.IsNullOrWhiteSpace(line) || line.TrimStart().StartsWith("##"))
{
lineNumber++;
line = lines[lineNumber];
}
// custom rule formatting support.
if (line.TrimEnd().EndsWith(','))
{
for (int j = lineNumber; lines[j].TrimEnd().EndsWith(','); j++)
{
line += " " + lines[j + 1].Replace('\t', ' ').Trim();
lineNumber = j;
}
}
rule = line;
break;
}
}
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"{ruleFileName}#{lineNumber}_{RepairData.RepairPolicy.ProcessName ?? RepairData.NodeName}",
$"Executing logic rule \'{rule}\'",
FabricHealerManager.Token);
}
catch (Exception e) when (e is ArgumentException || e is IOException || e is SystemException)
{
string message = $"TraceNextRule failure => Unable to read {ruleFileName}: {e.Message}";
FabricHealerManager.RepairLogger.LogWarning(message);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"TraceNextRule::{ruleFileName}::Failure",
message,
FabricHealerManager.Token);
}
// Guarantees the next rule runs. This is critical given TraceNextRule is designed to log the full text of whatever logic rule comes after it in a rule file.
return false;
}
}
public static TraceNextRulePredicateType Singleton(string name, TelemetryData repairData)
{
RepairData = repairData;
return Instance ??= new TraceNextRulePredicateType(name);
}
private TraceNextRulePredicateType(string name) : base(name, true, 0)
{
}
public override PredicateResolver CreateResolver(CompoundTerm input, Constraint constraint, QueryContext context)
{
return new Resolver(input, constraint, context);
}
}
}

Просмотреть файл

@ -15,6 +15,7 @@ namespace FabricHealer.Repair
public enum RepairActionType public enum RepairActionType
{ {
Infra, Infra,
DeactivateNode,
DeleteFiles, DeleteFiles,
RemoveFabricNodeState, RemoveFabricNodeState,
RemoveReplica, RemoveReplica,

Просмотреть файл

@ -34,6 +34,7 @@ namespace FabricHealer.Repair
public const string LocalLogPathParameter = "LocalLogPath"; public const string LocalLogPathParameter = "LocalLogPath";
public const string AsyncOperationTimeout = "AsyncOperationTimeoutSeconds"; public const string AsyncOperationTimeout = "AsyncOperationTimeoutSeconds";
public const string EnableFabricHealerOperationalTelemetry = "EnableOperationalTelemetry"; public const string EnableFabricHealerOperationalTelemetry = "EnableOperationalTelemetry";
public const string EnableLogicRuleTracing = "EnableLogicRuleTracing";
// General Repair Settings Parameters. // General Repair Settings Parameters.
public const string EnableAutoMitigation = "EnableAutoMitigation"; public const string EnableAutoMitigation = "EnableAutoMitigation";
@ -73,6 +74,7 @@ namespace FabricHealer.Repair
public const string Source = "Source"; public const string Source = "Source";
// Repair Actions. // Repair Actions.
public const string DeactivateFabricNode = "DeactivateFabricNode";
public const string DeleteFiles = "DeleteFiles"; public const string DeleteFiles = "DeleteFiles";
public const string RestartCodePackage = "RestartCodePackage"; public const string RestartCodePackage = "RestartCodePackage";
public const string RestartFabricNode = "RestartFabricNode"; public const string RestartFabricNode = "RestartFabricNode";
@ -100,6 +102,7 @@ namespace FabricHealer.Repair
public const string LogInfo = "LogInfo"; public const string LogInfo = "LogInfo";
public const string LogWarning = "LogWarning"; public const string LogWarning = "LogWarning";
public const string LogError = "LogError"; public const string LogError = "LogError";
public const string TraceNextRule = "TraceNextRule";
// Metric names. // Metric names.
public const string ActiveTcpPorts = "ActiveTcpPorts"; public const string ActiveTcpPorts = "ActiveTcpPorts";

Просмотреть файл

@ -13,7 +13,6 @@ using System.Fabric.Query;
using System.Fabric.Health; using System.Fabric.Health;
using FabricHealer.Utilities; using FabricHealer.Utilities;
using FabricHealer.Utilities.Telemetry; using FabricHealer.Utilities.Telemetry;
using System.Fabric.Repair;
using System.Net; using System.Net;
using System.Net.Sockets; using System.Net.Sockets;
using System.IO; using System.IO;
@ -22,7 +21,6 @@ using System.Collections.Generic;
using System.ComponentModel; using System.ComponentModel;
using Newtonsoft.Json; using Newtonsoft.Json;
using System.Fabric.Description; using System.Fabric.Description;
using System.Security.Cryptography.X509Certificates;
namespace FabricHealer.Repair namespace FabricHealer.Repair
{ {
@ -187,310 +185,6 @@ namespace FabricHealer.Repair
FabricHealerManager.RepairHistory.SuccessfulRepairs++; FabricHealerManager.RepairHistory.SuccessfulRepairs++;
} }
/// <summary>
/// Safely restarts a Service Fabric Node instance.
/// Algorithm:
/// 1 Deactivate target node.
/// 2 Wait for node to get into Disabled/Ok.
/// 3 Restart node (which is the Fabric.exe kill API in FaultManager)
/// 4 Wait for node to go Down.
/// 5 Wait for node to get to Disabled/Ok.
/// 5 Activate node.
/// 6 Wait for node to get to Up/Ok.
/// </summary>
/// <param name="repairData">Repair configuration</param>
/// <param name="repairTask">The scheduled Repair Task</param>
/// <param name="cancellationToken">Task cancellation token</param>
/// <returns></returns>
public static async Task<bool> SafeRestartFabricNodeAsync(
TelemetryData repairData,
RepairTask repairTask,
CancellationToken cancellationToken)
{
if (await FabricHealerManager.IsOneNodeClusterAsync())
{
string info = "One node cluster detected. Aborting node restart operation.";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairExecutor.SafeRestartFabricNodeAsync::NodeCount_1",
info,
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
return false;
}
var nodeQueryDesc = new NodeQueryDescription
{
MaxResults = 5,
};
NodeList nodeList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() => FabricHealerManager.FabricClientSingleton.QueryManager.GetNodePagedListAsync(
nodeQueryDesc,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken),
cancellationToken);
if (nodeList.Count < 3)
{
string info = $"Unsupported repair for a {nodeList.Count}-node cluster. Aborting fabric node restart operation.";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairExecutor.SafeRestartFabricNodeAsync::NodeCount",
info,
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
FabricHealerManager.RepairLogger.LogInfo(info);
return false;
}
ServiceDescription serviceDesc =
await FabricHealerManager.FabricClientSingleton.ServiceManager.GetServiceDescriptionAsync(
FabricHealerManager.ServiceContext.ServiceName,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken);
int instanceCount = (serviceDesc as StatelessServiceDescription).InstanceCount;
if (instanceCount == -1)
{
bool isTargetNodeHostingFH = repairData.NodeName == FabricHealerManager.ServiceContext.NodeContext.NodeName;
if (isTargetNodeHostingFH)
{
return false;
}
}
if (!nodeList.Any(n => n.NodeName == repairData.NodeName))
{
string info = $"Fabric node {repairData.NodeName} does not exist.";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairExecutor.SafeRestartFabricNodeAsync::MissingNode",
info,
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
}
var nodeInstanceId = nodeList.First(n => n.NodeName == repairData.NodeName).NodeInstanceId;
var stopwatch = new Stopwatch();
var maxWaitTimeout = TimeSpan.FromMinutes(MaxWaitTimeMinutesForNodeOperation);
string actionMessage = $"Attempting to safely restart Fabric node {repairData.NodeName} with InstanceId {nodeInstanceId}.";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairExecutor.SafeRestartFabricNodeAsyncAttemptingRestart",
actionMessage,
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
try
{
if (!JsonSerializationUtility.TryDeserializeObject(repairTask.ExecutorData, out RepairExecutorData executorData))
{
return false;
}
if (executorData.LatestRepairStep == FabricNodeRepairStep.Scheduled)
{
executorData.LatestRepairStep = FabricNodeRepairStep.Deactivate;
if (JsonSerializationUtility.TrySerializeObject(executorData, out string exData))
{
repairTask.ExecutorData = exData;
}
else
{
actionMessage = "Step = Deactivate => Did not successfully serialize executordata.";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairExecutor.SafeRestartFabricNodeAsyncAttemptingRestart::Deactivate",
actionMessage,
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
return false;
}
await FabricHealerManager.FabricClientSingleton.RepairManager.UpdateRepairExecutionStateAsync(repairTask, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
// Deactivate the node with intent to restart. Several health checks will
// take place to ensure safe deactivation, which includes giving services a
// chance to gracefully shut down, should they override OnAbort/OnClose.
await FabricHealerManager.FabricClientSingleton.ClusterManager.DeactivateNodeAsync(
repairData.NodeName,
NodeDeactivationIntent.Restart,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken);
stopwatch.Start();
// Wait for node to get into Disabled state.
while (stopwatch.Elapsed <= maxWaitTimeout)
{
var nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() =>
FabricHealerManager.FabricClientSingleton.QueryManager.GetNodeListAsync(
repairData.NodeName,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken),
cancellationToken);
if (nodes == null || nodes.Count == 0)
{
break;
}
Node targetNode = nodes[0];
// exit loop, this is the state we're looking for.
if (targetNode.NodeStatus == NodeStatus.Disabled)
{
break;
}
await Task.Delay(1000, cancellationToken);
}
stopwatch.Stop();
stopwatch.Reset();
}
if (executorData.LatestRepairStep == FabricNodeRepairStep.Deactivate)
{
executorData.LatestRepairStep = FabricNodeRepairStep.Restart;
if (JsonSerializationUtility.TrySerializeObject(executorData, out string exData))
{
repairTask.ExecutorData = exData;
}
else
{
return false;
}
await FabricHealerManager.FabricClientSingleton.RepairManager.UpdateRepairExecutionStateAsync(repairTask, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
actionMessage = $"In Step Restart Node.{Environment.NewLine}{repairTask.ExecutorData}";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairExecutor.SafeRestartFabricNodeAsyncAttemptingRestart::RestartStep",
actionMessage,
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
// Now, restart node.
_ = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() =>
FabricHealerManager.FabricClientSingleton.FaultManager.RestartNodeAsync(
repairData.NodeName,
nodeInstanceId,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken),
cancellationToken);
stopwatch.Start();
// Wait for Disabled/OK
while (stopwatch.Elapsed <= maxWaitTimeout)
{
var nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() =>
FabricHealerManager.FabricClientSingleton.QueryManager.GetNodeListAsync(
repairData.NodeName,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken),
cancellationToken);
Node targetNode = nodes[0];
// Node is ready to be enabled.
if (targetNode.NodeStatus == NodeStatus.Disabled && targetNode.HealthState == HealthState.Ok)
{
break;
}
await Task.Delay(1000, cancellationToken);
}
stopwatch.Stop();
stopwatch.Reset();
}
if (executorData.LatestRepairStep == FabricNodeRepairStep.Restart)
{
executorData.LatestRepairStep = FabricNodeRepairStep.Activate;
if (JsonSerializationUtility.TrySerializeObject(executorData, out string exData))
{
repairTask.ExecutorData = exData;
}
else
{
return false;
}
await FabricHealerManager.FabricClientSingleton.RepairManager.UpdateRepairExecutionStateAsync(repairTask, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
// Now, enable the node.
await FabricHealerManager.FabricClientSingleton.ClusterManager.ActivateNodeAsync(repairData.NodeName, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
await Task.Delay(TimeSpan.FromSeconds(15), cancellationToken);
var nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() =>
FabricHealerManager.FabricClientSingleton.QueryManager.GetNodeListAsync(
repairData.NodeName,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken),
cancellationToken);
Node targetNode = nodes[0];
// Make sure activation request went through.
if (targetNode.NodeStatus == NodeStatus.Disabled && targetNode.HealthState == HealthState.Ok)
{
await FabricHealerManager.FabricClientSingleton.ClusterManager.ActivateNodeAsync(repairData.NodeName, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
}
await Task.Delay(TimeSpan.FromSeconds(15), cancellationToken);
UpdateRepairHistory(repairData);
return true;
}
FabricHealerManager.RepairHistory.FailedRepairs++;
return false;
}
catch (Exception e) when (e is FabricException || e is OperationCanceledException || e is TimeoutException)
{
string err = $"Handled Exception restarting Fabric node {repairData.NodeName}, NodeInstanceId {nodeInstanceId}:{e.GetType().Name}";
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairExecutor.SafeRestartFabricNodeAsync::HandledException",
err,
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
FabricHealerManager.RepairLogger.LogInfo(err);
FabricHealerManager.RepairHistory.FailedRepairs++;
return false;
}
}
/// <summary> /// <summary>
/// Restarts a stateful replica. /// Restarts a stateful replica.
/// </summary> /// </summary>

Просмотреть файл

@ -6,6 +6,7 @@
using System; using System;
using System.Diagnostics.Tracing; using System.Diagnostics.Tracing;
using System.Fabric.Health; using System.Fabric.Health;
using System.Fabric.Repair;
namespace FabricHealer.Repair namespace FabricHealer.Repair
{ {
@ -120,5 +121,6 @@ namespace FabricHealer.Repair
{ {
get; set; get; set;
} }
public NodeImpactLevel NodeImpactLevel { get; internal set; }
} }
} }

Просмотреть файл

@ -7,11 +7,14 @@ using System;
using System.Fabric; using System.Fabric;
using System.Fabric.Health; using System.Fabric.Health;
using System.Fabric.Repair; using System.Fabric.Repair;
using System.IO;
using System.Linq; using System.Linq;
using System.Threading; using System.Threading;
using System.Threading.Tasks; using System.Threading.Tasks;
using FabricHealer.TelemetryLib;
using FabricHealer.Utilities; using FabricHealer.Utilities;
using FabricHealer.Utilities.Telemetry; using FabricHealer.Utilities.Telemetry;
using Guan.Logic;
namespace FabricHealer.Repair namespace FabricHealer.Repair
{ {
@ -48,15 +51,10 @@ namespace FabricHealer.Repair
} }
} }
NodeImpactLevel impact = executorData.RepairPolicy.RepairAction switch NodeImpactLevel impact =
{ executorData.RepairPolicy.NodeImpactLevel != NodeImpactLevel.Invalid ? executorData.RepairPolicy.NodeImpactLevel : NodeImpactLevel.None;
RepairActionType.RestartFabricNode => NodeImpactLevel.Restart, NodeRepairImpactDescription nodeRepairImpact = new();
RepairActionType.RemoveFabricNodeState => NodeImpactLevel.RemoveData, NodeImpact impactedNode = new(executorData.RepairPolicy.NodeName, impact);
_ => NodeImpactLevel.None
};
var nodeRepairImpact = new NodeRepairImpactDescription();
var impactedNode = new NodeImpact(executorData.RepairPolicy.NodeName, impact);
nodeRepairImpact.ImpactedNodes.Add(impactedNode); nodeRepairImpact.ImpactedNodes.Add(impactedNode);
RepairActionType repairAction = executorData.RepairPolicy.RepairAction; RepairActionType repairAction = executorData.RepairPolicy.RepairAction;
string repair = repairAction.ToString(); string repair = repairAction.ToString();
@ -80,11 +78,18 @@ namespace FabricHealer.Repair
doHealthChecks = false; doHealthChecks = false;
} }
string description = $"FabricHealer executing repair {repair} on node {executorData.RepairPolicy.NodeName}";
if (impact == NodeImpactLevel.Restart || impact == NodeImpactLevel.RemoveData)
{
description = executorData.RepairPolicy.RepairId;
}
var repairTask = new ClusterRepairTask(taskId, repair) var repairTask = new ClusterRepairTask(taskId, repair)
{ {
Target = new NodeRepairTargetDescription(executorData.RepairPolicy.NodeName), Target = new NodeRepairTargetDescription(executorData.RepairPolicy.NodeName),
Impact = nodeRepairImpact, Impact = nodeRepairImpact,
Description = $"FabricHealer executing repair {repair} on node {executorData.RepairPolicy.NodeName}", Description = description,
State = RepairTaskState.Preparing, State = RepairTaskState.Preparing,
Executor = RepairConstants.FabricHealer, Executor = RepairConstants.FabricHealer,
ExecutorData = JsonSerializationUtility.TrySerializeObject(executorData, out string exData) ? exData : null, ExecutorData = JsonSerializationUtility.TrySerializeObject(executorData, out string exData) ? exData : null,
@ -461,5 +466,99 @@ namespace FabricHealer.Repair
return false; return false;
} }
internal static async Task<bool> TryTraceCurrentlyExecutingRule(string predicate, TelemetryData repairData)
{
string ruleFileName = FabricHealerManager.CurrentlyExecutingLogicRulesFileName, rule = string.Empty;
int lineNumber = 0;
try
{
string ruleFilePath =
Path.Combine(
FabricHealerManager.ServiceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config").Path,
"LogicRules",
ruleFileName);
if (!File.Exists(ruleFilePath))
{
FabricHealerManager.RepairLogger.LogWarning($"TryTraceCurrentlyExecutingRule: Specified rule file path does not exist: {ruleFilePath}.");
return false;
}
string[] lines = File.ReadLines(ruleFilePath).ToArray();
int length = lines.Length;
predicate = predicate.Replace("'", "").Replace("\"", "").Replace(" ", "");
for (int i = 0; i < length; i++)
{
string line = lines[i].Replace("'", "").Replace("\"", "").Replace(" ", "");
if (line.Contains("##") || string.IsNullOrWhiteSpace(line))
{
continue;
}
if (line.Contains(predicate, StringComparison.OrdinalIgnoreCase))
{
lineNumber = i;
line = lines[lineNumber];
if (line.StartsWith("Mitigate") && line.EndsWith("."))
{
}
// final (repair) predicate ends with a . in FH.
if (line.TrimEnd().EndsWith('.'))
{
rule = line.Replace('\t', ' ');
// Line is the whole rule.
if (line.Contains(":-"))
{
break;
}
for (int j = lineNumber - 1; j < length; j--)
{
if (lines[j].TrimEnd().EndsWith(','))
{
rule = lines[j].Replace('\t', ' ').Trim() + ' ' + rule;
lineNumber = j;
if (lines[j].StartsWith("Mitigate"))
{
break;
}
}
}
}
break;
}
}
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"{ruleFileName}#{lineNumber}_{repairData.RepairPolicy.ProcessName ?? repairData.NodeName}",
$"Executing logic rule \'{rule}\'",
FabricHealerManager.Token);
return true;
}
catch (Exception e) when (e is ArgumentException || e is IOException || e is SystemException)
{
string message = $"TraceCurrentlyExecutingRule failure => Unable to read {ruleFileName}: {e.Message}";
FabricHealerManager.RepairLogger.LogWarning(message);
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"TraceCurrentlyExecutingRule::{ruleFileName}::Failure",
message,
FabricHealerManager.Token);
}
return false;
}
} }
} }

Просмотреть файл

@ -17,7 +17,6 @@ using FabricHealer.Utilities.Telemetry;
using Guan.Logic; using Guan.Logic;
using FabricHealer.Repair.Guan; using FabricHealer.Repair.Guan;
using FabricHealer.Utilities; using FabricHealer.Utilities;
using System.Fabric.Description;
namespace FabricHealer.Repair namespace FabricHealer.Repair
{ {
@ -37,32 +36,6 @@ namespace FabricHealer.Repair
await FabricHealerManager.FabricClientSingleton.ClusterManager.ActivateNodeAsync(nodeName, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken); await FabricHealerManager.FabricClientSingleton.ClusterManager.ActivateNodeAsync(nodeName, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
} }
public static async Task<bool> SafeRestartServiceFabricNodeAsync(TelemetryData repairData, RepairTask repairTask, CancellationToken cancellationToken)
{
if (!await RepairExecutor.SafeRestartFabricNodeAsync(repairData, repairTask, cancellationToken))
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"SafeRestartFabricNodeAsync",
$"Did not restart Fabric node {repairData.NodeName}",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
return false;
}
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"SafeRestartFabricNodeAsync",
$"Successfully restarted Fabric node {repairData.NodeName}",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
return true;
}
public static async Task StartRepairWorkflowAsync(TelemetryData repairData, List<string> repairRules, CancellationToken cancellationToken) public static async Task StartRepairWorkflowAsync(TelemetryData repairData, List<string> repairRules, CancellationToken cancellationToken)
{ {
if (await RepairTaskEngine.CheckForActiveStopFHRepairJob(cancellationToken)) if (await RepairTaskEngine.CheckForActiveStopFHRepairJob(cancellationToken))
@ -120,10 +93,10 @@ namespace FabricHealer.Repair
/// <param name="repairExecutorData">Optional Repair data that is used primarily when some repair is being restarted (after an FH restart, for example)</param> /// <param name="repairExecutorData">Optional Repair data that is used primarily when some repair is being restarted (after an FH restart, for example)</param>
/// <returns></returns> /// <returns></returns>
public static async Task RunGuanQueryAsync( public static async Task RunGuanQueryAsync(
TelemetryData repairData, TelemetryData repairData,
List<string> repairRules, List<string> repairRules,
CancellationToken cancellationToken, CancellationToken cancellationToken,
RepairExecutorData repairExecutorData = null) RepairExecutorData repairExecutorData = null)
{ {
if (await RepairTaskEngine.CheckForActiveStopFHRepairJob(cancellationToken)) if (await RepairTaskEngine.CheckForActiveStopFHRepairJob(cancellationToken))
{ {
@ -142,14 +115,16 @@ namespace FabricHealer.Repair
functorTable.Add(LogInfoPredicateType.Singleton(RepairConstants.LogInfo)); functorTable.Add(LogInfoPredicateType.Singleton(RepairConstants.LogInfo));
functorTable.Add(LogErrorPredicateType.Singleton(RepairConstants.LogError)); functorTable.Add(LogErrorPredicateType.Singleton(RepairConstants.LogError));
functorTable.Add(LogWarningPredicateType.Singleton(RepairConstants.LogWarning)); functorTable.Add(LogWarningPredicateType.Singleton(RepairConstants.LogWarning));
functorTable.Add(TraceNextRulePredicateType.Singleton(RepairConstants.TraceNextRule, repairData));
functorTable.Add(CheckInsideHealthStateMinDurationPredicateType.Singleton(RepairConstants.CheckInsideHealthStateMinDuration, repairData)); functorTable.Add(CheckInsideHealthStateMinDurationPredicateType.Singleton(RepairConstants.CheckInsideHealthStateMinDuration, repairData));
functorTable.Add(GetHealthEventHistoryPredicateType.Singleton(RepairConstants.GetHealthEventHistory, repairData)); functorTable.Add(GetHealthEventHistoryPredicateType.Singleton(RepairConstants.GetHealthEventHistory, repairData));
functorTable.Add(GetRepairHistoryPredicateType.Singleton(RepairConstants.GetRepairHistory, repairData)); functorTable.Add(GetRepairHistoryPredicateType.Singleton(RepairConstants.GetRepairHistory, repairData));
// Add external repair predicates. // Add external repair predicates.
functorTable.Add(DeactivateFabricNodePredicateType.Singleton(RepairConstants.DeactivateFabricNode, repairData));
functorTable.Add(DeleteFilesPredicateType.Singleton(RepairConstants.DeleteFiles, repairData)); functorTable.Add(DeleteFilesPredicateType.Singleton(RepairConstants.DeleteFiles, repairData));
functorTable.Add(RestartCodePackagePredicateType.Singleton(RepairConstants.RestartCodePackage, repairData)); functorTable.Add(RestartCodePackagePredicateType.Singleton(RepairConstants.RestartCodePackage, repairData));
functorTable.Add(RestartFabricNodePredicateType.Singleton(RepairConstants.RestartFabricNode, repairExecutorData, repairData)); functorTable.Add(RestartFabricNodePredicateType.Singleton(RepairConstants.RestartFabricNode, repairData));
functorTable.Add(RestartFabricSystemProcessPredicateType.Singleton(RepairConstants.RestartFabricSystemProcess, repairData)); functorTable.Add(RestartFabricSystemProcessPredicateType.Singleton(RepairConstants.RestartFabricSystemProcess, repairData));
functorTable.Add(RestartReplicaPredicateType.Singleton(RepairConstants.RestartReplica, repairData)); functorTable.Add(RestartReplicaPredicateType.Singleton(RepairConstants.RestartReplica, repairData));
functorTable.Add(ScheduleMachineRepairPredicateType.Singleton(RepairConstants.ScheduleMachineRepair, repairData)); functorTable.Add(ScheduleMachineRepairPredicateType.Singleton(RepairConstants.ScheduleMachineRepair, repairData));
@ -514,6 +489,7 @@ namespace FabricHealer.Repair
// Don't attempt a node-level repair on a node where there is already an active node-level repair. // Don't attempt a node-level repair on a node where there is already an active node-level repair.
if (repairData.RepairPolicy.RepairAction == RepairActionType.RestartFabricNode if (repairData.RepairPolicy.RepairAction == RepairActionType.RestartFabricNode
|| repairData.RepairPolicy.RepairAction == RepairActionType.DeactivateNode
&& await RepairTaskEngine.IsNodeLevelRepairCurrentlyInFlightAsync(repairData, cancellationToken)) && await RepairTaskEngine.IsNodeLevelRepairCurrentlyInFlightAsync(repairData, cancellationToken))
{ {
string message = $"Node {repairData.NodeName} already has a node-impactful repair in progress: " + string message = $"Node {repairData.NodeName} already has a node-impactful repair in progress: " +
@ -831,30 +807,7 @@ namespace FabricHealer.Repair
break; break;
} }
case RepairActionType.RestartFabricNode:
{
var executorData = repairTask.ExecutorData;
if (string.IsNullOrWhiteSpace(executorData))
{
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"RestartFabricNode::{repairData.NodeName}",
$"Repair {repairTask.TaskId} is missing ExecutorData.",
cancellationToken,
repairData,
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
success = false;
}
else
{
success = await SafeRestartServiceFabricNodeAsync(repairData, repairTask, cancellationToken);
}
break;
}
default: default:
return false; return false;
} }

Просмотреть файл

@ -29,6 +29,11 @@ namespace FabricHealer.Utilities
private set; private set;
} = 30; } = 30;
public bool EnableLogicRuleTracing
{
get; private set;
}
public bool EnableVerboseLogging public bool EnableVerboseLogging
{ {
get; get;
@ -199,6 +204,12 @@ namespace FabricHealer.Utilities
OperationalTelemetryEnabled = fhOpTelemEnabled; OperationalTelemetryEnabled = fhOpTelemEnabled;
} }
// Logic rule predicate tracing.
if (bool.TryParse(GetConfigSettingValue(RepairConstants.RepairManagerConfigurationSectionName, RepairConstants.EnableLogicRuleTracing), out bool traceRules))
{
EnableLogicRuleTracing = traceRules;
}
// Repair Policies // Repair Policies
if (bool.TryParse(GetConfigSettingValue(RepairConstants.AppRepairPolicySectionName, RepairConstants.Enabled), out bool appRepairEnabled)) if (bool.TryParse(GetConfigSettingValue(RepairConstants.AppRepairPolicySectionName, RepairConstants.Enabled), out bool appRepairEnabled))
{ {

Просмотреть файл

@ -9,7 +9,6 @@ using System.Fabric.Repair;
using System.Linq; using System.Linq;
using System.Threading; using System.Threading;
using System.Threading.Tasks; using System.Threading.Tasks;
using FabricHealer.TelemetryLib;
using FabricHealer.Utilities; using FabricHealer.Utilities;
namespace FabricHealer.Repair namespace FabricHealer.Repair

Просмотреть файл

@ -1,10 +1,11 @@
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<ApplicationManifest xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ApplicationTypeName="FabricHealerType" ApplicationTypeVersion="1.1.17" xmlns="http://schemas.microsoft.com/2011/01/fabric"> <ApplicationManifest xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ApplicationTypeName="FabricHealerType" ApplicationTypeVersion="1.1.18" xmlns="http://schemas.microsoft.com/2011/01/fabric">
<Parameters> <Parameters>
<!-- FabricHealerManager Settings --> <!-- FabricHealerManager Settings -->
<Parameter Name="AutoMitigationEnabled" DefaultValue="true" /> <Parameter Name="AutoMitigationEnabled" DefaultValue="true" />
<Parameter Name="EnableETW" DefaultValue="false" /> <Parameter Name="EnableETW" DefaultValue="false" />
<Parameter Name="HealthCheckIntervalInSeconds" DefaultValue="60" /> <Parameter Name="HealthCheckIntervalInSeconds" DefaultValue="60" />
<Parameter Name="EnableLogicRuleTracing" DefaultValue="true" />
<Parameter Name="EnableTelemetry" DefaultValue="false" /> <Parameter Name="EnableTelemetry" DefaultValue="false" />
<Parameter Name="EnableVerboseLogging" DefaultValue="true" /> <Parameter Name="EnableVerboseLogging" DefaultValue="true" />
<Parameter Name="OperationalTelemetryEnabled" DefaultValue="true" /> <Parameter Name="OperationalTelemetryEnabled" DefaultValue="true" />
@ -30,7 +31,7 @@
should match the Name and Version attributes of the ServiceManifest element defined in the should match the Name and Version attributes of the ServiceManifest element defined in the
ServiceManifest.xml file. --> ServiceManifest.xml file. -->
<ServiceManifestImport> <ServiceManifestImport>
<ServiceManifestRef ServiceManifestName="FabricHealerPkg" ServiceManifestVersion="1.1.17" /> <ServiceManifestRef ServiceManifestName="FabricHealerPkg" ServiceManifestVersion="1.1.18" />
<ConfigOverrides> <ConfigOverrides>
<ConfigOverride Name="Config"> <ConfigOverride Name="Config">
<Settings> <Settings>
@ -44,6 +45,7 @@
<Parameter Name="EnableOperationalTelemetry" Value="[OperationalTelemetryEnabled]" /> <Parameter Name="EnableOperationalTelemetry" Value="[OperationalTelemetryEnabled]" />
<Parameter Name="EnableRollingServiceRestarts" Value="[EnableRollingServiceRestarts]" /> <Parameter Name="EnableRollingServiceRestarts" Value="[EnableRollingServiceRestarts]" />
<Parameter Name="LocalLogPath" Value="[LocalLogPath]" /> <Parameter Name="LocalLogPath" Value="[LocalLogPath]" />
<Parameter Name="EnableLogicRuleTracing" Value="[EnableLogicRuleTracing]" />
</Section> </Section>
<!-- Repair policies --> <!-- Repair policies -->
<Section Name="AppRepairPolicy"> <Section Name="AppRepairPolicy">

Просмотреть файл

@ -1,4 +1,4 @@
## FabricHealer 1.1.17 ## FabricHealer 1.1.18
### Configuration as Logic and auto-mitigation in Service Fabric clusters ### Configuration as Logic and auto-mitigation in Service Fabric clusters
FabricHealer (FH) is a .NET 6 Service Fabric application that attempts to automatically fix a set of reliably solvable problems that can take place in Service Fabric FabricHealer (FH) is a .NET 6 Service Fabric application that attempts to automatically fix a set of reliably solvable problems that can take place in Service Fabric
@ -78,7 +78,7 @@ Register-ServiceFabricApplicationType -ApplicationPathInImageStore FH1110
#Create FO application (if not already deployed at lesser version): #Create FO application (if not already deployed at lesser version):
New-ServiceFabricApplication -ApplicationName fabric:/FabricHealer -ApplicationTypeName FabricHealerType -ApplicationTypeVersion 1.1.17 New-ServiceFabricApplication -ApplicationName fabric:/FabricHealer -ApplicationTypeName FabricHealerType -ApplicationTypeVersion 1.1.18
#Create the Service instance: #Create the Service instance:
@ -87,7 +87,7 @@ New-ServiceFabricService -Stateless -PartitionSchemeSingleton -ApplicationName f
#OR if updating existing version: #OR if updating existing version:
Start-ServiceFabricApplicationUpgrade -ApplicationName fabric:/FabricHealer -ApplicationTypeVersion 1.1.17 -Monitored -FailureAction rollback Start-ServiceFabricApplicationUpgrade -ApplicationName fabric:/FabricHealer -ApplicationTypeVersion 1.1.18 -Monitored -FailureAction rollback
``` ```
## Using FabricHealer ## Using FabricHealer

Просмотреть файл

@ -1,4 +1,4 @@
## FabricHealer 1.1.17 ## FabricHealer 1.1.18
### Configuration as Logic and auto-mitigation in Service Fabric clusters ### Configuration as Logic and auto-mitigation in Service Fabric clusters
FabricHealer (FH) is a .NET 6 Service Fabric application that attempts to automatically fix a set of reliably solvable problems that can take place in Service Fabric FabricHealer (FH) is a .NET 6 Service Fabric application that attempts to automatically fix a set of reliably solvable problems that can take place in Service Fabric