1.1.18: + rule tracing, upgrade mod, RepairPolicy mod, DeactivateFabricNode predicate.
This commit is contained in:
Родитель
c648b6c98a
Коммит
0ab8864831
|
@ -23,11 +23,11 @@ function Build-SFPkg {
|
|||
try {
|
||||
Push-Location $scriptPath
|
||||
|
||||
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.SelfContained.1.1.17" "$scriptPath\bin\release\FabricHealer\linux-x64\self-contained\FabricHealerType"
|
||||
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.FrameworkDependent.1.1.17" "$scriptPath\bin\release\FabricHealer\linux-x64\framework-dependent\FabricHealerType"
|
||||
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.SelfContained.1.1.18" "$scriptPath\bin\release\FabricHealer\linux-x64\self-contained\FabricHealerType"
|
||||
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.FrameworkDependent.1.1.18" "$scriptPath\bin\release\FabricHealer\linux-x64\framework-dependent\FabricHealerType"
|
||||
|
||||
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.SelfContained.1.1.17" "$scriptPath\bin\release\FabricHealer\win-x64\self-contained\FabricHealerType"
|
||||
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.FrameworkDependent.1.1.17" "$scriptPath\bin\release\FabricHealer\win-x64\framework-dependent\FabricHealerType"
|
||||
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.SelfContained.1.1.18" "$scriptPath\bin\release\FabricHealer\win-x64\self-contained\FabricHealerType"
|
||||
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.FrameworkDependent.1.1.18" "$scriptPath\bin\release\FabricHealer\win-x64\framework-dependent\FabricHealerType"
|
||||
}
|
||||
finally {
|
||||
Pop-Location
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
},
|
||||
"applicationTypeVersionFabricHealer": {
|
||||
"type": "string",
|
||||
"defaultValue": "1.1.17",
|
||||
"defaultValue": "1.1.18",
|
||||
"metadata": {
|
||||
"description": "Provide the app version number of FabricHealer. This must be identical to the version specified in the sfpkg."
|
||||
}
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
"value": "<YOUR-CLUSTER-RESOURCE-NAME>"
|
||||
},
|
||||
"applicationTypeVersionFabricHealer": {
|
||||
"value": "1.1.17"
|
||||
"value": "1.1.18"
|
||||
},
|
||||
"packageUrlFabricHealer": {
|
||||
"value": "<PUBLIC-ACCESSIBLE-URL-FOR-FABRICHEALER-SFPKG>"
|
|
@ -44,7 +44,7 @@ Here is a full example of exactly what is sent in one of these telemetry events,
|
|||
"ClusterId": "00000000-1111-1111-0000-00f00d000d",
|
||||
"ClusterType": "SFRP",
|
||||
"NodeNameHash": "3e83569d4c6aad78083cd081215dafc81e5218556b6a46cb8dd2b183ed0095ad",
|
||||
"FHVersion": "1.1.17",
|
||||
"FHVersion": "1.1.18",
|
||||
"UpTime": "00:00:00.2164523",
|
||||
"Timestamp": "2023-02-07T21:45:25.2443014Z",
|
||||
"OS": "Windows",
|
||||
|
|
|
@ -52,6 +52,9 @@
|
|||
<None Update="PackageRoot\Config\LogicRules\MachineRules.guan">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="TestApp42.zip">
|
||||
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
|
||||
</None>
|
||||
<None Update="testrules_wellformed.guan">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
|
|
|
@ -47,7 +47,7 @@ namespace FHTest
|
|||
private const string FHProxyId = "FabricHealerProxy";
|
||||
|
||||
[ClassInitialize]
|
||||
public static void TestClassStartUp(TestContext testContext)
|
||||
public static async Task TestClassStartUp(TestContext testContext)
|
||||
{
|
||||
if (!IsLocalSFRuntimePresent())
|
||||
{
|
||||
|
@ -93,6 +93,8 @@ namespace FHTest
|
|||
{
|
||||
TelemetryEnabled = false
|
||||
};
|
||||
|
||||
await DeployTestApp42Async();
|
||||
}
|
||||
|
||||
/* Helpers */
|
||||
|
@ -168,6 +170,116 @@ namespace FHTest
|
|||
}
|
||||
}
|
||||
|
||||
private static async Task DeployTestApp42Async()
|
||||
{
|
||||
string appName = "fabric:/TestApp42";
|
||||
|
||||
// If fabric:/TestApp42 is already installed, exit.
|
||||
var deployedTestApp =
|
||||
await fabricClient.QueryManager.GetDeployedApplicationListAsync(
|
||||
NodeName,
|
||||
new Uri(appName),
|
||||
TimeSpan.FromSeconds(30),
|
||||
token);
|
||||
|
||||
if (deployedTestApp?.Count > 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
string appType = "TestApp42Type";
|
||||
string appVersion = "1.0.0";
|
||||
|
||||
// Change this to suit your configuration (so, if you are on Windows and you installed SF on a different drive, for example).
|
||||
string imageStoreConnectionString = @"file:C:\SfDevCluster\Data\ImageStoreShare";
|
||||
string packagePathInImageStore = "TestApp42";
|
||||
string packagePathZip = Path.Combine(Environment.CurrentDirectory, "TestApp42.zip");
|
||||
string packagePath = Path.Combine(Environment.CurrentDirectory, "TestApp42", "Release");
|
||||
|
||||
try
|
||||
{
|
||||
// Unzip the compressed HealthMetrics app package.
|
||||
System.IO.Compression.ZipFile.ExtractToDirectory(packagePathZip, "TestApp42", true);
|
||||
|
||||
// Copy the HealthMetrics app package to a location in the image store.
|
||||
fabricClient.ApplicationManager.CopyApplicationPackage(imageStoreConnectionString, packagePath, packagePathInImageStore);
|
||||
|
||||
// Provision the HealthMetrics application.
|
||||
await fabricClient.ApplicationManager.ProvisionApplicationAsync(packagePathInImageStore);
|
||||
|
||||
// Create HealthMetrics app instance.
|
||||
ApplicationDescription appDesc = new(new Uri(appName), appType, appVersion);
|
||||
await fabricClient.ApplicationManager.CreateApplicationAsync(appDesc);
|
||||
|
||||
// This is a hack. Withouth this timeout, the deployed test services may not have populated the FC cache?
|
||||
// You may need to increase this value depending upon your dev machine? You'll find out..
|
||||
await Task.Delay(TimeSpan.FromSeconds(15));
|
||||
}
|
||||
catch (FabricException fe)
|
||||
{
|
||||
if (fe.ErrorCode == FabricErrorCode.ApplicationAlreadyExists)
|
||||
{
|
||||
await fabricClient.ApplicationManager.DeleteApplicationAsync(new DeleteApplicationDescription(new Uri(appName)) { ForceDelete = true });
|
||||
await DeployTestApp42Async();
|
||||
}
|
||||
else if (fe.ErrorCode == FabricErrorCode.ApplicationTypeAlreadyExists)
|
||||
{
|
||||
var appList = await fabricClient.QueryManager.GetApplicationListAsync(new Uri(appName));
|
||||
if (appList.Count > 0)
|
||||
{
|
||||
await fabricClient.ApplicationManager.DeleteApplicationAsync(new DeleteApplicationDescription(new Uri(appName)) { ForceDelete = true });
|
||||
}
|
||||
await fabricClient.ApplicationManager.UnprovisionApplicationAsync(appType, appVersion);
|
||||
await DeployTestApp42Async();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<bool> EnsureTestServicesExistAsync(string appName)
|
||||
{
|
||||
try
|
||||
{
|
||||
var services = await fabricClient.QueryManager.GetServiceListAsync(new Uri(appName));
|
||||
return services?.Count > 0;
|
||||
}
|
||||
catch (FabricElementNotFoundException)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private static async Task RemoveTestApplicationsAsync()
|
||||
{
|
||||
string imageStoreConnectionString = @"file:C:\SfDevCluster\Data\ImageStoreShare";
|
||||
|
||||
// TestApp42 \\
|
||||
|
||||
if (await EnsureTestServicesExistAsync("fabric:/TestApp42"))
|
||||
{
|
||||
string appName = "fabric:/TestApp42";
|
||||
string appType = "TestApp42Type";
|
||||
string appVersion = "1.0.0";
|
||||
string serviceName1 = "fabric:/TestApp42/ChildProcessCreator";
|
||||
string packagePathInImageStore = "TestApp42";
|
||||
|
||||
// Clean up the unzipped directory.
|
||||
fabricClient.ApplicationManager.RemoveApplicationPackage(imageStoreConnectionString, packagePathInImageStore);
|
||||
|
||||
// Delete services.
|
||||
var deleteServiceDescription1 = new DeleteServiceDescription(new Uri(serviceName1));
|
||||
await fabricClient.ServiceManager.DeleteServiceAsync(deleteServiceDescription1);
|
||||
|
||||
// Delete an application instance from the application type.
|
||||
var deleteApplicationDescription = new DeleteApplicationDescription(new Uri(appName));
|
||||
await fabricClient.ApplicationManager.DeleteApplicationAsync(deleteApplicationDescription);
|
||||
|
||||
// Un-provision the application type.
|
||||
await fabricClient.ApplicationManager.UnprovisionApplicationAsync(appType, appVersion);
|
||||
}
|
||||
}
|
||||
|
||||
[ClassCleanup]
|
||||
public static async Task TestClassCleanupAsync()
|
||||
{
|
||||
|
@ -176,6 +288,7 @@ namespace FHTest
|
|||
|
||||
// Ensure FHProxy cleans up its health reports.
|
||||
FabricHealerProxy.Instance.Close();
|
||||
await RemoveTestApplicationsAsync();
|
||||
}
|
||||
|
||||
/* GuanLogic Tests
|
||||
|
@ -336,6 +449,7 @@ namespace FHTest
|
|||
};
|
||||
|
||||
var file = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "LogicRules", "ReplicaRules.guan");
|
||||
FabricHealerManager.CurrentlyExecutingLogicRulesFileName = "ReplicaRules.guan";
|
||||
List<string> repairRules = FabricHealerManager.ParseRulesFile(await File.ReadAllLinesAsync(file, token));
|
||||
|
||||
try
|
||||
|
@ -379,6 +493,7 @@ namespace FHTest
|
|||
|
||||
var file = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "LogicRules", "SystemServiceRules.guan");
|
||||
List<string> repairRules = FabricHealerManager.ParseRulesFile(await File.ReadAllLinesAsync(file, token));
|
||||
FabricHealerManager.CurrentlyExecutingLogicRulesFileName = "SystemServiceRules.guan";
|
||||
|
||||
try
|
||||
{
|
||||
|
@ -490,6 +605,7 @@ namespace FHTest
|
|||
{
|
||||
string testRulesFilePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "LogicRules", "MachineRules.guan");
|
||||
string[] rules = await File.ReadAllLinesAsync(testRulesFilePath, token);
|
||||
FabricHealerManager.CurrentlyExecutingLogicRulesFileName = "MachineRules.guan";
|
||||
List<string> repairRules = FabricHealerManager.ParseRulesFile(rules);
|
||||
int escalationCount = 4; // reboot, reimage, heal, triage.
|
||||
RepairTaskList repairTasks = null;
|
||||
|
@ -771,7 +887,7 @@ namespace FHTest
|
|||
{
|
||||
// The service here must be one that is running in your test cluster.
|
||||
// TODO: install a local test app as part of tests.
|
||||
ServiceName = "fabric:/BadApp/BadService",
|
||||
ServiceName = "fabric:/TestApp42/ChildProcessCreator",
|
||||
NodeName = NodeName,
|
||||
// Specifying Source is Required for unit tests.
|
||||
// For unit tests, there is no FabricRuntime static, so FHProxy, which utilizes this type, will fail unless Source is provided here.
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
<Parameter Name="EnableAutoMitigation" Value="true" />
|
||||
<Parameter Name="EnableOperationalTelemetry" Value="false" />
|
||||
<Parameter Name="EnableRollingServiceRestarts" Value="true" />
|
||||
<Parameter Name="EnableLogicRuleTracing" Value="true" />
|
||||
<!-- Folder name for local log output. You can use a full path or just a folder name. -->
|
||||
<Parameter Name="LocalLogPath" Value="fabric_healer_testlogs" />
|
||||
|
||||
|
|
Двоичный файл не отображается.
|
@ -2,7 +2,7 @@
|
|||
<package xmlns="http://schemas.microsoft.com/packaging/2013/05/nuspec.xsd">
|
||||
<metadata minClientVersion="3.3.0">
|
||||
<id>%PACKAGE_ID%</id>
|
||||
<version>1.1.17</version>
|
||||
<version>1.1.18</version>
|
||||
<releaseNotes>
|
||||
This release requires Service Fabric runtime version 9 and higher and at least Service Fabric SDK version 6.0.1017. There are several changes and improvements in this
|
||||
release including a new machine repair model, updated logic rules, bug fixes, and many code improvements.
|
||||
|
|
|
@ -25,8 +25,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
|
|||
Documentation\OperationalTelemetry.md = Documentation\OperationalTelemetry.md
|
||||
README.md = README.md
|
||||
Documentation\Deployment\service-fabric-healer.json = Documentation\Deployment\service-fabric-healer.json
|
||||
Documentation\Deployment\service-fabric-healer.v1.1.18.parameters.json = Documentation\Deployment\service-fabric-healer.v1.1.18.parameters.json
|
||||
Documentation\Using.md = Documentation\Using.md
|
||||
Documentation\Deployment\service-fabric-healer.v1.1.17.parameters.json = Documentation\Deployment\service-fabric-healer.v1.1.17.parameters.json
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FHTest", "FHTest\FHTest.csproj", "{8D9712BF-C026-4A36-B6D1-6345137D3B6F}"
|
||||
|
|
|
@ -12,8 +12,8 @@
|
|||
<RuntimeIdentifier>win-x64</RuntimeIdentifier>-->
|
||||
<RuntimeIdentifiers>linux-x64;win-x64</RuntimeIdentifiers>
|
||||
<Product>FabricHealer</Product>
|
||||
<Version>1.1.17</Version>
|
||||
<FileVersion>1.1.17</FileVersion>
|
||||
<Version>1.1.18</Version>
|
||||
<FileVersion>1.1.18</FileVersion>
|
||||
<StartupObject>FabricHealer.Program</StartupObject>
|
||||
<Platforms>x64</Platforms>
|
||||
</PropertyGroup>
|
||||
|
|
|
@ -29,10 +29,15 @@ namespace FabricHealer
|
|||
{
|
||||
internal static TelemetryUtilities TelemetryUtilities;
|
||||
internal static RepairData RepairHistory;
|
||||
public static StatelessServiceContext ServiceContext;
|
||||
|
||||
internal static bool EnableRuleTracing { get; set; } = true;
|
||||
|
||||
public static StatelessServiceContext ServiceContext { get; private set; }
|
||||
|
||||
public static string CurrentlyExecutingLogicRulesFileName { get; set; }
|
||||
|
||||
// Folks often use their own version numbers. This is for internal diagnostic telemetry.
|
||||
private const string InternalVersionNumber = "1.1.17";
|
||||
private const string InternalVersionNumber = "1.1.18";
|
||||
private static FabricHealerManager singleton;
|
||||
private static FabricClient _fabricClient;
|
||||
private bool disposedValue;
|
||||
|
@ -305,7 +310,7 @@ namespace FabricHealer
|
|||
|
||||
// First, let's clean up any orphaned non-node level FabricHealer repair tasks left pending. This will also resume Fabric Node repairs that
|
||||
// FH owns and was executing at the time FH exited. Only FH-owned repairs will be canceled, not repairs conducted by other executors.
|
||||
await CancelOrResumeAllRunningFHRepairsAsync();
|
||||
await CancelAbandonedFHRepairsAsync();
|
||||
|
||||
// Run until RunAsync token is cancelled.
|
||||
while (!Token.IsCancellationRequested)
|
||||
|
@ -467,17 +472,26 @@ namespace FabricHealer
|
|||
{
|
||||
// FH looks for and resumes FabricNode restart repair jobs when it starts up (so, it will pick up where it left off in the safe restart sequence
|
||||
// when the Fabric node hosting FH is the one FH restarted).
|
||||
if (JsonSerializationUtility.TryDeserializeObject(repair.ExecutorData, out RepairExecutorData exData, true)
|
||||
&& exData.RepairPolicy.RepairAction == RepairActionType.RestartFabricNode)
|
||||
if (!JsonSerializationUtility.TryDeserializeObject(repair.ExecutorData, out RepairExecutorData exData, true))
|
||||
{
|
||||
if (isClosing)
|
||||
continue;
|
||||
}
|
||||
|
||||
// This would mean that the job has node-level Impact and its state is at least Approved.
|
||||
if (repair.Impact is NodeRepairImpactDescription impact)
|
||||
{
|
||||
if (impact.ImpactedNodes.Any(
|
||||
n => n.NodeName == exData.RepairPolicy.NodeName
|
||||
&& (n.ImpactLevel == NodeImpactLevel.Restart ||
|
||||
n.ImpactLevel == NodeImpactLevel.RemoveData ||
|
||||
n.ImpactLevel == NodeImpactLevel.RemoveNode)))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Was max execution time configured by user?
|
||||
if (exData != null && exData.RepairPolicy.MaxExecutionTime > TimeSpan.Zero)
|
||||
if (exData.RepairPolicy.MaxExecutionTime > TimeSpan.Zero)
|
||||
{
|
||||
maxFHExecutorTime = exData.RepairPolicy.MaxExecutionTime;
|
||||
}
|
||||
|
@ -492,7 +506,7 @@ namespace FabricHealer
|
|||
catch (Exception e) when (e is ArgumentException || e is FabricException || e is InvalidOperationException || e is TimeoutException)
|
||||
{
|
||||
#if DEBUG
|
||||
RepairLogger.LogWarning($"TryCleanUpOrphanedFabricHealerRepairJobsAsync Failure:{Environment.NewLine}{e}");
|
||||
RepairLogger.LogWarning($"TryCleanUpOrphanedFabricHealerRepairJobs Failure:{Environment.NewLine}{e}");
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -531,15 +545,13 @@ namespace FabricHealer
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// Cancels all FabricHealer repair tasks currently in flight (unless in Restoring state).
|
||||
/// OR Resumes fabric node-level repairs that were abandoned due to FH going down while they were processing.
|
||||
/// Cancels all FabricHealer repair tasks currently in flight.
|
||||
/// </summary>
|
||||
/// <returns>A Task.</returns>
|
||||
private static async Task CancelOrResumeAllRunningFHRepairsAsync()
|
||||
private static async Task CancelAbandonedFHRepairsAsync()
|
||||
{
|
||||
try
|
||||
{
|
||||
|
||||
var currentFHRepairTasksInProgress =
|
||||
await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => RepairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(
|
||||
|
@ -582,49 +594,11 @@ namespace FabricHealer
|
|||
continue;
|
||||
}
|
||||
|
||||
// Try and cancel existing repair. We may need to create a new one for abandoned repairs where FH goes down for some reason.
|
||||
// Note: CancelRepairTaskAsync handles exceptions (IOE) that may be thrown by RM due to state change policy.
|
||||
// The repair state could change to Completed after this call is made, for example, and before RM API call.
|
||||
if (repair.State != RepairTaskState.Completed)
|
||||
{
|
||||
await FabricRepairTasks.CancelRepairTaskAsync(repair);
|
||||
}
|
||||
|
||||
/* Resume interrupted Fabric Node restart repairs */
|
||||
|
||||
// There is no need to resume simple repairs that do not require multiple repair steps (e.g., codepackage/process/replica restarts).
|
||||
if (repairExecutorData.RepairPolicy.RepairAction != RepairActionType.RestartFabricNode)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
string errorCode = repairExecutorData.RepairPolicy.Code;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(errorCode))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// File Deletion repair is a node-level (VM) repair, but is not multi-step. Ignore.
|
||||
if (repairExecutorData.RepairPolicy.RepairAction == RepairActionType.DeleteFiles)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Fabric System service warnings/errors from FO can be Node level repair targets (e.g., Fabric binary needs to be restarted).
|
||||
// FH will restart the node hosting the troubled SF system service if specified in related logic rules.
|
||||
var repairRules =
|
||||
GetRepairRulesFromConfiguration(
|
||||
!string.IsNullOrWhiteSpace(
|
||||
repairExecutorData.RepairPolicy.ProcessName) ? RepairConstants.SystemServiceRepairPolicySectionName : RepairConstants.FabricNodeRepairPolicySectionName);
|
||||
|
||||
var repairData = new TelemetryData
|
||||
{
|
||||
NodeName = repairExecutorData.RepairPolicy.NodeName,
|
||||
Code = errorCode,
|
||||
};
|
||||
|
||||
await RunGuanQueryAsync(repairData, repairRules, Token, repairExecutorData);
|
||||
RepairLogger.LogInfo("Exiting CancelOrResumeAllRunningFHRepairsAsync: Completed.");
|
||||
}
|
||||
}
|
||||
|
@ -632,13 +606,13 @@ namespace FabricHealer
|
|||
{
|
||||
if (e is FabricException)
|
||||
{
|
||||
RepairLogger.LogWarning($"Could not cancel or resume repair tasks. Failed with:{Environment.NewLine}{e}");
|
||||
RepairLogger.LogWarning($"Could not cancel FH repair tasks. Failed with:{Environment.NewLine}{e}");
|
||||
}
|
||||
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"CancelOrResumeAllRunningFHRepairsAsync",
|
||||
$"Could not cancel or resume repair tasks. Failed with:{Environment.NewLine}{e}",
|
||||
$"Could not cancel abandoned FH repair tasks. Failed with:{Environment.NewLine}{e}",
|
||||
Token,
|
||||
null,
|
||||
ConfigSettings.EnableVerboseLogging);
|
||||
|
@ -1561,7 +1535,7 @@ namespace FabricHealer
|
|||
continue;
|
||||
}
|
||||
|
||||
// Disk?
|
||||
// Disk repair?
|
||||
if (repairData.EntityType == EntityType.Disk)
|
||||
{
|
||||
if (!ConfigSettings.EnableDiskRepair)
|
||||
|
@ -1573,7 +1547,7 @@ namespace FabricHealer
|
|||
continue;
|
||||
}
|
||||
|
||||
// Fabric node?
|
||||
// Fabric Node repair?
|
||||
if (repairData.EntityType == EntityType.Node)
|
||||
{
|
||||
if (!ConfigSettings.EnableFabricNodeRepair)
|
||||
|
@ -1581,17 +1555,13 @@ namespace FabricHealer
|
|||
continue;
|
||||
}
|
||||
|
||||
// FabricHealerProxy-generated report, so a restart fabric node request, for example.
|
||||
// FabricObserver/FabricHealerProxy-generated health report.
|
||||
await ProcessFabricNodeHealthAsync(evt, repairData);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Machine repair \\
|
||||
if (!ConfigSettings.EnableMachineRepair)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// Machine-level repair \\
|
||||
|
||||
// Make sure that there is not already an Infra repair in progress for the target node.
|
||||
if (await RepairTaskEngine.IsNodeLevelRepairCurrentlyInFlightAsync(repairData, Token))
|
||||
|
@ -1617,7 +1587,7 @@ namespace FabricHealer
|
|||
|
||||
/* Start repair workflow */
|
||||
|
||||
string repairId = $"MachineRepair_{nodeType}_{repairData.NodeName}";
|
||||
string repairId = $"MachineRepair_{repairData.NodeName}";
|
||||
repairData.RepairPolicy = new RepairPolicy
|
||||
{
|
||||
RepairId = repairId,
|
||||
|
@ -1752,8 +1722,8 @@ namespace FabricHealer
|
|||
return;
|
||||
}
|
||||
|
||||
// There is only one supported repair for a FabricNode: Restart.
|
||||
string repairId = $"{repairData.NodeName}_{repairData.NodeType}_Restart";
|
||||
string action = repairData.RepairPolicy.RepairAction == RepairActionType.DeactivateNode ? "Deactivate" : "Restart";
|
||||
string repairId = $"{repairData.NodeName}_{repairData.NodeType}_{action}";
|
||||
|
||||
var currentRepairs =
|
||||
await RepairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairConstants.FHTaskIdPrefix, Token);
|
||||
|
@ -2173,6 +2143,7 @@ namespace FabricHealer
|
|||
return null;
|
||||
}
|
||||
|
||||
CurrentlyExecutingLogicRulesFileName = logicRulesConfigFileName;
|
||||
List<string> repairRules = ParseRulesFile(rules);
|
||||
return repairRules;
|
||||
}
|
||||
|
|
|
@ -1,11 +1,6 @@
|
|||
## Logic rules for Service Fabric Node repairs.
|
||||
## Logic rule examples for Service Fabric Node repairs.
|
||||
## These repairs are not executed by FabricHealer. FH creates repair tasks with the correct node impact specified and RM takes it from there.
|
||||
|
||||
## First check if we are inside the run interval. If so, cut (!). This means that no other rules will be processed (no back-tracking).
|
||||
## This is commented out by default. Just uncomment and set the global run interval for app Fabric node level repairs to suit your needs.
|
||||
|
||||
## Mitigate() :- CheckInsideRunInterval(02:00:00), !.
|
||||
|
||||
## This rule means that whatever the Fabric node-level warning data from the issuing service happens to be, restart the target Fabric node if
|
||||
## the repair hasn't run 4 times in the last 8 hours.
|
||||
|
||||
Mitigate() :- GetRepairHistory(?repairCount, 08:00:00), ?repairCount < 4, RestartFabricNode(DoHealthChecks=false, MaxWaitTimeForHealthStateOk=00:45:00, MaxExecutionTime=02:00:00).
|
||||
## Restart/Deactivate. Try Restart twice in 8 hour window. Else, deactivate (Pause) the Fabric node.
|
||||
Mitigate(HealthState=Error) :- GetRepairHistory(?repairCount, 08:00:00), ?repairCount < 2, !, RestartFabricNode().
|
||||
Mitigate(HealthState=Error) :- DeactivateFabricNode().
|
|
@ -1,31 +1,31 @@
|
|||
## Logic rules for scheduling Machine-level repair jobs in the cluster. EntityType fact is always Machine.
|
||||
## FH does not conduct (execute) these repairs. It simply schedules them. InfrastructureService is always the Executor for these types of Repair Jobs.
|
||||
## FH does not conduct (execute) these repairs. It simply schedules them. InfrastructureService is always the Executor for machine-level Repair Jobs.
|
||||
|
||||
## Applicable Named Arguments for Mitigate. Facts are supplied by FabricObserver, FHProxy or FH itself.
|
||||
## Any argument below with (FO/FHProxy) means that only FO or FHProxy will present the fact.
|
||||
## | Argument Name | Definition |
|
||||
## |---------------------------|------------------------------------------------------------------------|
|
||||
## | NodeName | Name of the node |
|
||||
## | NodeType | Type of node |
|
||||
## | ErrorCode (FO/FHProxy) | Supported Error Code emitted by caller (e.g. "FO002") |
|
||||
## | MetricName (FO/FHProxy) | Name of the Metric (e.g., CpuPercent or MemoryMB, etc.) |
|
||||
## | MetricValue (FO/FHProxy) | Corresponding Metric Value (e.g. "85" indicating 85% CPU usage) |
|
||||
## | NodeName | Name of the node |
|
||||
## | NodeType | Type of node |
|
||||
## | ErrorCode (FO/FHProxy) | Supported Error Code emitted by caller (e.g. "FO002") |
|
||||
## | MetricName (FO/FHProxy) | Name of the Metric (e.g., CpuPercent or MemoryMB, etc.) |
|
||||
## | MetricValue (FO/FHProxy) | Corresponding Metric Value (e.g. "85" indicating 85% CPU usage) |
|
||||
## | OS | The name of the OS where FabricHealer is running (Linux or Windows) |
|
||||
## | HealthState | The HealthState of the target entity: Error or Warning |
|
||||
## | Source | The Source ID of the related SF Health Event |
|
||||
## | Source | The Source ID of the related SF Health Event |
|
||||
## | Property | The Property of the related SF Health Event |
|
||||
|
||||
## Metric Names, from FO or FHProxy.
|
||||
## | Name |
|
||||
## | Name |
|
||||
## |--------------------------------|
|
||||
## | ActiveTcpPorts |
|
||||
## | CpuPercent |
|
||||
## | EphemeralPorts |
|
||||
## | EphemeralPortsPercent |
|
||||
## | MemoryMB |
|
||||
## | MemoryPercent |
|
||||
## | Handles (Linux-only) |
|
||||
## | HandlesPercent (Linux-only) |
|
||||
## | ActiveTcpPorts |
|
||||
## | CpuPercent |
|
||||
## | EphemeralPorts |
|
||||
## | EphemeralPortsPercent |
|
||||
## | MemoryMB |
|
||||
## | MemoryPercent |
|
||||
## | Handles (Linux-only) |
|
||||
## | HandlesPercent (Linux-only) |
|
||||
|
||||
|
||||
## The logic program below is a repair specification (policy) that does not require facts from FabricObserver (FO) or FHProxy.
|
||||
|
@ -34,8 +34,8 @@
|
|||
## Don't proceed if the target entity is not in Error.
|
||||
Mitigate(HealthState=?healthState) :- not(?healthState == Error), !.
|
||||
|
||||
## Don't proceed is FabricObserver's NodeObserver is not the source of the Error event.
|
||||
##Mitigate(Source=?source) :- not(match(?source, "NodeObserver")), !.
|
||||
## Don't proceed unless the specified watchdog created the Error health event.
|
||||
Mitigate(Source=?source) :- not(match(?source, "EventLogWatchdog")), !.
|
||||
|
||||
## Don't proceed if there are already 2 or more machine repairs currently active in the cluster.
|
||||
Mitigate() :- CheckOutstandingRepairs(2), !.
|
||||
|
@ -47,9 +47,15 @@ Mitigate() :- CheckInsideScheduleInterval(00:10:00), !.
|
|||
Mitigate() :- CheckInsideNodeProbationPeriod(00:30:00), !.
|
||||
|
||||
## Don't proceed if the target node hasn't been in Error (including cyclic Up/Down) state for at least two hours.
|
||||
Mitigate() :- CheckInsideHealthStateMinDuration(02:00:00), !.
|
||||
Mitigate() :- CheckInsideHealthStateMinDuration(00:01:00), !.
|
||||
|
||||
## Mitigations (RM repair scheduling logic - InfrastructureService for the target node type will be the repair Executor, not FH).
|
||||
## For certain environments, the correct mitigation is to deactivate the target node. The below rule schedules a node deactivation (intent is Pause) repair.
|
||||
|
||||
Mitigate(Source=?source, Property=?property) :- match(?source, "EventLogWatchdog"),
|
||||
match(?property, "CriticalMachineFailure"), !,
|
||||
DeactivateFabricNode(ImpactLevel=RemoveData).
|
||||
|
||||
## Infra Mitigations (RM repair scheduling logic - InfrastructureService for the target node type will be the repair Executor, not FH).
|
||||
## The logic below demonstrates how to specify a repair escalation path: Reboot -> Reimage -> Heal -> Triage (human intervention required).
|
||||
## ScheduleMachineRepair predicate takes any repair action string. There are a handful that are supported by RepairManager/InfrastructureService, like below.
|
||||
|
||||
|
@ -69,4 +75,4 @@ Mitigate() :- GetRepairHistory(?repairCount, 08:00:00, System.Azure.Heal), ?repa
|
|||
## from scheduling any other machine repairs for the target node until canceled. It also counts against the number of concurrent Active repairs you specified
|
||||
## above in the CheckOutstandingRepairs predicate.
|
||||
Mitigate(NodeName=?nodeName) :- LogInfo("0042_{0}: Specified Machine repair escalations have been exhausted for node {0}. Human intervention is required.", ?nodeName),
|
||||
ScheduleMachineRepair(ManualTriageNeeded).
|
||||
ScheduleMachineRepair(ManualTriageNeeded).
|
|
@ -13,6 +13,8 @@
|
|||
<Parameter Name="EnableRollingServiceRestarts" Value="" MustOverride="true" />
|
||||
<!-- Folder name for local log output. You can use a full path or just a folder name. -->
|
||||
<Parameter Name="LocalLogPath" Value="" MustOverride="true" />
|
||||
<!-- This will enable FabricHealer to try and trace executed logic rules that employ repair action predicates. -->
|
||||
<Parameter Name="EnableLogicRuleTracing" Value="" MustOverride="true" />
|
||||
|
||||
<!-- ***Non-Overridable Parameters*** These must be set in this file. -->
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<ServiceManifest Name="FabricHealerPkg"
|
||||
Version="1.1.17"
|
||||
Version="1.1.18"
|
||||
xmlns="http://schemas.microsoft.com/2011/01/fabric"
|
||||
xmlns:xsd="http://www.w3.org/2001/XMLSchema"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
|
@ -11,7 +11,7 @@
|
|||
</ServiceTypes>
|
||||
|
||||
<!-- Code package is your service executable. -->
|
||||
<CodePackage Name="Code" Version="1.1.17">
|
||||
<CodePackage Name="Code" Version="1.1.18">
|
||||
<EntryPoint>
|
||||
<ExeHost>
|
||||
<Program>FabricHealer</Program>
|
||||
|
@ -21,5 +21,5 @@
|
|||
|
||||
<!-- Config package is the contents of the Config directory under PackageRoot that contains an
|
||||
independently-updateable and versioned set of custom configuration settings for your service. -->
|
||||
<ConfigPackage Name="Config" Version="1.1.17" />
|
||||
<ConfigPackage Name="Config" Version="1.1.18" />
|
||||
</ServiceManifest>
|
|
@ -167,8 +167,9 @@ namespace FabricHealer.Repair
|
|||
|
||||
repairTask = await RepairTaskEngine.CreateInfrastructureRepairTaskAsync(repairData, token);
|
||||
break;
|
||||
|
||||
|
||||
// FH
|
||||
case RepairActionType.DeactivateNode:
|
||||
case RepairActionType.DeleteFiles:
|
||||
case RepairActionType.RestartCodePackage:
|
||||
case RepairActionType.RestartFabricNode:
|
||||
|
@ -184,20 +185,16 @@ namespace FabricHealer.Repair
|
|||
return null;
|
||||
}
|
||||
|
||||
bool success = await CreateRepairTaskAsync(
|
||||
repairTask,
|
||||
repairData,
|
||||
token);
|
||||
|
||||
bool success = await CreateClusterRepairTaskAsync(repairTask, repairData, token);
|
||||
return success ? repairTask : null;
|
||||
}
|
||||
|
||||
private static async Task<bool> CreateRepairTaskAsync(
|
||||
private static async Task<bool> CreateClusterRepairTaskAsync(
|
||||
RepairTask repairTask,
|
||||
TelemetryData repairData,
|
||||
CancellationToken token)
|
||||
{
|
||||
if (repairTask == null)
|
||||
if (repairTask == null || repairData?.RepairPolicy == null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
@ -214,29 +211,45 @@ namespace FabricHealer.Repair
|
|||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
token);
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
$"CreateClusterRepairTaskAsync::{repairData.RepairPolicy.RepairId}",
|
||||
$"Successfully created repair task {repairTask.TaskId}.",
|
||||
token,
|
||||
null,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
$"CreateClusterRepairTaskAsync::{repairData.RepairPolicy.RepairId}_AlreadyExists",
|
||||
$"A repair already exists with internal repair Id {repairData.RepairPolicy.RepairId}. Will not schedule another repair.",
|
||||
token,
|
||||
null,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
catch (ArgumentException ae)
|
||||
{
|
||||
string message = $"Unable to create repairtask:{Environment.NewLine}{ae}";
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Warning,
|
||||
"FabricRepairTasks::TryCreateRepairTaskAsync",
|
||||
message,
|
||||
LogLevel.Info,
|
||||
"CreateClusterRepairTaskAsync",
|
||||
$"Unable to create repairtask:{Environment.NewLine}{ae}",
|
||||
token,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
}
|
||||
catch (FabricException fe)
|
||||
{
|
||||
string message = $"Unable to create repairtask:{Environment.NewLine}{fe}";
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Warning,
|
||||
"FabricRepairTasks::TryCreateRepairTaskAsync",
|
||||
message,
|
||||
LogLevel.Info,
|
||||
$"CreateClusterRepairTaskAsync::Failure({repairData.RepairPolicy.RepairId})",
|
||||
$"Unable to create repair task:{Environment.NewLine}{fe}",
|
||||
token,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
@ -264,13 +277,13 @@ namespace FabricHealer.Repair
|
|||
{
|
||||
var allSystemServices =
|
||||
await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricHealerManager.FabricClientSingleton.QueryManager.GetServiceListAsync(
|
||||
new Uri(RepairConstants.SystemAppName),
|
||||
null,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken),
|
||||
() => FabricHealerManager.FabricClientSingleton.QueryManager.GetServiceListAsync(
|
||||
new Uri(RepairConstants.SystemAppName),
|
||||
null,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken),
|
||||
|
||||
cancellationToken);
|
||||
cancellationToken);
|
||||
|
||||
var infraInstances =
|
||||
allSystemServices.Where(i => i.ServiceTypeName.Equals(RepairConstants.InfrastructureServiceType, StringComparison.InvariantCultureIgnoreCase));
|
||||
|
|
|
@ -0,0 +1,139 @@
|
|||
// ------------------------------------------------------------
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
|
||||
// ------------------------------------------------------------
|
||||
|
||||
using FabricHealer.Utilities.Telemetry;
|
||||
using FabricHealer.Utilities;
|
||||
using Guan.Logic;
|
||||
using System;
|
||||
using System.Fabric.Repair;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace FabricHealer.Repair.Guan
|
||||
{
|
||||
internal class DeactivateFabricNodePredicateType : PredicateType
|
||||
{
|
||||
private static TelemetryData RepairData;
|
||||
private static DeactivateFabricNodePredicateType Instance;
|
||||
|
||||
private class Resolver : BooleanPredicateResolver
|
||||
{
|
||||
public Resolver(CompoundTerm input, Constraint constraint, QueryContext context)
|
||||
: base(input, constraint, context)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
protected override async Task<bool> CheckAsync()
|
||||
{
|
||||
RepairData.RepairPolicy.RepairAction = RepairActionType.DeactivateNode;
|
||||
RepairData.RepairPolicy.RepairIdPrefix = RepairConstants.FHTaskIdPrefix;
|
||||
RepairData.RepairPolicy.RepairId = $"DeactivateNode::{RepairData.NodeName}";
|
||||
|
||||
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
|
||||
{
|
||||
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
|
||||
}
|
||||
|
||||
int count = Input.Arguments.Count;
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
var typeString = Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue().GetType().Name;
|
||||
|
||||
switch (typeString)
|
||||
{
|
||||
case "Boolean" when i == 0 && count == 4 || Input.Arguments[i].Name.ToLower() == "dohealthchecks":
|
||||
RepairData.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue();
|
||||
break;
|
||||
|
||||
case "TimeSpan" when i == 1 && count == 4 || Input.Arguments[i].Name.ToLower() == "maxwaittimeforhealthstateok":
|
||||
RepairData.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue();
|
||||
break;
|
||||
|
||||
case "TimeSpan" when i == 2 && count == 4 || Input.Arguments[i].Name.ToLower() == "maxexecutiontime":
|
||||
RepairData.RepairPolicy.MaxExecutionTime = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue();
|
||||
break;
|
||||
|
||||
case "String" when i == 3 && count == 4 || Input.Arguments[i].Name.ToLower() == "impactlevel":
|
||||
|
||||
string value = Input.Arguments[i].Value.GetEffectiveTerm().GetStringValue().ToLower();
|
||||
|
||||
if (value == "removedata")
|
||||
{
|
||||
RepairData.RepairPolicy.NodeImpactLevel = NodeImpactLevel.RemoveData;
|
||||
}
|
||||
else if (value == "removenode")
|
||||
{
|
||||
RepairData.RepairPolicy.NodeImpactLevel = NodeImpactLevel.RemoveNode;
|
||||
}
|
||||
else if (value == "restart")
|
||||
{
|
||||
RepairData.RepairPolicy.NodeImpactLevel = NodeImpactLevel.Restart;
|
||||
}
|
||||
else
|
||||
{
|
||||
RepairData.RepairPolicy.NodeImpactLevel = NodeImpactLevel.None;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new GuanException($"Unsupported argument type for RestartFabricNode: {typeString}");
|
||||
}
|
||||
}
|
||||
|
||||
var isNodeRepairAlreadyInProgress =
|
||||
await RepairTaskEngine.IsRepairInProgressAsync(RepairData, FabricHealerManager.Token);
|
||||
|
||||
if (isNodeRepairAlreadyInProgress)
|
||||
{
|
||||
string message =
|
||||
$"A repair for Fabric node {RepairData.NodeName} is already in progress in the cluster.";
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
$"DeactivateFabricNode::{RepairData.RepairPolicy.RepairId}",
|
||||
message,
|
||||
FabricHealerManager.Token,
|
||||
RepairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Try to schedule the Deactivate repair with RM (RM will deactivate the node, not FH).
|
||||
RepairTask repairTask = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => RepairTaskManager.ScheduleFabricHealerRepairTaskAsync(
|
||||
RepairData,
|
||||
FabricHealerManager.Token),
|
||||
FabricHealerManager.Token);
|
||||
if (repairTask == null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
public static DeactivateFabricNodePredicateType Singleton(string name, TelemetryData repairData)
|
||||
{
|
||||
RepairData = repairData;
|
||||
return Instance ??= new DeactivateFabricNodePredicateType(name);
|
||||
}
|
||||
|
||||
private DeactivateFabricNodePredicateType(string name)
|
||||
: base(name, true, 0)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
public override PredicateResolver CreateResolver(CompoundTerm input, Constraint constraint, QueryContext context)
|
||||
{
|
||||
return new Resolver(input, constraint, context);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -34,6 +34,11 @@ namespace FabricHealer.Repair.Guan
|
|||
return false;
|
||||
}
|
||||
|
||||
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
|
||||
{
|
||||
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
|
||||
}
|
||||
|
||||
RepairData.RepairPolicy.RepairAction = RepairActionType.DeleteFiles;
|
||||
bool recurseSubDirectories = false;
|
||||
string path = Input.Arguments[0].Value.GetEffectiveTerm().GetStringValue();
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
// ------------------------------------------------------------
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Guan.Logic;
|
||||
|
@ -27,7 +28,7 @@ namespace FabricHealer.Repair.Guan
|
|||
QueryContext queryContext = new(moduleProvider);
|
||||
queryContext.SetDirection(null, order);
|
||||
Query query = Query.Create(queryExpression, queryContext);
|
||||
await query.GetNextAsync();
|
||||
_ = await query.GetNextAsync();
|
||||
}
|
||||
|
||||
public async Task RunQueryAsync(List<CompoundTerm> queryExpressions, CancellationToken cancellationToken)
|
||||
|
@ -43,7 +44,7 @@ namespace FabricHealer.Repair.Guan
|
|||
QueryContext queryContext = new(moduleProvider);
|
||||
queryContext.SetDirection(null, order);
|
||||
Query query = Query.Create(queryExpressions, queryContext, moduleProvider);
|
||||
await query.GetNextAsync();
|
||||
_ = await query.GetNextAsync();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,6 +28,12 @@ namespace FabricHealer.Repair.Guan
|
|||
protected override async Task<bool> CheckAsync()
|
||||
{
|
||||
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartCodePackage;
|
||||
|
||||
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
|
||||
{
|
||||
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
|
||||
}
|
||||
|
||||
int count = Input.Arguments.Count;
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
|
|
|
@ -9,13 +9,11 @@ using Guan.Logic;
|
|||
using FabricHealer.Utilities;
|
||||
using FabricHealer.Utilities.Telemetry;
|
||||
using System.Threading.Tasks;
|
||||
using System.Threading;
|
||||
|
||||
namespace FabricHealer.Repair.Guan
|
||||
{
|
||||
public class RestartFabricNodePredicateType : PredicateType
|
||||
{
|
||||
private static RepairExecutorData RepairExecutorData;
|
||||
private static TelemetryData RepairData;
|
||||
private static RestartFabricNodePredicateType Instance;
|
||||
|
||||
|
@ -30,6 +28,13 @@ namespace FabricHealer.Repair.Guan
|
|||
protected override async Task<bool> CheckAsync()
|
||||
{
|
||||
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartFabricNode;
|
||||
RepairData.RepairPolicy.RepairIdPrefix = RepairConstants.FHTaskIdPrefix;
|
||||
|
||||
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
|
||||
{
|
||||
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
|
||||
}
|
||||
|
||||
int count = Input.Arguments.Count;
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
|
@ -55,59 +60,6 @@ namespace FabricHealer.Repair.Guan
|
|||
}
|
||||
}
|
||||
|
||||
RepairTask repairTask;
|
||||
bool success;
|
||||
|
||||
// This means it's a resumed repair.
|
||||
if (RepairExecutorData != null)
|
||||
{
|
||||
// Historical info, like what step the healer was in when the node went down, is contained in the
|
||||
// executordata instance.
|
||||
repairTask = await RepairTaskEngine.CreateFabricHealerRepairTask(RepairExecutorData, FabricHealerManager.Token);
|
||||
|
||||
if (repairTask == null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// MaxExecutionTime impl.
|
||||
using (CancellationTokenSource tokenSource = new())
|
||||
{
|
||||
using (var linkedCTS = CancellationTokenSource.CreateLinkedTokenSource(
|
||||
tokenSource.Token,
|
||||
FabricHealerManager.Token))
|
||||
{
|
||||
TimeSpan maxExecutionTime = TimeSpan.FromMinutes(60);
|
||||
|
||||
if (RepairData.RepairPolicy.MaxExecutionTime > TimeSpan.Zero)
|
||||
{
|
||||
maxExecutionTime = RepairData.RepairPolicy.MaxExecutionTime;
|
||||
}
|
||||
|
||||
tokenSource.CancelAfter(maxExecutionTime);
|
||||
tokenSource.Token.Register(() =>
|
||||
{
|
||||
_ = FabricHealerManager.TryCleanUpOrphanedFabricHealerRepairJobsAsync();
|
||||
});
|
||||
|
||||
// Try to execute repair (FH executor does this work and manages repair state).
|
||||
success = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => RepairTaskManager.ExecuteFabricHealerRepairTaskAsync(
|
||||
repairTask,
|
||||
RepairData,
|
||||
linkedCTS.Token),
|
||||
linkedCTS.Token);
|
||||
|
||||
if (!success && linkedCTS.IsCancellationRequested)
|
||||
{
|
||||
await FabricHealerManager.TryCleanUpOrphanedFabricHealerRepairJobsAsync();
|
||||
}
|
||||
|
||||
return success;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Block attempts to create node-level repair tasks if one is already running in the cluster.
|
||||
var isNodeRepairAlreadyInProgress =
|
||||
await RepairTaskEngine.IsRepairInProgressAsync(RepairData, FabricHealerManager.Token);
|
||||
|
@ -128,8 +80,8 @@ namespace FabricHealer.Repair.Guan
|
|||
return false;
|
||||
}
|
||||
|
||||
// Try to schedule repair with RM.
|
||||
repairTask = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
// Try to schedule repair with RM for Fabric Node Restart (FH will not be the executor).
|
||||
RepairTask repairTask = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => RepairTaskManager.ScheduleFabricHealerRepairTaskAsync(
|
||||
RepairData,
|
||||
FabricHealerManager.Token),
|
||||
|
@ -139,52 +91,13 @@ namespace FabricHealer.Repair.Guan
|
|||
return false;
|
||||
}
|
||||
|
||||
using (CancellationTokenSource tokenSource = new())
|
||||
{
|
||||
using (var linkedCTS = CancellationTokenSource.CreateLinkedTokenSource(
|
||||
tokenSource.Token,
|
||||
FabricHealerManager.Token))
|
||||
{
|
||||
TimeSpan maxExecutionTime = TimeSpan.FromMinutes(60);
|
||||
|
||||
if (RepairData.RepairPolicy.MaxExecutionTime > TimeSpan.Zero)
|
||||
{
|
||||
maxExecutionTime = RepairData.RepairPolicy.MaxExecutionTime;
|
||||
}
|
||||
|
||||
tokenSource.CancelAfter(maxExecutionTime);
|
||||
tokenSource.Token.Register(() =>
|
||||
{
|
||||
_ = FabricHealerManager.TryCleanUpOrphanedFabricHealerRepairJobsAsync();
|
||||
});
|
||||
|
||||
// Try to execute repair (FH executor does this work and manages repair state).
|
||||
success = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => RepairTaskManager.ExecuteFabricHealerRepairTaskAsync(
|
||||
repairTask,
|
||||
RepairData,
|
||||
linkedCTS.Token),
|
||||
linkedCTS.Token);
|
||||
|
||||
if (!success && linkedCTS.IsCancellationRequested)
|
||||
{
|
||||
await FabricHealerManager.TryCleanUpOrphanedFabricHealerRepairJobsAsync();
|
||||
}
|
||||
|
||||
return success;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
public static RestartFabricNodePredicateType Singleton(
|
||||
string name,
|
||||
RepairExecutorData repairExecutorData,
|
||||
TelemetryData repairData)
|
||||
public static RestartFabricNodePredicateType Singleton(string name, TelemetryData repairData)
|
||||
{
|
||||
RepairExecutorData = repairExecutorData;
|
||||
RepairData = repairData;
|
||||
|
||||
return Instance ??= new RestartFabricNodePredicateType(name);
|
||||
}
|
||||
|
||||
|
|
|
@ -34,6 +34,12 @@ namespace FabricHealer.Repair.Guan
|
|||
}
|
||||
|
||||
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartProcess;
|
||||
|
||||
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
|
||||
{
|
||||
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
|
||||
}
|
||||
|
||||
int count = Input.Arguments.Count;
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
|
|
|
@ -28,6 +28,12 @@ namespace FabricHealer.Repair.Guan
|
|||
protected override async Task<bool> CheckAsync()
|
||||
{
|
||||
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartReplica;
|
||||
|
||||
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
|
||||
{
|
||||
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
|
||||
}
|
||||
|
||||
int count = Input.Arguments.Count;
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
|
|
|
@ -34,6 +34,13 @@ namespace FabricHealer.Repair.Guan
|
|||
throw new GuanException("You must provide a repair action name for Infrastructure-level repairs as first argument.");
|
||||
}
|
||||
|
||||
RepairData.RepairPolicy.RepairAction = RepairActionType.Infra;
|
||||
|
||||
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
|
||||
{
|
||||
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
|
||||
}
|
||||
|
||||
// FH does not execute repairs for VM level mitigation. InfrastructureService (IS) does,
|
||||
// so, FH schedules VM repairs via RM and the execution is taken care of by IS (the executor).
|
||||
// Block attempts to create duplicate repair tasks or more than specified concurrent machine-level repairs.
|
||||
|
|
|
@ -0,0 +1,119 @@
|
|||
// ------------------------------------------------------------
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
|
||||
// ------------------------------------------------------------
|
||||
|
||||
using FabricHealer.Utilities;
|
||||
using FabricHealer.Utilities.Telemetry;
|
||||
using Guan.Logic;
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace FabricHealer.Repair.Guan
|
||||
{
|
||||
internal class TraceNextRulePredicateType : PredicateType
|
||||
{
|
||||
private static TraceNextRulePredicateType Instance;
|
||||
private static TelemetryData RepairData;
|
||||
|
||||
private class Resolver : BooleanPredicateResolver
|
||||
{
|
||||
public Resolver(CompoundTerm input, Constraint constraint, QueryContext context)
|
||||
: base(input, constraint, context)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
protected override async Task<bool> CheckAsync()
|
||||
{
|
||||
string ruleFileName = FabricHealerManager.CurrentlyExecutingLogicRulesFileName, rule = string.Empty;
|
||||
int lineNumber = 0;
|
||||
|
||||
string ruleFilePath =
|
||||
Path.Combine(
|
||||
FabricHealerManager.ServiceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config").Path,
|
||||
"LogicRules",
|
||||
ruleFileName);
|
||||
|
||||
if (!File.Exists(ruleFilePath))
|
||||
{
|
||||
throw new GuanException($"Specified rule file path does not exist: {ruleFilePath}");
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
string[] lines = File.ReadLines(ruleFilePath).ToArray();
|
||||
|
||||
for (int i = 0; i < lines.Length; i++)
|
||||
{
|
||||
string line = lines[i];
|
||||
|
||||
if (line.Contains($":- {RepairConstants.TraceNextRule}", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
lineNumber = i + 1;
|
||||
line = lines[lineNumber];
|
||||
|
||||
while (string.IsNullOrWhiteSpace(line) || line.TrimStart().StartsWith("##"))
|
||||
{
|
||||
lineNumber++;
|
||||
line = lines[lineNumber];
|
||||
}
|
||||
|
||||
// custom rule formatting support.
|
||||
if (line.TrimEnd().EndsWith(','))
|
||||
{
|
||||
for (int j = lineNumber; lines[j].TrimEnd().EndsWith(','); j++)
|
||||
{
|
||||
line += " " + lines[j + 1].Replace('\t', ' ').Trim();
|
||||
lineNumber = j;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
rule = line;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
$"{ruleFileName}#{lineNumber}_{RepairData.RepairPolicy.ProcessName ?? RepairData.NodeName}",
|
||||
$"Executing logic rule \'{rule}\'",
|
||||
FabricHealerManager.Token);
|
||||
}
|
||||
catch (Exception e) when (e is ArgumentException || e is IOException || e is SystemException)
|
||||
{
|
||||
string message = $"TraceNextRule failure => Unable to read {ruleFileName}: {e.Message}";
|
||||
FabricHealerManager.RepairLogger.LogWarning(message);
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
$"TraceNextRule::{ruleFileName}::Failure",
|
||||
message,
|
||||
FabricHealerManager.Token);
|
||||
}
|
||||
|
||||
// Guarantees the next rule runs. This is critical given TraceNextRule is designed to log the full text of whatever logic rule comes after it in a rule file.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public static TraceNextRulePredicateType Singleton(string name, TelemetryData repairData)
|
||||
{
|
||||
RepairData = repairData;
|
||||
return Instance ??= new TraceNextRulePredicateType(name);
|
||||
}
|
||||
|
||||
private TraceNextRulePredicateType(string name) : base(name, true, 0)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
public override PredicateResolver CreateResolver(CompoundTerm input, Constraint constraint, QueryContext context)
|
||||
{
|
||||
return new Resolver(input, constraint, context);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -15,6 +15,7 @@ namespace FabricHealer.Repair
|
|||
public enum RepairActionType
|
||||
{
|
||||
Infra,
|
||||
DeactivateNode,
|
||||
DeleteFiles,
|
||||
RemoveFabricNodeState,
|
||||
RemoveReplica,
|
||||
|
|
|
@ -34,6 +34,7 @@ namespace FabricHealer.Repair
|
|||
public const string LocalLogPathParameter = "LocalLogPath";
|
||||
public const string AsyncOperationTimeout = "AsyncOperationTimeoutSeconds";
|
||||
public const string EnableFabricHealerOperationalTelemetry = "EnableOperationalTelemetry";
|
||||
public const string EnableLogicRuleTracing = "EnableLogicRuleTracing";
|
||||
|
||||
// General Repair Settings Parameters.
|
||||
public const string EnableAutoMitigation = "EnableAutoMitigation";
|
||||
|
@ -73,6 +74,7 @@ namespace FabricHealer.Repair
|
|||
public const string Source = "Source";
|
||||
|
||||
// Repair Actions.
|
||||
public const string DeactivateFabricNode = "DeactivateFabricNode";
|
||||
public const string DeleteFiles = "DeleteFiles";
|
||||
public const string RestartCodePackage = "RestartCodePackage";
|
||||
public const string RestartFabricNode = "RestartFabricNode";
|
||||
|
@ -100,6 +102,7 @@ namespace FabricHealer.Repair
|
|||
public const string LogInfo = "LogInfo";
|
||||
public const string LogWarning = "LogWarning";
|
||||
public const string LogError = "LogError";
|
||||
public const string TraceNextRule = "TraceNextRule";
|
||||
|
||||
// Metric names.
|
||||
public const string ActiveTcpPorts = "ActiveTcpPorts";
|
||||
|
|
|
@ -13,7 +13,6 @@ using System.Fabric.Query;
|
|||
using System.Fabric.Health;
|
||||
using FabricHealer.Utilities;
|
||||
using FabricHealer.Utilities.Telemetry;
|
||||
using System.Fabric.Repair;
|
||||
using System.Net;
|
||||
using System.Net.Sockets;
|
||||
using System.IO;
|
||||
|
@ -22,7 +21,6 @@ using System.Collections.Generic;
|
|||
using System.ComponentModel;
|
||||
using Newtonsoft.Json;
|
||||
using System.Fabric.Description;
|
||||
using System.Security.Cryptography.X509Certificates;
|
||||
|
||||
namespace FabricHealer.Repair
|
||||
{
|
||||
|
@ -187,310 +185,6 @@ namespace FabricHealer.Repair
|
|||
FabricHealerManager.RepairHistory.SuccessfulRepairs++;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Safely restarts a Service Fabric Node instance.
|
||||
/// Algorithm:
|
||||
/// 1 Deactivate target node.
|
||||
/// 2 Wait for node to get into Disabled/Ok.
|
||||
/// 3 Restart node (which is the Fabric.exe kill API in FaultManager)
|
||||
/// 4 Wait for node to go Down.
|
||||
/// 5 Wait for node to get to Disabled/Ok.
|
||||
/// 5 Activate node.
|
||||
/// 6 Wait for node to get to Up/Ok.
|
||||
/// </summary>
|
||||
/// <param name="repairData">Repair configuration</param>
|
||||
/// <param name="repairTask">The scheduled Repair Task</param>
|
||||
/// <param name="cancellationToken">Task cancellation token</param>
|
||||
/// <returns></returns>
|
||||
public static async Task<bool> SafeRestartFabricNodeAsync(
|
||||
TelemetryData repairData,
|
||||
RepairTask repairTask,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
if (await FabricHealerManager.IsOneNodeClusterAsync())
|
||||
{
|
||||
string info = "One node cluster detected. Aborting node restart operation.";
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairExecutor.SafeRestartFabricNodeAsync::NodeCount_1",
|
||||
info,
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
var nodeQueryDesc = new NodeQueryDescription
|
||||
{
|
||||
MaxResults = 5,
|
||||
};
|
||||
|
||||
NodeList nodeList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricHealerManager.FabricClientSingleton.QueryManager.GetNodePagedListAsync(
|
||||
nodeQueryDesc,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken),
|
||||
cancellationToken);
|
||||
|
||||
if (nodeList.Count < 3)
|
||||
{
|
||||
string info = $"Unsupported repair for a {nodeList.Count}-node cluster. Aborting fabric node restart operation.";
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairExecutor.SafeRestartFabricNodeAsync::NodeCount",
|
||||
info,
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
FabricHealerManager.RepairLogger.LogInfo(info);
|
||||
return false;
|
||||
}
|
||||
|
||||
ServiceDescription serviceDesc =
|
||||
await FabricHealerManager.FabricClientSingleton.ServiceManager.GetServiceDescriptionAsync(
|
||||
FabricHealerManager.ServiceContext.ServiceName,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken);
|
||||
|
||||
int instanceCount = (serviceDesc as StatelessServiceDescription).InstanceCount;
|
||||
|
||||
if (instanceCount == -1)
|
||||
{
|
||||
bool isTargetNodeHostingFH = repairData.NodeName == FabricHealerManager.ServiceContext.NodeContext.NodeName;
|
||||
|
||||
if (isTargetNodeHostingFH)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!nodeList.Any(n => n.NodeName == repairData.NodeName))
|
||||
{
|
||||
string info = $"Fabric node {repairData.NodeName} does not exist.";
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairExecutor.SafeRestartFabricNodeAsync::MissingNode",
|
||||
info,
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
}
|
||||
|
||||
var nodeInstanceId = nodeList.First(n => n.NodeName == repairData.NodeName).NodeInstanceId;
|
||||
var stopwatch = new Stopwatch();
|
||||
var maxWaitTimeout = TimeSpan.FromMinutes(MaxWaitTimeMinutesForNodeOperation);
|
||||
string actionMessage = $"Attempting to safely restart Fabric node {repairData.NodeName} with InstanceId {nodeInstanceId}.";
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairExecutor.SafeRestartFabricNodeAsyncAttemptingRestart",
|
||||
actionMessage,
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
try
|
||||
{
|
||||
if (!JsonSerializationUtility.TryDeserializeObject(repairTask.ExecutorData, out RepairExecutorData executorData))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (executorData.LatestRepairStep == FabricNodeRepairStep.Scheduled)
|
||||
{
|
||||
executorData.LatestRepairStep = FabricNodeRepairStep.Deactivate;
|
||||
|
||||
if (JsonSerializationUtility.TrySerializeObject(executorData, out string exData))
|
||||
{
|
||||
repairTask.ExecutorData = exData;
|
||||
}
|
||||
else
|
||||
{
|
||||
actionMessage = "Step = Deactivate => Did not successfully serialize executordata.";
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairExecutor.SafeRestartFabricNodeAsyncAttemptingRestart::Deactivate",
|
||||
actionMessage,
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
await FabricHealerManager.FabricClientSingleton.RepairManager.UpdateRepairExecutionStateAsync(repairTask, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
|
||||
|
||||
// Deactivate the node with intent to restart. Several health checks will
|
||||
// take place to ensure safe deactivation, which includes giving services a
|
||||
// chance to gracefully shut down, should they override OnAbort/OnClose.
|
||||
await FabricHealerManager.FabricClientSingleton.ClusterManager.DeactivateNodeAsync(
|
||||
repairData.NodeName,
|
||||
NodeDeactivationIntent.Restart,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken);
|
||||
|
||||
stopwatch.Start();
|
||||
|
||||
// Wait for node to get into Disabled state.
|
||||
while (stopwatch.Elapsed <= maxWaitTimeout)
|
||||
{
|
||||
var nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() =>
|
||||
FabricHealerManager.FabricClientSingleton.QueryManager.GetNodeListAsync(
|
||||
repairData.NodeName,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken),
|
||||
cancellationToken);
|
||||
|
||||
if (nodes == null || nodes.Count == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
Node targetNode = nodes[0];
|
||||
|
||||
// exit loop, this is the state we're looking for.
|
||||
if (targetNode.NodeStatus == NodeStatus.Disabled)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
await Task.Delay(1000, cancellationToken);
|
||||
}
|
||||
|
||||
stopwatch.Stop();
|
||||
stopwatch.Reset();
|
||||
}
|
||||
|
||||
if (executorData.LatestRepairStep == FabricNodeRepairStep.Deactivate)
|
||||
{
|
||||
executorData.LatestRepairStep = FabricNodeRepairStep.Restart;
|
||||
|
||||
if (JsonSerializationUtility.TrySerializeObject(executorData, out string exData))
|
||||
{
|
||||
repairTask.ExecutorData = exData;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
await FabricHealerManager.FabricClientSingleton.RepairManager.UpdateRepairExecutionStateAsync(repairTask, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
|
||||
|
||||
actionMessage = $"In Step Restart Node.{Environment.NewLine}{repairTask.ExecutorData}";
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairExecutor.SafeRestartFabricNodeAsyncAttemptingRestart::RestartStep",
|
||||
actionMessage,
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
// Now, restart node.
|
||||
_ = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() =>
|
||||
FabricHealerManager.FabricClientSingleton.FaultManager.RestartNodeAsync(
|
||||
repairData.NodeName,
|
||||
nodeInstanceId,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken),
|
||||
cancellationToken);
|
||||
|
||||
stopwatch.Start();
|
||||
|
||||
// Wait for Disabled/OK
|
||||
while (stopwatch.Elapsed <= maxWaitTimeout)
|
||||
{
|
||||
var nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() =>
|
||||
FabricHealerManager.FabricClientSingleton.QueryManager.GetNodeListAsync(
|
||||
repairData.NodeName,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken),
|
||||
cancellationToken);
|
||||
|
||||
Node targetNode = nodes[0];
|
||||
|
||||
// Node is ready to be enabled.
|
||||
if (targetNode.NodeStatus == NodeStatus.Disabled && targetNode.HealthState == HealthState.Ok)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
await Task.Delay(1000, cancellationToken);
|
||||
}
|
||||
|
||||
stopwatch.Stop();
|
||||
stopwatch.Reset();
|
||||
}
|
||||
|
||||
if (executorData.LatestRepairStep == FabricNodeRepairStep.Restart)
|
||||
{
|
||||
executorData.LatestRepairStep = FabricNodeRepairStep.Activate;
|
||||
|
||||
if (JsonSerializationUtility.TrySerializeObject(executorData, out string exData))
|
||||
{
|
||||
repairTask.ExecutorData = exData;
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
await FabricHealerManager.FabricClientSingleton.RepairManager.UpdateRepairExecutionStateAsync(repairTask, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
|
||||
|
||||
// Now, enable the node.
|
||||
await FabricHealerManager.FabricClientSingleton.ClusterManager.ActivateNodeAsync(repairData.NodeName, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
|
||||
|
||||
await Task.Delay(TimeSpan.FromSeconds(15), cancellationToken);
|
||||
|
||||
var nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() =>
|
||||
FabricHealerManager.FabricClientSingleton.QueryManager.GetNodeListAsync(
|
||||
repairData.NodeName,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken),
|
||||
cancellationToken);
|
||||
|
||||
Node targetNode = nodes[0];
|
||||
|
||||
// Make sure activation request went through.
|
||||
if (targetNode.NodeStatus == NodeStatus.Disabled && targetNode.HealthState == HealthState.Ok)
|
||||
{
|
||||
await FabricHealerManager.FabricClientSingleton.ClusterManager.ActivateNodeAsync(repairData.NodeName, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
|
||||
}
|
||||
|
||||
await Task.Delay(TimeSpan.FromSeconds(15), cancellationToken);
|
||||
UpdateRepairHistory(repairData);
|
||||
return true;
|
||||
}
|
||||
|
||||
FabricHealerManager.RepairHistory.FailedRepairs++;
|
||||
return false;
|
||||
}
|
||||
catch (Exception e) when (e is FabricException || e is OperationCanceledException || e is TimeoutException)
|
||||
{
|
||||
string err = $"Handled Exception restarting Fabric node {repairData.NodeName}, NodeInstanceId {nodeInstanceId}:{e.GetType().Name}";
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairExecutor.SafeRestartFabricNodeAsync::HandledException",
|
||||
err,
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
FabricHealerManager.RepairLogger.LogInfo(err);
|
||||
FabricHealerManager.RepairHistory.FailedRepairs++;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Restarts a stateful replica.
|
||||
/// </summary>
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
using System;
|
||||
using System.Diagnostics.Tracing;
|
||||
using System.Fabric.Health;
|
||||
using System.Fabric.Repair;
|
||||
|
||||
namespace FabricHealer.Repair
|
||||
{
|
||||
|
@ -120,5 +121,6 @@ namespace FabricHealer.Repair
|
|||
{
|
||||
get; set;
|
||||
}
|
||||
public NodeImpactLevel NodeImpactLevel { get; internal set; }
|
||||
}
|
||||
}
|
|
@ -7,11 +7,14 @@ using System;
|
|||
using System.Fabric;
|
||||
using System.Fabric.Health;
|
||||
using System.Fabric.Repair;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using FabricHealer.TelemetryLib;
|
||||
using FabricHealer.Utilities;
|
||||
using FabricHealer.Utilities.Telemetry;
|
||||
using Guan.Logic;
|
||||
|
||||
namespace FabricHealer.Repair
|
||||
{
|
||||
|
@ -48,15 +51,10 @@ namespace FabricHealer.Repair
|
|||
}
|
||||
}
|
||||
|
||||
NodeImpactLevel impact = executorData.RepairPolicy.RepairAction switch
|
||||
{
|
||||
RepairActionType.RestartFabricNode => NodeImpactLevel.Restart,
|
||||
RepairActionType.RemoveFabricNodeState => NodeImpactLevel.RemoveData,
|
||||
_ => NodeImpactLevel.None
|
||||
};
|
||||
|
||||
var nodeRepairImpact = new NodeRepairImpactDescription();
|
||||
var impactedNode = new NodeImpact(executorData.RepairPolicy.NodeName, impact);
|
||||
NodeImpactLevel impact =
|
||||
executorData.RepairPolicy.NodeImpactLevel != NodeImpactLevel.Invalid ? executorData.RepairPolicy.NodeImpactLevel : NodeImpactLevel.None;
|
||||
NodeRepairImpactDescription nodeRepairImpact = new();
|
||||
NodeImpact impactedNode = new(executorData.RepairPolicy.NodeName, impact);
|
||||
nodeRepairImpact.ImpactedNodes.Add(impactedNode);
|
||||
RepairActionType repairAction = executorData.RepairPolicy.RepairAction;
|
||||
string repair = repairAction.ToString();
|
||||
|
@ -80,11 +78,18 @@ namespace FabricHealer.Repair
|
|||
doHealthChecks = false;
|
||||
}
|
||||
|
||||
string description = $"FabricHealer executing repair {repair} on node {executorData.RepairPolicy.NodeName}";
|
||||
|
||||
if (impact == NodeImpactLevel.Restart || impact == NodeImpactLevel.RemoveData)
|
||||
{
|
||||
description = executorData.RepairPolicy.RepairId;
|
||||
}
|
||||
|
||||
var repairTask = new ClusterRepairTask(taskId, repair)
|
||||
{
|
||||
Target = new NodeRepairTargetDescription(executorData.RepairPolicy.NodeName),
|
||||
Impact = nodeRepairImpact,
|
||||
Description = $"FabricHealer executing repair {repair} on node {executorData.RepairPolicy.NodeName}",
|
||||
Description = description,
|
||||
State = RepairTaskState.Preparing,
|
||||
Executor = RepairConstants.FabricHealer,
|
||||
ExecutorData = JsonSerializationUtility.TrySerializeObject(executorData, out string exData) ? exData : null,
|
||||
|
@ -461,5 +466,99 @@ namespace FabricHealer.Repair
|
|||
|
||||
return false;
|
||||
}
|
||||
|
||||
internal static async Task<bool> TryTraceCurrentlyExecutingRule(string predicate, TelemetryData repairData)
|
||||
{
|
||||
string ruleFileName = FabricHealerManager.CurrentlyExecutingLogicRulesFileName, rule = string.Empty;
|
||||
int lineNumber = 0;
|
||||
|
||||
try
|
||||
{
|
||||
string ruleFilePath =
|
||||
Path.Combine(
|
||||
FabricHealerManager.ServiceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config").Path,
|
||||
"LogicRules",
|
||||
ruleFileName);
|
||||
|
||||
if (!File.Exists(ruleFilePath))
|
||||
{
|
||||
FabricHealerManager.RepairLogger.LogWarning($"TryTraceCurrentlyExecutingRule: Specified rule file path does not exist: {ruleFilePath}.");
|
||||
return false;
|
||||
}
|
||||
|
||||
string[] lines = File.ReadLines(ruleFilePath).ToArray();
|
||||
int length = lines.Length;
|
||||
predicate = predicate.Replace("'", "").Replace("\"", "").Replace(" ", "");
|
||||
|
||||
for (int i = 0; i < length; i++)
|
||||
{
|
||||
string line = lines[i].Replace("'", "").Replace("\"", "").Replace(" ", "");
|
||||
|
||||
if (line.Contains("##") || string.IsNullOrWhiteSpace(line))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (line.Contains(predicate, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
lineNumber = i;
|
||||
line = lines[lineNumber];
|
||||
|
||||
if (line.StartsWith("Mitigate") && line.EndsWith("."))
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
// final (repair) predicate ends with a . in FH.
|
||||
if (line.TrimEnd().EndsWith('.'))
|
||||
{
|
||||
rule = line.Replace('\t', ' ');
|
||||
|
||||
// Line is the whole rule.
|
||||
if (line.Contains(":-"))
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
for (int j = lineNumber - 1; j < length; j--)
|
||||
{
|
||||
if (lines[j].TrimEnd().EndsWith(','))
|
||||
{
|
||||
rule = lines[j].Replace('\t', ' ').Trim() + ' ' + rule;
|
||||
lineNumber = j;
|
||||
|
||||
if (lines[j].StartsWith("Mitigate"))
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
$"{ruleFileName}#{lineNumber}_{repairData.RepairPolicy.ProcessName ?? repairData.NodeName}",
|
||||
$"Executing logic rule \'{rule}\'",
|
||||
FabricHealerManager.Token);
|
||||
|
||||
return true;
|
||||
}
|
||||
catch (Exception e) when (e is ArgumentException || e is IOException || e is SystemException)
|
||||
{
|
||||
string message = $"TraceCurrentlyExecutingRule failure => Unable to read {ruleFileName}: {e.Message}";
|
||||
FabricHealerManager.RepairLogger.LogWarning(message);
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
$"TraceCurrentlyExecutingRule::{ruleFileName}::Failure",
|
||||
message,
|
||||
FabricHealerManager.Token);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -17,7 +17,6 @@ using FabricHealer.Utilities.Telemetry;
|
|||
using Guan.Logic;
|
||||
using FabricHealer.Repair.Guan;
|
||||
using FabricHealer.Utilities;
|
||||
using System.Fabric.Description;
|
||||
|
||||
namespace FabricHealer.Repair
|
||||
{
|
||||
|
@ -37,32 +36,6 @@ namespace FabricHealer.Repair
|
|||
await FabricHealerManager.FabricClientSingleton.ClusterManager.ActivateNodeAsync(nodeName, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
|
||||
}
|
||||
|
||||
public static async Task<bool> SafeRestartServiceFabricNodeAsync(TelemetryData repairData, RepairTask repairTask, CancellationToken cancellationToken)
|
||||
{
|
||||
if (!await RepairExecutor.SafeRestartFabricNodeAsync(repairData, repairTask, cancellationToken))
|
||||
{
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"SafeRestartFabricNodeAsync",
|
||||
$"Did not restart Fabric node {repairData.NodeName}",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"SafeRestartFabricNodeAsync",
|
||||
$"Successfully restarted Fabric node {repairData.NodeName}",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public static async Task StartRepairWorkflowAsync(TelemetryData repairData, List<string> repairRules, CancellationToken cancellationToken)
|
||||
{
|
||||
if (await RepairTaskEngine.CheckForActiveStopFHRepairJob(cancellationToken))
|
||||
|
@ -120,10 +93,10 @@ namespace FabricHealer.Repair
|
|||
/// <param name="repairExecutorData">Optional Repair data that is used primarily when some repair is being restarted (after an FH restart, for example)</param>
|
||||
/// <returns></returns>
|
||||
public static async Task RunGuanQueryAsync(
|
||||
TelemetryData repairData,
|
||||
List<string> repairRules,
|
||||
CancellationToken cancellationToken,
|
||||
RepairExecutorData repairExecutorData = null)
|
||||
TelemetryData repairData,
|
||||
List<string> repairRules,
|
||||
CancellationToken cancellationToken,
|
||||
RepairExecutorData repairExecutorData = null)
|
||||
{
|
||||
if (await RepairTaskEngine.CheckForActiveStopFHRepairJob(cancellationToken))
|
||||
{
|
||||
|
@ -142,14 +115,16 @@ namespace FabricHealer.Repair
|
|||
functorTable.Add(LogInfoPredicateType.Singleton(RepairConstants.LogInfo));
|
||||
functorTable.Add(LogErrorPredicateType.Singleton(RepairConstants.LogError));
|
||||
functorTable.Add(LogWarningPredicateType.Singleton(RepairConstants.LogWarning));
|
||||
functorTable.Add(TraceNextRulePredicateType.Singleton(RepairConstants.TraceNextRule, repairData));
|
||||
functorTable.Add(CheckInsideHealthStateMinDurationPredicateType.Singleton(RepairConstants.CheckInsideHealthStateMinDuration, repairData));
|
||||
functorTable.Add(GetHealthEventHistoryPredicateType.Singleton(RepairConstants.GetHealthEventHistory, repairData));
|
||||
functorTable.Add(GetRepairHistoryPredicateType.Singleton(RepairConstants.GetRepairHistory, repairData));
|
||||
|
||||
// Add external repair predicates.
|
||||
functorTable.Add(DeactivateFabricNodePredicateType.Singleton(RepairConstants.DeactivateFabricNode, repairData));
|
||||
functorTable.Add(DeleteFilesPredicateType.Singleton(RepairConstants.DeleteFiles, repairData));
|
||||
functorTable.Add(RestartCodePackagePredicateType.Singleton(RepairConstants.RestartCodePackage, repairData));
|
||||
functorTable.Add(RestartFabricNodePredicateType.Singleton(RepairConstants.RestartFabricNode, repairExecutorData, repairData));
|
||||
functorTable.Add(RestartFabricNodePredicateType.Singleton(RepairConstants.RestartFabricNode, repairData));
|
||||
functorTable.Add(RestartFabricSystemProcessPredicateType.Singleton(RepairConstants.RestartFabricSystemProcess, repairData));
|
||||
functorTable.Add(RestartReplicaPredicateType.Singleton(RepairConstants.RestartReplica, repairData));
|
||||
functorTable.Add(ScheduleMachineRepairPredicateType.Singleton(RepairConstants.ScheduleMachineRepair, repairData));
|
||||
|
@ -514,6 +489,7 @@ namespace FabricHealer.Repair
|
|||
|
||||
// Don't attempt a node-level repair on a node where there is already an active node-level repair.
|
||||
if (repairData.RepairPolicy.RepairAction == RepairActionType.RestartFabricNode
|
||||
|| repairData.RepairPolicy.RepairAction == RepairActionType.DeactivateNode
|
||||
&& await RepairTaskEngine.IsNodeLevelRepairCurrentlyInFlightAsync(repairData, cancellationToken))
|
||||
{
|
||||
string message = $"Node {repairData.NodeName} already has a node-impactful repair in progress: " +
|
||||
|
@ -831,30 +807,7 @@ namespace FabricHealer.Repair
|
|||
|
||||
break;
|
||||
}
|
||||
case RepairActionType.RestartFabricNode:
|
||||
{
|
||||
var executorData = repairTask.ExecutorData;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(executorData))
|
||||
{
|
||||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
$"RestartFabricNode::{repairData.NodeName}",
|
||||
$"Repair {repairTask.TaskId} is missing ExecutorData.",
|
||||
cancellationToken,
|
||||
repairData,
|
||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
success = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
success = await SafeRestartServiceFabricNodeAsync(repairData, repairTask, cancellationToken);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -29,6 +29,11 @@ namespace FabricHealer.Utilities
|
|||
private set;
|
||||
} = 30;
|
||||
|
||||
public bool EnableLogicRuleTracing
|
||||
{
|
||||
get; private set;
|
||||
}
|
||||
|
||||
public bool EnableVerboseLogging
|
||||
{
|
||||
get;
|
||||
|
@ -199,6 +204,12 @@ namespace FabricHealer.Utilities
|
|||
OperationalTelemetryEnabled = fhOpTelemEnabled;
|
||||
}
|
||||
|
||||
// Logic rule predicate tracing.
|
||||
if (bool.TryParse(GetConfigSettingValue(RepairConstants.RepairManagerConfigurationSectionName, RepairConstants.EnableLogicRuleTracing), out bool traceRules))
|
||||
{
|
||||
EnableLogicRuleTracing = traceRules;
|
||||
}
|
||||
|
||||
// Repair Policies
|
||||
if (bool.TryParse(GetConfigSettingValue(RepairConstants.AppRepairPolicySectionName, RepairConstants.Enabled), out bool appRepairEnabled))
|
||||
{
|
||||
|
|
|
@ -9,7 +9,6 @@ using System.Fabric.Repair;
|
|||
using System.Linq;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using FabricHealer.TelemetryLib;
|
||||
using FabricHealer.Utilities;
|
||||
|
||||
namespace FabricHealer.Repair
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<ApplicationManifest xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ApplicationTypeName="FabricHealerType" ApplicationTypeVersion="1.1.17" xmlns="http://schemas.microsoft.com/2011/01/fabric">
|
||||
<ApplicationManifest xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ApplicationTypeName="FabricHealerType" ApplicationTypeVersion="1.1.18" xmlns="http://schemas.microsoft.com/2011/01/fabric">
|
||||
<Parameters>
|
||||
<!-- FabricHealerManager Settings -->
|
||||
<Parameter Name="AutoMitigationEnabled" DefaultValue="true" />
|
||||
<Parameter Name="EnableETW" DefaultValue="false" />
|
||||
<Parameter Name="HealthCheckIntervalInSeconds" DefaultValue="60" />
|
||||
<Parameter Name="EnableLogicRuleTracing" DefaultValue="true" />
|
||||
<Parameter Name="EnableTelemetry" DefaultValue="false" />
|
||||
<Parameter Name="EnableVerboseLogging" DefaultValue="true" />
|
||||
<Parameter Name="OperationalTelemetryEnabled" DefaultValue="true" />
|
||||
|
@ -30,7 +31,7 @@
|
|||
should match the Name and Version attributes of the ServiceManifest element defined in the
|
||||
ServiceManifest.xml file. -->
|
||||
<ServiceManifestImport>
|
||||
<ServiceManifestRef ServiceManifestName="FabricHealerPkg" ServiceManifestVersion="1.1.17" />
|
||||
<ServiceManifestRef ServiceManifestName="FabricHealerPkg" ServiceManifestVersion="1.1.18" />
|
||||
<ConfigOverrides>
|
||||
<ConfigOverride Name="Config">
|
||||
<Settings>
|
||||
|
@ -44,6 +45,7 @@
|
|||
<Parameter Name="EnableOperationalTelemetry" Value="[OperationalTelemetryEnabled]" />
|
||||
<Parameter Name="EnableRollingServiceRestarts" Value="[EnableRollingServiceRestarts]" />
|
||||
<Parameter Name="LocalLogPath" Value="[LocalLogPath]" />
|
||||
<Parameter Name="EnableLogicRuleTracing" Value="[EnableLogicRuleTracing]" />
|
||||
</Section>
|
||||
<!-- Repair policies -->
|
||||
<Section Name="AppRepairPolicy">
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
## FabricHealer 1.1.17
|
||||
## FabricHealer 1.1.18
|
||||
### Configuration as Logic and auto-mitigation in Service Fabric clusters
|
||||
|
||||
FabricHealer (FH) is a .NET 6 Service Fabric application that attempts to automatically fix a set of reliably solvable problems that can take place in Service Fabric
|
||||
|
@ -78,7 +78,7 @@ Register-ServiceFabricApplicationType -ApplicationPathInImageStore FH1110
|
|||
|
||||
#Create FO application (if not already deployed at lesser version):
|
||||
|
||||
New-ServiceFabricApplication -ApplicationName fabric:/FabricHealer -ApplicationTypeName FabricHealerType -ApplicationTypeVersion 1.1.17
|
||||
New-ServiceFabricApplication -ApplicationName fabric:/FabricHealer -ApplicationTypeName FabricHealerType -ApplicationTypeVersion 1.1.18
|
||||
|
||||
#Create the Service instance:
|
||||
|
||||
|
@ -87,7 +87,7 @@ New-ServiceFabricService -Stateless -PartitionSchemeSingleton -ApplicationName f
|
|||
|
||||
#OR if updating existing version:
|
||||
|
||||
Start-ServiceFabricApplicationUpgrade -ApplicationName fabric:/FabricHealer -ApplicationTypeVersion 1.1.17 -Monitored -FailureAction rollback
|
||||
Start-ServiceFabricApplicationUpgrade -ApplicationName fabric:/FabricHealer -ApplicationTypeVersion 1.1.18 -Monitored -FailureAction rollback
|
||||
```
|
||||
|
||||
## Using FabricHealer
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
## FabricHealer 1.1.17
|
||||
## FabricHealer 1.1.18
|
||||
### Configuration as Logic and auto-mitigation in Service Fabric clusters
|
||||
|
||||
FabricHealer (FH) is a .NET 6 Service Fabric application that attempts to automatically fix a set of reliably solvable problems that can take place in Service Fabric
|
||||
|
|
Загрузка…
Ссылка в новой задаче