1.1.18: + rule tracing, upgrade mod, RepairPolicy mod, DeactivateFabricNode predicate.
This commit is contained in:
Родитель
c648b6c98a
Коммит
0ab8864831
|
@ -23,11 +23,11 @@ function Build-SFPkg {
|
||||||
try {
|
try {
|
||||||
Push-Location $scriptPath
|
Push-Location $scriptPath
|
||||||
|
|
||||||
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.SelfContained.1.1.17" "$scriptPath\bin\release\FabricHealer\linux-x64\self-contained\FabricHealerType"
|
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.SelfContained.1.1.18" "$scriptPath\bin\release\FabricHealer\linux-x64\self-contained\FabricHealerType"
|
||||||
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.FrameworkDependent.1.1.17" "$scriptPath\bin\release\FabricHealer\linux-x64\framework-dependent\FabricHealerType"
|
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.FrameworkDependent.1.1.18" "$scriptPath\bin\release\FabricHealer\linux-x64\framework-dependent\FabricHealerType"
|
||||||
|
|
||||||
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.SelfContained.1.1.17" "$scriptPath\bin\release\FabricHealer\win-x64\self-contained\FabricHealerType"
|
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.SelfContained.1.1.18" "$scriptPath\bin\release\FabricHealer\win-x64\self-contained\FabricHealerType"
|
||||||
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.FrameworkDependent.1.1.17" "$scriptPath\bin\release\FabricHealer\win-x64\framework-dependent\FabricHealerType"
|
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.FrameworkDependent.1.1.18" "$scriptPath\bin\release\FabricHealer\win-x64\framework-dependent\FabricHealerType"
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
Pop-Location
|
Pop-Location
|
||||||
|
|
|
@ -11,7 +11,7 @@
|
||||||
},
|
},
|
||||||
"applicationTypeVersionFabricHealer": {
|
"applicationTypeVersionFabricHealer": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"defaultValue": "1.1.17",
|
"defaultValue": "1.1.18",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"description": "Provide the app version number of FabricHealer. This must be identical to the version specified in the sfpkg."
|
"description": "Provide the app version number of FabricHealer. This must be identical to the version specified in the sfpkg."
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
"value": "<YOUR-CLUSTER-RESOURCE-NAME>"
|
"value": "<YOUR-CLUSTER-RESOURCE-NAME>"
|
||||||
},
|
},
|
||||||
"applicationTypeVersionFabricHealer": {
|
"applicationTypeVersionFabricHealer": {
|
||||||
"value": "1.1.17"
|
"value": "1.1.18"
|
||||||
},
|
},
|
||||||
"packageUrlFabricHealer": {
|
"packageUrlFabricHealer": {
|
||||||
"value": "<PUBLIC-ACCESSIBLE-URL-FOR-FABRICHEALER-SFPKG>"
|
"value": "<PUBLIC-ACCESSIBLE-URL-FOR-FABRICHEALER-SFPKG>"
|
|
@ -44,7 +44,7 @@ Here is a full example of exactly what is sent in one of these telemetry events,
|
||||||
"ClusterId": "00000000-1111-1111-0000-00f00d000d",
|
"ClusterId": "00000000-1111-1111-0000-00f00d000d",
|
||||||
"ClusterType": "SFRP",
|
"ClusterType": "SFRP",
|
||||||
"NodeNameHash": "3e83569d4c6aad78083cd081215dafc81e5218556b6a46cb8dd2b183ed0095ad",
|
"NodeNameHash": "3e83569d4c6aad78083cd081215dafc81e5218556b6a46cb8dd2b183ed0095ad",
|
||||||
"FHVersion": "1.1.17",
|
"FHVersion": "1.1.18",
|
||||||
"UpTime": "00:00:00.2164523",
|
"UpTime": "00:00:00.2164523",
|
||||||
"Timestamp": "2023-02-07T21:45:25.2443014Z",
|
"Timestamp": "2023-02-07T21:45:25.2443014Z",
|
||||||
"OS": "Windows",
|
"OS": "Windows",
|
||||||
|
|
|
@ -52,6 +52,9 @@
|
||||||
<None Update="PackageRoot\Config\LogicRules\MachineRules.guan">
|
<None Update="PackageRoot\Config\LogicRules\MachineRules.guan">
|
||||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
</None>
|
</None>
|
||||||
|
<None Update="TestApp42.zip">
|
||||||
|
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
|
||||||
|
</None>
|
||||||
<None Update="testrules_wellformed.guan">
|
<None Update="testrules_wellformed.guan">
|
||||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
</None>
|
</None>
|
||||||
|
|
|
@ -47,7 +47,7 @@ namespace FHTest
|
||||||
private const string FHProxyId = "FabricHealerProxy";
|
private const string FHProxyId = "FabricHealerProxy";
|
||||||
|
|
||||||
[ClassInitialize]
|
[ClassInitialize]
|
||||||
public static void TestClassStartUp(TestContext testContext)
|
public static async Task TestClassStartUp(TestContext testContext)
|
||||||
{
|
{
|
||||||
if (!IsLocalSFRuntimePresent())
|
if (!IsLocalSFRuntimePresent())
|
||||||
{
|
{
|
||||||
|
@ -93,6 +93,8 @@ namespace FHTest
|
||||||
{
|
{
|
||||||
TelemetryEnabled = false
|
TelemetryEnabled = false
|
||||||
};
|
};
|
||||||
|
|
||||||
|
await DeployTestApp42Async();
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Helpers */
|
/* Helpers */
|
||||||
|
@ -168,6 +170,116 @@ namespace FHTest
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static async Task DeployTestApp42Async()
|
||||||
|
{
|
||||||
|
string appName = "fabric:/TestApp42";
|
||||||
|
|
||||||
|
// If fabric:/TestApp42 is already installed, exit.
|
||||||
|
var deployedTestApp =
|
||||||
|
await fabricClient.QueryManager.GetDeployedApplicationListAsync(
|
||||||
|
NodeName,
|
||||||
|
new Uri(appName),
|
||||||
|
TimeSpan.FromSeconds(30),
|
||||||
|
token);
|
||||||
|
|
||||||
|
if (deployedTestApp?.Count > 0)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
string appType = "TestApp42Type";
|
||||||
|
string appVersion = "1.0.0";
|
||||||
|
|
||||||
|
// Change this to suit your configuration (so, if you are on Windows and you installed SF on a different drive, for example).
|
||||||
|
string imageStoreConnectionString = @"file:C:\SfDevCluster\Data\ImageStoreShare";
|
||||||
|
string packagePathInImageStore = "TestApp42";
|
||||||
|
string packagePathZip = Path.Combine(Environment.CurrentDirectory, "TestApp42.zip");
|
||||||
|
string packagePath = Path.Combine(Environment.CurrentDirectory, "TestApp42", "Release");
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
// Unzip the compressed HealthMetrics app package.
|
||||||
|
System.IO.Compression.ZipFile.ExtractToDirectory(packagePathZip, "TestApp42", true);
|
||||||
|
|
||||||
|
// Copy the HealthMetrics app package to a location in the image store.
|
||||||
|
fabricClient.ApplicationManager.CopyApplicationPackage(imageStoreConnectionString, packagePath, packagePathInImageStore);
|
||||||
|
|
||||||
|
// Provision the HealthMetrics application.
|
||||||
|
await fabricClient.ApplicationManager.ProvisionApplicationAsync(packagePathInImageStore);
|
||||||
|
|
||||||
|
// Create HealthMetrics app instance.
|
||||||
|
ApplicationDescription appDesc = new(new Uri(appName), appType, appVersion);
|
||||||
|
await fabricClient.ApplicationManager.CreateApplicationAsync(appDesc);
|
||||||
|
|
||||||
|
// This is a hack. Withouth this timeout, the deployed test services may not have populated the FC cache?
|
||||||
|
// You may need to increase this value depending upon your dev machine? You'll find out..
|
||||||
|
await Task.Delay(TimeSpan.FromSeconds(15));
|
||||||
|
}
|
||||||
|
catch (FabricException fe)
|
||||||
|
{
|
||||||
|
if (fe.ErrorCode == FabricErrorCode.ApplicationAlreadyExists)
|
||||||
|
{
|
||||||
|
await fabricClient.ApplicationManager.DeleteApplicationAsync(new DeleteApplicationDescription(new Uri(appName)) { ForceDelete = true });
|
||||||
|
await DeployTestApp42Async();
|
||||||
|
}
|
||||||
|
else if (fe.ErrorCode == FabricErrorCode.ApplicationTypeAlreadyExists)
|
||||||
|
{
|
||||||
|
var appList = await fabricClient.QueryManager.GetApplicationListAsync(new Uri(appName));
|
||||||
|
if (appList.Count > 0)
|
||||||
|
{
|
||||||
|
await fabricClient.ApplicationManager.DeleteApplicationAsync(new DeleteApplicationDescription(new Uri(appName)) { ForceDelete = true });
|
||||||
|
}
|
||||||
|
await fabricClient.ApplicationManager.UnprovisionApplicationAsync(appType, appVersion);
|
||||||
|
await DeployTestApp42Async();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static async Task<bool> EnsureTestServicesExistAsync(string appName)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var services = await fabricClient.QueryManager.GetServiceListAsync(new Uri(appName));
|
||||||
|
return services?.Count > 0;
|
||||||
|
}
|
||||||
|
catch (FabricElementNotFoundException)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static async Task RemoveTestApplicationsAsync()
|
||||||
|
{
|
||||||
|
string imageStoreConnectionString = @"file:C:\SfDevCluster\Data\ImageStoreShare";
|
||||||
|
|
||||||
|
// TestApp42 \\
|
||||||
|
|
||||||
|
if (await EnsureTestServicesExistAsync("fabric:/TestApp42"))
|
||||||
|
{
|
||||||
|
string appName = "fabric:/TestApp42";
|
||||||
|
string appType = "TestApp42Type";
|
||||||
|
string appVersion = "1.0.0";
|
||||||
|
string serviceName1 = "fabric:/TestApp42/ChildProcessCreator";
|
||||||
|
string packagePathInImageStore = "TestApp42";
|
||||||
|
|
||||||
|
// Clean up the unzipped directory.
|
||||||
|
fabricClient.ApplicationManager.RemoveApplicationPackage(imageStoreConnectionString, packagePathInImageStore);
|
||||||
|
|
||||||
|
// Delete services.
|
||||||
|
var deleteServiceDescription1 = new DeleteServiceDescription(new Uri(serviceName1));
|
||||||
|
await fabricClient.ServiceManager.DeleteServiceAsync(deleteServiceDescription1);
|
||||||
|
|
||||||
|
// Delete an application instance from the application type.
|
||||||
|
var deleteApplicationDescription = new DeleteApplicationDescription(new Uri(appName));
|
||||||
|
await fabricClient.ApplicationManager.DeleteApplicationAsync(deleteApplicationDescription);
|
||||||
|
|
||||||
|
// Un-provision the application type.
|
||||||
|
await fabricClient.ApplicationManager.UnprovisionApplicationAsync(appType, appVersion);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
[ClassCleanup]
|
[ClassCleanup]
|
||||||
public static async Task TestClassCleanupAsync()
|
public static async Task TestClassCleanupAsync()
|
||||||
{
|
{
|
||||||
|
@ -176,6 +288,7 @@ namespace FHTest
|
||||||
|
|
||||||
// Ensure FHProxy cleans up its health reports.
|
// Ensure FHProxy cleans up its health reports.
|
||||||
FabricHealerProxy.Instance.Close();
|
FabricHealerProxy.Instance.Close();
|
||||||
|
await RemoveTestApplicationsAsync();
|
||||||
}
|
}
|
||||||
|
|
||||||
/* GuanLogic Tests
|
/* GuanLogic Tests
|
||||||
|
@ -336,6 +449,7 @@ namespace FHTest
|
||||||
};
|
};
|
||||||
|
|
||||||
var file = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "LogicRules", "ReplicaRules.guan");
|
var file = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "LogicRules", "ReplicaRules.guan");
|
||||||
|
FabricHealerManager.CurrentlyExecutingLogicRulesFileName = "ReplicaRules.guan";
|
||||||
List<string> repairRules = FabricHealerManager.ParseRulesFile(await File.ReadAllLinesAsync(file, token));
|
List<string> repairRules = FabricHealerManager.ParseRulesFile(await File.ReadAllLinesAsync(file, token));
|
||||||
|
|
||||||
try
|
try
|
||||||
|
@ -379,6 +493,7 @@ namespace FHTest
|
||||||
|
|
||||||
var file = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "LogicRules", "SystemServiceRules.guan");
|
var file = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "LogicRules", "SystemServiceRules.guan");
|
||||||
List<string> repairRules = FabricHealerManager.ParseRulesFile(await File.ReadAllLinesAsync(file, token));
|
List<string> repairRules = FabricHealerManager.ParseRulesFile(await File.ReadAllLinesAsync(file, token));
|
||||||
|
FabricHealerManager.CurrentlyExecutingLogicRulesFileName = "SystemServiceRules.guan";
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
|
@ -490,6 +605,7 @@ namespace FHTest
|
||||||
{
|
{
|
||||||
string testRulesFilePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "LogicRules", "MachineRules.guan");
|
string testRulesFilePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "LogicRules", "MachineRules.guan");
|
||||||
string[] rules = await File.ReadAllLinesAsync(testRulesFilePath, token);
|
string[] rules = await File.ReadAllLinesAsync(testRulesFilePath, token);
|
||||||
|
FabricHealerManager.CurrentlyExecutingLogicRulesFileName = "MachineRules.guan";
|
||||||
List<string> repairRules = FabricHealerManager.ParseRulesFile(rules);
|
List<string> repairRules = FabricHealerManager.ParseRulesFile(rules);
|
||||||
int escalationCount = 4; // reboot, reimage, heal, triage.
|
int escalationCount = 4; // reboot, reimage, heal, triage.
|
||||||
RepairTaskList repairTasks = null;
|
RepairTaskList repairTasks = null;
|
||||||
|
@ -771,7 +887,7 @@ namespace FHTest
|
||||||
{
|
{
|
||||||
// The service here must be one that is running in your test cluster.
|
// The service here must be one that is running in your test cluster.
|
||||||
// TODO: install a local test app as part of tests.
|
// TODO: install a local test app as part of tests.
|
||||||
ServiceName = "fabric:/BadApp/BadService",
|
ServiceName = "fabric:/TestApp42/ChildProcessCreator",
|
||||||
NodeName = NodeName,
|
NodeName = NodeName,
|
||||||
// Specifying Source is Required for unit tests.
|
// Specifying Source is Required for unit tests.
|
||||||
// For unit tests, there is no FabricRuntime static, so FHProxy, which utilizes this type, will fail unless Source is provided here.
|
// For unit tests, there is no FabricRuntime static, so FHProxy, which utilizes this type, will fail unless Source is provided here.
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
<Parameter Name="EnableAutoMitigation" Value="true" />
|
<Parameter Name="EnableAutoMitigation" Value="true" />
|
||||||
<Parameter Name="EnableOperationalTelemetry" Value="false" />
|
<Parameter Name="EnableOperationalTelemetry" Value="false" />
|
||||||
<Parameter Name="EnableRollingServiceRestarts" Value="true" />
|
<Parameter Name="EnableRollingServiceRestarts" Value="true" />
|
||||||
|
<Parameter Name="EnableLogicRuleTracing" Value="true" />
|
||||||
<!-- Folder name for local log output. You can use a full path or just a folder name. -->
|
<!-- Folder name for local log output. You can use a full path or just a folder name. -->
|
||||||
<Parameter Name="LocalLogPath" Value="fabric_healer_testlogs" />
|
<Parameter Name="LocalLogPath" Value="fabric_healer_testlogs" />
|
||||||
|
|
||||||
|
|
Двоичный файл не отображается.
|
@ -2,7 +2,7 @@
|
||||||
<package xmlns="http://schemas.microsoft.com/packaging/2013/05/nuspec.xsd">
|
<package xmlns="http://schemas.microsoft.com/packaging/2013/05/nuspec.xsd">
|
||||||
<metadata minClientVersion="3.3.0">
|
<metadata minClientVersion="3.3.0">
|
||||||
<id>%PACKAGE_ID%</id>
|
<id>%PACKAGE_ID%</id>
|
||||||
<version>1.1.17</version>
|
<version>1.1.18</version>
|
||||||
<releaseNotes>
|
<releaseNotes>
|
||||||
This release requires Service Fabric runtime version 9 and higher and at least Service Fabric SDK version 6.0.1017. There are several changes and improvements in this
|
This release requires Service Fabric runtime version 9 and higher and at least Service Fabric SDK version 6.0.1017. There are several changes and improvements in this
|
||||||
release including a new machine repair model, updated logic rules, bug fixes, and many code improvements.
|
release including a new machine repair model, updated logic rules, bug fixes, and many code improvements.
|
||||||
|
|
|
@ -25,8 +25,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
|
||||||
Documentation\OperationalTelemetry.md = Documentation\OperationalTelemetry.md
|
Documentation\OperationalTelemetry.md = Documentation\OperationalTelemetry.md
|
||||||
README.md = README.md
|
README.md = README.md
|
||||||
Documentation\Deployment\service-fabric-healer.json = Documentation\Deployment\service-fabric-healer.json
|
Documentation\Deployment\service-fabric-healer.json = Documentation\Deployment\service-fabric-healer.json
|
||||||
|
Documentation\Deployment\service-fabric-healer.v1.1.18.parameters.json = Documentation\Deployment\service-fabric-healer.v1.1.18.parameters.json
|
||||||
Documentation\Using.md = Documentation\Using.md
|
Documentation\Using.md = Documentation\Using.md
|
||||||
Documentation\Deployment\service-fabric-healer.v1.1.17.parameters.json = Documentation\Deployment\service-fabric-healer.v1.1.17.parameters.json
|
|
||||||
EndProjectSection
|
EndProjectSection
|
||||||
EndProject
|
EndProject
|
||||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FHTest", "FHTest\FHTest.csproj", "{8D9712BF-C026-4A36-B6D1-6345137D3B6F}"
|
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FHTest", "FHTest\FHTest.csproj", "{8D9712BF-C026-4A36-B6D1-6345137D3B6F}"
|
||||||
|
|
|
@ -12,8 +12,8 @@
|
||||||
<RuntimeIdentifier>win-x64</RuntimeIdentifier>-->
|
<RuntimeIdentifier>win-x64</RuntimeIdentifier>-->
|
||||||
<RuntimeIdentifiers>linux-x64;win-x64</RuntimeIdentifiers>
|
<RuntimeIdentifiers>linux-x64;win-x64</RuntimeIdentifiers>
|
||||||
<Product>FabricHealer</Product>
|
<Product>FabricHealer</Product>
|
||||||
<Version>1.1.17</Version>
|
<Version>1.1.18</Version>
|
||||||
<FileVersion>1.1.17</FileVersion>
|
<FileVersion>1.1.18</FileVersion>
|
||||||
<StartupObject>FabricHealer.Program</StartupObject>
|
<StartupObject>FabricHealer.Program</StartupObject>
|
||||||
<Platforms>x64</Platforms>
|
<Platforms>x64</Platforms>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
|
@ -29,10 +29,15 @@ namespace FabricHealer
|
||||||
{
|
{
|
||||||
internal static TelemetryUtilities TelemetryUtilities;
|
internal static TelemetryUtilities TelemetryUtilities;
|
||||||
internal static RepairData RepairHistory;
|
internal static RepairData RepairHistory;
|
||||||
public static StatelessServiceContext ServiceContext;
|
|
||||||
|
internal static bool EnableRuleTracing { get; set; } = true;
|
||||||
|
|
||||||
|
public static StatelessServiceContext ServiceContext { get; private set; }
|
||||||
|
|
||||||
|
public static string CurrentlyExecutingLogicRulesFileName { get; set; }
|
||||||
|
|
||||||
// Folks often use their own version numbers. This is for internal diagnostic telemetry.
|
// Folks often use their own version numbers. This is for internal diagnostic telemetry.
|
||||||
private const string InternalVersionNumber = "1.1.17";
|
private const string InternalVersionNumber = "1.1.18";
|
||||||
private static FabricHealerManager singleton;
|
private static FabricHealerManager singleton;
|
||||||
private static FabricClient _fabricClient;
|
private static FabricClient _fabricClient;
|
||||||
private bool disposedValue;
|
private bool disposedValue;
|
||||||
|
@ -305,7 +310,7 @@ namespace FabricHealer
|
||||||
|
|
||||||
// First, let's clean up any orphaned non-node level FabricHealer repair tasks left pending. This will also resume Fabric Node repairs that
|
// First, let's clean up any orphaned non-node level FabricHealer repair tasks left pending. This will also resume Fabric Node repairs that
|
||||||
// FH owns and was executing at the time FH exited. Only FH-owned repairs will be canceled, not repairs conducted by other executors.
|
// FH owns and was executing at the time FH exited. Only FH-owned repairs will be canceled, not repairs conducted by other executors.
|
||||||
await CancelOrResumeAllRunningFHRepairsAsync();
|
await CancelAbandonedFHRepairsAsync();
|
||||||
|
|
||||||
// Run until RunAsync token is cancelled.
|
// Run until RunAsync token is cancelled.
|
||||||
while (!Token.IsCancellationRequested)
|
while (!Token.IsCancellationRequested)
|
||||||
|
@ -467,17 +472,26 @@ namespace FabricHealer
|
||||||
{
|
{
|
||||||
// FH looks for and resumes FabricNode restart repair jobs when it starts up (so, it will pick up where it left off in the safe restart sequence
|
// FH looks for and resumes FabricNode restart repair jobs when it starts up (so, it will pick up where it left off in the safe restart sequence
|
||||||
// when the Fabric node hosting FH is the one FH restarted).
|
// when the Fabric node hosting FH is the one FH restarted).
|
||||||
if (JsonSerializationUtility.TryDeserializeObject(repair.ExecutorData, out RepairExecutorData exData, true)
|
if (!JsonSerializationUtility.TryDeserializeObject(repair.ExecutorData, out RepairExecutorData exData, true))
|
||||||
&& exData.RepairPolicy.RepairAction == RepairActionType.RestartFabricNode)
|
|
||||||
{
|
{
|
||||||
if (isClosing)
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This would mean that the job has node-level Impact and its state is at least Approved.
|
||||||
|
if (repair.Impact is NodeRepairImpactDescription impact)
|
||||||
|
{
|
||||||
|
if (impact.ImpactedNodes.Any(
|
||||||
|
n => n.NodeName == exData.RepairPolicy.NodeName
|
||||||
|
&& (n.ImpactLevel == NodeImpactLevel.Restart ||
|
||||||
|
n.ImpactLevel == NodeImpactLevel.RemoveData ||
|
||||||
|
n.ImpactLevel == NodeImpactLevel.RemoveNode)))
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Was max execution time configured by user?
|
// Was max execution time configured by user?
|
||||||
if (exData != null && exData.RepairPolicy.MaxExecutionTime > TimeSpan.Zero)
|
if (exData.RepairPolicy.MaxExecutionTime > TimeSpan.Zero)
|
||||||
{
|
{
|
||||||
maxFHExecutorTime = exData.RepairPolicy.MaxExecutionTime;
|
maxFHExecutorTime = exData.RepairPolicy.MaxExecutionTime;
|
||||||
}
|
}
|
||||||
|
@ -492,7 +506,7 @@ namespace FabricHealer
|
||||||
catch (Exception e) when (e is ArgumentException || e is FabricException || e is InvalidOperationException || e is TimeoutException)
|
catch (Exception e) when (e is ArgumentException || e is FabricException || e is InvalidOperationException || e is TimeoutException)
|
||||||
{
|
{
|
||||||
#if DEBUG
|
#if DEBUG
|
||||||
RepairLogger.LogWarning($"TryCleanUpOrphanedFabricHealerRepairJobsAsync Failure:{Environment.NewLine}{e}");
|
RepairLogger.LogWarning($"TryCleanUpOrphanedFabricHealerRepairJobs Failure:{Environment.NewLine}{e}");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -531,15 +545,13 @@ namespace FabricHealer
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Cancels all FabricHealer repair tasks currently in flight (unless in Restoring state).
|
/// Cancels all FabricHealer repair tasks currently in flight.
|
||||||
/// OR Resumes fabric node-level repairs that were abandoned due to FH going down while they were processing.
|
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <returns>A Task.</returns>
|
/// <returns>A Task.</returns>
|
||||||
private static async Task CancelOrResumeAllRunningFHRepairsAsync()
|
private static async Task CancelAbandonedFHRepairsAsync()
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
|
|
||||||
var currentFHRepairTasksInProgress =
|
var currentFHRepairTasksInProgress =
|
||||||
await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||||
() => RepairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(
|
() => RepairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(
|
||||||
|
@ -582,49 +594,11 @@ namespace FabricHealer
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try and cancel existing repair. We may need to create a new one for abandoned repairs where FH goes down for some reason.
|
|
||||||
// Note: CancelRepairTaskAsync handles exceptions (IOE) that may be thrown by RM due to state change policy.
|
|
||||||
// The repair state could change to Completed after this call is made, for example, and before RM API call.
|
|
||||||
if (repair.State != RepairTaskState.Completed)
|
if (repair.State != RepairTaskState.Completed)
|
||||||
{
|
{
|
||||||
await FabricRepairTasks.CancelRepairTaskAsync(repair);
|
await FabricRepairTasks.CancelRepairTaskAsync(repair);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Resume interrupted Fabric Node restart repairs */
|
|
||||||
|
|
||||||
// There is no need to resume simple repairs that do not require multiple repair steps (e.g., codepackage/process/replica restarts).
|
|
||||||
if (repairExecutorData.RepairPolicy.RepairAction != RepairActionType.RestartFabricNode)
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
string errorCode = repairExecutorData.RepairPolicy.Code;
|
|
||||||
|
|
||||||
if (string.IsNullOrWhiteSpace(errorCode))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// File Deletion repair is a node-level (VM) repair, but is not multi-step. Ignore.
|
|
||||||
if (repairExecutorData.RepairPolicy.RepairAction == RepairActionType.DeleteFiles)
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fabric System service warnings/errors from FO can be Node level repair targets (e.g., Fabric binary needs to be restarted).
|
|
||||||
// FH will restart the node hosting the troubled SF system service if specified in related logic rules.
|
|
||||||
var repairRules =
|
|
||||||
GetRepairRulesFromConfiguration(
|
|
||||||
!string.IsNullOrWhiteSpace(
|
|
||||||
repairExecutorData.RepairPolicy.ProcessName) ? RepairConstants.SystemServiceRepairPolicySectionName : RepairConstants.FabricNodeRepairPolicySectionName);
|
|
||||||
|
|
||||||
var repairData = new TelemetryData
|
|
||||||
{
|
|
||||||
NodeName = repairExecutorData.RepairPolicy.NodeName,
|
|
||||||
Code = errorCode,
|
|
||||||
};
|
|
||||||
|
|
||||||
await RunGuanQueryAsync(repairData, repairRules, Token, repairExecutorData);
|
|
||||||
RepairLogger.LogInfo("Exiting CancelOrResumeAllRunningFHRepairsAsync: Completed.");
|
RepairLogger.LogInfo("Exiting CancelOrResumeAllRunningFHRepairsAsync: Completed.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -632,13 +606,13 @@ namespace FabricHealer
|
||||||
{
|
{
|
||||||
if (e is FabricException)
|
if (e is FabricException)
|
||||||
{
|
{
|
||||||
RepairLogger.LogWarning($"Could not cancel or resume repair tasks. Failed with:{Environment.NewLine}{e}");
|
RepairLogger.LogWarning($"Could not cancel FH repair tasks. Failed with:{Environment.NewLine}{e}");
|
||||||
}
|
}
|
||||||
|
|
||||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||||
LogLevel.Info,
|
LogLevel.Info,
|
||||||
"CancelOrResumeAllRunningFHRepairsAsync",
|
"CancelOrResumeAllRunningFHRepairsAsync",
|
||||||
$"Could not cancel or resume repair tasks. Failed with:{Environment.NewLine}{e}",
|
$"Could not cancel abandoned FH repair tasks. Failed with:{Environment.NewLine}{e}",
|
||||||
Token,
|
Token,
|
||||||
null,
|
null,
|
||||||
ConfigSettings.EnableVerboseLogging);
|
ConfigSettings.EnableVerboseLogging);
|
||||||
|
@ -1561,7 +1535,7 @@ namespace FabricHealer
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Disk?
|
// Disk repair?
|
||||||
if (repairData.EntityType == EntityType.Disk)
|
if (repairData.EntityType == EntityType.Disk)
|
||||||
{
|
{
|
||||||
if (!ConfigSettings.EnableDiskRepair)
|
if (!ConfigSettings.EnableDiskRepair)
|
||||||
|
@ -1573,7 +1547,7 @@ namespace FabricHealer
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fabric node?
|
// Fabric Node repair?
|
||||||
if (repairData.EntityType == EntityType.Node)
|
if (repairData.EntityType == EntityType.Node)
|
||||||
{
|
{
|
||||||
if (!ConfigSettings.EnableFabricNodeRepair)
|
if (!ConfigSettings.EnableFabricNodeRepair)
|
||||||
|
@ -1581,17 +1555,13 @@ namespace FabricHealer
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// FabricHealerProxy-generated report, so a restart fabric node request, for example.
|
// FabricObserver/FabricHealerProxy-generated health report.
|
||||||
await ProcessFabricNodeHealthAsync(evt, repairData);
|
await ProcessFabricNodeHealthAsync(evt, repairData);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Machine repair \\
|
// Machine-level repair \\
|
||||||
if (!ConfigSettings.EnableMachineRepair)
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make sure that there is not already an Infra repair in progress for the target node.
|
// Make sure that there is not already an Infra repair in progress for the target node.
|
||||||
if (await RepairTaskEngine.IsNodeLevelRepairCurrentlyInFlightAsync(repairData, Token))
|
if (await RepairTaskEngine.IsNodeLevelRepairCurrentlyInFlightAsync(repairData, Token))
|
||||||
|
@ -1617,7 +1587,7 @@ namespace FabricHealer
|
||||||
|
|
||||||
/* Start repair workflow */
|
/* Start repair workflow */
|
||||||
|
|
||||||
string repairId = $"MachineRepair_{nodeType}_{repairData.NodeName}";
|
string repairId = $"MachineRepair_{repairData.NodeName}";
|
||||||
repairData.RepairPolicy = new RepairPolicy
|
repairData.RepairPolicy = new RepairPolicy
|
||||||
{
|
{
|
||||||
RepairId = repairId,
|
RepairId = repairId,
|
||||||
|
@ -1752,8 +1722,8 @@ namespace FabricHealer
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// There is only one supported repair for a FabricNode: Restart.
|
string action = repairData.RepairPolicy.RepairAction == RepairActionType.DeactivateNode ? "Deactivate" : "Restart";
|
||||||
string repairId = $"{repairData.NodeName}_{repairData.NodeType}_Restart";
|
string repairId = $"{repairData.NodeName}_{repairData.NodeType}_{action}";
|
||||||
|
|
||||||
var currentRepairs =
|
var currentRepairs =
|
||||||
await RepairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairConstants.FHTaskIdPrefix, Token);
|
await RepairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairConstants.FHTaskIdPrefix, Token);
|
||||||
|
@ -2173,6 +2143,7 @@ namespace FabricHealer
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CurrentlyExecutingLogicRulesFileName = logicRulesConfigFileName;
|
||||||
List<string> repairRules = ParseRulesFile(rules);
|
List<string> repairRules = ParseRulesFile(rules);
|
||||||
return repairRules;
|
return repairRules;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,11 +1,6 @@
|
||||||
## Logic rules for Service Fabric Node repairs.
|
## Logic rule examples for Service Fabric Node repairs.
|
||||||
|
## These repairs are not executed by FabricHealer. FH creates repair tasks with the correct node impact specified and RM takes it from there.
|
||||||
|
|
||||||
## First check if we are inside the run interval. If so, cut (!). This means that no other rules will be processed (no back-tracking).
|
## Restart/Deactivate. Try Restart twice in 8 hour window. Else, deactivate (Pause) the Fabric node.
|
||||||
## This is commented out by default. Just uncomment and set the global run interval for app Fabric node level repairs to suit your needs.
|
Mitigate(HealthState=Error) :- GetRepairHistory(?repairCount, 08:00:00), ?repairCount < 2, !, RestartFabricNode().
|
||||||
|
Mitigate(HealthState=Error) :- DeactivateFabricNode().
|
||||||
## Mitigate() :- CheckInsideRunInterval(02:00:00), !.
|
|
||||||
|
|
||||||
## This rule means that whatever the Fabric node-level warning data from the issuing service happens to be, restart the target Fabric node if
|
|
||||||
## the repair hasn't run 4 times in the last 8 hours.
|
|
||||||
|
|
||||||
Mitigate() :- GetRepairHistory(?repairCount, 08:00:00), ?repairCount < 4, RestartFabricNode(DoHealthChecks=false, MaxWaitTimeForHealthStateOk=00:45:00, MaxExecutionTime=02:00:00).
|
|
|
@ -1,31 +1,31 @@
|
||||||
## Logic rules for scheduling Machine-level repair jobs in the cluster. EntityType fact is always Machine.
|
## Logic rules for scheduling Machine-level repair jobs in the cluster. EntityType fact is always Machine.
|
||||||
## FH does not conduct (execute) these repairs. It simply schedules them. InfrastructureService is always the Executor for these types of Repair Jobs.
|
## FH does not conduct (execute) these repairs. It simply schedules them. InfrastructureService is always the Executor for machine-level Repair Jobs.
|
||||||
|
|
||||||
## Applicable Named Arguments for Mitigate. Facts are supplied by FabricObserver, FHProxy or FH itself.
|
## Applicable Named Arguments for Mitigate. Facts are supplied by FabricObserver, FHProxy or FH itself.
|
||||||
## Any argument below with (FO/FHProxy) means that only FO or FHProxy will present the fact.
|
## Any argument below with (FO/FHProxy) means that only FO or FHProxy will present the fact.
|
||||||
## | Argument Name | Definition |
|
## | Argument Name | Definition |
|
||||||
## |---------------------------|------------------------------------------------------------------------|
|
## |---------------------------|------------------------------------------------------------------------|
|
||||||
## | NodeName | Name of the node |
|
## | NodeName | Name of the node |
|
||||||
## | NodeType | Type of node |
|
## | NodeType | Type of node |
|
||||||
## | ErrorCode (FO/FHProxy) | Supported Error Code emitted by caller (e.g. "FO002") |
|
## | ErrorCode (FO/FHProxy) | Supported Error Code emitted by caller (e.g. "FO002") |
|
||||||
## | MetricName (FO/FHProxy) | Name of the Metric (e.g., CpuPercent or MemoryMB, etc.) |
|
## | MetricName (FO/FHProxy) | Name of the Metric (e.g., CpuPercent or MemoryMB, etc.) |
|
||||||
## | MetricValue (FO/FHProxy) | Corresponding Metric Value (e.g. "85" indicating 85% CPU usage) |
|
## | MetricValue (FO/FHProxy) | Corresponding Metric Value (e.g. "85" indicating 85% CPU usage) |
|
||||||
## | OS | The name of the OS where FabricHealer is running (Linux or Windows) |
|
## | OS | The name of the OS where FabricHealer is running (Linux or Windows) |
|
||||||
## | HealthState | The HealthState of the target entity: Error or Warning |
|
## | HealthState | The HealthState of the target entity: Error or Warning |
|
||||||
## | Source | The Source ID of the related SF Health Event |
|
## | Source | The Source ID of the related SF Health Event |
|
||||||
## | Property | The Property of the related SF Health Event |
|
## | Property | The Property of the related SF Health Event |
|
||||||
|
|
||||||
## Metric Names, from FO or FHProxy.
|
## Metric Names, from FO or FHProxy.
|
||||||
## | Name |
|
## | Name |
|
||||||
## |--------------------------------|
|
## |--------------------------------|
|
||||||
## | ActiveTcpPorts |
|
## | ActiveTcpPorts |
|
||||||
## | CpuPercent |
|
## | CpuPercent |
|
||||||
## | EphemeralPorts |
|
## | EphemeralPorts |
|
||||||
## | EphemeralPortsPercent |
|
## | EphemeralPortsPercent |
|
||||||
## | MemoryMB |
|
## | MemoryMB |
|
||||||
## | MemoryPercent |
|
## | MemoryPercent |
|
||||||
## | Handles (Linux-only) |
|
## | Handles (Linux-only) |
|
||||||
## | HandlesPercent (Linux-only) |
|
## | HandlesPercent (Linux-only) |
|
||||||
|
|
||||||
|
|
||||||
## The logic program below is a repair specification (policy) that does not require facts from FabricObserver (FO) or FHProxy.
|
## The logic program below is a repair specification (policy) that does not require facts from FabricObserver (FO) or FHProxy.
|
||||||
|
@ -34,8 +34,8 @@
|
||||||
## Don't proceed if the target entity is not in Error.
|
## Don't proceed if the target entity is not in Error.
|
||||||
Mitigate(HealthState=?healthState) :- not(?healthState == Error), !.
|
Mitigate(HealthState=?healthState) :- not(?healthState == Error), !.
|
||||||
|
|
||||||
## Don't proceed is FabricObserver's NodeObserver is not the source of the Error event.
|
## Don't proceed unless the specified watchdog created the Error health event.
|
||||||
##Mitigate(Source=?source) :- not(match(?source, "NodeObserver")), !.
|
Mitigate(Source=?source) :- not(match(?source, "EventLogWatchdog")), !.
|
||||||
|
|
||||||
## Don't proceed if there are already 2 or more machine repairs currently active in the cluster.
|
## Don't proceed if there are already 2 or more machine repairs currently active in the cluster.
|
||||||
Mitigate() :- CheckOutstandingRepairs(2), !.
|
Mitigate() :- CheckOutstandingRepairs(2), !.
|
||||||
|
@ -47,9 +47,15 @@ Mitigate() :- CheckInsideScheduleInterval(00:10:00), !.
|
||||||
Mitigate() :- CheckInsideNodeProbationPeriod(00:30:00), !.
|
Mitigate() :- CheckInsideNodeProbationPeriod(00:30:00), !.
|
||||||
|
|
||||||
## Don't proceed if the target node hasn't been in Error (including cyclic Up/Down) state for at least two hours.
|
## Don't proceed if the target node hasn't been in Error (including cyclic Up/Down) state for at least two hours.
|
||||||
Mitigate() :- CheckInsideHealthStateMinDuration(02:00:00), !.
|
Mitigate() :- CheckInsideHealthStateMinDuration(00:01:00), !.
|
||||||
|
|
||||||
## Mitigations (RM repair scheduling logic - InfrastructureService for the target node type will be the repair Executor, not FH).
|
## For certain environments, the correct mitigation is to deactivate the target node. The below rule schedules a node deactivation (intent is Pause) repair.
|
||||||
|
|
||||||
|
Mitigate(Source=?source, Property=?property) :- match(?source, "EventLogWatchdog"),
|
||||||
|
match(?property, "CriticalMachineFailure"), !,
|
||||||
|
DeactivateFabricNode(ImpactLevel=RemoveData).
|
||||||
|
|
||||||
|
## Infra Mitigations (RM repair scheduling logic - InfrastructureService for the target node type will be the repair Executor, not FH).
|
||||||
## The logic below demonstrates how to specify a repair escalation path: Reboot -> Reimage -> Heal -> Triage (human intervention required).
|
## The logic below demonstrates how to specify a repair escalation path: Reboot -> Reimage -> Heal -> Triage (human intervention required).
|
||||||
## ScheduleMachineRepair predicate takes any repair action string. There are a handful that are supported by RepairManager/InfrastructureService, like below.
|
## ScheduleMachineRepair predicate takes any repair action string. There are a handful that are supported by RepairManager/InfrastructureService, like below.
|
||||||
|
|
||||||
|
@ -69,4 +75,4 @@ Mitigate() :- GetRepairHistory(?repairCount, 08:00:00, System.Azure.Heal), ?repa
|
||||||
## from scheduling any other machine repairs for the target node until canceled. It also counts against the number of concurrent Active repairs you specified
|
## from scheduling any other machine repairs for the target node until canceled. It also counts against the number of concurrent Active repairs you specified
|
||||||
## above in the CheckOutstandingRepairs predicate.
|
## above in the CheckOutstandingRepairs predicate.
|
||||||
Mitigate(NodeName=?nodeName) :- LogInfo("0042_{0}: Specified Machine repair escalations have been exhausted for node {0}. Human intervention is required.", ?nodeName),
|
Mitigate(NodeName=?nodeName) :- LogInfo("0042_{0}: Specified Machine repair escalations have been exhausted for node {0}. Human intervention is required.", ?nodeName),
|
||||||
ScheduleMachineRepair(ManualTriageNeeded).
|
ScheduleMachineRepair(ManualTriageNeeded).
|
|
@ -13,6 +13,8 @@
|
||||||
<Parameter Name="EnableRollingServiceRestarts" Value="" MustOverride="true" />
|
<Parameter Name="EnableRollingServiceRestarts" Value="" MustOverride="true" />
|
||||||
<!-- Folder name for local log output. You can use a full path or just a folder name. -->
|
<!-- Folder name for local log output. You can use a full path or just a folder name. -->
|
||||||
<Parameter Name="LocalLogPath" Value="" MustOverride="true" />
|
<Parameter Name="LocalLogPath" Value="" MustOverride="true" />
|
||||||
|
<!-- This will enable FabricHealer to try and trace executed logic rules that employ repair action predicates. -->
|
||||||
|
<Parameter Name="EnableLogicRuleTracing" Value="" MustOverride="true" />
|
||||||
|
|
||||||
<!-- ***Non-Overridable Parameters*** These must be set in this file. -->
|
<!-- ***Non-Overridable Parameters*** These must be set in this file. -->
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
<?xml version="1.0" encoding="utf-8"?>
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
<ServiceManifest Name="FabricHealerPkg"
|
<ServiceManifest Name="FabricHealerPkg"
|
||||||
Version="1.1.17"
|
Version="1.1.18"
|
||||||
xmlns="http://schemas.microsoft.com/2011/01/fabric"
|
xmlns="http://schemas.microsoft.com/2011/01/fabric"
|
||||||
xmlns:xsd="http://www.w3.org/2001/XMLSchema"
|
xmlns:xsd="http://www.w3.org/2001/XMLSchema"
|
||||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||||
|
@ -11,7 +11,7 @@
|
||||||
</ServiceTypes>
|
</ServiceTypes>
|
||||||
|
|
||||||
<!-- Code package is your service executable. -->
|
<!-- Code package is your service executable. -->
|
||||||
<CodePackage Name="Code" Version="1.1.17">
|
<CodePackage Name="Code" Version="1.1.18">
|
||||||
<EntryPoint>
|
<EntryPoint>
|
||||||
<ExeHost>
|
<ExeHost>
|
||||||
<Program>FabricHealer</Program>
|
<Program>FabricHealer</Program>
|
||||||
|
@ -21,5 +21,5 @@
|
||||||
|
|
||||||
<!-- Config package is the contents of the Config directory under PackageRoot that contains an
|
<!-- Config package is the contents of the Config directory under PackageRoot that contains an
|
||||||
independently-updateable and versioned set of custom configuration settings for your service. -->
|
independently-updateable and versioned set of custom configuration settings for your service. -->
|
||||||
<ConfigPackage Name="Config" Version="1.1.17" />
|
<ConfigPackage Name="Config" Version="1.1.18" />
|
||||||
</ServiceManifest>
|
</ServiceManifest>
|
|
@ -167,8 +167,9 @@ namespace FabricHealer.Repair
|
||||||
|
|
||||||
repairTask = await RepairTaskEngine.CreateInfrastructureRepairTaskAsync(repairData, token);
|
repairTask = await RepairTaskEngine.CreateInfrastructureRepairTaskAsync(repairData, token);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// FH
|
// FH
|
||||||
|
case RepairActionType.DeactivateNode:
|
||||||
case RepairActionType.DeleteFiles:
|
case RepairActionType.DeleteFiles:
|
||||||
case RepairActionType.RestartCodePackage:
|
case RepairActionType.RestartCodePackage:
|
||||||
case RepairActionType.RestartFabricNode:
|
case RepairActionType.RestartFabricNode:
|
||||||
|
@ -184,20 +185,16 @@ namespace FabricHealer.Repair
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool success = await CreateRepairTaskAsync(
|
bool success = await CreateClusterRepairTaskAsync(repairTask, repairData, token);
|
||||||
repairTask,
|
|
||||||
repairData,
|
|
||||||
token);
|
|
||||||
|
|
||||||
return success ? repairTask : null;
|
return success ? repairTask : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static async Task<bool> CreateRepairTaskAsync(
|
private static async Task<bool> CreateClusterRepairTaskAsync(
|
||||||
RepairTask repairTask,
|
RepairTask repairTask,
|
||||||
TelemetryData repairData,
|
TelemetryData repairData,
|
||||||
CancellationToken token)
|
CancellationToken token)
|
||||||
{
|
{
|
||||||
if (repairTask == null)
|
if (repairTask == null || repairData?.RepairPolicy == null)
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -214,29 +211,45 @@ namespace FabricHealer.Repair
|
||||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||||
token);
|
token);
|
||||||
|
|
||||||
|
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||||
|
LogLevel.Info,
|
||||||
|
$"CreateClusterRepairTaskAsync::{repairData.RepairPolicy.RepairId}",
|
||||||
|
$"Successfully created repair task {repairTask.TaskId}.",
|
||||||
|
token,
|
||||||
|
null,
|
||||||
|
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||||
|
LogLevel.Info,
|
||||||
|
$"CreateClusterRepairTaskAsync::{repairData.RepairPolicy.RepairId}_AlreadyExists",
|
||||||
|
$"A repair already exists with internal repair Id {repairData.RepairPolicy.RepairId}. Will not schedule another repair.",
|
||||||
|
token,
|
||||||
|
null,
|
||||||
|
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
catch (ArgumentException ae)
|
catch (ArgumentException ae)
|
||||||
{
|
{
|
||||||
string message = $"Unable to create repairtask:{Environment.NewLine}{ae}";
|
|
||||||
|
|
||||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||||
LogLevel.Warning,
|
LogLevel.Info,
|
||||||
"FabricRepairTasks::TryCreateRepairTaskAsync",
|
"CreateClusterRepairTaskAsync",
|
||||||
message,
|
$"Unable to create repairtask:{Environment.NewLine}{ae}",
|
||||||
token,
|
token,
|
||||||
repairData,
|
repairData,
|
||||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||||
}
|
}
|
||||||
catch (FabricException fe)
|
catch (FabricException fe)
|
||||||
{
|
{
|
||||||
string message = $"Unable to create repairtask:{Environment.NewLine}{fe}";
|
|
||||||
|
|
||||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||||
LogLevel.Warning,
|
LogLevel.Info,
|
||||||
"FabricRepairTasks::TryCreateRepairTaskAsync",
|
$"CreateClusterRepairTaskAsync::Failure({repairData.RepairPolicy.RepairId})",
|
||||||
message,
|
$"Unable to create repair task:{Environment.NewLine}{fe}",
|
||||||
token,
|
token,
|
||||||
repairData,
|
repairData,
|
||||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||||
|
@ -264,13 +277,13 @@ namespace FabricHealer.Repair
|
||||||
{
|
{
|
||||||
var allSystemServices =
|
var allSystemServices =
|
||||||
await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||||
() => FabricHealerManager.FabricClientSingleton.QueryManager.GetServiceListAsync(
|
() => FabricHealerManager.FabricClientSingleton.QueryManager.GetServiceListAsync(
|
||||||
new Uri(RepairConstants.SystemAppName),
|
new Uri(RepairConstants.SystemAppName),
|
||||||
null,
|
null,
|
||||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||||
cancellationToken),
|
cancellationToken),
|
||||||
|
|
||||||
cancellationToken);
|
cancellationToken);
|
||||||
|
|
||||||
var infraInstances =
|
var infraInstances =
|
||||||
allSystemServices.Where(i => i.ServiceTypeName.Equals(RepairConstants.InfrastructureServiceType, StringComparison.InvariantCultureIgnoreCase));
|
allSystemServices.Where(i => i.ServiceTypeName.Equals(RepairConstants.InfrastructureServiceType, StringComparison.InvariantCultureIgnoreCase));
|
||||||
|
|
|
@ -0,0 +1,139 @@
|
||||||
|
// ------------------------------------------------------------
|
||||||
|
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||||
|
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
|
||||||
|
// ------------------------------------------------------------
|
||||||
|
|
||||||
|
using FabricHealer.Utilities.Telemetry;
|
||||||
|
using FabricHealer.Utilities;
|
||||||
|
using Guan.Logic;
|
||||||
|
using System;
|
||||||
|
using System.Fabric.Repair;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
|
||||||
|
namespace FabricHealer.Repair.Guan
|
||||||
|
{
|
||||||
|
internal class DeactivateFabricNodePredicateType : PredicateType
|
||||||
|
{
|
||||||
|
private static TelemetryData RepairData;
|
||||||
|
private static DeactivateFabricNodePredicateType Instance;
|
||||||
|
|
||||||
|
private class Resolver : BooleanPredicateResolver
|
||||||
|
{
|
||||||
|
public Resolver(CompoundTerm input, Constraint constraint, QueryContext context)
|
||||||
|
: base(input, constraint, context)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
protected override async Task<bool> CheckAsync()
|
||||||
|
{
|
||||||
|
RepairData.RepairPolicy.RepairAction = RepairActionType.DeactivateNode;
|
||||||
|
RepairData.RepairPolicy.RepairIdPrefix = RepairConstants.FHTaskIdPrefix;
|
||||||
|
RepairData.RepairPolicy.RepairId = $"DeactivateNode::{RepairData.NodeName}";
|
||||||
|
|
||||||
|
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
|
||||||
|
{
|
||||||
|
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
|
||||||
|
}
|
||||||
|
|
||||||
|
int count = Input.Arguments.Count;
|
||||||
|
|
||||||
|
for (int i = 0; i < count; i++)
|
||||||
|
{
|
||||||
|
var typeString = Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue().GetType().Name;
|
||||||
|
|
||||||
|
switch (typeString)
|
||||||
|
{
|
||||||
|
case "Boolean" when i == 0 && count == 4 || Input.Arguments[i].Name.ToLower() == "dohealthchecks":
|
||||||
|
RepairData.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue();
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "TimeSpan" when i == 1 && count == 4 || Input.Arguments[i].Name.ToLower() == "maxwaittimeforhealthstateok":
|
||||||
|
RepairData.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue();
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "TimeSpan" when i == 2 && count == 4 || Input.Arguments[i].Name.ToLower() == "maxexecutiontime":
|
||||||
|
RepairData.RepairPolicy.MaxExecutionTime = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue();
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "String" when i == 3 && count == 4 || Input.Arguments[i].Name.ToLower() == "impactlevel":
|
||||||
|
|
||||||
|
string value = Input.Arguments[i].Value.GetEffectiveTerm().GetStringValue().ToLower();
|
||||||
|
|
||||||
|
if (value == "removedata")
|
||||||
|
{
|
||||||
|
RepairData.RepairPolicy.NodeImpactLevel = NodeImpactLevel.RemoveData;
|
||||||
|
}
|
||||||
|
else if (value == "removenode")
|
||||||
|
{
|
||||||
|
RepairData.RepairPolicy.NodeImpactLevel = NodeImpactLevel.RemoveNode;
|
||||||
|
}
|
||||||
|
else if (value == "restart")
|
||||||
|
{
|
||||||
|
RepairData.RepairPolicy.NodeImpactLevel = NodeImpactLevel.Restart;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
RepairData.RepairPolicy.NodeImpactLevel = NodeImpactLevel.None;
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
throw new GuanException($"Unsupported argument type for RestartFabricNode: {typeString}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var isNodeRepairAlreadyInProgress =
|
||||||
|
await RepairTaskEngine.IsRepairInProgressAsync(RepairData, FabricHealerManager.Token);
|
||||||
|
|
||||||
|
if (isNodeRepairAlreadyInProgress)
|
||||||
|
{
|
||||||
|
string message =
|
||||||
|
$"A repair for Fabric node {RepairData.NodeName} is already in progress in the cluster.";
|
||||||
|
|
||||||
|
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||||
|
LogLevel.Info,
|
||||||
|
$"DeactivateFabricNode::{RepairData.RepairPolicy.RepairId}",
|
||||||
|
message,
|
||||||
|
FabricHealerManager.Token,
|
||||||
|
RepairData,
|
||||||
|
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to schedule the Deactivate repair with RM (RM will deactivate the node, not FH).
|
||||||
|
RepairTask repairTask = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||||
|
() => RepairTaskManager.ScheduleFabricHealerRepairTaskAsync(
|
||||||
|
RepairData,
|
||||||
|
FabricHealerManager.Token),
|
||||||
|
FabricHealerManager.Token);
|
||||||
|
if (repairTask == null)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static DeactivateFabricNodePredicateType Singleton(string name, TelemetryData repairData)
|
||||||
|
{
|
||||||
|
RepairData = repairData;
|
||||||
|
return Instance ??= new DeactivateFabricNodePredicateType(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
private DeactivateFabricNodePredicateType(string name)
|
||||||
|
: base(name, true, 0)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public override PredicateResolver CreateResolver(CompoundTerm input, Constraint constraint, QueryContext context)
|
||||||
|
{
|
||||||
|
return new Resolver(input, constraint, context);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -34,6 +34,11 @@ namespace FabricHealer.Repair.Guan
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
|
||||||
|
{
|
||||||
|
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
|
||||||
|
}
|
||||||
|
|
||||||
RepairData.RepairPolicy.RepairAction = RepairActionType.DeleteFiles;
|
RepairData.RepairPolicy.RepairAction = RepairActionType.DeleteFiles;
|
||||||
bool recurseSubDirectories = false;
|
bool recurseSubDirectories = false;
|
||||||
string path = Input.Arguments[0].Value.GetEffectiveTerm().GetStringValue();
|
string path = Input.Arguments[0].Value.GetEffectiveTerm().GetStringValue();
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
// ------------------------------------------------------------
|
// ------------------------------------------------------------
|
||||||
|
|
||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
|
using System.Diagnostics;
|
||||||
using System.Threading;
|
using System.Threading;
|
||||||
using System.Threading.Tasks;
|
using System.Threading.Tasks;
|
||||||
using Guan.Logic;
|
using Guan.Logic;
|
||||||
|
@ -27,7 +28,7 @@ namespace FabricHealer.Repair.Guan
|
||||||
QueryContext queryContext = new(moduleProvider);
|
QueryContext queryContext = new(moduleProvider);
|
||||||
queryContext.SetDirection(null, order);
|
queryContext.SetDirection(null, order);
|
||||||
Query query = Query.Create(queryExpression, queryContext);
|
Query query = Query.Create(queryExpression, queryContext);
|
||||||
await query.GetNextAsync();
|
_ = await query.GetNextAsync();
|
||||||
}
|
}
|
||||||
|
|
||||||
public async Task RunQueryAsync(List<CompoundTerm> queryExpressions, CancellationToken cancellationToken)
|
public async Task RunQueryAsync(List<CompoundTerm> queryExpressions, CancellationToken cancellationToken)
|
||||||
|
@ -43,7 +44,7 @@ namespace FabricHealer.Repair.Guan
|
||||||
QueryContext queryContext = new(moduleProvider);
|
QueryContext queryContext = new(moduleProvider);
|
||||||
queryContext.SetDirection(null, order);
|
queryContext.SetDirection(null, order);
|
||||||
Query query = Query.Create(queryExpressions, queryContext, moduleProvider);
|
Query query = Query.Create(queryExpressions, queryContext, moduleProvider);
|
||||||
await query.GetNextAsync();
|
_ = await query.GetNextAsync();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,12 @@ namespace FabricHealer.Repair.Guan
|
||||||
protected override async Task<bool> CheckAsync()
|
protected override async Task<bool> CheckAsync()
|
||||||
{
|
{
|
||||||
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartCodePackage;
|
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartCodePackage;
|
||||||
|
|
||||||
|
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
|
||||||
|
{
|
||||||
|
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
|
||||||
|
}
|
||||||
|
|
||||||
int count = Input.Arguments.Count;
|
int count = Input.Arguments.Count;
|
||||||
|
|
||||||
for (int i = 0; i < count; i++)
|
for (int i = 0; i < count; i++)
|
||||||
|
|
|
@ -9,13 +9,11 @@ using Guan.Logic;
|
||||||
using FabricHealer.Utilities;
|
using FabricHealer.Utilities;
|
||||||
using FabricHealer.Utilities.Telemetry;
|
using FabricHealer.Utilities.Telemetry;
|
||||||
using System.Threading.Tasks;
|
using System.Threading.Tasks;
|
||||||
using System.Threading;
|
|
||||||
|
|
||||||
namespace FabricHealer.Repair.Guan
|
namespace FabricHealer.Repair.Guan
|
||||||
{
|
{
|
||||||
public class RestartFabricNodePredicateType : PredicateType
|
public class RestartFabricNodePredicateType : PredicateType
|
||||||
{
|
{
|
||||||
private static RepairExecutorData RepairExecutorData;
|
|
||||||
private static TelemetryData RepairData;
|
private static TelemetryData RepairData;
|
||||||
private static RestartFabricNodePredicateType Instance;
|
private static RestartFabricNodePredicateType Instance;
|
||||||
|
|
||||||
|
@ -30,6 +28,13 @@ namespace FabricHealer.Repair.Guan
|
||||||
protected override async Task<bool> CheckAsync()
|
protected override async Task<bool> CheckAsync()
|
||||||
{
|
{
|
||||||
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartFabricNode;
|
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartFabricNode;
|
||||||
|
RepairData.RepairPolicy.RepairIdPrefix = RepairConstants.FHTaskIdPrefix;
|
||||||
|
|
||||||
|
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
|
||||||
|
{
|
||||||
|
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
|
||||||
|
}
|
||||||
|
|
||||||
int count = Input.Arguments.Count;
|
int count = Input.Arguments.Count;
|
||||||
|
|
||||||
for (int i = 0; i < count; i++)
|
for (int i = 0; i < count; i++)
|
||||||
|
@ -55,59 +60,6 @@ namespace FabricHealer.Repair.Guan
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
RepairTask repairTask;
|
|
||||||
bool success;
|
|
||||||
|
|
||||||
// This means it's a resumed repair.
|
|
||||||
if (RepairExecutorData != null)
|
|
||||||
{
|
|
||||||
// Historical info, like what step the healer was in when the node went down, is contained in the
|
|
||||||
// executordata instance.
|
|
||||||
repairTask = await RepairTaskEngine.CreateFabricHealerRepairTask(RepairExecutorData, FabricHealerManager.Token);
|
|
||||||
|
|
||||||
if (repairTask == null)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// MaxExecutionTime impl.
|
|
||||||
using (CancellationTokenSource tokenSource = new())
|
|
||||||
{
|
|
||||||
using (var linkedCTS = CancellationTokenSource.CreateLinkedTokenSource(
|
|
||||||
tokenSource.Token,
|
|
||||||
FabricHealerManager.Token))
|
|
||||||
{
|
|
||||||
TimeSpan maxExecutionTime = TimeSpan.FromMinutes(60);
|
|
||||||
|
|
||||||
if (RepairData.RepairPolicy.MaxExecutionTime > TimeSpan.Zero)
|
|
||||||
{
|
|
||||||
maxExecutionTime = RepairData.RepairPolicy.MaxExecutionTime;
|
|
||||||
}
|
|
||||||
|
|
||||||
tokenSource.CancelAfter(maxExecutionTime);
|
|
||||||
tokenSource.Token.Register(() =>
|
|
||||||
{
|
|
||||||
_ = FabricHealerManager.TryCleanUpOrphanedFabricHealerRepairJobsAsync();
|
|
||||||
});
|
|
||||||
|
|
||||||
// Try to execute repair (FH executor does this work and manages repair state).
|
|
||||||
success = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
|
||||||
() => RepairTaskManager.ExecuteFabricHealerRepairTaskAsync(
|
|
||||||
repairTask,
|
|
||||||
RepairData,
|
|
||||||
linkedCTS.Token),
|
|
||||||
linkedCTS.Token);
|
|
||||||
|
|
||||||
if (!success && linkedCTS.IsCancellationRequested)
|
|
||||||
{
|
|
||||||
await FabricHealerManager.TryCleanUpOrphanedFabricHealerRepairJobsAsync();
|
|
||||||
}
|
|
||||||
|
|
||||||
return success;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Block attempts to create node-level repair tasks if one is already running in the cluster.
|
// Block attempts to create node-level repair tasks if one is already running in the cluster.
|
||||||
var isNodeRepairAlreadyInProgress =
|
var isNodeRepairAlreadyInProgress =
|
||||||
await RepairTaskEngine.IsRepairInProgressAsync(RepairData, FabricHealerManager.Token);
|
await RepairTaskEngine.IsRepairInProgressAsync(RepairData, FabricHealerManager.Token);
|
||||||
|
@ -128,8 +80,8 @@ namespace FabricHealer.Repair.Guan
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try to schedule repair with RM.
|
// Try to schedule repair with RM for Fabric Node Restart (FH will not be the executor).
|
||||||
repairTask = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
RepairTask repairTask = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||||
() => RepairTaskManager.ScheduleFabricHealerRepairTaskAsync(
|
() => RepairTaskManager.ScheduleFabricHealerRepairTaskAsync(
|
||||||
RepairData,
|
RepairData,
|
||||||
FabricHealerManager.Token),
|
FabricHealerManager.Token),
|
||||||
|
@ -139,52 +91,13 @@ namespace FabricHealer.Repair.Guan
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
using (CancellationTokenSource tokenSource = new())
|
return true;
|
||||||
{
|
|
||||||
using (var linkedCTS = CancellationTokenSource.CreateLinkedTokenSource(
|
|
||||||
tokenSource.Token,
|
|
||||||
FabricHealerManager.Token))
|
|
||||||
{
|
|
||||||
TimeSpan maxExecutionTime = TimeSpan.FromMinutes(60);
|
|
||||||
|
|
||||||
if (RepairData.RepairPolicy.MaxExecutionTime > TimeSpan.Zero)
|
|
||||||
{
|
|
||||||
maxExecutionTime = RepairData.RepairPolicy.MaxExecutionTime;
|
|
||||||
}
|
|
||||||
|
|
||||||
tokenSource.CancelAfter(maxExecutionTime);
|
|
||||||
tokenSource.Token.Register(() =>
|
|
||||||
{
|
|
||||||
_ = FabricHealerManager.TryCleanUpOrphanedFabricHealerRepairJobsAsync();
|
|
||||||
});
|
|
||||||
|
|
||||||
// Try to execute repair (FH executor does this work and manages repair state).
|
|
||||||
success = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
|
||||||
() => RepairTaskManager.ExecuteFabricHealerRepairTaskAsync(
|
|
||||||
repairTask,
|
|
||||||
RepairData,
|
|
||||||
linkedCTS.Token),
|
|
||||||
linkedCTS.Token);
|
|
||||||
|
|
||||||
if (!success && linkedCTS.IsCancellationRequested)
|
|
||||||
{
|
|
||||||
await FabricHealerManager.TryCleanUpOrphanedFabricHealerRepairJobsAsync();
|
|
||||||
}
|
|
||||||
|
|
||||||
return success;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static RestartFabricNodePredicateType Singleton(
|
public static RestartFabricNodePredicateType Singleton(string name, TelemetryData repairData)
|
||||||
string name,
|
|
||||||
RepairExecutorData repairExecutorData,
|
|
||||||
TelemetryData repairData)
|
|
||||||
{
|
{
|
||||||
RepairExecutorData = repairExecutorData;
|
|
||||||
RepairData = repairData;
|
RepairData = repairData;
|
||||||
|
|
||||||
return Instance ??= new RestartFabricNodePredicateType(name);
|
return Instance ??= new RestartFabricNodePredicateType(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -34,6 +34,12 @@ namespace FabricHealer.Repair.Guan
|
||||||
}
|
}
|
||||||
|
|
||||||
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartProcess;
|
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartProcess;
|
||||||
|
|
||||||
|
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
|
||||||
|
{
|
||||||
|
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
|
||||||
|
}
|
||||||
|
|
||||||
int count = Input.Arguments.Count;
|
int count = Input.Arguments.Count;
|
||||||
|
|
||||||
for (int i = 0; i < count; i++)
|
for (int i = 0; i < count; i++)
|
||||||
|
|
|
@ -28,6 +28,12 @@ namespace FabricHealer.Repair.Guan
|
||||||
protected override async Task<bool> CheckAsync()
|
protected override async Task<bool> CheckAsync()
|
||||||
{
|
{
|
||||||
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartReplica;
|
RepairData.RepairPolicy.RepairAction = RepairActionType.RestartReplica;
|
||||||
|
|
||||||
|
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
|
||||||
|
{
|
||||||
|
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
|
||||||
|
}
|
||||||
|
|
||||||
int count = Input.Arguments.Count;
|
int count = Input.Arguments.Count;
|
||||||
|
|
||||||
for (int i = 0; i < count; i++)
|
for (int i = 0; i < count; i++)
|
||||||
|
|
|
@ -34,6 +34,13 @@ namespace FabricHealer.Repair.Guan
|
||||||
throw new GuanException("You must provide a repair action name for Infrastructure-level repairs as first argument.");
|
throw new GuanException("You must provide a repair action name for Infrastructure-level repairs as first argument.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
RepairData.RepairPolicy.RepairAction = RepairActionType.Infra;
|
||||||
|
|
||||||
|
if (FabricHealerManager.ConfigSettings.EnableLogicRuleTracing)
|
||||||
|
{
|
||||||
|
_ = await RepairTaskEngine.TryTraceCurrentlyExecutingRule(Input.ToString(), RepairData);
|
||||||
|
}
|
||||||
|
|
||||||
// FH does not execute repairs for VM level mitigation. InfrastructureService (IS) does,
|
// FH does not execute repairs for VM level mitigation. InfrastructureService (IS) does,
|
||||||
// so, FH schedules VM repairs via RM and the execution is taken care of by IS (the executor).
|
// so, FH schedules VM repairs via RM and the execution is taken care of by IS (the executor).
|
||||||
// Block attempts to create duplicate repair tasks or more than specified concurrent machine-level repairs.
|
// Block attempts to create duplicate repair tasks or more than specified concurrent machine-level repairs.
|
||||||
|
|
|
@ -0,0 +1,119 @@
|
||||||
|
// ------------------------------------------------------------
|
||||||
|
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||||
|
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
|
||||||
|
// ------------------------------------------------------------
|
||||||
|
|
||||||
|
using FabricHealer.Utilities;
|
||||||
|
using FabricHealer.Utilities.Telemetry;
|
||||||
|
using Guan.Logic;
|
||||||
|
using System;
|
||||||
|
using System.IO;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
|
||||||
|
namespace FabricHealer.Repair.Guan
|
||||||
|
{
|
||||||
|
internal class TraceNextRulePredicateType : PredicateType
|
||||||
|
{
|
||||||
|
private static TraceNextRulePredicateType Instance;
|
||||||
|
private static TelemetryData RepairData;
|
||||||
|
|
||||||
|
private class Resolver : BooleanPredicateResolver
|
||||||
|
{
|
||||||
|
public Resolver(CompoundTerm input, Constraint constraint, QueryContext context)
|
||||||
|
: base(input, constraint, context)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
protected override async Task<bool> CheckAsync()
|
||||||
|
{
|
||||||
|
string ruleFileName = FabricHealerManager.CurrentlyExecutingLogicRulesFileName, rule = string.Empty;
|
||||||
|
int lineNumber = 0;
|
||||||
|
|
||||||
|
string ruleFilePath =
|
||||||
|
Path.Combine(
|
||||||
|
FabricHealerManager.ServiceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config").Path,
|
||||||
|
"LogicRules",
|
||||||
|
ruleFileName);
|
||||||
|
|
||||||
|
if (!File.Exists(ruleFilePath))
|
||||||
|
{
|
||||||
|
throw new GuanException($"Specified rule file path does not exist: {ruleFilePath}");
|
||||||
|
}
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
string[] lines = File.ReadLines(ruleFilePath).ToArray();
|
||||||
|
|
||||||
|
for (int i = 0; i < lines.Length; i++)
|
||||||
|
{
|
||||||
|
string line = lines[i];
|
||||||
|
|
||||||
|
if (line.Contains($":- {RepairConstants.TraceNextRule}", StringComparison.OrdinalIgnoreCase))
|
||||||
|
{
|
||||||
|
lineNumber = i + 1;
|
||||||
|
line = lines[lineNumber];
|
||||||
|
|
||||||
|
while (string.IsNullOrWhiteSpace(line) || line.TrimStart().StartsWith("##"))
|
||||||
|
{
|
||||||
|
lineNumber++;
|
||||||
|
line = lines[lineNumber];
|
||||||
|
}
|
||||||
|
|
||||||
|
// custom rule formatting support.
|
||||||
|
if (line.TrimEnd().EndsWith(','))
|
||||||
|
{
|
||||||
|
for (int j = lineNumber; lines[j].TrimEnd().EndsWith(','); j++)
|
||||||
|
{
|
||||||
|
line += " " + lines[j + 1].Replace('\t', ' ').Trim();
|
||||||
|
lineNumber = j;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
rule = line;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||||
|
LogLevel.Info,
|
||||||
|
$"{ruleFileName}#{lineNumber}_{RepairData.RepairPolicy.ProcessName ?? RepairData.NodeName}",
|
||||||
|
$"Executing logic rule \'{rule}\'",
|
||||||
|
FabricHealerManager.Token);
|
||||||
|
}
|
||||||
|
catch (Exception e) when (e is ArgumentException || e is IOException || e is SystemException)
|
||||||
|
{
|
||||||
|
string message = $"TraceNextRule failure => Unable to read {ruleFileName}: {e.Message}";
|
||||||
|
FabricHealerManager.RepairLogger.LogWarning(message);
|
||||||
|
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||||
|
LogLevel.Info,
|
||||||
|
$"TraceNextRule::{ruleFileName}::Failure",
|
||||||
|
message,
|
||||||
|
FabricHealerManager.Token);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Guarantees the next rule runs. This is critical given TraceNextRule is designed to log the full text of whatever logic rule comes after it in a rule file.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static TraceNextRulePredicateType Singleton(string name, TelemetryData repairData)
|
||||||
|
{
|
||||||
|
RepairData = repairData;
|
||||||
|
return Instance ??= new TraceNextRulePredicateType(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
private TraceNextRulePredicateType(string name) : base(name, true, 0)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public override PredicateResolver CreateResolver(CompoundTerm input, Constraint constraint, QueryContext context)
|
||||||
|
{
|
||||||
|
return new Resolver(input, constraint, context);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -15,6 +15,7 @@ namespace FabricHealer.Repair
|
||||||
public enum RepairActionType
|
public enum RepairActionType
|
||||||
{
|
{
|
||||||
Infra,
|
Infra,
|
||||||
|
DeactivateNode,
|
||||||
DeleteFiles,
|
DeleteFiles,
|
||||||
RemoveFabricNodeState,
|
RemoveFabricNodeState,
|
||||||
RemoveReplica,
|
RemoveReplica,
|
||||||
|
|
|
@ -34,6 +34,7 @@ namespace FabricHealer.Repair
|
||||||
public const string LocalLogPathParameter = "LocalLogPath";
|
public const string LocalLogPathParameter = "LocalLogPath";
|
||||||
public const string AsyncOperationTimeout = "AsyncOperationTimeoutSeconds";
|
public const string AsyncOperationTimeout = "AsyncOperationTimeoutSeconds";
|
||||||
public const string EnableFabricHealerOperationalTelemetry = "EnableOperationalTelemetry";
|
public const string EnableFabricHealerOperationalTelemetry = "EnableOperationalTelemetry";
|
||||||
|
public const string EnableLogicRuleTracing = "EnableLogicRuleTracing";
|
||||||
|
|
||||||
// General Repair Settings Parameters.
|
// General Repair Settings Parameters.
|
||||||
public const string EnableAutoMitigation = "EnableAutoMitigation";
|
public const string EnableAutoMitigation = "EnableAutoMitigation";
|
||||||
|
@ -73,6 +74,7 @@ namespace FabricHealer.Repair
|
||||||
public const string Source = "Source";
|
public const string Source = "Source";
|
||||||
|
|
||||||
// Repair Actions.
|
// Repair Actions.
|
||||||
|
public const string DeactivateFabricNode = "DeactivateFabricNode";
|
||||||
public const string DeleteFiles = "DeleteFiles";
|
public const string DeleteFiles = "DeleteFiles";
|
||||||
public const string RestartCodePackage = "RestartCodePackage";
|
public const string RestartCodePackage = "RestartCodePackage";
|
||||||
public const string RestartFabricNode = "RestartFabricNode";
|
public const string RestartFabricNode = "RestartFabricNode";
|
||||||
|
@ -100,6 +102,7 @@ namespace FabricHealer.Repair
|
||||||
public const string LogInfo = "LogInfo";
|
public const string LogInfo = "LogInfo";
|
||||||
public const string LogWarning = "LogWarning";
|
public const string LogWarning = "LogWarning";
|
||||||
public const string LogError = "LogError";
|
public const string LogError = "LogError";
|
||||||
|
public const string TraceNextRule = "TraceNextRule";
|
||||||
|
|
||||||
// Metric names.
|
// Metric names.
|
||||||
public const string ActiveTcpPorts = "ActiveTcpPorts";
|
public const string ActiveTcpPorts = "ActiveTcpPorts";
|
||||||
|
|
|
@ -13,7 +13,6 @@ using System.Fabric.Query;
|
||||||
using System.Fabric.Health;
|
using System.Fabric.Health;
|
||||||
using FabricHealer.Utilities;
|
using FabricHealer.Utilities;
|
||||||
using FabricHealer.Utilities.Telemetry;
|
using FabricHealer.Utilities.Telemetry;
|
||||||
using System.Fabric.Repair;
|
|
||||||
using System.Net;
|
using System.Net;
|
||||||
using System.Net.Sockets;
|
using System.Net.Sockets;
|
||||||
using System.IO;
|
using System.IO;
|
||||||
|
@ -22,7 +21,6 @@ using System.Collections.Generic;
|
||||||
using System.ComponentModel;
|
using System.ComponentModel;
|
||||||
using Newtonsoft.Json;
|
using Newtonsoft.Json;
|
||||||
using System.Fabric.Description;
|
using System.Fabric.Description;
|
||||||
using System.Security.Cryptography.X509Certificates;
|
|
||||||
|
|
||||||
namespace FabricHealer.Repair
|
namespace FabricHealer.Repair
|
||||||
{
|
{
|
||||||
|
@ -187,310 +185,6 @@ namespace FabricHealer.Repair
|
||||||
FabricHealerManager.RepairHistory.SuccessfulRepairs++;
|
FabricHealerManager.RepairHistory.SuccessfulRepairs++;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// Safely restarts a Service Fabric Node instance.
|
|
||||||
/// Algorithm:
|
|
||||||
/// 1 Deactivate target node.
|
|
||||||
/// 2 Wait for node to get into Disabled/Ok.
|
|
||||||
/// 3 Restart node (which is the Fabric.exe kill API in FaultManager)
|
|
||||||
/// 4 Wait for node to go Down.
|
|
||||||
/// 5 Wait for node to get to Disabled/Ok.
|
|
||||||
/// 5 Activate node.
|
|
||||||
/// 6 Wait for node to get to Up/Ok.
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="repairData">Repair configuration</param>
|
|
||||||
/// <param name="repairTask">The scheduled Repair Task</param>
|
|
||||||
/// <param name="cancellationToken">Task cancellation token</param>
|
|
||||||
/// <returns></returns>
|
|
||||||
public static async Task<bool> SafeRestartFabricNodeAsync(
|
|
||||||
TelemetryData repairData,
|
|
||||||
RepairTask repairTask,
|
|
||||||
CancellationToken cancellationToken)
|
|
||||||
{
|
|
||||||
if (await FabricHealerManager.IsOneNodeClusterAsync())
|
|
||||||
{
|
|
||||||
string info = "One node cluster detected. Aborting node restart operation.";
|
|
||||||
|
|
||||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
|
||||||
LogLevel.Info,
|
|
||||||
"RepairExecutor.SafeRestartFabricNodeAsync::NodeCount_1",
|
|
||||||
info,
|
|
||||||
cancellationToken,
|
|
||||||
repairData,
|
|
||||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
var nodeQueryDesc = new NodeQueryDescription
|
|
||||||
{
|
|
||||||
MaxResults = 5,
|
|
||||||
};
|
|
||||||
|
|
||||||
NodeList nodeList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
|
||||||
() => FabricHealerManager.FabricClientSingleton.QueryManager.GetNodePagedListAsync(
|
|
||||||
nodeQueryDesc,
|
|
||||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
|
||||||
cancellationToken),
|
|
||||||
cancellationToken);
|
|
||||||
|
|
||||||
if (nodeList.Count < 3)
|
|
||||||
{
|
|
||||||
string info = $"Unsupported repair for a {nodeList.Count}-node cluster. Aborting fabric node restart operation.";
|
|
||||||
|
|
||||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
|
||||||
LogLevel.Info,
|
|
||||||
"RepairExecutor.SafeRestartFabricNodeAsync::NodeCount",
|
|
||||||
info,
|
|
||||||
cancellationToken,
|
|
||||||
repairData,
|
|
||||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
|
||||||
|
|
||||||
FabricHealerManager.RepairLogger.LogInfo(info);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
ServiceDescription serviceDesc =
|
|
||||||
await FabricHealerManager.FabricClientSingleton.ServiceManager.GetServiceDescriptionAsync(
|
|
||||||
FabricHealerManager.ServiceContext.ServiceName,
|
|
||||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
|
||||||
cancellationToken);
|
|
||||||
|
|
||||||
int instanceCount = (serviceDesc as StatelessServiceDescription).InstanceCount;
|
|
||||||
|
|
||||||
if (instanceCount == -1)
|
|
||||||
{
|
|
||||||
bool isTargetNodeHostingFH = repairData.NodeName == FabricHealerManager.ServiceContext.NodeContext.NodeName;
|
|
||||||
|
|
||||||
if (isTargetNodeHostingFH)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!nodeList.Any(n => n.NodeName == repairData.NodeName))
|
|
||||||
{
|
|
||||||
string info = $"Fabric node {repairData.NodeName} does not exist.";
|
|
||||||
|
|
||||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
|
||||||
LogLevel.Info,
|
|
||||||
"RepairExecutor.SafeRestartFabricNodeAsync::MissingNode",
|
|
||||||
info,
|
|
||||||
cancellationToken,
|
|
||||||
repairData,
|
|
||||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
|
||||||
}
|
|
||||||
|
|
||||||
var nodeInstanceId = nodeList.First(n => n.NodeName == repairData.NodeName).NodeInstanceId;
|
|
||||||
var stopwatch = new Stopwatch();
|
|
||||||
var maxWaitTimeout = TimeSpan.FromMinutes(MaxWaitTimeMinutesForNodeOperation);
|
|
||||||
string actionMessage = $"Attempting to safely restart Fabric node {repairData.NodeName} with InstanceId {nodeInstanceId}.";
|
|
||||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
|
||||||
LogLevel.Info,
|
|
||||||
"RepairExecutor.SafeRestartFabricNodeAsyncAttemptingRestart",
|
|
||||||
actionMessage,
|
|
||||||
cancellationToken,
|
|
||||||
repairData,
|
|
||||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
|
||||||
try
|
|
||||||
{
|
|
||||||
if (!JsonSerializationUtility.TryDeserializeObject(repairTask.ExecutorData, out RepairExecutorData executorData))
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (executorData.LatestRepairStep == FabricNodeRepairStep.Scheduled)
|
|
||||||
{
|
|
||||||
executorData.LatestRepairStep = FabricNodeRepairStep.Deactivate;
|
|
||||||
|
|
||||||
if (JsonSerializationUtility.TrySerializeObject(executorData, out string exData))
|
|
||||||
{
|
|
||||||
repairTask.ExecutorData = exData;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
actionMessage = "Step = Deactivate => Did not successfully serialize executordata.";
|
|
||||||
|
|
||||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
|
||||||
LogLevel.Info,
|
|
||||||
"RepairExecutor.SafeRestartFabricNodeAsyncAttemptingRestart::Deactivate",
|
|
||||||
actionMessage,
|
|
||||||
cancellationToken,
|
|
||||||
repairData,
|
|
||||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
await FabricHealerManager.FabricClientSingleton.RepairManager.UpdateRepairExecutionStateAsync(repairTask, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
|
|
||||||
|
|
||||||
// Deactivate the node with intent to restart. Several health checks will
|
|
||||||
// take place to ensure safe deactivation, which includes giving services a
|
|
||||||
// chance to gracefully shut down, should they override OnAbort/OnClose.
|
|
||||||
await FabricHealerManager.FabricClientSingleton.ClusterManager.DeactivateNodeAsync(
|
|
||||||
repairData.NodeName,
|
|
||||||
NodeDeactivationIntent.Restart,
|
|
||||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
|
||||||
cancellationToken);
|
|
||||||
|
|
||||||
stopwatch.Start();
|
|
||||||
|
|
||||||
// Wait for node to get into Disabled state.
|
|
||||||
while (stopwatch.Elapsed <= maxWaitTimeout)
|
|
||||||
{
|
|
||||||
var nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
|
||||||
() =>
|
|
||||||
FabricHealerManager.FabricClientSingleton.QueryManager.GetNodeListAsync(
|
|
||||||
repairData.NodeName,
|
|
||||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
|
||||||
cancellationToken),
|
|
||||||
cancellationToken);
|
|
||||||
|
|
||||||
if (nodes == null || nodes.Count == 0)
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
Node targetNode = nodes[0];
|
|
||||||
|
|
||||||
// exit loop, this is the state we're looking for.
|
|
||||||
if (targetNode.NodeStatus == NodeStatus.Disabled)
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
await Task.Delay(1000, cancellationToken);
|
|
||||||
}
|
|
||||||
|
|
||||||
stopwatch.Stop();
|
|
||||||
stopwatch.Reset();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (executorData.LatestRepairStep == FabricNodeRepairStep.Deactivate)
|
|
||||||
{
|
|
||||||
executorData.LatestRepairStep = FabricNodeRepairStep.Restart;
|
|
||||||
|
|
||||||
if (JsonSerializationUtility.TrySerializeObject(executorData, out string exData))
|
|
||||||
{
|
|
||||||
repairTask.ExecutorData = exData;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
await FabricHealerManager.FabricClientSingleton.RepairManager.UpdateRepairExecutionStateAsync(repairTask, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
|
|
||||||
|
|
||||||
actionMessage = $"In Step Restart Node.{Environment.NewLine}{repairTask.ExecutorData}";
|
|
||||||
|
|
||||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
|
||||||
LogLevel.Info,
|
|
||||||
"RepairExecutor.SafeRestartFabricNodeAsyncAttemptingRestart::RestartStep",
|
|
||||||
actionMessage,
|
|
||||||
cancellationToken,
|
|
||||||
repairData,
|
|
||||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
|
||||||
|
|
||||||
// Now, restart node.
|
|
||||||
_ = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
|
||||||
() =>
|
|
||||||
FabricHealerManager.FabricClientSingleton.FaultManager.RestartNodeAsync(
|
|
||||||
repairData.NodeName,
|
|
||||||
nodeInstanceId,
|
|
||||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
|
||||||
cancellationToken),
|
|
||||||
cancellationToken);
|
|
||||||
|
|
||||||
stopwatch.Start();
|
|
||||||
|
|
||||||
// Wait for Disabled/OK
|
|
||||||
while (stopwatch.Elapsed <= maxWaitTimeout)
|
|
||||||
{
|
|
||||||
var nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
|
||||||
() =>
|
|
||||||
FabricHealerManager.FabricClientSingleton.QueryManager.GetNodeListAsync(
|
|
||||||
repairData.NodeName,
|
|
||||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
|
||||||
cancellationToken),
|
|
||||||
cancellationToken);
|
|
||||||
|
|
||||||
Node targetNode = nodes[0];
|
|
||||||
|
|
||||||
// Node is ready to be enabled.
|
|
||||||
if (targetNode.NodeStatus == NodeStatus.Disabled && targetNode.HealthState == HealthState.Ok)
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
await Task.Delay(1000, cancellationToken);
|
|
||||||
}
|
|
||||||
|
|
||||||
stopwatch.Stop();
|
|
||||||
stopwatch.Reset();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (executorData.LatestRepairStep == FabricNodeRepairStep.Restart)
|
|
||||||
{
|
|
||||||
executorData.LatestRepairStep = FabricNodeRepairStep.Activate;
|
|
||||||
|
|
||||||
if (JsonSerializationUtility.TrySerializeObject(executorData, out string exData))
|
|
||||||
{
|
|
||||||
repairTask.ExecutorData = exData;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
await FabricHealerManager.FabricClientSingleton.RepairManager.UpdateRepairExecutionStateAsync(repairTask, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
|
|
||||||
|
|
||||||
// Now, enable the node.
|
|
||||||
await FabricHealerManager.FabricClientSingleton.ClusterManager.ActivateNodeAsync(repairData.NodeName, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
|
|
||||||
|
|
||||||
await Task.Delay(TimeSpan.FromSeconds(15), cancellationToken);
|
|
||||||
|
|
||||||
var nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
|
||||||
() =>
|
|
||||||
FabricHealerManager.FabricClientSingleton.QueryManager.GetNodeListAsync(
|
|
||||||
repairData.NodeName,
|
|
||||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
|
||||||
cancellationToken),
|
|
||||||
cancellationToken);
|
|
||||||
|
|
||||||
Node targetNode = nodes[0];
|
|
||||||
|
|
||||||
// Make sure activation request went through.
|
|
||||||
if (targetNode.NodeStatus == NodeStatus.Disabled && targetNode.HealthState == HealthState.Ok)
|
|
||||||
{
|
|
||||||
await FabricHealerManager.FabricClientSingleton.ClusterManager.ActivateNodeAsync(repairData.NodeName, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
|
|
||||||
}
|
|
||||||
|
|
||||||
await Task.Delay(TimeSpan.FromSeconds(15), cancellationToken);
|
|
||||||
UpdateRepairHistory(repairData);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
FabricHealerManager.RepairHistory.FailedRepairs++;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
catch (Exception e) when (e is FabricException || e is OperationCanceledException || e is TimeoutException)
|
|
||||||
{
|
|
||||||
string err = $"Handled Exception restarting Fabric node {repairData.NodeName}, NodeInstanceId {nodeInstanceId}:{e.GetType().Name}";
|
|
||||||
|
|
||||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
|
||||||
LogLevel.Info,
|
|
||||||
"RepairExecutor.SafeRestartFabricNodeAsync::HandledException",
|
|
||||||
err,
|
|
||||||
cancellationToken,
|
|
||||||
repairData,
|
|
||||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
|
||||||
|
|
||||||
FabricHealerManager.RepairLogger.LogInfo(err);
|
|
||||||
FabricHealerManager.RepairHistory.FailedRepairs++;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Restarts a stateful replica.
|
/// Restarts a stateful replica.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
using System;
|
using System;
|
||||||
using System.Diagnostics.Tracing;
|
using System.Diagnostics.Tracing;
|
||||||
using System.Fabric.Health;
|
using System.Fabric.Health;
|
||||||
|
using System.Fabric.Repair;
|
||||||
|
|
||||||
namespace FabricHealer.Repair
|
namespace FabricHealer.Repair
|
||||||
{
|
{
|
||||||
|
@ -120,5 +121,6 @@ namespace FabricHealer.Repair
|
||||||
{
|
{
|
||||||
get; set;
|
get; set;
|
||||||
}
|
}
|
||||||
|
public NodeImpactLevel NodeImpactLevel { get; internal set; }
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -7,11 +7,14 @@ using System;
|
||||||
using System.Fabric;
|
using System.Fabric;
|
||||||
using System.Fabric.Health;
|
using System.Fabric.Health;
|
||||||
using System.Fabric.Repair;
|
using System.Fabric.Repair;
|
||||||
|
using System.IO;
|
||||||
using System.Linq;
|
using System.Linq;
|
||||||
using System.Threading;
|
using System.Threading;
|
||||||
using System.Threading.Tasks;
|
using System.Threading.Tasks;
|
||||||
|
using FabricHealer.TelemetryLib;
|
||||||
using FabricHealer.Utilities;
|
using FabricHealer.Utilities;
|
||||||
using FabricHealer.Utilities.Telemetry;
|
using FabricHealer.Utilities.Telemetry;
|
||||||
|
using Guan.Logic;
|
||||||
|
|
||||||
namespace FabricHealer.Repair
|
namespace FabricHealer.Repair
|
||||||
{
|
{
|
||||||
|
@ -48,15 +51,10 @@ namespace FabricHealer.Repair
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NodeImpactLevel impact = executorData.RepairPolicy.RepairAction switch
|
NodeImpactLevel impact =
|
||||||
{
|
executorData.RepairPolicy.NodeImpactLevel != NodeImpactLevel.Invalid ? executorData.RepairPolicy.NodeImpactLevel : NodeImpactLevel.None;
|
||||||
RepairActionType.RestartFabricNode => NodeImpactLevel.Restart,
|
NodeRepairImpactDescription nodeRepairImpact = new();
|
||||||
RepairActionType.RemoveFabricNodeState => NodeImpactLevel.RemoveData,
|
NodeImpact impactedNode = new(executorData.RepairPolicy.NodeName, impact);
|
||||||
_ => NodeImpactLevel.None
|
|
||||||
};
|
|
||||||
|
|
||||||
var nodeRepairImpact = new NodeRepairImpactDescription();
|
|
||||||
var impactedNode = new NodeImpact(executorData.RepairPolicy.NodeName, impact);
|
|
||||||
nodeRepairImpact.ImpactedNodes.Add(impactedNode);
|
nodeRepairImpact.ImpactedNodes.Add(impactedNode);
|
||||||
RepairActionType repairAction = executorData.RepairPolicy.RepairAction;
|
RepairActionType repairAction = executorData.RepairPolicy.RepairAction;
|
||||||
string repair = repairAction.ToString();
|
string repair = repairAction.ToString();
|
||||||
|
@ -80,11 +78,18 @@ namespace FabricHealer.Repair
|
||||||
doHealthChecks = false;
|
doHealthChecks = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
string description = $"FabricHealer executing repair {repair} on node {executorData.RepairPolicy.NodeName}";
|
||||||
|
|
||||||
|
if (impact == NodeImpactLevel.Restart || impact == NodeImpactLevel.RemoveData)
|
||||||
|
{
|
||||||
|
description = executorData.RepairPolicy.RepairId;
|
||||||
|
}
|
||||||
|
|
||||||
var repairTask = new ClusterRepairTask(taskId, repair)
|
var repairTask = new ClusterRepairTask(taskId, repair)
|
||||||
{
|
{
|
||||||
Target = new NodeRepairTargetDescription(executorData.RepairPolicy.NodeName),
|
Target = new NodeRepairTargetDescription(executorData.RepairPolicy.NodeName),
|
||||||
Impact = nodeRepairImpact,
|
Impact = nodeRepairImpact,
|
||||||
Description = $"FabricHealer executing repair {repair} on node {executorData.RepairPolicy.NodeName}",
|
Description = description,
|
||||||
State = RepairTaskState.Preparing,
|
State = RepairTaskState.Preparing,
|
||||||
Executor = RepairConstants.FabricHealer,
|
Executor = RepairConstants.FabricHealer,
|
||||||
ExecutorData = JsonSerializationUtility.TrySerializeObject(executorData, out string exData) ? exData : null,
|
ExecutorData = JsonSerializationUtility.TrySerializeObject(executorData, out string exData) ? exData : null,
|
||||||
|
@ -461,5 +466,99 @@ namespace FabricHealer.Repair
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
internal static async Task<bool> TryTraceCurrentlyExecutingRule(string predicate, TelemetryData repairData)
|
||||||
|
{
|
||||||
|
string ruleFileName = FabricHealerManager.CurrentlyExecutingLogicRulesFileName, rule = string.Empty;
|
||||||
|
int lineNumber = 0;
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
string ruleFilePath =
|
||||||
|
Path.Combine(
|
||||||
|
FabricHealerManager.ServiceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config").Path,
|
||||||
|
"LogicRules",
|
||||||
|
ruleFileName);
|
||||||
|
|
||||||
|
if (!File.Exists(ruleFilePath))
|
||||||
|
{
|
||||||
|
FabricHealerManager.RepairLogger.LogWarning($"TryTraceCurrentlyExecutingRule: Specified rule file path does not exist: {ruleFilePath}.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
string[] lines = File.ReadLines(ruleFilePath).ToArray();
|
||||||
|
int length = lines.Length;
|
||||||
|
predicate = predicate.Replace("'", "").Replace("\"", "").Replace(" ", "");
|
||||||
|
|
||||||
|
for (int i = 0; i < length; i++)
|
||||||
|
{
|
||||||
|
string line = lines[i].Replace("'", "").Replace("\"", "").Replace(" ", "");
|
||||||
|
|
||||||
|
if (line.Contains("##") || string.IsNullOrWhiteSpace(line))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (line.Contains(predicate, StringComparison.OrdinalIgnoreCase))
|
||||||
|
{
|
||||||
|
lineNumber = i;
|
||||||
|
line = lines[lineNumber];
|
||||||
|
|
||||||
|
if (line.StartsWith("Mitigate") && line.EndsWith("."))
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// final (repair) predicate ends with a . in FH.
|
||||||
|
if (line.TrimEnd().EndsWith('.'))
|
||||||
|
{
|
||||||
|
rule = line.Replace('\t', ' ');
|
||||||
|
|
||||||
|
// Line is the whole rule.
|
||||||
|
if (line.Contains(":-"))
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = lineNumber - 1; j < length; j--)
|
||||||
|
{
|
||||||
|
if (lines[j].TrimEnd().EndsWith(','))
|
||||||
|
{
|
||||||
|
rule = lines[j].Replace('\t', ' ').Trim() + ' ' + rule;
|
||||||
|
lineNumber = j;
|
||||||
|
|
||||||
|
if (lines[j].StartsWith("Mitigate"))
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||||
|
LogLevel.Info,
|
||||||
|
$"{ruleFileName}#{lineNumber}_{repairData.RepairPolicy.ProcessName ?? repairData.NodeName}",
|
||||||
|
$"Executing logic rule \'{rule}\'",
|
||||||
|
FabricHealerManager.Token);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
catch (Exception e) when (e is ArgumentException || e is IOException || e is SystemException)
|
||||||
|
{
|
||||||
|
string message = $"TraceCurrentlyExecutingRule failure => Unable to read {ruleFileName}: {e.Message}";
|
||||||
|
FabricHealerManager.RepairLogger.LogWarning(message);
|
||||||
|
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||||
|
LogLevel.Info,
|
||||||
|
$"TraceCurrentlyExecutingRule::{ruleFileName}::Failure",
|
||||||
|
message,
|
||||||
|
FabricHealerManager.Token);
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -17,7 +17,6 @@ using FabricHealer.Utilities.Telemetry;
|
||||||
using Guan.Logic;
|
using Guan.Logic;
|
||||||
using FabricHealer.Repair.Guan;
|
using FabricHealer.Repair.Guan;
|
||||||
using FabricHealer.Utilities;
|
using FabricHealer.Utilities;
|
||||||
using System.Fabric.Description;
|
|
||||||
|
|
||||||
namespace FabricHealer.Repair
|
namespace FabricHealer.Repair
|
||||||
{
|
{
|
||||||
|
@ -37,32 +36,6 @@ namespace FabricHealer.Repair
|
||||||
await FabricHealerManager.FabricClientSingleton.ClusterManager.ActivateNodeAsync(nodeName, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
|
await FabricHealerManager.FabricClientSingleton.ClusterManager.ActivateNodeAsync(nodeName, FabricHealerManager.ConfigSettings.AsyncTimeout, cancellationToken);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static async Task<bool> SafeRestartServiceFabricNodeAsync(TelemetryData repairData, RepairTask repairTask, CancellationToken cancellationToken)
|
|
||||||
{
|
|
||||||
if (!await RepairExecutor.SafeRestartFabricNodeAsync(repairData, repairTask, cancellationToken))
|
|
||||||
{
|
|
||||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
|
||||||
LogLevel.Info,
|
|
||||||
"SafeRestartFabricNodeAsync",
|
|
||||||
$"Did not restart Fabric node {repairData.NodeName}",
|
|
||||||
cancellationToken,
|
|
||||||
repairData,
|
|
||||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
|
||||||
LogLevel.Info,
|
|
||||||
"SafeRestartFabricNodeAsync",
|
|
||||||
$"Successfully restarted Fabric node {repairData.NodeName}",
|
|
||||||
cancellationToken,
|
|
||||||
repairData,
|
|
||||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static async Task StartRepairWorkflowAsync(TelemetryData repairData, List<string> repairRules, CancellationToken cancellationToken)
|
public static async Task StartRepairWorkflowAsync(TelemetryData repairData, List<string> repairRules, CancellationToken cancellationToken)
|
||||||
{
|
{
|
||||||
if (await RepairTaskEngine.CheckForActiveStopFHRepairJob(cancellationToken))
|
if (await RepairTaskEngine.CheckForActiveStopFHRepairJob(cancellationToken))
|
||||||
|
@ -120,10 +93,10 @@ namespace FabricHealer.Repair
|
||||||
/// <param name="repairExecutorData">Optional Repair data that is used primarily when some repair is being restarted (after an FH restart, for example)</param>
|
/// <param name="repairExecutorData">Optional Repair data that is used primarily when some repair is being restarted (after an FH restart, for example)</param>
|
||||||
/// <returns></returns>
|
/// <returns></returns>
|
||||||
public static async Task RunGuanQueryAsync(
|
public static async Task RunGuanQueryAsync(
|
||||||
TelemetryData repairData,
|
TelemetryData repairData,
|
||||||
List<string> repairRules,
|
List<string> repairRules,
|
||||||
CancellationToken cancellationToken,
|
CancellationToken cancellationToken,
|
||||||
RepairExecutorData repairExecutorData = null)
|
RepairExecutorData repairExecutorData = null)
|
||||||
{
|
{
|
||||||
if (await RepairTaskEngine.CheckForActiveStopFHRepairJob(cancellationToken))
|
if (await RepairTaskEngine.CheckForActiveStopFHRepairJob(cancellationToken))
|
||||||
{
|
{
|
||||||
|
@ -142,14 +115,16 @@ namespace FabricHealer.Repair
|
||||||
functorTable.Add(LogInfoPredicateType.Singleton(RepairConstants.LogInfo));
|
functorTable.Add(LogInfoPredicateType.Singleton(RepairConstants.LogInfo));
|
||||||
functorTable.Add(LogErrorPredicateType.Singleton(RepairConstants.LogError));
|
functorTable.Add(LogErrorPredicateType.Singleton(RepairConstants.LogError));
|
||||||
functorTable.Add(LogWarningPredicateType.Singleton(RepairConstants.LogWarning));
|
functorTable.Add(LogWarningPredicateType.Singleton(RepairConstants.LogWarning));
|
||||||
|
functorTable.Add(TraceNextRulePredicateType.Singleton(RepairConstants.TraceNextRule, repairData));
|
||||||
functorTable.Add(CheckInsideHealthStateMinDurationPredicateType.Singleton(RepairConstants.CheckInsideHealthStateMinDuration, repairData));
|
functorTable.Add(CheckInsideHealthStateMinDurationPredicateType.Singleton(RepairConstants.CheckInsideHealthStateMinDuration, repairData));
|
||||||
functorTable.Add(GetHealthEventHistoryPredicateType.Singleton(RepairConstants.GetHealthEventHistory, repairData));
|
functorTable.Add(GetHealthEventHistoryPredicateType.Singleton(RepairConstants.GetHealthEventHistory, repairData));
|
||||||
functorTable.Add(GetRepairHistoryPredicateType.Singleton(RepairConstants.GetRepairHistory, repairData));
|
functorTable.Add(GetRepairHistoryPredicateType.Singleton(RepairConstants.GetRepairHistory, repairData));
|
||||||
|
|
||||||
// Add external repair predicates.
|
// Add external repair predicates.
|
||||||
|
functorTable.Add(DeactivateFabricNodePredicateType.Singleton(RepairConstants.DeactivateFabricNode, repairData));
|
||||||
functorTable.Add(DeleteFilesPredicateType.Singleton(RepairConstants.DeleteFiles, repairData));
|
functorTable.Add(DeleteFilesPredicateType.Singleton(RepairConstants.DeleteFiles, repairData));
|
||||||
functorTable.Add(RestartCodePackagePredicateType.Singleton(RepairConstants.RestartCodePackage, repairData));
|
functorTable.Add(RestartCodePackagePredicateType.Singleton(RepairConstants.RestartCodePackage, repairData));
|
||||||
functorTable.Add(RestartFabricNodePredicateType.Singleton(RepairConstants.RestartFabricNode, repairExecutorData, repairData));
|
functorTable.Add(RestartFabricNodePredicateType.Singleton(RepairConstants.RestartFabricNode, repairData));
|
||||||
functorTable.Add(RestartFabricSystemProcessPredicateType.Singleton(RepairConstants.RestartFabricSystemProcess, repairData));
|
functorTable.Add(RestartFabricSystemProcessPredicateType.Singleton(RepairConstants.RestartFabricSystemProcess, repairData));
|
||||||
functorTable.Add(RestartReplicaPredicateType.Singleton(RepairConstants.RestartReplica, repairData));
|
functorTable.Add(RestartReplicaPredicateType.Singleton(RepairConstants.RestartReplica, repairData));
|
||||||
functorTable.Add(ScheduleMachineRepairPredicateType.Singleton(RepairConstants.ScheduleMachineRepair, repairData));
|
functorTable.Add(ScheduleMachineRepairPredicateType.Singleton(RepairConstants.ScheduleMachineRepair, repairData));
|
||||||
|
@ -514,6 +489,7 @@ namespace FabricHealer.Repair
|
||||||
|
|
||||||
// Don't attempt a node-level repair on a node where there is already an active node-level repair.
|
// Don't attempt a node-level repair on a node where there is already an active node-level repair.
|
||||||
if (repairData.RepairPolicy.RepairAction == RepairActionType.RestartFabricNode
|
if (repairData.RepairPolicy.RepairAction == RepairActionType.RestartFabricNode
|
||||||
|
|| repairData.RepairPolicy.RepairAction == RepairActionType.DeactivateNode
|
||||||
&& await RepairTaskEngine.IsNodeLevelRepairCurrentlyInFlightAsync(repairData, cancellationToken))
|
&& await RepairTaskEngine.IsNodeLevelRepairCurrentlyInFlightAsync(repairData, cancellationToken))
|
||||||
{
|
{
|
||||||
string message = $"Node {repairData.NodeName} already has a node-impactful repair in progress: " +
|
string message = $"Node {repairData.NodeName} already has a node-impactful repair in progress: " +
|
||||||
|
@ -831,30 +807,7 @@ namespace FabricHealer.Repair
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case RepairActionType.RestartFabricNode:
|
|
||||||
{
|
|
||||||
var executorData = repairTask.ExecutorData;
|
|
||||||
|
|
||||||
if (string.IsNullOrWhiteSpace(executorData))
|
|
||||||
{
|
|
||||||
|
|
||||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
|
||||||
LogLevel.Info,
|
|
||||||
$"RestartFabricNode::{repairData.NodeName}",
|
|
||||||
$"Repair {repairTask.TaskId} is missing ExecutorData.",
|
|
||||||
cancellationToken,
|
|
||||||
repairData,
|
|
||||||
FabricHealerManager.ConfigSettings.EnableVerboseLogging);
|
|
||||||
|
|
||||||
success = false;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
success = await SafeRestartServiceFabricNodeAsync(repairData, repairTask, cancellationToken);
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,6 +29,11 @@ namespace FabricHealer.Utilities
|
||||||
private set;
|
private set;
|
||||||
} = 30;
|
} = 30;
|
||||||
|
|
||||||
|
public bool EnableLogicRuleTracing
|
||||||
|
{
|
||||||
|
get; private set;
|
||||||
|
}
|
||||||
|
|
||||||
public bool EnableVerboseLogging
|
public bool EnableVerboseLogging
|
||||||
{
|
{
|
||||||
get;
|
get;
|
||||||
|
@ -199,6 +204,12 @@ namespace FabricHealer.Utilities
|
||||||
OperationalTelemetryEnabled = fhOpTelemEnabled;
|
OperationalTelemetryEnabled = fhOpTelemEnabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Logic rule predicate tracing.
|
||||||
|
if (bool.TryParse(GetConfigSettingValue(RepairConstants.RepairManagerConfigurationSectionName, RepairConstants.EnableLogicRuleTracing), out bool traceRules))
|
||||||
|
{
|
||||||
|
EnableLogicRuleTracing = traceRules;
|
||||||
|
}
|
||||||
|
|
||||||
// Repair Policies
|
// Repair Policies
|
||||||
if (bool.TryParse(GetConfigSettingValue(RepairConstants.AppRepairPolicySectionName, RepairConstants.Enabled), out bool appRepairEnabled))
|
if (bool.TryParse(GetConfigSettingValue(RepairConstants.AppRepairPolicySectionName, RepairConstants.Enabled), out bool appRepairEnabled))
|
||||||
{
|
{
|
||||||
|
|
|
@ -9,7 +9,6 @@ using System.Fabric.Repair;
|
||||||
using System.Linq;
|
using System.Linq;
|
||||||
using System.Threading;
|
using System.Threading;
|
||||||
using System.Threading.Tasks;
|
using System.Threading.Tasks;
|
||||||
using FabricHealer.TelemetryLib;
|
|
||||||
using FabricHealer.Utilities;
|
using FabricHealer.Utilities;
|
||||||
|
|
||||||
namespace FabricHealer.Repair
|
namespace FabricHealer.Repair
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
<?xml version="1.0" encoding="utf-8"?>
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
<ApplicationManifest xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ApplicationTypeName="FabricHealerType" ApplicationTypeVersion="1.1.17" xmlns="http://schemas.microsoft.com/2011/01/fabric">
|
<ApplicationManifest xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ApplicationTypeName="FabricHealerType" ApplicationTypeVersion="1.1.18" xmlns="http://schemas.microsoft.com/2011/01/fabric">
|
||||||
<Parameters>
|
<Parameters>
|
||||||
<!-- FabricHealerManager Settings -->
|
<!-- FabricHealerManager Settings -->
|
||||||
<Parameter Name="AutoMitigationEnabled" DefaultValue="true" />
|
<Parameter Name="AutoMitigationEnabled" DefaultValue="true" />
|
||||||
<Parameter Name="EnableETW" DefaultValue="false" />
|
<Parameter Name="EnableETW" DefaultValue="false" />
|
||||||
<Parameter Name="HealthCheckIntervalInSeconds" DefaultValue="60" />
|
<Parameter Name="HealthCheckIntervalInSeconds" DefaultValue="60" />
|
||||||
|
<Parameter Name="EnableLogicRuleTracing" DefaultValue="true" />
|
||||||
<Parameter Name="EnableTelemetry" DefaultValue="false" />
|
<Parameter Name="EnableTelemetry" DefaultValue="false" />
|
||||||
<Parameter Name="EnableVerboseLogging" DefaultValue="true" />
|
<Parameter Name="EnableVerboseLogging" DefaultValue="true" />
|
||||||
<Parameter Name="OperationalTelemetryEnabled" DefaultValue="true" />
|
<Parameter Name="OperationalTelemetryEnabled" DefaultValue="true" />
|
||||||
|
@ -30,7 +31,7 @@
|
||||||
should match the Name and Version attributes of the ServiceManifest element defined in the
|
should match the Name and Version attributes of the ServiceManifest element defined in the
|
||||||
ServiceManifest.xml file. -->
|
ServiceManifest.xml file. -->
|
||||||
<ServiceManifestImport>
|
<ServiceManifestImport>
|
||||||
<ServiceManifestRef ServiceManifestName="FabricHealerPkg" ServiceManifestVersion="1.1.17" />
|
<ServiceManifestRef ServiceManifestName="FabricHealerPkg" ServiceManifestVersion="1.1.18" />
|
||||||
<ConfigOverrides>
|
<ConfigOverrides>
|
||||||
<ConfigOverride Name="Config">
|
<ConfigOverride Name="Config">
|
||||||
<Settings>
|
<Settings>
|
||||||
|
@ -44,6 +45,7 @@
|
||||||
<Parameter Name="EnableOperationalTelemetry" Value="[OperationalTelemetryEnabled]" />
|
<Parameter Name="EnableOperationalTelemetry" Value="[OperationalTelemetryEnabled]" />
|
||||||
<Parameter Name="EnableRollingServiceRestarts" Value="[EnableRollingServiceRestarts]" />
|
<Parameter Name="EnableRollingServiceRestarts" Value="[EnableRollingServiceRestarts]" />
|
||||||
<Parameter Name="LocalLogPath" Value="[LocalLogPath]" />
|
<Parameter Name="LocalLogPath" Value="[LocalLogPath]" />
|
||||||
|
<Parameter Name="EnableLogicRuleTracing" Value="[EnableLogicRuleTracing]" />
|
||||||
</Section>
|
</Section>
|
||||||
<!-- Repair policies -->
|
<!-- Repair policies -->
|
||||||
<Section Name="AppRepairPolicy">
|
<Section Name="AppRepairPolicy">
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
## FabricHealer 1.1.17
|
## FabricHealer 1.1.18
|
||||||
### Configuration as Logic and auto-mitigation in Service Fabric clusters
|
### Configuration as Logic and auto-mitigation in Service Fabric clusters
|
||||||
|
|
||||||
FabricHealer (FH) is a .NET 6 Service Fabric application that attempts to automatically fix a set of reliably solvable problems that can take place in Service Fabric
|
FabricHealer (FH) is a .NET 6 Service Fabric application that attempts to automatically fix a set of reliably solvable problems that can take place in Service Fabric
|
||||||
|
@ -78,7 +78,7 @@ Register-ServiceFabricApplicationType -ApplicationPathInImageStore FH1110
|
||||||
|
|
||||||
#Create FO application (if not already deployed at lesser version):
|
#Create FO application (if not already deployed at lesser version):
|
||||||
|
|
||||||
New-ServiceFabricApplication -ApplicationName fabric:/FabricHealer -ApplicationTypeName FabricHealerType -ApplicationTypeVersion 1.1.17
|
New-ServiceFabricApplication -ApplicationName fabric:/FabricHealer -ApplicationTypeName FabricHealerType -ApplicationTypeVersion 1.1.18
|
||||||
|
|
||||||
#Create the Service instance:
|
#Create the Service instance:
|
||||||
|
|
||||||
|
@ -87,7 +87,7 @@ New-ServiceFabricService -Stateless -PartitionSchemeSingleton -ApplicationName f
|
||||||
|
|
||||||
#OR if updating existing version:
|
#OR if updating existing version:
|
||||||
|
|
||||||
Start-ServiceFabricApplicationUpgrade -ApplicationName fabric:/FabricHealer -ApplicationTypeVersion 1.1.17 -Monitored -FailureAction rollback
|
Start-ServiceFabricApplicationUpgrade -ApplicationName fabric:/FabricHealer -ApplicationTypeVersion 1.1.18 -Monitored -FailureAction rollback
|
||||||
```
|
```
|
||||||
|
|
||||||
## Using FabricHealer
|
## Using FabricHealer
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
## FabricHealer 1.1.17
|
## FabricHealer 1.1.18
|
||||||
### Configuration as Logic and auto-mitigation in Service Fabric clusters
|
### Configuration as Logic and auto-mitigation in Service Fabric clusters
|
||||||
|
|
||||||
FabricHealer (FH) is a .NET 6 Service Fabric application that attempts to automatically fix a set of reliably solvable problems that can take place in Service Fabric
|
FabricHealer (FH) is a .NET 6 Service Fabric application that attempts to automatically fix a set of reliably solvable problems that can take place in Service Fabric
|
||||||
|
|
Загрузка…
Ссылка в новой задаче