FH 1.1.0. FHLib 1.0.0. SF 8.x Requirement. ARM Support.

This commit is contained in:
Charles Torre 2022-04-14 13:11:41 -07:00
Родитель cb83fc4c8d
Коммит e6aca7e477
67 изменённых файлов: 3222 добавлений и 1112 удалений

15
Build-FHLib.ps1 Normal file
Просмотреть файл

@ -0,0 +1,15 @@
$ErrorActionPreference = "Stop"
$Configuration="Release"
[string] $scriptPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
try {
Push-Location $scriptPath
Remove-Item $scriptPath\FabricHealerLib\bin\release\netstandard2.0\ -Recurse -Force -EA SilentlyContinue
dotnet publish $scriptPath\FabricHealerLib\FabricHealerLib.csproj -o bin\release\netstandard2.0 -c $Configuration
}
finally {
Pop-Location
}

45
Build-FHLibNupkg.ps1 Normal file
Просмотреть файл

@ -0,0 +1,45 @@
function Install-Nuget {
# Path to Latest nuget.exe on nuget.org
$source = "https://dist.nuget.org/win-x86-commandline/latest/nuget.exe"
# Save file to top level directory in repo
$destination = "$scriptPath\nuget.exe"
#Download the file
if (-Not [System.IO.File]::Exists($destination)) {
Invoke-WebRequest -Uri $source -OutFile $destination
}
}
function Build-Nuget {
param (
[string]
$packageId,
[string]
$basePath
)
[string] $nugetSpecTemplate = [System.IO.File]::ReadAllText([System.IO.Path]::Combine($scriptPath, "FabricHealerLib.nuspec.template"))
[string] $nugetSpecPath = "$scriptPath\FabricHealerLib\bin\release\netstandard2.0\$($packageId).nuspec"
[System.IO.File]::WriteAllText($nugetSpecPath, $nugetSpecTemplate.Replace("%PACKAGE_ID%", $packageId).Replace("%ROOT_PATH%", $scriptPath))
.\nuget.exe pack $nugetSpecPath -basepath $basePath -OutputDirectory bin\release\FabricHealerLib\Nugets -properties NoWarn=NU5100,NU5128
}
[string] $scriptPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
try {
Push-Location $scriptPath
Install-Nuget
Build-Nuget "Microsoft.ServiceFabricApps.FabricHealerLib" "$scriptPath\FabricHealerLib\bin\release\netstandard2.0"
}
finally {
Pop-Location
}

Просмотреть файл

@ -23,11 +23,11 @@ function Build-SFPkg {
try {
Push-Location $scriptPath
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.SelfContained.1.0.15" "$scriptPath\bin\release\FabricHealer\linux-x64\self-contained\FabricHealerType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.FrameworkDependent.1.0.15" "$scriptPath\bin\release\FabricHealer\linux-x64\framework-dependent\FabricHealerType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.SelfContained.1.1.0" "$scriptPath\bin\release\FabricHealer\linux-x64\self-contained\FabricHealerType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.FrameworkDependent.1.1.0" "$scriptPath\bin\release\FabricHealer\linux-x64\framework-dependent\FabricHealerType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.SelfContained.1.0.15" "$scriptPath\bin\release\FabricHealer\win-x64\self-contained\FabricHealerType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.FrameworkDependent.1.0.15" "$scriptPath\bin\release\FabricHealer\win-x64\framework-dependent\FabricHealerType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.SelfContained.1.1.0" "$scriptPath\bin\release\FabricHealer\win-x64\self-contained\FabricHealerType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.FrameworkDependent.1.1.0" "$scriptPath\bin\release\FabricHealer\win-x64\framework-dependent\FabricHealerType"
}
finally {
Pop-Location

Просмотреть файл

@ -19,6 +19,7 @@ using FabricHealer.Utilities;
using FabricHealer;
using System.Fabric.Repair;
using System.Diagnostics;
using System.Fabric.Health;
namespace FHTest
{
@ -124,13 +125,14 @@ namespace FHTest
TelemetryEnabled = false
};
// This will be the mock data used to create a repair task.
var foHealthData = new TelemetryData
// This will be the data used to create a repair task.
var repairData = new TelemetryData
{
ApplicationName = "fabric:/test",
NodeName = "TEST_0",
RepairId = "Test42",
Code = FOErrorWarningCodes.AppErrorMemoryMB,
RepairId = $"Test42_{SupportedErrorCodes.AppErrorMemoryMB}",
Code = SupportedErrorCodes.AppErrorMemoryMB,
HealthState = HealthState.Warning,
ServiceName = "fabric:/test0/service0",
Value = 1024.0
};
@ -146,7 +148,7 @@ namespace FHTest
try
{
await TestInitializeGuanAndRunQuery(foHealthData, repairRules, executorData);
await TestInitializeGuanAndRunQuery(repairData, repairRules, executorData);
}
catch (GuanException ge)
{
@ -172,17 +174,18 @@ namespace FHTest
string testRulesFilePath = Path.Combine(Environment.CurrentDirectory, "testrules_wellformed");
string[] rules = await File.ReadAllLinesAsync(testRulesFilePath, token).ConfigureAwait(true);
List<string> repairRules = ParseRulesFile(rules);
var foHealthData = new TelemetryData
var repairData = new TelemetryData
{
ApplicationName = "fabric:/test0",
NodeName = "TEST_0",
Metric = "Memory",
RepairId = "Test42",
Code = FOErrorWarningCodes.AppErrorMemoryMB,
HealthState = HealthState.Warning,
RepairId = $"Test42_{SupportedErrorCodes.AppErrorMemoryMB}",
Code = SupportedErrorCodes.AppErrorMemoryMB,
ServiceName = "fabric:/test0/service0",
Value = 42,
ReplicaId = default,
PartitionId = default(Guid).ToString(),
PartitionId = default,
};
var executorData = new RepairExecutorData
@ -192,7 +195,7 @@ namespace FHTest
try
{
await TestInitializeGuanAndRunQuery(foHealthData, repairRules, executorData);
await TestInitializeGuanAndRunQuery(repairData, repairRules, executorData);
}
catch (GuanException ge)
{
@ -217,17 +220,18 @@ namespace FHTest
string[] rules = await File.ReadAllLinesAsync(Path.Combine(Environment.CurrentDirectory, "testrules_malformed"), token).ConfigureAwait(true);
List<string> repairAction = ParseRulesFile(rules);
var foHealthData = new TelemetryData
var repairData = new TelemetryData
{
ApplicationName = "fabric:/test0",
NodeName = "TEST_0",
Metric = "Memory",
RepairId = "Test42",
Code = FOErrorWarningCodes.AppErrorMemoryMB,
HealthState = HealthState.Warning,
RepairId = $"Test42_{SupportedErrorCodes.AppErrorMemoryMB}",
Code = SupportedErrorCodes.AppErrorMemoryMB,
ServiceName = "fabric:/test0/service0",
Value = 42,
ReplicaId = default,
PartitionId = default(Guid).ToString(),
PartitionId = default,
};
var executorData = new RepairExecutorData
@ -235,12 +239,12 @@ namespace FHTest
RepairPolicy = new RepairPolicy { RepairAction = RepairActionType.RestartCodePackage },
};
await Assert.ThrowsExceptionAsync<GuanException>(async () => { await TestInitializeGuanAndRunQuery(foHealthData, repairAction, executorData); });
await Assert.ThrowsExceptionAsync<GuanException>(async () => { await TestInitializeGuanAndRunQuery(repairData, repairAction, executorData); });
}
/* private Helpers */
private async Task TestInitializeGuanAndRunQuery(TelemetryData foHealthData, List<string> repairRules, RepairExecutorData executorData)
private async Task TestInitializeGuanAndRunQuery(TelemetryData repairData, List<string> repairRules, RepairExecutorData executorData)
{
var fabricClient = new FabricClient();
var repairTaskManager = new RepairTaskManager(fabricClient, context, token);
@ -251,19 +255,19 @@ namespace FHTest
FunctorTable functorTable = new FunctorTable();
// Add external helper predicates.
functorTable.Add(CheckFolderSizePredicateType.Singleton(RepairConstants.CheckFolderSize, repairTaskManager, foHealthData));
functorTable.Add(GetRepairHistoryPredicateType.Singleton(RepairConstants.GetRepairHistory, repairTaskManager, foHealthData));
functorTable.Add(GetHealthEventHistoryPredicateType.Singleton(RepairConstants.GetHealthEventHistory, repairTaskManager, foHealthData));
functorTable.Add(CheckInsideRunIntervalPredicateType.Singleton(RepairConstants.CheckInsideRunInterval, repairTaskManager, foHealthData));
functorTable.Add(CheckFolderSizePredicateType.Singleton(RepairConstants.CheckFolderSize, repairTaskManager, repairData));
functorTable.Add(GetRepairHistoryPredicateType.Singleton(RepairConstants.GetRepairHistory, repairTaskManager, repairData));
functorTable.Add(GetHealthEventHistoryPredicateType.Singleton(RepairConstants.GetHealthEventHistory, repairTaskManager, repairData));
functorTable.Add(CheckInsideRunIntervalPredicateType.Singleton(RepairConstants.CheckInsideRunInterval, repairTaskManager, repairData));
functorTable.Add(EmitMessagePredicateType.Singleton(RepairConstants.EmitMessage, repairTaskManager));
// Add external repair predicates.
functorTable.Add(DeleteFilesPredicateType.Singleton(RepairConstants.DeleteFiles, repairTaskManager, foHealthData));
functorTable.Add(RestartCodePackagePredicateType.Singleton(RepairConstants.RestartCodePackage, repairTaskManager, foHealthData));
functorTable.Add(RestartFabricNodePredicateType.Singleton(RepairConstants.RestartFabricNode, repairTaskManager, executorData, repairTaskEngine, foHealthData));
functorTable.Add(RestartFabricSystemProcessPredicateType.Singleton(RepairConstants.RestartFabricSystemProcess, repairTaskManager, foHealthData));
functorTable.Add(RestartReplicaPredicateType.Singleton(RepairConstants.RestartReplica, repairTaskManager, foHealthData));
functorTable.Add(RestartVMPredicateType.Singleton(RepairConstants.RestartVM, repairTaskManager, foHealthData));
functorTable.Add(DeleteFilesPredicateType.Singleton(RepairConstants.DeleteFiles, repairTaskManager, repairData));
functorTable.Add(RestartCodePackagePredicateType.Singleton(RepairConstants.RestartCodePackage, repairTaskManager, repairData));
functorTable.Add(RestartFabricNodePredicateType.Singleton(RepairConstants.RestartFabricNode, repairTaskManager, executorData, repairTaskEngine, repairData));
functorTable.Add(RestartFabricSystemProcessPredicateType.Singleton(RepairConstants.RestartFabricSystemProcess, repairTaskManager, repairData));
functorTable.Add(RestartReplicaPredicateType.Singleton(RepairConstants.RestartReplica, repairTaskManager, repairData));
functorTable.Add(RestartVMPredicateType.Singleton(RepairConstants.RestartVM, repairTaskManager, repairData));
// Parse rules
Module module = Module.Parse("external", repairRules, functorTable);
@ -280,20 +284,22 @@ namespace FHTest
// The type of metric that led FO to generate the unhealthy evaluation for the entity (App, Node, VM, Replica, etc).
// We rename these for brevity for simplified use in logic rule composition (e;g., MetricName="Threads" instead of MetricName="Total Thread Count").
foHealthData.Metric = FOErrorWarningCodes.GetMetricNameFromCode(foHealthData.Code);
repairData.Metric = SupportedErrorCodes.GetMetricNameFromErrorCode(repairData.Code);
// These args hold the related values supplied by FO and are available anywhere Mitigate is used as a rule head.
compoundTerm.AddArgument(new Constant(foHealthData.ApplicationName), RepairConstants.AppName);
compoundTerm.AddArgument(new Constant(foHealthData.Code), RepairConstants.FOErrorCode);
compoundTerm.AddArgument(new Constant(foHealthData.Metric), RepairConstants.MetricName);
compoundTerm.AddArgument(new Constant(foHealthData.NodeName), RepairConstants.NodeName);
compoundTerm.AddArgument(new Constant(foHealthData.NodeType), RepairConstants.NodeType);
compoundTerm.AddArgument(new Constant(foHealthData.OS), RepairConstants.OS);
compoundTerm.AddArgument(new Constant(foHealthData.ServiceName), RepairConstants.ServiceName);
compoundTerm.AddArgument(new Constant(foHealthData.SystemServiceProcessName), RepairConstants.SystemServiceProcessName);
compoundTerm.AddArgument(new Constant(foHealthData.PartitionId), RepairConstants.PartitionId);
compoundTerm.AddArgument(new Constant(foHealthData.ReplicaId), RepairConstants.ReplicaOrInstanceId);
compoundTerm.AddArgument(new Constant(Convert.ToInt64(foHealthData.Value)), RepairConstants.MetricValue);
compoundTerm.AddArgument(new Constant(repairData.ApplicationName), RepairConstants.AppName);
compoundTerm.AddArgument(new Constant(repairData.Code), RepairConstants.ErrorCode);
compoundTerm.AddArgument(new Constant(Enum.GetName(typeof(HealthState), repairData.HealthState)), RepairConstants.HealthState);
compoundTerm.AddArgument(new Constant(repairData.Metric), RepairConstants.MetricName);
compoundTerm.AddArgument(new Constant(repairData.NodeName), RepairConstants.NodeName);
compoundTerm.AddArgument(new Constant(repairData.NodeType), RepairConstants.NodeType);
compoundTerm.AddArgument(new Constant(repairData.ObserverName), RepairConstants.ObserverName);
compoundTerm.AddArgument(new Constant(repairData.OS), RepairConstants.OS);
compoundTerm.AddArgument(new Constant(repairData.ServiceName), RepairConstants.ServiceName);
compoundTerm.AddArgument(new Constant(repairData.SystemServiceProcessName), RepairConstants.SystemServiceProcessName);
compoundTerm.AddArgument(new Constant(repairData.PartitionId), RepairConstants.PartitionId);
compoundTerm.AddArgument(new Constant(repairData.ReplicaId), RepairConstants.ReplicaOrInstanceId);
compoundTerm.AddArgument(new Constant(Convert.ToInt64(repairData.Value)), RepairConstants.MetricValue);
compoundTerms.Add(compoundTerm);
await queryDispatcher.RunQueryAsync(compoundTerms).ConfigureAwait(false);

Просмотреть файл

@ -69,12 +69,12 @@ Mitigate(MetricName="MemoryMB", MetricValue=?MetricValue) :- ?MetricValue >= 102
## Disk
Mitigate(FOErrorCode=?FOErrorCode) :- ?FOErrorCode == "FO042" || ?FOErrorCode == "FO043", GetRepairHistory(?repairCount, 08:00:00),
Mitigate(ErrorCode=?ErrorCode) :- ?ErrorCode == "FO042" || ?ErrorCode == "FO043", GetRepairHistory(?repairCount, 08:00:00),
?repairCount < 4,
CheckFolderSize("E:\SvcFab\Log\Traces", MaxFolderSizeGB=50),
DeleteFiles("E:\SvcFab\Log\Traces", SortOrder=Ascending, MaxFilesToDelete=25, RecurseSubdirectories=false).
Mitigate(FOErrorCode=?FOErrorCode) :- ?FOErrorCode == "FO042" || ?FOErrorCode == "FO043", GetRepairHistory(?repairCount, 08:00:00),
Mitigate(ErrorCode=?ErrorCode) :- ?ErrorCode == "FO042" || ?ErrorCode == "FO043", GetRepairHistory(?repairCount, 08:00:00),
?repairCount < 4,
CheckFolderSize("%SOMEPATHVAR%", MaxFolderSizeGB=50),
DeleteFiles("%SOMEPATHVAR%", SortOrder=Ascending, MaxFilesToDelete=25, RecurseSubdirectories=false).
@ -101,6 +101,7 @@ Mitigate(AppName=?AppName, MetricName="Threads", MetricValue=?MetricValue) :- ?A
## Threads - Any app service. 5 repairs within 5 hour window. This means if FO warns on Thread count, then heal. There are no conditional checks (on MetricValue) to take place.
## Mitigate(MetricName="Threads") :- TimeScopedRestartCodePackage(5, 05:00:00).
Mitigate(ServiceName=?ServiceName) :- ?ServiceName != null, TimeScopedRestartReplica(5, 05:00:00).
## Internal Predicates

Просмотреть файл

@ -2,9 +2,12 @@
<package xmlns="http://schemas.microsoft.com/packaging/2013/05/nuspec.xsd">
<metadata minClientVersion="3.3.0">
<id>%PACKAGE_ID%</id>
<version>1.0.15</version>
<version>1.1.0</version>
<releaseNotes>
Code improvements.
- FabricHealer no longer requires FabricObserver to be deployed in the same cluster.
- Any .NET SF service can now interoperate with FabricHealer using the FabricHealerLib .NET Standard library.
- FabricHealer now requires Microsoft.ServiceFabric.Services Version 5.0.516 and higher. Lesser runtime versions than 8.0.516 are no longer supported.
- FabricHealer can now be deployed via ARM, directly from the repo.
</releaseNotes>
<authors>Microsoft</authors>
<license type="expression">MIT</license>
@ -19,6 +22,7 @@ Code improvements.
</contentFiles>
<dependencies>
<dependency id="Microsoft.Logic.Guan" version="1.0.4" />
<dependency id="Microsoft.ServiceFabric.Services" Version="5.0.516" />
</dependencies>
<projectUrl>https://github.com/microsoft/service-fabric-healer</projectUrl>
<tags>azure servicefabric fabrichealer fabricobserver auto-mitigation logic-programming guan</tags>

Просмотреть файл

@ -10,9 +10,13 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
.editorconfig = .editorconfig
.gitignore = .gitignore
Build-FabricHealer.ps1 = Build-FabricHealer.ps1
Build-FHLib.ps1 = Build-FHLib.ps1
Build-FHLibNupkg.ps1 = Build-FHLibNupkg.ps1
Build-NugetPackages.ps1 = Build-NugetPackages.ps1
Build-SFPKGs.ps1 = Build-SFPKGs.ps1
FabricHealer.nuspec.template = FabricHealer.nuspec.template
FabricHealerLib.nuspec.template = FabricHealerLib.nuspec.template
FabricHealerLibnuget.md = FabricHealerLibnuget.md
fhnuget.md = fhnuget.md
icon.png = icon.png
Documentation\LogicWorkflows.md = Documentation\LogicWorkflows.md
@ -27,6 +31,8 @@ Project("{A07B5EB6-E848-4116-A8D0-A826331D98C6}") = "FabricHealerApp", "FabricHe
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TelemetryLib", "TelemetryLib\TelemetryLib.csproj", "{7BC6991F-C840-413E-B1CD-4025947CF5FA}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FabricHealerLib", "FabricHealerLib\FabricHealerLib.csproj", "{C6180826-6E9D-488E-B212-999540122211}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@ -71,6 +77,14 @@ Global
{7BC6991F-C840-413E-B1CD-4025947CF5FA}.Release|Any CPU.Build.0 = Release|Any CPU
{7BC6991F-C840-413E-B1CD-4025947CF5FA}.Release|x64.ActiveCfg = Release|x64
{7BC6991F-C840-413E-B1CD-4025947CF5FA}.Release|x64.Build.0 = Release|x64
{C6180826-6E9D-488E-B212-999540122211}.Debug|Any CPU.ActiveCfg = Debug|x64
{C6180826-6E9D-488E-B212-999540122211}.Debug|Any CPU.Build.0 = Debug|x64
{C6180826-6E9D-488E-B212-999540122211}.Debug|x64.ActiveCfg = Debug|x64
{C6180826-6E9D-488E-B212-999540122211}.Debug|x64.Build.0 = Debug|x64
{C6180826-6E9D-488E-B212-999540122211}.Release|Any CPU.ActiveCfg = Release|x64
{C6180826-6E9D-488E-B212-999540122211}.Release|Any CPU.Build.0 = Release|x64
{C6180826-6E9D-488E-B212-999540122211}.Release|x64.ActiveCfg = Release|x64
{C6180826-6E9D-488E-B212-999540122211}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE

Просмотреть файл

@ -12,8 +12,8 @@
<RuntimeIdentifier>win-x64</RuntimeIdentifier>-->
<RuntimeIdentifiers>linux-x64;win-x64</RuntimeIdentifiers>
<Product>FabricHealer</Product>
<Version>1.0.15</Version>
<FileVersion>1.0.15</FileVersion>
<Version>1.1.0</Version>
<FileVersion>1.1.0</FileVersion>
<AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
<IsServiceFabricServiceProject>true</IsServiceFabricServiceProject>
<StartupObject>FabricHealer.Program</StartupObject>
@ -26,8 +26,7 @@
<ItemGroup>
<PackageReference Include="Microsoft.ApplicationInsights" Version="2.17.0" />
<PackageReference Include="Microsoft.Logic.Guan" Version="1.0.4" />
<PackageReference Include="Microsoft.ServiceFabric" Version="7.2.452" />
<PackageReference Include="Microsoft.ServiceFabric.Services" Version="4.2.452" />
<PackageReference Include="Microsoft.ServiceFabric.Services" Version="5.0.516" />
<PackageReference Include="Microsoft.CSharp" Version="4.7.0" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.1" />
<PackageReference Include="NLog" Version="4.7.9" />

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -28,6 +28,6 @@ namespace FabricHealer.Interfaces
Task<bool> SafeRestartServiceFabricNodeAsync(RepairConfiguration repairConfiguration, RepairTask repairTask, CancellationToken cancellationToken);
Task StartRepairWorkflowAsync(TelemetryData foHealthData, List<string> repairRules, CancellationToken cancellationToken);
Task StartRepairWorkflowAsync(TelemetryData repairData, List<string> repairRules, CancellationToken cancellationToken);
}
}

Просмотреть файл

@ -0,0 +1,87 @@
// ------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
using FabricHealer.Utilities.Telemetry;
using System;
using System.Fabric.Health;
namespace FabricHealer.Interfaces
{
public interface ITelemetryData
{
/// <summary>
/// Required if the repair target is a Service. Service Fabric ApplicationName as a string value (OriginalString on a Uri instance, for example).
/// </summary>
string ApplicationName { get; set; }
/// <summary>
/// Required. The supported error code.
/// </summary>
string Code { get; set; }
/// <summary>
/// Required if the repair target is a container. The id of the container.
/// </summary>
string ContainerId { get; set; }
/// <summary>
/// Optional. The description of the problem being reported. This is unrelated to the Description of the Health Event that will be generated by ServiceFabricHealthReporter.
/// </summary>
string Description { get; set; }
/// <summary>
/// The Service Fabric entity type to be repaired.
/// </summary>
EntityType EntityType { get; set; }
/// <summary>
/// This string value will be set by the consuming function, which requires a System.Fabric.HealthState enum parameter.
/// </summary>
HealthState HealthState { get; set; }
/// <summary>
/// Required. The supported resource usage metric name.
/// </summary>
string Metric { get; set; }
/// <summary>
/// Required. The name of the node where the entity resides.
/// </summary>
string NodeName { get; set; }
/// <summary>
/// The OS hosting Service Fabric. This is read-only.
/// </summary>
string OS { get; }
/// <summary>
/// Required if the repair target is a Service. The Partition Id (as a string) where the replica or instance resides that is in Error or Warning state.
/// </summary>
Guid PartitionId { get; set; }
/// <summary>
/// Optional. The host process id of the Service entity.
/// </summary>
long ProcessId { get; set; }
/// <summary>
/// Required if the repair target is a Service. The Replica or Instance id of the target Service replica.
/// </summary>
long ReplicaId { get; set; }
/// <summary>
/// Required if the repair target is a Service. The name of the service (as a string). This would be the same value as the OriginalString property of the ServiceName Uri instance.
/// </summary>
string ServiceName { get; set; }
/// <summary>
/// Required. This is the name of the service (as a string) that is generating the health report with this TelemetryData instance.
/// </summary>
string Source { get; set; }
/// <summary>
/// Optional. This is required if you are targeting Service Fabric System Service process. In this case, you should also supply the related value for ProcessId.
/// </summary>
string SystemServiceProcessName { get; set; }
/// <summary>
/// Optional. The supported resource usage metric value. NOTE: This value must be supplied if you expect to use this fact in related FabricHealer logic rules.
/// </summary>
double Value { get; set; }
/// <summary>
/// Don't set. The Fabric node type. FabricHealer will set this.
/// </summary>
string NodeType { get; set; }
/// <summary>
/// Required. This will be used as the Health Event Property.
/// </summary>
string Property { get; set; }
}
}

Просмотреть файл

@ -88,9 +88,7 @@ namespace FabricHealer.Interfaces
/// </summary>
/// <param name="telemetryData">TelemetryData instance.</param>
/// <param name="cancellationToken">CancellationToken instance.</param>
Task ReportMetricAsync(
TelemetryData telemetryData,
CancellationToken cancellationToken);
Task ReportMetricAsync(TelemetryData telemetryData, CancellationToken cancellationToken);
/// <summary>
/// Calls telemetry provider to report a metric.

Просмотреть файл

@ -7,13 +7,13 @@
## | ServiceName | Name of the SF service, format is fabric:/SomeApp/SomeService |
## | NodeName | Name of the node |
## | NodeType | Type of node |
## | ObserverName | Name of Observer that generated the event. |
## | ObserverName | Name of Observer that generated the event (if the data comes from FabricObserver service) |
## | PartitionId | Id of the partition |
## | ReplicaOrInstanceId | Id of the replica or instance |
## | FOErrorCode | Error Code emitted by FO (e.g. "FO002") |
## | MetricName | Name of the resource supplied by FO (e.g., CpuPercent or MemoryMB, etc.) |
## | MetricValue | Corresponding Metric Value supplied by FO (e.g. "85" indicating 85% CPU usage) |
## | OS | The name of the OS from which the FO data was collected (Linux or Windows) |
## | ErrorCode | Supported Error Code emitted by caller (e.g. "FO002") |
## | MetricName | Name of the metric (e.g., CpuPercent or MemoryMB, etc.) |
## | MetricValue | Corresponding Metric Value (e.g. "85" indicating 85% CPU usage) |
## | OS | The name of the OS from which the data was collected (Linux or Windows) |
## Application-related Metric Names.
## | Name |
@ -153,6 +153,9 @@ Mitigate(AppName=?AppName, MetricName="Threads", MetricValue=?MetricValue) :- ?A
## Threads - Any app service. 5 repairs within 5 hour window. This means if FO warns on Thread count, then heal. There are no conditional checks (on MetricValue) to take place.
## Mitigate(MetricName="Threads") :- TimeScopedRestartCodePackage(5, 05:00:00).
## Generic rule for any service (no facts besides ServiceName). This means any service that is in Error or Warning state and
## also specified in the serialized TelemetryData instance that forms the Description of the related Service level Health Event will be restarted.
Mitigate(ReplicaOrInstanceId=?ReplicaOrInstanceId) :- ?ReplicaOrInstanceId > 0, TimeScopedRestartReplica(5, 05:00:00).
## Internal Predicates

Просмотреть файл

@ -5,11 +5,11 @@
## |---------------------------|----------------------------------------------------------------------------------------------|
## | NodeName | Name of the node |
## | NodeType | Type of node |
## | ObserverName | Name of Observer that generated the event |
## | FOErrorCode | Error Code emitted by FO (Disk codes are FO007-FO010, FO042, F0043) |
## | MetricName | Name of the metric supplied by FO |
## | MetricValue | Corresponding value for supplied metric name |
## | OS | The name of the OS from which the FO data was collected (Linux or Windows) |
## | ObserverName | Name of Observer that generated the event (if data comes from the FabricObserver service) |
## | ErrorCode | Supported Error Code emitted by caller (Disk codes are FO007-FO010, FO042, F0043) |
## | MetricName | Name of the supplied metric |
## | MetricValue | Corresponding value for supplied metric |
## | OS | The name of the OS from which the data was collected (Linux or Windows) |
## Disk-related Metric Names.
## | Name |
@ -67,19 +67,19 @@ Mitigate(MetricName=?MetricName) :- match(?MetricName, "DiskSpace"), GetRepairHi
## Folder size Warning, check ErrorCode from FO (you could also check MetricName -> MetricName=FolderSizeMB, but we already do that in several places).
## See FOErrorWarningCodes.cs for list of codes and renaming function (GetMetricNameFromCode).
Mitigate(FOErrorCode=FO043) :- GetRepairHistory(?repairCount, 08:00:00),
Mitigate(ErrorCode=FO043) :- GetRepairHistory(?repairCount, 08:00:00),
?repairCount < 4,
CheckFolderSize("E:\SvcFab\Log\Traces", MaxFolderSizeGB=50),
DeleteFiles("E:\SvcFab\Log\Traces", SortOrder=Ascending, MaxFilesToDelete=25, RecurseSubdirectories=false).
## Constrain on folder size Error or Warning code.
Mitigate(FOErrorCode=?FOErrorCode) :- ?FOErrorCode == "FO042" || ?FOErrorCode == "FO043", GetRepairHistory(?repairCount, 08:00:00),
Mitigate(ErrorCode=?ErrorCode) :- ?ErrorCode == "FO042" || ?ErrorCode == "FO043", GetRepairHistory(?repairCount, 08:00:00),
?repairCount < 4,
CheckFolderSize("C:\fabric_observer_logs", MaxFolderSizeMB=250),
DeleteFiles("C:\fabric_observer_logs", SortOrder=Ascending, MaxFilesToDelete=5, RecurseSubdirectories=true, SearchPattern="*.dmp").
## Constrain on folder size Error or Warning code; use environment variable for/in supplied path. Note: Environment variable string must be enclosed in quotes.
Mitigate(FOErrorCode=?FOErrorCode) :- ?FOErrorCode == "FO042" || ?FOErrorCode == "FO043", GetRepairHistory(?repairCount, 08:00:00),
Mitigate(ErrorCode=?ErrorCode) :- ?ErrorCode == "FO042" || ?ErrorCode == "FO043", GetRepairHistory(?repairCount, 08:00:00),
?repairCount < 4,
CheckFolderSize("%SOMEPATHVAR%", MaxFolderSizeGB=50),
DeleteFiles("%SOMEPATHVAR%", SortOrder=Ascending, MaxFilesToDelete=25, RecurseSubdirectories=false).

Просмотреть файл

@ -1,13 +1,11 @@
## Logic rules for Service Fabric Node repairs. These are not used today. FabricObserver does not monitor Fabric nodes.
## Fabric nodes are only put into Warning or Error health state by FO, as configured by user, if the underlying VM is having issues.
## See SystemAppRules.config.txt for logic rules related to system service process issues detected by FabricObserver.
## Logic rules for Service Fabric Node repairs.
## First check if we are inside the run interval. If so, cut (!).
## This is commented out by default. Just uncomment and set the global run interval for app Fabric node level repairs to suit your needs.
## Mitigate() :- CheckInsideRunInterval(02:00:00), !.
## This rule means that whatever the warning data from FabricObserver happens to be (related to node level healing repairs, of course), restart the target node if
## This rule means that whatever the warning data from the issuing service happens to be (related to node level healing repairs, of course), restart the target node if
## the repair hasn't run 4 times in the last 8 hours.
Mitigate() :- GetRepairHistory(?repairCount, 08:00:00), ?repairCount < 4, RestartFabricNode().

Просмотреть файл

@ -6,11 +6,11 @@
## | AppName* | Name of the SF System Application. *This is always fabric:/System (FO monitors SF system service processes). |
## | NodeName | Name of the node |
## | NodeType | Type of node |
## | FOErrorCode | Error Code emitted by FO (e.g. "FO002") |
## | MetricName | Name of the resource supplied by FO (e.g., CpuPercent or MemoryMB, etc.) |
## | MetricValue | Corresponding Metric Value supplied by FO (e.g. "85" indicating 85% CPU usage) |
## | SystemServiceProcessName | The name of a Fabric system service process supplied in FO health data |
## | OS | The name of the OS from which the FO data was collected (Linux or Windows) |
## | ErrorCode | Supported Error Code emitted by caller (e.g. "FO002") |
## | MetricName | Name of the Metric (e.g., CpuPercent or MemoryMB, etc.) |
## | MetricValue | Corresponding Metric value (e.g. "85" indicating 85% CPU usage) |
## | SystemServiceProcessName | The name of a Fabric system service process supplied in TelemetryData instance |
## | OS | The name of the OS from which the data was collected (Linux or Windows) |
## System Service-related Metric Names.
## | Name |

Просмотреть файл

@ -5,10 +5,10 @@
## |---------------------------|----------------------------------------------------------------------------------------------|
## | NodeName | Name of the node |
## | NodeType | Type of node |
## | FOErrorCode | Error Code emitted by FO (e.g. "FO002") |
## | MetricName | Name of the resource supplied by FO (e.g., CpuPercent or MemoryMB, etc.) |
## | MetricValue | Corresponding Metric Value supplied by FO (e.g. "85" indicating 85% CPU usage) |
## | OS | The name of the OS from which the FO data was collected (Linux or Windows) |
## | ErrorCode | Supported Error Code emitted by caller (e.g. "FO002") |
## | MetricName | Name of the Metric (e.g., CpuPercent or MemoryMB, etc.) |
## | MetricValue | Corresponding Metric Value (e.g. "85" indicating 85% CPU usage) |
## | OS | The name of the OS from which the data was collected (Linux or Windows) |
## VM-related Metric Names.
## | Name |

Просмотреть файл

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<ServiceManifest Name="FabricHealerPkg"
Version="1.0.15"
Version="1.1.0"
xmlns="http://schemas.microsoft.com/2011/01/fabric"
xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
@ -11,7 +11,7 @@
</ServiceTypes>
<!-- Code package is your service executable. -->
<CodePackage Name="Code" Version="1.0.15">
<CodePackage Name="Code" Version="1.1.0">
<EntryPoint>
<ExeHost>
<Program>FabricHealer</Program>
@ -21,5 +21,5 @@
<!-- Config package is the contents of the Config directory under PackageRoot that contains an
independently-updateable and versioned set of custom configuration settings for your service. -->
<ConfigPackage Name="Config" Version="1.0.15" />
<ConfigPackage Name="Config" Version="1.1.0" />
</ServiceManifest>

Просмотреть файл

@ -13,6 +13,7 @@ using System.Threading;
using System.Threading.Tasks;
using FabricHealer.Utilities;
using FabricHealer.Utilities.Telemetry;
using FabricHealer.Interfaces;
namespace FabricHealer.Repair
{
@ -72,8 +73,18 @@ namespace FabricHealer.Repair
repairTask.State = RepairTaskState.Restoring;
repairTask.ResultStatus = RepairTaskResult.Cancelled;
_ = await fabricClient.RepairManager.UpdateRepairExecutionStateAsync(repairTask).ConfigureAwait(false);
try
{
_ = await fabricClient.RepairManager.UpdateRepairExecutionStateAsync(repairTask).ConfigureAwait(false);
}
catch (FabricException fe)
{
// TOTHINK
if (fe.ErrorCode == FabricErrorCode.SequenceNumberCheckFailed)
{
// Not sure what to do here. This can randomly take place (timing).
}
}
break;
case RepairTaskState.Invalid:
@ -268,7 +279,7 @@ namespace FabricHealer.Repair
public static async Task<bool> IsLastCompletedFHRepairTaskWithinTimeRangeAsync(
TimeSpan interval,
FabricClient fabricClient,
TelemetryData foHealthData,
TelemetryData repairData,
CancellationToken cancellationToken)
{
@ -289,11 +300,11 @@ namespace FabricHealer.Repair
var orderedRepairList = allRecentFHRepairTasksCompleted.OrderByDescending(o => o.CompletedTimestamp).ToList();
// There could be several repairs of this type for the same repair target in RM's db.
if (orderedRepairList.Any(r => r.ExecutorData.Contains(foHealthData.RepairId)))
if (orderedRepairList.Any(r => r.ExecutorData.Contains(repairData.RepairId)))
{
foreach (var repair in orderedRepairList)
{
if (repair.ExecutorData.Contains(foHealthData.RepairId))
if (repair.ExecutorData.Contains(repairData.RepairId))
{
// Completed aborted/cancelled repair tasks should not block repairs if they are inside run interval.
return repair.CompletedTimestamp != null && repair.Flags != RepairTaskFlags.AbortRequested && repair.Flags != RepairTaskFlags.CancelRequested && DateTime.UtcNow.Subtract(repair.CompletedTimestamp.Value) <= interval;
@ -304,8 +315,8 @@ namespace FabricHealer.Repair
// VM repairs - IS is executor, ExecutorData is supplied by IS. Custom FH repair id supplied as repair Description.
foreach (var repair in allRecentFHRepairTasksCompleted.Where(r => r.ResultStatus == RepairTaskResult.Succeeded))
{
if (repair.Executor != $"fabric:/System/InfrastructureService/{foHealthData.NodeType}" ||
repair.Description != foHealthData.RepairId)
if (repair.Executor != $"fabric:/System/InfrastructureService/{repairData.NodeType}" ||
repair.Description != repairData.RepairId)
{
continue;
}
@ -329,7 +340,7 @@ namespace FabricHealer.Repair
public static async Task<int> GetCompletedRepairCountWithinTimeRangeAsync(
TimeSpan timeWindow,
FabricClient fabricClient,
TelemetryData foHealthData,
TelemetryData repairData,
CancellationToken cancellationToken)
{
var allRecentFHRepairTasksCompleted =
@ -360,7 +371,7 @@ namespace FabricHealer.Repair
continue;
}
if (foHealthData.RepairId != fhExecutorData.RepairPolicy.RepairId)
if (repairData.RepairId != fhExecutorData.RepairPolicy.RepairId)
{
continue;
}
@ -373,7 +384,7 @@ namespace FabricHealer.Repair
}
}
// VM repairs (IS is executor, ExecutorData supplied by IS. Custom FH repair id supplied as repair Description.)
else if (repair.Executor == $"{RepairTaskEngine.InfrastructureServiceName}/{foHealthData.NodeType}" && repair.Description == foHealthData.RepairId)
else if (repair.Executor == $"{RepairTaskEngine.InfrastructureServiceName}/{repairData.NodeType}" && repair.Description == repairData.RepairId)
{
if (repair.CompletedTimestamp == null || !repair.CompletedTimestamp.HasValue)
{

Просмотреть файл

@ -18,7 +18,7 @@ namespace FabricHealer.Repair.Guan
{
private static CheckFolderSizePredicateType Instance;
private static RepairTaskManager RepairTaskManager;
private static TelemetryData FOHealthData;
private static TelemetryData RepairData;
private class Resolver : BooleanPredicateResolver
{
@ -110,7 +110,7 @@ namespace FabricHealer.Repair.Guan
}
string message =
$"Repair {FOHealthData.RepairId}: Supplied Maximum folder size value ({(maxFolderSizeGB > 0 ? maxFolderSizeGB + "GB" : maxFolderSizeMB + "MB")}) " +
$"Repair {RepairData.RepairId}: Supplied Maximum folder size value ({(maxFolderSizeGB > 0 ? maxFolderSizeGB + "GB" : maxFolderSizeMB + "MB")}) " +
$"for path {folderPath} is less than computed folder size ({size}{(maxFolderSizeGB > 0 ? "GB" : "MB")}). " +
"Will not attempt repair.";
@ -141,9 +141,9 @@ namespace FabricHealer.Repair.Guan
}
}
public static CheckFolderSizePredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData foHealthData)
public static CheckFolderSizePredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData repairData)
{
FOHealthData = foHealthData;
RepairData = repairData;
RepairTaskManager = repairTaskManager;
return Instance ??= new CheckFolderSizePredicateType(name);

Просмотреть файл

@ -15,7 +15,7 @@ namespace FabricHealer.Repair.Guan
{
private static CheckInsideRunIntervalPredicateType Instance;
private static RepairTaskManager RepairTaskManager;
private static TelemetryData FOHealthData;
private static TelemetryData RepairData;
private class Resolver : BooleanPredicateResolver
{
@ -47,7 +47,7 @@ namespace FabricHealer.Repair.Guan
bool insideRunInterval = await FabricRepairTasks.IsLastCompletedFHRepairTaskWithinTimeRangeAsync(
interval,
RepairTaskManager.FabricClientInstance,
FOHealthData,
RepairData,
RepairTaskManager.Token).ConfigureAwait(false);
if (!insideRunInterval)
@ -55,22 +55,22 @@ namespace FabricHealer.Repair.Guan
return false;
}
string message = $"Repair with ID {FOHealthData.RepairId} has already run once within the specified run interval ({(runInterval > TimeSpan.MinValue ? runInterval : interval)}).{Environment.NewLine}" +
string message = $"Repair with ID {RepairData.RepairId} has already run once within the specified run interval ({(runInterval > TimeSpan.MinValue ? runInterval : interval)}).{Environment.NewLine}" +
"Will not attempt repair at this time.";
await RepairTaskManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"CheckInsideRunInterval::{FOHealthData.RepairId}",
$"CheckInsideRunInterval::{RepairData.RepairId}",
message,
RepairTaskManager.Token).ConfigureAwait(false);
return true;
}
}
public static CheckInsideRunIntervalPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData foHealthData)
public static CheckInsideRunIntervalPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData repairData)
{
RepairTaskManager = repairTaskManager;
FOHealthData = foHealthData;
RepairData = repairData;
return Instance ??= new CheckInsideRunIntervalPredicateType(name);
}

Просмотреть файл

@ -16,7 +16,7 @@ namespace FabricHealer.Repair.Guan
public class DeleteFilesPredicateType : PredicateType
{
private static RepairTaskManager RepairTaskManager;
private static TelemetryData FOHealthData;
private static TelemetryData RepairData;
private static DeleteFilesPredicateType Instance;
private class Resolver : BooleanPredicateResolver
@ -28,15 +28,22 @@ namespace FabricHealer.Repair.Guan
{
repairConfiguration = new RepairConfiguration
{
AppName = !string.IsNullOrWhiteSpace(FOHealthData.ApplicationName) ? new Uri(FOHealthData.ApplicationName) : null,
FOErrorCode = FOHealthData.Code,
NodeName = FOHealthData.NodeName,
NodeType = FOHealthData.NodeType,
PartitionId = !string.IsNullOrWhiteSpace(FOHealthData.PartitionId) ? new Guid(FOHealthData.PartitionId) : default,
ReplicaOrInstanceId = FOHealthData.ReplicaId > 0 ? FOHealthData.ReplicaId : 0,
ServiceName = !string.IsNullOrWhiteSpace(FOHealthData.ServiceName) ? new Uri(FOHealthData.ServiceName) : null,
FOHealthMetricValue = FOHealthData.Value,
RepairPolicy = new DiskRepairPolicy()
AppName = null,
ErrorCode = RepairData.Code,
NodeName = RepairData.NodeName,
NodeType = RepairData.NodeType,
PartitionId = default,
ReplicaOrInstanceId = 0,
ServiceName = null,
MetricValue = RepairData.Value,
RepairPolicy = new DiskRepairPolicy
{
RepairAction = RepairActionType.DeleteFiles,
TargetType = RepairTargetType.VirtualMachine,
RepairId = RepairData.RepairId
},
EventSourceId = RepairData.Source,
EventProperty = RepairData.Property
};
}
@ -114,12 +121,7 @@ namespace FabricHealer.Repair.Guan
}
}
// RepairPolicy (base)
repairConfiguration.RepairPolicy.RepairAction = RepairActionType.DeleteFiles;
repairConfiguration.RepairPolicy.TargetType = RepairTargetType.VirtualMachine;
repairConfiguration.RepairPolicy.RepairId = FOHealthData.RepairId;
// DiskRepairPolicy (derives from RepairPolicy)
// DiskRepairPolicy
(repairConfiguration.RepairPolicy as DiskRepairPolicy).FolderPath = path;
(repairConfiguration.RepairPolicy as DiskRepairPolicy).MaxNumberOfFilesToDelete = maxFilesToDelete;
(repairConfiguration.RepairPolicy as DiskRepairPolicy).FileAgeSortOrder = direction;
@ -132,7 +134,6 @@ namespace FabricHealer.Repair.Guan
repairConfiguration,
RepairTaskManager.Token),
RepairTaskManager.Token).ConfigureAwait(false);
if (repairTask == null)
{
return false;
@ -149,9 +150,9 @@ namespace FabricHealer.Repair.Guan
}
}
public static DeleteFilesPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData foHealthData)
public static DeleteFilesPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData repairData)
{
FOHealthData = foHealthData;
RepairData = repairData;
RepairTaskManager = repairTaskManager;
return Instance ??= new DeleteFilesPredicateType(name);

Просмотреть файл

@ -14,7 +14,7 @@ namespace FabricHealer.Repair.Guan
public class GetHealthEventHistoryPredicateType : PredicateType
{
private static RepairTaskManager RepairTaskManager;
private static TelemetryData FOHealthData;
private static TelemetryData RepairData;
private static GetHealthEventHistoryPredicateType Instance;
private class Resolver : GroundPredicateResolver
@ -32,7 +32,7 @@ namespace FabricHealer.Repair.Guan
if (timeRange > TimeSpan.MinValue)
{
eventCount = RepairTaskManager.GetEntityHealthEventCountWithinTimeRange(FOHealthData.HealthEventProperty, timeRange);
eventCount = RepairTaskManager.GetEntityHealthEventCountWithinTimeRange(RepairData.Property, timeRange);
}
else
{
@ -41,7 +41,7 @@ namespace FabricHealer.Repair.Guan
await RepairTaskManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"GetHealthEventHistoryPredicateType::{FOHealthData.HealthEventProperty}",
$"GetHealthEventHistoryPredicateType::{RepairData.Property}",
message,
RepairTaskManager.Token).ConfigureAwait(false);
}
@ -52,10 +52,10 @@ namespace FabricHealer.Repair.Guan
}
}
public static GetHealthEventHistoryPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData foHealthData)
public static GetHealthEventHistoryPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData repairData)
{
RepairTaskManager = repairTaskManager;
FOHealthData = foHealthData;
RepairData = repairData;
return Instance ??= new GetHealthEventHistoryPredicateType(name);
}

Просмотреть файл

@ -14,7 +14,7 @@ namespace FabricHealer.Repair.Guan
public class GetRepairHistoryPredicateType : PredicateType
{
private static RepairTaskManager RepairTaskManager;
private static TelemetryData FOHealthData;
private static TelemetryData RepairData;
private static GetRepairHistoryPredicateType Instance;
private class Resolver : GroundPredicateResolver
@ -35,7 +35,7 @@ namespace FabricHealer.Repair.Guan
repairCount = await FabricRepairTasks.GetCompletedRepairCountWithinTimeRangeAsync(
timeWindow,
RepairTaskManager.FabricClientInstance,
FOHealthData,
RepairData,
RepairTaskManager.Token).ConfigureAwait(false);
}
else
@ -46,7 +46,7 @@ namespace FabricHealer.Repair.Guan
await RepairTaskManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"GetRepairHistoryPredicate::{FOHealthData.RepairId}",
$"GetRepairHistoryPredicate::{RepairData.RepairId}",
message,
RepairTaskManager.Token).ConfigureAwait(false);
}
@ -57,10 +57,10 @@ namespace FabricHealer.Repair.Guan
}
}
public static GetRepairHistoryPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData foHealthData)
public static GetRepairHistoryPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData repairData)
{
RepairTaskManager = repairTaskManager;
FOHealthData = foHealthData;
RepairData = repairData;
return Instance ??= new GetRepairHistoryPredicateType(name);
}

Просмотреть файл

@ -14,7 +14,7 @@ namespace FabricHealer.Repair.Guan
public class RestartCodePackagePredicateType : PredicateType
{
private static RepairTaskManager RepairTaskManager;
private static TelemetryData FOHealthData;
private static TelemetryData RepairData;
private static RestartCodePackagePredicateType Instance;
private class Resolver : BooleanPredicateResolver
@ -27,26 +27,29 @@ namespace FabricHealer.Repair.Guan
repairConfiguration = new RepairConfiguration
{
AppName = !string.IsNullOrWhiteSpace(FOHealthData.ApplicationName) ? new Uri(FOHealthData.ApplicationName) : null,
ContainerId = FOHealthData.ContainerId,
FOErrorCode = FOHealthData.Code,
NodeName = FOHealthData.NodeName,
NodeType = FOHealthData.NodeType,
PartitionId = !string.IsNullOrWhiteSpace(FOHealthData.PartitionId) ? new Guid(FOHealthData.PartitionId) : default,
ReplicaOrInstanceId = FOHealthData.ReplicaId > 0 ? FOHealthData.ReplicaId : 0,
ServiceName = !string.IsNullOrWhiteSpace(FOHealthData.ServiceName) ? new Uri(FOHealthData.ServiceName) : null,
FOHealthMetricValue = FOHealthData.Value,
RepairPolicy = new RepairPolicy()
AppName = !string.IsNullOrWhiteSpace(RepairData.ApplicationName) ? new Uri(RepairData.ApplicationName) : null,
ContainerId = RepairData.ContainerId,
ErrorCode = RepairData.Code,
EntityType = RepairData.EntityType,
NodeName = RepairData.NodeName,
NodeType = RepairData.NodeType,
PartitionId = RepairData.PartitionId != Guid.Empty ? RepairData.PartitionId : default,
ReplicaOrInstanceId = RepairData.ReplicaId > 0 ? RepairData.ReplicaId : 0,
ServiceName = !string.IsNullOrWhiteSpace(RepairData.ServiceName) ? new Uri(RepairData.ServiceName) : null,
MetricValue = RepairData.Value,
RepairPolicy = new RepairPolicy
{
RepairAction = RepairActionType.RestartCodePackage,
RepairId = RepairData.RepairId,
TargetType = RepairTargetType.Application
},
EventSourceId = RepairData.Source,
EventProperty = RepairData.Property
};
}
protected override async Task<bool> CheckAsync()
{
// RepairPolicy
repairConfiguration.RepairPolicy.RepairAction = RepairActionType.RestartCodePackage;
repairConfiguration.RepairPolicy.RepairId = FOHealthData.RepairId;
repairConfiguration.RepairPolicy.TargetType = RepairTargetType.Application;
int count = Input.Arguments.Count;
for (int i = 0; i < count; i++)
@ -90,10 +93,10 @@ namespace FabricHealer.Repair.Guan
}
}
public static RestartCodePackagePredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData foHealthData)
public static RestartCodePackagePredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData repairData)
{
RepairTaskManager = repairTaskManager;
FOHealthData = foHealthData;
RepairData = repairData;
return Instance ??= new RestartCodePackagePredicateType(name);
}

Просмотреть файл

@ -17,7 +17,7 @@ namespace FabricHealer.Repair.Guan
private static RepairTaskManager RepairTaskManager;
private static RepairExecutorData RepairExecutorData;
private static RepairTaskEngine RepairTaskEngine;
private static TelemetryData FOHealthData;
private static TelemetryData RepairData;
private static RestartFabricNodePredicateType Instance;
private class Resolver : BooleanPredicateResolver
@ -29,25 +29,28 @@ namespace FabricHealer.Repair.Guan
{
repairConfiguration = new RepairConfiguration
{
AppName = !string.IsNullOrWhiteSpace(FOHealthData.ApplicationName) ? new Uri(FOHealthData.ApplicationName) : null,
FOErrorCode = FOHealthData.Code,
NodeName = FOHealthData.NodeName,
NodeType = FOHealthData.NodeType,
PartitionId = !string.IsNullOrWhiteSpace(FOHealthData.PartitionId) ? new Guid(FOHealthData.PartitionId) : default,
ReplicaOrInstanceId = FOHealthData.ReplicaId > 0 ? FOHealthData.ReplicaId : 0,
ServiceName = (!string.IsNullOrWhiteSpace(FOHealthData.ServiceName) && FOHealthData.ServiceName.Contains("fabric:/")) ? new Uri(FOHealthData.ServiceName) : null,
FOHealthMetricValue = FOHealthData.Value,
RepairPolicy = new RepairPolicy()
AppName = !string.IsNullOrWhiteSpace(RepairData.ApplicationName) ? new Uri(RepairData.ApplicationName) : null,
ErrorCode = RepairData.Code,
EntityType = RepairData.EntityType,
NodeName = RepairData.NodeName,
NodeType = RepairData.NodeType,
PartitionId = default,
ReplicaOrInstanceId = RepairData.ReplicaId > 0 ? RepairData.ReplicaId : 0,
ServiceName = (!string.IsNullOrWhiteSpace(RepairData.ServiceName) && RepairData.ServiceName.Contains("fabric:/")) ? new Uri(RepairData.ServiceName) : null,
MetricValue = RepairData.Value,
RepairPolicy = new RepairPolicy
{
RepairAction = RepairActionType.RestartFabricNode,
RepairId = RepairData.RepairId,
TargetType = RepairData.ApplicationName == RepairConstants.SystemAppName ? RepairTargetType.Application : RepairTargetType.Node
},
EventSourceId = RepairData.Source,
EventProperty = RepairData.Property
};
}
protected override async Task<bool> CheckAsync()
{
// Repair Policy
repairConfiguration.RepairPolicy.RepairAction = RepairActionType.RestartFabricNode;
repairConfiguration.RepairPolicy.RepairId = FOHealthData.RepairId;
repairConfiguration.RepairPolicy.TargetType = FOHealthData.ApplicationName == RepairConstants.SystemAppName ? RepairTargetType.Application : RepairTargetType.Node;
int count = Input.Arguments.Count;
for (int i = 0; i < count; i++)
@ -95,11 +98,11 @@ namespace FabricHealer.Repair.Guan
if (isNodeRepairAlreadyInProgress)
{
string message =
$"A Fabric Node repair, {FOHealthData.RepairId}, is already in progress in the cluster. Will not attempt repair at this time.";
$"A Fabric Node repair, {RepairData.RepairId}, is already in progress in the cluster. Will not attempt repair at this time.";
await RepairTaskManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"RestartFabricNodePredicateType::{FOHealthData.RepairId}",
$"RestartFabricNodePredicateType::{RepairData.RepairId}",
message,
RepairTaskManager.Token).ConfigureAwait(false);
return false;
@ -133,12 +136,12 @@ namespace FabricHealer.Repair.Guan
RepairTaskManager repairTaskManager,
RepairExecutorData repairExecutorData,
RepairTaskEngine repairTaskEngine,
TelemetryData foHealthData)
TelemetryData repairData)
{
RepairTaskManager = repairTaskManager;
RepairExecutorData = repairExecutorData;
RepairTaskEngine = repairTaskEngine;
FOHealthData = foHealthData;
RepairData = repairData;
return Instance ??= new RestartFabricNodePredicateType(name);
}

Просмотреть файл

@ -14,7 +14,7 @@ namespace FabricHealer.Repair.Guan
public class RestartFabricSystemProcessPredicateType : PredicateType
{
private static RepairTaskManager RepairTaskManager;
private static TelemetryData FOHealthData;
private static TelemetryData RepairData;
private static RestartFabricSystemProcessPredicateType Instance;
private class Resolver : BooleanPredicateResolver
@ -27,18 +27,26 @@ namespace FabricHealer.Repair.Guan
repairConfiguration = new RepairConfiguration
{
AppName = !string.IsNullOrWhiteSpace(FOHealthData.ApplicationName) ? new Uri(FOHealthData.ApplicationName) : null,
ContainerId = FOHealthData.ContainerId,
FOErrorCode = FOHealthData.Code,
NodeName = FOHealthData.NodeName,
NodeType = FOHealthData.NodeType,
PartitionId = !string.IsNullOrWhiteSpace(FOHealthData.PartitionId) ? new Guid(FOHealthData.PartitionId) : default,
ProcessId = FOHealthData.ProcessId > 0 ? FOHealthData.ProcessId : -1,
SystemServiceProcessName = !string.IsNullOrWhiteSpace(FOHealthData.SystemServiceProcessName) ? FOHealthData.SystemServiceProcessName : string.Empty,
ReplicaOrInstanceId = FOHealthData.ReplicaId > 0 ? FOHealthData.ReplicaId : 0,
ServiceName = !string.IsNullOrWhiteSpace(FOHealthData.ServiceName) ? new Uri(FOHealthData.ServiceName) : null,
FOHealthMetricValue = FOHealthData.Value,
RepairPolicy = new RepairPolicy()
AppName = !string.IsNullOrWhiteSpace(RepairData.ApplicationName) ? new Uri(RepairData.ApplicationName) : null,
ContainerId = RepairData.ContainerId,
ErrorCode = RepairData.Code,
EntityType = RepairData.EntityType,
NodeName = RepairData.NodeName,
NodeType = RepairData.NodeType,
PartitionId = default,
ProcessId = (int)(RepairData.ProcessId > 0 ? RepairData.ProcessId : -1),
SystemServiceProcessName = !string.IsNullOrWhiteSpace(RepairData.SystemServiceProcessName) ? RepairData.SystemServiceProcessName : string.Empty,
ReplicaOrInstanceId = RepairData.ReplicaId > 0 ? RepairData.ReplicaId : 0,
ServiceName = !string.IsNullOrWhiteSpace(RepairData.ServiceName) ? new Uri(RepairData.ServiceName) : null,
MetricValue = RepairData.Value,
RepairPolicy = new RepairPolicy
{
RepairAction = RepairActionType.RestartProcess,
RepairId = RepairData.RepairId,
TargetType = RepairTargetType.Application
},
EventSourceId = RepairData.Source,
EventProperty = RepairData.Property
};
}
@ -50,11 +58,6 @@ namespace FabricHealer.Repair.Guan
return false;
}
// RepairPolicy
repairConfiguration.RepairPolicy.RepairAction = RepairActionType.RestartProcess;
repairConfiguration.RepairPolicy.RepairId = FOHealthData.RepairId;
repairConfiguration.RepairPolicy.TargetType = RepairTargetType.Application;
int count = Input.Arguments.Count;
for (int i = 0; i < count; i++)
@ -98,10 +101,10 @@ namespace FabricHealer.Repair.Guan
}
}
public static RestartFabricSystemProcessPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData foHealthData)
public static RestartFabricSystemProcessPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData repairData)
{
RepairTaskManager = repairTaskManager;
FOHealthData = foHealthData;
RepairData = repairData;
return Instance ??= new RestartFabricSystemProcessPredicateType(name);
}

Просмотреть файл

@ -14,7 +14,7 @@ namespace FabricHealer.Repair.Guan
public class RestartReplicaPredicateType : PredicateType
{
private static RepairTaskManager RepairTaskManager;
private static TelemetryData FOHealthData;
private static TelemetryData RepairData;
private static RestartReplicaPredicateType Instance;
private class Resolver : BooleanPredicateResolver
@ -26,24 +26,28 @@ namespace FabricHealer.Repair.Guan
{
repairConfiguration = new RepairConfiguration
{
AppName = !string.IsNullOrWhiteSpace(FOHealthData.ApplicationName) ? new Uri(FOHealthData.ApplicationName) : null,
FOErrorCode = FOHealthData.Code,
NodeName = FOHealthData.NodeName,
NodeType = FOHealthData.NodeType,
PartitionId = !string.IsNullOrWhiteSpace(FOHealthData.PartitionId) ? new Guid(FOHealthData.PartitionId) : default,
ReplicaOrInstanceId = FOHealthData.ReplicaId > 0 ? FOHealthData.ReplicaId : 0,
ServiceName = !string.IsNullOrWhiteSpace(FOHealthData.ServiceName) ? new Uri(FOHealthData.ServiceName) : null,
FOHealthMetricValue = FOHealthData.Value,
RepairPolicy = new RepairPolicy()
AppName = !string.IsNullOrWhiteSpace(RepairData.ApplicationName) ? new Uri(RepairData.ApplicationName) : null,
ErrorCode = RepairData.Code,
EntityType = RepairData.EntityType,
NodeName = RepairData.NodeName,
NodeType = RepairData.NodeType,
PartitionId = RepairData.PartitionId != Guid.Empty ? RepairData.PartitionId : default,
ReplicaOrInstanceId = RepairData.ReplicaId > 0 ? RepairData.ReplicaId : 0,
ServiceName = !string.IsNullOrWhiteSpace(RepairData.ServiceName) ? new Uri(RepairData.ServiceName) : null,
MetricValue = RepairData.Value,
RepairPolicy = new RepairPolicy
{
RepairId = RepairData.RepairId,
TargetType = RepairTargetType.Application,
RepairAction = RepairActionType.RestartReplica
},
EventSourceId = RepairData.Source,
EventProperty = RepairData.Property
};
}
protected override async Task<bool> CheckAsync()
{
repairConfiguration.RepairPolicy.RepairId = FOHealthData.RepairId;
repairConfiguration.RepairPolicy.TargetType = RepairTargetType.Application;
repairConfiguration.RepairPolicy.RepairAction = RepairActionType.RestartReplica;
int count = Input.Arguments.Count;
for (int i = 0; i < count; i++)
@ -87,10 +91,10 @@ namespace FabricHealer.Repair.Guan
}
}
public static RestartReplicaPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData foHealthData)
public static RestartReplicaPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData repairData)
{
RepairTaskManager = repairTaskManager;
FOHealthData = foHealthData;
RepairData = repairData;
return Instance ??= new RestartReplicaPredicateType(name);
}

Просмотреть файл

@ -14,7 +14,7 @@ namespace FabricHealer.Repair.Guan
public class RestartVMPredicateType : PredicateType
{
private static RepairTaskManager RepairTaskManager;
private static TelemetryData FOHealthData;
private static TelemetryData RepairData;
private static RestartVMPredicateType Instance;
private class Resolver : BooleanPredicateResolver
@ -26,25 +26,28 @@ namespace FabricHealer.Repair.Guan
{
repairConfiguration = new RepairConfiguration
{
AppName = !string.IsNullOrWhiteSpace(FOHealthData.ApplicationName) ? new Uri(FOHealthData.ApplicationName) : null,
FOErrorCode = FOHealthData.Code,
NodeName = FOHealthData.NodeName,
NodeType = FOHealthData.NodeType,
PartitionId = !string.IsNullOrWhiteSpace(FOHealthData.PartitionId) ? new Guid(FOHealthData.PartitionId) : default,
ReplicaOrInstanceId = FOHealthData.ReplicaId > 0 ? FOHealthData.ReplicaId : 0,
ServiceName = !string.IsNullOrWhiteSpace(FOHealthData.ServiceName) ? new Uri(FOHealthData.ServiceName) : null,
FOHealthMetricValue = FOHealthData.Value,
RepairPolicy = new RepairPolicy()
AppName = null,
ErrorCode = RepairData.Code,
EntityType = RepairData.EntityType,
NodeName = RepairData.NodeName,
NodeType = RepairData.NodeType,
PartitionId = default,
ReplicaOrInstanceId = 0,
ServiceName = null,
MetricValue = RepairData.Value,
RepairPolicy = new RepairPolicy
{
RepairAction = RepairActionType.RestartVM,
RepairId = RepairData.RepairId,
TargetType = RepairTargetType.VirtualMachine
},
EventSourceId = RepairData.Source,
EventProperty = RepairData.Property
};
}
protected override async Task<bool> CheckAsync()
{
// Repair Policy
repairConfiguration.RepairPolicy.RepairAction = RepairActionType.RestartVM;
repairConfiguration.RepairPolicy.RepairId = FOHealthData.RepairId;
repairConfiguration.RepairPolicy.TargetType = RepairTargetType.VirtualMachine;
int count = Input.Arguments.Count;
for (int i = 0; i < count; i++)
@ -71,17 +74,17 @@ namespace FabricHealer.Repair.Guan
var repairTaskEngine = new RepairTaskEngine(RepairTaskManager.FabricClientInstance);
var isRepairAlreadyInProgress =
await repairTaskEngine.IsFHRepairTaskRunningAsync(
$"{RepairTaskEngine.InfrastructureServiceName}/{FOHealthData.NodeType}",
$"{RepairTaskEngine.InfrastructureServiceName}/{RepairData.NodeType}",
repairConfiguration,
RepairTaskManager.Token).ConfigureAwait(false);
if (isRepairAlreadyInProgress)
{
string message = $"VM Repair {FOHealthData.RepairId} is already in progress. Will not attempt repair at this time.";
string message = $"VM Repair {RepairData.RepairId} is already in progress. Will not attempt repair at this time.";
await RepairTaskManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"RestartVMPredicateType::{FOHealthData.RepairId}",
$"RestartVMPredicateType::{RepairData.RepairId}",
message,
RepairTaskManager.Token).ConfigureAwait(false);
return false;
@ -96,10 +99,10 @@ namespace FabricHealer.Repair.Guan
}
}
public static RestartVMPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData foHealthData)
public static RestartVMPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData repairData)
{
RepairTaskManager = repairTaskManager;
FOHealthData = foHealthData;
RepairData = repairData;
return Instance ??= new RestartVMPredicateType(name);
}

Просмотреть файл

@ -3,6 +3,7 @@
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
using FabricHealer.Utilities.Telemetry;
using System;
namespace FabricHealer.Repair
@ -54,18 +55,33 @@ namespace FabricHealer.Repair
get; set;
}
public string FOErrorCode
public string ErrorCode
{
get; set;
}
public object FOHealthMetricValue
public object MetricValue
{
get; set;
}
public string EventProperty
{
get; set;
}
public string SystemServiceProcessName
{
get; set;
}
public string EventSourceId
{
get; set;
}
public EntityType EntityType
{
get; set;
}
}

Просмотреть файл

@ -44,7 +44,7 @@ namespace FabricHealer.Repair
public const string SystemAppRepairPolicySectionName = "SystemAppRepairPolicy";
public const string VmRepairPolicySectionName = "VMRepairPolicy";
// RepairPolicy Settings Parameters.
// RepairPolicy
public const string Enabled = "Enabled";
// Mitigate Argument names.
@ -54,11 +54,12 @@ namespace FabricHealer.Repair
public const string NodeType = "NodeType";
public const string PartitionId = "PartitionId";
public const string ReplicaOrInstanceId = "ReplicaOrInstanceId";
public const string FOErrorCode = "FOErrorCode";
public const string ErrorCode = "ErrorCode";
public const string MetricName = "MetricName";
public const string MetricValue = "MetricValue";
public const string OS = "OS";
public const string SystemServiceProcessName = "SystemServiceProcessName";
public const string HealthState = "HealthState";
// Repair Actions.
public const string DeleteFiles = "DeleteFiles";

Просмотреть файл

@ -22,6 +22,7 @@ using System.Linq;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.ComponentModel;
using Newtonsoft.Json;
namespace FabricHealer.Repair
{
@ -83,17 +84,7 @@ namespace FabricHealer.Repair
}
else
{
await Task.Delay(TimeSpan.FromSeconds(15), cancellationToken);
replicaList = await fabricClient.QueryManager.GetReplicaListAsync(
repairConfiguration.PartitionId,
null,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken).ConfigureAwait(false);
if (replicaList.Count == 0)
{
await telemetryUtilities.EmitTelemetryEtwHealthEventAsync(
await telemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairExecutor.RestartCodePackageAsync",
$"Execution failure: Replica {repairConfiguration.ReplicaOrInstanceId} " +
@ -101,12 +92,7 @@ namespace FabricHealer.Repair
cancellationToken,
repairConfiguration,
FabricHealerManager.ConfigSettings.EnableVerboseLogging).ConfigureAwait(false);
return null;
}
replica = replicaList.First(r => r.ReplicaStatus == ServiceReplicaStatus.Ready);
replicaId = replica.Id;
return null;
}
ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(partitionSelector, replicaId);
@ -132,7 +118,7 @@ namespace FabricHealer.Repair
if (restartCodePackageResult != null)
{
UpdateRepairHistory(repairConfiguration);
await ClearHealthWarningsAsync(repairConfiguration, HealthScope.Application, cancellationToken, RepairConstants.AppObserver).ConfigureAwait(false);
ClearEntityHealthWarnings(repairConfiguration);
}
return restartCodePackageResult;
@ -477,11 +463,12 @@ namespace FabricHealer.Repair
/// <param name="repairConfiguration">RepairConfiguration instance.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns></returns>
public async Task<RestartReplicaResult> RestartReplicaAsync(RepairConfiguration repairConfiguration, CancellationToken cancellationToken)
public async Task<ReplicaResult> RestartReplicaAsync(RepairConfiguration repairConfiguration, CancellationToken cancellationToken)
{
RestartReplicaResult replicaResult;
ReplicaResult replicaResult = null;
string actionMessage = $"Attempting to restart stateful replica {repairConfiguration.ReplicaOrInstanceId} " +
$"on partition {repairConfiguration.PartitionId} on node {repairConfiguration.NodeName}.";
await telemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairExecutor.RestartReplicaAsync::Start",
@ -516,31 +503,43 @@ namespace FabricHealer.Repair
}
replicaSelector = ReplicaSelector.ReplicaIdOf(partitionSelector, repairConfiguration.ReplicaOrInstanceId);
replicaResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() =>
fabricClient.FaultManager.RestartReplicaAsync(
replicaSelector,
CompletionMode.Verify,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken), cancellationToken).ConfigureAwait(false);
string statusSuccess =
$"Successfully restarted stateful replica {repairConfiguration.ReplicaOrInstanceId} " +
$"on partition {repairConfiguration.PartitionId} " +
$"on node {repairConfiguration.NodeName}.";
await telemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairExecutor.RestartReplicaAsync::Success",
statusSuccess,
cancellationToken,
repairConfiguration,
FabricHealerManager.ConfigSettings.EnableVerboseLogging).ConfigureAwait(false);
UpdateRepairHistory(repairConfiguration);
try
{
replicaResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
() =>
fabricClient.FaultManager.RestartReplicaAsync(
replicaSelector,
CompletionMode.Verify,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken), cancellationToken).ConfigureAwait(false);
}
catch (FabricException fe)
{
// This would mean the stateful service replica is volatile (no persisted state), so we have to Remove it.
if (fe.ErrorCode == FabricErrorCode.InvalidReplicaOperation && fe.InnerException.Message == "0x80071C3A")
{
replicaResult = await RemoveReplicaAsync(repairConfiguration, cancellationToken).ConfigureAwait(false);
}
}
if (replicaResult != null)
{
await ClearHealthWarningsAsync(repairConfiguration, HealthScope.Application, cancellationToken).ConfigureAwait(false);
string statusSuccess =
$"Successfully restarted stateful replica {repairConfiguration.ReplicaOrInstanceId} " +
$"on partition {repairConfiguration.PartitionId} " +
$"on node {repairConfiguration.NodeName}.";
await telemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairExecutor.RestartReplicaAsync::Success",
statusSuccess,
cancellationToken,
repairConfiguration,
FabricHealerManager.ConfigSettings.EnableVerboseLogging).ConfigureAwait(false);
UpdateRepairHistory(repairConfiguration);
ClearEntityHealthWarnings(repairConfiguration);
}
}
catch (Exception e) when (e is FabricException || e is TimeoutException || e is OperationCanceledException)
@ -562,6 +561,7 @@ namespace FabricHealer.Repair
FabricHealerManager.RepairHistory.FailedRepairs++;
return null;
}
return replicaResult;
}
@ -603,7 +603,7 @@ namespace FabricHealer.Repair
UpdateRepairHistory(repairConfiguration);
// Clear Warning from FO. If in fact the issue has not been solved, then FO will generate a new health report for the target and the game will be played again.
await ClearHealthWarningsAsync(repairConfiguration, HealthScope.Application, cancellationToken, RepairConstants.FabricSystemObserver).ConfigureAwait(false);
ClearEntityHealthWarnings(repairConfiguration);
}
catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is NotSupportedException || e is Win32Exception)
{
@ -751,7 +751,7 @@ namespace FabricHealer.Repair
if (replicaResult != null)
{
await ClearHealthWarningsAsync(repairConfiguration, HealthScope.Application, cancellationToken).ConfigureAwait(false);
ClearEntityHealthWarnings(repairConfiguration);
}
}
catch (Exception e) when (e is FabricException || e is TimeoutException || e is OperationCanceledException)
@ -910,7 +910,7 @@ namespace FabricHealer.Repair
UpdateRepairHistory(repairConfiguration);
}
await ClearHealthWarningsAsync(repairConfiguration, HealthScope.Node, cancellationToken, RepairConstants.DiskObserver).ConfigureAwait(false);
ClearEntityHealthWarnings(repairConfiguration);
return true;
}
@ -953,106 +953,74 @@ namespace FabricHealer.Repair
/// <summary>
/// Clears existing health warnings for target repair entity. This should only be called after a repair operation succeeds.
/// </summary>
/// <param name="repairConfiguration">RepairConfiguration instance used for repair.</param>
/// <param name="healthScope">Repair target health scope (FO, by default, produces 2 types of reports: Application and Node)</param>
/// <param name="cancellationToken">CancellationToken instance.</param>
/// <param name="source">Optional: The name of the source Observer (AppObserver, DiskObserver, etc...)</param>
/// <returns></returns>
private async Task ClearHealthWarningsAsync(RepairConfiguration repairConfiguration, HealthScope healthScope, CancellationToken cancellationToken, string source = null)
/// <param name="repairConfiguration">RepairConfiguration instance.</param>
private void ClearEntityHealthWarnings(RepairConfiguration repairConfiguration)
{
try
{
if (healthScope == HealthScope.Application)
var telemetryData = new TelemetryData
{
var appHealth = await fabricClient.HealthManager.GetApplicationHealthAsync(
repairConfiguration.AppName,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken).ConfigureAwait(false);
ApplicationName = repairConfiguration.AppName?.OriginalString,
ServiceName = repairConfiguration.ServiceName?.OriginalString,
Code = "FO000",
HealthState = HealthState.Ok,
Description = $"{(repairConfiguration.EventSourceId == RepairConstants.FabricSystemObserver ? repairConfiguration.SystemServiceProcessName : repairConfiguration.ServiceName.OriginalString)} has been repaired.",
NodeName = repairConfiguration.NodeName,
NodeType = repairConfiguration.NodeType,
Source = RepairTaskEngine.FabricHealerExecutorName,
SystemServiceProcessName = $"{(repairConfiguration.EventSourceId == RepairConstants.FabricSystemObserver ? repairConfiguration.SystemServiceProcessName : string.Empty)}",
};
var unhealthyFOAppEvents = appHealth.HealthEvents?.Where(
s => s.HealthInformation.SourceId.Contains(source ?? "Observer")
&& (s.HealthInformation.HealthState == HealthState.Error || s.HealthInformation.HealthState == HealthState.Warning)
&& JsonSerializationUtility.TryDeserialize(s.HealthInformation.Description, out TelemetryData foHealthData)
&& repairConfiguration.NodeName == foHealthData.NodeName
&& foHealthData.ApplicationName == repairConfiguration.AppName.OriginalString
&& (!string.IsNullOrWhiteSpace(foHealthData.SystemServiceProcessName) ? repairConfiguration.SystemServiceProcessName == foHealthData.SystemServiceProcessName : foHealthData.ServiceName == repairConfiguration.ServiceName.OriginalString));
var telemetryData = new TelemetryData
{
ApplicationName = repairConfiguration.AppName?.OriginalString,
ServiceName = repairConfiguration.ServiceName?.OriginalString,
Code = "FO000",
HealthState = "Ok",
Description = $"{(source == RepairConstants.FabricSystemObserver ? repairConfiguration.SystemServiceProcessName : repairConfiguration.ServiceName.OriginalString)} has been repaired.",
NodeName = repairConfiguration.NodeName,
NodeType = repairConfiguration.NodeType,
Source = RepairTaskEngine.FabricHealerExecutorName,
SystemServiceProcessName = $"{(source == RepairConstants.FabricSystemObserver ? repairConfiguration.SystemServiceProcessName : string.Empty)}",
};
if (unhealthyFOAppEvents != null)
{
foreach (var evt in unhealthyFOAppEvents)
{
if (repairConfiguration.ServiceName != null || repairConfiguration.SystemServiceProcessName != null)
{
var healthInfo = new HealthInformation(evt.HealthInformation.SourceId, evt.HealthInformation.Property, HealthState.Ok)
{
Description = JsonSerializationUtility.TrySerialize(telemetryData, out string data)
? data
: $"{(source == RepairConstants.FabricSystemObserver ? repairConfiguration.SystemServiceProcessName : repairConfiguration.ServiceName.OriginalString)} has been repaired.",
TimeToLive = TimeSpan.FromMinutes(5),
RemoveWhenExpired = true,
};
var healthReport = new ApplicationHealthReport(repairConfiguration.AppName, healthInfo);
fabricClient.HealthManager.ReportHealth(healthReport, new HealthReportSendOptions {Immediate = true});
}
await Task.Delay(250, cancellationToken);
}
}
}
else
var healthInformation = new HealthInformation(repairConfiguration.EventSourceId, repairConfiguration.EventProperty, HealthState.Ok)
{
var nodeHealth = await fabricClient.HealthManager.GetNodeHealthAsync(
repairConfiguration.NodeName,
FabricHealerManager.ConfigSettings.AsyncTimeout,
cancellationToken).ConfigureAwait(false);
Description = JsonConvert.SerializeObject(telemetryData),
TimeToLive = TimeSpan.FromMinutes(5),
RemoveWhenExpired = true
};
var unhealthyFONodeEvents = nodeHealth.HealthEvents?.Where(
s => s.HealthInformation.SourceId.Contains(source ?? "Observer")
&& (s.HealthInformation.HealthState == HealthState.Error || s.HealthInformation.HealthState == HealthState.Warning)
&& JsonSerializationUtility.TryDeserialize(s.HealthInformation.Description, out TelemetryData foHealthData)
&& foHealthData?.NodeName == repairConfiguration.NodeName
&& foHealthData?.Code == repairConfiguration.FOErrorCode);
var sendOptions = new HealthReportSendOptions { Immediate = false };
var telemetryData = new TelemetryData
{
Code = "FO000",
HealthState = "Ok",
Description = $"{repairConfiguration.NodeName} has been repaired.",
NodeName = repairConfiguration.NodeName,
NodeType = repairConfiguration.NodeType,
Source = RepairTaskEngine.FabricHealerExecutorName,
};
switch (repairConfiguration.EntityType)
{
case EntityType.Application when repairConfiguration.AppName != null:
foreach (var evt in unhealthyFONodeEvents)
{
cancellationToken.ThrowIfCancellationRequested();
var appHealthReport = new ApplicationHealthReport(repairConfiguration.AppName, healthInformation);
fabricClient.HealthManager.ReportHealth(appHealthReport, sendOptions);
break;
var healthInfo = new HealthInformation(evt.HealthInformation.SourceId, evt.HealthInformation.Property, HealthState.Ok)
{
Description = JsonSerializationUtility.TrySerialize(telemetryData, out string data) ? data : $"{repairConfiguration.NodeName} has been repaired.",
TimeToLive = TimeSpan.FromMinutes(5),
RemoveWhenExpired = true,
};
case EntityType.Service when repairConfiguration.ServiceName != null:
var healthReport = new NodeHealthReport(repairConfiguration.NodeName, healthInfo);
fabricClient.HealthManager.ReportHealth(healthReport, new HealthReportSendOptions { Immediate = true });
var serviceHealthReport = new ServiceHealthReport(repairConfiguration.ServiceName, healthInformation);
fabricClient.HealthManager.ReportHealth(serviceHealthReport, sendOptions);
break;
await Task.Delay(250, cancellationToken);
}
case EntityType.StatefulService when repairConfiguration.PartitionId != null && repairConfiguration.ReplicaOrInstanceId > 0:
var statefulServiceHealthReport = new StatefulServiceReplicaHealthReport(repairConfiguration.PartitionId, repairConfiguration.ReplicaOrInstanceId, healthInformation);
fabricClient.HealthManager.ReportHealth(statefulServiceHealthReport, sendOptions);
break;
case EntityType.StatelessService when repairConfiguration.PartitionId != null && repairConfiguration.ReplicaOrInstanceId > 0:
var statelessServiceHealthReport = new StatelessServiceInstanceHealthReport(repairConfiguration.PartitionId, repairConfiguration.ReplicaOrInstanceId, healthInformation);
fabricClient.HealthManager.ReportHealth(statelessServiceHealthReport, sendOptions);
break;
case EntityType.Partition when repairConfiguration.PartitionId != null:
var partitionHealthReport = new PartitionHealthReport(repairConfiguration.PartitionId, healthInformation);
fabricClient.HealthManager.ReportHealth(partitionHealthReport, sendOptions);
break;
case EntityType.DeployedApplication when repairConfiguration != null && !string.IsNullOrWhiteSpace(repairConfiguration.NodeName):
var deployedApplicationHealthReport = new DeployedApplicationHealthReport(repairConfiguration.AppName, repairConfiguration.NodeName, healthInformation);
fabricClient.HealthManager.ReportHealth(deployedApplicationHealthReport, sendOptions);
break;
case EntityType.Node when !string.IsNullOrWhiteSpace(repairConfiguration.NodeName):
var nodeHealthReport = new NodeHealthReport(repairConfiguration.NodeName, healthInformation);
fabricClient.HealthManager.ReportHealth(nodeHealthReport, sendOptions);
break;
}
}
catch (Exception e) when (e is FabricException || e is TimeoutException)

Просмотреть файл

@ -45,7 +45,7 @@ namespace FabricHealer.Repair
}
[DataMember]
public string FOErrorCode
public string ErrorCode
{
get; set;
}

Просмотреть файл

@ -29,7 +29,7 @@ namespace FabricHealer.Repair
public async Task<RepairTask> CreateFabricHealerRepairTask(RepairExecutorData executorData, CancellationToken token)
{
if (executorData == null || executorData.NodeName == null || executorData.FOErrorCode == null)
if (executorData == null || executorData.NodeName == null)
{
return null;
}
@ -71,7 +71,7 @@ namespace FabricHealer.Repair
// Error health state on target SF entity can block RM from approving the job to repair it (which is the whole point of doing the job).
// So, do not do health checks if customer configures FO to emit Error health reports.
// In general, FO should *not* be configured to emit Error events. See FO documentation.
if (executorData.FOErrorCode != null && FOErrorWarningCodes.GetErrorWarningNameFromCode(executorData.FOErrorCode).Contains("Error"))
if (executorData.ErrorCode != null && SupportedErrorCodes.GetCodeNameFromErrorCode(executorData.ErrorCode).Contains("Error"))
{
doHealthChecks = false;
}
@ -124,7 +124,7 @@ namespace FabricHealer.Repair
}
string taskId = $"{FHTaskIdPrefix}/{HostVMReboot}/{(uint)repairConfiguration.NodeName.GetHashCode()}/{repairConfiguration.NodeType}";
bool doHealthChecks = !FOErrorWarningCodes.GetErrorWarningNameFromCode(repairConfiguration.FOErrorCode).Contains("Error");
bool doHealthChecks = !SupportedErrorCodes.GetCodeNameFromErrorCode(repairConfiguration.ErrorCode).Contains("Error");
// Error health state on target SF entity can block RM from approving the job to repair it (which is the whole point of doing the job).
// So, do not do health checks if customer configures FO to emit Error health reports.

Просмотреть файл

@ -86,13 +86,13 @@ namespace FabricHealer.Repair
return true;
}
public async Task StartRepairWorkflowAsync(TelemetryData foHealthData, List<string> repairRules, CancellationToken cancellationToken)
public async Task StartRepairWorkflowAsync(TelemetryData repairData, List<string> repairRules, CancellationToken cancellationToken)
{
Node node = null;
if (foHealthData.NodeName != null)
if (repairData.NodeName != null)
{
node = await GetFabricNodeFromNodeNameAsync(foHealthData.NodeName, cancellationToken).ConfigureAwait(false);
node = await GetFabricNodeFromNodeNameAsync(repairData.NodeName, cancellationToken).ConfigureAwait(false);
}
if (node == null)
@ -144,11 +144,14 @@ namespace FabricHealer.Repair
return;
}
foHealthData.NodeType = node.NodeType;
if (string.IsNullOrEmpty(repairData.NodeType))
{
repairData.NodeType = node.NodeType;
}
try
{
await RunGuanQueryAsync(foHealthData, repairRules);
await RunGuanQueryAsync(repairData, repairRules);
}
catch (GuanException ge)
{
@ -166,29 +169,29 @@ namespace FabricHealer.Repair
/// This is the entry point to Guan parsing and query execution. It creates the necessary Guan objects to successfully execute logic rules based on supplied FO data
/// and related repair rules.
/// </summary>
/// <param name="foHealthData">Health data from FO for target SF entity</param>
/// <param name="repairData">Health data from FO for target SF entity</param>
/// <param name="repairRules">Repair rules that are related to target SF entity</param>
/// <param name="repairExecutorData">Optional Repair data that is used primarily when some repair is being restarted (after an FH restart, for example)</param>
/// <returns></returns>
public async Task RunGuanQueryAsync(TelemetryData foHealthData, List<string> repairRules, RepairExecutorData repairExecutorData = null)
public async Task RunGuanQueryAsync(TelemetryData repairData, List<string> repairRules, RepairExecutorData repairExecutorData = null)
{
// Add predicate types to functor table. Note that all health information data from FO are automatically passed to all predicates.
FunctorTable functorTable = new FunctorTable();
// Add external helper predicates.
functorTable.Add(CheckFolderSizePredicateType.Singleton(RepairConstants.CheckFolderSize, this, foHealthData));
functorTable.Add(GetRepairHistoryPredicateType.Singleton(RepairConstants.GetRepairHistory, this, foHealthData));
functorTable.Add(GetHealthEventHistoryPredicateType.Singleton(RepairConstants.GetHealthEventHistory, this, foHealthData));
functorTable.Add(CheckInsideRunIntervalPredicateType.Singleton(RepairConstants.CheckInsideRunInterval, this, foHealthData));
functorTable.Add(CheckFolderSizePredicateType.Singleton(RepairConstants.CheckFolderSize, this, repairData));
functorTable.Add(GetRepairHistoryPredicateType.Singleton(RepairConstants.GetRepairHistory, this, repairData));
functorTable.Add(GetHealthEventHistoryPredicateType.Singleton(RepairConstants.GetHealthEventHistory, this, repairData));
functorTable.Add(CheckInsideRunIntervalPredicateType.Singleton(RepairConstants.CheckInsideRunInterval, this, repairData));
functorTable.Add(EmitMessagePredicateType.Singleton(RepairConstants.EmitMessage, this));
// Add external repair predicates.
functorTable.Add(DeleteFilesPredicateType.Singleton(RepairConstants.DeleteFiles, this, foHealthData));
functorTable.Add(RestartCodePackagePredicateType.Singleton(RepairConstants.RestartCodePackage, this, foHealthData));
functorTable.Add(RestartFabricNodePredicateType.Singleton(RepairConstants.RestartFabricNode, this, repairExecutorData, repairTaskEngine, foHealthData));
functorTable.Add(RestartFabricSystemProcessPredicateType.Singleton(RepairConstants.RestartFabricSystemProcess, this, foHealthData));
functorTable.Add(RestartReplicaPredicateType.Singleton(RepairConstants.RestartReplica, this, foHealthData));
functorTable.Add(RestartVMPredicateType.Singleton(RepairConstants.RestartVM, this, foHealthData));
functorTable.Add(DeleteFilesPredicateType.Singleton(RepairConstants.DeleteFiles, this, repairData));
functorTable.Add(RestartCodePackagePredicateType.Singleton(RepairConstants.RestartCodePackage, this, repairData));
functorTable.Add(RestartFabricNodePredicateType.Singleton(RepairConstants.RestartFabricNode, this, repairExecutorData, repairTaskEngine, repairData));
functorTable.Add(RestartFabricSystemProcessPredicateType.Singleton(RepairConstants.RestartFabricSystemProcess, this, repairData));
functorTable.Add(RestartReplicaPredicateType.Singleton(RepairConstants.RestartReplica, this, repairData));
functorTable.Add(RestartVMPredicateType.Singleton(RepairConstants.RestartVM, this, repairData));
// Parse rules.
Module module = Module.Parse("external", repairRules, functorTable);
@ -205,22 +208,23 @@ namespace FabricHealer.Repair
// The type of metric that led FO to generate the unhealthy evaluation for the entity (App, Node, VM, Replica, etc).
// We rename these for brevity for simplified use in logic rule composition (e;g., MetricName="Threads" instead of MetricName="Total Thread Count").
foHealthData.Metric = FOErrorWarningCodes.GetMetricNameFromCode(foHealthData.Code);
repairData.Metric = SupportedErrorCodes.GetMetricNameFromErrorCode(repairData.Code);
// These args hold the related values supplied by FO and are available anywhere Mitigate is used as a rule head.
// Think of these as facts from FabricObserver.
compoundTerm.AddArgument(new Constant(foHealthData.ApplicationName), RepairConstants.AppName);
compoundTerm.AddArgument(new Constant(foHealthData.Code), RepairConstants.FOErrorCode);
compoundTerm.AddArgument(new Constant(foHealthData.Metric), RepairConstants.MetricName);
compoundTerm.AddArgument(new Constant(foHealthData.NodeName), RepairConstants.NodeName);
compoundTerm.AddArgument(new Constant(foHealthData.NodeType), RepairConstants.NodeType);
compoundTerm.AddArgument(new Constant(foHealthData.ObserverName), RepairConstants.ObserverName);
compoundTerm.AddArgument(new Constant(foHealthData.OS), RepairConstants.OS);
compoundTerm.AddArgument(new Constant(foHealthData.ServiceName), RepairConstants.ServiceName);
compoundTerm.AddArgument(new Constant(foHealthData.SystemServiceProcessName), RepairConstants.SystemServiceProcessName);
compoundTerm.AddArgument(new Constant(foHealthData.PartitionId), RepairConstants.PartitionId);
compoundTerm.AddArgument(new Constant(foHealthData.ReplicaId), RepairConstants.ReplicaOrInstanceId);
compoundTerm.AddArgument(new Constant(Convert.ToInt64(foHealthData.Value)), RepairConstants.MetricValue);
compoundTerm.AddArgument(new Constant(repairData.ApplicationName), RepairConstants.AppName);
compoundTerm.AddArgument(new Constant(repairData.Code), RepairConstants.ErrorCode);
compoundTerm.AddArgument(new Constant(Enum.GetName(typeof(HealthState), repairData.HealthState)), RepairConstants.HealthState);
compoundTerm.AddArgument(new Constant(repairData.Metric), RepairConstants.MetricName);
compoundTerm.AddArgument(new Constant(repairData.NodeName), RepairConstants.NodeName);
compoundTerm.AddArgument(new Constant(repairData.NodeType), RepairConstants.NodeType);
compoundTerm.AddArgument(new Constant(repairData.ObserverName), RepairConstants.ObserverName);
compoundTerm.AddArgument(new Constant(repairData.OS), RepairConstants.OS);
compoundTerm.AddArgument(new Constant(repairData.ServiceName), RepairConstants.ServiceName);
compoundTerm.AddArgument(new Constant(repairData.SystemServiceProcessName), RepairConstants.SystemServiceProcessName);
compoundTerm.AddArgument(new Constant(repairData.PartitionId), RepairConstants.PartitionId);
compoundTerm.AddArgument(new Constant(repairData.ReplicaId), RepairConstants.ReplicaOrInstanceId);
compoundTerm.AddArgument(new Constant(Convert.ToInt64(repairData.Value)), RepairConstants.MetricValue);
compoundTerms.Add(compoundTerm);
// Run Guan query.
@ -548,8 +552,8 @@ namespace FabricHealer.Repair
var executorData = new RepairExecutorData
{
ExecutorTimeoutInMinutes = (int)MaxWaitTimeForInfraRepairTaskCompleted.TotalMinutes,
FOErrorCode = repairConfiguration.FOErrorCode,
FOMetricValue = repairConfiguration.FOHealthMetricValue,
ErrorCode = repairConfiguration.ErrorCode,
FOMetricValue = repairConfiguration.MetricValue,
RepairPolicy = repairConfiguration.RepairPolicy,
NodeName = repairConfiguration.NodeName,
NodeType = repairConfiguration.NodeType,
@ -774,6 +778,8 @@ namespace FabricHealer.Repair
var replica = repList[0];
var partition = await FabricClientInstance.QueryManager.GetPartitionAsync(repairConfiguration.PartitionId);
var partitionKind = partition[0].PartitionInformation.Kind;
// Restart - stateful replica.
if (replica.ServiceKind == ServiceKind.Stateful)
{
@ -1041,20 +1047,20 @@ namespace FabricHealer.Repair
isTargetAppHealedOnTargetNode = appHealth.HealthEvents.Any(
h => JsonSerializationUtility.TryDeserialize(
h.HealthInformation.Description,
out TelemetryData foHealthData)
&& foHealthData.NodeName == repairConfig.NodeName
&& foHealthData.SystemServiceProcessName == repairConfig.SystemServiceProcessName
&& foHealthData.HealthState.ToLower() == "ok");
out TelemetryData repairData)
&& repairData.NodeName == repairConfig.NodeName
&& repairData.SystemServiceProcessName == repairConfig.SystemServiceProcessName
&& repairData.HealthState == HealthState.Ok);
}
else // Application repairs (code package restarts)
{
isTargetAppHealedOnTargetNode = appHealth.HealthEvents.Any(
h => JsonSerializationUtility.TryDeserialize(
h.HealthInformation.Description,
out TelemetryData foHealthData)
&& foHealthData.NodeName == repairConfig.NodeName
&& foHealthData.ApplicationName == repairConfig.AppName.OriginalString
&& foHealthData.HealthState.ToLower() == "ok");
out TelemetryData repairData)
&& repairData.NodeName == repairConfig.NodeName
&& repairData.ApplicationName == repairConfig.AppName.OriginalString
&& repairData.HealthState == HealthState.Ok);
}
return isTargetAppHealedOnTargetNode ? HealthState.Ok : appHealth.AggregatedHealthState;
@ -1072,9 +1078,9 @@ namespace FabricHealer.Repair
bool isTargetNodeHealed = nodeHealth.HealthEvents.Any(
h => JsonSerializationUtility.TryDeserialize(
h.HealthInformation.Description,
out TelemetryData foHealthData)
&& foHealthData.NodeName == repairConfig.NodeName
&& foHealthData.HealthState.ToLower() == "ok");
out TelemetryData repairData)
&& repairData.NodeName == repairConfig.NodeName
&& repairData.HealthState == HealthState.Ok);
return isTargetNodeHealed ? HealthState.Ok : nodeHealth.AggregatedHealthState;
}

Просмотреть файл

@ -104,7 +104,7 @@ namespace FabricHealer.Utilities
private set;
}
public bool EnableNodeRepair
public bool EnableFabricNodeRepair
{
get;
private set;
@ -225,7 +225,7 @@ namespace FabricHealer.Utilities
if (bool.TryParse(GetConfigSettingValue(RepairConstants.FabricNodeRepairPolicySectionName, RepairConstants.Enabled), out bool nodeRepairEnabled))
{
EnableNodeRepair = nodeRepairEnabled;
EnableFabricNodeRepair = nodeRepairEnabled;
}
if (bool.TryParse(GetConfigSettingValue(RepairConstants.ReplicaRepairPolicySectionName, RepairConstants.Enabled), out bool replicaRepairEnabled))

Просмотреть файл

@ -1,252 +0,0 @@
// ------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
using FabricHealer.Repair;
using System.Collections.Generic;
using System.Linq;
namespace FabricHealer.Utilities
{
// FabricObserver Error/Warning/Ok Codes.
public static class FOErrorWarningCodes
{
// Ok
public const string Ok = "FO000";
// CPU
public const string AppErrorCpuPercent = "FO001";
public const string AppWarningCpuPercent = "FO002";
public const string NodeErrorCpuPercent = "FO003";
public const string NodeWarningCpuPercent = "FO004";
// Certificate
public const string ErrorCertificateExpiration = "FO005";
public const string WarningCertificateExpiration = "FO006";
// Disk
public const string NodeErrorDiskSpacePercent = "FO007";
public const string NodeErrorDiskSpaceMB = "FO008";
public const string NodeWarningDiskSpacePercent = "FO009";
public const string NodeWarningDiskSpaceMB = "FO010";
public const string NodeErrorDiskAverageQueueLength = "FO011";
public const string NodeWarningDiskAverageQueueLength = "FO012";
public const string NodeErrorFolderSizeMB = "FO042";
public const string NodeWarningFolderSizeMB = "FO043";
// Memory
public const string AppErrorMemoryPercent = "FO013";
public const string AppWarningMemoryPercent = "FO014";
public const string AppErrorMemoryMB = "FO015";
public const string AppWarningMemoryMB = "FO016";
public const string NodeErrorMemoryPercent = "FO017";
public const string NodeWarningMemoryPercent = "FO018";
public const string NodeErrorMemoryMB = "FO019";
public const string NodeWarningMemoryMB = "FO020";
// Networking
public const string AppErrorNetworkEndpointUnreachable = "FO021";
public const string AppWarningNetworkEndpointUnreachable = "FO022";
public const string AppErrorTooManyActiveTcpPorts = "FO023";
public const string AppWarningTooManyActiveTcpPorts = "FO024";
public const string NodeErrorTooManyActiveTcpPorts = "FO025";
public const string NodeWarningTooManyActiveTcpPorts = "FO026";
public const string ErrorTooManyFirewallRules = "FO027";
public const string WarningTooManyFirewallRules = "FO028";
public const string AppErrorTooManyActiveEphemeralPorts = "FO029";
public const string AppWarningTooManyActiveEphemeralPorts = "FO030";
public const string NodeErrorTooManyActiveEphemeralPorts = "FO031";
public const string NodeWarningTooManyActiveEphemeralPorts = "FO032";
public const string AppErrorActiveEphemeralPortsPercent = "FO044";
public const string AppWarningActiveEphemeralPortsPercent = "FO045";
public const string NodeErrorActiveEphemeralPortsPercent = "FO046";
public const string NodeWarningActiveEphemeralPortsPercent = "FO047";
// Process owned File Handles / File Descriptors - Linux (File Descriptors) and Windows (File Handles)
public const string AppErrorTooManyOpenFileHandles = "FO033";
public const string AppWarningTooManyOpenFileHandles = "FO034";
// System-wide open File Handles / File Descriptors - Linux only.
public const string NodeErrorTotalOpenFileHandlesPercent = "FO035";
public const string NodeWarningTotalOpenFileHandlesPercent = "FO036";
public const string NodeErrorTooManyOpenFileHandles = "FO037";
public const string NodeWarningTooManyOpenFileHandles = "FO038";
// Process Threads (threads running in process)
public const string AppErrorTooManyThreads = "FO039";
public const string AppWarningTooManyThreads = "FO040";
// Process KVS LVIDs (Percent of maximum available currently in use)
public const string AppWarningKvsLvidsPercentUsed = "FO041";
public static Dictionary<string, string> AppErrorCodesDictionary
{
get;
} = new Dictionary<string, string>
{
{ Ok, "Ok" },
{ AppErrorCpuPercent, "AppErrorCpuPercent" },
{ AppWarningCpuPercent, "AppWarningCpuPercent" },
{ AppErrorMemoryPercent, "AppErrorMemoryPercent" },
{ AppWarningMemoryPercent, "AppWarningMemoryPercent" },
{ AppErrorMemoryMB, "AppErrorMemoryMB" },
{ AppWarningMemoryMB, "AppWarningMemoryMB" },
{ AppErrorNetworkEndpointUnreachable, "AppErrorNetworkEndpointUnreachable" },
{ AppWarningNetworkEndpointUnreachable, "AppWarningNetworkEndpointUnreachable" },
{ AppErrorTooManyActiveTcpPorts, "AppErrorTooManyActiveTcpPorts" },
{ AppWarningTooManyActiveTcpPorts, "AppWarningTooManyActiveTcpPorts" },
{ AppErrorTooManyActiveEphemeralPorts, "AppErrorTooManyActiveEphemeralPorts" },
{ AppWarningTooManyActiveEphemeralPorts, "AppWarningTooManyActiveEphemeralPorts" },
{ AppErrorActiveEphemeralPortsPercent, "AppErrorActiveEphemeralPortsPercent" },
{ AppWarningActiveEphemeralPortsPercent, "AppWarningActiveEphemeralPortsPercent" },
{ AppErrorTooManyOpenFileHandles, "AppErrorTooManyOpenFileHandles" },
{ AppWarningTooManyOpenFileHandles, "AppWarningTooManyOpenFileHandles" },
{ AppErrorTooManyThreads, "AppErrorTooManyThreads" },
{ AppWarningTooManyThreads, "AppWarningTooManyThreads" },
{ AppWarningKvsLvidsPercentUsed, "AppWarningKvsLvidsPercentUsed"}
};
public static Dictionary<string, string> NodeErrorCodesDictionary
{
get;
} = new Dictionary<string, string>
{
{ Ok, "Ok" },
{ NodeErrorCpuPercent, "NodeErrorCpuPercent" },
{ NodeWarningCpuPercent, "NodeWarningCpuPercent" },
{ ErrorCertificateExpiration, "ErrorCertificateExpiration" },
{ WarningCertificateExpiration, "WarningCertificateExpiration" },
{ NodeErrorDiskSpacePercent, "NodeErrorDiskSpacePercent" },
{ NodeErrorDiskSpaceMB, "NodeErrorDiskSpaceMB" },
{ NodeWarningDiskSpacePercent, "NodeWarningDiskSpacePercent" },
{ NodeWarningDiskSpaceMB, "NodeWarningDiskSpaceMB" },
{ NodeErrorDiskAverageQueueLength, "NodeErrorDiskAverageQueueLength" },
{ NodeWarningDiskAverageQueueLength, "NodeWarningDiskAverageQueueLength" },
{ NodeErrorFolderSizeMB, "NodeErrorFolderSizeMB" },
{ NodeWarningFolderSizeMB, "NodeWarningFolderSizeMB" },
{ NodeErrorMemoryPercent, "NodeErrorMemoryPercent" },
{ NodeWarningMemoryPercent, "NodeWarningMemoryPercent" },
{ NodeErrorMemoryMB, "NodeErrorMemoryMB" },
{ NodeWarningMemoryMB, "NodeWarningMemoryMB" },
{ NodeErrorTooManyActiveTcpPorts, "NodeErrorTooManyActiveTcpPorts" },
{ NodeWarningTooManyActiveTcpPorts, "NodeWarningTooManyActiveTcpPorts" },
{ ErrorTooManyFirewallRules, "NodeErrorTooManyFirewallRules" },
{ WarningTooManyFirewallRules, "NodeWarningTooManyFirewallRules" },
{ NodeErrorTooManyActiveEphemeralPorts, "NodeErrorTooManyActiveEphemeralPorts" },
{ NodeWarningTooManyActiveEphemeralPorts, "NodeWarningTooManyActiveEphemeralPorts" },
{ NodeErrorActiveEphemeralPortsPercent, "NodeErrorActiveEphemeralPortsPercent" },
{ NodeWarningActiveEphemeralPortsPercent, "NodeWarningActiveEphemeralPortsPercent" },
{ NodeErrorTotalOpenFileHandlesPercent, "NodeErrorTotalOpenFileHandlesPercent" },
{ NodeWarningTotalOpenFileHandlesPercent, "NodeWarningTotalOpenFileHandlesPercent" },
{ NodeErrorTooManyOpenFileHandles, "NodeErrorTooManyOpenFileHandles" },
{ NodeWarningTooManyOpenFileHandles, "NodeWarningTooManyOpenFileHandles" }
};
public static string GetErrorWarningNameFromCode(string id)
{
if (string.IsNullOrWhiteSpace(id))
{
return null;
}
if (AppErrorCodesDictionary.Any(k => k.Key == id))
{
return AppErrorCodesDictionary.First(k => k.Key == id).Value;
}
return NodeErrorCodesDictionary.Any(k => k.Key == id) ? NodeErrorCodesDictionary.First(k => k.Key == id).Value : null;
}
public static string GetMetricNameFromCode(string code)
{
if (GetIsResourceType(code, RepairConstants.ActiveTcpPorts))
{
return RepairConstants.ActiveTcpPorts;
}
if (GetIsResourceType(code, RepairConstants.Certificate))
{
return RepairConstants.Certificate;
}
if (GetIsResourceType(code, RepairConstants.Cpu))
{
return RepairConstants.CpuPercent;
}
if (GetIsResourceType(code, RepairConstants.DiskAverageQueueLength))
{
return RepairConstants.DiskAverageQueueLength;
}
if (GetIsResourceType(code, RepairConstants.DiskSpaceMB))
{
return RepairConstants.DiskSpaceMB;
}
if (GetIsResourceType(code, RepairConstants.DiskSpacePercent))
{
return RepairConstants.DiskSpacePercent;
}
if (GetIsResourceType(code, RepairConstants.FolderSizeMB))
{
return RepairConstants.FolderSizeMB;
}
if (GetIsResourceType(code, RepairConstants.EndpointUnreachable))
{
return RepairConstants.EndpointUnreachable;
}
if (GetIsResourceType(code, RepairConstants.EphemeralPortsPercent))
{
return RepairConstants.EphemeralPortsPercent;
}
if (GetIsResourceType(code, RepairConstants.EphemeralPorts))
{
return RepairConstants.EphemeralPorts;
}
if (GetIsResourceType(code, RepairConstants.FirewallRules))
{
return RepairConstants.FirewallRules;
}
if (GetIsResourceType(code, RepairConstants.MemoryMB))
{
return RepairConstants.MemoryMB;
}
if (GetIsResourceType(code, RepairConstants.MemoryPercent))
{
return RepairConstants.MemoryPercent;
}
if (GetIsResourceType(code, RepairConstants.Threads))
{
return RepairConstants.Threads;
}
if (GetIsResourceType(code, RepairConstants.FileHandles))
{
return RepairConstants.FileHandles;
}
return GetIsResourceType(code, RepairConstants.FileHandlesPercent) ? RepairConstants.FileHandlesPercent : null;
}
private static bool GetIsResourceType(string id, string resourceType)
{
if (string.IsNullOrWhiteSpace(id))
{
return false;
}
return AppErrorCodesDictionary.Any(k => k.Key == id && k.Value.Contains(resourceType))
|| NodeErrorCodesDictionary.Any(k => k.Key == id && k.Value.Contains(resourceType));
}
}
}

Просмотреть файл

@ -0,0 +1,397 @@
// ------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
using FabricHealer.Repair;
using System.Collections.Generic;
using System.Linq;
namespace FabricHealer.Utilities
{
/// <summary>
/// Error and Warning Codes related to machine resource usage metrics at the machine and service levels.
/// FabricHealer understands these codes.
/// </summary>
public sealed class SupportedErrorCodes
{
/// <summary>
/// Ok HealthState
/// </summary>
public const string Ok = "FO000";
/// <summary>
/// FO001 Percentage of total CPU usage has exceeded configured Error threshold for an app service process.
/// </summary>
public const string AppErrorCpuPercent = "FO001";
/// <summary>
/// FO002 Percentage of total CPU usage has exceeded configured Warning threshold for an app service process.
/// </summary>
public const string AppWarningCpuPercent = "FO002";
/// <summary>
/// FO003 Percentage of total CPU usage has exceeded configured Error threshold on a machine.
/// </summary>
public const string NodeErrorCpuPercent = "FO003";
/// <summary>
/// FO004 Percentage of total CPU usage has exceeded configured Warning threshold on a machine.
/// </summary>
public const string NodeWarningCpuPercent = "FO004";
/// <summary>
/// FO005 Error: Certificate expiration has occured.
/// </summary>
public const string ErrorCertificateExpiration = "FO005";
/// <summary>
/// FO006 Warning: Certificate expiration is imminent.
/// </summary>
public const string WarningCertificateExpiration = "FO006";
/// <summary>
/// FO007 Disk usage percentage has exceeded configured Error threshold on a machine.
/// </summary>
public const string NodeErrorDiskSpacePercent = "FO007";
/// <summary>
/// FO008 Disk usage space (MB) has exceeded configured Error threshold on a machine.
/// </summary>
public const string NodeErrorDiskSpaceMB = "FO008";
/// <summary>
/// FO009 Disk usage percentage has exceeded configured Warning threshold on a machine.
/// </summary>
public const string NodeWarningDiskSpacePercent = "FO009";
/// <summary>
/// FO010 Disk usage space (MB) has exceeded configured Warning threshold on a machine.
/// </summary>
public const string NodeWarningDiskSpaceMB = "FO010";
/// <summary>
/// FO011 Avergage disk queue length has exceeded configured Error threshold.
/// </summary>
public const string NodeErrorDiskAverageQueueLength = "FO011";
/// <summary>
/// FO012 Average disk queue length has exceeded configured Warning threshold.
/// </summary>
public const string NodeWarningDiskAverageQueueLength = "FO012";
/// <summary>
/// FO042 Folder size (MB) has exceeded configured Error threshold
/// </summary>
public const string NodeErrorFolderSizeMB = "FO042";
/// <summary>
/// FO043 Folder size (MB) has exceeded configured Warning threshold
/// </summary>
public const string NodeWarningFolderSizeMB = "FO043";
/// <summary>
/// FO013 Percentage of total physical memory usage has exceeded configured Error threshold for an app service process.
/// </summary>
public const string AppErrorMemoryPercent = "FO013";
/// <summary>
/// FO014 Percentage of total physical memory usage has exceeded configured Warning threshold for an app service process.
/// </summary>
public const string AppWarningMemoryPercent = "FO014";
/// <summary>
/// FO015 Committed memory (MB) has exceeded configured Error threshold for an app service process.
/// </summary>
public const string AppErrorMemoryMB = "FO015";
/// <summary>
/// FO016 Committed memory (MB) has exceeded configured Warning threshold for an app service process.
/// </summary>
public const string AppWarningMemoryMB = "FO016";
/// <summary>
/// FO017 Percentage of total physical memory usage has exceeded configured Error threshold on a machine.
/// </summary>
public const string NodeErrorMemoryPercent = "FO017";
/// <summary>
/// FO018 Percentage of total physical memory usage has exceeded configured Warning threshold on a machine.
/// </summary>
public const string NodeWarningMemoryPercent = "FO018";
/// <summary>
/// FO019 Total Committed memory (MB) has exceeded configured Error threshold on a machine.
/// </summary>
public const string NodeErrorMemoryMB = "FO019";
/// <summary>
/// FO020 Total Committed memory (MB) has exceeded configured Warning threshold on a machine.
/// </summary>
public const string NodeWarningMemoryMB = "FO020";
/// <summary>
/// FO021 Error: Configured endpoint detected as unreachable.
/// </summary>
public const string AppErrorNetworkEndpointUnreachable = "FO021";
/// <summary>
/// FO022 Warning: Configured endpoint detected as unreachable.
/// </summary>
public const string AppWarningNetworkEndpointUnreachable = "FO022";
/// <summary>
/// FO023 Number of active TCP ports at or exceeding configured Error threshold for an app service process.
/// </summary>
public const string AppErrorTooManyActiveTcpPorts = "FO023";
/// <summary>
/// FO024 Number of active TCP ports at or exceeding configured Warning threshold for an app service process.
/// </summary>
public const string AppWarningTooManyActiveTcpPorts = "FO024";
/// <summary>
/// FO025 Number of active TCP ports at or exceeding configured Error threshold on a machine.
/// </summary>
public const string NodeErrorTooManyActiveTcpPorts = "FO025";
/// <summary>
/// FO026 Number of active TCP ports at or exceeding configured Warning threshold on a machine.
/// </summary>
public const string NodeWarningTooManyActiveTcpPorts = "FO026";
/// <summary>
/// FO027 Number of enabled Firewall Rules at or exceeding configured Error threshold on a machine.
/// </summary>
public const string ErrorTooManyFirewallRules = "FO027";
/// <summary>
/// FO028 Number of enabled Firewall Rules at or exceeding configured Warning threshold on a machine.
/// </summary>
public const string WarningTooManyFirewallRules = "FO028";
/// <summary>
/// FO029 Number of active Ephemeral TCP ports (ports in the dynamic range) at or exceeding configured Error threshold for an app service process.
/// </summary>
public const string AppErrorTooManyActiveEphemeralPorts = "FO029";
/// <summary>
/// FO030 Number of active Ephemeral TCP ports (ports in the dynamic range) at or exceeding configured Warning threshold for an app service process.
/// </summary>
public const string AppWarningTooManyActiveEphemeralPorts = "FO030";
/// <summary>
/// FO031 Number of active Ephemeral TCP ports (ports in the Windows dynamic port range) at or exceeding configured Error threshold for a machine.
/// </summary>
public const string NodeErrorTooManyActiveEphemeralPorts = "FO031";
/// <summary>
/// FO032 Number of active Ephemeral TCP ports (ports in the Windows dynamic port range) at or exceeding configured Warning threshold for a machine.
/// </summary>
public const string NodeWarningTooManyActiveEphemeralPorts = "FO032";
/// <summary>
/// FO044 Percentage of active Ephemeral TCP ports in use is at or exceeding configured Error threshold for an app service process.
/// </summary>
public const string AppErrorActiveEphemeralPortsPercent = "FO044";
/// <summary>
/// FO045 Percentage of active Ephemeral TCP ports in use is at or exceeding configured Error threshold for an app service process.
/// </summary>
public const string AppWarningActiveEphemeralPortsPercent = "FO045";
/// <summary>
/// FO046 Percentage of active Ephemeral TCP ports in use is at or exceeding configured Error threshold for a machine.
/// </summary>
public const string NodeErrorActiveEphemeralPortsPercent = "FO046";
/// <summary>
/// Percentage of active Ephemeral TCP ports in use is at or exceeding configured Error threshold for a machine.
/// </summary>
public const string NodeWarningActiveEphemeralPortsPercent = "FO047";
/// <summary>
/// FO033 Number of allocated File Handles is at or exceeding configured Error threshold for an app service process.
/// </summary>
public const string AppErrorTooManyOpenFileHandles = "FO033";
/// <summary>
/// FO034 Number of allocated File Handles is at or exceeding configured Warning threshold for an app service process.
/// </summary>
public const string AppWarningTooManyOpenFileHandles = "FO034";
/// <summary>
/// FO035 Percentage of Maximum number of File Descriptors in use is at or exceeding configured Error threshold on a Linux machine.
/// </summary>
public const string NodeErrorTotalOpenFileHandlesPercent = "FO035";
/// <summary>
/// FO036 Percentage of Maximum number of File Descriptors in use is at or exceeding configured Warning threshold on a Linux machine.
/// </summary>
public const string NodeWarningTotalOpenFileHandlesPercent = "FO036";
/// <summary>
/// FO037 Number of allocated File Handles is at or exceeding configured Error threshold on a Linux a machine.
/// </summary>
public const string NodeErrorTooManyOpenFileHandles = "FO037";
/// <summary>
/// FO038 Number of allocated File Handles is at or exceeding configured Warning threshold on a Linux a machine.
/// </summary>
public const string NodeWarningTooManyOpenFileHandles = "FO038";
/// <summary>
/// FO039 Number of threads at or exceeding configured Error threshold for an app service process.
/// </summary>
public const string AppErrorTooManyThreads = "FO039";
/// <summary>
/// FO040 Number of threads at or exceeding configured Warning threshold for an app service process.
/// </summary>
public const string AppWarningTooManyThreads = "FO040";
/// <summary>
/// FO041 Percentage of Maximum number of KVS LVIDs in use is at or exceeding internal Warning threshold (75%) for an app service process.
/// The related threshold is non-configurable and Windows-only.
/// </summary>
public const string AppWarningKvsLvidsPercentUsed = "FO041";
/// <summary>
/// AppErrorCodesDictionary dictionary.
/// </summary>
public static Dictionary<string, string> AppErrorCodesDictionary
{
get;
} = new Dictionary<string, string>
{
{ Ok, "Ok" },
{ AppErrorCpuPercent, "AppErrorCpuPercent" },
{ AppWarningCpuPercent, "AppWarningCpuPercent" },
{ AppErrorMemoryPercent, "AppErrorMemoryPercent" },
{ AppWarningMemoryPercent, "AppWarningMemoryPercent" },
{ AppErrorMemoryMB, "AppErrorMemoryMB" },
{ AppWarningMemoryMB, "AppWarningMemoryMB" },
{ AppErrorNetworkEndpointUnreachable, "AppErrorNetworkEndpointUnreachable" },
{ AppWarningNetworkEndpointUnreachable, "AppWarningNetworkEndpointUnreachable" },
{ AppErrorTooManyActiveTcpPorts, "AppErrorTooManyActiveTcpPorts" },
{ AppWarningTooManyActiveTcpPorts, "AppWarningTooManyActiveTcpPorts" },
{ AppErrorTooManyActiveEphemeralPorts, "AppErrorTooManyActiveEphemeralPorts" },
{ AppWarningTooManyActiveEphemeralPorts, "AppWarningTooManyActiveEphemeralPorts" },
{ AppErrorActiveEphemeralPortsPercent, "AppErrorActiveEphemeralPortsPercent" },
{ AppWarningActiveEphemeralPortsPercent, "AppWarningActiveEphemeralPortsPercent" },
{ AppErrorTooManyOpenFileHandles, "AppErrorTooManyOpenFileHandles" },
{ AppWarningTooManyOpenFileHandles, "AppWarningTooManyOpenFileHandles" },
{ AppErrorTooManyThreads, "AppErrorTooManyThreads" },
{ AppWarningTooManyThreads, "AppWarningTooManyThreads" },
{ AppWarningKvsLvidsPercentUsed, "AppWarningKvsLvidsPercentUsed"}
};
/// <summary>
/// NodeErrorCodesDictionary dictionary.
/// </summary>
public static Dictionary<string, string> NodeErrorCodesDictionary
{
get;
} = new Dictionary<string, string>
{
{ Ok, "Ok" },
{ NodeErrorCpuPercent, "NodeErrorCpuPercent" },
{ NodeWarningCpuPercent, "NodeWarningCpuPercent" },
{ ErrorCertificateExpiration, "ErrorCertificateExpiration" },
{ WarningCertificateExpiration, "WarningCertificateExpiration" },
{ NodeErrorDiskSpacePercent, "NodeErrorDiskSpacePercent" },
{ NodeErrorDiskSpaceMB, "NodeErrorDiskSpaceMB" },
{ NodeWarningDiskSpacePercent, "NodeWarningDiskSpacePercent" },
{ NodeWarningDiskSpaceMB, "NodeWarningDiskSpaceMB" },
{ NodeErrorDiskAverageQueueLength, "NodeErrorDiskAverageQueueLength" },
{ NodeWarningDiskAverageQueueLength, "NodeWarningDiskAverageQueueLength" },
{ NodeErrorFolderSizeMB, "NodeErrorFolderSizeMB" },
{ NodeWarningFolderSizeMB, "NodeWarningFolderSizeMB" },
{ NodeErrorMemoryPercent, "NodeErrorMemoryPercent" },
{ NodeWarningMemoryPercent, "NodeWarningMemoryPercent" },
{ NodeErrorMemoryMB, "NodeErrorMemoryMB" },
{ NodeWarningMemoryMB, "NodeWarningMemoryMB" },
{ NodeErrorTooManyActiveTcpPorts, "NodeErrorTooManyActiveTcpPorts" },
{ NodeWarningTooManyActiveTcpPorts, "NodeWarningTooManyActiveTcpPorts" },
{ ErrorTooManyFirewallRules, "NodeErrorTooManyFirewallRules" },
{ WarningTooManyFirewallRules, "NodeWarningTooManyFirewallRules" },
{ NodeErrorTooManyActiveEphemeralPorts, "NodeErrorTooManyActiveEphemeralPorts" },
{ NodeWarningTooManyActiveEphemeralPorts, "NodeWarningTooManyActiveEphemeralPorts" },
{ NodeErrorActiveEphemeralPortsPercent, "NodeErrorActiveEphemeralPortsPercent" },
{ NodeWarningActiveEphemeralPortsPercent, "NodeWarningActiveEphemeralPortsPercent" },
{ NodeErrorTotalOpenFileHandlesPercent, "NodeErrorTotalOpenFileHandlesPercent" },
{ NodeWarningTotalOpenFileHandlesPercent, "NodeWarningTotalOpenFileHandlesPercent" },
{ NodeErrorTooManyOpenFileHandles, "NodeErrorTooManyOpenFileHandles" },
{ NodeWarningTooManyOpenFileHandles, "NodeWarningTooManyOpenFileHandles" }
};
/// <summary>
/// This function takes a SupportedErrorCodes code (key) and returns the name of the error or warning (value).
/// </summary>
/// <param name="code">The SupportedErrorWarningCodes code to use as a lookup key.</param>
/// <returns>The name of the error or warning code.</returns>
public static string GetCodeNameFromErrorCode(string code)
{
if (string.IsNullOrWhiteSpace(code))
{
return null;
}
if (AppErrorCodesDictionary.Any(k => k.Key == code))
{
return AppErrorCodesDictionary.First(k => k.Key == code).Value;
}
return NodeErrorCodesDictionary.Any(k => k.Key == code) ? NodeErrorCodesDictionary.First(k => k.Key == code).Value : null;
}
/// <summary>
/// Given a SupportedErrorCodes code (key), return the associated Metric Name.
/// </summary>
/// <param name="code">The error code.</param>
/// <returns>Name of associated metric that put the entity into Error or Warning state.</returns>
public static string GetMetricNameFromErrorCode(string code)
{
if (GetIsResourceType(code, RepairConstants.ActiveTcpPorts))
{
return RepairConstants.ActiveTcpPorts;
}
if (GetIsResourceType(code, RepairConstants.Certificate))
{
return RepairConstants.Certificate;
}
if (GetIsResourceType(code, RepairConstants.Cpu))
{
return RepairConstants.CpuPercent;
}
if (GetIsResourceType(code, RepairConstants.DiskAverageQueueLength))
{
return RepairConstants.DiskAverageQueueLength;
}
if (GetIsResourceType(code, RepairConstants.DiskSpaceMB))
{
return RepairConstants.DiskSpaceMB;
}
if (GetIsResourceType(code, RepairConstants.DiskSpacePercent))
{
return RepairConstants.DiskSpacePercent;
}
if (GetIsResourceType(code, RepairConstants.FolderSizeMB))
{
return RepairConstants.FolderSizeMB;
}
if (GetIsResourceType(code, RepairConstants.EndpointUnreachable))
{
return RepairConstants.EndpointUnreachable;
}
if (GetIsResourceType(code, RepairConstants.EphemeralPortsPercent))
{
return RepairConstants.EphemeralPortsPercent;
}
if (GetIsResourceType(code, RepairConstants.EphemeralPorts))
{
return RepairConstants.EphemeralPorts;
}
if (GetIsResourceType(code, RepairConstants.FirewallRules))
{
return RepairConstants.FirewallRules;
}
if (GetIsResourceType(code, RepairConstants.MemoryMB))
{
return RepairConstants.MemoryMB;
}
if (GetIsResourceType(code, RepairConstants.MemoryPercent))
{
return RepairConstants.MemoryPercent;
}
if (GetIsResourceType(code, RepairConstants.Threads))
{
return RepairConstants.Threads;
}
if (GetIsResourceType(code, RepairConstants.FileHandles))
{
return RepairConstants.FileHandles;
}
return GetIsResourceType(code, RepairConstants.FileHandlesPercent) ? RepairConstants.FileHandlesPercent : null;
}
private static bool GetIsResourceType(string id, string resourceType)
{
if (string.IsNullOrWhiteSpace(id))
{
return false;
}
return AppErrorCodesDictionary.Any(k => k.Key == id && k.Value.Contains(resourceType))
|| NodeErrorCodesDictionary.Any(k => k.Key == id && k.Value.Contains(resourceType));
}
}
}

Просмотреть файл

@ -197,11 +197,11 @@ namespace FabricHealer.Utilities.Telemetry
{ "ClusterId", telemetryData.ClusterId ?? string.Empty },
{ "ErrorCode", telemetryData.Code ?? string.Empty },
{ "Description", telemetryData.Description ?? string.Empty },
{ "HealthState", telemetryData.HealthState ?? string.Empty },
{ "HealthState", Enum.GetName(typeof(HealthState), telemetryData.HealthState) },
{ "Metric", telemetryData.Metric ?? string.Empty },
{ "NodeName", telemetryData.NodeName ?? string.Empty },
{ "ObserverName", telemetryData.ObserverName ?? string.Empty },
{ "Partition", telemetryData.PartitionId ?? string.Empty },
{ "Partition", telemetryData.PartitionId != null ? telemetryData.PartitionId.ToString() : string.Empty },
{ "Replica", telemetryData.ReplicaId.ToString() },
{ "Source", telemetryData.Source ?? string.Empty },
{ "Value", telemetryData.Value.ToString() },

Просмотреть файл

@ -0,0 +1,46 @@
// ------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
namespace FabricHealer.Utilities.Telemetry
{
/// <summary>
/// Service Fabric entity types.
/// </summary>
public enum EntityType
{
/// <summary>
/// Application type.
/// </summary>
Application,
/// <summary>
/// Node type.
/// </summary>
Node,
/// <summary>
/// Service type.
/// </summary>
Service,
/// <summary>
/// StatefulService type.
/// </summary>
StatefulService,
/// <summary>
/// StatelessService type.
/// </summary>
StatelessService,
/// <summary>
/// Partition report.
/// </summary>
Partition,
/// <summary>
/// DeployedApplication type.
/// </summary>
DeployedApplication,
/// <summary>
/// Process. This is only for direct process restarts of a Service Fabric system service executable.
/// </summary>
Process
}
}

Просмотреть файл

@ -67,19 +67,19 @@ namespace FabricHealer.Utilities.Telemetry
string instanceName = null)
{
string jsonPayload = JsonConvert.SerializeObject(
new
{
id = $"FH_{Guid.NewGuid()}",
datetime = DateTime.UtcNow,
source = "FabricHealer",
property = propertyName,
healthScope = scope.ToString(),
healthState = state.ToString(),
healthEvaluation = unhealthyEvaluations,
osPlatform = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux",
serviceName = serviceName ?? string.Empty,
instanceName = instanceName ?? string.Empty,
});
new
{
id = $"FH_{Guid.NewGuid()}",
datetime = DateTime.UtcNow,
source = "FabricHealer",
property = propertyName,
healthScope = scope.ToString(),
healthState = state.ToString(),
healthEvaluation = unhealthyEvaluations,
osPlatform = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux",
serviceName = serviceName ?? string.Empty,
instanceName = instanceName ?? string.Empty,
});
await SendTelemetryAsync(jsonPayload, cancellationToken).ConfigureAwait(false);
}
@ -106,15 +106,15 @@ namespace FabricHealer.Utilities.Telemetry
CancellationToken cancellationToken)
{
string jsonPayload = JsonConvert.SerializeObject(
new
{
id = $"FH_{Guid.NewGuid()}",
datetime = DateTime.UtcNow,
source,
osPlatform = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux",
property = name,
value,
});
new
{
id = $"FH_{Guid.NewGuid()}",
datetime = DateTime.UtcNow,
source,
osPlatform = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux",
property = name,
value,
});
await SendTelemetryAsync(jsonPayload, cancellationToken).ConfigureAwait(false);

Просмотреть файл

@ -5,19 +5,21 @@
using Newtonsoft.Json;
using System.Runtime.InteropServices;
using FabricHealer.TelemetryLib;
using FabricHealer.Interfaces;
using System.Fabric.Health;
using System;
namespace FabricHealer.Utilities.Telemetry
{
public class TelemetryData
public class TelemetryData : ITelemetryData
{
private readonly string _os;
public string ApplicationName
{
get; set;
}
public string ClusterId => ClusterInformation.ClusterInfoTuple.ClusterId;
public string Code
{
get; set;
@ -28,16 +30,26 @@ namespace FabricHealer.Utilities.Telemetry
get; set;
}
public string ClusterId
{
get; set;
}
public string Description
{
get; set;
}
public string HealthState
public EntityType EntityType
{
get; set;
}
public HealthState HealthState
{
get; set;
}
public string Metric
{
get; set;
@ -48,22 +60,25 @@ namespace FabricHealer.Utilities.Telemetry
get; set;
}
/// <summary>
/// The name of the FabricObserver observer that generated the health information.
/// </summary>
public string ObserverName
{
get; set;
}
public string OS
{
get;
} = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux";
public string PartitionId
get { return _os; }
}
public Guid PartitionId
{
get; set;
}
public int ProcessId
public long ProcessId
{
get; set;
}
@ -97,13 +112,15 @@ namespace FabricHealer.Utilities.Telemetry
{
get; set;
}
/// <summary>
/// The Repair Id.
/// </summary>
public string RepairId
{
get; set;
}
public string HealthEventProperty
public string Property
{
get; set;
}
@ -111,7 +128,7 @@ namespace FabricHealer.Utilities.Telemetry
[JsonConstructor]
public TelemetryData()
{
_os = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux";
}
}
}

Просмотреть файл

@ -5,6 +5,7 @@
using FabricHealer.Interfaces;
using FabricHealer.Repair;
using FabricHealer.TelemetryLib;
using System;
using System.Fabric;
using System.Fabric.Health;
@ -117,11 +118,12 @@ namespace FabricHealer.Utilities.Telemetry
var telemData = new TelemetryData()
{
ApplicationName = repairConfig?.AppName?.OriginalString ?? string.Empty,
ClusterId = ClusterInformation.ClusterInfoTuple.ClusterId,
Description = description,
HealthState = Enum.GetName(typeof(HealthState), healthState),
HealthState = healthState,
Metric = repairAction,
NodeName = repairConfig?.NodeName ?? string.Empty,
PartitionId = repairConfig?.PartitionId.ToString() ?? string.Empty,
PartitionId = repairConfig?.PartitionId != null ? repairConfig.PartitionId : default,
ReplicaId = repairConfig != null ? repairConfig.ReplicaOrInstanceId : 0,
ServiceName = repairConfig?.ServiceName?.OriginalString ?? string.Empty,
Source = source,
@ -135,22 +137,23 @@ namespace FabricHealer.Utilities.Telemetry
if (FabricHealerManager.ConfigSettings.EtwEnabled)
{
ServiceEventSource.Current.Write(
RepairConstants.EventSourceEventName,
new
{
ApplicationName = repairConfig?.AppName?.OriginalString ?? string.Empty,
Description = description,
HealthState = Enum.GetName(typeof(HealthState), healthState),
Metric = repairAction,
PartitionId = repairConfig?.PartitionId.ToString() ?? string.Empty,
ReplicaId = repairConfig?.ReplicaOrInstanceId.ToString() ?? string.Empty,
Level = level,
NodeName = repairConfig?.NodeName ?? string.Empty,
OS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux",
ServiceName = repairConfig?.ServiceName?.OriginalString ?? string.Empty,
Source = source,
SystemServiceProcessName = repairConfig?.SystemServiceProcessName ?? string.Empty,
});
RepairConstants.EventSourceEventName,
new
{
ApplicationName = repairConfig?.AppName?.OriginalString ?? string.Empty,
ClusterInformation.ClusterInfoTuple.ClusterId,
Description = description,
HealthState = Enum.GetName(typeof(HealthState), healthState),
Metric = repairAction,
PartitionId = repairConfig?.PartitionId.ToString() ?? string.Empty,
ReplicaId = repairConfig?.ReplicaOrInstanceId.ToString() ?? string.Empty,
Level = level,
NodeName = repairConfig?.NodeName ?? string.Empty,
OS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux",
ServiceName = repairConfig?.ServiceName?.OriginalString ?? string.Empty,
Source = source,
SystemServiceProcessName = repairConfig?.SystemServiceProcessName ?? string.Empty,
});
}
}
}

Просмотреть файл

@ -1,7 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<ApplicationManifest xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ApplicationTypeName="FabricHealerType" ApplicationTypeVersion="1.0.15" xmlns="http://schemas.microsoft.com/2011/01/fabric">
<ApplicationManifest xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ApplicationTypeName="FabricHealerType" ApplicationTypeVersion="1.1.0" xmlns="http://schemas.microsoft.com/2011/01/fabric">
<Parameters>
<Parameter Name="FabricHealer_InstanceCount" DefaultValue="-1" />
<!-- FabricHealerManager Settings -->
<Parameter Name="AutoMitigationEnabled" DefaultValue="true" />
<Parameter Name="EventSourceProviderEnabled" DefaultValue="true" />
@ -16,7 +15,7 @@
<!-- Repair Policy Enablement -->
<Parameter Name="EnableAppRepair" DefaultValue="true" />
<Parameter Name="EnableDiskRepair" DefaultValue="false" />
<Parameter Name="EnableNodeRepair" DefaultValue="false" />
<Parameter Name="EnableFabricNodeRepair" DefaultValue="true" />
<Parameter Name="EnableReplicaRepair" DefaultValue="false" />
<Parameter Name="EnableSystemAppRepair" DefaultValue="false" />
<Parameter Name="EnableVMRepair" DefaultValue="false" />
@ -25,7 +24,7 @@
should match the Name and Version attributes of the ServiceManifest element defined in the
ServiceManifest.xml file. -->
<ServiceManifestImport>
<ServiceManifestRef ServiceManifestName="FabricHealerPkg" ServiceManifestVersion="1.0.15" />
<ServiceManifestRef ServiceManifestName="FabricHealerPkg" ServiceManifestVersion="1.1.0" />
<ConfigOverrides>
<ConfigOverride Name="Config">
<Settings>
@ -47,7 +46,7 @@
<Parameter Name="Enabled" Value="[EnableDiskRepair]" />
</Section>
<Section Name="FabricNodeRepairPolicy">
<Parameter Name="Enabled" Value="[EnableNodeRepair]" />
<Parameter Name="Enabled" Value="[EnableFabricNodeRepair]" />
</Section>
<Section Name="ReplicaRepairPolicy">
<Parameter Name="Enabled" Value="[EnableReplicaRepair]" />
@ -65,18 +64,6 @@
<RunAsPolicy CodePackageRef="Code" UserRef="SystemUser" />
</Policies>
</ServiceManifestImport>
<DefaultServices>
<!-- The section below creates instances of service types, when an instance of this
application type is created. You can also create one or more instances of service type using the
ServiceFabric PowerShell module.
The attribute ServiceTypeName below must match the name defined in the imported ServiceManifest.xml file. -->
<Service Name="FabricHealer" ServicePackageActivationMode="ExclusiveProcess">
<StatelessService ServiceTypeName="FabricHealerType" InstanceCount="[FabricHealer_InstanceCount]">
<SingletonPartition />
</StatelessService>
</Service>
</DefaultServices>
<!-- Because of the actions FabricHealer takes in a cluster, it must run as Admin user on Windows and root on Linux.
LocalSystem AccountType maps to System on Windows and root on Linux. -->
<Principals>

Просмотреть файл

@ -1,6 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<Application xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Name="fabric:/FabricHealer" xmlns="http://schemas.microsoft.com/2011/01/fabric">
<Parameters>
<Parameter Name="FabricHealer_InstanceCount" Value="-1" />
</Parameters>
<Parameters />
</Application>

Просмотреть файл

@ -26,6 +26,9 @@
<None Include="PublishProfiles\Local.1Node.xml" />
<None Include="PublishProfiles\Local.5Node.xml" />
<None Include="Scripts\Deploy-FabricApplication.ps1" />
<None Include="StartupServiceParameters\Cloud.xml" />
<None Include="StartupServiceParameters\Local.1Node.xml" />
<None Include="StartupServiceParameters\Local.5Node.xml" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\FabricHealer\FabricHealer.csproj" />
@ -33,6 +36,7 @@
<ItemGroup>
<Content Include="packages.config" />
<Content Include="PublishProfiles\Cloud.xml" />
<Content Include="StartupServices.xml" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.Common.targets" />
<PropertyGroup>

Просмотреть файл

@ -1,11 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<PublishProfile xmlns="http://schemas.microsoft.com/2015/05/fabrictools">
<ClusterConnectionParameters ConnectionEndpoint=""
X509Credential="true"
ServerCertThumbprint=""
FindType="FindByThumbprint"
FindValue=""
StoreLocation="LocalMachine"
StoreName="My" />
<ApplicationParameterFile Path="..\ApplicationParameters\Cloud.xml" />
<ClusterConnectionParameters />
<ApplicationParameterFile Path="..\ApplicationParameters\Cloud.xml" />
<StartupServiceParameterFile Path="..\StartupServiceParameters\Cloud.xml" />
<CopyPackageParameters CompressPackage="true" />
</PublishProfile>

Просмотреть файл

@ -1,11 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<PublishProfile xmlns="http://schemas.microsoft.com/2015/05/fabrictools">
<!-- ClusterConnectionParameters allows you to specify the PowerShell parameters to use when connecting to the Service Fabric cluster.
<!-- ClusterConnectionParameters allows you to specify the PowerShell parameters to use when connecting to the Service Fabric cluster.
Valid parameters are any that are accepted by the Connect-ServiceFabricCluster cmdlet.
For a local cluster, you would typically not use any parameters.
For example: <ClusterConnectionParameters />
-->
<ClusterConnectionParameters />
<ApplicationParameterFile Path="..\ApplicationParameters\Local.1Node.xml" />
<ClusterConnectionParameters />
<ApplicationParameterFile Path="..\ApplicationParameters\Local.1Node.xml" />
<StartupServiceParameterFile Path="..\StartupServiceParameters\Local.1Node.xml" />
</PublishProfile>

Просмотреть файл

@ -1,11 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<PublishProfile xmlns="http://schemas.microsoft.com/2015/05/fabrictools">
<!-- ClusterConnectionParameters allows you to specify the PowerShell parameters to use when connecting to the Service Fabric cluster.
<!-- ClusterConnectionParameters allows you to specify the PowerShell parameters to use when connecting to the Service Fabric cluster.
Valid parameters are any that are accepted by the Connect-ServiceFabricCluster cmdlet.
For a local cluster, you would typically not use any parameters.
For example: <ClusterConnectionParameters />
-->
<ClusterConnectionParameters />
<ApplicationParameterFile Path="..\ApplicationParameters\Local.5Node.xml" />
<ClusterConnectionParameters />
<ApplicationParameterFile Path="..\ApplicationParameters\Local.5Node.xml" />
<StartupServiceParameterFile Path="..\StartupServiceParameters\Local.5Node.xml" />
</PublishProfile>

Просмотреть файл

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<StartupServices xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://schemas.microsoft.com/2011/01/fabric">
<Parameters>
<Parameter Name="FabricHealer_InstanceCount" Value="-1" />
</Parameters>
</StartupServices>

Просмотреть файл

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<StartupServices xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://schemas.microsoft.com/2011/01/fabric">
<Parameters>
<Parameter Name="FabricHealer_InstanceCount" Value="-1" />
</Parameters>
</StartupServices>

Просмотреть файл

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<StartupServices xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://schemas.microsoft.com/2011/01/fabric">
<Parameters>
<Parameter Name="FabricHealer_InstanceCount" Value="-1" />
</Parameters>
</StartupServices>

Просмотреть файл

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="utf-8"?>
<StartupServicesManifest xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://schemas.microsoft.com/2011/01/fabric">
<Parameters>
<Parameter Name="FabricHealer_InstanceCount" DefaultValue="-1" />
</Parameters>
<Services>
<Service Name="FabricHealer" ServicePackageActivationMode="ExclusiveProcess">
<StatelessService ServiceTypeName="FabricHealerType" InstanceCount="[FabricHealer_InstanceCount]">
<SingletonPartition />
</StatelessService>
</Service>
</Services>
</StartupServicesManifest>

Просмотреть файл

@ -0,0 +1,35 @@
<?xml version="1.0" encoding="utf-8"?>
<package xmlns="http://schemas.microsoft.com/packaging/2013/05/nuspec.xsd">
<metadata minClientVersion="3.3.0">
<id>%PACKAGE_ID%</id>
<version>1.0.0</version>
<releaseNotes>
</releaseNotes>
<authors>Microsoft</authors>
<license type="expression">MIT</license>
<requireLicenseAcceptance>true</requireLicenseAcceptance>
<title>FabricHealerLib: Utility library for communicating with FabricHealer service.</title>
<icon>icon.png</icon>
<readme>FabricHealerLibnuget.md</readme>
<language>en-US</language>
<description>This package contains FabricHealerLib, a utility library that provides a very simple, structured way to communicate Service Fabric entity information to FabricHealer via Service Fabric health reporting. Simply put, you can execute FabricHealer auto-mitigation workflows with just a few lines of C# code from any Service Fabric service running in the same cluster as FabricHealer.</description>
<contentFiles>
<files include="**" buildAction="None" copyToOutput="true" />
</contentFiles>
<dependencies>
<group targetFramework=".NETStandard2.0">
<dependency id="Newtonsoft.Json" version="13.0.1" />
<dependency id="Microsoft.ServiceFabric.Services" version="5.0.516" />
</group>
</dependencies>
<projectUrl>https://github.com/microsoft/FabricHealerLib</projectUrl>
<tags>FabricHealerLib service-fabric netstandard20 netcore csharp</tags>
<copyright>© Microsoft Corporation. All rights reserved.</copyright>
</metadata>
<files>
<file src="FabricHealerLib.dll" target="lib\netstandard2.0" />
<file src="FabricHealerLib.xml" target="lib\netstandard2.0" />
<file src="%ROOT_PATH%\icon.png" target="" />
<file src="%ROOT_PATH%\FabricHealerLibnuget.md" target="" />
</files>
</package>

Просмотреть файл

@ -0,0 +1,46 @@
// ------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
namespace FabricHealerLib
{
/// <summary>
/// Service Fabric entity types.
/// </summary>
public enum EntityType
{
/// <summary>
/// Application type.
/// </summary>
Application,
/// <summary>
/// Node type.
/// </summary>
Node,
/// <summary>
/// Service type.
/// </summary>
Service,
/// <summary>
/// StatefulService type.
/// </summary>
StatefulService,
/// <summary>
/// StatelessService type.
/// </summary>
StatelessService,
/// <summary>
/// Partition type. NOTE: Partition repair is not currently supported.
/// </summary>
Partition,
/// <summary>
/// DeployedApplication type.
/// </summary>
DeployedApplication,
/// <summary>
/// Process. This is only for direct process restarts of a Service Fabric system service executable.
/// </summary>
Process
}
}

Просмотреть файл

@ -0,0 +1,11 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>netstandard2.0</TargetFramework>
<Platforms>x64</Platforms>
<GenerateDocumentationFile>True</GenerateDocumentationFile>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Newtonsoft.Json" Version="13.0.1" />
<PackageReference Include="Microsoft.ServiceFabric.Services" Version="5.0.516" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,346 @@
// ------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
using System;
using System.Fabric;
using System.Fabric.Health;
using Newtonsoft.Json;
using System.Threading.Tasks;
using System.Threading;
using System.Linq;
using System.Fabric.Query;
namespace FabricHealerLib
{
/// <summary>
/// FabricHealer utility library that provides a very simple, structured way to share Service Fabric entity repair information to FabricHealer
/// via Service Fabric entity health reporting.
/// </summary>
public class FabricHealerProxy : IDisposable
{
private const int MaxRetries = 3;
private int _retried;
private bool _disposedValue;
private readonly FabricClient _fabricClient;
/// <summary>
/// Creates an instance of FabricHealerProxy.
/// </summary>
public FabricHealerProxy()
{
if (_fabricClient == null)
{
var settings = new FabricClientSettings
{
HealthOperationTimeout = TimeSpan.FromSeconds(30),
HealthReportSendInterval = TimeSpan.FromSeconds(1),
HealthReportRetrySendInterval = TimeSpan.FromSeconds(3)
};
_fabricClient = new FabricClient(settings);
}
}
/// <summary>
/// This function generates a specially-crafted Service Fabric Health Report that the FabricHealer service will understand and act upon given the facts supplied
/// in the RepairData instance.
/// </summary>
/// <param name="repairData">A RepairData instance. This is a well-known (ITelemetryData) data type that contains facts that FabricHealer will use
/// in the execution of its entity repair logic rules and related mitigation functions.</param>
/// <param name="cancellationToken">CancellationToken used to ensure this function stops processing when the token is cancelled.</param>
/// <param name="repairDataLifetime">The amount of time for the repair data to remain active (TTL of associated health report). Default is 15 mins.</param>
/// <exception cref="ArgumentNullException">Thrown when RepairData instance is null.</exception>
/// <exception cref="FabricException">Thrown when an internal Service Fabric operation fails.</exception>
/// <exception cref="FabricNodeNotFoundException">Thrown when specified RepairData.NodeName does not exist in the cluster.</exception>
/// <exception cref="FabricServiceNotFoundException">Thrown when specified service doesn't exist in the cluster.</exception>
/// <exception cref="MissingFieldException">Thrown when required RepairData fields are missing in the supplied instance.</exception>
/// <exception cref="UriFormatException">Thrown when required ApplicationName or ServiceName value is a malformed Uri string.</exception>
/// <exception cref="TimeoutException">Thrown when internal Fabric client API calls timeout.</exception>
public async Task RepairEntityAsync(RepairData repairData, CancellationToken cancellationToken, TimeSpan repairDataLifetime = default)
{
if (cancellationToken.IsCancellationRequested)
{
return;
}
if (repairData.HealthState == HealthState.Ok)
{
return;
}
if (repairData == null)
{
throw new ArgumentNullException("Supplied null for repairData argument. You must supply an instance of RepairData.");
}
if (string.IsNullOrEmpty(repairData.NodeName))
{
throw new MissingFieldException("RepairData.NodeName is a required field.");
}
if (string.IsNullOrWhiteSpace(repairData.ApplicationName))
{
// Application (0) EntityType enum default. Check to see if only Service was supplied. OR, check to see if only NodeName was supplied.
if (!string.IsNullOrWhiteSpace(repairData.ServiceName) && repairData.EntityType == EntityType.Application)
{
repairData.EntityType = EntityType.Service;
}
else if (!string.IsNullOrEmpty(repairData.NodeName) && repairData.EntityType == EntityType.Application)
{
repairData.EntityType = EntityType.Node;
if (string.IsNullOrWhiteSpace(repairData.NodeType))
{
NodeList nodes = await _fabricClient.QueryManager.GetNodeListAsync(repairData.NodeName, TimeSpan.FromSeconds(30), cancellationToken).ConfigureAwait(false);
if (nodes == null || nodes.Count == 0)
{
throw new FabricNodeNotFoundException($"NodeName {repairData.NodeName} does not exist in this cluster.");
}
repairData.NodeType = nodes[0].NodeType;
}
}
}
try
{
if (string.IsNullOrWhiteSpace(repairData.Source))
{
CodePackageActivationContext context =
await FabricRuntime.GetActivationContextAsync(TimeSpan.FromSeconds(30), cancellationToken).ConfigureAwait(false);
repairData.Source = context.GetServiceManifestName();
}
// Support for repair data that does not contain replica/partition facts for service level repair.
switch (repairData.EntityType)
{
case EntityType.Application when repairData.PartitionId == Guid.Empty || repairData.ReplicaId == 0:
case EntityType.DeployedApplication when repairData.PartitionId == Guid.Empty || repairData.ReplicaId == 0:
case EntityType.Service when repairData.PartitionId == Guid.Empty || repairData.ReplicaId == 0:
case EntityType.StatefulService when repairData.PartitionId == Guid.Empty || repairData.ReplicaId == 0:
case EntityType.StatelessService when repairData.PartitionId == Guid.Empty || repairData.ReplicaId == 0:
Uri appName, serviceName;
if (string.IsNullOrWhiteSpace(repairData.ApplicationName))
{
if (!TryValidateAndFixFabricUriString(repairData.ServiceName, out serviceName))
{
throw new UriFormatException($"Specified ServiceName, {repairData.ServiceName}, is invalid.");
}
ApplicationNameResult appNameResult =
await _fabricClient.QueryManager.GetApplicationNameAsync(serviceName, TimeSpan.FromSeconds(60), cancellationToken).ConfigureAwait(false);
appName = appNameResult.ApplicationName;
repairData.ApplicationName = appName.OriginalString;
}
else
{
if (!TryValidateAndFixFabricUriString(repairData.ApplicationName, out appName))
{
throw new UriFormatException($"Specified ApplicationName, {repairData.ApplicationName}, is invalid.");
}
}
if (repairData.PartitionId == null || repairData.ReplicaId == 0)
{
var depReplicas = await _fabricClient.QueryManager.GetDeployedReplicaListAsync(repairData.NodeName, appName).ConfigureAwait(false);
var depReplica =
depReplicas.First(r => r.ServiceName.OriginalString.ToLower() == repairData.ServiceName.ToLower() && r.ReplicaStatus == ServiceReplicaStatus.Ready);
Guid partitionId = depReplica.Partitionid;
long replicaId;
if (depReplica is DeployedStatefulServiceReplica depStatefulReplica)
{
if (depStatefulReplica.ReplicaRole == ReplicaRole.Primary || depStatefulReplica.ReplicaRole == ReplicaRole.ActiveSecondary)
{
replicaId = depStatefulReplica.ReplicaId;
}
else
{
return;
}
}
else
{
replicaId = (depReplica as DeployedStatelessServiceInstance).InstanceId;
}
repairData.PartitionId = partitionId;
repairData.ProcessId = depReplica.HostProcessId;
repairData.ReplicaId = replicaId;
}
if (string.IsNullOrWhiteSpace(repairData.Description))
{
repairData.Description = $"{repairData.ServiceName} has been designated as Unhealthy by {repairData.Source}.";
}
if (string.IsNullOrWhiteSpace(repairData.Property))
{
repairData.Property = $"{repairData.NodeName}_{repairData.ServiceName.Remove(0, repairData.ApplicationName.Length + 1)}_{repairData.Metric ?? "FHRepair"}";
}
break;
case EntityType.Node:
if (string.IsNullOrWhiteSpace(repairData.Description))
{
repairData.Description = $"{repairData.NodeName} of type {repairData.NodeType} has been designated as Unhealthy by {repairData.Source}.";
}
if (string.IsNullOrWhiteSpace(repairData.Property))
{
repairData.Property = $"{repairData.NodeName}_{repairData.Metric ?? repairData.NodeType}_FHRepair";
}
break;
}
TimeSpan timeToLive = repairDataLifetime;
if (timeToLive == default)
{
timeToLive = TimeSpan.FromMinutes(20);
}
var healthInformation = new HealthInformation(repairData.Source, repairData.Property, repairData.HealthState)
{
Description = JsonConvert.SerializeObject(repairData),
TimeToLive = timeToLive,
RemoveWhenExpired = true
};
var sendOptions = new HealthReportSendOptions { Immediate = true };
switch (repairData.EntityType)
{
case EntityType.Application when repairData.ApplicationName != null:
var appHealthReport = new ApplicationHealthReport(new Uri(repairData.ApplicationName), healthInformation);
_fabricClient.HealthManager.ReportHealth(appHealthReport, sendOptions);
break;
case EntityType.Service when repairData.ServiceName != null:
var serviceHealthReport = new ServiceHealthReport(new Uri(repairData.ServiceName), healthInformation);
_fabricClient.HealthManager.ReportHealth(serviceHealthReport, sendOptions);
break;
case EntityType.StatefulService when repairData.PartitionId != Guid.Empty && repairData.ReplicaId > 0:
var statefulServiceHealthReport = new StatefulServiceReplicaHealthReport(repairData.PartitionId, repairData.ReplicaId, healthInformation);
_fabricClient.HealthManager.ReportHealth(statefulServiceHealthReport, sendOptions);
break;
case EntityType.StatelessService when repairData.PartitionId != Guid.Empty && repairData.ReplicaId > 0:
var statelessServiceHealthReport = new StatelessServiceInstanceHealthReport(repairData.PartitionId, repairData.ReplicaId, healthInformation);
_fabricClient.HealthManager.ReportHealth(statelessServiceHealthReport, sendOptions);
break;
case EntityType.Partition when repairData.PartitionId != Guid.Empty:
var partitionHealthReport = new PartitionHealthReport(repairData.PartitionId, healthInformation);
_fabricClient.HealthManager.ReportHealth(partitionHealthReport, sendOptions);
break;
case EntityType.DeployedApplication when repairData.ApplicationName != null:
var deployedApplicationHealthReport = new DeployedApplicationHealthReport(new Uri(repairData.ApplicationName), repairData.NodeName, healthInformation);
_fabricClient.HealthManager.ReportHealth(deployedApplicationHealthReport, sendOptions);
break;
case EntityType.Node:
var nodeHealthReport = new NodeHealthReport(repairData.NodeName, healthInformation);
_fabricClient.HealthManager.ReportHealth(nodeHealthReport, sendOptions);
break;
}
}
catch (Exception e) when (e is FabricServiceNotFoundException)
{
throw new FabricServiceNotFoundException($"Specified ServiceName {repairData.ServiceName} does not exist in the cluster.", e);
}
catch (Exception e) when (e is FabricException || e is TimeoutException)
{
await Task.Delay(TimeSpan.FromSeconds(1), cancellationToken);
_retried++;
if (_retried <= MaxRetries)
{
await RepairEntityAsync(repairData, cancellationToken).ConfigureAwait(false);
}
throw;
}
catch (Exception e) when (e is OperationCanceledException || e is TaskCanceledException)
{
_retried = 0;
return;
}
_retried = 0;
}
private bool TryValidateAndFixFabricUriString(string uriString, out Uri fixedUri)
{
try
{
/* Try and fix malformed app/service names, if possible. */
if (!uriString.StartsWith("fabric:/"))
{
uriString = uriString.Insert(0, "fabric:/");
}
if (uriString.Contains("://"))
{
uriString = uriString.Replace("://", ":/");
}
if (uriString.Contains(" "))
{
uriString = uriString.Replace(" ", string.Empty);
}
if (Uri.IsWellFormedUriString(uriString, UriKind.RelativeOrAbsolute))
{
fixedUri = new Uri(uriString);
return true;
}
}
catch (ArgumentException)
{
}
fixedUri = null;
return false;
}
private void Dispose(bool disposing)
{
if (!_disposedValue)
{
if (disposing)
{
_fabricClient?.Dispose();
}
_disposedValue = true;
}
}
/// <summary>
/// Dispose.
/// </summary>
public void Dispose()
{
Dispose(disposing: true);
GC.SuppressFinalize(this);
}
}
}

Просмотреть файл

@ -0,0 +1,21 @@
using System;
namespace FabricHealerLib
{
/// <summary>
/// Exception thrown when specified node does not exist in the cluster.
/// </summary>
[Serializable]
public class FabricNodeNotFoundException : Exception
{
/// <summary>
/// Creates an instance of FabricNodeNotFoundException.
/// </summary>
/// <param name="message"></param>
public FabricNodeNotFoundException(string message)
: base(message)
{
}
}
}

Просмотреть файл

@ -0,0 +1,89 @@
// ------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
using System;
using System.Fabric.Health;
namespace FabricHealerLib.Interfaces
{
/// <summary>
///
/// </summary>
public interface ITelemetryData
{
/// <summary>
/// Required if the repair target is a Service. Service Fabric ApplicationName as a string value (OriginalString on a Uri instance, for example).
/// </summary>
string ApplicationName { get; set; }
/// <summary>
/// Required. The supported error code.
/// </summary>
string Code { get; set; }
/// <summary>
/// Required if the repair target is a container. The id of the container.
/// </summary>
string ContainerId { get; set; }
/// <summary>
/// Optional. The description of the problem being reported.
/// </summary>
string Description { get; set; }
/// <summary>
/// The Service Fabric entity type to be repaired.
/// </summary>
EntityType EntityType { get; set; }
/// <summary>
/// This string value will be set by the consuming function, which requires a System.Fabric.HealthState enum parameter.
/// </summary>
HealthState HealthState { get; set; }
/// <summary>
/// Required. The supported resource usage metric name.
/// </summary>
string Metric { get; set; }
/// <summary>
/// Required. The name of the node where the entity resides.
/// </summary>
string NodeName { get; set; }
/// <summary>
/// The OS hosting Service Fabric. This is read-only.
/// </summary>
string OS { get; }
/// <summary>
/// Required if the repair target is a Service. The Partition Id (as a string) where the replica or instance resides that is in Error or Warning state.
/// </summary>
Guid PartitionId { get; set; }
/// <summary>
/// Optional. The host process id of the Service entity.
/// </summary>
long ProcessId { get; set; }
/// <summary>
/// Required if the repair target is a Service. The Replica or Instance id of the target Service replica.
/// </summary>
long ReplicaId { get; set; }
/// <summary>
/// Required if the repair target is a Service. The name of the service (as a string). This would be the same value as the OriginalString property of the ServiceName Uri instance.
/// </summary>
string ServiceName { get; set; }
/// <summary>
/// Required. This is the name of the service (as a string) that is generating the health report with this TelemetryData instance.
/// </summary>
string Source { get; set; }
/// <summary>
/// Optional. This is required if you are targeting Service Fabric System Service process. In this case, you should also supply the related value for ProcessId.
/// </summary>
string SystemServiceProcessName { get; set; }
/// <summary>
/// Optional. The supported resource usage metric value. NOTE: This value must be supplied if you expect to use this fact in related FabricHealer logic rules.
/// </summary>
double Value { get; set; }
/// <summary>
/// Don't set. The Fabric node type. FabricHealer will set this.
/// </summary>
string NodeType { get; set; }
/// <summary>
/// Required. This will be used as the Health Event Property.
/// </summary>
string Property { get; set; }
}
}

120
FabricHealerLib/README.md Normal file
Просмотреть файл

@ -0,0 +1,120 @@
# FabricHealerLib
FabricHealerLib is a utility library that provides a very simple, structured way to communicate Service Fabric entity repair information to FabricHealer via Service Fabric health
reporting. Simply put, you can execute FabricHealer auto-mitigation workflows with just a few lines of C# code from any .NET Service Fabric service running in the same cluster
as FabricHealer. You can install it into your Service Fabric service from the [nuget.org package gallery](...).
### Example
- Deploy FabricHealer to your cluster (Do note that if you deploy FabricHealer as a singleton partition 1 (versus -1), then FH will only conduct SF-related repairs).
- Install FabricHealerLib nupkg into your own service from where you want to repair SF entities.
- Add code like below to your own service to heal some target Service Fabric entity. Note that only Service Fabric entities (not including System services) are supported (not machines or Disks, for example).
FabricHealer will execute entity-related logic rules (housed in it's FabricNodeRules.guan file in this case), and if any of the rules succeed, then FH will create a Repair Job with pre and post safety checks (default),
orchestrate RM through to repair completion (FH will be the executor of the repair), emit repair step information via telemetry, local logging, and etw.
```C#
using System;
using System.Fabric;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.ServiceFabric.Services.Runtime;
using FabricHealerLib;
namespace Stateless1
{
/// <summary>
/// An instance of this class is created for each service instance by the Service Fabric runtime.
/// </summary>
internal sealed class Stateless1 : StatelessService
{
public Stateless1(StatelessServiceContext context)
: base(context)
{
}
/// <summary>
/// This is the main entry point for your service instance.
/// </summary>
/// <param name="cancellationToken">Canceled when Service Fabric needs to shut down this service instance.</param>
protected override async Task RunAsync(CancellationToken cancellationToken)
{
// This specifies that you want FabricHealer to repair a service instance deployed to a Fabric node named NodeName.
// FabricHealer supports both Replica and CodePackage restarts of services. The logic rules will dictate which one of these happens,
// so make sure to craft a speific logic rule that makes sense for you (and use some logic!).
// Note that, out of the box, FabricHealer's AppRules.guan file already has a restart replica catch-all rule that will restart the primary replica of
// the specified service below, deployed to the a node named NodeName.
var repairDataServiceTarget = new RepairData
{
ServiceName = "fabric:/HealthMetrics/DoctorActorServiceType",
NodeName = Context.NodeContext.NodeName
};
// This specifies that you want FabricHealer to repair a Fabric node named NodeName. The only supported repair in FabricHealer is a Restart.
// So, implicitly, this means you want FabricHealer to restart _Node_0.
var repairDataNodeTarget = new RepairData
{
NodeName = "_Node_0"
};
// In this case, you must place this using declaration of FabricHealerProxy instance at function scope (so, not within the try below).
// Failure to do so will result in nothing happening as the FabricClient instance that FabricHealerProxy creates will have closed before
// Service Fabric's HealthManager has completed its related work.
using var fabricHealer = new FabricHealerProxy();
// Service repair.
try
{
await fabricHealer.RepairEntityAsync(repairDataServiceTarget, cancellationToken, TimeSpan.FromMinutes(5)).ConfigureAwait(false);
}
catch (MissingFieldException)
{
// This means a required RepairData field was not specified. For example, RepairData.NodeName was not set.
}
catch (FabricNodeNotFoundException)
{
// The Fabric node you specified in RepairData.NodeName does not exist.
}
catch (FabricServiceNotFoundException)
{
// The Fabric service you specified in RepairData.ServiceName does not exist.
}
catch (FabricException)
{
// Thrown when an internal Service Fabric operation fails. Internally, RepairEntityAsync will retry failed Fabric client operations 3 times.
// This will have already lead to 3 internal retries before surfacing here.
}
catch (TimeoutException)
{
// Thrown when a Fabric client API call times out. This will have already lead to 3 internal retries before surfacing here.
}
// Node repair.
try
{
await fabricHealer.RepairEntityAsync(repairDataNodeTarget, cancellationToken, TimeSpan.FromMinutes(5)).ConfigureAwait(false);
}
catch (FabricNodeNotFoundException)
{
// Check your spelling..
}
catch (FabricException)
{
// No-op unless you want to re-run RepairEntityAsync again..
}
catch (TimeoutException)
{
// ClusterManager service could be hammered (flooded with queries), for example. You could retry RepairEntityAsync again after you wait a bit..
}
// Do nothing and wait.
while (!cancellationToken.IsCancellationRequested)
{
await Task.Delay(TimeSpan.FromSeconds(30), cancellationToken);
}
}
}
}
```

Просмотреть файл

@ -0,0 +1,120 @@
// ------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
using FabricHealerLib.Interfaces;
using Newtonsoft.Json;
using System;
using System.Fabric.Health;
using System.Runtime.InteropServices;
namespace FabricHealerLib
{
/// <summary>
/// Data type that houses facts that FabricHealer expects and therefore understands. This type is serialized by ServiceFabricHealthReporter and used as the Description property
/// of the related Service Fabric Health Event. FabricHealer will deserialize the serialized instance and use the facts it contains throughout its mitigation infrastructure.
/// Effectively, this type enables structured (well-known data) inter-service communication via Service Fabric health reports.
/// </summary>
public class RepairData : ITelemetryData
{
private readonly string _os;
/// <inheritdoc/>
public string ApplicationName
{
get; set;
}
/// <inheritdoc/>
public string Code
{
get; set;
}
/// <inheritdoc/>
public string ContainerId
{
get; set;
}
/// <inheritdoc/>
public string Description
{
get; set;
}
/// <inheritdoc/>
public EntityType EntityType
{
get; set;
}
/// <inheritdoc/>
public HealthState HealthState
{
get; set;
} = HealthState.Warning;
/// <inheritdoc/>
public string Metric
{
get; set;
}
/// <inheritdoc/>
public string NodeName
{
get; set;
}
/// <inheritdoc/>
public string NodeType
{
get; set;
}
/// <inheritdoc/>
public string OS
{
get { return _os; }
}
/// <inheritdoc/>
public Guid PartitionId
{
get; set;
}
/// <inheritdoc/>
public long ProcessId
{
get; set;
}
/// <inheritdoc/>
public long ReplicaId
{
get; set;
}
/// <inheritdoc/>
public string ServiceName
{
get; set;
}
/// <inheritdoc/>
public string Source
{
get; set;
}
/// <inheritdoc/>
public string SystemServiceProcessName
{
get; set;
}
/// <inheritdoc/>
public string Property
{
get; set;
}
/// <inheritdoc/>
public double Value
{
get; set;
}
/// <inheritdoc/>
[JsonConstructor]
public RepairData()
{
_os = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux";
}
}
}

Просмотреть файл

@ -0,0 +1,305 @@
// ------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
using System.Collections.Generic;
using System.Linq;
namespace FabricHealerLib
{
/// <summary>
/// Error and Warning Codes related to machine resource usage metrics at the machine and service levels.
/// FabricHealer understands these codes.
/// </summary>
public sealed class SupportedErrorCodes
{
/// <summary>
/// Ok HealthState
/// </summary>
public const string Ok = "FO000";
/// <summary>
/// FO001 Percentage of total CPU usage has exceeded configured Error threshold for an app service process.
/// </summary>
public const string AppErrorCpuPercent = "FO001";
/// <summary>
/// FO002 Percentage of total CPU usage has exceeded configured Warning threshold for an app service process.
/// </summary>
public const string AppWarningCpuPercent = "FO002";
/// <summary>
/// FO003 Percentage of total CPU usage has exceeded configured Error threshold on a machine.
/// </summary>
public const string NodeErrorCpuPercent = "FO003";
/// <summary>
/// FO004 Percentage of total CPU usage has exceeded configured Warning threshold on a machine.
/// </summary>
public const string NodeWarningCpuPercent = "FO004";
/// <summary>
/// FO005 Error: Certificate expiration has occured.
/// </summary>
public const string ErrorCertificateExpiration = "FO005";
/// <summary>
/// FO006 Warning: Certificate expiration is imminent.
/// </summary>
public const string WarningCertificateExpiration = "FO006";
/// <summary>
/// FO007 Disk usage percentage has exceeded configured Error threshold on a machine.
/// </summary>
public const string NodeErrorDiskSpacePercent = "FO007";
/// <summary>
/// FO008 Disk usage space (MB) has exceeded configured Error threshold on a machine.
/// </summary>
public const string NodeErrorDiskSpaceMB = "FO008";
/// <summary>
/// FO009 Disk usage percentage has exceeded configured Warning threshold on a machine.
/// </summary>
public const string NodeWarningDiskSpacePercent = "FO009";
/// <summary>
/// FO010 Disk usage space (MB) has exceeded configured Warning threshold on a machine.
/// </summary>
public const string NodeWarningDiskSpaceMB = "FO010";
/// <summary>
/// FO011 Avergage disk queue length has exceeded configured Error threshold.
/// </summary>
public const string NodeErrorDiskAverageQueueLength = "FO011";
/// <summary>
/// FO012 Average disk queue length has exceeded configured Warning threshold.
/// </summary>
public const string NodeWarningDiskAverageQueueLength = "FO012";
/// <summary>
/// FO042 Folder size (MB) has exceeded configured Error threshold
/// </summary>
public const string NodeErrorFolderSizeMB = "FO042";
/// <summary>
/// FO043 Folder size (MB) has exceeded configured Warning threshold
/// </summary>
public const string NodeWarningFolderSizeMB = "FO043";
/// <summary>
/// FO013 Percentage of total physical memory usage has exceeded configured Error threshold for an app service process.
/// </summary>
public const string AppErrorMemoryPercent = "FO013";
/// <summary>
/// FO014 Percentage of total physical memory usage has exceeded configured Warning threshold for an app service process.
/// </summary>
public const string AppWarningMemoryPercent = "FO014";
/// <summary>
/// FO015 Committed memory (MB) has exceeded configured Error threshold for an app service process.
/// </summary>
public const string AppErrorMemoryMB = "FO015";
/// <summary>
/// FO016 Committed memory (MB) has exceeded configured Warning threshold for an app service process.
/// </summary>
public const string AppWarningMemoryMB = "FO016";
/// <summary>
/// FO017 Percentage of total physical memory usage has exceeded configured Error threshold on a machine.
/// </summary>
public const string NodeErrorMemoryPercent = "FO017";
/// <summary>
/// FO018 Percentage of total physical memory usage has exceeded configured Warning threshold on a machine.
/// </summary>
public const string NodeWarningMemoryPercent = "FO018";
/// <summary>
/// FO019 Total Committed memory (MB) has exceeded configured Error threshold on a machine.
/// </summary>
public const string NodeErrorMemoryMB = "FO019";
/// <summary>
/// FO020 Total Committed memory (MB) has exceeded configured Warning threshold on a machine.
/// </summary>
public const string NodeWarningMemoryMB = "FO020";
/// <summary>
/// FO021 Error: Configured endpoint detected as unreachable.
/// </summary>
public const string AppErrorNetworkEndpointUnreachable = "FO021";
/// <summary>
/// FO022 Warning: Configured endpoint detected as unreachable.
/// </summary>
public const string AppWarningNetworkEndpointUnreachable = "FO022";
/// <summary>
/// FO023 Number of active TCP ports at or exceeding configured Error threshold for an app service process.
/// </summary>
public const string AppErrorTooManyActiveTcpPorts = "FO023";
/// <summary>
/// FO024 Number of active TCP ports at or exceeding configured Warning threshold for an app service process.
/// </summary>
public const string AppWarningTooManyActiveTcpPorts = "FO024";
/// <summary>
/// FO025 Number of active TCP ports at or exceeding configured Error threshold on a machine.
/// </summary>
public const string NodeErrorTooManyActiveTcpPorts = "FO025";
/// <summary>
/// FO026 Number of active TCP ports at or exceeding configured Warning threshold on a machine.
/// </summary>
public const string NodeWarningTooManyActiveTcpPorts = "FO026";
/// <summary>
/// FO027 Number of enabled Firewall Rules at or exceeding configured Error threshold on a machine.
/// </summary>
public const string ErrorTooManyFirewallRules = "FO027";
/// <summary>
/// FO028 Number of enabled Firewall Rules at or exceeding configured Warning threshold on a machine.
/// </summary>
public const string WarningTooManyFirewallRules = "FO028";
/// <summary>
/// FO029 Number of active Ephemeral TCP ports (ports in the dynamic range) at or exceeding configured Error threshold for an app service process.
/// </summary>
public const string AppErrorTooManyActiveEphemeralPorts = "FO029";
/// <summary>
/// FO030 Number of active Ephemeral TCP ports (ports in the dynamic range) at or exceeding configured Warning threshold for an app service process.
/// </summary>
public const string AppWarningTooManyActiveEphemeralPorts = "FO030";
/// <summary>
/// FO031 Number of active Ephemeral TCP ports (ports in the Windows dynamic port range) at or exceeding configured Error threshold for a machine.
/// </summary>
public const string NodeErrorTooManyActiveEphemeralPorts = "FO031";
/// <summary>
/// FO032 Number of active Ephemeral TCP ports (ports in the Windows dynamic port range) at or exceeding configured Warning threshold for a machine.
/// </summary>
public const string NodeWarningTooManyActiveEphemeralPorts = "FO032";
/// <summary>
/// FO044 Percentage of active Ephemeral TCP ports in use is at or exceeding configured Error threshold for an app service process.
/// </summary>
public const string AppErrorActiveEphemeralPortsPercent = "FO044";
/// <summary>
/// FO045 Percentage of active Ephemeral TCP ports in use is at or exceeding configured Error threshold for an app service process.
/// </summary>
public const string AppWarningActiveEphemeralPortsPercent = "FO045";
/// <summary>
/// FO046 Percentage of active Ephemeral TCP ports in use is at or exceeding configured Error threshold for a machine.
/// </summary>
public const string NodeErrorActiveEphemeralPortsPercent = "FO046";
/// <summary>
/// Percentage of active Ephemeral TCP ports in use is at or exceeding configured Error threshold for a machine.
/// </summary>
public const string NodeWarningActiveEphemeralPortsPercent = "FO047";
/// <summary>
/// FO033 Number of allocated File Handles is at or exceeding configured Error threshold for an app service process.
/// </summary>
public const string AppErrorTooManyOpenFileHandles = "FO033";
/// <summary>
/// FO034 Number of allocated File Handles is at or exceeding configured Warning threshold for an app service process.
/// </summary>
public const string AppWarningTooManyOpenFileHandles = "FO034";
/// <summary>
/// FO035 Percentage of Maximum number of File Descriptors in use is at or exceeding configured Error threshold on a Linux machine.
/// </summary>
public const string NodeErrorTotalOpenFileHandlesPercent = "FO035";
/// <summary>
/// FO036 Percentage of Maximum number of File Descriptors in use is at or exceeding configured Warning threshold on a Linux machine.
/// </summary>
public const string NodeWarningTotalOpenFileHandlesPercent = "FO036";
/// <summary>
/// FO037 Number of allocated File Handles is at or exceeding configured Error threshold on a Linux a machine.
/// </summary>
public const string NodeErrorTooManyOpenFileHandles = "FO037";
/// <summary>
/// FO038 Number of allocated File Handles is at or exceeding configured Warning threshold on a Linux a machine.
/// </summary>
public const string NodeWarningTooManyOpenFileHandles = "FO038";
/// <summary>
/// FO039 Number of threads at or exceeding configured Error threshold for an app service process.
/// </summary>
public const string AppErrorTooManyThreads = "FO039";
/// <summary>
/// FO040 Number of threads at or exceeding configured Warning threshold for an app service process.
/// </summary>
public const string AppWarningTooManyThreads = "FO040";
/// <summary>
/// FO041 Percentage of Maximum number of KVS LVIDs in use is at or exceeding internal Warning threshold (75%) for an app service process.
/// The related threshold is non-configurable and Windows-only.
/// </summary>
public const string AppWarningKvsLvidsPercentUsed = "FO041";
/// <summary>
/// AppErrorCodesDictionary dictionary.
/// </summary>
public static Dictionary<string, string> AppErrorCodesDictionary
{
get;
} = new Dictionary<string, string>
{
{ Ok, "Ok" },
{ AppErrorCpuPercent, "AppErrorCpuPercent" },
{ AppWarningCpuPercent, "AppWarningCpuPercent" },
{ AppErrorMemoryPercent, "AppErrorMemoryPercent" },
{ AppWarningMemoryPercent, "AppWarningMemoryPercent" },
{ AppErrorMemoryMB, "AppErrorMemoryMB" },
{ AppWarningMemoryMB, "AppWarningMemoryMB" },
{ AppErrorNetworkEndpointUnreachable, "AppErrorNetworkEndpointUnreachable" },
{ AppWarningNetworkEndpointUnreachable, "AppWarningNetworkEndpointUnreachable" },
{ AppErrorTooManyActiveTcpPorts, "AppErrorTooManyActiveTcpPorts" },
{ AppWarningTooManyActiveTcpPorts, "AppWarningTooManyActiveTcpPorts" },
{ AppErrorTooManyActiveEphemeralPorts, "AppErrorTooManyActiveEphemeralPorts" },
{ AppWarningTooManyActiveEphemeralPorts, "AppWarningTooManyActiveEphemeralPorts" },
{ AppErrorActiveEphemeralPortsPercent, "AppErrorActiveEphemeralPortsPercent" },
{ AppWarningActiveEphemeralPortsPercent, "AppWarningActiveEphemeralPortsPercent" },
{ AppErrorTooManyOpenFileHandles, "AppErrorTooManyOpenFileHandles" },
{ AppWarningTooManyOpenFileHandles, "AppWarningTooManyOpenFileHandles" },
{ AppErrorTooManyThreads, "AppErrorTooManyThreads" },
{ AppWarningTooManyThreads, "AppWarningTooManyThreads" },
{ AppWarningKvsLvidsPercentUsed, "AppWarningKvsLvidsPercentUsed"}
};
/// <summary>
/// NodeErrorCodesDictionary dictionary.
/// </summary>
public static Dictionary<string, string> NodeErrorCodesDictionary
{
get;
} = new Dictionary<string, string>
{
{ Ok, "Ok" },
{ NodeErrorCpuPercent, "NodeErrorCpuPercent" },
{ NodeWarningCpuPercent, "NodeWarningCpuPercent" },
{ ErrorCertificateExpiration, "ErrorCertificateExpiration" },
{ WarningCertificateExpiration, "WarningCertificateExpiration" },
{ NodeErrorDiskSpacePercent, "NodeErrorDiskSpacePercent" },
{ NodeErrorDiskSpaceMB, "NodeErrorDiskSpaceMB" },
{ NodeWarningDiskSpacePercent, "NodeWarningDiskSpacePercent" },
{ NodeWarningDiskSpaceMB, "NodeWarningDiskSpaceMB" },
{ NodeErrorDiskAverageQueueLength, "NodeErrorDiskAverageQueueLength" },
{ NodeWarningDiskAverageQueueLength, "NodeWarningDiskAverageQueueLength" },
{ NodeErrorFolderSizeMB, "NodeErrorFolderSizeMB" },
{ NodeWarningFolderSizeMB, "NodeWarningFolderSizeMB" },
{ NodeErrorMemoryPercent, "NodeErrorMemoryPercent" },
{ NodeWarningMemoryPercent, "NodeWarningMemoryPercent" },
{ NodeErrorMemoryMB, "NodeErrorMemoryMB" },
{ NodeWarningMemoryMB, "NodeWarningMemoryMB" },
{ NodeErrorTooManyActiveTcpPorts, "NodeErrorTooManyActiveTcpPorts" },
{ NodeWarningTooManyActiveTcpPorts, "NodeWarningTooManyActiveTcpPorts" },
{ ErrorTooManyFirewallRules, "NodeErrorTooManyFirewallRules" },
{ WarningTooManyFirewallRules, "NodeWarningTooManyFirewallRules" },
{ NodeErrorTooManyActiveEphemeralPorts, "NodeErrorTooManyActiveEphemeralPorts" },
{ NodeWarningTooManyActiveEphemeralPorts, "NodeWarningTooManyActiveEphemeralPorts" },
{ NodeErrorActiveEphemeralPortsPercent, "NodeErrorActiveEphemeralPortsPercent" },
{ NodeWarningActiveEphemeralPortsPercent, "NodeWarningActiveEphemeralPortsPercent" },
{ NodeErrorTotalOpenFileHandlesPercent, "NodeErrorTotalOpenFileHandlesPercent" },
{ NodeWarningTotalOpenFileHandlesPercent, "NodeWarningTotalOpenFileHandlesPercent" },
{ NodeErrorTooManyOpenFileHandles, "NodeErrorTooManyOpenFileHandles" },
{ NodeWarningTooManyOpenFileHandles, "NodeWarningTooManyOpenFileHandles" }
};
/// <summary>
/// This function takes a SupportedErrorWarningCodes code (key) and returns the name of the error or warning (value).
/// </summary>
/// <param name="code">The SupportedErrorWarningCodes code to use as a lookup key.</param>
/// <returns>The name of the error or warning code.</returns>
public static string GetNameFromErrorCode(string code)
{
if (string.IsNullOrEmpty(code))
{
return null;
}
if (AppErrorCodesDictionary.Any(k => k.Key == code))
{
return AppErrorCodesDictionary.First(k => k.Key == code).Value;
}
if (NodeErrorCodesDictionary.Any(k => k.Key == code))
{
return NodeErrorCodesDictionary.First(k => k.Key == code).Value;
}
return null;
}
}
}

Просмотреть файл

@ -0,0 +1,105 @@
// ------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
// ------------------------------------------------------------
namespace FabricHealerLib
{
/// <summary>
/// Metric names that FabricHealer expects and therefore understands what to do with.
/// </summary>
public sealed class SupportedMetricNames
{
/// <summary>
/// CPU Time (Percent)
/// </summary>
public const string CpuTime = "CPU Time (Percent)";
/// <summary>
/// Memory Usage (MB)
/// </summary>
public const string MemoryConsumptionMb = "Memory Usage (MB)";
/// <summary>
/// Memory Usage (Percent)
/// </summary>
public const string MemoryConsumptionPercentage = "Memory Usage (Percent)";
/// <summary>
/// Certificate Expiration
/// </summary>
public const string CertificateExpiration = "Certificate Expiration";
/// <summary>
/// Average Disk Queue Length
/// </summary>
public const string DiskAverageQueueLength = "Average Disk Queue Length";
/// <summary>
/// Disk Space Usage (Percent)
/// </summary>
public const string DiskSpaceUsagePercentage = "Disk Space Usage (Percent)";
/// <summary>
/// Disk Space Usage (MB)
/// </summary>
public const string DiskSpaceUsageMb = "Disk Space Usage (MB)";
/// <summary>
/// Disk Space Available (MB)
/// </summary>
public const string DiskSpaceAvailableMb = "Disk Space Available (MB)";
/// <summary>
/// Disk Space Total (MB)
/// </summary>
public const string DiskSpaceTotalMb = "Disk Space Total (MB)";
/// <summary>
/// Folder Size (MB)
/// </summary>
public const string FolderSizeMB = "Folder Size (MB)";
/// <summary>
/// Outbound Internet Connection Failur
/// </summary>
public const string InternetConnectionFailure = "Outbound Internet Connection Failure";
/// <summary>
/// Active Firewall Rules
/// </summary>
public const string ActiveFirewallRules = "Active Firewall Rules";
/// <summary>
/// Active TCP Ports
/// </summary>
public const string ActiveTcpPorts = "Active TCP Ports";
/// <summary>
/// Active Ephemeral Ports
/// </summary>
public const string ActiveEphemeralPorts = "Active Ephemeral Ports";
/// <summary>
/// Active Ephemeral Ports (Percent)
/// </summary>
public const string ActiveEphemeralPortsPercentage = "Active Ephemeral Ports (Percent)";
/// <summary>
/// Total Ephemeral Ports
/// </summary>
public const string TotalEphemeralPorts = "Total Ephemeral Ports";
/// <summary>
/// Allocated File Handle
/// </summary>
public const string AllocatedFileHandles = "Allocated File Handles";
/// <summary>
/// Allocated File Handles (Percent)
/// </summary>
public const string AllocatedFileHandlesPct = "Allocated File Handles (Percent)";
/// <summary>
/// Thread Count
/// </summary>
public const string ThreadCount = "Thread Count";
/// <summary>
/// Child Process Count
/// </summary>
public const string ChildProcessCount = "Child Process Count";
/// <summary>
/// LVID Usage (Percent)
/// </summary>
public const string KvsLvidsPercent = "LVID Usage (Percent)";
}
}

3
FabricHealerLibnuget.md Normal file
Просмотреть файл

@ -0,0 +1,3 @@
# FabricHealerLib
### Getting Started

Просмотреть файл

@ -1,20 +1,30 @@
## FabricHealer 1.0.15
## FabricHealer 1.1.0
### Configuration as Logic and auto-mitigation in Service Fabric clusters
FabricHealer (FH) is a Service Fabric application that attempts to automatically fix a set of reliably solvable problems that can take place in Service Fabric applications (including containers), host virtual machines, and logical disks (scoped to space usage problems only). These repairs mostly employ a set of Service Fabric API calls, but can also be fully customizable (like Disk repair). All repairs are safely orchestrated through the Service Fabric RepairManager system service. Repair workflow configuration is written as [Prolog](http://www.let.rug.nl/bos/lpn//lpnpage.php?pageid=online)-like [logic](https://github.com/microsoft/service-fabric-healer/blob/main/FabricHealer/PackageRoot/Config/LogicRules) with [supporting external predicates](https://github.com/microsoft/service-fabric-healer/blob/main/FabricHealer/Repair/Guan) written in C#.
FabricHealer (FH) is a Service Fabric application that attempts to automatically fix a set of reliably solvable problems that can take place in Service Fabric
applications (including containers), host virtual machines, and logical disks (scoped to space usage problems only). These repairs mostly employ a set of Service Fabric API calls,
but can also be fully customizable (like Disk repair). All repairs are safely orchestrated through the Service Fabric RepairManager system service.
Repair workflow configuration is written as [Prolog](http://www.let.rug.nl/bos/lpn//lpnpage.php?pageid=online)-like [logic](https://github.com/microsoft/service-fabric-healer/blob/main/FabricHealer/PackageRoot/Config/LogicRules) with [supporting external predicates](https://github.com/microsoft/service-fabric-healer/blob/main/FabricHealer/Repair/Guan) written in C#.
FabricHealer's Configuration-as-Logic feature is made possible by a new logic programming library for .NET, [Guan](https://github.com/microsoft/guan). The fun starts when FabricHealer detects supported error or warning health events reported by [FabricObserver](https://github.com/microsoft/service-fabric-observer).
FabricHealer's Configuration-as-Logic feature is made possible by a new logic programming library for .NET, [Guan](https://github.com/microsoft/guan).
The fun starts when FabricHealer detects supported error or warning health events reported by [FabricObserver](https://github.com/microsoft/service-fabric-observer), for example.
You can use FabricHealer if you don't also deploy FabricObserver. Just install FabricHealerLib into your .NET Service Fabric project and you can leverage the power of FH from there.
There is a very simple "interface" to FabricHealer that begins with some service generating a Service Fabric Health Report. This health report must contain a specially-crafted
Description value: a serialized instance of a well-known (to FH) type (must implement ITelemetryData). As mentioned above, just use FabricHealerLib to push FH into motion from your
Service Fabric service.
FabricHealer is implemented as a stateless singleton service that runs on all nodes in a Linux or Windows Service Fabric cluster. It is a .NET Core 3.1 application and has been tested on Windows (2016/2019) and Ubuntu (16/18.04).
FabricHealer is implemented as a stateless singleton service that runs on all nodes in a Linux or Windows Service Fabric cluster.
It is a .NET Core 3.1 application and has been tested on Windows (2016/2019) and Ubuntu (16/18.04).
All warning and error health reports created by [FabricObserver](https://github.com/microsoft/service-fabric-observer) and subsequently repaired by FabricHealer are user-configured - developer control extends from unhealthy event source to related healing operations.
All warning and error health reports created by [FabricObserver](https://github.com/microsoft/service-fabric-observer) and subsequently repaired by FabricHealer are user-configured
- developer control extends from unhealthy event source to related healing operations.
FabricObserver and FabricHealer are part of a family of highly configurable Service Fabric observability tools that work together to keep your clusters green.
To learn more about FabricHealer's configuration-as-logic model, [click here.](https://github.com/microsoft/service-fabric-healer/blob/main/Documentation/LogicWorkflows.md)
```
FabricHealer requires that FabricObserver and RepairManager (RM) service are deployed.
FabricHealer requires that RepairManager (RM) service is deployed.
```
```
For VM level repair, InfrastructureService (IS) service must be deployed.

Просмотреть файл

@ -26,7 +26,6 @@ namespace FabricHealer.TelemetryLib
private static string diagnosticsClusterId;
private static XmlDocument clusterManifestXdoc;
private static readonly object _lock = new object();
private static readonly object _lock2 = new object();
private static (string ClusterId, string ClusterType, string TenantId) _clusterInfoTuple;
public static (string ClusterId, string ClusterType, string TenantId) ClusterInfoTuple
@ -78,15 +77,11 @@ namespace FabricHealer.TelemetryLib
{
using (var xreader = XmlReader.Create(sreader, new XmlReaderSettings { XmlResolver = null }))
{
// TODO: This is not necessary as the only caller is a property getter, where a lock is already in place around the call to this function.
lock (_lock2)
{
clusterManifestXdoc?.Load(xreader);
// Get values from cluster manifest, clusterId if it exists in either Paas or Diagnostics section.
GetValuesFromClusterManifest();
}
clusterManifestXdoc?.Load(xreader);
// Get values from cluster manifest, clusterId if it exists in either Paas or Diagnostics section.
GetValuesFromClusterManifest();
if (paasClusterId != null)
{
clusterId = paasClusterId;

Просмотреть файл

@ -15,7 +15,7 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.ApplicationInsights" Version="2.17.0" />
<PackageReference Include="Microsoft.ServiceFabric" Version="7.2.452" />
<PackageReference Include="Microsoft.ServiceFabric.Services" Version="5.0.516" />
<PackageReference Include="Microsoft.Win32.Registry" Version="5.0.0" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.1" />
</ItemGroup>