diff --git a/FHTest/FHUnitTests.cs b/FHTest/FHUnitTests.cs
index 30f5f06..1a1f228 100644
--- a/FHTest/FHUnitTests.cs
+++ b/FHTest/FHUnitTests.cs
@@ -510,7 +510,7 @@ namespace FHTest
var (generatedWarning, data) = await IsEntityInWarningStateAsync(null, RepairFactsServiceTarget.ServiceName);
Assert.IsTrue(generatedWarning);
- Assert.IsTrue(data is TelemetryData);
+ Assert.IsTrue(data != null);
}
[TestMethod]
@@ -529,7 +529,7 @@ namespace FHTest
var (generatedWarning, data) = await IsEntityInWarningStateAsync(null, null, NodeName);
Assert.IsTrue(generatedWarning);
- Assert.IsTrue(data is TelemetryData);
+ Assert.IsTrue(data != null);
}
[TestMethod]
@@ -624,13 +624,13 @@ namespace FHTest
{
var (generatedWarningService, sdata) = await IsEntityInWarningStateAsync(null, repair.ServiceName);
Assert.IsTrue(generatedWarningService);
- Assert.IsTrue(sdata is TelemetryData);
+ Assert.IsTrue(sdata != null);
}
else if (repair.EntityType == EntityType.Disk || repair.EntityType == EntityType.Machine || repair.EntityType == EntityType.Node)
{
var (generatedWarningNode, ndata) = await IsEntityInWarningStateAsync(null, null, NodeName);
Assert.IsTrue(generatedWarningNode);
- Assert.IsTrue(ndata is TelemetryData);
+ Assert.IsTrue(ndata != null);
}
// FHProxy creates or renames Source with trailing id ("FabricHealerProxy");
diff --git a/FabricHealer/FabricHealerManager.cs b/FabricHealer/FabricHealerManager.cs
index 98fc841..3c7ea41 100644
--- a/FabricHealer/FabricHealerManager.cs
+++ b/FabricHealer/FabricHealerManager.cs
@@ -1430,7 +1430,7 @@ namespace FabricHealer
else
{
// Nothing to do here.
- if (repairData.EntityType == EntityType.Invalid)
+ if (repairData.EntityType == EntityType.Unknown)
{
continue;
}
diff --git a/FabricHealer/Repair/RepairTaskManager.cs b/FabricHealer/Repair/RepairTaskManager.cs
index 76fc6f3..acee75f 100644
--- a/FabricHealer/Repair/RepairTaskManager.cs
+++ b/FabricHealer/Repair/RepairTaskManager.cs
@@ -522,7 +522,8 @@ namespace FabricHealer.Repair
Stopwatch stopWatch = Stopwatch.StartNew();
bool isApproved = false;
- var repairs = await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, cancellationToken);
+ var repairs =
+ await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, cancellationToken);
if (repairs.All(repair => repair.TaskId != repairTask.TaskId))
{
@@ -687,7 +688,7 @@ namespace FabricHealer.Repair
if (repairData.PartitionId == null)
{
success = false;
- await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoPartition",
$"No partition specified.",
@@ -707,7 +708,7 @@ namespace FabricHealer.Repair
if (repList.Count == 0)
{
success = false;
- await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoReplica",
$"Stateless Instance {repairData.ReplicaId} not found on partition " +
@@ -732,7 +733,7 @@ namespace FabricHealer.Repair
if (repairData.PartitionId == null)
{
success = false;
- await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoPartition",
$"No partition specified.",
@@ -818,8 +819,9 @@ namespace FabricHealer.Repair
switch (repairData.EntityType)
{
- // Try and handle the case where EntityType is not specified or is set to Invalid for some reason.
- case EntityType.Invalid:
+ // Try and handle the case where EntityType is not specified (facts from FHProxy, for example)
+ // or is explicitly set to Invalid for some reason.
+ case EntityType.Unknown:
if (!string.IsNullOrWhiteSpace(repairData.ServiceName))
{
@@ -833,7 +835,18 @@ namespace FabricHealer.Repair
{
goto case EntityType.Node;
}
- break;
+ else if (repairData.ReplicaId > 0)
+ {
+ goto case EntityType.Replica;
+ }
+ else if (!string.IsNullOrWhiteSpace(repairData.ProcessName) || repairData.ProcessId > 0)
+ {
+ goto case EntityType.Process;
+ }
+ else
+ {
+ return false;
+ }
case EntityType.Application:
diff --git a/FabricHealer/Utilities/Telemetry/EntityType.cs b/FabricHealer/Utilities/Telemetry/EntityType.cs
index 8d4f7a5..0ebbb32 100644
--- a/FabricHealer/Utilities/Telemetry/EntityType.cs
+++ b/FabricHealer/Utilities/Telemetry/EntityType.cs
@@ -11,9 +11,9 @@ namespace FabricHealer.Utilities.Telemetry
public enum EntityType
{
///
- /// Invalid (default value).
+ /// Unknown (default).
///
- Invalid,
+ Unknown,
///
/// Application type.
///
diff --git a/FabricHealerManager.cs b/FabricHealerManager.cs
new file mode 100644
index 0000000..395ac60
--- /dev/null
+++ b/FabricHealerManager.cs
@@ -0,0 +1,2161 @@
+// ------------------------------------------------------------
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
+// ------------------------------------------------------------
+
+using FabricHealer.Utilities;
+using FabricHealer.Repair;
+using FabricHealer.Utilities.Telemetry;
+using System;
+using System.Collections.Generic;
+using System.Fabric;
+using System.Fabric.Health;
+using System.IO;
+using System.Linq;
+using System.Threading;
+using System.Threading.Tasks;
+using HealthReport = FabricHealer.Utilities.HealthReport;
+using System.Fabric.Repair;
+using System.Fabric.Query;
+using FabricHealer.TelemetryLib;
+using Octokit;
+using System.Fabric.Description;
+using System.Runtime.InteropServices;
+
+namespace FabricHealer
+{
+ public sealed class FabricHealerManager : IDisposable
+ {
+ internal static TelemetryUtilities TelemetryUtilities;
+ internal static RepairData RepairHistory;
+
+ // Folks often use their own version numbers. This is for internal diagnostic telemetry.
+ private const string InternalVersionNumber = "1.1.1.960";
+ private static FabricHealerManager singleton;
+ private static FabricClient _fabricClient;
+ private bool disposedValue;
+ private readonly StatelessServiceContext serviceContext;
+ private readonly RepairTaskManager repairTaskManager;
+ private readonly RepairTaskEngine repairTaskEngine;
+ private readonly Uri systemAppUri = new Uri(RepairConstants.SystemAppName);
+ private readonly Uri repairManagerServiceUri = new Uri(RepairConstants.RepairManagerAppName);
+ private readonly FabricHealthReporter healthReporter;
+ private readonly TimeSpan OperationalTelemetryRunInterval = TimeSpan.FromDays(1);
+ private readonly string sfRuntimeVersion;
+ private int nodeCount;
+ private DateTime StartDateTime;
+ private long _instanceCount;
+ private static readonly object lockObj = new object();
+
+ internal static Logger RepairLogger
+ {
+ get;
+ private set;
+ }
+
+ // CancellationToken from FabricHealer.RunAsync.
+ internal static CancellationToken Token
+ {
+ get;
+ private set;
+ }
+
+ private DateTime LastTelemetrySendDate
+ {
+ get; set;
+ }
+
+ private DateTime LastVersionCheckDateTime
+ {
+ get; set;
+ }
+
+ public static ConfigSettings ConfigSettings
+ {
+ get; set;
+ }
+
+ ///
+ /// Singleton FabricClient instance used throughout FH. Thread-safe.
+ ///
+ public static FabricClient FabricClientSingleton
+ {
+ get
+ {
+ if (_fabricClient == null)
+ {
+ lock (lockObj)
+ {
+ if (_fabricClient == null)
+ {
+ _fabricClient = new FabricClient();
+ _fabricClient.Settings.HealthReportSendInterval = TimeSpan.FromSeconds(1);
+ _fabricClient.Settings.HealthReportRetrySendInterval = TimeSpan.FromSeconds(3);
+ return _fabricClient;
+ }
+ }
+ }
+ else
+ {
+ try
+ {
+ // This call with throw an ObjectDisposedException if fabricClient was disposed by, say, a plugin or if the runtime
+ // disposed of it for some random (unlikely..) reason. This is just a test to ensure it is not in a disposed state.
+ if (_fabricClient.Settings.HealthReportSendInterval > TimeSpan.MinValue)
+ {
+ return _fabricClient;
+ }
+ }
+ catch (Exception e) when (e is ObjectDisposedException || e is InvalidComObjectException)
+ {
+ lock (lockObj)
+ {
+ _fabricClient = null;
+ _fabricClient = new FabricClient();
+ _fabricClient.Settings.HealthReportSendInterval = TimeSpan.FromSeconds(1);
+ _fabricClient.Settings.HealthReportRetrySendInterval = TimeSpan.FromSeconds(3);
+ return _fabricClient;
+ }
+ }
+ }
+
+ return _fabricClient;
+ }
+ }
+
+ private FabricHealerManager(StatelessServiceContext context, CancellationToken token)
+ {
+ serviceContext = context;
+ Token = token;
+ serviceContext.CodePackageActivationContext.ConfigurationPackageModifiedEvent += CodePackageActivationContext_ConfigurationPackageModifiedEvent;
+ ConfigSettings = new ConfigSettings(context);
+ TelemetryUtilities = new TelemetryUtilities(context);
+ repairTaskEngine = new RepairTaskEngine();
+ repairTaskManager = new RepairTaskManager(serviceContext, Token);
+ RepairLogger = new Logger(RepairConstants.FabricHealer, ConfigSettings.LocalLogPathParameter)
+ {
+ EnableVerboseLogging = ConfigSettings.EnableVerboseLogging
+ };
+
+ RepairHistory = new RepairData();
+ healthReporter = new FabricHealthReporter(RepairLogger);
+ sfRuntimeVersion = GetServiceFabricRuntimeVersion();
+ }
+
+ ///
+ /// This is the static singleton instance of FabricHealerManager type. FabricHealerManager does not support
+ /// multiple instantiations. It does not provide a public constructor.
+ ///
+ /// StatelessServiceContext instance.
+ /// CancellationToken instance.
+ /// The singleton instance of FabricHealerManager.
+ public static FabricHealerManager Instance(StatelessServiceContext context, CancellationToken token)
+ {
+ _fabricClient = new FabricClient();
+ return singleton ??= new FabricHealerManager(context ?? throw new ArgumentException("ServiceContext can't be null..", nameof(context)), token);
+ }
+
+ ///
+ /// Checks if repair manager is enabled in the cluster or not
+ ///
+ ///
+ /// cancellation token to stop the async operation
+ /// true if repair manager application is present in cluster, otherwise false
+ private async Task InitializeAsync()
+ {
+ string okMessage = $"{repairManagerServiceUri} is deployed.";
+ bool isRmDeployed = true;
+ var healthReport = new HealthReport
+ {
+ NodeName = serviceContext.NodeContext.NodeName,
+ AppName = new Uri(RepairConstants.FabricHealerAppName),
+ EntityType = EntityType.Application,
+ HealthMessage = okMessage,
+ State = HealthState.Ok,
+ Property = "RequirementCheck::RMDeployed",
+ HealthReportTimeToLive = TimeSpan.FromMinutes(5),
+ SourceId = RepairConstants.FabricHealer,
+ };
+ ServiceList serviceList = await FabricClientSingleton.QueryManager.GetServiceListAsync(
+ systemAppUri,
+ repairManagerServiceUri,
+ ConfigSettings.AsyncTimeout,
+ Token);
+
+ if ((serviceList?.Count ?? 0) == 0)
+ {
+ string warnMessage =
+ $"{repairManagerServiceUri} could not be found, " +
+ $"FabricHealer Service requires {repairManagerServiceUri} system service to be deployed in the cluster. " +
+ "Consider adding a RepairManager section in your cluster manifest.";
+
+ healthReport.HealthMessage = warnMessage;
+ healthReport.State = HealthState.Warning;
+ healthReport.Code = SupportedErrorCodes.Ok;
+ healthReport.HealthReportTimeToLive = TimeSpan.MaxValue;
+ healthReport.SourceId = "CheckRepairManagerDeploymentStatusAsync";
+ isRmDeployed = false;
+ }
+
+ healthReporter.ReportHealthToServiceFabric(healthReport);
+
+ // Set the service replica instance count (FH is a Stateless singleton service, so it will either be -1 or the number of nodes FH is deployed to).
+ _instanceCount = await GetServiceInstanceCountAsync();
+
+ return isRmDeployed;
+ }
+
+ private async Task GetServiceInstanceCountAsync()
+ {
+ ServiceDescription serviceDesc =
+ await FabricClientSingleton.ServiceManager.GetServiceDescriptionAsync(serviceContext.ServiceName, ConfigSettings.AsyncTimeout, Token);
+
+ return (serviceDesc as StatelessServiceDescription).InstanceCount;
+ }
+
+ ///
+ /// Gets a parameter value from the specified config section or returns supplied default value if
+ /// not specified in config.
+ ///
+ ///
+ /// Name of the section.
+ /// Name of the parameter.
+ /// Default value.
+ /// parameter value.
+ private static string GetSettingParameterValue(
+ StatelessServiceContext context,
+ string sectionName,
+ string parameterName,
+ string defaultValue = null)
+ {
+ if (string.IsNullOrWhiteSpace(sectionName) || string.IsNullOrWhiteSpace(parameterName))
+ {
+ return null;
+ }
+
+ if (context == null)
+ {
+ return null;
+ }
+
+ try
+ {
+ var serviceConfiguration = context.CodePackageActivationContext.GetConfigurationPackageObject("Config");
+
+ if (serviceConfiguration.Settings.Sections.All(sec => sec.Name != sectionName))
+ {
+ return !string.IsNullOrWhiteSpace(defaultValue) ? defaultValue : null;
+ }
+
+ if (serviceConfiguration.Settings.Sections[sectionName].Parameters.All(param => param.Name != parameterName))
+ {
+ return !string.IsNullOrWhiteSpace(defaultValue) ? defaultValue : null;
+ }
+
+ string setting = serviceConfiguration.Settings.Sections[sectionName].Parameters[parameterName]?.Value;
+
+ if (string.IsNullOrWhiteSpace(setting) && defaultValue != null)
+ {
+ return defaultValue;
+ }
+
+ return setting;
+ }
+ catch (Exception e) when (e is ArgumentException || e is KeyNotFoundException)
+ {
+
+ }
+
+ return null;
+ }
+
+ // This function starts the detection workflow, which involves querying event store for
+ // Warning/Error heath events, looking for well-known FabricObserver error codes with supported
+ // repair actions, scheduling and executing related repair tasks.
+ public async Task StartAsync()
+ {
+ StartDateTime = DateTime.UtcNow;
+
+ if (!ConfigSettings.EnableAutoMitigation)
+ {
+ return;
+ }
+
+ bool initialized = await InitializeAsync();
+
+ if (!initialized)
+ {
+ return;
+ }
+
+ try
+ {
+ RepairLogger.LogInfo("Starting FabricHealer Health Detection loop.");
+
+ var nodeList =
+ await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
+ () => FabricClientSingleton.QueryManager.GetNodeListAsync(null, ConfigSettings.AsyncTimeout, Token),
+ Token);
+
+ nodeCount = nodeList.Count;
+
+ // First, let's clean up any orphan non-node level FabricHealer repair tasks left pending
+ // when the FabricHealer process is killed or otherwise ungracefully closed.
+ // This call will return quickly if FH was gracefully closed as there will be
+ // no outstanding repair tasks left orphaned.
+ await CancelOrResumeAllRunningFHRepairsAsync();
+
+ // Run until RunAsync token is cancelled.
+ while (!Token.IsCancellationRequested)
+ {
+ if (!ConfigSettings.EnableAutoMitigation)
+ {
+ break;
+ }
+
+ await MonitorHealthEventsAsync();
+
+ // Identity-agnostic internal operational telemetry sent to Service Fabric team (only) for use in
+ // understanding generic behavior of FH in the real world (no PII). This data is sent once a day and will be retained for no more
+ // than 90 days. Please consider enabling this to help the SF team make this technology better.
+ if (ConfigSettings.OperationalTelemetryEnabled && DateTime.UtcNow.Subtract(LastTelemetrySendDate) >= OperationalTelemetryRunInterval)
+ {
+ try
+ {
+ using var telemetryEvents = new TelemetryEvents(serviceContext);
+ var fhData = GetFabricHealerInternalTelemetryData();
+
+ if (fhData != null)
+ {
+ string filepath = Path.Combine(RepairLogger.LogFolderBasePath, $"fh_operational_telemetry.log");
+
+ if (telemetryEvents.EmitFabricHealerOperationalEvent(fhData, OperationalTelemetryRunInterval, filepath))
+ {
+ LastTelemetrySendDate = DateTime.UtcNow;
+ ResetInternalDataCounters();
+ }
+ }
+ }
+ catch
+ {
+ // Telemetry is non-critical and should not take down FH.
+ // TelemetryLib will log exception details to file in top level FH log folder.
+ }
+ }
+
+ // Check for new version once a day.
+ if (DateTime.UtcNow.Subtract(LastVersionCheckDateTime) >= OperationalTelemetryRunInterval)
+ {
+ await CheckGithubForNewVersionAsync();
+ LastVersionCheckDateTime = DateTime.UtcNow;
+ }
+
+ await Task.Delay(
+ TimeSpan.FromSeconds(
+ ConfigSettings.HealthCheckIntervalInSeconds > 0 ? ConfigSettings.HealthCheckIntervalInSeconds : 10), Token);
+ }
+
+ RepairLogger.LogInfo("Shutdown signaled. Stopping.");
+ await ClearExistingHealthReportsAsync();
+ }
+ catch (AggregateException)
+ {
+ // This check is necessary to prevent cancelling outstanding repair tasks if
+ // one of the handled exceptions originated from another operation unrelated to
+ // shutdown (like an async operation that timed out).
+ if (Token.IsCancellationRequested)
+ {
+ RepairLogger.LogInfo("Shutdown signaled. Stopping.");
+ await ClearExistingHealthReportsAsync();
+ }
+ }
+ catch (Exception e) when (e is FabricException || e is OperationCanceledException || e is TaskCanceledException || e is TimeoutException)
+ {
+ // This check is necessary to prevent cancelling outstanding repair tasks if
+ // one of the handled exceptions originated from another operation unrelated to
+ // shutdown (like an async operation that timed out).
+ if (Token.IsCancellationRequested)
+ {
+ RepairLogger.LogInfo("Shutdown signaled. Stopping.");
+ await ClearExistingHealthReportsAsync();
+ }
+ }
+ catch (Exception e)
+ {
+ var message = $"Unhandeld Exception in FabricHealerManager:{Environment.NewLine}{e}";
+ RepairLogger.LogError(message);
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Warning,
+ RepairConstants.FabricHealer,
+ message,
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ // FH Critical Error telemetry (no PII, no user stack) sent to SF team (FabricHealer dev). This information is helpful in understanding what went
+ // wrong that lead to the FH process going down (assuming it went down with an exception that can be caught).
+ // Please consider enabling this to help the SF team make this technology better.
+ if (ConfigSettings.OperationalTelemetryEnabled)
+ {
+ try
+ {
+ using var telemetryEvents = new TelemetryEvents(serviceContext);
+ var fhData = new FabricHealerCriticalErrorEventData
+ {
+ Source = nameof(FabricHealerManager),
+ ErrorMessage = e.Message,
+ ErrorStack = e.StackTrace,
+ CrashTime = DateTime.UtcNow.ToString("o"),
+ Version = InternalVersionNumber,
+ SFRuntimeVersion = sfRuntimeVersion
+ };
+
+ string filepath = Path.Combine(RepairLogger.LogFolderBasePath, $"fh_critical_error_telemetry.log");
+ _ = telemetryEvents.EmitFabricHealerCriticalErrorEvent(fhData, filepath);
+ }
+ catch
+ {
+ // Telemetry is non-critical and should not take down FH.
+ }
+ }
+
+ // Don't swallow the exception.
+ // Take down FH process. Fix the bug.
+ throw;
+ }
+ }
+
+ private void ResetInternalDataCounters()
+ {
+ RepairHistory.Repairs.Clear();
+ RepairHistory.FailedRepairs = 0;
+ RepairHistory.SuccessfulRepairs = 0;
+ RepairHistory.RepairCount = 0;
+ RepairHistory.EnabledRepairCount = 0;
+ }
+
+ private FabricHealerOperationalEventData GetFabricHealerInternalTelemetryData()
+ {
+ FabricHealerOperationalEventData opsTelemData = null;
+
+ try
+ {
+ RepairHistory.EnabledRepairCount = GetEnabledRepairRuleCount();
+
+ opsTelemData = new FabricHealerOperationalEventData
+ {
+ UpTime = DateTime.UtcNow.Subtract(StartDateTime).ToString(),
+ Version = InternalVersionNumber,
+ RepairData = RepairHistory,
+ SFRuntimeVersion = sfRuntimeVersion
+ };
+ }
+ catch
+ {
+
+ }
+
+ return opsTelemData;
+ }
+
+ ///
+ /// Cancels all FabricHealer repair tasks currently in flight (unless in Restoring state).
+ /// OR Resumes fabric node-level repairs that were abandoned due to FH going down while they were processing.
+ ///
+ /// A Task.
+ private async Task CancelOrResumeAllRunningFHRepairsAsync()
+ {
+ try
+ {
+ var currentFHRepairTasksInProgress =
+ await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
+ () => repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(
+ RepairTaskEngine.FabricHealerExecutorName,
+ Token),
+ Token);
+
+ if (currentFHRepairTasksInProgress.Count == 0)
+ {
+ return;
+ }
+
+ foreach (var repair in currentFHRepairTasksInProgress)
+ {
+ if (repair.State == RepairTaskState.Restoring)
+ {
+ continue;
+ }
+
+ // Grab the executor data from existing repair.
+ var executorData = repair.ExecutorData;
+
+ if (string.IsNullOrWhiteSpace(executorData))
+ {
+ continue;
+ }
+
+ if (!JsonSerializationUtility.TryDeserializeObject(executorData, out RepairExecutorData repairExecutorData))
+ {
+ continue;
+ }
+
+ // Don't do anything if the orphaned repair was for a different node than this one:
+ if (_instanceCount == -1 && repairExecutorData.RepairData.NodeName != serviceContext.NodeContext.NodeName)
+ {
+ continue;
+ }
+
+ // Try and cancel existing repair. We may need to create a new one for abandoned repairs where FH goes down for some reason.
+ // Note: CancelRepairTaskAsync handles exceptions (IOE) that may be thrown by RM due to state change policy.
+ // The repair state could change to Completed after this call is made, for example, and before RM API call.
+ if (repair.State != RepairTaskState.Completed)
+ {
+ await FabricRepairTasks.CancelRepairTaskAsync(repair);
+ }
+
+ /* Resume interrupted Fabric Node restart repairs */
+
+ // There is no need to resume simple repairs that do not require multiple repair steps (e.g., codepackage/process/replica restarts).
+ if (repairExecutorData.RepairData.RepairPolicy.RepairAction != RepairActionType.RestartFabricNode)
+ {
+ continue;
+ }
+
+ string errorCode = repairExecutorData.RepairData.Code;
+
+ if (string.IsNullOrWhiteSpace(errorCode))
+ {
+ continue;
+ }
+
+ // File Deletion repair is a node-level (VM) repair, but is not multi-step. Ignore.
+ if (SupportedErrorCodes.GetCodeNameFromErrorCode(errorCode).Contains("Disk")
+ || repairExecutorData.RepairData.RepairPolicy.RepairAction == RepairActionType.DeleteFiles)
+ {
+ continue;
+ }
+
+ // Fabric System service warnings/errors from FO can be Node level repair targets (e.g., Fabric binary needs to be restarted).
+ // FH will restart the node hosting the troubled SF system service if specified in related logic rules.
+ var repairRules =
+ GetRepairRulesFromConfiguration(
+ !string.IsNullOrWhiteSpace(
+ repairExecutorData.RepairData.ProcessName) ? RepairConstants.SystemServiceRepairPolicySectionName : RepairConstants.FabricNodeRepairPolicySectionName);
+
+ var repairData = new TelemetryData
+ {
+ NodeName = repairExecutorData.RepairData.NodeName,
+ Code = errorCode,
+ };
+
+ await repairTaskManager.RunGuanQueryAsync(repairData, repairRules, repairExecutorData);
+ }
+ }
+ catch (Exception e) when (e is FabricException || e is OperationCanceledException || e is TaskCanceledException)
+ {
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ "CancelOrResumeAllRunningFHRepairsAsync",
+ $"Could not cancel or resume repair tasks. Failed with:{Environment.NewLine}{e}",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+ }
+ }
+
+ private async void CodePackageActivationContext_ConfigurationPackageModifiedEvent(object sender, PackageModifiedEventArgs e)
+ {
+ await ClearExistingHealthReportsAsync();
+ ConfigSettings.UpdateConfigSettings(e.NewPackage.Settings);
+ }
+
+ /* Potential TODOs. This list should grow and external predicates should be written to support related workflow composition in logic rule file(s).
+
+ Symptom Mitigation
+ ------------------------------------------------------ ---------------------------------------------------
+ Expired Certificate [TP Scenario] Modify the cluster manifest AEPCC to true (we already have an automation script for this scenario)
+ Node crash due to lease issue Restart the neighboring VM
+ Node Crash due to slow network issue Restart the VM
+ System Service in quorum loss Repair the partition/Restart the VM
+ Node stuck in disabling state due to MR [safety check] Address safety issue through automation
+ [MR Scenario] Node in down state: MR unable
+ to send the Remove-ServiceFabricNodeState in time Remove-ServiceFabricNodeState
+ Unused container fill the disk space Call docker prune cmd
+ Primary replica for system service in IB state forever Restart the primary replica
+ */
+
+ private async Task MonitorHealthEventsAsync()
+ {
+ try
+ {
+ var clusterHealth = await FabricClientSingleton.HealthManager.GetClusterHealthAsync(ConfigSettings.AsyncTimeout, Token);
+
+ if (clusterHealth.AggregatedHealthState == HealthState.Ok)
+ {
+ return;
+ }
+
+ // Check cluster upgrade status. If the cluster is upgrading to a new version (or rolling back)
+ // then do not attempt any repairs.
+ try
+ {
+ string udInClusterUpgrade = await UpgradeChecker.GetCurrentUDWhereFabricUpgradeInProgressAsync(Token);
+
+ if (!string.IsNullOrWhiteSpace(udInClusterUpgrade))
+ {
+ string telemetryDescription = $"Cluster is currently upgrading in UD \"{udInClusterUpgrade}\". Will not schedule or execute repairs at this time.";
+
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ "MonitorHealthEventsAsync::ClusterUpgradeDetected",
+ telemetryDescription,
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+ return;
+ }
+ }
+ catch (Exception e) when (e is FabricException || e is TimeoutException)
+ {
+#if DEBUG
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ "MonitorHealthEventsAsync::HandledException",
+ $"Failure in MonitorHealthEventsAsync::Node:{Environment.NewLine}{e}",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+#endif
+ }
+
+ var unhealthyEvaluations = clusterHealth.UnhealthyEvaluations;
+
+ foreach (var evaluation in unhealthyEvaluations)
+ {
+ Token.ThrowIfCancellationRequested();
+
+ string kind = Enum.GetName(typeof(HealthEvaluationKind), evaluation.Kind);
+
+ if (kind != null && kind.Contains("Node"))
+ {
+ if (!ConfigSettings.EnableMachineRepair && !ConfigSettings.EnableDiskRepair && !ConfigSettings.EnableFabricNodeRepair)
+ {
+ continue;
+ }
+
+ try
+ {
+ await ProcessNodeHealthAsync(clusterHealth.NodeHealthStates);
+ }
+ catch (Exception e) when (e is FabricException || e is TimeoutException)
+ {
+#if DEBUG
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ "MonitorHealthEventsAsync::HandledException",
+ $"Failure in MonitorHealthEventsAsync::Node:{Environment.NewLine}{e}",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+#endif
+ }
+ }
+ else if (kind != null && kind.Contains("Application"))
+ {
+ if (!ConfigSettings.EnableAppRepair && !ConfigSettings.EnableSystemAppRepair)
+ {
+ continue;
+ }
+
+ foreach (var app in clusterHealth.ApplicationHealthStates)
+ {
+ Token.ThrowIfCancellationRequested();
+
+ try
+ {
+ var entityHealth =
+ await FabricClientSingleton.HealthManager.GetApplicationHealthAsync(app.ApplicationName, ConfigSettings.AsyncTimeout, Token);
+
+ if (app.AggregatedHealthState == HealthState.Ok)
+ {
+ continue;
+ }
+
+ if (entityHealth.ServiceHealthStates != null && entityHealth.ServiceHealthStates.Any(
+ s => s.AggregatedHealthState == HealthState.Error || s.AggregatedHealthState == HealthState.Warning))
+ {
+ foreach (var service in entityHealth.ServiceHealthStates.Where(
+ s => s.AggregatedHealthState == HealthState.Error || s.AggregatedHealthState == HealthState.Warning))
+ {
+ await ProcessServiceHealthAsync(service);
+ }
+ }
+ else
+ {
+ await ProcessApplicationHealthAsync(app);
+ }
+ }
+ catch (Exception e) when (e is FabricException || e is TimeoutException)
+ {
+#if DEBUG
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ "MonitorHealthEventsAsync::HandledException",
+ $"Failure in MonitorHealthEventsAsync::Application:{Environment.NewLine}{e}",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+#endif
+ }
+ }
+ }
+ }
+ }
+ catch (Exception e) when (e is ArgumentException || e is FabricException)
+ {
+ // Don't crash.
+ }
+ catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException))
+ {
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Error,
+ "MonitorHealthEventsAsync::UnhandledException",
+ $"Failure in MonitorHealthEventsAsync:{Environment.NewLine}{e}",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ RepairLogger.LogWarning($"Unhandled exception in MonitorHealthEventsAsync:{Environment.NewLine}{e}");
+
+ // Fix the bug(s).
+ throw;
+ }
+ }
+
+ ///
+ /// Processes SF Application health events, including System Application events (which could have Fabric Node impact repairs).
+ ///
+ /// Collection of ApplicationHealthState objects.
+ /// A task.
+ private async Task ProcessApplicationHealthAsync(ApplicationHealthState appHealthState)
+ {
+ ApplicationHealth appHealth = null;
+ Uri appName = appHealthState.ApplicationName;
+
+ // System app target? Do not proceed if system app repair is not enabled.
+ if (appName.OriginalString == RepairConstants.SystemAppName && !ConfigSettings.EnableSystemAppRepair)
+ {
+ return;
+ }
+
+ // User app target? Do not proceed if App repair is not enabled.
+ if (appName.OriginalString != RepairConstants.SystemAppName && !ConfigSettings.EnableAppRepair)
+ {
+ return;
+ }
+
+ appHealth = await FabricClientSingleton.HealthManager.GetApplicationHealthAsync(appName, ConfigSettings.AsyncTimeout, Token);
+
+ if (appName.OriginalString != RepairConstants.SystemAppName)
+ {
+ try
+ {
+ var appUpgradeStatus = await FabricClientSingleton.ApplicationManager.GetApplicationUpgradeProgressAsync(appName);
+
+ if (appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingBackInProgress
+ || appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardInProgress
+ || appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardPending)
+ {
+ List udInAppUpgrade = await UpgradeChecker.GetUdsWhereApplicationUpgradeInProgressAsync(appName, Token);
+ string udText = string.Empty;
+
+ // -1 means no upgrade in progress for application.
+ if (udInAppUpgrade.Any(ud => ud > -1))
+ {
+ udText = $"in UD {udInAppUpgrade.First(ud => ud > -1)}";
+ }
+
+ string telemetryDescription = $"{appName} is upgrading {udText}. Will not attempt application repair at this time.";
+
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ "MonitorRepairableHealthEventsAsync::AppUpgradeDetected",
+ telemetryDescription,
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ return;
+ }
+ }
+ catch (FabricException)
+ {
+ // This upgrade check should not prevent moving forward if the fabric client call fails with an FE.
+ }
+ }
+
+ var healthEvents = appHealth.HealthEvents.Where(
+ s => s.HealthInformation.HealthState == HealthState.Warning
+ || s.HealthInformation.HealthState == HealthState.Error);
+
+ foreach (var evt in healthEvents)
+ {
+ Token.ThrowIfCancellationRequested();
+
+ if (string.IsNullOrWhiteSpace(evt.HealthInformation.Description))
+ {
+ continue;
+ }
+
+ // If health data is not a serialized TelemetryData instance, then move along.
+ if (!JsonSerializationUtility.TryDeserializeObject(evt.HealthInformation.Description, out TelemetryData repairData))
+ {
+ continue;
+ }
+
+ // Since FH can run on each node (-1 InstanceCount), if this is the case then have FH only try to repair app services that are also running on the same node.
+ // This removes the need to try and orchestrate repairs across nodes (which we will have to do in the non -1 case).
+ if (_instanceCount == -1 && repairData.NodeName != serviceContext.NodeContext.NodeName)
+ {
+ continue;
+ }
+ else if (_instanceCount > 1)
+ {
+ // Randomly wait to decrease chances of simultaneous ownership among FH instances.
+ await RandomWaitAsync();
+ }
+
+ if (repairData.Code != null && !SupportedErrorCodes.AppErrorCodesDictionary.ContainsKey(repairData.Code)
+ || repairData.Code == SupportedErrorCodes.AppErrorNetworkEndpointUnreachable
+ || repairData.Code == SupportedErrorCodes.AppWarningNetworkEndpointUnreachable)
+ {
+ // Network endpoint test failures have no general mitigation.
+ continue;
+ }
+
+ // Get configuration settings related to Application (service code package) repair.
+ List repairRules;
+ string repairId;
+ string system = string.Empty;
+
+ if (appName.OriginalString == RepairConstants.SystemAppName)
+ {
+ if (!ConfigSettings.EnableSystemAppRepair)
+ {
+ continue;
+ }
+
+ // Block attempts to schedule node-level or system service restart repairs if one is already executing in the cluster.
+ var fhRepairTasks = await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, Token);
+
+ if (fhRepairTasks.Count > 0)
+ {
+ foreach (var repair in fhRepairTasks)
+ {
+ var executorData = JsonSerializationUtility.TryDeserializeObject(repair.ExecutorData, out RepairExecutorData exData) ? exData : null;
+
+ if (executorData?.RepairData?.RepairPolicy?.RepairAction != RepairActionType.RestartFabricNode &&
+ executorData?.RepairData?.RepairPolicy?.RepairAction != RepairActionType.RestartProcess)
+ {
+ continue;
+ }
+
+ string message = $"A Service Fabric System service repair ({repair.TaskId}) is already in progress in the cluster(state: {repair.State}). " +
+ $"Will not attempt repair at this time.";
+
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ $"ProcessApplicationHealth::System::{repair.TaskId}",
+ message,
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ return;
+ }
+ }
+
+ repairRules = GetRepairRulesForTelemetryData(repairData);
+
+ if (repairRules == null || repairRules?.Count == 0)
+ {
+ continue;
+ }
+
+ repairId = $"{repairData.NodeName}_{repairData.ProcessName}_{repairData.Code}";
+ system = "System ";
+
+ var currentRepairs =
+ await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, Token);
+
+ // Is a repair for the target app service instance already happening in the cluster?
+ // There can be multiple Warnings emitted by FO for a single app at the same time.
+ if (currentRepairs.Count > 0 && currentRepairs.Any(r => r.ExecutorData.Contains(repairData.ProcessName)))
+ {
+ var repair = currentRepairs.FirstOrDefault(r => r.ExecutorData.Contains(repairData.ProcessName));
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ $"MonitorRepairableHealthEventsAsync::{repairData.ProcessName}",
+ $"There is already a repair in progress for Fabric system service {repairData.ProcessName}(state: {repair.State})",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ continue;
+ }
+
+ // Repair already in progress?
+ if (currentRepairs.Count > 0 && currentRepairs.Any(r => r.ExecutorData.Contains(repairId)))
+ {
+ continue;
+ }
+ }
+ else
+ {
+ if (!ConfigSettings.EnableAppRepair)
+ {
+ continue;
+ }
+
+ // Don't restart thyself.
+ if (repairData.ServiceName == serviceContext.ServiceName.OriginalString && repairData.NodeName == serviceContext.NodeContext.NodeName)
+ {
+ continue;
+ }
+
+ repairRules = GetRepairRulesForTelemetryData(repairData);
+
+ // Nothing to do here.
+ if (repairRules == null || repairRules?.Count == 0)
+ {
+ continue;
+ }
+
+ string serviceProcessName = $"{repairData.ServiceName?.Replace("fabric:/", "").Replace("/", "")}";
+ var currentRepairs =
+ await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, Token);
+
+ // This is the way each FH repair is ID'd. This data is stored in the related Repair Task's ExecutorData property.
+ repairId = $"{repairData.NodeName}_{serviceProcessName}_{repairData.Metric?.Replace(" ", string.Empty)}";
+
+ // Is a repair for the target app service instance already happening in the cluster?
+ // There can be multiple Warnings emitted by FO for a single app at the same time.
+ if (currentRepairs.Count > 0 && currentRepairs.Any(r => r.ExecutorData.Contains(repairId)))
+ {
+ var repair = currentRepairs.FirstOrDefault(r => r.ExecutorData.Contains(repairId));
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ $"MonitorRepairableHealthEventsAsync::{repairData.ServiceName}",
+ $"{appName} already has a repair in progress for service {repairData.ServiceName}(state: {repair.State})",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ continue;
+ }
+ }
+
+ /* Start repair workflow */
+
+ repairData.RepairPolicy = new RepairPolicy
+ {
+ RepairId = repairId
+ };
+ repairData.Property = evt.HealthInformation.Property;
+ string errOrWarn = "Error";
+
+ if (evt.HealthInformation.HealthState == HealthState.Warning)
+ {
+ errOrWarn = "Warning";
+ }
+
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ $"MonitorRepairableHealthEventsAsync:{repairId}",
+ $"Detected {errOrWarn} state for Application {repairData.ApplicationName}{Environment.NewLine}" +
+ $"SourceId: {evt.HealthInformation.SourceId}{Environment.NewLine}" +
+ $"Property: {evt.HealthInformation.Property}{Environment.NewLine}" +
+ $"{system}Application repair policy is enabled. " +
+ $"{repairRules.Count} Logic rules found for {system}Application-level repair.",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ // Update the in-memory HealthEvent List.
+ this.repairTaskManager.DetectedHealthEvents.Add(evt);
+
+ // Start the repair workflow.
+ await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
+ }
+ }
+
+ private async Task ProcessServiceHealthAsync(ServiceHealthState serviceHealthState)
+ {
+ // This is just used to make sure there is more than 1 node in the cluster. We don't need a list of all nodes.
+ var nodeQueryDesc = new NodeQueryDescription
+ {
+ MaxResults = 3,
+ };
+
+ NodeList nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
+ () => FabricClientSingleton.QueryManager.GetNodePagedListAsync(
+ nodeQueryDesc,
+ ConfigSettings.AsyncTimeout,
+ Token),
+ Token);
+
+ ServiceHealth serviceHealth;
+ Uri appName;
+ Uri serviceName = serviceHealthState.ServiceName;
+
+ // User service target? Do not proceed if App repair is not enabled.
+ if (!serviceName.OriginalString.Contains(RepairConstants.SystemAppName) && !ConfigSettings.EnableAppRepair)
+ {
+ return;
+ }
+
+ // System service target? Do not proceed if system app repair is not enabled.
+ if (serviceName.OriginalString.Contains(RepairConstants.SystemAppName) && !ConfigSettings.EnableSystemAppRepair)
+ {
+ return;
+ }
+
+ serviceHealth = await FabricClientSingleton.HealthManager.GetServiceHealthAsync(serviceName, ConfigSettings.AsyncTimeout, Token);
+ var name = await FabricClientSingleton.QueryManager.GetApplicationNameAsync(serviceName, ConfigSettings.AsyncTimeout, Token);
+ appName = name.ApplicationName;
+
+ // user service - upgrade check.
+ if (!appName.OriginalString.Contains(RepairConstants.SystemAppName) && !serviceName.OriginalString.Contains(RepairConstants.SystemAppName))
+ {
+ try
+ {
+ var app = await FabricClientSingleton.QueryManager.GetApplicationNameAsync(serviceName, ConfigSettings.AsyncTimeout, Token);
+ var appUpgradeStatus = await FabricClientSingleton.ApplicationManager.GetApplicationUpgradeProgressAsync(app.ApplicationName);
+
+ if (appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingBackInProgress
+ || appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardInProgress
+ || appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardPending)
+ {
+ List udInAppUpgrade = await UpgradeChecker.GetUdsWhereApplicationUpgradeInProgressAsync(serviceName, Token);
+ string udText = string.Empty;
+
+ // -1 means no upgrade in progress for application.
+ if (udInAppUpgrade.Any(ud => ud > -1))
+ {
+ udText = $"in UD {udInAppUpgrade.First(ud => ud > -1)}";
+ }
+
+ string telemetryDescription = $"{app.ApplicationName} is upgrading {udText}. Will not attempt service repair at this time.";
+
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ "MonitorRepairableHealthEventsAsync::AppUpgradeDetected",
+ telemetryDescription,
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ return;
+ }
+ }
+ catch (FabricException)
+ {
+ // This upgrade check should not prevent moving forward if the fabric client call fails with an FE.
+ }
+ }
+
+ var healthEvents = serviceHealth.HealthEvents.Where(
+ e => e.HealthInformation.HealthState == HealthState.Warning || e.HealthInformation.HealthState == HealthState.Error);
+
+ // Replica repair. This only makes sense if a partition is in Error or Warning state (and Replica repair is still experimental for FH).
+ if (ConfigSettings.EnableReplicaRepair)
+ {
+ var partitionHealthStates = serviceHealth.PartitionHealthStates.Where(
+ p => p.AggregatedHealthState == HealthState.Warning || p.AggregatedHealthState == HealthState.Error);
+
+ if (partitionHealthStates.Any())
+ {
+ await ProcessReplicaHealthAsync(serviceHealth);
+ return;
+ }
+ }
+
+ foreach (var evt in healthEvents)
+ {
+ Token.ThrowIfCancellationRequested();
+
+ if (string.IsNullOrWhiteSpace(evt.HealthInformation.Description))
+ {
+ continue;
+ }
+
+ // If health data is not a serialized instance of a type that implements ITelemetryData, then move along.
+ if (!JsonSerializationUtility.TryDeserializeObject(evt.HealthInformation.Description, out TelemetryData repairData))
+ {
+ continue;
+ }
+
+ // No service name provided. No service.
+ if (string.IsNullOrEmpty(repairData.ServiceName))
+ {
+ continue;
+ }
+
+ if (repairData.ServiceName.ToLower() != serviceName.OriginalString.ToLower())
+ {
+ continue;
+ }
+
+ // PartitionId is Guid? in ITelemetryData.
+ if (repairData.PartitionId == null)
+ {
+ continue;
+ }
+
+ if (string.IsNullOrWhiteSpace(repairData.ApplicationName))
+ {
+ repairData.ApplicationName = appName.OriginalString;
+ }
+
+ // Since FH can run on each node (-1 InstanceCount), if this is the case then have FH only try to repair app services that are also running on the same node.
+ // This removes the need to try and orchestrate repairs across nodes (which we will have to do in the non -1 case).
+ if (_instanceCount == -1 && repairData.NodeName != serviceContext.NodeContext.NodeName)
+ {
+ continue;
+ }
+ else if (_instanceCount > 1)
+ {
+ // Randomly wait to decrease chances of simultaneous ownership among FH instances.
+ await RandomWaitAsync();
+ }
+
+ if (repairData.Code != null && !SupportedErrorCodes.AppErrorCodesDictionary.ContainsKey(repairData.Code)
+ || repairData.Code == SupportedErrorCodes.AppErrorNetworkEndpointUnreachable
+ || repairData.Code == SupportedErrorCodes.AppWarningNetworkEndpointUnreachable)
+ {
+ // Network endpoint test failures have no general mitigation.
+ continue;
+ }
+
+ // Get configuration settings related to Application (service code package) repair.
+ List repairRules;
+ string repairId;
+ string system = string.Empty;
+
+ if (serviceName.OriginalString.Contains(RepairConstants.SystemAppName))
+ {
+ if (!ConfigSettings.EnableSystemAppRepair)
+ {
+ continue;
+ }
+
+ // Block attempts to schedule node-level or system service restart repairs if one is already executing in the cluster.
+ var fhRepairTasks = await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(
+ RepairTaskEngine.FabricHealerExecutorName,
+ Token);
+
+ if (fhRepairTasks.Count > 0)
+ {
+ foreach (var repair in fhRepairTasks)
+ {
+ var executorData = JsonSerializationUtility.TryDeserializeObject(repair.ExecutorData, out RepairExecutorData exData) ? exData : null;
+
+ if (executorData?.RepairData?.RepairPolicy?.RepairAction != RepairActionType.RestartFabricNode &&
+ executorData?.RepairData?.RepairPolicy?.RepairAction != RepairActionType.RestartProcess)
+ {
+ continue;
+ }
+
+ string message = $"A Service Fabric System service repair ({repair.TaskId}) is already in progress in the cluster(state: {repair.State}). " +
+ $"Will not attempt repair at this time.";
+
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ $"ProcessApplicationHealth::System::{repair.TaskId}",
+ message,
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ return;
+ }
+ }
+
+ repairRules = GetRepairRulesForTelemetryData(repairData);
+
+ if (repairRules == null || repairRules?.Count == 0)
+ {
+ continue;
+ }
+
+ repairId = $"{repairData.NodeName}_{serviceName.OriginalString}_{repairData.Code}";
+ system = "System ";
+
+ var currentRepairs =
+ await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, Token);
+
+ // Is a repair for the target app service instance already happening in the cluster?
+ // There can be multiple Warnings emitted by FO for a single app at the same time.
+ if (currentRepairs.Count > 0 && currentRepairs.Any(r => r.ExecutorData.Contains(serviceName.OriginalString)))
+ {
+ var repair = currentRepairs.FirstOrDefault(r => r.ExecutorData.Contains(serviceName.OriginalString));
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ $"MonitorRepairableHealthEventsAsync::{serviceName.OriginalString}",
+ $"There is already a repair in progress for Fabric system service {serviceName.OriginalString}(state: {repair.State})",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ continue;
+ }
+
+ // Repair already in progress?
+ if (currentRepairs.Count > 0 && currentRepairs.Any(r => r.ExecutorData.Contains(repairId)))
+ {
+ continue;
+ }
+ }
+ else
+ {
+ if (!ConfigSettings.EnableAppRepair)
+ {
+ continue;
+ }
+
+ // Don't restart thyself.
+ if (repairData.ServiceName == serviceContext.ServiceName.OriginalString && repairData.NodeName == serviceContext.NodeContext.NodeName)
+ {
+ continue;
+ }
+
+ repairRules = GetRepairRulesForTelemetryData(repairData);
+
+ // Nothing to do here.
+ if (repairRules == null || repairRules?.Count == 0)
+ {
+ continue;
+ }
+
+ string serviceProcessName = $"{repairData.ServiceName.Replace("fabric:/", "").Replace("/", "")}";
+ var currentFHRepairs =
+ await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, Token);
+
+ // This is the way each FH repair is ID'd. This data is stored in the related Repair Task's ExecutorData property.
+ repairId = $"{repairData.NodeName}_{serviceProcessName}_{repairData.Metric?.Replace(" ", string.Empty)}";
+
+ // All FH repairs have serialized instances of RepairExecutorData set as the value for a RepairTask's ExecutorData property.
+ if (currentFHRepairs?.Count > 0)
+ {
+ // This prevents starting creating a new repair if another service running on a different node needs to be resarted, for example.
+ // Thing of this as a UD Walk across nodes of service instances in need of repair.
+ if (ConfigSettings.EnableRollingServiceRestarts
+ && nodes?.Count > 1
+ && currentFHRepairs.Any(r => JsonSerializationUtility.TryDeserializeObject(r.ExecutorData, out RepairExecutorData execData)
+ && execData?.RepairData?.ServiceName?.ToLower() == repairData.ServiceName.ToLower()))
+ {
+ var repair = currentFHRepairs.FirstOrDefault(r => JsonSerializationUtility.TryDeserializeObject(r.ExecutorData, out RepairExecutorData execData)
+ && execData.RepairData.ServiceName.ToLower() == repairData.ServiceName.ToLower());
+
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ $"{serviceProcessName}_RollingRepairInProgress",
+ $"There is currently a rolling repair in progress for service {repairData.ServiceName}. Current node: {repair.Target}. Repair State: {repair.State})",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ return;
+ }
+ // For the case where a service repair is still not Completed (e.g., the repair status is Restoring, which would happen after the repair executor has completed
+ // its work, but RM is performing post safety checks (safety checks can be enabled/disabled in logic rules).
+ else if (currentFHRepairs.Any(r => JsonSerializationUtility.TryDeserializeObject(r.ExecutorData, out RepairExecutorData execData)
+ && execData?.RepairData?.RepairPolicy?.RepairId == repairId))
+ {
+ var repair = currentFHRepairs.FirstOrDefault(r => JsonSerializationUtility.TryDeserializeObject(r.ExecutorData, out RepairExecutorData execData)
+ && execData.RepairData.RepairPolicy.RepairId == repairId);
+
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ $"{serviceProcessName}_RepairAlreadyInProgress",
+ $"There is currently a repair in progress for service {repairData.ServiceName} on node {repairData.NodeName}. Repair State: {repair.State}.",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ return;
+ }
+ // This means no other service instance will get repaired anywhere in the cluster if a service repair job for the same service is
+ // already taking place on any node. This only takes effect if config setting EnableRollingServiceRestarts is set to true.
+ }
+ }
+
+ /* Start repair workflow */
+ repairData.RepairPolicy = new RepairPolicy
+ {
+ RepairId = repairId
+ };
+ repairData.Property = evt.HealthInformation.Property;
+ string errOrWarn = "Error";
+
+ if (evt.HealthInformation.HealthState == HealthState.Warning)
+ {
+ errOrWarn = "Warning";
+ }
+
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ $"MonitorRepairableHealthEventsAsync:{repairId}",
+ $"Detected {errOrWarn} state for Service {repairData.ServiceName}{Environment.NewLine}" +
+ $"SourceId: {evt.HealthInformation.SourceId}{Environment.NewLine}" +
+ $"Property: {evt.HealthInformation.Property}{Environment.NewLine}" +
+ $"{system}Application repair policy is enabled. " +
+ $"{repairRules.Count} Logic rules found for {system}Service-level repair.",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ // Update the in-memory HealthEvent List.
+ repairTaskManager.DetectedHealthEvents.Add(evt);
+
+ // Start the repair workflow.
+ await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
+ }
+ }
+
+ private async Task ProcessNodeHealthAsync(IEnumerable nodeHealthStates)
+ {
+ // This is just used to make sure there is more than 1 node in the cluster. We don't need a list of all nodes.
+ var nodeQueryDesc = new NodeQueryDescription
+ {
+ MaxResults = 3,
+ };
+
+ NodeList nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
+ () => FabricClientSingleton.QueryManager.GetNodePagedListAsync(
+ nodeQueryDesc,
+ ConfigSettings.AsyncTimeout,
+ Token),
+ Token);
+
+ // Node/Machine repair not supported in onebox clusters..
+ if (nodes?.Count == 1)
+ {
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ $"ProcessMachineHealth::NotSupported",
+ "Machine/Fabric Node repair is not supported in clusters with 1 Fabric node.",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ return;
+ }
+
+ var supportedNodeHealthStates =
+ nodeHealthStates.Where(a => a.AggregatedHealthState == HealthState.Warning || a.AggregatedHealthState == HealthState.Error);
+
+ foreach (var node in supportedNodeHealthStates)
+ {
+ Token.ThrowIfCancellationRequested();
+
+ var nodeList = await FabricClientSingleton.QueryManager.GetNodeListAsync(node.NodeName, ConfigSettings.AsyncTimeout, Token);
+ string nodeType = nodeList[0].NodeType;
+ string nodeUD = nodeList[0].UpgradeDomain;
+ var nodeHealth = await FabricClientSingleton.HealthManager.GetNodeHealthAsync(node.NodeName, ConfigSettings.AsyncTimeout, Token);
+ var nodeHealthEvents =
+ nodeHealth.HealthEvents.Where(
+ s => (s.HealthInformation.HealthState == HealthState.Warning || s.HealthInformation.HealthState == HealthState.Error));
+
+ // Ensure a node in Error is not in error due to being down as part of a cluster upgrade or infra update in its UD.
+ if (node.AggregatedHealthState == HealthState.Error)
+ {
+ string udInClusterUpgrade = await UpgradeChecker.GetCurrentUDWhereFabricUpgradeInProgressAsync(Token);
+
+ if (!string.IsNullOrWhiteSpace(udInClusterUpgrade) && udInClusterUpgrade == nodeUD)
+ {
+ string telemetryDescription =
+ $"Cluster is currently upgrading in UD \"{udInClusterUpgrade}\", which is the UD for node {node.NodeName}, which is down. " +
+ "Will not schedule or execute node-level repair at this time.";
+
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ $"{node.NodeName}_down_ClusterUpgrade({nodeUD})",
+ telemetryDescription,
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ continue;
+ }
+
+ // Check to see if an Azure tenant/platform update is in progress for target node. Do not conduct repairs if so.
+ if (await UpgradeChecker.IsAzureUpdateInProgress(nodeType, node.NodeName, Token))
+ {
+ string telemetryDescription = $"{node.NodeName} is down due to Infra repair job (UD = {nodeUD}). Will not attempt node repair at this time.";
+
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ $"{node.NodeName}_down_InfraRepair",
+ telemetryDescription,
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ continue;
+ }
+ }
+
+ foreach (var evt in nodeHealthEvents)
+ {
+ Token.ThrowIfCancellationRequested();
+
+ // Random wait to limit potential duplicate (concurrent) repair job creation from other FH instances.
+ await RandomWaitAsync();
+
+ // Was health event generated by FO or FHProxy?
+ if (!JsonSerializationUtility.TryDeserializeObject(evt.HealthInformation.Description, out TelemetryData repairData))
+ {
+ // This will enable Machine level repair (reboot, reimage) based on detected SF Node Health Event not generated by FO/FHProxy.
+ repairData = new TelemetryData
+ {
+ NodeName = node.NodeName,
+ NodeType = nodeType,
+ EntityType = EntityType.Machine,
+ Description = evt.HealthInformation.Description,
+ HealthState = evt.HealthInformation.HealthState,
+ Source = RepairConstants.FabricHealer
+ };
+ }
+ else
+ {
+ // Nothing to do here.
+ if (repairData.EntityType == EntityType.Unknown)
+ {
+ continue;
+ }
+
+ // Disk?
+ if (repairData.EntityType == EntityType.Disk && ConfigSettings.EnableDiskRepair)
+ {
+ await ProcessDiskHealthAsync(evt, repairData);
+ continue;
+ }
+
+ // Fabric node?
+ if (repairData.EntityType == EntityType.Node && ConfigSettings.EnableFabricNodeRepair)
+ {
+ // FabricHealerProxy-generated report, so a restart fabric node request, for example.
+ await ProcessFabricNodeHealthAsync(evt, repairData);
+ continue;
+ }
+ }
+
+ // Machine repair \\
+
+ if (!ConfigSettings.EnableMachineRepair)
+ {
+ continue;
+ }
+
+ // If there are mulitple instances of FH deployed to the cluster (like -1 InstanceCount), then don't do machine repairs if this instance of FH
+ // detects a need to do so. Another instance on a different node will take the job. Only DiskObserver-generated repair data has to be done on the node
+ // where FO's DiskObserver emitted the related information, for example (like Disk space issues and the need to clean specified (in logic rules) folders).
+ if ((_instanceCount == -1 || _instanceCount > 2) && node.NodeName == serviceContext.NodeContext.NodeName)
+ {
+ continue;
+ }
+
+ // Make sure that there is not already an Infra repair in progress for the target node.
+ var currentISRepairs =
+ await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync($"{RepairConstants.InfrastructureServiceName}/{nodeType}", Token);
+
+ if (currentISRepairs?.Count > 0)
+ {
+ if (currentISRepairs.Any(r => r.Description.Contains(node.NodeName)))
+ {
+ var repair = currentISRepairs.FirstOrDefault(r => r.Description.Contains(node.NodeName));
+
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ $"{node.NodeName}_MachineRepairAlreadyInProgress",
+ $"There is currently a Machine repair in progress for node {node.NodeName}. Repair State: {repair.State}.",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ return;
+ }
+ }
+
+ // Get repair rules for supplied facts (TelemetryData).
+ var repairRules = GetRepairRulesForTelemetryData(repairData);
+
+ if (repairRules == null || repairRules.Count == 0)
+ {
+ continue;
+ }
+
+ /* Start repair workflow */
+
+ string repairId = $"Machine_Repair_{nodeType}_{repairData.NodeName}";
+ repairData.RepairPolicy = new RepairPolicy
+ {
+ RepairId = repairId
+ };
+ repairData.Property = evt.HealthInformation.Property;
+ string errOrWarn = "Error";
+
+ if (evt.HealthInformation.HealthState == HealthState.Warning)
+ {
+ errOrWarn = "Warning";
+ }
+
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ repairId,
+ $"Detected Fabric node {repairData.NodeName} is in {errOrWarn}.{Environment.NewLine}" +
+ $"Machine repair target specified. {repairRules.Count} logic rules found for Machine repair.",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ // Update the in-memory HealthEvent List.
+ repairTaskManager.DetectedHealthEvents.Add(evt);
+
+ // Start the repair workflow.
+ await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
+ }
+ }
+ }
+
+ private async Task ProcessDiskHealthAsync(HealthEvent evt, TelemetryData repairData)
+ {
+ // Can only repair local disks.
+ if (repairData.NodeName != serviceContext.NodeContext.NodeName)
+ {
+ return;
+ }
+
+ // Get repair rules for supported source Observer.
+ var repairRules = GetRepairRulesForTelemetryData(repairData);
+
+ if (repairRules == null || repairRules.Count == 0)
+ {
+ return;
+ }
+
+ /* Start repair workflow */
+
+ string repairId = $"Disk_Repair_{repairData.Code}{repairData.NodeName}_DeleteFiles";
+ repairData.RepairPolicy = new RepairPolicy
+ {
+ RepairId = repairId
+ };
+ repairData.Property = evt.HealthInformation.Property;
+ string errOrWarn = "Error";
+
+ if (evt.HealthInformation.HealthState == HealthState.Warning)
+ {
+ errOrWarn = "Warning";
+ }
+
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ repairId,
+ $"Detected {repairData.NodeName} is in {errOrWarn}.{Environment.NewLine}" +
+ $"Disk repair is enabled. {repairRules.Count} logic rules found for Disk repair.",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ // Update the in-memory HealthEvent List.
+ repairTaskManager.DetectedHealthEvents.Add(evt);
+
+ // Start the repair workflow.
+ await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
+ }
+
+ private async Task ProcessFabricNodeHealthAsync(HealthEvent healthEvent, TelemetryData repairData)
+ {
+ var repairRules = GetRepairRulesForTelemetryData(repairData);
+
+ if (repairRules == null || repairRules?.Count == 0)
+ {
+ return;
+ }
+
+ // There is only one supported repair for a FabricNode: Restart.
+ string repairId = $"{repairData.NodeName}_{repairData.NodeType}_Restart";
+
+ var currentRepairs =
+ await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, Token);
+
+ // Block attempts to reschedule another Fabric node-level repair for the same node if a current repair has not yet completed.
+ if (currentRepairs.Count > 0 && currentRepairs.Any(r => r.ExecutorData.Contains(repairId)))
+ {
+ var repair = currentRepairs.FirstOrDefault(r => r.ExecutorData.Contains(repairData.NodeType));
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ repairId,
+ $"There is already a repair in progress for Fabric node {repairData.NodeName}(state: {repair.State})",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ return;
+ }
+
+ repairData.RepairPolicy = new RepairPolicy
+ {
+ RepairId = repairId
+ };
+ repairData.Property = repairData.Property;
+ string errOrWarn = "Error";
+
+ if (repairData.HealthState == HealthState.Warning)
+ {
+ errOrWarn = "Warning";
+ }
+
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ repairId,
+ $"Detected {repairData.NodeName} is in {errOrWarn}.{Environment.NewLine}" +
+ $"Fabric Node repair is enabled. {repairRules.Count} Logic rules found for Fabric Node repair.",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ // Update the in-memory HealthEvent List.
+ repairTaskManager.DetectedHealthEvents.Add(healthEvent);
+
+ // Start the repair workflow.
+ await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
+ }
+
+ // This is an example of a repair for a non-FO-originating health event. This function needs some work, but you get the basic idea here.
+ // FO does not support replica monitoring and as such it does not emit specific error codes that FH recognizes.
+ // *This is an experimental function/workflow in need of more testing.*
+ private async Task ProcessReplicaHealthAsync(ServiceHealth serviceHealth)
+ {
+ // Random wait to limit potential duplicate (concurrent) repair job creation from other FH instances.
+ await RandomWaitAsync();
+
+ /* Example of repairable problem at Replica level, as health event:
+
+ [SourceId] ='System.RAP' reported Warning/Error for property...
+ [Property] = 'IStatefulServiceReplica.ChangeRole(N)Duration'.
+ [Description] = The api IStatefulServiceReplica.ChangeRole(N) on node [NodeName] is stuck.
+
+ Start Time (UTC): 2020-04-26 19:22:55.492.
+ */
+
+ List healthEvents = new List();
+ var partitionHealthStates = serviceHealth.PartitionHealthStates.Where(
+ p => p.AggregatedHealthState == HealthState.Warning || p.AggregatedHealthState == HealthState.Error);
+
+ foreach (var partitionHealthState in partitionHealthStates)
+ {
+ PartitionHealth partitionHealth =
+ await FabricClientSingleton.HealthManager.GetPartitionHealthAsync(partitionHealthState.PartitionId, ConfigSettings.AsyncTimeout, Token);
+
+ List replicaHealthStates = partitionHealth.ReplicaHealthStates.Where(
+ p => p.AggregatedHealthState == HealthState.Warning || p.AggregatedHealthState == HealthState.Error).ToList();
+
+ if (replicaHealthStates != null && replicaHealthStates.Count > 0)
+ {
+ foreach (var rep in replicaHealthStates)
+ {
+ var replicaHealth =
+ await FabricClientSingleton.HealthManager.GetReplicaHealthAsync(partitionHealthState.PartitionId, rep.Id, ConfigSettings.AsyncTimeout, Token);
+
+ if (replicaHealth != null)
+ {
+ healthEvents = replicaHealth.HealthEvents.Where(
+ h => h.HealthInformation.HealthState == HealthState.Warning || h.HealthInformation.HealthState == HealthState.Error).ToList();
+
+ foreach (HealthEvent healthEvent in healthEvents)
+ {
+ if (!healthEvent.HealthInformation.SourceId.Contains("System.RAP"))
+ {
+ continue;
+ }
+
+ if (!healthEvent.HealthInformation.Property.Contains("IStatefulServiceReplica.ChangeRole") &&
+ !healthEvent.HealthInformation.Property.Contains("IReplicator.BuildReplica"))
+ {
+ continue;
+ }
+
+ if (!healthEvent.HealthInformation.Description.Contains("is stuck"))
+ {
+ continue;
+ }
+
+ var app = await FabricClientSingleton.QueryManager.GetApplicationNameAsync(
+ serviceHealth.ServiceName,
+ ConfigSettings.AsyncTimeout,
+ Token);
+
+ var replicaList = await FabricClientSingleton.QueryManager.GetReplicaListAsync(
+ rep.PartitionId,
+ rep.Id,
+ ConfigSettings.AsyncTimeout,
+ Token);
+ // Replica still exists?
+ if (replicaList.Count == 0)
+ {
+ continue;
+ }
+
+ var appName = app?.ApplicationName?.OriginalString;
+ var replica = replicaList[0];
+ var nodeName = replica?.NodeName;
+
+ // Get configuration settings related to Replica repair.
+ var repairRules = GetRepairRulesFromConfiguration(RepairConstants.ReplicaRepairPolicySectionName);
+
+ if (repairRules == null || !repairRules.Any())
+ {
+ continue;
+ }
+
+ var repairData = new TelemetryData
+ {
+ ApplicationName = appName,
+ EntityType = EntityType.Replica,
+ HealthState = healthEvent.HealthInformation.HealthState,
+ NodeName = nodeName,
+ ReplicaId = rep.Id,
+ PartitionId = rep.PartitionId,
+ ServiceName = serviceHealth.ServiceName.OriginalString,
+ Source = RepairConstants.FabricHealerAppName
+ };
+
+ string errOrWarn = "Error";
+
+ if (healthEvent.HealthInformation.HealthState == HealthState.Warning)
+ {
+ errOrWarn = "Warning";
+ }
+
+ string repairId = $"{nodeName}_{serviceHealth.ServiceName.OriginalString.Remove(0, appName.Length + 1)}_{repairData.PartitionId}";
+ repairData.RepairPolicy = new RepairPolicy
+ {
+ RepairId = repairId
+ };
+ repairData.Property = healthEvent.HealthInformation.Property;
+
+ // Repair already in progress?
+ var currentRepairs = await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, Token);
+
+ if (currentRepairs.Count > 0 && currentRepairs.Any(r => r.ExecutorData.Contains(repairId)))
+ {
+ continue;
+ }
+
+ /* Start repair workflow */
+
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ $"MonitorRepairableHealthEventsAsync:Replica_{rep.Id}_{errOrWarn}",
+ $"Detected Replica {rep.Id} on Partition " +
+ $"{rep.PartitionId} is in {errOrWarn}.{Environment.NewLine}" +
+ $"Replica repair policy is enabled. " +
+ $"{repairRules.Count} Logic rules found for Replica repair.",
+ Token,
+ null,
+ ConfigSettings.EnableVerboseLogging);
+
+ // Start the repair workflow.
+ await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private List GetRepairRulesFromErrorCode(string errorCode, string app = null)
+ {
+ if (!SupportedErrorCodes.AppErrorCodesDictionary.ContainsKey(errorCode)
+ && !SupportedErrorCodes.NodeErrorCodesDictionary.ContainsKey(errorCode))
+ {
+ return null;
+ }
+
+ string repairPolicySectionName;
+
+ switch (errorCode)
+ {
+ // App repair (user and system).
+ case SupportedErrorCodes.AppErrorCpuPercent:
+ case SupportedErrorCodes.AppErrorMemoryMB:
+ case SupportedErrorCodes.AppErrorMemoryPercent:
+ case SupportedErrorCodes.AppErrorTooManyActiveEphemeralPorts:
+ case SupportedErrorCodes.AppErrorTooManyActiveTcpPorts:
+ case SupportedErrorCodes.AppErrorTooManyOpenFileHandles:
+ case SupportedErrorCodes.AppErrorTooManyThreads:
+ case SupportedErrorCodes.AppWarningCpuPercent:
+ case SupportedErrorCodes.AppWarningMemoryMB:
+ case SupportedErrorCodes.AppWarningMemoryPercent:
+ case SupportedErrorCodes.AppWarningTooManyActiveEphemeralPorts:
+ case SupportedErrorCodes.AppWarningTooManyActiveTcpPorts:
+ case SupportedErrorCodes.AppWarningTooManyOpenFileHandles:
+ case SupportedErrorCodes.AppWarningTooManyThreads:
+
+ repairPolicySectionName =
+ app == RepairConstants.SystemAppName ? RepairConstants.SystemServiceRepairPolicySectionName : RepairConstants.AppRepairPolicySectionName;
+ break;
+
+ // VM repair.
+ case SupportedErrorCodes.NodeErrorCpuPercent:
+ case SupportedErrorCodes.NodeErrorMemoryMB:
+ case SupportedErrorCodes.NodeErrorMemoryPercent:
+ case SupportedErrorCodes.NodeErrorTooManyActiveEphemeralPorts:
+ case SupportedErrorCodes.NodeErrorTooManyActiveTcpPorts:
+ case SupportedErrorCodes.NodeErrorTotalOpenFileHandlesPercent:
+ case SupportedErrorCodes.NodeWarningCpuPercent:
+ case SupportedErrorCodes.NodeWarningMemoryMB:
+ case SupportedErrorCodes.NodeWarningMemoryPercent:
+ case SupportedErrorCodes.NodeWarningTooManyActiveEphemeralPorts:
+ case SupportedErrorCodes.NodeWarningTooManyActiveTcpPorts:
+ case SupportedErrorCodes.NodeWarningTotalOpenFileHandlesPercent:
+
+ repairPolicySectionName = RepairConstants.MachineRepairPolicySectionName;
+ break;
+
+ // Disk repair.
+ case SupportedErrorCodes.NodeWarningDiskSpaceMB:
+ case SupportedErrorCodes.NodeErrorDiskSpaceMB:
+ case SupportedErrorCodes.NodeWarningDiskSpacePercent:
+ case SupportedErrorCodes.NodeErrorDiskSpacePercent:
+ case SupportedErrorCodes.NodeWarningFolderSizeMB:
+ case SupportedErrorCodes.NodeErrorFolderSizeMB:
+
+ repairPolicySectionName = RepairConstants.DiskRepairPolicySectionName;
+ break;
+
+ default:
+ return null;
+ }
+
+ return GetRepairRulesFromConfiguration(repairPolicySectionName);
+ }
+
+ private List GetRepairRulesForSupportedObserver(string observerName)
+ {
+ string repairPolicySectionName;
+
+ switch (observerName)
+ {
+ // App repair (user).
+ case RepairConstants.AppObserver:
+
+ repairPolicySectionName = RepairConstants.AppRepairPolicySectionName;
+ break;
+
+ // System service repair.
+ case RepairConstants.FabricSystemObserver:
+ repairPolicySectionName = RepairConstants.SystemServiceRepairPolicySectionName;
+ break;
+
+ // Disk repair
+ case RepairConstants.DiskObserver:
+ repairPolicySectionName = RepairConstants.DiskRepairPolicySectionName;
+ break;
+
+ // VM repair.
+ case RepairConstants.NodeObserver:
+
+ repairPolicySectionName = RepairConstants.MachineRepairPolicySectionName;
+ break;
+
+ default:
+ return null;
+ }
+
+ return GetRepairRulesFromConfiguration(repairPolicySectionName);
+ }
+
+ ///
+ /// Get a list of rules based on facts held in a TelemetryData instance.
+ ///
+ /// Instance of TelemetryData that FabricHealer created itself or deserialized from a Health Event Description.
+ ///
+ private List GetRepairRulesForTelemetryData(TelemetryData repairData)
+ {
+ string repairPolicySectionName;
+
+ switch (repairData.EntityType)
+ {
+ case EntityType.Unknown:
+
+ return null;
+
+ // App/Service repair (user).
+ case EntityType.Application when repairData.ApplicationName.ToLower() != RepairConstants.SystemAppName.ToLower():
+ case EntityType.Service:
+ case EntityType.StatefulService:
+ case EntityType.StatelessService:
+ repairPolicySectionName = RepairConstants.AppRepairPolicySectionName;
+ break;
+
+ // System service process repair.
+ case EntityType.Application when repairData.ApplicationName.ToLower() == RepairConstants.SystemAppName.ToLower() && repairData.ProcessName != null:
+ case EntityType.Process when repairData.ProcessName != null || repairData.ProcessId > 0:
+ repairPolicySectionName = RepairConstants.SystemServiceRepairPolicySectionName;
+ break;
+
+ // Disk repair.
+ case EntityType.Disk when serviceContext.NodeContext.NodeName == repairData.NodeName:
+ repairPolicySectionName = RepairConstants.DiskRepairPolicySectionName;
+ break;
+
+ // Machine repair.
+ case EntityType.Machine:
+ repairPolicySectionName = RepairConstants.MachineRepairPolicySectionName;
+ break;
+
+ // Fabric Node repair (from FabricHealerProxy, for example, where there is no concept of Observer).
+ case EntityType.Node:
+ repairPolicySectionName = RepairConstants.FabricNodeRepairPolicySectionName;
+ break;
+
+ default:
+ return null;
+ }
+
+ return GetRepairRulesFromConfiguration(repairPolicySectionName);
+ }
+
+ private List GetRepairRulesFromConfiguration(string repairPolicySectionName)
+ {
+ try
+ {
+ string logicRulesConfigFileName =
+ GetSettingParameterValue(serviceContext, repairPolicySectionName, RepairConstants.LogicRulesConfigurationFile);
+ var configPath = serviceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config").Path;
+ var rulesFolderPath = Path.Combine(configPath, RepairConstants.LogicRulesFolderName);
+ var rulesFilePath = Path.Combine(rulesFolderPath, logicRulesConfigFileName);
+
+ if (!File.Exists(rulesFilePath))
+ {
+ return null;
+ }
+
+ string[] rules = File.ReadAllLines(rulesFilePath);
+
+ if (rules.Length == 0)
+ {
+ return null;
+ }
+
+ List repairRules = ParseRulesFile(rules);
+ return repairRules;
+ }
+ catch (Exception ex) when (ex is ArgumentException || ex is IOException)
+ {
+ return null;
+ }
+ }
+
+ private int GetEnabledRepairRuleCount()
+ {
+ var config = serviceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config");
+ int count = 0;
+
+ foreach (var section in config.Settings.Sections)
+ {
+ if (!section.Name.Contains(RepairConstants.RepairPolicy))
+ {
+ continue;
+ }
+
+ if (section.Parameters[RepairConstants.Enabled]?.Value?.ToLower() == "true")
+ {
+ count++;
+ }
+ }
+
+ return count;
+ }
+
+ private void Dispose(bool disposing)
+ {
+ if (disposedValue)
+ {
+ return;
+ }
+
+ if (disposing)
+ {
+ _fabricClient?.Dispose();
+ }
+
+ disposedValue = true;
+ }
+
+ public void Dispose()
+ {
+ Dispose(true);
+ }
+
+ private static List ParseRulesFile(string[] rules)
+ {
+ var repairRules = new List();
+ int ptr1 = 0, ptr2 = 0;
+ rules = rules.Where(s => !string.IsNullOrWhiteSpace(s)).ToArray();
+
+ while (ptr1 < rules.Length && ptr2 < rules.Length)
+ {
+ // Single line comments removal.
+ if (rules[ptr2].TrimStart().StartsWith("##"))
+ {
+ ptr1++;
+ ptr2++;
+ continue;
+ }
+
+ if (rules[ptr2].TrimEnd().EndsWith("."))
+ {
+ if (ptr1 == ptr2)
+ {
+ repairRules.Add(rules[ptr2].TrimEnd().Remove(rules[ptr2].Length - 1, 1));
+ }
+ else
+ {
+ string rule = rules[ptr1].Trim();
+
+ for (int i = ptr1 + 1; i <= ptr2; i++)
+ {
+ rule = rule + ' ' + rules[i].Replace('\t', ' ').Trim();
+ }
+
+ repairRules.Add(rule.Remove(rule.Length - 1, 1));
+ }
+ ptr2++;
+ ptr1 = ptr2;
+ }
+ else
+ {
+ ptr2++;
+ }
+ }
+
+ return repairRules;
+ }
+
+ private async Task RandomWaitAsync()
+ {
+ var random = new Random();
+ int waitTimeMS = random.Next(random.Next(100, nodeCount * 100), 1000 * nodeCount);
+
+ await Task.Delay(waitTimeMS, Token);
+ }
+
+ // https://stackoverflow.com/questions/25678690/how-can-i-check-github-releases-in-c
+ private async Task CheckGithubForNewVersionAsync()
+ {
+ try
+ {
+ var githubClient = new GitHubClient(new ProductHeaderValue(RepairConstants.FabricHealer));
+ IReadOnlyList releases = await githubClient.Repository.Release.GetAll("microsoft", "service-fabric-healer");
+
+ if (releases.Count == 0)
+ {
+ return;
+ }
+
+ string releaseAssetName = releases[0].Name;
+ string latestVersion = releaseAssetName.Split(" ")[1];
+ Version latestGitHubVersion = new Version(latestVersion);
+ Version localVersion = new Version(InternalVersionNumber);
+ int versionComparison = localVersion.CompareTo(latestGitHubVersion);
+
+ if (versionComparison < 0)
+ {
+ string message = $"A newer version of FabricHealer is available: {latestVersion}";
+ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+ LogLevel.Info,
+ RepairConstants.FabricHealer,
+ message,
+ Token,
+ null,
+ true,
+ TimeSpan.FromDays(1),
+ "NewVersionAvailable",
+ EntityType.Application);
+ }
+ }
+ catch (Exception e)
+ {
+ // Don't take down FO due to error in version check.
+ RepairLogger.LogWarning($"Failure in CheckGithubForNewVersionAsync:{Environment.NewLine}{e}");
+ }
+ }
+
+ private async Task ClearExistingHealthReportsAsync()
+ {
+ try
+ {
+ var healthReporter = new FabricHealthReporter(RepairLogger);
+ var healthReport = new HealthReport
+ {
+ HealthMessage = "Clearing existing health reports as FabricHealer is stopping or updating.",
+ NodeName = serviceContext.NodeContext.NodeName,
+ State = HealthState.Ok,
+ HealthReportTimeToLive = TimeSpan.FromMinutes(5),
+ };
+
+ var appName = new Uri(RepairConstants.FabricHealerAppName);
+ var appHealth = await FabricClientSingleton.HealthManager.GetApplicationHealthAsync(appName);
+ var FHAppEvents = appHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains($"{RepairConstants.FabricHealer}."));
+
+ foreach (HealthEvent evt in FHAppEvents)
+ {
+ healthReport.AppName = appName;
+ healthReport.Property = evt.HealthInformation.Property;
+ healthReport.SourceId = evt.HealthInformation.SourceId;
+ healthReport.EntityType = EntityType.Application;
+
+ healthReporter.ReportHealthToServiceFabric(healthReport);
+ Thread.Sleep(50);
+ }
+
+ var nodeHealth = await FabricClientSingleton.HealthManager.GetNodeHealthAsync(serviceContext.NodeContext.NodeName);
+ var FHNodeEvents = nodeHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains(RepairConstants.FabricHealer));
+
+ foreach (HealthEvent evt in FHNodeEvents)
+ {
+ healthReport.Property = evt.HealthInformation.Property;
+ healthReport.SourceId = evt.HealthInformation.SourceId;
+ healthReport.EntityType = EntityType.Node;
+
+ healthReporter.ReportHealthToServiceFabric(healthReport);
+ Thread.Sleep(50);
+ }
+ }
+ catch (FabricException)
+ {
+
+ }
+ }
+
+ private string GetServiceFabricRuntimeVersion()
+ {
+ try
+ {
+ var config = ServiceFabricConfiguration.Instance;
+ return config.FabricVersion;
+ }
+ catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException))
+ {
+ RepairLogger.LogWarning($"GetServiceFabricRuntimeVersion failure:{Environment.NewLine}{e}");
+ }
+
+ return null;
+ }
+ }
+}
diff --git a/FabricHealerProxy/EntityType.cs b/FabricHealerProxy/EntityType.cs
index c0bbf77..b6bd855 100644
--- a/FabricHealerProxy/EntityType.cs
+++ b/FabricHealerProxy/EntityType.cs
@@ -11,9 +11,9 @@ namespace FabricHealer
public enum EntityType
{
///
- /// Invalid (default value).
+ /// Unknown (default value).
///
- Invalid,
+ Unknown,
///
/// Application type.
///
diff --git a/FabricHealerProxy/FabricHealerProxy.cs b/FabricHealerProxy/FabricHealerProxy.cs
index c8f9041..d5e24b1 100644
--- a/FabricHealerProxy/FabricHealerProxy.cs
+++ b/FabricHealerProxy/FabricHealerProxy.cs
@@ -254,22 +254,22 @@ namespace FabricHealer
ManageRepairDataHistory(cancellationToken);
// Support not specifying EntityType.
- if (!string.IsNullOrWhiteSpace(repairData.ServiceName) && repairData.EntityType == EntityType.Invalid)
+ if (!string.IsNullOrWhiteSpace(repairData.ServiceName) && repairData.EntityType == EntityType.Unknown)
{
repairData.EntityType = EntityType.Service;
}
- else if (repairData.ReplicaId > 0 && repairData.EntityType == EntityType.Invalid)
+ else if (repairData.ReplicaId > 0 && repairData.EntityType == EntityType.Unknown)
{
repairData.EntityType = EntityType.Replica;
}
- else if ((repairData.ProcessId > 0 || !string.IsNullOrWhiteSpace(repairData.ProcessName)) && repairData.EntityType == EntityType.Invalid)
+ else if ((repairData.ProcessId > 0 || !string.IsNullOrWhiteSpace(repairData.ProcessName)) && repairData.EntityType == EntityType.Unknown)
{
repairData.EntityType = EntityType.Process;
}
else if (!string.IsNullOrEmpty(repairData.NodeName) &&
string.IsNullOrWhiteSpace(repairData.ApplicationName) &&
string.IsNullOrWhiteSpace(repairData.ServiceName) &&
- repairData.EntityType == EntityType.Invalid || repairData.EntityType == EntityType.Machine)
+ repairData.EntityType == EntityType.Unknown || repairData.EntityType == EntityType.Machine)
{
repairData.EntityType = repairData.EntityType == EntityType.Machine ? EntityType.Machine : EntityType.Node;