diff --git a/FHTest/FHUnitTests.cs b/FHTest/FHUnitTests.cs index 30f5f06..1a1f228 100644 --- a/FHTest/FHUnitTests.cs +++ b/FHTest/FHUnitTests.cs @@ -510,7 +510,7 @@ namespace FHTest var (generatedWarning, data) = await IsEntityInWarningStateAsync(null, RepairFactsServiceTarget.ServiceName); Assert.IsTrue(generatedWarning); - Assert.IsTrue(data is TelemetryData); + Assert.IsTrue(data != null); } [TestMethod] @@ -529,7 +529,7 @@ namespace FHTest var (generatedWarning, data) = await IsEntityInWarningStateAsync(null, null, NodeName); Assert.IsTrue(generatedWarning); - Assert.IsTrue(data is TelemetryData); + Assert.IsTrue(data != null); } [TestMethod] @@ -624,13 +624,13 @@ namespace FHTest { var (generatedWarningService, sdata) = await IsEntityInWarningStateAsync(null, repair.ServiceName); Assert.IsTrue(generatedWarningService); - Assert.IsTrue(sdata is TelemetryData); + Assert.IsTrue(sdata != null); } else if (repair.EntityType == EntityType.Disk || repair.EntityType == EntityType.Machine || repair.EntityType == EntityType.Node) { var (generatedWarningNode, ndata) = await IsEntityInWarningStateAsync(null, null, NodeName); Assert.IsTrue(generatedWarningNode); - Assert.IsTrue(ndata is TelemetryData); + Assert.IsTrue(ndata != null); } // FHProxy creates or renames Source with trailing id ("FabricHealerProxy"); diff --git a/FabricHealer/FabricHealerManager.cs b/FabricHealer/FabricHealerManager.cs index 98fc841..3c7ea41 100644 --- a/FabricHealer/FabricHealerManager.cs +++ b/FabricHealer/FabricHealerManager.cs @@ -1430,7 +1430,7 @@ namespace FabricHealer else { // Nothing to do here. - if (repairData.EntityType == EntityType.Invalid) + if (repairData.EntityType == EntityType.Unknown) { continue; } diff --git a/FabricHealer/Repair/RepairTaskManager.cs b/FabricHealer/Repair/RepairTaskManager.cs index 76fc6f3..acee75f 100644 --- a/FabricHealer/Repair/RepairTaskManager.cs +++ b/FabricHealer/Repair/RepairTaskManager.cs @@ -522,7 +522,8 @@ namespace FabricHealer.Repair Stopwatch stopWatch = Stopwatch.StartNew(); bool isApproved = false; - var repairs = await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, cancellationToken); + var repairs = + await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, cancellationToken); if (repairs.All(repair => repair.TaskId != repairTask.TaskId)) { @@ -687,7 +688,7 @@ namespace FabricHealer.Repair if (repairData.PartitionId == null) { success = false; - await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( LogLevel.Info, "RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoPartition", $"No partition specified.", @@ -707,7 +708,7 @@ namespace FabricHealer.Repair if (repList.Count == 0) { success = false; - await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( LogLevel.Info, "RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoReplica", $"Stateless Instance {repairData.ReplicaId} not found on partition " + @@ -732,7 +733,7 @@ namespace FabricHealer.Repair if (repairData.PartitionId == null) { success = false; - await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( LogLevel.Info, "RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoPartition", $"No partition specified.", @@ -818,8 +819,9 @@ namespace FabricHealer.Repair switch (repairData.EntityType) { - // Try and handle the case where EntityType is not specified or is set to Invalid for some reason. - case EntityType.Invalid: + // Try and handle the case where EntityType is not specified (facts from FHProxy, for example) + // or is explicitly set to Invalid for some reason. + case EntityType.Unknown: if (!string.IsNullOrWhiteSpace(repairData.ServiceName)) { @@ -833,7 +835,18 @@ namespace FabricHealer.Repair { goto case EntityType.Node; } - break; + else if (repairData.ReplicaId > 0) + { + goto case EntityType.Replica; + } + else if (!string.IsNullOrWhiteSpace(repairData.ProcessName) || repairData.ProcessId > 0) + { + goto case EntityType.Process; + } + else + { + return false; + } case EntityType.Application: diff --git a/FabricHealer/Utilities/Telemetry/EntityType.cs b/FabricHealer/Utilities/Telemetry/EntityType.cs index 8d4f7a5..0ebbb32 100644 --- a/FabricHealer/Utilities/Telemetry/EntityType.cs +++ b/FabricHealer/Utilities/Telemetry/EntityType.cs @@ -11,9 +11,9 @@ namespace FabricHealer.Utilities.Telemetry public enum EntityType { /// - /// Invalid (default value). + /// Unknown (default). /// - Invalid, + Unknown, /// /// Application type. /// diff --git a/FabricHealerManager.cs b/FabricHealerManager.cs new file mode 100644 index 0000000..395ac60 --- /dev/null +++ b/FabricHealerManager.cs @@ -0,0 +1,2161 @@ +// ------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License (MIT). See License.txt in the repo root for license information. +// ------------------------------------------------------------ + +using FabricHealer.Utilities; +using FabricHealer.Repair; +using FabricHealer.Utilities.Telemetry; +using System; +using System.Collections.Generic; +using System.Fabric; +using System.Fabric.Health; +using System.IO; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using HealthReport = FabricHealer.Utilities.HealthReport; +using System.Fabric.Repair; +using System.Fabric.Query; +using FabricHealer.TelemetryLib; +using Octokit; +using System.Fabric.Description; +using System.Runtime.InteropServices; + +namespace FabricHealer +{ + public sealed class FabricHealerManager : IDisposable + { + internal static TelemetryUtilities TelemetryUtilities; + internal static RepairData RepairHistory; + + // Folks often use their own version numbers. This is for internal diagnostic telemetry. + private const string InternalVersionNumber = "1.1.1.960"; + private static FabricHealerManager singleton; + private static FabricClient _fabricClient; + private bool disposedValue; + private readonly StatelessServiceContext serviceContext; + private readonly RepairTaskManager repairTaskManager; + private readonly RepairTaskEngine repairTaskEngine; + private readonly Uri systemAppUri = new Uri(RepairConstants.SystemAppName); + private readonly Uri repairManagerServiceUri = new Uri(RepairConstants.RepairManagerAppName); + private readonly FabricHealthReporter healthReporter; + private readonly TimeSpan OperationalTelemetryRunInterval = TimeSpan.FromDays(1); + private readonly string sfRuntimeVersion; + private int nodeCount; + private DateTime StartDateTime; + private long _instanceCount; + private static readonly object lockObj = new object(); + + internal static Logger RepairLogger + { + get; + private set; + } + + // CancellationToken from FabricHealer.RunAsync. + internal static CancellationToken Token + { + get; + private set; + } + + private DateTime LastTelemetrySendDate + { + get; set; + } + + private DateTime LastVersionCheckDateTime + { + get; set; + } + + public static ConfigSettings ConfigSettings + { + get; set; + } + + /// + /// Singleton FabricClient instance used throughout FH. Thread-safe. + /// + public static FabricClient FabricClientSingleton + { + get + { + if (_fabricClient == null) + { + lock (lockObj) + { + if (_fabricClient == null) + { + _fabricClient = new FabricClient(); + _fabricClient.Settings.HealthReportSendInterval = TimeSpan.FromSeconds(1); + _fabricClient.Settings.HealthReportRetrySendInterval = TimeSpan.FromSeconds(3); + return _fabricClient; + } + } + } + else + { + try + { + // This call with throw an ObjectDisposedException if fabricClient was disposed by, say, a plugin or if the runtime + // disposed of it for some random (unlikely..) reason. This is just a test to ensure it is not in a disposed state. + if (_fabricClient.Settings.HealthReportSendInterval > TimeSpan.MinValue) + { + return _fabricClient; + } + } + catch (Exception e) when (e is ObjectDisposedException || e is InvalidComObjectException) + { + lock (lockObj) + { + _fabricClient = null; + _fabricClient = new FabricClient(); + _fabricClient.Settings.HealthReportSendInterval = TimeSpan.FromSeconds(1); + _fabricClient.Settings.HealthReportRetrySendInterval = TimeSpan.FromSeconds(3); + return _fabricClient; + } + } + } + + return _fabricClient; + } + } + + private FabricHealerManager(StatelessServiceContext context, CancellationToken token) + { + serviceContext = context; + Token = token; + serviceContext.CodePackageActivationContext.ConfigurationPackageModifiedEvent += CodePackageActivationContext_ConfigurationPackageModifiedEvent; + ConfigSettings = new ConfigSettings(context); + TelemetryUtilities = new TelemetryUtilities(context); + repairTaskEngine = new RepairTaskEngine(); + repairTaskManager = new RepairTaskManager(serviceContext, Token); + RepairLogger = new Logger(RepairConstants.FabricHealer, ConfigSettings.LocalLogPathParameter) + { + EnableVerboseLogging = ConfigSettings.EnableVerboseLogging + }; + + RepairHistory = new RepairData(); + healthReporter = new FabricHealthReporter(RepairLogger); + sfRuntimeVersion = GetServiceFabricRuntimeVersion(); + } + + /// + /// This is the static singleton instance of FabricHealerManager type. FabricHealerManager does not support + /// multiple instantiations. It does not provide a public constructor. + /// + /// StatelessServiceContext instance. + /// CancellationToken instance. + /// The singleton instance of FabricHealerManager. + public static FabricHealerManager Instance(StatelessServiceContext context, CancellationToken token) + { + _fabricClient = new FabricClient(); + return singleton ??= new FabricHealerManager(context ?? throw new ArgumentException("ServiceContext can't be null..", nameof(context)), token); + } + + /// + /// Checks if repair manager is enabled in the cluster or not + /// + /// + /// cancellation token to stop the async operation + /// true if repair manager application is present in cluster, otherwise false + private async Task InitializeAsync() + { + string okMessage = $"{repairManagerServiceUri} is deployed."; + bool isRmDeployed = true; + var healthReport = new HealthReport + { + NodeName = serviceContext.NodeContext.NodeName, + AppName = new Uri(RepairConstants.FabricHealerAppName), + EntityType = EntityType.Application, + HealthMessage = okMessage, + State = HealthState.Ok, + Property = "RequirementCheck::RMDeployed", + HealthReportTimeToLive = TimeSpan.FromMinutes(5), + SourceId = RepairConstants.FabricHealer, + }; + ServiceList serviceList = await FabricClientSingleton.QueryManager.GetServiceListAsync( + systemAppUri, + repairManagerServiceUri, + ConfigSettings.AsyncTimeout, + Token); + + if ((serviceList?.Count ?? 0) == 0) + { + string warnMessage = + $"{repairManagerServiceUri} could not be found, " + + $"FabricHealer Service requires {repairManagerServiceUri} system service to be deployed in the cluster. " + + "Consider adding a RepairManager section in your cluster manifest."; + + healthReport.HealthMessage = warnMessage; + healthReport.State = HealthState.Warning; + healthReport.Code = SupportedErrorCodes.Ok; + healthReport.HealthReportTimeToLive = TimeSpan.MaxValue; + healthReport.SourceId = "CheckRepairManagerDeploymentStatusAsync"; + isRmDeployed = false; + } + + healthReporter.ReportHealthToServiceFabric(healthReport); + + // Set the service replica instance count (FH is a Stateless singleton service, so it will either be -1 or the number of nodes FH is deployed to). + _instanceCount = await GetServiceInstanceCountAsync(); + + return isRmDeployed; + } + + private async Task GetServiceInstanceCountAsync() + { + ServiceDescription serviceDesc = + await FabricClientSingleton.ServiceManager.GetServiceDescriptionAsync(serviceContext.ServiceName, ConfigSettings.AsyncTimeout, Token); + + return (serviceDesc as StatelessServiceDescription).InstanceCount; + } + + /// + /// Gets a parameter value from the specified config section or returns supplied default value if + /// not specified in config. + /// + /// + /// Name of the section. + /// Name of the parameter. + /// Default value. + /// parameter value. + private static string GetSettingParameterValue( + StatelessServiceContext context, + string sectionName, + string parameterName, + string defaultValue = null) + { + if (string.IsNullOrWhiteSpace(sectionName) || string.IsNullOrWhiteSpace(parameterName)) + { + return null; + } + + if (context == null) + { + return null; + } + + try + { + var serviceConfiguration = context.CodePackageActivationContext.GetConfigurationPackageObject("Config"); + + if (serviceConfiguration.Settings.Sections.All(sec => sec.Name != sectionName)) + { + return !string.IsNullOrWhiteSpace(defaultValue) ? defaultValue : null; + } + + if (serviceConfiguration.Settings.Sections[sectionName].Parameters.All(param => param.Name != parameterName)) + { + return !string.IsNullOrWhiteSpace(defaultValue) ? defaultValue : null; + } + + string setting = serviceConfiguration.Settings.Sections[sectionName].Parameters[parameterName]?.Value; + + if (string.IsNullOrWhiteSpace(setting) && defaultValue != null) + { + return defaultValue; + } + + return setting; + } + catch (Exception e) when (e is ArgumentException || e is KeyNotFoundException) + { + + } + + return null; + } + + // This function starts the detection workflow, which involves querying event store for + // Warning/Error heath events, looking for well-known FabricObserver error codes with supported + // repair actions, scheduling and executing related repair tasks. + public async Task StartAsync() + { + StartDateTime = DateTime.UtcNow; + + if (!ConfigSettings.EnableAutoMitigation) + { + return; + } + + bool initialized = await InitializeAsync(); + + if (!initialized) + { + return; + } + + try + { + RepairLogger.LogInfo("Starting FabricHealer Health Detection loop."); + + var nodeList = + await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( + () => FabricClientSingleton.QueryManager.GetNodeListAsync(null, ConfigSettings.AsyncTimeout, Token), + Token); + + nodeCount = nodeList.Count; + + // First, let's clean up any orphan non-node level FabricHealer repair tasks left pending + // when the FabricHealer process is killed or otherwise ungracefully closed. + // This call will return quickly if FH was gracefully closed as there will be + // no outstanding repair tasks left orphaned. + await CancelOrResumeAllRunningFHRepairsAsync(); + + // Run until RunAsync token is cancelled. + while (!Token.IsCancellationRequested) + { + if (!ConfigSettings.EnableAutoMitigation) + { + break; + } + + await MonitorHealthEventsAsync(); + + // Identity-agnostic internal operational telemetry sent to Service Fabric team (only) for use in + // understanding generic behavior of FH in the real world (no PII). This data is sent once a day and will be retained for no more + // than 90 days. Please consider enabling this to help the SF team make this technology better. + if (ConfigSettings.OperationalTelemetryEnabled && DateTime.UtcNow.Subtract(LastTelemetrySendDate) >= OperationalTelemetryRunInterval) + { + try + { + using var telemetryEvents = new TelemetryEvents(serviceContext); + var fhData = GetFabricHealerInternalTelemetryData(); + + if (fhData != null) + { + string filepath = Path.Combine(RepairLogger.LogFolderBasePath, $"fh_operational_telemetry.log"); + + if (telemetryEvents.EmitFabricHealerOperationalEvent(fhData, OperationalTelemetryRunInterval, filepath)) + { + LastTelemetrySendDate = DateTime.UtcNow; + ResetInternalDataCounters(); + } + } + } + catch + { + // Telemetry is non-critical and should not take down FH. + // TelemetryLib will log exception details to file in top level FH log folder. + } + } + + // Check for new version once a day. + if (DateTime.UtcNow.Subtract(LastVersionCheckDateTime) >= OperationalTelemetryRunInterval) + { + await CheckGithubForNewVersionAsync(); + LastVersionCheckDateTime = DateTime.UtcNow; + } + + await Task.Delay( + TimeSpan.FromSeconds( + ConfigSettings.HealthCheckIntervalInSeconds > 0 ? ConfigSettings.HealthCheckIntervalInSeconds : 10), Token); + } + + RepairLogger.LogInfo("Shutdown signaled. Stopping."); + await ClearExistingHealthReportsAsync(); + } + catch (AggregateException) + { + // This check is necessary to prevent cancelling outstanding repair tasks if + // one of the handled exceptions originated from another operation unrelated to + // shutdown (like an async operation that timed out). + if (Token.IsCancellationRequested) + { + RepairLogger.LogInfo("Shutdown signaled. Stopping."); + await ClearExistingHealthReportsAsync(); + } + } + catch (Exception e) when (e is FabricException || e is OperationCanceledException || e is TaskCanceledException || e is TimeoutException) + { + // This check is necessary to prevent cancelling outstanding repair tasks if + // one of the handled exceptions originated from another operation unrelated to + // shutdown (like an async operation that timed out). + if (Token.IsCancellationRequested) + { + RepairLogger.LogInfo("Shutdown signaled. Stopping."); + await ClearExistingHealthReportsAsync(); + } + } + catch (Exception e) + { + var message = $"Unhandeld Exception in FabricHealerManager:{Environment.NewLine}{e}"; + RepairLogger.LogError(message); + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Warning, + RepairConstants.FabricHealer, + message, + Token, + null, + ConfigSettings.EnableVerboseLogging); + + // FH Critical Error telemetry (no PII, no user stack) sent to SF team (FabricHealer dev). This information is helpful in understanding what went + // wrong that lead to the FH process going down (assuming it went down with an exception that can be caught). + // Please consider enabling this to help the SF team make this technology better. + if (ConfigSettings.OperationalTelemetryEnabled) + { + try + { + using var telemetryEvents = new TelemetryEvents(serviceContext); + var fhData = new FabricHealerCriticalErrorEventData + { + Source = nameof(FabricHealerManager), + ErrorMessage = e.Message, + ErrorStack = e.StackTrace, + CrashTime = DateTime.UtcNow.ToString("o"), + Version = InternalVersionNumber, + SFRuntimeVersion = sfRuntimeVersion + }; + + string filepath = Path.Combine(RepairLogger.LogFolderBasePath, $"fh_critical_error_telemetry.log"); + _ = telemetryEvents.EmitFabricHealerCriticalErrorEvent(fhData, filepath); + } + catch + { + // Telemetry is non-critical and should not take down FH. + } + } + + // Don't swallow the exception. + // Take down FH process. Fix the bug. + throw; + } + } + + private void ResetInternalDataCounters() + { + RepairHistory.Repairs.Clear(); + RepairHistory.FailedRepairs = 0; + RepairHistory.SuccessfulRepairs = 0; + RepairHistory.RepairCount = 0; + RepairHistory.EnabledRepairCount = 0; + } + + private FabricHealerOperationalEventData GetFabricHealerInternalTelemetryData() + { + FabricHealerOperationalEventData opsTelemData = null; + + try + { + RepairHistory.EnabledRepairCount = GetEnabledRepairRuleCount(); + + opsTelemData = new FabricHealerOperationalEventData + { + UpTime = DateTime.UtcNow.Subtract(StartDateTime).ToString(), + Version = InternalVersionNumber, + RepairData = RepairHistory, + SFRuntimeVersion = sfRuntimeVersion + }; + } + catch + { + + } + + return opsTelemData; + } + + /// + /// Cancels all FabricHealer repair tasks currently in flight (unless in Restoring state). + /// OR Resumes fabric node-level repairs that were abandoned due to FH going down while they were processing. + /// + /// A Task. + private async Task CancelOrResumeAllRunningFHRepairsAsync() + { + try + { + var currentFHRepairTasksInProgress = + await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( + () => repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync( + RepairTaskEngine.FabricHealerExecutorName, + Token), + Token); + + if (currentFHRepairTasksInProgress.Count == 0) + { + return; + } + + foreach (var repair in currentFHRepairTasksInProgress) + { + if (repair.State == RepairTaskState.Restoring) + { + continue; + } + + // Grab the executor data from existing repair. + var executorData = repair.ExecutorData; + + if (string.IsNullOrWhiteSpace(executorData)) + { + continue; + } + + if (!JsonSerializationUtility.TryDeserializeObject(executorData, out RepairExecutorData repairExecutorData)) + { + continue; + } + + // Don't do anything if the orphaned repair was for a different node than this one: + if (_instanceCount == -1 && repairExecutorData.RepairData.NodeName != serviceContext.NodeContext.NodeName) + { + continue; + } + + // Try and cancel existing repair. We may need to create a new one for abandoned repairs where FH goes down for some reason. + // Note: CancelRepairTaskAsync handles exceptions (IOE) that may be thrown by RM due to state change policy. + // The repair state could change to Completed after this call is made, for example, and before RM API call. + if (repair.State != RepairTaskState.Completed) + { + await FabricRepairTasks.CancelRepairTaskAsync(repair); + } + + /* Resume interrupted Fabric Node restart repairs */ + + // There is no need to resume simple repairs that do not require multiple repair steps (e.g., codepackage/process/replica restarts). + if (repairExecutorData.RepairData.RepairPolicy.RepairAction != RepairActionType.RestartFabricNode) + { + continue; + } + + string errorCode = repairExecutorData.RepairData.Code; + + if (string.IsNullOrWhiteSpace(errorCode)) + { + continue; + } + + // File Deletion repair is a node-level (VM) repair, but is not multi-step. Ignore. + if (SupportedErrorCodes.GetCodeNameFromErrorCode(errorCode).Contains("Disk") + || repairExecutorData.RepairData.RepairPolicy.RepairAction == RepairActionType.DeleteFiles) + { + continue; + } + + // Fabric System service warnings/errors from FO can be Node level repair targets (e.g., Fabric binary needs to be restarted). + // FH will restart the node hosting the troubled SF system service if specified in related logic rules. + var repairRules = + GetRepairRulesFromConfiguration( + !string.IsNullOrWhiteSpace( + repairExecutorData.RepairData.ProcessName) ? RepairConstants.SystemServiceRepairPolicySectionName : RepairConstants.FabricNodeRepairPolicySectionName); + + var repairData = new TelemetryData + { + NodeName = repairExecutorData.RepairData.NodeName, + Code = errorCode, + }; + + await repairTaskManager.RunGuanQueryAsync(repairData, repairRules, repairExecutorData); + } + } + catch (Exception e) when (e is FabricException || e is OperationCanceledException || e is TaskCanceledException) + { + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + "CancelOrResumeAllRunningFHRepairsAsync", + $"Could not cancel or resume repair tasks. Failed with:{Environment.NewLine}{e}", + Token, + null, + ConfigSettings.EnableVerboseLogging); + } + } + + private async void CodePackageActivationContext_ConfigurationPackageModifiedEvent(object sender, PackageModifiedEventArgs e) + { + await ClearExistingHealthReportsAsync(); + ConfigSettings.UpdateConfigSettings(e.NewPackage.Settings); + } + + /* Potential TODOs. This list should grow and external predicates should be written to support related workflow composition in logic rule file(s). + + Symptom Mitigation + ------------------------------------------------------ --------------------------------------------------- + Expired Certificate [TP Scenario] Modify the cluster manifest AEPCC to true (we already have an automation script for this scenario) + Node crash due to lease issue Restart the neighboring VM + Node Crash due to slow network issue Restart the VM + System Service in quorum loss Repair the partition/Restart the VM + Node stuck in disabling state due to MR [safety check] Address safety issue through automation + [MR Scenario] Node in down state: MR unable + to send the Remove-ServiceFabricNodeState in time Remove-ServiceFabricNodeState + Unused container fill the disk space Call docker prune cmd + Primary replica for system service in IB state forever Restart the primary replica + */ + + private async Task MonitorHealthEventsAsync() + { + try + { + var clusterHealth = await FabricClientSingleton.HealthManager.GetClusterHealthAsync(ConfigSettings.AsyncTimeout, Token); + + if (clusterHealth.AggregatedHealthState == HealthState.Ok) + { + return; + } + + // Check cluster upgrade status. If the cluster is upgrading to a new version (or rolling back) + // then do not attempt any repairs. + try + { + string udInClusterUpgrade = await UpgradeChecker.GetCurrentUDWhereFabricUpgradeInProgressAsync(Token); + + if (!string.IsNullOrWhiteSpace(udInClusterUpgrade)) + { + string telemetryDescription = $"Cluster is currently upgrading in UD \"{udInClusterUpgrade}\". Will not schedule or execute repairs at this time."; + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + "MonitorHealthEventsAsync::ClusterUpgradeDetected", + telemetryDescription, + Token, + null, + ConfigSettings.EnableVerboseLogging); + return; + } + } + catch (Exception e) when (e is FabricException || e is TimeoutException) + { +#if DEBUG + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + "MonitorHealthEventsAsync::HandledException", + $"Failure in MonitorHealthEventsAsync::Node:{Environment.NewLine}{e}", + Token, + null, + ConfigSettings.EnableVerboseLogging); +#endif + } + + var unhealthyEvaluations = clusterHealth.UnhealthyEvaluations; + + foreach (var evaluation in unhealthyEvaluations) + { + Token.ThrowIfCancellationRequested(); + + string kind = Enum.GetName(typeof(HealthEvaluationKind), evaluation.Kind); + + if (kind != null && kind.Contains("Node")) + { + if (!ConfigSettings.EnableMachineRepair && !ConfigSettings.EnableDiskRepair && !ConfigSettings.EnableFabricNodeRepair) + { + continue; + } + + try + { + await ProcessNodeHealthAsync(clusterHealth.NodeHealthStates); + } + catch (Exception e) when (e is FabricException || e is TimeoutException) + { +#if DEBUG + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + "MonitorHealthEventsAsync::HandledException", + $"Failure in MonitorHealthEventsAsync::Node:{Environment.NewLine}{e}", + Token, + null, + ConfigSettings.EnableVerboseLogging); +#endif + } + } + else if (kind != null && kind.Contains("Application")) + { + if (!ConfigSettings.EnableAppRepair && !ConfigSettings.EnableSystemAppRepair) + { + continue; + } + + foreach (var app in clusterHealth.ApplicationHealthStates) + { + Token.ThrowIfCancellationRequested(); + + try + { + var entityHealth = + await FabricClientSingleton.HealthManager.GetApplicationHealthAsync(app.ApplicationName, ConfigSettings.AsyncTimeout, Token); + + if (app.AggregatedHealthState == HealthState.Ok) + { + continue; + } + + if (entityHealth.ServiceHealthStates != null && entityHealth.ServiceHealthStates.Any( + s => s.AggregatedHealthState == HealthState.Error || s.AggregatedHealthState == HealthState.Warning)) + { + foreach (var service in entityHealth.ServiceHealthStates.Where( + s => s.AggregatedHealthState == HealthState.Error || s.AggregatedHealthState == HealthState.Warning)) + { + await ProcessServiceHealthAsync(service); + } + } + else + { + await ProcessApplicationHealthAsync(app); + } + } + catch (Exception e) when (e is FabricException || e is TimeoutException) + { +#if DEBUG + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + "MonitorHealthEventsAsync::HandledException", + $"Failure in MonitorHealthEventsAsync::Application:{Environment.NewLine}{e}", + Token, + null, + ConfigSettings.EnableVerboseLogging); +#endif + } + } + } + } + } + catch (Exception e) when (e is ArgumentException || e is FabricException) + { + // Don't crash. + } + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) + { + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Error, + "MonitorHealthEventsAsync::UnhandledException", + $"Failure in MonitorHealthEventsAsync:{Environment.NewLine}{e}", + Token, + null, + ConfigSettings.EnableVerboseLogging); + + RepairLogger.LogWarning($"Unhandled exception in MonitorHealthEventsAsync:{Environment.NewLine}{e}"); + + // Fix the bug(s). + throw; + } + } + + /// + /// Processes SF Application health events, including System Application events (which could have Fabric Node impact repairs). + /// + /// Collection of ApplicationHealthState objects. + /// A task. + private async Task ProcessApplicationHealthAsync(ApplicationHealthState appHealthState) + { + ApplicationHealth appHealth = null; + Uri appName = appHealthState.ApplicationName; + + // System app target? Do not proceed if system app repair is not enabled. + if (appName.OriginalString == RepairConstants.SystemAppName && !ConfigSettings.EnableSystemAppRepair) + { + return; + } + + // User app target? Do not proceed if App repair is not enabled. + if (appName.OriginalString != RepairConstants.SystemAppName && !ConfigSettings.EnableAppRepair) + { + return; + } + + appHealth = await FabricClientSingleton.HealthManager.GetApplicationHealthAsync(appName, ConfigSettings.AsyncTimeout, Token); + + if (appName.OriginalString != RepairConstants.SystemAppName) + { + try + { + var appUpgradeStatus = await FabricClientSingleton.ApplicationManager.GetApplicationUpgradeProgressAsync(appName); + + if (appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingBackInProgress + || appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardInProgress + || appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardPending) + { + List udInAppUpgrade = await UpgradeChecker.GetUdsWhereApplicationUpgradeInProgressAsync(appName, Token); + string udText = string.Empty; + + // -1 means no upgrade in progress for application. + if (udInAppUpgrade.Any(ud => ud > -1)) + { + udText = $"in UD {udInAppUpgrade.First(ud => ud > -1)}"; + } + + string telemetryDescription = $"{appName} is upgrading {udText}. Will not attempt application repair at this time."; + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + "MonitorRepairableHealthEventsAsync::AppUpgradeDetected", + telemetryDescription, + Token, + null, + ConfigSettings.EnableVerboseLogging); + + return; + } + } + catch (FabricException) + { + // This upgrade check should not prevent moving forward if the fabric client call fails with an FE. + } + } + + var healthEvents = appHealth.HealthEvents.Where( + s => s.HealthInformation.HealthState == HealthState.Warning + || s.HealthInformation.HealthState == HealthState.Error); + + foreach (var evt in healthEvents) + { + Token.ThrowIfCancellationRequested(); + + if (string.IsNullOrWhiteSpace(evt.HealthInformation.Description)) + { + continue; + } + + // If health data is not a serialized TelemetryData instance, then move along. + if (!JsonSerializationUtility.TryDeserializeObject(evt.HealthInformation.Description, out TelemetryData repairData)) + { + continue; + } + + // Since FH can run on each node (-1 InstanceCount), if this is the case then have FH only try to repair app services that are also running on the same node. + // This removes the need to try and orchestrate repairs across nodes (which we will have to do in the non -1 case). + if (_instanceCount == -1 && repairData.NodeName != serviceContext.NodeContext.NodeName) + { + continue; + } + else if (_instanceCount > 1) + { + // Randomly wait to decrease chances of simultaneous ownership among FH instances. + await RandomWaitAsync(); + } + + if (repairData.Code != null && !SupportedErrorCodes.AppErrorCodesDictionary.ContainsKey(repairData.Code) + || repairData.Code == SupportedErrorCodes.AppErrorNetworkEndpointUnreachable + || repairData.Code == SupportedErrorCodes.AppWarningNetworkEndpointUnreachable) + { + // Network endpoint test failures have no general mitigation. + continue; + } + + // Get configuration settings related to Application (service code package) repair. + List repairRules; + string repairId; + string system = string.Empty; + + if (appName.OriginalString == RepairConstants.SystemAppName) + { + if (!ConfigSettings.EnableSystemAppRepair) + { + continue; + } + + // Block attempts to schedule node-level or system service restart repairs if one is already executing in the cluster. + var fhRepairTasks = await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, Token); + + if (fhRepairTasks.Count > 0) + { + foreach (var repair in fhRepairTasks) + { + var executorData = JsonSerializationUtility.TryDeserializeObject(repair.ExecutorData, out RepairExecutorData exData) ? exData : null; + + if (executorData?.RepairData?.RepairPolicy?.RepairAction != RepairActionType.RestartFabricNode && + executorData?.RepairData?.RepairPolicy?.RepairAction != RepairActionType.RestartProcess) + { + continue; + } + + string message = $"A Service Fabric System service repair ({repair.TaskId}) is already in progress in the cluster(state: {repair.State}). " + + $"Will not attempt repair at this time."; + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + $"ProcessApplicationHealth::System::{repair.TaskId}", + message, + Token, + null, + ConfigSettings.EnableVerboseLogging); + + return; + } + } + + repairRules = GetRepairRulesForTelemetryData(repairData); + + if (repairRules == null || repairRules?.Count == 0) + { + continue; + } + + repairId = $"{repairData.NodeName}_{repairData.ProcessName}_{repairData.Code}"; + system = "System "; + + var currentRepairs = + await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, Token); + + // Is a repair for the target app service instance already happening in the cluster? + // There can be multiple Warnings emitted by FO for a single app at the same time. + if (currentRepairs.Count > 0 && currentRepairs.Any(r => r.ExecutorData.Contains(repairData.ProcessName))) + { + var repair = currentRepairs.FirstOrDefault(r => r.ExecutorData.Contains(repairData.ProcessName)); + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + $"MonitorRepairableHealthEventsAsync::{repairData.ProcessName}", + $"There is already a repair in progress for Fabric system service {repairData.ProcessName}(state: {repair.State})", + Token, + null, + ConfigSettings.EnableVerboseLogging); + + continue; + } + + // Repair already in progress? + if (currentRepairs.Count > 0 && currentRepairs.Any(r => r.ExecutorData.Contains(repairId))) + { + continue; + } + } + else + { + if (!ConfigSettings.EnableAppRepair) + { + continue; + } + + // Don't restart thyself. + if (repairData.ServiceName == serviceContext.ServiceName.OriginalString && repairData.NodeName == serviceContext.NodeContext.NodeName) + { + continue; + } + + repairRules = GetRepairRulesForTelemetryData(repairData); + + // Nothing to do here. + if (repairRules == null || repairRules?.Count == 0) + { + continue; + } + + string serviceProcessName = $"{repairData.ServiceName?.Replace("fabric:/", "").Replace("/", "")}"; + var currentRepairs = + await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, Token); + + // This is the way each FH repair is ID'd. This data is stored in the related Repair Task's ExecutorData property. + repairId = $"{repairData.NodeName}_{serviceProcessName}_{repairData.Metric?.Replace(" ", string.Empty)}"; + + // Is a repair for the target app service instance already happening in the cluster? + // There can be multiple Warnings emitted by FO for a single app at the same time. + if (currentRepairs.Count > 0 && currentRepairs.Any(r => r.ExecutorData.Contains(repairId))) + { + var repair = currentRepairs.FirstOrDefault(r => r.ExecutorData.Contains(repairId)); + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + $"MonitorRepairableHealthEventsAsync::{repairData.ServiceName}", + $"{appName} already has a repair in progress for service {repairData.ServiceName}(state: {repair.State})", + Token, + null, + ConfigSettings.EnableVerboseLogging); + + continue; + } + } + + /* Start repair workflow */ + + repairData.RepairPolicy = new RepairPolicy + { + RepairId = repairId + }; + repairData.Property = evt.HealthInformation.Property; + string errOrWarn = "Error"; + + if (evt.HealthInformation.HealthState == HealthState.Warning) + { + errOrWarn = "Warning"; + } + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + $"MonitorRepairableHealthEventsAsync:{repairId}", + $"Detected {errOrWarn} state for Application {repairData.ApplicationName}{Environment.NewLine}" + + $"SourceId: {evt.HealthInformation.SourceId}{Environment.NewLine}" + + $"Property: {evt.HealthInformation.Property}{Environment.NewLine}" + + $"{system}Application repair policy is enabled. " + + $"{repairRules.Count} Logic rules found for {system}Application-level repair.", + Token, + null, + ConfigSettings.EnableVerboseLogging); + + // Update the in-memory HealthEvent List. + this.repairTaskManager.DetectedHealthEvents.Add(evt); + + // Start the repair workflow. + await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token); + } + } + + private async Task ProcessServiceHealthAsync(ServiceHealthState serviceHealthState) + { + // This is just used to make sure there is more than 1 node in the cluster. We don't need a list of all nodes. + var nodeQueryDesc = new NodeQueryDescription + { + MaxResults = 3, + }; + + NodeList nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( + () => FabricClientSingleton.QueryManager.GetNodePagedListAsync( + nodeQueryDesc, + ConfigSettings.AsyncTimeout, + Token), + Token); + + ServiceHealth serviceHealth; + Uri appName; + Uri serviceName = serviceHealthState.ServiceName; + + // User service target? Do not proceed if App repair is not enabled. + if (!serviceName.OriginalString.Contains(RepairConstants.SystemAppName) && !ConfigSettings.EnableAppRepair) + { + return; + } + + // System service target? Do not proceed if system app repair is not enabled. + if (serviceName.OriginalString.Contains(RepairConstants.SystemAppName) && !ConfigSettings.EnableSystemAppRepair) + { + return; + } + + serviceHealth = await FabricClientSingleton.HealthManager.GetServiceHealthAsync(serviceName, ConfigSettings.AsyncTimeout, Token); + var name = await FabricClientSingleton.QueryManager.GetApplicationNameAsync(serviceName, ConfigSettings.AsyncTimeout, Token); + appName = name.ApplicationName; + + // user service - upgrade check. + if (!appName.OriginalString.Contains(RepairConstants.SystemAppName) && !serviceName.OriginalString.Contains(RepairConstants.SystemAppName)) + { + try + { + var app = await FabricClientSingleton.QueryManager.GetApplicationNameAsync(serviceName, ConfigSettings.AsyncTimeout, Token); + var appUpgradeStatus = await FabricClientSingleton.ApplicationManager.GetApplicationUpgradeProgressAsync(app.ApplicationName); + + if (appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingBackInProgress + || appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardInProgress + || appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardPending) + { + List udInAppUpgrade = await UpgradeChecker.GetUdsWhereApplicationUpgradeInProgressAsync(serviceName, Token); + string udText = string.Empty; + + // -1 means no upgrade in progress for application. + if (udInAppUpgrade.Any(ud => ud > -1)) + { + udText = $"in UD {udInAppUpgrade.First(ud => ud > -1)}"; + } + + string telemetryDescription = $"{app.ApplicationName} is upgrading {udText}. Will not attempt service repair at this time."; + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + "MonitorRepairableHealthEventsAsync::AppUpgradeDetected", + telemetryDescription, + Token, + null, + ConfigSettings.EnableVerboseLogging); + + return; + } + } + catch (FabricException) + { + // This upgrade check should not prevent moving forward if the fabric client call fails with an FE. + } + } + + var healthEvents = serviceHealth.HealthEvents.Where( + e => e.HealthInformation.HealthState == HealthState.Warning || e.HealthInformation.HealthState == HealthState.Error); + + // Replica repair. This only makes sense if a partition is in Error or Warning state (and Replica repair is still experimental for FH). + if (ConfigSettings.EnableReplicaRepair) + { + var partitionHealthStates = serviceHealth.PartitionHealthStates.Where( + p => p.AggregatedHealthState == HealthState.Warning || p.AggregatedHealthState == HealthState.Error); + + if (partitionHealthStates.Any()) + { + await ProcessReplicaHealthAsync(serviceHealth); + return; + } + } + + foreach (var evt in healthEvents) + { + Token.ThrowIfCancellationRequested(); + + if (string.IsNullOrWhiteSpace(evt.HealthInformation.Description)) + { + continue; + } + + // If health data is not a serialized instance of a type that implements ITelemetryData, then move along. + if (!JsonSerializationUtility.TryDeserializeObject(evt.HealthInformation.Description, out TelemetryData repairData)) + { + continue; + } + + // No service name provided. No service. + if (string.IsNullOrEmpty(repairData.ServiceName)) + { + continue; + } + + if (repairData.ServiceName.ToLower() != serviceName.OriginalString.ToLower()) + { + continue; + } + + // PartitionId is Guid? in ITelemetryData. + if (repairData.PartitionId == null) + { + continue; + } + + if (string.IsNullOrWhiteSpace(repairData.ApplicationName)) + { + repairData.ApplicationName = appName.OriginalString; + } + + // Since FH can run on each node (-1 InstanceCount), if this is the case then have FH only try to repair app services that are also running on the same node. + // This removes the need to try and orchestrate repairs across nodes (which we will have to do in the non -1 case). + if (_instanceCount == -1 && repairData.NodeName != serviceContext.NodeContext.NodeName) + { + continue; + } + else if (_instanceCount > 1) + { + // Randomly wait to decrease chances of simultaneous ownership among FH instances. + await RandomWaitAsync(); + } + + if (repairData.Code != null && !SupportedErrorCodes.AppErrorCodesDictionary.ContainsKey(repairData.Code) + || repairData.Code == SupportedErrorCodes.AppErrorNetworkEndpointUnreachable + || repairData.Code == SupportedErrorCodes.AppWarningNetworkEndpointUnreachable) + { + // Network endpoint test failures have no general mitigation. + continue; + } + + // Get configuration settings related to Application (service code package) repair. + List repairRules; + string repairId; + string system = string.Empty; + + if (serviceName.OriginalString.Contains(RepairConstants.SystemAppName)) + { + if (!ConfigSettings.EnableSystemAppRepair) + { + continue; + } + + // Block attempts to schedule node-level or system service restart repairs if one is already executing in the cluster. + var fhRepairTasks = await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync( + RepairTaskEngine.FabricHealerExecutorName, + Token); + + if (fhRepairTasks.Count > 0) + { + foreach (var repair in fhRepairTasks) + { + var executorData = JsonSerializationUtility.TryDeserializeObject(repair.ExecutorData, out RepairExecutorData exData) ? exData : null; + + if (executorData?.RepairData?.RepairPolicy?.RepairAction != RepairActionType.RestartFabricNode && + executorData?.RepairData?.RepairPolicy?.RepairAction != RepairActionType.RestartProcess) + { + continue; + } + + string message = $"A Service Fabric System service repair ({repair.TaskId}) is already in progress in the cluster(state: {repair.State}). " + + $"Will not attempt repair at this time."; + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + $"ProcessApplicationHealth::System::{repair.TaskId}", + message, + Token, + null, + ConfigSettings.EnableVerboseLogging); + + return; + } + } + + repairRules = GetRepairRulesForTelemetryData(repairData); + + if (repairRules == null || repairRules?.Count == 0) + { + continue; + } + + repairId = $"{repairData.NodeName}_{serviceName.OriginalString}_{repairData.Code}"; + system = "System "; + + var currentRepairs = + await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, Token); + + // Is a repair for the target app service instance already happening in the cluster? + // There can be multiple Warnings emitted by FO for a single app at the same time. + if (currentRepairs.Count > 0 && currentRepairs.Any(r => r.ExecutorData.Contains(serviceName.OriginalString))) + { + var repair = currentRepairs.FirstOrDefault(r => r.ExecutorData.Contains(serviceName.OriginalString)); + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + $"MonitorRepairableHealthEventsAsync::{serviceName.OriginalString}", + $"There is already a repair in progress for Fabric system service {serviceName.OriginalString}(state: {repair.State})", + Token, + null, + ConfigSettings.EnableVerboseLogging); + + continue; + } + + // Repair already in progress? + if (currentRepairs.Count > 0 && currentRepairs.Any(r => r.ExecutorData.Contains(repairId))) + { + continue; + } + } + else + { + if (!ConfigSettings.EnableAppRepair) + { + continue; + } + + // Don't restart thyself. + if (repairData.ServiceName == serviceContext.ServiceName.OriginalString && repairData.NodeName == serviceContext.NodeContext.NodeName) + { + continue; + } + + repairRules = GetRepairRulesForTelemetryData(repairData); + + // Nothing to do here. + if (repairRules == null || repairRules?.Count == 0) + { + continue; + } + + string serviceProcessName = $"{repairData.ServiceName.Replace("fabric:/", "").Replace("/", "")}"; + var currentFHRepairs = + await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, Token); + + // This is the way each FH repair is ID'd. This data is stored in the related Repair Task's ExecutorData property. + repairId = $"{repairData.NodeName}_{serviceProcessName}_{repairData.Metric?.Replace(" ", string.Empty)}"; + + // All FH repairs have serialized instances of RepairExecutorData set as the value for a RepairTask's ExecutorData property. + if (currentFHRepairs?.Count > 0) + { + // This prevents starting creating a new repair if another service running on a different node needs to be resarted, for example. + // Thing of this as a UD Walk across nodes of service instances in need of repair. + if (ConfigSettings.EnableRollingServiceRestarts + && nodes?.Count > 1 + && currentFHRepairs.Any(r => JsonSerializationUtility.TryDeserializeObject(r.ExecutorData, out RepairExecutorData execData) + && execData?.RepairData?.ServiceName?.ToLower() == repairData.ServiceName.ToLower())) + { + var repair = currentFHRepairs.FirstOrDefault(r => JsonSerializationUtility.TryDeserializeObject(r.ExecutorData, out RepairExecutorData execData) + && execData.RepairData.ServiceName.ToLower() == repairData.ServiceName.ToLower()); + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + $"{serviceProcessName}_RollingRepairInProgress", + $"There is currently a rolling repair in progress for service {repairData.ServiceName}. Current node: {repair.Target}. Repair State: {repair.State})", + Token, + null, + ConfigSettings.EnableVerboseLogging); + + return; + } + // For the case where a service repair is still not Completed (e.g., the repair status is Restoring, which would happen after the repair executor has completed + // its work, but RM is performing post safety checks (safety checks can be enabled/disabled in logic rules). + else if (currentFHRepairs.Any(r => JsonSerializationUtility.TryDeserializeObject(r.ExecutorData, out RepairExecutorData execData) + && execData?.RepairData?.RepairPolicy?.RepairId == repairId)) + { + var repair = currentFHRepairs.FirstOrDefault(r => JsonSerializationUtility.TryDeserializeObject(r.ExecutorData, out RepairExecutorData execData) + && execData.RepairData.RepairPolicy.RepairId == repairId); + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + $"{serviceProcessName}_RepairAlreadyInProgress", + $"There is currently a repair in progress for service {repairData.ServiceName} on node {repairData.NodeName}. Repair State: {repair.State}.", + Token, + null, + ConfigSettings.EnableVerboseLogging); + + return; + } + // This means no other service instance will get repaired anywhere in the cluster if a service repair job for the same service is + // already taking place on any node. This only takes effect if config setting EnableRollingServiceRestarts is set to true. + } + } + + /* Start repair workflow */ + repairData.RepairPolicy = new RepairPolicy + { + RepairId = repairId + }; + repairData.Property = evt.HealthInformation.Property; + string errOrWarn = "Error"; + + if (evt.HealthInformation.HealthState == HealthState.Warning) + { + errOrWarn = "Warning"; + } + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + $"MonitorRepairableHealthEventsAsync:{repairId}", + $"Detected {errOrWarn} state for Service {repairData.ServiceName}{Environment.NewLine}" + + $"SourceId: {evt.HealthInformation.SourceId}{Environment.NewLine}" + + $"Property: {evt.HealthInformation.Property}{Environment.NewLine}" + + $"{system}Application repair policy is enabled. " + + $"{repairRules.Count} Logic rules found for {system}Service-level repair.", + Token, + null, + ConfigSettings.EnableVerboseLogging); + + // Update the in-memory HealthEvent List. + repairTaskManager.DetectedHealthEvents.Add(evt); + + // Start the repair workflow. + await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token); + } + } + + private async Task ProcessNodeHealthAsync(IEnumerable nodeHealthStates) + { + // This is just used to make sure there is more than 1 node in the cluster. We don't need a list of all nodes. + var nodeQueryDesc = new NodeQueryDescription + { + MaxResults = 3, + }; + + NodeList nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( + () => FabricClientSingleton.QueryManager.GetNodePagedListAsync( + nodeQueryDesc, + ConfigSettings.AsyncTimeout, + Token), + Token); + + // Node/Machine repair not supported in onebox clusters.. + if (nodes?.Count == 1) + { + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + $"ProcessMachineHealth::NotSupported", + "Machine/Fabric Node repair is not supported in clusters with 1 Fabric node.", + Token, + null, + ConfigSettings.EnableVerboseLogging); + + return; + } + + var supportedNodeHealthStates = + nodeHealthStates.Where(a => a.AggregatedHealthState == HealthState.Warning || a.AggregatedHealthState == HealthState.Error); + + foreach (var node in supportedNodeHealthStates) + { + Token.ThrowIfCancellationRequested(); + + var nodeList = await FabricClientSingleton.QueryManager.GetNodeListAsync(node.NodeName, ConfigSettings.AsyncTimeout, Token); + string nodeType = nodeList[0].NodeType; + string nodeUD = nodeList[0].UpgradeDomain; + var nodeHealth = await FabricClientSingleton.HealthManager.GetNodeHealthAsync(node.NodeName, ConfigSettings.AsyncTimeout, Token); + var nodeHealthEvents = + nodeHealth.HealthEvents.Where( + s => (s.HealthInformation.HealthState == HealthState.Warning || s.HealthInformation.HealthState == HealthState.Error)); + + // Ensure a node in Error is not in error due to being down as part of a cluster upgrade or infra update in its UD. + if (node.AggregatedHealthState == HealthState.Error) + { + string udInClusterUpgrade = await UpgradeChecker.GetCurrentUDWhereFabricUpgradeInProgressAsync(Token); + + if (!string.IsNullOrWhiteSpace(udInClusterUpgrade) && udInClusterUpgrade == nodeUD) + { + string telemetryDescription = + $"Cluster is currently upgrading in UD \"{udInClusterUpgrade}\", which is the UD for node {node.NodeName}, which is down. " + + "Will not schedule or execute node-level repair at this time."; + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + $"{node.NodeName}_down_ClusterUpgrade({nodeUD})", + telemetryDescription, + Token, + null, + ConfigSettings.EnableVerboseLogging); + + continue; + } + + // Check to see if an Azure tenant/platform update is in progress for target node. Do not conduct repairs if so. + if (await UpgradeChecker.IsAzureUpdateInProgress(nodeType, node.NodeName, Token)) + { + string telemetryDescription = $"{node.NodeName} is down due to Infra repair job (UD = {nodeUD}). Will not attempt node repair at this time."; + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + $"{node.NodeName}_down_InfraRepair", + telemetryDescription, + Token, + null, + ConfigSettings.EnableVerboseLogging); + + continue; + } + } + + foreach (var evt in nodeHealthEvents) + { + Token.ThrowIfCancellationRequested(); + + // Random wait to limit potential duplicate (concurrent) repair job creation from other FH instances. + await RandomWaitAsync(); + + // Was health event generated by FO or FHProxy? + if (!JsonSerializationUtility.TryDeserializeObject(evt.HealthInformation.Description, out TelemetryData repairData)) + { + // This will enable Machine level repair (reboot, reimage) based on detected SF Node Health Event not generated by FO/FHProxy. + repairData = new TelemetryData + { + NodeName = node.NodeName, + NodeType = nodeType, + EntityType = EntityType.Machine, + Description = evt.HealthInformation.Description, + HealthState = evt.HealthInformation.HealthState, + Source = RepairConstants.FabricHealer + }; + } + else + { + // Nothing to do here. + if (repairData.EntityType == EntityType.Unknown) + { + continue; + } + + // Disk? + if (repairData.EntityType == EntityType.Disk && ConfigSettings.EnableDiskRepair) + { + await ProcessDiskHealthAsync(evt, repairData); + continue; + } + + // Fabric node? + if (repairData.EntityType == EntityType.Node && ConfigSettings.EnableFabricNodeRepair) + { + // FabricHealerProxy-generated report, so a restart fabric node request, for example. + await ProcessFabricNodeHealthAsync(evt, repairData); + continue; + } + } + + // Machine repair \\ + + if (!ConfigSettings.EnableMachineRepair) + { + continue; + } + + // If there are mulitple instances of FH deployed to the cluster (like -1 InstanceCount), then don't do machine repairs if this instance of FH + // detects a need to do so. Another instance on a different node will take the job. Only DiskObserver-generated repair data has to be done on the node + // where FO's DiskObserver emitted the related information, for example (like Disk space issues and the need to clean specified (in logic rules) folders). + if ((_instanceCount == -1 || _instanceCount > 2) && node.NodeName == serviceContext.NodeContext.NodeName) + { + continue; + } + + // Make sure that there is not already an Infra repair in progress for the target node. + var currentISRepairs = + await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync($"{RepairConstants.InfrastructureServiceName}/{nodeType}", Token); + + if (currentISRepairs?.Count > 0) + { + if (currentISRepairs.Any(r => r.Description.Contains(node.NodeName))) + { + var repair = currentISRepairs.FirstOrDefault(r => r.Description.Contains(node.NodeName)); + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + $"{node.NodeName}_MachineRepairAlreadyInProgress", + $"There is currently a Machine repair in progress for node {node.NodeName}. Repair State: {repair.State}.", + Token, + null, + ConfigSettings.EnableVerboseLogging); + + return; + } + } + + // Get repair rules for supplied facts (TelemetryData). + var repairRules = GetRepairRulesForTelemetryData(repairData); + + if (repairRules == null || repairRules.Count == 0) + { + continue; + } + + /* Start repair workflow */ + + string repairId = $"Machine_Repair_{nodeType}_{repairData.NodeName}"; + repairData.RepairPolicy = new RepairPolicy + { + RepairId = repairId + }; + repairData.Property = evt.HealthInformation.Property; + string errOrWarn = "Error"; + + if (evt.HealthInformation.HealthState == HealthState.Warning) + { + errOrWarn = "Warning"; + } + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + repairId, + $"Detected Fabric node {repairData.NodeName} is in {errOrWarn}.{Environment.NewLine}" + + $"Machine repair target specified. {repairRules.Count} logic rules found for Machine repair.", + Token, + null, + ConfigSettings.EnableVerboseLogging); + + // Update the in-memory HealthEvent List. + repairTaskManager.DetectedHealthEvents.Add(evt); + + // Start the repair workflow. + await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token); + } + } + } + + private async Task ProcessDiskHealthAsync(HealthEvent evt, TelemetryData repairData) + { + // Can only repair local disks. + if (repairData.NodeName != serviceContext.NodeContext.NodeName) + { + return; + } + + // Get repair rules for supported source Observer. + var repairRules = GetRepairRulesForTelemetryData(repairData); + + if (repairRules == null || repairRules.Count == 0) + { + return; + } + + /* Start repair workflow */ + + string repairId = $"Disk_Repair_{repairData.Code}{repairData.NodeName}_DeleteFiles"; + repairData.RepairPolicy = new RepairPolicy + { + RepairId = repairId + }; + repairData.Property = evt.HealthInformation.Property; + string errOrWarn = "Error"; + + if (evt.HealthInformation.HealthState == HealthState.Warning) + { + errOrWarn = "Warning"; + } + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + repairId, + $"Detected {repairData.NodeName} is in {errOrWarn}.{Environment.NewLine}" + + $"Disk repair is enabled. {repairRules.Count} logic rules found for Disk repair.", + Token, + null, + ConfigSettings.EnableVerboseLogging); + + // Update the in-memory HealthEvent List. + repairTaskManager.DetectedHealthEvents.Add(evt); + + // Start the repair workflow. + await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token); + } + + private async Task ProcessFabricNodeHealthAsync(HealthEvent healthEvent, TelemetryData repairData) + { + var repairRules = GetRepairRulesForTelemetryData(repairData); + + if (repairRules == null || repairRules?.Count == 0) + { + return; + } + + // There is only one supported repair for a FabricNode: Restart. + string repairId = $"{repairData.NodeName}_{repairData.NodeType}_Restart"; + + var currentRepairs = + await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, Token); + + // Block attempts to reschedule another Fabric node-level repair for the same node if a current repair has not yet completed. + if (currentRepairs.Count > 0 && currentRepairs.Any(r => r.ExecutorData.Contains(repairId))) + { + var repair = currentRepairs.FirstOrDefault(r => r.ExecutorData.Contains(repairData.NodeType)); + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + repairId, + $"There is already a repair in progress for Fabric node {repairData.NodeName}(state: {repair.State})", + Token, + null, + ConfigSettings.EnableVerboseLogging); + + return; + } + + repairData.RepairPolicy = new RepairPolicy + { + RepairId = repairId + }; + repairData.Property = repairData.Property; + string errOrWarn = "Error"; + + if (repairData.HealthState == HealthState.Warning) + { + errOrWarn = "Warning"; + } + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + repairId, + $"Detected {repairData.NodeName} is in {errOrWarn}.{Environment.NewLine}" + + $"Fabric Node repair is enabled. {repairRules.Count} Logic rules found for Fabric Node repair.", + Token, + null, + ConfigSettings.EnableVerboseLogging); + + // Update the in-memory HealthEvent List. + repairTaskManager.DetectedHealthEvents.Add(healthEvent); + + // Start the repair workflow. + await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token); + } + + // This is an example of a repair for a non-FO-originating health event. This function needs some work, but you get the basic idea here. + // FO does not support replica monitoring and as such it does not emit specific error codes that FH recognizes. + // *This is an experimental function/workflow in need of more testing.* + private async Task ProcessReplicaHealthAsync(ServiceHealth serviceHealth) + { + // Random wait to limit potential duplicate (concurrent) repair job creation from other FH instances. + await RandomWaitAsync(); + + /* Example of repairable problem at Replica level, as health event: + + [SourceId] ='System.RAP' reported Warning/Error for property... + [Property] = 'IStatefulServiceReplica.ChangeRole(N)Duration'. + [Description] = The api IStatefulServiceReplica.ChangeRole(N) on node [NodeName] is stuck. + + Start Time (UTC): 2020-04-26 19:22:55.492. + */ + + List healthEvents = new List(); + var partitionHealthStates = serviceHealth.PartitionHealthStates.Where( + p => p.AggregatedHealthState == HealthState.Warning || p.AggregatedHealthState == HealthState.Error); + + foreach (var partitionHealthState in partitionHealthStates) + { + PartitionHealth partitionHealth = + await FabricClientSingleton.HealthManager.GetPartitionHealthAsync(partitionHealthState.PartitionId, ConfigSettings.AsyncTimeout, Token); + + List replicaHealthStates = partitionHealth.ReplicaHealthStates.Where( + p => p.AggregatedHealthState == HealthState.Warning || p.AggregatedHealthState == HealthState.Error).ToList(); + + if (replicaHealthStates != null && replicaHealthStates.Count > 0) + { + foreach (var rep in replicaHealthStates) + { + var replicaHealth = + await FabricClientSingleton.HealthManager.GetReplicaHealthAsync(partitionHealthState.PartitionId, rep.Id, ConfigSettings.AsyncTimeout, Token); + + if (replicaHealth != null) + { + healthEvents = replicaHealth.HealthEvents.Where( + h => h.HealthInformation.HealthState == HealthState.Warning || h.HealthInformation.HealthState == HealthState.Error).ToList(); + + foreach (HealthEvent healthEvent in healthEvents) + { + if (!healthEvent.HealthInformation.SourceId.Contains("System.RAP")) + { + continue; + } + + if (!healthEvent.HealthInformation.Property.Contains("IStatefulServiceReplica.ChangeRole") && + !healthEvent.HealthInformation.Property.Contains("IReplicator.BuildReplica")) + { + continue; + } + + if (!healthEvent.HealthInformation.Description.Contains("is stuck")) + { + continue; + } + + var app = await FabricClientSingleton.QueryManager.GetApplicationNameAsync( + serviceHealth.ServiceName, + ConfigSettings.AsyncTimeout, + Token); + + var replicaList = await FabricClientSingleton.QueryManager.GetReplicaListAsync( + rep.PartitionId, + rep.Id, + ConfigSettings.AsyncTimeout, + Token); + // Replica still exists? + if (replicaList.Count == 0) + { + continue; + } + + var appName = app?.ApplicationName?.OriginalString; + var replica = replicaList[0]; + var nodeName = replica?.NodeName; + + // Get configuration settings related to Replica repair. + var repairRules = GetRepairRulesFromConfiguration(RepairConstants.ReplicaRepairPolicySectionName); + + if (repairRules == null || !repairRules.Any()) + { + continue; + } + + var repairData = new TelemetryData + { + ApplicationName = appName, + EntityType = EntityType.Replica, + HealthState = healthEvent.HealthInformation.HealthState, + NodeName = nodeName, + ReplicaId = rep.Id, + PartitionId = rep.PartitionId, + ServiceName = serviceHealth.ServiceName.OriginalString, + Source = RepairConstants.FabricHealerAppName + }; + + string errOrWarn = "Error"; + + if (healthEvent.HealthInformation.HealthState == HealthState.Warning) + { + errOrWarn = "Warning"; + } + + string repairId = $"{nodeName}_{serviceHealth.ServiceName.OriginalString.Remove(0, appName.Length + 1)}_{repairData.PartitionId}"; + repairData.RepairPolicy = new RepairPolicy + { + RepairId = repairId + }; + repairData.Property = healthEvent.HealthInformation.Property; + + // Repair already in progress? + var currentRepairs = await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, Token); + + if (currentRepairs.Count > 0 && currentRepairs.Any(r => r.ExecutorData.Contains(repairId))) + { + continue; + } + + /* Start repair workflow */ + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + $"MonitorRepairableHealthEventsAsync:Replica_{rep.Id}_{errOrWarn}", + $"Detected Replica {rep.Id} on Partition " + + $"{rep.PartitionId} is in {errOrWarn}.{Environment.NewLine}" + + $"Replica repair policy is enabled. " + + $"{repairRules.Count} Logic rules found for Replica repair.", + Token, + null, + ConfigSettings.EnableVerboseLogging); + + // Start the repair workflow. + await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token); + } + } + } + } + } + } + + private List GetRepairRulesFromErrorCode(string errorCode, string app = null) + { + if (!SupportedErrorCodes.AppErrorCodesDictionary.ContainsKey(errorCode) + && !SupportedErrorCodes.NodeErrorCodesDictionary.ContainsKey(errorCode)) + { + return null; + } + + string repairPolicySectionName; + + switch (errorCode) + { + // App repair (user and system). + case SupportedErrorCodes.AppErrorCpuPercent: + case SupportedErrorCodes.AppErrorMemoryMB: + case SupportedErrorCodes.AppErrorMemoryPercent: + case SupportedErrorCodes.AppErrorTooManyActiveEphemeralPorts: + case SupportedErrorCodes.AppErrorTooManyActiveTcpPorts: + case SupportedErrorCodes.AppErrorTooManyOpenFileHandles: + case SupportedErrorCodes.AppErrorTooManyThreads: + case SupportedErrorCodes.AppWarningCpuPercent: + case SupportedErrorCodes.AppWarningMemoryMB: + case SupportedErrorCodes.AppWarningMemoryPercent: + case SupportedErrorCodes.AppWarningTooManyActiveEphemeralPorts: + case SupportedErrorCodes.AppWarningTooManyActiveTcpPorts: + case SupportedErrorCodes.AppWarningTooManyOpenFileHandles: + case SupportedErrorCodes.AppWarningTooManyThreads: + + repairPolicySectionName = + app == RepairConstants.SystemAppName ? RepairConstants.SystemServiceRepairPolicySectionName : RepairConstants.AppRepairPolicySectionName; + break; + + // VM repair. + case SupportedErrorCodes.NodeErrorCpuPercent: + case SupportedErrorCodes.NodeErrorMemoryMB: + case SupportedErrorCodes.NodeErrorMemoryPercent: + case SupportedErrorCodes.NodeErrorTooManyActiveEphemeralPorts: + case SupportedErrorCodes.NodeErrorTooManyActiveTcpPorts: + case SupportedErrorCodes.NodeErrorTotalOpenFileHandlesPercent: + case SupportedErrorCodes.NodeWarningCpuPercent: + case SupportedErrorCodes.NodeWarningMemoryMB: + case SupportedErrorCodes.NodeWarningMemoryPercent: + case SupportedErrorCodes.NodeWarningTooManyActiveEphemeralPorts: + case SupportedErrorCodes.NodeWarningTooManyActiveTcpPorts: + case SupportedErrorCodes.NodeWarningTotalOpenFileHandlesPercent: + + repairPolicySectionName = RepairConstants.MachineRepairPolicySectionName; + break; + + // Disk repair. + case SupportedErrorCodes.NodeWarningDiskSpaceMB: + case SupportedErrorCodes.NodeErrorDiskSpaceMB: + case SupportedErrorCodes.NodeWarningDiskSpacePercent: + case SupportedErrorCodes.NodeErrorDiskSpacePercent: + case SupportedErrorCodes.NodeWarningFolderSizeMB: + case SupportedErrorCodes.NodeErrorFolderSizeMB: + + repairPolicySectionName = RepairConstants.DiskRepairPolicySectionName; + break; + + default: + return null; + } + + return GetRepairRulesFromConfiguration(repairPolicySectionName); + } + + private List GetRepairRulesForSupportedObserver(string observerName) + { + string repairPolicySectionName; + + switch (observerName) + { + // App repair (user). + case RepairConstants.AppObserver: + + repairPolicySectionName = RepairConstants.AppRepairPolicySectionName; + break; + + // System service repair. + case RepairConstants.FabricSystemObserver: + repairPolicySectionName = RepairConstants.SystemServiceRepairPolicySectionName; + break; + + // Disk repair + case RepairConstants.DiskObserver: + repairPolicySectionName = RepairConstants.DiskRepairPolicySectionName; + break; + + // VM repair. + case RepairConstants.NodeObserver: + + repairPolicySectionName = RepairConstants.MachineRepairPolicySectionName; + break; + + default: + return null; + } + + return GetRepairRulesFromConfiguration(repairPolicySectionName); + } + + /// + /// Get a list of rules based on facts held in a TelemetryData instance. + /// + /// Instance of TelemetryData that FabricHealer created itself or deserialized from a Health Event Description. + /// + private List GetRepairRulesForTelemetryData(TelemetryData repairData) + { + string repairPolicySectionName; + + switch (repairData.EntityType) + { + case EntityType.Unknown: + + return null; + + // App/Service repair (user). + case EntityType.Application when repairData.ApplicationName.ToLower() != RepairConstants.SystemAppName.ToLower(): + case EntityType.Service: + case EntityType.StatefulService: + case EntityType.StatelessService: + repairPolicySectionName = RepairConstants.AppRepairPolicySectionName; + break; + + // System service process repair. + case EntityType.Application when repairData.ApplicationName.ToLower() == RepairConstants.SystemAppName.ToLower() && repairData.ProcessName != null: + case EntityType.Process when repairData.ProcessName != null || repairData.ProcessId > 0: + repairPolicySectionName = RepairConstants.SystemServiceRepairPolicySectionName; + break; + + // Disk repair. + case EntityType.Disk when serviceContext.NodeContext.NodeName == repairData.NodeName: + repairPolicySectionName = RepairConstants.DiskRepairPolicySectionName; + break; + + // Machine repair. + case EntityType.Machine: + repairPolicySectionName = RepairConstants.MachineRepairPolicySectionName; + break; + + // Fabric Node repair (from FabricHealerProxy, for example, where there is no concept of Observer). + case EntityType.Node: + repairPolicySectionName = RepairConstants.FabricNodeRepairPolicySectionName; + break; + + default: + return null; + } + + return GetRepairRulesFromConfiguration(repairPolicySectionName); + } + + private List GetRepairRulesFromConfiguration(string repairPolicySectionName) + { + try + { + string logicRulesConfigFileName = + GetSettingParameterValue(serviceContext, repairPolicySectionName, RepairConstants.LogicRulesConfigurationFile); + var configPath = serviceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config").Path; + var rulesFolderPath = Path.Combine(configPath, RepairConstants.LogicRulesFolderName); + var rulesFilePath = Path.Combine(rulesFolderPath, logicRulesConfigFileName); + + if (!File.Exists(rulesFilePath)) + { + return null; + } + + string[] rules = File.ReadAllLines(rulesFilePath); + + if (rules.Length == 0) + { + return null; + } + + List repairRules = ParseRulesFile(rules); + return repairRules; + } + catch (Exception ex) when (ex is ArgumentException || ex is IOException) + { + return null; + } + } + + private int GetEnabledRepairRuleCount() + { + var config = serviceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config"); + int count = 0; + + foreach (var section in config.Settings.Sections) + { + if (!section.Name.Contains(RepairConstants.RepairPolicy)) + { + continue; + } + + if (section.Parameters[RepairConstants.Enabled]?.Value?.ToLower() == "true") + { + count++; + } + } + + return count; + } + + private void Dispose(bool disposing) + { + if (disposedValue) + { + return; + } + + if (disposing) + { + _fabricClient?.Dispose(); + } + + disposedValue = true; + } + + public void Dispose() + { + Dispose(true); + } + + private static List ParseRulesFile(string[] rules) + { + var repairRules = new List(); + int ptr1 = 0, ptr2 = 0; + rules = rules.Where(s => !string.IsNullOrWhiteSpace(s)).ToArray(); + + while (ptr1 < rules.Length && ptr2 < rules.Length) + { + // Single line comments removal. + if (rules[ptr2].TrimStart().StartsWith("##")) + { + ptr1++; + ptr2++; + continue; + } + + if (rules[ptr2].TrimEnd().EndsWith(".")) + { + if (ptr1 == ptr2) + { + repairRules.Add(rules[ptr2].TrimEnd().Remove(rules[ptr2].Length - 1, 1)); + } + else + { + string rule = rules[ptr1].Trim(); + + for (int i = ptr1 + 1; i <= ptr2; i++) + { + rule = rule + ' ' + rules[i].Replace('\t', ' ').Trim(); + } + + repairRules.Add(rule.Remove(rule.Length - 1, 1)); + } + ptr2++; + ptr1 = ptr2; + } + else + { + ptr2++; + } + } + + return repairRules; + } + + private async Task RandomWaitAsync() + { + var random = new Random(); + int waitTimeMS = random.Next(random.Next(100, nodeCount * 100), 1000 * nodeCount); + + await Task.Delay(waitTimeMS, Token); + } + + // https://stackoverflow.com/questions/25678690/how-can-i-check-github-releases-in-c + private async Task CheckGithubForNewVersionAsync() + { + try + { + var githubClient = new GitHubClient(new ProductHeaderValue(RepairConstants.FabricHealer)); + IReadOnlyList releases = await githubClient.Repository.Release.GetAll("microsoft", "service-fabric-healer"); + + if (releases.Count == 0) + { + return; + } + + string releaseAssetName = releases[0].Name; + string latestVersion = releaseAssetName.Split(" ")[1]; + Version latestGitHubVersion = new Version(latestVersion); + Version localVersion = new Version(InternalVersionNumber); + int versionComparison = localVersion.CompareTo(latestGitHubVersion); + + if (versionComparison < 0) + { + string message = $"A newer version of FabricHealer is available: {latestVersion}"; + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + RepairConstants.FabricHealer, + message, + Token, + null, + true, + TimeSpan.FromDays(1), + "NewVersionAvailable", + EntityType.Application); + } + } + catch (Exception e) + { + // Don't take down FO due to error in version check. + RepairLogger.LogWarning($"Failure in CheckGithubForNewVersionAsync:{Environment.NewLine}{e}"); + } + } + + private async Task ClearExistingHealthReportsAsync() + { + try + { + var healthReporter = new FabricHealthReporter(RepairLogger); + var healthReport = new HealthReport + { + HealthMessage = "Clearing existing health reports as FabricHealer is stopping or updating.", + NodeName = serviceContext.NodeContext.NodeName, + State = HealthState.Ok, + HealthReportTimeToLive = TimeSpan.FromMinutes(5), + }; + + var appName = new Uri(RepairConstants.FabricHealerAppName); + var appHealth = await FabricClientSingleton.HealthManager.GetApplicationHealthAsync(appName); + var FHAppEvents = appHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains($"{RepairConstants.FabricHealer}.")); + + foreach (HealthEvent evt in FHAppEvents) + { + healthReport.AppName = appName; + healthReport.Property = evt.HealthInformation.Property; + healthReport.SourceId = evt.HealthInformation.SourceId; + healthReport.EntityType = EntityType.Application; + + healthReporter.ReportHealthToServiceFabric(healthReport); + Thread.Sleep(50); + } + + var nodeHealth = await FabricClientSingleton.HealthManager.GetNodeHealthAsync(serviceContext.NodeContext.NodeName); + var FHNodeEvents = nodeHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains(RepairConstants.FabricHealer)); + + foreach (HealthEvent evt in FHNodeEvents) + { + healthReport.Property = evt.HealthInformation.Property; + healthReport.SourceId = evt.HealthInformation.SourceId; + healthReport.EntityType = EntityType.Node; + + healthReporter.ReportHealthToServiceFabric(healthReport); + Thread.Sleep(50); + } + } + catch (FabricException) + { + + } + } + + private string GetServiceFabricRuntimeVersion() + { + try + { + var config = ServiceFabricConfiguration.Instance; + return config.FabricVersion; + } + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) + { + RepairLogger.LogWarning($"GetServiceFabricRuntimeVersion failure:{Environment.NewLine}{e}"); + } + + return null; + } + } +} diff --git a/FabricHealerProxy/EntityType.cs b/FabricHealerProxy/EntityType.cs index c0bbf77..b6bd855 100644 --- a/FabricHealerProxy/EntityType.cs +++ b/FabricHealerProxy/EntityType.cs @@ -11,9 +11,9 @@ namespace FabricHealer public enum EntityType { /// - /// Invalid (default value). + /// Unknown (default value). /// - Invalid, + Unknown, /// /// Application type. /// diff --git a/FabricHealerProxy/FabricHealerProxy.cs b/FabricHealerProxy/FabricHealerProxy.cs index c8f9041..d5e24b1 100644 --- a/FabricHealerProxy/FabricHealerProxy.cs +++ b/FabricHealerProxy/FabricHealerProxy.cs @@ -254,22 +254,22 @@ namespace FabricHealer ManageRepairDataHistory(cancellationToken); // Support not specifying EntityType. - if (!string.IsNullOrWhiteSpace(repairData.ServiceName) && repairData.EntityType == EntityType.Invalid) + if (!string.IsNullOrWhiteSpace(repairData.ServiceName) && repairData.EntityType == EntityType.Unknown) { repairData.EntityType = EntityType.Service; } - else if (repairData.ReplicaId > 0 && repairData.EntityType == EntityType.Invalid) + else if (repairData.ReplicaId > 0 && repairData.EntityType == EntityType.Unknown) { repairData.EntityType = EntityType.Replica; } - else if ((repairData.ProcessId > 0 || !string.IsNullOrWhiteSpace(repairData.ProcessName)) && repairData.EntityType == EntityType.Invalid) + else if ((repairData.ProcessId > 0 || !string.IsNullOrWhiteSpace(repairData.ProcessName)) && repairData.EntityType == EntityType.Unknown) { repairData.EntityType = EntityType.Process; } else if (!string.IsNullOrEmpty(repairData.NodeName) && string.IsNullOrWhiteSpace(repairData.ApplicationName) && string.IsNullOrWhiteSpace(repairData.ServiceName) && - repairData.EntityType == EntityType.Invalid || repairData.EntityType == EntityType.Machine) + repairData.EntityType == EntityType.Unknown || repairData.EntityType == EntityType.Machine) { repairData.EntityType = repairData.EntityType == EntityType.Machine ? EntityType.Machine : EntityType.Node;