Operational Telemetry impl.
This commit is contained in:
Родитель
78e62cf4e1
Коммит
db4d6c6da3
|
@ -1,7 +1,7 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>netcoreapp3.1</TargetFramework>
|
||||
<TargetFramework>net5.0</TargetFramework>
|
||||
|
||||
<IsPackable>false</IsPackable>
|
||||
|
||||
|
|
|
@ -51,10 +51,10 @@ namespace FHTest
|
|||
|
||||
// Set this to the full path to your Rules directory in the FabricHealer project's PackageRoot\Config directory.
|
||||
// e.g., if developing on Windows, then something like @"C:\Users\[me]\source\repos\service-fabric-healer\FabricHealer\PackageRoot\Config\Rules\";
|
||||
private const string FHRulesDirectory = @"C:\Users\ctorre\source\repos\service-fabric-healer\FabricHealer\PackageRoot\Config\Rules\";
|
||||
private const string FHRulesDirectory = @"C:\Users\[me]\source\repos\service-fabric-healer\FabricHealer\PackageRoot\Config\Rules\";
|
||||
|
||||
/* GuanLogic Tests */
|
||||
// TODO: More of them.
|
||||
// TODO: Add more tests.
|
||||
|
||||
// This test ensures your actual rule files contain legitimate rules. This will catch bugs in your
|
||||
// logic. Of course, you should have caught these flaws in your end-to-end tests. This is just an extra precaution.
|
||||
|
|
|
@ -23,6 +23,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FHTest", "FHTest\FHTest.csp
|
|||
EndProject
|
||||
Project("{A07B5EB6-E848-4116-A8D0-A826331D98C6}") = "FabricHealerApp", "FabricHealerApp\FabricHealerApp.sfproj", "{A977C8E0-2183-4845-95EA-7F3C3E795310}"
|
||||
EndProject
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TelemetryLib", "TelemetryLib\TelemetryLib.csproj", "{7BC6991F-C840-413E-B1CD-4025947CF5FA}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Any CPU = Debug|Any CPU
|
||||
|
@ -59,6 +61,14 @@ Global
|
|||
{A977C8E0-2183-4845-95EA-7F3C3E795310}.Release|x64.ActiveCfg = Release|x64
|
||||
{A977C8E0-2183-4845-95EA-7F3C3E795310}.Release|x64.Build.0 = Release|x64
|
||||
{A977C8E0-2183-4845-95EA-7F3C3E795310}.Release|x64.Deploy.0 = Release|x64
|
||||
{7BC6991F-C840-413E-B1CD-4025947CF5FA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{7BC6991F-C840-413E-B1CD-4025947CF5FA}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{7BC6991F-C840-413E-B1CD-4025947CF5FA}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{7BC6991F-C840-413E-B1CD-4025947CF5FA}.Debug|x64.Build.0 = Debug|x64
|
||||
{7BC6991F-C840-413E-B1CD-4025947CF5FA}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{7BC6991F-C840-413E-B1CD-4025947CF5FA}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{7BC6991F-C840-413E-B1CD-4025947CF5FA}.Release|x64.ActiveCfg = Release|x64
|
||||
{7BC6991F-C840-413E-B1CD-4025947CF5FA}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<ProjectGuid>{9A19103F-16F7-4668-BE54-9A1E7A4F7556}</ProjectGuid>
|
||||
<TargetFramework>netcoreapp3.1</TargetFramework>
|
||||
<TargetFramework>net5.0</TargetFramework>
|
||||
<PlatformTarget>x64</PlatformTarget>
|
||||
<OutputType>Exe</OutputType>
|
||||
<!-- ***NOTE***:
|
||||
|
@ -13,8 +13,8 @@
|
|||
<RuntimeIdentifiers>linux-x64;win-x64</RuntimeIdentifiers>-->
|
||||
<RootNamespace>FabricHealer</RootNamespace>
|
||||
<AssemblyName>FabricHealer</AssemblyName>
|
||||
<AssemblyVersion>0.4.2</AssemblyVersion>
|
||||
<FileVersion>0.4.2</FileVersion>
|
||||
<AssemblyVersion>1.0.0</AssemblyVersion>
|
||||
<FileVersion>1.0.0</FileVersion>
|
||||
<AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
|
||||
<IsServiceFabricServiceProject>true</IsServiceFabricServiceProject>
|
||||
<StartupObject>FabricHealer.Program</StartupObject>
|
||||
|
@ -27,12 +27,8 @@
|
|||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.ApplicationInsights" Version="2.17.0" />
|
||||
<PackageReference Include="Microsoft.Logic.Guan" Version="1.0.0-Preview" />
|
||||
<PackageReference Include="Microsoft.ServiceFabric.Data" Version="5.0.516" />
|
||||
<PackageReference Include="Microsoft.ServiceFabric.Data.Extensions" Version="5.0.516" />
|
||||
<PackageReference Include="Microsoft.ServiceFabric.Data.Interfaces" Version="5.0.516" />
|
||||
<PackageReference Include="Microsoft.ServiceFabric.Diagnostics.Internal" Version="5.0.516" />
|
||||
<PackageReference Include="Microsoft.ServiceFabric.Services" Version="5.0.516" />
|
||||
<PackageReference Include="Microsoft.ServiceFabric.Services.Remoting" Version="5.0.516" />
|
||||
<PackageReference Include="Microsoft.ServiceFabric" Version="7.2.452" />
|
||||
<PackageReference Include="Microsoft.ServiceFabric.Services" Version="4.2.452" />
|
||||
<PackageReference Include="Microsoft.CSharp" Version="4.7.0" />
|
||||
<PackageReference Include="Newtonsoft.Json" Version="13.0.1" />
|
||||
<PackageReference Include="NLog" Version="4.7.9" />
|
||||
|
@ -40,4 +36,7 @@
|
|||
<PackageReference Include="System.Data.DataSetExtensions" Version="4.5.0" />
|
||||
<PackageReference Include="System.Configuration.ConfigurationManager" Version="5.0.0" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\TelemetryLib\TelemetryLib.csproj" />
|
||||
</ItemGroup>
|
||||
</Project>
|
|
@ -17,12 +17,18 @@ using System.Threading.Tasks;
|
|||
using HealthReport = FabricHealer.Utilities.HealthReport;
|
||||
using System.Fabric.Repair;
|
||||
using System.Fabric.Query;
|
||||
using System.Fabric.Description;
|
||||
using FabricHealer.TelemetryLib;
|
||||
|
||||
namespace FabricHealer
|
||||
{
|
||||
public sealed class FabricHealerManager : IDisposable
|
||||
{
|
||||
internal static TelemetryUtilities TelemetryUtilities;
|
||||
internal static RepairData RepairHistory;
|
||||
|
||||
// Folks often use their own version numbers. This is for internal diagnostic telemetry.
|
||||
private const string InternalVersionNumber = "1.0.0-Preview";
|
||||
private static FabricHealerManager singleton;
|
||||
private bool disposedValue;
|
||||
private readonly StatelessServiceContext serviceContext;
|
||||
private readonly FabricClient fabricClient;
|
||||
|
@ -31,8 +37,9 @@ namespace FabricHealer
|
|||
private readonly Uri systemAppUri = new Uri("fabric:/System");
|
||||
private readonly Uri repairManagerServiceUri = new Uri("fabric:/System/RepairManagerService");
|
||||
private readonly FabricHealthReporter healthReporter;
|
||||
private static FabricHealerManager singleton;
|
||||
internal static TelemetryUtilities TelemetryUtilities;
|
||||
private readonly TimeSpan OperationalTelemetryRunInterval = TimeSpan.FromDays(1);
|
||||
private int nodeCount;
|
||||
private DateTime StartDateTime;
|
||||
|
||||
internal static Logger RepairLogger
|
||||
{
|
||||
|
@ -46,12 +53,27 @@ namespace FabricHealer
|
|||
private set;
|
||||
}
|
||||
|
||||
private bool FabricHealerOperationalTelemetryEnabled
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
|
||||
// CancellationToken from FabricHealer.RunAsync. See ctor.
|
||||
private CancellationToken Token
|
||||
{
|
||||
get;
|
||||
}
|
||||
|
||||
private DateTime LastTelemetrySendDate
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
|
||||
private bool EtwEnabled
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
|
||||
private FabricHealerManager(StatelessServiceContext context, CancellationToken token)
|
||||
{
|
||||
serviceContext = context;
|
||||
|
@ -62,26 +84,12 @@ namespace FabricHealer
|
|||
TelemetryUtilities = new TelemetryUtilities(fabricClient, context);
|
||||
repairTaskEngine = new RepairTaskEngine(fabricClient);
|
||||
repairTaskManager = new RepairTaskManager(fabricClient, serviceContext, Token);
|
||||
RepairLogger = new Logger("FabricHealer")
|
||||
RepairLogger = new Logger("FabricHealer", ConfigSettings.LocalLogPathParameter)
|
||||
{
|
||||
EnableVerboseLogging = ConfigSettings.EnableVerboseLogging,
|
||||
EnableVerboseLogging = ConfigSettings.EnableVerboseLogging
|
||||
};
|
||||
|
||||
// Local Logger setup.
|
||||
string logFolderBasePath;
|
||||
string localLogPath = ConfigSettings.LocalLogPathParameter;
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(localLogPath))
|
||||
{
|
||||
logFolderBasePath = localLogPath;
|
||||
}
|
||||
else
|
||||
{
|
||||
string logFolderBase = Path.Combine($@"{Environment.CurrentDirectory}", "fabrichealer_logs");
|
||||
logFolderBasePath = logFolderBase;
|
||||
}
|
||||
|
||||
RepairLogger.LogFolderBasePath = logFolderBasePath;
|
||||
RepairHistory = new RepairData();
|
||||
healthReporter = new FabricHealthReporter(fabricClient);
|
||||
}
|
||||
|
||||
|
@ -141,7 +149,6 @@ namespace FabricHealer
|
|||
}
|
||||
|
||||
healthReporter.ReportHealthToServiceFabric(healthReport);
|
||||
|
||||
return isRmDeployed;
|
||||
}
|
||||
|
||||
|
@ -206,6 +213,8 @@ namespace FabricHealer
|
|||
// repair actions, scheduling and executing related repair tasks.
|
||||
public async Task StartAsync()
|
||||
{
|
||||
StartDateTime = DateTime.UtcNow;
|
||||
|
||||
if (!ConfigSettings.EnableAutoMitigation || !await CheckRepairManagerDeploymentStatusAsync(repairManagerServiceUri, Token).ConfigureAwait(true))
|
||||
{
|
||||
return;
|
||||
|
@ -215,6 +224,12 @@ namespace FabricHealer
|
|||
{
|
||||
RepairLogger.LogInfo("Starting FabricHealer Health Detection loop.");
|
||||
|
||||
var nodeList =
|
||||
await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => fabricClient.QueryManager.GetNodeListAsync(null, ConfigSettings.AsyncTimeout, Token),
|
||||
Token).ConfigureAwait(true);
|
||||
nodeCount = nodeList.Count;
|
||||
|
||||
// First, let's clean up any orphan non-node level FabricHealer repair tasks left pending
|
||||
// when the FabricHealer process is killed or otherwise ungracefully closed.
|
||||
// This call will return quickly if FH was gracefully closed as there will be
|
||||
|
@ -234,8 +249,39 @@ namespace FabricHealer
|
|||
continue;
|
||||
}
|
||||
|
||||
GC.Collect(2, GCCollectionMode.Forced, true, false);
|
||||
GC.Collect(2, GCCollectionMode.Forced, true, false);
|
||||
// Identity-agnostic internal operational telemetry sent to Service Fabric team (only) for use in
|
||||
// understanding generic behavior of FH in the real world (no PII). This data is sent once a day and will be retained for no more
|
||||
// than 90 days.
|
||||
if (ConfigSettings.OperationalTelemetryEnabled && DateTime.UtcNow.Subtract(LastTelemetrySendDate) >= OperationalTelemetryRunInterval)
|
||||
{
|
||||
try
|
||||
{
|
||||
using var telemetryEvents = new TelemetryEvents(
|
||||
fabricClient,
|
||||
serviceContext,
|
||||
ServiceEventSource.Current,
|
||||
Token,
|
||||
EtwEnabled);
|
||||
|
||||
var fhData = GetFabricHealerInternalTelemetryData();
|
||||
|
||||
if (fhData != null)
|
||||
{
|
||||
string filepath = Path.Combine(RepairLogger.LogFolderBasePath, $"fh_operational_telemetry.log");
|
||||
|
||||
if (telemetryEvents.EmitFabricObserverOperationalEvent(fhData, OperationalTelemetryRunInterval, filepath))
|
||||
{
|
||||
LastTelemetrySendDate = DateTime.UtcNow;
|
||||
ResetInternalDataCounters();
|
||||
}
|
||||
}
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Telemetry is non-critical and should not take down FH.
|
||||
// TelemetryLib will log exception details to file in top level FH log folder.
|
||||
}
|
||||
}
|
||||
|
||||
await Task.Delay(
|
||||
TimeSpan.FromSeconds(
|
||||
|
@ -255,7 +301,7 @@ namespace FabricHealer
|
|||
RepairLogger.LogInfo("Shutdown signaled. Stopping.");
|
||||
}
|
||||
}
|
||||
catch (Exception e) when (e is OperationCanceledException || e is TimeoutException)
|
||||
catch (Exception e) when (e is FabricException || e is OperationCanceledException || e is TaskCanceledException || e is TimeoutException)
|
||||
{
|
||||
// This check is necessary to prevent cancelling outstanding repair tasks if
|
||||
// one of the handled exceptions originated from another operation unrelated to
|
||||
|
@ -285,12 +331,74 @@ namespace FabricHealer
|
|||
});
|
||||
}
|
||||
|
||||
// Operational telemetry sent to FO developer for use in understanding generic behavior of FO in the real world (no PII)
|
||||
if (ConfigSettings.OperationalTelemetryEnabled)
|
||||
{
|
||||
try
|
||||
{
|
||||
using var telemetryEvents = new TelemetryEvents(
|
||||
fabricClient,
|
||||
serviceContext,
|
||||
ServiceEventSource.Current,
|
||||
Token,
|
||||
EtwEnabled);
|
||||
|
||||
var fhData = new FabricHealerCriticalErrorEventData
|
||||
{
|
||||
Source = nameof(FabricHealerManager),
|
||||
ErrorMessage = e.Message,
|
||||
ErrorStack = e.StackTrace,
|
||||
CrashTime = DateTime.UtcNow.ToString("o"),
|
||||
Version = InternalVersionNumber
|
||||
};
|
||||
|
||||
string filepath = Path.Combine(RepairLogger.LogFolderBasePath, $"fh_critical_error_telemetry.log");
|
||||
_ = telemetryEvents.EmitFabricObserverCriticalErrorEvent(fhData, filepath);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Telemetry is non-critical and should not take down FH.
|
||||
}
|
||||
}
|
||||
|
||||
// Don't swallow the exception.
|
||||
// Take down FH process. Fix the bugs.
|
||||
// Take down FH process. Fix the bug.
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
private void ResetInternalDataCounters()
|
||||
{
|
||||
RepairHistory.Repairs.Clear();
|
||||
RepairHistory.FailedRepairs = 0;
|
||||
RepairHistory.SuccessfulRepairs = 0;
|
||||
RepairHistory.RepairCount = 0;
|
||||
RepairHistory.EnabledRepairCount = 0;
|
||||
}
|
||||
|
||||
private FabricHealerOperationalEventData GetFabricHealerInternalTelemetryData()
|
||||
{
|
||||
FabricHealerOperationalEventData telemetryData = null;
|
||||
|
||||
try
|
||||
{
|
||||
RepairHistory.EnabledRepairCount = GetEnabledRepairRuleCount();
|
||||
|
||||
telemetryData = new FabricHealerOperationalEventData
|
||||
{
|
||||
UpTime = DateTime.UtcNow.Subtract(StartDateTime).ToString(),
|
||||
Version = InternalVersionNumber,
|
||||
RepairData = RepairHistory
|
||||
};
|
||||
}
|
||||
catch
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
return telemetryData;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Cancels all FabricHealer repair tasks currently in flight (unless in Restoring state).
|
||||
/// OR Resumes fabric node-level repairs that were abandoned due to FH going down while they were processing.
|
||||
|
@ -393,14 +501,9 @@ namespace FabricHealer
|
|||
}
|
||||
}
|
||||
|
||||
private async void CodePackageActivationContext_ConfigurationPackageModifiedEvent(object sender, PackageModifiedEventArgs<ConfigurationPackage> e)
|
||||
private void CodePackageActivationContext_ConfigurationPackageModifiedEvent(object sender, PackageModifiedEventArgs<ConfigurationPackage> e)
|
||||
{
|
||||
ConfigSettings.UpdateConfigSettings(e.NewPackage.Settings);
|
||||
|
||||
if (ConfigSettings.EnableAutoMitigation)
|
||||
{
|
||||
await StartAsync().ConfigureAwait(true);
|
||||
}
|
||||
}
|
||||
|
||||
/* Potential TODOs. This list should grow and external predicates should be written to support related workflow composition in logic rule file(s).
|
||||
|
@ -572,6 +675,7 @@ namespace FabricHealer
|
|||
private async Task ProcessApplicationHealthAsync(IEnumerable<ApplicationHealthState> appHealthStates)
|
||||
{
|
||||
var supportedAppHealthStates = appHealthStates.Where(a => a.AggregatedHealthState == HealthState.Warning || a.AggregatedHealthState == HealthState.Error);
|
||||
var nodeList = await fabricClient.QueryManager.GetNodeListAsync().ConfigureAwait(false);
|
||||
|
||||
foreach (var app in supportedAppHealthStates)
|
||||
{
|
||||
|
@ -619,7 +723,7 @@ namespace FabricHealer
|
|||
|
||||
// Random wait to limit potential duplicate (concurrent) repair job creation from other FH instances.
|
||||
var random = new Random();
|
||||
int waitTimeMS = random.Next(250, 10000);
|
||||
int waitTimeMS = random.Next(random.Next(0, nodeCount * 100), 1000 * nodeCount);
|
||||
await Task.Delay(waitTimeMS, Token).ConfigureAwait(true);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(evt.HealthInformation.Description))
|
||||
|
@ -787,7 +891,7 @@ namespace FabricHealer
|
|||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
Node targetNode = nodeList[0];
|
||||
|
||||
// Check to see if a VM-level repair is already in flight in the cluster.
|
||||
|
@ -819,7 +923,7 @@ namespace FabricHealer
|
|||
|
||||
// Random wait to limit potential duplicate (concurrent) repair job creation from other FH instances.
|
||||
var random = new Random();
|
||||
int waitTimeMS = random.Next(250, 10000);
|
||||
int waitTimeMS = random.Next(random.Next(0, nodeCount * 100), 1000 * nodeCount);
|
||||
await Task.Delay(waitTimeMS, Token).ConfigureAwait(true);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(evt.HealthInformation.Description))
|
||||
|
@ -920,7 +1024,7 @@ namespace FabricHealer
|
|||
|
||||
// Random wait to limit potential duplicate (concurrent) repair job creation from other FH instances.
|
||||
var random = new Random();
|
||||
int waitTimeMS = random.Next(250, 10000);
|
||||
int waitTimeMS = random.Next(random.Next(0, nodeCount * 100), 1000 * nodeCount);
|
||||
await Task.Delay(waitTimeMS, Token).ConfigureAwait(true);
|
||||
|
||||
var service = await fabricClient.QueryManager.GetServiceNameAsync(
|
||||
|
@ -1105,6 +1209,27 @@ namespace FabricHealer
|
|||
return repairRules;
|
||||
}
|
||||
|
||||
private int GetEnabledRepairRuleCount()
|
||||
{
|
||||
var config = serviceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config");
|
||||
int count = 0;
|
||||
|
||||
foreach (var section in config.Settings.Sections)
|
||||
{
|
||||
if (!section.Name.Contains("RepairPolicy"))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (section.Parameters["Enabled"]?.Value?.ToLower() == "true")
|
||||
{
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
private void Dispose(bool disposing)
|
||||
{
|
||||
if (disposedValue)
|
||||
|
|
|
@ -30,9 +30,9 @@
|
|||
## *Network-related repair is not supported today.
|
||||
|
||||
## RunInterval rule: Are we inside the run interval for an app repair? This is optional. It's up to you if you want to employ this type of constraint.
|
||||
## This below rule means that if any of the repairs for any of the apps in this rule file has run within the past 5 minutes, cut (!),
|
||||
## This below rule means that if any of the repairs for any of the apps in this rule file has run within the past 10 minutes, cut (!),
|
||||
## which is a special operator that means, effectively, stop processing rules; do not backtrack.
|
||||
## By having this as a top level rule, it means no subsequent rules in this file will run if we are inside the run interval.
|
||||
## By having this as a top level rule, it means no subsequent rules in this file will run if we are inside the specified run interval.
|
||||
|
||||
Mitigate() :- CheckInsideRunInterval(RunInterval=00:10:00), !.
|
||||
|
||||
|
@ -89,7 +89,7 @@ TimeScopedRestartCodePackage(?count, ?time) :- GetRepairHistory(?repairCount, Ti
|
|||
|
||||
## If we get here, it means the number of repairs for a target has not exceeded the maximum number specified to run within a time window.
|
||||
## Note you can add up to two optional arguments to RestartCodePackage, name them whatever you want or omit the names, it just has to be either a TimeSpan value for how long to wait
|
||||
## for the repair target to become healthy or a bool for whether or not RM should do health checks before and after the repair executes.
|
||||
## Obviously, you can supply both. See below for an example using both optional arguments (named for clarity..you could also do RestartCodePackage(true, 00:10:00)).
|
||||
## for the repair target to become healthy and/or a bool for whether or not RM should do health checks before and after the repair executes.
|
||||
## See below for an example using both optional arguments (arguments are named for clarity (generally a good idea)...you could also supply RestartCodePackage(true, 00:10:00)).
|
||||
|
||||
TimeScopedRestartCodePackage() :- RestartCodePackage(DoHealthChecks=true, MaxWaitTimeForHealthStateOk=00:10:00).
|
|
@ -19,8 +19,8 @@
|
|||
## | EphemeralPorts |
|
||||
## | MemoryMB |
|
||||
## | MemoryPercent |
|
||||
## | FileHandles |
|
||||
## | FileHandlesPercent |
|
||||
## | FileHandles (Linux) |
|
||||
## | FileHandlesPercent (Linux)|
|
||||
|
||||
## First, check if we are inside run interval. If so, then cut (!).
|
||||
Mitigate() :- CheckInsideRunInterval(RunInterval=02:00:00), !.
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
<Parameter Name="EventSourceProviderName" Value="FabricHealerETWProvider" />
|
||||
<!-- Big on/off switch. You can be more granular below in the Repair policies sections. -->
|
||||
<Parameter Name="EnableAutoMitigation" Value="" MustOverride="true" />
|
||||
<Parameter Name="EnableOperationalTelemetry" Value="" MustOverride="true" />
|
||||
</Section>
|
||||
<!-- Repair policies -->
|
||||
<Section Name="AppRepairPolicy">
|
||||
|
|
|
@ -26,10 +26,11 @@ namespace FabricHealer.Repair
|
|||
public const string AppInsightsTelemetryEnabled = "EnableTelemetryProvider";
|
||||
public const string AppInsightsInstrumentationKeyParameter = "AppInsightsInstrumentationKey";
|
||||
public const string EnableEventSourceProvider = "EnableEventSourceProvider";
|
||||
public const string EventSourceProviderName = "EventSourceProviderName";
|
||||
public const string EventSourceProviderName = "FabricHealerETWProvider";
|
||||
public const string HealthCheckLoopSleepTimeSeconds = "HealthCheckLoopSleepTimeSeconds";
|
||||
public const string LocalLogPathParameter = "LocalLogPath";
|
||||
public const string AsyncOperationTimeout = "AsyncOperationTimeoutSeconds";
|
||||
public const string EnableFabricHealerOperationalTelemetry = "EnableOperationalTelemetry";
|
||||
|
||||
// General Repair Settings Parameters.
|
||||
public const string EnableAutoMitigation = "EnableAutoMitigation";
|
||||
|
|
|
@ -130,6 +130,7 @@ namespace FabricHealer.Repair
|
|||
|
||||
if (restartCodePackageResult != null)
|
||||
{
|
||||
UpdateRepairHistory(repairConfiguration);
|
||||
await ClearHealthWarningsAsync(repairConfiguration, HealthScope.Application, cancellationToken, "AppObserver").ConfigureAwait(true);
|
||||
}
|
||||
|
||||
|
@ -142,11 +143,29 @@ namespace FabricHealer.Repair
|
|||
"RepairExecutor.RestartCodePackageAsync",
|
||||
$"Execution failure:{Environment.NewLine}{e}",
|
||||
cancellationToken).ConfigureAwait(true);
|
||||
|
||||
|
||||
FabricHealerManager.RepairHistory.FailedRepairs++;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static void UpdateRepairHistory(RepairConfiguration repairConfiguration)
|
||||
{
|
||||
string repairName = Enum.GetName(typeof(RepairActionType), repairConfiguration.RepairPolicy.RepairAction);
|
||||
|
||||
if (!FabricHealerManager.RepairHistory.Repairs.ContainsKey(repairName))
|
||||
{
|
||||
FabricHealerManager.RepairHistory.Repairs.Add(repairName, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
FabricHealerManager.RepairHistory.Repairs[repairName]++;
|
||||
}
|
||||
|
||||
FabricHealerManager.RepairHistory.RepairCount++;
|
||||
FabricHealerManager.RepairHistory.SuccessfulRepairs++;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Safely restarts a Service Fabric Node instance.
|
||||
/// Algorithm:
|
||||
|
@ -175,7 +194,6 @@ namespace FabricHealer.Repair
|
|||
cancellationToken).ConfigureAwait(true);
|
||||
|
||||
FabricHealerManager.RepairLogger.LogInfo(info);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -496,6 +514,8 @@ namespace FabricHealer.Repair
|
|||
statusSuccess,
|
||||
cancellationToken,
|
||||
repairConfiguration).ConfigureAwait(true);
|
||||
|
||||
UpdateRepairHistory(repairConfiguration);
|
||||
}
|
||||
catch (Exception e) when (e is FabricException || e is TimeoutException || e is OperationCanceledException)
|
||||
{
|
||||
|
@ -514,6 +534,7 @@ namespace FabricHealer.Repair
|
|||
cancellationToken,
|
||||
repairConfiguration).ConfigureAwait(true);
|
||||
|
||||
FabricHealerManager.RepairHistory.FailedRepairs++;
|
||||
return null;
|
||||
}
|
||||
|
||||
|
@ -560,12 +581,14 @@ namespace FabricHealer.Repair
|
|||
}
|
||||
|
||||
p?.Kill(true);
|
||||
UpdateRepairHistory(repairConfiguration);
|
||||
|
||||
// Clear Warning from FO. If in fact the issue has not been solved, then FO will generate a new health report for the target and the game will be played again.
|
||||
await ClearHealthWarningsAsync(repairConfiguration, HealthScope.Application, cancellationToken, "FabricSystemObserver").ConfigureAwait(true);
|
||||
}
|
||||
catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is NotSupportedException || e is Win32Exception)
|
||||
{
|
||||
FabricHealerManager.RepairHistory.FailedRepairs++;
|
||||
return false;
|
||||
}
|
||||
catch (Exception e)
|
||||
|
@ -583,6 +606,8 @@ namespace FabricHealer.Repair
|
|||
err,
|
||||
cancellationToken,
|
||||
repairConfiguration);
|
||||
|
||||
FabricHealerManager.RepairHistory.FailedRepairs++;
|
||||
|
||||
// fix the bug..
|
||||
throw;
|
||||
|
@ -680,6 +705,8 @@ namespace FabricHealer.Repair
|
|||
statusSuccess,
|
||||
cancellationToken,
|
||||
repairConfiguration).ConfigureAwait(true);
|
||||
|
||||
UpdateRepairHistory(repairConfiguration);
|
||||
}
|
||||
catch (Exception e) when (e is FabricException || e is TimeoutException || e is OperationCanceledException)
|
||||
{
|
||||
|
@ -697,7 +724,8 @@ namespace FabricHealer.Repair
|
|||
err,
|
||||
cancellationToken,
|
||||
repairConfiguration).ConfigureAwait(true);
|
||||
|
||||
|
||||
FabricHealerManager.RepairHistory.FailedRepairs++;
|
||||
return null;
|
||||
}
|
||||
|
||||
|
@ -796,7 +824,8 @@ namespace FabricHealer.Repair
|
|||
$"Unable to delete specified number of files ({maxFiles}).",
|
||||
cancellationToken,
|
||||
repairConfiguration).ConfigureAwait(true);
|
||||
|
||||
|
||||
FabricHealerManager.RepairHistory.FailedRepairs++;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -807,8 +836,9 @@ namespace FabricHealer.Repair
|
|||
"RepairExecutor.DeleteFilesAsync::IncompleteOperation",
|
||||
"Unable to delete all files.",
|
||||
cancellationToken,
|
||||
repairConfiguration).ConfigureAwait(true);
|
||||
repairConfiguration).ConfigureAwait(true);
|
||||
|
||||
FabricHealerManager.RepairHistory.FailedRepairs++;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -818,10 +848,11 @@ namespace FabricHealer.Repair
|
|||
$"Successfully deleted {(maxFiles > 0 ? "up to " + maxFiles : "all")} files in {targetFolderPath}",
|
||||
cancellationToken,
|
||||
repairConfiguration).ConfigureAwait(true);
|
||||
|
||||
UpdateRepairHistory(repairConfiguration);
|
||||
}
|
||||
|
||||
await ClearHealthWarningsAsync(repairConfiguration, HealthScope.Node, cancellationToken, "DiskObserver").ConfigureAwait(true);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -372,7 +372,6 @@ namespace FabricHealer.Repair
|
|||
|
||||
public async Task<bool> RestartDeployedCodePackageAsync(RepairConfiguration repairConfiguration, CancellationToken cancellationToken)
|
||||
{
|
||||
#if DEBUG
|
||||
string actionMessage =
|
||||
"Attempting to restart deployed code package for service " +
|
||||
$"{repairConfiguration.ServiceName.OriginalString} " +
|
||||
|
@ -384,14 +383,14 @@ namespace FabricHealer.Repair
|
|||
actionMessage,
|
||||
cancellationToken,
|
||||
repairConfiguration).ConfigureAwait(true);
|
||||
#endif
|
||||
|
||||
var result = await RepairExec.RestartDeployedCodePackageAsync(repairConfiguration, cancellationToken).ConfigureAwait(true);
|
||||
|
||||
if (result == null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#if DEBUG
|
||||
|
||||
actionMessage =
|
||||
"Successfully restarted deployed code package for service " +
|
||||
$"{repairConfiguration.ServiceName.OriginalString} " +
|
||||
|
@ -403,7 +402,7 @@ namespace FabricHealer.Repair
|
|||
actionMessage,
|
||||
cancellationToken,
|
||||
repairConfiguration).ConfigureAwait(true);
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -562,13 +561,13 @@ namespace FabricHealer.Repair
|
|||
|
||||
return false;
|
||||
}
|
||||
#if DEBUG
|
||||
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager::WaitingForApproval",
|
||||
$"Waiting for RM to Approve repair task {repairTask.TaskId}.",
|
||||
cancellationToken).ConfigureAwait(true);
|
||||
#endif
|
||||
|
||||
while (approvalTimeout >= stopWatch.Elapsed)
|
||||
{
|
||||
repairs = await repairTaskEngine.GetFHRepairTasksCurrentlyProcessingAsync(RepairTaskEngine.FabricHealerExecutorName, cancellationToken).ConfigureAwait(true);
|
||||
|
@ -602,13 +601,11 @@ namespace FabricHealer.Repair
|
|||
|
||||
if (isApproved)
|
||||
{
|
||||
#if DEBUG
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync_Approved",
|
||||
$"RM has Approved repair task {repairTask.TaskId}.",
|
||||
cancellationToken).ConfigureAwait(true);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -619,11 +616,9 @@ namespace FabricHealer.Repair
|
|||
cancellationToken).ConfigureAwait(true);
|
||||
|
||||
await FabricRepairTasks.CancelRepairTaskAsync(repairTask, FabricClientInstance);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
_ = await FabricRepairTasks.SetFabricRepairJobStateAsync(
|
||||
repairTask,
|
||||
RepairTaskState.Executing,
|
||||
|
@ -650,43 +645,43 @@ namespace FabricHealer.Repair
|
|||
// Note: For SF app container services, RestartDeployedCodePackage API does not work.
|
||||
// Thus, using Restart/Remove(stateful/stateless)Replica API instead, which does restart container instances.
|
||||
case RepairActionType.RestartCodePackage:
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(repairConfiguration.ContainerId))
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(repairConfiguration.ContainerId))
|
||||
success = await RestartDeployedCodePackageAsync(repairConfiguration, cancellationToken)
|
||||
.ConfigureAwait(true);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Need replica or instance details..
|
||||
var repList = await FabricClientInstance.QueryManager.GetReplicaListAsync(
|
||||
repairConfiguration.PartitionId,
|
||||
repairConfiguration.ReplicaOrInstanceId,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken).ConfigureAwait(true);
|
||||
if (repList.Count == 0)
|
||||
{
|
||||
success = await RestartDeployedCodePackageAsync(repairConfiguration, cancellationToken)
|
||||
.ConfigureAwait(true);
|
||||
success = false;
|
||||
break;
|
||||
}
|
||||
|
||||
var rep = repList[0];
|
||||
|
||||
// Restarting stateful replica will restart the container instance.
|
||||
if (rep.ServiceKind == ServiceKind.Stateful)
|
||||
{
|
||||
success = await RestartReplicaAsync(repairConfiguration, cancellationToken).ConfigureAwait(true);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Need replica or instance details..
|
||||
var repList = await FabricClientInstance.QueryManager.GetReplicaListAsync(
|
||||
repairConfiguration.PartitionId,
|
||||
repairConfiguration.ReplicaOrInstanceId,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken).ConfigureAwait(true);
|
||||
if (repList.Count == 0)
|
||||
{
|
||||
success = false;
|
||||
break;
|
||||
}
|
||||
|
||||
var rep = repList[0];
|
||||
|
||||
// Restarting stateful replica will restart the container instance.
|
||||
if (rep.ServiceKind == ServiceKind.Stateful)
|
||||
{
|
||||
success = await RestartReplicaAsync(repairConfiguration, cancellationToken).ConfigureAwait(true);
|
||||
}
|
||||
else
|
||||
{
|
||||
// For stateless intances, you need to remove the replica, which will
|
||||
// restart the container instance.
|
||||
success = await RemoveReplicaAsync(repairConfiguration, cancellationToken).ConfigureAwait(true);
|
||||
}
|
||||
// For stateless intances, you need to remove the replica, which will
|
||||
// restart the container instance.
|
||||
success = await RemoveReplicaAsync(repairConfiguration, cancellationToken).ConfigureAwait(true);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case RepairActionType.RemoveReplica:
|
||||
|
||||
success = await RemoveReplicaAsync(repairConfiguration, cancellationToken).ConfigureAwait(true);
|
||||
|
@ -698,65 +693,65 @@ namespace FabricHealer.Repair
|
|||
break;
|
||||
|
||||
case RepairActionType.RestartReplica:
|
||||
{
|
||||
var replicaList = await FabricClientInstance.QueryManager.GetReplicaListAsync(
|
||||
repairConfiguration.PartitionId,
|
||||
repairConfiguration.ReplicaOrInstanceId,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken).ConfigureAwait(true);
|
||||
|
||||
if (replicaList.Count == 0)
|
||||
{
|
||||
var replicaList = await FabricClientInstance.QueryManager.GetReplicaListAsync(
|
||||
repairConfiguration.PartitionId,
|
||||
repairConfiguration.ReplicaOrInstanceId,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
cancellationToken).ConfigureAwait(true);
|
||||
|
||||
if (replicaList.Count == 0)
|
||||
{
|
||||
success = false;
|
||||
success = false;
|
||||
#if DEBUG
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
|
||||
$"Replica or Instance {repairConfiguration.ReplicaOrInstanceId} not found on partition " +
|
||||
$"{repairConfiguration.PartitionId}.",
|
||||
cancellationToken).ConfigureAwait(true);
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
|
||||
$"Replica or Instance {repairConfiguration.ReplicaOrInstanceId} not found on partition " +
|
||||
$"{repairConfiguration.PartitionId}.",
|
||||
cancellationToken).ConfigureAwait(true);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
var replica = replicaList[0];
|
||||
|
||||
// Restart - stateful replica.
|
||||
if (replica.ServiceKind == ServiceKind.Stateful)
|
||||
{
|
||||
success = await RestartReplicaAsync(repairConfiguration, cancellationToken).ConfigureAwait(true);
|
||||
}
|
||||
else
|
||||
{
|
||||
// For stateless replicas, you need to remove the replica. The runtime will create a new one
|
||||
// and place it..
|
||||
success = await RemoveReplicaAsync(repairConfiguration, cancellationToken).ConfigureAwait(true);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
var replica = replicaList[0];
|
||||
|
||||
// Restart - stateful replica.
|
||||
if (replica.ServiceKind == ServiceKind.Stateful)
|
||||
{
|
||||
success = await RestartReplicaAsync(repairConfiguration, cancellationToken).ConfigureAwait(true);
|
||||
}
|
||||
else
|
||||
{
|
||||
// For stateless replicas, you need to remove the replica. The runtime will create a new one
|
||||
// and place it..
|
||||
success = await RemoveReplicaAsync(repairConfiguration, cancellationToken).ConfigureAwait(true);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case RepairActionType.RestartFabricNode:
|
||||
{
|
||||
var executorData = repairTask.ExecutorData;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(executorData))
|
||||
{
|
||||
var executorData = repairTask.ExecutorData;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(executorData))
|
||||
{
|
||||
#if DEBUG
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.SafeRestartFabricNode",
|
||||
$"Repair {repairTask.TaskId} is missing ExecutorData.",
|
||||
cancellationToken).ConfigureAwait(true);
|
||||
await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
"RepairTaskManager.SafeRestartFabricNode",
|
||||
$"Repair {repairTask.TaskId} is missing ExecutorData.",
|
||||
cancellationToken).ConfigureAwait(true);
|
||||
#endif
|
||||
success = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
success = await SafeRestartServiceFabricNodeAsync(repairConfiguration.NodeName, repairTask, cancellationToken).ConfigureAwait(true);
|
||||
}
|
||||
|
||||
break;
|
||||
success = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
success = await SafeRestartServiceFabricNodeAsync(repairConfiguration.NodeName, repairTask, cancellationToken).ConfigureAwait(true);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
@ -793,7 +788,7 @@ namespace FabricHealer.Repair
|
|||
break;
|
||||
|
||||
default:
|
||||
throw new ArgumentOutOfRangeException();
|
||||
throw new ArgumentException("Unknown repair target type.");
|
||||
}
|
||||
|
||||
if (success)
|
||||
|
@ -876,7 +871,6 @@ namespace FabricHealer.Repair
|
|||
cancellationToken).ConfigureAwait(true);
|
||||
|
||||
await FabricRepairTasks.CancelRepairTaskAsync(repairTask, FabricClientInstance).ConfigureAwait(true);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -914,7 +908,6 @@ namespace FabricHealer.Repair
|
|||
}
|
||||
|
||||
stopwatch.Stop();
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -929,67 +922,67 @@ namespace FabricHealer.Repair
|
|||
switch (repairConfig.RepairPolicy.TargetType)
|
||||
{
|
||||
case RepairTargetType.Application:
|
||||
{
|
||||
var appHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricClientInstance.HealthManager.GetApplicationHealthAsync(
|
||||
repairConfig.AppName,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
token),
|
||||
token);
|
||||
|
||||
bool isTargetAppHealedOnTargetNode = false;
|
||||
|
||||
// System Service repairs (Restarts)
|
||||
if (repairConfig.AppName.OriginalString == "fabric:/System")
|
||||
{
|
||||
var appHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricClientInstance.HealthManager.GetApplicationHealthAsync(
|
||||
repairConfig.AppName,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
token),
|
||||
token);
|
||||
|
||||
bool isTargetAppHealedOnTargetNode = false;
|
||||
|
||||
// System Service repairs (Restarts)
|
||||
if (repairConfig.AppName.OriginalString == "fabric:/System")
|
||||
{
|
||||
isTargetAppHealedOnTargetNode = appHealth.HealthEvents.Any(
|
||||
h => JsonSerializationUtility.TryDeserialize(h.HealthInformation.Description,
|
||||
out TelemetryData foHealthData)
|
||||
&& foHealthData.NodeName == repairConfig.NodeName
|
||||
&& foHealthData.SystemServiceProcessName == repairConfig.SystemServiceProcessName
|
||||
&& foHealthData.HealthState.ToLower() == "ok");
|
||||
}
|
||||
else // Application repairs.
|
||||
{
|
||||
isTargetAppHealedOnTargetNode = appHealth.HealthEvents.Any(
|
||||
h => JsonSerializationUtility.TryDeserialize(h.HealthInformation.Description,
|
||||
out TelemetryData foHealthData)
|
||||
&& foHealthData.NodeName == repairConfig.NodeName
|
||||
&& foHealthData.ApplicationName == repairConfig.AppName.OriginalString
|
||||
&& foHealthData.HealthState.ToLower() == "ok");
|
||||
}
|
||||
|
||||
return isTargetAppHealedOnTargetNode ? HealthState.Ok : appHealth.AggregatedHealthState;
|
||||
isTargetAppHealedOnTargetNode = appHealth.HealthEvents.Any(
|
||||
h => JsonSerializationUtility.TryDeserialize(h.HealthInformation.Description,
|
||||
out TelemetryData foHealthData)
|
||||
&& foHealthData.NodeName == repairConfig.NodeName
|
||||
&& foHealthData.SystemServiceProcessName == repairConfig.SystemServiceProcessName
|
||||
&& foHealthData.HealthState.ToLower() == "ok");
|
||||
}
|
||||
else // Application repairs.
|
||||
{
|
||||
isTargetAppHealedOnTargetNode = appHealth.HealthEvents.Any(
|
||||
h => JsonSerializationUtility.TryDeserialize(h.HealthInformation.Description,
|
||||
out TelemetryData foHealthData)
|
||||
&& foHealthData.NodeName == repairConfig.NodeName
|
||||
&& foHealthData.ApplicationName == repairConfig.AppName.OriginalString
|
||||
&& foHealthData.HealthState.ToLower() == "ok");
|
||||
}
|
||||
|
||||
return isTargetAppHealedOnTargetNode ? HealthState.Ok : appHealth.AggregatedHealthState;
|
||||
}
|
||||
case RepairTargetType.Node:
|
||||
case RepairTargetType.VirtualMachine:
|
||||
{
|
||||
var nodeHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricClientInstance.HealthManager.GetNodeHealthAsync(
|
||||
repairConfig.NodeName,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
token),
|
||||
token);
|
||||
{
|
||||
var nodeHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricClientInstance.HealthManager.GetNodeHealthAsync(
|
||||
repairConfig.NodeName,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
token),
|
||||
token);
|
||||
|
||||
bool isTargetNodeHealed = nodeHealth.HealthEvents.Any(
|
||||
h => JsonSerializationUtility.TryDeserialize(h.HealthInformation.Description,
|
||||
out TelemetryData foHealthData)
|
||||
&& foHealthData.NodeName == repairConfig.NodeName
|
||||
&& foHealthData.HealthState.ToLower() == "ok");
|
||||
bool isTargetNodeHealed = nodeHealth.HealthEvents.Any(
|
||||
h => JsonSerializationUtility.TryDeserialize(h.HealthInformation.Description,
|
||||
out TelemetryData foHealthData)
|
||||
&& foHealthData.NodeName == repairConfig.NodeName
|
||||
&& foHealthData.HealthState.ToLower() == "ok");
|
||||
|
||||
return isTargetNodeHealed ? HealthState.Ok : nodeHealth.AggregatedHealthState;
|
||||
}
|
||||
return isTargetNodeHealed ? HealthState.Ok : nodeHealth.AggregatedHealthState;
|
||||
}
|
||||
case RepairTargetType.Replica:
|
||||
{
|
||||
// Make sure the Partition where the restarted replica was located is now healthy.
|
||||
var partitionHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricClientInstance.HealthManager.GetPartitionHealthAsync(
|
||||
repairConfig.PartitionId,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
token),
|
||||
token);
|
||||
return partitionHealth.AggregatedHealthState;
|
||||
}
|
||||
{
|
||||
// Make sure the Partition where the restarted replica was located is now healthy.
|
||||
var partitionHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => FabricClientInstance.HealthManager.GetPartitionHealthAsync(
|
||||
repairConfig.PartitionId,
|
||||
FabricHealerManager.ConfigSettings.AsyncTimeout,
|
||||
token),
|
||||
token);
|
||||
return partitionHealth.AggregatedHealthState;
|
||||
}
|
||||
default:
|
||||
return HealthState.Unknown;
|
||||
}
|
||||
|
|
|
@ -7,36 +7,29 @@ using System;
|
|||
using System.Diagnostics.Tracing;
|
||||
using System.Fabric;
|
||||
using System.Threading.Tasks;
|
||||
using FabricHealer.TelemetryLib;
|
||||
using FabricHealer.Repair;
|
||||
|
||||
namespace FabricHealer
|
||||
{
|
||||
[EventSource(Name = "Service-Fabric-FabricHealer", Guid = "344ea295-2ee1-53a6-fc40-4e893e25c60d")]
|
||||
internal sealed class ServiceEventSource : EventSource
|
||||
public sealed class ServiceEventSource : EventSource, ITelemetryEventSource
|
||||
{
|
||||
public static readonly ServiceEventSource Current = new ServiceEventSource();
|
||||
|
||||
static ServiceEventSource()
|
||||
{
|
||||
// A workaround for the problem where ETW activities do not get tracked until Tasks infrastructure is initialized.
|
||||
// This problem will be fixed in .NET Framework 4.6.2.
|
||||
Task.Run(() => { });
|
||||
// This problem is fixed in .NET Framework 4.6.2. If you are running this version or greater, then delete the below code.
|
||||
_ = Task.Run(() => { });
|
||||
}
|
||||
|
||||
// Instance constructor is private to enforce singleton semantics
|
||||
private ServiceEventSource() : base() { }
|
||||
|
||||
#region Keywords
|
||||
// Event keywords can be used to categorize events.
|
||||
// Each keyword is a bit flag. A single event can be associated with multiple keywords (via EventAttribute.Keywords property).
|
||||
// Keywords must be defined as a public class named 'Keywords' inside EventSource that uses them.
|
||||
public static class Keywords
|
||||
// Instance constructor is private to enforce singleton semantics.
|
||||
// FabricObserver ETW provider name is passed to base.ctor here instead of decorating this class.
|
||||
private ServiceEventSource() : base(RepairConstants.EventSourceProviderName)
|
||||
{
|
||||
public const EventKeywords Requests = (EventKeywords)0x1L;
|
||||
public const EventKeywords ServiceInitialization = (EventKeywords)0x2L;
|
||||
}
|
||||
#endregion
|
||||
|
||||
#region Events
|
||||
}
|
||||
|
||||
// Define an instance method for each event you want to record and apply an [Event] attribute to it.
|
||||
// The method name is the name of the event.
|
||||
// Pass any parameters you want to record with the event (only primitive integer types, DateTime, Guid & string are allowed).
|
||||
|
@ -44,18 +37,62 @@ namespace FabricHealer
|
|||
// The number and types of arguments passed to every event method must exactly match what is passed to WriteEvent().
|
||||
// Put [NonEvent] attribute on all methods that do not define an event.
|
||||
// For more information see https://msdn.microsoft.com/en-us/library/system.diagnostics.tracing.eventsource.aspx
|
||||
|
||||
[NonEvent]
|
||||
public void Message(string message, params object[] args)
|
||||
{
|
||||
if (IsEnabled())
|
||||
if (!IsEnabled())
|
||||
{
|
||||
string finalMessage = string.Format(message, args);
|
||||
Message(finalMessage);
|
||||
return;
|
||||
}
|
||||
|
||||
string finalMessage = string.Format(message, args);
|
||||
Message(finalMessage);
|
||||
}
|
||||
|
||||
[NonEvent]
|
||||
public void DataTypeWriteInfo<T>(string eventName, T data)
|
||||
{
|
||||
var options = new EventSourceOptions
|
||||
{
|
||||
ActivityOptions = EventActivityOptions.None,
|
||||
Keywords = Keywords.ResourceUsage,
|
||||
Opcode = EventOpcode.Info,
|
||||
Level = EventLevel.Verbose,
|
||||
};
|
||||
|
||||
Write(eventName, options, data);
|
||||
}
|
||||
|
||||
[NonEvent]
|
||||
public void DataTypeWriteWarning<T>(string eventName, T data)
|
||||
{
|
||||
var options = new EventSourceOptions
|
||||
{
|
||||
ActivityOptions = EventActivityOptions.None,
|
||||
Keywords = Keywords.ErrorOrWarning,
|
||||
Opcode = EventOpcode.Info,
|
||||
Level = EventLevel.Warning,
|
||||
};
|
||||
|
||||
Write(eventName, options, data);
|
||||
}
|
||||
|
||||
[NonEvent]
|
||||
public void DataTypeWriteError<T>(string eventName, T data)
|
||||
{
|
||||
var options = new EventSourceOptions
|
||||
{
|
||||
ActivityOptions = EventActivityOptions.None,
|
||||
Keywords = Keywords.ErrorOrWarning,
|
||||
Opcode = EventOpcode.Info,
|
||||
Level = EventLevel.Error,
|
||||
};
|
||||
|
||||
Write(eventName, options, data);
|
||||
}
|
||||
|
||||
private const int MessageEventId = 1;
|
||||
|
||||
[Event(MessageEventId, Level = EventLevel.Informational, Message = "{0}")]
|
||||
public void Message(string message)
|
||||
{
|
||||
|
@ -66,27 +103,37 @@ namespace FabricHealer
|
|||
}
|
||||
|
||||
[NonEvent]
|
||||
public void ServiceMessage(StatefulServiceContext serviceContext, string message, params object[] args)
|
||||
public void ServiceMessage(StatelessServiceContext serviceContext, string message, params object[] args)
|
||||
{
|
||||
if (IsEnabled())
|
||||
if (!IsEnabled())
|
||||
{
|
||||
string finalMessage = string.Format(message, args);
|
||||
ServiceMessage(
|
||||
serviceContext.ServiceName.ToString(),
|
||||
serviceContext.ServiceTypeName,
|
||||
serviceContext.ReplicaId,
|
||||
serviceContext.PartitionId,
|
||||
serviceContext.CodePackageActivationContext.ApplicationName,
|
||||
serviceContext.CodePackageActivationContext.ApplicationTypeName,
|
||||
serviceContext.NodeContext.NodeName,
|
||||
finalMessage);
|
||||
return;
|
||||
}
|
||||
|
||||
string finalMessage = string.Format(message, args);
|
||||
ServiceMessage(
|
||||
serviceContext.ServiceName.ToString(),
|
||||
serviceContext.ServiceTypeName,
|
||||
serviceContext.InstanceId,
|
||||
serviceContext.PartitionId,
|
||||
serviceContext.CodePackageActivationContext.ApplicationName,
|
||||
serviceContext.CodePackageActivationContext.ApplicationTypeName,
|
||||
serviceContext.NodeContext.NodeName,
|
||||
finalMessage);
|
||||
}
|
||||
|
||||
[NonEvent]
|
||||
public void VerboseMessage(string message, params object[] args)
|
||||
{
|
||||
string finalMessage = string.Format(message, args);
|
||||
VerboseMessage(finalMessage);
|
||||
}
|
||||
|
||||
// For very high-frequency events it might be advantageous to raise events using WriteEventCore API.
|
||||
// This results in more efficient parameter handling, but requires explicit allocation of EventData structure and unsafe code.
|
||||
// To enable this code path, define UNSAFE conditional compilation symbol and turn on unsafe code support in project properties.
|
||||
private const int ServiceMessageEventId = 2;
|
||||
|
||||
[Event(ServiceMessageEventId, Level = EventLevel.Informational, Message = "{7}")]
|
||||
private
|
||||
#if UNSAFE
|
||||
|
@ -124,6 +171,7 @@ namespace FabricHealer
|
|||
}
|
||||
|
||||
private const int ServiceTypeRegisteredEventId = 3;
|
||||
|
||||
[Event(ServiceTypeRegisteredEventId, Level = EventLevel.Informational, Message = "Service host process {0} registered service type {1}", Keywords = Keywords.ServiceInitialization)]
|
||||
public void ServiceTypeRegistered(int hostProcessId, string serviceType)
|
||||
{
|
||||
|
@ -131,6 +179,7 @@ namespace FabricHealer
|
|||
}
|
||||
|
||||
private const int ServiceHostInitializationFailedEventId = 4;
|
||||
|
||||
[Event(ServiceHostInitializationFailedEventId, Level = EventLevel.Error, Message = "Service host initialization failed", Keywords = Keywords.ServiceInitialization)]
|
||||
public void ServiceHostInitializationFailed(string exception)
|
||||
{
|
||||
|
@ -141,6 +190,7 @@ namespace FabricHealer
|
|||
// These activities can be automatically picked up by debugging and profiling tools, which can compute their execution time, child activities,
|
||||
// and other statistics.
|
||||
private const int ServiceRequestStartEventId = 5;
|
||||
|
||||
[Event(ServiceRequestStartEventId, Level = EventLevel.Informational, Message = "Service request '{0}' started", Keywords = Keywords.Requests)]
|
||||
public void ServiceRequestStart(string requestTypeName)
|
||||
{
|
||||
|
@ -148,78 +198,17 @@ namespace FabricHealer
|
|||
}
|
||||
|
||||
private const int ServiceRequestStopEventId = 6;
|
||||
|
||||
[Event(ServiceRequestStopEventId, Level = EventLevel.Informational, Message = "Service request '{0}' finished", Keywords = Keywords.Requests)]
|
||||
public void ServiceRequestStop(string requestTypeName, string exception = "")
|
||||
{
|
||||
WriteEvent(ServiceRequestStopEventId, requestTypeName, exception);
|
||||
}
|
||||
|
||||
[NonEvent]
|
||||
public void VerboseMessage(string message, params object[] args)
|
||||
{
|
||||
if (IsEnabled())
|
||||
{
|
||||
string finalMessage = string.Format(message, args);
|
||||
VerboseMessage(finalMessage);
|
||||
}
|
||||
}
|
||||
|
||||
[NonEvent]
|
||||
public void InfoMessage(string message, params object[] args)
|
||||
{
|
||||
if (IsEnabled())
|
||||
{
|
||||
string finalMessage = string.Format(message, args);
|
||||
InfoMessage(finalMessage);
|
||||
}
|
||||
}
|
||||
|
||||
[NonEvent]
|
||||
public void ErrorMessage(string message, params object[] args)
|
||||
{
|
||||
if (IsEnabled())
|
||||
{
|
||||
string finalMessage = string.Format(message, args);
|
||||
ErrorMessage(finalMessage);
|
||||
}
|
||||
}
|
||||
|
||||
private const int ErrorMessageEventId = 7;
|
||||
[Event(ErrorMessageEventId, Level = EventLevel.Error, Message = "{0}")]
|
||||
public void ErrorMessage(string message)
|
||||
{
|
||||
if (IsEnabled())
|
||||
{
|
||||
WriteEvent(ErrorMessageEventId, message);
|
||||
}
|
||||
}
|
||||
|
||||
private const int InfoMessageEventId = 8;
|
||||
[Event(InfoMessageEventId, Level = EventLevel.Informational, Message = "{0}")]
|
||||
public void InfoMessage(string message)
|
||||
{
|
||||
if (IsEnabled())
|
||||
{
|
||||
WriteEvent(InfoMessageEventId, message);
|
||||
}
|
||||
}
|
||||
|
||||
private const int PrintRepairTaskEventId = 9;
|
||||
[Event(PrintRepairTaskEventId, Level = EventLevel.Verbose, Message = "TasksID = {0}, State = {1}, Action = {2}, Executor = {3}, Description = {4}, ExecutorData = {5}, Target = {6}")]
|
||||
public void PrintRepairTasks(string taskId, string state, string action, string executor, string description, string executordata, string target)
|
||||
{
|
||||
if (IsEnabled())
|
||||
{
|
||||
WriteEvent(PrintRepairTaskEventId, taskId, state, action, executor, description, executordata, target);
|
||||
}
|
||||
}
|
||||
|
||||
// TelemetryLib impl \\
|
||||
|
||||
private const int VerboseMessageEventId = 10;
|
||||
private const int VerboseMessageEventId = 7;
|
||||
|
||||
[Event(VerboseMessageEventId, Level = EventLevel.Verbose, Message = "{0}")]
|
||||
public void VerboseMessage(string message)
|
||||
public void VerboseMessage<T>(T message)
|
||||
{
|
||||
if (IsEnabled())
|
||||
{
|
||||
|
@ -227,37 +216,18 @@ namespace FabricHealer
|
|||
}
|
||||
}
|
||||
|
||||
private const int FabricHealerTelemetryEventId = 11;
|
||||
|
||||
[Event(FabricHealerTelemetryEventId, Level = EventLevel.Verbose,
|
||||
Message = "FabricHealer Internal Diagnostic Event, " +
|
||||
"eventSourceId = {0}, applicationVersion = {1}, " +
|
||||
"fabricHealerConfiguration = {2}, " +
|
||||
"fabricHealerHealthState = {3}")]
|
||||
public void FabricHealerRuntimeNodeEvent(
|
||||
string clusterId,
|
||||
string applicationVersion,
|
||||
string fhConfigInfo,
|
||||
string fhHealthInfo)
|
||||
[Event(42, Level = EventLevel.Verbose)]
|
||||
public void InternalFHDataEvent<T>(T data)
|
||||
{
|
||||
if (IsEnabled())
|
||||
{
|
||||
WriteEvent(
|
||||
FabricHealerTelemetryEventId,
|
||||
clusterId,
|
||||
applicationVersion,
|
||||
fhConfigInfo,
|
||||
fhHealthInfo);
|
||||
}
|
||||
Write("FabricHealerOperationalEvent", data);
|
||||
}
|
||||
|
||||
public void FabricObserverRuntimeNodeEvent(string clusterId, string applicationVersion, string foConfigInfo, string foHealthInfo)
|
||||
[Event(43, Level = EventLevel.Error)]
|
||||
public void InternalFHCriticalErrorDataEvent<T>(T data)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
Write("FabricHealerCriticalErrorEvent", data);
|
||||
}
|
||||
#endregion
|
||||
|
||||
#region Private methods
|
||||
#if UNSAFE
|
||||
private int SizeInBytes(string s)
|
||||
{
|
||||
|
@ -271,6 +241,17 @@ namespace FabricHealer
|
|||
}
|
||||
}
|
||||
#endif
|
||||
#endregion
|
||||
|
||||
// Event keywords can be used to categorize events.
|
||||
// Each keyword is a bit flag. A single event can be associated with multiple keywords (via EventAttribute.Keywords property).
|
||||
// Keywords must be defined as a public class named 'Keywords' inside EventSource that uses them.
|
||||
public static class Keywords
|
||||
{
|
||||
public const EventKeywords Requests = (EventKeywords)0x1L;
|
||||
public const EventKeywords ServiceInitialization = (EventKeywords)0x2L;
|
||||
public const EventKeywords ResourceUsage = (EventKeywords)0x4L;
|
||||
public const EventKeywords ErrorOrWarning = (EventKeywords)0x8L;
|
||||
public const EventKeywords InternalData = (EventKeywords)0x10L;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -136,10 +136,14 @@ namespace FabricHealer.Utilities
|
|||
private set;
|
||||
}
|
||||
|
||||
public bool OperationalTelemetryEnabled
|
||||
{
|
||||
get; private set;
|
||||
}
|
||||
|
||||
public ConfigSettings(StatelessServiceContext context)
|
||||
{
|
||||
this.context = context ?? throw new ArgumentException("Context can't be null.");
|
||||
|
||||
UpdateConfigSettings();
|
||||
}
|
||||
|
||||
|
@ -171,7 +175,7 @@ namespace FabricHealer.Utilities
|
|||
ExecutionLoopSleepSeconds = execFrequency;
|
||||
}
|
||||
|
||||
// (Assuming Diagnostics/Analytics cloud service implemented) Telemetry.
|
||||
// Telemetry.
|
||||
if (bool.TryParse(GetConfigSettingValue(RepairConstants.RepairManagerConfigurationSectionName, RepairConstants.AppInsightsTelemetryEnabled), out bool telemEnabled))
|
||||
{
|
||||
TelemetryEnabled = telemEnabled;
|
||||
|
@ -212,6 +216,12 @@ namespace FabricHealer.Utilities
|
|||
EtwProviderName = GetConfigSettingValue(RepairConstants.RepairManagerConfigurationSectionName, RepairConstants.EventSourceProviderName);
|
||||
}
|
||||
|
||||
// FabricHealer operational telemetry
|
||||
if (bool.TryParse(GetConfigSettingValue(RepairConstants.RepairManagerConfigurationSectionName, RepairConstants.EnableFabricHealerOperationalTelemetry), out bool fhOpTelemEnabled))
|
||||
{
|
||||
OperationalTelemetryEnabled = fhOpTelemEnabled;
|
||||
}
|
||||
|
||||
// Repair Policies
|
||||
if (bool.TryParse(GetConfigSettingValue(RepairConstants.AppRepairPolicySectionName, RepairConstants.Enabled), out bool appRepairEnabled))
|
||||
{
|
||||
|
|
|
@ -82,42 +82,46 @@ namespace FabricHealer.Utilities
|
|||
|
||||
private void InitializeLoggers()
|
||||
{
|
||||
// default log directory.
|
||||
string logFolderBase;
|
||||
|
||||
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
|
||||
// Log directory supplied in Settings.xml.
|
||||
if (!string.IsNullOrEmpty(LogFolderBasePath))
|
||||
{
|
||||
string windrive = Environment.SystemDirectory[..3];
|
||||
logFolderBase = windrive + "fabrichealer_logs";
|
||||
}
|
||||
else
|
||||
{
|
||||
logFolderBase = "/tmp/fabrichealer_logs";
|
||||
}
|
||||
logFolderBase = LogFolderBasePath;
|
||||
|
||||
// log directory supplied in config. Set in ObserverManager.
|
||||
if (!string.IsNullOrWhiteSpace(LogFolderBasePath))
|
||||
{
|
||||
// Add current drive letter if not supplied for Windows path target.
|
||||
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
|
||||
{
|
||||
if (!LogFolderBasePath[..3].Contains(":\\"))
|
||||
// Add current drive letter if not supplied for Windows path target.
|
||||
if (!LogFolderBasePath.Substring(0, 3).Contains(":\\"))
|
||||
{
|
||||
string windrive = Environment.SystemDirectory[..3];
|
||||
string windrive = Environment.SystemDirectory.Substring(0, 3);
|
||||
logFolderBase = windrive + LogFolderBasePath;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Remove supplied drive letter if Linux is the runtime target.
|
||||
if (LogFolderBasePath[..3].Contains(":\\"))
|
||||
if (LogFolderBasePath.Substring(0, 3).Contains(":\\"))
|
||||
{
|
||||
LogFolderBasePath = LogFolderBasePath.Remove(0, 3);
|
||||
logFolderBase = LogFolderBasePath.Remove(0, 3).Replace("\\", "/");
|
||||
}
|
||||
|
||||
logFolderBase = LogFolderBasePath;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
|
||||
{
|
||||
string windrive = Environment.SystemDirectory.Substring(0, 3);
|
||||
logFolderBase = windrive + "fabrichealer_logs";
|
||||
}
|
||||
else
|
||||
{
|
||||
logFolderBase = "/tmp/fabrichealer_logs";
|
||||
}
|
||||
}
|
||||
|
||||
LogFolderBasePath = logFolderBase;
|
||||
string file = Path.Combine(logFolderBase, "fabrichealer.log");
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(FolderName) && !string.IsNullOrWhiteSpace(Filename))
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
<Parameter Name="MonitorLoopSleepSeconds" DefaultValue="10" />
|
||||
<Parameter Name="TelemetryProviderEnabled" DefaultValue="false" />
|
||||
<Parameter Name="VerboseLoggingEnabled" DefaultValue="true" />
|
||||
<Parameter Name="OperationalTelemetryEnabled" DefaultValue="true" />
|
||||
<!-- Repair Policy Enablement -->
|
||||
<Parameter Name="AppRepairEnabled" DefaultValue="true" />
|
||||
<Parameter Name="DiskRepairEnabled" DefaultValue="true" />
|
||||
|
@ -31,6 +32,7 @@
|
|||
<Parameter Name="EnableEventSourceProvider" Value="[EventSourceProviderEnabled]" />
|
||||
<Parameter Name="EnableTelemetryProvider" Value="[TelemetryProviderEnabled]" />
|
||||
<Parameter Name="EnableVerboseLogging" Value="[VerboseLoggingEnabled]" />
|
||||
<Parameter Name="EnableOperationalTelemetry" Value="[OperationalTelemetryEnabled]" />
|
||||
</Section>
|
||||
<!-- Repair policies -->
|
||||
<Section Name="AppRepairPolicy">
|
||||
|
|
|
@ -16,7 +16,7 @@ FabricHealer requires that FabricObserver (v 3.1.8+) and RepairManager (RM) serv
|
|||
For VM level repair, InfrastructureService (IS) service must be deployed.
|
||||
```
|
||||
|
||||
## For Early Adopters while in Private Preview
|
||||
## For Early Adopters while in Preview
|
||||
|
||||
Please [download the Guan nupkg](https://github.com/microsoft/Guan/releases/download/1.0.0-Preview/Microsoft.Logic.Guan.1.0.0-Preview.nupkg) to your local dev machine and install it into your local FH project in order to build FH successfully. This will be unnecessary when FH ships in Public Preview as Guan will be shipping concurrently and the Guan nupkg will be available in the nuget.org package gallery, as will FH.
|
||||
|
||||
|
|
|
@ -0,0 +1,161 @@
|
|||
// ------------------------------------------------------------
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
|
||||
// ------------------------------------------------------------
|
||||
|
||||
using Microsoft.Win32;
|
||||
using System;
|
||||
using System.Fabric;
|
||||
using System.Globalization;
|
||||
using System.IO;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using System.Xml;
|
||||
|
||||
namespace FabricHealer.TelemetryLib
|
||||
{
|
||||
/// <summary>
|
||||
/// Helper class to facilitate non-PII identification of cluster.
|
||||
/// </summary>
|
||||
public sealed class ClusterIdentificationUtility
|
||||
{
|
||||
private const string FabricRegistryKeyPath = "Software\\Microsoft\\Service Fabric";
|
||||
private static string paasClusterId;
|
||||
private static string diagnosticsClusterId;
|
||||
private static XmlDocument clusterManifestXdoc;
|
||||
private static readonly object lockObj = new object();
|
||||
|
||||
/// <summary>
|
||||
/// Gets ClusterID, tenantID and ClusterType for current ServiceFabric cluster
|
||||
/// The logic to compute these values closely resembles the logic used in SF runtime's telemetry client.
|
||||
/// </summary>
|
||||
public static async Task<(string ClusterId, string TenantId, string ClusterType)> TupleGetClusterIdAndTypeAsync(FabricClient fabricClient, CancellationToken token)
|
||||
{
|
||||
string clusterManifest = await fabricClient.ClusterManager.GetClusterManifestAsync(
|
||||
TimeSpan.FromSeconds(TelemetryConstants.AsyncOperationTimeoutSeconds),
|
||||
token);
|
||||
|
||||
// Get tenantId for PaasV1 clusters or SFRP.
|
||||
string tenantId = GetTenantId() ?? TelemetryConstants.Undefined;
|
||||
string clusterId = TelemetryConstants.Undefined;
|
||||
string clusterType = TelemetryConstants.Undefined;
|
||||
|
||||
if (!string.IsNullOrEmpty(clusterManifest))
|
||||
{
|
||||
// Safe XML pattern - *Do not use LoadXml*.
|
||||
clusterManifestXdoc = new XmlDocument { XmlResolver = null };
|
||||
|
||||
using (var sreader = new StringReader(clusterManifest))
|
||||
{
|
||||
using (var xreader = XmlReader.Create(sreader, new XmlReaderSettings { XmlResolver = null }))
|
||||
{
|
||||
lock (lockObj)
|
||||
{
|
||||
clusterManifestXdoc?.Load(xreader);
|
||||
|
||||
// Get values from cluster manifest, clusterId if it exists in either Paas or Diagnostics section.
|
||||
GetValuesFromClusterManifest();
|
||||
}
|
||||
|
||||
if (paasClusterId != null)
|
||||
{
|
||||
clusterId = paasClusterId;
|
||||
clusterType = TelemetryConstants.ClusterTypeSfrp;
|
||||
}
|
||||
else if (tenantId != TelemetryConstants.Undefined)
|
||||
{
|
||||
clusterId = tenantId;
|
||||
clusterType = TelemetryConstants.ClusterTypePaasV1;
|
||||
}
|
||||
else if (diagnosticsClusterId != null)
|
||||
{
|
||||
clusterId = diagnosticsClusterId;
|
||||
clusterType = TelemetryConstants.ClusterTypeStandalone;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (clusterId, tenantId, clusterType);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the value of a parameter inside a section from the cluster manifest XmlDocument instance (clusterManifestXdoc).
|
||||
/// </summary>
|
||||
/// <param name="sectionName"></param>
|
||||
/// <param name="parameterName"></param>
|
||||
/// <returns></returns>
|
||||
private static string GetParamValueFromSection(string sectionName, string parameterName)
|
||||
{
|
||||
if (clusterManifestXdoc == null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
XmlNode sectionNode = clusterManifestXdoc.DocumentElement?.SelectSingleNode("//*[local-name()='Section' and @Name='" + sectionName + "']");
|
||||
XmlNode parameterNode = sectionNode?.SelectSingleNode("//*[local-name()='Parameter' and @Name='" + parameterName + "']");
|
||||
XmlAttribute attr = parameterNode?.Attributes?["Value"];
|
||||
|
||||
return attr?.Value;
|
||||
}
|
||||
|
||||
private static string GetClusterIdFromPaasSection()
|
||||
{
|
||||
return GetParamValueFromSection("Paas", "ClusterId");
|
||||
}
|
||||
|
||||
private static string GetClusterIdFromDiagnosticsSection()
|
||||
{
|
||||
return GetParamValueFromSection("Diagnostics", "ClusterId");
|
||||
}
|
||||
|
||||
private static string GetTenantId()
|
||||
{
|
||||
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
|
||||
{
|
||||
return GetTenantIdWindows();
|
||||
}
|
||||
|
||||
return GetTenantIdLinux();
|
||||
}
|
||||
|
||||
private static string GetTenantIdLinux()
|
||||
{
|
||||
// Implementation copied from https://github.com/microsoft/service-fabric/blob/master/src/prod/src/managed/DCA/product/host/TelemetryConsumerLinux.cs
|
||||
const string TenantIdFile = "/var/lib/waagent/HostingEnvironmentConfig.xml";
|
||||
|
||||
if (!File.Exists(TenantIdFile))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
string tenantId;
|
||||
var xmlDoc = new XmlDocument { XmlResolver = null };
|
||||
|
||||
using (var xmlReader = XmlReader.Create(TenantIdFile, new XmlReaderSettings { XmlResolver = null }))
|
||||
{
|
||||
xmlDoc.Load(xmlReader);
|
||||
}
|
||||
|
||||
tenantId = xmlDoc.GetElementsByTagName("Deployment").Item(0).Attributes.GetNamedItem("name").Value;
|
||||
return tenantId;
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.NoInlining)]
|
||||
private static string GetTenantIdWindows()
|
||||
{
|
||||
const string TenantIdValueName = "WATenantID";
|
||||
string tenantIdKeyName = string.Format(CultureInfo.InvariantCulture, "{0}\\{1}", Registry.LocalMachine.Name, FabricRegistryKeyPath);
|
||||
|
||||
return (string)Registry.GetValue(tenantIdKeyName, TenantIdValueName, null);
|
||||
}
|
||||
|
||||
private static void GetValuesFromClusterManifest()
|
||||
{
|
||||
paasClusterId = GetClusterIdFromPaasSection();
|
||||
diagnosticsClusterId = GetClusterIdFromDiagnosticsSection();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<ApplicationInsights xmlns="http://schemas.microsoft.com/ApplicationInsights/2013/Settings">
|
||||
<InstrumentationKey></InstrumentationKey>
|
||||
<TelemetryInitializers>
|
||||
<Add Type="Microsoft.ApplicationInsights.DependencyCollector.HttpDependenciesParsingTelemetryInitializer, Microsoft.AI.DependencyCollector"/>
|
||||
<Add Type="Microsoft.ApplicationInsights.WindowsServer.AzureRoleEnvironmentTelemetryInitializer, Microsoft.AI.WindowsServer"/>
|
||||
<Add Type="Microsoft.ApplicationInsights.WindowsServer.BuildInfoConfigComponentVersionTelemetryInitializer, Microsoft.AI.WindowsServer"/>
|
||||
<Add Type="Microsoft.ApplicationInsights.Web.ClientIpHeaderTelemetryInitializer, Microsoft.AI.Web"/>
|
||||
<Add Type="Microsoft.ApplicationInsights.Web.AzureAppServiceRoleNameFromHostNameHeaderInitializer, Microsoft.AI.Web"/>
|
||||
<Add Type="Microsoft.ApplicationInsights.Web.OperationNameTelemetryInitializer, Microsoft.AI.Web"/>
|
||||
<Add Type="Microsoft.ApplicationInsights.Web.OperationCorrelationTelemetryInitializer, Microsoft.AI.Web"/>
|
||||
<Add Type="Microsoft.ApplicationInsights.Web.UserTelemetryInitializer, Microsoft.AI.Web"/>
|
||||
<Add Type="Microsoft.ApplicationInsights.Web.AuthenticatedUserIdTelemetryInitializer, Microsoft.AI.Web"/>
|
||||
<Add Type="Microsoft.ApplicationInsights.Web.AccountIdTelemetryInitializer, Microsoft.AI.Web"/>
|
||||
<Add Type="Microsoft.ApplicationInsights.Web.SessionTelemetryInitializer, Microsoft.AI.Web"/>
|
||||
</TelemetryInitializers>
|
||||
<TelemetryModules>
|
||||
<Add Type="Microsoft.ApplicationInsights.DependencyCollector.DependencyTrackingTelemetryModule, Microsoft.AI.DependencyCollector">
|
||||
<ExcludeComponentCorrelationHttpHeadersOnDomains>
|
||||
<Add>core.windows.net</Add>
|
||||
<Add>core.chinacloudapi.cn</Add>
|
||||
<Add>core.cloudapi.de</Add>
|
||||
<Add>core.usgovcloudapi.net</Add>
|
||||
</ExcludeComponentCorrelationHttpHeadersOnDomains>
|
||||
<IncludeDiagnosticSourceActivities>
|
||||
<Add>Microsoft.Azure.EventHubs</Add>
|
||||
<Add>Microsoft.Azure.ServiceBus</Add>
|
||||
</IncludeDiagnosticSourceActivities>
|
||||
</Add>
|
||||
<Add Type="Microsoft.ApplicationInsights.WindowsServer.DeveloperModeWithDebuggerAttachedTelemetryModule, Microsoft.AI.WindowsServer"/>
|
||||
<Add Type="Microsoft.ApplicationInsights.WindowsServer.UnhandledExceptionTelemetryModule, Microsoft.AI.WindowsServer"/>
|
||||
</TelemetryModules>
|
||||
<ApplicationIdProvider Type="Microsoft.ApplicationInsights.Extensibility.Implementation.ApplicationId.ApplicationInsightsApplicationIdProvider, Microsoft.ApplicationInsights"/>
|
||||
<TelemetrySinks>
|
||||
<Add Name="default">
|
||||
<TelemetryProcessors>
|
||||
<Add Type="Microsoft.ApplicationInsights.Extensibility.PerfCounterCollector.QuickPulse.QuickPulseTelemetryProcessor, Microsoft.AI.PerfCounterCollector"/>
|
||||
<Add Type="Microsoft.ApplicationInsights.Extensibility.AutocollectedMetricsExtractor, Microsoft.ApplicationInsights"/>
|
||||
<Add Type="Microsoft.ApplicationInsights.WindowsServer.TelemetryChannel.AdaptiveSamplingTelemetryProcessor, Microsoft.AI.ServerTelemetryChannel">
|
||||
<MaxTelemetryItemsPerSecond>5</MaxTelemetryItemsPerSecond>
|
||||
<ExcludedTypes>Event</ExcludedTypes>
|
||||
</Add>
|
||||
<Add Type="Microsoft.ApplicationInsights.WindowsServer.TelemetryChannel.AdaptiveSamplingTelemetryProcessor, Microsoft.AI.ServerTelemetryChannel">
|
||||
<MaxTelemetryItemsPerSecond>5</MaxTelemetryItemsPerSecond>
|
||||
<IncludedTypes>Event</IncludedTypes>
|
||||
</Add>
|
||||
</TelemetryProcessors>
|
||||
<TelemetryChannel Type="Microsoft.ApplicationInsights.WindowsServer.TelemetryChannel.ServerTelemetryChannel, Microsoft.AI.ServerTelemetryChannel"/>
|
||||
</Add>
|
||||
</TelemetrySinks>
|
||||
</ApplicationInsights>
|
|
@ -0,0 +1,37 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Text;
|
||||
|
||||
namespace FabricHealer.TelemetryLib
|
||||
{
|
||||
public class FabricHealerCriticalErrorEventData
|
||||
{
|
||||
public string Version
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
|
||||
public string Source
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
|
||||
public string ErrorMessage
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
|
||||
public string ErrorStack
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
|
||||
public string CrashTime
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
|
||||
public string OS => RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
// ------------------------------------------------------------
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
|
||||
// ------------------------------------------------------------
|
||||
|
||||
using System.Runtime.InteropServices;
|
||||
|
||||
namespace FabricHealer.TelemetryLib
|
||||
{
|
||||
public class FabricHealerOperationalEventData
|
||||
{
|
||||
public string UpTime
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
|
||||
public string Version
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
|
||||
public RepairData RepairData
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
|
||||
public string OS => RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
// ------------------------------------------------------------
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
|
||||
// ------------------------------------------------------------
|
||||
|
||||
namespace FabricHealer.TelemetryLib
|
||||
{
|
||||
public interface ITelemetryEventSource
|
||||
{
|
||||
void InternalFHDataEvent<T>(T data);
|
||||
|
||||
void InternalFHCriticalErrorDataEvent<T>(T data);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
// ------------------------------------------------------------
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
|
||||
// ------------------------------------------------------------
|
||||
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace FabricHealer.TelemetryLib
|
||||
{
|
||||
public class RepairData
|
||||
{
|
||||
public Dictionary<string, double> Repairs
|
||||
{
|
||||
get; set;
|
||||
} = new Dictionary<string, double>();
|
||||
|
||||
public double RepairCount
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
|
||||
public double FailedRepairs
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
|
||||
public double SuccessfulRepairs
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
|
||||
public double EnabledRepairCount
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
// ------------------------------------------------------------
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
|
||||
// ------------------------------------------------------------
|
||||
|
||||
namespace FabricHealer.TelemetryLib
|
||||
{
|
||||
public static class TelemetryConstants
|
||||
{
|
||||
internal const string Undefined = "undefined";
|
||||
internal const string ClusterTypeStandalone = "standalone";
|
||||
internal const string ClusterTypeSfrp = "SFRP";
|
||||
internal const string ClusterTypePaasV1 = "PaasV1";
|
||||
internal const int AsyncOperationTimeoutSeconds = 120;
|
||||
public const string AIKey = "c065641b-ec84-43fe-a8e7-c2bcbb697995";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,249 @@
|
|||
// ------------------------------------------------------------
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
|
||||
// ------------------------------------------------------------
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Fabric;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Reflection;
|
||||
using System.Threading;
|
||||
using Microsoft.ApplicationInsights;
|
||||
using Microsoft.ApplicationInsights.Extensibility;
|
||||
using Newtonsoft.Json;
|
||||
|
||||
namespace FabricHealer.TelemetryLib
|
||||
{
|
||||
/// <summary>
|
||||
/// Contains common FabricObserver telemetry events
|
||||
/// </summary>
|
||||
public class TelemetryEvents : IDisposable
|
||||
{
|
||||
private const string OperationalEventName = "OperationalEvent";
|
||||
private const string CriticalErrorEventName = "CriticalErrorEvent";
|
||||
private const string TaskName = "FabricHealer";
|
||||
private readonly TelemetryClient telemetryClient;
|
||||
private readonly ServiceContext serviceContext;
|
||||
private readonly ITelemetryEventSource serviceEventSource;
|
||||
private readonly string clusterId, tenantId, clusterType;
|
||||
private readonly TelemetryConfiguration appInsightsTelemetryConf;
|
||||
private readonly bool isEtwEnabled;
|
||||
|
||||
public TelemetryEvents(
|
||||
FabricClient fabricClient,
|
||||
ServiceContext context,
|
||||
ITelemetryEventSource eventSource,
|
||||
CancellationToken token,
|
||||
bool etwEnabled)
|
||||
{
|
||||
serviceEventSource = eventSource;
|
||||
serviceContext = context;
|
||||
string config = File.ReadAllText(Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location), "FHAppInsightsOperational.config"));
|
||||
appInsightsTelemetryConf = TelemetryConfiguration.CreateFromConfiguration(config);
|
||||
appInsightsTelemetryConf.InstrumentationKey = TelemetryConstants.AIKey;
|
||||
telemetryClient = new TelemetryClient(appInsightsTelemetryConf);
|
||||
var (ClusterId, TenantId, ClusterType) = ClusterIdentificationUtility.TupleGetClusterIdAndTypeAsync(fabricClient, token).GetAwaiter().GetResult();
|
||||
clusterId = ClusterId;
|
||||
tenantId = TenantId;
|
||||
clusterType = ClusterType;
|
||||
isEtwEnabled = etwEnabled;
|
||||
}
|
||||
|
||||
public bool EmitFabricObserverOperationalEvent(FabricHealerOperationalEventData repairData, TimeSpan runInterval, string logFilePath)
|
||||
{
|
||||
if (!telemetryClient.IsEnabled())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// ETW
|
||||
if (isEtwEnabled)
|
||||
{
|
||||
serviceEventSource.InternalFHDataEvent(new { FOInternalTelemtryData = JsonConvert.SerializeObject(repairData) });
|
||||
}
|
||||
|
||||
string nodeHashString = string.Empty;
|
||||
int nodeNameHash = serviceContext?.NodeContext.NodeName.GetHashCode() ?? -1;
|
||||
|
||||
if (nodeNameHash != -1)
|
||||
{
|
||||
nodeHashString = ((uint)nodeNameHash).ToString();
|
||||
}
|
||||
|
||||
IDictionary<string, string> eventProperties = new Dictionary<string, string>
|
||||
{
|
||||
{ "EventName", OperationalEventName},
|
||||
{ "TaskName", TaskName},
|
||||
{ "EventRunInterval", runInterval.ToString() },
|
||||
{ "ClusterId", clusterId },
|
||||
{ "ClusterType", clusterType },
|
||||
{ "NodeNameHash", nodeHashString },
|
||||
{ "FHVersion", repairData.Version },
|
||||
{ "UpTime", repairData.UpTime },
|
||||
{ "Timestamp", DateTime.UtcNow.ToString("o") },
|
||||
{ "OS", repairData.OS }
|
||||
};
|
||||
|
||||
if (eventProperties.TryGetValue("ClusterType", out string clustType))
|
||||
{
|
||||
if (clustType != TelemetryConstants.ClusterTypeSfrp)
|
||||
{
|
||||
eventProperties.Add("TenantId", tenantId);
|
||||
}
|
||||
}
|
||||
|
||||
Dictionary<string, double> eventMetrics = new Dictionary<string, double>
|
||||
{
|
||||
{ "EnabledRepairCount", repairData.RepairData.EnabledRepairCount },
|
||||
{ "TotalRepairAttempts", repairData.RepairData.RepairCount },
|
||||
{ "SuccessfulRepairs", repairData.RepairData.SuccessfulRepairs },
|
||||
{ "FailedRepairs", repairData.RepairData.FailedRepairs },
|
||||
};
|
||||
|
||||
Dictionary<string, double> repairs = repairData.RepairData.Repairs;
|
||||
eventMetrics.Append(repairs);
|
||||
|
||||
telemetryClient?.TrackEvent($"{TaskName}.{OperationalEventName}", eventProperties, eventMetrics);
|
||||
telemetryClient?.Flush();
|
||||
|
||||
// allow time for flushing
|
||||
Thread.Sleep(1000);
|
||||
|
||||
// write a local log file containing the exact information sent to MS \\
|
||||
string telemetryData = "{" + string.Join(",", eventProperties.Select(kv => $"\"{kv.Key}\":" + $"\"{kv.Value}\"").ToArray());
|
||||
telemetryData += "," + string.Join(",", eventMetrics.Select(kv => $"\"{kv.Key}\":" + kv.Value).ToArray()) + "}";
|
||||
_ = TryWriteLogFile(logFilePath, telemetryData);
|
||||
|
||||
eventProperties.Clear();
|
||||
eventProperties = null;
|
||||
|
||||
return true;
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
// Telemetry is non-critical and should not take down FO.
|
||||
_ = TryWriteLogFile(logFilePath, $"{e}");
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public bool EmitFabricObserverCriticalErrorEvent(FabricHealerCriticalErrorEventData fhErrorData, string logFilePath)
|
||||
{
|
||||
if (!telemetryClient.IsEnabled())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// ETW
|
||||
if (isEtwEnabled)
|
||||
{
|
||||
serviceEventSource.InternalFHCriticalErrorDataEvent(new { FOCriticalErrorData = JsonConvert.SerializeObject(fhErrorData) });
|
||||
}
|
||||
|
||||
string nodeHashString = string.Empty;
|
||||
int nodeNameHash = serviceContext?.NodeContext.NodeName.GetHashCode() ?? -1;
|
||||
|
||||
if (nodeNameHash != -1)
|
||||
{
|
||||
nodeHashString = ((uint)nodeNameHash).ToString();
|
||||
}
|
||||
|
||||
IDictionary<string, string> eventProperties = new Dictionary<string, string>
|
||||
{
|
||||
{ "EventName", CriticalErrorEventName},
|
||||
{ "TaskName", TaskName},
|
||||
{ "ClusterId", clusterId },
|
||||
{ "ClusterType", clusterType },
|
||||
{ "TenantId", tenantId },
|
||||
{ "NodeNameHash", nodeHashString },
|
||||
{ "FHVersion", fhErrorData.Version },
|
||||
{ "CrashTime", fhErrorData.CrashTime },
|
||||
{ "ErrorMessage", fhErrorData.ErrorMessage },
|
||||
{ "CrashData", fhErrorData.ErrorStack },
|
||||
{ "Timestamp", DateTime.UtcNow.ToString("o") },
|
||||
{ "OS", fhErrorData.OS }
|
||||
};
|
||||
|
||||
telemetryClient?.TrackEvent($"{TaskName}.{OperationalEventName}", eventProperties);
|
||||
telemetryClient?.Flush();
|
||||
|
||||
// allow time for flushing
|
||||
Thread.Sleep(1000);
|
||||
|
||||
// write a local log file containing the exact information sent to MS \\
|
||||
string telemetryData = "{" + string.Join(",", eventProperties.Select(kv => $"\"{kv.Key}\":" + $"\"{kv.Value}\"").ToArray()) + "}";
|
||||
_ = TryWriteLogFile(logFilePath, telemetryData);
|
||||
|
||||
return true;
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
// Telemetry is non-critical and should not take down FO.
|
||||
_ = TryWriteLogFile(logFilePath, $"{e}");
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
telemetryClient?.Flush();
|
||||
|
||||
// allow time for flushing.
|
||||
Thread.Sleep(1000);
|
||||
appInsightsTelemetryConf?.Dispose();
|
||||
}
|
||||
|
||||
const int Retries = 4;
|
||||
|
||||
private bool TryWriteLogFile(string path, string content)
|
||||
{
|
||||
if (string.IsNullOrEmpty(content))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
for (var i = 0; i < Retries; i++)
|
||||
{
|
||||
try
|
||||
{
|
||||
string directory = Path.GetDirectoryName(path);
|
||||
|
||||
if (!Directory.Exists(directory))
|
||||
{
|
||||
if (directory != null)
|
||||
{
|
||||
_ = Directory.CreateDirectory(directory);
|
||||
}
|
||||
}
|
||||
|
||||
File.WriteAllText(path, content);
|
||||
return true;
|
||||
}
|
||||
catch
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
Thread.Sleep(1000);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public static class Extensions
|
||||
{
|
||||
public static void Append<K, V>(this Dictionary<K, V> first, Dictionary<K, V> second)
|
||||
{
|
||||
second.ToList().ForEach(pair => first[pair.Key] = pair.Value);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<ProjectGuid>{7BC6991F-C840-413E-B1CD-4025947CF5FA}</ProjectGuid>
|
||||
<OutputType>Library</OutputType>
|
||||
<AppDesignerFolder>Properties</AppDesignerFolder>
|
||||
<RootNamespace>FabricObserver.TelemetryLib</RootNamespace>
|
||||
<AssemblyName>TelemetryLib</AssemblyName>
|
||||
<TargetFramework>netstandard2.0</TargetFramework>
|
||||
<PlatformTarget>x64</PlatformTarget>
|
||||
<AssemblyVersion>2.0.0.0</AssemblyVersion>
|
||||
<FileVersion>2.0.0.0</FileVersion>
|
||||
<Copyright>Copyright © 2020</Copyright>
|
||||
<Product>TelemetryLib</Product>
|
||||
<Platforms>AnyCPU;x64</Platforms>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.ApplicationInsights" Version="2.17.0" />
|
||||
<PackageReference Include="Microsoft.ServiceFabric" Version="7.2.452" />
|
||||
<PackageReference Include="Microsoft.Win32.Registry" Version="5.0.0" />
|
||||
<PackageReference Include="Newtonsoft.Json" Version="13.0.1" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Update="FHAppInsightsOperational.config">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</None>
|
||||
</ItemGroup>
|
||||
</Project>
|
Загрузка…
Ссылка в новой задаче