New impl for evthealthduration, updated predicates.
This commit is contained in:
Родитель
1eff56ffe5
Коммит
7b892cba3f
|
@ -1024,7 +1024,7 @@ namespace FabricHealer
|
|||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
// Update the in-memory HealthEvent List.
|
||||
this.repairTaskManager.detectedHealthEvents.Add(evt);
|
||||
this.repairTaskManager.detectedHealthEvents.Add((repairData.ApplicationName, evt));
|
||||
|
||||
// Start the repair workflow.
|
||||
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
|
||||
|
@ -1346,7 +1346,7 @@ namespace FabricHealer
|
|||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
// Update the in-memory HealthEvent List.
|
||||
repairTaskManager.detectedHealthEvents.Add(evt);
|
||||
repairTaskManager.detectedHealthEvents.Add((repairData.ServiceName, evt));
|
||||
|
||||
// Start the repair workflow.
|
||||
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
|
||||
|
@ -1524,7 +1524,7 @@ namespace FabricHealer
|
|||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
// Update the in-memory HealthEvent List.
|
||||
repairTaskManager.detectedHealthEvents.Add(evt);
|
||||
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, evt));
|
||||
|
||||
// Start the repair workflow.
|
||||
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
|
||||
|
@ -1574,7 +1574,7 @@ namespace FabricHealer
|
|||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
// Update the in-memory HealthEvent List.
|
||||
repairTaskManager.detectedHealthEvents.Add(evt);
|
||||
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, evt));
|
||||
|
||||
// Start the repair workflow.
|
||||
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
|
||||
|
@ -1633,7 +1633,7 @@ namespace FabricHealer
|
|||
ConfigSettings.EnableVerboseLogging);
|
||||
|
||||
// Update the in-memory HealthEvent List.
|
||||
repairTaskManager.detectedHealthEvents.Add(healthEvent);
|
||||
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, healthEvent));
|
||||
|
||||
// Start the repair workflow.
|
||||
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
|
||||
|
|
|
@ -61,6 +61,9 @@ Mitigate(HealthState=?healthState) :- not(?healthState == Error), !.
|
|||
## Don't proceed if there are 2 machine repairs currently active in the cluster.
|
||||
Mitigate() :- CheckOutstandingRepairs(2), !.
|
||||
|
||||
## Don't proceed if the target machine hasn't been in Error (including cyclic Up/Down) state for at least two hours.
|
||||
Mitigate(HealthState=Error) :- GetEntityHealthStateDuration(?duration), ?duration <= 02:00:00, !.
|
||||
|
||||
## Don't schedule a repair if one was scheduled less than 10 minutes ago. Do we want this account for all repairs (not just FH-scheduled)?
|
||||
Mitigate() :- CheckInsideScheduleInterval(00:10:00), !.
|
||||
|
||||
|
|
|
@ -470,229 +470,6 @@ namespace FabricHealer.Repair
|
|||
return count;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the anount of time the target entity (application, node, etc) has been in the specified health state.
|
||||
/// </summary>
|
||||
/// <param name="entityType">EntityType</param>
|
||||
/// <param name="nameOrIdFilter">String representation of the target entity's name or ID (e.g., application name or node name or partition id)</param>
|
||||
/// <param name="healthState">Target HealthState to match.</param>
|
||||
/// <param name="token">CancellationToken</param>
|
||||
/// <returns></returns>
|
||||
internal static async Task<TimeSpan> GetEntityCurrentHealthStateDurationAsync(
|
||||
EntityType entityType,
|
||||
string nameOrIdFilter,
|
||||
HealthState healthState,
|
||||
CancellationToken token)
|
||||
{
|
||||
HealthEventsFilter healthEventsFilter = new HealthEventsFilter();
|
||||
|
||||
if (healthState == HealthState.Warning)
|
||||
{
|
||||
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Warning;
|
||||
}
|
||||
else if (healthState == HealthState.Error)
|
||||
{
|
||||
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Error;
|
||||
}
|
||||
else if (healthState == HealthState.Ok)
|
||||
{
|
||||
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Ok;
|
||||
}
|
||||
else
|
||||
{
|
||||
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.None;
|
||||
}
|
||||
|
||||
switch (entityType)
|
||||
{
|
||||
case EntityType.Application:
|
||||
|
||||
var appqueryDesc = new ApplicationHealthQueryDescription(new Uri(nameOrIdFilter))
|
||||
{
|
||||
EventsFilter = healthEventsFilter
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var appHealth =
|
||||
await FabricHealerManager.FabricClientSingleton.HealthManager.GetApplicationHealthAsync(
|
||||
appqueryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
|
||||
|
||||
if (appHealth == null || appHealth.HealthEvents.Count == 0)
|
||||
{
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
// How many times has the entity transitioned to Error health state in the last hour?
|
||||
// This is not going to work if the same event is created in a cycle. TODO: figure out how to do this correctly.
|
||||
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
|
||||
{
|
||||
var appHealthErrorEvents =
|
||||
appHealth.HealthEvents.Where(
|
||||
evt => DateTime.UtcNow.Subtract(
|
||||
evt.LastErrorTransitionAt) <= TimeSpan.FromHours(1)).OrderByDescending(
|
||||
o => o.LastErrorTransitionAt);
|
||||
|
||||
int errorCount = appHealthErrorEvents.Count();
|
||||
|
||||
if (errorCount > 1)
|
||||
{
|
||||
return DateTime.UtcNow.Subtract(appHealthErrorEvents.First().LastErrorTransitionAt);
|
||||
}
|
||||
}
|
||||
|
||||
var appHealthEvents = appHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
|
||||
|
||||
// return the time since the last health event was issued, as a TimeSpan.
|
||||
return DateTime.UtcNow.Subtract(appHealthEvents.First().SourceUtcTimestamp);
|
||||
|
||||
}
|
||||
catch (FabricException)
|
||||
{
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
case EntityType.Partition:
|
||||
|
||||
var partitionqueryDesc = new PartitionHealthQueryDescription(Guid.Parse(nameOrIdFilter))
|
||||
{
|
||||
EventsFilter = healthEventsFilter
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var partitionHealth =
|
||||
await FabricHealerManager.FabricClientSingleton.HealthManager.GetPartitionHealthAsync(
|
||||
partitionqueryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
|
||||
|
||||
if (partitionHealth == null || partitionHealth.HealthEvents.Count == 0)
|
||||
{
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
// How many times has the entity transitioned to Error health state in the last hour?
|
||||
// This is not going to work if the same event is created in a cycle. TODO: figure out how to do this correctly.
|
||||
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
|
||||
{
|
||||
var partitionHealthErrorEvents =
|
||||
partitionHealth.HealthEvents.Where(
|
||||
evt => DateTime.UtcNow.Subtract(
|
||||
evt.LastErrorTransitionAt) <= TimeSpan.FromHours(1)).OrderByDescending(
|
||||
o => o.LastErrorTransitionAt);
|
||||
|
||||
int errorCount = partitionHealthErrorEvents.Count();
|
||||
|
||||
if (errorCount > 1)
|
||||
{
|
||||
return DateTime.UtcNow.Subtract(partitionHealthErrorEvents.First().LastErrorTransitionAt);
|
||||
}
|
||||
}
|
||||
|
||||
var partitionHealthEvents = partitionHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
|
||||
return DateTime.UtcNow.Subtract(partitionHealthEvents.First().SourceUtcTimestamp);
|
||||
}
|
||||
catch (FabricException)
|
||||
{
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
case EntityType.Service:
|
||||
|
||||
var servicequeryDesc = new ServiceHealthQueryDescription(new Uri(nameOrIdFilter))
|
||||
{
|
||||
EventsFilter = healthEventsFilter
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var serviceHealth =
|
||||
await FabricHealerManager.FabricClientSingleton.HealthManager.GetServiceHealthAsync(
|
||||
servicequeryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
|
||||
|
||||
if (serviceHealth == null || serviceHealth.HealthEvents.Count == 0)
|
||||
{
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
// How many times has the entity transitioned to Error health state in the last hour?
|
||||
// This is not going to work if the same event is created in a cycle. TODO: figure out how to do this correctly.
|
||||
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
|
||||
{
|
||||
var serviceHealthErrorEvents =
|
||||
serviceHealth.HealthEvents.Where(
|
||||
evt => DateTime.UtcNow.Subtract(
|
||||
evt.LastErrorTransitionAt) <= TimeSpan.FromHours(1)).OrderByDescending(
|
||||
o => o.LastErrorTransitionAt);
|
||||
|
||||
int errorCount = serviceHealthErrorEvents.Count();
|
||||
|
||||
if (errorCount > 1)
|
||||
{
|
||||
return DateTime.UtcNow.Subtract(serviceHealthErrorEvents.First().LastErrorTransitionAt);
|
||||
}
|
||||
}
|
||||
|
||||
var serviceHealthEvents = serviceHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
|
||||
return DateTime.UtcNow.Subtract(serviceHealthEvents.First().SourceUtcTimestamp);
|
||||
}
|
||||
catch (FabricException)
|
||||
{
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
case EntityType.Disk:
|
||||
case EntityType.Machine:
|
||||
case EntityType.Node:
|
||||
|
||||
var nodequeryDesc = new NodeHealthQueryDescription(nameOrIdFilter)
|
||||
{
|
||||
EventsFilter = healthEventsFilter
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var nodeHealth =
|
||||
await FabricHealerManager.FabricClientSingleton.HealthManager.GetNodeHealthAsync(
|
||||
nodequeryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
|
||||
|
||||
if (nodeHealth == null || nodeHealth.HealthEvents.Count == 0)
|
||||
{
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
// How many times has the entity transitioned to Error health state in the last hour?
|
||||
// This is not going to work if the same event is created in a cycle. TODO: figure out how to do this correctly.
|
||||
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
|
||||
{
|
||||
var nodeHealthErrorEvents =
|
||||
nodeHealth.HealthEvents.Where(
|
||||
evt => DateTime.UtcNow.Subtract(
|
||||
evt.LastErrorTransitionAt) <= TimeSpan.FromHours(2)).OrderByDescending(
|
||||
o => o.LastErrorTransitionAt);
|
||||
|
||||
int errorCount = nodeHealthErrorEvents.Count();
|
||||
|
||||
if (errorCount > 1)
|
||||
{
|
||||
return DateTime.UtcNow.Subtract(nodeHealthErrorEvents.First().LastErrorTransitionAt);
|
||||
}
|
||||
}
|
||||
|
||||
var nodeHealthEvents = nodeHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
|
||||
return DateTime.UtcNow.Subtract(nodeHealthEvents.First().SourceUtcTimestamp);
|
||||
}
|
||||
catch (Exception e) when (e is ArgumentException || e is FabricException || e is InvalidOperationException || e is TaskCanceledException || e is TimeoutException)
|
||||
{
|
||||
string message = $"Unable to get {healthState} health state duration for {entityType}: {e.Message}";
|
||||
FabricHealerManager.RepairLogger.LogWarning(message);
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
default:
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
}
|
||||
|
||||
// TOTHINK: Should this look at any repair and apply a probation to it (so, not just FH-scheduled/executed repairs).
|
||||
// This mainly makes sense for node-level repairs (machine).
|
||||
internal static async Task<bool> IsRepairInPostProbationAsync(TimeSpan probationPeriod, TelemetryData repairData, CancellationToken cancellationToken)
|
||||
|
|
|
@ -15,6 +15,7 @@ namespace FabricHealer.Repair.Guan
|
|||
{
|
||||
private static TelemetryData RepairData;
|
||||
private static GetEntityHealthStateDurationPredicateType Instance;
|
||||
private static RepairTaskManager RepairTaskManager;
|
||||
|
||||
private class Resolver : GroundPredicateResolver
|
||||
{
|
||||
|
@ -24,60 +25,21 @@ namespace FabricHealer.Repair.Guan
|
|||
|
||||
}
|
||||
|
||||
// GetEntityHealthStateDuration(?HealthStateDuration, Machine, State=Error)
|
||||
protected override async Task<Term> GetNextTermAsync()
|
||||
{
|
||||
if (Input.Arguments.Count != 3)
|
||||
{
|
||||
throw new GuanException("GetCurrentEntityHealthStateDuration predicate requires 3 arguments.");
|
||||
}
|
||||
|
||||
TimeSpan duration;
|
||||
duration = await RepairTaskManager.GetEntityCurrentHealthStateDurationAsync(RepairData, FabricHealerManager.Token);
|
||||
|
||||
if (!Enum.TryParse((string)Input.Arguments[1].Value.GetEffectiveTerm().GetObjectValue(), out EntityType entityType))
|
||||
{
|
||||
throw new GuanException("The second argument of GetCurrentEntityHealthStateDuration must be a valid EntityType value (Application, Service, Node, Machine, etc..)");
|
||||
}
|
||||
|
||||
if (!Enum.TryParse((string)Input.Arguments[2].Value.GetEffectiveTerm().GetObjectValue(), out HealthState state))
|
||||
{
|
||||
throw new GuanException("The third argument of GetCurrentEntityHealthStateDuration must be a valid HealthState value (Error, Warning, etc..)");
|
||||
}
|
||||
|
||||
switch (entityType)
|
||||
{
|
||||
case EntityType.Application:
|
||||
duration = await FabricRepairTasks.GetEntityCurrentHealthStateDurationAsync(entityType, RepairData.ApplicationName, state, FabricHealerManager.Token);
|
||||
break;
|
||||
|
||||
case EntityType.Disk:
|
||||
case EntityType.Machine:
|
||||
case EntityType.Node:
|
||||
|
||||
duration = await FabricRepairTasks.GetEntityCurrentHealthStateDurationAsync(entityType, RepairData.NodeName, state, FabricHealerManager.Token);
|
||||
break;
|
||||
|
||||
case EntityType.Partition:
|
||||
duration = await FabricRepairTasks.GetEntityCurrentHealthStateDurationAsync(entityType, RepairData.PartitionId.ToString(), state, FabricHealerManager.Token);
|
||||
break;
|
||||
|
||||
case EntityType.Service:
|
||||
duration = await FabricRepairTasks.GetEntityCurrentHealthStateDurationAsync(entityType, RepairData.ServiceName, state, FabricHealerManager.Token);
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new GuanException($"Unsupported entity type: {entityType}");
|
||||
}
|
||||
|
||||
var result = new CompoundTerm(this.Input.Functor);
|
||||
var result = new CompoundTerm(Input.Functor);
|
||||
result.AddArgument(new Constant(duration), "0");
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
public static GetEntityHealthStateDurationPredicateType Singleton(string name, TelemetryData repairData)
|
||||
public static GetEntityHealthStateDurationPredicateType Singleton(string name, TelemetryData repairData, RepairTaskManager repairTaskManager)
|
||||
{
|
||||
RepairData = repairData;
|
||||
RepairTaskManager = repairTaskManager;
|
||||
return Instance ??= new GetEntityHealthStateDurationPredicateType(name);
|
||||
}
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ namespace FabricHealer.Repair.Guan
|
|||
|
||||
if (timeRange > TimeSpan.MinValue)
|
||||
{
|
||||
eventCount = RepairTaskManager.GetEntityHealthEventCountWithinTimeRange(RepairData.Property, timeRange);
|
||||
eventCount = RepairTaskManager.GetEntityHealthEventCountWithinTimeRange(RepairData, timeRange);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -18,7 +18,7 @@ using FabricHealer.Interfaces;
|
|||
using Guan.Logic;
|
||||
using FabricHealer.Repair.Guan;
|
||||
using FabricHealer.Utilities;
|
||||
using System.Transactions;
|
||||
using System.Fabric.Description;
|
||||
|
||||
namespace FabricHealer.Repair
|
||||
{
|
||||
|
@ -32,12 +32,13 @@ namespace FabricHealer.Repair
|
|||
private readonly DateTime healthEventsListCreationTime = DateTime.UtcNow;
|
||||
private readonly TimeSpan maxLifeTimeHealthEventsData = TimeSpan.FromDays(2);
|
||||
private DateTime lastHealthEventsListClearDateTime;
|
||||
internal readonly List<HealthEvent> detectedHealthEvents = new List<HealthEvent>();
|
||||
internal readonly List<(string entityName, HealthEvent healthEvent)> detectedHealthEvents;
|
||||
|
||||
public RepairTaskManager()
|
||||
{
|
||||
repairExecutor = new RepairExecutor();
|
||||
repairTaskEngine = new RepairTaskEngine();
|
||||
detectedHealthEvents = new List<(string id, HealthEvent healthEvent)>();
|
||||
lastHealthEventsListClearDateTime = healthEventsListCreationTime;
|
||||
}
|
||||
|
||||
|
@ -140,7 +141,7 @@ namespace FabricHealer.Repair
|
|||
functorTable.Add(CheckInsideScheduleIntervalPredicateType.Singleton(RepairConstants.CheckInsideScheduleInterval, repairData));
|
||||
functorTable.Add(CheckOutstandingRepairsPredicateType.Singleton(RepairConstants.CheckOutstandingRepairs, repairData));
|
||||
functorTable.Add(EmitMessagePredicateType.Singleton(RepairConstants.EmitMessage));
|
||||
functorTable.Add(GetEntityHealthStateDurationPredicateType.Singleton(RepairConstants.GetEntityHealthStateDuration, repairData));
|
||||
functorTable.Add(GetEntityHealthStateDurationPredicateType.Singleton(RepairConstants.GetEntityHealthStateDuration, repairData, this));
|
||||
functorTable.Add(GetHealthEventHistoryPredicateType.Singleton(RepairConstants.GetHealthEventHistory, this, repairData));
|
||||
functorTable.Add(GetRepairHistoryPredicateType.Singleton(RepairConstants.GetRepairHistory, repairData));
|
||||
|
||||
|
@ -987,17 +988,37 @@ namespace FabricHealer.Repair
|
|||
}
|
||||
|
||||
// Support for GetHealthEventHistoryPredicateType, which enables time-scoping logic rules based on health events related to specific SF entities/targets.
|
||||
internal int GetEntityHealthEventCountWithinTimeRange(string property, TimeSpan timeWindow)
|
||||
internal int GetEntityHealthEventCountWithinTimeRange(TelemetryData repairData, TimeSpan timeWindow)
|
||||
{
|
||||
int count = 0;
|
||||
var healthEvents = detectedHealthEvents.Where(evt => evt.HealthInformation.Property == property);
|
||||
|
||||
if (healthEvents == null || !healthEvents.Any())
|
||||
if (repairData == null || detectedHealthEvents == null || !detectedHealthEvents.Any())
|
||||
{
|
||||
return count;
|
||||
}
|
||||
|
||||
foreach (HealthEvent healthEvent in healthEvents)
|
||||
string id = string.Empty;
|
||||
|
||||
switch (repairData.EntityType)
|
||||
{
|
||||
case EntityType.Application:
|
||||
id = repairData.ApplicationName;
|
||||
break;
|
||||
|
||||
case EntityType.Service:
|
||||
id = repairData.ServiceName;
|
||||
break;
|
||||
|
||||
case EntityType.Disk:
|
||||
case EntityType.Machine:
|
||||
case EntityType.Node:
|
||||
id = repairData.NodeName;
|
||||
break;
|
||||
}
|
||||
|
||||
var entityHealthEvents = detectedHealthEvents.Where(
|
||||
evt => evt.entityName == id && evt.healthEvent.HealthInformation.Property == repairData.Property);
|
||||
|
||||
foreach (var (_, healthEvent) in entityHealthEvents)
|
||||
{
|
||||
if (DateTime.UtcNow.Subtract(healthEvent.SourceUtcTimestamp) > timeWindow)
|
||||
{
|
||||
|
@ -1016,6 +1037,165 @@ namespace FabricHealer.Repair
|
|||
return count;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the anount of time the target entity (application, node, etc) has been in the specified health state.
|
||||
/// </summary>
|
||||
/// <param name="entityType">EntityType</param>
|
||||
/// <param name="nameOrIdFilter">String representation of the target entity's name or ID (e.g., application name or node name or partition id)</param>
|
||||
/// <param name="healthState">Target HealthState to match.</param>
|
||||
/// <param name="token">CancellationToken</param>
|
||||
/// <returns></returns>
|
||||
internal async Task<TimeSpan> GetEntityCurrentHealthStateDurationAsync(
|
||||
TelemetryData repairData,
|
||||
CancellationToken token)
|
||||
{
|
||||
HealthEventsFilter healthEventsFilter = new HealthEventsFilter();
|
||||
|
||||
if (repairData.HealthState == HealthState.Warning)
|
||||
{
|
||||
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Warning;
|
||||
}
|
||||
else if (repairData.HealthState == HealthState.Error)
|
||||
{
|
||||
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Error;
|
||||
}
|
||||
else if (repairData.HealthState == HealthState.Ok)
|
||||
{
|
||||
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Ok;
|
||||
}
|
||||
else
|
||||
{
|
||||
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.None;
|
||||
}
|
||||
|
||||
switch (repairData.EntityType)
|
||||
{
|
||||
case EntityType.Application:
|
||||
|
||||
var appqueryDesc = new ApplicationHealthQueryDescription(new Uri(repairData.ApplicationName))
|
||||
{
|
||||
EventsFilter = healthEventsFilter
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var appHealth =
|
||||
await FabricHealerManager.FabricClientSingleton.HealthManager.GetApplicationHealthAsync(
|
||||
appqueryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
|
||||
|
||||
if (appHealth == null || appHealth.HealthEvents.Count == 0)
|
||||
{
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
// How many times has the entity been put into Error health state in the last 2 hours?
|
||||
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
|
||||
{
|
||||
if (GetEntityHealthEventCountWithinTimeRange(repairData, TimeSpan.FromHours(2)) > 1)
|
||||
{
|
||||
return DateTime.UtcNow.Subtract(
|
||||
detectedHealthEvents.Where(
|
||||
evt => evt.entityName == repairData.ApplicationName &&
|
||||
evt.healthEvent.HealthInformation.Property == repairData.Property).Last().healthEvent.SourceUtcTimestamp);
|
||||
}
|
||||
}
|
||||
|
||||
var appHealthEvents = appHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
|
||||
|
||||
// return the time since the last health event was issued, as a TimeSpan.
|
||||
return DateTime.UtcNow.Subtract(appHealthEvents.First().SourceUtcTimestamp);
|
||||
|
||||
}
|
||||
catch (FabricException)
|
||||
{
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
case EntityType.Service:
|
||||
|
||||
var servicequeryDesc = new ServiceHealthQueryDescription(new Uri(repairData.ServiceName))
|
||||
{
|
||||
EventsFilter = healthEventsFilter
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var serviceHealth =
|
||||
await FabricHealerManager.FabricClientSingleton.HealthManager.GetServiceHealthAsync(
|
||||
servicequeryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
|
||||
|
||||
if (serviceHealth == null || serviceHealth.HealthEvents.Count == 0)
|
||||
{
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
// How many times has the entity been put into Error health state in the last 2 hours ?
|
||||
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
|
||||
{
|
||||
if (GetEntityHealthEventCountWithinTimeRange(repairData, TimeSpan.FromHours(2)) > 1)
|
||||
{
|
||||
return DateTime.UtcNow.Subtract(
|
||||
detectedHealthEvents.Where(
|
||||
evt => evt.entityName == repairData.ServiceName &&
|
||||
evt.healthEvent.HealthInformation.Property == repairData.Property).Last().healthEvent.SourceUtcTimestamp);
|
||||
}
|
||||
}
|
||||
|
||||
var serviceHealthEvents = serviceHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
|
||||
return DateTime.UtcNow.Subtract(serviceHealthEvents.First().SourceUtcTimestamp);
|
||||
}
|
||||
catch (FabricException)
|
||||
{
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
case EntityType.Disk:
|
||||
case EntityType.Machine:
|
||||
case EntityType.Node:
|
||||
|
||||
var nodequeryDesc = new NodeHealthQueryDescription(repairData.NodeName)
|
||||
{
|
||||
EventsFilter = healthEventsFilter
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
var nodeHealth =
|
||||
await FabricHealerManager.FabricClientSingleton.HealthManager.GetNodeHealthAsync(
|
||||
nodequeryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
|
||||
|
||||
if (nodeHealth == null || nodeHealth.HealthEvents.Count == 0)
|
||||
{
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
// How many times has the entity been put into Error health state in the last 2 hours?
|
||||
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
|
||||
{
|
||||
if (GetEntityHealthEventCountWithinTimeRange(repairData, TimeSpan.FromHours(2)) > 1)
|
||||
{
|
||||
return DateTime.UtcNow.Subtract(
|
||||
detectedHealthEvents.Where(
|
||||
evt => evt.entityName == repairData.NodeName &&
|
||||
evt.healthEvent.HealthInformation.Property == repairData.Property).Last().healthEvent.SourceUtcTimestamp);
|
||||
}
|
||||
}
|
||||
|
||||
var nodeHealthEvents = nodeHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
|
||||
return DateTime.UtcNow.Subtract(nodeHealthEvents.First().SourceUtcTimestamp);
|
||||
}
|
||||
catch (Exception e) when (e is ArgumentException || e is FabricException || e is InvalidOperationException || e is TaskCanceledException || e is TimeoutException)
|
||||
{
|
||||
string message = $"Unable to get {repairData.HealthState} health state duration for {repairData.EntityType}: {e.Message}";
|
||||
FabricHealerManager.RepairLogger.LogWarning(message);
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
|
||||
default:
|
||||
return TimeSpan.MinValue;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// This function checks to see if the target of a repair is healthy after the repair task completed.
|
||||
/// This will signal the result via telemetry and as a health event.
|
||||
|
|
Загрузка…
Ссылка в новой задаче