New impl for evthealthduration, updated predicates.

This commit is contained in:
Charles Torre 2022-11-14 15:34:33 -08:00
Родитель 1eff56ffe5
Коммит 7b892cba3f
6 изменённых файлов: 202 добавлений и 280 удалений

Просмотреть файл

@ -1024,7 +1024,7 @@ namespace FabricHealer
ConfigSettings.EnableVerboseLogging);
// Update the in-memory HealthEvent List.
this.repairTaskManager.detectedHealthEvents.Add(evt);
this.repairTaskManager.detectedHealthEvents.Add((repairData.ApplicationName, evt));
// Start the repair workflow.
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
@ -1346,7 +1346,7 @@ namespace FabricHealer
ConfigSettings.EnableVerboseLogging);
// Update the in-memory HealthEvent List.
repairTaskManager.detectedHealthEvents.Add(evt);
repairTaskManager.detectedHealthEvents.Add((repairData.ServiceName, evt));
// Start the repair workflow.
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
@ -1524,7 +1524,7 @@ namespace FabricHealer
ConfigSettings.EnableVerboseLogging);
// Update the in-memory HealthEvent List.
repairTaskManager.detectedHealthEvents.Add(evt);
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, evt));
// Start the repair workflow.
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
@ -1574,7 +1574,7 @@ namespace FabricHealer
ConfigSettings.EnableVerboseLogging);
// Update the in-memory HealthEvent List.
repairTaskManager.detectedHealthEvents.Add(evt);
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, evt));
// Start the repair workflow.
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
@ -1633,7 +1633,7 @@ namespace FabricHealer
ConfigSettings.EnableVerboseLogging);
// Update the in-memory HealthEvent List.
repairTaskManager.detectedHealthEvents.Add(healthEvent);
repairTaskManager.detectedHealthEvents.Add((repairData.NodeName, healthEvent));
// Start the repair workflow.
await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);

Просмотреть файл

@ -61,6 +61,9 @@ Mitigate(HealthState=?healthState) :- not(?healthState == Error), !.
## Don't proceed if there are 2 machine repairs currently active in the cluster.
Mitigate() :- CheckOutstandingRepairs(2), !.
## Don't proceed if the target machine hasn't been in Error (including cyclic Up/Down) state for at least two hours.
Mitigate(HealthState=Error) :- GetEntityHealthStateDuration(?duration), ?duration <= 02:00:00, !.
## Don't schedule a repair if one was scheduled less than 10 minutes ago. Do we want this account for all repairs (not just FH-scheduled)?
Mitigate() :- CheckInsideScheduleInterval(00:10:00), !.

Просмотреть файл

@ -470,229 +470,6 @@ namespace FabricHealer.Repair
return count;
}
/// <summary>
/// Returns the anount of time the target entity (application, node, etc) has been in the specified health state.
/// </summary>
/// <param name="entityType">EntityType</param>
/// <param name="nameOrIdFilter">String representation of the target entity's name or ID (e.g., application name or node name or partition id)</param>
/// <param name="healthState">Target HealthState to match.</param>
/// <param name="token">CancellationToken</param>
/// <returns></returns>
internal static async Task<TimeSpan> GetEntityCurrentHealthStateDurationAsync(
EntityType entityType,
string nameOrIdFilter,
HealthState healthState,
CancellationToken token)
{
HealthEventsFilter healthEventsFilter = new HealthEventsFilter();
if (healthState == HealthState.Warning)
{
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Warning;
}
else if (healthState == HealthState.Error)
{
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Error;
}
else if (healthState == HealthState.Ok)
{
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Ok;
}
else
{
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.None;
}
switch (entityType)
{
case EntityType.Application:
var appqueryDesc = new ApplicationHealthQueryDescription(new Uri(nameOrIdFilter))
{
EventsFilter = healthEventsFilter
};
try
{
var appHealth =
await FabricHealerManager.FabricClientSingleton.HealthManager.GetApplicationHealthAsync(
appqueryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
if (appHealth == null || appHealth.HealthEvents.Count == 0)
{
return TimeSpan.MinValue;
}
// How many times has the entity transitioned to Error health state in the last hour?
// This is not going to work if the same event is created in a cycle. TODO: figure out how to do this correctly.
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
{
var appHealthErrorEvents =
appHealth.HealthEvents.Where(
evt => DateTime.UtcNow.Subtract(
evt.LastErrorTransitionAt) <= TimeSpan.FromHours(1)).OrderByDescending(
o => o.LastErrorTransitionAt);
int errorCount = appHealthErrorEvents.Count();
if (errorCount > 1)
{
return DateTime.UtcNow.Subtract(appHealthErrorEvents.First().LastErrorTransitionAt);
}
}
var appHealthEvents = appHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
// return the time since the last health event was issued, as a TimeSpan.
return DateTime.UtcNow.Subtract(appHealthEvents.First().SourceUtcTimestamp);
}
catch (FabricException)
{
return TimeSpan.MinValue;
}
case EntityType.Partition:
var partitionqueryDesc = new PartitionHealthQueryDescription(Guid.Parse(nameOrIdFilter))
{
EventsFilter = healthEventsFilter
};
try
{
var partitionHealth =
await FabricHealerManager.FabricClientSingleton.HealthManager.GetPartitionHealthAsync(
partitionqueryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
if (partitionHealth == null || partitionHealth.HealthEvents.Count == 0)
{
return TimeSpan.MinValue;
}
// How many times has the entity transitioned to Error health state in the last hour?
// This is not going to work if the same event is created in a cycle. TODO: figure out how to do this correctly.
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
{
var partitionHealthErrorEvents =
partitionHealth.HealthEvents.Where(
evt => DateTime.UtcNow.Subtract(
evt.LastErrorTransitionAt) <= TimeSpan.FromHours(1)).OrderByDescending(
o => o.LastErrorTransitionAt);
int errorCount = partitionHealthErrorEvents.Count();
if (errorCount > 1)
{
return DateTime.UtcNow.Subtract(partitionHealthErrorEvents.First().LastErrorTransitionAt);
}
}
var partitionHealthEvents = partitionHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
return DateTime.UtcNow.Subtract(partitionHealthEvents.First().SourceUtcTimestamp);
}
catch (FabricException)
{
return TimeSpan.MinValue;
}
case EntityType.Service:
var servicequeryDesc = new ServiceHealthQueryDescription(new Uri(nameOrIdFilter))
{
EventsFilter = healthEventsFilter
};
try
{
var serviceHealth =
await FabricHealerManager.FabricClientSingleton.HealthManager.GetServiceHealthAsync(
servicequeryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
if (serviceHealth == null || serviceHealth.HealthEvents.Count == 0)
{
return TimeSpan.MinValue;
}
// How many times has the entity transitioned to Error health state in the last hour?
// This is not going to work if the same event is created in a cycle. TODO: figure out how to do this correctly.
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
{
var serviceHealthErrorEvents =
serviceHealth.HealthEvents.Where(
evt => DateTime.UtcNow.Subtract(
evt.LastErrorTransitionAt) <= TimeSpan.FromHours(1)).OrderByDescending(
o => o.LastErrorTransitionAt);
int errorCount = serviceHealthErrorEvents.Count();
if (errorCount > 1)
{
return DateTime.UtcNow.Subtract(serviceHealthErrorEvents.First().LastErrorTransitionAt);
}
}
var serviceHealthEvents = serviceHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
return DateTime.UtcNow.Subtract(serviceHealthEvents.First().SourceUtcTimestamp);
}
catch (FabricException)
{
return TimeSpan.MinValue;
}
case EntityType.Disk:
case EntityType.Machine:
case EntityType.Node:
var nodequeryDesc = new NodeHealthQueryDescription(nameOrIdFilter)
{
EventsFilter = healthEventsFilter
};
try
{
var nodeHealth =
await FabricHealerManager.FabricClientSingleton.HealthManager.GetNodeHealthAsync(
nodequeryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
if (nodeHealth == null || nodeHealth.HealthEvents.Count == 0)
{
return TimeSpan.MinValue;
}
// How many times has the entity transitioned to Error health state in the last hour?
// This is not going to work if the same event is created in a cycle. TODO: figure out how to do this correctly.
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
{
var nodeHealthErrorEvents =
nodeHealth.HealthEvents.Where(
evt => DateTime.UtcNow.Subtract(
evt.LastErrorTransitionAt) <= TimeSpan.FromHours(2)).OrderByDescending(
o => o.LastErrorTransitionAt);
int errorCount = nodeHealthErrorEvents.Count();
if (errorCount > 1)
{
return DateTime.UtcNow.Subtract(nodeHealthErrorEvents.First().LastErrorTransitionAt);
}
}
var nodeHealthEvents = nodeHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
return DateTime.UtcNow.Subtract(nodeHealthEvents.First().SourceUtcTimestamp);
}
catch (Exception e) when (e is ArgumentException || e is FabricException || e is InvalidOperationException || e is TaskCanceledException || e is TimeoutException)
{
string message = $"Unable to get {healthState} health state duration for {entityType}: {e.Message}";
FabricHealerManager.RepairLogger.LogWarning(message);
return TimeSpan.MinValue;
}
default:
return TimeSpan.MinValue;
}
}
// TOTHINK: Should this look at any repair and apply a probation to it (so, not just FH-scheduled/executed repairs).
// This mainly makes sense for node-level repairs (machine).
internal static async Task<bool> IsRepairInPostProbationAsync(TimeSpan probationPeriod, TelemetryData repairData, CancellationToken cancellationToken)

Просмотреть файл

@ -15,6 +15,7 @@ namespace FabricHealer.Repair.Guan
{
private static TelemetryData RepairData;
private static GetEntityHealthStateDurationPredicateType Instance;
private static RepairTaskManager RepairTaskManager;
private class Resolver : GroundPredicateResolver
{
@ -24,60 +25,21 @@ namespace FabricHealer.Repair.Guan
}
// GetEntityHealthStateDuration(?HealthStateDuration, Machine, State=Error)
protected override async Task<Term> GetNextTermAsync()
{
if (Input.Arguments.Count != 3)
{
throw new GuanException("GetCurrentEntityHealthStateDuration predicate requires 3 arguments.");
}
TimeSpan duration;
duration = await RepairTaskManager.GetEntityCurrentHealthStateDurationAsync(RepairData, FabricHealerManager.Token);
if (!Enum.TryParse((string)Input.Arguments[1].Value.GetEffectiveTerm().GetObjectValue(), out EntityType entityType))
{
throw new GuanException("The second argument of GetCurrentEntityHealthStateDuration must be a valid EntityType value (Application, Service, Node, Machine, etc..)");
}
if (!Enum.TryParse((string)Input.Arguments[2].Value.GetEffectiveTerm().GetObjectValue(), out HealthState state))
{
throw new GuanException("The third argument of GetCurrentEntityHealthStateDuration must be a valid HealthState value (Error, Warning, etc..)");
}
switch (entityType)
{
case EntityType.Application:
duration = await FabricRepairTasks.GetEntityCurrentHealthStateDurationAsync(entityType, RepairData.ApplicationName, state, FabricHealerManager.Token);
break;
case EntityType.Disk:
case EntityType.Machine:
case EntityType.Node:
duration = await FabricRepairTasks.GetEntityCurrentHealthStateDurationAsync(entityType, RepairData.NodeName, state, FabricHealerManager.Token);
break;
case EntityType.Partition:
duration = await FabricRepairTasks.GetEntityCurrentHealthStateDurationAsync(entityType, RepairData.PartitionId.ToString(), state, FabricHealerManager.Token);
break;
case EntityType.Service:
duration = await FabricRepairTasks.GetEntityCurrentHealthStateDurationAsync(entityType, RepairData.ServiceName, state, FabricHealerManager.Token);
break;
default:
throw new GuanException($"Unsupported entity type: {entityType}");
}
var result = new CompoundTerm(this.Input.Functor);
var result = new CompoundTerm(Input.Functor);
result.AddArgument(new Constant(duration), "0");
return result;
}
}
public static GetEntityHealthStateDurationPredicateType Singleton(string name, TelemetryData repairData)
public static GetEntityHealthStateDurationPredicateType Singleton(string name, TelemetryData repairData, RepairTaskManager repairTaskManager)
{
RepairData = repairData;
RepairTaskManager = repairTaskManager;
return Instance ??= new GetEntityHealthStateDurationPredicateType(name);
}

Просмотреть файл

@ -32,7 +32,7 @@ namespace FabricHealer.Repair.Guan
if (timeRange > TimeSpan.MinValue)
{
eventCount = RepairTaskManager.GetEntityHealthEventCountWithinTimeRange(RepairData.Property, timeRange);
eventCount = RepairTaskManager.GetEntityHealthEventCountWithinTimeRange(RepairData, timeRange);
}
else
{

Просмотреть файл

@ -18,7 +18,7 @@ using FabricHealer.Interfaces;
using Guan.Logic;
using FabricHealer.Repair.Guan;
using FabricHealer.Utilities;
using System.Transactions;
using System.Fabric.Description;
namespace FabricHealer.Repair
{
@ -32,12 +32,13 @@ namespace FabricHealer.Repair
private readonly DateTime healthEventsListCreationTime = DateTime.UtcNow;
private readonly TimeSpan maxLifeTimeHealthEventsData = TimeSpan.FromDays(2);
private DateTime lastHealthEventsListClearDateTime;
internal readonly List<HealthEvent> detectedHealthEvents = new List<HealthEvent>();
internal readonly List<(string entityName, HealthEvent healthEvent)> detectedHealthEvents;
public RepairTaskManager()
{
repairExecutor = new RepairExecutor();
repairTaskEngine = new RepairTaskEngine();
detectedHealthEvents = new List<(string id, HealthEvent healthEvent)>();
lastHealthEventsListClearDateTime = healthEventsListCreationTime;
}
@ -140,7 +141,7 @@ namespace FabricHealer.Repair
functorTable.Add(CheckInsideScheduleIntervalPredicateType.Singleton(RepairConstants.CheckInsideScheduleInterval, repairData));
functorTable.Add(CheckOutstandingRepairsPredicateType.Singleton(RepairConstants.CheckOutstandingRepairs, repairData));
functorTable.Add(EmitMessagePredicateType.Singleton(RepairConstants.EmitMessage));
functorTable.Add(GetEntityHealthStateDurationPredicateType.Singleton(RepairConstants.GetEntityHealthStateDuration, repairData));
functorTable.Add(GetEntityHealthStateDurationPredicateType.Singleton(RepairConstants.GetEntityHealthStateDuration, repairData, this));
functorTable.Add(GetHealthEventHistoryPredicateType.Singleton(RepairConstants.GetHealthEventHistory, this, repairData));
functorTable.Add(GetRepairHistoryPredicateType.Singleton(RepairConstants.GetRepairHistory, repairData));
@ -987,17 +988,37 @@ namespace FabricHealer.Repair
}
// Support for GetHealthEventHistoryPredicateType, which enables time-scoping logic rules based on health events related to specific SF entities/targets.
internal int GetEntityHealthEventCountWithinTimeRange(string property, TimeSpan timeWindow)
internal int GetEntityHealthEventCountWithinTimeRange(TelemetryData repairData, TimeSpan timeWindow)
{
int count = 0;
var healthEvents = detectedHealthEvents.Where(evt => evt.HealthInformation.Property == property);
if (healthEvents == null || !healthEvents.Any())
if (repairData == null || detectedHealthEvents == null || !detectedHealthEvents.Any())
{
return count;
}
foreach (HealthEvent healthEvent in healthEvents)
string id = string.Empty;
switch (repairData.EntityType)
{
case EntityType.Application:
id = repairData.ApplicationName;
break;
case EntityType.Service:
id = repairData.ServiceName;
break;
case EntityType.Disk:
case EntityType.Machine:
case EntityType.Node:
id = repairData.NodeName;
break;
}
var entityHealthEvents = detectedHealthEvents.Where(
evt => evt.entityName == id && evt.healthEvent.HealthInformation.Property == repairData.Property);
foreach (var (_, healthEvent) in entityHealthEvents)
{
if (DateTime.UtcNow.Subtract(healthEvent.SourceUtcTimestamp) > timeWindow)
{
@ -1016,6 +1037,165 @@ namespace FabricHealer.Repair
return count;
}
/// <summary>
/// Returns the anount of time the target entity (application, node, etc) has been in the specified health state.
/// </summary>
/// <param name="entityType">EntityType</param>
/// <param name="nameOrIdFilter">String representation of the target entity's name or ID (e.g., application name or node name or partition id)</param>
/// <param name="healthState">Target HealthState to match.</param>
/// <param name="token">CancellationToken</param>
/// <returns></returns>
internal async Task<TimeSpan> GetEntityCurrentHealthStateDurationAsync(
TelemetryData repairData,
CancellationToken token)
{
HealthEventsFilter healthEventsFilter = new HealthEventsFilter();
if (repairData.HealthState == HealthState.Warning)
{
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Warning;
}
else if (repairData.HealthState == HealthState.Error)
{
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Error;
}
else if (repairData.HealthState == HealthState.Ok)
{
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Ok;
}
else
{
healthEventsFilter.HealthStateFilterValue = HealthStateFilter.None;
}
switch (repairData.EntityType)
{
case EntityType.Application:
var appqueryDesc = new ApplicationHealthQueryDescription(new Uri(repairData.ApplicationName))
{
EventsFilter = healthEventsFilter
};
try
{
var appHealth =
await FabricHealerManager.FabricClientSingleton.HealthManager.GetApplicationHealthAsync(
appqueryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
if (appHealth == null || appHealth.HealthEvents.Count == 0)
{
return TimeSpan.MinValue;
}
// How many times has the entity been put into Error health state in the last 2 hours?
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
{
if (GetEntityHealthEventCountWithinTimeRange(repairData, TimeSpan.FromHours(2)) > 1)
{
return DateTime.UtcNow.Subtract(
detectedHealthEvents.Where(
evt => evt.entityName == repairData.ApplicationName &&
evt.healthEvent.HealthInformation.Property == repairData.Property).Last().healthEvent.SourceUtcTimestamp);
}
}
var appHealthEvents = appHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
// return the time since the last health event was issued, as a TimeSpan.
return DateTime.UtcNow.Subtract(appHealthEvents.First().SourceUtcTimestamp);
}
catch (FabricException)
{
return TimeSpan.MinValue;
}
case EntityType.Service:
var servicequeryDesc = new ServiceHealthQueryDescription(new Uri(repairData.ServiceName))
{
EventsFilter = healthEventsFilter
};
try
{
var serviceHealth =
await FabricHealerManager.FabricClientSingleton.HealthManager.GetServiceHealthAsync(
servicequeryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
if (serviceHealth == null || serviceHealth.HealthEvents.Count == 0)
{
return TimeSpan.MinValue;
}
// How many times has the entity been put into Error health state in the last 2 hours ?
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
{
if (GetEntityHealthEventCountWithinTimeRange(repairData, TimeSpan.FromHours(2)) > 1)
{
return DateTime.UtcNow.Subtract(
detectedHealthEvents.Where(
evt => evt.entityName == repairData.ServiceName &&
evt.healthEvent.HealthInformation.Property == repairData.Property).Last().healthEvent.SourceUtcTimestamp);
}
}
var serviceHealthEvents = serviceHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
return DateTime.UtcNow.Subtract(serviceHealthEvents.First().SourceUtcTimestamp);
}
catch (FabricException)
{
return TimeSpan.MinValue;
}
case EntityType.Disk:
case EntityType.Machine:
case EntityType.Node:
var nodequeryDesc = new NodeHealthQueryDescription(repairData.NodeName)
{
EventsFilter = healthEventsFilter
};
try
{
var nodeHealth =
await FabricHealerManager.FabricClientSingleton.HealthManager.GetNodeHealthAsync(
nodequeryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
if (nodeHealth == null || nodeHealth.HealthEvents.Count == 0)
{
return TimeSpan.MinValue;
}
// How many times has the entity been put into Error health state in the last 2 hours?
if (healthEventsFilter.HealthStateFilterValue == HealthStateFilter.Error)
{
if (GetEntityHealthEventCountWithinTimeRange(repairData, TimeSpan.FromHours(2)) > 1)
{
return DateTime.UtcNow.Subtract(
detectedHealthEvents.Where(
evt => evt.entityName == repairData.NodeName &&
evt.healthEvent.HealthInformation.Property == repairData.Property).Last().healthEvent.SourceUtcTimestamp);
}
}
var nodeHealthEvents = nodeHealth.HealthEvents.OrderByDescending(o => o.SourceUtcTimestamp);
return DateTime.UtcNow.Subtract(nodeHealthEvents.First().SourceUtcTimestamp);
}
catch (Exception e) when (e is ArgumentException || e is FabricException || e is InvalidOperationException || e is TaskCanceledException || e is TimeoutException)
{
string message = $"Unable to get {repairData.HealthState} health state duration for {repairData.EntityType}: {e.Message}";
FabricHealerManager.RepairLogger.LogWarning(message);
return TimeSpan.MinValue;
}
default:
return TimeSpan.MinValue;
}
}
/// <summary>
/// This function checks to see if the target of a repair is healthy after the repair task completed.
/// This will signal the result via telemetry and as a health event.