Bug fixes in rules. New feature for rules (DoHealthChecks).
This commit is contained in:
Родитель
aec5a734f3
Коммит
c0d34c5da4
|
@ -51,7 +51,7 @@ namespace FHTest
|
|||
|
||||
// Set this to the full path to your Rules directory in the FabricHealer project's PackageRoot\Config directory.
|
||||
// e.g., if developing on Windows, then something like @"C:\Users\[me]\source\repos\service-fabric-healer\FabricHealer\PackageRoot\Config\Rules\";
|
||||
private const string FHRulesDirectory = @"";
|
||||
private const string FHRulesDirectory = @"C:\Users\ctorre\source\repos\service-fabric-healer\FabricHealer\PackageRoot\Config\Rules\";
|
||||
|
||||
/* GuanLogic Tests */
|
||||
// TODO: More of them.
|
||||
|
@ -72,7 +72,7 @@ namespace FHTest
|
|||
|
||||
var executorData = new RepairExecutorData
|
||||
{
|
||||
RepairPolicy = new RepairPolicy { RepairAction = RepairActionType.RestartCodePackage },
|
||||
RepairPolicy = new RepairPolicy(),
|
||||
};
|
||||
|
||||
foreach (var file in Directory.GetFiles(FHRulesDirectory))
|
||||
|
@ -166,9 +166,9 @@ namespace FHTest
|
|||
/* private Helpers */
|
||||
|
||||
private async Task<bool> TestInitializeGuanAndRunQuery(
|
||||
TelemetryData foHealthData,
|
||||
List<string> repairRules,
|
||||
RepairExecutorData executorData)
|
||||
TelemetryData foHealthData,
|
||||
List<string> repairRules,
|
||||
RepairExecutorData executorData)
|
||||
{
|
||||
var fabricClient = new FabricClient(FabricClientRole.Admin);
|
||||
var repairTaskHelper = new RepairTaskManager(fabricClient, context, token);
|
||||
|
|
|
@ -88,8 +88,8 @@ TimeScopedRestartCodePackage(?count, ?time) :- GetRepairHistory(?repairCount, Ti
|
|||
|
||||
|
||||
## If we get here, it means the number of repairs for a target has not exceeded the maximum number specified to run within a time window.
|
||||
## Note you can add an argument (optional) to RestartCodePackage, name it whatever you want or omit the name, it just has to be a TimeSpan value.
|
||||
## This arg represents how long you want to wait to see if the target app service is healed. Internally, FH will check healthstate of repair target after repair execution (with sleeps)
|
||||
## for a default of 10 minutes for app repair, 30 minutes for a FabricNode repair, and 2 hours for a VM repair. All FH external repair predicates support this optional arg.
|
||||
## Note you can add up to two optional arguments to RestartCodePackage, name them whatever you want or omit the names, it just has to be either a TimeSpan value for how long to wait
|
||||
## for the repair target to become healthy or a bool for whether or not RM should do health checks before and after the repair executes.
|
||||
## Obviously, you can supply both. See below for an example using both optional arguments (named for clarity..you could also do RestartCodePackage(true, 00:10:00)).
|
||||
|
||||
TimeScopedRestartCodePackage() :- RestartCodePackage().
|
||||
TimeScopedRestartCodePackage() :- RestartCodePackage(DoHealthChecks=true, MaxWaitTimeForHealthStateOk=00:10:00).
|
|
@ -1,7 +1,7 @@
|
|||
## Logic rules for Disk repair. Only file management is supported (file deletion).
|
||||
|
||||
## First, check if we are inside run interval. If so, then cut (!).
|
||||
Mitigate() :- CheckInsideRunInterval(RunInterval=02:00:00), !
|
||||
Mitigate() :- CheckInsideRunInterval(RunInterval=02:00:00), !.
|
||||
|
||||
## The sample rule below checks the folder size of SF logs, fabricobserver logs, E:\temp. If the computed size exceeds a supplied threshold, then try and delete the files in the directory.
|
||||
## The CheckFolderSize external predicate takes a folder path string argument and either a MaxFolderSizeGB or MaxFolderSizeMB argument.
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
## Logic rules for Service Fabric Node repairs.
|
||||
|
||||
## First check if we are inside the run interval. If so, cut (!).
|
||||
Mitigate() :- CheckInsideRunInterval(RunInterval=02:00:00), !
|
||||
Mitigate() :- CheckInsideRunInterval(RunInterval=02:00:00), !.
|
||||
|
||||
## This rule means that whatever the warning data from FabricObserver happens to be (related to node level healing repairs, of course), restart the target node if
|
||||
## the repair hasn't run 4 times in the last 8 hours.
|
||||
|
|
|
@ -5,11 +5,11 @@
|
|||
## Start Time (UTC): 2020-04-26 19:22:55.492.
|
||||
|
||||
## First, check if we are inside run interval. If so, then cut (!), which effectively means stop processing rules (no backtracking to subsequent rules in the file).
|
||||
Mitigate() :- CheckInsideRunInterval(RunInterval=00:15:00), !
|
||||
Mitigate() :- CheckInsideRunInterval(RunInterval=00:15:00), !.
|
||||
|
||||
## Set a repair count variable for use by any rule in this file (NOTE: all rules must have the same TimeWindow value) as an internal predicate, _mitigate(?count),
|
||||
## where ?repairCount and ?count variables are unified when _mitigate(?count) predicate runs. The concept here is sharing a variable value across different rules.
|
||||
Mitigate() :- GetRepairHistory(?repairCount, TimeWindow=01:00:00), _mitigate(?repairCount)
|
||||
Mitigate() :- GetRepairHistory(?repairCount, TimeWindow=01:00:00), _mitigate(?repairCount).
|
||||
|
||||
## Now, let's say you wanted to only repair specific Apps or Paritions where related repair TimeWindow values are *not* the same, unlike the above "global" variable rule.
|
||||
## You could do something like the below three rules, which would mean the _mitigate internal predicate would only run if the supplied Mitigate argument values are matched:
|
||||
|
@ -21,10 +21,10 @@ Mitigate() :- GetRepairHistory(?repairCount, TimeWindow=01:00:00), _mitigate(?re
|
|||
## This means that these predicates either succeed (pass true back) or fail (pass false back) as the result of their execution. So, if one fails, then the next rule will be run, etc.
|
||||
|
||||
## Try this.
|
||||
_mitigate(?count) :- ?count < 4, RestartReplica()
|
||||
_mitigate(?count) :- ?count < 4, RestartReplica().
|
||||
|
||||
## Else, try this.
|
||||
_mitigate(?count) :- ?count < 4, RestartCodePackage()
|
||||
_mitigate(?count) :- ?count < 4, RestartCodePackage().
|
||||
|
||||
## Else, try this.
|
||||
_mitigate(?count) :- ?count < 2, RestartFabricNode()
|
||||
_mitigate(?count) :- ?count < 2, RestartFabricNode().
|
|
@ -42,25 +42,31 @@ namespace FabricHealer.Repair.Guan
|
|||
|
||||
protected override bool Check()
|
||||
{
|
||||
int count = Input.Arguments.Count;
|
||||
|
||||
if (count == 1 && Input.Arguments[0].Value.GetValue().GetType() != typeof(TimeSpan))
|
||||
{
|
||||
throw new GuanException(
|
||||
"RestartCodePackagePredicate: One optional argument is supported and it must be a TimeSpan " +
|
||||
"(xx:yy:zz format, for example 00:30:00 represents 30 minutes).");
|
||||
}
|
||||
|
||||
// RepairPolicy
|
||||
repairConfiguration.RepairPolicy.RepairAction = RepairActionType.RestartCodePackage;
|
||||
repairConfiguration.RepairPolicy.RepairId = FOHealthData.RepairId;
|
||||
repairConfiguration.RepairPolicy.TargetType = RepairTargetType.Application;
|
||||
|
||||
if (count == 1 && Input.Arguments[0].Value.GetValue() is TimeSpan)
|
||||
{
|
||||
repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[0].Value.GetEffectiveTerm().GetValue();
|
||||
}
|
||||
int count = Input.Arguments.Count;
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
var typeString = Input.Arguments[i].Value.GetValue().GetType().ToString();
|
||||
switch (typeString)
|
||||
{
|
||||
case "System.TimeSpan":
|
||||
repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetValue();
|
||||
break;
|
||||
|
||||
case "System.Boolean":
|
||||
repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[0].Value.GetEffectiveTerm().GetValue();
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new GuanException($"Unsupported input: {Input.Arguments[i].Value.GetValue().GetType()}");
|
||||
}
|
||||
}
|
||||
|
||||
// Try to schedule repair with RM.
|
||||
var repairTask = FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
|
||||
() => RepairTaskManager.ScheduleFabricHealerRmRepairTaskAsync(
|
||||
|
@ -93,7 +99,7 @@ namespace FabricHealer.Repair.Guan
|
|||
}
|
||||
|
||||
private RestartCodePackagePredicateType(string name)
|
||||
: base(name, true, 0, 1)
|
||||
: base(name, true, 0, 2)
|
||||
{
|
||||
|
||||
}
|
||||
|
|
|
@ -43,27 +43,33 @@ namespace FabricHealer.Repair.Guan
|
|||
|
||||
protected override bool Check()
|
||||
{
|
||||
int count = Input.Arguments.Count;
|
||||
|
||||
if (count == 1 && Input.Arguments[0].Value.GetValue().GetType() != typeof(TimeSpan))
|
||||
{
|
||||
throw new GuanException(
|
||||
"RestartFabricNodePredicate: One optional argument is supported and it must be a TimeSpan " +
|
||||
"(xx:yy:zz format, for example 00:30:00 represents 30 minutes).");
|
||||
}
|
||||
|
||||
RepairTask repairTask;
|
||||
|
||||
// Repair Policy
|
||||
repairConfiguration.RepairPolicy.RepairAction = RepairActionType.RestartFabricNode;
|
||||
repairConfiguration.RepairPolicy.RepairId = FOHealthData.RepairId;
|
||||
repairConfiguration.RepairPolicy.TargetType = FOHealthData.ApplicationName == "fabric:/System" ? RepairTargetType.Application : RepairTargetType.Node;
|
||||
|
||||
if (count == 1 && Input.Arguments[0].Value.GetValue() is TimeSpan)
|
||||
|
||||
int count = Input.Arguments.Count;
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[0].Value.GetEffectiveTerm().GetValue();
|
||||
var typeString = Input.Arguments[i].Value.GetValue().GetType().ToString();
|
||||
|
||||
switch (typeString)
|
||||
{
|
||||
case "System.TimeSpan":
|
||||
repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetValue();
|
||||
break;
|
||||
|
||||
case "System.Boolean":
|
||||
repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[0].Value.GetEffectiveTerm().GetValue();
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new GuanException($"Unsupported input: {Input.Arguments[i].Value.GetValue().GetType()}");
|
||||
}
|
||||
}
|
||||
|
||||
RepairTask repairTask;
|
||||
bool success;
|
||||
|
||||
// This means it's a resumed repair.
|
||||
|
@ -141,7 +147,7 @@ namespace FabricHealer.Repair.Guan
|
|||
}
|
||||
|
||||
private RestartFabricNodePredicateType(string name)
|
||||
: base(name, true, 0, 1)
|
||||
: base(name, true, 0, 2)
|
||||
{
|
||||
|
||||
}
|
||||
|
|
|
@ -50,23 +50,30 @@ namespace FabricHealer.Repair.Guan
|
|||
return false;
|
||||
}
|
||||
|
||||
int count = Input.Arguments.Count;
|
||||
|
||||
if (count == 1 && Input.Arguments[0].Value.GetValue().GetType() != typeof(TimeSpan))
|
||||
{
|
||||
throw new GuanException(
|
||||
"RestartFabricSystemProcessPredicate: One optional argument is supported and it must be a TimeSpan " +
|
||||
"(xx:yy:zz format, for example 00:30:00 represents 30 minutes).");
|
||||
}
|
||||
|
||||
// RepairPolicy
|
||||
repairConfiguration.RepairPolicy.RepairAction = RepairActionType.RestartProcess;
|
||||
repairConfiguration.RepairPolicy.RepairId = FOHealthData.RepairId;
|
||||
repairConfiguration.RepairPolicy.TargetType = RepairTargetType.Application;
|
||||
|
||||
if (count == 1 && Input.Arguments[0].Value.GetValue() is TimeSpan)
|
||||
int count = Input.Arguments.Count;
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[0].Value.GetEffectiveTerm().GetValue();
|
||||
var typeString = Input.Arguments[i].Value.GetValue().GetType().ToString();
|
||||
|
||||
switch (typeString)
|
||||
{
|
||||
case "System.TimeSpan":
|
||||
repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetValue();
|
||||
break;
|
||||
|
||||
case "System.Boolean":
|
||||
repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[0].Value.GetEffectiveTerm().GetValue();
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new GuanException($"Unsupported input: {Input.Arguments[i].Value.GetValue().GetType()}");
|
||||
}
|
||||
}
|
||||
|
||||
// Try to schedule repair with RM.
|
||||
|
@ -101,7 +108,7 @@ namespace FabricHealer.Repair.Guan
|
|||
}
|
||||
|
||||
private RestartFabricSystemProcessPredicateType(string name)
|
||||
: base(name, true, 0, 1)
|
||||
: base(name, true, 0, 2)
|
||||
{
|
||||
|
||||
}
|
||||
|
|
|
@ -40,21 +40,29 @@ namespace FabricHealer.Repair.Guan
|
|||
|
||||
protected override bool Check()
|
||||
{
|
||||
int count = Input.Arguments.Count;
|
||||
|
||||
if (count == 1 && Input.Arguments[0].Value.GetValue().GetType() != typeof(TimeSpan))
|
||||
{
|
||||
throw new GuanException(
|
||||
"RestartReplicaPredicate: One optional argument is supported and it must be a TimeSpan " +
|
||||
"(xx:yy:zz format, for example 00:30:00 represents 30 minutes).");
|
||||
}
|
||||
|
||||
repairConfiguration.RepairPolicy.RepairId = FOHealthData.RepairId;
|
||||
repairConfiguration.RepairPolicy.TargetType = RepairTargetType.Application;
|
||||
repairConfiguration.RepairPolicy.RepairAction = RepairActionType.RestartReplica;
|
||||
|
||||
int count = Input.Arguments.Count;
|
||||
|
||||
if (count == 1 && Input.Arguments[0].Value.GetValue() is TimeSpan)
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[0].Value.GetEffectiveTerm().GetValue();
|
||||
var typeString = Input.Arguments[i].Value.GetValue().GetType().ToString();
|
||||
|
||||
switch (typeString)
|
||||
{
|
||||
case "System.TimeSpan":
|
||||
repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetValue();
|
||||
break;
|
||||
|
||||
case "System.Boolean":
|
||||
repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[0].Value.GetEffectiveTerm().GetValue();
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new GuanException($"Unsupported input: {Input.Arguments[i].Value.GetValue().GetType()}");
|
||||
}
|
||||
}
|
||||
|
||||
// Try to schedule repair with RM.
|
||||
|
@ -89,7 +97,7 @@ namespace FabricHealer.Repair.Guan
|
|||
}
|
||||
|
||||
private RestartReplicaPredicateType(string name)
|
||||
: base(name, true, 0, 1)
|
||||
: base(name, true, 0, 2)
|
||||
{
|
||||
|
||||
}
|
||||
|
|
|
@ -40,23 +40,30 @@ namespace FabricHealer.Repair.Guan
|
|||
|
||||
protected override bool Check()
|
||||
{
|
||||
int count = Input.Arguments.Count;
|
||||
|
||||
if (count == 1 && Input.Arguments[0].Value.GetValue().GetType() != typeof(TimeSpan))
|
||||
{
|
||||
throw new GuanException(
|
||||
"RestartVMPredicate: One optional argument is supported and it must be a TimeSpan " +
|
||||
"(xx:yy:zz format, for example 00:30:00 represents 30 minutes).");
|
||||
}
|
||||
|
||||
// Repair Policy
|
||||
repairConfiguration.RepairPolicy.RepairAction = RepairActionType.RestartVM;
|
||||
repairConfiguration.RepairPolicy.RepairId = FOHealthData.RepairId;
|
||||
repairConfiguration.RepairPolicy.TargetType = RepairTargetType.VirtualMachine;
|
||||
|
||||
if (count == 1)
|
||||
int count = Input.Arguments.Count;
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[0].Value.GetEffectiveTerm().GetValue();
|
||||
var typeString = Input.Arguments[i].Value.GetValue().GetType().ToString();
|
||||
|
||||
switch (typeString)
|
||||
{
|
||||
case "System.TimeSpan":
|
||||
repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetValue();
|
||||
break;
|
||||
|
||||
case "System.Boolean":
|
||||
repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[0].Value.GetEffectiveTerm().GetValue();
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new GuanException($"Unsupported input: {Input.Arguments[i].Value.GetValue().GetType()}");
|
||||
}
|
||||
}
|
||||
|
||||
// FH does not execute repairs for VM level mitigation. InfrastructureService (IS) does,
|
||||
|
@ -99,7 +106,7 @@ namespace FabricHealer.Repair.Guan
|
|||
}
|
||||
|
||||
private RestartVMPredicateType(string name)
|
||||
: base(name, true, 0, 1)
|
||||
: base(name, true, 0, 2)
|
||||
{
|
||||
|
||||
}
|
||||
|
|
|
@ -43,5 +43,14 @@ namespace FabricHealer.Repair
|
|||
{
|
||||
get; set;
|
||||
} = TimeSpan.MinValue;
|
||||
|
||||
/// <summary>
|
||||
/// Whether or not RepairManager should do preparing and restoring health checks before approving the target repair job.
|
||||
/// Setting this to true will increase the time it takes to complete a repair.
|
||||
/// </summary>
|
||||
public bool DoHealthChecks
|
||||
{
|
||||
get; set;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -48,6 +48,15 @@ namespace FabricHealer.Repair
|
|||
string taskId = $"{FHTaskIdPrefix}/{Guid.NewGuid()}/{repair}/{executorData.NodeName}";
|
||||
bool doHealthChecks = impact != NodeImpactLevel.None;
|
||||
|
||||
// Health checks for app level repairs.
|
||||
if (executorData.RepairPolicy.DoHealthChecks && impact == NodeImpactLevel.None &&
|
||||
(repairAction == RepairActionType.RestartCodePackage ||
|
||||
repairAction == RepairActionType.RestartReplica ||
|
||||
repairAction == RepairActionType.RemoveReplica))
|
||||
{
|
||||
doHealthChecks = true;
|
||||
}
|
||||
|
||||
// Error health state on target SF entity can block RM from approving the job to repair it (which is the whole point of doing the job).
|
||||
// So, do not do health checks if customer configures FO to emit Error health reports.
|
||||
// In general, FO should *not* be configured to emit Error events. See FO documentation.
|
||||
|
|
Загрузка…
Ссылка в новой задаче