From c0d34c5da4fc6da87e089b8c0fc20755fd7f3802 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Wed, 14 Jul 2021 17:47:49 -0700 Subject: [PATCH] Bug fixes in rules. New feature for rules (DoHealthChecks). --- FHTest/FHUnitTests.cs | 10 +++--- .../Config/Rules/AppRules.config.txt | 8 ++--- .../Config/Rules/DiskRules.config.txt | 2 +- .../Config/Rules/FabricNodeRules.config.txt | 2 +- .../Config/Rules/ReplicaRules.config.txt | 10 +++--- .../Guan/RestartCodePackagePredicateType.cs | 34 ++++++++++-------- .../Guan/RestartFabricNodePredicateType.cs | 36 +++++++++++-------- ...RestartFabricSystemProcessPredicateType.cs | 31 +++++++++------- .../Guan/RestartReplicaPredicateType.cs | 32 ++++++++++------- .../Repair/Guan/RestartVMPredicateType.cs | 31 +++++++++------- FabricHealer/Repair/RepairPolicy.cs | 9 +++++ FabricHealer/Repair/RepairTaskEngine.cs | 9 +++++ 12 files changed, 133 insertions(+), 81 deletions(-) diff --git a/FHTest/FHUnitTests.cs b/FHTest/FHUnitTests.cs index e45ad21..fee8a97 100644 --- a/FHTest/FHUnitTests.cs +++ b/FHTest/FHUnitTests.cs @@ -51,7 +51,7 @@ namespace FHTest // Set this to the full path to your Rules directory in the FabricHealer project's PackageRoot\Config directory. // e.g., if developing on Windows, then something like @"C:\Users\[me]\source\repos\service-fabric-healer\FabricHealer\PackageRoot\Config\Rules\"; - private const string FHRulesDirectory = @""; + private const string FHRulesDirectory = @"C:\Users\ctorre\source\repos\service-fabric-healer\FabricHealer\PackageRoot\Config\Rules\"; /* GuanLogic Tests */ // TODO: More of them. @@ -72,7 +72,7 @@ namespace FHTest var executorData = new RepairExecutorData { - RepairPolicy = new RepairPolicy { RepairAction = RepairActionType.RestartCodePackage }, + RepairPolicy = new RepairPolicy(), }; foreach (var file in Directory.GetFiles(FHRulesDirectory)) @@ -166,9 +166,9 @@ namespace FHTest /* private Helpers */ private async Task TestInitializeGuanAndRunQuery( - TelemetryData foHealthData, - List repairRules, - RepairExecutorData executorData) + TelemetryData foHealthData, + List repairRules, + RepairExecutorData executorData) { var fabricClient = new FabricClient(FabricClientRole.Admin); var repairTaskHelper = new RepairTaskManager(fabricClient, context, token); diff --git a/FabricHealer/PackageRoot/Config/Rules/AppRules.config.txt b/FabricHealer/PackageRoot/Config/Rules/AppRules.config.txt index 09fb127..05ba79b 100644 --- a/FabricHealer/PackageRoot/Config/Rules/AppRules.config.txt +++ b/FabricHealer/PackageRoot/Config/Rules/AppRules.config.txt @@ -88,8 +88,8 @@ TimeScopedRestartCodePackage(?count, ?time) :- GetRepairHistory(?repairCount, Ti ## If we get here, it means the number of repairs for a target has not exceeded the maximum number specified to run within a time window. -## Note you can add an argument (optional) to RestartCodePackage, name it whatever you want or omit the name, it just has to be a TimeSpan value. -## This arg represents how long you want to wait to see if the target app service is healed. Internally, FH will check healthstate of repair target after repair execution (with sleeps) -## for a default of 10 minutes for app repair, 30 minutes for a FabricNode repair, and 2 hours for a VM repair. All FH external repair predicates support this optional arg. +## Note you can add up to two optional arguments to RestartCodePackage, name them whatever you want or omit the names, it just has to be either a TimeSpan value for how long to wait +## for the repair target to become healthy or a bool for whether or not RM should do health checks before and after the repair executes. +## Obviously, you can supply both. See below for an example using both optional arguments (named for clarity..you could also do RestartCodePackage(true, 00:10:00)). -TimeScopedRestartCodePackage() :- RestartCodePackage(). \ No newline at end of file +TimeScopedRestartCodePackage() :- RestartCodePackage(DoHealthChecks=true, MaxWaitTimeForHealthStateOk=00:10:00). \ No newline at end of file diff --git a/FabricHealer/PackageRoot/Config/Rules/DiskRules.config.txt b/FabricHealer/PackageRoot/Config/Rules/DiskRules.config.txt index 5b5b137..d1d51b5 100644 --- a/FabricHealer/PackageRoot/Config/Rules/DiskRules.config.txt +++ b/FabricHealer/PackageRoot/Config/Rules/DiskRules.config.txt @@ -1,7 +1,7 @@ ## Logic rules for Disk repair. Only file management is supported (file deletion). ## First, check if we are inside run interval. If so, then cut (!). -Mitigate() :- CheckInsideRunInterval(RunInterval=02:00:00), ! +Mitigate() :- CheckInsideRunInterval(RunInterval=02:00:00), !. ## The sample rule below checks the folder size of SF logs, fabricobserver logs, E:\temp. If the computed size exceeds a supplied threshold, then try and delete the files in the directory. ## The CheckFolderSize external predicate takes a folder path string argument and either a MaxFolderSizeGB or MaxFolderSizeMB argument. diff --git a/FabricHealer/PackageRoot/Config/Rules/FabricNodeRules.config.txt b/FabricHealer/PackageRoot/Config/Rules/FabricNodeRules.config.txt index b70ed7c..8410faf 100644 --- a/FabricHealer/PackageRoot/Config/Rules/FabricNodeRules.config.txt +++ b/FabricHealer/PackageRoot/Config/Rules/FabricNodeRules.config.txt @@ -1,7 +1,7 @@ ## Logic rules for Service Fabric Node repairs. ## First check if we are inside the run interval. If so, cut (!). -Mitigate() :- CheckInsideRunInterval(RunInterval=02:00:00), ! +Mitigate() :- CheckInsideRunInterval(RunInterval=02:00:00), !. ## This rule means that whatever the warning data from FabricObserver happens to be (related to node level healing repairs, of course), restart the target node if ## the repair hasn't run 4 times in the last 8 hours. diff --git a/FabricHealer/PackageRoot/Config/Rules/ReplicaRules.config.txt b/FabricHealer/PackageRoot/Config/Rules/ReplicaRules.config.txt index 176e94e..a7a1c5a 100644 --- a/FabricHealer/PackageRoot/Config/Rules/ReplicaRules.config.txt +++ b/FabricHealer/PackageRoot/Config/Rules/ReplicaRules.config.txt @@ -5,11 +5,11 @@ ## Start Time (UTC): 2020-04-26 19:22:55.492. ## First, check if we are inside run interval. If so, then cut (!), which effectively means stop processing rules (no backtracking to subsequent rules in the file). -Mitigate() :- CheckInsideRunInterval(RunInterval=00:15:00), ! +Mitigate() :- CheckInsideRunInterval(RunInterval=00:15:00), !. ## Set a repair count variable for use by any rule in this file (NOTE: all rules must have the same TimeWindow value) as an internal predicate, _mitigate(?count), ## where ?repairCount and ?count variables are unified when _mitigate(?count) predicate runs. The concept here is sharing a variable value across different rules. -Mitigate() :- GetRepairHistory(?repairCount, TimeWindow=01:00:00), _mitigate(?repairCount) +Mitigate() :- GetRepairHistory(?repairCount, TimeWindow=01:00:00), _mitigate(?repairCount). ## Now, let's say you wanted to only repair specific Apps or Paritions where related repair TimeWindow values are *not* the same, unlike the above "global" variable rule. ## You could do something like the below three rules, which would mean the _mitigate internal predicate would only run if the supplied Mitigate argument values are matched: @@ -21,10 +21,10 @@ Mitigate() :- GetRepairHistory(?repairCount, TimeWindow=01:00:00), _mitigate(?re ## This means that these predicates either succeed (pass true back) or fail (pass false back) as the result of their execution. So, if one fails, then the next rule will be run, etc. ## Try this. -_mitigate(?count) :- ?count < 4, RestartReplica() +_mitigate(?count) :- ?count < 4, RestartReplica(). ## Else, try this. -_mitigate(?count) :- ?count < 4, RestartCodePackage() +_mitigate(?count) :- ?count < 4, RestartCodePackage(). ## Else, try this. -_mitigate(?count) :- ?count < 2, RestartFabricNode() \ No newline at end of file +_mitigate(?count) :- ?count < 2, RestartFabricNode(). \ No newline at end of file diff --git a/FabricHealer/Repair/Guan/RestartCodePackagePredicateType.cs b/FabricHealer/Repair/Guan/RestartCodePackagePredicateType.cs index ceec98e..f06ea60 100644 --- a/FabricHealer/Repair/Guan/RestartCodePackagePredicateType.cs +++ b/FabricHealer/Repair/Guan/RestartCodePackagePredicateType.cs @@ -42,25 +42,31 @@ namespace FabricHealer.Repair.Guan protected override bool Check() { - int count = Input.Arguments.Count; - - if (count == 1 && Input.Arguments[0].Value.GetValue().GetType() != typeof(TimeSpan)) - { - throw new GuanException( - "RestartCodePackagePredicate: One optional argument is supported and it must be a TimeSpan " + - "(xx:yy:zz format, for example 00:30:00 represents 30 minutes)."); - } - // RepairPolicy repairConfiguration.RepairPolicy.RepairAction = RepairActionType.RestartCodePackage; repairConfiguration.RepairPolicy.RepairId = FOHealthData.RepairId; repairConfiguration.RepairPolicy.TargetType = RepairTargetType.Application; - if (count == 1 && Input.Arguments[0].Value.GetValue() is TimeSpan) - { - repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[0].Value.GetEffectiveTerm().GetValue(); - } + int count = Input.Arguments.Count; + for (int i = 0; i < count; i++) + { + var typeString = Input.Arguments[i].Value.GetValue().GetType().ToString(); + switch (typeString) + { + case "System.TimeSpan": + repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetValue(); + break; + + case "System.Boolean": + repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[0].Value.GetEffectiveTerm().GetValue(); + break; + + default: + throw new GuanException($"Unsupported input: {Input.Arguments[i].Value.GetValue().GetType()}"); + } + } + // Try to schedule repair with RM. var repairTask = FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => RepairTaskManager.ScheduleFabricHealerRmRepairTaskAsync( @@ -93,7 +99,7 @@ namespace FabricHealer.Repair.Guan } private RestartCodePackagePredicateType(string name) - : base(name, true, 0, 1) + : base(name, true, 0, 2) { } diff --git a/FabricHealer/Repair/Guan/RestartFabricNodePredicateType.cs b/FabricHealer/Repair/Guan/RestartFabricNodePredicateType.cs index 88cbb33..94ae0c2 100644 --- a/FabricHealer/Repair/Guan/RestartFabricNodePredicateType.cs +++ b/FabricHealer/Repair/Guan/RestartFabricNodePredicateType.cs @@ -43,27 +43,33 @@ namespace FabricHealer.Repair.Guan protected override bool Check() { - int count = Input.Arguments.Count; - - if (count == 1 && Input.Arguments[0].Value.GetValue().GetType() != typeof(TimeSpan)) - { - throw new GuanException( - "RestartFabricNodePredicate: One optional argument is supported and it must be a TimeSpan " + - "(xx:yy:zz format, for example 00:30:00 represents 30 minutes)."); - } - - RepairTask repairTask; - // Repair Policy repairConfiguration.RepairPolicy.RepairAction = RepairActionType.RestartFabricNode; repairConfiguration.RepairPolicy.RepairId = FOHealthData.RepairId; repairConfiguration.RepairPolicy.TargetType = FOHealthData.ApplicationName == "fabric:/System" ? RepairTargetType.Application : RepairTargetType.Node; - - if (count == 1 && Input.Arguments[0].Value.GetValue() is TimeSpan) + + int count = Input.Arguments.Count; + + for (int i = 0; i < count; i++) { - repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[0].Value.GetEffectiveTerm().GetValue(); + var typeString = Input.Arguments[i].Value.GetValue().GetType().ToString(); + + switch (typeString) + { + case "System.TimeSpan": + repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetValue(); + break; + + case "System.Boolean": + repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[0].Value.GetEffectiveTerm().GetValue(); + break; + + default: + throw new GuanException($"Unsupported input: {Input.Arguments[i].Value.GetValue().GetType()}"); + } } + RepairTask repairTask; bool success; // This means it's a resumed repair. @@ -141,7 +147,7 @@ namespace FabricHealer.Repair.Guan } private RestartFabricNodePredicateType(string name) - : base(name, true, 0, 1) + : base(name, true, 0, 2) { } diff --git a/FabricHealer/Repair/Guan/RestartFabricSystemProcessPredicateType.cs b/FabricHealer/Repair/Guan/RestartFabricSystemProcessPredicateType.cs index eba835d..64a0f55 100644 --- a/FabricHealer/Repair/Guan/RestartFabricSystemProcessPredicateType.cs +++ b/FabricHealer/Repair/Guan/RestartFabricSystemProcessPredicateType.cs @@ -50,23 +50,30 @@ namespace FabricHealer.Repair.Guan return false; } - int count = Input.Arguments.Count; - - if (count == 1 && Input.Arguments[0].Value.GetValue().GetType() != typeof(TimeSpan)) - { - throw new GuanException( - "RestartFabricSystemProcessPredicate: One optional argument is supported and it must be a TimeSpan " + - "(xx:yy:zz format, for example 00:30:00 represents 30 minutes)."); - } - // RepairPolicy repairConfiguration.RepairPolicy.RepairAction = RepairActionType.RestartProcess; repairConfiguration.RepairPolicy.RepairId = FOHealthData.RepairId; repairConfiguration.RepairPolicy.TargetType = RepairTargetType.Application; - if (count == 1 && Input.Arguments[0].Value.GetValue() is TimeSpan) + int count = Input.Arguments.Count; + + for (int i = 0; i < count; i++) { - repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[0].Value.GetEffectiveTerm().GetValue(); + var typeString = Input.Arguments[i].Value.GetValue().GetType().ToString(); + + switch (typeString) + { + case "System.TimeSpan": + repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetValue(); + break; + + case "System.Boolean": + repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[0].Value.GetEffectiveTerm().GetValue(); + break; + + default: + throw new GuanException($"Unsupported input: {Input.Arguments[i].Value.GetValue().GetType()}"); + } } // Try to schedule repair with RM. @@ -101,7 +108,7 @@ namespace FabricHealer.Repair.Guan } private RestartFabricSystemProcessPredicateType(string name) - : base(name, true, 0, 1) + : base(name, true, 0, 2) { } diff --git a/FabricHealer/Repair/Guan/RestartReplicaPredicateType.cs b/FabricHealer/Repair/Guan/RestartReplicaPredicateType.cs index 8f05193..0d0a9cc 100644 --- a/FabricHealer/Repair/Guan/RestartReplicaPredicateType.cs +++ b/FabricHealer/Repair/Guan/RestartReplicaPredicateType.cs @@ -40,21 +40,29 @@ namespace FabricHealer.Repair.Guan protected override bool Check() { - int count = Input.Arguments.Count; - - if (count == 1 && Input.Arguments[0].Value.GetValue().GetType() != typeof(TimeSpan)) - { - throw new GuanException( - "RestartReplicaPredicate: One optional argument is supported and it must be a TimeSpan " + - "(xx:yy:zz format, for example 00:30:00 represents 30 minutes)."); - } - repairConfiguration.RepairPolicy.RepairId = FOHealthData.RepairId; repairConfiguration.RepairPolicy.TargetType = RepairTargetType.Application; + repairConfiguration.RepairPolicy.RepairAction = RepairActionType.RestartReplica; + + int count = Input.Arguments.Count; - if (count == 1 && Input.Arguments[0].Value.GetValue() is TimeSpan) + for (int i = 0; i < count; i++) { - repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[0].Value.GetEffectiveTerm().GetValue(); + var typeString = Input.Arguments[i].Value.GetValue().GetType().ToString(); + + switch (typeString) + { + case "System.TimeSpan": + repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetValue(); + break; + + case "System.Boolean": + repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[0].Value.GetEffectiveTerm().GetValue(); + break; + + default: + throw new GuanException($"Unsupported input: {Input.Arguments[i].Value.GetValue().GetType()}"); + } } // Try to schedule repair with RM. @@ -89,7 +97,7 @@ namespace FabricHealer.Repair.Guan } private RestartReplicaPredicateType(string name) - : base(name, true, 0, 1) + : base(name, true, 0, 2) { } diff --git a/FabricHealer/Repair/Guan/RestartVMPredicateType.cs b/FabricHealer/Repair/Guan/RestartVMPredicateType.cs index 013aad1..16c363a 100644 --- a/FabricHealer/Repair/Guan/RestartVMPredicateType.cs +++ b/FabricHealer/Repair/Guan/RestartVMPredicateType.cs @@ -40,23 +40,30 @@ namespace FabricHealer.Repair.Guan protected override bool Check() { - int count = Input.Arguments.Count; - - if (count == 1 && Input.Arguments[0].Value.GetValue().GetType() != typeof(TimeSpan)) - { - throw new GuanException( - "RestartVMPredicate: One optional argument is supported and it must be a TimeSpan " + - "(xx:yy:zz format, for example 00:30:00 represents 30 minutes)."); - } - // Repair Policy repairConfiguration.RepairPolicy.RepairAction = RepairActionType.RestartVM; repairConfiguration.RepairPolicy.RepairId = FOHealthData.RepairId; repairConfiguration.RepairPolicy.TargetType = RepairTargetType.VirtualMachine; - if (count == 1) + int count = Input.Arguments.Count; + + for (int i = 0; i < count; i++) { - repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[0].Value.GetEffectiveTerm().GetValue(); + var typeString = Input.Arguments[i].Value.GetValue().GetType().ToString(); + + switch (typeString) + { + case "System.TimeSpan": + repairConfiguration.RepairPolicy.MaxTimePostRepairHealthCheck = (TimeSpan)Input.Arguments[i].Value.GetEffectiveTerm().GetValue(); + break; + + case "System.Boolean": + repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[0].Value.GetEffectiveTerm().GetValue(); + break; + + default: + throw new GuanException($"Unsupported input: {Input.Arguments[i].Value.GetValue().GetType()}"); + } } // FH does not execute repairs for VM level mitigation. InfrastructureService (IS) does, @@ -99,7 +106,7 @@ namespace FabricHealer.Repair.Guan } private RestartVMPredicateType(string name) - : base(name, true, 0, 1) + : base(name, true, 0, 2) { } diff --git a/FabricHealer/Repair/RepairPolicy.cs b/FabricHealer/Repair/RepairPolicy.cs index b047a29..f8a43d9 100644 --- a/FabricHealer/Repair/RepairPolicy.cs +++ b/FabricHealer/Repair/RepairPolicy.cs @@ -43,5 +43,14 @@ namespace FabricHealer.Repair { get; set; } = TimeSpan.MinValue; + + /// + /// Whether or not RepairManager should do preparing and restoring health checks before approving the target repair job. + /// Setting this to true will increase the time it takes to complete a repair. + /// + public bool DoHealthChecks + { + get; set; + } } } diff --git a/FabricHealer/Repair/RepairTaskEngine.cs b/FabricHealer/Repair/RepairTaskEngine.cs index 055cc07..b299f0e 100644 --- a/FabricHealer/Repair/RepairTaskEngine.cs +++ b/FabricHealer/Repair/RepairTaskEngine.cs @@ -48,6 +48,15 @@ namespace FabricHealer.Repair string taskId = $"{FHTaskIdPrefix}/{Guid.NewGuid()}/{repair}/{executorData.NodeName}"; bool doHealthChecks = impact != NodeImpactLevel.None; + // Health checks for app level repairs. + if (executorData.RepairPolicy.DoHealthChecks && impact == NodeImpactLevel.None && + (repairAction == RepairActionType.RestartCodePackage || + repairAction == RepairActionType.RestartReplica || + repairAction == RepairActionType.RemoveReplica)) + { + doHealthChecks = true; + } + // Error health state on target SF entity can block RM from approving the job to repair it (which is the whole point of doing the job). // So, do not do health checks if customer configures FO to emit Error health reports. // In general, FO should *not* be configured to emit Error events. See FO documentation.