Updated rule tracing (ln#), unit tests.
This commit is contained in:
Родитель
941ffd1eb6
Коммит
441937a9b1
|
@ -25,6 +25,7 @@ using System.Xml;
|
|||
using ServiceFabric.Mocks;
|
||||
using static ServiceFabric.Mocks.MockConfigurationPackage;
|
||||
using System.Fabric.Description;
|
||||
using System.Fabric.Query;
|
||||
|
||||
namespace FHTest
|
||||
{
|
||||
|
@ -298,21 +299,28 @@ namespace FHTest
|
|||
[TestMethod]
|
||||
public async Task AllAppRules_EnsureWellFormedRules_QueryInitialized_Successful()
|
||||
{
|
||||
var partitions = await fabricClient.QueryManager.GetPartitionListAsync(new Uri("fabric:/TestApp42/ChildProcessCreator"));
|
||||
Guid partition = partitions[0].PartitionInformation.Id;
|
||||
|
||||
// This will be the data used to create a repair task.
|
||||
var repairData = new TelemetryData
|
||||
{
|
||||
ApplicationName = "fabric:/test",
|
||||
ApplicationName = "fabric:/TestApp42",
|
||||
EntityType = EntityType.Service,
|
||||
NodeName = NodeName,
|
||||
Code = SupportedErrorCodes.AppErrorMemoryMB,
|
||||
HealthState = HealthState.Warning,
|
||||
ServiceName = "fabric:/test0/service0",
|
||||
PartitionId = partition.ToString(),
|
||||
Source = $"AppObserver({SupportedErrorCodes.AppErrorMemoryMB})",
|
||||
Property = "TestApp42_ChildProcessCreator_MemoryMB",
|
||||
ProcessName = "ChildProcessCreator",
|
||||
ServiceName = "fabric:/TestApp42/ChildProcessCreator",
|
||||
Value = 1024.0
|
||||
};
|
||||
|
||||
repairData.RepairPolicy = new RepairPolicy
|
||||
{
|
||||
RepairId = $"Test42_{SupportedErrorCodes.AppErrorMemoryMB}{NodeName}",
|
||||
RepairId = $"TestApp42_{SupportedErrorCodes.AppErrorMemoryMB}{NodeName}",
|
||||
AppName = repairData.ApplicationName,
|
||||
RepairIdPrefix = RepairConstants.FHTaskIdPrefix,
|
||||
NodeName = repairData.NodeName,
|
||||
|
@ -423,15 +431,27 @@ namespace FHTest
|
|||
[TestMethod]
|
||||
public async Task AllReplicaRules_EnsureWellFormedRules_QueryInitialized_Successful()
|
||||
{
|
||||
var partitions = await fabricClient.QueryManager.GetPartitionListAsync(new Uri("fabric:/TestApp42/ChildProcessCreator"));
|
||||
Guid partition = partitions[0].PartitionInformation.Id;
|
||||
var replicas = await fabricClient.QueryManager.GetReplicaListAsync(partition);
|
||||
Replica replica = replicas[0];
|
||||
long replicaId = replica.Id;
|
||||
|
||||
// This will be the data used to create a repair task.
|
||||
var repairData = new TelemetryData
|
||||
{
|
||||
ApplicationName = "fabric:/test",
|
||||
EntityType = EntityType.Partition,
|
||||
PartitionId = Guid.NewGuid().ToString(),
|
||||
ApplicationName = "fabric:/TestApp42",
|
||||
EntityType = EntityType.Service,
|
||||
NodeName = NodeName,
|
||||
Code = SupportedErrorCodes.AppErrorMemoryMB,
|
||||
HealthState = HealthState.Warning,
|
||||
ServiceName = "fabric:/test0/service0"
|
||||
PartitionId = partition.ToString(),
|
||||
ReplicaId = replicaId,
|
||||
Source = $"AppObserver({SupportedErrorCodes.AppErrorMemoryMB})",
|
||||
Property = "TestApp42_ChildProcessCreator_MemoryMB",
|
||||
ProcessName = "ChildProcessCreator",
|
||||
ServiceName = "fabric:/TestApp42/ChildProcessCreator",
|
||||
Value = 1024.0
|
||||
};
|
||||
|
||||
repairData.RepairPolicy = new RepairPolicy
|
||||
|
@ -515,18 +535,25 @@ namespace FHTest
|
|||
{
|
||||
string testRulesFilePath = Path.Combine(Environment.CurrentDirectory, "testrules_wellformed.guan");
|
||||
string[] rules = await File.ReadAllLinesAsync(testRulesFilePath, token);
|
||||
FabricHealerManager.CurrentlyExecutingLogicRulesFileName = "testrules_wellformed.guan";
|
||||
List<string> repairRules = FabricHealerManager.ParseRulesFile(rules);
|
||||
var partitions = await fabricClient.QueryManager.GetPartitionListAsync(new Uri("fabric:/TestApp42/ChildProcessCreator"));
|
||||
Guid partition = partitions[0].PartitionInformation.Id;
|
||||
|
||||
// This will be the data used to create a repair task.
|
||||
var repairData = new TelemetryData
|
||||
{
|
||||
ApplicationName = "fabric:/test0",
|
||||
ApplicationName = "fabric:/TestApp42",
|
||||
EntityType = EntityType.Service,
|
||||
NodeName = NodeName,
|
||||
Metric = "Memory",
|
||||
HealthState = HealthState.Warning,
|
||||
Code = SupportedErrorCodes.AppErrorMemoryMB,
|
||||
ServiceName = "fabric:/test0/service0",
|
||||
Value = 42,
|
||||
ReplicaId = default,
|
||||
PartitionId = default
|
||||
HealthState = HealthState.Warning,
|
||||
PartitionId = partition.ToString(),
|
||||
Source = $"AppObserver({SupportedErrorCodes.AppErrorMemoryMB})",
|
||||
Property = "TestApp42_ChildProcessCreator_MemoryMB",
|
||||
ProcessName = "ChildProcessCreator",
|
||||
ServiceName = "fabric:/TestApp42/ChildProcessCreator",
|
||||
Value = 1024.0
|
||||
};
|
||||
|
||||
repairData.RepairPolicy = new RepairPolicy
|
||||
|
|
|
@ -127,6 +127,7 @@ Mitigate(AppName="fabric:/MemoryStress", MetricName="MemoryPercent", MetricValue
|
|||
## Memory - Megabytes In Use for Any SF Service Process belonging to the specified SF Applications. 5 repairs within 5 hour window.
|
||||
Mitigate(AppName="fabric:/ContainerFoo", MetricName="MemoryMB") :- TimeScopedRestartCodePackage(5, 05:00:00).
|
||||
Mitigate(AppName="fabric:/ContainerFoo2", MetricName="MemoryMB") :- TimeScopedRestartCodePackage(5, 05:00:00).
|
||||
Mitigate(AppName="fabric:/TestApp42", MetricName="MemoryMB") :- TimeScopedRestartCodePackage(5, 05:00:00).
|
||||
|
||||
## Note the constraint on HealthState in the head of the rule below, which only applies to one service, fabric:/Voting/VotingData, in this example (just change the fabric Uri for your target).
|
||||
## This is important when you have both Warning and Error thresholds specified for some service for some metric in FabricObserver. You would do that
|
||||
|
|
|
@ -26,7 +26,7 @@ Mitigate() :- GetRepairHistory(?repairCount, 01:00:00), _mitigate(?repairCount).
|
|||
## This means that these predicates either succeed (pass true back) or fail (pass false back) as the result of their execution. So, if one fails, then the next rule will be run, etc.
|
||||
|
||||
## Try this.
|
||||
_mitigate(?count) :- ?count < 4, RestartReplica(DoHealthChecks=false, MaxWaitTimeForHealthStateOk=00:15:00, MaxExecutionTime=00:10:00).
|
||||
_mitigate(?count) :- ?count < 400, RestartReplica(DoHealthChecks=false, MaxWaitTimeForHealthStateOk=00:15:00, MaxExecutionTime=00:10:00).
|
||||
|
||||
## Else, try this.
|
||||
_mitigate(?count) :- ?count < 4, RestartCodePackage(DoHealthChecks=false, MaxWaitTimeForHealthStateOk=00:15:00, MaxExecutionTime=00:30:00).
|
||||
|
|
|
@ -47,7 +47,7 @@ Mitigate() :- CheckInsideScheduleInterval(00:10:00), !.
|
|||
Mitigate() :- CheckInsideNodeProbationPeriod(00:30:00), !.
|
||||
|
||||
## Don't proceed if the target node hasn't been in Error (including cyclic Up/Down) state for at least two hours.
|
||||
Mitigate() :- CheckInsideHealthStateMinDuration(00:01:00), !.
|
||||
Mitigate() :- CheckInsideHealthStateMinDuration(02:00:00), !.
|
||||
|
||||
## For certain environments, the correct mitigation is to deactivate the target node. The below rule schedules a node deactivation (intent is Pause) repair.
|
||||
Mitigate(Source=?source, Property=?property) :- match(?source, "EventLogWatchdog"), match(?property, "CriticalMachineFailure"),
|
||||
|
|
|
@ -78,7 +78,7 @@ namespace FabricHealer.Repair.Guan
|
|||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
$"{ruleFileName}#{lineNumber}_{RepairData.RepairPolicy.ProcessName ?? RepairData.NodeName}",
|
||||
$"{ruleFileName}#{lineNumber + 1}_{RepairData.RepairPolicy.ProcessName ?? RepairData.NodeName}",
|
||||
$"Executing logic rule \'{rule}\'",
|
||||
FabricHealerManager.Token);
|
||||
}
|
||||
|
|
|
@ -536,7 +536,7 @@ namespace FabricHealer.Repair
|
|||
|
||||
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
|
||||
LogLevel.Info,
|
||||
$"{ruleFileName}#{lineNumber}_{repairData.RepairPolicy.ProcessName ?? repairData.NodeName}",
|
||||
$"{ruleFileName}#{lineNumber + 1}_{repairData.RepairPolicy.ProcessName ?? repairData.NodeName}",
|
||||
$"Executing logic rule \'{rule}\'",
|
||||
FabricHealerManager.Token);
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче