Updated rule tracing (ln#), unit tests.

This commit is contained in:
Charles Torre 2023-02-27 15:32:15 -08:00
Родитель 941ffd1eb6
Коммит 441937a9b1
6 изменённых файлов: 46 добавлений и 18 удалений

Просмотреть файл

@ -25,6 +25,7 @@ using System.Xml;
using ServiceFabric.Mocks;
using static ServiceFabric.Mocks.MockConfigurationPackage;
using System.Fabric.Description;
using System.Fabric.Query;
namespace FHTest
{
@ -298,21 +299,28 @@ namespace FHTest
[TestMethod]
public async Task AllAppRules_EnsureWellFormedRules_QueryInitialized_Successful()
{
var partitions = await fabricClient.QueryManager.GetPartitionListAsync(new Uri("fabric:/TestApp42/ChildProcessCreator"));
Guid partition = partitions[0].PartitionInformation.Id;
// This will be the data used to create a repair task.
var repairData = new TelemetryData
{
ApplicationName = "fabric:/test",
ApplicationName = "fabric:/TestApp42",
EntityType = EntityType.Service,
NodeName = NodeName,
Code = SupportedErrorCodes.AppErrorMemoryMB,
HealthState = HealthState.Warning,
ServiceName = "fabric:/test0/service0",
PartitionId = partition.ToString(),
Source = $"AppObserver({SupportedErrorCodes.AppErrorMemoryMB})",
Property = "TestApp42_ChildProcessCreator_MemoryMB",
ProcessName = "ChildProcessCreator",
ServiceName = "fabric:/TestApp42/ChildProcessCreator",
Value = 1024.0
};
repairData.RepairPolicy = new RepairPolicy
{
RepairId = $"Test42_{SupportedErrorCodes.AppErrorMemoryMB}{NodeName}",
RepairId = $"TestApp42_{SupportedErrorCodes.AppErrorMemoryMB}{NodeName}",
AppName = repairData.ApplicationName,
RepairIdPrefix = RepairConstants.FHTaskIdPrefix,
NodeName = repairData.NodeName,
@ -423,15 +431,27 @@ namespace FHTest
[TestMethod]
public async Task AllReplicaRules_EnsureWellFormedRules_QueryInitialized_Successful()
{
var partitions = await fabricClient.QueryManager.GetPartitionListAsync(new Uri("fabric:/TestApp42/ChildProcessCreator"));
Guid partition = partitions[0].PartitionInformation.Id;
var replicas = await fabricClient.QueryManager.GetReplicaListAsync(partition);
Replica replica = replicas[0];
long replicaId = replica.Id;
// This will be the data used to create a repair task.
var repairData = new TelemetryData
{
ApplicationName = "fabric:/test",
EntityType = EntityType.Partition,
PartitionId = Guid.NewGuid().ToString(),
ApplicationName = "fabric:/TestApp42",
EntityType = EntityType.Service,
NodeName = NodeName,
Code = SupportedErrorCodes.AppErrorMemoryMB,
HealthState = HealthState.Warning,
ServiceName = "fabric:/test0/service0"
PartitionId = partition.ToString(),
ReplicaId = replicaId,
Source = $"AppObserver({SupportedErrorCodes.AppErrorMemoryMB})",
Property = "TestApp42_ChildProcessCreator_MemoryMB",
ProcessName = "ChildProcessCreator",
ServiceName = "fabric:/TestApp42/ChildProcessCreator",
Value = 1024.0
};
repairData.RepairPolicy = new RepairPolicy
@ -515,18 +535,25 @@ namespace FHTest
{
string testRulesFilePath = Path.Combine(Environment.CurrentDirectory, "testrules_wellformed.guan");
string[] rules = await File.ReadAllLinesAsync(testRulesFilePath, token);
FabricHealerManager.CurrentlyExecutingLogicRulesFileName = "testrules_wellformed.guan";
List<string> repairRules = FabricHealerManager.ParseRulesFile(rules);
var partitions = await fabricClient.QueryManager.GetPartitionListAsync(new Uri("fabric:/TestApp42/ChildProcessCreator"));
Guid partition = partitions[0].PartitionInformation.Id;
// This will be the data used to create a repair task.
var repairData = new TelemetryData
{
ApplicationName = "fabric:/test0",
ApplicationName = "fabric:/TestApp42",
EntityType = EntityType.Service,
NodeName = NodeName,
Metric = "Memory",
HealthState = HealthState.Warning,
Code = SupportedErrorCodes.AppErrorMemoryMB,
ServiceName = "fabric:/test0/service0",
Value = 42,
ReplicaId = default,
PartitionId = default
HealthState = HealthState.Warning,
PartitionId = partition.ToString(),
Source = $"AppObserver({SupportedErrorCodes.AppErrorMemoryMB})",
Property = "TestApp42_ChildProcessCreator_MemoryMB",
ProcessName = "ChildProcessCreator",
ServiceName = "fabric:/TestApp42/ChildProcessCreator",
Value = 1024.0
};
repairData.RepairPolicy = new RepairPolicy

Просмотреть файл

@ -127,6 +127,7 @@ Mitigate(AppName="fabric:/MemoryStress", MetricName="MemoryPercent", MetricValue
## Memory - Megabytes In Use for Any SF Service Process belonging to the specified SF Applications. 5 repairs within 5 hour window.
Mitigate(AppName="fabric:/ContainerFoo", MetricName="MemoryMB") :- TimeScopedRestartCodePackage(5, 05:00:00).
Mitigate(AppName="fabric:/ContainerFoo2", MetricName="MemoryMB") :- TimeScopedRestartCodePackage(5, 05:00:00).
Mitigate(AppName="fabric:/TestApp42", MetricName="MemoryMB") :- TimeScopedRestartCodePackage(5, 05:00:00).
## Note the constraint on HealthState in the head of the rule below, which only applies to one service, fabric:/Voting/VotingData, in this example (just change the fabric Uri for your target).
## This is important when you have both Warning and Error thresholds specified for some service for some metric in FabricObserver. You would do that

Просмотреть файл

@ -26,7 +26,7 @@ Mitigate() :- GetRepairHistory(?repairCount, 01:00:00), _mitigate(?repairCount).
## This means that these predicates either succeed (pass true back) or fail (pass false back) as the result of their execution. So, if one fails, then the next rule will be run, etc.
## Try this.
_mitigate(?count) :- ?count < 4, RestartReplica(DoHealthChecks=false, MaxWaitTimeForHealthStateOk=00:15:00, MaxExecutionTime=00:10:00).
_mitigate(?count) :- ?count < 400, RestartReplica(DoHealthChecks=false, MaxWaitTimeForHealthStateOk=00:15:00, MaxExecutionTime=00:10:00).
## Else, try this.
_mitigate(?count) :- ?count < 4, RestartCodePackage(DoHealthChecks=false, MaxWaitTimeForHealthStateOk=00:15:00, MaxExecutionTime=00:30:00).

Просмотреть файл

@ -47,7 +47,7 @@ Mitigate() :- CheckInsideScheduleInterval(00:10:00), !.
Mitigate() :- CheckInsideNodeProbationPeriod(00:30:00), !.
## Don't proceed if the target node hasn't been in Error (including cyclic Up/Down) state for at least two hours.
Mitigate() :- CheckInsideHealthStateMinDuration(00:01:00), !.
Mitigate() :- CheckInsideHealthStateMinDuration(02:00:00), !.
## For certain environments, the correct mitigation is to deactivate the target node. The below rule schedules a node deactivation (intent is Pause) repair.
Mitigate(Source=?source, Property=?property) :- match(?source, "EventLogWatchdog"), match(?property, "CriticalMachineFailure"),

Просмотреть файл

@ -78,7 +78,7 @@ namespace FabricHealer.Repair.Guan
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"{ruleFileName}#{lineNumber}_{RepairData.RepairPolicy.ProcessName ?? RepairData.NodeName}",
$"{ruleFileName}#{lineNumber + 1}_{RepairData.RepairPolicy.ProcessName ?? RepairData.NodeName}",
$"Executing logic rule \'{rule}\'",
FabricHealerManager.Token);
}

Просмотреть файл

@ -536,7 +536,7 @@ namespace FabricHealer.Repair
await FabricHealerManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
LogLevel.Info,
$"{ruleFileName}#{lineNumber}_{repairData.RepairPolicy.ProcessName ?? repairData.NodeName}",
$"{ruleFileName}#{lineNumber + 1}_{repairData.RepairPolicy.ProcessName ?? repairData.NodeName}",
$"Executing logic rule \'{rule}\'",
FabricHealerManager.Token);