Re-impl (DEV): MachineRepair.

2022-08-30 18:55:44 -07:00 · 2022-08-30 18:55:44 -07:00 · 698bc2b3cc
--- a/FHTest/FHUnitTests.cs
+++ b/FHTest/FHUnitTests.cs
@ -285,7 +285,7 @@ namespace FHTest
            functorTable.Add(RestartFabricNodePredicateType.Singleton(RepairConstants.RestartFabricNode, repairTaskManager, executorData, repairTaskEngine, repairData));
            functorTable.Add(RestartFabricSystemProcessPredicateType.Singleton(RepairConstants.RestartFabricSystemProcess, repairTaskManager, repairData));
            functorTable.Add(RestartReplicaPredicateType.Singleton(RepairConstants.RestartReplica, repairTaskManager, repairData));
-            functorTable.Add(RestartMachinePredicateType.Singleton(RepairConstants.RestartVM, repairTaskManager, repairData));
+            functorTable.Add(ScheduleMachineRepairPredicateType.Singleton(RepairConstants.ScheduleMachineRepair, repairTaskManager, repairData));

            // Parse rules
            Module module = Module.Parse("external", repairRules, functorTable);
--- a/FabricHealer/FabricHealerManager.cs
+++ b/FabricHealer/FabricHealerManager.cs
@ -356,7 +356,7 @@ namespace FabricHealer

                    await Task.Delay(
                        TimeSpan.FromSeconds(
-                            ConfigSettings.ExecutionLoopSleepSeconds > 0 ? ConfigSettings.ExecutionLoopSleepSeconds : 10), Token);      
+                            ConfigSettings.HealthCheckIntervalInSeconds > 0 ? ConfigSettings.HealthCheckIntervalInSeconds : 10), Token);      
                }

                RepairLogger.LogInfo("Shutdown signaled. Stopping.");
@ -1330,7 +1330,7 @@ namespace FabricHealer
                repairTaskManager.DetectedHealthEvents.Add(evt);

                // Start the repair workflow.
-                await repairTaskManager.StartRepairWorkflowAsync((TelemetryData)repairData, repairRules, Token);
+                await repairTaskManager.StartRepairWorkflowAsync(repairData, repairRules, Token);
            }
        }

@ -1357,11 +1357,11 @@ namespace FabricHealer
                Token.ThrowIfCancellationRequested();
            
                var nodeHealth = await FabricClientSingleton.HealthManager.GetNodeHealthAsync(node.NodeName);
-                var observerHealthEvents =
+                var nodeHealthEvents =
                    nodeHealth.HealthEvents.Where(
                                s => (s.HealthInformation.HealthState == HealthState.Warning || s.HealthInformation.HealthState == HealthState.Error));
                
-                foreach (var evt in observerHealthEvents)
+                foreach (var evt in nodeHealthEvents)
                {
                    Token.ThrowIfCancellationRequested();

@ -1373,7 +1373,11 @@ namespace FabricHealer
                        continue;
                    }

+                    // TODO: Remove this hard requirement (TelemetryData-only). FH can just read health event data and learn what the problem is if FO/FH Proxy did not generate the health event. This is important
+                    // for cases where customers are not using FO or FHProxy, but want to use FH.
                    // Check to see if the event Description is a serialized instance of TelemetryData, which would mean the health report was generated in a supported way.
+                    // In the case where there is no TelemetryData involved, create a new TelemtryData and set it with the minimum number of facts required to accomplish the goal. 
+                    // This is trivial for the Machine repair case, but will get more complicated for other entities. That said, it is very doable.
                    if (!JsonSerializationUtility.TryDeserialize(evt.HealthInformation.Description, out TelemetryData repairData))
                    {
                        continue;
@ -1423,7 +1427,7 @@ namespace FabricHealer
                        continue;
                    }

-                    // Get repair rules for supported source Observer.
+                    // Get repair rules for supplied facts (TelemetryData).
                    var repairRules = GetRepairRulesForTelemetryData(repairData);

                    if (repairRules == null || repairRules.Count == 0)
--- a/FabricHealer/PackageRoot/Config/LogicRules/AppRules.guan
+++ b/FabricHealer/PackageRoot/Config/LogicRules/AppRules.guan
@ -79,8 +79,10 @@ Mitigate(AppName="fabric:/PortEater42", MetricName="EphemeralPorts", MetricValue
 ## and where at least 3 health events identifying this problem were produced in the last 15 minutes. This is useful to ensure you don't mitigate a transient (short-lived)
 ## problem as they will self-correct.

-Mitigate(AppName="fabric:/CpuStress", MetricName="CpuPercent", MetricValue=?MetricValue) :- ?MetricValue >= 15,
-	GetHealthEventHistory(?HealthEventCount, 00:15:00),
+## How long has it been in unhealthy state and how long has it not been in unhealthy state (healthy state).
+
+Mitigate(ServiceName="fabric:/CpuStress/CpuStressor", MetricName="CpuPercent", MetricValue=?MetricValue) :- ?MetricValue >= 85,
+	GetHealthEventHistory(?HealthEventCount, 00:30:00),
 	?HealthEventCount >= 3,
 	TimeScopedRestartCodePackage(1, 00:15:00).

@ -162,7 +164,7 @@ Mitigate(AppName="fabric:/MyApp42", MetricName="EphemeralPortsPercent", MetricVa
 Mitigate(AppName=?AppName, MetricName="Threads", MetricValue=?MetricValue) :- ?AppName != "fabric:/FabricObserver", ?MetricValue >= 400, TimeScopedRestartCodePackage(5, 05:00:00).

 ## Threads - Any app service. 5 repairs within 5 hour window. This means if FO warns on Thread count, then heal. There are no conditional checks (on MetricValue) to take place.
-## Mitigate(MetricName="Threads") :- TimeScopedRestartCodePackage(5, 05:00:00).
+Mitigate(MetricName="Threads") :- TimeScopedRestartCodePackage(5, 05:00:00).

 ## Generic rule for restarting any service in Warning or Error. This means any service that is in Error or Warning state and
 ## also specified in the serialized TelemetryData instance that forms the Description of the related Service level Health Event will be restarted.
@ -186,5 +188,5 @@ TimeScopedRestartReplica(?count, ?time) :- GetRepairHistory(?repairCount, ?time)
 ## See below for an example using both optional arguments. Named arguments are just used for clarity below; you could also just specify RestartCodePackage(true, 00:10:00), for example.
 ## Note: It's up to you to decide if you want RepairManager to conduct pre and post health checks.

-TimeScopedRestartCodePackage() :- RestartCodePackage(DoHealthChecks=true, MaxWaitTimeForHealthStateOk=00:10:00).
+TimeScopedRestartCodePackage() :- RestartCodePackage(DoHealthChecks=false, MaxWaitTimeForHealthStateOk=00:10:00).
 TimeScopedRestartReplica() :- RestartReplica(DoHealthChecks=true, MaxWaitTimeForHealthStateOk=00:01:00).
--- a/FabricHealer/PackageRoot/Config/LogicRules/MachineRules.guan
+++ b/FabricHealer/PackageRoot/Config/LogicRules/MachineRules.guan
@ -1,6 +1,6 @@
-## Logic rules for Machine level repairs in the cluster. Only OS reboot is supported today.
+## Logic rules for Machine level repairs in the cluster. Only OS reboot is supported today for VMSS-managed clusters.

-## Applicable Named Arguments for Mitigate. Corresponding data is supplied by FabricObserver, Renamed for brevity by FH.
+## Applicable Named Arguments for Mitigate. Corresponding facts are supplied by FabricObserver or FabricHealerProxy, Renamed for brevity by FH.
 ## | Argument Name             | Definition                                                                                   |
 ## |---------------------------|----------------------------------------------------------------------------------------------|
 ## | NodeName                  | Name of the node                                                                             | 
@ -22,37 +22,75 @@
 ## | FileHandles (Linux)       | 
 ## | FileHandlesPercent (Linux)| 

+## Supported repair action names.
+## | Name                         |
+## |------------------------------|
+## | System.Reboot                |
+## | System.ReimageOS             |
+## | System.FullReimage           |
+## | System.Azure.HostReboot      |
+## | System.Azure.HostRepaveData  |
+
 ## First, check if we are inside run interval. If so, then cut (!).
 ## This is commented out by default. Just uncomment and set the global run interval for VM level repairs to suit your needs.

 ## Mitigate() :- CheckInsideRunInterval(02:00:00), !.

-## TimeScopedRestartVM is an internal predicate to check for the number of times a VM reboot repair has run to completion within a supplied time window. 
+## TimeScopedRebootMachine is an internal predicate to check for the number of times a VM reboot repair has run to completion within a supplied time window. 
 ## If Completed VM Repair count is less then supplied value, then run RestartVM mitigation.

-TimeScopedRestartVM(?count, ?time) :- GetRepairHistory(?repairCount, ?time),
+TimeScopedRebootMachine(?count, ?time) :- GetRepairHistory(?repairCount, ?time),
    ?repairCount < ?count, 
-    RestartVM().
+    ScheduleMachineRepair("System.Reboot").

 ## Percent Memory in Use (of total physical).

-Mitigate(MetricName="MemoryPercent", MetricValue=?MetricValue) :- ?MetricValue >= 95,
+Mitigate(MetricName=MemoryPercent, MetricValue=?MetricValue) :- ?MetricValue >= 95,
 	GetHealthEventHistory(?HealthEventCount, 00:15:00),
 	?HealthEventCount >= 3,
-    TimeScopedRestartVM(4, 08:00:00).
-
+    TimeScopedRebootMachine(4, 08:00:00).

 ## File Handles/FDs. Linux-only.
 ## Percent Allocated, System-wide.

-Mitigate(MetricName="FileHandlesPercent", MetricValue=?MetricValue, OS="Linux") :- ?MetricValue >= 95,
+Mitigate(MetricName=FileHandlesPercent, MetricValue=?MetricValue, OS=Linux) :- ?MetricValue >= 95,
 	GetHealthEventHistory(?HealthEventCount, 00:15:00),
 	?HealthEventCount >= 3,
-    TimeScopedRestartVM(2, 08:00:00).
+    TimeScopedRebootMachine(2, 08:00:00).

 ## Total Allocated, System-wide.

-Mitigate(MetricName="FileHandles", MetricValue=?MetricValue, OS="Linux") :- ?MetricValue >= 1000000,
+Mitigate(MetricName=FileHandles, MetricValue=?MetricValue, OS=Linux) :- ?MetricValue >= 1000000,
 	GetHealthEventHistory(?HealthEventCount, 00:15:00),
 	?HealthEventCount >= 3,
-    TimeScopedRestartVM(2, 08:00:00).
+    TimeScopedRebootMachine(2, 08:00:00). 
+
+## Reboot/Reimage. The rules below satisfy the following requirements: 
+## Repair throttling (MaxOutstandingRepairTasks), time-in-Error checking (pre-probation - GetCurrentEntityHealthStateDuration), 
+## cap repair job attempts within time intervals (how many times to try a repair before trying something else, for example, escalation..), 
+## time-to-wait after repair completes (post-probation - ProbationToHealthyWaitDurationPostRepair), 
+## attempt scheduling disk reimage if rebooting didn't work (user-configured repair escalation).
+
+## Logic workflow. Note: Since we are in this rule file, the context is already known to be Machine. FH has already determined what the facts are.
+## If Health State is Error, then proceed. This constraint is in the head of the rule for convenience and readability.
+## How long has the node been in Error state (probation)? 
+## If at least 2 hours, then continue. Else, stop processing rules (end cut (!)).
+## How many times has this machine repair been run in the last 4 hours? If less than twice, continue. This is added logic, does not exist in RPE. Like all of this, customer owns the configuration.
+## Only schedule the repair if there are less than 2 machine-level repairs currently in flight in the cluster.
+## "Currently in flight" means RepairTaskStateFilter.Active | RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing.
+## Wait 30 mins for green node after IS completes execution and RM has restored node (no health checks for Error state, by default. Only safety checks.)
+## If any of the sub-rules do not succeed, then Guan will go to the next rule, which in this case will attempt to schedule a Disk reimage repair job with the same logical constraints/workflow.
+
+Mitigate(NodeName=?NodeName, HealthState=Error) :- GetCurrentEntityHealthStateDuration(?HealthStateDuration, Entity=Machine, Target=?NodeName, State=Error),
+    ?HealthStateDuration <= 02:00:00, !.
+
+Mitigate(NodeName=?NodeName, HealthState=Error) :- GetRepairHistory(?repairCount, 04:00:00),
+	?repairCount < 2,
+	ScheduleMachineRepair("System.Azure.HostReboot", MaxOutstandingRepairTasks=2, ProbationToHealthyWaitDurationPostRepair=00:30:00).
+
+## Data disk reimage. Not supported on VMSS-managed virtual machines.
+Mitigate(NodeName=?NodeName, HealthState=Error) :- GetRepairHistory(?repairCount, 04:00:00),
+	?repairCount < 2,
+	ScheduleMachineRepair("System.Azure.HostRepaveData", MaxOutstandingRepairTasks=2, ProbationToHealthyWaitDurationPostRepair=00:30:00).
+
+## Human intervention is required (Triage)
--- a/FabricHealer/PackageRoot/Config/Settings.xml
+++ b/FabricHealer/PackageRoot/Config/Settings.xml
@ -1,57 +1,110 @@
 <?xml version="1.0" encoding="utf-8" ?>
 <Settings xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://schemas.microsoft.com/2011/01/fabric">
-  <Section Name="RepairManagerConfiguration">
-    <!-- ***Overridable Parameters***
-         These must be set in ApplicationManifest.xml -->
-    <Parameter Name="HealthCheckLoopSleepTimeSeconds" Value="" MustOverride="true" />
-    <Parameter Name="EnableVerboseLogging" Value="" MustOverride="true" />
-    <Parameter Name="EnableTelemetry" Value="" MustOverride="true" />
-    <Parameter Name="EnableETW" Value="" MustOverride="true" />
-    <Parameter Name="EnableAutoMitigation" Value="" MustOverride="true" />
-    <Parameter Name="EnableOperationalTelemetry" Value="" MustOverride="true" />
-	<Parameter Name="EnableRollingServiceRestarts" Value="" MustOverride="true" />
-    <!-- Folder name for local log output. You can use a full path or just a folder name. -->
-    <Parameter Name="LocalLogPath" Value="" MustOverride="true" />
+	<Section Name="RepairManagerConfiguration">
+	<!-- ***Overridable Parameters*** These must be set in ApplicationManifest.xml -->
+		<!-- Interval in seconds for how often FabricHealer wakes up and scans health states to schedule repairs. -->
+		<Parameter Name="HealthCheckIntervalInSeconds" Value="" MustOverride="true" />
+		<Parameter Name="EnableVerboseLogging" Value="" MustOverride="true" />
+		<Parameter Name="EnableTelemetry" Value="" MustOverride="true" />
+		<Parameter Name="EnableETW" Value="" MustOverride="true" />
+		<!-- Big Red Button: You can turn FabricHealer on and off with a versionless parameter-only application upgrade. -->
+		<Parameter Name="EnableAutoMitigation" Value="" MustOverride="true" />
+		<Parameter Name="EnableOperationalTelemetry" Value="" MustOverride="true" />
+		<Parameter Name="EnableRollingServiceRestarts" Value="" MustOverride="true" />
+		<!-- Folder name for local log output. You can use a full path or just a folder name. -->
+		<Parameter Name="LocalLogPath" Value="" MustOverride="true" />
    
-    <!-- ***Non-Overridable Parameters*** These must be set in this file. -->
+		<!-- ***Non-Overridable Parameters*** These must be set in this file. -->
 	  
-    <!-- Default timeout for async SF API calls. -->
-    <Parameter Name="AsyncOperationTimeoutSeconds" Value="120" />
-    <!-- Required-If EnableTelemetry is set to true in ApplicationManifest. Values can be either AzureApplicationInsights or AzureLogAnalytics -->
-    <Parameter Name="TelemetryProvider" Value="AzureLogAnalytics" />
-    <!-- Required-If TelemetryProvider is AzureApplicationInsights. -->
-    <Parameter Name="AppInsightsInstrumentationKey" Value="" />
-    <!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
-    <Parameter Name="LogAnalyticsWorkspaceId" Value="" />
-    <!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
-    <Parameter Name="LogAnalyticsSharedKey" Value="" />
-    <!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
-    <Parameter Name="LogAnalyticsLogType" Value="FabricHealer" />
-  </Section>
+		<!-- Default timeout for async SF API calls. -->
+		<Parameter Name="AsyncOperationTimeoutSeconds" Value="120" />
+		<!-- Required-If EnableTelemetry is set to true in ApplicationManifest. Values can be either AzureApplicationInsights or AzureLogAnalytics -->
+		<Parameter Name="TelemetryProvider" Value="AzureLogAnalytics" />
+		<!-- Required-If TelemetryProvider is AzureApplicationInsights. -->
+		<Parameter Name="AppInsightsInstrumentationKey" Value="" />
+		<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
+		<Parameter Name="LogAnalyticsWorkspaceId" Value="" />
+		<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
+		<Parameter Name="LogAnalyticsSharedKey" Value="" />
+		<!-- Required-If TelemetryProvider is AzureLogAnalytics. -->
+		<Parameter Name="LogAnalyticsLogType" Value="FabricHealer" />
+	</Section>
+
+	<!-- Repair Policies - Overridable Parameters. Must be set in ApplicationManifest.xml. -->
+	<Section Name="AppRepairPolicy">
+		<Parameter Name="Enabled" Value="" MustOverride="true" />
+		<Parameter Name="LogicRulesConfigurationFile" Value="" />
+	</Section>
+	<Section Name="DiskRepairPolicy">
+		<Parameter Name="Enabled" Value="" MustOverride="true" />
+		<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
+	</Section>
+	<Section Name="FabricNodeRepairPolicy">
+		<Parameter Name="Enabled" Value="" MustOverride="true" />
+		<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
+	</Section>
+	<Section Name="ReplicaRepairPolicy">
+		<Parameter Name="Enabled" Value="" MustOverride="true" />
+		<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
+	</Section>
+	<Section Name="SystemServiceRepairPolicy">
+		<Parameter Name="Enabled" Value="" MustOverride="true" />
+		<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
+	</Section>
 	
-  <!-- Repair Policies.Overridable Parameters. Must be set in ApplicationManifest.xml. -->
-  <Section Name="AppRepairPolicy">
-    <Parameter Name="Enabled" Value="" MustOverride="true" />
-    <Parameter Name="LogicRulesConfigurationFile" Value="" />
-  </Section>
-  <Section Name="DiskRepairPolicy">
-    <Parameter Name="Enabled" Value="" MustOverride="true" />
-    <Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
-  </Section>
-  <Section Name="FabricNodeRepairPolicy">
-    <Parameter Name="Enabled" Value="" MustOverride="true" />
-    <Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
-  </Section>
-  <Section Name="ReplicaRepairPolicy">
-    <Parameter Name="Enabled" Value="" MustOverride="true" />
-    <Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
-  </Section>
-  <Section Name="SystemServiceRepairPolicy">
-    <Parameter Name="Enabled" Value="" MustOverride="true" />
-    <Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
-  </Section>
-  <Section Name="MachineRepairPolicy">
-    <Parameter Name="Enabled" Value="" MustOverride="true" />
-    <Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
-  </Section>
+	<!-- Machine Repair. -->
+	<Section Name="MachineRepairPolicy">
+		<!-- FabricHealer is not allowed to schedule a machine repair more often than this interval.
+             This is to prevent the FabricHealer from scheduling too many machine repairs in a short time. -->
+		<Parameter Name="ActionSchedulingIntervalInSeconds" Value="600" />
+		<Parameter Name="ProbationToFailingWaitDurationInSeconds" Value="7200" />
+		<Parameter Name="ProbationToHealthyWaitDurationInSeconds" Value="1800" />
+		<Parameter Name="MinimumHealthyDurationInSeconds" Value="300" />
+		<Parameter Name="Enabled" Value="" MustOverride="true" />
+		<Parameter Name="LogicRulesConfigurationFile" Value="" MustOverride="true" />
+	</Section>
+	<!-- Supported Machine level repair actions. -->
+	<!-- RM Repair action strings (built into RM).
+		========================================================================
+		Alias               VM (default)             Host 
+        ========================================================================
+		System.Reboot       System.Azure.Reboot      System.Azure.HostReboot 
+		System.ReimageOS    System.Azure.ReimageOS 
+		System.FullReimage  System.Azure.RepaveData  System.Azure.HostRepaveData
+		========================================================================-->
+	<Section Name="NodeRepairActionList">
+		<!-- This is the name of the repair that will be provided to RM by FH when scheduling the repair. -->
+		<Parameter Name="System.Azure.HostReboot" Value="RepairPolicyHostReboot" />
+		<!-- This is not supported for VMSS-based machine clusters. -->
+		<!--<Parameter Name="System.Azure.HostRepaveData" Value="RepairPolicyHostRepaveData" />-->
+		<Parameter Name="ManualTriageNeeded" Value="RepairPolicyTriage" />
+	</Section>
+	<Section Name="RepairPolicyHostReboot">
+		<!-- Required - How many repair tasks of this kind be scheduled concurrently.  -->
+		<Parameter Name="MaxOutstandingRepairTasks" Value="2"  />
+		<!-- Required - Probationary period to failing after this repair -->
+		<Parameter Name="ProbationToFailingWaitDurationPostRepairInSeconds" Value="7200" />
+		<!-- Required - Probationary period to healthy after this repair -->
+		<Parameter Name="ProbationToHealthyWaitDurationPostRepairInSeconds" Value="1800" />
+		<!-- Required - Minimum time of error state before this repair is applied -->
+		<Parameter Name="PolicyActionTimeInSeconds" Value="7200" />
+		<!-- This is optional and defaults to true -->
+		<Parameter Name="IsEnabled" Value="true"  />
+	</Section>
+	<!-- This is not supported for VMSS-based machine clusters. -->
+	<Section Name="RepairPolicyHostRepaveData">
+		<Parameter Name="MaxOutstandingRepairTasks" Value="2" />
+		<Parameter Name="ProbationToFailingWaitDurationPostRepairInSeconds" Value="7200" />
+		<Parameter Name="ProbationToHealthyWaitDurationPostRepairInSeconds" Value="1800" />
+		<Parameter Name="PolicyActionTimeInSeconds" Value="14400" />
+		<Parameter Name="IsEnabled" Value="false" />
+	</Section>
+	<Section Name="RepairPolicyTriage">
+		<Parameter Name="MaxOutstandingRepairTasks" Value="2" MustOverride="false" />
+		<Parameter Name="ProbationToFailingWaitDurationPostRepairInSeconds" Value="7200" />
+		<Parameter Name="ProbationToHealthyWaitDurationPostRepairInSeconds" Value="1800" />
+		<Parameter Name="PolicyActionTimeInSeconds" Value="9000" />
+		<Parameter Name="IsEnabled" Value="true" />
+	</Section>
+	<!-- End Machine Repair -->
 </Settings>
--- a/FabricHealer/Repair/FabricRepairTasks.cs
+++ b/FabricHealer/Repair/FabricRepairTasks.cs
@ -6,6 +6,8 @@
 using System;
 using System.Collections.Generic;
 using System.Fabric;
+using System.Fabric.Description;
+using System.Fabric.Health;
 using System.Fabric.Query;
 using System.Fabric.Repair;
 using System.Linq;
@ -154,19 +156,20 @@ namespace FabricHealer.Repair

            await Task.Delay(new Random().Next(100, 1500));

-            var isRepairAlreadyInProgress =
-                    await repairTaskEngine.IsFHRepairTaskRunningAsync(executorName, repairData, token);
+            bool isRepairInProgress = await repairTaskEngine.IsRepairInProgressAsync(executorName, repairData, token);

-            if (isRepairAlreadyInProgress)
+            if (isRepairInProgress)
            {
                return null;
            }

            switch (repairAction)
            {
-                case RepairActionType.RestartVM:
+                case RepairActionType.RebootMachine:
+                case RepairActionType.ReimageDisk:
+                case RepairActionType.ReimageOS:

-                    repairTask = await repairTaskEngine.CreateVmRebootISRepairTaskAsync(repairData, executorName, token);
+                    repairTask = await repairTaskEngine.CreateMachineRepairTaskAsync(repairData, executorName, token);
                    break;

                case RepairActionType.DeleteFiles:
@ -209,7 +212,7 @@ namespace FabricHealer.Repair
            try
            {
                var isRepairAlreadyInProgress =
-                    await repairTaskEngine.IsFHRepairTaskRunningAsync(repairTask.Executor, repairData, token);
+                    await repairTaskEngine.IsRepairInProgressAsync(repairTask.Executor, repairData, token);

                if (!isRepairAlreadyInProgress)
                {
@ -395,5 +398,69 @@ namespace FabricHealer.Repair

            return count;
        }
+
+        internal static async Task<TimeSpan> GetEntityCurrentHealthStateDurationAsync(EntityType entityType, string entityFilter, HealthState state, CancellationToken token)
+        {
+            HealthEventsFilter healthEventsFilter = new HealthEventsFilter();
+
+            if (state == HealthState.Warning)
+            {
+                healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Warning;
+            }
+            else if (state == HealthState.Error)
+            {
+                healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Error;
+            }
+            else if (state == HealthState.Ok)
+            {
+                healthEventsFilter.HealthStateFilterValue = HealthStateFilter.Ok;
+            }
+            else
+            {
+                healthEventsFilter.HealthStateFilterValue = HealthStateFilter.None;
+            }
+
+            switch (entityType)
+            {
+                case EntityType.Application:
+                    break;
+
+                case EntityType.Service:
+                    break;
+
+                case EntityType.Machine:
+                case EntityType.Node:
+
+                    var queryDesc = new NodeHealthQueryDescription(entityFilter)
+                    {
+                        EventsFilter = healthEventsFilter
+                    };
+                    var nodeHealthList =
+                        await FabricHealerManager.FabricClientSingleton.HealthManager.GetNodeHealthAsync(
+                                    queryDesc, FabricHealerManager.ConfigSettings.AsyncTimeout, token);
+                    
+                    if (nodeHealthList == null || nodeHealthList.HealthEvents.Count == 0)
+                    {
+                        return TimeSpan.MinValue;
+                    }
+
+                    foreach (var nodeHealthEvent in nodeHealthList.HealthEvents)
+                    {
+                        if (nodeHealthEvent.IsExpired)
+                        {
+                            continue;
+                        }
+
+                        return DateTime.UtcNow.Subtract(nodeHealthEvent.SourceUtcTimestamp);
+                    }
+
+                    break;
+
+                default:
+                    return TimeSpan.MinValue;
+            }
+            
+            return TimeSpan.MinValue;
+        }
    }
 }
--- a/FabricHealer/Repair/Guan/GetCurrentEntityHealthStateDurationPredicateType.cs
+++ b/FabricHealer/Repair/Guan/GetCurrentEntityHealthStateDurationPredicateType.cs
@ -0,0 +1,83 @@
+// ------------------------------------------------------------
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License (MIT). See License.txt in the repo root for license information.
+// ------------------------------------------------------------
+
+using System;
+using System.Threading.Tasks;
+using Guan.Logic;
+using FabricHealer.Utilities.Telemetry;
+using System.Fabric.Health;
+
+namespace FabricHealer.Repair.Guan
+{
+    public class GetCurrentEntityHealthStateDurationPredicateType : PredicateType
+    {
+        private static RepairTaskManager RepairTaskManager;
+        private static GetCurrentEntityHealthStateDurationPredicateType Instance;
+
+        private class Resolver : GroundPredicateResolver
+        {
+            public Resolver(CompoundTerm input, Constraint constraint, QueryContext context)
+                    : base(input, constraint, context, 1)
+            {
+
+            }
+
+            // GetCurrentEntityHealthStateDuration(?HealthStateDuration, Machine, ?NodeName, State=Error)
+            protected override async Task<Term> GetNextTermAsync()
+            {
+                if (Input.Arguments.Count != 4)
+                {
+                    throw new GuanException("GetCurrentEntityHealthStateDuration predicate requires 4 arguments.");
+                }
+
+                TimeSpan duration;
+
+                if (!Enum.TryParse((string)Input.Arguments[1].Value.GetEffectiveTerm().GetObjectValue(), out EntityType entityType))
+                {
+                    throw new GuanException("The second argument of GetCurrentEntityHealthStateDuration must be a valid EntityType value (Application, Service, Node, Machine, etc..)");
+                }
+
+                if (!Enum.TryParse((string)Input.Arguments[3].Value.GetEffectiveTerm().GetObjectValue(), out HealthState state))
+                {
+                    throw new GuanException("The third argument of GetCurrentEntityHealthStateDuration must be a valid HealthState value (Error, Warning, etc..)");
+                }
+
+                string nodeName = (string)Input.Arguments[2].Value.GetEffectiveTerm().GetObjectValue();
+                
+                
+                duration = await FabricRepairTasks.GetEntityCurrentHealthStateDurationAsync(entityType, nodeName, state, RepairTaskManager.Token);
+
+                var result = new CompoundTerm(this.Input.Functor);
+                result.AddArgument(new Constant(duration), "0");
+                return result;
+            }
+        }
+
+        public static GetCurrentEntityHealthStateDurationPredicateType Singleton(string name, RepairTaskManager repairTaskManager)
+        {
+            RepairTaskManager = repairTaskManager;
+            return Instance ??= new GetCurrentEntityHealthStateDurationPredicateType(name);
+        }
+
+        private GetCurrentEntityHealthStateDurationPredicateType(string name)
+                 : base(name, true, 1)
+        {
+
+        }
+
+        public override PredicateResolver CreateResolver(CompoundTerm input, Constraint constraint, QueryContext context)
+        {
+            return new Resolver(input, constraint, context);
+        }
+
+        public override void AdjustTerm(CompoundTerm term, Rule rule)
+        {
+            if (term.Arguments[0].Value.IsGround())
+            {
+                throw new GuanException("The first argument of GetCurrentEntityHealthStateDuration must be a variable: {0}", term);
+            }
+        }
+    }
+}
--- a/FabricHealer/Repair/Guan/RestartFabricNodePredicateType.cs
+++ b/FabricHealer/Repair/Guan/RestartFabricNodePredicateType.cs
@ -70,7 +70,7 @@ namespace FabricHealer.Repair.Guan
                // Block attempts to create node-level repair tasks if one is already running in the cluster.
                var repairTaskEngine = new RepairTaskEngine();
                var isNodeRepairAlreadyInProgress =
-                    await repairTaskEngine.IsFHRepairTaskRunningAsync(
+                    await repairTaskEngine.IsRepairInProgressAsync(
                            RepairTaskEngine.FabricHealerExecutorName,
                            RepairData,
                            RepairTaskManager.Token);
--- a/FabricHealer/Repair/Guan/ScheduleMachineRepairPredicateType.cs
+++ b/FabricHealer/Repair/Guan/ScheduleMachineRepairPredicateType.cs
@ -11,11 +11,11 @@ using System.Threading.Tasks;

 namespace FabricHealer.Repair.Guan
 {
-    public class RestartMachinePredicateType : PredicateType
+    public class ScheduleMachineRepairPredicateType : PredicateType
    {
        private static RepairTaskManager RepairTaskManager;
        private static TelemetryData RepairData;
-        private static RestartMachinePredicateType Instance;
+        private static ScheduleMachineRepairPredicateType Instance;

        private class Resolver : BooleanPredicateResolver
        {
@ -27,36 +27,50 @@ namespace FabricHealer.Repair.Guan

            protected override async Task<bool> CheckAsync()
            {
-                RepairData.RepairPolicy.RepairAction = RepairActionType.RestartVM;
+                if (Input.Arguments.Count == 0)
+                {
+                    throw new GuanException("You must provide a repair action name for Infrastructure-level repairs as first argument.");
+                }
+
+                // Repair action name is required.
+                string repairAction = (string)Input.Arguments[0].Value.GetObjectValue();
+
+                /*
+                    public const string SystemReboot = "System.Reboot";
+                    public const string SystemReimageOS = "System.ReimageOS ";
+                    public const string SystemFullReimage = "System.FullReimage";
+                    public const string SystemHostReboot = "System.Azure.HostReboot";
+                    public const string SystemHostRepaveData = "System.Azure.HostRepaveData";
+                */
+
+                switch (repairAction)
+                {
+                    case RepairConstants.SystemReboot:
+                    case RepairConstants.SystemHostReboot:
+                        RepairData.RepairPolicy.RepairAction = RepairActionType.RebootMachine;
+                        break;
+
+                    case RepairConstants.SystemReimageOS:
+                    case RepairConstants.SystemFullReimage:
+                    case RepairConstants.SystemHostRepaveData:
+                        RepairData.RepairPolicy.RepairAction = RepairActionType.ReimageOS;
+                        break;
+
+                    default:
+                        throw new GuanException($"Unrecognized repair action name: {repairAction}. Repair actions are case sensitive.");
+                }

                // FH does not execute repairs for VM level mitigation. InfrastructureService (IS) does,
                // so, FH schedules VM repairs via RM and the execution is taken care of by IS (the executor).
-                // Block attempts to create duplicate repair tasks.
+                // Block attempts to create duplicate repair tasks or more than specified concurrent machine-level repairs.
                var repairTaskEngine = new RepairTaskEngine();
-                var isRepairAlreadyInProgress =
-                    await repairTaskEngine.IsFHRepairTaskRunningAsync(
-                            $"{RepairTaskEngine.InfrastructureServiceName}/{RepairData.NodeType}",
-                            RepairData,
-                            RepairTaskManager.Token);
-
-                if (isRepairAlreadyInProgress)
-                {
-                    string message = $"VM Repair {RepairData.RepairPolicy.RepairId} is already in progress. Will not attempt repair at this time.";
-
-                    await RepairTaskManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
-                            LogLevel.Info,
-                            $"RestartVMPredicateType::{RepairData.RepairPolicy.RepairId}",
-                            message,
-                            RepairTaskManager.Token);
-
-                    return false;
-                }
-
                int count = Input.Arguments.Count;
+                long maxConcurrentRepairs = 0;

                for (int i = 0; i < count; i++)
                {
                    var typeString = Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue().GetType().Name;
+
                    switch (typeString)
                    {
                        case "TimeSpan":
@ -67,13 +81,39 @@ namespace FabricHealer.Repair.Guan
                            RepairData.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[i].Value.GetObjectValue();
                            break;

+                        case "Int64":
+                            maxConcurrentRepairs = (long)Input.Arguments[i].Value.GetObjectValue();
+                            break;
+
                        default:
                            throw new GuanException($"Unsupported input: {Input.Arguments[i].Value.GetObjectValue().GetType()}");
                    }
                }

+                var isRepairAlreadyInProgress =
+                        await repairTaskEngine.IsRepairInProgressAsync(
+                                $"{RepairTaskEngine.InfrastructureServiceName}/{RepairData.NodeType}",
+                                RepairData,
+                                RepairTaskManager.Token,
+                                maxConcurrentRepairs);
+
+                if (isRepairAlreadyInProgress)
+                {
+                    string message = $"VM Repair {RepairData.RepairPolicy.RepairId} is already in progress" +
+                                     $"{(maxConcurrentRepairs > 0 ? " or max number of concurrent machine repairs has been reached" : "")}. " +
+                                     $"Will not attempt repair at this time.";
+
+                    await RepairTaskManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
+                            LogLevel.Info,
+                            $"RestartMachinePredicateType::{RepairData.RepairPolicy.RepairId}",
+                            message,
+                            RepairTaskManager.Token);
+
+                    return false;
+                }
+
                bool success = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
-                                        () => RepairTaskManager.ExecuteRMInfrastructureRepairTask(
+                                        () => RepairTaskManager.ScheduleInfrastructureRepairTask(
                                                RepairData,
                                                RepairTaskManager.Token),
                                        RepairTaskManager.Token);
@ -81,15 +121,15 @@ namespace FabricHealer.Repair.Guan
            }
        }

-        public static RestartMachinePredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData repairData)
+        public static ScheduleMachineRepairPredicateType Singleton(string name, RepairTaskManager repairTaskManager, TelemetryData repairData)
        {
            RepairTaskManager = repairTaskManager;
            RepairData = repairData;

-            return Instance ??= new RestartMachinePredicateType(name);
+            return Instance ??= new ScheduleMachineRepairPredicateType(name);
        }

-        private RestartMachinePredicateType(string name)
+        private ScheduleMachineRepairPredicateType(string name)
                 : base(name, true, 0)
        {

--- a/FabricHealer/Repair/RepairAction.cs
+++ b/FabricHealer/Repair/RepairAction.cs
@ -22,6 +22,8 @@ namespace FabricHealer.Repair
        RestartFabricNode,
        RestartProcess,
        RestartReplica,
-        RestartVM,
+        RebootMachine,
+        ReimageDisk,
+        ReimageOS
    }
 }
--- a/FabricHealer/Repair/RepairConstants.cs
+++ b/FabricHealer/Repair/RepairConstants.cs
@ -29,7 +29,7 @@ namespace FabricHealer.Repair
        public const string EnableRollingServiceRestartsParameter = "EnableRollingServiceRestarts";
        public const string AppInsightsInstrumentationKeyParameter = "AppInsightsInstrumentationKey";
        public const string EnableETW = "EnableETW";
-        public const string HealthCheckLoopSleepTimeSeconds = "HealthCheckLoopSleepTimeSeconds";
+        public const string HealthCheckIntervalInSeconds = "HealthCheckIntervalInSeconds";
        public const string LocalLogPathParameter = "LocalLogPath";
        public const string AsyncOperationTimeout = "AsyncOperationTimeoutSeconds";
        public const string EnableFabricHealerOperationalTelemetry = "EnableOperationalTelemetry";
@ -72,7 +72,15 @@ namespace FabricHealer.Repair
        public const string RestartFabricNode = "RestartFabricNode";
        public const string RestartFabricSystemProcess = "RestartFabricSystemProcess";
        public const string RestartReplica = "RestartReplica";
-        public const string RestartVM = "RestartVM";
+        public const string ScheduleMachineRepair = "ScheduleMachineRepair";
+        public const string ScheduleDiskReimage = "ScheduleDiskReimage";
+
+        // Infra repair names (RM "commands").
+        public const string SystemReboot = "System.Reboot";
+        public const string SystemReimageOS = "System.ReimageOS ";
+        public const string SystemFullReimage = "System.FullReimage";
+        public const string SystemHostReboot = "System.Azure.HostReboot";
+        public const string SystemHostRepaveData = "System.Azure.HostRepaveData";

        // Helper Predicates.
        public const string CheckInsideRunInterval = "CheckInsideRunInterval";
--- a/FabricHealer/Repair/RepairExecutor.cs
+++ b/FabricHealer/Repair/RepairExecutor.cs
@ -956,42 +956,6 @@ namespace FabricHealer.Repair
            return true;
        }

-        /// <summary>
-        /// Returns a machine name string, given a fabric node name.
-        /// </summary>
-        /// <param name="nodeName">Fabric node name</param>
-        /// <param name="cancellationToken"></param>
-        internal async Task<string> GetMachineHostNameFromFabricNodeNameAsync(string nodeName, CancellationToken cancellationToken)
-        {
-            try
-            {
-                var nodes = await FabricHealerManager.FabricClientSingleton.QueryManager.GetNodeListAsync(
-                                   nodeName,
-                                   FabricHealerManager.ConfigSettings.AsyncTimeout,
-                                   cancellationToken);
-
-                Node targetNode = nodes.Count > 0 ? nodes[0] : null;
-
-                if (targetNode == null)
-                {
-                    return null;
-                }
-
-                string ipOrDnsName = targetNode.IpAddressOrFQDN;
-                var hostEntry = await Dns.GetHostEntryAsync(ipOrDnsName);
-                var machineName = hostEntry.HostName;
-
-                return machineName;
-            }
-            catch (Exception e) when (e is ArgumentException|| e is SocketException|| e is OperationCanceledException || e is TimeoutException)
-            {
-                FabricHealerManager.RepairLogger.LogWarning(
-                    $"Unable to determine machine host name from Fabric node name {nodeName}:{Environment.NewLine}{e}");
-            }
-
-            return null;
-        }
-
        /// <summary>
        /// Clears existing health warnings for target repair entity. This should only be called after a repair operation succeeds.
        /// </summary>
--- a/FabricHealer/Repair/RepairPolicy.cs
+++ b/FabricHealer/Repair/RepairPolicy.cs
@ -32,6 +32,14 @@ namespace FabricHealer.Repair
            get; set;
        }

+        /// <summary>
+        /// The name of the infrastucture repair to provide to RM that IS will execute.
+        /// </summary>
+        public string InfrastructureRepairName
+        {
+            get; set;
+        }
+
        /// <summary>
        /// Maximum amount of time to check if health state of repaired target entity is Ok.
        /// </summary>
--- a/FabricHealer/Repair/RepairTaskEngine.cs
+++ b/FabricHealer/Repair/RepairTaskEngine.cs
@ -4,7 +4,9 @@
 // ------------------------------------------------------------

 using System;
+using System.ComponentModel.DataAnnotations;
 using System.Fabric.Description;
+using System.Fabric.Health;
 using System.Fabric.Query;
 using System.Fabric.Repair;
 using System.Linq;
@ -12,13 +14,12 @@ using System.Threading;
 using System.Threading.Tasks;
 using FabricHealer.Utilities;
 using FabricHealer.Utilities.Telemetry;
-using Newtonsoft.Json.Linq;

 namespace FabricHealer.Repair
 {
    public sealed class RepairTaskEngine
    {
-        public const string HostVMReboot = "System.Reboot";
+        public const string HostMachineReboot = "System.Reboot";
        public const string FHTaskIdPrefix = "FH";
        public const string AzureTaskIdPrefix = "Azure";
        public const string FabricHealerExecutorName = "FabricHealer";
@ -105,8 +106,14 @@ namespace FabricHealer.Repair
            return repairTasks;
        }

-        // This allows InfrastructureService to schedule and run reboot im concert with VMSS over MR.
-        public async Task<RepairTask> CreateVmRebootISRepairTaskAsync(TelemetryData repairData, string executorName, CancellationToken cancellationToken)
+        /// <summary>
+        /// This schedules a repair task where SF's InfrastructureService will reboot the target machine safely.
+        /// </summary>
+        /// <param name="repairData"></param>
+        /// <param name="executorName"></param>
+        /// <param name="cancellationToken"></param>
+        /// <returns></returns>
+        public async Task<RepairTask> CreateMachineRepairTaskAsync(TelemetryData repairData, string executorName, CancellationToken cancellationToken)
        {
            // This constraint (MaxResults) is used just to make sure there is more 1 node in the cluster. We don't need a list of all nodes.
            var nodeQueryDesc = new NodeQueryDescription
@ -114,6 +121,13 @@ namespace FabricHealer.Repair
                MaxResults = 3,
            };

+            string repairActionName = HostMachineReboot;
+
+            if (!string.IsNullOrWhiteSpace(repairData.RepairPolicy.InfrastructureRepairName))
+            {
+                repairActionName = repairData.RepairPolicy.InfrastructureRepairName;
+            }
+
            NodeList nodes = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                                    () => FabricHealerManager.FabricClientSingleton.QueryManager.GetNodePagedListAsync(
                                            nodeQueryDesc,
@ -126,14 +140,14 @@ namespace FabricHealer.Repair
                return null;
            }

-            string taskId = $"{FHTaskIdPrefix}/{HostVMReboot}/{(uint)repairData.NodeName.GetHashCode()}/{repairData.NodeType}";
-            bool doHealthChecks = !SupportedErrorCodes.GetCodeNameFromErrorCode(repairData.Code).Contains("Error");
+            string taskId = $"{FHTaskIdPrefix}/{repairActionName}/{(uint)repairData.NodeName.GetHashCode()}/{repairData.NodeType}";
+            bool doHealthChecks = repairData.HealthState != HealthState.Error;

            // Error health state on target SF entity can block RM from approving the job to repair it (which is the whole point of doing the job).
            // So, do not do health checks if customer configures FO to emit Error health reports.
            // In general, FO should *not* be configured to emit Error events. See FO documentation.

-            var repairTask = new ClusterRepairTask(taskId, HostVMReboot)
+            var repairTask = new ClusterRepairTask(taskId, repairActionName)
            {
                Target = new NodeRepairTargetDescription(repairData.NodeName),
                Description = $"{repairData.RepairPolicy.RepairId}",
@ -146,25 +160,40 @@ namespace FabricHealer.Repair
            return repairTask;
        }

-        public async Task<bool> IsFHRepairTaskRunningAsync(string executorName, TelemetryData repairdData, CancellationToken token)
+        /// <summary>
+        /// Determines if a repair task is already in flight or if the max number of concurrent repairs has been reached for the target using the information specified in repairData instance.
+        /// </summary>
+        /// <param name="executorName">Name of the repair executor.</param>
+        /// <param name="repairData">TelemetryData instance.</param>
+        /// <param name="token">CancellationToken.</param>
+        /// <param name="maxConcurrentRepairs">Optional: Number of max concurrent repairs for the entity type specified in repairData. Default is 0 which means no concurrent repairs.</param>
+        /// <returns></returns>
+        public async Task<bool> IsRepairInProgressAsync(string executorName, TelemetryData repairData, CancellationToken token, long maxConcurrentRepairs = 0)
        {
-            // All RepairTasks are prefixed with FH, regardless of repair target type (VM, fabric node, system service process, codepackage, replica).
-            // For VM-level repair, RM will create a new task for IS that replaces FH executor data with IS job info, but the original FH repair task will
-            // remain in an active state which will block any duplicate scheduling by another FH instance.
-            var currentFHRepairTasksInProgress =
-                            await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
-                                    FHTaskIdPrefix,
-                                    RepairTaskStateFilter.Active | RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing,
-                                    executorName,
-                                    FabricHealerManager.ConfigSettings.AsyncTimeout,
-                                    token);
+            // All RepairTasks are prefixed with FH, regardless of repair target type (VM/Machine, Fabric node, system service process, code package, replica).
+            // For VM-level repairs, RM will create a new task for IS that replaces FH executor data with IS job info.
+            RepairTaskList repairTasksInProgress =
+                    await FabricHealerManager.FabricClientSingleton.RepairManager.GetRepairTaskListAsync(
+                            FHTaskIdPrefix,
+                            RepairTaskStateFilter.Active | RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing,
+                            executorName,
+                            FabricHealerManager.ConfigSettings.AsyncTimeout,
+                            token);

-            if (currentFHRepairTasksInProgress == null || currentFHRepairTasksInProgress.Count == 0)
+            if (repairTasksInProgress == null || repairTasksInProgress.Count == 0)
            {
                return false;
            }

-            foreach (var repair in currentFHRepairTasksInProgress)
+            // Throttling machine level repairs.
+            if (executorName == $"{InfrastructureServiceName}/{repairData.NodeType}" && 
+                maxConcurrentRepairs > 0 &&
+                repairTasksInProgress.Count(r => r.Executor == executorName) >= maxConcurrentRepairs)
+            {
+                return true;
+            }
+
+            foreach (var repair in repairTasksInProgress)
            {
                // This check is to see if there are any FH-as-executor repairs in flight.
                if (executorName == FabricHealerExecutorName)
@ -179,19 +208,17 @@ namespace FabricHealer.Repair
                        return false;
                    }

-                    // The node repair check ensures that only one node-level repair can take place in a cluster (no concurrent node restarts), by default.
-                    // FH is conservative, by design.
-                    if (repairdData.RepairPolicy.RepairId == executorData.RepairData.RepairPolicy.RepairId ||
-                        executorData.RepairData.RepairPolicy.RepairAction == RepairActionType.RestartFabricNode)
+                    // This check ensures that only one repair can be scheduled at a time for the same target.
+                    if (repairData.RepairPolicy.RepairId == executorData.RepairData.RepairPolicy.RepairId)
                    {
                        return true;
                    }
                }
-                else if (repair.Executor == $"{InfrastructureServiceName}/{repairdData.NodeType}")
+                else if (repair.Executor == $"{InfrastructureServiceName}/{repairData.NodeType}")
                {
-                    // This would block scheduling any VM level operation (reboot) already in flight.
-                    // For IS repairs, unique id is stored in the repair task's Description property.
-                    if (repair.Description == repairdData.RepairPolicy.RepairId)
+                    // This would block rescheduling any VM level operation (reboot) that is already in flight.
+                    // NOTE: For Infrastructure-level repairs (IS is executor), unique id is stored in the repair task's Description property.
+                    if (repair.Description == repairData.RepairPolicy.RepairId)
                    {
                        return true;
                    }
--- a/FabricHealer/Repair/RepairTaskManager.cs
+++ b/FabricHealer/Repair/RepairTaskManager.cs
@ -151,7 +151,7 @@ namespace FabricHealer.Repair
            functorTable.Add(RestartFabricNodePredicateType.Singleton(RepairConstants.RestartFabricNode, this, repairExecutorData, repairTaskEngine, repairData));
            functorTable.Add(RestartFabricSystemProcessPredicateType.Singleton(RepairConstants.RestartFabricSystemProcess, this, repairData));
            functorTable.Add(RestartReplicaPredicateType.Singleton(RepairConstants.RestartReplica, this, repairData));
-            functorTable.Add(RestartMachinePredicateType.Singleton(RepairConstants.RestartVM, this, repairData));
+            functorTable.Add(ScheduleMachineRepairPredicateType.Singleton(RepairConstants.ScheduleMachineRepair, this, repairData));

            // Parse rules.
            Module module = Module.Parse("external", repairRules, functorTable);
@ -181,7 +181,7 @@ namespace FabricHealer.Repair
            compoundTerm.AddArgument(new Constant(repairData.NodeType), RepairConstants.NodeType);
            compoundTerm.AddArgument(new Constant(repairData.ObserverName), RepairConstants.ObserverName);
            compoundTerm.AddArgument(new Constant(repairData.OS), RepairConstants.OS);
-            compoundTerm.AddArgument(new Constant(Enum.GetName(typeof(ServiceKind), repairData.ServiceKind)), RepairConstants.ServiceKind);
+            compoundTerm.AddArgument(new Constant(repairData.ServiceKind), RepairConstants.ServiceKind);
            compoundTerm.AddArgument(new Constant(repairData.ServiceName), RepairConstants.ServiceName);
            compoundTerm.AddArgument(new Constant(repairData.ProcessId), RepairConstants.ProcessId);
            compoundTerm.AddArgument(new Constant(repairData.ProcessName), RepairConstants.ProcessName);
@ -198,7 +198,7 @@ namespace FabricHealer.Repair
        // The repair will be executed by SF Infrastructure service, not FH. This is the case for all
        // VM-level repairs. IS will communicate with VMSS (for example) to guarantee safe repairs in MR-enabled
        // clusters.RM, as usual, will orchestrate the repair cycle.
-        public async Task<bool> ExecuteRMInfrastructureRepairTask(TelemetryData repairData, CancellationToken cancellationToken)
+        public async Task<bool> ScheduleInfrastructureRepairTask(TelemetryData repairData, CancellationToken cancellationToken)
        {
            var infraServices = await FabricRepairTasks.GetInfrastructureServiceInstancesAsync(cancellationToken);
            var arrServices = infraServices as Service[] ?? infraServices.ToArray();
@ -207,7 +207,7 @@ namespace FabricHealer.Repair
            {
                await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
                        LogLevel.Info,
-                        "ExecuteRMInfrastructureRepairTask",
+                        "ScheduleInfrastructureRepairTask",
                        "Infrastructure Service not found. Will not attemp VM repair.",
                        cancellationToken,
                        repairData,
@ -229,8 +229,8 @@ namespace FabricHealer.Repair

                await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
                        LogLevel.Info,
-                        "RepairTaskManager.ExecuteRMInfrastructureRepairTask",
-                        $"IS RepairTask {RepairTaskEngine.HostVMReboot} " +
+                        "ScheduleInfrastructureRepairTask",
+                        $"IS RepairTask {RepairTaskEngine.HostMachineReboot} " +
                        $"Executor set to {executorName}.",
                        cancellationToken,
                        repairData,
@ -243,7 +243,7 @@ namespace FabricHealer.Repair
            {
                await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
                        LogLevel.Info,
-                        "ExecuteRMInfrastructureRepairTask",
+                        "ScheduleInfrastructureRepairTask",
                        "Unable to find InfrastructureService service instance." +
                        "Exiting RepairTaskManager.ScheduleFHRepairTaskAsync.",
                        cancellationToken,
@ -255,19 +255,26 @@ namespace FabricHealer.Repair

            // Make sure there is not already a repair job executing reboot repair for target node.
            var isRepairAlreadyInProgress =
-                    await repairTaskEngine.IsFHRepairTaskRunningAsync(
-                                             executorName,
-                                             repairData,
-                                             cancellationToken);
+                    await repairTaskEngine.IsRepairInProgressAsync(executorName, repairData, cancellationToken);

            if (isRepairAlreadyInProgress)
            {
+                string machineName = repairData.NodeName;
+                
+                try
+                {
+                    machineName = Environment.MachineName;
+                }
+                catch (InvalidOperationException)
+                {
+
+                }
+
                await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
                        LogLevel.Info,
-                        "ExecuteRMInfrastructureRepairTask",
-                        "Virtual machine repair task for VM " +
-                        $"{await RepairExec.GetMachineHostNameFromFabricNodeNameAsync(repairData.NodeName, cancellationToken)} " +
-                        "is already in progress. Will not schedule another VM repair at this time.",
+                        "ScheduleInfrastructureRepairTask",
+                        $"Virtual machine repair task for {machineName} is already in progress " +
+                        "or max number of concurrent machine repairs has been reached. Will not schedule another machine repair at this time.",
                        cancellationToken,
                        repairData,
                        FabricHealerManager.ConfigSettings.EnableVerboseLogging);
@ -282,7 +289,7 @@ namespace FabricHealer.Repair
            {
                await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
                        LogLevel.Info,
-                        "ExecuteRMInfrastructureRepairTask",
+                        "ScheduleInfrastructureRepairTask",
                        "Unable to create Repair Task.",
                        cancellationToken,
                        repairData,
@ -293,7 +300,7 @@ namespace FabricHealer.Repair

            await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
                    LogLevel.Info,
-                    "ExecuteRMInfrastructureRepairTask",
+                    "ScheduleInfrastructureRepairTask",
                    $"Successfully created Repair Task {repairTask.TaskId}",
                    cancellationToken,
                    repairData,
@ -320,7 +327,7 @@ namespace FabricHealer.Repair

                await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
                        LogLevel.Info,
-                        "ExecuteRMInfrastructureRepairTask::Completed",
+                        "InfrastructureRepairTask::Completed",
                        $"Successfully completed repair {repairData.RepairPolicy.RepairId}",
                        cancellationToken,
                        repairData,
@ -332,7 +339,7 @@ namespace FabricHealer.Repair

            await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
                    LogLevel.Info,
-                    "ExecuteRMInfrastructureRepairTask::Timeout",
+                    "ScheduleInfrastructureRepairTask::Timeout",
                    $"Max wait time of {MaxWaitTimeForInfraRepairTaskCompleted} has elapsed for repair " +
                    $"{repairData.RepairPolicy.RepairId}.",
                    cancellationToken,
@ -441,7 +448,7 @@ namespace FabricHealer.Repair
            await Task.Delay(new Random().Next(100, 1500));

            // Has the repair already been scheduled by a different FH instance?
-            if (await repairTaskEngine.IsFHRepairTaskRunningAsync(RepairTaskEngine.FHTaskIdPrefix, repairData, cancellationToken))
+            if (await repairTaskEngine.IsRepairInProgressAsync(RepairTaskEngine.FHTaskIdPrefix, repairData, cancellationToken))
            {
                return null;
            }
@ -465,7 +472,7 @@ namespace FabricHealer.Repair
                    }

                    if (repairExecutorData.RepairData.RepairPolicy.RepairAction != RepairActionType.RestartFabricNode &&
-                        repairExecutorData.RepairData.RepairPolicy.RepairAction != RepairActionType.RestartVM)
+                        repairExecutorData.RepairData.RepairPolicy.RepairAction != RepairActionType.RebootMachine)
                    {
                        continue;
                    }
@ -700,7 +707,7 @@ namespace FabricHealer.Repair
                            success = false;
                            await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
                                    LogLevel.Info,
-                                    "RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
+                                    "RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoReplica",
                                    $"Stateless Instance {repairData.ReplicaId} not found on partition " +
                                    $"{repairData.PartitionId}.",
                                    cancellationToken,
@ -745,7 +752,7 @@ namespace FabricHealer.Repair
                            success = false;
                                await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync(
                                        LogLevel.Info,
-                                        "RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync",
+                                        "RepairTaskManager.ExecuteFabricHealerRmRepairTaskAsync::NoReplica",
                                        $"Stateful replica {repairData.ReplicaId} not found on partition " +
                                        $"{repairData.PartitionId}.",
                                        cancellationToken,
@ -941,11 +948,11 @@ namespace FabricHealer.Repair
                // This is done by setting the repair task to Restoring State with ResultStatus Succeeded. RM will then move forward to Restoring
                // (and do any restoring health checks if specified), then Complete the repair job.
                _ = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
-                                                   () => FabricRepairTasks.CompleteCustomActionRepairJobAsync(
-                                                            repairTask,
-                                                            Context, 
-                                                            cancellationToken),
-                                                   cancellationToken);
+                            () => FabricRepairTasks.CompleteCustomActionRepairJobAsync(
+                                    repairTask,
+                                    Context, 
+                                    cancellationToken),
+                            cancellationToken);

                // Let RM catch up.
                await Task.Delay(TimeSpan.FromSeconds(3), cancellationToken);
--- a/FabricHealer/Utilities/ConfigSettings.cs
+++ b/FabricHealer/Utilities/ConfigSettings.cs
@ -23,7 +23,7 @@ namespace FabricHealer.Utilities
            private set;
        }

-        public int ExecutionLoopSleepSeconds
+        public int HealthCheckIntervalInSeconds
        {
            get;
            private set;
@ -169,9 +169,9 @@ namespace FabricHealer.Utilities

            LocalLogPathParameter = GetConfigSettingValue( RepairConstants.RepairManagerConfigurationSectionName, RepairConstants.LocalLogPathParameter);

-            if (int.TryParse( GetConfigSettingValue(RepairConstants.RepairManagerConfigurationSectionName, RepairConstants.HealthCheckLoopSleepTimeSeconds), out int execFrequency))
+            if (int.TryParse(GetConfigSettingValue(RepairConstants.RepairManagerConfigurationSectionName, RepairConstants.HealthCheckIntervalInSeconds), out int execFrequency))
            {
-                ExecutionLoopSleepSeconds = execFrequency;
+                HealthCheckIntervalInSeconds = execFrequency;
            }

            // Rolling service restarts.
--- a/FabricHealer/Utilities/Telemetry/TelemetryData.cs
+++ b/FabricHealer/Utilities/Telemetry/TelemetryData.cs
@ -10,7 +10,6 @@ using System.Fabric.Health;
 using System;
 using FabricHealer.Repair;
 using System.Diagnostics.Tracing;
-using System.Fabric.Query;

 namespace FabricHealer.Utilities.Telemetry
 {
@ -96,8 +95,7 @@ namespace FabricHealer.Utilities.Telemetry
            get; set;
        }

-        [EventField]
-        public ServiceKind ServiceKind
+        public string ServiceKind
        {
            get; set;
        }
--- a/FabricHealer/Utilities/Telemetry/TelemetryUtilities.cs
+++ b/FabricHealer/Utilities/Telemetry/TelemetryUtilities.cs
@ -12,6 +12,7 @@ using System.Fabric.Health;
 using System.Runtime.InteropServices;
 using System.Threading;
 using System.Threading.Tasks;
+using System.Xml.Linq;

 namespace FabricHealer.Utilities.Telemetry
 {
@ -146,17 +147,22 @@ namespace FabricHealer.Utilities.Telemetry
            // ETW.
            if (FabricHealerManager.ConfigSettings.EtwEnabled)
            {
-                if (healthState == HealthState.Ok || healthState == HealthState.Unknown || healthState == HealthState.Invalid)
+                if (JsonSerializationUtility.TrySerialize(telemData, out string tData))
                {
-                    ServiceEventSource.Current.DataTypeWriteInfo(RepairConstants.EventSourceEventName, telemData);
-                }
-                else if (healthState == HealthState.Warning)
-                {
-                    ServiceEventSource.Current.DataTypeWriteWarning(RepairConstants.EventSourceEventName, telemData);
-                }
-                else
-                {
-                    ServiceEventSource.Current.DataTypeWriteError(RepairConstants.EventSourceEventName, telemData);
+                    var data = new { tData };
+
+                    if (healthState == HealthState.Ok || healthState == HealthState.Unknown || healthState == HealthState.Invalid)
+                    {
+                        ServiceEventSource.Current.DataTypeWriteInfo(RepairConstants.EventSourceEventName, data);
+                    }
+                    else if (healthState == HealthState.Warning)
+                    {
+                        ServiceEventSource.Current.DataTypeWriteWarning(RepairConstants.EventSourceEventName, data);
+                    }
+                    else
+                    {
+                        ServiceEventSource.Current.DataTypeWriteError(RepairConstants.EventSourceEventName, data);
+                    }
                }
            }
        }
--- a/FabricHealerApp/ApplicationPackageRoot/ApplicationManifest.xml
+++ b/FabricHealerApp/ApplicationPackageRoot/ApplicationManifest.xml
@ -4,7 +4,7 @@
    <!-- FabricHealerManager Settings  -->
    <Parameter Name="AutoMitigationEnabled" DefaultValue="true" />
    <Parameter Name="EnableETW" DefaultValue="true" />
-    <Parameter Name="MonitorLoopSleepSeconds" DefaultValue="5" />
+    <Parameter Name="HealthCheckIntervalInSeconds" DefaultValue="60" />
    <Parameter Name="EnableTelemetry" DefaultValue="true" />
    <!-- Set VerboseLoggingEnabled to true if you want detailed local logging and telemetry/ETW with repair data. 
         This data will live in a folder named RepairData, which will be created in your LocalLogPath directory.
@ -39,7 +39,7 @@
        <Settings>
          <!-- FabricHealerManager -->
          <Section Name="RepairManagerConfiguration">
-            <Parameter Name="HealthCheckLoopSleepTimeSeconds" Value="[MonitorLoopSleepSeconds]" />
+            <Parameter Name="HealthCheckIntervalInSeconds" Value="[HealthCheckIntervalInSeconds]" />
            <Parameter Name="EnableAutoMitigation" Value="[AutoMitigationEnabled]" />
            <Parameter Name="EnableETW" Value="[EnableETW]" />
            <Parameter Name="EnableTelemetry" Value="[EnableTelemetry]" />