From 1aa006fed68b804e5bd3f5fe53afc71ecc54df27 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Tue, 19 Apr 2022 13:52:50 -0700 Subject: [PATCH] FHProxy: IEnumerable support. Doc updates. --- FabricHealerProxy.md | 62 +++++++++++++-------------- FabricHealerProxy/FabricHealer.cs | 69 ++++++++++++++++++++++++++----- FabricHealerProxy/README.md | 62 +++++++++++++-------------- 3 files changed, 122 insertions(+), 71 deletions(-) diff --git a/FabricHealerProxy.md b/FabricHealerProxy.md index 939fefa..d1beec8 100644 --- a/FabricHealerProxy.md +++ b/FabricHealerProxy.md @@ -1,6 +1,6 @@ # FabricHealerProxy -FabricHealerProxy is a .NET Standard 2.0 library that provides a very simple and reliable way to share Service Fabric entity repair information to FabricHealer service instances running in the same cluster. You can install FabricHealerProxy into your .NET Service Fabric service from the [nuget.org package gallery](...). +FabricHealerProxy is a .NET Standard 2.0 library that provides a very simple and reliable way for any .NET Service Fabric service to initiate Service Fabric entity repair by the FabricHealer service running in the same cluster. ### How to use FabricHealerProxy @@ -22,6 +22,7 @@ using System.Threading.Tasks; using Microsoft.ServiceFabric.Services.Runtime; using FabricHealerProxy; using FabricHealerProxy.Exceptions; +using System.Collections.Generic; namespace Stateless1 { @@ -45,7 +46,8 @@ namespace Stateless1 // This specifies that you want FabricHealer to repair a service instance deployed to a Fabric node named NodeName. // FabricHealer supports both Replica and CodePackage restarts of services. The logic rules will dictate which one of these happens, // so make sure to craft a specific logic rule that makes sense for you (and use some logic!). - // Note that, out of the box, FabricHealer's AppRules.guan file (located in the FabricHealer project's PackageRoot/Config/LogicRules folder) already has a restart replica catch-all (applies to any service) rule that will restart the primary replica of + // Note that, out of the box, FabricHealer's AppRules.guan file (located in the FabricHealer project's PackageRoot/Config/LogicRules folder) + // already has a restart replica catch-all (applies to any service) rule that will restart the primary replica of // the specified service below, deployed to the a specified Fabric node. var repairDataServiceTarget = new RepairData { @@ -53,6 +55,12 @@ namespace Stateless1 NodeName = "_Node_0" }; + var repairDataServiceTarget2 = new RepairData + { + ServiceName = "fabric:/HealthMetrics/BandActorServiceType", + NodeName = "_Node_0" + }; + // This specifies that you want FabricHealer to repair a Fabric node named NodeName. The only supported repair in FabricHealer is a Restart. // Related rules can be found in FabricNodeRepair.guan file in the FabricHealer project's PackageRoot/Config/LogicRules folder. // So, implicitly, this means you want FabricHealer to restart _Node_0. You can of course modify the related logic rules to do something else. It's up to you! @@ -61,10 +69,27 @@ namespace Stateless1 NodeName = "_Node_0" }; - // Service repair. + // For use in the IEnumerable RepairEntityAsync overload. + List repairDataList = new List + { + repairDataNodeTarget, + repairDataServiceTarget, + repairDataServiceTarget2 + }; + + // For use in the single instance RepairData RepairEntityAsync overload. + var repairDataServiceTargetSingle = new RepairData + { + ServiceName = "fabric:/HealthMetrics/HealthMetrics.WebServiceType", + NodeName = "_Node_0" + }; + + // This demonstrates which exceptions will be thrown by the API. The first three represent user error (most likely). The last two are internal SF issues which + // will be thrown only after a series of retries. How to handle these is up to you. try { - await FabricHealer.Proxy.RepairEntityAsync(repairDataServiceTarget, cancellationToken, TimeSpan.FromMinutes(5)).ConfigureAwait(false); + await FabricHealer.Proxy.RepairEntityAsync(repairDataServiceTargetSingle, cancellationToken, TimeSpan.FromMinutes(5)).ConfigureAwait(false); + await FabricHealer.Proxy.RepairEntityAsync(repairDataList, cancellationToken, TimeSpan.FromMinutes(5)).ConfigureAwait(false); } catch (MissingRepairDataException) { @@ -89,30 +114,6 @@ namespace Stateless1 // ClusterManager service could be hammered (flooded with queries), for example. You could retry RepairEntityAsync again after you wait a bit.. } - // Node repair. - try - { - await FabricHealer.Proxy.RepairEntityAsync(repairDataNodeTarget, cancellationToken, TimeSpan.FromMinutes(5)).ConfigureAwait(false); - } - catch (FabricNodeNotFoundException) - { - // Check your spelling.. - } - catch (FabricException) - { - // No-op unless you want to re-run RepairEntityAsync again. - } - catch (TimeoutException) - { - // ClusterManager service could be hammered (flooded with queries), for example. You could retry RepairEntityAsync again after you wait a bit.. - } - - var repairDataServiceTarget2 = new RepairData - { - ServiceName = "fabric:/HealthMetrics/BandActorServiceType", - NodeName = "_Node_0" - }; - // Do nothing and wait. while (!cancellationToken.IsCancellationRequested) { @@ -125,8 +126,9 @@ namespace Stateless1 } } - - // Close the Proxy to clear internal state and remove health reports that are still active. + + // Close the proxy (this cleans up state and removes any health report that is currently active (not expired). + // Note: this does not cancel repairs that are in flight or in the FabricHealer internal repair queue. await FabricHealer.Proxy.Close(); } } diff --git a/FabricHealerProxy/FabricHealer.cs b/FabricHealerProxy/FabricHealer.cs index 303eb03..a5b52f7 100644 --- a/FabricHealerProxy/FabricHealer.cs +++ b/FabricHealerProxy/FabricHealer.cs @@ -35,7 +35,8 @@ namespace FabricHealerProxy // Use one FC for the lifetime of the consuming SF service process that loads FabricHealerProxy.dll. private static readonly FabricClient fabricClient = new FabricClient(settings); - private static readonly object lockObj = new object(); + private static readonly object instanceLock = new object(); + private static readonly object writeLock = new object(); private static readonly TimeSpan defaultHealthReportTtl = TimeSpan.FromMinutes(15); private static readonly TimeSpan maxDataLifeTime = defaultHealthReportTtl; @@ -60,7 +61,7 @@ namespace FabricHealerProxy { if (instance == null) { - lock (lockObj) + lock (instanceLock) { if (instance == null) { @@ -126,7 +127,30 @@ namespace FabricHealerProxy repairData, repairDataLifetime, fabricClient, - cancellationToken)); + cancellationToken)).ConfigureAwait(false); + } + + /// + /// This function generates a specially-crafted Service Fabric Health Report that the FabricHealer service will understand and act upon given the facts supplied + /// in the RepairData instance. Use this function to supply a list or array of RepairData objects. + /// + /// A collection of RepairData instances. RepairData is a well-known (ITelemetryData) data ty[e that contains facts which FabricHealer will use + /// in the execution of its entity repair logic rules and related mitigation functions. + /// CancellationToken used to ensure this function stops processing when the token is cancelled. + /// The amount of time for the repair data to remain active (TTL of associated health report). Default is 15 mins. + /// Thrown when RepairData instance is null. + /// Thrown when an internal Service Fabric operation fails. + /// Thrown when specified RepairData.NodeName does not exist in the cluster. + /// Thrown when specified service doesn't exist in the cluster. + /// Thrown when RepairData instance is missing values for required non-null members (E.g., NodeName). + /// Thrown when required ApplicationName or ServiceName value is a malformed Uri string. + /// Thrown when internal Fabric client API calls timeout. + public async Task RepairEntityAsync(IEnumerable repairDataCollection, CancellationToken cancellationToken, TimeSpan repairDataLifetime = default) + { + foreach (var repairData in repairDataCollection) + { + await RepairEntityAsync(repairData, cancellationToken, repairDataLifetime).ConfigureAwait(false); + } } private async Task RepairEntityAsyncInternal(RepairData repairData, TimeSpan repairDataLifetime, FabricClient fabricClient, CancellationToken cancellationToken) @@ -137,7 +161,10 @@ namespace FabricHealerProxy } // Remove expired repair data. - ManageRepairDataHistory(); + lock (writeLock) + { + ManageRepairDataHistory(); + } if (string.IsNullOrWhiteSpace(repairData.ApplicationName)) { @@ -288,7 +315,10 @@ namespace FabricHealerProxy } // Add repairData to history. - repairDataHistory.Add((DateTime.UtcNow, repairData)); + lock (writeLock) + { + repairDataHistory.Add((DateTime.UtcNow, repairData)); + } } private void ManageRepairDataHistory() @@ -365,7 +395,7 @@ namespace FabricHealerProxy break; } - return await HealthReportExistsAsync(repairData, fabricClient, cancellationToken); + return await HealthReportExistsAsync(repairData, fabricClient, cancellationToken).ConfigureAwait(false); } private async Task HealthReportExistsAsync(RepairData repairData, FabricClient fabricClient, CancellationToken cancellationToken) @@ -466,7 +496,7 @@ namespace FabricHealerProxy TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(3), TimeSpan.FromSeconds(5) - }).ExecuteAsync(() => ClearHealthReportsInternalAsync()); + }).ExecuteAsync(() => ClearHealthReportsInternalAsync()).ConfigureAwait(false); } private async Task ClearHealthReportsInternalAsync() @@ -534,7 +564,13 @@ namespace FabricHealerProxy } - await Task.Delay(250); + await Task.Delay(250).ConfigureAwait(false); + + lock (writeLock) + { + repairDataHistory.RemoveAt(i); + --i; + } } } @@ -544,9 +580,20 @@ namespace FabricHealerProxy public async Task Close() { await ClearHealthReports(); - repairDataHistory?.Clear(); - repairDataHistory = null; - instance = null; + + if (repairDataHistory != null) + { + lock (writeLock) + { + repairDataHistory?.Clear(); + repairDataHistory = null; + + if (instance != null) + { + instance = null; + } + } + } } } } \ No newline at end of file diff --git a/FabricHealerProxy/README.md b/FabricHealerProxy/README.md index 5c42b14..7f98c12 100644 --- a/FabricHealerProxy/README.md +++ b/FabricHealerProxy/README.md @@ -1,6 +1,6 @@ # FabricHealerProxy -FabricHealerProxy is a .NET Standard 2.0 library that provides a very simple and reliable way to share Service Fabric entity repair information to FabricHealer service instances running in the same cluster. You can install FabricHealerProxy into your .NET Service Fabric service from the [nuget.org package gallery](...). +FabricHealerProxy is a .NET Standard 2.0 library that provides a very simple and reliable way for any .NET Service Fabric service to initiate Service Fabric entity repair by the FabricHealer service running in the same cluster. You can install FabricHealerProxy into your .NET Service Fabric service from the [nuget.org package gallery](...). ### How to use FabricHealerProxy @@ -22,6 +22,7 @@ using System.Threading.Tasks; using Microsoft.ServiceFabric.Services.Runtime; using FabricHealerProxy; using FabricHealerProxy.Exceptions; +using System.Collections.Generic; namespace Stateless1 { @@ -45,7 +46,8 @@ namespace Stateless1 // This specifies that you want FabricHealer to repair a service instance deployed to a Fabric node named NodeName. // FabricHealer supports both Replica and CodePackage restarts of services. The logic rules will dictate which one of these happens, // so make sure to craft a specific logic rule that makes sense for you (and use some logic!). - // Note that, out of the box, FabricHealer's AppRules.guan file (located in the FabricHealer project's PackageRoot/Config/LogicRules folder) already has a restart replica catch-all (applies to any service) rule that will restart the primary replica of + // Note that, out of the box, FabricHealer's AppRules.guan file (located in the FabricHealer project's PackageRoot/Config/LogicRules folder) + // already has a restart replica catch-all (applies to any service) rule that will restart the primary replica of // the specified service below, deployed to the a specified Fabric node. var repairDataServiceTarget = new RepairData { @@ -53,6 +55,12 @@ namespace Stateless1 NodeName = "_Node_0" }; + var repairDataServiceTarget2 = new RepairData + { + ServiceName = "fabric:/HealthMetrics/BandActorServiceType", + NodeName = "_Node_0" + }; + // This specifies that you want FabricHealer to repair a Fabric node named NodeName. The only supported repair in FabricHealer is a Restart. // Related rules can be found in FabricNodeRepair.guan file in the FabricHealer project's PackageRoot/Config/LogicRules folder. // So, implicitly, this means you want FabricHealer to restart _Node_0. You can of course modify the related logic rules to do something else. It's up to you! @@ -61,10 +69,27 @@ namespace Stateless1 NodeName = "_Node_0" }; - // Service repair. + // For use in the IEnumerable RepairEntityAsync overload. + List repairDataList = new List + { + repairDataNodeTarget, + repairDataServiceTarget, + repairDataServiceTarget2 + }; + + // For use in the single instance RepairData RepairEntityAsync overload. + var repairDataServiceTargetSingle = new RepairData + { + ServiceName = "fabric:/HealthMetrics/HealthMetrics.WebServiceType", + NodeName = "_Node_0" + }; + + // This demonstrates which exceptions will be thrown by the API. The first three represent user error (most likely). The last two are internal SF issues which + // will be thrown only after a series of retries. How to handle these is up to you. try { - await FabricHealer.Proxy.RepairEntityAsync(repairDataServiceTarget, cancellationToken, TimeSpan.FromMinutes(5)).ConfigureAwait(false); + await FabricHealer.Proxy.RepairEntityAsync(repairDataServiceTargetSingle, cancellationToken, TimeSpan.FromMinutes(5)).ConfigureAwait(false); + await FabricHealer.Proxy.RepairEntityAsync(repairDataList, cancellationToken, TimeSpan.FromMinutes(5)).ConfigureAwait(false); } catch (MissingRepairDataException) { @@ -89,30 +114,6 @@ namespace Stateless1 // ClusterManager service could be hammered (flooded with queries), for example. You could retry RepairEntityAsync again after you wait a bit.. } - // Node repair. - try - { - await FabricHealer.Proxy.RepairEntityAsync(repairDataNodeTarget, cancellationToken, TimeSpan.FromMinutes(5)).ConfigureAwait(false); - } - catch (FabricNodeNotFoundException) - { - // Check your spelling.. - } - catch (FabricException) - { - // No-op unless you want to re-run RepairEntityAsync again. - } - catch (TimeoutException) - { - // ClusterManager service could be hammered (flooded with queries), for example. You could retry RepairEntityAsync again after you wait a bit.. - } - - var repairDataServiceTarget2 = new RepairData - { - ServiceName = "fabric:/HealthMetrics/BandActorServiceType", - NodeName = "_Node_0" - }; - // Do nothing and wait. while (!cancellationToken.IsCancellationRequested) { @@ -125,8 +126,9 @@ namespace Stateless1 } } - - // Close the Proxy to clear internal state and remove health reports that are still active. + + // Close the proxy (this cleans up state and removes any health report that is currently active (not expired). + // Note: this does not cancel repairs that are in flight or in the FabricHealer internal repair queue. await FabricHealer.Proxy.Close(); } }