Fix the LinuxCommunicator constructor failure issue by retrying to connect to HpcMonitoringServer service
This commit is contained in:
Родитель
fdb8eccf6c
Коммит
ebf9090348
|
@ -59,8 +59,6 @@ namespace Microsoft.Hpc.Communicators.LinuxCommunicator
|
||||||
|
|
||||||
instance = this;
|
instance = this;
|
||||||
this.headNodeFqdn = new Lazy<string>(() => Dns.GetHostEntryAsync(this.HeadNode).Result.HostName, LazyThreadSafetyMode.ExecutionAndPublication);
|
this.headNodeFqdn = new Lazy<string>(() => Dns.GetHostEntryAsync(this.HeadNode).Result.HostName, LazyThreadSafetyMode.ExecutionAndPublication);
|
||||||
this.MonitoringConfigManager = new MonitoringConfigManager(this.headNodeFqdn.Value);
|
|
||||||
this.HostsManager = new HostsFileManager();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public event EventHandler<RegisterEventArgs> RegisterRequested;
|
public event EventHandler<RegisterEventArgs> RegisterRequested;
|
||||||
|
@ -129,6 +127,10 @@ namespace Microsoft.Hpc.Communicators.LinuxCommunicator
|
||||||
{
|
{
|
||||||
this.Tracer.TraceInfo("Initializing LinuxCommunicator.");
|
this.Tracer.TraceInfo("Initializing LinuxCommunicator.");
|
||||||
|
|
||||||
|
this.MonitoringConfigManager = new MonitoringConfigManager();
|
||||||
|
Task.Run(() => this.MonitoringConfigManager.Initialize(this.headNodeFqdn.Value));
|
||||||
|
this.HostsManager = new HostsFileManager();
|
||||||
|
|
||||||
ServicePointManager.ServerCertificateValidationCallback += (s, cert, chain, sslPolicyErrors) =>
|
ServicePointManager.ServerCertificateValidationCallback += (s, cert, chain, sslPolicyErrors) =>
|
||||||
{
|
{
|
||||||
this.Tracer.TraceDetail("sslPolicyErrors {0}", sslPolicyErrors);
|
this.Tracer.TraceDetail("sslPolicyErrors {0}", sslPolicyErrors);
|
||||||
|
|
|
@ -111,6 +111,7 @@
|
||||||
<Compile Include="Monitoring\MonitoringConfigManager.cs" />
|
<Compile Include="Monitoring\MonitoringConfigManager.cs" />
|
||||||
<Compile Include="Monitoring\PerformanceCounterNames.cs" />
|
<Compile Include="Monitoring\PerformanceCounterNames.cs" />
|
||||||
<Compile Include="Properties\AssemblyInfo.cs" />
|
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||||
|
<Compile Include="RetryFramework.cs" />
|
||||||
<Compile Include="Startup.cs" />
|
<Compile Include="Startup.cs" />
|
||||||
<Compile Include="TaskExtensionMethods.cs" />
|
<Compile Include="TaskExtensionMethods.cs" />
|
||||||
<Compile Include="WebServer.cs" />
|
<Compile Include="WebServer.cs" />
|
||||||
|
|
|
@ -65,13 +65,31 @@ namespace Microsoft.Hpc.Communicators.LinuxCommunicator.Monitoring
|
||||||
|
|
||||||
private MetricCountersConfig metricCountersConfig = new MetricCountersConfig();
|
private MetricCountersConfig metricCountersConfig = new MetricCountersConfig();
|
||||||
|
|
||||||
public MonitoringConfigManager(string server)
|
public MonitoringConfigManager() { }
|
||||||
|
|
||||||
|
public void Initialize(string server)
|
||||||
{
|
{
|
||||||
this.Store = MonitoringStoreConnection.Connect(server, "LinuxCommunicator");
|
RetryManager rm = new RetryManager(new PeriodicRetryTimer(30 * 1000));
|
||||||
this.checkConfigTimer.AutoReset = true;
|
while (true)
|
||||||
this.checkConfigTimer.Interval = 5 * 60 * 1000;
|
{
|
||||||
this.checkConfigTimer.Elapsed += checkConfigTimer_Elapsed;
|
try
|
||||||
this.checkConfigTimer_Elapsed(this, null);
|
{
|
||||||
|
this.Store = MonitoringStoreConnection.Connect(server, "LinuxCommunicator");
|
||||||
|
this.checkConfigTimer_Elapsed(this, null);
|
||||||
|
this.checkConfigTimer.AutoReset = true;
|
||||||
|
this.checkConfigTimer.Interval = 5 * 60 * 1000;
|
||||||
|
this.checkConfigTimer.Elapsed += checkConfigTimer_Elapsed;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
catch (Exception e)
|
||||||
|
{
|
||||||
|
if (rm.HasAttemptsLeft)
|
||||||
|
{
|
||||||
|
LinuxCommunicator.Instance.Tracer.TraceException(e, "MonitoringConfigManager initialization failed. Retry count {0}, retry wait time {1}.", rm.RetryCount, rm.NextWaitTime);
|
||||||
|
rm.WaitForNextAttempt();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public event EventHandler<ConfigChangedEventArgs> ConfigChanged;
|
public event EventHandler<ConfigChangedEventArgs> ConfigChanged;
|
||||||
|
|
|
@ -0,0 +1,342 @@
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// <copyright file="ExceptionHelper.cs" company="Microsoft">
|
||||||
|
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||||
|
// </copyright>
|
||||||
|
// <owner current="true" primary="true">nzeng</owner>
|
||||||
|
// Security review: nzeng 01-11-06
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#region Using directives
|
||||||
|
|
||||||
|
using System;
|
||||||
|
using System.Text;
|
||||||
|
using System.Diagnostics;
|
||||||
|
using System.Threading;
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
namespace Microsoft.Hpc
|
||||||
|
{
|
||||||
|
internal class RetryManager
|
||||||
|
{
|
||||||
|
public const int InfiniteRetries = -1;
|
||||||
|
|
||||||
|
RetryWaitTimer _waitTimer;
|
||||||
|
|
||||||
|
int _maxRetries;
|
||||||
|
int _totalTimeLimit = Timeout.Infinite;
|
||||||
|
|
||||||
|
int _retryCount = 0;
|
||||||
|
int _totalWaitTime = 0;
|
||||||
|
int _currentWaitTime = 0;
|
||||||
|
|
||||||
|
public RetryManager(RetryWaitTimer waitTimer) : this(waitTimer, InfiniteRetries) { }
|
||||||
|
public RetryManager(RetryWaitTimer waitTimer, int maxRetries) : this(waitTimer, maxRetries, Timeout.Infinite) { }
|
||||||
|
|
||||||
|
public RetryManager(RetryWaitTimer waitTimer, int maxRetries, int totalTimeLimit)
|
||||||
|
{
|
||||||
|
if (waitTimer == null)
|
||||||
|
{
|
||||||
|
throw new ArgumentNullException("wait");
|
||||||
|
}
|
||||||
|
_waitTimer = waitTimer;
|
||||||
|
|
||||||
|
SetMaxRetries(maxRetries);
|
||||||
|
SetTotalTimeLimit(totalTimeLimit);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets the number of retries attempted thus far
|
||||||
|
/// </summary>
|
||||||
|
public int RetryCount { get { return _retryCount; } }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Get the total spent waiting between retries
|
||||||
|
/// </summary>
|
||||||
|
public int ElaspsedWaitTime { get { return _totalWaitTime; } }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets or sets the maximum number of retries
|
||||||
|
/// </summary>
|
||||||
|
public int MaxRetryCount
|
||||||
|
{
|
||||||
|
get { return _maxRetries; }
|
||||||
|
set { SetMaxRetries(value); }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets or sets the total amount of time that may be spend waiting for retries.
|
||||||
|
/// </summary>
|
||||||
|
public int TotalTimeLimit
|
||||||
|
{
|
||||||
|
get { return _totalTimeLimit; }
|
||||||
|
set { SetTotalTimeLimit(value); }
|
||||||
|
}
|
||||||
|
|
||||||
|
void SetMaxRetries(int n)
|
||||||
|
{
|
||||||
|
if (n <= 0 && n != RetryManager.InfiniteRetries)
|
||||||
|
{
|
||||||
|
throw new ArgumentException("The maximum number of retries must be greater than zero, or RetryOperator.InfiniteRetries");
|
||||||
|
}
|
||||||
|
_maxRetries = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SetTotalTimeLimit(int t)
|
||||||
|
{
|
||||||
|
if (t <= 0 && t != Timeout.Infinite)
|
||||||
|
{
|
||||||
|
throw new ArgumentException("The specified time must be greater than zero, or Timeout.Infinite");
|
||||||
|
}
|
||||||
|
_totalTimeLimit = t;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Returns true if there are more retries left
|
||||||
|
/// </summary>
|
||||||
|
public bool HasAttemptsLeft
|
||||||
|
{
|
||||||
|
get
|
||||||
|
{
|
||||||
|
return ((_maxRetries == RetryManager.InfiniteRetries || _retryCount < _maxRetries)
|
||||||
|
&& (_totalTimeLimit == Timeout.Infinite || _totalWaitTime < _totalTimeLimit));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Get the next wait time
|
||||||
|
/// </summary>
|
||||||
|
public int NextWaitTime
|
||||||
|
{
|
||||||
|
get
|
||||||
|
{
|
||||||
|
int waitTime = _waitTimer.GetNextWaitTime(_retryCount, _currentWaitTime);
|
||||||
|
if (_totalTimeLimit != Timeout.Infinite && (_totalWaitTime + waitTime > _totalTimeLimit))
|
||||||
|
{
|
||||||
|
waitTime = _totalTimeLimit - _totalWaitTime;
|
||||||
|
}
|
||||||
|
return waitTime;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Increment the retry count and advance the total wait time without actually waiting
|
||||||
|
/// </summary>
|
||||||
|
public void SimulateNextAttempt()
|
||||||
|
{
|
||||||
|
WaitForNextAttempt(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Wait until the next retry by making the current thread sleep for the appropriate amount of time.
|
||||||
|
/// May return immediately if the wait is zero.
|
||||||
|
/// </summary>
|
||||||
|
public void WaitForNextAttempt()
|
||||||
|
{
|
||||||
|
WaitForNextAttempt(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void WaitForNextAttempt(bool doSleep)
|
||||||
|
{
|
||||||
|
if (!HasAttemptsLeft)
|
||||||
|
{
|
||||||
|
throw new InvalidOperationException("There are no more retry attempts remaining");
|
||||||
|
}
|
||||||
|
|
||||||
|
_currentWaitTime = NextWaitTime;
|
||||||
|
_retryCount++;
|
||||||
|
|
||||||
|
Debug.Assert(_currentWaitTime >= 0);
|
||||||
|
if (_currentWaitTime > 0)
|
||||||
|
{
|
||||||
|
if (doSleep)
|
||||||
|
{
|
||||||
|
Thread.Sleep(_currentWaitTime);
|
||||||
|
}
|
||||||
|
_totalWaitTime += _currentWaitTime;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Resets the retry manager's retry count
|
||||||
|
/// </summary>
|
||||||
|
public void Reset()
|
||||||
|
{
|
||||||
|
_retryCount = 0;
|
||||||
|
_totalWaitTime = 0;
|
||||||
|
_currentWaitTime = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Defines how long a retry manager will wait between sub-sequent retries
|
||||||
|
/// </summary>
|
||||||
|
internal abstract class RetryWaitTimer
|
||||||
|
{
|
||||||
|
internal abstract int GetNextWaitTime(int retryCount, int currentWaitTime);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Instantly returns without waiting
|
||||||
|
/// </summary>
|
||||||
|
internal class InstantRetryTimer : RetryWaitTimer
|
||||||
|
{
|
||||||
|
internal override int GetNextWaitTime(int retryCount, int currentWaitTime)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This class should be a singleton
|
||||||
|
private InstantRetryTimer() { }
|
||||||
|
|
||||||
|
static InstantRetryTimer _instance = new InstantRetryTimer();
|
||||||
|
public static InstantRetryTimer Instance
|
||||||
|
{
|
||||||
|
get { return _instance; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Waits a constant time between subsequent retries
|
||||||
|
/// </summary>
|
||||||
|
internal class PeriodicRetryTimer : RetryWaitTimer
|
||||||
|
{
|
||||||
|
int _period;
|
||||||
|
|
||||||
|
public PeriodicRetryTimer(int period)
|
||||||
|
{
|
||||||
|
if (period < 0)
|
||||||
|
{
|
||||||
|
throw new ArgumentOutOfRangeException("period", "The period must be a non-negative integer (in milliseconds)");
|
||||||
|
}
|
||||||
|
_period = period;
|
||||||
|
}
|
||||||
|
|
||||||
|
internal override int GetNextWaitTime(int retryCount, int currentWaitTime)
|
||||||
|
{
|
||||||
|
return _period;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// A retry timer where wait time at retry n depends on the wait at retry n-1.
|
||||||
|
/// </summary>
|
||||||
|
internal abstract class BoundedBackoffRetryTimer : RetryWaitTimer
|
||||||
|
{
|
||||||
|
int _initialWait;
|
||||||
|
int _waitUpperBound;
|
||||||
|
|
||||||
|
protected BoundedBackoffRetryTimer(int initialWait, int waitUpperBound)
|
||||||
|
{
|
||||||
|
if (initialWait <= 0)
|
||||||
|
{
|
||||||
|
throw new ArgumentOutOfRangeException("initialWait", "Initial value must be a positive integer (in milliseconds)");
|
||||||
|
}
|
||||||
|
if (waitUpperBound <= 0 && waitUpperBound != Timeout.Infinite)
|
||||||
|
{
|
||||||
|
throw new ArgumentOutOfRangeException("waitCap", "The wait cap must be greater than zero, or Timeout.Infinite");
|
||||||
|
}
|
||||||
|
|
||||||
|
_initialWait = initialWait;
|
||||||
|
_waitUpperBound = waitUpperBound;
|
||||||
|
}
|
||||||
|
|
||||||
|
internal override int GetNextWaitTime(int retryCount, int currentWaitTime)
|
||||||
|
{
|
||||||
|
if (retryCount == 0)
|
||||||
|
{
|
||||||
|
return _initialWait;
|
||||||
|
}
|
||||||
|
|
||||||
|
int nextWaitTime = GetBackOffValue(currentWaitTime);
|
||||||
|
if (nextWaitTime < 0)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (_waitUpperBound != Timeout.Infinite && nextWaitTime > _waitUpperBound)
|
||||||
|
{
|
||||||
|
return _waitUpperBound;
|
||||||
|
}
|
||||||
|
return nextWaitTime;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract int GetBackOffValue(int currentValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Wait times will increase exponentially
|
||||||
|
/// </summary>
|
||||||
|
internal class ExponentialBackoffRetryTimer : BoundedBackoffRetryTimer
|
||||||
|
{
|
||||||
|
double _growthFactor;
|
||||||
|
|
||||||
|
public ExponentialBackoffRetryTimer(int initialWait) : this(initialWait, Timeout.Infinite, 2) { }
|
||||||
|
public ExponentialBackoffRetryTimer(int initialWait, int waitUpperBound) : this(initialWait, waitUpperBound, 2) { }
|
||||||
|
|
||||||
|
public ExponentialBackoffRetryTimer(int initialWait, int waitUpperBound, double growthFactor)
|
||||||
|
: base(initialWait, waitUpperBound)
|
||||||
|
{
|
||||||
|
if (growthFactor <= 0)
|
||||||
|
{
|
||||||
|
throw new ArgumentOutOfRangeException("growthFactor", "The growth factor must be a positive value");
|
||||||
|
}
|
||||||
|
_growthFactor = growthFactor;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected override int GetBackOffValue(int currentValue)
|
||||||
|
{
|
||||||
|
return (int)Math.Round(currentValue * _growthFactor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Wait times will increase exponentially and also vary a bit randomly
|
||||||
|
/// </summary>
|
||||||
|
internal class ExponentialRandomBackoffRetryTimer : ExponentialBackoffRetryTimer
|
||||||
|
{
|
||||||
|
Random _rand = null;
|
||||||
|
public ExponentialRandomBackoffRetryTimer(int initialWait) : this(initialWait, Timeout.Infinite, 2) { }
|
||||||
|
public ExponentialRandomBackoffRetryTimer(int initialWait, int waitUpperBound) : this(initialWait, waitUpperBound, 2) { }
|
||||||
|
|
||||||
|
public ExponentialRandomBackoffRetryTimer(int initialWait, int waitUpperBound, double growthFactor)
|
||||||
|
: base(initialWait, waitUpperBound,growthFactor)
|
||||||
|
{
|
||||||
|
_rand = new Random();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected override int GetBackOffValue(int currentValue)
|
||||||
|
{
|
||||||
|
return ((int)base.GetBackOffValue(currentValue)) + _rand.Next(0, currentValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Wait times will increase linearly
|
||||||
|
/// </summary>
|
||||||
|
internal class LinearBackoffRetryTimer : BoundedBackoffRetryTimer
|
||||||
|
{
|
||||||
|
int _increment;
|
||||||
|
|
||||||
|
public LinearBackoffRetryTimer(int initialWait) : this(initialWait, Timeout.Infinite, initialWait) { }
|
||||||
|
public LinearBackoffRetryTimer(int initialWait, int waitUpperBound) : this(initialWait, waitUpperBound, initialWait) { }
|
||||||
|
|
||||||
|
public LinearBackoffRetryTimer(int initialWait, int waitUpperBound, int increment)
|
||||||
|
: base(initialWait, waitUpperBound)
|
||||||
|
{
|
||||||
|
_increment = increment;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected override int GetBackOffValue(int currentValue)
|
||||||
|
{
|
||||||
|
return currentValue + _increment;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Загрузка…
Ссылка в новой задаче