Fix the LinuxCommunicator constructor failure issue by retrying to connect to HpcMonitoringServer service

This commit is contained in:
chezhang 2016-07-29 14:48:55 +08:00
Родитель fdb8eccf6c
Коммит ebf9090348
4 изменённых файлов: 371 добавлений и 8 удалений

Просмотреть файл

@ -59,8 +59,6 @@ namespace Microsoft.Hpc.Communicators.LinuxCommunicator
instance = this; instance = this;
this.headNodeFqdn = new Lazy<string>(() => Dns.GetHostEntryAsync(this.HeadNode).Result.HostName, LazyThreadSafetyMode.ExecutionAndPublication); this.headNodeFqdn = new Lazy<string>(() => Dns.GetHostEntryAsync(this.HeadNode).Result.HostName, LazyThreadSafetyMode.ExecutionAndPublication);
this.MonitoringConfigManager = new MonitoringConfigManager(this.headNodeFqdn.Value);
this.HostsManager = new HostsFileManager();
} }
public event EventHandler<RegisterEventArgs> RegisterRequested; public event EventHandler<RegisterEventArgs> RegisterRequested;
@ -129,6 +127,10 @@ namespace Microsoft.Hpc.Communicators.LinuxCommunicator
{ {
this.Tracer.TraceInfo("Initializing LinuxCommunicator."); this.Tracer.TraceInfo("Initializing LinuxCommunicator.");
this.MonitoringConfigManager = new MonitoringConfigManager();
Task.Run(() => this.MonitoringConfigManager.Initialize(this.headNodeFqdn.Value));
this.HostsManager = new HostsFileManager();
ServicePointManager.ServerCertificateValidationCallback += (s, cert, chain, sslPolicyErrors) => ServicePointManager.ServerCertificateValidationCallback += (s, cert, chain, sslPolicyErrors) =>
{ {
this.Tracer.TraceDetail("sslPolicyErrors {0}", sslPolicyErrors); this.Tracer.TraceDetail("sslPolicyErrors {0}", sslPolicyErrors);

Просмотреть файл

@ -111,6 +111,7 @@
<Compile Include="Monitoring\MonitoringConfigManager.cs" /> <Compile Include="Monitoring\MonitoringConfigManager.cs" />
<Compile Include="Monitoring\PerformanceCounterNames.cs" /> <Compile Include="Monitoring\PerformanceCounterNames.cs" />
<Compile Include="Properties\AssemblyInfo.cs" /> <Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="RetryFramework.cs" />
<Compile Include="Startup.cs" /> <Compile Include="Startup.cs" />
<Compile Include="TaskExtensionMethods.cs" /> <Compile Include="TaskExtensionMethods.cs" />
<Compile Include="WebServer.cs" /> <Compile Include="WebServer.cs" />

Просмотреть файл

@ -65,13 +65,31 @@ namespace Microsoft.Hpc.Communicators.LinuxCommunicator.Monitoring
private MetricCountersConfig metricCountersConfig = new MetricCountersConfig(); private MetricCountersConfig metricCountersConfig = new MetricCountersConfig();
public MonitoringConfigManager(string server) public MonitoringConfigManager() { }
public void Initialize(string server)
{ {
this.Store = MonitoringStoreConnection.Connect(server, "LinuxCommunicator"); RetryManager rm = new RetryManager(new PeriodicRetryTimer(30 * 1000));
this.checkConfigTimer.AutoReset = true; while (true)
this.checkConfigTimer.Interval = 5 * 60 * 1000; {
this.checkConfigTimer.Elapsed += checkConfigTimer_Elapsed; try
this.checkConfigTimer_Elapsed(this, null); {
this.Store = MonitoringStoreConnection.Connect(server, "LinuxCommunicator");
this.checkConfigTimer_Elapsed(this, null);
this.checkConfigTimer.AutoReset = true;
this.checkConfigTimer.Interval = 5 * 60 * 1000;
this.checkConfigTimer.Elapsed += checkConfigTimer_Elapsed;
break;
}
catch (Exception e)
{
if (rm.HasAttemptsLeft)
{
LinuxCommunicator.Instance.Tracer.TraceException(e, "MonitoringConfigManager initialization failed. Retry count {0}, retry wait time {1}.", rm.RetryCount, rm.NextWaitTime);
rm.WaitForNextAttempt();
}
}
}
} }
public event EventHandler<ConfigChangedEventArgs> ConfigChanged; public event EventHandler<ConfigChangedEventArgs> ConfigChanged;

Просмотреть файл

@ -0,0 +1,342 @@
//------------------------------------------------------------------------------
// <copyright file="ExceptionHelper.cs" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
// <owner current="true" primary="true">nzeng</owner>
// Security review: nzeng 01-11-06
//------------------------------------------------------------------------------
#region Using directives
using System;
using System.Text;
using System.Diagnostics;
using System.Threading;
#endregion
namespace Microsoft.Hpc
{
internal class RetryManager
{
public const int InfiniteRetries = -1;
RetryWaitTimer _waitTimer;
int _maxRetries;
int _totalTimeLimit = Timeout.Infinite;
int _retryCount = 0;
int _totalWaitTime = 0;
int _currentWaitTime = 0;
public RetryManager(RetryWaitTimer waitTimer) : this(waitTimer, InfiniteRetries) { }
public RetryManager(RetryWaitTimer waitTimer, int maxRetries) : this(waitTimer, maxRetries, Timeout.Infinite) { }
public RetryManager(RetryWaitTimer waitTimer, int maxRetries, int totalTimeLimit)
{
if (waitTimer == null)
{
throw new ArgumentNullException("wait");
}
_waitTimer = waitTimer;
SetMaxRetries(maxRetries);
SetTotalTimeLimit(totalTimeLimit);
}
/// <summary>
/// Gets the number of retries attempted thus far
/// </summary>
public int RetryCount { get { return _retryCount; } }
/// <summary>
/// Get the total spent waiting between retries
/// </summary>
public int ElaspsedWaitTime { get { return _totalWaitTime; } }
/// <summary>
/// Gets or sets the maximum number of retries
/// </summary>
public int MaxRetryCount
{
get { return _maxRetries; }
set { SetMaxRetries(value); }
}
/// <summary>
/// Gets or sets the total amount of time that may be spend waiting for retries.
/// </summary>
public int TotalTimeLimit
{
get { return _totalTimeLimit; }
set { SetTotalTimeLimit(value); }
}
void SetMaxRetries(int n)
{
if (n <= 0 && n != RetryManager.InfiniteRetries)
{
throw new ArgumentException("The maximum number of retries must be greater than zero, or RetryOperator.InfiniteRetries");
}
_maxRetries = n;
}
void SetTotalTimeLimit(int t)
{
if (t <= 0 && t != Timeout.Infinite)
{
throw new ArgumentException("The specified time must be greater than zero, or Timeout.Infinite");
}
_totalTimeLimit = t;
}
/// <summary>
/// Returns true if there are more retries left
/// </summary>
public bool HasAttemptsLeft
{
get
{
return ((_maxRetries == RetryManager.InfiniteRetries || _retryCount < _maxRetries)
&& (_totalTimeLimit == Timeout.Infinite || _totalWaitTime < _totalTimeLimit));
}
}
/// <summary>
/// Get the next wait time
/// </summary>
public int NextWaitTime
{
get
{
int waitTime = _waitTimer.GetNextWaitTime(_retryCount, _currentWaitTime);
if (_totalTimeLimit != Timeout.Infinite && (_totalWaitTime + waitTime > _totalTimeLimit))
{
waitTime = _totalTimeLimit - _totalWaitTime;
}
return waitTime;
}
}
/// <summary>
/// Increment the retry count and advance the total wait time without actually waiting
/// </summary>
public void SimulateNextAttempt()
{
WaitForNextAttempt(false);
}
/// <summary>
/// Wait until the next retry by making the current thread sleep for the appropriate amount of time.
/// May return immediately if the wait is zero.
/// </summary>
public void WaitForNextAttempt()
{
WaitForNextAttempt(true);
}
void WaitForNextAttempt(bool doSleep)
{
if (!HasAttemptsLeft)
{
throw new InvalidOperationException("There are no more retry attempts remaining");
}
_currentWaitTime = NextWaitTime;
_retryCount++;
Debug.Assert(_currentWaitTime >= 0);
if (_currentWaitTime > 0)
{
if (doSleep)
{
Thread.Sleep(_currentWaitTime);
}
_totalWaitTime += _currentWaitTime;
}
}
/// <summary>
/// Resets the retry manager's retry count
/// </summary>
public void Reset()
{
_retryCount = 0;
_totalWaitTime = 0;
_currentWaitTime = 0;
}
}
/// <summary>
/// Defines how long a retry manager will wait between sub-sequent retries
/// </summary>
internal abstract class RetryWaitTimer
{
internal abstract int GetNextWaitTime(int retryCount, int currentWaitTime);
}
/// <summary>
/// Instantly returns without waiting
/// </summary>
internal class InstantRetryTimer : RetryWaitTimer
{
internal override int GetNextWaitTime(int retryCount, int currentWaitTime)
{
return 0;
}
// This class should be a singleton
private InstantRetryTimer() { }
static InstantRetryTimer _instance = new InstantRetryTimer();
public static InstantRetryTimer Instance
{
get { return _instance; }
}
}
/// <summary>
/// Waits a constant time between subsequent retries
/// </summary>
internal class PeriodicRetryTimer : RetryWaitTimer
{
int _period;
public PeriodicRetryTimer(int period)
{
if (period < 0)
{
throw new ArgumentOutOfRangeException("period", "The period must be a non-negative integer (in milliseconds)");
}
_period = period;
}
internal override int GetNextWaitTime(int retryCount, int currentWaitTime)
{
return _period;
}
}
/// <summary>
/// A retry timer where wait time at retry n depends on the wait at retry n-1.
/// </summary>
internal abstract class BoundedBackoffRetryTimer : RetryWaitTimer
{
int _initialWait;
int _waitUpperBound;
protected BoundedBackoffRetryTimer(int initialWait, int waitUpperBound)
{
if (initialWait <= 0)
{
throw new ArgumentOutOfRangeException("initialWait", "Initial value must be a positive integer (in milliseconds)");
}
if (waitUpperBound <= 0 && waitUpperBound != Timeout.Infinite)
{
throw new ArgumentOutOfRangeException("waitCap", "The wait cap must be greater than zero, or Timeout.Infinite");
}
_initialWait = initialWait;
_waitUpperBound = waitUpperBound;
}
internal override int GetNextWaitTime(int retryCount, int currentWaitTime)
{
if (retryCount == 0)
{
return _initialWait;
}
int nextWaitTime = GetBackOffValue(currentWaitTime);
if (nextWaitTime < 0)
{
return 0;
}
if (_waitUpperBound != Timeout.Infinite && nextWaitTime > _waitUpperBound)
{
return _waitUpperBound;
}
return nextWaitTime;
}
protected abstract int GetBackOffValue(int currentValue);
}
/// <summary>
/// Wait times will increase exponentially
/// </summary>
internal class ExponentialBackoffRetryTimer : BoundedBackoffRetryTimer
{
double _growthFactor;
public ExponentialBackoffRetryTimer(int initialWait) : this(initialWait, Timeout.Infinite, 2) { }
public ExponentialBackoffRetryTimer(int initialWait, int waitUpperBound) : this(initialWait, waitUpperBound, 2) { }
public ExponentialBackoffRetryTimer(int initialWait, int waitUpperBound, double growthFactor)
: base(initialWait, waitUpperBound)
{
if (growthFactor <= 0)
{
throw new ArgumentOutOfRangeException("growthFactor", "The growth factor must be a positive value");
}
_growthFactor = growthFactor;
}
protected override int GetBackOffValue(int currentValue)
{
return (int)Math.Round(currentValue * _growthFactor);
}
}
/// <summary>
/// Wait times will increase exponentially and also vary a bit randomly
/// </summary>
internal class ExponentialRandomBackoffRetryTimer : ExponentialBackoffRetryTimer
{
Random _rand = null;
public ExponentialRandomBackoffRetryTimer(int initialWait) : this(initialWait, Timeout.Infinite, 2) { }
public ExponentialRandomBackoffRetryTimer(int initialWait, int waitUpperBound) : this(initialWait, waitUpperBound, 2) { }
public ExponentialRandomBackoffRetryTimer(int initialWait, int waitUpperBound, double growthFactor)
: base(initialWait, waitUpperBound,growthFactor)
{
_rand = new Random();
}
protected override int GetBackOffValue(int currentValue)
{
return ((int)base.GetBackOffValue(currentValue)) + _rand.Next(0, currentValue);
}
}
/// <summary>
/// Wait times will increase linearly
/// </summary>
internal class LinearBackoffRetryTimer : BoundedBackoffRetryTimer
{
int _increment;
public LinearBackoffRetryTimer(int initialWait) : this(initialWait, Timeout.Infinite, initialWait) { }
public LinearBackoffRetryTimer(int initialWait, int waitUpperBound) : this(initialWait, waitUpperBound, initialWait) { }
public LinearBackoffRetryTimer(int initialWait, int waitUpperBound, int increment)
: base(initialWait, waitUpperBound)
{
_increment = increment;
}
protected override int GetBackOffValue(int currentValue)
{
return currentValue + _increment;
}
}
}