зеркало из https://github.com/microsoft/BuildXL.git
Merged PR 784298: Fix ValidateSuccess errors on the workers for builds terminating early
Some instances of ValidateSuccessMatches errors on workers are related to this following scenario: - Worker early-released and we call FinishAsync by capturing 'buildFailure' as null because HasFailed is false. - We wait for worker to drain its pips - Internal error occurs on the orchestrator - Orchestrator terminates and tries to exit each worker. - For early-released workers, there is already a pending FinishAsync due to the early release and the second 'FinishAsync' with the error message is ignored. - Worker receives Exit message with no failure. ValidateSuccess condition fails because we expect the build to succeed, but there are errors logged due to pending/terminated pips on the worker itself. Related work items: #2172783
This commit is contained in:
Родитель
43d3318255
Коммит
b92d1ec815
|
@ -58,8 +58,8 @@ namespace BuildXL.Engine.Distribution
|
|||
private CancellationTokenRegistration m_cancellationTokenRegistration;
|
||||
private bool m_isInitialized;
|
||||
private volatile bool m_isConnectionLost;
|
||||
private bool m_materializeOutputFailed;
|
||||
private bool m_isEarlyReleaseInitiated;
|
||||
private string m_infraFailure;
|
||||
|
||||
/// <inheritdoc />
|
||||
public override bool IsEarlyReleaseInitiated => Volatile.Read(ref m_isEarlyReleaseInitiated);
|
||||
|
@ -333,6 +333,7 @@ namespace BuildXL.Engine.Distribution
|
|||
|
||||
CountConnectionFailure(e.Type);
|
||||
m_isConnectionLost = true;
|
||||
m_infraFailure = $"The connection got lost with the worker. Is ever connected: {EverConnected}. Is early-release initiated: {IsEarlyReleaseInitiated}";
|
||||
|
||||
// Connection is lost. As the worker might have pending tasks,
|
||||
// Scheduler might decide to release the worker due to insufficient amount of remaining work, which is not related to the connection issue.
|
||||
|
@ -551,7 +552,7 @@ namespace BuildXL.Engine.Distribution
|
|||
return true;
|
||||
}
|
||||
|
||||
public override async Task FinishAsync(string buildFailure = null, [CallerMemberName] string callerName = null)
|
||||
public override async Task FinishAsync([CallerMemberName] string callerName = null)
|
||||
{
|
||||
if (await TryInitiateStop())
|
||||
{
|
||||
|
@ -565,7 +566,7 @@ namespace BuildXL.Engine.Distribution
|
|||
Name,
|
||||
callerName);
|
||||
|
||||
await DisconnectAsync(buildFailure, callerName);
|
||||
await DisconnectAsync(callerName);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -608,7 +609,7 @@ namespace BuildXL.Engine.Distribution
|
|||
Scheduler.Tracing.Logger.Log.WorkerReleasedEarly(m_appLoggingContext, Name);
|
||||
}
|
||||
|
||||
private async Task DisconnectAsync(string buildFailure = null, [CallerMemberName] string callerName = null)
|
||||
private async Task DisconnectAsync([CallerMemberName] string callerName = null)
|
||||
{
|
||||
Contract.Requires(AttachCompletionTask.IsCompleted, "AttachCompletionTask needs to be completed before we disconnect the worker");
|
||||
Contract.Requires(Status == WorkerNodeStatus.Stopping, $"Disconnect cannot be called for {Status} status");
|
||||
|
@ -628,18 +629,9 @@ namespace BuildXL.Engine.Distribution
|
|||
Logger.Log.DistributionStreamingNetworkFailure(m_appLoggingContext, Name);
|
||||
}
|
||||
|
||||
string infraFailure = string.Empty;
|
||||
if (!EverAvailable && !IsEarlyReleaseInitiated)
|
||||
{
|
||||
infraFailure = $"{Name} failed to connect to the orchestrator within the timeout ({EngineEnvironmentSettings.MinimumWaitForRemoteWorker.Value.TotalMinutes} minutes), typically due to a delay between orchestrator startup and worker bxl process initiation. Such delays can be expected in certain multi-stage distributed builds.";
|
||||
}
|
||||
else if (m_materializeOutputFailed)
|
||||
{
|
||||
infraFailure = "Some materialize output requests could not be sent.";
|
||||
}
|
||||
else if (m_isConnectionLost)
|
||||
{
|
||||
infraFailure = $"The connection got lost with the worker. Is ever connected: {EverConnected}. Is early-release initiated: {IsEarlyReleaseInitiated}";
|
||||
m_infraFailure = $"{Name} failed to connect to the orchestrator within the timeout ({EngineEnvironmentSettings.MinimumWaitForRemoteWorker.Value.TotalMinutes} minutes), typically due to a delay between orchestrator startup and worker bxl process initiation. Such delays can be expected in certain multi-stage distributed builds.";
|
||||
}
|
||||
|
||||
// If we still have a connection with the worker, we should send a message to worker to make it exit.
|
||||
|
@ -647,15 +639,16 @@ namespace BuildXL.Engine.Distribution
|
|||
{
|
||||
var buildEndData = new BuildEndData();
|
||||
|
||||
buildEndData.Failure = buildFailure ?? infraFailure;
|
||||
// The infrastructure failures should be given higher priority when forwarding them to workers.
|
||||
buildEndData.Failure = m_infraFailure ?? (Environment.HasFailed ? "Distributed build failed. See errors on orchestrator." : string.Empty);
|
||||
var exitCallResult = await m_workerClient.ExitAsync(buildEndData);
|
||||
if (!exitCallResult.Succeeded)
|
||||
{
|
||||
infraFailure += $" Exit call failed with {exitCallResult.LastFailure?.DescribeIncludingInnerFailures() ?? "gRPC call failed"}.";
|
||||
m_infraFailure += $" Exit call failed with {exitCallResult.LastFailure?.DescribeIncludingInnerFailures() ?? "gRPC call failed"}.";
|
||||
}
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(infraFailure))
|
||||
if (!string.IsNullOrEmpty(m_infraFailure))
|
||||
{
|
||||
// We log the following message for each worker if any of these occurs:
|
||||
// (i) The worker has not been ever attached to the orchestrator at any part of the build and the early release has not been initiated for this worker.
|
||||
|
@ -663,7 +656,7 @@ namespace BuildXL.Engine.Distribution
|
|||
// (iii) The worker failed to materialize all outputs.
|
||||
// (iv) The exit call fails to be sent.
|
||||
|
||||
Scheduler.Tracing.Logger.Log.ProblematicWorkerExit(m_appLoggingContext, infraFailure);
|
||||
Scheduler.Tracing.Logger.Log.ProblematicWorkerExit(m_appLoggingContext, m_infraFailure);
|
||||
m_orchestratorService.Counters.IncrementCounter(DistributionCounter.NumProblematicWorkers);
|
||||
Environment.ReportProblematicWorker();
|
||||
}
|
||||
|
@ -991,7 +984,7 @@ namespace BuildXL.Engine.Distribution
|
|||
{
|
||||
// Output replication failures on workers due to connection issues do not fail the distributed build.
|
||||
// Setting the exit failure above ensures that the worker will fail its build and not proceed.
|
||||
m_materializeOutputFailed = true;
|
||||
m_infraFailure = "Some materialize output requests could not be sent.";
|
||||
|
||||
result = new ExecutionResult();
|
||||
result.SetResult(operationContext, PipResultStatus.NotMaterialized);
|
||||
|
@ -1294,7 +1287,8 @@ namespace BuildXL.Engine.Distribution
|
|||
|
||||
if (isInfraError)
|
||||
{
|
||||
await FinishAsync(forwardedEvent.Text);
|
||||
m_infraFailure = forwardedEvent.Text;
|
||||
await FinishAsync();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -417,7 +417,7 @@ namespace BuildXL.Scheduler.Distribution
|
|||
/// Signals that build is finished and that worker should exit
|
||||
/// </summary>
|
||||
#pragma warning disable 1998 // Disable the warning for "This async method lacks 'await'"
|
||||
public virtual async Task FinishAsync(string buildFailure, [CallerMemberName] string callerName = null)
|
||||
public virtual async Task FinishAsync([CallerMemberName] string callerName = null)
|
||||
{
|
||||
Status = WorkerNodeStatus.Stopped;
|
||||
}
|
||||
|
|
|
@ -260,6 +260,12 @@ namespace BuildXL.Scheduler
|
|||
/// Report a problematic worker
|
||||
/// </summary>
|
||||
void ReportProblematicWorker();
|
||||
|
||||
/// <summary>
|
||||
/// Returns a Boolean indicating if the scheduler has so far been successful in executing pips.
|
||||
/// If the pip queue is empty and the scheduler has failed, then the final value of this flag is known.
|
||||
/// </summary>
|
||||
public bool HasFailed { get; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
|
|
@ -1581,12 +1581,6 @@ namespace BuildXL.Scheduler
|
|||
|
||||
#region Execution
|
||||
|
||||
/// <summary>
|
||||
/// Returns a Boolean indicating if the scheduler has so far been successful in executing pips.
|
||||
/// If the pip queue is empty and the scheduler has failed, then the final value of this flag is known.
|
||||
/// </summary>
|
||||
public bool HasFailed => m_hasFailures;
|
||||
|
||||
/// <summary>
|
||||
/// Returns a Boolean indicating if the scheduler has received a request for cancellation.
|
||||
/// </summary>
|
||||
|
@ -1747,7 +1741,7 @@ namespace BuildXL.Scheduler
|
|||
|
||||
using (PipExecutionCounters.StartStopwatch(PipExecutorCounter.WhenDoneWorkerFinishDuration))
|
||||
{
|
||||
await m_workers.ParallelForEachAsync((worker) => worker.FinishAsync(HasFailed ? "Distributed build failed. See errors on orchestrator." : null));
|
||||
await m_workers.ParallelForEachAsync((worker) => worker.FinishAsync());
|
||||
|
||||
// Wait for all workers to confirm that they have stopped.
|
||||
while (m_workers.Any(w => w.Status != WorkerNodeStatus.Stopped))
|
||||
|
@ -5349,6 +5343,9 @@ namespace BuildXL.Scheduler
|
|||
/// <inheritdoc />
|
||||
public PipExecutionContext Context { get; }
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool HasFailed => m_hasFailures;
|
||||
|
||||
/// <inheritdoc />
|
||||
[SuppressMessage("Microsoft.Design", "CA1033:InterfaceMethodsShouldBeCallableByChildTypes")]
|
||||
bool IPipExecutionEnvironment.MaterializeOutputsInBackground => MaterializeOutputsInBackground;
|
||||
|
|
|
@ -686,6 +686,8 @@ namespace Test.BuildXL.Scheduler
|
|||
public IReadOnlySet<AbsolutePath> TranslatedGlobalUnsafeUntrackedScopes => CollectionUtilities.EmptySet<AbsolutePath>();
|
||||
|
||||
public SchedulerTestHooks SchedulerTestHooks { get; }
|
||||
|
||||
public bool HasFailed => throw new NotImplementedException();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -657,6 +657,8 @@ namespace Test.BuildXL.Scheduler.Utils
|
|||
/// <inheritdoc />
|
||||
public PipSpecificPropertiesConfig PipSpecificPropertiesConfig { get; set; }
|
||||
|
||||
public bool HasFailed => throw new NotImplementedException();
|
||||
|
||||
public SealDirectoryKind GetSealDirectoryKind(DirectoryArtifact directory)
|
||||
{
|
||||
if (m_knownSealedSourceDirectoriesAllDirectories.Contains(directory.Path))
|
||||
|
|
Загрузка…
Ссылка в новой задаче