Merged PR 784298: Fix ValidateSuccess errors on the workers for builds terminating early

Some instances of ValidateSuccessMatches errors on workers are related to this following scenario:

- Worker early-released and we call FinishAsync by capturing 'buildFailure' as null because HasFailed is false.
- We wait for worker to drain its pips
- Internal error occurs on the orchestrator
- Orchestrator terminates and tries to exit each worker.
- For early-released workers, there is already a pending FinishAsync due to the early release and the second 'FinishAsync' with the error message is ignored.
- Worker receives Exit message with no failure. ValidateSuccess condition fails because we expect the build to succeed, but there are errors logged due to pending/terminated pips on the worker itself.

Related work items: #2172783
This commit is contained in:
Semih Okur 2024-05-10 22:59:06 +00:00
Родитель 43d3318255
Коммит b92d1ec815
6 изменённых файлов: 29 добавлений и 28 удалений

Просмотреть файл

@ -58,8 +58,8 @@ namespace BuildXL.Engine.Distribution
private CancellationTokenRegistration m_cancellationTokenRegistration;
private bool m_isInitialized;
private volatile bool m_isConnectionLost;
private bool m_materializeOutputFailed;
private bool m_isEarlyReleaseInitiated;
private string m_infraFailure;
/// <inheritdoc />
public override bool IsEarlyReleaseInitiated => Volatile.Read(ref m_isEarlyReleaseInitiated);
@ -333,6 +333,7 @@ namespace BuildXL.Engine.Distribution
CountConnectionFailure(e.Type);
m_isConnectionLost = true;
m_infraFailure = $"The connection got lost with the worker. Is ever connected: {EverConnected}. Is early-release initiated: {IsEarlyReleaseInitiated}";
// Connection is lost. As the worker might have pending tasks,
// Scheduler might decide to release the worker due to insufficient amount of remaining work, which is not related to the connection issue.
@ -551,7 +552,7 @@ namespace BuildXL.Engine.Distribution
return true;
}
public override async Task FinishAsync(string buildFailure = null, [CallerMemberName] string callerName = null)
public override async Task FinishAsync([CallerMemberName] string callerName = null)
{
if (await TryInitiateStop())
{
@ -565,7 +566,7 @@ namespace BuildXL.Engine.Distribution
Name,
callerName);
await DisconnectAsync(buildFailure, callerName);
await DisconnectAsync(callerName);
}
}
@ -608,7 +609,7 @@ namespace BuildXL.Engine.Distribution
Scheduler.Tracing.Logger.Log.WorkerReleasedEarly(m_appLoggingContext, Name);
}
private async Task DisconnectAsync(string buildFailure = null, [CallerMemberName] string callerName = null)
private async Task DisconnectAsync([CallerMemberName] string callerName = null)
{
Contract.Requires(AttachCompletionTask.IsCompleted, "AttachCompletionTask needs to be completed before we disconnect the worker");
Contract.Requires(Status == WorkerNodeStatus.Stopping, $"Disconnect cannot be called for {Status} status");
@ -628,18 +629,9 @@ namespace BuildXL.Engine.Distribution
Logger.Log.DistributionStreamingNetworkFailure(m_appLoggingContext, Name);
}
string infraFailure = string.Empty;
if (!EverAvailable && !IsEarlyReleaseInitiated)
{
infraFailure = $"{Name} failed to connect to the orchestrator within the timeout ({EngineEnvironmentSettings.MinimumWaitForRemoteWorker.Value.TotalMinutes} minutes), typically due to a delay between orchestrator startup and worker bxl process initiation. Such delays can be expected in certain multi-stage distributed builds.";
}
else if (m_materializeOutputFailed)
{
infraFailure = "Some materialize output requests could not be sent.";
}
else if (m_isConnectionLost)
{
infraFailure = $"The connection got lost with the worker. Is ever connected: {EverConnected}. Is early-release initiated: {IsEarlyReleaseInitiated}";
m_infraFailure = $"{Name} failed to connect to the orchestrator within the timeout ({EngineEnvironmentSettings.MinimumWaitForRemoteWorker.Value.TotalMinutes} minutes), typically due to a delay between orchestrator startup and worker bxl process initiation. Such delays can be expected in certain multi-stage distributed builds.";
}
// If we still have a connection with the worker, we should send a message to worker to make it exit.
@ -647,15 +639,16 @@ namespace BuildXL.Engine.Distribution
{
var buildEndData = new BuildEndData();
buildEndData.Failure = buildFailure ?? infraFailure;
// The infrastructure failures should be given higher priority when forwarding them to workers.
buildEndData.Failure = m_infraFailure ?? (Environment.HasFailed ? "Distributed build failed. See errors on orchestrator." : string.Empty);
var exitCallResult = await m_workerClient.ExitAsync(buildEndData);
if (!exitCallResult.Succeeded)
{
infraFailure += $" Exit call failed with {exitCallResult.LastFailure?.DescribeIncludingInnerFailures() ?? "gRPC call failed"}.";
m_infraFailure += $" Exit call failed with {exitCallResult.LastFailure?.DescribeIncludingInnerFailures() ?? "gRPC call failed"}.";
}
}
if (!string.IsNullOrEmpty(infraFailure))
if (!string.IsNullOrEmpty(m_infraFailure))
{
// We log the following message for each worker if any of these occurs:
// (i) The worker has not been ever attached to the orchestrator at any part of the build and the early release has not been initiated for this worker.
@ -663,7 +656,7 @@ namespace BuildXL.Engine.Distribution
// (iii) The worker failed to materialize all outputs.
// (iv) The exit call fails to be sent.
Scheduler.Tracing.Logger.Log.ProblematicWorkerExit(m_appLoggingContext, infraFailure);
Scheduler.Tracing.Logger.Log.ProblematicWorkerExit(m_appLoggingContext, m_infraFailure);
m_orchestratorService.Counters.IncrementCounter(DistributionCounter.NumProblematicWorkers);
Environment.ReportProblematicWorker();
}
@ -991,7 +984,7 @@ namespace BuildXL.Engine.Distribution
{
// Output replication failures on workers due to connection issues do not fail the distributed build.
// Setting the exit failure above ensures that the worker will fail its build and not proceed.
m_materializeOutputFailed = true;
m_infraFailure = "Some materialize output requests could not be sent.";
result = new ExecutionResult();
result.SetResult(operationContext, PipResultStatus.NotMaterialized);
@ -1294,7 +1287,8 @@ namespace BuildXL.Engine.Distribution
if (isInfraError)
{
await FinishAsync(forwardedEvent.Text);
m_infraFailure = forwardedEvent.Text;
await FinishAsync();
return true;
}

Просмотреть файл

@ -417,7 +417,7 @@ namespace BuildXL.Scheduler.Distribution
/// Signals that build is finished and that worker should exit
/// </summary>
#pragma warning disable 1998 // Disable the warning for "This async method lacks 'await'"
public virtual async Task FinishAsync(string buildFailure, [CallerMemberName] string callerName = null)
public virtual async Task FinishAsync([CallerMemberName] string callerName = null)
{
Status = WorkerNodeStatus.Stopped;
}

Просмотреть файл

@ -260,6 +260,12 @@ namespace BuildXL.Scheduler
/// Report a problematic worker
/// </summary>
void ReportProblematicWorker();
/// <summary>
/// Returns a Boolean indicating if the scheduler has so far been successful in executing pips.
/// If the pip queue is empty and the scheduler has failed, then the final value of this flag is known.
/// </summary>
public bool HasFailed { get; }
}
/// <summary>

Просмотреть файл

@ -1581,12 +1581,6 @@ namespace BuildXL.Scheduler
#region Execution
/// <summary>
/// Returns a Boolean indicating if the scheduler has so far been successful in executing pips.
/// If the pip queue is empty and the scheduler has failed, then the final value of this flag is known.
/// </summary>
public bool HasFailed => m_hasFailures;
/// <summary>
/// Returns a Boolean indicating if the scheduler has received a request for cancellation.
/// </summary>
@ -1747,7 +1741,7 @@ namespace BuildXL.Scheduler
using (PipExecutionCounters.StartStopwatch(PipExecutorCounter.WhenDoneWorkerFinishDuration))
{
await m_workers.ParallelForEachAsync((worker) => worker.FinishAsync(HasFailed ? "Distributed build failed. See errors on orchestrator." : null));
await m_workers.ParallelForEachAsync((worker) => worker.FinishAsync());
// Wait for all workers to confirm that they have stopped.
while (m_workers.Any(w => w.Status != WorkerNodeStatus.Stopped))
@ -5349,6 +5343,9 @@ namespace BuildXL.Scheduler
/// <inheritdoc />
public PipExecutionContext Context { get; }
/// <inheritdoc />
public bool HasFailed => m_hasFailures;
/// <inheritdoc />
[SuppressMessage("Microsoft.Design", "CA1033:InterfaceMethodsShouldBeCallableByChildTypes")]
bool IPipExecutionEnvironment.MaterializeOutputsInBackground => MaterializeOutputsInBackground;

Просмотреть файл

@ -686,6 +686,8 @@ namespace Test.BuildXL.Scheduler
public IReadOnlySet<AbsolutePath> TranslatedGlobalUnsafeUntrackedScopes => CollectionUtilities.EmptySet<AbsolutePath>();
public SchedulerTestHooks SchedulerTestHooks { get; }
public bool HasFailed => throw new NotImplementedException();
}
}

Просмотреть файл

@ -657,6 +657,8 @@ namespace Test.BuildXL.Scheduler.Utils
/// <inheritdoc />
public PipSpecificPropertiesConfig PipSpecificPropertiesConfig { get; set; }
public bool HasFailed => throw new NotImplementedException();
public SealDirectoryKind GetSealDirectoryKind(DirectoryArtifact directory)
{
if (m_knownSealedSourceDirectoriesAllDirectories.Contains(directory.Path))