diff --git a/Public/Src/Engine/Scheduler/Scheduler.cs b/Public/Src/Engine/Scheduler/Scheduler.cs index 03b9921a9..0137ccea0 100644 --- a/Public/Src/Engine/Scheduler/Scheduler.cs +++ b/Public/Src/Engine/Scheduler/Scheduler.cs @@ -8137,8 +8137,9 @@ namespace BuildXL.Scheduler if (EngineEnvironmentSettings.LimitProblematicWorkerCount && m_remoteWorkers.Length >= 4 && - numProblematicWorkers > m_remoteWorkers.Length / 2) + numProblematicWorkers >= (m_remoteWorkers.Length * EngineEnvironmentSettings.LimitProblematicWorkerThreshold)) { + // Because LimitProblematicWorkerThreshold is 0.9 by default, we will fail the build only when all workers fail until 10 workers. Logger.Log.HighCountProblematicWorkers(m_loggingContext, numProblematicWorkers, m_remoteWorkers.Length); TerminateForInternalError(); } diff --git a/Public/Src/Utilities/Configuration/EngineEnvironmentSettings.cs b/Public/Src/Utilities/Configuration/EngineEnvironmentSettings.cs index 83e6a8ec2..0a3441322 100644 --- a/Public/Src/Utilities/Configuration/EngineEnvironmentSettings.cs +++ b/Public/Src/Utilities/Configuration/EngineEnvironmentSettings.cs @@ -287,6 +287,14 @@ namespace BuildXL.Utilities.Configuration /// public static readonly Setting LimitProblematicWorkerCount = CreateSetting("BuildXLLimitProblematicWorkerCount", value => string.IsNullOrWhiteSpace(value) || value == "1"); + /// + /// It defines the fraction of remote workers that must be problematic before considering a build failure. + /// + /// + /// For example, a threshold of 0.9 means that if 90% or more of the workers are problematic, the build will be terminated due to excessive errors. + /// + public static readonly Setting LimitProblematicWorkerThreshold = CreateSetting("BuildXLLimitProblematicWorkerCount", value => ParseDouble(value) ?? 0.9); + #endregion #region Grpc related settings