Applying workaround for race condition in OpenMPI on non-0 exit code from CNTK
This commit is contained in:
Родитель
b931eb69aa
Коммит
1b5fa2dfde
|
@ -79,6 +79,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
#endif
|
||||
}
|
||||
|
||||
// Workaround for the issue with MPI hanging when we have non-0 exit codes from CNTK processes
|
||||
// OpenMPI has a confirmed race condition on killing child process vs. handling their non-zero exit statuses, resulting
|
||||
// in a deadlock, where all processes killed but MPI is still waiting.
|
||||
// This happens when several perfectly synchronized processes (for example on MPI barrier)
|
||||
// simulatenously exit with non-0 exit code.
|
||||
// As a workaround, we simply sleep 50*rank miliseconds, effectively "de-synchronizing processes" at exit,
|
||||
// allowing MPI to sequentially handle terminations
|
||||
static int s_myRank;
|
||||
static void MPIWorkaroundAtExit()
|
||||
{
|
||||
// Note: we can't use g_mpi, since MPI stack is already down at this point
|
||||
Sleep(s_myRank*50);
|
||||
}
|
||||
|
||||
public:
|
||||
MPIWrapper()
|
||||
: m_currentComm(MPI_COMM_WORLD)
|
||||
|
@ -98,6 +112,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
MPI_Comm_size(MPI_COMM_WORLD, &m_numMPINodes);
|
||||
m_numNodesInUse = m_numMPINodes;
|
||||
|
||||
// Applying MPI workaround
|
||||
s_myRank = m_myRank;
|
||||
atexit(&MPIWrapper::MPIWorkaroundAtExit);
|
||||
|
||||
// by default we use all of them
|
||||
RequestNodes("MPIWrapper");
|
||||
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
#include "Include/Basics.h"
|
||||
#include "Include/MPIWrapper.h"
|
||||
|
||||
int Microsoft::MSR::CNTK::MPIWrapper::s_myRank = -1;
|
|
@ -228,6 +228,7 @@
|
|||
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"></PrecompiledHeader>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\Common\TimerUtility.cpp" />
|
||||
<ClCompile Include="..\..\Common\MPIWrapper.cpp" />
|
||||
<ClCompile Include="CNTK.cpp" />
|
||||
<ClCompile Include="ExperimentalNetworkBuilder.cpp" />
|
||||
<ClCompile Include="ModelEditLanguage.cpp" />
|
||||
|
@ -243,4 +244,4 @@
|
|||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets" />
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
1
Makefile
1
Makefile
|
@ -427,6 +427,7 @@ CNTK_SRC =\
|
|||
BrainScript/BrainScriptParser.cpp \
|
||||
BrainScript/BrainScriptTest.cpp \
|
||||
MachineLearning/CNTK/ExperimentalNetworkBuilder.cpp \
|
||||
Common/MPIWrapper.cpp \
|
||||
|
||||
|
||||
ifdef CUDA_PATH
|
||||
|
|
Загрузка…
Ссылка в новой задаче