Applying workaround for race condition in OpenMPI on non-0 exit code from CNTK

This commit is contained in:
Vladimir Ivanov 2015-10-30 16:16:09 -07:00
Родитель b931eb69aa
Коммит 1b5fa2dfde
4 изменённых файлов: 25 добавлений и 1 удалений

Просмотреть файл

@ -79,6 +79,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
#endif
}
// Workaround for the issue with MPI hanging when we have non-0 exit codes from CNTK processes
// OpenMPI has a confirmed race condition on killing child process vs. handling their non-zero exit statuses, resulting
// in a deadlock, where all processes killed but MPI is still waiting.
// This happens when several perfectly synchronized processes (for example on MPI barrier)
// simulatenously exit with non-0 exit code.
// As a workaround, we simply sleep 50*rank miliseconds, effectively "de-synchronizing processes" at exit,
// allowing MPI to sequentially handle terminations
static int s_myRank;
static void MPIWorkaroundAtExit()
{
// Note: we can't use g_mpi, since MPI stack is already down at this point
Sleep(s_myRank*50);
}
public:
MPIWrapper()
: m_currentComm(MPI_COMM_WORLD)
@ -98,6 +112,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
MPI_Comm_size(MPI_COMM_WORLD, &m_numMPINodes);
m_numNodesInUse = m_numMPINodes;
// Applying MPI workaround
s_myRank = m_myRank;
atexit(&MPIWrapper::MPIWorkaroundAtExit);
// by default we use all of them
RequestNodes("MPIWrapper");

4
Common/MPIWrapper.cpp Normal file
Просмотреть файл

@ -0,0 +1,4 @@
#include "Include/Basics.h"
#include "Include/MPIWrapper.h"
int Microsoft::MSR::CNTK::MPIWrapper::s_myRank = -1;

Просмотреть файл

@ -228,6 +228,7 @@
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"></PrecompiledHeader>
</ClCompile>
<ClCompile Include="..\..\Common\TimerUtility.cpp" />
<ClCompile Include="..\..\Common\MPIWrapper.cpp" />
<ClCompile Include="CNTK.cpp" />
<ClCompile Include="ExperimentalNetworkBuilder.cpp" />
<ClCompile Include="ModelEditLanguage.cpp" />
@ -243,4 +244,4 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" />
</Project>
</Project>

Просмотреть файл

@ -427,6 +427,7 @@ CNTK_SRC =\
BrainScript/BrainScriptParser.cpp \
BrainScript/BrainScriptTest.cpp \
MachineLearning/CNTK/ExperimentalNetworkBuilder.cpp \
Common/MPIWrapper.cpp \
ifdef CUDA_PATH