Merge remote-tracking branch 'origin/master' into mahilleb/pr/1030
This commit is contained in:
Коммит
ac3866a0e5
|
@ -42,6 +42,10 @@
|
|||
|
||||
<UseZip>false</UseZip>
|
||||
<UseZip Condition="Exists('$(ZLIB_PATH)')">true</UseZip>
|
||||
|
||||
<HasMultiverso>false</HasMultiverso>
|
||||
<HasMultiverso Condition="Exists('$(SolutionDir)\Source\Multiverso\include\multiverso')">true</HasMultiverso>
|
||||
|
||||
</PropertyGroup>
|
||||
|
||||
<PropertyGroup>
|
||||
|
|
20
CNTK.sln
20
CNTK.sln
|
@ -2257,11 +2257,11 @@ Global
|
|||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|Mixed Platforms.Build.0 = Debug_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Any CPU.ActiveCfg = debug|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Mixed Platforms.ActiveCfg = debug|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Mixed Platforms.Build.0 = debug|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|x64.ActiveCfg = debug|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|x64.Build.0 = debug|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Any CPU.ActiveCfg = Debug|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Mixed Platforms.Build.0 = Debug|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|x64.Build.0 = Debug|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|Any CPU.ActiveCfg = Release_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|Mixed Platforms.Build.0 = Release_CpuOnly|x64
|
||||
|
@ -2272,11 +2272,11 @@ Global
|
|||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|Mixed Platforms.Build.0 = Release_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|x64.ActiveCfg = Release_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|x64.Build.0 = Release_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Any CPU.ActiveCfg = release|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Mixed Platforms.ActiveCfg = release|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Mixed Platforms.Build.0 = release|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|x64.ActiveCfg = release|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|x64.Build.0 = release|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Any CPU.ActiveCfg = Release|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Mixed Platforms.ActiveCfg = Release|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Mixed Platforms.Build.0 = Release|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|x64.ActiveCfg = Release|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|x64.Build.0 = Release|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|Any CPU.ActiveCfg = Debug_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|Mixed Platforms.ActiveCfg = Debug_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|Mixed Platforms.Build.0 = Debug_CpuOnly|x64
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 40743f9c86297f63b29c99c259199f59f16c0c7c
|
||||
Subproject commit 86d29abf747edbca73ec9d85618e16279492d72a
|
|
@ -425,7 +425,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
{
|
||||
// Synchronize all ranks before proceeding to ensure that
|
||||
// rank 0 has finished writing the previous model file
|
||||
BarrierWorkers();
|
||||
SynchronizeWorkers();
|
||||
|
||||
// (re-)initialize 1-bit SGD
|
||||
if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD &&
|
||||
|
@ -728,7 +728,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
// Synchronize all ranks before proceeding to ensure that
|
||||
// nobody tries reading the checkpoint file at the same time
|
||||
// as rank 0 deleting it below
|
||||
BarrierWorkers();
|
||||
SynchronizeWorkers();
|
||||
|
||||
// Persist model and check-point info
|
||||
if ((m_mpi == nullptr) || m_mpi->IsMainNode())
|
||||
|
@ -797,7 +797,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
// Synchronize all ranks before proceeding to ensure that
|
||||
// rank 0 has finished writing the model file
|
||||
// TODO[DataASGD]: should othet other rank waiting in async-mode
|
||||
BarrierWorkers();
|
||||
SynchronizeWorkers();
|
||||
|
||||
// progress tracing for compute cluster management
|
||||
ProgressTracing::TraceProgressPercentage(m_maxEpochs, 0.0, true);
|
||||
|
|
|
@ -597,7 +597,7 @@ private:
|
|||
return UsingGradientAggregation(epochNumber) || UsingModelAggregation(epochNumber) || UsingAsyncGradientAggregation(epochNumber);
|
||||
}
|
||||
|
||||
void BarrierWorkers()
|
||||
void SynchronizeWorkers()
|
||||
{
|
||||
if (m_mpi != nullptr && GetParallelizationMethod() != ParallelizationMethod::dataParallelASGD)
|
||||
{
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
|
@ -149,4 +149,7 @@
|
|||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets" />
|
||||
<Target Name="CheckDependencies">
|
||||
<Error Condition="!$(HasMultiverso) And '$(CNTK_ENABLE_ASGD)'!='false'" Text="CNTK requires Multiverso to build, Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#getting-cntk-source-code for installation instructions." />
|
||||
</Target>
|
||||
</Project>
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -5,17 +5,17 @@
|
|||
#dataDir="."
|
||||
ConfigDir=$TEST_DIR
|
||||
LogFileName="ASGDMultiGPU"
|
||||
Instances=4
|
||||
Instances=2
|
||||
NumCPUThreads=$(threadsPerInstance $Instances)
|
||||
parallelizationMethod="DataParallelASGD"
|
||||
|
||||
# cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
|
||||
cntkmpirun "-n $Instances" 03_ResNet-parallel.cntk "numCPUThreads=$NumCPUThreads precision=float DeviceId=\"auto\" parallelTrain=true minibatch=512 epochsize=10 asyncBuffer=\"false\" parallelizationMethod=$parallelizationMethod"
|
||||
cntkmpirun "-n $Instances" 03_ResNet-parallel.cntk "numCPUThreads=$NumCPUThreads precision=float DeviceId=\"auto\" parallelTrain=true minibatch=256 epochsize=10 asyncBuffer=\"false\" parallelizationMethod=$parallelizationMethod"
|
||||
ExitCode=$?
|
||||
sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank0
|
||||
sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank1
|
||||
sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank2
|
||||
sed 's/^/MPI Rank 3: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank3
|
||||
#sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank2
|
||||
#sed 's/^/MPI Rank 3: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank3
|
||||
|
||||
# Delete the test data if copied
|
||||
[[ "$Copied" -eq "1" ]] && rm -rf "$DataDir"
|
||||
|
|
|
@ -18,12 +18,16 @@ testCases:
|
|||
patterns:
|
||||
- ^MPI Rank {{integer}}
|
||||
- Finished Epoch[{{integer}} of {{integer}}]
|
||||
- CE = {{float,tolerance=1}}
|
||||
- Err = {{float,tolerance=1}}
|
||||
|
||||
Per-minibatch training results must match for each MPI Rank:
|
||||
patterns:
|
||||
- ^MPI Rank {{integer}}
|
||||
- Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
|
||||
- " * {{integer}}; "
|
||||
- CE = {{float,tolerance=1}}
|
||||
- Err = {{float,tolerance=1}}
|
||||
|
||||
DataParallelASGD training parameters must match for each MPI Rank:
|
||||
patterns:
|
||||
|
|
Загрузка…
Ссылка в новой задаче