Merge remote-tracking branch 'origin/master' into mahilleb/pr/1030

This commit is contained in:
Mark Hillebrand 2016-11-14 13:07:33 +00:00
Родитель b89762fe96 0383a0134f
Коммит ac3866a0e5
9 изменённых файлов: 601 добавлений и 1613 удалений

Просмотреть файл

@ -42,6 +42,10 @@
<UseZip>false</UseZip>
<UseZip Condition="Exists('$(ZLIB_PATH)')">true</UseZip>
<HasMultiverso>false</HasMultiverso>
<HasMultiverso Condition="Exists('$(SolutionDir)\Source\Multiverso\include\multiverso')">true</HasMultiverso>
</PropertyGroup>
<PropertyGroup>

Просмотреть файл

@ -2257,11 +2257,11 @@ Global
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|Mixed Platforms.Build.0 = Debug_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Any CPU.ActiveCfg = debug|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Mixed Platforms.ActiveCfg = debug|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Mixed Platforms.Build.0 = debug|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|x64.ActiveCfg = debug|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|x64.Build.0 = debug|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Any CPU.ActiveCfg = Debug|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Mixed Platforms.Build.0 = Debug|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|x64.ActiveCfg = Debug|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|x64.Build.0 = Debug|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|Any CPU.ActiveCfg = Release_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|Mixed Platforms.Build.0 = Release_CpuOnly|x64
@ -2272,11 +2272,11 @@ Global
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|Mixed Platforms.Build.0 = Release_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|x64.ActiveCfg = Release_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|x64.Build.0 = Release_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Any CPU.ActiveCfg = release|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Mixed Platforms.ActiveCfg = release|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Mixed Platforms.Build.0 = release|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|x64.ActiveCfg = release|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|x64.Build.0 = release|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Any CPU.ActiveCfg = Release|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Mixed Platforms.ActiveCfg = Release|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Mixed Platforms.Build.0 = Release|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|x64.ActiveCfg = Release|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|x64.Build.0 = Release|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|Any CPU.ActiveCfg = Debug_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|Mixed Platforms.ActiveCfg = Debug_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|Mixed Platforms.Build.0 = Debug_CpuOnly|x64

@ -1 +1 @@
Subproject commit 40743f9c86297f63b29c99c259199f59f16c0c7c
Subproject commit 86d29abf747edbca73ec9d85618e16279492d72a

Просмотреть файл

@ -425,7 +425,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
{
// Synchronize all ranks before proceeding to ensure that
// rank 0 has finished writing the previous model file
BarrierWorkers();
SynchronizeWorkers();
// (re-)initialize 1-bit SGD
if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD &&
@ -728,7 +728,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
// Synchronize all ranks before proceeding to ensure that
// nobody tries reading the checkpoint file at the same time
// as rank 0 deleting it below
BarrierWorkers();
SynchronizeWorkers();
// Persist model and check-point info
if ((m_mpi == nullptr) || m_mpi->IsMainNode())
@ -797,7 +797,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
// Synchronize all ranks before proceeding to ensure that
// rank 0 has finished writing the model file
// TODO[DataASGD]: should othet other rank waiting in async-mode
BarrierWorkers();
SynchronizeWorkers();
// progress tracing for compute cluster management
ProgressTracing::TraceProgressPercentage(m_maxEpochs, 0.0, true);

Просмотреть файл

@ -597,7 +597,7 @@ private:
return UsingGradientAggregation(epochNumber) || UsingModelAggregation(epochNumber) || UsingAsyncGradientAggregation(epochNumber);
}
void BarrierWorkers()
void SynchronizeWorkers()
{
if (m_mpi != nullptr && GetParallelizationMethod() != ParallelizationMethod::dataParallelASGD)
{

Просмотреть файл

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
@ -149,4 +149,7 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" />
<Target Name="CheckDependencies">
<Error Condition="!$(HasMultiverso) And '$(CNTK_ENABLE_ASGD)'!='false'" Text="CNTK requires Multiverso to build, Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#getting-cntk-source-code for installation instructions." />
</Target>
</Project>

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -5,17 +5,17 @@
#dataDir="."
ConfigDir=$TEST_DIR
LogFileName="ASGDMultiGPU"
Instances=4
Instances=2
NumCPUThreads=$(threadsPerInstance $Instances)
parallelizationMethod="DataParallelASGD"
# cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
cntkmpirun "-n $Instances" 03_ResNet-parallel.cntk "numCPUThreads=$NumCPUThreads precision=float DeviceId=\"auto\" parallelTrain=true minibatch=512 epochsize=10 asyncBuffer=\"false\" parallelizationMethod=$parallelizationMethod"
cntkmpirun "-n $Instances" 03_ResNet-parallel.cntk "numCPUThreads=$NumCPUThreads precision=float DeviceId=\"auto\" parallelTrain=true minibatch=256 epochsize=10 asyncBuffer=\"false\" parallelizationMethod=$parallelizationMethod"
ExitCode=$?
sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank0
sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank1
sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank2
sed 's/^/MPI Rank 3: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank3
#sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank2
#sed 's/^/MPI Rank 3: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank3
# Delete the test data if copied
[[ "$Copied" -eq "1" ]] && rm -rf "$DataDir"

Просмотреть файл

@ -18,12 +18,16 @@ testCases:
patterns:
- ^MPI Rank {{integer}}
- Finished Epoch[{{integer}} of {{integer}}]
- CE = {{float,tolerance=1}}
- Err = {{float,tolerance=1}}
Per-minibatch training results must match for each MPI Rank:
patterns:
- ^MPI Rank {{integer}}
- Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
- " * {{integer}}; "
- CE = {{float,tolerance=1}}
- Err = {{float,tolerance=1}}
DataParallelASGD training parameters must match for each MPI Rank:
patterns: