Integrate qiwye/asgd-dev into master

This commit is contained in:
Project Philly 2016-11-12 03:38:06 -08:00
Родитель 1f2558c650 acc6d4a3c3
Коммит e618b917fe
90 изменённых файлов: 4443 добавлений и 113 удалений

3
.gitmodules поставляемый
Просмотреть файл

@ -1,3 +1,6 @@
[submodule "Source/1BitSGD"]
path = Source/1BitSGD
url = https://git.codeplex.com/cntk1bitsgd
[submodule "Source/Multiverso"]
path = Source/Multiverso
url = https://github.com/Microsoft/Multiverso

Просмотреть файл

@ -137,6 +137,9 @@ EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ComputationNetworkLib", "Source\ComputationNetworkLib\ComputationNetworkLib.vcxproj", "{928ABD1B-4D3B-4017-AEF1-0FA1B4467513}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SGDLib", "Source\SGDLib\SGDLib.vcxproj", "{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}"
ProjectSection(ProjectDependencies) = postProject
{16F14058-B116-49D9-8BA0-209F3AFFE849} = {16F14058-B116-49D9-8BA0-209F3AFFE849}
EndProjectSection
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ParallelTraining", "ParallelTraining", "{5E666C53-2D82-49C9-9127-3FDDC321C741}"
ProjectSection(SolutionItems) = preProject
@ -1285,6 +1288,10 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "V2LibraryDistributionTests"
{E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Multiverso", "Source\Multiverso\src\Multiverso.vcxproj", "{16F14058-B116-49D9-8BA0-209F3AFFE849}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MultiversoTests", "Source\Multiverso\Test\unittests\MultiversoTests.vcxproj", "{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalExtendedClientTest", "Tests\EndToEndTests\EvalClientTests\CPPEvalExtendedClientTest\CPPEvalExtendedClientTest.vcxproj", "{5D29C76D-648A-456F-920D-48230F2FB3C8}"
ProjectSection(ProjectDependencies) = postProject
{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9}
@ -2245,6 +2252,56 @@ Global
{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E}.Release|Mixed Platforms.Build.0 = Release|x64
{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E}.Release|x64.ActiveCfg = Release|x64
{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E}.Release|x64.Build.0 = Release|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|Any CPU.ActiveCfg = Debug_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|Mixed Platforms.ActiveCfg = Debug_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|Mixed Platforms.Build.0 = Debug_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Any CPU.ActiveCfg = debug|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Mixed Platforms.ActiveCfg = debug|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Mixed Platforms.Build.0 = debug|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|x64.ActiveCfg = debug|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|x64.Build.0 = debug|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|Any CPU.ActiveCfg = Release_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|Mixed Platforms.Build.0 = Release_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|Any CPU.ActiveCfg = Release_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|Mixed Platforms.Build.0 = Release_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|x64.ActiveCfg = Release_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|x64.Build.0 = Release_CpuOnly|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Any CPU.ActiveCfg = release|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Mixed Platforms.ActiveCfg = release|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Mixed Platforms.Build.0 = release|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|x64.ActiveCfg = release|x64
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|x64.Build.0 = release|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|Any CPU.ActiveCfg = Debug_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|Mixed Platforms.ActiveCfg = Debug_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|Mixed Platforms.Build.0 = Debug_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug|Any CPU.ActiveCfg = Debug|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug|Mixed Platforms.Build.0 = Debug|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug|x64.ActiveCfg = Debug|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug|x64.Build.0 = Debug|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_CpuOnly|Any CPU.ActiveCfg = Release_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_CpuOnly|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_CpuOnly|Mixed Platforms.Build.0 = Release_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_NoOpt|Any CPU.ActiveCfg = Release_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_NoOpt|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_NoOpt|Mixed Platforms.Build.0 = Release_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_NoOpt|x64.ActiveCfg = Release_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_NoOpt|x64.Build.0 = Release_CpuOnly|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|Any CPU.ActiveCfg = Release|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|Mixed Platforms.ActiveCfg = Release|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|Mixed Platforms.Build.0 = Release|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|x64.ActiveCfg = Release|x64
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|x64.Build.0 = Release|x64
{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|Any CPU.ActiveCfg = Debug_CpuOnly|x64
{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|Mixed Platforms.ActiveCfg = Debug_CpuOnly|x64
{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|Mixed Platforms.Build.0 = Debug_CpuOnly|x64
@ -2447,6 +2504,8 @@ Global
{E844AB9A-A48F-4A99-9625-F528C5C46D83} = {8656B71D-E24C-4AC2-8BE4-C07B415A3E15}
{CD721536-CFD3-413E-A3D7-FB0FAF989635} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
{16F14058-B116-49D9-8BA0-209F3AFFE849} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
{5D29C76D-648A-456F-920D-48230F2FB3C8} = {05E45AF7-C069-4057-BC16-0A532D068CE4}
EndGlobalSection
EndGlobal

Просмотреть файл

@ -0,0 +1,102 @@
# Parameters can be overwritten on the command line
# for example: cntk configFile=myConfigFile RootDir=../..
# For running from Visual Studio add
# currentDirectory=$(SolutionDir)/<path to corresponding data folder>
command = trainNetwork
precision = "float"; traceLevel = 1 ; deviceId = "auto"
rootDir = ".." ; dataDir = "$rootDir$/DataSets/MNIST" ;
outputDir = "./Output" ;
modelPath = "$outputDir$/Models/06_OneConvRegrMultiNode"
#stderr = "$outputDir$/06_OneConvRegr_bs_out"
parallelizationMethod=DataParallelSGD
# TRAINING CONFIG
trainNetwork = {
action = "train"
BrainScriptNetworkBuilder = {
imageShape = 28:28:1 # image dimensions, 1 channel only
labelDim = 10 # number of distinct labels
featScale = 1/256
Scale{f} = x => Constant(f) .* x
model = Sequential (
Scale {featScale} :
ConvolutionalLayer {16, (5:5), pad = true} : ReLU :
MaxPoolingLayer {(2:2), stride=(2:2)} :
DenseLayer {64} : ReLU :
LinearLayer {labelDim}
)
# inputs
features = Input {imageShape}
labels = Input {labelDim}
# apply model to features
z = model (features)
# loss and error computation
sqErr = SquareError (labels, z)
rmse = Sqrt (sqErr / labelDim)
# declare special nodes
featureNodes = (features)
labelNodes = (labels)
criterionNodes = (rmse)
evaluationNodes = (rmse)
outputNodes = (z)
}
SGD = {
epochSize = 0
minibatchSize = 64
maxEpochs = 15
learningRatesPerSample = 0.001*5:0.0005
momentumAsTimeConstant = 1024
numMBsToShowResult = 500
ParallelTrain = [
parallelizationMethod = $parallelizationMethod$
distributedMBReading = "true"
parallelizationStartEpoch = 1
DataParallelSGD = [
gradientBits = 32
]
ModelAveragingSGD = [
blockSizePerWorker = 64
]
DataParallelASGD = [
syncPeriod = 64
usePipeline = false
]
}
reader = {
readerType = "CNTKTextFormatReader"
# See ../REAMDE.md for details on getting the data (Train-28x28_cntk_text.txt).
file = "$DataDir$/Train-28x28_cntk_text.txt"
input = {
features = { dim = 784 ; format = "dense" }
labels = { dim = 10 ; format = "dense" }
}
}
}
# TEST CONFIG
testNetwork = {
action = "test"
minibatchSize = 1024 # reduce this if you run out of memory
reader = {
readerType = "CNTKTextFormatReader"
file = "$DataDir$/Test-28x28_cntk_text.txt"
input = {
features = { dim = 784 ; format = "dense" }
labels = { dim = 10 ; format = "dense" }
}
}
}

Просмотреть файл

@ -101,3 +101,12 @@ In the fifth example, we show how CNTK can be used to perform a regression task.
`cntk configFile=05_OneConvRegr.cntk`
The trained network achieves root-mean-square error (RMSE) of 0.0039. To see more sophisticated examples on regression tasks, please refer to [Regression](../Regression).
### 06_OneConvRegrMultiNode.cntk
In the sixth example, we show how to train CNTK with multiple process(GPUs) for a regression task. CNTK using MPI for the multiple nodes task, and CNTK currently support four parallel SGD algorithms: DataParallelSGD, BlockMomentumSGD, ModelAveragingSGD, DataParallelASGD. We reuse the same network architecture in `05_OneConvRegr`, only to add a parallel train block. To run this example on a machine, use the following command:
`mpiexec -n 2 cntk configFile=06_OneConvRegrMultiNode.cntk parallelTrain=True parallelizationMethod=DataParallelSGD`
You can change the parallelizationMethod to other three options. To see more detailed guide on multiple GPUs and machines tasks, please refer to [Multiple GPUs and machines](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines).

Просмотреть файл

@ -516,6 +516,7 @@ $(CNTKLIBRARY_DISTRIBUTION_TESTS): $(CNTKLIBRARY_DISTRIBUTION_TESTS_OBJ) | $(CNT
EVAL:=eval
SGDLIB_SRC=\
$(SOURCEDIR)/SGDLib/ASGDHelper.cpp \
$(SOURCEDIR)/SGDLib/Profiler.cpp \
$(SOURCEDIR)/SGDLib/SGD.cpp \
$(SOURCEDIR)/SGDLib/PostComputingActions.cpp \
@ -551,7 +552,7 @@ $(EVAL_LIB): $(EVAL_OBJ) | $(CNTKMATH_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo Building $(EVAL_LIB) for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) $(PROTOBUF_PATH)/lib/libprotobuf.a
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) $(lMULTIVERSO) $(PROTOBUF_PATH)/lib/libprotobuf.a
########################################
# Eval Sample clients
@ -570,7 +571,7 @@ $(EVAL_CLIENT): $(EVAL_CLIENT_OBJ) | $(EVAL_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $(EVAL_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH) $(lMULTIVERSO)
EVAL_EXTENDED_CLIENT:=$(BINDIR)/cppevalextendedclient
@ -586,7 +587,7 @@ $(EVAL_EXTENDED_CLIENT): $(EVAL_EXTENDED_CLIENT_OBJ) | $(EVAL_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $(EVAL_EXTENDED_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH) $(lMULTIVERSO)
########################################
# Eval V2 Sample client
@ -893,6 +894,71 @@ endif
# temporarily adding to 1bit, need to work with others to fix it
endif
########################################
# ASGD(multiverso) setup
########################################
ifeq ("$(CNTK_ENABLE_ASGD)","true")
ifeq (,$(wildcard Source/Multiverso/include/multiverso/*.h))
$(error Build with Multiverso was requested but cannot find the code. Please check https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines#24-data-parallel-asgd to learn more.)
endif
lMULTIVERSO:=-lmultiverso
INCLUDEPATH += $(SOURCEDIR)/Multiverso/include
COMMON_FLAGS += -DASGD_PARALLEL_SUPPORT
MULTIVERSO_LIB:=$(LIBDIR)/libmultiverso.so
ALL_LIBS+=$(MULTIVERSO_LIB)
ifeq ("$(BUILDTYPE)","release")
MULTIVERSO_CMAKE_BUILDTYPE=Release
endif
ifeq ("$(BUILDTYPE)","debug")
MULTIVERSO_CMAKE_BUILDTYPE=Debug
endif
$(MULTIVERSO_LIB):
@echo "Build Multiverso lib"
@mkdir -p $(LIBDIR)
@mkdir -p $(BINDIR)
@mkdir -p $(SOURCEDIR)/Multiverso/build/$(BUILDTYPE)
@cmake -DCMAKE_VERBOSE_MAKEFILE=TRUE \
-DBoost_NO_BOOST_CMAKE=TRUE \
-DBoost_NO_SYSTEM_PATHS=TRUE \
-DBOOST_ROOT:PATHNAME=$(BOOST_PATH) \
-DBOOST_LIBRARY_DIRS:FILEPATH=$(BOOST_PATH) \
-DLIBRARY_OUTPUT_PATH=$(shell readlink -f $(LIBDIR)) \
-DEXECUTABLE_OUTPUT_PATH=$(shell readlink -f $(BINDIR)) \
-DCMAKE_BUILD_TYPE=$(MULTIVERSO_CMAKE_BUILDTYPE) \
-B./Source/Multiverso/build/$(BUILDTYPE) -H./Source/Multiverso
@make VERBOSE=1 -C ./Source/Multiverso/build/$(BUILDTYPE) -j multiverso
UNITTEST_MULTIVERSO_SRC = \
$(SOURCEDIR)/Multiverso/Test/unittests/test_array.cpp \
$(SOURCEDIR)/Multiverso/Test/unittests/test_blob.cpp \
$(SOURCEDIR)/Multiverso/Test/unittests/test_kv.cpp \
$(SOURCEDIR)/Multiverso/Test/unittests/test_message.cpp \
$(SOURCEDIR)/Multiverso/Test/unittests/test_multiverso.cpp \
$(SOURCEDIR)/Multiverso/Test/unittests/test_node.cpp \
$(SOURCEDIR)/Multiverso/Test/unittests/test_sync.cpp \
UNITTEST_MULTIVERSO_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MULTIVERSO_SRC))
UNITTEST_MULTIVERSO := $(BINDIR)/multiversotests
ALL += $(UNITTEST_MULTIVERSO)
$(UNITTEST_MULTIVERSO): $(UNITTEST_MULTIVERSO_OBJ) | $(MULTIVERSO_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(lMULTIVERSO) -ldl
endif
########################################
# cntk
########################################
@ -926,11 +992,11 @@ CNTK:=$(BINDIR)/cntk
ALL+=$(CNTK)
SRC+=$(CNTK_SRC)
$(CNTK): $(CNTK_OBJ) | $(CNTKMATH_LIB)
$(CNTK): $(CNTK_OBJ) | $(CNTKMATH_LIB) $(MULTIVERSO_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) -fopenmp $(PROTOBUF_PATH)/lib/libprotobuf.a
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) $(lMULTIVERSO) -fopenmp $(PROTOBUF_PATH)/lib/libprotobuf.a
# deployable resources: standard library of BS
CNTK_CORE_BS:=$(BINDIR)/cntk.core.bs
@ -967,7 +1033,7 @@ $(UNITTEST_EVAL) : $(UNITTEST_EVAL_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(EVAL) -l$(CNTKMATH)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(EVAL) -l$(CNTKMATH) $(lMULTIVERSO)
#TODO: create project specific makefile or rules to avoid adding project specific path to the global path
INCLUDEPATH += $(SOURCEDIR)/Readers/CNTKTextFormatReader
@ -1027,11 +1093,11 @@ UNITTEST_NETWORK := $(BINDIR)/networktests
ALL += $(UNITTEST_NETWORK)
SRC += $(UNITTEST_NETWORK_SRC)
$(UNITTEST_NETWORK): $(UNITTEST_NETWORK_OBJ) | $(CNTKMATH_LIB) $(CNTKTEXTFORMATREADER)
$(UNITTEST_NETWORK): $(UNITTEST_NETWORK_OBJ) | $(CNTKMATH_LIB) $(CNTKTEXTFORMATREADER) $(MULTIVERSO_LIB)
@echo $(SEPARATOR)
@mkdir -p $(dir $@)
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(CNTKMATH) -fopenmp $(PROTOBUF_PATH)/lib/libprotobuf.a
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) $(lMULTIVERSO) -l$(CNTKMATH) -fopenmp $(PROTOBUF_PATH)/lib/libprotobuf.a
UNITTEST_MATH_SRC = \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp \

Просмотреть файл

@ -373,6 +373,9 @@ void PrintBuiltInfo()
#ifdef _WITH_1BITSGD_
LOGPRINTF(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
#endif
#ifdef _WITH_ASGD_
LOGPRINTF(stderr, "\t\tWith ASGD: %s\n", _WITH_ASGD_);
#endif
#ifdef _MATHLIB_
LOGPRINTF(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
#endif

Просмотреть файл

@ -85,7 +85,7 @@
<StackReserveSize>100000000</StackReserveSize>
</Link>
<PreBuildEvent>
<Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)"</Command>
<Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
</PreBuildEvent>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -113,7 +113,7 @@
<StackReserveSize>100000000</StackReserveSize>
</Link>
<PreBuildEvent>
<Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)"</Command>
<Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
</PreBuildEvent>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="$(CpuOnlyBuild)">

Просмотреть файл

@ -22,6 +22,7 @@ set p_CNTK_ENABLE_1BitSGD=%~3
set p_CudaPath=%~4
set p_CUDNN_PATH=%~5
set p_CUB_PATH=%~6
set p_CNTK_ENABLE_ASGD=%~7
echo #ifndef _BUILDINFO_H > buildinfo.h$$
echo #define _BUILDINFO_H >> buildinfo.h$$
@ -75,7 +76,12 @@ if "%p_CNTK_ENABLE_1BitSGD%" == "true" (
) else (
echo #define _WITH_1BITSGD_ "no">>buildinfo.h$$
)
:: assuming CNTK_ENABLE_ASGD was true as default value
if "%p_CNTK_ENABLE_ASGD%" == "false" (
echo #define _WITH_ASGD_ "no">>buildinfo.h$$
) else (
echo #define _WITH_ASGD_ "yes">>buildinfo.h$$
)
if not %l_build_target% == CPU-only (
if "%p_CudaPath%" == "" (
echo #define _CUDA_PATH_ "NOT_DEFINED" >> buildinfo.h$$

Просмотреть файл

@ -0,0 +1,67 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include <list>
#include "ComputationNetwork.h"
namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
// class AdjustLearningRateAtBeginning
// Providing option for DataParallelASGD training. so that every nodes
// could adjust learning rate every minibatch at first N epochs.
// -----------------------------------------------------------------------
// TODO: We can removed these options once we can adjust learning rate at minibatchs level
enum class AdjustLearningRateAtBeginning : int
{
None = 0, // default, don't adjust learning rate
Linearly = 1, // using linear adjustment, learning rate will from 0 to learningRatesPerMB
Staircase = (1 << 1), // using staircased adjustment, learning rate will from 0 to learningRatesPerMB every adjustNbMinibatch
};
template<class ElemType = float>
class ASGDHelper
{
public:
virtual ~ASGDHelper() { }
// -----------------------------------------------------------------------
// InitModel() -- Upload initialized model (, which was pre-computed by CNTK logic) .
// to the parameter servers, so that every node could start training from same model
// -----------------------------------------------------------------------
virtual void InitModel(const std::list<ComputationNodeBasePtr> & learnableNodes) = 0;
// -----------------------------------------------------------------------
// PushAndPullModel() -- Push parameters of learnableNodes to parameter servers, then get the latests model back.
// -----------------------------------------------------------------------
virtual bool PushAndPullModel(const std::list<ComputationNodeBasePtr> & learnableNodes, size_t sampleSinceLastSynced = 0) = 0;
// -----------------------------------------------------------------------
// WaitAll() -- Wait(Barrier) all the other nodes to process
// -----------------------------------------------------------------------
virtual void WaitAll() = 0;
// -----------------------------------------------------------------------
// WaitAsyncBuffer() -- Wait pipeline thread to finish job when useAsyncBuffer is true
// -----------------------------------------------------------------------
virtual void WaitAsyncBuffer() = 0;
}; // Class ASGDHelper
// Factory method to create a ASGDHelper instance
template<class ElemType = float>
ASGDHelper<ElemType>* NewASGDHelper(
const std::list<ComputationNodeBasePtr> & learnableNodes, // Parameters that needs to be train
size_t nodeNumRanks, // Number of working nodes
bool useAsyncBuffered = true, // Using asynchonous buffer to hide communication cost
bool isSimulatedModelAveragingSGD = false, // Using parameter server-based MA rather than ASGD
AdjustLearningRateAtBeginning adjusttype =
AdjustLearningRateAtBeginning::None, // Adjust learning per minibatches at very begining of training process
double adjustCoef = 0.2, // see in DecayCoefficient()
size_t adjustPerMinibatches = 600, //
int traceLevel = 0, // log level
int syncPerfStats = 0); // shown perf data every syncPerfStats
}}}

Просмотреть файл

@ -99,6 +99,7 @@ class MPIWrapper : public std::enable_shared_from_this<MPIWrapper>
int argc = 0;
char **argv = NULL;
// TODO(qiwye) Multiverso(parameter server) will benefit from MPI_THREAD_MULTIPLE .
int requiredThreadLevelSupport = MPI_THREAD_SERIALIZED;
int provided;
int ret = MPI_Init_thread(&argc, &argv, requiredThreadLevelSupport, &provided);

1
Source/Multiverso Submodule

@ -0,0 +1 @@
Subproject commit 40743f9c86297f63b29c99c259199f59f16c0c7c

Просмотреть файл

@ -0,0 +1,670 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// ASGDHelper.cpp : Implements ASGDHelper interface. The implementation is based on Multiverso.
//
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
#include "ASGDHelper.h"
#include "MPIWrapper.h"
#include "ComputationNetwork.h"
#include "TimerUtility.h"
#include <functional>
#include <thread>
#include <unordered_map>
#include <numeric>
#include <algorithm>
#ifdef ASGD_PARALLEL_SUPPORT
#include <multiverso/multiverso.h>
#include <multiverso/util/configure.h>
#include <multiverso/table/array_table.h>
#include <multiverso/updater/updater.h>
#pragma comment(lib, "Multiverso.lib")
#endif
#ifndef CPUONLY
#include <cuda_runtime.h>
#pragma comment (lib, "cudart.lib") // for cudaMemcpyAsync()
#endif
namespace Microsoft { namespace MSR { namespace CNTK {
#ifndef CPUONLY
#include <cuda_runtime.h>
// -----------------------------------------------------------------------
// Error handling
// -----------------------------------------------------------------------
template <typename ERRTYPE>
static void CudaCall(ERRTYPE retCode, const char* exprString, const char* libName, ERRTYPE successCode)
{
if (retCode != successCode)
{
try
{
#ifdef _WIN32
const char* hostname = getenv("COMPUTERNAME");
#else
char hostname[HOST_NAME_MAX];
if (gethostname(hostname, HOST_NAME_MAX) != 0)
strcpy(hostname, "?");
#endif
int currentCudaDevice;
cudaGetDevice(&currentCudaDevice);
Microsoft::MSR::CNTK::RuntimeError("%s failure %d; GPU=%d ; hostname=%s ; expr=%s", libName, (int)retCode, currentCudaDevice, hostname ? hostname : "?", exprString);
}
catch (const std::exception& e) // catch, log, and rethrow since CUDA code sometimes hangs in destruction, so we'd never get to see the error
{
std::cerr << e.what() << std::endl;
throw;
}
}
}
#define CUDA_CALL(expr) (CudaCall((expr), #expr, "CUDA", cudaSuccess))
#endif // CPUONLY
#ifdef ASGD_PARALLEL_SUPPORT
// MultiversoHelper is the implementation of ASGDHelper interface with Multiverso
template<class ElemType = float>
class MultiversoHelper : public ASGDHelper<ElemType>
{
public:
typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
MultiversoHelper(const std::list<ComputationNodeBasePtr> & learnableNodes, // Parameters that needs to be train
size_t nodeNumRanks, // Number of working nodes
bool useAsyncBuffer = true, // Using asynchonous buffer to hide communication cost
bool isSimulatedModelAveragingSGD = false, // Using parameter server-based MA rather than ASGD
AdjustLearningRateAtBeginning adjusttype = AdjustLearningRateAtBeginning::None, // Adjust learning per minibatches at very begining of training process
// this could be used to tackle the unstableness of ASGD
double adjustCoef = 0.2, // see in DecayCoefficient()
size_t adjustPerMinibatches = 600, //
int traceLevel = 0, // log level
int syncPerfStats = 0) : // shown perf data every syncPerfStats
m_parameterSyncCounter(0), m_adjustLearningRateAtBeginningType(adjusttype),
m_adjustCoefficient(adjustCoef), m_adjustMBNumber(adjustPerMinibatches),
m_totalClientNumber(nodeNumRanks), m_useAsyncBuffer(useAsyncBuffer),
m_traceLevel(traceLevel), m_ModelAveragingSGDSimulating(isSimulatedModelAveragingSGD), m_doesEveryNodesShouldSynced(false),
m_syncPerfStats(syncPerfStats)
{
if (m_ModelAveragingSGDSimulating)
{
m_doesEveryNodesShouldSynced = true;
m_useAsyncBuffer = false;
}
// Pipeline releated variables
m_localBufferNum = m_useAsyncBuffer ? 2 : 1;
m_bufferSwapIndex = new int[m_localBufferNum];
// CPU asynchronous buffer
m_cpuAsyncBuffer = new ElemType*[m_localBufferNum];
// Get option used by multiverso sparse update
m_getOptions.reserve(m_localBufferNum);
m_addOptions.reserve(m_localBufferNum);
#ifndef CPUONLY
// GPU asynchronous buffer
m_gpuAsyncBuffer.resize(m_localBufferNum);
// creat an communication stream for the data tranfer between GPU and CPU
CUDA_CALL(cudaStreamCreate(&_commStream));
#endif
m_bufferIndexInUse = 0;
for (int i = 0; i < m_localBufferNum; i++)
m_bufferSwapIndex[i] = (i + 1) % m_localBufferNum;
m_aysncBufferThread = nullptr;
multiverso::SetCMDFlag("logtostderr", true);
if (m_doesEveryNodesShouldSynced)
multiverso::SetCMDFlag("sync", true);
MultiversoInit(learnableNodes);
}
~MultiversoHelper()
{
fprintf(stderr, "~MultiversoHelper\n");
fflush(stderr);
if (m_useAsyncBuffer && m_aysncBufferThread != nullptr && m_aysncBufferThread->joinable())
m_aysncBufferThread->join();
delete m_bufferSwapIndex, m_deltaArray;
for (size_t i = 0; i < m_localBufferNum; i++)
{
#ifndef CPUONLY
CUDA_CALL(cudaFreeHost(m_cpuAsyncBuffer[i]));
#else
delete m_cpuAsyncBuffer[i];
#endif
}
delete m_cpuAsyncBuffer;
#ifndef CPUONLY
CUDA_CALL(cudaStreamDestroy(_commStream));
#endif
multiverso::MV_ShutDown(false);
}
void InitModel(const std::list<ComputationNodeBasePtr> & learnableNodes) override
{
float factor = 1.0f / m_totalClientNumber;
int i = 0; // indicate the index of learnable nodes
for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, i++)
{
ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
Matrix<ElemType> &mat = node->Value();
#ifndef CPUONLY
for (int j = 0; j < m_localBufferNum; j++)
m_gpuAsyncBuffer[j].push_back(mat.DeepClone());
#endif
ElemType* px = m_cpuAsyncBuffer[0] + m_tableOffsets[i];
mat.CopyToArray(px, m_tableLength[i]);
}
for (int i = 1; i < m_localBufferNum; i++)
memcpy(m_cpuAsyncBuffer[i], m_cpuAsyncBuffer[0], sizeof(ElemType) * m_totalModelSize);
memcpy(m_deltaArray, m_cpuAsyncBuffer[0], sizeof(ElemType) * m_totalModelSize);
// because the parameter server will minus the delta on the server, so that we should send the minus initial model to the server.
std::transform(m_deltaArray, m_deltaArray + m_totalModelSize, m_deltaArray, std::bind1st(std::multiplies<ElemType>(), -factor));
m_workerArray->Add(m_deltaArray, m_totalModelSize);
m_workerArray->Get(m_deltaArray, m_totalModelSize);
WaitAll();
m_workerArray->Get(m_deltaArray, m_totalModelSize);
if (std::equal(m_deltaArray, m_deltaArray + m_totalModelSize, m_cpuAsyncBuffer[0]))
fprintf(stderr, "multiverso initial model loaded.\n");
m_reportTimer.Start();
}
bool PushAndPullModel(const std::list<ComputationNodeBasePtr> & learnableNodes, size_t sampleSinceLastSynced) override
{
m_parameterSyncCounter++;
double fromCPUToGPUTime;
double fromGPUToCPUTime;
double networkTime;
double swapTimeOnGPU;
m_reportTimer.Restart();
WaitAsyncBuffer();
m_reportTimer.Stop();
// reset statics for profiling
if (m_traceLevel > 2 && m_syncPerfStats > 0 && m_parameterSyncCounter % m_syncPerfStats == 0)
{
fromCPUToGPUTime = 0;
fromGPUToCPUTime = 0;
networkTime = 0;
swapTimeOnGPU = 0;
}
m_bufferIndexInUse = m_bufferSwapIndex[m_bufferIndexInUse];
int i = 0; // indicate the index of learnable nodes
if (m_useAsyncBuffer)
{
m_reportTimer.Restart();
for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, i++)
{
ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
Microsoft::MSR::CNTK::Matrix<ElemType> &mat = node->Value();
#ifndef CPUONLY
// CNTK model -> GPU buffer
CUDA_CALL(cudaMemcpy(m_gpuAsyncBuffer[m_bufferIndexInUse][i].Data(),
mat.Data(),
mat.GetNumElements() * sizeof(ElemType),
cudaMemcpyDeviceToDevice));
// GPU buffer -> CNTK model
CUDA_CALL(cudaMemcpy(mat.Data(),
m_gpuAsyncBuffer[m_bufferSwapIndex[m_bufferIndexInUse]][i].Data(),
mat.GetNumElements() * sizeof(ElemType),
cudaMemcpyDeviceToDevice));
#else
ElemType * px = m_cpuAsyncBuffer[m_bufferIndexInUse] + m_tableOffsets[i];
mat.CopyToArray(px, m_tableLength[i]);
ElemType * py = m_cpuAsyncBuffer[m_bufferSwapIndex[m_bufferIndexInUse]] + m_tableOffsets[i];
mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), mat.GetDeviceId(), py);
delete px;
#endif
}
m_reportTimer.Stop();
if (m_traceLevel > 2)
{
swapTimeOnGPU = m_reportTimer.ElapsedSeconds();
}
#ifndef CPUONLY
m_aysncBufferThread = new thread([&]()
{
float factor = DecayCoefficient();
int deviceId = m_gpuAsyncBuffer[m_bufferIndexInUse][0].GetDeviceId();
CUDA_CALL(cudaSetDevice(deviceId));
Timer threadTimer;
threadTimer.Restart();
for (int widx = 0; widx < m_tableCount; widx++)
{
ElemType * px = m_deltaArray + m_tableOffsets[widx];
// GPU buffer -> CPU buffer
CUDA_CALL(cudaMemcpyAsync(px,
m_gpuAsyncBuffer[m_bufferIndexInUse][widx].Data(),
m_gpuAsyncBuffer[m_bufferIndexInUse][widx].GetNumElements() * sizeof(ElemType),
cudaMemcpyDeviceToHost,
_commStream));
}
// waiting copy from GPU to CPU has finished
CUDA_CALL(cudaStreamSynchronize(_commStream));
threadTimer.Stop();
if (m_traceLevel > 3)
{
double time = threadTimer.ElapsedSeconds();
fprintf(stderr, "\t\t -- pullAndRequest, GPU -> CPU time %lf \n", time);
}
// delta = gradient * learning_rate
std::transform(m_cpuAsyncBuffer[m_bufferIndexInUse],
m_cpuAsyncBuffer[m_bufferIndexInUse] + m_totalModelSize,
m_deltaArray, m_deltaArray,
std::minus<ElemType>());
threadTimer.Restart();
// lr decay
std::transform(m_deltaArray,
m_deltaArray + m_totalModelSize,
m_deltaArray,
std::bind1st(std::multiplies<ElemType>(), factor));
ElemType* px = m_deltaArray;
ElemType* py = m_cpuAsyncBuffer[m_bufferIndexInUse];
m_workerArray->AddAsync(px, m_totalModelSize);
m_workerArray->Get(py, m_totalModelSize);
threadTimer.Stop();
if (m_traceLevel > 3)
{
double time = threadTimer.ElapsedSeconds();
fprintf(stderr, "\t\t -- pullAndRequest, Worker <--> Multiverso time %lf \n", time);
}
threadTimer.Restart();
// copy parameters from CPU buffer to GPU buffer
for (int widx = 0; widx < m_tableCount; widx++)
{
ElemType * py = m_cpuAsyncBuffer[m_bufferIndexInUse] + m_tableOffsets[widx];
CUDA_CALL(cudaMemcpyAsync(m_gpuAsyncBuffer[m_bufferIndexInUse][widx].Data(),
py,
m_gpuAsyncBuffer[m_bufferIndexInUse][widx].GetNumElements() * sizeof(ElemType),
cudaMemcpyHostToDevice,
_commStream));
}
CUDA_CALL(cudaStreamSynchronize(_commStream));
threadTimer.Stop();
if (m_traceLevel > 3)
{
double time = threadTimer.ElapsedSeconds();
fprintf(stderr, "\t\t -- pullAndRequest, CPU -> GPU time %lf \n", time);
}
});
#else
m_aysncBufferThread = new thread([&]()
{
float factor = DecayCoefficient();
int t_cacheIdx = m_bufferIndexInUse;
std::transform(m_cpuAsyncBuffer[t_cacheIdx], m_cpuAsyncBuffer[t_cacheIdx] + m_totalModelSize, m_deltaArray, m_deltaArray, std::minus<ElemType>());
std::transform(m_deltaArray, m_deltaArray + m_totalModelSize, m_deltaArray, std::bind1st(std::multiplies<ElemType>(), factor));
ElemType* px = m_deltaArray;
ElemType* py = m_cpuAsyncBuffer[t_cacheIdx];
m_workerArray->AddAsync(px, m_totalModelSize);
m_workerArray->Get(py, m_totalModelSize);
});
#endif
}
else
{
m_reportTimer.Restart();
float factor = DecayCoefficient();
i = 0;
for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, i++)
{
ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
Microsoft::MSR::CNTK::Matrix<ElemType> &mat = node->Value();
ElemType * px = m_deltaArray + m_tableOffsets[i];
mat.CopyToArray(px, m_tableLength[i]);
}
m_reportTimer.Stop();
if (m_traceLevel > 3)
{
double time = m_reportTimer.ElapsedSeconds();
fprintf(stderr, "\t\t -- pullAndRequest, GPU -> CPU time %lf \n", time);
}
std::transform(m_cpuAsyncBuffer[0], m_cpuAsyncBuffer[0] + m_totalModelSize, m_deltaArray, m_deltaArray, std::minus<ElemType>());
// lr decay
if (m_ModelAveragingSGDSimulating)
{
factor = ModelAggregationCoefficient(sampleSinceLastSynced);
std::transform(m_deltaArray, m_deltaArray + m_totalModelSize, m_deltaArray, std::bind1st(std::multiplies<ElemType>(), factor));
if (m_traceLevel > 2 && m_syncPerfStats != 0)
{
if (m_parameterSyncCounter % m_syncPerfStats == 0)
ReportPerfStats(m_totalClientNumber * m_sampleSinceLastReport, m_sampleSinceLastReport);
else
m_sampleSinceLastReport += sampleSinceLastSynced;
}
}
else
{
std::transform(m_deltaArray, m_deltaArray + m_totalModelSize, m_deltaArray, std::bind1st(std::multiplies<ElemType>(), factor));
}
m_reportTimer.Restart();
ElemType* px = m_deltaArray;
ElemType* py = m_cpuAsyncBuffer[0];
m_workerArray->AddAsync(px, m_totalModelSize);
m_workerArray->Get(py, m_totalModelSize);
m_reportTimer.Stop();
if (m_traceLevel > 3)
{
double time = m_reportTimer.ElapsedSeconds();
fprintf(stderr, "\t\t -- pullAndRequest, Worker <--> Multiverso time %lf \n", time);
}
m_reportTimer.Restart();
i = 0;
for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, i++)
{
ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
Microsoft::MSR::CNTK::Matrix<ElemType> &mat = node->Value();
ElemType * px = m_cpuAsyncBuffer[0] + m_tableOffsets[i];
mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), mat.GetDeviceId(), px);
}
m_reportTimer.Stop();
if (m_traceLevel > 3)
{
double time = m_reportTimer.ElapsedSeconds();
fprintf(stderr, "\t\t -- pullAndRequest, CPU -> GPU time %lf \n", time);
}
}
return true;
}
void WaitAll() override
{
multiverso::MV_Barrier();
}
void WaitAsyncBuffer() override
{
if (m_aysncBufferThread != nullptr && m_aysncBufferThread->joinable())
{
m_aysncBufferThread->join();
delete m_aysncBufferThread;
m_aysncBufferThread = nullptr;
}
}
private:
void MultiversoInit(const std::list<ComputationNodeBasePtr> & learnableNodes)
{
// parameter server offer vary of updaters, we only use the SGD updater for this simple case.
multiverso::SetCMDFlag<std::string>(std::string("updater_type"), std::string("sgd"));
multiverso::MV_Init();
int i = 0;
for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, i++)
{
ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
Matrix<ElemType> &mat = node->Value();
size_t layerSize = mat.GetNumElements();
m_tableLength.push_back(layerSize);
}
m_tableCount = m_tableLength.size();
// cacluate total of learnable node's size
m_totalModelSize = accumulate(m_tableLength.begin(), m_tableLength.end(), 0);
m_serverArray = new multiverso::ArrayServer<ElemType>(m_totalModelSize);
m_workerArray = new multiverso::ArrayWorker<ElemType>(m_totalModelSize);
multiverso::MV_Barrier();
size_t idx = 0;
for (size_t len : m_tableLength)
{
m_tableOffsets.push_back(idx);
idx += len;
}
#ifndef CPUONLY
for (int i = 0; i < m_localBufferNum; i++)
m_gpuAsyncBuffer[i].reserve(m_tableCount);
// create pinned memory
for (int i = 0; i < m_localBufferNum; ++i)
CUDA_CALL(cudaMallocHost((void **)&m_cpuAsyncBuffer[i], sizeof(ElemType) * (m_totalModelSize), cudaHostAllocPortable));
CUDA_CALL(cudaMallocHost((void **)&m_deltaArray, sizeof(ElemType) * (m_totalModelSize), cudaHostAllocPortable));
#else
for (int i = 0; i < m_localBufferNum; i++)
m_cpuAsyncBuffer[i] = new ElemType[m_totalModelSize];
#endif
}
float DecayCoefficient()
{
float f = 1.f;
switch (m_adjustLearningRateAtBeginningType)
{
case AdjustLearningRateAtBeginning::None:
break;
case AdjustLearningRateAtBeginning::Linearly:
f = min(f, max(0.f, (float)(m_adjustCoefficient + (1 - m_adjustCoefficient) / m_adjustMBNumber * m_parameterSyncCounter)));
break;
case AdjustLearningRateAtBeginning::Staircase:
f = min(f, max(0.f, (float)(m_adjustCoefficient * (m_parameterSyncCounter / m_adjustMBNumber + 1))));
break;
default:
break;
}
return f;
}
float ModelAggregationCoefficient(size_t samplesSinceLastSync)
{
float factor = 0;
int nTotalSamples = samplesSinceLastSync;
// m_pMPI->AllReduce(&nTotalSamples, 1);
if (nTotalSamples <= 0)
{
factor = 1.0f / m_pMPI->NumNodesInUse();
// give an estimated one
}
else
{
factor = (samplesSinceLastSync + 0.0f) / nTotalSamples;
}
factor = 1.0f / m_pMPI->NumNodesInUse();
return factor;
}
inline void transpose(ElemType *src, ElemType *dst, const int N, const int M)
{
for (auto n = 0; n < N*M; n++) {
auto i = n / N;
auto j = n%N;
dst[n] = src[M*j + i];
}
}
void ReportPerfStats(size_t totalSamplesProcessedSinceLastReport,
size_t localSamplesProcessedSinceLastReport)
{
m_reportTimer.Stop();
double secondsSinceLastReport = m_reportTimer.ElapsedSeconds();
m_reportTimer.Restart();
float totalThroughput = secondsSinceLastReport > 0 ? (float)totalSamplesProcessedSinceLastReport / ((float)secondsSinceLastReport * 1000.0f) : 0.0f;
float throughputPerWorker = totalThroughput / m_totalClientNumber;
string prefix = "\t\t(sim-model aggregation stats) %d-th sync: %8.2f seconds since last report ; %d samples processed by %d workers (%d by me);\n"
"\t\t(sim-model aggregation stats) %d-th sync: totalThroughput = %.2fk samplesPerSecond , throughputPerWorker = %.2fk samplesPerSecond\n";
fprintf(stderr, prefix.c_str(), (int)m_parameterSyncCounter, secondsSinceLastReport, (int)totalSamplesProcessedSinceLastReport, (int)m_totalClientNumber, (int)localSamplesProcessedSinceLastReport,
(int)m_parameterSyncCounter, totalThroughput, throughputPerWorker);
m_sampleSinceLastReport = 0;
}
multiverso::ArrayServer<ElemType>* m_serverArray;
multiverso::ArrayWorker<ElemType>* m_workerArray;
thread * m_aysncBufferThread;
bool m_doesEveryNodesShouldSynced;
bool m_ModelAveragingSGDSimulating;
int m_totalClientNumber;
int m_traceLevel;
int m_syncPerfStats;
Timer m_reportTimer;
size_t m_parameterSyncCounter;
size_t m_sampleSinceLastReport;
bool m_useAsyncBuffer;
int m_localBufferNum;
int * m_bufferSwapIndex;
int m_bufferIndexInUse;
std::vector<multiverso::GetOption*> m_getOptions; // used by sparse table
std::vector<multiverso::AddOption*> m_addOptions; // used by sparse table
AdjustLearningRateAtBeginning m_adjustLearningRateAtBeginningType;
double m_adjustCoefficient;
size_t m_adjustMBNumber;
vector<size_t> m_tableLength;
size_t m_totalModelSize;
vector<size_t> m_tableOffsets;
//shared_ptr<ElemType> m_deltaArray;
ElemType * m_deltaArray;
//std::vector<shared_ptr<ElemType> > m_cpuAsyncBuffer;
ElemType ** m_cpuAsyncBuffer;
MPIWrapperPtr m_pMPI;
// GPU double buffer
std::vector<std::vector<Matrix<ElemType> >> m_gpuAsyncBuffer;
int m_tableCount;
#ifndef CPUONLY
cudaStream_t _commStream;
#endif
}; // Class MultiversoHelper
#endif
// A None implementation of ASGDHelper interface which does nothing
// This is used when CNTK_ENABLE_ASGD = false
template<class ElemType = float>
class NoneASGDHelper : public ASGDHelper<ElemType>
{
public:
NoneASGDHelper(const std::list<ComputationNodeBasePtr> & learnableNodes,
int nodeNumRanks,
bool useAsyncBuffer = true,
bool isSimModelAveragingSGD = false,
AdjustLearningRateAtBeginning adjusttype = AdjustLearningRateAtBeginning::None,
double adjustcoef = 0.2,
size_t adjustnbmb = 600,
int traceLevel = 0,
int syncPerfStats = 0,
const MPIWrapperPtr& pMPI = nullptr) { }
~NoneASGDHelper() { }
void InitModel(const std::list<ComputationNodeBasePtr> & learnableNode) override { }
bool PushAndPullModel(const std::list<ComputationNodeBasePtr> & learnableNodes, size_t sampleSinceLastSynced) override {
return true;
}
void WaitAll() override { }
void WaitAsyncBuffer() override { }
};
template<class ElemType>
ASGDHelper<ElemType>* NewASGDHelper(
const std::list<ComputationNodeBasePtr> & learnableNodes, // Parameters that needs to be train
size_t nodeNumRanks, // Number of working nodes
bool useAsyncBuffer, // Using asynchonous buffer to hide communication cost
bool isSimulatedModelAveragingSGD,
AdjustLearningRateAtBeginning adjusttype,
double adjustCoef,
size_t adjustPerMinibatches,
int traceLevel,
int syncPerfStats)
{
#ifdef ASGD_PARALLEL_SUPPORT
return new MultiversoHelper<ElemType>(learnableNodes, nodeNumRanks, useAsyncBuffer, isSimulatedModelAveragingSGD,
adjusttype, adjustCoef, adjustPerMinibatches, traceLevel, syncPerfStats);
#else
return new NoneASGDHelper<ElemType>(learnableNodes, nodeNumRanks, useAsyncBuffer, isSimulatedModelAveragingSGD,
adjusttype, adjustCoef, adjustPerMinibatches, traceLevel, syncPerfStats);
#endif
}
template ASGDHelper<float>* NewASGDHelper<float>(
const std::list<ComputationNodeBasePtr> & learnableNodes,
size_t nodeNumRanks,
bool useAsyncBuffer,
bool isSimulatedModelAveragingSGD,
AdjustLearningRateAtBeginning adjusttype,
double adjustCoef,
size_t adjustPerMinibatches,
int traceLevel,
int syncPerfStats);
template ASGDHelper<double>* NewASGDHelper<double>(
const std::list<ComputationNodeBasePtr> & learnableNodes,
size_t nodeNumRanks,
bool useAsyncBuffer,
bool isSimulatedModelAveragingSGD,
AdjustLearningRateAtBeginning adjusttype,
double adjustCoef,
size_t adjustPerMinibatches,
int traceLevel,
int syncPerfStats);
}}}

Просмотреть файл

@ -25,6 +25,8 @@
#include "V2AllReduceDistGradAggregator.h"
#endif
#include "ASGDHelper.h"
#include "SimpleDistGradAggregator.h"
#include "V2SimpleDistGradAggregator.h"
#include "ProgressTracing.h"
@ -403,15 +405,27 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
m_seqGammarCalcAMF, m_seqGammarCalcLMF, m_seqGammarCalcWP, m_seqGammarCalcbMMIFactor, m_seqGammarCalcUsesMBR);
}
// Multiverso Warpper for ASGD logic init
if (m_parallelizationMethod == ParallelizationMethod::dataParallelASGD)
{
m_pASGDHelper.reset(NewASGDHelper<ElemType>(learnableNodes,
m_mpi->NumNodesInUse(),
m_isAsyncBufferEnabled,
m_isSimulateMA,
m_adjustLearningRateAtBeginning,
m_adjustCoefficient,
m_adjustPerMinibatches,
m_traceLevel,
m_syncStatsTrace));
m_pASGDHelper->InitModel(learnableNodes);
}
// --- MAIN EPOCH LOOP
for (int i = startEpoch; i < (int) m_maxEpochs; i++) // TODO: why is this an int, and not a size_t?
{
// Synchronize all ranks before proceeding to ensure that
// rank 0 has finished writing the previous model file
if (m_mpi != nullptr)
{
m_mpi->WaitAll();
}
BarrierWorkers();
// (re-)initialize 1-bit SGD
if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD &&
@ -575,7 +589,9 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
if (validationSetDataReader != trainSetDataReader && validationSetDataReader != nullptr)
{
SimpleEvaluator<ElemType> evalforvalidation(net, m_mpi, m_enableDistributedMBReading);
// TODO(dataASGD) making evaluator becoming nondistributed one when using ASGD, since Multiverso has another background thread using MPI.
// Making the evaluation serial (non-distributed) will slowdown training especially when validation set is large.
SimpleEvaluator<ElemType> evalforvalidation(net, UsingAsyncGradientAggregation(i + 1) ?nullptr : m_mpi, m_enableDistributedMBReading);
vector<wstring> cvSetTrainAndEvalNodes;
if (criterionNodes.size() > 0)
{
@ -712,10 +728,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
// Synchronize all ranks before proceeding to ensure that
// nobody tries reading the checkpoint file at the same time
// as rank 0 deleting it below
if (m_mpi != nullptr)
{
m_mpi->WaitAll();
}
BarrierWorkers();
// Persist model and check-point info
if ((m_mpi == nullptr) || m_mpi->IsMainNode())
@ -783,10 +796,8 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
// Synchronize all ranks before proceeding to ensure that
// rank 0 has finished writing the model file
if (m_mpi != nullptr)
{
m_mpi->WaitAll();
}
// TODO[DataASGD]: should othet other rank waiting in async-mode
BarrierWorkers();
// progress tracing for compute cluster management
ProgressTracing::TraceProgressPercentage(m_maxEpochs, 0.0, true);
@ -803,6 +814,8 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
}
delete inputMatrices;
if (m_parallelizationMethod == ParallelizationMethod::dataParallelASGD)
m_pASGDHelper.reset();
}
// -----------------------------------------------------------------------
@ -846,6 +859,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
bool useGradientAggregation = UsingGradientAggregation(epochNumber);
bool useModelAggregation = UsingModelAggregation(epochNumber);
bool useAsyncGradientAggregation = UsingAsyncGradientAggregation(epochNumber);
bool useParallelTrain = UsingParallelTrain(epochNumber);
// Find all evaluation nodes that accumulate error on their own.
@ -981,6 +995,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
double readTime = 0;
double computeTime = 0;
double parameterUpdateTime = 0;
double parameterSyncTime = 0; // perf communication time between syncs.
if (m_perfTraceLevel > 0)
fineGrainedPerfMeasurementTimer.Start();
@ -1241,15 +1256,14 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
}
}
if (m_perfTraceLevel > 0)
{
std::unique_ptr<MatrixComputeStreamEvent> mainStreamSyncEvent(MatrixComputeStreamEvent::Create(net->GetDeviceId()));
mainStreamSyncEvent->SynchronizeEvent();
fineGrainedPerfMeasurementTimer.Stop();
parameterUpdateTime = fineGrainedPerfMeasurementTimer.ElapsedSeconds();
PREPENDTS(stderr);
fprintf(stderr, "Perf trace: Worker MB size = %d, Read = %.5gs; Compute = %.5gs; Parameter update = %.5gs, Aggregate MB size = %d\n", (int)actualMBSize, readTime, computeTime, parameterUpdateTime, (int)aggregateNumSamples);
fineGrainedPerfMeasurementTimer.Start();
}
// aggregation by model averaging or block momentum
@ -1270,11 +1284,38 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
}
}
timer.Stop();
numMBsRun++;
// using parameter server for parameter update
if (useAsyncGradientAggregation && m_mpi->NumNodesInUse() > 1)
{
// Determine if any samples were processed across any of the ranks
if (useDistributedMBReading)
{
noMoreSamplesToProcess = !wasDataRead;
}
if (nSamplesSinceLastModelSync >= m_nFramesBetweenASGDSync[epochNumber])
{
m_pASGDHelper->PushAndPullModel(learnableNodes, nSamplesSinceLastModelSync);
nSamplesSinceLastModelSync = 0;
}
}
if (m_perfTraceLevel > 0)
{
fineGrainedPerfMeasurementTimer.Stop();
parameterSyncTime = fineGrainedPerfMeasurementTimer.ElapsedSeconds();
}
timer.Stop();
if (m_perfTraceLevel > 0)
{
PREPENDTS(stderr);
fprintf(stderr, "Perf trace: Worker MB size = %d, Read = %.5gs; Compute = %.5gs; Parameter update = %.5gs; Parameter sync = %.5gs; Aggregate MB size = %d\n", (int)actualMBSize, readTime, computeTime, parameterUpdateTime, parameterSyncTime, (int)aggregateNumSamples);
}
numMBsRun++;
totalTimeInMBs += timer.ElapsedSeconds();
//trainSamplesSinceLastLogged += (int)aggregateNumSamplesWithLabel; // now inside epochCriterionLastLogged
// log
// This shows the criterion since last logged.
@ -1404,6 +1445,12 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
nSamplesSinceLastModelSync = 0;
}
if (useAsyncGradientAggregation && (m_mpi->NumNodesInUse() > 1))
{
m_pASGDHelper->PushAndPullModel(learnableNodes, nSamplesSinceLastModelSync);
nSamplesSinceLastModelSync = 0;
}
// hoist the accumulated criterion value from GPU side to our 'out' variables
// (unless we useGradientAggregation, in which case they are accumulated in the 'out' variables directly)
if (!useGradientAggregation)
@ -2555,7 +2602,8 @@ static ParallelizationMethod ParseParallelizationMethod(const wstring& s)
else if (EqualCI(s, L"DataParallelSGD")) return ParallelizationMethod::dataParallelSGD;
else if (EqualCI(s, L"ModelAveragingSGD")) return ParallelizationMethod::modelAveragingSGD;
else if (EqualCI(s, L"BlockMomentumSGD")) return ParallelizationMethod::blockMomentumSGD;
else InvalidArgument("ParseParallelizationMethod: Invalid Parallelization Method. Valid values are (none | DataParallelSGD | ModelAveragingSGD | BlockMomentumSGD)");
else if (EqualCI(s, L"dataParallelASGD")) return ParallelizationMethod::dataParallelASGD;
else InvalidArgument("ParseParallelizationMethod: Invalid Parallelization Method. Valid values are (none | DataParallelSGD | ModelAveragingSGD | BlockMomentumSGD | dataParallelASGD)");
}
static LearningRateSearchAlgorithm ParseLearningRateSearchType(const wstring& s)
@ -2569,6 +2617,16 @@ static LearningRateSearchAlgorithm ParseLearningRateSearchType(const wstring& s)
else InvalidArgument("autoAdjustLR: Invalid learning rate search type. Valid values are (none | searchBeforeEpoch | adjustAfterEpoch)");
}
#ifdef ASGD_PARALLEL_SUPPORT
static AdjustLearningRateAtBeginning AdjustLearningRateAtBeginningType(const wstring& s)
{
if (EqualCI(s.c_str(), L"") || EqualCI(s.c_str(), L"none")) return AdjustLearningRateAtBeginning::None;
else if (EqualCI(s.c_str(), L"linearly")) return AdjustLearningRateAtBeginning::Linearly;
else if (EqualCI(s.c_str(), L"staircase")) return AdjustLearningRateAtBeginning::Staircase;
else InvalidArgument("AdjustLearningRateatBeginningType: Invalid Type. Valid values are (None | Linearly | Staircase)");
}
#endif
template<class ConfigRecordType>
SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
{
@ -2938,7 +2996,26 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
m_blockMomentumAsTimeConstant = BlockMomentumSGD<double>::Momentum2TimeConstant(blockMomentum, m_modelAggregationBlockSize);
}
#endif
InitializeAndCheckBlockMomentumSGDParameters();
}
if (configParallelTrain.Exists(L"DataParallelASGD"))
{
#ifndef ASGD_PARALLEL_SUPPORT
InvalidArgument("DataParallelASGD is not enabled in this version.\n");
#else
const ConfigRecordType & configDataParallelASGD(configParallelTrain(L"DataParallelASGD", ConfigRecordType::Record()));
m_nFramesBetweenASGDSync = configDataParallelASGD(L"syncPeriod", ConfigRecordType::Array(intargvector(vector<int>{256})));
m_isAsyncBufferEnabled = configDataParallelASGD(L"UsePipeline", false);
m_isSimulateMA = configDataParallelASGD(L"SimModelAverage", false); // using parameter server-based version of ModefAveragingSGD
if (configDataParallelASGD.Exists(L"AdjustLearningRateAtBeginning")) // adjust learning rate per m_adjustNumInBatch minibatchs until to original one
// this option could be used to takcle the unstableness of ASGD
{
const ConfigRecordType & configAdjustLearningRateAtBeginning(configDataParallelASGD(L"AdjustLearningRateAtBeginning", ConfigRecordType::Record()));
m_adjustLearningRateAtBeginning = AdjustLearningRateAtBeginningType(configAdjustLearningRateAtBeginning(L"adjustType", L"None"));
m_adjustCoefficient = configAdjustLearningRateAtBeginning(L"adjustCoefficient", (double)0.1);
m_adjustPerMinibatches = configAdjustLearningRateAtBeginning(L"adjustPerMinibatches", (size_t)256);
}
#endif
}
} // if (!pMPI)
} // if (configSGD.Exists(L"ParallelTrain"))

Просмотреть файл

@ -19,7 +19,7 @@
#include <random>
#include "Profiler.h"
#include "MASGD.h"
#include "ASGDHelper.h"
using namespace std; // ugh! TODO: get rid of this from .h files!!!
#define CNTK_CHECKPOINT_VERSION_1 1 // 1 -> no version number
@ -60,6 +60,7 @@ enum class ParallelizationMethod : int
dataParallelSGD = 1,
modelAveragingSGD = 2,
blockMomentumSGD = 3,
dataParallelASGD = 4,
modelParallelSGD = (1 << 8) // Currently unsupported
};
@ -286,6 +287,14 @@ protected:
double m_L2RegWeight;
double m_L1RegWeight;
// Parallel training related with ASGD
intargvector m_nFramesBetweenASGDSync;
bool m_isAsyncBufferEnabled;
bool m_isSimulateMA;
AdjustLearningRateAtBeginning m_adjustLearningRateAtBeginning;
double m_adjustCoefficient;
size_t m_adjustPerMinibatches;
// sequence training
double m_hSmoothingWeight;
double m_frameDropThresh;
@ -564,20 +573,41 @@ protected:
private:
void MarkDropoutNodesEvalTimeStampAsOutdated(const ComputationNetworkPtr& net, const ComputationNodeBasePtr& criterionNode);
std::shared_ptr<ASGDHelper<ElemType>> m_pASGDHelper;
bool UsingGradientAggregation(size_t epochNumber) const
{
return ((GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD) && (epochNumber >= m_parallelizationStartEpochNum));
}
bool UsingModelAggregation(size_t epochNumber) const
{
return ((GetParallelizationMethod() == ParallelizationMethod::modelAveragingSGD ||
GetParallelizationMethod() == ParallelizationMethod::blockMomentumSGD) &&
(epochNumber >= m_parallelizationStartEpochNum));
}
bool UsingParallelTrain(size_t epochNumber) const
bool UsingAsyncGradientAggregation(size_t epochNumber)
{
return UsingGradientAggregation(epochNumber) || UsingModelAggregation(epochNumber);
return ((GetParallelizationMethod() == ParallelizationMethod::dataParallelASGD) && (epochNumber >= m_parallelizationStartEpochNum));
}
bool UsingParallelTrain(size_t epochNumber)
{
return UsingGradientAggregation(epochNumber) || UsingModelAggregation(epochNumber) || UsingAsyncGradientAggregation(epochNumber);
}
void BarrierWorkers()
{
if (m_mpi != nullptr && GetParallelizationMethod() != ParallelizationMethod::dataParallelASGD)
{
m_mpi->WaitAll();
}
if (m_mpi != nullptr && GetParallelizationMethod() == ParallelizationMethod::dataParallelASGD)
{
m_pASGDHelper->WaitAll();
}
return;
}
};

Просмотреть файл

@ -43,10 +43,12 @@
<ClCompile>
<AdditionalIncludeDirectories>$(SolutionDir)Source\Readers\ReaderLib;$(SolutionDir)Source\CNTKv2LibraryDll;$(SolutionDir)Source\CNTKv2LibraryDll\API;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories Condition="'$(CNTK_ENABLE_1BitSGD)'=='true'">$(SolutionDir)Source\1BitSGD;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<AdditionalIncludeDirectories Condition="'$(CNTK_ENABLE_ASGD)'!='false'">$(SolutionDir)Source\multiverso\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PrecompiledHeader>
</PrecompiledHeader>
<PreprocessorDefinitions>WIN32;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions Condition="'$(CNTK_ENABLE_1BitSGD)'=='true'">QUANTIZED_GRADIENT_AGGREGATION;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<PreprocessorDefinitions Condition="'$(CNTK_ENABLE_ASGD)'!='false'">ASGD_PARALLEL_SUPPORT;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<DisableSpecificWarnings>4819</DisableSpecificWarnings>
</ClCompile>
<Link>
@ -101,6 +103,7 @@
<ClInclude Include="..\Common\Include\BestGpu.h" />
<ClInclude Include="..\Common\Include\Config.h" />
<ClInclude Include="..\Common\Include\DataReader.h" />
<ClInclude Include="..\Common\Include\ASGDHelper.h" />
<ClInclude Include="..\Common\Include\TensorShape.h" />
<ClInclude Include="..\Common\Include\DataWriter.h" />
<ClInclude Include="..\Common\Include\File.h" />
@ -138,6 +141,7 @@
<ClInclude Include="V2SimpleDistGradAggregator.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="ASGDHelper.cpp" />
<ClCompile Include="PostComputingActions.cpp" />
<ClCompile Include="Profiler.cpp" />
<ClCompile Include="SGD.cpp" />

Просмотреть файл

@ -13,6 +13,9 @@
<ClCompile Include="PostComputingActions.cpp">
<Filter>Stat</Filter>
</ClCompile>
<ClCompile Include="ASGDHelper.cpp">
<Filter>Parallelization</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\Common\Include\fileutil.h">
@ -135,6 +138,9 @@
<ClInclude Include="V2SimpleDistGradAggregator.h">
<Filter>Parallelization</Filter>
</ClInclude>
<ClInclude Include="..\Common\Include\ASGDHelper.h">
<Filter>Parallelization</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<Filter Include="Common">

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

0
Tests/EndToEndTests/Image/AlexNet/Composite/run-test Normal file → Executable file
Просмотреть файл

Просмотреть файл

@ -0,0 +1,147 @@
# Note: This sample uses the deprecated NdlNetworkBuilder.
# An updated version using BrainScript is coming soon.
# Please find updated samples on Github, https://github.com/Microsoft/CNTK/tree/master/Examples /...
#
makeMode = true
RootDir = "."
configName = "ssgd"
minibatch = 128
epochSize = 5
parallelizationMethod = "DataParallelSGD"
asyncBuffer = "true"
ConfigDir = "$RootDir$"
DataDir = "$RootDir$"
OutputDir = "$RootDir$/Output-$configName$"
ModelDir = "$OutputDir$/Models"
ndlMacros = "$ConfigDir$/Macros.ndl"
precision = "float"
DeviceId = "auto"
imageLayout = "cudnn"
# override the above as follows when running on CPU:
# deviceId = -1
# If set to true, always initialize the network on CPU, making initialization consistent across CPU and GPU targets (for testing).
initOnCPUOnly=true
prefetch = "true"
parallelTrain = "false"
command = Train
stderr = "$OutputDir$/03_ResNet"
traceLevel = 1
Proj16to32Filename = "$ConfigDir$/16to32.txt"
Proj32to64Filename = "$ConfigDir$/32to64.txt"
Train = [
action = "train"
modelPath = "$ModelDir$/03_ResNet"
NDLNetworkBuilder = [
networkDescription = "$ConfigDir$/03_ResNet.ndl"
]
SGD = [
epochSize = 0
minibatchSize = $minibatch$
# Note that learning rates are 10x more than in the paper due to a different
# momentum update rule in CNTK: v{t + 1} = lr*(1 - momentum)*g{t + 1} + momentum*v{t}
learningRatesPerSample = 0.004*80:0.0004*40:0.00004
momentumPerMB = 0
maxEpochs = $epochsize$
L2RegWeight = 0.0001
dropoutRate = 0
perfTraceLevel = 0
firstMBsToShowResult = 1
numMBsToShowResult = 10
ParallelTrain = [
parallelizationMethod = $parallelizationMethod$
distributedMBReading = "true"
parallelizationStartEpoch = 1
DataParallelSGD = [
gradientBits = 32
useBufferedAsyncGradientAggregation = $asyncBuffer$
]
ModelAveragingSGD = [
blockSizePerWorker = 128
]
DataParallelASGD = [
syncPeriod = 128
usePipeline = $asyncBuffer$
]
]
]
reader = [
readerType = "ImageReader"
file = "$DataDir$/train_map.txt"
randomize = "auto"
features = [
width = 32
height = 32
channels = 3
cropType = "random"
cropRatio = 0.8
jitterType = "uniRatio"
interpolations = "linear"
meanFile = "$DataDir$/CIFAR-10_mean.xml"
]
labels = [
labelDim = 10
]
]
cvReader = [
readerType = "ImageReader"
file = "$DataDir$/test_map.txt"
randomize = "none"
features = [
width = 32
height = 32
channels = 3
cropType = "center"
cropRatio = 1
jitterType = "uniRatio"
interpolations = "linear"
meanFile = "$DataDir$/CIFAR-10_mean.xml"
]
labels = [
labelDim = 10
]
]
]
Test = [
action = "test"
modelPath = "$ModelDir$/03_ResNet"
# Set minibatch size for testing.
minibatchSize = 256
reader = [
readerType = "ImageReader"
file = "$DataDir$/cifar-10-batches-py/test_map.txt"
randomize = "none"
features = [
width = 32
height = 32
channels = 3
cropType = "center"
cropRatio = 1
jitterType = "uniRatio"
interpolations = "linear"
meanFile = "$DataDir$/cifar-10-batches-py/CIFAR-10_mean.xml"
]
labels = [
labelDim = 10
]
]
]

Просмотреть файл

@ -0,0 +1,67 @@
load=LocalMacros
run=DNN
LocalMacros = [
ImageW = 32
ImageH = 32
ImageC = 3
LabelDim = 10
features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = $imageLayout$)
labels = Input(LabelDim, tag = label)
convWScale = 7.07
convBValue = 0
fc1WScale = 0.4
fc1BValue = 0
scValue = 1
# Batch normalization time constant.
bnTimeConst = 4096
kW = 3
kH = 3
hStride1 = 1
vStride1 = 1
]
DNN=[
conv1WScale = 0.26
cMap1 = 16
conv1 = ConvBNReLULayer(features, cMap1, 27, kW, kH, hStride1, vStride1, conv1WScale, convBValue, scValue, bnTimeConst)
rn1_1 = ResNetNode2(conv1, cMap1, 144, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn1_2 = ResNetNode2(rn1_1, cMap1, 144, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
cMap2 = 32
rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", learningRateMultiplier = 0)
rn2_1 = ResNetNode2Inc(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn2_1_Wproj)
#rn2_1 = ResNetNode2Inc2(rn1_3, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
cMap3 = 64
rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", learningRateMultiplier = 0)
rn3_1 = ResNetNode2Inc(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn3_1_Wproj)
#rn3_1 = ResNetNode2Inc2(rn2_3, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
# Global average pooling
poolW = 8
poolH = 8
poolhStride = 1
poolvStride = 1
pool = AveragePooling(rn3_3, poolW, poolH, poolhStride, poolvStride, imageLayout = $imageLayout$)
ol = DnnLastLayer(cMap3, labelDim, pool, fc1WScale, fc1BValue)
CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
Err = ClassificationError(labels, ol, tag = Eval)
OutputNodes = ol
]

Просмотреть файл

@ -0,0 +1,32 @@
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Просмотреть файл

@ -0,0 +1,64 @@
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Просмотреть файл

@ -0,0 +1,148 @@
ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
[
W = LearnableParameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = $imageLayout$)
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
p = Plus(c, b)
y = RectifiedLinear(p)
]
ConvLocalReLULayer(inp, outMap, outWCount, inMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
[
W = LearnableParameter(outWCount, inWCount, init = Gaussian, initValueScale = wScale)
b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = $imageLayout$)
c = Convolution(W, inp, {kW, kH, inMap}, mapCount = outMap, stride = {hStride, vStride, inMap}, sharing = {false, false, false}, imageLayout = $imageLayout$)
p = Plus(c, b)
y = RectifiedLinear(p)
]
ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
[
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
]
ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
[
W = LearnableParameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
c = ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
]
ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
[
c = ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
y = RectifiedLinear(c)
]
ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
[
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
]
ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
[
# First convolution layer.
c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
# Second convolution layer, no ReLU.
c2 = ConvBNLayer(c1, outMap, inWCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
p = Plus(c2, inp)
y = RectifiedLinear(p)
]
ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, bnTimeConst, Wproj)
[
# First convolution layer.
c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 2, 2, wScale, bValue, scValue, bnTimeConst)
# Second convolution layer, no ReLU.
c2 = ConvBNLayer(c1, outMap, wCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
# Projection convolution layer.
c_proj = ProjLayer(Wproj, inp, outMap, 2, 2, bValue, scValue, bnTimeConst)
#c_proj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = $imageLayout$)
p = Plus(c2, c_proj)
y = RectifiedLinear(p)
]
ResNetNode2Inc2(inp, inMap, outMap, inWCount, wCount, kW, kH, wScale, w1Scale, bValue, scValue, bnTimeConst)
[
pool = MaxPooling(inp, 1, 1, 2, 2, imageLayout = $imageLayout$)
# First convolution layer.
c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 2, 2, wScale, bValue, scValue, bnTimeConst)
# Second convolution layer, no ReLU.
c2 = ConvBNLayer(c1, inMap, wCount, kW, kH, 1, 1, w1Scale, bValue, scValue, bnTimeConst)
c3 = ConvBNLayer(c1, inMap, wCount, kW, kH, 1, 1, w1Scale, bValue, scValue, bnTimeConst)
p = Plus(c2, pool)
r = RowStack(p, c3)
y = RectifiedLinear(r)
]
DnnReLULayer(inDim, outDim, x, wScale, bValue)
[
W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale)
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
t = Times(W, x)
z = Plus(t, b)
y = RectifiedLinear(z)
]
DNNImageReLULayer(inW, inH, inC, outDim, x, wScale, bValue)
[
W = ImageParameter(outDim, inW, inH, inC, init = Gaussian, initValueScale = wScale, imageLayout=$imageLayout$)
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
t = Times(W, x)
z = Plus(t, b)
y = RectifiedLinear(z)
]
DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
[
W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale)
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst)
y = RectifiedLinear(bn)
]
DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeConst)
[
W = ImageParameter(outDim, inW, inH, inC, init = Gaussian, initValueScale = wScale, imageLayout=$imageLayout$)
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
t = Times(W, x)
bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst)
y = RectifiedLinear(bn)
]
DnnLastLayer(hiddenDim, labelDim, x, wScale, bValue)
[
W = LearnableParameter(labelDim, hiddenDim, init = Gaussian, initValueScale = wScale)
b = LearnableParameter(labelDim, init = fixedValue, value = bValue)
t = Times(W, x)
z = Plus(t, b)
]
DnnImageLastLayer(inW, inH, inC, labelDim, x, wScale, bValue)
[
W = ImageParameter(labelDim, inW, inH, inC, init = Gaussian, initValueScale = wScale, imageLayout=$imageLayout$)
b = LearnableParameter(labelDim, init = fixedValue, value = bValue)
t = Times(W, x)
z = Plus(t, b)
]

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

Просмотреть файл

@ -0,0 +1,23 @@
#!/bin/bash
. $TEST_DIR/run-test-common
#dataDir="."
ConfigDir=$TEST_DIR
LogFileName="ASGDMultiGPU"
Instances=4
NumCPUThreads=$(threadsPerInstance $Instances)
parallelizationMethod="DataParallelASGD"
# cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
cntkmpirun "-n $Instances" 03_ResNet-parallel.cntk "numCPUThreads=$NumCPUThreads precision=float DeviceId=\"auto\" parallelTrain=true minibatch=512 epochsize=10 asyncBuffer=\"false\" parallelizationMethod=$parallelizationMethod"
ExitCode=$?
sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank0
sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank1
sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank2
sed 's/^/MPI Rank 3: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank3
# Delete the test data if copied
[[ "$Copied" -eq "1" ]] && rm -rf "$DataDir"
exit $ExitCode

Просмотреть файл

@ -0,0 +1,35 @@
#!/bin/bash
. $TEST_ROOT_DIR/run-test-common
export MKL_NUM_THREADS=4
export MKL_CBWR=COMPATIBLE
export OMP_NUM_THREADS=1
ConfigDir=$TEST_DIR
if [[ ! -d $TEST_DATA_DIR || ! -e $TEST_DATA_DIR/Train_cntk_text.txt || ! -e $TEST_DATA_DIR/train_map.txt ]]; then
# Cannot find test data locally.
# Try external test data directory (not part of the CNTK repository) as an alternative.
if [[ -d "$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY" ]]; then
if [ "$OS" == "Windows_NT" ]; then
DataSourceDir=`cygpath -au $CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY`/Image/CIFAR/v0
else
DataSourceDir=$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY/Image/CIFAR/v0
fi
# Copy the test data to the test run directory
DataDir=$TEST_RUN_DIR/TestData
mkdir $DataDir
mkdir $DataDir/cifar-10-batches-py
cp -R $DataSourceDir/*_cntk_text.txt $DataDir || exit $?
cp -R $DataSourceDir/cifar-10-batches-py/data.zip $DataDir/cifar-10-batches-py || exit $?
cp -R $DataSourceDir/cifar-10-batches-py/CIFAR-10_mean.xml $DataDir || exit $?
cp -R $DataSourceDir/cifar-10-batches-py/*_map.txt $DataDir || exit $?
Copied=1
else
echo Error: cannot find data. Please see Examples/Image/DataSets/CIFAR10/README.md for instructions to get it.
exit 1
fi
fi

Просмотреть файл

@ -0,0 +1,31 @@
dataDir: .
tags:
# running on every BVT job in 'P' (Parallel) leg in Debug-GPU on Linux configurations:
# TODO: Enable windows test when Jenkins ready
- bvt-p (build_sku == 'gpu') and (flavor=='debug') and (os == 'linux') and (device == 'gpu')
# running unconditionally on every Nightly job in 'P' leg
- nightly-p (build_sku == 'gpu') and (os == 'linux') and (device == 'gpu')
testCases:
Must train epochs in exactly same order and parameters for each MPI Rank:
patterns:
- ^MPI Rank {{integer}}
- Starting Epoch {{integer}}
- learning rate per sample = {{float}}
Epochs must be finished with expected results for each MPI Rank:
patterns:
- ^MPI Rank {{integer}}
- Finished Epoch[{{integer}} of {{integer}}]
Per-minibatch training results must match for each MPI Rank:
patterns:
- ^MPI Rank {{integer}}
- Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
- " * {{integer}}; "
DataParallelASGD training parameters must match for each MPI Rank:
patterns:
- ^MPI Rank {{integer}}
- Starting minibatch loop

0
Tests/EndToEndTests/PyTest/run-test Normal file → Executable file
Просмотреть файл

0
Tests/EndToEndTests/ScriptTest/run-test Normal file → Executable file
Просмотреть файл

0
Tests/EndToEndTests/Speech/DNN/ParallelBM/run-test Normal file → Executable file
Просмотреть файл

0
Tests/EndToEndTests/Speech/DNN/PlotDNN/run-test Normal file → Executable file
Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

Просмотреть файл

0
Tests/EndToEndTests/Text/SLU/run-test Normal file → Executable file
Просмотреть файл

Просмотреть файл

Просмотреть файл

0
Tests/EndToEndTests/UnitTests/EvalTests/run-test Normal file → Executable file
Просмотреть файл

Просмотреть файл

0
Tests/EndToEndTests/UnitTests/MathTests/run-test Normal file → Executable file
Просмотреть файл

Просмотреть файл

@ -0,0 +1,115 @@
CPU info:
CPU Model Name: Intel(R) Xeon(R) CPU E5-2680 v2 @ 2.80GHz
Hardware threads: 40
Total Memory: 264118516 kB
-------------------------------------------------------------------
Running 8 test cases...
[INFO] [2016-09-30 12:19:35] multiverso MPI-Net is initialized under MPI_THREAD_SERIALIZED mode.
[INFO] [2016-09-30 12:19:35] All nodes registered. System contains 1 nodes. num_worker = 1, num_server = 1
[INFO] [2016-09-30 12:19:35] Create a async server
[INFO] [2016-09-30 12:19:35] Rank 0: Multiverso start sucessfully
[INFO] [2016-09-30 12:19:35] --------------Show dashboard monitor information--------------
[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_ADD] count = 2 elapse = 3.25433ms average = 1.62716ms
[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_GET] count = 2 elapse = 0.011775ms average = 0.0058875ms
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_ADD] count = 2 elapse = 0.058559ms average = 0.0292795ms
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_GET] count = 2 elapse = 0.014456ms average = 0.007228ms
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_REPLY_GET] count = 2 elapse = 0.005685ms average = 0.0028425ms
[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_ADD] count = 1 elapse = 3.5289ms average = 3.5289ms
[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_GET] count = 1 elapse = 0.110966ms average = 0.110966ms
[INFO] [2016-09-30 12:19:35] --------------------------------------------------------------
[INFO] [2016-09-30 12:19:35] Multiverso Shutdown successfully
[INFO] [2016-09-30 12:19:35] multiverso MPI-Net is initialized under MPI_THREAD_SERIALIZED mode.
[INFO] [2016-09-30 12:19:35] All nodes registered. System contains 1 nodes. num_worker = 1, num_server = 1
[INFO] [2016-09-30 12:19:35] Create a async server
[INFO] [2016-09-30 12:19:35] Rank 0: Multiverso start sucessfully
[INFO] [2016-09-30 12:19:35] --------------Show dashboard monitor information--------------
[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_ADD] count = 2 elapse = 3.25433ms average = 1.62716ms
[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_GET] count = 2 elapse = 0.011775ms average = 0.0058875ms
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_ADD] count = 2 elapse = 0.058559ms average = 0.0292795ms
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_GET] count = 2 elapse = 0.014456ms average = 0.007228ms
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_REPLY_GET] count = 2 elapse = 0.005685ms average = 0.0028425ms
[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_ADD] count = 1 elapse = 3.5289ms average = 3.5289ms
[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_GET] count = 1 elapse = 0.110966ms average = 0.110966ms
[INFO] [2016-09-30 12:19:35] --------------------------------------------------------------
[INFO] [2016-09-30 12:19:35] Multiverso Shutdown successfully
[INFO] [2016-09-30 12:19:35] multiverso MPI-Net is initialized under MPI_THREAD_SERIALIZED mode.
[INFO] [2016-09-30 12:19:35] All nodes registered. System contains 1 nodes. num_worker = 1, num_server = 1
[INFO] [2016-09-30 12:19:35] Create a async server
[INFO] [2016-09-30 12:19:35] Rank 0: Multiverso start sucessfully
[INFO] [2016-09-30 12:19:35] --------------Show dashboard monitor information--------------
[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_ADD] count = 4 elapse = 3.26092ms average = 0.81523ms
[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_GET] count = 5 elapse = 0.035872ms average = 0.0071744ms
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_ADD] count = 4 elapse = 0.079631ms average = 0.0199077ms
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_GET] count = 5 elapse = 0.055307ms average = 0.0110614ms
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_REPLY_GET] count = 5 elapse = 0.014141ms average = 0.0028282ms
[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_ADD] count = 3 elapse = 3.64047ms average = 1.21349ms
[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_GET] count = 4 elapse = 0.35004ms average = 0.08751ms
[INFO] [2016-09-30 12:19:35] --------------------------------------------------------------
[INFO] [2016-09-30 12:19:35] Multiverso Shutdown successfully
[INFO] [2016-09-30 12:19:35] multiverso MPI-Net is initialized under MPI_THREAD_SERIALIZED mode.
[INFO] [2016-09-30 12:19:35] All nodes registered. System contains 1 nodes. num_worker = 1, num_server = 1
[INFO] [2016-09-30 12:19:35] Create a sync server
[INFO] [2016-09-30 12:19:35] Rank 0: Multiverso start sucessfully
[INFO] [2016-09-30 12:19:35] --------------Show dashboard monitor information--------------
[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_ADD] count = 6 elapse = 3.35131ms average = 0.558552ms
[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_GET] count = 7 elapse = 0.049346ms average = 0.00704943ms
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_ADD] count = 6 elapse = 0.110051ms average = 0.0183418ms
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_GET] count = 7 elapse = 0.068958ms average = 0.00985114ms
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_REPLY_GET] count = 7 elapse = 0.018843ms average = 0.00269186ms
[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_ADD] count = 4 elapse = 3.83984ms average = 0.959961ms
[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_GET] count = 5 elapse = 0.420295ms average = 0.084059ms
[INFO] [2016-09-30 12:19:35] --------------------------------------------------------------
[INFO] [2016-09-30 12:19:35] Multiverso Shutdown successfully
Test module "multiverso" has passed with:
8 test cases out of 8 passed
83 assertions out of 83 passed
Test suite "array_test" has passed with:
2 test cases out of 2 passed
34 assertions out of 34 passed
Test case "array_test/array_access" has passed with:
20 assertions out of 20 passed
Test case "array_test/array_partition" has passed with:
14 assertions out of 14 passed
Test suite "blob" has passed with:
2 test cases out of 2 passed
7 assertions out of 7 passed
Test case "blob/blob_constructor_test" has passed with:
3 assertions out of 3 passed
Test case "blob/blob_access_test" has passed with:
4 assertions out of 4 passed
Test suite "test_kv" has passed with:
1 test case out of 1 passed
3 assertions out of 3 passed
Test case "test_kv/access" has passed with:
3 assertions out of 3 passed
Test suite "message" has passed with:
1 test case out of 1 passed
11 assertions out of 11 passed
Test case "message/message_access" has passed with:
11 assertions out of 11 passed
Test suite "node" has passed with:
1 test case out of 1 passed
8 assertions out of 8 passed
Test case "node/node_role" has passed with:
8 assertions out of 8 passed
Test suite "test_sync" has passed with:
1 test case out of 1 passed
20 assertions out of 20 passed
Test case "test_sync/sync" has passed with:
20 assertions out of 20 passed

Просмотреть файл

@ -0,0 +1,6 @@
#!/bin/bash
. $TEST_ROOT_DIR/run-test-common
. $TEST_DIR/../run-boost-test-common
boosttestrun multiversotests

Просмотреть файл

@ -0,0 +1,19 @@
dataDir: .
tags:
# CPU only, at this stage.
# TODO move from l to separate leg, requires infra changes
- bvt-l (build_sku == 'cpu') or (build_sku == '1bitsgd')
- nightly-l (build_sku == 'cpu') or (build_sku == '1bitsgd')
testCases:
Test cases pass:
patterns:
- "Test case"
- "passed with"
Test suites pass:
patterns:
- "Test suite"
- "passed with"

0
Tests/EndToEndTests/UnitTests/NetworkTests/run-test Normal file → Executable file
Просмотреть файл

0
Tests/EndToEndTests/UnitTests/ReaderTests/run-test Normal file → Executable file
Просмотреть файл

Просмотреть файл

@ -43,7 +43,7 @@ checkEmptyStdout \
checkEmptyStdout \
"git ls-tree --full-tree -r HEAD --name-only | git check-attr text --cached --stdin | grep -v 'text: set' | cut -d: -f1 | git check-attr binary --cached --stdin | grep -v 'binary: set' | cut -d: -f1 | grep -v Source/1BitSGD" \
"git ls-tree --full-tree -r HEAD --name-only | git check-attr text --cached --stdin | grep -v 'text: set' | cut -d: -f1 | git check-attr binary --cached --stdin | grep -v 'binary: set' | cut -d: -f1 | grep -v Source/Multiverso | grep -v Source/1BitSGD" \
"files that are neither marked as binary nor text; should extend .gitattributes"
# TODO line ending checks

Просмотреть файл

@ -55,9 +55,10 @@ makebuildinfo()
local CUDA_PATH=$6
local CUB_PATH=$7
local WITH_1BITSGD=$8
local BUILDER=$9
local BUILDMACHINE=${10}
local BUILDPATH=${11}
local WITH_ASGD=$9
local BUILDER=$10
local BUILDMACHINE=${11}
local BUILDPATH=${12}
(
printf "#ifndef _BUILDINFO_H\n"
@ -84,6 +85,11 @@ makebuildinfo()
else
printf "#define _WITH_1BITSGD_ \"no\"\n"
fi
if [ ! -z "$WITH_ASGD" ]; then
printf "#define _WITH_ASGD_ \"yes\"\n"
else
printf "#define _WITH_ASGD_ \"no\"\n"
fi
printf "#define _BUILDER_ \"%s\"\n" "$BUILDER"
printf "#define _BUILDMACHINE_ \"%s\"\n" "$BUILDMACHINE"
printf "#define _BUILDPATH_ \"%s\"\n" "$BUILDPATH"
@ -152,6 +158,7 @@ makebuildinfo \
"$CUDAPATH" \
"$CUBPATH" \
"$CNTK_ENABLE_1BitSGD" \
"$CNTK_ENABLE_ASGD" \
"$BUILDER" \
"$BUILDMACHINE" \
"$BUILDPATH"

19
configure поставляемый
Просмотреть файл

@ -90,6 +90,9 @@ enable_1bitsgd=$default_use_1bitsgd
default_use_code_coverage=no
enable_code_coverage=$default_use_code_coverage
default_use_asgd=yes
enable_asgd=$default_use_asgd
# List from best to worst choice
default_path_list="/usr /usr/local /opt /opt/local"
@ -322,6 +325,7 @@ function show_help ()
echo " --with-build-top=directory build directory $(show_default $build_top)"
echo " --add directory add directory to library search path"
echo " --1bitsgd[=(yes|no)] use 1Bit SGD $(show_default ${default_use_1bitsgd})"
echo " --asgd[=(yes|no)] use ASGD powered by Multiverso $(show_default $(default_use_asgd))"
echo " --cuda[=(yes|no)] use cuda GPU $(show_default $(default_use_cuda))"
echo " --python[=(yes|no)] with Python bindings $(show_default $(default_use_python))"
echo " --with-cuda[=directory] $(show_default $(find_cuda))"
@ -402,6 +406,17 @@ do
fi
;;
--asgd*)
if test x$optarg = xyes || test x$optarg = xno
then
enable_asgd=$optarg
else
echo "Invalid value for --asgd $optarg"
show_help
exit
fi
;;
--cuda*)
if test x$optarg = xyes || test x$optarg = xno
then
@ -1040,6 +1055,10 @@ if test x$protobuf_path != x; then
echo PROTOBUF_PATH=$protobuf_path >> $config
fi
if test $enable_asgd = yes ; then
echo CNTK_ENABLE_ASGD=true >> $config
fi
# If we are not in the configure directory, generate a trampoline Makefile
makefile=$build_top/Makefile
if test $(is_hardlinked "$configure" "$build_top/configure") = no