Integrate qiwye/asgd-dev into master
This commit is contained in:
Коммит
e618b917fe
|
@ -1,3 +1,6 @@
|
|||
[submodule "Source/1BitSGD"]
|
||||
path = Source/1BitSGD
|
||||
url = https://git.codeplex.com/cntk1bitsgd
|
||||
[submodule "Source/Multiverso"]
|
||||
path = Source/Multiverso
|
||||
url = https://github.com/Microsoft/Multiverso
|
||||
|
|
59
CNTK.sln
59
CNTK.sln
|
@ -137,6 +137,9 @@ EndProject
|
|||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ComputationNetworkLib", "Source\ComputationNetworkLib\ComputationNetworkLib.vcxproj", "{928ABD1B-4D3B-4017-AEF1-0FA1B4467513}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SGDLib", "Source\SGDLib\SGDLib.vcxproj", "{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}"
|
||||
ProjectSection(ProjectDependencies) = postProject
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849} = {16F14058-B116-49D9-8BA0-209F3AFFE849}
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ParallelTraining", "ParallelTraining", "{5E666C53-2D82-49C9-9127-3FDDC321C741}"
|
||||
ProjectSection(SolutionItems) = preProject
|
||||
|
@ -1285,6 +1288,10 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "V2LibraryDistributionTests"
|
|||
{E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF}
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Multiverso", "Source\Multiverso\src\Multiverso.vcxproj", "{16F14058-B116-49D9-8BA0-209F3AFFE849}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MultiversoTests", "Source\Multiverso\Test\unittests\MultiversoTests.vcxproj", "{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}"
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalExtendedClientTest", "Tests\EndToEndTests\EvalClientTests\CPPEvalExtendedClientTest\CPPEvalExtendedClientTest.vcxproj", "{5D29C76D-648A-456F-920D-48230F2FB3C8}"
|
||||
ProjectSection(ProjectDependencies) = postProject
|
||||
{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9}
|
||||
|
@ -2245,6 +2252,56 @@ Global
|
|||
{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E}.Release|Mixed Platforms.Build.0 = Release|x64
|
||||
{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E}.Release|x64.ActiveCfg = Release|x64
|
||||
{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E}.Release|x64.Build.0 = Release|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|Any CPU.ActiveCfg = Debug_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|Mixed Platforms.ActiveCfg = Debug_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|Mixed Platforms.Build.0 = Debug_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Any CPU.ActiveCfg = debug|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Mixed Platforms.ActiveCfg = debug|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Mixed Platforms.Build.0 = debug|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|x64.ActiveCfg = debug|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|x64.Build.0 = debug|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|Any CPU.ActiveCfg = Release_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|Mixed Platforms.Build.0 = Release_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|Any CPU.ActiveCfg = Release_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|Mixed Platforms.Build.0 = Release_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|x64.ActiveCfg = Release_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|x64.Build.0 = Release_CpuOnly|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Any CPU.ActiveCfg = release|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Mixed Platforms.ActiveCfg = release|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Mixed Platforms.Build.0 = release|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|x64.ActiveCfg = release|x64
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|x64.Build.0 = release|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|Any CPU.ActiveCfg = Debug_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|Mixed Platforms.ActiveCfg = Debug_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|Mixed Platforms.Build.0 = Debug_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug|Any CPU.ActiveCfg = Debug|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug|Mixed Platforms.Build.0 = Debug|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug|x64.Build.0 = Debug|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_CpuOnly|Any CPU.ActiveCfg = Release_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_CpuOnly|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_CpuOnly|Mixed Platforms.Build.0 = Release_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_NoOpt|Any CPU.ActiveCfg = Release_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_NoOpt|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_NoOpt|Mixed Platforms.Build.0 = Release_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_NoOpt|x64.ActiveCfg = Release_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_NoOpt|x64.Build.0 = Release_CpuOnly|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|Any CPU.ActiveCfg = Release|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|Mixed Platforms.ActiveCfg = Release|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|Mixed Platforms.Build.0 = Release|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|x64.ActiveCfg = Release|x64
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|x64.Build.0 = Release|x64
|
||||
{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|Any CPU.ActiveCfg = Debug_CpuOnly|x64
|
||||
{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|Mixed Platforms.ActiveCfg = Debug_CpuOnly|x64
|
||||
{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|Mixed Platforms.Build.0 = Debug_CpuOnly|x64
|
||||
|
@ -2447,6 +2504,8 @@ Global
|
|||
{E844AB9A-A48F-4A99-9625-F528C5C46D83} = {8656B71D-E24C-4AC2-8BE4-C07B415A3E15}
|
||||
{CD721536-CFD3-413E-A3D7-FB0FAF989635} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
|
||||
{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
|
||||
{16F14058-B116-49D9-8BA0-209F3AFFE849} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
|
||||
{EC7157E9-A51F-4702-A5FD-8DAF88C7029F} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
|
||||
{5D29C76D-648A-456F-920D-48230F2FB3C8} = {05E45AF7-C069-4057-BC16-0A532D068CE4}
|
||||
EndGlobalSection
|
||||
EndGlobal
|
||||
|
|
|
@ -0,0 +1,102 @@
|
|||
# Parameters can be overwritten on the command line
|
||||
# for example: cntk configFile=myConfigFile RootDir=../..
|
||||
# For running from Visual Studio add
|
||||
# currentDirectory=$(SolutionDir)/<path to corresponding data folder>
|
||||
|
||||
command = trainNetwork
|
||||
|
||||
precision = "float"; traceLevel = 1 ; deviceId = "auto"
|
||||
|
||||
rootDir = ".." ; dataDir = "$rootDir$/DataSets/MNIST" ;
|
||||
outputDir = "./Output" ;
|
||||
|
||||
modelPath = "$outputDir$/Models/06_OneConvRegrMultiNode"
|
||||
#stderr = "$outputDir$/06_OneConvRegr_bs_out"
|
||||
|
||||
parallelizationMethod=DataParallelSGD
|
||||
|
||||
# TRAINING CONFIG
|
||||
trainNetwork = {
|
||||
action = "train"
|
||||
|
||||
BrainScriptNetworkBuilder = {
|
||||
imageShape = 28:28:1 # image dimensions, 1 channel only
|
||||
labelDim = 10 # number of distinct labels
|
||||
featScale = 1/256
|
||||
Scale{f} = x => Constant(f) .* x
|
||||
|
||||
model = Sequential (
|
||||
Scale {featScale} :
|
||||
ConvolutionalLayer {16, (5:5), pad = true} : ReLU :
|
||||
MaxPoolingLayer {(2:2), stride=(2:2)} :
|
||||
DenseLayer {64} : ReLU :
|
||||
LinearLayer {labelDim}
|
||||
)
|
||||
|
||||
# inputs
|
||||
features = Input {imageShape}
|
||||
labels = Input {labelDim}
|
||||
|
||||
# apply model to features
|
||||
z = model (features)
|
||||
|
||||
# loss and error computation
|
||||
sqErr = SquareError (labels, z)
|
||||
rmse = Sqrt (sqErr / labelDim)
|
||||
|
||||
# declare special nodes
|
||||
featureNodes = (features)
|
||||
labelNodes = (labels)
|
||||
criterionNodes = (rmse)
|
||||
evaluationNodes = (rmse)
|
||||
outputNodes = (z)
|
||||
}
|
||||
|
||||
SGD = {
|
||||
epochSize = 0
|
||||
minibatchSize = 64
|
||||
maxEpochs = 15
|
||||
learningRatesPerSample = 0.001*5:0.0005
|
||||
momentumAsTimeConstant = 1024
|
||||
numMBsToShowResult = 500
|
||||
ParallelTrain = [
|
||||
parallelizationMethod = $parallelizationMethod$
|
||||
distributedMBReading = "true"
|
||||
parallelizationStartEpoch = 1
|
||||
DataParallelSGD = [
|
||||
gradientBits = 32
|
||||
]
|
||||
ModelAveragingSGD = [
|
||||
blockSizePerWorker = 64
|
||||
]
|
||||
DataParallelASGD = [
|
||||
syncPeriod = 64
|
||||
usePipeline = false
|
||||
]
|
||||
}
|
||||
|
||||
reader = {
|
||||
readerType = "CNTKTextFormatReader"
|
||||
# See ../REAMDE.md for details on getting the data (Train-28x28_cntk_text.txt).
|
||||
file = "$DataDir$/Train-28x28_cntk_text.txt"
|
||||
input = {
|
||||
features = { dim = 784 ; format = "dense" }
|
||||
labels = { dim = 10 ; format = "dense" }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# TEST CONFIG
|
||||
testNetwork = {
|
||||
action = "test"
|
||||
minibatchSize = 1024 # reduce this if you run out of memory
|
||||
|
||||
reader = {
|
||||
readerType = "CNTKTextFormatReader"
|
||||
file = "$DataDir$/Test-28x28_cntk_text.txt"
|
||||
input = {
|
||||
features = { dim = 784 ; format = "dense" }
|
||||
labels = { dim = 10 ; format = "dense" }
|
||||
}
|
||||
}
|
||||
}
|
|
@ -101,3 +101,12 @@ In the fifth example, we show how CNTK can be used to perform a regression task.
|
|||
`cntk configFile=05_OneConvRegr.cntk`
|
||||
|
||||
The trained network achieves root-mean-square error (RMSE) of 0.0039. To see more sophisticated examples on regression tasks, please refer to [Regression](../Regression).
|
||||
|
||||
### 06_OneConvRegrMultiNode.cntk
|
||||
|
||||
In the sixth example, we show how to train CNTK with multiple process(GPUs) for a regression task. CNTK using MPI for the multiple nodes task, and CNTK currently support four parallel SGD algorithms: DataParallelSGD, BlockMomentumSGD, ModelAveragingSGD, DataParallelASGD. We reuse the same network architecture in `05_OneConvRegr`, only to add a parallel train block. To run this example on a machine, use the following command:
|
||||
|
||||
`mpiexec -n 2 cntk configFile=06_OneConvRegrMultiNode.cntk parallelTrain=True parallelizationMethod=DataParallelSGD`
|
||||
|
||||
You can change the parallelizationMethod to other three options. To see more detailed guide on multiple GPUs and machines tasks, please refer to [Multiple GPUs and machines](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines).
|
||||
|
||||
|
|
82
Makefile
82
Makefile
|
@ -516,6 +516,7 @@ $(CNTKLIBRARY_DISTRIBUTION_TESTS): $(CNTKLIBRARY_DISTRIBUTION_TESTS_OBJ) | $(CNT
|
|||
EVAL:=eval
|
||||
|
||||
SGDLIB_SRC=\
|
||||
$(SOURCEDIR)/SGDLib/ASGDHelper.cpp \
|
||||
$(SOURCEDIR)/SGDLib/Profiler.cpp \
|
||||
$(SOURCEDIR)/SGDLib/SGD.cpp \
|
||||
$(SOURCEDIR)/SGDLib/PostComputingActions.cpp \
|
||||
|
@ -551,7 +552,7 @@ $(EVAL_LIB): $(EVAL_OBJ) | $(CNTKMATH_LIB)
|
|||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo Building $(EVAL_LIB) for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) $(PROTOBUF_PATH)/lib/libprotobuf.a
|
||||
$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) $(lMULTIVERSO) $(PROTOBUF_PATH)/lib/libprotobuf.a
|
||||
|
||||
########################################
|
||||
# Eval Sample clients
|
||||
|
@ -570,7 +571,7 @@ $(EVAL_CLIENT): $(EVAL_CLIENT_OBJ) | $(EVAL_LIB)
|
|||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building $(EVAL_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH) $(lMULTIVERSO)
|
||||
|
||||
EVAL_EXTENDED_CLIENT:=$(BINDIR)/cppevalextendedclient
|
||||
|
||||
|
@ -586,7 +587,7 @@ $(EVAL_EXTENDED_CLIENT): $(EVAL_EXTENDED_CLIENT_OBJ) | $(EVAL_LIB)
|
|||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building $(EVAL_EXTENDED_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH) $(lMULTIVERSO)
|
||||
|
||||
########################################
|
||||
# Eval V2 Sample client
|
||||
|
@ -893,6 +894,71 @@ endif
|
|||
# temporarily adding to 1bit, need to work with others to fix it
|
||||
endif
|
||||
|
||||
|
||||
########################################
|
||||
# ASGD(multiverso) setup
|
||||
########################################
|
||||
|
||||
|
||||
ifeq ("$(CNTK_ENABLE_ASGD)","true")
|
||||
|
||||
ifeq (,$(wildcard Source/Multiverso/include/multiverso/*.h))
|
||||
$(error Build with Multiverso was requested but cannot find the code. Please check https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines#24-data-parallel-asgd to learn more.)
|
||||
endif
|
||||
|
||||
lMULTIVERSO:=-lmultiverso
|
||||
|
||||
INCLUDEPATH += $(SOURCEDIR)/Multiverso/include
|
||||
COMMON_FLAGS += -DASGD_PARALLEL_SUPPORT
|
||||
|
||||
MULTIVERSO_LIB:=$(LIBDIR)/libmultiverso.so
|
||||
|
||||
ALL_LIBS+=$(MULTIVERSO_LIB)
|
||||
ifeq ("$(BUILDTYPE)","release")
|
||||
MULTIVERSO_CMAKE_BUILDTYPE=Release
|
||||
endif
|
||||
ifeq ("$(BUILDTYPE)","debug")
|
||||
MULTIVERSO_CMAKE_BUILDTYPE=Debug
|
||||
endif
|
||||
|
||||
$(MULTIVERSO_LIB):
|
||||
@echo "Build Multiverso lib"
|
||||
@mkdir -p $(LIBDIR)
|
||||
@mkdir -p $(BINDIR)
|
||||
@mkdir -p $(SOURCEDIR)/Multiverso/build/$(BUILDTYPE)
|
||||
@cmake -DCMAKE_VERBOSE_MAKEFILE=TRUE \
|
||||
-DBoost_NO_BOOST_CMAKE=TRUE \
|
||||
-DBoost_NO_SYSTEM_PATHS=TRUE \
|
||||
-DBOOST_ROOT:PATHNAME=$(BOOST_PATH) \
|
||||
-DBOOST_LIBRARY_DIRS:FILEPATH=$(BOOST_PATH) \
|
||||
-DLIBRARY_OUTPUT_PATH=$(shell readlink -f $(LIBDIR)) \
|
||||
-DEXECUTABLE_OUTPUT_PATH=$(shell readlink -f $(BINDIR)) \
|
||||
-DCMAKE_BUILD_TYPE=$(MULTIVERSO_CMAKE_BUILDTYPE) \
|
||||
-B./Source/Multiverso/build/$(BUILDTYPE) -H./Source/Multiverso
|
||||
@make VERBOSE=1 -C ./Source/Multiverso/build/$(BUILDTYPE) -j multiverso
|
||||
|
||||
UNITTEST_MULTIVERSO_SRC = \
|
||||
$(SOURCEDIR)/Multiverso/Test/unittests/test_array.cpp \
|
||||
$(SOURCEDIR)/Multiverso/Test/unittests/test_blob.cpp \
|
||||
$(SOURCEDIR)/Multiverso/Test/unittests/test_kv.cpp \
|
||||
$(SOURCEDIR)/Multiverso/Test/unittests/test_message.cpp \
|
||||
$(SOURCEDIR)/Multiverso/Test/unittests/test_multiverso.cpp \
|
||||
$(SOURCEDIR)/Multiverso/Test/unittests/test_node.cpp \
|
||||
$(SOURCEDIR)/Multiverso/Test/unittests/test_sync.cpp \
|
||||
|
||||
UNITTEST_MULTIVERSO_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MULTIVERSO_SRC))
|
||||
|
||||
UNITTEST_MULTIVERSO := $(BINDIR)/multiversotests
|
||||
|
||||
ALL += $(UNITTEST_MULTIVERSO)
|
||||
|
||||
$(UNITTEST_MULTIVERSO): $(UNITTEST_MULTIVERSO_OBJ) | $(MULTIVERSO_LIB)
|
||||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(lMULTIVERSO) -ldl
|
||||
endif
|
||||
|
||||
########################################
|
||||
# cntk
|
||||
########################################
|
||||
|
@ -926,11 +992,11 @@ CNTK:=$(BINDIR)/cntk
|
|||
ALL+=$(CNTK)
|
||||
SRC+=$(CNTK_SRC)
|
||||
|
||||
$(CNTK): $(CNTK_OBJ) | $(CNTKMATH_LIB)
|
||||
$(CNTK): $(CNTK_OBJ) | $(CNTKMATH_LIB) $(MULTIVERSO_LIB)
|
||||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) -fopenmp $(PROTOBUF_PATH)/lib/libprotobuf.a
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) $(lMULTIVERSO) -fopenmp $(PROTOBUF_PATH)/lib/libprotobuf.a
|
||||
|
||||
# deployable resources: standard library of BS
|
||||
CNTK_CORE_BS:=$(BINDIR)/cntk.core.bs
|
||||
|
@ -967,7 +1033,7 @@ $(UNITTEST_EVAL) : $(UNITTEST_EVAL_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
|
|||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(EVAL) -l$(CNTKMATH)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(EVAL) -l$(CNTKMATH) $(lMULTIVERSO)
|
||||
|
||||
#TODO: create project specific makefile or rules to avoid adding project specific path to the global path
|
||||
INCLUDEPATH += $(SOURCEDIR)/Readers/CNTKTextFormatReader
|
||||
|
@ -1027,11 +1093,11 @@ UNITTEST_NETWORK := $(BINDIR)/networktests
|
|||
ALL += $(UNITTEST_NETWORK)
|
||||
SRC += $(UNITTEST_NETWORK_SRC)
|
||||
|
||||
$(UNITTEST_NETWORK): $(UNITTEST_NETWORK_OBJ) | $(CNTKMATH_LIB) $(CNTKTEXTFORMATREADER)
|
||||
$(UNITTEST_NETWORK): $(UNITTEST_NETWORK_OBJ) | $(CNTKMATH_LIB) $(CNTKTEXTFORMATREADER) $(MULTIVERSO_LIB)
|
||||
@echo $(SEPARATOR)
|
||||
@mkdir -p $(dir $@)
|
||||
@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(CNTKMATH) -fopenmp $(PROTOBUF_PATH)/lib/libprotobuf.a
|
||||
$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) $(lMULTIVERSO) -l$(CNTKMATH) -fopenmp $(PROTOBUF_PATH)/lib/libprotobuf.a
|
||||
|
||||
UNITTEST_MATH_SRC = \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp \
|
||||
|
|
|
@ -373,6 +373,9 @@ void PrintBuiltInfo()
|
|||
#ifdef _WITH_1BITSGD_
|
||||
LOGPRINTF(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
|
||||
#endif
|
||||
#ifdef _WITH_ASGD_
|
||||
LOGPRINTF(stderr, "\t\tWith ASGD: %s\n", _WITH_ASGD_);
|
||||
#endif
|
||||
#ifdef _MATHLIB_
|
||||
LOGPRINTF(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
|
||||
#endif
|
||||
|
|
|
@ -85,7 +85,7 @@
|
|||
<StackReserveSize>100000000</StackReserveSize>
|
||||
</Link>
|
||||
<PreBuildEvent>
|
||||
<Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)"</Command>
|
||||
<Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
|
||||
</PreBuildEvent>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="$(ReleaseBuild)">
|
||||
|
@ -113,7 +113,7 @@
|
|||
<StackReserveSize>100000000</StackReserveSize>
|
||||
</Link>
|
||||
<PreBuildEvent>
|
||||
<Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)"</Command>
|
||||
<Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
|
||||
</PreBuildEvent>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="$(CpuOnlyBuild)">
|
||||
|
|
|
@ -22,6 +22,7 @@ set p_CNTK_ENABLE_1BitSGD=%~3
|
|||
set p_CudaPath=%~4
|
||||
set p_CUDNN_PATH=%~5
|
||||
set p_CUB_PATH=%~6
|
||||
set p_CNTK_ENABLE_ASGD=%~7
|
||||
|
||||
echo #ifndef _BUILDINFO_H > buildinfo.h$$
|
||||
echo #define _BUILDINFO_H >> buildinfo.h$$
|
||||
|
@ -75,7 +76,12 @@ if "%p_CNTK_ENABLE_1BitSGD%" == "true" (
|
|||
) else (
|
||||
echo #define _WITH_1BITSGD_ "no">>buildinfo.h$$
|
||||
)
|
||||
|
||||
:: assuming CNTK_ENABLE_ASGD was true as default value
|
||||
if "%p_CNTK_ENABLE_ASGD%" == "false" (
|
||||
echo #define _WITH_ASGD_ "no">>buildinfo.h$$
|
||||
) else (
|
||||
echo #define _WITH_ASGD_ "yes">>buildinfo.h$$
|
||||
)
|
||||
if not %l_build_target% == CPU-only (
|
||||
if "%p_CudaPath%" == "" (
|
||||
echo #define _CUDA_PATH_ "NOT_DEFINED" >> buildinfo.h$$
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
#pragma once
|
||||
|
||||
#include <list>
|
||||
#include "ComputationNetwork.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// class AdjustLearningRateAtBeginning
|
||||
// Providing option for DataParallelASGD training. so that every nodes
|
||||
// could adjust learning rate every minibatch at first N epochs.
|
||||
// -----------------------------------------------------------------------
|
||||
// TODO: We can removed these options once we can adjust learning rate at minibatchs level
|
||||
enum class AdjustLearningRateAtBeginning : int
|
||||
{
|
||||
None = 0, // default, don't adjust learning rate
|
||||
Linearly = 1, // using linear adjustment, learning rate will from 0 to learningRatesPerMB
|
||||
Staircase = (1 << 1), // using staircased adjustment, learning rate will from 0 to learningRatesPerMB every adjustNbMinibatch
|
||||
};
|
||||
|
||||
template<class ElemType = float>
|
||||
class ASGDHelper
|
||||
{
|
||||
public:
|
||||
virtual ~ASGDHelper() { }
|
||||
// -----------------------------------------------------------------------
|
||||
// InitModel() -- Upload initialized model (, which was pre-computed by CNTK logic) .
|
||||
// to the parameter servers, so that every node could start training from same model
|
||||
// -----------------------------------------------------------------------
|
||||
virtual void InitModel(const std::list<ComputationNodeBasePtr> & learnableNodes) = 0;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// PushAndPullModel() -- Push parameters of learnableNodes to parameter servers, then get the latests model back.
|
||||
// -----------------------------------------------------------------------
|
||||
virtual bool PushAndPullModel(const std::list<ComputationNodeBasePtr> & learnableNodes, size_t sampleSinceLastSynced = 0) = 0;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// WaitAll() -- Wait(Barrier) all the other nodes to process
|
||||
// -----------------------------------------------------------------------
|
||||
virtual void WaitAll() = 0;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// WaitAsyncBuffer() -- Wait pipeline thread to finish job when useAsyncBuffer is true
|
||||
// -----------------------------------------------------------------------
|
||||
virtual void WaitAsyncBuffer() = 0;
|
||||
|
||||
}; // Class ASGDHelper
|
||||
|
||||
// Factory method to create a ASGDHelper instance
|
||||
template<class ElemType = float>
|
||||
ASGDHelper<ElemType>* NewASGDHelper(
|
||||
const std::list<ComputationNodeBasePtr> & learnableNodes, // Parameters that needs to be train
|
||||
size_t nodeNumRanks, // Number of working nodes
|
||||
bool useAsyncBuffered = true, // Using asynchonous buffer to hide communication cost
|
||||
bool isSimulatedModelAveragingSGD = false, // Using parameter server-based MA rather than ASGD
|
||||
AdjustLearningRateAtBeginning adjusttype =
|
||||
AdjustLearningRateAtBeginning::None, // Adjust learning per minibatches at very begining of training process
|
||||
double adjustCoef = 0.2, // see in DecayCoefficient()
|
||||
size_t adjustPerMinibatches = 600, //
|
||||
int traceLevel = 0, // log level
|
||||
int syncPerfStats = 0); // shown perf data every syncPerfStats
|
||||
|
||||
}}}
|
|
@ -99,6 +99,7 @@ class MPIWrapper : public std::enable_shared_from_this<MPIWrapper>
|
|||
|
||||
int argc = 0;
|
||||
char **argv = NULL;
|
||||
// TODO(qiwye) Multiverso(parameter server) will benefit from MPI_THREAD_MULTIPLE .
|
||||
int requiredThreadLevelSupport = MPI_THREAD_SERIALIZED;
|
||||
int provided;
|
||||
int ret = MPI_Init_thread(&argc, &argv, requiredThreadLevelSupport, &provided);
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
Subproject commit 40743f9c86297f63b29c99c259199f59f16c0c7c
|
|
@ -0,0 +1,670 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
// ASGDHelper.cpp : Implements ASGDHelper interface. The implementation is based on Multiverso.
|
||||
//
|
||||
|
||||
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
|
||||
|
||||
#include "ASGDHelper.h"
|
||||
#include "MPIWrapper.h"
|
||||
#include "ComputationNetwork.h"
|
||||
#include "TimerUtility.h"
|
||||
|
||||
#include <functional>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <numeric>
|
||||
#include <algorithm>
|
||||
|
||||
#ifdef ASGD_PARALLEL_SUPPORT
|
||||
|
||||
#include <multiverso/multiverso.h>
|
||||
#include <multiverso/util/configure.h>
|
||||
#include <multiverso/table/array_table.h>
|
||||
#include <multiverso/updater/updater.h>
|
||||
|
||||
#pragma comment(lib, "Multiverso.lib")
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef CPUONLY
|
||||
#include <cuda_runtime.h>
|
||||
#pragma comment (lib, "cudart.lib") // for cudaMemcpyAsync()
|
||||
#endif
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
#ifndef CPUONLY
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Error handling
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
template <typename ERRTYPE>
|
||||
static void CudaCall(ERRTYPE retCode, const char* exprString, const char* libName, ERRTYPE successCode)
|
||||
{
|
||||
if (retCode != successCode)
|
||||
{
|
||||
try
|
||||
{
|
||||
#ifdef _WIN32
|
||||
const char* hostname = getenv("COMPUTERNAME");
|
||||
#else
|
||||
char hostname[HOST_NAME_MAX];
|
||||
if (gethostname(hostname, HOST_NAME_MAX) != 0)
|
||||
strcpy(hostname, "?");
|
||||
#endif
|
||||
int currentCudaDevice;
|
||||
cudaGetDevice(¤tCudaDevice);
|
||||
Microsoft::MSR::CNTK::RuntimeError("%s failure %d; GPU=%d ; hostname=%s ; expr=%s", libName, (int)retCode, currentCudaDevice, hostname ? hostname : "?", exprString);
|
||||
}
|
||||
catch (const std::exception& e) // catch, log, and rethrow since CUDA code sometimes hangs in destruction, so we'd never get to see the error
|
||||
{
|
||||
std::cerr << e.what() << std::endl;
|
||||
throw;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define CUDA_CALL(expr) (CudaCall((expr), #expr, "CUDA", cudaSuccess))
|
||||
#endif // CPUONLY
|
||||
|
||||
#ifdef ASGD_PARALLEL_SUPPORT
|
||||
|
||||
// MultiversoHelper is the implementation of ASGDHelper interface with Multiverso
|
||||
template<class ElemType = float>
|
||||
class MultiversoHelper : public ASGDHelper<ElemType>
|
||||
{
|
||||
public:
|
||||
typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
|
||||
|
||||
MultiversoHelper(const std::list<ComputationNodeBasePtr> & learnableNodes, // Parameters that needs to be train
|
||||
size_t nodeNumRanks, // Number of working nodes
|
||||
bool useAsyncBuffer = true, // Using asynchonous buffer to hide communication cost
|
||||
bool isSimulatedModelAveragingSGD = false, // Using parameter server-based MA rather than ASGD
|
||||
AdjustLearningRateAtBeginning adjusttype = AdjustLearningRateAtBeginning::None, // Adjust learning per minibatches at very begining of training process
|
||||
// this could be used to tackle the unstableness of ASGD
|
||||
double adjustCoef = 0.2, // see in DecayCoefficient()
|
||||
size_t adjustPerMinibatches = 600, //
|
||||
int traceLevel = 0, // log level
|
||||
int syncPerfStats = 0) : // shown perf data every syncPerfStats
|
||||
m_parameterSyncCounter(0), m_adjustLearningRateAtBeginningType(adjusttype),
|
||||
m_adjustCoefficient(adjustCoef), m_adjustMBNumber(adjustPerMinibatches),
|
||||
m_totalClientNumber(nodeNumRanks), m_useAsyncBuffer(useAsyncBuffer),
|
||||
m_traceLevel(traceLevel), m_ModelAveragingSGDSimulating(isSimulatedModelAveragingSGD), m_doesEveryNodesShouldSynced(false),
|
||||
m_syncPerfStats(syncPerfStats)
|
||||
{
|
||||
if (m_ModelAveragingSGDSimulating)
|
||||
{
|
||||
m_doesEveryNodesShouldSynced = true;
|
||||
m_useAsyncBuffer = false;
|
||||
}
|
||||
// Pipeline releated variables
|
||||
m_localBufferNum = m_useAsyncBuffer ? 2 : 1;
|
||||
m_bufferSwapIndex = new int[m_localBufferNum];
|
||||
|
||||
// CPU asynchronous buffer
|
||||
m_cpuAsyncBuffer = new ElemType*[m_localBufferNum];
|
||||
|
||||
// Get option used by multiverso sparse update
|
||||
m_getOptions.reserve(m_localBufferNum);
|
||||
m_addOptions.reserve(m_localBufferNum);
|
||||
|
||||
#ifndef CPUONLY
|
||||
// GPU asynchronous buffer
|
||||
m_gpuAsyncBuffer.resize(m_localBufferNum);
|
||||
// creat an communication stream for the data tranfer between GPU and CPU
|
||||
CUDA_CALL(cudaStreamCreate(&_commStream));
|
||||
#endif
|
||||
m_bufferIndexInUse = 0;
|
||||
for (int i = 0; i < m_localBufferNum; i++)
|
||||
m_bufferSwapIndex[i] = (i + 1) % m_localBufferNum;
|
||||
|
||||
m_aysncBufferThread = nullptr;
|
||||
|
||||
multiverso::SetCMDFlag("logtostderr", true);
|
||||
|
||||
if (m_doesEveryNodesShouldSynced)
|
||||
multiverso::SetCMDFlag("sync", true);
|
||||
|
||||
MultiversoInit(learnableNodes);
|
||||
}
|
||||
|
||||
~MultiversoHelper()
|
||||
{
|
||||
fprintf(stderr, "~MultiversoHelper\n");
|
||||
fflush(stderr);
|
||||
|
||||
if (m_useAsyncBuffer && m_aysncBufferThread != nullptr && m_aysncBufferThread->joinable())
|
||||
m_aysncBufferThread->join();
|
||||
|
||||
delete m_bufferSwapIndex, m_deltaArray;
|
||||
|
||||
for (size_t i = 0; i < m_localBufferNum; i++)
|
||||
{
|
||||
#ifndef CPUONLY
|
||||
CUDA_CALL(cudaFreeHost(m_cpuAsyncBuffer[i]));
|
||||
#else
|
||||
delete m_cpuAsyncBuffer[i];
|
||||
#endif
|
||||
}
|
||||
delete m_cpuAsyncBuffer;
|
||||
#ifndef CPUONLY
|
||||
CUDA_CALL(cudaStreamDestroy(_commStream));
|
||||
#endif
|
||||
multiverso::MV_ShutDown(false);
|
||||
}
|
||||
|
||||
void InitModel(const std::list<ComputationNodeBasePtr> & learnableNodes) override
|
||||
{
|
||||
float factor = 1.0f / m_totalClientNumber;
|
||||
|
||||
int i = 0; // indicate the index of learnable nodes
|
||||
for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, i++)
|
||||
{
|
||||
ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
|
||||
Matrix<ElemType> &mat = node->Value();
|
||||
|
||||
#ifndef CPUONLY
|
||||
for (int j = 0; j < m_localBufferNum; j++)
|
||||
m_gpuAsyncBuffer[j].push_back(mat.DeepClone());
|
||||
#endif
|
||||
ElemType* px = m_cpuAsyncBuffer[0] + m_tableOffsets[i];
|
||||
mat.CopyToArray(px, m_tableLength[i]);
|
||||
}
|
||||
|
||||
for (int i = 1; i < m_localBufferNum; i++)
|
||||
memcpy(m_cpuAsyncBuffer[i], m_cpuAsyncBuffer[0], sizeof(ElemType) * m_totalModelSize);
|
||||
|
||||
memcpy(m_deltaArray, m_cpuAsyncBuffer[0], sizeof(ElemType) * m_totalModelSize);
|
||||
|
||||
// because the parameter server will minus the delta on the server, so that we should send the minus initial model to the server.
|
||||
std::transform(m_deltaArray, m_deltaArray + m_totalModelSize, m_deltaArray, std::bind1st(std::multiplies<ElemType>(), -factor));
|
||||
|
||||
m_workerArray->Add(m_deltaArray, m_totalModelSize);
|
||||
m_workerArray->Get(m_deltaArray, m_totalModelSize);
|
||||
WaitAll();
|
||||
m_workerArray->Get(m_deltaArray, m_totalModelSize);
|
||||
|
||||
if (std::equal(m_deltaArray, m_deltaArray + m_totalModelSize, m_cpuAsyncBuffer[0]))
|
||||
fprintf(stderr, "multiverso initial model loaded.\n");
|
||||
m_reportTimer.Start();
|
||||
}
|
||||
|
||||
bool PushAndPullModel(const std::list<ComputationNodeBasePtr> & learnableNodes, size_t sampleSinceLastSynced) override
|
||||
{
|
||||
m_parameterSyncCounter++;
|
||||
|
||||
double fromCPUToGPUTime;
|
||||
double fromGPUToCPUTime;
|
||||
double networkTime;
|
||||
double swapTimeOnGPU;
|
||||
m_reportTimer.Restart();
|
||||
WaitAsyncBuffer();
|
||||
m_reportTimer.Stop();
|
||||
|
||||
// reset statics for profiling
|
||||
if (m_traceLevel > 2 && m_syncPerfStats > 0 && m_parameterSyncCounter % m_syncPerfStats == 0)
|
||||
{
|
||||
fromCPUToGPUTime = 0;
|
||||
fromGPUToCPUTime = 0;
|
||||
networkTime = 0;
|
||||
swapTimeOnGPU = 0;
|
||||
}
|
||||
|
||||
m_bufferIndexInUse = m_bufferSwapIndex[m_bufferIndexInUse];
|
||||
|
||||
int i = 0; // indicate the index of learnable nodes
|
||||
if (m_useAsyncBuffer)
|
||||
{
|
||||
m_reportTimer.Restart();
|
||||
for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, i++)
|
||||
{
|
||||
ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
|
||||
Microsoft::MSR::CNTK::Matrix<ElemType> &mat = node->Value();
|
||||
#ifndef CPUONLY
|
||||
// CNTK model -> GPU buffer
|
||||
CUDA_CALL(cudaMemcpy(m_gpuAsyncBuffer[m_bufferIndexInUse][i].Data(),
|
||||
mat.Data(),
|
||||
mat.GetNumElements() * sizeof(ElemType),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
|
||||
// GPU buffer -> CNTK model
|
||||
CUDA_CALL(cudaMemcpy(mat.Data(),
|
||||
m_gpuAsyncBuffer[m_bufferSwapIndex[m_bufferIndexInUse]][i].Data(),
|
||||
mat.GetNumElements() * sizeof(ElemType),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
#else
|
||||
ElemType * px = m_cpuAsyncBuffer[m_bufferIndexInUse] + m_tableOffsets[i];
|
||||
mat.CopyToArray(px, m_tableLength[i]);
|
||||
ElemType * py = m_cpuAsyncBuffer[m_bufferSwapIndex[m_bufferIndexInUse]] + m_tableOffsets[i];
|
||||
mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), mat.GetDeviceId(), py);
|
||||
delete px;
|
||||
#endif
|
||||
}
|
||||
m_reportTimer.Stop();
|
||||
if (m_traceLevel > 2)
|
||||
{
|
||||
swapTimeOnGPU = m_reportTimer.ElapsedSeconds();
|
||||
}
|
||||
#ifndef CPUONLY
|
||||
m_aysncBufferThread = new thread([&]()
|
||||
{
|
||||
float factor = DecayCoefficient();
|
||||
int deviceId = m_gpuAsyncBuffer[m_bufferIndexInUse][0].GetDeviceId();
|
||||
|
||||
CUDA_CALL(cudaSetDevice(deviceId));
|
||||
|
||||
Timer threadTimer;
|
||||
threadTimer.Restart();
|
||||
for (int widx = 0; widx < m_tableCount; widx++)
|
||||
{
|
||||
ElemType * px = m_deltaArray + m_tableOffsets[widx];
|
||||
// GPU buffer -> CPU buffer
|
||||
CUDA_CALL(cudaMemcpyAsync(px,
|
||||
m_gpuAsyncBuffer[m_bufferIndexInUse][widx].Data(),
|
||||
m_gpuAsyncBuffer[m_bufferIndexInUse][widx].GetNumElements() * sizeof(ElemType),
|
||||
cudaMemcpyDeviceToHost,
|
||||
_commStream));
|
||||
}
|
||||
// waiting copy from GPU to CPU has finished
|
||||
CUDA_CALL(cudaStreamSynchronize(_commStream));
|
||||
threadTimer.Stop();
|
||||
|
||||
if (m_traceLevel > 3)
|
||||
{
|
||||
double time = threadTimer.ElapsedSeconds();
|
||||
fprintf(stderr, "\t\t -- pullAndRequest, GPU -> CPU time %lf \n", time);
|
||||
}
|
||||
|
||||
// delta = gradient * learning_rate
|
||||
std::transform(m_cpuAsyncBuffer[m_bufferIndexInUse],
|
||||
m_cpuAsyncBuffer[m_bufferIndexInUse] + m_totalModelSize,
|
||||
m_deltaArray, m_deltaArray,
|
||||
std::minus<ElemType>());
|
||||
|
||||
threadTimer.Restart();
|
||||
// lr decay
|
||||
std::transform(m_deltaArray,
|
||||
m_deltaArray + m_totalModelSize,
|
||||
m_deltaArray,
|
||||
std::bind1st(std::multiplies<ElemType>(), factor));
|
||||
|
||||
|
||||
ElemType* px = m_deltaArray;
|
||||
ElemType* py = m_cpuAsyncBuffer[m_bufferIndexInUse];
|
||||
m_workerArray->AddAsync(px, m_totalModelSize);
|
||||
m_workerArray->Get(py, m_totalModelSize);
|
||||
|
||||
threadTimer.Stop();
|
||||
if (m_traceLevel > 3)
|
||||
{
|
||||
double time = threadTimer.ElapsedSeconds();
|
||||
fprintf(stderr, "\t\t -- pullAndRequest, Worker <--> Multiverso time %lf \n", time);
|
||||
}
|
||||
|
||||
threadTimer.Restart();
|
||||
// copy parameters from CPU buffer to GPU buffer
|
||||
for (int widx = 0; widx < m_tableCount; widx++)
|
||||
{
|
||||
ElemType * py = m_cpuAsyncBuffer[m_bufferIndexInUse] + m_tableOffsets[widx];
|
||||
|
||||
CUDA_CALL(cudaMemcpyAsync(m_gpuAsyncBuffer[m_bufferIndexInUse][widx].Data(),
|
||||
py,
|
||||
m_gpuAsyncBuffer[m_bufferIndexInUse][widx].GetNumElements() * sizeof(ElemType),
|
||||
cudaMemcpyHostToDevice,
|
||||
_commStream));
|
||||
}
|
||||
CUDA_CALL(cudaStreamSynchronize(_commStream));
|
||||
threadTimer.Stop();
|
||||
if (m_traceLevel > 3)
|
||||
{
|
||||
double time = threadTimer.ElapsedSeconds();
|
||||
fprintf(stderr, "\t\t -- pullAndRequest, CPU -> GPU time %lf \n", time);
|
||||
}
|
||||
});
|
||||
#else
|
||||
m_aysncBufferThread = new thread([&]()
|
||||
{
|
||||
float factor = DecayCoefficient();
|
||||
int t_cacheIdx = m_bufferIndexInUse;
|
||||
|
||||
std::transform(m_cpuAsyncBuffer[t_cacheIdx], m_cpuAsyncBuffer[t_cacheIdx] + m_totalModelSize, m_deltaArray, m_deltaArray, std::minus<ElemType>());
|
||||
std::transform(m_deltaArray, m_deltaArray + m_totalModelSize, m_deltaArray, std::bind1st(std::multiplies<ElemType>(), factor));
|
||||
|
||||
ElemType* px = m_deltaArray;
|
||||
ElemType* py = m_cpuAsyncBuffer[t_cacheIdx];
|
||||
m_workerArray->AddAsync(px, m_totalModelSize);
|
||||
m_workerArray->Get(py, m_totalModelSize);
|
||||
|
||||
});
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
m_reportTimer.Restart();
|
||||
float factor = DecayCoefficient();
|
||||
i = 0;
|
||||
for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, i++)
|
||||
{
|
||||
ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
|
||||
Microsoft::MSR::CNTK::Matrix<ElemType> &mat = node->Value();
|
||||
|
||||
ElemType * px = m_deltaArray + m_tableOffsets[i];
|
||||
mat.CopyToArray(px, m_tableLength[i]);
|
||||
}
|
||||
|
||||
m_reportTimer.Stop();
|
||||
if (m_traceLevel > 3)
|
||||
{
|
||||
double time = m_reportTimer.ElapsedSeconds();
|
||||
fprintf(stderr, "\t\t -- pullAndRequest, GPU -> CPU time %lf \n", time);
|
||||
}
|
||||
std::transform(m_cpuAsyncBuffer[0], m_cpuAsyncBuffer[0] + m_totalModelSize, m_deltaArray, m_deltaArray, std::minus<ElemType>());
|
||||
|
||||
// lr decay
|
||||
if (m_ModelAveragingSGDSimulating)
|
||||
{
|
||||
factor = ModelAggregationCoefficient(sampleSinceLastSynced);
|
||||
std::transform(m_deltaArray, m_deltaArray + m_totalModelSize, m_deltaArray, std::bind1st(std::multiplies<ElemType>(), factor));
|
||||
if (m_traceLevel > 2 && m_syncPerfStats != 0)
|
||||
{
|
||||
if (m_parameterSyncCounter % m_syncPerfStats == 0)
|
||||
ReportPerfStats(m_totalClientNumber * m_sampleSinceLastReport, m_sampleSinceLastReport);
|
||||
else
|
||||
m_sampleSinceLastReport += sampleSinceLastSynced;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
std::transform(m_deltaArray, m_deltaArray + m_totalModelSize, m_deltaArray, std::bind1st(std::multiplies<ElemType>(), factor));
|
||||
}
|
||||
m_reportTimer.Restart();
|
||||
|
||||
ElemType* px = m_deltaArray;
|
||||
ElemType* py = m_cpuAsyncBuffer[0];
|
||||
m_workerArray->AddAsync(px, m_totalModelSize);
|
||||
m_workerArray->Get(py, m_totalModelSize);
|
||||
|
||||
m_reportTimer.Stop();
|
||||
if (m_traceLevel > 3)
|
||||
{
|
||||
double time = m_reportTimer.ElapsedSeconds();
|
||||
fprintf(stderr, "\t\t -- pullAndRequest, Worker <--> Multiverso time %lf \n", time);
|
||||
}
|
||||
m_reportTimer.Restart();
|
||||
i = 0;
|
||||
for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, i++)
|
||||
{
|
||||
ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
|
||||
Microsoft::MSR::CNTK::Matrix<ElemType> &mat = node->Value();
|
||||
|
||||
ElemType * px = m_cpuAsyncBuffer[0] + m_tableOffsets[i];
|
||||
mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), mat.GetDeviceId(), px);
|
||||
}
|
||||
m_reportTimer.Stop();
|
||||
if (m_traceLevel > 3)
|
||||
{
|
||||
double time = m_reportTimer.ElapsedSeconds();
|
||||
fprintf(stderr, "\t\t -- pullAndRequest, CPU -> GPU time %lf \n", time);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void WaitAll() override
|
||||
{
|
||||
multiverso::MV_Barrier();
|
||||
}
|
||||
|
||||
void WaitAsyncBuffer() override
|
||||
{
|
||||
if (m_aysncBufferThread != nullptr && m_aysncBufferThread->joinable())
|
||||
{
|
||||
m_aysncBufferThread->join();
|
||||
delete m_aysncBufferThread;
|
||||
m_aysncBufferThread = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void MultiversoInit(const std::list<ComputationNodeBasePtr> & learnableNodes)
|
||||
{
|
||||
// parameter server offer vary of updaters, we only use the SGD updater for this simple case.
|
||||
multiverso::SetCMDFlag<std::string>(std::string("updater_type"), std::string("sgd"));
|
||||
multiverso::MV_Init();
|
||||
|
||||
int i = 0;
|
||||
for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, i++)
|
||||
{
|
||||
ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
|
||||
Matrix<ElemType> &mat = node->Value();
|
||||
size_t layerSize = mat.GetNumElements();
|
||||
|
||||
m_tableLength.push_back(layerSize);
|
||||
}
|
||||
|
||||
m_tableCount = m_tableLength.size();
|
||||
|
||||
// cacluate total of learnable node's size
|
||||
m_totalModelSize = accumulate(m_tableLength.begin(), m_tableLength.end(), 0);
|
||||
|
||||
m_serverArray = new multiverso::ArrayServer<ElemType>(m_totalModelSize);
|
||||
m_workerArray = new multiverso::ArrayWorker<ElemType>(m_totalModelSize);
|
||||
|
||||
multiverso::MV_Barrier();
|
||||
|
||||
size_t idx = 0;
|
||||
for (size_t len : m_tableLength)
|
||||
{
|
||||
m_tableOffsets.push_back(idx);
|
||||
idx += len;
|
||||
}
|
||||
|
||||
#ifndef CPUONLY
|
||||
for (int i = 0; i < m_localBufferNum; i++)
|
||||
m_gpuAsyncBuffer[i].reserve(m_tableCount);
|
||||
|
||||
// create pinned memory
|
||||
for (int i = 0; i < m_localBufferNum; ++i)
|
||||
CUDA_CALL(cudaMallocHost((void **)&m_cpuAsyncBuffer[i], sizeof(ElemType) * (m_totalModelSize), cudaHostAllocPortable));
|
||||
|
||||
CUDA_CALL(cudaMallocHost((void **)&m_deltaArray, sizeof(ElemType) * (m_totalModelSize), cudaHostAllocPortable));
|
||||
#else
|
||||
for (int i = 0; i < m_localBufferNum; i++)
|
||||
m_cpuAsyncBuffer[i] = new ElemType[m_totalModelSize];
|
||||
#endif
|
||||
}
|
||||
|
||||
float DecayCoefficient()
|
||||
{
|
||||
float f = 1.f;
|
||||
switch (m_adjustLearningRateAtBeginningType)
|
||||
{
|
||||
case AdjustLearningRateAtBeginning::None:
|
||||
break;
|
||||
case AdjustLearningRateAtBeginning::Linearly:
|
||||
f = min(f, max(0.f, (float)(m_adjustCoefficient + (1 - m_adjustCoefficient) / m_adjustMBNumber * m_parameterSyncCounter)));
|
||||
break;
|
||||
case AdjustLearningRateAtBeginning::Staircase:
|
||||
f = min(f, max(0.f, (float)(m_adjustCoefficient * (m_parameterSyncCounter / m_adjustMBNumber + 1))));
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return f;
|
||||
}
|
||||
|
||||
float ModelAggregationCoefficient(size_t samplesSinceLastSync)
|
||||
{
|
||||
float factor = 0;
|
||||
int nTotalSamples = samplesSinceLastSync;
|
||||
// m_pMPI->AllReduce(&nTotalSamples, 1);
|
||||
|
||||
if (nTotalSamples <= 0)
|
||||
{
|
||||
factor = 1.0f / m_pMPI->NumNodesInUse();
|
||||
// give an estimated one
|
||||
}
|
||||
else
|
||||
{
|
||||
factor = (samplesSinceLastSync + 0.0f) / nTotalSamples;
|
||||
}
|
||||
factor = 1.0f / m_pMPI->NumNodesInUse();
|
||||
return factor;
|
||||
}
|
||||
|
||||
inline void transpose(ElemType *src, ElemType *dst, const int N, const int M)
|
||||
{
|
||||
for (auto n = 0; n < N*M; n++) {
|
||||
auto i = n / N;
|
||||
auto j = n%N;
|
||||
dst[n] = src[M*j + i];
|
||||
}
|
||||
}
|
||||
|
||||
void ReportPerfStats(size_t totalSamplesProcessedSinceLastReport,
|
||||
size_t localSamplesProcessedSinceLastReport)
|
||||
{
|
||||
m_reportTimer.Stop();
|
||||
double secondsSinceLastReport = m_reportTimer.ElapsedSeconds();
|
||||
m_reportTimer.Restart();
|
||||
|
||||
float totalThroughput = secondsSinceLastReport > 0 ? (float)totalSamplesProcessedSinceLastReport / ((float)secondsSinceLastReport * 1000.0f) : 0.0f;
|
||||
float throughputPerWorker = totalThroughput / m_totalClientNumber;
|
||||
|
||||
string prefix = "\t\t(sim-model aggregation stats) %d-th sync: %8.2f seconds since last report ; %d samples processed by %d workers (%d by me);\n"
|
||||
"\t\t(sim-model aggregation stats) %d-th sync: totalThroughput = %.2fk samplesPerSecond , throughputPerWorker = %.2fk samplesPerSecond\n";
|
||||
fprintf(stderr, prefix.c_str(), (int)m_parameterSyncCounter, secondsSinceLastReport, (int)totalSamplesProcessedSinceLastReport, (int)m_totalClientNumber, (int)localSamplesProcessedSinceLastReport,
|
||||
(int)m_parameterSyncCounter, totalThroughput, throughputPerWorker);
|
||||
m_sampleSinceLastReport = 0;
|
||||
|
||||
}
|
||||
|
||||
multiverso::ArrayServer<ElemType>* m_serverArray;
|
||||
multiverso::ArrayWorker<ElemType>* m_workerArray;
|
||||
|
||||
thread * m_aysncBufferThread;
|
||||
bool m_doesEveryNodesShouldSynced;
|
||||
bool m_ModelAveragingSGDSimulating;
|
||||
|
||||
int m_totalClientNumber;
|
||||
int m_traceLevel;
|
||||
int m_syncPerfStats;
|
||||
Timer m_reportTimer;
|
||||
size_t m_parameterSyncCounter;
|
||||
size_t m_sampleSinceLastReport;
|
||||
|
||||
bool m_useAsyncBuffer;
|
||||
int m_localBufferNum;
|
||||
int * m_bufferSwapIndex;
|
||||
int m_bufferIndexInUse;
|
||||
std::vector<multiverso::GetOption*> m_getOptions; // used by sparse table
|
||||
std::vector<multiverso::AddOption*> m_addOptions; // used by sparse table
|
||||
|
||||
|
||||
AdjustLearningRateAtBeginning m_adjustLearningRateAtBeginningType;
|
||||
double m_adjustCoefficient;
|
||||
size_t m_adjustMBNumber;
|
||||
|
||||
vector<size_t> m_tableLength;
|
||||
size_t m_totalModelSize;
|
||||
vector<size_t> m_tableOffsets;
|
||||
//shared_ptr<ElemType> m_deltaArray;
|
||||
ElemType * m_deltaArray;
|
||||
//std::vector<shared_ptr<ElemType> > m_cpuAsyncBuffer;
|
||||
ElemType ** m_cpuAsyncBuffer;
|
||||
|
||||
MPIWrapperPtr m_pMPI;
|
||||
|
||||
// GPU double buffer
|
||||
std::vector<std::vector<Matrix<ElemType> >> m_gpuAsyncBuffer;
|
||||
int m_tableCount;
|
||||
|
||||
#ifndef CPUONLY
|
||||
cudaStream_t _commStream;
|
||||
#endif
|
||||
}; // Class MultiversoHelper
|
||||
|
||||
#endif
|
||||
|
||||
// A None implementation of ASGDHelper interface which does nothing
|
||||
// This is used when CNTK_ENABLE_ASGD = false
|
||||
template<class ElemType = float>
|
||||
class NoneASGDHelper : public ASGDHelper<ElemType>
|
||||
{
|
||||
public:
|
||||
NoneASGDHelper(const std::list<ComputationNodeBasePtr> & learnableNodes,
|
||||
int nodeNumRanks,
|
||||
bool useAsyncBuffer = true,
|
||||
bool isSimModelAveragingSGD = false,
|
||||
AdjustLearningRateAtBeginning adjusttype = AdjustLearningRateAtBeginning::None,
|
||||
double adjustcoef = 0.2,
|
||||
size_t adjustnbmb = 600,
|
||||
int traceLevel = 0,
|
||||
int syncPerfStats = 0,
|
||||
const MPIWrapperPtr& pMPI = nullptr) { }
|
||||
|
||||
~NoneASGDHelper() { }
|
||||
|
||||
void InitModel(const std::list<ComputationNodeBasePtr> & learnableNode) override { }
|
||||
|
||||
bool PushAndPullModel(const std::list<ComputationNodeBasePtr> & learnableNodes, size_t sampleSinceLastSynced) override {
|
||||
return true;
|
||||
}
|
||||
|
||||
void WaitAll() override { }
|
||||
|
||||
void WaitAsyncBuffer() override { }
|
||||
};
|
||||
|
||||
template<class ElemType>
|
||||
ASGDHelper<ElemType>* NewASGDHelper(
|
||||
const std::list<ComputationNodeBasePtr> & learnableNodes, // Parameters that needs to be train
|
||||
size_t nodeNumRanks, // Number of working nodes
|
||||
bool useAsyncBuffer, // Using asynchonous buffer to hide communication cost
|
||||
bool isSimulatedModelAveragingSGD,
|
||||
AdjustLearningRateAtBeginning adjusttype,
|
||||
double adjustCoef,
|
||||
size_t adjustPerMinibatches,
|
||||
int traceLevel,
|
||||
int syncPerfStats)
|
||||
{
|
||||
#ifdef ASGD_PARALLEL_SUPPORT
|
||||
return new MultiversoHelper<ElemType>(learnableNodes, nodeNumRanks, useAsyncBuffer, isSimulatedModelAveragingSGD,
|
||||
adjusttype, adjustCoef, adjustPerMinibatches, traceLevel, syncPerfStats);
|
||||
#else
|
||||
return new NoneASGDHelper<ElemType>(learnableNodes, nodeNumRanks, useAsyncBuffer, isSimulatedModelAveragingSGD,
|
||||
adjusttype, adjustCoef, adjustPerMinibatches, traceLevel, syncPerfStats);
|
||||
#endif
|
||||
}
|
||||
|
||||
template ASGDHelper<float>* NewASGDHelper<float>(
|
||||
const std::list<ComputationNodeBasePtr> & learnableNodes,
|
||||
size_t nodeNumRanks,
|
||||
bool useAsyncBuffer,
|
||||
bool isSimulatedModelAveragingSGD,
|
||||
AdjustLearningRateAtBeginning adjusttype,
|
||||
double adjustCoef,
|
||||
size_t adjustPerMinibatches,
|
||||
int traceLevel,
|
||||
int syncPerfStats);
|
||||
|
||||
template ASGDHelper<double>* NewASGDHelper<double>(
|
||||
const std::list<ComputationNodeBasePtr> & learnableNodes,
|
||||
size_t nodeNumRanks,
|
||||
bool useAsyncBuffer,
|
||||
bool isSimulatedModelAveragingSGD,
|
||||
AdjustLearningRateAtBeginning adjusttype,
|
||||
double adjustCoef,
|
||||
size_t adjustPerMinibatches,
|
||||
int traceLevel,
|
||||
int syncPerfStats);
|
||||
|
||||
}}}
|
|
@ -25,6 +25,8 @@
|
|||
#include "V2AllReduceDistGradAggregator.h"
|
||||
#endif
|
||||
|
||||
#include "ASGDHelper.h"
|
||||
|
||||
#include "SimpleDistGradAggregator.h"
|
||||
#include "V2SimpleDistGradAggregator.h"
|
||||
#include "ProgressTracing.h"
|
||||
|
@ -403,15 +405,27 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
m_seqGammarCalcAMF, m_seqGammarCalcLMF, m_seqGammarCalcWP, m_seqGammarCalcbMMIFactor, m_seqGammarCalcUsesMBR);
|
||||
}
|
||||
|
||||
// Multiverso Warpper for ASGD logic init
|
||||
if (m_parallelizationMethod == ParallelizationMethod::dataParallelASGD)
|
||||
{
|
||||
m_pASGDHelper.reset(NewASGDHelper<ElemType>(learnableNodes,
|
||||
m_mpi->NumNodesInUse(),
|
||||
m_isAsyncBufferEnabled,
|
||||
m_isSimulateMA,
|
||||
m_adjustLearningRateAtBeginning,
|
||||
m_adjustCoefficient,
|
||||
m_adjustPerMinibatches,
|
||||
m_traceLevel,
|
||||
m_syncStatsTrace));
|
||||
m_pASGDHelper->InitModel(learnableNodes);
|
||||
}
|
||||
|
||||
// --- MAIN EPOCH LOOP
|
||||
for (int i = startEpoch; i < (int) m_maxEpochs; i++) // TODO: why is this an int, and not a size_t?
|
||||
{
|
||||
// Synchronize all ranks before proceeding to ensure that
|
||||
// rank 0 has finished writing the previous model file
|
||||
if (m_mpi != nullptr)
|
||||
{
|
||||
m_mpi->WaitAll();
|
||||
}
|
||||
BarrierWorkers();
|
||||
|
||||
// (re-)initialize 1-bit SGD
|
||||
if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD &&
|
||||
|
@ -575,7 +589,9 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
|
||||
if (validationSetDataReader != trainSetDataReader && validationSetDataReader != nullptr)
|
||||
{
|
||||
SimpleEvaluator<ElemType> evalforvalidation(net, m_mpi, m_enableDistributedMBReading);
|
||||
// TODO(dataASGD) making evaluator becoming nondistributed one when using ASGD, since Multiverso has another background thread using MPI.
|
||||
// Making the evaluation serial (non-distributed) will slowdown training especially when validation set is large.
|
||||
SimpleEvaluator<ElemType> evalforvalidation(net, UsingAsyncGradientAggregation(i + 1) ?nullptr : m_mpi, m_enableDistributedMBReading);
|
||||
vector<wstring> cvSetTrainAndEvalNodes;
|
||||
if (criterionNodes.size() > 0)
|
||||
{
|
||||
|
@ -712,10 +728,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
// Synchronize all ranks before proceeding to ensure that
|
||||
// nobody tries reading the checkpoint file at the same time
|
||||
// as rank 0 deleting it below
|
||||
if (m_mpi != nullptr)
|
||||
{
|
||||
m_mpi->WaitAll();
|
||||
}
|
||||
BarrierWorkers();
|
||||
|
||||
// Persist model and check-point info
|
||||
if ((m_mpi == nullptr) || m_mpi->IsMainNode())
|
||||
|
@ -783,10 +796,8 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
|
||||
// Synchronize all ranks before proceeding to ensure that
|
||||
// rank 0 has finished writing the model file
|
||||
if (m_mpi != nullptr)
|
||||
{
|
||||
m_mpi->WaitAll();
|
||||
}
|
||||
// TODO[DataASGD]: should othet other rank waiting in async-mode
|
||||
BarrierWorkers();
|
||||
|
||||
// progress tracing for compute cluster management
|
||||
ProgressTracing::TraceProgressPercentage(m_maxEpochs, 0.0, true);
|
||||
|
@ -803,6 +814,8 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
}
|
||||
|
||||
delete inputMatrices;
|
||||
if (m_parallelizationMethod == ParallelizationMethod::dataParallelASGD)
|
||||
m_pASGDHelper.reset();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
|
@ -846,6 +859,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
|
||||
bool useGradientAggregation = UsingGradientAggregation(epochNumber);
|
||||
bool useModelAggregation = UsingModelAggregation(epochNumber);
|
||||
bool useAsyncGradientAggregation = UsingAsyncGradientAggregation(epochNumber);
|
||||
bool useParallelTrain = UsingParallelTrain(epochNumber);
|
||||
|
||||
// Find all evaluation nodes that accumulate error on their own.
|
||||
|
@ -981,6 +995,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
double readTime = 0;
|
||||
double computeTime = 0;
|
||||
double parameterUpdateTime = 0;
|
||||
double parameterSyncTime = 0; // perf communication time between syncs.
|
||||
if (m_perfTraceLevel > 0)
|
||||
fineGrainedPerfMeasurementTimer.Start();
|
||||
|
||||
|
@ -1241,15 +1256,14 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
if (m_perfTraceLevel > 0)
|
||||
{
|
||||
std::unique_ptr<MatrixComputeStreamEvent> mainStreamSyncEvent(MatrixComputeStreamEvent::Create(net->GetDeviceId()));
|
||||
mainStreamSyncEvent->SynchronizeEvent();
|
||||
fineGrainedPerfMeasurementTimer.Stop();
|
||||
parameterUpdateTime = fineGrainedPerfMeasurementTimer.ElapsedSeconds();
|
||||
|
||||
PREPENDTS(stderr);
|
||||
fprintf(stderr, "Perf trace: Worker MB size = %d, Read = %.5gs; Compute = %.5gs; Parameter update = %.5gs, Aggregate MB size = %d\n", (int)actualMBSize, readTime, computeTime, parameterUpdateTime, (int)aggregateNumSamples);
|
||||
fineGrainedPerfMeasurementTimer.Start();
|
||||
}
|
||||
|
||||
// aggregation by model averaging or block momentum
|
||||
|
@ -1270,11 +1284,38 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
}
|
||||
}
|
||||
|
||||
timer.Stop();
|
||||
numMBsRun++;
|
||||
// using parameter server for parameter update
|
||||
if (useAsyncGradientAggregation && m_mpi->NumNodesInUse() > 1)
|
||||
{
|
||||
// Determine if any samples were processed across any of the ranks
|
||||
if (useDistributedMBReading)
|
||||
{
|
||||
noMoreSamplesToProcess = !wasDataRead;
|
||||
}
|
||||
|
||||
if (nSamplesSinceLastModelSync >= m_nFramesBetweenASGDSync[epochNumber])
|
||||
{
|
||||
m_pASGDHelper->PushAndPullModel(learnableNodes, nSamplesSinceLastModelSync);
|
||||
nSamplesSinceLastModelSync = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (m_perfTraceLevel > 0)
|
||||
{
|
||||
fineGrainedPerfMeasurementTimer.Stop();
|
||||
parameterSyncTime = fineGrainedPerfMeasurementTimer.ElapsedSeconds();
|
||||
}
|
||||
|
||||
timer.Stop();
|
||||
if (m_perfTraceLevel > 0)
|
||||
{
|
||||
PREPENDTS(stderr);
|
||||
fprintf(stderr, "Perf trace: Worker MB size = %d, Read = %.5gs; Compute = %.5gs; Parameter update = %.5gs; Parameter sync = %.5gs; Aggregate MB size = %d\n", (int)actualMBSize, readTime, computeTime, parameterUpdateTime, parameterSyncTime, (int)aggregateNumSamples);
|
||||
}
|
||||
|
||||
numMBsRun++;
|
||||
totalTimeInMBs += timer.ElapsedSeconds();
|
||||
//trainSamplesSinceLastLogged += (int)aggregateNumSamplesWithLabel; // now inside epochCriterionLastLogged
|
||||
|
||||
// log
|
||||
// This shows the criterion since last logged.
|
||||
|
@ -1404,6 +1445,12 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
nSamplesSinceLastModelSync = 0;
|
||||
}
|
||||
|
||||
if (useAsyncGradientAggregation && (m_mpi->NumNodesInUse() > 1))
|
||||
{
|
||||
m_pASGDHelper->PushAndPullModel(learnableNodes, nSamplesSinceLastModelSync);
|
||||
nSamplesSinceLastModelSync = 0;
|
||||
}
|
||||
|
||||
// hoist the accumulated criterion value from GPU side to our 'out' variables
|
||||
// (unless we useGradientAggregation, in which case they are accumulated in the 'out' variables directly)
|
||||
if (!useGradientAggregation)
|
||||
|
@ -2555,7 +2602,8 @@ static ParallelizationMethod ParseParallelizationMethod(const wstring& s)
|
|||
else if (EqualCI(s, L"DataParallelSGD")) return ParallelizationMethod::dataParallelSGD;
|
||||
else if (EqualCI(s, L"ModelAveragingSGD")) return ParallelizationMethod::modelAveragingSGD;
|
||||
else if (EqualCI(s, L"BlockMomentumSGD")) return ParallelizationMethod::blockMomentumSGD;
|
||||
else InvalidArgument("ParseParallelizationMethod: Invalid Parallelization Method. Valid values are (none | DataParallelSGD | ModelAveragingSGD | BlockMomentumSGD)");
|
||||
else if (EqualCI(s, L"dataParallelASGD")) return ParallelizationMethod::dataParallelASGD;
|
||||
else InvalidArgument("ParseParallelizationMethod: Invalid Parallelization Method. Valid values are (none | DataParallelSGD | ModelAveragingSGD | BlockMomentumSGD | dataParallelASGD)");
|
||||
}
|
||||
|
||||
static LearningRateSearchAlgorithm ParseLearningRateSearchType(const wstring& s)
|
||||
|
@ -2569,6 +2617,16 @@ static LearningRateSearchAlgorithm ParseLearningRateSearchType(const wstring& s)
|
|||
else InvalidArgument("autoAdjustLR: Invalid learning rate search type. Valid values are (none | searchBeforeEpoch | adjustAfterEpoch)");
|
||||
}
|
||||
|
||||
#ifdef ASGD_PARALLEL_SUPPORT
|
||||
static AdjustLearningRateAtBeginning AdjustLearningRateAtBeginningType(const wstring& s)
|
||||
{
|
||||
if (EqualCI(s.c_str(), L"") || EqualCI(s.c_str(), L"none")) return AdjustLearningRateAtBeginning::None;
|
||||
else if (EqualCI(s.c_str(), L"linearly")) return AdjustLearningRateAtBeginning::Linearly;
|
||||
else if (EqualCI(s.c_str(), L"staircase")) return AdjustLearningRateAtBeginning::Staircase;
|
||||
else InvalidArgument("AdjustLearningRateatBeginningType: Invalid Type. Valid values are (None | Linearly | Staircase)");
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class ConfigRecordType>
|
||||
SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
|
||||
{
|
||||
|
@ -2938,7 +2996,26 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
|
|||
m_blockMomentumAsTimeConstant = BlockMomentumSGD<double>::Momentum2TimeConstant(blockMomentum, m_modelAggregationBlockSize);
|
||||
}
|
||||
#endif
|
||||
InitializeAndCheckBlockMomentumSGDParameters();
|
||||
}
|
||||
|
||||
if (configParallelTrain.Exists(L"DataParallelASGD"))
|
||||
{
|
||||
#ifndef ASGD_PARALLEL_SUPPORT
|
||||
InvalidArgument("DataParallelASGD is not enabled in this version.\n");
|
||||
#else
|
||||
const ConfigRecordType & configDataParallelASGD(configParallelTrain(L"DataParallelASGD", ConfigRecordType::Record()));
|
||||
m_nFramesBetweenASGDSync = configDataParallelASGD(L"syncPeriod", ConfigRecordType::Array(intargvector(vector<int>{256})));
|
||||
m_isAsyncBufferEnabled = configDataParallelASGD(L"UsePipeline", false);
|
||||
m_isSimulateMA = configDataParallelASGD(L"SimModelAverage", false); // using parameter server-based version of ModefAveragingSGD
|
||||
if (configDataParallelASGD.Exists(L"AdjustLearningRateAtBeginning")) // adjust learning rate per m_adjustNumInBatch minibatchs until to original one
|
||||
// this option could be used to takcle the unstableness of ASGD
|
||||
{
|
||||
const ConfigRecordType & configAdjustLearningRateAtBeginning(configDataParallelASGD(L"AdjustLearningRateAtBeginning", ConfigRecordType::Record()));
|
||||
m_adjustLearningRateAtBeginning = AdjustLearningRateAtBeginningType(configAdjustLearningRateAtBeginning(L"adjustType", L"None"));
|
||||
m_adjustCoefficient = configAdjustLearningRateAtBeginning(L"adjustCoefficient", (double)0.1);
|
||||
m_adjustPerMinibatches = configAdjustLearningRateAtBeginning(L"adjustPerMinibatches", (size_t)256);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} // if (!pMPI)
|
||||
} // if (configSGD.Exists(L"ParallelTrain"))
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
#include <random>
|
||||
#include "Profiler.h"
|
||||
#include "MASGD.h"
|
||||
|
||||
#include "ASGDHelper.h"
|
||||
using namespace std; // ugh! TODO: get rid of this from .h files!!!
|
||||
|
||||
#define CNTK_CHECKPOINT_VERSION_1 1 // 1 -> no version number
|
||||
|
@ -60,6 +60,7 @@ enum class ParallelizationMethod : int
|
|||
dataParallelSGD = 1,
|
||||
modelAveragingSGD = 2,
|
||||
blockMomentumSGD = 3,
|
||||
dataParallelASGD = 4,
|
||||
modelParallelSGD = (1 << 8) // Currently unsupported
|
||||
};
|
||||
|
||||
|
@ -286,6 +287,14 @@ protected:
|
|||
double m_L2RegWeight;
|
||||
double m_L1RegWeight;
|
||||
|
||||
// Parallel training related with ASGD
|
||||
intargvector m_nFramesBetweenASGDSync;
|
||||
bool m_isAsyncBufferEnabled;
|
||||
bool m_isSimulateMA;
|
||||
AdjustLearningRateAtBeginning m_adjustLearningRateAtBeginning;
|
||||
double m_adjustCoefficient;
|
||||
size_t m_adjustPerMinibatches;
|
||||
|
||||
// sequence training
|
||||
double m_hSmoothingWeight;
|
||||
double m_frameDropThresh;
|
||||
|
@ -564,20 +573,41 @@ protected:
|
|||
|
||||
private:
|
||||
void MarkDropoutNodesEvalTimeStampAsOutdated(const ComputationNetworkPtr& net, const ComputationNodeBasePtr& criterionNode);
|
||||
std::shared_ptr<ASGDHelper<ElemType>> m_pASGDHelper;
|
||||
|
||||
bool UsingGradientAggregation(size_t epochNumber) const
|
||||
{
|
||||
return ((GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD) && (epochNumber >= m_parallelizationStartEpochNum));
|
||||
}
|
||||
|
||||
bool UsingModelAggregation(size_t epochNumber) const
|
||||
{
|
||||
return ((GetParallelizationMethod() == ParallelizationMethod::modelAveragingSGD ||
|
||||
GetParallelizationMethod() == ParallelizationMethod::blockMomentumSGD) &&
|
||||
(epochNumber >= m_parallelizationStartEpochNum));
|
||||
}
|
||||
bool UsingParallelTrain(size_t epochNumber) const
|
||||
|
||||
bool UsingAsyncGradientAggregation(size_t epochNumber)
|
||||
{
|
||||
return UsingGradientAggregation(epochNumber) || UsingModelAggregation(epochNumber);
|
||||
return ((GetParallelizationMethod() == ParallelizationMethod::dataParallelASGD) && (epochNumber >= m_parallelizationStartEpochNum));
|
||||
}
|
||||
|
||||
bool UsingParallelTrain(size_t epochNumber)
|
||||
{
|
||||
return UsingGradientAggregation(epochNumber) || UsingModelAggregation(epochNumber) || UsingAsyncGradientAggregation(epochNumber);
|
||||
}
|
||||
|
||||
void BarrierWorkers()
|
||||
{
|
||||
if (m_mpi != nullptr && GetParallelizationMethod() != ParallelizationMethod::dataParallelASGD)
|
||||
{
|
||||
m_mpi->WaitAll();
|
||||
}
|
||||
if (m_mpi != nullptr && GetParallelizationMethod() == ParallelizationMethod::dataParallelASGD)
|
||||
{
|
||||
m_pASGDHelper->WaitAll();
|
||||
}
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -43,10 +43,12 @@
|
|||
<ClCompile>
|
||||
<AdditionalIncludeDirectories>$(SolutionDir)Source\Readers\ReaderLib;$(SolutionDir)Source\CNTKv2LibraryDll;$(SolutionDir)Source\CNTKv2LibraryDll\API;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
|
||||
<AdditionalIncludeDirectories Condition="'$(CNTK_ENABLE_1BitSGD)'=='true'">$(SolutionDir)Source\1BitSGD;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
||||
<AdditionalIncludeDirectories Condition="'$(CNTK_ENABLE_ASGD)'!='false'">$(SolutionDir)Source\multiverso\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<PreprocessorDefinitions>WIN32;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions Condition="'$(CNTK_ENABLE_1BitSGD)'=='true'">QUANTIZED_GRADIENT_AGGREGATION;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<PreprocessorDefinitions Condition="'$(CNTK_ENABLE_ASGD)'!='false'">ASGD_PARALLEL_SUPPORT;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<DisableSpecificWarnings>4819</DisableSpecificWarnings>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
|
@ -101,6 +103,7 @@
|
|||
<ClInclude Include="..\Common\Include\BestGpu.h" />
|
||||
<ClInclude Include="..\Common\Include\Config.h" />
|
||||
<ClInclude Include="..\Common\Include\DataReader.h" />
|
||||
<ClInclude Include="..\Common\Include\ASGDHelper.h" />
|
||||
<ClInclude Include="..\Common\Include\TensorShape.h" />
|
||||
<ClInclude Include="..\Common\Include\DataWriter.h" />
|
||||
<ClInclude Include="..\Common\Include\File.h" />
|
||||
|
@ -138,6 +141,7 @@
|
|||
<ClInclude Include="V2SimpleDistGradAggregator.h" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="ASGDHelper.cpp" />
|
||||
<ClCompile Include="PostComputingActions.cpp" />
|
||||
<ClCompile Include="Profiler.cpp" />
|
||||
<ClCompile Include="SGD.cpp" />
|
||||
|
|
|
@ -13,6 +13,9 @@
|
|||
<ClCompile Include="PostComputingActions.cpp">
|
||||
<Filter>Stat</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="ASGDHelper.cpp">
|
||||
<Filter>Parallelization</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\Common\Include\fileutil.h">
|
||||
|
@ -135,6 +138,9 @@
|
|||
<ClInclude Include="V2SimpleDistGradAggregator.h">
|
||||
<Filter>Parallelization</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Common\Include\ASGDHelper.h">
|
||||
<Filter>Parallelization</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Filter Include="Common">
|
||||
|
|
0
Tests/EndToEndTests/Examples/Image/Deprecated/CIFAR-10/02_BatchNormConv/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Image/Deprecated/CIFAR-10/02_BatchNormConv/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Image/Deprecated/CIFAR-10/05_ConvLocal/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Image/Deprecated/CIFAR-10/05_ConvLocal/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Image/Deprecated/MNIST/01_OneHidden_ndl/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Image/Deprecated/MNIST/01_OneHidden_ndl/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Image/Deprecated/MNIST/02_Convolution_ndl/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Image/Deprecated/MNIST/02_Convolution_ndl/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Image/Deprecated/MNIST/03_ConvBatchNorm_ndl/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Image/Deprecated/MNIST/03_ConvBatchNorm_ndl/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Speech/TIMIT/CrossValidateSimpleNetwork/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Speech/TIMIT/CrossValidateSimpleNetwork/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Speech/TIMIT/EvalSimpleNetwork/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Speech/TIMIT/EvalSimpleNetwork/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Speech/TIMIT/TrainSimpleNetwork/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Speech/TIMIT/TrainSimpleNetwork/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Speech/TIMIT/TrainWithPreTrain/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Examples/Speech/TIMIT/TrainWithPreTrain/run-test
Normal file → Executable file
|
@ -0,0 +1,147 @@
|
|||
# Note: This sample uses the deprecated NdlNetworkBuilder.
|
||||
# An updated version using BrainScript is coming soon.
|
||||
# Please find updated samples on Github, https://github.com/Microsoft/CNTK/tree/master/Examples /...
|
||||
#
|
||||
makeMode = true
|
||||
RootDir = "."
|
||||
|
||||
configName = "ssgd"
|
||||
minibatch = 128
|
||||
epochSize = 5
|
||||
parallelizationMethod = "DataParallelSGD"
|
||||
asyncBuffer = "true"
|
||||
|
||||
ConfigDir = "$RootDir$"
|
||||
DataDir = "$RootDir$"
|
||||
OutputDir = "$RootDir$/Output-$configName$"
|
||||
ModelDir = "$OutputDir$/Models"
|
||||
|
||||
ndlMacros = "$ConfigDir$/Macros.ndl"
|
||||
|
||||
precision = "float"
|
||||
DeviceId = "auto"
|
||||
imageLayout = "cudnn"
|
||||
|
||||
|
||||
# override the above as follows when running on CPU:
|
||||
# deviceId = -1
|
||||
|
||||
# If set to true, always initialize the network on CPU, making initialization consistent across CPU and GPU targets (for testing).
|
||||
initOnCPUOnly=true
|
||||
|
||||
prefetch = "true"
|
||||
parallelTrain = "false"
|
||||
|
||||
command = Train
|
||||
|
||||
stderr = "$OutputDir$/03_ResNet"
|
||||
traceLevel = 1
|
||||
|
||||
Proj16to32Filename = "$ConfigDir$/16to32.txt"
|
||||
Proj32to64Filename = "$ConfigDir$/32to64.txt"
|
||||
|
||||
Train = [
|
||||
action = "train"
|
||||
modelPath = "$ModelDir$/03_ResNet"
|
||||
|
||||
NDLNetworkBuilder = [
|
||||
networkDescription = "$ConfigDir$/03_ResNet.ndl"
|
||||
]
|
||||
|
||||
SGD = [
|
||||
epochSize = 0
|
||||
minibatchSize = $minibatch$
|
||||
# Note that learning rates are 10x more than in the paper due to a different
|
||||
# momentum update rule in CNTK: v{t + 1} = lr*(1 - momentum)*g{t + 1} + momentum*v{t}
|
||||
learningRatesPerSample = 0.004*80:0.0004*40:0.00004
|
||||
momentumPerMB = 0
|
||||
maxEpochs = $epochsize$
|
||||
L2RegWeight = 0.0001
|
||||
dropoutRate = 0
|
||||
perfTraceLevel = 0
|
||||
|
||||
firstMBsToShowResult = 1
|
||||
numMBsToShowResult = 10
|
||||
|
||||
ParallelTrain = [
|
||||
parallelizationMethod = $parallelizationMethod$
|
||||
distributedMBReading = "true"
|
||||
parallelizationStartEpoch = 1
|
||||
DataParallelSGD = [
|
||||
gradientBits = 32
|
||||
useBufferedAsyncGradientAggregation = $asyncBuffer$
|
||||
]
|
||||
ModelAveragingSGD = [
|
||||
blockSizePerWorker = 128
|
||||
]
|
||||
DataParallelASGD = [
|
||||
syncPeriod = 128
|
||||
usePipeline = $asyncBuffer$
|
||||
]
|
||||
]
|
||||
]
|
||||
|
||||
reader = [
|
||||
readerType = "ImageReader"
|
||||
file = "$DataDir$/train_map.txt"
|
||||
randomize = "auto"
|
||||
features = [
|
||||
width = 32
|
||||
height = 32
|
||||
channels = 3
|
||||
cropType = "random"
|
||||
cropRatio = 0.8
|
||||
jitterType = "uniRatio"
|
||||
interpolations = "linear"
|
||||
meanFile = "$DataDir$/CIFAR-10_mean.xml"
|
||||
]
|
||||
labels = [
|
||||
labelDim = 10
|
||||
]
|
||||
]
|
||||
|
||||
cvReader = [
|
||||
readerType = "ImageReader"
|
||||
file = "$DataDir$/test_map.txt"
|
||||
randomize = "none"
|
||||
features = [
|
||||
width = 32
|
||||
height = 32
|
||||
channels = 3
|
||||
cropType = "center"
|
||||
cropRatio = 1
|
||||
jitterType = "uniRatio"
|
||||
interpolations = "linear"
|
||||
meanFile = "$DataDir$/CIFAR-10_mean.xml"
|
||||
]
|
||||
labels = [
|
||||
labelDim = 10
|
||||
]
|
||||
]
|
||||
]
|
||||
|
||||
Test = [
|
||||
action = "test"
|
||||
modelPath = "$ModelDir$/03_ResNet"
|
||||
# Set minibatch size for testing.
|
||||
minibatchSize = 256
|
||||
|
||||
reader = [
|
||||
readerType = "ImageReader"
|
||||
file = "$DataDir$/cifar-10-batches-py/test_map.txt"
|
||||
randomize = "none"
|
||||
features = [
|
||||
width = 32
|
||||
height = 32
|
||||
channels = 3
|
||||
cropType = "center"
|
||||
cropRatio = 1
|
||||
jitterType = "uniRatio"
|
||||
interpolations = "linear"
|
||||
meanFile = "$DataDir$/cifar-10-batches-py/CIFAR-10_mean.xml"
|
||||
]
|
||||
labels = [
|
||||
labelDim = 10
|
||||
]
|
||||
]
|
||||
]
|
|
@ -0,0 +1,67 @@
|
|||
load=LocalMacros
|
||||
run=DNN
|
||||
|
||||
LocalMacros = [
|
||||
ImageW = 32
|
||||
ImageH = 32
|
||||
ImageC = 3
|
||||
LabelDim = 10
|
||||
|
||||
features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = $imageLayout$)
|
||||
labels = Input(LabelDim, tag = label)
|
||||
|
||||
convWScale = 7.07
|
||||
convBValue = 0
|
||||
|
||||
fc1WScale = 0.4
|
||||
fc1BValue = 0
|
||||
|
||||
scValue = 1
|
||||
|
||||
# Batch normalization time constant.
|
||||
bnTimeConst = 4096
|
||||
|
||||
kW = 3
|
||||
kH = 3
|
||||
|
||||
hStride1 = 1
|
||||
vStride1 = 1
|
||||
]
|
||||
|
||||
DNN=[
|
||||
conv1WScale = 0.26
|
||||
cMap1 = 16
|
||||
conv1 = ConvBNReLULayer(features, cMap1, 27, kW, kH, hStride1, vStride1, conv1WScale, convBValue, scValue, bnTimeConst)
|
||||
|
||||
rn1_1 = ResNetNode2(conv1, cMap1, 144, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn1_2 = ResNetNode2(rn1_1, cMap1, 144, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
||||
cMap2 = 32
|
||||
rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", learningRateMultiplier = 0)
|
||||
rn2_1 = ResNetNode2Inc(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn2_1_Wproj)
|
||||
#rn2_1 = ResNetNode2Inc2(rn1_3, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
|
||||
rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
||||
cMap3 = 64
|
||||
rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", learningRateMultiplier = 0)
|
||||
rn3_1 = ResNetNode2Inc(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn3_1_Wproj)
|
||||
#rn3_1 = ResNetNode2Inc2(rn2_3, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
|
||||
rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
|
||||
|
||||
# Global average pooling
|
||||
poolW = 8
|
||||
poolH = 8
|
||||
poolhStride = 1
|
||||
poolvStride = 1
|
||||
pool = AveragePooling(rn3_3, poolW, poolH, poolhStride, poolvStride, imageLayout = $imageLayout$)
|
||||
|
||||
ol = DnnLastLayer(cMap3, labelDim, pool, fc1WScale, fc1BValue)
|
||||
|
||||
CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
|
||||
Err = ClassificationError(labels, ol, tag = Eval)
|
||||
OutputNodes = ol
|
||||
]
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
@ -0,0 +1,64 @@
|
|||
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
@ -0,0 +1,148 @@
|
|||
ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
|
||||
[
|
||||
W = LearnableParameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
|
||||
b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = $imageLayout$)
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
|
||||
p = Plus(c, b)
|
||||
y = RectifiedLinear(p)
|
||||
]
|
||||
|
||||
ConvLocalReLULayer(inp, outMap, outWCount, inMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
|
||||
[
|
||||
W = LearnableParameter(outWCount, inWCount, init = Gaussian, initValueScale = wScale)
|
||||
b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = $imageLayout$)
|
||||
c = Convolution(W, inp, {kW, kH, inMap}, mapCount = outMap, stride = {hStride, vStride, inMap}, sharing = {false, false, false}, imageLayout = $imageLayout$)
|
||||
p = Plus(c, b)
|
||||
y = RectifiedLinear(p)
|
||||
]
|
||||
|
||||
ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
|
||||
sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
|
||||
y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
|
||||
]
|
||||
|
||||
ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
W = LearnableParameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
|
||||
c = ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
|
||||
]
|
||||
|
||||
ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
c = ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
|
||||
y = RectifiedLinear(c)
|
||||
]
|
||||
|
||||
ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
|
||||
sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
|
||||
c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
|
||||
y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
|
||||
]
|
||||
|
||||
ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
# First convolution layer.
|
||||
c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
|
||||
# Second convolution layer, no ReLU.
|
||||
c2 = ConvBNLayer(c1, outMap, inWCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
|
||||
p = Plus(c2, inp)
|
||||
y = RectifiedLinear(p)
|
||||
]
|
||||
|
||||
ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, bnTimeConst, Wproj)
|
||||
[
|
||||
# First convolution layer.
|
||||
c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 2, 2, wScale, bValue, scValue, bnTimeConst)
|
||||
# Second convolution layer, no ReLU.
|
||||
c2 = ConvBNLayer(c1, outMap, wCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
|
||||
|
||||
# Projection convolution layer.
|
||||
c_proj = ProjLayer(Wproj, inp, outMap, 2, 2, bValue, scValue, bnTimeConst)
|
||||
#c_proj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = $imageLayout$)
|
||||
|
||||
p = Plus(c2, c_proj)
|
||||
y = RectifiedLinear(p)
|
||||
]
|
||||
|
||||
ResNetNode2Inc2(inp, inMap, outMap, inWCount, wCount, kW, kH, wScale, w1Scale, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
pool = MaxPooling(inp, 1, 1, 2, 2, imageLayout = $imageLayout$)
|
||||
# First convolution layer.
|
||||
c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 2, 2, wScale, bValue, scValue, bnTimeConst)
|
||||
# Second convolution layer, no ReLU.
|
||||
c2 = ConvBNLayer(c1, inMap, wCount, kW, kH, 1, 1, w1Scale, bValue, scValue, bnTimeConst)
|
||||
c3 = ConvBNLayer(c1, inMap, wCount, kW, kH, 1, 1, w1Scale, bValue, scValue, bnTimeConst)
|
||||
|
||||
p = Plus(c2, pool)
|
||||
r = RowStack(p, c3)
|
||||
y = RectifiedLinear(r)
|
||||
]
|
||||
|
||||
DnnReLULayer(inDim, outDim, x, wScale, bValue)
|
||||
[
|
||||
W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale)
|
||||
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
|
||||
t = Times(W, x)
|
||||
z = Plus(t, b)
|
||||
y = RectifiedLinear(z)
|
||||
]
|
||||
|
||||
DNNImageReLULayer(inW, inH, inC, outDim, x, wScale, bValue)
|
||||
[
|
||||
W = ImageParameter(outDim, inW, inH, inC, init = Gaussian, initValueScale = wScale, imageLayout=$imageLayout$)
|
||||
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
|
||||
t = Times(W, x)
|
||||
z = Plus(t, b)
|
||||
y = RectifiedLinear(z)
|
||||
]
|
||||
|
||||
DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale)
|
||||
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
|
||||
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
t = Times(W, x)
|
||||
bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst)
|
||||
y = RectifiedLinear(bn)
|
||||
]
|
||||
|
||||
DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeConst)
|
||||
[
|
||||
W = ImageParameter(outDim, inW, inH, inC, init = Gaussian, initValueScale = wScale, imageLayout=$imageLayout$)
|
||||
b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue)
|
||||
sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue)
|
||||
m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
|
||||
t = Times(W, x)
|
||||
bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst)
|
||||
y = RectifiedLinear(bn)
|
||||
]
|
||||
|
||||
DnnLastLayer(hiddenDim, labelDim, x, wScale, bValue)
|
||||
[
|
||||
W = LearnableParameter(labelDim, hiddenDim, init = Gaussian, initValueScale = wScale)
|
||||
b = LearnableParameter(labelDim, init = fixedValue, value = bValue)
|
||||
t = Times(W, x)
|
||||
z = Plus(t, b)
|
||||
]
|
||||
|
||||
DnnImageLastLayer(inW, inH, inC, labelDim, x, wScale, bValue)
|
||||
[
|
||||
W = ImageParameter(labelDim, inW, inH, inC, init = Gaussian, initValueScale = wScale, imageLayout=$imageLayout$)
|
||||
b = LearnableParameter(labelDim, init = fixedValue, value = bValue)
|
||||
t = Times(W, x)
|
||||
z = Plus(t, b)
|
||||
]
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,23 @@
|
|||
#!/bin/bash
|
||||
|
||||
. $TEST_DIR/run-test-common
|
||||
|
||||
#dataDir="."
|
||||
ConfigDir=$TEST_DIR
|
||||
LogFileName="ASGDMultiGPU"
|
||||
Instances=4
|
||||
NumCPUThreads=$(threadsPerInstance $Instances)
|
||||
parallelizationMethod="DataParallelASGD"
|
||||
|
||||
# cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
|
||||
cntkmpirun "-n $Instances" 03_ResNet-parallel.cntk "numCPUThreads=$NumCPUThreads precision=float DeviceId=\"auto\" parallelTrain=true minibatch=512 epochsize=10 asyncBuffer=\"false\" parallelizationMethod=$parallelizationMethod"
|
||||
ExitCode=$?
|
||||
sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank0
|
||||
sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank1
|
||||
sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank2
|
||||
sed 's/^/MPI Rank 3: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank3
|
||||
|
||||
# Delete the test data if copied
|
||||
[[ "$Copied" -eq "1" ]] && rm -rf "$DataDir"
|
||||
|
||||
exit $ExitCode
|
|
@ -0,0 +1,35 @@
|
|||
#!/bin/bash
|
||||
|
||||
. $TEST_ROOT_DIR/run-test-common
|
||||
|
||||
export MKL_NUM_THREADS=4
|
||||
export MKL_CBWR=COMPATIBLE
|
||||
export OMP_NUM_THREADS=1
|
||||
|
||||
ConfigDir=$TEST_DIR
|
||||
|
||||
|
||||
if [[ ! -d $TEST_DATA_DIR || ! -e $TEST_DATA_DIR/Train_cntk_text.txt || ! -e $TEST_DATA_DIR/train_map.txt ]]; then
|
||||
# Cannot find test data locally.
|
||||
# Try external test data directory (not part of the CNTK repository) as an alternative.
|
||||
if [[ -d "$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY" ]]; then
|
||||
if [ "$OS" == "Windows_NT" ]; then
|
||||
DataSourceDir=`cygpath -au $CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY`/Image/CIFAR/v0
|
||||
else
|
||||
DataSourceDir=$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY/Image/CIFAR/v0
|
||||
fi
|
||||
|
||||
# Copy the test data to the test run directory
|
||||
DataDir=$TEST_RUN_DIR/TestData
|
||||
mkdir $DataDir
|
||||
mkdir $DataDir/cifar-10-batches-py
|
||||
cp -R $DataSourceDir/*_cntk_text.txt $DataDir || exit $?
|
||||
cp -R $DataSourceDir/cifar-10-batches-py/data.zip $DataDir/cifar-10-batches-py || exit $?
|
||||
cp -R $DataSourceDir/cifar-10-batches-py/CIFAR-10_mean.xml $DataDir || exit $?
|
||||
cp -R $DataSourceDir/cifar-10-batches-py/*_map.txt $DataDir || exit $?
|
||||
Copied=1
|
||||
else
|
||||
echo Error: cannot find data. Please see Examples/Image/DataSets/CIFAR10/README.md for instructions to get it.
|
||||
exit 1
|
||||
fi
|
||||
fi
|
|
@ -0,0 +1,31 @@
|
|||
dataDir: .
|
||||
|
||||
tags:
|
||||
# running on every BVT job in 'P' (Parallel) leg in Debug-GPU on Linux configurations:
|
||||
# TODO: Enable windows test when Jenkins ready
|
||||
- bvt-p (build_sku == 'gpu') and (flavor=='debug') and (os == 'linux') and (device == 'gpu')
|
||||
# running unconditionally on every Nightly job in 'P' leg
|
||||
- nightly-p (build_sku == 'gpu') and (os == 'linux') and (device == 'gpu')
|
||||
|
||||
testCases:
|
||||
Must train epochs in exactly same order and parameters for each MPI Rank:
|
||||
patterns:
|
||||
- ^MPI Rank {{integer}}
|
||||
- Starting Epoch {{integer}}
|
||||
- learning rate per sample = {{float}}
|
||||
|
||||
Epochs must be finished with expected results for each MPI Rank:
|
||||
patterns:
|
||||
- ^MPI Rank {{integer}}
|
||||
- Finished Epoch[{{integer}} of {{integer}}]
|
||||
|
||||
Per-minibatch training results must match for each MPI Rank:
|
||||
patterns:
|
||||
- ^MPI Rank {{integer}}
|
||||
- Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
|
||||
- " * {{integer}}; "
|
||||
|
||||
DataParallelASGD training parameters must match for each MPI Rank:
|
||||
patterns:
|
||||
- ^MPI Rank {{integer}}
|
||||
- Starting minibatch loop
|
0
Tests/EndToEndTests/Speech/HTKDeserializers/DNN/DiscriminativePreTraining/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/DNN/DiscriminativePreTraining/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/DNN/Parallel1BitQuantization/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/DNN/Parallel1BitQuantization/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelBufferedAsyncGradientAggregation/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelBufferedAsyncGradientAggregation/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelNoQuantization/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelNoQuantization/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelNoQuantizationBufferedAsyncGradientAggregation/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelNoQuantizationBufferedAsyncGradientAggregation/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/DNN/WriteCommand/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/DNN/WriteCommand/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/LSTM/FullUtterance/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/LSTM/FullUtterance/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/AdaptLearnRate/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/AdaptLearnRate/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/CrossValidateSimpleNetwork/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/CrossValidateSimpleNetwork/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/EvalSimpleNetwork/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/EvalSimpleNetwork/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainAutoEncoder/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainAutoEncoder/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainLstm/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainLstm/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainMultiInput/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainMultiInput/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainMultiTask/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainMultiTask/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainNdlNetwork/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainNdlNetwork/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainSimpleNetwork/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainSimpleNetwork/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainWithPreTrain/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainWithPreTrain/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/WriteBottleneck/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/WriteBottleneck/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/WriteScaledLogLike/run-test
Normal file → Executable file
0
Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/WriteScaledLogLike/run-test
Normal file → Executable file
|
@ -0,0 +1,115 @@
|
|||
CPU info:
|
||||
CPU Model Name: Intel(R) Xeon(R) CPU E5-2680 v2 @ 2.80GHz
|
||||
Hardware threads: 40
|
||||
Total Memory: 264118516 kB
|
||||
-------------------------------------------------------------------
|
||||
Running 8 test cases...
|
||||
[INFO] [2016-09-30 12:19:35] multiverso MPI-Net is initialized under MPI_THREAD_SERIALIZED mode.
|
||||
[INFO] [2016-09-30 12:19:35] All nodes registered. System contains 1 nodes. num_worker = 1, num_server = 1
|
||||
[INFO] [2016-09-30 12:19:35] Create a async server
|
||||
[INFO] [2016-09-30 12:19:35] Rank 0: Multiverso start sucessfully
|
||||
[INFO] [2016-09-30 12:19:35] --------------Show dashboard monitor information--------------
|
||||
[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_ADD] count = 2 elapse = 3.25433ms average = 1.62716ms
|
||||
[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_GET] count = 2 elapse = 0.011775ms average = 0.0058875ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_ADD] count = 2 elapse = 0.058559ms average = 0.0292795ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_GET] count = 2 elapse = 0.014456ms average = 0.007228ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_REPLY_GET] count = 2 elapse = 0.005685ms average = 0.0028425ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_ADD] count = 1 elapse = 3.5289ms average = 3.5289ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_GET] count = 1 elapse = 0.110966ms average = 0.110966ms
|
||||
[INFO] [2016-09-30 12:19:35] --------------------------------------------------------------
|
||||
[INFO] [2016-09-30 12:19:35] Multiverso Shutdown successfully
|
||||
[INFO] [2016-09-30 12:19:35] multiverso MPI-Net is initialized under MPI_THREAD_SERIALIZED mode.
|
||||
[INFO] [2016-09-30 12:19:35] All nodes registered. System contains 1 nodes. num_worker = 1, num_server = 1
|
||||
[INFO] [2016-09-30 12:19:35] Create a async server
|
||||
[INFO] [2016-09-30 12:19:35] Rank 0: Multiverso start sucessfully
|
||||
[INFO] [2016-09-30 12:19:35] --------------Show dashboard monitor information--------------
|
||||
[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_ADD] count = 2 elapse = 3.25433ms average = 1.62716ms
|
||||
[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_GET] count = 2 elapse = 0.011775ms average = 0.0058875ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_ADD] count = 2 elapse = 0.058559ms average = 0.0292795ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_GET] count = 2 elapse = 0.014456ms average = 0.007228ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_REPLY_GET] count = 2 elapse = 0.005685ms average = 0.0028425ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_ADD] count = 1 elapse = 3.5289ms average = 3.5289ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_GET] count = 1 elapse = 0.110966ms average = 0.110966ms
|
||||
[INFO] [2016-09-30 12:19:35] --------------------------------------------------------------
|
||||
[INFO] [2016-09-30 12:19:35] Multiverso Shutdown successfully
|
||||
[INFO] [2016-09-30 12:19:35] multiverso MPI-Net is initialized under MPI_THREAD_SERIALIZED mode.
|
||||
[INFO] [2016-09-30 12:19:35] All nodes registered. System contains 1 nodes. num_worker = 1, num_server = 1
|
||||
[INFO] [2016-09-30 12:19:35] Create a async server
|
||||
[INFO] [2016-09-30 12:19:35] Rank 0: Multiverso start sucessfully
|
||||
[INFO] [2016-09-30 12:19:35] --------------Show dashboard monitor information--------------
|
||||
[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_ADD] count = 4 elapse = 3.26092ms average = 0.81523ms
|
||||
[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_GET] count = 5 elapse = 0.035872ms average = 0.0071744ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_ADD] count = 4 elapse = 0.079631ms average = 0.0199077ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_GET] count = 5 elapse = 0.055307ms average = 0.0110614ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_REPLY_GET] count = 5 elapse = 0.014141ms average = 0.0028282ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_ADD] count = 3 elapse = 3.64047ms average = 1.21349ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_GET] count = 4 elapse = 0.35004ms average = 0.08751ms
|
||||
[INFO] [2016-09-30 12:19:35] --------------------------------------------------------------
|
||||
[INFO] [2016-09-30 12:19:35] Multiverso Shutdown successfully
|
||||
[INFO] [2016-09-30 12:19:35] multiverso MPI-Net is initialized under MPI_THREAD_SERIALIZED mode.
|
||||
[INFO] [2016-09-30 12:19:35] All nodes registered. System contains 1 nodes. num_worker = 1, num_server = 1
|
||||
[INFO] [2016-09-30 12:19:35] Create a sync server
|
||||
[INFO] [2016-09-30 12:19:35] Rank 0: Multiverso start sucessfully
|
||||
[INFO] [2016-09-30 12:19:35] --------------Show dashboard monitor information--------------
|
||||
[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_ADD] count = 6 elapse = 3.35131ms average = 0.558552ms
|
||||
[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_GET] count = 7 elapse = 0.049346ms average = 0.00704943ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_ADD] count = 6 elapse = 0.110051ms average = 0.0183418ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_GET] count = 7 elapse = 0.068958ms average = 0.00985114ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_REPLY_GET] count = 7 elapse = 0.018843ms average = 0.00269186ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_ADD] count = 4 elapse = 3.83984ms average = 0.959961ms
|
||||
[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_GET] count = 5 elapse = 0.420295ms average = 0.084059ms
|
||||
[INFO] [2016-09-30 12:19:35] --------------------------------------------------------------
|
||||
[INFO] [2016-09-30 12:19:35] Multiverso Shutdown successfully
|
||||
|
||||
Test module "multiverso" has passed with:
|
||||
8 test cases out of 8 passed
|
||||
83 assertions out of 83 passed
|
||||
|
||||
Test suite "array_test" has passed with:
|
||||
2 test cases out of 2 passed
|
||||
34 assertions out of 34 passed
|
||||
|
||||
Test case "array_test/array_access" has passed with:
|
||||
20 assertions out of 20 passed
|
||||
|
||||
Test case "array_test/array_partition" has passed with:
|
||||
14 assertions out of 14 passed
|
||||
|
||||
Test suite "blob" has passed with:
|
||||
2 test cases out of 2 passed
|
||||
7 assertions out of 7 passed
|
||||
|
||||
Test case "blob/blob_constructor_test" has passed with:
|
||||
3 assertions out of 3 passed
|
||||
|
||||
Test case "blob/blob_access_test" has passed with:
|
||||
4 assertions out of 4 passed
|
||||
|
||||
Test suite "test_kv" has passed with:
|
||||
1 test case out of 1 passed
|
||||
3 assertions out of 3 passed
|
||||
|
||||
Test case "test_kv/access" has passed with:
|
||||
3 assertions out of 3 passed
|
||||
|
||||
Test suite "message" has passed with:
|
||||
1 test case out of 1 passed
|
||||
11 assertions out of 11 passed
|
||||
|
||||
Test case "message/message_access" has passed with:
|
||||
11 assertions out of 11 passed
|
||||
|
||||
Test suite "node" has passed with:
|
||||
1 test case out of 1 passed
|
||||
8 assertions out of 8 passed
|
||||
|
||||
Test case "node/node_role" has passed with:
|
||||
8 assertions out of 8 passed
|
||||
|
||||
Test suite "test_sync" has passed with:
|
||||
1 test case out of 1 passed
|
||||
20 assertions out of 20 passed
|
||||
|
||||
Test case "test_sync/sync" has passed with:
|
||||
20 assertions out of 20 passed
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
#!/bin/bash
|
||||
|
||||
. $TEST_ROOT_DIR/run-test-common
|
||||
. $TEST_DIR/../run-boost-test-common
|
||||
|
||||
boosttestrun multiversotests
|
|
@ -0,0 +1,19 @@
|
|||
dataDir: .
|
||||
|
||||
tags:
|
||||
# CPU only, at this stage.
|
||||
# TODO move from l to separate leg, requires infra changes
|
||||
- bvt-l (build_sku == 'cpu') or (build_sku == '1bitsgd')
|
||||
- nightly-l (build_sku == 'cpu') or (build_sku == '1bitsgd')
|
||||
|
||||
testCases:
|
||||
Test cases pass:
|
||||
patterns:
|
||||
- "Test case"
|
||||
- "passed with"
|
||||
|
||||
Test suites pass:
|
||||
patterns:
|
||||
- "Test suite"
|
||||
- "passed with"
|
||||
|
|
@ -43,7 +43,7 @@ checkEmptyStdout \
|
|||
|
||||
|
||||
checkEmptyStdout \
|
||||
"git ls-tree --full-tree -r HEAD --name-only | git check-attr text --cached --stdin | grep -v 'text: set' | cut -d: -f1 | git check-attr binary --cached --stdin | grep -v 'binary: set' | cut -d: -f1 | grep -v Source/1BitSGD" \
|
||||
"git ls-tree --full-tree -r HEAD --name-only | git check-attr text --cached --stdin | grep -v 'text: set' | cut -d: -f1 | git check-attr binary --cached --stdin | grep -v 'binary: set' | cut -d: -f1 | grep -v Source/Multiverso | grep -v Source/1BitSGD" \
|
||||
"files that are neither marked as binary nor text; should extend .gitattributes"
|
||||
|
||||
# TODO line ending checks
|
||||
|
|
|
@ -55,9 +55,10 @@ makebuildinfo()
|
|||
local CUDA_PATH=$6
|
||||
local CUB_PATH=$7
|
||||
local WITH_1BITSGD=$8
|
||||
local BUILDER=$9
|
||||
local BUILDMACHINE=${10}
|
||||
local BUILDPATH=${11}
|
||||
local WITH_ASGD=$9
|
||||
local BUILDER=$10
|
||||
local BUILDMACHINE=${11}
|
||||
local BUILDPATH=${12}
|
||||
|
||||
(
|
||||
printf "#ifndef _BUILDINFO_H\n"
|
||||
|
@ -84,6 +85,11 @@ makebuildinfo()
|
|||
else
|
||||
printf "#define _WITH_1BITSGD_ \"no\"\n"
|
||||
fi
|
||||
if [ ! -z "$WITH_ASGD" ]; then
|
||||
printf "#define _WITH_ASGD_ \"yes\"\n"
|
||||
else
|
||||
printf "#define _WITH_ASGD_ \"no\"\n"
|
||||
fi
|
||||
printf "#define _BUILDER_ \"%s\"\n" "$BUILDER"
|
||||
printf "#define _BUILDMACHINE_ \"%s\"\n" "$BUILDMACHINE"
|
||||
printf "#define _BUILDPATH_ \"%s\"\n" "$BUILDPATH"
|
||||
|
@ -152,6 +158,7 @@ makebuildinfo \
|
|||
"$CUDAPATH" \
|
||||
"$CUBPATH" \
|
||||
"$CNTK_ENABLE_1BitSGD" \
|
||||
"$CNTK_ENABLE_ASGD" \
|
||||
"$BUILDER" \
|
||||
"$BUILDMACHINE" \
|
||||
"$BUILDPATH"
|
||||
|
|
|
@ -90,6 +90,9 @@ enable_1bitsgd=$default_use_1bitsgd
|
|||
default_use_code_coverage=no
|
||||
enable_code_coverage=$default_use_code_coverage
|
||||
|
||||
default_use_asgd=yes
|
||||
enable_asgd=$default_use_asgd
|
||||
|
||||
# List from best to worst choice
|
||||
default_path_list="/usr /usr/local /opt /opt/local"
|
||||
|
||||
|
@ -322,6 +325,7 @@ function show_help ()
|
|||
echo " --with-build-top=directory build directory $(show_default $build_top)"
|
||||
echo " --add directory add directory to library search path"
|
||||
echo " --1bitsgd[=(yes|no)] use 1Bit SGD $(show_default ${default_use_1bitsgd})"
|
||||
echo " --asgd[=(yes|no)] use ASGD powered by Multiverso $(show_default $(default_use_asgd))"
|
||||
echo " --cuda[=(yes|no)] use cuda GPU $(show_default $(default_use_cuda))"
|
||||
echo " --python[=(yes|no)] with Python bindings $(show_default $(default_use_python))"
|
||||
echo " --with-cuda[=directory] $(show_default $(find_cuda))"
|
||||
|
@ -402,6 +406,17 @@ do
|
|||
fi
|
||||
;;
|
||||
|
||||
--asgd*)
|
||||
if test x$optarg = xyes || test x$optarg = xno
|
||||
then
|
||||
enable_asgd=$optarg
|
||||
else
|
||||
echo "Invalid value for --asgd $optarg"
|
||||
show_help
|
||||
exit
|
||||
fi
|
||||
;;
|
||||
|
||||
--cuda*)
|
||||
if test x$optarg = xyes || test x$optarg = xno
|
||||
then
|
||||
|
@ -1040,6 +1055,10 @@ if test x$protobuf_path != x; then
|
|||
echo PROTOBUF_PATH=$protobuf_path >> $config
|
||||
fi
|
||||
|
||||
if test $enable_asgd = yes ; then
|
||||
echo CNTK_ENABLE_ASGD=true >> $config
|
||||
fi
|
||||
|
||||
# If we are not in the configure directory, generate a trampoline Makefile
|
||||
makefile=$build_top/Makefile
|
||||
if test $(is_hardlinked "$configure" "$build_top/configure") = no
|
||||
|
|
Загрузка…
Ссылка в новой задаче