Integrate qiwye/asgd-dev into master

2016-11-12 03:38:06 -08:00 · 2016-11-12 03:38:06 -08:00 · e618b917fe
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,6 @@
 [submodule "Source/1BitSGD"]
 	path = Source/1BitSGD
 	url = https://git.codeplex.com/cntk1bitsgd
+[submodule "Source/Multiverso"]
+	path = Source/Multiverso
+	url = https://github.com/Microsoft/Multiverso
--- a/CNTK.sln
+++ b/CNTK.sln
@ -137,6 +137,9 @@ EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ComputationNetworkLib", "Source\ComputationNetworkLib\ComputationNetworkLib.vcxproj", "{928ABD1B-4D3B-4017-AEF1-0FA1B4467513}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SGDLib", "Source\SGDLib\SGDLib.vcxproj", "{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}"
+	ProjectSection(ProjectDependencies) = postProject
+		{16F14058-B116-49D9-8BA0-209F3AFFE849} = {16F14058-B116-49D9-8BA0-209F3AFFE849}
+	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ParallelTraining", "ParallelTraining", "{5E666C53-2D82-49C9-9127-3FDDC321C741}"
 	ProjectSection(SolutionItems) = preProject
@ -1285,6 +1288,10 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "V2LibraryDistributionTests"
 		{E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF}
 	EndProjectSection
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Multiverso", "Source\Multiverso\src\Multiverso.vcxproj", "{16F14058-B116-49D9-8BA0-209F3AFFE849}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MultiversoTests", "Source\Multiverso\Test\unittests\MultiversoTests.vcxproj", "{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}"
+EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalExtendedClientTest", "Tests\EndToEndTests\EvalClientTests\CPPEvalExtendedClientTest\CPPEvalExtendedClientTest.vcxproj", "{5D29C76D-648A-456F-920D-48230F2FB3C8}"
 	ProjectSection(ProjectDependencies) = postProject
 		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9}
@ -2245,6 +2252,56 @@ Global
 		{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E}.Release|Mixed Platforms.Build.0 = Release|x64
 		{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E}.Release|x64.ActiveCfg = Release|x64
 		{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E}.Release|x64.Build.0 = Release|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|Any CPU.ActiveCfg = Debug_CpuOnly|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|Mixed Platforms.ActiveCfg = Debug_CpuOnly|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|Mixed Platforms.Build.0 = Debug_CpuOnly|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Any CPU.ActiveCfg = debug|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Mixed Platforms.ActiveCfg = debug|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|Mixed Platforms.Build.0 = debug|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|x64.ActiveCfg = debug|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Debug|x64.Build.0 = debug|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|Any CPU.ActiveCfg = Release_CpuOnly|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|Mixed Platforms.Build.0 = Release_CpuOnly|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|Any CPU.ActiveCfg = Release_CpuOnly|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|Mixed Platforms.Build.0 = Release_CpuOnly|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|x64.ActiveCfg = Release_CpuOnly|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release_NoOpt|x64.Build.0 = Release_CpuOnly|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Any CPU.ActiveCfg = release|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Mixed Platforms.ActiveCfg = release|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|Mixed Platforms.Build.0 = release|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|x64.ActiveCfg = release|x64
+		{16F14058-B116-49D9-8BA0-209F3AFFE849}.Release|x64.Build.0 = release|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|Any CPU.ActiveCfg = Debug_CpuOnly|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|Mixed Platforms.ActiveCfg = Debug_CpuOnly|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|Mixed Platforms.Build.0 = Debug_CpuOnly|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug|Mixed Platforms.Build.0 = Debug|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug|x64.ActiveCfg = Debug|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Debug|x64.Build.0 = Debug|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_CpuOnly|Any CPU.ActiveCfg = Release_CpuOnly|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_CpuOnly|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_CpuOnly|Mixed Platforms.Build.0 = Release_CpuOnly|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_NoOpt|Any CPU.ActiveCfg = Release_CpuOnly|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_NoOpt|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_NoOpt|Mixed Platforms.Build.0 = Release_CpuOnly|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_NoOpt|x64.ActiveCfg = Release_CpuOnly|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release_NoOpt|x64.Build.0 = Release_CpuOnly|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|Any CPU.ActiveCfg = Release|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|Mixed Platforms.ActiveCfg = Release|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|Mixed Platforms.Build.0 = Release|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|x64.ActiveCfg = Release|x64
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|x64.Build.0 = Release|x64
 		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|Any CPU.ActiveCfg = Debug_CpuOnly|x64
 		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|Mixed Platforms.ActiveCfg = Debug_CpuOnly|x64
 		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|Mixed Platforms.Build.0 = Debug_CpuOnly|x64
@ -2447,6 +2504,8 @@ Global
 		{E844AB9A-A48F-4A99-9625-F528C5C46D83} = {8656B71D-E24C-4AC2-8BE4-C07B415A3E15}
 		{CD721536-CFD3-413E-A3D7-FB0FAF989635} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
+		{16F14058-B116-49D9-8BA0-209F3AFFE849} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
 		{5D29C76D-648A-456F-920D-48230F2FB3C8} = {05E45AF7-C069-4057-BC16-0A532D068CE4}
 	EndGlobalSection
 EndGlobal
--- a/Examples/Image/GettingStarted/06_OneConvRegrMultiNode.cntk
+++ b/Examples/Image/GettingStarted/06_OneConvRegrMultiNode.cntk
@ -0,0 +1,102 @@
+# Parameters can be overwritten on the command line
+# for example: cntk configFile=myConfigFile RootDir=../.. 
+# For running from Visual Studio add
+# currentDirectory=$(SolutionDir)/<path to corresponding data folder> 
+
+command = trainNetwork
+
+precision = "float"; traceLevel = 1 ; deviceId = "auto"
+
+rootDir = ".." ; dataDir = "$rootDir$/DataSets/MNIST" ;
+outputDir = "./Output" ;
+
+modelPath = "$outputDir$/Models/06_OneConvRegrMultiNode"
+#stderr = "$outputDir$/06_OneConvRegr_bs_out"
+
+parallelizationMethod=DataParallelSGD
+
+# TRAINING CONFIG
+trainNetwork = {
+    action = "train"
+    
+    BrainScriptNetworkBuilder = {
+        imageShape = 28:28:1                        # image dimensions, 1 channel only
+        labelDim = 10                               # number of distinct labels
+        featScale = 1/256
+        Scale{f} = x => Constant(f) .* x
+        
+        model = Sequential (
+            Scale {featScale} :
+            ConvolutionalLayer {16, (5:5), pad = true} : ReLU : 
+            MaxPoolingLayer    {(2:2), stride=(2:2)} :
+            DenseLayer {64} : ReLU : 
+            LinearLayer {labelDim}
+        )
+        
+        # inputs
+        features = Input {imageShape}
+        labels = Input {labelDim}
+
+        # apply model to features
+        z = model (features)
+
+        # loss and error computation
+        sqErr = SquareError (labels, z)
+        rmse = Sqrt (sqErr / labelDim)
+
+        # declare special nodes
+        featureNodes    = (features)
+        labelNodes      = (labels)
+        criterionNodes  = (rmse)
+        evaluationNodes = (rmse)
+        outputNodes     = (z)
+    }
+
+    SGD = {
+        epochSize = 0
+        minibatchSize = 64
+        maxEpochs = 15
+        learningRatesPerSample = 0.001*5:0.0005
+        momentumAsTimeConstant = 1024
+        numMBsToShowResult = 500
+        ParallelTrain = [
+            parallelizationMethod = $parallelizationMethod$
+            distributedMBReading = "true"
+            parallelizationStartEpoch = 1
+            DataParallelSGD = [
+                gradientBits = 32
+            ]
+            ModelAveragingSGD = [
+                blockSizePerWorker = 64
+            ]
+            DataParallelASGD = [
+                syncPeriod = 64
+                usePipeline = false
+            ]
+    }
+
+    reader = {
+        readerType = "CNTKTextFormatReader"
+        # See ../REAMDE.md for details on getting the data (Train-28x28_cntk_text.txt).
+        file = "$DataDir$/Train-28x28_cntk_text.txt"
+        input = {
+            features   = { dim = 784 ; format = "dense" }
+            labels =   { dim = 10  ; format = "dense" }
+        }
+    }   
+}
+
+# TEST CONFIG
+testNetwork = {
+    action = "test"
+    minibatchSize = 1024    # reduce this if you run out of memory
+
+    reader = {
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Test-28x28_cntk_text.txt"
+        input = {
+            features = { dim = 784 ; format = "dense" }
+            labels =   { dim = 10  ; format = "dense" }
+        }
+    }
+}
--- a/Examples/Image/GettingStarted/README.md
+++ b/Examples/Image/GettingStarted/README.md
@ -101,3 +101,12 @@ In the fifth example, we show how CNTK can be used to perform a regression task.
 `cntk configFile=05_OneConvRegr.cntk`

 The trained network achieves root-mean-square error (RMSE) of 0.0039. To see more sophisticated examples on regression tasks, please refer to [Regression](../Regression).
+
+ ### 06_OneConvRegrMultiNode.cntk
+
+In the sixth example, we show how to train CNTK with multiple process(GPUs) for a regression task. CNTK using MPI for the multiple nodes task, and CNTK currently support four parallel SGD algorithms: DataParallelSGD, BlockMomentumSGD, ModelAveragingSGD, DataParallelASGD. We reuse the same network architecture in `05_OneConvRegr`, only to add a parallel train block. To run this example on a machine, use the following command:
+
+`mpiexec -n 2 cntk configFile=06_OneConvRegrMultiNode.cntk parallelTrain=True parallelizationMethod=DataParallelSGD`
+
+You can change the parallelizationMethod to other three options. To see more detailed guide on multiple GPUs and machines tasks, please refer to [Multiple GPUs and machines](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines).
+
--- a/82
+++ b/82
@ -516,6 +516,7 @@ $(CNTKLIBRARY_DISTRIBUTION_TESTS): $(CNTKLIBRARY_DISTRIBUTION_TESTS_OBJ) | $(CNT
 EVAL:=eval

 SGDLIB_SRC=\
+	$(SOURCEDIR)/SGDLib/ASGDHelper.cpp \
 	$(SOURCEDIR)/SGDLib/Profiler.cpp \
 	$(SOURCEDIR)/SGDLib/SGD.cpp \
 	$(SOURCEDIR)/SGDLib/PostComputingActions.cpp \
@ -551,7 +552,7 @@ $(EVAL_LIB): $(EVAL_OBJ) | $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo Building $(EVAL_LIB) for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) $(PROTOBUF_PATH)/lib/libprotobuf.a
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) $(lMULTIVERSO) $(PROTOBUF_PATH)/lib/libprotobuf.a

 ########################################
 # Eval Sample clients
@ -570,7 +571,7 @@ $(EVAL_CLIENT): $(EVAL_CLIENT_OBJ) | $(EVAL_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building $(EVAL_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH) $(lMULTIVERSO)

 EVAL_EXTENDED_CLIENT:=$(BINDIR)/cppevalextendedclient

@ -586,7 +587,7 @@ $(EVAL_EXTENDED_CLIENT): $(EVAL_EXTENDED_CLIENT_OBJ) | $(EVAL_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building $(EVAL_EXTENDED_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH) $(lMULTIVERSO)

 ########################################
 # Eval V2 Sample client
@ -893,6 +894,71 @@ endif
  # temporarily adding to 1bit, need to work with others to fix it
 endif

+ 
+########################################
+# ASGD(multiverso) setup
+########################################
+
+
+ifeq ("$(CNTK_ENABLE_ASGD)","true")
+
+ifeq (,$(wildcard Source/Multiverso/include/multiverso/*.h))
+  $(error Build with Multiverso was requested but cannot find the code. Please check https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines#24-data-parallel-asgd to learn more.)
+endif
+
+lMULTIVERSO:=-lmultiverso
+
+INCLUDEPATH += $(SOURCEDIR)/Multiverso/include
+COMMON_FLAGS += -DASGD_PARALLEL_SUPPORT
+
+MULTIVERSO_LIB:=$(LIBDIR)/libmultiverso.so
+
+ALL_LIBS+=$(MULTIVERSO_LIB)
+ifeq ("$(BUILDTYPE)","release")
+MULTIVERSO_CMAKE_BUILDTYPE=Release
+endif
+ifeq ("$(BUILDTYPE)","debug")
+MULTIVERSO_CMAKE_BUILDTYPE=Debug
+endif
+
+$(MULTIVERSO_LIB): 
+	@echo "Build Multiverso lib"
+	@mkdir -p $(LIBDIR)
+	@mkdir -p $(BINDIR)
+	@mkdir -p $(SOURCEDIR)/Multiverso/build/$(BUILDTYPE)
+	@cmake -DCMAKE_VERBOSE_MAKEFILE=TRUE \
+		-DBoost_NO_BOOST_CMAKE=TRUE \
+		-DBoost_NO_SYSTEM_PATHS=TRUE \
+		-DBOOST_ROOT:PATHNAME=$(BOOST_PATH) \
+		-DBOOST_LIBRARY_DIRS:FILEPATH=$(BOOST_PATH) \
+		-DLIBRARY_OUTPUT_PATH=$(shell readlink -f $(LIBDIR)) \
+		-DEXECUTABLE_OUTPUT_PATH=$(shell readlink -f $(BINDIR)) \
+		-DCMAKE_BUILD_TYPE=$(MULTIVERSO_CMAKE_BUILDTYPE) \
+		-B./Source/Multiverso/build/$(BUILDTYPE) -H./Source/Multiverso
+	@make VERBOSE=1 -C ./Source/Multiverso/build/$(BUILDTYPE) -j multiverso
+
+UNITTEST_MULTIVERSO_SRC = \
+	$(SOURCEDIR)/Multiverso/Test/unittests/test_array.cpp \
+	$(SOURCEDIR)/Multiverso/Test/unittests/test_blob.cpp \
+	$(SOURCEDIR)/Multiverso/Test/unittests/test_kv.cpp \
+	$(SOURCEDIR)/Multiverso/Test/unittests/test_message.cpp \
+	$(SOURCEDIR)/Multiverso/Test/unittests/test_multiverso.cpp \
+	$(SOURCEDIR)/Multiverso/Test/unittests/test_node.cpp \
+	$(SOURCEDIR)/Multiverso/Test/unittests/test_sync.cpp \
+
+UNITTEST_MULTIVERSO_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MULTIVERSO_SRC))
+
+UNITTEST_MULTIVERSO := $(BINDIR)/multiversotests
+
+ALL += $(UNITTEST_MULTIVERSO)
+
+$(UNITTEST_MULTIVERSO): $(UNITTEST_MULTIVERSO_OBJ) | $(MULTIVERSO_LIB)
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(lMULTIVERSO) -ldl
+endif
+
 ########################################
 # cntk
 ########################################
@ -926,11 +992,11 @@ CNTK:=$(BINDIR)/cntk
 ALL+=$(CNTK)
 SRC+=$(CNTK_SRC)

-$(CNTK): $(CNTK_OBJ) | $(CNTKMATH_LIB)
+$(CNTK): $(CNTK_OBJ) | $(CNTKMATH_LIB) $(MULTIVERSO_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) -fopenmp $(PROTOBUF_PATH)/lib/libprotobuf.a
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) $(lMULTIVERSO) -fopenmp $(PROTOBUF_PATH)/lib/libprotobuf.a

 # deployable resources: standard library of BS
 CNTK_CORE_BS:=$(BINDIR)/cntk.core.bs
@ -967,7 +1033,7 @@ $(UNITTEST_EVAL) : $(UNITTEST_EVAL_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(EVAL) -l$(CNTKMATH)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(EVAL) -l$(CNTKMATH) $(lMULTIVERSO)

 #TODO: create project specific makefile or rules to avoid adding project specific path to the global path
 INCLUDEPATH += $(SOURCEDIR)/Readers/CNTKTextFormatReader
@ -1027,11 +1093,11 @@ UNITTEST_NETWORK := $(BINDIR)/networktests
 ALL += $(UNITTEST_NETWORK)
 SRC += $(UNITTEST_NETWORK_SRC)

-$(UNITTEST_NETWORK): $(UNITTEST_NETWORK_OBJ) | $(CNTKMATH_LIB) $(CNTKTEXTFORMATREADER)
+$(UNITTEST_NETWORK): $(UNITTEST_NETWORK_OBJ) | $(CNTKMATH_LIB) $(CNTKTEXTFORMATREADER) $(MULTIVERSO_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(CNTKMATH) -fopenmp $(PROTOBUF_PATH)/lib/libprotobuf.a
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) $(lMULTIVERSO) -l$(CNTKMATH) -fopenmp  $(PROTOBUF_PATH)/lib/libprotobuf.a

 UNITTEST_MATH_SRC = \
 	$(SOURCEDIR)/../Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp \
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -336,7 +336,7 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
            fprintf(stderr, "\n");
            if (traceLevel > 0)
            {
-                LOGPRINTF(stderr, "Action \"%s\" complete.\n\n", thisAction.c_str());
+            LOGPRINTF(stderr, "Action \"%s\" complete.\n\n", thisAction.c_str());
            }

            NDLScript<ElemType> ndlScript;
@ -373,6 +373,9 @@ void PrintBuiltInfo()
 #ifdef _WITH_1BITSGD_
    LOGPRINTF(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
 #endif
+#ifdef _WITH_ASGD_
+    LOGPRINTF(stderr, "\t\tWith ASGD: %s\n", _WITH_ASGD_);
+#endif
 #ifdef _MATHLIB_
    LOGPRINTF(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
 #endif
@ -718,9 +721,9 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
        {
            for (int i = 0; i < command.size(); i++) // append all 'command' entries
            {
-                logpath += L"_";
+            logpath += L"_";
                logpath += (wstring)command[i];
-            }
+        }
            logpath += L".log"; // append .log
        }

@ -744,21 +747,21 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
    if (traceLevel > 0)
    {
        PrintBuiltInfo();
-        PrintGpuInfo();
+    PrintGpuInfo();
    }

 #ifdef _DEBUG
    if (traceLevel > 0)
    {
-        // This simply merges all the different config parameters specified (eg, via config files or via command line directly),
-        // and prints it.
+    // This simply merges all the different config parameters specified (eg, via config files or via command line directly),
+    // and prints it.
        fprintf(stderr, "\nConfiguration, Raw:\n\n");
-        LOGPRINTF(stderr, "%s\n", rawConfigString.c_str());
+    LOGPRINTF(stderr, "%s\n", rawConfigString.c_str());

-        // Same as above, but all variables are resolved.  If a parameter is set multiple times (eg, set in config, overridden at command line),
-        // All of these assignments will appear, even though only the last assignment matters.
+    // Same as above, but all variables are resolved.  If a parameter is set multiple times (eg, set in config, overridden at command line),
+    // All of these assignments will appear, even though only the last assignment matters.
        fprintf(stderr, "\nConfiguration After Variable Resolution:\n\n");
-        LOGPRINTF(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str());
+    LOGPRINTF(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str());
    }
 #endif

@ -769,12 +772,12 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
    if (traceLevel > 0)
    {
        fprintf(stderr, "\nConfiguration After Processing and Variable Resolution:\n\n");
-        config.dumpWithResolvedVariables();
+    config.dumpWithResolvedVariables();

-        LOGPRINTF(stderr, "Commands:");
-        for (int i = 0; i < command.size(); i++)
-            fprintf(stderr, " %s", command[i].c_str());
-        fprintf(stderr, "\n");
+    LOGPRINTF(stderr, "Commands:");
+    for (int i = 0; i < command.size(); i++)
+        fprintf(stderr, " %s", command[i].c_str());
+    fprintf(stderr, "\n");
    }

    // run commands
--- a/Source/CNTK/CNTK.vcxproj
+++ b/Source/CNTK/CNTK.vcxproj
@ -85,7 +85,7 @@
      <StackReserveSize>100000000</StackReserveSize>
    </Link>
    <PreBuildEvent>
-      <Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)"</Command>
+      <Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
    </PreBuildEvent>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -113,7 +113,7 @@
      <StackReserveSize>100000000</StackReserveSize>
    </Link>
    <PreBuildEvent>
-      <Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)"</Command>
+      <Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
    </PreBuildEvent>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(CpuOnlyBuild)">
--- a/Source/CNTK/prebuild.bat
+++ b/Source/CNTK/prebuild.bat
@ -22,6 +22,7 @@ set p_CNTK_ENABLE_1BitSGD=%~3
 set p_CudaPath=%~4
 set p_CUDNN_PATH=%~5
 set p_CUB_PATH=%~6
+set p_CNTK_ENABLE_ASGD=%~7

 echo #ifndef _BUILDINFO_H > buildinfo.h$$
 echo #define _BUILDINFO_H >> buildinfo.h$$
@ -75,7 +76,12 @@ if "%p_CNTK_ENABLE_1BitSGD%" == "true" (
 ) else (
    echo #define _WITH_1BITSGD_ "no">>buildinfo.h$$
 )
-
+:: assuming CNTK_ENABLE_ASGD was true as default value 
+if "%p_CNTK_ENABLE_ASGD%" == "false" (
+    echo #define _WITH_ASGD_ "no">>buildinfo.h$$
+) else (
+    echo #define _WITH_ASGD_ "yes">>buildinfo.h$$
+)
 if not %l_build_target% == CPU-only (
    if "%p_CudaPath%" == "" (
        echo #define _CUDA_PATH_    "NOT_DEFINED"     >> buildinfo.h$$
--- a/Source/Common/Include/ASGDHelper.h
+++ b/Source/Common/Include/ASGDHelper.h
@ -0,0 +1,67 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+#pragma once
+
+#include <list>
+#include "ComputationNetwork.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// -----------------------------------------------------------------------
+// class AdjustLearningRateAtBeginning
+//       Providing option for DataParallelASGD training. so that every nodes
+//       could adjust learning rate every minibatch at first N epochs.
+// -----------------------------------------------------------------------
+// TODO: We can removed these options once we can adjust learning rate at minibatchs level
+enum class AdjustLearningRateAtBeginning : int
+{
+    None = 0,  // default, don't adjust learning rate
+    Linearly = 1, // using linear adjustment, learning rate will from 0 to learningRatesPerMB
+    Staircase = (1 << 1), // using staircased adjustment, learning rate will from 0 to learningRatesPerMB every adjustNbMinibatch
+};
+
+template<class ElemType = float>
+class ASGDHelper
+{
+public:
+    virtual ~ASGDHelper() { }
+    // -----------------------------------------------------------------------
+    // InitModel() -- Upload initialized model (, which was pre-computed by CNTK logic) .
+    // to the parameter servers, so that every node could start training from same model
+    // -----------------------------------------------------------------------
+    virtual void InitModel(const std::list<ComputationNodeBasePtr> & learnableNodes) = 0;
+
+    // -----------------------------------------------------------------------
+    // PushAndPullModel() -- Push parameters of learnableNodes to parameter servers, then get the latests model back.
+    // -----------------------------------------------------------------------
+    virtual bool PushAndPullModel(const std::list<ComputationNodeBasePtr> & learnableNodes, size_t sampleSinceLastSynced = 0) = 0;
+
+    // -----------------------------------------------------------------------
+    // WaitAll() -- Wait(Barrier) all the other nodes to process
+    // -----------------------------------------------------------------------
+    virtual void WaitAll() = 0;
+
+    // -----------------------------------------------------------------------
+    // WaitAsyncBuffer() -- Wait pipeline thread to finish job when useAsyncBuffer is true
+    // -----------------------------------------------------------------------
+    virtual void WaitAsyncBuffer() = 0;
+
+};  // Class ASGDHelper
+
+// Factory method to create a ASGDHelper instance
+template<class ElemType = float>
+ASGDHelper<ElemType>* NewASGDHelper(
+    const std::list<ComputationNodeBasePtr> & learnableNodes,                // Parameters that needs to be train
+    size_t nodeNumRanks,                                                     // Number of working nodes
+    bool useAsyncBuffered = true,                                            // Using asynchonous buffer to hide communication cost
+    bool isSimulatedModelAveragingSGD = false,                               // Using parameter server-based MA rather than ASGD
+    AdjustLearningRateAtBeginning adjusttype =
+    AdjustLearningRateAtBeginning::None,                                     // Adjust learning per minibatches at very begining of training process
+    double adjustCoef = 0.2,                                                 // see in DecayCoefficient()
+    size_t adjustPerMinibatches = 600,                                       //
+    int traceLevel = 0,                                                      // log level
+    int syncPerfStats = 0);                                                  // shown perf data every syncPerfStats
+
+}}}
--- a/Source/Common/Include/MPIWrapper.h
+++ b/Source/Common/Include/MPIWrapper.h
@ -99,6 +99,7 @@ class MPIWrapper : public std::enable_shared_from_this<MPIWrapper>

            int argc = 0;
            char **argv = NULL;
+            // TODO(qiwye) Multiverso(parameter server) will benefit from MPI_THREAD_MULTIPLE .
            int requiredThreadLevelSupport = MPI_THREAD_SERIALIZED;
            int provided;
            int ret = MPI_Init_thread(&argc, &argv, requiredThreadLevelSupport, &provided);
--- a/Source/Multiverso
+++ b/Source/Multiverso
@ -0,0 +1 @@
+Subproject commit 40743f9c86297f63b29c99c259199f59f16c0c7c
--- a/Source/SGDLib/ASGDHelper.cpp
+++ b/Source/SGDLib/ASGDHelper.cpp
@ -0,0 +1,670 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// ASGDHelper.cpp : Implements ASGDHelper interface. The implementation is based on Multiverso.
+//
+
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
+
+#include "ASGDHelper.h"
+#include "MPIWrapper.h"
+#include "ComputationNetwork.h"
+#include "TimerUtility.h"
+
+#include <functional>
+#include <thread>
+#include <unordered_map>
+#include <numeric>
+#include <algorithm>
+
+#ifdef ASGD_PARALLEL_SUPPORT
+
+#include <multiverso/multiverso.h>
+#include <multiverso/util/configure.h>
+#include <multiverso/table/array_table.h>
+#include <multiverso/updater/updater.h>
+
+#pragma comment(lib, "Multiverso.lib")
+
+#endif
+
+
+#ifndef CPUONLY
+#include <cuda_runtime.h>
+#pragma comment (lib, "cudart.lib")     // for cudaMemcpyAsync()
+#endif
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+#ifndef CPUONLY
+
+#include <cuda_runtime.h>
+
+// -----------------------------------------------------------------------
+// Error handling
+// -----------------------------------------------------------------------
+
+template <typename ERRTYPE>
+static void CudaCall(ERRTYPE retCode, const char* exprString, const char* libName, ERRTYPE successCode)
+{
+    if (retCode != successCode)
+    {
+        try
+        {
+#ifdef _WIN32
+            const char* hostname = getenv("COMPUTERNAME");
+#else
+            char hostname[HOST_NAME_MAX];
+            if (gethostname(hostname, HOST_NAME_MAX) != 0)
+                strcpy(hostname, "?");
+#endif
+            int currentCudaDevice;
+            cudaGetDevice(&currentCudaDevice);
+            Microsoft::MSR::CNTK::RuntimeError("%s failure %d; GPU=%d ; hostname=%s ; expr=%s", libName, (int)retCode, currentCudaDevice, hostname ? hostname : "?", exprString);
+        }
+        catch (const std::exception& e) // catch, log, and rethrow since CUDA code sometimes hangs in destruction, so we'd never get to see the error
+        {
+            std::cerr << e.what() << std::endl;
+            throw;
+        }
+    }
+}
+
+#define CUDA_CALL(expr)     (CudaCall((expr), #expr, "CUDA",     cudaSuccess))
+#endif // CPUONLY
+
+#ifdef ASGD_PARALLEL_SUPPORT
+
+// MultiversoHelper is the implementation of ASGDHelper interface with Multiverso
+template<class ElemType = float>
+class MultiversoHelper : public ASGDHelper<ElemType>
+{
+public:
+    typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
+
+    MultiversoHelper(const std::list<ComputationNodeBasePtr> & learnableNodes,          // Parameters that needs to be train
+        size_t nodeNumRanks,                                                            // Number of working nodes
+        bool useAsyncBuffer = true,                                                   // Using asynchonous buffer to hide communication cost
+        bool isSimulatedModelAveragingSGD = false,                                      // Using parameter server-based MA rather than ASGD
+        AdjustLearningRateAtBeginning adjusttype = AdjustLearningRateAtBeginning::None, // Adjust learning per minibatches at very begining of training process
+        // this could be used to tackle the unstableness of ASGD
+        double adjustCoef = 0.2,                                                        // see in DecayCoefficient()
+        size_t adjustPerMinibatches = 600,                                              //
+        int traceLevel = 0,                                                             // log level
+        int syncPerfStats = 0) :                                                        // shown perf data every syncPerfStats
+        m_parameterSyncCounter(0), m_adjustLearningRateAtBeginningType(adjusttype),
+        m_adjustCoefficient(adjustCoef), m_adjustMBNumber(adjustPerMinibatches),
+        m_totalClientNumber(nodeNumRanks), m_useAsyncBuffer(useAsyncBuffer),
+        m_traceLevel(traceLevel), m_ModelAveragingSGDSimulating(isSimulatedModelAveragingSGD), m_doesEveryNodesShouldSynced(false),
+        m_syncPerfStats(syncPerfStats)
+    {
+        if (m_ModelAveragingSGDSimulating)
+        {
+            m_doesEveryNodesShouldSynced = true;
+            m_useAsyncBuffer = false;
+        }
+        // Pipeline releated variables
+        m_localBufferNum = m_useAsyncBuffer ? 2 : 1;
+        m_bufferSwapIndex = new int[m_localBufferNum];
+
+        // CPU asynchronous buffer
+        m_cpuAsyncBuffer = new ElemType*[m_localBufferNum];
+
+        // Get option used by multiverso sparse update
+        m_getOptions.reserve(m_localBufferNum);
+        m_addOptions.reserve(m_localBufferNum);
+
+#ifndef CPUONLY
+        // GPU asynchronous buffer
+        m_gpuAsyncBuffer.resize(m_localBufferNum);
+        // creat an communication stream for the data tranfer between GPU and CPU
+        CUDA_CALL(cudaStreamCreate(&_commStream));
+#endif
+        m_bufferIndexInUse = 0;
+        for (int i = 0; i < m_localBufferNum; i++)
+            m_bufferSwapIndex[i] = (i + 1) % m_localBufferNum;
+
+        m_aysncBufferThread = nullptr;
+
+        multiverso::SetCMDFlag("logtostderr", true);
+
+        if (m_doesEveryNodesShouldSynced)
+            multiverso::SetCMDFlag("sync", true);
+
+        MultiversoInit(learnableNodes);
+    }
+
+    ~MultiversoHelper()
+    {
+        fprintf(stderr, "~MultiversoHelper\n");
+        fflush(stderr);
+
+        if (m_useAsyncBuffer && m_aysncBufferThread != nullptr && m_aysncBufferThread->joinable())
+            m_aysncBufferThread->join();
+
+        delete m_bufferSwapIndex, m_deltaArray;
+
+        for (size_t i = 0; i < m_localBufferNum; i++)
+        {
+#ifndef CPUONLY
+            CUDA_CALL(cudaFreeHost(m_cpuAsyncBuffer[i]));
+#else
+            delete m_cpuAsyncBuffer[i];
+#endif
+        }
+        delete m_cpuAsyncBuffer;
+#ifndef CPUONLY
+        CUDA_CALL(cudaStreamDestroy(_commStream));
+#endif
+        multiverso::MV_ShutDown(false);
+    }
+
+    void InitModel(const std::list<ComputationNodeBasePtr> & learnableNodes) override
+    {
+        float factor = 1.0f / m_totalClientNumber;
+
+        int i = 0; // indicate the index of learnable nodes
+        for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, i++)
+        {
+            ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
+            Matrix<ElemType> &mat = node->Value();
+
+#ifndef CPUONLY
+            for (int j = 0; j < m_localBufferNum; j++)
+                m_gpuAsyncBuffer[j].push_back(mat.DeepClone());
+#endif
+            ElemType* px = m_cpuAsyncBuffer[0] + m_tableOffsets[i];
+            mat.CopyToArray(px, m_tableLength[i]);
+        }
+
+        for (int i = 1; i < m_localBufferNum; i++)
+            memcpy(m_cpuAsyncBuffer[i], m_cpuAsyncBuffer[0], sizeof(ElemType) * m_totalModelSize);
+
+        memcpy(m_deltaArray, m_cpuAsyncBuffer[0], sizeof(ElemType) * m_totalModelSize);
+
+        // because the parameter server will minus the delta on the server, so that we should send the minus initial model to the server.
+        std::transform(m_deltaArray, m_deltaArray + m_totalModelSize, m_deltaArray, std::bind1st(std::multiplies<ElemType>(), -factor));
+
+        m_workerArray->Add(m_deltaArray, m_totalModelSize);
+        m_workerArray->Get(m_deltaArray, m_totalModelSize);
+        WaitAll();
+        m_workerArray->Get(m_deltaArray, m_totalModelSize);
+
+        if (std::equal(m_deltaArray, m_deltaArray + m_totalModelSize, m_cpuAsyncBuffer[0]))
+            fprintf(stderr, "multiverso initial model loaded.\n");
+        m_reportTimer.Start();
+    }
+
+    bool PushAndPullModel(const std::list<ComputationNodeBasePtr> & learnableNodes, size_t sampleSinceLastSynced) override
+    {
+        m_parameterSyncCounter++;
+
+        double fromCPUToGPUTime;
+        double fromGPUToCPUTime;
+        double networkTime;
+        double swapTimeOnGPU;
+        m_reportTimer.Restart();
+        WaitAsyncBuffer();
+        m_reportTimer.Stop();
+
+        // reset statics for profiling
+        if (m_traceLevel > 2 && m_syncPerfStats > 0 && m_parameterSyncCounter % m_syncPerfStats == 0)
+        {
+            fromCPUToGPUTime = 0;
+            fromGPUToCPUTime = 0;
+            networkTime = 0;
+            swapTimeOnGPU = 0;
+        }
+
+        m_bufferIndexInUse = m_bufferSwapIndex[m_bufferIndexInUse];
+
+        int i = 0; // indicate the index of learnable nodes
+        if (m_useAsyncBuffer)
+        {
+            m_reportTimer.Restart();
+            for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, i++)
+            {
+                ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
+                Microsoft::MSR::CNTK::Matrix<ElemType> &mat = node->Value();
+#ifndef CPUONLY
+                // CNTK model -> GPU buffer
+                CUDA_CALL(cudaMemcpy(m_gpuAsyncBuffer[m_bufferIndexInUse][i].Data(),
+                    mat.Data(),
+                    mat.GetNumElements() * sizeof(ElemType),
+                    cudaMemcpyDeviceToDevice));
+
+                // GPU buffer -> CNTK model
+                CUDA_CALL(cudaMemcpy(mat.Data(),
+                    m_gpuAsyncBuffer[m_bufferSwapIndex[m_bufferIndexInUse]][i].Data(),
+                    mat.GetNumElements() * sizeof(ElemType),
+                    cudaMemcpyDeviceToDevice));
+#else
+                ElemType * px = m_cpuAsyncBuffer[m_bufferIndexInUse] + m_tableOffsets[i];
+                mat.CopyToArray(px, m_tableLength[i]);
+                ElemType * py = m_cpuAsyncBuffer[m_bufferSwapIndex[m_bufferIndexInUse]] + m_tableOffsets[i];
+                mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), mat.GetDeviceId(), py);
+                delete px;
+#endif
+            }
+            m_reportTimer.Stop();
+            if (m_traceLevel > 2)
+            {
+                swapTimeOnGPU = m_reportTimer.ElapsedSeconds();
+            }
+#ifndef CPUONLY
+            m_aysncBufferThread = new thread([&]()
+            {
+                float factor = DecayCoefficient();
+                int deviceId = m_gpuAsyncBuffer[m_bufferIndexInUse][0].GetDeviceId();
+
+                CUDA_CALL(cudaSetDevice(deviceId));
+
+                Timer threadTimer;
+                threadTimer.Restart();
+                for (int widx = 0; widx < m_tableCount; widx++)
+                {
+                    ElemType * px = m_deltaArray + m_tableOffsets[widx];
+                    // GPU buffer -> CPU buffer
+                    CUDA_CALL(cudaMemcpyAsync(px,
+                        m_gpuAsyncBuffer[m_bufferIndexInUse][widx].Data(),
+                        m_gpuAsyncBuffer[m_bufferIndexInUse][widx].GetNumElements() * sizeof(ElemType),
+                        cudaMemcpyDeviceToHost,
+                        _commStream));
+                }
+                // waiting copy from GPU to CPU has finished
+                CUDA_CALL(cudaStreamSynchronize(_commStream));
+                threadTimer.Stop();
+
+                if (m_traceLevel > 3)
+                {
+                    double time = threadTimer.ElapsedSeconds();
+                    fprintf(stderr, "\t\t -- pullAndRequest, GPU -> CPU time %lf \n", time);
+                }
+
+                // delta =  gradient * learning_rate
+                std::transform(m_cpuAsyncBuffer[m_bufferIndexInUse], 
+                    m_cpuAsyncBuffer[m_bufferIndexInUse] + m_totalModelSize,
+                    m_deltaArray, m_deltaArray, 
+                    std::minus<ElemType>());
+
+                threadTimer.Restart();
+                // lr decay
+                std::transform(m_deltaArray, 
+                    m_deltaArray + m_totalModelSize, 
+                    m_deltaArray, 
+                    std::bind1st(std::multiplies<ElemType>(), factor));
+
+
+                ElemType* px = m_deltaArray;
+                ElemType* py = m_cpuAsyncBuffer[m_bufferIndexInUse];
+                m_workerArray->AddAsync(px, m_totalModelSize);
+                m_workerArray->Get(py, m_totalModelSize);
+
+                threadTimer.Stop();
+                if (m_traceLevel > 3)
+                {
+                    double time = threadTimer.ElapsedSeconds();
+                    fprintf(stderr, "\t\t -- pullAndRequest, Worker <--> Multiverso time %lf \n", time);
+                }
+
+                threadTimer.Restart();
+                // copy parameters from CPU buffer to GPU buffer
+                for (int widx = 0; widx < m_tableCount; widx++)
+                {
+                    ElemType * py = m_cpuAsyncBuffer[m_bufferIndexInUse] + m_tableOffsets[widx];
+
+                    CUDA_CALL(cudaMemcpyAsync(m_gpuAsyncBuffer[m_bufferIndexInUse][widx].Data(),
+                        py,
+                        m_gpuAsyncBuffer[m_bufferIndexInUse][widx].GetNumElements() * sizeof(ElemType),
+                        cudaMemcpyHostToDevice,
+                        _commStream));
+                }
+                CUDA_CALL(cudaStreamSynchronize(_commStream));
+                threadTimer.Stop();
+                if (m_traceLevel > 3)
+                {
+                    double time = threadTimer.ElapsedSeconds();
+                    fprintf(stderr, "\t\t -- pullAndRequest, CPU -> GPU time %lf \n", time);
+                }
+            });
+#else
+            m_aysncBufferThread = new thread([&]()
+            {
+                float factor = DecayCoefficient();
+                int t_cacheIdx = m_bufferIndexInUse;
+
+                std::transform(m_cpuAsyncBuffer[t_cacheIdx], m_cpuAsyncBuffer[t_cacheIdx] + m_totalModelSize, m_deltaArray, m_deltaArray, std::minus<ElemType>());
+                std::transform(m_deltaArray, m_deltaArray + m_totalModelSize, m_deltaArray, std::bind1st(std::multiplies<ElemType>(), factor));
+
+                ElemType* px = m_deltaArray;
+                ElemType* py = m_cpuAsyncBuffer[t_cacheIdx];
+                m_workerArray->AddAsync(px, m_totalModelSize);
+                m_workerArray->Get(py, m_totalModelSize);
+
+            });
+#endif
+        }
+        else
+        {
+            m_reportTimer.Restart();
+            float factor = DecayCoefficient();
+            i = 0;
+            for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, i++)
+            {
+                ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
+                Microsoft::MSR::CNTK::Matrix<ElemType> &mat = node->Value();
+
+                ElemType * px = m_deltaArray + m_tableOffsets[i];
+                mat.CopyToArray(px, m_tableLength[i]);
+            }
+
+            m_reportTimer.Stop();
+            if (m_traceLevel > 3)
+            {
+                double time = m_reportTimer.ElapsedSeconds();
+                fprintf(stderr, "\t\t -- pullAndRequest, GPU -> CPU time %lf \n", time);
+            }
+            std::transform(m_cpuAsyncBuffer[0], m_cpuAsyncBuffer[0] + m_totalModelSize, m_deltaArray, m_deltaArray, std::minus<ElemType>());
+
+            // lr decay
+            if (m_ModelAveragingSGDSimulating)
+            {
+                factor = ModelAggregationCoefficient(sampleSinceLastSynced);
+                std::transform(m_deltaArray, m_deltaArray + m_totalModelSize, m_deltaArray, std::bind1st(std::multiplies<ElemType>(), factor));
+                if (m_traceLevel > 2 && m_syncPerfStats != 0)
+                {
+                    if (m_parameterSyncCounter % m_syncPerfStats == 0)
+                        ReportPerfStats(m_totalClientNumber * m_sampleSinceLastReport, m_sampleSinceLastReport);
+                    else
+                        m_sampleSinceLastReport += sampleSinceLastSynced;
+                }
+
+            }
+            else
+            {
+                std::transform(m_deltaArray, m_deltaArray + m_totalModelSize, m_deltaArray, std::bind1st(std::multiplies<ElemType>(), factor));
+            }
+            m_reportTimer.Restart();
+
+            ElemType* px = m_deltaArray;
+            ElemType* py = m_cpuAsyncBuffer[0];
+            m_workerArray->AddAsync(px, m_totalModelSize);
+            m_workerArray->Get(py, m_totalModelSize);
+
+            m_reportTimer.Stop();
+            if (m_traceLevel > 3)
+            {
+                double time = m_reportTimer.ElapsedSeconds();
+                fprintf(stderr, "\t\t -- pullAndRequest, Worker <--> Multiverso time %lf \n", time);
+            }
+            m_reportTimer.Restart();
+            i = 0;
+            for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, i++)
+            {
+                ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
+                Microsoft::MSR::CNTK::Matrix<ElemType> &mat = node->Value();
+
+                ElemType * px = m_cpuAsyncBuffer[0] + m_tableOffsets[i];
+                mat.SetValue(mat.GetNumRows(), mat.GetNumCols(), mat.GetDeviceId(), px);
+            }
+            m_reportTimer.Stop();
+            if (m_traceLevel > 3)
+            {
+                double time = m_reportTimer.ElapsedSeconds();
+                fprintf(stderr, "\t\t -- pullAndRequest, CPU -> GPU time %lf \n", time);
+            }
+        }
+        return true;
+    }
+
+    void WaitAll() override
+    {
+        multiverso::MV_Barrier();
+    }
+
+    void WaitAsyncBuffer() override
+    {
+        if (m_aysncBufferThread != nullptr && m_aysncBufferThread->joinable())
+        {
+            m_aysncBufferThread->join();
+            delete m_aysncBufferThread;
+            m_aysncBufferThread = nullptr;
+        }
+    }
+
+private:
+    void MultiversoInit(const std::list<ComputationNodeBasePtr> & learnableNodes)
+    {
+        // parameter server offer vary of updaters, we only use the SGD updater for this simple case.
+        multiverso::SetCMDFlag<std::string>(std::string("updater_type"), std::string("sgd"));
+        multiverso::MV_Init();
+
+        int i = 0;
+        for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, i++)
+        {
+            ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
+            Matrix<ElemType> &mat = node->Value();
+            size_t layerSize = mat.GetNumElements();
+
+            m_tableLength.push_back(layerSize);
+        }
+
+        m_tableCount = m_tableLength.size();
+
+        // cacluate total of learnable node's size
+        m_totalModelSize = accumulate(m_tableLength.begin(), m_tableLength.end(), 0);
+
+        m_serverArray = new multiverso::ArrayServer<ElemType>(m_totalModelSize);
+        m_workerArray = new multiverso::ArrayWorker<ElemType>(m_totalModelSize);
+
+        multiverso::MV_Barrier();
+
+        size_t idx = 0;
+        for (size_t len : m_tableLength)
+        {
+            m_tableOffsets.push_back(idx);
+            idx += len;
+        }
+
+#ifndef CPUONLY
+        for (int i = 0; i < m_localBufferNum; i++)
+            m_gpuAsyncBuffer[i].reserve(m_tableCount);
+
+        // create pinned memory
+        for (int i = 0; i < m_localBufferNum; ++i)
+            CUDA_CALL(cudaMallocHost((void **)&m_cpuAsyncBuffer[i], sizeof(ElemType) * (m_totalModelSize), cudaHostAllocPortable));
+
+        CUDA_CALL(cudaMallocHost((void **)&m_deltaArray, sizeof(ElemType) * (m_totalModelSize), cudaHostAllocPortable));
+#else
+        for (int i = 0; i < m_localBufferNum; i++)
+            m_cpuAsyncBuffer[i] = new ElemType[m_totalModelSize];
+#endif
+    }
+
+    float DecayCoefficient()
+    {
+        float f = 1.f;
+        switch (m_adjustLearningRateAtBeginningType)
+        {
+        case AdjustLearningRateAtBeginning::None:
+            break;
+        case AdjustLearningRateAtBeginning::Linearly:
+            f = min(f, max(0.f, (float)(m_adjustCoefficient + (1 - m_adjustCoefficient) / m_adjustMBNumber * m_parameterSyncCounter)));
+            break;
+        case AdjustLearningRateAtBeginning::Staircase:
+            f = min(f, max(0.f, (float)(m_adjustCoefficient * (m_parameterSyncCounter / m_adjustMBNumber + 1))));
+            break;
+        default:
+            break;
+        }
+        return f;
+    }
+
+    float ModelAggregationCoefficient(size_t samplesSinceLastSync)
+    {
+        float factor = 0;
+        int   nTotalSamples = samplesSinceLastSync;
+        // m_pMPI->AllReduce(&nTotalSamples, 1);
+
+        if (nTotalSamples <= 0)
+        {
+            factor = 1.0f / m_pMPI->NumNodesInUse();
+            // give an estimated one 
+        }
+        else
+        {
+            factor = (samplesSinceLastSync + 0.0f) / nTotalSamples;
+        }
+        factor = 1.0f / m_pMPI->NumNodesInUse();
+        return factor;
+    }
+
+    inline void transpose(ElemType *src, ElemType *dst, const int N, const int M)
+    {
+        for (auto n = 0; n < N*M; n++) {
+            auto i = n / N;
+            auto j = n%N;
+            dst[n] = src[M*j + i];
+        }
+    }
+
+    void ReportPerfStats(size_t totalSamplesProcessedSinceLastReport,
+        size_t localSamplesProcessedSinceLastReport)
+    {
+        m_reportTimer.Stop();
+        double secondsSinceLastReport = m_reportTimer.ElapsedSeconds();
+        m_reportTimer.Restart();
+
+        float totalThroughput = secondsSinceLastReport > 0 ? (float)totalSamplesProcessedSinceLastReport / ((float)secondsSinceLastReport * 1000.0f) : 0.0f;
+        float throughputPerWorker = totalThroughput / m_totalClientNumber;
+
+        string prefix = "\t\t(sim-model aggregation stats) %d-th sync: %8.2f seconds since last report ; %d samples processed by %d workers (%d by me);\n"
+            "\t\t(sim-model aggregation stats) %d-th sync: totalThroughput = %.2fk samplesPerSecond , throughputPerWorker = %.2fk samplesPerSecond\n";
+        fprintf(stderr, prefix.c_str(), (int)m_parameterSyncCounter, secondsSinceLastReport, (int)totalSamplesProcessedSinceLastReport, (int)m_totalClientNumber, (int)localSamplesProcessedSinceLastReport,
+            (int)m_parameterSyncCounter, totalThroughput, throughputPerWorker);
+        m_sampleSinceLastReport = 0;
+
+    }
+
+    multiverso::ArrayServer<ElemType>* m_serverArray;
+    multiverso::ArrayWorker<ElemType>* m_workerArray;
+
+    thread * m_aysncBufferThread;
+    bool m_doesEveryNodesShouldSynced;
+    bool m_ModelAveragingSGDSimulating;
+
+    int m_totalClientNumber;
+    int m_traceLevel;
+    int m_syncPerfStats;
+    Timer m_reportTimer;
+    size_t m_parameterSyncCounter;
+    size_t m_sampleSinceLastReport;
+
+    bool m_useAsyncBuffer;
+    int m_localBufferNum;
+    int * m_bufferSwapIndex;
+    int m_bufferIndexInUse;
+    std::vector<multiverso::GetOption*> m_getOptions; // used by sparse table
+    std::vector<multiverso::AddOption*> m_addOptions; // used by sparse table
+
+
+    AdjustLearningRateAtBeginning m_adjustLearningRateAtBeginningType;
+    double m_adjustCoefficient;
+    size_t m_adjustMBNumber;
+
+    vector<size_t> m_tableLength;
+    size_t m_totalModelSize;
+    vector<size_t> m_tableOffsets;
+    //shared_ptr<ElemType>  m_deltaArray;
+    ElemType * m_deltaArray;
+    //std::vector<shared_ptr<ElemType>  > m_cpuAsyncBuffer;
+    ElemType ** m_cpuAsyncBuffer;
+
+    MPIWrapperPtr m_pMPI;
+
+    // GPU double buffer
+    std::vector<std::vector<Matrix<ElemType>   >> m_gpuAsyncBuffer;
+    int m_tableCount;
+
+#ifndef CPUONLY
+    cudaStream_t _commStream;
+#endif
+};  // Class MultiversoHelper
+
+#endif 
+
+// A None implementation of ASGDHelper interface which does nothing
+// This is used when CNTK_ENABLE_ASGD = false
+template<class ElemType = float>
+class NoneASGDHelper : public ASGDHelper<ElemType>
+{
+public:
+    NoneASGDHelper(const std::list<ComputationNodeBasePtr> & learnableNodes,
+        int nodeNumRanks,
+        bool useAsyncBuffer = true,
+        bool isSimModelAveragingSGD = false,
+        AdjustLearningRateAtBeginning adjusttype = AdjustLearningRateAtBeginning::None,
+        double adjustcoef = 0.2,
+        size_t adjustnbmb = 600,
+        int traceLevel = 0,
+        int syncPerfStats = 0,
+        const MPIWrapperPtr& pMPI = nullptr) { }
+
+    ~NoneASGDHelper() { }
+
+    void InitModel(const std::list<ComputationNodeBasePtr> & learnableNode) override { }
+
+    bool PushAndPullModel(const std::list<ComputationNodeBasePtr> & learnableNodes, size_t sampleSinceLastSynced) override { 
+        return true;
+    }
+
+    void WaitAll() override { }
+
+    void WaitAsyncBuffer() override { }
+};
+
+template<class ElemType>
+ASGDHelper<ElemType>* NewASGDHelper(
+    const std::list<ComputationNodeBasePtr> & learnableNodes,                // Parameters that needs to be train
+    size_t nodeNumRanks,                                                     // Number of working nodes
+    bool useAsyncBuffer,                                            // Using asynchonous buffer to hide communication cost
+    bool isSimulatedModelAveragingSGD,
+    AdjustLearningRateAtBeginning adjusttype,
+    double adjustCoef,
+    size_t adjustPerMinibatches,
+    int traceLevel,
+    int syncPerfStats) 
+{
+#ifdef ASGD_PARALLEL_SUPPORT
+    return new MultiversoHelper<ElemType>(learnableNodes, nodeNumRanks, useAsyncBuffer, isSimulatedModelAveragingSGD, 
+                                      adjusttype, adjustCoef, adjustPerMinibatches, traceLevel, syncPerfStats);
+#else
+    return new NoneASGDHelper<ElemType>(learnableNodes, nodeNumRanks, useAsyncBuffer, isSimulatedModelAveragingSGD, 
+                                      adjusttype, adjustCoef, adjustPerMinibatches, traceLevel, syncPerfStats); 
+#endif
+}
+
+template ASGDHelper<float>* NewASGDHelper<float>(
+    const std::list<ComputationNodeBasePtr> & learnableNodes,
+    size_t nodeNumRanks,
+    bool useAsyncBuffer,
+    bool isSimulatedModelAveragingSGD,
+    AdjustLearningRateAtBeginning adjusttype,
+    double adjustCoef,
+    size_t adjustPerMinibatches,
+    int traceLevel,
+    int syncPerfStats); 
+
+template ASGDHelper<double>* NewASGDHelper<double>(
+    const std::list<ComputationNodeBasePtr> & learnableNodes,
+    size_t nodeNumRanks,
+    bool useAsyncBuffer,
+    bool isSimulatedModelAveragingSGD,
+    AdjustLearningRateAtBeginning adjusttype,
+    double adjustCoef,
+    size_t adjustPerMinibatches,
+    int traceLevel,
+    int syncPerfStats); 
+
+}}} 
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@ -25,6 +25,8 @@
 #include "V2AllReduceDistGradAggregator.h"
 #endif

+#include "ASGDHelper.h"
+
 #include "SimpleDistGradAggregator.h"
 #include "V2SimpleDistGradAggregator.h"
 #include "ProgressTracing.h"
@ -403,15 +405,27 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
                                                  m_seqGammarCalcAMF, m_seqGammarCalcLMF, m_seqGammarCalcWP, m_seqGammarCalcbMMIFactor, m_seqGammarCalcUsesMBR);
    }

+    // Multiverso Warpper for ASGD logic init
+    if (m_parallelizationMethod == ParallelizationMethod::dataParallelASGD)
+    {
+        m_pASGDHelper.reset(NewASGDHelper<ElemType>(learnableNodes,
+                                         m_mpi->NumNodesInUse(),
+                                         m_isAsyncBufferEnabled,
+                                         m_isSimulateMA,
+                                         m_adjustLearningRateAtBeginning,
+                                         m_adjustCoefficient,
+                                         m_adjustPerMinibatches,
+                                         m_traceLevel,
+                                         m_syncStatsTrace));
+        m_pASGDHelper->InitModel(learnableNodes);
+    }
+
    // --- MAIN EPOCH LOOP
    for (int i = startEpoch; i < (int) m_maxEpochs; i++) // TODO: why is this an int, and not a size_t?
    {
        // Synchronize all ranks before proceeding to ensure that
        // rank 0 has finished writing the previous model file
-        if (m_mpi != nullptr)
-        {
-            m_mpi->WaitAll();
-        }
+        BarrierWorkers();

        // (re-)initialize 1-bit SGD
        if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD &&
@ -575,7 +589,9 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,

        if (validationSetDataReader != trainSetDataReader && validationSetDataReader != nullptr)
        {
-            SimpleEvaluator<ElemType> evalforvalidation(net, m_mpi, m_enableDistributedMBReading);
+            // TODO(dataASGD) making evaluator becoming nondistributed one when using ASGD, since Multiverso has another background thread using MPI.
+            //                Making the evaluation serial (non-distributed) will slowdown training especially when validation set is large.
+            SimpleEvaluator<ElemType> evalforvalidation(net, UsingAsyncGradientAggregation(i + 1) ?nullptr : m_mpi, m_enableDistributedMBReading);
            vector<wstring> cvSetTrainAndEvalNodes;
            if (criterionNodes.size() > 0)
            {
@ -712,10 +728,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
        // Synchronize all ranks before proceeding to ensure that
        // nobody tries reading the checkpoint file at the same time
        // as rank 0 deleting it below
-        if (m_mpi != nullptr)
-        {
-            m_mpi->WaitAll();
-        }
+        BarrierWorkers();

        // Persist model and check-point info
        if ((m_mpi == nullptr) || m_mpi->IsMainNode())
@ -783,10 +796,8 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,

    // Synchronize all ranks before proceeding to ensure that
    // rank 0 has finished writing the model file
-    if (m_mpi != nullptr)
-    {
-        m_mpi->WaitAll();
-    }
+    // TODO[DataASGD]: should othet other rank waiting in async-mode
+    BarrierWorkers();

    // progress tracing for compute cluster management
    ProgressTracing::TraceProgressPercentage(m_maxEpochs, 0.0, true);
@ -803,6 +814,8 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
    }

    delete inputMatrices;
+    if (m_parallelizationMethod == ParallelizationMethod::dataParallelASGD)
+        m_pASGDHelper.reset();
 }

 // -----------------------------------------------------------------------
@ -846,6 +859,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,

    bool useGradientAggregation = UsingGradientAggregation(epochNumber);
    bool useModelAggregation = UsingModelAggregation(epochNumber);
+    bool useAsyncGradientAggregation = UsingAsyncGradientAggregation(epochNumber);
    bool useParallelTrain = UsingParallelTrain(epochNumber);

    // Find all evaluation nodes that accumulate error on their own.
@ -981,6 +995,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
        double readTime = 0;
        double computeTime = 0;
        double parameterUpdateTime = 0;
+        double parameterSyncTime = 0; // perf communication time between syncs.
        if (m_perfTraceLevel > 0)
            fineGrainedPerfMeasurementTimer.Start();

@ -1241,15 +1256,14 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
            }
        }

+
        if (m_perfTraceLevel > 0)
        {
            std::unique_ptr<MatrixComputeStreamEvent> mainStreamSyncEvent(MatrixComputeStreamEvent::Create(net->GetDeviceId()));
            mainStreamSyncEvent->SynchronizeEvent();
            fineGrainedPerfMeasurementTimer.Stop();
            parameterUpdateTime = fineGrainedPerfMeasurementTimer.ElapsedSeconds();
-
-            PREPENDTS(stderr);
-            fprintf(stderr, "Perf trace: Worker MB size = %d, Read = %.5gs; Compute = %.5gs; Parameter update = %.5gs, Aggregate MB size = %d\n", (int)actualMBSize, readTime, computeTime, parameterUpdateTime, (int)aggregateNumSamples);
+            fineGrainedPerfMeasurementTimer.Start();
        }

        // aggregation by model averaging or block momentum 
@ -1270,11 +1284,38 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
            }
        }

-        timer.Stop();
-        numMBsRun++;
+        // using parameter server for parameter update
+        if (useAsyncGradientAggregation && m_mpi->NumNodesInUse() > 1)
+        {
+            // Determine if any samples were processed across any of the ranks
+            if (useDistributedMBReading)
+            {
+                noMoreSamplesToProcess = !wasDataRead;
+            }

+            if (nSamplesSinceLastModelSync >= m_nFramesBetweenASGDSync[epochNumber])
+            {
+                m_pASGDHelper->PushAndPullModel(learnableNodes, nSamplesSinceLastModelSync);
+                nSamplesSinceLastModelSync = 0;
+            } 
+        }
+
+
+        if (m_perfTraceLevel > 0)
+        {
+            fineGrainedPerfMeasurementTimer.Stop();
+            parameterSyncTime = fineGrainedPerfMeasurementTimer.ElapsedSeconds();
+        }
+
+        timer.Stop();
+        if (m_perfTraceLevel > 0)
+        {
+            PREPENDTS(stderr);
+            fprintf(stderr, "Perf trace: Worker MB size = %d, Read = %.5gs; Compute = %.5gs; Parameter update = %.5gs; Parameter sync = %.5gs; Aggregate MB size = %d\n", (int)actualMBSize, readTime, computeTime, parameterUpdateTime, parameterSyncTime, (int)aggregateNumSamples);
+        }
+
+        numMBsRun++;
        totalTimeInMBs += timer.ElapsedSeconds();
-        //trainSamplesSinceLastLogged += (int)aggregateNumSamplesWithLabel; // now inside epochCriterionLastLogged

        // log
        // This shows the criterion since last logged.
@ -1404,6 +1445,12 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
        nSamplesSinceLastModelSync = 0;
    }

+    if (useAsyncGradientAggregation && (m_mpi->NumNodesInUse() > 1))
+    {
+        m_pASGDHelper->PushAndPullModel(learnableNodes, nSamplesSinceLastModelSync);
+        nSamplesSinceLastModelSync = 0;
+    }
+
    // hoist the accumulated criterion value from GPU side to our 'out'  variables
    // (unless we useGradientAggregation, in which case they are accumulated in the 'out' variables directly)
    if (!useGradientAggregation)
@ -2555,7 +2602,8 @@ static ParallelizationMethod ParseParallelizationMethod(const wstring& s)
    else if (EqualCI(s, L"DataParallelSGD"))         return ParallelizationMethod::dataParallelSGD;
    else if (EqualCI(s, L"ModelAveragingSGD"))       return ParallelizationMethod::modelAveragingSGD;
    else if (EqualCI(s, L"BlockMomentumSGD"))        return ParallelizationMethod::blockMomentumSGD;
-    else InvalidArgument("ParseParallelizationMethod: Invalid Parallelization Method. Valid values are (none | DataParallelSGD | ModelAveragingSGD | BlockMomentumSGD)");
+    else if (EqualCI(s, L"dataParallelASGD"))        return ParallelizationMethod::dataParallelASGD;
+    else InvalidArgument("ParseParallelizationMethod: Invalid Parallelization Method. Valid values are (none | DataParallelSGD | ModelAveragingSGD | BlockMomentumSGD | dataParallelASGD)");
 }

 static LearningRateSearchAlgorithm ParseLearningRateSearchType(const wstring& s)
@ -2568,8 +2616,18 @@ static LearningRateSearchAlgorithm ParseLearningRateSearchType(const wstring& s)
    else if (EqualCI(s, L"afterEpoch")  || EqualCI(s, L"after"))  return LearningRateSearchAlgorithm::AdjustAfterEpoch;
    else InvalidArgument("autoAdjustLR: Invalid learning rate search type. Valid values are (none | searchBeforeEpoch | adjustAfterEpoch)");
 }
-
-template <class ConfigRecordType>
+  
+#ifdef ASGD_PARALLEL_SUPPORT
+static AdjustLearningRateAtBeginning AdjustLearningRateAtBeginningType(const wstring& s)
+{
+    if      (EqualCI(s.c_str(), L"") || EqualCI(s.c_str(), L"none")) return AdjustLearningRateAtBeginning::None;
+    else if (EqualCI(s.c_str(), L"linearly"))                        return AdjustLearningRateAtBeginning::Linearly;
+    else if (EqualCI(s.c_str(), L"staircase"))                       return AdjustLearningRateAtBeginning::Staircase;
+    else InvalidArgument("AdjustLearningRateatBeginningType: Invalid Type. Valid values are (None | Linearly | Staircase)");
+}
+#endif
+  
+template<class ConfigRecordType>
 SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
 {
    floatargvector learningRatesPerMB = configSGD(L"learningRatesPerMB", ConfigRecordType::Array(floatargvector()));
@ -2830,15 +2888,15 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
        else
        {
            size_t numMPIWorkers = pMPI->NumNodesInUse();            
-        const ConfigRecordType& configParallelTrain(configSGD(L"ParallelTrain", ConfigRecordType::Record()));
-        m_parallelizationMethod = ParseParallelizationMethod(configParallelTrain(L"parallelizationMethod", L"none"));
-        m_parallelizationStartEpochNum = configParallelTrain(L"parallelizationStartEpoch", (int) 1) - 1; // Internally, epoch numbers are 0-based
-        if (m_parallelizationStartEpochNum < 0 /* sic */)
+            const ConfigRecordType& configParallelTrain(configSGD(L"ParallelTrain", ConfigRecordType::Record()));
+            m_parallelizationMethod = ParseParallelizationMethod(configParallelTrain(L"parallelizationMethod", L"none"));
+            m_parallelizationStartEpochNum = configParallelTrain(L"parallelizationStartEpoch", (int)1) - 1; // Internally, epoch numbers are 0-based
+            if (m_parallelizationStartEpochNum < 0 /* sic */)
            // Be explicit that user-facing epoch numbers are 1-based
            InvalidArgument("parallelizationStartEpoch must be greater or equal to 1");
-        m_enableDistributedMBReadingNotSpecified = !configParallelTrain.Exists(L"distributedMBReading");
-        m_enableDistributedMBReading = configParallelTrain(L"distributedMBReading", false);
-        m_syncStatsTrace = configParallelTrain(L"syncPerfStats", (int) 0);
+            m_enableDistributedMBReadingNotSpecified = !configParallelTrain.Exists(L"distributedMBReading");
+            m_enableDistributedMBReading = configParallelTrain(L"distributedMBReading", false);
+            m_syncStatsTrace = configParallelTrain(L"syncPerfStats", (int)0);

        if (configParallelTrain.Exists(L"DataParallelSGD"))
        {
@ -2856,62 +2914,62 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
        if (configParallelTrain.Exists(L"ModelAveragingSGD"))
        {
            const ConfigRecordType& configMASGD(configParallelTrain(L"ModelAveragingSGD", ConfigRecordType::Record()));
-                if (configMASGD.Exists(L"blockSizePerWorker") && configMASGD.Exists(L"blockSize"))
-                    InvalidArgument("It is only allowed to set blockSizePerWorker or blockSize, not both of them");
-                else if (configMASGD.Exists(L"blockSize"))
-                    m_modelAggregationBlockSize = configMASGD(L"blockSize");
-                else if (configMASGD.Exists(L"blockSizePerWorker"))
-                {
-                    m_modelAggregationBlockSize = configMASGD(L"blockSizePerWorker");
-                    m_modelAggregationBlockSize *= numMPIWorkers;
-                }
-                else
-                    m_modelAggregationBlockSize = 40000 * numMPIWorkers;    // default value 
+            if (configMASGD.Exists(L"blockSizePerWorker") && configMASGD.Exists(L"blockSize"))
+                InvalidArgument("It is only allowed to set blockSizePerWorker or blockSize, not both of them");
+            else if (configMASGD.Exists(L"blockSize"))
+                m_modelAggregationBlockSize = configMASGD(L"blockSize");
+            else if (configMASGD.Exists(L"blockSizePerWorker"))
+            {
+                m_modelAggregationBlockSize = configMASGD(L"blockSizePerWorker");
+                m_modelAggregationBlockSize *= numMPIWorkers;
+            }
+            else
+                m_modelAggregationBlockSize = 40000 * numMPIWorkers;    // default value 
 #if 1           // legacy option 
            if (configMASGD.Exists(L"syncFrequencyInFrames"))
            {
-                    if (configMASGD.Exists(L"blockSizePerWorker") || configMASGD.Exists(L"blockSize"))
-                        InvalidArgument("syncFrequencyInFrames is a deprecated alias of blockSizePerWorker. It is not allowed to specify both of them");
-                    m_modelAggregationBlockSize = configMASGD(L"syncFrequencyInFrames");
-                    m_modelAggregationBlockSize *= numMPIWorkers;
-                    fprintf(stderr, "WARNING: option syncFrequencyInFrames in ModelAveragingSGD is going to be deprecated. Please use blockSizePerWorker instead\n");
-                }
-                if (configMASGD.Exists(L"syncPeroid"))
-                {
-                    if (configMASGD.Exists(L"blockSizePerWorker") || configMASGD.Exists(L"blockSize"))
-                        InvalidArgument("syncPeriod is a deprecated alias of blockSizePerWorker. It is not allowed to specify both of them");
-                    m_modelAggregationBlockSize = configMASGD(L"syncPeriod");
-                    m_modelAggregationBlockSize *= numMPIWorkers;
-                    fprintf(stderr, "WARNING: option syncPeroid in ModelAveragingSGD is going to be deprecated. Please use blockSizePerWorker instead in the future.\n");
+                if (configMASGD.Exists(L"blockSizePerWorker") || configMASGD.Exists(L"blockSize"))
+                    InvalidArgument("syncFrequencyInFrames is a deprecated alias of blockSizePerWorker. It is not allowed to specify both of them");
+                m_modelAggregationBlockSize = configMASGD(L"syncFrequencyInFrames");
+                m_modelAggregationBlockSize *= numMPIWorkers;
+                fprintf(stderr, "WARNING: option syncFrequencyInFrames in ModelAveragingSGD is going to be deprecated. Please use blockSizePerWorker instead\n");
+            }
+            if (configMASGD.Exists(L"syncPeroid"))
+            {
+                if (configMASGD.Exists(L"blockSizePerWorker") || configMASGD.Exists(L"blockSize"))
+                    InvalidArgument("syncPeriod is a deprecated alias of blockSizePerWorker. It is not allowed to specify both of them");
+                m_modelAggregationBlockSize = configMASGD(L"syncPeriod");
+                m_modelAggregationBlockSize *= numMPIWorkers;
+                fprintf(stderr, "WARNING: option syncPeroid in ModelAveragingSGD is going to be deprecated. Please use blockSizePerWorker instead in the future.\n");
            }
 #endif
        }
        if (configParallelTrain.Exists(L"BlockMomentumSGD"))
        {
 #ifndef CNTK_PARALLEL_TRAINING_SUPPORT
-            InvalidArgument("BlockMomentumSGD is not enabled in this version.\n"); 
+            InvalidArgument("BlockMomentumSGD is not enabled in this version.\n");
 #else
            const ConfigRecordType& configBMSGD(configParallelTrain(L"BlockMomentumSGD", ConfigRecordType::Record()));
-                if (configBMSGD.Exists(L"blockSize") && configBMSGD.Exists(L"blockSizePerWorker"))
-                    InvalidArgument("It is only allowed to set blockSizePerWorker or blockSize, not both of them");
-                else if (configBMSGD.Exists(L"blockSizePerWorker"))
-                {
-                    m_modelAggregationBlockSize = configBMSGD(L"blockSizePerWorker");
-                    m_modelAggregationBlockSize *= numMPIWorkers;
-                }
-                else if (configBMSGD.Exists(L"blockSize"))
-                    m_modelAggregationBlockSize = configBMSGD(L"blockSize");
-                else
-                    m_modelAggregationBlockSize = 120000 * numMPIWorkers; // default value 
+            if (configBMSGD.Exists(L"blockSize") && configBMSGD.Exists(L"blockSizePerWorker"))
+                InvalidArgument("It is only allowed to set blockSizePerWorker or blockSize, not both of them");
+            else if (configBMSGD.Exists(L"blockSizePerWorker"))
+            {
+                m_modelAggregationBlockSize = configBMSGD(L"blockSizePerWorker");
+                m_modelAggregationBlockSize *= numMPIWorkers;
+            }
+            else if (configBMSGD.Exists(L"blockSize"))
+                m_modelAggregationBlockSize = configBMSGD(L"blockSize");
+            else
+                m_modelAggregationBlockSize = 120000 * numMPIWorkers; // default value 
 #if 1           // legacy option
-                if (configBMSGD.Exists(L"syncPeriod"))
-                {
-                    if (configBMSGD.Exists(L"blockSizePerWorker") || configBMSGD.Exists(L"blockSize"))
-                        InvalidArgument("syncPeriod is a deprecated alias of blockSizePerWorker. It is not allowed to specify both of them");
-                    m_modelAggregationBlockSize = configBMSGD(L"syncPeriod");
-                    m_modelAggregationBlockSize *= numMPIWorkers;
-                    fprintf(stderr, "WARNING: option syncPeroid in BlockMomentumSGD is going to be deprecated. Please use blockSizePerWorker instead in the future.\n");
-                }
+            if (configBMSGD.Exists(L"syncPeriod"))
+            {
+                if (configBMSGD.Exists(L"blockSizePerWorker") || configBMSGD.Exists(L"blockSize"))
+                    InvalidArgument("syncPeriod is a deprecated alias of blockSizePerWorker. It is not allowed to specify both of them");
+                m_modelAggregationBlockSize = configBMSGD(L"syncPeriod");
+                m_modelAggregationBlockSize *= numMPIWorkers;
+                fprintf(stderr, "WARNING: option syncPeroid in BlockMomentumSGD is going to be deprecated. Please use blockSizePerWorker instead in the future.\n");
+            }
 #endif 
            m_resetSGDMomentum = configBMSGD(L"resetSGDMomentum", true);
            m_useNesterovBlockMomentum = configBMSGD(L"useNesterovMomentum", true);
@ -2929,16 +2987,35 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
            else if (configBMSGD.Exists(L"blockMomentumPerSync"))
            {
                double blockMomentum = configBMSGD(L"blockMomentumPerSync");
-                    m_blockMomentumAsTimeConstant = BlockMomentumSGD<double>::Momentum2TimeConstant(blockMomentum, m_modelAggregationBlockSize);
+                m_blockMomentumAsTimeConstant = BlockMomentumSGD<double>::Momentum2TimeConstant(blockMomentum, m_modelAggregationBlockSize);
            }
 #endif 
            else /*if (!configBMSGD.Exists(L"blockMomentumPerSync") && !configBMSGD.Exists(L"blockMomentumAsTimeConstant"))*/
            {
-                    double blockMomentum = 1.0 - 1.0 / (double)numMPIWorkers;   // this is a default value which ensures each block update contributes equally
-                    m_blockMomentumAsTimeConstant = BlockMomentumSGD<double>::Momentum2TimeConstant(blockMomentum, m_modelAggregationBlockSize);
+                double blockMomentum = 1.0 - 1.0 / (double)numMPIWorkers;   // this is a default value which ensures each block update contributes equally
+                m_blockMomentumAsTimeConstant = BlockMomentumSGD<double>::Momentum2TimeConstant(blockMomentum, m_modelAggregationBlockSize);
            }
 #endif 
-                InitializeAndCheckBlockMomentumSGDParameters();
+        }
+
+        if (configParallelTrain.Exists(L"DataParallelASGD"))
+        {
+#ifndef ASGD_PARALLEL_SUPPORT
+            InvalidArgument("DataParallelASGD is not enabled in this version.\n");
+#else
+            const ConfigRecordType & configDataParallelASGD(configParallelTrain(L"DataParallelASGD", ConfigRecordType::Record()));
+            m_nFramesBetweenASGDSync = configDataParallelASGD(L"syncPeriod", ConfigRecordType::Array(intargvector(vector<int>{256})));
+            m_isAsyncBufferEnabled = configDataParallelASGD(L"UsePipeline", false);
+            m_isSimulateMA = configDataParallelASGD(L"SimModelAverage", false); // using parameter server-based version of ModefAveragingSGD
+            if (configDataParallelASGD.Exists(L"AdjustLearningRateAtBeginning")) // adjust learning rate per m_adjustNumInBatch minibatchs until to original one
+                                                                                 // this option could be used to takcle the unstableness of ASGD
+            {
+                const ConfigRecordType & configAdjustLearningRateAtBeginning(configDataParallelASGD(L"AdjustLearningRateAtBeginning", ConfigRecordType::Record()));
+                m_adjustLearningRateAtBeginning = AdjustLearningRateAtBeginningType(configAdjustLearningRateAtBeginning(L"adjustType", L"None"));
+                m_adjustCoefficient = configAdjustLearningRateAtBeginning(L"adjustCoefficient", (double)0.1);
+                m_adjustPerMinibatches = configAdjustLearningRateAtBeginning(L"adjustPerMinibatches", (size_t)256);
+            }
+#endif
        }
        } // if (!pMPI)
    } // if (configSGD.Exists(L"ParallelTrain"))
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@ -19,7 +19,7 @@
 #include <random>
 #include "Profiler.h"
 #include "MASGD.h"
-
+#include "ASGDHelper.h"
 using namespace std; // ugh! TODO: get rid of this from .h files!!!

 #define CNTK_CHECKPOINT_VERSION_1 1     // 1 -> no version number 
@ -60,6 +60,7 @@ enum class ParallelizationMethod : int
    dataParallelSGD = 1,
    modelAveragingSGD = 2,
    blockMomentumSGD = 3,
+    dataParallelASGD = 4,
    modelParallelSGD = (1 << 8) // Currently unsupported
 };

@ -286,6 +287,14 @@ protected:
    double m_L2RegWeight;
    double m_L1RegWeight;

+    // Parallel training related with ASGD 
+    intargvector m_nFramesBetweenASGDSync;
+    bool m_isAsyncBufferEnabled;
+    bool m_isSimulateMA;
+    AdjustLearningRateAtBeginning m_adjustLearningRateAtBeginning;
+    double m_adjustCoefficient;
+    size_t m_adjustPerMinibatches;
+
    // sequence training
    double m_hSmoothingWeight;
    double m_frameDropThresh;
@ -352,7 +361,7 @@ public:

        if (m_mpi == nullptr)
            m_parallelizationMethod = ParallelizationMethod::none;
-    }
+        }

    void Train(shared_ptr<ComputationNetwork> net, DEVICEID_TYPE deviceId,
               IDataReader* trainSetDataReader,
@ -564,20 +573,41 @@ protected:

 private:
    void MarkDropoutNodesEvalTimeStampAsOutdated(const ComputationNetworkPtr& net, const ComputationNodeBasePtr& criterionNode);
+    std::shared_ptr<ASGDHelper<ElemType>> m_pASGDHelper;

    bool UsingGradientAggregation(size_t epochNumber) const
    {
        return ((GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD) && (epochNumber >= m_parallelizationStartEpochNum));
    }
+
    bool UsingModelAggregation(size_t epochNumber) const
    {
        return ((GetParallelizationMethod() == ParallelizationMethod::modelAveragingSGD ||
                 GetParallelizationMethod() == ParallelizationMethod::blockMomentumSGD) &&
                (epochNumber >= m_parallelizationStartEpochNum));
    }
-    bool UsingParallelTrain(size_t epochNumber) const
+
+    bool UsingAsyncGradientAggregation(size_t epochNumber)
    {
-        return UsingGradientAggregation(epochNumber) || UsingModelAggregation(epochNumber);
+        return ((GetParallelizationMethod() == ParallelizationMethod::dataParallelASGD) && (epochNumber >= m_parallelizationStartEpochNum));
+    }
+
+    bool UsingParallelTrain(size_t epochNumber)
+    {
+        return UsingGradientAggregation(epochNumber) || UsingModelAggregation(epochNumber) || UsingAsyncGradientAggregation(epochNumber);
+    }
+
+    void BarrierWorkers()
+    {
+        if (m_mpi != nullptr && GetParallelizationMethod() != ParallelizationMethod::dataParallelASGD)
+        {
+            m_mpi->WaitAll();
+        }
+        if (m_mpi != nullptr && GetParallelizationMethod() == ParallelizationMethod::dataParallelASGD)
+        {
+            m_pASGDHelper->WaitAll();
+        }
+        return;
    }
 };

--- a/Source/SGDLib/SGDLib.vcxproj
+++ b/Source/SGDLib/SGDLib.vcxproj
@ -43,10 +43,12 @@
    <ClCompile>
      <AdditionalIncludeDirectories>$(SolutionDir)Source\Readers\ReaderLib;$(SolutionDir)Source\CNTKv2LibraryDll;$(SolutionDir)Source\CNTKv2LibraryDll\API;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
      <AdditionalIncludeDirectories Condition="'$(CNTK_ENABLE_1BitSGD)'=='true'">$(SolutionDir)Source\1BitSGD;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(CNTK_ENABLE_ASGD)'!='false'">$(SolutionDir)Source\multiverso\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <PrecompiledHeader>
      </PrecompiledHeader>
      <PreprocessorDefinitions>WIN32;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <PreprocessorDefinitions Condition="'$(CNTK_ENABLE_1BitSGD)'=='true'">QUANTIZED_GRADIENT_AGGREGATION;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions Condition="'$(CNTK_ENABLE_ASGD)'!='false'">ASGD_PARALLEL_SUPPORT;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <DisableSpecificWarnings>4819</DisableSpecificWarnings>
    </ClCompile>
    <Link>
@ -101,6 +103,7 @@
    <ClInclude Include="..\Common\Include\BestGpu.h" />
    <ClInclude Include="..\Common\Include\Config.h" />
    <ClInclude Include="..\Common\Include\DataReader.h" />
+    <ClInclude Include="..\Common\Include\ASGDHelper.h" />
    <ClInclude Include="..\Common\Include\TensorShape.h" />
    <ClInclude Include="..\Common\Include\DataWriter.h" />
    <ClInclude Include="..\Common\Include\File.h" />
@ -138,6 +141,7 @@
    <ClInclude Include="V2SimpleDistGradAggregator.h" />
  </ItemGroup>
  <ItemGroup>
+    <ClCompile Include="ASGDHelper.cpp" />
    <ClCompile Include="PostComputingActions.cpp" />
    <ClCompile Include="Profiler.cpp" />
    <ClCompile Include="SGD.cpp" />
--- a/Source/SGDLib/SGDLib.vcxproj.filters
+++ b/Source/SGDLib/SGDLib.vcxproj.filters
@ -13,6 +13,9 @@
    <ClCompile Include="PostComputingActions.cpp">
      <Filter>Stat</Filter>
    </ClCompile>
+    <ClCompile Include="ASGDHelper.cpp">
+      <Filter>Parallelization</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\Common\Include\fileutil.h">
@ -135,6 +138,9 @@
    <ClInclude Include="V2SimpleDistGradAggregator.h">
      <Filter>Parallelization</Filter>
    </ClInclude>
+    <ClInclude Include="..\Common\Include\ASGDHelper.h">
+      <Filter>Parallelization</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Common">
@ -177,4 +183,4 @@
      <UniqueIdentifier>{f406217f-5a11-44ca-bb34-52254dbee8af}</UniqueIdentifier>
    </Filter>
  </ItemGroup>
-</Project>
+</Project>
--- a/Source/SGDLib/SimpleEvaluator.h
+++ b/Source/SGDLib/SimpleEvaluator.h
@ -130,7 +130,7 @@ public:
                actualMBSize = 0; // (undefined if !wasDataRead)

            if (actualMBSize > 0)
-        {
+            {

            size_t actualNumSubminibatches = numSubminibatchesNeeded <= 1 ? 1 : smbDispatcher.GetMinibatchIntoCache(*dataReader, *m_net, inputMatrices, numSubminibatchesNeeded);
            for (size_t ismb = 0; ismb < actualNumSubminibatches; ismb++)
--- a/Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/run-test
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CNTK/run-test
--- a/Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/run-test
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/CuDNN/run-test
--- a/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/run-test
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/run-test
--- a/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/run-test
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/run-test
--- a/Tests/EndToEndTests/Examples/Image/Deprecated/CIFAR-10/02_BatchNormConv/run-test
+++ b/Tests/EndToEndTests/Examples/Image/Deprecated/CIFAR-10/02_BatchNormConv/run-test
--- a/Tests/EndToEndTests/Examples/Image/Deprecated/CIFAR-10/05_ConvLocal/run-test
+++ b/Tests/EndToEndTests/Examples/Image/Deprecated/CIFAR-10/05_ConvLocal/run-test
--- a/Tests/EndToEndTests/Examples/Image/Deprecated/MNIST/01_OneHidden_ndl/run-test
+++ b/Tests/EndToEndTests/Examples/Image/Deprecated/MNIST/01_OneHidden_ndl/run-test
--- a/Tests/EndToEndTests/Examples/Image/Deprecated/MNIST/02_Convolution_ndl/run-test
+++ b/Tests/EndToEndTests/Examples/Image/Deprecated/MNIST/02_Convolution_ndl/run-test
--- a/Tests/EndToEndTests/Examples/Image/Deprecated/MNIST/03_ConvBatchNorm_ndl/run-test
+++ b/Tests/EndToEndTests/Examples/Image/Deprecated/MNIST/03_ConvBatchNorm_ndl/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/AdaptLearnRate/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/AdaptLearnRate/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/CrossValidateSimpleNetwork/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/CrossValidateSimpleNetwork/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/EvalSimpleNetwork/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/EvalSimpleNetwork/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainAutoEncoder/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainAutoEncoder/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainLstm/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainLstm/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainMultiInput/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainMultiInput/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainMultiTask/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainMultiTask/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainNdlNetwork/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainNdlNetwork/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainSimpleNetwork/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainSimpleNetwork/run-test
--- a/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainWithPreTrain/run-test
+++ b/Tests/EndToEndTests/Examples/Speech/TIMIT/TrainWithPreTrain/run-test
--- a/Tests/EndToEndTests/Image/AlexNet/Composite/run-test
+++ b/Tests/EndToEndTests/Image/AlexNet/Composite/run-test
--- a/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/03_ResNet-parallel.cntk
+++ b/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/03_ResNet-parallel.cntk
@ -0,0 +1,147 @@
+# Note: This sample uses the deprecated NdlNetworkBuilder.
+#       An updated version using BrainScript is coming soon.
+#       Please find updated samples on Github, https://github.com/Microsoft/CNTK/tree/master/Examples /...
+#
+makeMode = true
+RootDir = "."
+
+configName = "ssgd"
+minibatch = 128
+epochSize = 5
+parallelizationMethod = "DataParallelSGD"
+asyncBuffer = "true"
+
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output-$configName$"
+ModelDir = "$OutputDir$/Models"
+
+ndlMacros = "$ConfigDir$/Macros.ndl"
+
+precision = "float"
+DeviceId = "auto"
+imageLayout = "cudnn"
+
+
+# override the above as follows when running on CPU:
+# deviceId = -1
+
+# If set to true, always initialize the network on CPU, making initialization consistent across CPU and GPU targets (for testing).
+initOnCPUOnly=true
+
+prefetch = "true"
+parallelTrain = "false"
+
+command = Train
+
+stderr = "$OutputDir$/03_ResNet"
+traceLevel = 1
+
+Proj16to32Filename = "$ConfigDir$/16to32.txt"
+Proj32to64Filename = "$ConfigDir$/32to64.txt"
+
+Train = [
+    action = "train"
+    modelPath = "$ModelDir$/03_ResNet"
+
+     NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/03_ResNet.ndl"
+    ]
+    
+    SGD = [
+        epochSize = 0
+        minibatchSize = $minibatch$
+        # Note that learning rates are 10x more than in the paper due to a different
+        # momentum update rule in CNTK: v{t + 1} = lr*(1 - momentum)*g{t + 1} + momentum*v{t}
+        learningRatesPerSample = 0.004*80:0.0004*40:0.00004
+        momentumPerMB = 0
+        maxEpochs = $epochsize$
+        L2RegWeight = 0.0001
+        dropoutRate = 0
+        perfTraceLevel = 0
+        
+        firstMBsToShowResult = 1
+        numMBsToShowResult = 10
+
+        ParallelTrain = [
+            parallelizationMethod = $parallelizationMethod$
+            distributedMBReading = "true"
+            parallelizationStartEpoch = 1
+            DataParallelSGD = [
+                gradientBits = 32
+                useBufferedAsyncGradientAggregation = $asyncBuffer$
+            ]
+            ModelAveragingSGD = [
+                blockSizePerWorker = 128
+            ]
+            DataParallelASGD = [
+                syncPeriod = 128
+                usePipeline = $asyncBuffer$
+            ]
+        ]
+    ]
+    
+    reader = [
+        readerType = "ImageReader"
+        file = "$DataDir$/train_map.txt"
+        randomize = "auto"
+        features = [
+            width = 32
+            height = 32
+            channels = 3
+            cropType = "random"
+            cropRatio = 0.8
+            jitterType = "uniRatio"
+            interpolations = "linear"
+            meanFile = "$DataDir$/CIFAR-10_mean.xml"
+        ]
+        labels = [
+            labelDim = 10
+        ]
+    ]
+
+    cvReader = [
+        readerType = "ImageReader"
+        file = "$DataDir$/test_map.txt"
+        randomize = "none"
+        features = [
+            width = 32
+            height = 32
+            channels = 3
+            cropType = "center"
+            cropRatio = 1
+            jitterType = "uniRatio"
+            interpolations = "linear"
+            meanFile = "$DataDir$/CIFAR-10_mean.xml"
+        ]
+        labels = [
+            labelDim = 10
+        ]
+    ]    
+]
+
+Test = [
+    action = "test"
+    modelPath = "$ModelDir$/03_ResNet"
+    # Set minibatch size for testing.
+    minibatchSize = 256
+
+    reader = [
+        readerType = "ImageReader"
+        file = "$DataDir$/cifar-10-batches-py/test_map.txt"
+        randomize = "none"
+        features = [
+            width = 32
+            height = 32
+            channels = 3
+            cropType = "center"
+            cropRatio = 1
+            jitterType = "uniRatio"
+            interpolations = "linear"
+            meanFile = "$DataDir$/cifar-10-batches-py/CIFAR-10_mean.xml"
+        ]
+        labels = [
+            labelDim = 10
+        ]
+    ]    
+]
--- a/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/03_ResNet.ndl
+++ b/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/03_ResNet.ndl
@ -0,0 +1,67 @@
+load=LocalMacros
+run=DNN
+
+LocalMacros = [
+    ImageW = 32
+    ImageH = 32
+    ImageC = 3
+    LabelDim = 10
+
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = $imageLayout$)
+    labels = Input(LabelDim, tag = label)
+    
+    convWScale = 7.07
+    convBValue = 0
+    
+    fc1WScale = 0.4
+    fc1BValue = 0
+    
+    scValue = 1
+    
+    # Batch normalization time constant.
+    bnTimeConst = 4096
+    
+    kW = 3
+    kH = 3
+    
+    hStride1 = 1
+    vStride1 = 1
+]
+
+DNN=[
+    conv1WScale = 0.26
+    cMap1 = 16
+    conv1 = ConvBNReLULayer(features, cMap1, 27, kW, kH, hStride1, vStride1, conv1WScale, convBValue, scValue, bnTimeConst)
+
+    rn1_1 = ResNetNode2(conv1, cMap1, 144, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+    rn1_2 = ResNetNode2(rn1_1, cMap1, 144, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+    rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+
+    cMap2 = 32
+    rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", learningRateMultiplier = 0)
+    rn2_1 = ResNetNode2Inc(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn2_1_Wproj)
+    #rn2_1 = ResNetNode2Inc2(rn1_3, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
+    rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+    rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+
+    cMap3 = 64
+    rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", learningRateMultiplier = 0)
+    rn3_1 = ResNetNode2Inc(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn3_1_Wproj)
+    #rn3_1 = ResNetNode2Inc2(rn2_3, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
+    rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+    rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+                
+    # Global average pooling
+    poolW = 8
+    poolH = 8
+    poolhStride = 1
+    poolvStride = 1
+    pool = AveragePooling(rn3_3, poolW, poolH, poolhStride, poolvStride, imageLayout = $imageLayout$)
+
+    ol = DnnLastLayer(cMap3, labelDim, pool, fc1WScale, fc1BValue)
+    
+    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
+    Err = ClassificationError(labels, ol, tag = Eval)
+    OutputNodes = ol
+]
+
--- a/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/16to32.txt
+++ b/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/16to32.txt
@ -0,0 +1,32 @@
+1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
--- a/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/32to64.txt
+++ b/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/32to64.txt
@ -0,0 +1,64 @@
+1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
--- a/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/Macros.ndl
+++ b/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/Macros.ndl
@ -0,0 +1,148 @@
+ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
+[
+    W = LearnableParameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
+    b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = $imageLayout$)
+    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
+    p = Plus(c, b)
+    y = RectifiedLinear(p)
+]
+
+ConvLocalReLULayer(inp, outMap, outWCount, inMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
+[
+    W = LearnableParameter(outWCount, inWCount, init = Gaussian, initValueScale = wScale)
+    b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = $imageLayout$)
+    c = Convolution(W, inp, {kW, kH, inMap}, mapCount = outMap, stride = {hStride, vStride, inMap}, sharing = {false, false, false}, imageLayout = $imageLayout$)
+    p = Plus(c, b)
+    y = RectifiedLinear(p)
+]
+
+ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
+[
+    b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
+    sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
+    m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    
+    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
+    y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
+]
+
+ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
+[
+    W = LearnableParameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
+    c = ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
+]
+
+ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
+[
+    c = ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
+    y = RectifiedLinear(c)
+]
+
+ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
+[
+    b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
+    sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
+    m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    
+    c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
+    y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
+]
+
+ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
+[
+    # First convolution layer.
+    c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
+    # Second convolution layer, no ReLU.
+    c2 = ConvBNLayer(c1, outMap, inWCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
+    p = Plus(c2, inp)
+    y = RectifiedLinear(p)
+]
+
+ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, bnTimeConst, Wproj)
+[
+    # First convolution layer.
+    c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 2, 2, wScale, bValue, scValue, bnTimeConst)
+    # Second convolution layer, no ReLU.
+    c2 = ConvBNLayer(c1, outMap, wCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
+    
+    # Projection convolution layer.
+    c_proj = ProjLayer(Wproj, inp, outMap, 2, 2, bValue, scValue, bnTimeConst)
+    #c_proj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = $imageLayout$)
+    
+    p = Plus(c2, c_proj)
+    y = RectifiedLinear(p)
+]
+
+ResNetNode2Inc2(inp, inMap, outMap, inWCount, wCount, kW, kH, wScale, w1Scale, bValue, scValue, bnTimeConst)
+[
+    pool = MaxPooling(inp, 1, 1, 2, 2, imageLayout = $imageLayout$)
+    # First convolution layer.
+    c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 2, 2, wScale, bValue, scValue, bnTimeConst)
+    # Second convolution layer, no ReLU.
+    c2 = ConvBNLayer(c1, inMap, wCount, kW, kH, 1, 1, w1Scale, bValue, scValue, bnTimeConst)
+    c3 = ConvBNLayer(c1, inMap, wCount, kW, kH, 1, 1, w1Scale, bValue, scValue, bnTimeConst)
+    
+    p = Plus(c2, pool)
+    r = RowStack(p, c3)
+    y = RectifiedLinear(r)
+]
+
+DnnReLULayer(inDim, outDim, x, wScale, bValue)
+[
+    W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
+    b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
+    t = Times(W, x)
+    z = Plus(t, b)
+    y = RectifiedLinear(z)
+]
+
+DNNImageReLULayer(inW, inH, inC, outDim, x, wScale, bValue)
+[
+    W = ImageParameter(outDim, inW, inH, inC, init = Gaussian,   initValueScale = wScale, imageLayout=$imageLayout$) 
+    b = LearnableParameter(outDim, 1,         init = fixedValue, value = bValue) 
+    t = Times(W, x)
+    z = Plus(t, b)
+    y = RectifiedLinear(z)
+]
+
+DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
+[
+    W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
+    b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
+    sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
+    m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    t = Times(W, x)
+    bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst)
+    y = RectifiedLinear(bn)
+]
+
+DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeConst)
+[
+    W = ImageParameter(outDim, inW, inH, inC, init = Gaussian, initValueScale = wScale, imageLayout=$imageLayout$)
+    b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
+    sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
+    m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    t = Times(W, x)
+    bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst)
+    y = RectifiedLinear(bn)
+]
+
+DnnLastLayer(hiddenDim, labelDim, x, wScale, bValue)
+[
+    W = LearnableParameter(labelDim, hiddenDim, init = Gaussian, initValueScale = wScale)
+    b = LearnableParameter(labelDim, init = fixedValue, value = bValue)
+    t = Times(W, x)
+    z = Plus(t, b)
+]
+
+DnnImageLastLayer(inW, inH, inC, labelDim, x, wScale, bValue)
+[
+    W = ImageParameter(labelDim, inW, inH, inC, init = Gaussian, initValueScale = wScale, imageLayout=$imageLayout$)
+    b = LearnableParameter(labelDim, init = fixedValue, value = bValue)
+    t = Times(W, x)
+    z = Plus(t, b)
+]
--- a/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/baseline.gpu.txt
+++ b/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/baseline.gpu.txt
--- a/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/baseline.windows.gpu.txt
+++ b/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/baseline.windows.gpu.txt
--- a/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/run-test
+++ b/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/run-test
@ -0,0 +1,23 @@
+#!/bin/bash
+
+. $TEST_DIR/run-test-common
+
+#dataDir="."
+ConfigDir=$TEST_DIR
+LogFileName="ASGDMultiGPU"
+Instances=4
+NumCPUThreads=$(threadsPerInstance $Instances)
+parallelizationMethod="DataParallelASGD"
+
+# cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
+cntkmpirun "-n $Instances" 03_ResNet-parallel.cntk "numCPUThreads=$NumCPUThreads precision=float DeviceId=\"auto\" parallelTrain=true minibatch=512 epochsize=10 asyncBuffer=\"false\" parallelizationMethod=$parallelizationMethod"
+ExitCode=$?
+sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank0
+sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank1
+sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank2
+sed 's/^/MPI Rank 3: /' $TEST_RUN_DIR/"$LogFileName"_Train.logrank3
+
+# Delete the test data if copied
+[[ "$Copied" -eq "1" ]] && rm -rf "$DataDir"
+
+exit $ExitCode
--- a/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/run-test-common
+++ b/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/run-test-common
@ -0,0 +1,35 @@
+#!/bin/bash
+
+. $TEST_ROOT_DIR/run-test-common
+
+export MKL_NUM_THREADS=4
+export MKL_CBWR=COMPATIBLE
+export OMP_NUM_THREADS=1
+
+ConfigDir=$TEST_DIR
+
+
+if [[ ! -d $TEST_DATA_DIR || ! -e $TEST_DATA_DIR/Train_cntk_text.txt || ! -e $TEST_DATA_DIR/train_map.txt ]]; then
+    # Cannot find test data locally.
+    # Try external test data directory (not part of the CNTK repository) as an alternative.
+    if [[ -d "$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY" ]]; then
+        if [ "$OS" == "Windows_NT" ]; then
+            DataSourceDir=`cygpath -au $CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY`/Image/CIFAR/v0
+        else
+            DataSourceDir=$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY/Image/CIFAR/v0
+        fi
+
+        # Copy the test data to the test run directory
+        DataDir=$TEST_RUN_DIR/TestData
+        mkdir $DataDir
+        mkdir $DataDir/cifar-10-batches-py
+        cp -R $DataSourceDir/*_cntk_text.txt $DataDir || exit $?
+        cp -R $DataSourceDir/cifar-10-batches-py/data.zip $DataDir/cifar-10-batches-py || exit $?
+        cp -R $DataSourceDir/cifar-10-batches-py/CIFAR-10_mean.xml $DataDir || exit $?
+        cp -R $DataSourceDir/cifar-10-batches-py/*_map.txt $DataDir || exit $?
+        Copied=1
+    else
+        echo Error: cannot find data. Please see Examples/Image/DataSets/CIFAR10/README.md for instructions to get it.
+        exit 1
+    fi
+fi
--- a/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/testcases.yml
+++ b/Tests/EndToEndTests/ParallelTraining/AsynchronousSGD/testcases.yml
@ -0,0 +1,31 @@
+dataDir: .
+
+tags:
+     # running on every BVT job in 'P' (Parallel) leg in Debug-GPU on Linux configurations:
+     # TODO: Enable windows test when Jenkins ready
+     - bvt-p  (build_sku == 'gpu') and (flavor=='debug') and (os == 'linux') and (device == 'gpu')
+     # running unconditionally on every Nightly job in 'P' leg
+     - nightly-p (build_sku == 'gpu') and (os == 'linux') and (device == 'gpu')
+
+testCases:
+  Must train epochs in exactly same order and parameters for each MPI Rank:
+    patterns:
+      - ^MPI Rank {{integer}}
+      - Starting Epoch {{integer}}
+      - learning rate per sample = {{float}}
+
+  Epochs must be finished with expected results for each MPI Rank:
+    patterns:
+      - ^MPI Rank {{integer}}
+      - Finished Epoch[{{integer}} of {{integer}}]
+
+  Per-minibatch training results must match for each MPI Rank:
+    patterns:
+      - ^MPI Rank {{integer}}
+      - Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
+      - " * {{integer}}; "
+
+  DataParallelASGD training parameters must match for each MPI Rank:
+    patterns:
+      - ^MPI Rank {{integer}}
+      - Starting minibatch loop
--- a/Tests/EndToEndTests/PyTest/run-test
+++ b/Tests/EndToEndTests/PyTest/run-test
--- a/Tests/EndToEndTests/ScriptTest/run-test
+++ b/Tests/EndToEndTests/ScriptTest/run-test
--- a/Tests/EndToEndTests/Speech/DNN/ParallelBM/run-test
+++ b/Tests/EndToEndTests/Speech/DNN/ParallelBM/run-test
--- a/Tests/EndToEndTests/Speech/DNN/PlotDNN/run-test
+++ b/Tests/EndToEndTests/Speech/DNN/PlotDNN/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/DiscriminativePreTraining/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/DiscriminativePreTraining/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/Parallel1BitQuantization/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/Parallel1BitQuantization/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelBufferedAsyncGradientAggregation/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelBufferedAsyncGradientAggregation/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelNoQuantization/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelNoQuantization/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelNoQuantizationBufferedAsyncGradientAggregation/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/ParallelNoQuantizationBufferedAsyncGradientAggregation/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/WriteCommand/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/DNN/WriteCommand/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/LSTM/FullUtterance/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/LSTM/FullUtterance/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/LSTM/Truncated/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/LSTM/Truncated/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/QuickE2E/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/QuickE2E/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/SVD/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/SVD/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/AdaptLearnRate/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/AdaptLearnRate/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/CrossValidateSimpleNetwork/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/CrossValidateSimpleNetwork/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/EvalSimpleNetwork/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/EvalSimpleNetwork/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainAutoEncoder/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainAutoEncoder/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainLstm/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainLstm/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainMultiInput/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainMultiInput/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainMultiTask/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainMultiTask/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainNdlNetwork/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainNdlNetwork/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainSimpleNetwork/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainSimpleNetwork/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainWithPreTrain/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/TrainWithPreTrain/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/WriteBottleneck/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/WriteBottleneck/run-test
--- a/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/WriteScaledLogLike/run-test
+++ b/Tests/EndToEndTests/Speech/HTKDeserializers/TIMIT/WriteScaledLogLike/run-test
--- a/Tests/EndToEndTests/Speech/LSTM/FullUtteranceCuDNN5/run-test
+++ b/Tests/EndToEndTests/Speech/LSTM/FullUtteranceCuDNN5/run-test
--- a/Tests/EndToEndTests/Speech/LSTM/FullUtteranceLayers/run-test
+++ b/Tests/EndToEndTests/Speech/LSTM/FullUtteranceLayers/run-test
--- a/Tests/EndToEndTests/Speech/LSTM/TruncatedLayers/run-test
+++ b/Tests/EndToEndTests/Speech/LSTM/TruncatedLayers/run-test
--- a/Tests/EndToEndTests/Text/SLU/run-test
+++ b/Tests/EndToEndTests/Text/SLU/run-test
--- a/Tests/EndToEndTests/Text/SequenceClassification/run-test
+++ b/Tests/EndToEndTests/Text/SequenceClassification/run-test
--- a/Tests/EndToEndTests/UnitTests/BrainScriptTests/run-test
+++ b/Tests/EndToEndTests/UnitTests/BrainScriptTests/run-test
--- a/Tests/EndToEndTests/UnitTests/EvalTests/run-test
+++ b/Tests/EndToEndTests/UnitTests/EvalTests/run-test
--- a/Tests/EndToEndTests/UnitTests/ManagedEvalTests/run-test
+++ b/Tests/EndToEndTests/UnitTests/ManagedEvalTests/run-test
--- a/Tests/EndToEndTests/UnitTests/MathTests/run-test
+++ b/Tests/EndToEndTests/UnitTests/MathTests/run-test
--- a/Tests/EndToEndTests/UnitTests/MultiversoTests/baseline.txt
+++ b/Tests/EndToEndTests/UnitTests/MultiversoTests/baseline.txt
@ -0,0 +1,115 @@
+CPU info:
+    CPU Model Name: Intel(R) Xeon(R) CPU E5-2680 v2 @ 2.80GHz
+    Hardware threads: 40
+    Total Memory: 264118516 kB
+-------------------------------------------------------------------
+Running 8 test cases...
+[INFO] [2016-09-30 12:19:35] multiverso MPI-Net is initialized under MPI_THREAD_SERIALIZED mode.
+[INFO] [2016-09-30 12:19:35] All nodes registered. System contains 1 nodes. num_worker = 1, num_server = 1
+[INFO] [2016-09-30 12:19:35] Create a async server
+[INFO] [2016-09-30 12:19:35] Rank 0: Multiverso start sucessfully
+[INFO] [2016-09-30 12:19:35] --------------Show dashboard monitor information--------------
+[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_ADD]  count = 2 elapse = 3.25433ms average = 1.62716ms
+[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_GET]  count = 2 elapse = 0.011775ms average = 0.0058875ms
+[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_ADD]  count = 2 elapse = 0.058559ms average = 0.0292795ms
+[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_GET]  count = 2 elapse = 0.014456ms average = 0.007228ms
+[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_REPLY_GET]  count = 2 elapse = 0.005685ms average = 0.0028425ms
+[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_ADD]  count = 1 elapse = 3.5289ms average = 3.5289ms
+[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_GET]  count = 1 elapse = 0.110966ms average = 0.110966ms
+[INFO] [2016-09-30 12:19:35] --------------------------------------------------------------
+[INFO] [2016-09-30 12:19:35] Multiverso Shutdown successfully
+[INFO] [2016-09-30 12:19:35] multiverso MPI-Net is initialized under MPI_THREAD_SERIALIZED mode.
+[INFO] [2016-09-30 12:19:35] All nodes registered. System contains 1 nodes. num_worker = 1, num_server = 1
+[INFO] [2016-09-30 12:19:35] Create a async server
+[INFO] [2016-09-30 12:19:35] Rank 0: Multiverso start sucessfully
+[INFO] [2016-09-30 12:19:35] --------------Show dashboard monitor information--------------
+[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_ADD]  count = 2 elapse = 3.25433ms average = 1.62716ms
+[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_GET]  count = 2 elapse = 0.011775ms average = 0.0058875ms
+[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_ADD]  count = 2 elapse = 0.058559ms average = 0.0292795ms
+[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_GET]  count = 2 elapse = 0.014456ms average = 0.007228ms
+[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_REPLY_GET]  count = 2 elapse = 0.005685ms average = 0.0028425ms
+[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_ADD]  count = 1 elapse = 3.5289ms average = 3.5289ms
+[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_GET]  count = 1 elapse = 0.110966ms average = 0.110966ms
+[INFO] [2016-09-30 12:19:35] --------------------------------------------------------------
+[INFO] [2016-09-30 12:19:35] Multiverso Shutdown successfully
+[INFO] [2016-09-30 12:19:35] multiverso MPI-Net is initialized under MPI_THREAD_SERIALIZED mode.
+[INFO] [2016-09-30 12:19:35] All nodes registered. System contains 1 nodes. num_worker = 1, num_server = 1
+[INFO] [2016-09-30 12:19:35] Create a async server
+[INFO] [2016-09-30 12:19:35] Rank 0: Multiverso start sucessfully
+[INFO] [2016-09-30 12:19:35] --------------Show dashboard monitor information--------------
+[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_ADD]  count = 4 elapse = 3.26092ms average = 0.81523ms
+[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_GET]  count = 5 elapse = 0.035872ms average = 0.0071744ms
+[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_ADD]  count = 4 elapse = 0.079631ms average = 0.0199077ms
+[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_GET]  count = 5 elapse = 0.055307ms average = 0.0110614ms
+[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_REPLY_GET]  count = 5 elapse = 0.014141ms average = 0.0028282ms
+[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_ADD]  count = 3 elapse = 3.64047ms average = 1.21349ms
+[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_GET]  count = 4 elapse = 0.35004ms average = 0.08751ms
+[INFO] [2016-09-30 12:19:35] --------------------------------------------------------------
+[INFO] [2016-09-30 12:19:35] Multiverso Shutdown successfully
+[INFO] [2016-09-30 12:19:35] multiverso MPI-Net is initialized under MPI_THREAD_SERIALIZED mode.
+[INFO] [2016-09-30 12:19:35] All nodes registered. System contains 1 nodes. num_worker = 1, num_server = 1
+[INFO] [2016-09-30 12:19:35] Create a sync server
+[INFO] [2016-09-30 12:19:35] Rank 0: Multiverso start sucessfully
+[INFO] [2016-09-30 12:19:35] --------------Show dashboard monitor information--------------
+[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_ADD]  count = 6 elapse = 3.35131ms average = 0.558552ms
+[INFO] [2016-09-30 12:19:35] [SERVER_PROCESS_GET]  count = 7 elapse = 0.049346ms average = 0.00704943ms
+[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_ADD]  count = 6 elapse = 0.110051ms average = 0.0183418ms
+[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_GET]  count = 7 elapse = 0.068958ms average = 0.00985114ms
+[INFO] [2016-09-30 12:19:35] [WORKER_PROCESS_REPLY_GET]  count = 7 elapse = 0.018843ms average = 0.00269186ms
+[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_ADD]  count = 4 elapse = 3.83984ms average = 0.959961ms
+[INFO] [2016-09-30 12:19:35] [WORKER_TABLE_SYNC_GET]  count = 5 elapse = 0.420295ms average = 0.084059ms
+[INFO] [2016-09-30 12:19:35] --------------------------------------------------------------
+[INFO] [2016-09-30 12:19:35] Multiverso Shutdown successfully
+
+Test module "multiverso" has passed with:
+  8 test cases out of 8 passed
+  83 assertions out of 83 passed
+
+  Test suite "array_test" has passed with:
+    2 test cases out of 2 passed
+    34 assertions out of 34 passed
+
+    Test case "array_test/array_access" has passed with:
+      20 assertions out of 20 passed
+
+    Test case "array_test/array_partition" has passed with:
+      14 assertions out of 14 passed
+
+  Test suite "blob" has passed with:
+    2 test cases out of 2 passed
+    7 assertions out of 7 passed
+
+    Test case "blob/blob_constructor_test" has passed with:
+      3 assertions out of 3 passed
+
+    Test case "blob/blob_access_test" has passed with:
+      4 assertions out of 4 passed
+
+  Test suite "test_kv" has passed with:
+    1 test case out of 1 passed
+    3 assertions out of 3 passed
+
+    Test case "test_kv/access" has passed with:
+      3 assertions out of 3 passed
+
+  Test suite "message" has passed with:
+    1 test case out of 1 passed
+    11 assertions out of 11 passed
+
+    Test case "message/message_access" has passed with:
+      11 assertions out of 11 passed
+
+  Test suite "node" has passed with:
+    1 test case out of 1 passed
+    8 assertions out of 8 passed
+
+    Test case "node/node_role" has passed with:
+      8 assertions out of 8 passed
+
+  Test suite "test_sync" has passed with:
+    1 test case out of 1 passed
+    20 assertions out of 20 passed
+
+    Test case "test_sync/sync" has passed with:
+      20 assertions out of 20 passed
+
--- a/Tests/EndToEndTests/UnitTests/MultiversoTests/run-test
+++ b/Tests/EndToEndTests/UnitTests/MultiversoTests/run-test
@ -0,0 +1,6 @@
+#!/bin/bash
+
+. $TEST_ROOT_DIR/run-test-common
+. $TEST_DIR/../run-boost-test-common
+
+boosttestrun multiversotests
--- a/Tests/EndToEndTests/UnitTests/MultiversoTests/testcases.yml
+++ b/Tests/EndToEndTests/UnitTests/MultiversoTests/testcases.yml
@ -0,0 +1,19 @@
+dataDir: .
+
+tags:
+  # CPU only, at this stage.
+  # TODO move from l to separate leg, requires infra changes
+  - bvt-l (build_sku == 'cpu') or (build_sku == '1bitsgd')
+  - nightly-l (build_sku == 'cpu') or (build_sku == '1bitsgd')
+
+testCases:
+  Test cases pass:
+    patterns:
+      - "Test case"
+      - "passed with"
+
+  Test suites pass:
+    patterns:
+      - "Test suite"
+      - "passed with"
+
--- a/Tests/EndToEndTests/UnitTests/NetworkTests/run-test
+++ b/Tests/EndToEndTests/UnitTests/NetworkTests/run-test
--- a/Tests/EndToEndTests/UnitTests/ReaderTests/run-test
+++ b/Tests/EndToEndTests/UnitTests/ReaderTests/run-test
--- a/Tools/check-git-head.sh
+++ b/Tools/check-git-head.sh
@ -43,7 +43,7 @@ checkEmptyStdout \


 checkEmptyStdout \
-  "git ls-tree --full-tree -r HEAD --name-only | git check-attr text --cached --stdin | grep -v 'text: set' | cut -d: -f1 | git check-attr binary --cached --stdin | grep -v 'binary: set' | cut -d: -f1 | grep -v Source/1BitSGD" \
+  "git ls-tree --full-tree -r HEAD --name-only | git check-attr text --cached --stdin | grep -v 'text: set' | cut -d: -f1 | git check-attr binary --cached --stdin | grep -v 'binary: set' | cut -d: -f1 | grep -v Source/Multiverso | grep -v Source/1BitSGD" \
  "files that are neither marked as binary nor text; should extend .gitattributes"

 # TODO line ending checks
--- a/Tools/generate_build_info
+++ b/Tools/generate_build_info
@ -55,9 +55,10 @@ makebuildinfo()
    local CUDA_PATH=$6
    local CUB_PATH=$7
    local WITH_1BITSGD=$8
-    local BUILDER=$9
-    local BUILDMACHINE=${10}
-    local BUILDPATH=${11}
+    local WITH_ASGD=$9
+    local BUILDER=$10
+    local BUILDMACHINE=${11}
+    local BUILDPATH=${12}

    (
        printf "#ifndef _BUILDINFO_H\n"
@ -84,6 +85,11 @@ makebuildinfo()
        else
            printf "#define _WITH_1BITSGD_ \"no\"\n"
        fi
+        if [ ! -z "$WITH_ASGD" ]; then 
+            printf "#define _WITH_ASGD_ \"yes\"\n"
+        else
+            printf "#define _WITH_ASGD_ \"no\"\n"
+        fi
        printf "#define _BUILDER_ \"%s\"\n"  "$BUILDER"
        printf "#define _BUILDMACHINE_ \"%s\"\n"  "$BUILDMACHINE"
        printf "#define _BUILDPATH_ \"%s\"\n"  "$BUILDPATH"
@ -152,6 +158,7 @@ makebuildinfo \
    "$CUDAPATH" \
    "$CUBPATH" \
    "$CNTK_ENABLE_1BitSGD" \
+    "$CNTK_ENABLE_ASGD" \
    "$BUILDER" \
    "$BUILDMACHINE" \
    "$BUILDPATH"
--- a/19
+++ b/19
@ -90,6 +90,9 @@ enable_1bitsgd=$default_use_1bitsgd
 default_use_code_coverage=no
 enable_code_coverage=$default_use_code_coverage

+default_use_asgd=yes
+enable_asgd=$default_use_asgd
+
 # List from best to worst choice
 default_path_list="/usr /usr/local /opt /opt/local"

@ -322,6 +325,7 @@ function show_help ()
    echo "  --with-build-top=directory build directory $(show_default $build_top)"
    echo "  --add directory add directory to library search path"
    echo "  --1bitsgd[=(yes|no)] use 1Bit SGD $(show_default ${default_use_1bitsgd})"
+    echo "  --asgd[=(yes|no)] use ASGD powered by Multiverso $(show_default $(default_use_asgd))"
    echo "  --cuda[=(yes|no)] use cuda GPU $(show_default $(default_use_cuda))"
    echo "  --python[=(yes|no)] with Python bindings $(show_default $(default_use_python))"
    echo "  --with-cuda[=directory] $(show_default $(find_cuda))"
@ -402,6 +406,17 @@ do
            fi
            ;;

+        --asgd*)
+            if test x$optarg = xyes || test x$optarg = xno
+            then
+                enable_asgd=$optarg
+            else
+                echo "Invalid value for --asgd $optarg"
+                show_help
+                exit
+            fi
+            ;;
+
        --cuda*)
            if test x$optarg = xyes || test x$optarg = xno
            then
@ -1040,6 +1055,10 @@ if test x$protobuf_path != x; then
    echo PROTOBUF_PATH=$protobuf_path >> $config
 fi

+if test $enable_asgd = yes ; then
+    echo CNTK_ENABLE_ASGD=true >> $config
+fi
+
 # If we are not in the configure directory, generate a trampoline Makefile
 makefile=$build_top/Makefile
 if test $(is_hardlinked "$configure" "$build_top/configure") = no
				`@ -0,0 +1 @@`
				`Subproject commit 40743f9c86297f63b29c99c259199f59f16c0c7c`