From 026426ad0847afd12325b7f535d47e71f2061b14 Mon Sep 17 00:00:00 2001
From: Yinggong ZHAO <yinggong.zhao@gmail.com>
Date: Mon, 1 Jun 2015 23:29:25 -0700
Subject: [PATCH 01/21] In sequencereader move labelOutput to DeviceId and 
 remove useless code in NCE-LSTM, which will move data from GPU to CPU

---
 DataReader/LMSequenceReader/SequenceReader.cpp | 4 ++++
 MachineLearning/CNTK/SimpleNetworkBuilder.cpp  | 2 --
 2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/DataReader/LMSequenceReader/SequenceReader.cpp b/DataReader/LMSequenceReader/SequenceReader.cpp
index 9c61b52a1..9b39b7ab0 100644
--- a/DataReader/LMSequenceReader/SequenceReader.cpp
+++ b/DataReader/LMSequenceReader/SequenceReader.cpp
@@ -2051,6 +2051,10 @@ void BatchSequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring,
         RuntimeError("GetLabelOutput::should use CPU for labels ");
 
     }
+    if (curDevId != CPUDEVICE)
+    {
+        labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
+    }
 }
 
 template class BatchSequenceReader<double>; 
diff --git a/MachineLearning/CNTK/SimpleNetworkBuilder.cpp b/MachineLearning/CNTK/SimpleNetworkBuilder.cpp
index 2c800cf84..461a8b622 100644
--- a/MachineLearning/CNTK/SimpleNetworkBuilder.cpp
+++ b/MachineLearning/CNTK/SimpleNetworkBuilder.cpp
@@ -1273,8 +1273,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             w = m_net->CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
             m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
 
-            double val = w->FunctionValues()(0, 0);
-
             /// the label is a dense matrix. each element is the word index
             label = m_net->CreateInputNode(L"labels", 2 * (this->nce_noises + 1), mbSize);
 

From b488e18a933bde53da3cf877e046c0f267eddb18 Mon Sep 17 00:00:00 2001
From: Yinggong ZHAO <yinggong.zhao@gmail.com>
Date: Mon, 1 Jun 2015 23:36:56 -0700
Subject: [PATCH 02/21] remove sampleCount in
 CPUMatrix::AssignNoiseContrastiveEstimation

---
 Math/Math/CPUMatrix.cpp | 3 +--
 Math/Math/CPUMatrix.h   | 2 +-
 Math/Math/Matrix.cpp    | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/Math/Math/CPUMatrix.cpp b/Math/Math/CPUMatrix.cpp
index 9d5c505f1..4770a11a3 100644
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@@ -3837,7 +3837,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     template<class ElemType>
     void CPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const CPUMatrix<ElemType>& a,
-        const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias, size_t sampleCount, CPUMatrix<ElemType>& tmp, CPUMatrix<ElemType>& c)
+        const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias, CPUMatrix<ElemType>& tmp, CPUMatrix<ElemType>& c)
         //this: samples+probs
         // a:   hidden
         // b:   embedding
@@ -3852,7 +3852,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             std::cerr << endl;
         }
         */
-        sampleCount *= 1;
         double log_likelihood = 0.0;
         size_t sample_size = this->GetNumRows() / 2;
         size_t batch_size = this->GetNumCols();
diff --git a/Math/Math/CPUMatrix.h b/Math/Math/CPUMatrix.h
index 8a39bda0b..59cce206f 100644
--- a/Math/Math/CPUMatrix.h
+++ b/Math/Math/CPUMatrix.h
@@ -216,7 +216,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         CPUMatrix<ElemType>& AssignVectorNorm2Of(CPUMatrix<ElemType>& a, const bool isColWise);
 
         void AssignNoiseContrastiveEstimation(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias,
-            size_t sampleCount, CPUMatrix<ElemType>& tmp, CPUMatrix<ElemType>& c);
+            CPUMatrix<ElemType>& tmp, CPUMatrix<ElemType>& c);
 
         void AssignNCEUnnormalizedEval(const CPUMatrix<ElemType>& a,
             const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias, CPUMatrix<ElemType>& c);
diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp
index 23e914be6..ec53f4b07 100644
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@@ -3556,7 +3556,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             size_t sampleCount = a.m_CPUMatrix->GetNumElements() / a.m_CPUMatrix->GetNumRows();
             tmp.Resize(a.GetNumRows() / 2, sampleCount);
-            a.m_CPUMatrix->AssignNoiseContrastiveEstimation(*b.m_CPUMatrix, *c.m_CPUMatrix, *bias.m_CPUMatrix, sampleCount, *tmp.m_CPUMatrix, *this->m_CPUMatrix);
+            a.m_CPUMatrix->AssignNoiseContrastiveEstimation(*b.m_CPUMatrix, *c.m_CPUMatrix, *bias.m_CPUMatrix, *tmp.m_CPUMatrix, *this->m_CPUMatrix);
         }
         else
         {

From a9c669cc2acfd2fb419e9a44c3636b0e1bbf0554 Mon Sep 17 00:00:00 2001
From: Marko Radmilac <mradmila@microsoft.com>
Date: Tue, 2 Jun 2015 01:12:33 -0700
Subject: [PATCH 03/21] Adding script for build and test

---
 Makefile.gpu           |  12 ++--
 Scripts/build-and-test | 157 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 163 insertions(+), 6 deletions(-)
 create mode 100755 Scripts/build-and-test

diff --git a/Makefile.gpu b/Makefile.gpu
index d42fbc8c2..eefb13025 100644
--- a/Makefile.gpu
+++ b/Makefile.gpu
@@ -32,11 +32,11 @@ DEVICE = gpu
 BUILDTYPE = debug
 #BUILDTYPE = release
 # comment following and uncomment the next one to enable MKL library
-#MATHLIB = acml
-MATHLIB = mkl
+MATHLIB = acml
+#MATHLIB = mkl
 # modify relevant path below for your system
 MKL_PATH = /usr/users/chiaying/intel/composer_xe_2013.2.146
-ACML_PATH = /usr/local/acml5.3.0/gfortran64
+ACML_PATH = /usr/local/acml5.3.1/ifort64
 #######
 
 BUILDFOR = $(ARCH).$(DEVICE).$(BUILDTYPE).$(MATHLIB)
@@ -48,8 +48,8 @@ ifeq ($(BUILDTYPE),debug)
 	BUILDTYPE_OPT = -g
 	GPU_BUILDTYPE_OPT = -G
 else
-	BUILDTYPE_OPT = -O4
-	GPU_BUILDTYPE_OPT =
+	BUILDTYPE_OPT = -O3 -flto
+	GPU_BUILDTYPE_OPT = -O3
 endif
 
 ifeq ($(MATHLIB),mkl)
@@ -142,7 +142,7 @@ $(OBJDIR)/%.o : %.cu Makefile
 	@echo $(SEPARATOR)
 	@echo creating $@ for $(ARCH) with build type $(BUILDTYPE) 
 	@mkdir -p $(dir $@)
-	$(NVCC) -c $< -o $@ $(BUILDTYPE_OPT) $(GPU_BUILDTYPE_OPT) $(NVCCFLAGS) $(INCFLAGS) -Xcompiler -fPIC
+	$(NVCC) -c $< -o $@ $(GPU_BUILDTYPE_OPT) $(NVCCFLAGS) $(INCFLAGS) -Xcompiler -fPIC
 
 $(OBJDIR)/%.o : %.cpp Makefile
 	@echo $(SEPARATOR)
diff --git a/Scripts/build-and-test b/Scripts/build-and-test
new file mode 100755
index 000000000..41d11d4fc
--- /dev/null
+++ b/Scripts/build-and-test
@@ -0,0 +1,157 @@
+#!/bin/bash
+
+# Setting some default values
+CNTK_CLEANUP=1
+QUIET_MAKE=
+
+# parsing command line arguments:
+while [[ $# > 0 ]]
+do
+key="$1"
+
+case $key in
+    -h|--help)
+    echo "Usage: build-and-test [options]"
+    echo "Options:"
+    echo "  -q|--quiet-make - redirect build output to files"
+    echo "  -n|--no-cleanup - leave build binaries intact"
+    echo "If CNTK root is empty and branch is not specified then master CNTK branch is built"
+    exit 1
+    ;;
+    -n|--no-cleanup)
+    CNTK_CLEANUP=0
+    ;;
+    -q|--quiet-make)
+    QUIET_MAKE=1
+    ;;
+    -*)
+    echo Unkown option $key
+    exit 1
+    ;;
+    *)
+    echo Unkown option $key
+    exit 1
+    ;;
+esac
+shift # past argument or value
+done
+
+# Step 0 -- Validate all necessary prerequisites
+# It is possible to use this script on Windows to build CNTK
+# from Cygwin window with Visual C++ environment loaded.
+# In that case OS environment variable will be set and we 
+# can use it to differentiate from Linux.
+if [[ $OS == "Windows_NT" && $OSTYPE == "cygwin" ]]; then
+  DEBUG_DIR=Debug
+  RELEASE_DIR=Release
+  PREFIX_DIR=x64
+  BIN_NAME=CNTK.exe
+  
+  if [[ $VCINSTALLDIR == "" ]]; then
+    echo "============ Visual Studio environment not properly setup ============"
+    echo "============ Please find and run the appropriate vcvarsall.bat script ============"
+    exit 1
+  fi
+elif [[ $OSTYPE == "linux-gnu" ]]; then
+  DEBUG_DIR=x86_64.gpu.debug.acml
+  RELEASE_DIR=x86_64.gpu.release.acml
+  PREFIX_DIR=bin
+  BIN_NAME=cntk
+else
+  echo "============ ERROR: Unsupported OS ============"
+  echo "============ Scripts supports only building from Linux and Windows through Cygwin ============"
+  exit 1
+fi
+
+# Step 1 -- Prepare temporary folders and files, tweak settings if necessary
+TMP_ROOT=`mktemp -d /tmp/cntk.XXXXX || exit $?`
+echo "============ Creating CNTK temp directory in $TMP_ROOT ============"
+
+TMP_CONF_FILE=`mktemp $TMP_ROOT/Simple.conf.XXXXX || exit $?`
+TMP_RESULT_FILE=`mktemp $TMP_ROOT/Result.XXXXX || exit $?`
+
+SCRIPT=`readlink -f $0`
+SCRIPT_DIR=`dirname $SCRIPT`
+CNTK_ROOT=`dirname $SCRIPT_DIR`
+
+if ! [[ -d "$CNTK_ROOT/.git" ]]; then
+  echo "============ ERROR: Build script located in the wrong directory ($SCRIPT_DIR) ============"
+  error 1
+fi
+
+cd $CNTK_ROOT
+cp Demos/Simple/Simple.config $TMP_CONF_FILE || exit $?
+MAKEFILE=Makefile.gpu
+
+# Our make is too noisy right now and it is difficult to spot
+# issues from stdout and stderr. In the quiet mode these are
+# redirected to a file where they could be examined after the fact
+if [[ $QUIET_MAKE == 1 ]]; then
+  exec 6>>$TMP_ROOT/stdout || exit $?
+  exec 7>>$TMP_ROOT/stderr || exit $?
+else
+  exec 6>&1 || exit $?
+  exec 7>&2 || exit $?
+fi
+
+# Step 2 -- Perform necessary builds
+for FLAVOR in debug release
+do
+  echo "============ Building CNTK $FLAVOR  ============"
+  if [[ $OS == "Windows_NT" ]]; then
+    msbuild.exe /property:Configuration=$FLAVOR /t:Clean || exit $?
+    msbuild.exe /property:Configuration=$FLAVOR || exit $?
+  else
+    make BUILDTYPE=$FLAVOR -f $MAKEFILE clean || exit $?
+    make BUILDTYPE=$FLAVOR -j -f $MAKEFILE 1>&6 2>&7 || exit $?
+  fi
+done
+
+if ! [[ -f "$CNTK_ROOT/$PREFIX_DIR/$DEBUG_DIR/$BIN_NAME" && -f "$CNTK_ROOT/$PREFIX_DIR/$RELEASE_DIR/$BIN_NAME" ]]; then
+  echo "============ ERROR: CNTK did not build properly  ============"
+  exit 1
+fi
+
+# Step 3 -- Run the tests to verify that everything works properly
+cd $PREFIX_DIR
+
+for TARGET in CPU GPU
+do
+  # These sed scripts are simply toggling DeviceNumber argument in the config file
+  # If it is set to Auto, it will pick GPU over CPU. At -1 CPU is selected.
+  if [[ $TARGET == CPU ]]; then
+    sed -i -e 's/^DeviceNumber.*/DeviceNumber=-1/g' $TMP_CONF_FILE || exit $?
+  else
+    sed -i -e 's/^DeviceNumber.*/DeviceNumber=Auto/g' $TMP_CONF_FILE || exit $?
+  fi
+
+  for FLAVOR_DIR in $DEBUG_DIR $RELEASE_DIR
+  do
+    echo "============ Running CNTK ($FLAVOR_DIR) ($TARGET) ============"
+    rm -rf models
+    if [[ $OS == "Windows_NT" ]]; then
+      # We have to use cygpath on Windows to modify the file paths into the format readable by cntk.
+      time ./$FLAVOR_DIR/$BIN_NAME configFile="`cygpath -w $TMP_CONF_FILE`" 2>$TMP_RESULT_FILE || exit $?
+    else
+      time ./$FLAVOR_DIR/$BIN_NAME configFile=$TMP_CONF_FILE 2>$TMP_RESULT_FILE || exit $?
+    fi
+    grep -q "Using $TARGET" $TMP_RESULT_FILE || exit $?
+    grep -q "EXCEPTION" $TMP_RESULT_FILE && exit $?
+  done
+done
+
+# Step 4 -- Optionally cleanup after builds and tests
+if [[ $CNTK_CLEANUP == 1 ]]; then
+  rm -rf models
+  cd $CNTK_ROOT
+  for FLAVOR in debug release
+  do
+    echo "============ Cleaning up CNTK $FLAVOR  ============"
+    if [[ $OS == "Windows_NT" ]]; then
+      msbuild.exe /property:Configuration=$FLAVOR /t:Clean || exit $?
+    else
+      make BUILDTYPE=$FLAVOR -f $MAKEFILE clean || exit $?
+    fi
+  done
+  rm -rf $TMP_ROOT
+fi

From ac723ceae784a2b7523f78b07bffceff4c40ad22 Mon Sep 17 00:00:00 2001
From: Marko Radmilac <mradmila@microsoft.com>
Date: Tue, 2 Jun 2015 12:50:32 -0700
Subject: [PATCH 04/21] Add completion message

---
 Scripts/build-and-test | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Scripts/build-and-test b/Scripts/build-and-test
index 41d11d4fc..93e68a330 100755
--- a/Scripts/build-and-test
+++ b/Scripts/build-and-test
@@ -155,3 +155,5 @@ if [[ $CNTK_CLEANUP == 1 ]]; then
   done
   rm -rf $TMP_ROOT
 fi
+
+echo "============ Build and test of CNTK was successful!  ============"

From 12b1ab8ca68ecdce94055e5f8b769a82dc2cc8e0 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amitaga@microsoft.com>
Date: Wed, 3 Jun 2015 11:36:35 -0700
Subject: [PATCH 05/21] Some minor changes to the Simple Demo config file to
 allow specifying the path to input files using the RootDir variable

---
 Demos/Simple/Simple.config | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/Demos/Simple/Simple.config b/Demos/Simple/Simple.config
index 2176f8414..dd4078505 100644
--- a/Demos/Simple/Simple.config
+++ b/Demos/Simple/Simple.config
@@ -1,8 +1,9 @@
-# command=Simple_Demo_Output
+RootDir=..
 command=Simple_Demo:Simple_Demo_Output
 
 # deviceId=-1 for CPU, >=0 for GPU devices
 DeviceNumber=-1
+
 #stderr=Demo
 
 precision=float
@@ -13,7 +14,6 @@ deviceId=$DeviceNumber$
 outputNodeNames=ScaledLogLikelihood
 traceLevel=1
 
-
 #######################################
 #  TRAINING CONFIG (Simple, Fixed LR) #
 #######################################
@@ -52,22 +52,22 @@ Simple_Demo=[
     reader=[
       # reader to use
       readerType=UCIFastReader
-      file=../Demos/Simple/SimpleDataTrain.txt
+      file=$RootDir$/Demos/Simple/SimpleDataTrain.txt
 
       miniBatchMode=Partial
       randomize=Auto
       verbosity=1   
 
       features=[
-	  dim=2      # two-dimensional input data
+          dim=2      # two-dimensional input data
           start=0    # Start with first element on line
       ]
 
       labels=[
-	start=2      # Skip two elements
+        start=2      # Skip two elements
         dim=1        # One label dimension
         labelDim=2   # Two labels possible
-        labelMappingFile=../Demos/Simple/SimpleMapping.txt
+        labelMappingFile=$RootDir$/Demos/Simple/SimpleMapping.txt
       ]
     ]
 ]
@@ -84,16 +84,16 @@ Simple_Demo_Output=[
     reader=[
       # reader to use
       readerType=UCIFastReader
-      file=../Demos/Simple/SimpleDataTest.txt
+      file=$RootDir$/Demos/Simple/SimpleDataTest.txt
       features=[
           dim=2
-	  start=0
+          start=0
       ]
       labels=[
-	start=2
+      start=2
         dim=1
         labelDim=2
-        labelMappingFile=../Demos/Simple/SimpleMapping.txt
+        labelMappingFile=$RootDir$/Demos/Simple/SimpleMapping.txt
       ]
     ]
     outputPath=SimpleOutput    # Dump output as text

From c35d51dfe5ba97264aaacae3391df4e2453cf0a3 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amitaga@microsoft.com>
Date: Wed, 3 Jun 2015 14:16:28 -0700
Subject: [PATCH 06/21] Fixed a bug that was incorrectly deleting the CPUMatrix
 external buffer

---
 Math/Math/CPUMatrix.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/Math/Math/CPUMatrix.cpp b/Math/Math/CPUMatrix.cpp
index 4770a11a3..2c34b52ac 100644
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@@ -632,16 +632,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // if it's externally managed, then populate the structure
         if (matrixFlags&matrixFlagDontOwnBuffer)
         {
+            // free previous array allocation if any before overwriting
             if (m_pArray != nullptr)
                 delete [] m_pArray;
 
             m_pArray = pArray;
             m_numRows = numRows;
             m_numCols = numCols;
-            // free previous array allocation if any before overwriting
-            if (m_pArray != nullptr)
-                delete[] m_pArray;
-            m_pArray = pArray;
             m_elemSizeAllocated = GetNumElements();
             m_externalBuffer = true;
         }

From 4b29673fda9596b8a372a576d575e80dd7a6021e Mon Sep 17 00:00:00 2001
From: Dong Yu <dongyu@microsoft.com>
Date: Wed, 3 Jun 2015 16:57:02 -0700
Subject: [PATCH 07/21] Fix the error throw bugs in UCIParser. Now if a file
 cannot be opened the error will be thrown and caught and cntk will exit
 gracefully.

---
 DataReader/UCIFastReader/UCIParser.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/DataReader/UCIFastReader/UCIParser.cpp b/DataReader/UCIFastReader/UCIParser.cpp
index da8475c43..83244581a 100644
--- a/DataReader/UCIFastReader/UCIParser.cpp
+++ b/DataReader/UCIFastReader/UCIParser.cpp
@@ -362,10 +362,10 @@ void UCIParser<NumType, LabelType>::ParseInit(LPCWSTR fileName, size_t startFeat
 
     errno_t err = _wfopen_s( &m_pFile, fileName, L"rb" );
     if (err)
-        std::runtime_error("UCIParser::ParseInit - error opening file"); 
+        throw std::runtime_error("UCIParser::ParseInit - error opening file"); 
     int rc = _fseeki64(m_pFile, 0, SEEK_END);
     if (rc)
-        std::runtime_error("UCIParser::ParseInit - error seeking in file");
+        throw std::runtime_error("UCIParser::ParseInit - error seeking in file");
 
     m_fileSize = GetFilePosition();
     m_fileBuffer = new BYTE[m_bufferSize];
@@ -379,7 +379,7 @@ int64_t UCIParser<NumType, LabelType>::GetFilePosition()
 {
     int64_t position = _ftelli64(m_pFile);
     if (position == -1L)
-        std::runtime_error("UCIParser::GetFilePosition - error retrieving file position in file");
+        throw std::runtime_error("UCIParser::GetFilePosition - error retrieving file position in file");
     return position;
 }
 
@@ -392,7 +392,7 @@ void UCIParser<NumType, LabelType>::SetFilePosition(int64_t position)
 {
     int rc = _fseeki64(m_pFile, position, SEEK_SET);
     if (rc)
-        std::runtime_error("UCIParser::SetFilePosition - error seeking in file");
+        throw std::runtime_error("UCIParser::SetFilePosition - error seeking in file");
 
     // setup state machine to start at this position
     PrepareStartPosition(position);
@@ -445,7 +445,7 @@ size_t UCIParser<NumType, LabelType>::UpdateBuffer()
     size_t bytesToRead = min(m_bufferSize, m_fileSize-m_bufferStart)-saveBytes;
     size_t bytesRead = fread(m_fileBuffer+saveBytes, 1, bytesToRead, m_pFile);
     if (bytesRead == 0 && ferror(m_pFile))
-        std::runtime_error("UCIParser::UpdateBuffer - error reading file");
+        throw std::runtime_error("UCIParser::UpdateBuffer - error reading file");
     return bytesRead;
 }
 

From 02080fc0f6d6d5840c2c586443b9eec02f455387 Mon Sep 17 00:00:00 2001
From: Mike Seltzer <mseltzer@microsoft.com>
Date: Thu, 4 Jun 2015 13:12:10 -0700
Subject: [PATCH 08/21] fix bug in multi utterance reader when mlf and feature
 file are mismatched in duration

---
 .../HTKMLFReader/utterancesourcemulti.h       | 67 +++++++++++--------
 1 file changed, 39 insertions(+), 28 deletions(-)

diff --git a/DataReader/HTKMLFReader/utterancesourcemulti.h b/DataReader/HTKMLFReader/utterancesourcemulti.h
index 4af5c9ec1..6b4ba1812 100644
--- a/DataReader/HTKMLFReader/utterancesourcemulti.h
+++ b/DataReader/HTKMLFReader/utterancesourcemulti.h
@@ -382,47 +382,58 @@ public:
                 // TODO: we can store labels more efficiently now since we don't do frame-wise random access anymore.
     
                 // OK, utterance has all we need --remember it
-                utteranceset.push_back (std::move (utterance));
 
                 if (m==0)
                 {
-                    _totalframes += uttframes;
-                    framesaccum.push_back(uttframes); //track number of frames in each utterance - first feature is the reference
                     if (!labels.empty() && !lacksmlf)
                     //if (!labels.empty() && labelsiter != labels[0].end())
                     {
-                        foreach_index (j, labels)
+                        // first verify that all the label files have the proper duration
+                        bool durationmatch = true;
+                        foreach_index(j, labels)
                         {
                             const auto & labseq = labels[j].find(key)->second;
                             // check if durations match; skip if not
-                            size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
+                            size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size() - 1].firstframe + labseq[labseq.size() - 1].numframes);
                             if (labframes != uttframes)
                             {
-                                fprintf (stderr, " [duration mismatch (%d in label vs. %d in feat file), skipping %S]", labframes, uttframes, key.c_str());
+                                fprintf(stderr, " [duration mismatch (%d in label vs. %d in feat file), skipping %S]", labframes, uttframes, key.c_str());
                                 nomlf++;
-                                continue;   // skip this utterance at all
+                                durationmatch = false;
+                                break; // continue;   // skip this utterance at all
                             }
-                            // expand classid sequence into flat array
-                            foreach_index (i, labseq)
+                        }
+                        if (durationmatch){
+                            utteranceset.push_back(std::move(utterance));
+                            _totalframes += uttframes;
+                            framesaccum.push_back(uttframes); //track number of frames in each utterance - first feature is the reference
+                            // then parse each mlf if the durations are consistent
+                            foreach_index(j, labels)
                             {
-                                const auto & e = labseq[i];
-                                if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
-                                    throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
-                                if (e.classid >= udim[j])
-                                    throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: class id %d exceeds model output dimension %d in file %S", e.classid, udim, key.c_str()));
-                                if (e.classid != (CLASSIDTYPE) e.classid)
-                                    throw std::runtime_error ("CLASSIDTYPE has too few bits");
-                                for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
-                                    classids[j]->push_back ((CLASSIDTYPE) e.classid);
-                                numclasses[j] = max (numclasses[j], 1u + e.classid);
-                                counts[j].resize (numclasses[j], 0);
-                                counts[j][e.classid] += e.numframes;
-                            }
-                            classids[j]->push_back ((CLASSIDTYPE) -1);  // append a boundary marker marker for checking
+                                const auto & labseq = labels[j].find(key)->second;
+                                // expand classid sequence into flat array
+                                foreach_index(i, labseq)
+                                {
+                                    const auto & e = labseq[i];
+                                    if ((i > 0 && labseq[i - 1].firstframe + labseq[i - 1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
+                                        throw std::runtime_error(msra::strfun::strprintf("minibatchutterancesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
+                                    if (e.classid >= udim[j])
+                                        throw std::runtime_error(msra::strfun::strprintf("minibatchutterancesource: class id %d exceeds model output dimension %d in file %S", e.classid, udim, key.c_str()));
+                                    if (e.classid != (CLASSIDTYPE)e.classid)
+                                        throw std::runtime_error("CLASSIDTYPE has too few bits");
+                                    for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
+                                        classids[j]->push_back((CLASSIDTYPE)e.classid);
+                                    numclasses[j] = max(numclasses[j], 1u + e.classid);
+                                    counts[j].resize(numclasses[j], 0);
+                                    counts[j][e.classid] += e.numframes;
+                                }
 
-                            if (!labels[j].empty() && classids[j]->size() != _totalframes + utteranceset.size())
-                                throw std::logic_error (msra::strfun::strprintf ("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
-                            assert (labels[j].empty() || classids[j]->size() == _totalframes + utteranceset.size());
+                                classids[j]->push_back((CLASSIDTYPE)-1);  // append a boundary marker marker for checking
+
+                                if (!labels[j].empty() && classids[j]->size() != _totalframes + utteranceset.size())
+                                    throw std::logic_error(msra::strfun::strprintf("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
+                                assert(labels[j].empty() || classids[j]->size() == _totalframes + utteranceset.size());
+                            }
                         }
                     }
                     else{
@@ -451,7 +462,7 @@ public:
             }
             if (nomlf + nolat > 0)
             {
-                fprintf (stderr, "minibatchutterancesource: out of %d files, %d files not found in label set and %d have no lattice\n", infiles.size(), nomlf, nolat);
+                fprintf (stderr, "minibatchutterancesource: out of %d files, %d files not found in label set and %d have no lattice\n", infiles[0].size(), nomlf, nolat);
                 if (nomlf + nolat > infiles[m].size() / 2)
                     throw std::runtime_error ("minibatchutterancesource: too many files not found in label set--assuming broken configuration\n");
             }
@@ -1236,4 +1247,4 @@ public:
 };
 
 };};
-
+

From e4424d56978a27033f1791e5402573a5149f2016 Mon Sep 17 00:00:00 2001
From: Yu <yzhang87@mit.edu>
Date: Sun, 7 Jun 2015 18:19:26 -0400
Subject: [PATCH 09/21] Fix the comile on linux for kaldi reader

---
 DataReader/Kaldi2Reader/HTKMLFReader.cpp |  1 +
 DataReader/Kaldi2Reader/HTKMLFWriter.cpp |  1 +
 Makefile_kaldi2.cpu                      | 16 ++++++++--------
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/DataReader/Kaldi2Reader/HTKMLFReader.cpp b/DataReader/Kaldi2Reader/HTKMLFReader.cpp
index ee6bc91d1..7a1d78f78 100644
--- a/DataReader/Kaldi2Reader/HTKMLFReader.cpp
+++ b/DataReader/Kaldi2Reader/HTKMLFReader.cpp
@@ -24,6 +24,7 @@
 #define DATAREADER_EXPORTS  // creating the exports here
 #include "DataReader.h"
 #include "HTKMLFReader.h"
+#include "commandArgUtil.h"
 #ifdef LEAKDETECT
 #include <vld.h> // for memory leak detection
 #endif
diff --git a/DataReader/Kaldi2Reader/HTKMLFWriter.cpp b/DataReader/Kaldi2Reader/HTKMLFWriter.cpp
index 8c0881c24..1d6f3f480 100644
--- a/DataReader/Kaldi2Reader/HTKMLFWriter.cpp
+++ b/DataReader/Kaldi2Reader/HTKMLFWriter.cpp
@@ -27,6 +27,7 @@
 #define DATAWRITER_EXPORTS  // creating the exports here
 #include "DataWriter.h"
 #include "HTKMLFWriter.h"
+#include "commandArgUtil.h"
 #ifdef LEAKDETECT
 #include <vld.h> // for memory leak detection
 #endif
diff --git a/Makefile_kaldi2.cpu b/Makefile_kaldi2.cpu
index b793130dd..abd4e425d 100644
--- a/Makefile_kaldi2.cpu
+++ b/Makefile_kaldi2.cpu
@@ -31,8 +31,8 @@ DEVICE = cpu
 #BUILDTYPE = debug
 BUILDTYPE = release
 # comment following and uncomment the next one to enable MKL library
-#MATHLIB = acml
-MATHLIB = mkl
+MATHLIB = acml
+#MATHLIB = mkl
 # modify relevant path below for your system
 MKL_PATH = /usr/users/chiaying/intel/composer_xe_2013.2.146
 ACML_PATH = /usr/users/yzhang87/code/acml/gfortran64
@@ -61,7 +61,7 @@ endif
 # Add KALDI (you need to add your Kaldi path into this file)
 include kaldi_vars.mk
 
-INCFLAGS = -I Common/Include -I Math/Math -I MachineLearning/cn -I $(MATHLIB_INCLUDE)  $(KALDI_INCLUDES)
+INCFLAGS = -I Common/Include -I Math/Math -I MachineLearning/CNTK -I $(MATHLIB_INCLUDE)  $(KALDI_INCLUDES)
 
 
 CFLAGS = -msse3 -std=c++0x -std=c++11 -DCPUONLY -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K $(KALDI_DEFINES) $(MATHLIB_DEFINE) -fopenmp -fpermissive -fPIC
@@ -70,9 +70,9 @@ COMMON_SRC = Common/fileutil.cpp Common/DataWriter.cpp Common/ConfigFile.cpp Com
              Common/Eval.cpp Common/File.cpp Common/BestGpu.cpp  Common/TimerUtility.cpp
 
 MATH_SRC = Math/Math/Matrix.cpp Math/Math/CPUMatrix.cpp Math/Math/CPUSparseMatrix.cpp Math/Math/NoGPU.cpp
-CN_SRC =  MachineLearning/cn/NetworkDescriptionLanguage.cpp MachineLearning/cn/cn.cpp MachineLearning/cn/ComputationNode.cpp \
-          MachineLearning/cn/ModelEditLanguage.cpp MachineLearning/cn/PTaskGraphBuilder.cpp \
-          MachineLearning/cn/SimpleNetworkBuilder.cpp MachineLearning/cn/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
+CN_SRC =  MachineLearning/CNTK/NetworkDescriptionLanguage.cpp MachineLearning/CNTK/CNTK.cpp MachineLearning/CNTK/ComputationNode.cpp \
+          MachineLearning/CNTK/ModelEditLanguage.cpp \
+          MachineLearning/CNTK/SimpleNetworkBuilder.cpp MachineLearning/CNTK/tests.cpp MachineLearning/CNTKEval/CNTKEval.cpp
 BINARYREADER_SRC = DataReader/BinaryReader/BinaryWriter.cpp DataReader/BinaryReader/BinaryReader.cpp DataReader/BinaryReader/BinaryFile.cpp
 HTKMLFREADER_SRC = DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp DataReader/HTKMLFReader_linux/DataWriter.cpp DataReader/HTKMLFReader_linux/DataReader.cpp DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
 KALDIREADER_SRC = DataReader/KaldiReader/HTKMLFWriter.cpp DataReader/KaldiReader/DataWriter.cpp DataReader/KaldiReader/DataReader.cpp DataReader/KaldiReader/HTKMLFReader.cpp
@@ -101,7 +101,7 @@ DEP := $(patsubst %.o, %.d, $(OBJ))
 SEPARATOR = "=-----------------------------------------------------------="
 
 #all: $(BINDIR)/cn.exe $(BINDIR)/UCIFastReader.so $(BINDIR)/SequenceReader.so $(BINDIR)/LUSequenceReader.so $(BINDIR)/HTKMLFReader.so $(BINDIR)/BinaryReader.so
-all: $(BINDIR)/cn.exe $(BINDIR)/UCIFastReader.so $(BINDIR)/LMSequenceReader.so $(BINDIR)/LUSequenceReader.so $(BINDIR)/HTKMLFReader.so $(BINDIR)/Kaldi2Reader.so
+all: $(BINDIR)/cntk $(BINDIR)/UCIFastReader.so $(BINDIR)/LMSequenceReader.so $(BINDIR)/LUSequenceReader.so $(BINDIR)/HTKMLFReader.so $(BINDIR)/Kaldi2Reader.so
 
 
 	ln -sf $(CURDIR)/$(BINDIR)/* bin
@@ -135,7 +135,7 @@ $(BINDIR)/Kaldi2Reader.so: $(KALDI2READER_OBJ) $(CORE_OBJ)
 	$(CC) $(BUILDTYPE_OPT) -fPIC -shared -o $@ $^ $(KALDI_LIBS)
 
 
-$(BINDIR)/cn.exe: $(CORE_OBJ)
+$(BINDIR)/cntk: $(CORE_OBJ)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building output for $(ARCH) with build type $(BUILDTYPE) ...

From 346dc11cde7a7dedbcf2c1700acb8d4ee0010182 Mon Sep 17 00:00:00 2001
From: Dong Yu <dongyu@microsoft.com>
Date: Wed, 10 Jun 2015 13:44:54 -0700
Subject: [PATCH 10/21] fix bugs in the binaryReader and UCIFastReader

---
 DataReader/BinaryReader/BinaryWriter.cpp | 4 ++--
 DataReader/UCIFastReader/UCIParser.cpp   | 7 ++++++-
 DataReader/UCIFastReader/UCIParser.h     | 4 ++--
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/DataReader/BinaryReader/BinaryWriter.cpp b/DataReader/BinaryReader/BinaryWriter.cpp
index c30a22b32..e77ec2437 100644
--- a/DataReader/BinaryReader/BinaryWriter.cpp
+++ b/DataReader/BinaryReader/BinaryWriter.cpp
@@ -47,8 +47,8 @@ BinaryWriter<ElemType>::~BinaryWriter()
 //  miniBatchMode=Partial
 //  randomize=None
 //  wfile=c:\speech\mnist\mnist_test.bin
-//  #wsize - inital size of the file in MB
-//  # if calculated size would be bigger, that is used instead
+//  #wsize - inital size of the file in MB default to 256
+//  # has to be large enough for your dataset. the file will shrink to the actual size when closed.
 //  #wsize=256
 //  #wrecords - number of records we should allocate space for in the file
 //  # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
diff --git a/DataReader/UCIFastReader/UCIParser.cpp b/DataReader/UCIFastReader/UCIParser.cpp
index 83244581a..0a9d30317 100644
--- a/DataReader/UCIFastReader/UCIParser.cpp
+++ b/DataReader/UCIFastReader/UCIParser.cpp
@@ -11,6 +11,11 @@
 #include <stdexcept>
 #include <stdint.h>
 
+#if WIN32
+#define ftell64 _ftelli64
+#else
+#define ftell64 ftell
+#endif
 
 // SetState for a particular value
 template <typename NumType, typename LabelType>
@@ -377,7 +382,7 @@ void UCIParser<NumType, LabelType>::ParseInit(LPCWSTR fileName, size_t startFeat
 template <typename NumType, typename LabelType>
 int64_t UCIParser<NumType, LabelType>::GetFilePosition()
 {
-    int64_t position = _ftelli64(m_pFile);
+    int64_t position = ftell64(m_pFile);
     if (position == -1L)
         throw std::runtime_error("UCIParser::GetFilePosition - error retrieving file position in file");
     return position;
diff --git a/DataReader/UCIFastReader/UCIParser.h b/DataReader/UCIFastReader/UCIParser.h
index 07ba939e8..98447d479 100644
--- a/DataReader/UCIFastReader/UCIParser.h
+++ b/DataReader/UCIFastReader/UCIParser.h
@@ -90,8 +90,8 @@ private:
     int m_elementsConvertedThisLine;
 
     // global stats
-    int m_totalNumbersConverted;
-    int m_totalLabelsConverted;
+    int64_t m_totalNumbersConverted;
+    int64_t m_totalLabelsConverted;
 
     // file positions/buffer
     FILE * m_pFile;

From 66193e0716c4741697d1daaa9e1bfecdfa550e86 Mon Sep 17 00:00:00 2001
From: Marko Radmilac <mradmila@microsoft.com>
Date: Mon, 8 Jun 2015 14:19:54 -0700
Subject: [PATCH 11/21] Modify build script to support more options

---
 Scripts/build-and-test | 271 +++++++++++++++++++++++++----------------
 1 file changed, 169 insertions(+), 102 deletions(-)

diff --git a/Scripts/build-and-test b/Scripts/build-and-test
index 93e68a330..df89a424c 100755
--- a/Scripts/build-and-test
+++ b/Scripts/build-and-test
@@ -1,8 +1,10 @@
 #!/bin/bash
 
 # Setting some default values
-CNTK_CLEANUP=1
-QUIET_MAKE=
+BUILD=1
+RUN=1
+CLEAN_AFTER=0
+CLEAN_BEFORE=0
 
 # parsing command line arguments:
 while [[ $# > 0 ]]
@@ -13,20 +15,32 @@ case $key in
     -h|--help)
     echo "Usage: build-and-test [options]"
     echo "Options:"
-    echo "  -q|--quiet-make - redirect build output to files"
-    echo "  -n|--no-cleanup - leave build binaries intact"
-    echo "If CNTK root is empty and branch is not specified then master CNTK branch is built"
+    echo "  -q|--quiet-build - redirect build output to files"
+    echo "  -r|--run-only - assume that binaries are already built"
+    echo "  -b|--build-only - just build, do not run"
+    echo "  -cb|--clean-build - clean up the enlistment binaries before build"
+    echo "  -o|--output-directory <output_dir> - specify output directory to use"
+    echo "Script location in the enlistment is used for finding root directory to build and run"
     exit 1
     ;;
-    -n|--no-cleanup)
-    CNTK_CLEANUP=0
+    -q|--quiet)
+    QUIET_BUILD=1
     ;;
-    -q|--quiet-make)
-    QUIET_MAKE=1
+    -r|--run-only)
+    BUILD=0
+    RUN=1
     ;;
-    -*)
-    echo Unkown option $key
-    exit 1
+    -b|--build-only)
+    BUILD=1
+    RUN=0
+    ;;
+    -cb|--clean-build)
+    CLEAN_BEFORE=1
+    BUILD=1
+    ;;
+    -o|--output-directory)
+    OUTPUT_DIR="$2"
+    shift # past argument
     ;;
     *)
     echo Unkown option $key
@@ -36,124 +50,177 @@ esac
 shift # past argument or value
 done
 
-# Step 0 -- Validate all necessary prerequisites
+# Step 0 -- Validate all necessary prerequisites and check for incompatible options
 # It is possible to use this script on Windows to build CNTK
 # from Cygwin window with Visual C++ environment loaded.
 # In that case OS environment variable will be set and we 
 # can use it to differentiate from Linux.
-if [[ $OS == "Windows_NT" && $OSTYPE == "cygwin" ]]; then
-  DEBUG_DIR=Debug
-  RELEASE_DIR=Release
-  PREFIX_DIR=x64
-  BIN_NAME=CNTK.exe
-  
-  if [[ $VCINSTALLDIR == "" ]]; then
-    echo "============ Visual Studio environment not properly setup ============"
-    echo "============ Please find and run the appropriate vcvarsall.bat script ============"
+if [[ $CLEAN_BEFORE == 1 && $RUN == 1 && $BUILD == 0 ]]; then
+    echo "============ ERROR: Incompatible options RUN and CLEAN_BEFORE set without BUILD ============"
     exit 1
-  fi
+fi
+
+if [[ $OS == "Windows_NT" && $OSTYPE == "cygwin" ]]; then
+    DEBUG_DIR=Debug
+    RELEASE_DIR=Release
+    PREFIX_DIR=x64
+    BIN_NAME=CNTK.exe
+  
+    if [[ $VS120COMNTOOLS == "" ]]; then
+        echo "============ Visual Studio 12.0 environment not properly setup or VS not installed ============"
+        echo "============ Please find and run the appropriate vcvarsall.bat script ============"
+        exit 1
+    fi
+
+    if [[ $ACML_PATH == "" ]]; then
+        echo "============ ACML path not set  ============"
+        echo "============ ACML libraries are needed to successfully build CNTK ============"
+        exit 1
+    fi
 elif [[ $OSTYPE == "linux-gnu" ]]; then
-  DEBUG_DIR=x86_64.gpu.debug.acml
-  RELEASE_DIR=x86_64.gpu.release.acml
-  PREFIX_DIR=bin
-  BIN_NAME=cntk
+    DEBUG_DIR=x86_64.gpu.debug.acml
+    RELEASE_DIR=x86_64.gpu.release.acml
+    PREFIX_DIR=bin
+    BIN_NAME=cntk
+    MAKEFILE=Makefile.gpu
 else
-  echo "============ ERROR: Unsupported OS ============"
-  echo "============ Scripts supports only building from Linux and Windows through Cygwin ============"
-  exit 1
+    echo "============ ERROR: Unsupported OS ============"
+    echo "============ Scripts supports only building from Linux and Windows through Cygwin ============"
+    exit 1
 fi
 
 # Step 1 -- Prepare temporary folders and files, tweak settings if necessary
-TMP_ROOT=`mktemp -d /tmp/cntk.XXXXX || exit $?`
-echo "============ Creating CNTK temp directory in $TMP_ROOT ============"
+if [[ $OUTPUT_DIR == "" ]]; then
+    TMP_ROOT=`mktemp -d /tmp/cntk.XXXXX || exit $?`
+    echo "============ Creating CNTK temp directory in $TMP_ROOT ============"
+    OUTPUT_DIR=$TMP_ROOT
+fi
 
-TMP_CONF_FILE=`mktemp $TMP_ROOT/Simple.conf.XXXXX || exit $?`
-TMP_RESULT_FILE=`mktemp $TMP_ROOT/Result.XXXXX || exit $?`
+CONF_FILE="$OUTPUT_DIR/Simple.conf"
+BUILD_FILE="$OUTPUT_DIR/Build"
+RUN_FILE="$OUTPUT_DIR/Result"
 
+# Get to the root path from which we know how to build and run
 SCRIPT=`readlink -f $0`
 SCRIPT_DIR=`dirname $SCRIPT`
 CNTK_ROOT=`dirname $SCRIPT_DIR`
 
 if ! [[ -d "$CNTK_ROOT/.git" ]]; then
-  echo "============ ERROR: Build script located in the wrong directory ($SCRIPT_DIR) ============"
-  error 1
+    echo "============ ERROR: Build script located in the wrong directory ($SCRIPT_DIR) ============"
+    error 1
 fi
 
 cd $CNTK_ROOT
-cp Demos/Simple/Simple.config $TMP_CONF_FILE || exit $?
-MAKEFILE=Makefile.gpu
 
-# Our make is too noisy right now and it is difficult to spot
-# issues from stdout and stderr. In the quiet mode these are
-# redirected to a file where they could be examined after the fact
-if [[ $QUIET_MAKE == 1 ]]; then
-  exec 6>>$TMP_ROOT/stdout || exit $?
-  exec 7>>$TMP_ROOT/stderr || exit $?
-else
-  exec 6>&1 || exit $?
-  exec 7>&2 || exit $?
+if ! [[ -f $CONF_FILE ]]; then
+    cp Demos/Simple/Simple.config $CONF_FILE || exit $?
+    chmod a+r $CONF_FILE
 fi
 
-# Step 2 -- Perform necessary builds
-for FLAVOR in debug release
-do
-  echo "============ Building CNTK $FLAVOR  ============"
-  if [[ $OS == "Windows_NT" ]]; then
-    msbuild.exe /property:Configuration=$FLAVOR /t:Clean || exit $?
-    msbuild.exe /property:Configuration=$FLAVOR || exit $?
-  else
-    make BUILDTYPE=$FLAVOR -f $MAKEFILE clean || exit $?
-    make BUILDTYPE=$FLAVOR -j -f $MAKEFILE 1>&6 2>&7 || exit $?
-  fi
-done
-
-if ! [[ -f "$CNTK_ROOT/$PREFIX_DIR/$DEBUG_DIR/$BIN_NAME" && -f "$CNTK_ROOT/$PREFIX_DIR/$RELEASE_DIR/$BIN_NAME" ]]; then
-  echo "============ ERROR: CNTK did not build properly  ============"
-  exit 1
+if [[ $QUIET_BUILD == 1 ]]; then
+    echo "============ WARNING: You have selected quiet build. All build output will be placed in ($OUTPUT_DIR) ============"
 fi
 
-# Step 3 -- Run the tests to verify that everything works properly
-cd $PREFIX_DIR
+# Step 2 -- Build the project debug and release, if requested
+if [[ $BUILD == 1 ]]; then
+    # Step 2 -- Perform necessary builds
+    for FLAVOR in debug release
+    do
+        # Our make is too noisy right now and it is difficult to spot
+        # issues from stdout and stderr. In the quiet mode these are
+        # redirected to a file where they could be examined after the fact
+        if [[ $QUIET_BUILD == 1 ]]; then
+            exec 6>$BUILD_FILE.$FLAVOR.out || exit $?
+            exec 7>$BUILD_FILE.$FLAVOR.err || exit $?
+        else
+            exec 6>&1 || exit $?
+            exec 7>&2 || exit $?
+        fi
 
-for TARGET in CPU GPU
-do
-  # These sed scripts are simply toggling DeviceNumber argument in the config file
-  # If it is set to Auto, it will pick GPU over CPU. At -1 CPU is selected.
-  if [[ $TARGET == CPU ]]; then
-    sed -i -e 's/^DeviceNumber.*/DeviceNumber=-1/g' $TMP_CONF_FILE || exit $?
-  else
-    sed -i -e 's/^DeviceNumber.*/DeviceNumber=Auto/g' $TMP_CONF_FILE || exit $?
-  fi
+        echo "============ Building CNTK $FLAVOR (clean=$CLEAN_BEFORE)  ============"
 
-  for FLAVOR_DIR in $DEBUG_DIR $RELEASE_DIR
-  do
-    echo "============ Running CNTK ($FLAVOR_DIR) ($TARGET) ============"
+        if [[ $OS == "Windows_NT" ]]; then
+            if [[ $CLEAN_BEFORE == 1 ]]; then
+                msbuild.exe /property:Configuration=$FLAVOR /t:Clean 1>&6 2>&7 || exit $?
+            fi
+            msbuild.exe /property:Configuration=$FLAVOR /m 1>&6 2>&7 || exit $?
+        else
+            if [[ $CLEAN_BEFORE == 1 ]]; then
+                make BUILDTYPE=$FLAVOR -f $MAKEFILE clean 1>&6 2>&7 || exit $?
+            fi
+            make BUILDTYPE=$FLAVOR -j -f $MAKEFILE 1>&6 2>&7 || exit $?
+        fi
+        chmod a+r $BUILD_FILE.*
+    done
+fi
+
+# Step 3 -- Run the project tests, both debug and release, if requested
+if [[ $RUN == 1 ]]; then
+    if ! [[ -f "$CNTK_ROOT/$PREFIX_DIR/$DEBUG_DIR/$BIN_NAME" && -f "$CNTK_ROOT/$PREFIX_DIR/$RELEASE_DIR/$BIN_NAME" ]]; then
+        echo "============ ERROR: CNTK did not build properly  ============"
+        exit 1
+    fi
+
+    cd $PREFIX_DIR
+
+    for TARGET in CPU GPU
+    do
+        # These sed scripts are simply toggling DeviceNumber argument in the config file
+        # If it is set to Auto, it will pick GPU over CPU. At -1 CPU is selected.
+        if [[ $TARGET == CPU ]]; then
+            sed -i -e 's/^DeviceNumber.*/DeviceNumber=-1/g' $CONF_FILE || exit $?
+        else
+            sed -i -e 's/^DeviceNumber.*/DeviceNumber=Auto/g' $CONF_FILE || exit $?
+        fi
+
+        for FLAVOR in debug release
+        do
+            if [[ FLAVOR == "debug" ]]; then
+                FLAVOR_DIR="$DEBUG_DIR"
+            else
+                FLAVOR_DIR="$RELEASE_DIR"
+            fi
+            OUT_FILE="$RUN_FILE.$FLAVOR.out"
+
+            echo "============ Running CNTK for ($FLAVOR) ($TARGET), output in ($RUN_FILE.*) ============"
+            rm -rf models
+            if [[ $OS == "Windows_NT" ]]; then
+                # We have to use cygpath on Windows to modify the file paths into the format readable by cntk.
+                time ./$FLAVOR_DIR/$BIN_NAME configFile="`cygpath -w $CONF_FILE`" &>$OUT_FILE || exit $?
+            else
+                time ./$FLAVOR_DIR/$BIN_NAME configFile=$CONF_FILE &>$OUT_FILE || exit $?
+            fi
+            chmod a+r $RUN_FILE.*
+
+            # Check if execution was successful
+            grep -q "Using $TARGET" "$OUT_FILE" || {
+                echo "============ ERROR: Run output (in $OUT_FILE) did not contain information about target device ($TARGET) ============"
+                exit 1
+            }
+
+            grep -q "EXCEPTION" "$OUT_FILE" && {
+                echo "============ ERROR: Run output in ($OUT_FILE) contains exceptions ============"
+                grep "EXCEPTION" "$OUT_FILE"
+                exit 1
+            }
+        done
+    done
+fi
+
+# Step 5 -- Optionally clean after builds and tests
+if [[ $CLEAN_AFTER == 1 ]]; then
     rm -rf models
-    if [[ $OS == "Windows_NT" ]]; then
-      # We have to use cygpath on Windows to modify the file paths into the format readable by cntk.
-      time ./$FLAVOR_DIR/$BIN_NAME configFile="`cygpath -w $TMP_CONF_FILE`" 2>$TMP_RESULT_FILE || exit $?
-    else
-      time ./$FLAVOR_DIR/$BIN_NAME configFile=$TMP_CONF_FILE 2>$TMP_RESULT_FILE || exit $?
-    fi
-    grep -q "Using $TARGET" $TMP_RESULT_FILE || exit $?
-    grep -q "EXCEPTION" $TMP_RESULT_FILE && exit $?
-  done
-done
-
-# Step 4 -- Optionally cleanup after builds and tests
-if [[ $CNTK_CLEANUP == 1 ]]; then
-  rm -rf models
-  cd $CNTK_ROOT
-  for FLAVOR in debug release
-  do
-    echo "============ Cleaning up CNTK $FLAVOR  ============"
-    if [[ $OS == "Windows_NT" ]]; then
-      msbuild.exe /property:Configuration=$FLAVOR /t:Clean || exit $?
-    else
-      make BUILDTYPE=$FLAVOR -f $MAKEFILE clean || exit $?
-    fi
-  done
-  rm -rf $TMP_ROOT
+    cd $CNTK_ROOT
+    for FLAVOR in debug release
+    do
+        echo "============ Cleaning up CNTK $FLAVOR  ============"
+        if [[ $OS == "Windows_NT" ]]; then
+            msbuild.exe /property:Configuration=$FLAVOR /t:clean 1>&6 2>&7 || exit $?
+        else
+            make BUILDTYPE=$FLAVOR -f $MAKEFILE clean 1>&6 2>&7 || exit $?
+        fi
+    done
+    rm -rf $OUTPUT_DIR
 fi
 
 echo "============ Build and test of CNTK was successful!  ============"

From c518eb2203b00dc7d3fdf8021413c685db753b95 Mon Sep 17 00:00:00 2001
From: Marko Radmilac <mradmila@microsoft.com>
Date: Tue, 9 Jun 2015 12:26:36 -0700
Subject: [PATCH 12/21] Make default output directory under cntk for
 simplicity, and address code review feedback

---
 .gitignore             |  1 +
 Scripts/build-and-test | 34 +++++++++++++++++++---------------
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/.gitignore b/.gitignore
index 664cc1b07..0ef78ce74 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@ x64/
 build/
 [Bb]in/
 [Oo]bj/
+.run-*
 
 # Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets
 !packages/*/build/
diff --git a/Scripts/build-and-test b/Scripts/build-and-test
index df89a424c..4b452dd9d 100755
--- a/Scripts/build-and-test
+++ b/Scripts/build-and-test
@@ -15,12 +15,12 @@ case $key in
     -h|--help)
     echo "Usage: build-and-test [options]"
     echo "Options:"
-    echo "  -q|--quiet-build - redirect build output to files"
-    echo "  -r|--run-only - assume that binaries are already built"
+    echo "  -q|--quiet-build - redirect build output to file (by default those will be in <cntk_root>.run-<operating_system>-*)"
+    echo "  -r|--run-only - elides build step, runs the binaries that have already been built"
     echo "  -b|--build-only - just build, do not run"
     echo "  -cb|--clean-build - clean up the enlistment binaries before build"
     echo "  -o|--output-directory <output_dir> - specify output directory to use"
-    echo "Script location in the enlistment is used for finding root directory to build and run"
+    echo "The root directory used to build and run CNTK is hosts the Scripts directory that contains this script"
     exit 1
     ;;
     -q|--quiet)
@@ -65,6 +65,7 @@ if [[ $OS == "Windows_NT" && $OSTYPE == "cygwin" ]]; then
     RELEASE_DIR=Release
     PREFIX_DIR=x64
     BIN_NAME=CNTK.exe
+    BUILD_OS="windows"
   
     if [[ $VS120COMNTOOLS == "" ]]; then
         echo "============ Visual Studio 12.0 environment not properly setup or VS not installed ============"
@@ -83,6 +84,7 @@ elif [[ $OSTYPE == "linux-gnu" ]]; then
     PREFIX_DIR=bin
     BIN_NAME=cntk
     MAKEFILE=Makefile.gpu
+    BUILD_OS="linux"
 else
     echo "============ ERROR: Unsupported OS ============"
     echo "============ Scripts supports only building from Linux and Windows through Cygwin ============"
@@ -90,31 +92,33 @@ else
 fi
 
 # Step 1 -- Prepare temporary folders and files, tweak settings if necessary
-if [[ $OUTPUT_DIR == "" ]]; then
-    TMP_ROOT=`mktemp -d /tmp/cntk.XXXXX || exit $?`
-    echo "============ Creating CNTK temp directory in $TMP_ROOT ============"
-    OUTPUT_DIR=$TMP_ROOT
-fi
-
-CONF_FILE="$OUTPUT_DIR/Simple.conf"
-BUILD_FILE="$OUTPUT_DIR/Build"
-RUN_FILE="$OUTPUT_DIR/Result"
 
 # Get to the root path from which we know how to build and run
 SCRIPT=`readlink -f $0`
 SCRIPT_DIR=`dirname $SCRIPT`
 CNTK_ROOT=`dirname $SCRIPT_DIR`
 
-if ! [[ -d "$CNTK_ROOT/.git" ]]; then
+# Setup the output directory
+if [[ $OUTPUT_DIR == "" ]]; then
+    OUTPUT_DIR="$CNTK_ROOT/.run-$BUILD_OS-$RANDOM"
+fi
+
+echo "============ Creating CNTK temp directory in $TMP_ROOT ============"
+mkdir -p $OUTPUT_DIR || exit $?
+
+CONF_FILE="$OUTPUT_DIR/Simple.conf"
+BUILD_FILE="$OUTPUT_DIR/Build"
+RUN_FILE="$OUTPUT_DIR/Result"
+
+if ! [[ -d "$CNTK_ROOT/MachineLearning" ]]; then
     echo "============ ERROR: Build script located in the wrong directory ($SCRIPT_DIR) ============"
-    error 1
+    exit 1
 fi
 
 cd $CNTK_ROOT
 
 if ! [[ -f $CONF_FILE ]]; then
     cp Demos/Simple/Simple.config $CONF_FILE || exit $?
-    chmod a+r $CONF_FILE
 fi
 
 if [[ $QUIET_BUILD == 1 ]]; then

From 5fec0dcca82635cffd0df4effd4ade98c0f99068 Mon Sep 17 00:00:00 2001
From: thhoens <thhoens@microsoft.com>
Date: Thu, 11 Jun 2015 11:52:36 -0700
Subject: [PATCH 13/21] Fixed a bug where the GPUSparseMatrix class would claim
 that the MajorIndexCount was equal to the allocated space, instead of the
 number of elements. This brings it in line with the CPUSparseMatrix class.

---
 Math/Math/GPUSparseMatrix.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Math/Math/GPUSparseMatrix.h b/Math/Math/GPUSparseMatrix.h
index f032a7f68..e2e3d0070 100644
--- a/Math/Math/GPUSparseMatrix.h
+++ b/Math/Math/GPUSparseMatrix.h
@@ -79,16 +79,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         size_t MajorIndexCount() const
         {
-            return MajorIndexCount(m_numRows, m_numCols, m_elemSizeAllocated, m_format);
+            return MajorIndexCount(m_numRows, m_numCols, m_nz, m_format);
         }
-        size_t MajorIndexCount(const size_t numRows, const size_t numCols, const size_t numNZReserved, const MatrixFormat format) const
+        size_t MajorIndexCount(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat format) const
         { 
             if (format == matrixFormatSparseBlockCol)
                 return numCols;
             else if (format == matrixFormatSparseBlockRow)
                 return numRows;
             else
-                return numNZReserved;
+                return numNZ;
         }
         size_t MajorIndexSize() const // actual number of major index bytes in use
         { 

From adeab1eb201cb532d1587ad6229069b983501faf Mon Sep 17 00:00:00 2001
From: Dong Yu <dongyu@microsoft.com>
Date: Thu, 11 Jun 2015 18:43:09 -0700
Subject: [PATCH 14/21] implemented the RowStackNode which supports variable
 number of inputs. Passed unit tests and simple gradient check on MNIST. This
 change involves many files.

---
 MachineLearning/CNTK/ComputationNetwork.h     |  78 +-
 MachineLearning/CNTK/ComputationNode.h        |   5 +
 MachineLearning/CNTK/LinearAlgebraNodes.h     | 161 +++
 .../CNTK/NetworkDescriptionLanguage.cpp       |   2 +
 MachineLearning/CNTK/NonlinearityNodes.h      | 930 +++++++++---------
 .../CNTK/SynchronousExecutionEngine.h         |  53 +-
 Math/CNTKMathTest/CPUMatrixUnitTests.cpp      |  22 +-
 Math/CNTKMathTest/GPUMatrixUnitTests.cpp      |  23 +-
 Math/Math/CPUMatrix.cpp                       |  42 +
 Math/Math/CPUMatrix.h                         |   1 +
 Math/Math/GPUMatrix.cu                        |  57 ++
 Math/Math/GPUMatrix.h                         |   1 +
 Math/Math/GPUMatrixCUDAKernels.cu             |  21 +
 Math/Math/Matrix.cpp                          |  62 ++
 Math/Math/Matrix.h                            |   1 +
 Math/Math/NoGPU.cpp                           |   1 +
 16 files changed, 941 insertions(+), 519 deletions(-)

diff --git a/MachineLearning/CNTK/ComputationNetwork.h b/MachineLearning/CNTK/ComputationNetwork.h
index 671ecf62d..3b8c515b6 100644
--- a/MachineLearning/CNTK/ComputationNetwork.h
+++ b/MachineLearning/CNTK/ComputationNetwork.h
@@ -548,41 +548,38 @@ public:
                     }
 
                     ComputationNodePtr nodePtr = GetNodeFromName(nodeName);
-                    ComputationNodePtr childNodePtr0, childNodePtr1, childNodePtr2, childNodePtr3, childNodePtr4;
-                    switch (numChildren)
+                    std::vector<const ComputationNodePtr> childrenNodes;
+                    childrenNodes.resize(numChildren);
+                    for (int j = 0; j < numChildren; j++)
+                        childrenNodes[j] = GetNodeFromName(childrenNames[j]);
+
+                    if (nodePtr->OperationName() == RowStackNode<ElemType>::TypeName()) //allow for variable input nodes
+                        nodePtr->AttachInputs(childrenNodes);
+                    else //fixed input nodes
                     {
-                    case 1:
-                        childNodePtr0 = GetNodeFromName(childrenNames[0]);
-                        nodePtr->AttachInputs(childNodePtr0);
-                        break;
-                    case 2:
-                        childNodePtr0 = GetNodeFromName(childrenNames[0]);
-                        childNodePtr1 = GetNodeFromName(childrenNames[1]);
-                        nodePtr->AttachInputs(childNodePtr0, childNodePtr1);
-                        break;
-                    case 3:
-                        childNodePtr0 = GetNodeFromName(childrenNames[0]);
-                        childNodePtr1 = GetNodeFromName(childrenNames[1]);
-                        childNodePtr2 = GetNodeFromName(childrenNames[2]);
-                        nodePtr->AttachInputs(childNodePtr0, childNodePtr1, childNodePtr2);
-                        break;
-                    case 4:
-                        childNodePtr0 = GetNodeFromName(childrenNames[0]);
-                        childNodePtr1 = GetNodeFromName(childrenNames[1]);
-                        childNodePtr2 = GetNodeFromName(childrenNames[2]);
-                        childNodePtr3 = GetNodeFromName(childrenNames[3]);
-                        nodePtr->AttachInputs(childNodePtr0, childNodePtr1, childNodePtr2, childNodePtr3);
-                        break;
-                    case 5:
-                        childNodePtr0 = GetNodeFromName(childrenNames[0]);
-                        childNodePtr1 = GetNodeFromName(childrenNames[1]);
-                        childNodePtr2 = GetNodeFromName(childrenNames[2]);
-                        childNodePtr3 = GetNodeFromName(childrenNames[3]);
-                        childNodePtr4 = GetNodeFromName(childrenNames[4]);
-                        nodePtr->AttachInputs(childNodePtr0, childNodePtr1, childNodePtr2, childNodePtr3, childNodePtr4);
-                        break;
-                    default:
-                        throw std::logic_error("Invalid number of children.");
+                        switch (numChildren)
+                        {
+                        case 1:
+                            nodePtr->AttachInputs(childrenNodes[0]);
+                            break;
+                        case 2:
+                            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1]);
+                            break;
+                        case 3:
+                            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2]);
+                            break;
+                        case 4:
+                            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3]);
+                            break;
+                        case 5:
+                            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3], childrenNodes[4]);
+                            break;
+                        case 6:
+                            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3], childrenNodes[4], childrenNodes[5]);
+                            break;
+                        default:
+                            throw std::logic_error("Invalid number of children.");
+                        }
                     }
                 }
             }
@@ -1021,6 +1018,8 @@ public:
                 newNode = new LookupTableNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
             else if (nodeType == RowSliceNode<ElemType>::TypeName())
                 newNode = new RowSliceNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
+            else if (nodeType == RowStackNode<ElemType>::TypeName())
+                newNode = new RowStackNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
             else if (nodeType == GMMLogLikelihoodNode<ElemType>::TypeName())
                 newNode = new GMMLogLikelihoodNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
 			else if (nodeType == CosDistanceWithNegativeSamplesNode<ElemType>::TypeName())
@@ -1190,6 +1189,8 @@ public:
                 newNode = new TimeReverseNode<ElemType>(m_deviceId, nodeName);
             else if (nodeType == CosDistanceWithNegativeSamplesNode<ElemType>::TypeName())
 				newNode = new CosDistanceWithNegativeSamplesNode<ElemType>(m_deviceId, nodeName);
+            else if (nodeType == RowStackNode<ElemType>::TypeName())
+                newNode = new RowStackNode<ElemType>(m_deviceId, nodeName);
             else
             {
                 fprintf(stderr, "Error creating new ComputationNode of type %ls, with name %ls\n", nodeType.c_str(), nodeName.c_str());
@@ -1529,6 +1530,15 @@ public:
             return newNode;
         }
 
+        ComputationNodePtr RowStack(const std::vector<const ComputationNodePtr> inputs, const std::wstring nodeName = L"")
+        {
+            ComputationNodePtr newNode(new RowStackNode<ElemType>(m_deviceId, nodeName));
+            newNode->AttachInputs(inputs);
+            AddNodeToNet(newNode);
+
+            return newNode;
+        }
+
         ComputationNodePtr GMMLogLikelihood(const ComputationNodePtr unnormedPrior, const ComputationNodePtr mean, const ComputationNodePtr logStddev, const ComputationNodePtr feature, const std::wstring nodeName = L"")
         {
             ComputationNodePtr newNode(new GMMLogLikelihoodNode<ElemType>(m_deviceId, nodeName));
diff --git a/MachineLearning/CNTK/ComputationNode.h b/MachineLearning/CNTK/ComputationNode.h
index 9cd84b91b..55471acd3 100644
--- a/MachineLearning/CNTK/ComputationNode.h
+++ b/MachineLearning/CNTK/ComputationNode.h
@@ -152,6 +152,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             throw std::logic_error("This operation does not support six inputs.");
         }
 
+        virtual void AttachInputs(const std::vector<const ComputationNodePtr>& /*inputs*/)
+        {
+            throw std::logic_error("This operation does not support variable-length inputs.");
+        }
+
         virtual void DetachInputs()
         {
             m_children.resize(0);
diff --git a/MachineLearning/CNTK/LinearAlgebraNodes.h b/MachineLearning/CNTK/LinearAlgebraNodes.h
index eb3ecc8bc..ffbda78ea 100644
--- a/MachineLearning/CNTK/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTK/LinearAlgebraNodes.h
@@ -429,6 +429,167 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template class RowSliceNode<float>; 
     template class RowSliceNode<double>;
 
+    //this node is used to extract part of the input by rows as the output
+    //it has to be continuous segments of rows since each column is treated as one sample
+    template<class ElemType>
+    class RowStackNode : public ComputationNode<ElemType>
+    {
+        UsingComputationNodeMembers;
+    public:
+        RowStackNode(const DEVICEID_TYPE deviceId = AUTOPLACEMATRIX, const std::wstring name = L"") : ComputationNode<ElemType>(deviceId)
+        {
+            m_nodeName = (name == L"" ? CreateUniqNodeName() : name);
+            m_deviceId = deviceId;
+            MoveMatricesToDevice(deviceId);
+            InitRecurrentNode();
+        }
+
+        RowStackNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId = AUTOPLACEMATRIX, const std::wstring name = L"") : ComputationNode<ElemType>(deviceId)
+        {
+            m_nodeName = (name == L"" ? CreateUniqNodeName() : name);
+            LoadFromFile(fstream, modelVersion, deviceId);
+        }
+
+        // copy constructor
+        RowStackNode(const RowStackNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags) : ComputationNode<ElemType>(node->m_deviceId)
+        {
+            node->CopyTo(this, newName, flags);
+        }
+
+        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            const std::wstring& name = (newName == L"") ? NodeName() : newName;
+
+            ComputationNodePtr node = new RowStackNode<ElemType>(this, name, flags);
+            return node;
+        }
+
+        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            ComputationNode<ElemType>::CopyTo(nodeP, newName, flags);
+            RowStackNode<ElemType>* node = (RowStackNode<ElemType>*) nodeP;
+
+            if (flags & CopyNodeFlags::copyNodeChildren)
+            {
+                node->m_children = m_children;
+                node->m_startRowIndeces = m_startRowIndeces;
+                node->m_inputMatrices = m_inputMatrices;
+            }
+        }
+
+        virtual const std::wstring OperationName() const { return TypeName(); }
+        static const std::wstring TypeName() { return L"RowStack"; }
+
+        virtual void ComputeInputPartial(const size_t inputIndex)
+        {
+            if (inputIndex >= ChildrenSize())
+                throw std::invalid_argument("RowStack-ComputeInputPartial: inputIndex out of range.");
+
+            ComputeInputPartialS(Inputs(inputIndex)->GradientValues(), GradientValues(), m_startRowIndeces[inputIndex], m_startRowIndeces[inputIndex + 1] - m_startRowIndeces[inputIndex]);
+        }
+
+        virtual void ComputeInputPartial(const size_t inputIndex, const size_t timeIdxInSeq)
+        {
+            if (inputIndex >= ChildrenSize())
+                throw std::invalid_argument("RowStack-ComputeInputPartial: inputIndex out of range.");
+
+            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+
+            ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startRowIndeces[inputIndex], m_startRowIndeces[inputIndex+1] - m_startRowIndeces[inputIndex]);
+        }
+
+        static void WINAPI ComputeInputPartialS(Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const size_t startIndex, const size_t numRows)
+        {
+            inputGradientValues.AddWithRowSliceValuesOf(gradientValues, startIndex, numRows);
+        }
+
+        virtual void EvaluateThisNode()
+        {
+            EvaluateThisNodeS(m_functionValues, m_inputMatrices,  0, Inputs(0)->FunctionValues().GetNumCols());
+        }
+
+        virtual void EvaluateThisNode(const size_t timeIdxInSeq)
+        {
+            Matrix<ElemType> sliceFunctionValues = FunctionValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+
+            EvaluateThisNodeS(sliceFunctionValues, m_inputMatrices, timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+        }
+
+        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
+        {
+            functionValues.AssignRowStackValuesOf(inputMatrices, sliceStartCol, sliceNumCols);
+#if NANCHECK
+            functionValues.HasNan("RowStack");
+#endif
+        }
+
+        virtual void Validate()
+        {
+            PrintSelfBeforeValidation();
+            
+            unsigned int numInputs = ChildrenSize();
+            if (numInputs < 2)
+                LogicError("RowStack operation: must have two or more inputs.");
+
+            if (Inputs(0) == nullptr)
+                LogicError("RowStack operation: the input node is NULL.");
+
+            size_t numCols = Inputs(0)->FunctionValues().GetNumCols();
+            m_startRowIndeces.resize(ChildrenSize()+1);
+            m_inputMatrices.resize(ChildrenSize());
+
+            size_t totalRows = 0;
+            m_startRowIndeces[0] = 0;
+
+            for (int i = 0; i < ChildrenSize(); i++)
+            {
+                if (Inputs(i) == nullptr)
+                    LogicError("RowStack operation: the input node is NULL.");
+
+                Matrix<ElemType>& childMatrix = Inputs(i)->FunctionValues();
+                size_t numRows = childMatrix.GetNumRows();
+                if (numRows == 0)
+                    LogicError("RowStack operation: the input node %ls has 0 rows.", Inputs(i)->NodeName().c_str());
+                
+                if (childMatrix.GetNumCols() != numCols)
+                    LogicError("RowStack operation: the input node %ls has different number of columns.", Inputs(i)->NodeName().c_str());
+
+                totalRows += numRows;
+                m_inputMatrices[i] = &childMatrix;
+                m_startRowIndeces[i + 1] = m_startRowIndeces[i] + numRows;
+            }
+
+            FunctionValues().Resize(totalRows, numCols);
+            CopyImageSizeFromInputs();
+        }
+
+        virtual void CopyImageSizeFromInputs()
+        {
+            CopyImageSizeFromInput(0, true);
+            m_outputHeight = FunctionValues().GetNumRows();
+
+            //WARNING: this node will destroy the image size information from the child
+            if (m_inputWidth * m_inputChannels != 1)
+                fprintf(stderr, "WARNING: RowStack operation cannot inherit image size information from its child. Image size info is lost.\n");
+        }
+
+        virtual void AttachInputs(const std::vector<const ComputationNodePtr>& inputs)
+        {
+            unsigned int numInputs = inputs.size();
+            m_children.resize(numInputs);
+            for (unsigned int i = 0; i < numInputs; i++)
+                m_children[i] = inputs[i];
+        }
+
+    private:
+        std::vector<size_t> m_startRowIndeces; //start row number in the stacked matrix of each input (child)
+        std::vector<const Matrix<ElemType>*> m_inputMatrices;
+    };
+
+    template class RowStackNode<float>;
+    template class RowStackNode<double>;
+
     template<class ElemType>
     class ScaleNode : public ComputationNode<ElemType>
     {
diff --git a/MachineLearning/CNTK/NetworkDescriptionLanguage.cpp b/MachineLearning/CNTK/NetworkDescriptionLanguage.cpp
index 4f2f2309f..c4e35c5ad 100644
--- a/MachineLearning/CNTK/NetworkDescriptionLanguage.cpp
+++ b/MachineLearning/CNTK/NetworkDescriptionLanguage.cpp
@@ -220,6 +220,8 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
         ret = true;
     else if (EqualInsensitive(nodeType, RowSliceNode<ElemType>::TypeName()))
         ret = true;
+    else if (EqualInsensitive(nodeType, RowStackNode<ElemType>::TypeName()))
+        ret = true;
     else if (EqualInsensitive(nodeType, LookupTableNode<ElemType>::TypeName()))
         ret = true;
     else if (EqualInsensitive(nodeType, GMMLogLikelihoodNode<ElemType>::TypeName(), L"GMMLL"))
diff --git a/MachineLearning/CNTK/NonlinearityNodes.h b/MachineLearning/CNTK/NonlinearityNodes.h
index 032851e96..015f7d884 100644
--- a/MachineLearning/CNTK/NonlinearityNodes.h
+++ b/MachineLearning/CNTK/NonlinearityNodes.h
@@ -1149,469 +1149,469 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template class LogSoftmaxNode<float>;
     template class LogSoftmaxNode<double>;
 
-    //calculates: the log likelihood of a feature given GMM parameters
-    template<class ElemType>
-    class GMMLogLikelihoodNode : public ComputationNode<ElemType>
-    {
-        UsingComputationNodeMembers;
-    public:
-        GMMLogLikelihoodNode(const DEVICEID_TYPE deviceId = AUTOPLACEMATRIX, const std::wstring name = L"")
-            : ComputationNode<ElemType>(deviceId), m_prior(deviceId), m_normedDeviation(deviceId), m_normedDeviationVectors(deviceId), m_stddev(deviceId), m_posterior(deviceId), m_temp(deviceId)
-        {
-            m_nodeName = (name == L"" ? CreateUniqNodeName() : name);
-            m_deviceId = deviceId;
-            MoveMatricesToDevice(deviceId);
-            InitRecurrentNode();
-        }
-
-        GMMLogLikelihoodNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId = AUTOPLACEMATRIX, const std::wstring name = L"")
-            : ComputationNode<ElemType>(deviceId), m_prior(deviceId), m_normedDeviation(deviceId), m_normedDeviationVectors(deviceId), m_stddev(deviceId), m_posterior(deviceId), m_temp(deviceId)
-        {
-            m_nodeName = (name == L"" ? CreateUniqNodeName() : name);
-            LoadFromFile(fstream, modelVersion, deviceId);
-        }
-
-        // copy constructor
-        GMMLogLikelihoodNode(const GMMLogLikelihoodNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags)
-            : ComputationNode<ElemType>(node->m_deviceId), m_prior(node->m_deviceId), m_normedDeviation(node->m_deviceId), m_normedDeviationVectors(node->m_deviceId),
-            m_stddev(node->m_deviceId), m_posterior(node->m_deviceId), m_temp(m_deviceId)
-        {
-            node->CopyTo(this, newName, flags);
-        }
-
-        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
-        {
-            const std::wstring& name = (newName == L"") ? NodeName() : newName;
-
-            ComputationNodePtr node = new GMMLogLikelihoodNode<ElemType>(this, name, flags);
-            return node;
-        }
-
-        virtual const std::wstring OperationName() const { return TypeName(); }
-        static const std::wstring TypeName() { return L"GMMLogLikelihood"; }
-
-        virtual void ComputeInputPartial(const size_t inputIndex)
-        {
-            switch (inputIndex)
-            {
-            case 0:
-                ComputeInputPartialUnnormedPrior(Inputs(0)->GradientValues(), m_gradientValues, m_prior, m_posterior, m_temp);
-                break;
-            case 1:
-                ComputeInputPartialMean(Inputs(1)->GradientValues(), m_gradientValues, m_normedDeviationVectors, m_posterior, m_temp);
-                break;
-            case 2:
-                ComputeInputPartialLogStddev(Inputs(2)->GradientValues(), m_gradientValues, m_normedDeviation, m_posterior, m_temp);
-                break;
-            case 3:
-                ComputeInputPartialFeature(Inputs(3)->GradientValues(), m_gradientValues, m_normedDeviationVectors, m_posterior, m_temp);
-                break;
-            default:
-                throw std::invalid_argument("GMMLogLikelihoodNode only takes four inputs.");
-            }
-        }
-
-        virtual void ComputeInputPartial(const size_t inputIndex, const size_t timeIdxInSeq)
-        {
-            //get the right slice 
-            size_t startIndex = timeIdxInSeq * m_samplesInRecurrentStep;
-
-            size_t colsPrior = Inputs(0)->FunctionValues().GetNumCols();
-
-            Matrix<ElemType> sliceGradientValue = m_gradientValues.ColumnSlice(startIndex, m_samplesInRecurrentStep);
-            Matrix<ElemType> slicePosterior = m_posterior.ColumnSlice(startIndex, m_samplesInRecurrentStep);
-                
-            switch (inputIndex)
-            {
-            case 0:
-                {
-                    if (colsPrior == 1)
-                        ComputeInputPartialUnnormedPrior(Inputs(0)->GradientValues(), sliceGradientValue, m_prior, slicePosterior, m_temp);
-                    else
-                    {
-                        Matrix<ElemType> sliceUnnormedPriorGradient = Inputs(0)->GradientValues().ColumnSlice(startIndex, m_samplesInRecurrentStep);
-                        Matrix<ElemType> slicePrior = m_prior.ColumnSlice(startIndex, m_samplesInRecurrentStep);
-                        ComputeInputPartialUnnormedPrior(sliceUnnormedPriorGradient, sliceGradientValue, slicePrior, slicePosterior, m_temp);
-                    }
-                }
-                break;
-            case 1:
-                {
-                      Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.ColumnSlice(startIndex, m_samplesInRecurrentStep);
-                      if (colsPrior == 1)
-                        ComputeInputPartialMean(Inputs(1)->GradientValues(), sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
-                    else
-                    {
-                        Matrix<ElemType> sliceMeanGradient = Inputs(1)->GradientValues().ColumnSlice(startIndex, m_samplesInRecurrentStep);
-                        ComputeInputPartialMean(sliceMeanGradient, sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
-                    }
-                }
-                break;
-            case 2:
-                {
-                    Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.ColumnSlice(startIndex, m_samplesInRecurrentStep);
-                    if (colsPrior == 1)
-                        ComputeInputPartialLogStddev(Inputs(2)->GradientValues(), sliceGradientValue, sliceNormedDeviation, slicePosterior, m_temp);
-                    else
-                    {
-                        Matrix<ElemType> sliceLotStddevGradient = Inputs(2)->GradientValues().ColumnSlice(startIndex, m_samplesInRecurrentStep);
-                        ComputeInputPartialLogStddev(sliceLotStddevGradient, sliceGradientValue, sliceNormedDeviation, slicePosterior, m_temp);
-                    }
-                }
-                break;
-            case 3:
-                {
-                    Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.ColumnSlice(startIndex, m_samplesInRecurrentStep);
-                    Matrix<ElemType> sliceFeatureGradient = Inputs(3)->GradientValues().ColumnSlice(startIndex, m_samplesInRecurrentStep);
-                    ComputeInputPartialFeature(sliceFeatureGradient, sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
-                }
-                break;
-            default:
-                throw std::invalid_argument("GMMLogLikelihoodNode criterion only takes four inputs.");
-            }
-        }
-
-        static void WINAPI ComputeInputPartialUnnormedPrior(Matrix<ElemType>& unnormedPriorGradientValues, const Matrix<ElemType>& gradientValues,
-            const Matrix<ElemType>& prior, const Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
-        {
-            temp.AssignDifferenceOf(posterior, prior);
-            temp.RowElementMultiplyWith(gradientValues);
-            if (prior.GetNumCols() == posterior.GetNumCols())
-            {
-                unnormedPriorGradientValues += temp; 
-            }
-            else if (prior.GetNumCols() == 1)
-            {
-                Matrix<ElemType>::MultiplyAndAdd(temp, false, ConstOnes(posterior.GetNumCols(), 1, unnormedPriorGradientValues.GetDeviceId()), false, unnormedPriorGradientValues);
-            }
-            else
-            {
-                throw std::runtime_error("GMMLogLikelihoodNode: UnnormedPrior should either have same number of columns as the features or have only one column.");
-            }
-        }
-
-        static void WINAPI ComputeInputPartialMean(Matrix<ElemType>& meanGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& normedDeviationVectors,
-            Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
-        {
-            size_t numComponent = posterior.GetNumRows(); 
-            size_t numSamples = posterior.GetNumCols();
-            size_t featureSize = normedDeviationVectors.GetNumRows() / numComponent;
-
-            temp.SetValue(normedDeviationVectors); //recall normedDeviationVectors <-- (x-u_c)/(stddev^2)
-            temp.Reshape(featureSize, numSamples* numComponent);
-
-            posterior.Reshape(1, numSamples* numComponent);
-            temp.RowElementMultiplyWith(posterior); //temp <-- posterior * (x-u_c)/(stddev^2)
-
-            posterior.Reshape(numComponent, numSamples);  //reshape back
-            temp.Reshape(featureSize * numComponent, numSamples); //reshape back
-
-            temp.RowElementMultiplyWith(gradientValues);
-
-            if (numSamples == meanGradientValues.GetNumCols())
-            {
-                meanGradientValues += temp;
-            }
-            else if (meanGradientValues.GetNumCols() == 1)
-            {
-                Matrix<ElemType>::MultiplyAndAdd(temp, false, ConstOnes(numSamples, 1, meanGradientValues.GetDeviceId()), false, meanGradientValues);
-            }
-            else
-            {
-                throw std::runtime_error("GMMLogLikelihoodNode: stddev should either have same number of columns as the features or have only one column.");
-            }
-        }
-
-        static void WINAPI ComputeInputPartialLogStddev(Matrix<ElemType>& logStddevGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& normedDeviation,
-            const Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
-        {
-            size_t numComponent = posterior.GetNumRows();
-            size_t numSamples = posterior.GetNumCols();
-
-            temp.AssignDifferenceOf(normedDeviation, (ElemType)numComponent);
-            temp.ElementMultiplyWith(posterior);
-            temp.RowElementMultiplyWith(gradientValues);
-            if (logStddevGradientValues.GetNumCols() == numSamples)
-            {
-                logStddevGradientValues += temp;
-            }
-            else if (logStddevGradientValues.GetNumCols() == 1)
-            {
-                Matrix<ElemType>::MultiplyAndAdd(temp, false, ConstOnes(numSamples, 1, logStddevGradientValues.GetDeviceId()), false, logStddevGradientValues);
-            }
-            else
-            {
-                throw std::runtime_error("GMMLogLikelihoodNode: stddev should either have same number of columns as the features or have only one column.");
-            }
-        }
-
-        static void WINAPI ComputeInputPartialFeature(Matrix<ElemType>& featureGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& normedDeviationVectors,
-            Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
-        {
-            size_t numComponent = posterior.GetNumRows();
-            size_t numSamples = posterior.GetNumCols();
-            size_t featureSize = normedDeviationVectors.GetNumRows() / numComponent;
-
-            temp.SetValue(normedDeviationVectors);
-            temp *= -1;
-            temp.Reshape(featureSize, numSamples* numComponent);
-            posterior.Reshape(1, numSamples* numComponent);
-            temp.RowElementMultiplyWith(posterior);
-
-            posterior.Reshape(numComponent, numSamples);
-            temp.Reshape(featureSize * numComponent, numSamples);
-            temp.RowElementMultiplyWith(gradientValues);
-
-            for (int i = 0; i < numComponent; i++)
-                featureGradientValues.AddWithRowSliceValuesOf(temp, i*featureSize, featureSize);
-        }
-
-        virtual void SetFunctionAndGradientSize(const int numSamples)
-        {
-            ComputationNode<ElemType>::SetFunctionAndGradientSize(numSamples);
-
-            size_t numComponents = Inputs(0)->FunctionValues().GetNumRows();
-            size_t colsPrior = Inputs(0)->FunctionValues().GetNumCols();
-            //size_t numSamples = Inputs(3)->FunctionValues().GetNumCols();
-            size_t featureSize = Inputs(3)->FunctionValues().GetNumRows();
-
-            m_prior.Resize(numComponents, colsPrior);
-            m_stddev.Resize(numComponents, colsPrior);
-            m_normedDeviation.Resize(numComponents, numSamples);
-            m_normedDeviationVectors.Resize(numComponents * featureSize, numSamples);
-            m_posterior.Resize(numComponents, numSamples);
-        }
-
-        //input0=unnormedPrior, input1=mean, input2=logstddev, input3=feature
-        virtual void EvaluateThisNode()
-        {
-            // all internal matrices will be automatically resized since all of them are assigned to a value so no resize is needed here.
-            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), Inputs(3)->FunctionValues(), 
-                m_prior, m_stddev, m_normedDeviationVectors, m_normedDeviation, m_posterior, m_temp);
-        }
-
-        //input0=unnormedPrior, input1=mean, input2=logstddev, input3=feature
-        virtual void EvaluateThisNode(const size_t timeIdxInSeq)
-        {
-            size_t colsPrior = Inputs(0)->FunctionValues().GetNumCols();
-            size_t numSamples = Inputs(3)->FunctionValues().GetNumCols();
-
-            //get the right slice 
-            size_t startIndex = timeIdxInSeq * m_samplesInRecurrentStep;
-
-            Matrix<ElemType> sliceOutputValue = m_functionValues.ColumnSlice(startIndex, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceFeature = Inputs(3)->FunctionValues().ColumnSlice(startIndex, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.ColumnSlice(startIndex, m_samplesInRecurrentStep);
-            Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.ColumnSlice(startIndex, m_samplesInRecurrentStep);
-            Matrix<ElemType> slicePosterior = m_posterior.ColumnSlice(startIndex, m_samplesInRecurrentStep);
-
-            if (colsPrior == 1)
-            {
-                EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), sliceFeature,
-                    m_prior, m_stddev, sliceNormedDeviationVectors, sliceNormedDeviation, slicePosterior, m_temp);
-            }
-            else if (colsPrior == numSamples)
-            {
-                Matrix<ElemType> sliceUnnormedPrior = Inputs(0)->FunctionValues().ColumnSlice(startIndex, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceMean = Inputs(1)->FunctionValues().ColumnSlice(startIndex, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceLogstddev = Inputs(2)->FunctionValues().ColumnSlice(startIndex, m_samplesInRecurrentStep);
-
-                Matrix<ElemType> slicePrior = m_prior.ColumnSlice(startIndex, m_samplesInRecurrentStep);
-                Matrix<ElemType> sliceStddev = m_stddev.ColumnSlice(startIndex, m_samplesInRecurrentStep);
-
-                EvaluateThisNodeS(sliceOutputValue, sliceUnnormedPrior, sliceMean, sliceLogstddev, sliceFeature,
-                    slicePrior, sliceStddev, sliceNormedDeviationVectors, sliceNormedDeviation, slicePosterior, m_temp);
-            }
-            else  //should not reach the code since validation should fail already
-            {
-                throw std::runtime_error("GMMLogLikelihoodNode: UnnormedPrior should either have same number of columns as the features or have only one column.");
-            }
-
-        }
-
-        //input0=unnormedPrior, input1=mean, input2=logstddev, input3=feature
-        //If we want to speed up we need to replace following code with a several specialized GPU functions
-        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& unnormedPrior, const Matrix<ElemType>& mean,  Matrix<ElemType>& logstddev,
-            const Matrix<ElemType>& feature, Matrix<ElemType>& prior, Matrix<ElemType>& stddev, Matrix<ElemType>& normedDeviationVectors,
-            Matrix<ElemType>& normedDeviation, Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
-        {
-            int numComponent = unnormedPrior.GetNumRows();
-            size_t numSamples = feature.GetNumCols();
-            size_t featureDim = feature.GetNumRows();
-
-            //compute prior which is softmax of unnormedPrior
-            prior.AssignLogSoftmaxOf(unnormedPrior, true);  //log prior
-
-            prior.InplaceExp();
-
-            //compute stddev
-            stddev.AssignExpOf(logstddev);
-
-#if DUMPOUTPUT
-            unnormedPrior.Print("unnormedPrior", 0, min(5, unnormedPrior.GetNumRows() - 1), 0, min(10, unnormedPrior.GetNumCols() - 1));
-            mean.Print("mean", 0, min(5, mean.GetNumRows() - 1), 0, min(10, mean.GetNumCols() - 1));
-            logstddev.Print("logstddev", 0, min(5, logstddev.GetNumRows() - 1), 0, min(10, logstddev.GetNumCols() - 1));
-
-            prior.Print("prior", 0, min(5, prior.GetNumRows() - 1), 0, min(10, prior.GetNumCols() - 1));
-            stddev.Print("stddev", 0, min(5, stddev.GetNumRows() - 1), 0, min(10, stddev.GetNumCols() - 1));
-#endif
-
-            //compute normedDeviation <-- ||x-u_c||^2/(stddev^2)
-            normedDeviationVectors.AssignRepeatOf(feature, numComponent, 1);
-            normedDeviationVectors -= mean; //each column of the mean has multiple mean components
-            normedDeviationVectors.Reshape(featureDim, numSamples* numComponent);  //now each column is feature-mean_i
-
-            normedDeviation.AssignVectorNorm2Of(normedDeviationVectors, true);
-            normedDeviation ^= 2;
-            temp.AssignRepeatOf(stddev, 1, numSamples / stddev.GetNumCols());  //stddev.GetNumCols() is either 1 or =numSamples
-            temp.Reshape(1, temp.GetNumElements());  //one stddev value for each component for each sample
-            temp ^= 2;
-            normedDeviation.ElementDivideBy(temp);  //normedDeviation and temp have same dim (1, numSamples* numComponent)
-
-            //compute  normedDeviationVectors <-- (x-u_c)/(stddev^2)
-            normedDeviationVectors.RowElementDivideBy(temp);  //divide twice
-            normedDeviationVectors.Reshape(featureDim*numComponent, numSamples);  //reshape back
-
-            //compute per-component likelihood
-            posterior.AssignProductOf(-0.5f, normedDeviation); //posterior  <-- -||x-u_c||^2/(stddev^2)/2 and in (1, numSamples* numComponent) dim
-            temp.InplaceLog();
-            temp *= ((ElemType)numComponent / 2.0f); //temp <-- stddev^c and in (1, numSamples* numComponent) dim
-            posterior -= temp;  // posterior  <-- exp[-||x-u_c||^2/(stddev^2)/2]/(stddev^c)
-            posterior -= (ElemType)(numComponent / 2.0f*log(TWO_PI)); //likelihood for each component and sample is now computed and stored in posterior
-            posterior.InplaceExp(); //posterior  <-- exp(-||x-u_c||^2/(stddev^2)/2)
-
-            normedDeviation.Reshape(numComponent, numSamples);  //reshape back
-            posterior.Reshape(numComponent, numSamples);  //reshape back
-
-            //compute posterior <-- prior_i * likelihood_i
-            if (unnormedPrior.GetNumCols() == numSamples)  //each sample has different prior
-                posterior.ElementMultiplyWith(prior);
-            else  //all samples share the same prior
-                posterior.ColumnElementMultiplyWith(prior);
-
-            //compute GMM log-likelihood
-            Matrix<ElemType>::Multiply(ConstOnes(1, numComponent, posterior.GetDeviceId()), false, posterior, false, functionValues);  //functionValues <-- total likelihood
-            posterior.RowElementDivideBy(functionValues); //posterior <-- per-comp likelihood / total likelihood
-            functionValues.InplaceLog(); //log likelihood
-
-#if DUMPOUTPUT
-            temp.Print("temp", 0, min(5, temp.GetNumRows() - 1), 0, min(10, temp.GetNumCols() - 1));
-            normedDeviation.Print("normedDeviation", 0, min(5, normedDeviation.GetNumRows() - 1), 0, min(10, normedDeviation.GetNumCols() - 1));
-
-            posterior.Print("posterior", 0, min(5, posterior.GetNumRows() - 1), 0, min(10, posterior.GetNumCols() - 1));
-            functionValues.Print("functionValues", 0, min(5, functionValues.GetNumRows() - 1), 0, min(10, functionValues.GetNumCols() - 1));
-
-            functionValues.Print("GMMLogLikelihoodNode");
-#endif
-
-#if NANCHECK
-            functionValues.HasNan("GMMLogLikelihood");
-#endif
-        }
-
-        virtual void Validate()
-        {
-            PrintSelfBeforeValidation();
-
-            if (m_children.size() != 4)
-                throw std::logic_error("GMMLogLikelihoodNode requires four inputs.");
-
-            size_t rows[4], cols[4];
-            for (int i = 0; i < 4; i++)
-            {
-                rows[i] = Inputs(i)->FunctionValues().GetNumRows();
-                cols[i] = Inputs(i)->FunctionValues().GetNumCols();
-            }
-
-            if (cols[0] != cols[1] || cols[0] != cols[2])
-                throw std::logic_error("GMMLogLikelihoodNode: UnnormedPrior (first input), mean (second input), and logStddev (third input) should have same number of columns.");
-
-            if (cols[0] != 1 && cols[0] != cols[3])
-                throw std::logic_error("GMMLogLikelihoodNode: UnnormedPrior (first input) should either have same number of columns as the features (fourth input) or have only one column.");
-
-            if (rows[0] != rows[2])
-                throw std::logic_error("GMMLogLikelihoodNode: UnnormedPrior (first input) should have same dimension as logStddev (third input), i.e., all dimensions in each Gaussian component share the same stddev.");
-
-            if (rows[1] != rows[0]*rows[3])
-                throw std::logic_error("GMMLogLikelihoodNode: the number of rows in mean (second input) should equal rows(unnormedPrior(first input) * rows(feature(fourth input)).");
-
-            FunctionValues().Resize(1, cols[3]);
-            CopyImageSizeFromInputs();
-        }
-
-        virtual void CopyImageSizeFromInputs()
-        {
-            CopyImageSizeFromInput(3, false);
-
-            m_outputChannels = 1;
-            m_outputWidth = 1;
-            m_outputHeight = 1;
-        }
-
-        //leftNode should be the empirical
-        virtual void AttachInputs(const ComputationNodePtr unnormedPrior, const ComputationNodePtr mean, const ComputationNodePtr logStddev, const ComputationNodePtr feature)
-        {
-            m_children.resize(4);
-            m_children[0] = unnormedPrior;
-            m_children[1] = mean;
-            m_children[2] = logStddev;
-            m_children[3] = feature;
-        }
-
-        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
-        {
-            ComputationNode<ElemType>::MoveMatricesToDevice(deviceId);
-
-            if (deviceId != AUTOPLACEMATRIX)
-            {
-                if (m_prior.GetDeviceId() != deviceId)
-                {
-                    m_prior.TransferFromDeviceToDevice(m_prior.GetDeviceId(), deviceId, true);
-                }
-                if (m_normedDeviation.GetDeviceId() != deviceId)
-                {
-                    m_normedDeviation.TransferFromDeviceToDevice(m_normedDeviation.GetDeviceId(), deviceId, true);
-                }
-                if (m_normedDeviationVectors.GetDeviceId() != deviceId)
-                {
-                    m_normedDeviationVectors.TransferFromDeviceToDevice(m_normedDeviationVectors.GetDeviceId(), deviceId, true);
-                }
-                if (m_stddev.GetDeviceId() != deviceId)
-                {
-                    m_stddev.TransferFromDeviceToDevice(m_stddev.GetDeviceId(), deviceId, true);
-                }
-                if (m_posterior.GetDeviceId() != deviceId)
-                {
-                    m_posterior.TransferFromDeviceToDevice(m_posterior.GetDeviceId(), deviceId, true);
-                }
-            }
-        }
-
-        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
-        {
-            ComputationNode<ElemType>::CopyTo(nodeP, newName, flags);
-            GMMLogLikelihoodNode<ElemType>* node = (GMMLogLikelihoodNode<ElemType>*) nodeP;
-
-            if (flags & CopyNodeFlags::copyNodeValue)
-            {
-                node->m_prior = m_prior;
-                node->m_normedDeviation = m_normedDeviation;
-                node->m_normedDeviationVectors = m_normedDeviationVectors;
-                node->m_stddev = m_stddev;
-                node->m_posterior = m_posterior;
-            }
-        }
-
-    protected:
-        Matrix<ElemType> m_prior;
-        Matrix<ElemType> m_normedDeviation;
-        Matrix<ElemType> m_normedDeviationVectors;
-        Matrix<ElemType> m_stddev;
-        Matrix<ElemType> m_posterior;
-        Matrix<ElemType> m_temp;
-    };
-
-    template class GMMLogLikelihoodNode<float>;
-    template class GMMLogLikelihoodNode<double>;
-
+    //calculates: the log likelihood of a feature given GMM parameters
+    template<class ElemType>
+    class GMMLogLikelihoodNode : public ComputationNode<ElemType>
+    {
+        UsingComputationNodeMembers;
+    public:
+        GMMLogLikelihoodNode(const DEVICEID_TYPE deviceId = AUTOPLACEMATRIX, const std::wstring name = L"")
+            : ComputationNode<ElemType>(deviceId), m_prior(deviceId), m_normedDeviation(deviceId), m_normedDeviationVectors(deviceId), m_stddev(deviceId), m_posterior(deviceId), m_temp(deviceId)
+        {
+            m_nodeName = (name == L"" ? CreateUniqNodeName() : name);
+            m_deviceId = deviceId;
+            MoveMatricesToDevice(deviceId);
+            InitRecurrentNode();
+        }
+
+        GMMLogLikelihoodNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId = AUTOPLACEMATRIX, const std::wstring name = L"")
+            : ComputationNode<ElemType>(deviceId), m_prior(deviceId), m_normedDeviation(deviceId), m_normedDeviationVectors(deviceId), m_stddev(deviceId), m_posterior(deviceId), m_temp(deviceId)
+        {
+            m_nodeName = (name == L"" ? CreateUniqNodeName() : name);
+            LoadFromFile(fstream, modelVersion, deviceId);
+        }
+
+        // copy constructor
+        GMMLogLikelihoodNode(const GMMLogLikelihoodNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags)
+            : ComputationNode<ElemType>(node->m_deviceId), m_prior(node->m_deviceId), m_normedDeviation(node->m_deviceId), m_normedDeviationVectors(node->m_deviceId),
+            m_stddev(node->m_deviceId), m_posterior(node->m_deviceId), m_temp(m_deviceId)
+        {
+            node->CopyTo(this, newName, flags);
+        }
+
+        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            const std::wstring& name = (newName == L"") ? NodeName() : newName;
+
+            ComputationNodePtr node = new GMMLogLikelihoodNode<ElemType>(this, name, flags);
+            return node;
+        }
+
+        virtual const std::wstring OperationName() const { return TypeName(); }
+        static const std::wstring TypeName() { return L"GMMLogLikelihood"; }
+
+        virtual void ComputeInputPartial(const size_t inputIndex)
+        {
+            switch (inputIndex)
+            {
+            case 0:
+                ComputeInputPartialUnnormedPrior(Inputs(0)->GradientValues(), m_gradientValues, m_prior, m_posterior, m_temp);
+                break;
+            case 1:
+                ComputeInputPartialMean(Inputs(1)->GradientValues(), m_gradientValues, m_normedDeviationVectors, m_posterior, m_temp);
+                break;
+            case 2:
+                ComputeInputPartialLogStddev(Inputs(2)->GradientValues(), m_gradientValues, m_normedDeviation, m_posterior, m_temp);
+                break;
+            case 3:
+                ComputeInputPartialFeature(Inputs(3)->GradientValues(), m_gradientValues, m_normedDeviationVectors, m_posterior, m_temp);
+                break;
+            default:
+                throw std::invalid_argument("GMMLogLikelihoodNode only takes four inputs.");
+            }
+        }
+
+        virtual void ComputeInputPartial(const size_t inputIndex, const size_t timeIdxInSeq)
+        {
+            //get the right slice 
+            size_t startIndex = timeIdxInSeq * m_samplesInRecurrentStep;
+
+            size_t colsPrior = Inputs(0)->FunctionValues().GetNumCols();
+
+            Matrix<ElemType> sliceGradientValue = m_gradientValues.ColumnSlice(startIndex, m_samplesInRecurrentStep);
+            Matrix<ElemType> slicePosterior = m_posterior.ColumnSlice(startIndex, m_samplesInRecurrentStep);
+                
+            switch (inputIndex)
+            {
+            case 0:
+                {
+                    if (colsPrior == 1)
+                        ComputeInputPartialUnnormedPrior(Inputs(0)->GradientValues(), sliceGradientValue, m_prior, slicePosterior, m_temp);
+                    else
+                    {
+                        Matrix<ElemType> sliceUnnormedPriorGradient = Inputs(0)->GradientValues().ColumnSlice(startIndex, m_samplesInRecurrentStep);
+                        Matrix<ElemType> slicePrior = m_prior.ColumnSlice(startIndex, m_samplesInRecurrentStep);
+                        ComputeInputPartialUnnormedPrior(sliceUnnormedPriorGradient, sliceGradientValue, slicePrior, slicePosterior, m_temp);
+                    }
+                }
+                break;
+            case 1:
+                {
+                      Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.ColumnSlice(startIndex, m_samplesInRecurrentStep);
+                      if (colsPrior == 1)
+                        ComputeInputPartialMean(Inputs(1)->GradientValues(), sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
+                    else
+                    {
+                        Matrix<ElemType> sliceMeanGradient = Inputs(1)->GradientValues().ColumnSlice(startIndex, m_samplesInRecurrentStep);
+                        ComputeInputPartialMean(sliceMeanGradient, sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
+                    }
+                }
+                break;
+            case 2:
+                {
+                    Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.ColumnSlice(startIndex, m_samplesInRecurrentStep);
+                    if (colsPrior == 1)
+                        ComputeInputPartialLogStddev(Inputs(2)->GradientValues(), sliceGradientValue, sliceNormedDeviation, slicePosterior, m_temp);
+                    else
+                    {
+                        Matrix<ElemType> sliceLotStddevGradient = Inputs(2)->GradientValues().ColumnSlice(startIndex, m_samplesInRecurrentStep);
+                        ComputeInputPartialLogStddev(sliceLotStddevGradient, sliceGradientValue, sliceNormedDeviation, slicePosterior, m_temp);
+                    }
+                }
+                break;
+            case 3:
+                {
+                    Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.ColumnSlice(startIndex, m_samplesInRecurrentStep);
+                    Matrix<ElemType> sliceFeatureGradient = Inputs(3)->GradientValues().ColumnSlice(startIndex, m_samplesInRecurrentStep);
+                    ComputeInputPartialFeature(sliceFeatureGradient, sliceGradientValue, sliceNormedDeviationVectors, slicePosterior, m_temp);
+                }
+                break;
+            default:
+                throw std::invalid_argument("GMMLogLikelihoodNode criterion only takes four inputs.");
+            }
+        }
+
+        static void WINAPI ComputeInputPartialUnnormedPrior(Matrix<ElemType>& unnormedPriorGradientValues, const Matrix<ElemType>& gradientValues,
+            const Matrix<ElemType>& prior, const Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
+        {
+            temp.AssignDifferenceOf(posterior, prior);
+            temp.RowElementMultiplyWith(gradientValues);
+            if (prior.GetNumCols() == posterior.GetNumCols())
+            {
+                unnormedPriorGradientValues += temp; 
+            }
+            else if (prior.GetNumCols() == 1)
+            {
+                Matrix<ElemType>::MultiplyAndAdd(temp, false, ConstOnes(posterior.GetNumCols(), 1, unnormedPriorGradientValues.GetDeviceId()), false, unnormedPriorGradientValues);
+            }
+            else
+            {
+                throw std::runtime_error("GMMLogLikelihoodNode: UnnormedPrior should either have same number of columns as the features or have only one column.");
+            }
+        }
+
+        static void WINAPI ComputeInputPartialMean(Matrix<ElemType>& meanGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& normedDeviationVectors,
+            Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
+        {
+            size_t numComponent = posterior.GetNumRows(); 
+            size_t numSamples = posterior.GetNumCols();
+            size_t featureSize = normedDeviationVectors.GetNumRows() / numComponent;
+
+            temp.SetValue(normedDeviationVectors); //recall normedDeviationVectors <-- (x-u_c)/(stddev^2)
+            temp.Reshape(featureSize, numSamples* numComponent);
+
+            posterior.Reshape(1, numSamples* numComponent);
+            temp.RowElementMultiplyWith(posterior); //temp <-- posterior * (x-u_c)/(stddev^2)
+
+            posterior.Reshape(numComponent, numSamples);  //reshape back
+            temp.Reshape(featureSize * numComponent, numSamples); //reshape back
+
+            temp.RowElementMultiplyWith(gradientValues);
+
+            if (numSamples == meanGradientValues.GetNumCols())
+            {
+                meanGradientValues += temp;
+            }
+            else if (meanGradientValues.GetNumCols() == 1)
+            {
+                Matrix<ElemType>::MultiplyAndAdd(temp, false, ConstOnes(numSamples, 1, meanGradientValues.GetDeviceId()), false, meanGradientValues);
+            }
+            else
+            {
+                throw std::runtime_error("GMMLogLikelihoodNode: stddev should either have same number of columns as the features or have only one column.");
+            }
+        }
+
+        static void WINAPI ComputeInputPartialLogStddev(Matrix<ElemType>& logStddevGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& normedDeviation,
+            const Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
+        {
+            size_t numComponent = posterior.GetNumRows();
+            size_t numSamples = posterior.GetNumCols();
+
+            temp.AssignDifferenceOf(normedDeviation, (ElemType)numComponent);
+            temp.ElementMultiplyWith(posterior);
+            temp.RowElementMultiplyWith(gradientValues);
+            if (logStddevGradientValues.GetNumCols() == numSamples)
+            {
+                logStddevGradientValues += temp;
+            }
+            else if (logStddevGradientValues.GetNumCols() == 1)
+            {
+                Matrix<ElemType>::MultiplyAndAdd(temp, false, ConstOnes(numSamples, 1, logStddevGradientValues.GetDeviceId()), false, logStddevGradientValues);
+            }
+            else
+            {
+                throw std::runtime_error("GMMLogLikelihoodNode: stddev should either have same number of columns as the features or have only one column.");
+            }
+        }
+
+        static void WINAPI ComputeInputPartialFeature(Matrix<ElemType>& featureGradientValues, const Matrix<ElemType>& gradientValues, const Matrix<ElemType>& normedDeviationVectors,
+            Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
+        {
+            size_t numComponent = posterior.GetNumRows();
+            size_t numSamples = posterior.GetNumCols();
+            size_t featureSize = normedDeviationVectors.GetNumRows() / numComponent;
+
+            temp.SetValue(normedDeviationVectors);
+            temp *= -1;
+            temp.Reshape(featureSize, numSamples* numComponent);
+            posterior.Reshape(1, numSamples* numComponent);
+            temp.RowElementMultiplyWith(posterior);
+
+            posterior.Reshape(numComponent, numSamples);
+            temp.Reshape(featureSize * numComponent, numSamples);
+            temp.RowElementMultiplyWith(gradientValues);
+
+            for (int i = 0; i < numComponent; i++)
+                featureGradientValues.AddWithRowSliceValuesOf(temp, i*featureSize, featureSize);
+        }
+
+        virtual void SetFunctionAndGradientSize(const int numSamples)
+        {
+            ComputationNode<ElemType>::SetFunctionAndGradientSize(numSamples);
+
+            size_t numComponents = Inputs(0)->FunctionValues().GetNumRows();
+            size_t colsPrior = Inputs(0)->FunctionValues().GetNumCols();
+            //size_t numSamples = Inputs(3)->FunctionValues().GetNumCols();
+            size_t featureSize = Inputs(3)->FunctionValues().GetNumRows();
+
+            m_prior.Resize(numComponents, colsPrior);
+            m_stddev.Resize(numComponents, colsPrior);
+            m_normedDeviation.Resize(numComponents, numSamples);
+            m_normedDeviationVectors.Resize(numComponents * featureSize, numSamples);
+            m_posterior.Resize(numComponents, numSamples);
+        }
+
+        //input0=unnormedPrior, input1=mean, input2=logstddev, input3=feature
+        virtual void EvaluateThisNode()
+        {
+            // all internal matrices will be automatically resized since all of them are assigned to a value so no resize is needed here.
+            EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), Inputs(3)->FunctionValues(), 
+                m_prior, m_stddev, m_normedDeviationVectors, m_normedDeviation, m_posterior, m_temp);
+        }
+
+        //input0=unnormedPrior, input1=mean, input2=logstddev, input3=feature
+        virtual void EvaluateThisNode(const size_t timeIdxInSeq)
+        {
+            size_t colsPrior = Inputs(0)->FunctionValues().GetNumCols();
+            size_t numSamples = Inputs(3)->FunctionValues().GetNumCols();
+
+            //get the right slice 
+            size_t startIndex = timeIdxInSeq * m_samplesInRecurrentStep;
+
+            Matrix<ElemType> sliceOutputValue = m_functionValues.ColumnSlice(startIndex, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceFeature = Inputs(3)->FunctionValues().ColumnSlice(startIndex, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceNormedDeviation = m_normedDeviation.ColumnSlice(startIndex, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceNormedDeviationVectors = m_normedDeviationVectors.ColumnSlice(startIndex, m_samplesInRecurrentStep);
+            Matrix<ElemType> slicePosterior = m_posterior.ColumnSlice(startIndex, m_samplesInRecurrentStep);
+
+            if (colsPrior == 1)
+            {
+                EvaluateThisNodeS(sliceOutputValue, Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), sliceFeature,
+                    m_prior, m_stddev, sliceNormedDeviationVectors, sliceNormedDeviation, slicePosterior, m_temp);
+            }
+            else if (colsPrior == numSamples)
+            {
+                Matrix<ElemType> sliceUnnormedPrior = Inputs(0)->FunctionValues().ColumnSlice(startIndex, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceMean = Inputs(1)->FunctionValues().ColumnSlice(startIndex, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceLogstddev = Inputs(2)->FunctionValues().ColumnSlice(startIndex, m_samplesInRecurrentStep);
+
+                Matrix<ElemType> slicePrior = m_prior.ColumnSlice(startIndex, m_samplesInRecurrentStep);
+                Matrix<ElemType> sliceStddev = m_stddev.ColumnSlice(startIndex, m_samplesInRecurrentStep);
+
+                EvaluateThisNodeS(sliceOutputValue, sliceUnnormedPrior, sliceMean, sliceLogstddev, sliceFeature,
+                    slicePrior, sliceStddev, sliceNormedDeviationVectors, sliceNormedDeviation, slicePosterior, m_temp);
+            }
+            else  //should not reach the code since validation should fail already
+            {
+                throw std::runtime_error("GMMLogLikelihoodNode: UnnormedPrior should either have same number of columns as the features or have only one column.");
+            }
+
+        }
+
+        //input0=unnormedPrior, input1=mean, input2=logstddev, input3=feature
+        //If we want to speed up we need to replace following code with a several specialized GPU functions
+        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& unnormedPrior, const Matrix<ElemType>& mean,  Matrix<ElemType>& logstddev,
+            const Matrix<ElemType>& feature, Matrix<ElemType>& prior, Matrix<ElemType>& stddev, Matrix<ElemType>& normedDeviationVectors,
+            Matrix<ElemType>& normedDeviation, Matrix<ElemType>& posterior, Matrix<ElemType>& temp)
+        {
+            int numComponent = unnormedPrior.GetNumRows();
+            size_t numSamples = feature.GetNumCols();
+            size_t featureDim = feature.GetNumRows();
+
+            //compute prior which is softmax of unnormedPrior
+            prior.AssignLogSoftmaxOf(unnormedPrior, true);  //log prior
+
+            prior.InplaceExp();
+
+            //compute stddev
+            stddev.AssignExpOf(logstddev);
+
+#if DUMPOUTPUT
+            unnormedPrior.Print("unnormedPrior", 0, min(5, unnormedPrior.GetNumRows() - 1), 0, min(10, unnormedPrior.GetNumCols() - 1));
+            mean.Print("mean", 0, min(5, mean.GetNumRows() - 1), 0, min(10, mean.GetNumCols() - 1));
+            logstddev.Print("logstddev", 0, min(5, logstddev.GetNumRows() - 1), 0, min(10, logstddev.GetNumCols() - 1));
+
+            prior.Print("prior", 0, min(5, prior.GetNumRows() - 1), 0, min(10, prior.GetNumCols() - 1));
+            stddev.Print("stddev", 0, min(5, stddev.GetNumRows() - 1), 0, min(10, stddev.GetNumCols() - 1));
+#endif
+
+            //compute normedDeviation <-- ||x-u_c||^2/(stddev^2)
+            normedDeviationVectors.AssignRepeatOf(feature, numComponent, 1);
+            normedDeviationVectors -= mean; //each column of the mean has multiple mean components
+            normedDeviationVectors.Reshape(featureDim, numSamples* numComponent);  //now each column is feature-mean_i
+
+            normedDeviation.AssignVectorNorm2Of(normedDeviationVectors, true);
+            normedDeviation ^= 2;
+            temp.AssignRepeatOf(stddev, 1, numSamples / stddev.GetNumCols());  //stddev.GetNumCols() is either 1 or =numSamples
+            temp.Reshape(1, temp.GetNumElements());  //one stddev value for each component for each sample
+            temp ^= 2;
+            normedDeviation.ElementDivideBy(temp);  //normedDeviation and temp have same dim (1, numSamples* numComponent)
+
+            //compute  normedDeviationVectors <-- (x-u_c)/(stddev^2)
+            normedDeviationVectors.RowElementDivideBy(temp);  //divide twice
+            normedDeviationVectors.Reshape(featureDim*numComponent, numSamples);  //reshape back
+
+            //compute per-component likelihood
+            posterior.AssignProductOf(-0.5f, normedDeviation); //posterior  <-- -||x-u_c||^2/(stddev^2)/2 and in (1, numSamples* numComponent) dim
+            temp.InplaceLog();
+            temp *= ((ElemType)numComponent / 2.0f); //temp <-- stddev^c and in (1, numSamples* numComponent) dim
+            posterior -= temp;  // posterior  <-- exp[-||x-u_c||^2/(stddev^2)/2]/(stddev^c)
+            posterior -= (ElemType)(numComponent / 2.0f*log(TWO_PI)); //likelihood for each component and sample is now computed and stored in posterior
+            posterior.InplaceExp(); //posterior  <-- exp(-||x-u_c||^2/(stddev^2)/2)
+
+            normedDeviation.Reshape(numComponent, numSamples);  //reshape back
+            posterior.Reshape(numComponent, numSamples);  //reshape back
+
+            //compute posterior <-- prior_i * likelihood_i
+            if (unnormedPrior.GetNumCols() == numSamples)  //each sample has different prior
+                posterior.ElementMultiplyWith(prior);
+            else  //all samples share the same prior
+                posterior.ColumnElementMultiplyWith(prior);
+
+            //compute GMM log-likelihood
+            Matrix<ElemType>::Multiply(ConstOnes(1, numComponent, posterior.GetDeviceId()), false, posterior, false, functionValues);  //functionValues <-- total likelihood
+            posterior.RowElementDivideBy(functionValues); //posterior <-- per-comp likelihood / total likelihood
+            functionValues.InplaceLog(); //log likelihood
+
+#if DUMPOUTPUT
+            temp.Print("temp", 0, min(5, temp.GetNumRows() - 1), 0, min(10, temp.GetNumCols() - 1));
+            normedDeviation.Print("normedDeviation", 0, min(5, normedDeviation.GetNumRows() - 1), 0, min(10, normedDeviation.GetNumCols() - 1));
+
+            posterior.Print("posterior", 0, min(5, posterior.GetNumRows() - 1), 0, min(10, posterior.GetNumCols() - 1));
+            functionValues.Print("functionValues", 0, min(5, functionValues.GetNumRows() - 1), 0, min(10, functionValues.GetNumCols() - 1));
+
+            functionValues.Print("GMMLogLikelihoodNode");
+#endif
+
+#if NANCHECK
+            functionValues.HasNan("GMMLogLikelihood");
+#endif
+        }
+
+        virtual void Validate()
+        {
+            PrintSelfBeforeValidation();
+
+            if (m_children.size() != 4)
+                throw std::logic_error("GMMLogLikelihoodNode requires four inputs.");
+
+            size_t rows[4], cols[4];
+            for (int i = 0; i < 4; i++)
+            {
+                rows[i] = Inputs(i)->FunctionValues().GetNumRows();
+                cols[i] = Inputs(i)->FunctionValues().GetNumCols();
+            }
+
+            if (cols[0] != cols[1] || cols[0] != cols[2])
+                throw std::logic_error("GMMLogLikelihoodNode: UnnormedPrior (first input), mean (second input), and logStddev (third input) should have same number of columns.");
+
+            if (cols[0] != 1 && cols[0] != cols[3])
+                throw std::logic_error("GMMLogLikelihoodNode: UnnormedPrior (first input) should either have same number of columns as the features (fourth input) or have only one column.");
+
+            if (rows[0] != rows[2])
+                throw std::logic_error("GMMLogLikelihoodNode: UnnormedPrior (first input) should have same dimension as logStddev (third input), i.e., all dimensions in each Gaussian component share the same stddev.");
+
+            if (rows[1] != rows[0]*rows[3])
+                throw std::logic_error("GMMLogLikelihoodNode: the number of rows in mean (second input) should equal rows(unnormedPrior(first input) * rows(feature(fourth input)).");
+
+            FunctionValues().Resize(1, cols[3]);
+            CopyImageSizeFromInputs();
+        }
+
+        virtual void CopyImageSizeFromInputs()
+        {
+            CopyImageSizeFromInput(3, false);
+
+            m_outputChannels = 1;
+            m_outputWidth = 1;
+            m_outputHeight = 1;
+        }
+
+        //leftNode should be the empirical
+        virtual void AttachInputs(const ComputationNodePtr unnormedPrior, const ComputationNodePtr mean, const ComputationNodePtr logStddev, const ComputationNodePtr feature)
+        {
+            m_children.resize(4);
+            m_children[0] = unnormedPrior;
+            m_children[1] = mean;
+            m_children[2] = logStddev;
+            m_children[3] = feature;
+        }
+
+        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId)
+        {
+            ComputationNode<ElemType>::MoveMatricesToDevice(deviceId);
+
+            if (deviceId != AUTOPLACEMATRIX)
+            {
+                if (m_prior.GetDeviceId() != deviceId)
+                {
+                    m_prior.TransferFromDeviceToDevice(m_prior.GetDeviceId(), deviceId, true);
+                }
+                if (m_normedDeviation.GetDeviceId() != deviceId)
+                {
+                    m_normedDeviation.TransferFromDeviceToDevice(m_normedDeviation.GetDeviceId(), deviceId, true);
+                }
+                if (m_normedDeviationVectors.GetDeviceId() != deviceId)
+                {
+                    m_normedDeviationVectors.TransferFromDeviceToDevice(m_normedDeviationVectors.GetDeviceId(), deviceId, true);
+                }
+                if (m_stddev.GetDeviceId() != deviceId)
+                {
+                    m_stddev.TransferFromDeviceToDevice(m_stddev.GetDeviceId(), deviceId, true);
+                }
+                if (m_posterior.GetDeviceId() != deviceId)
+                {
+                    m_posterior.TransferFromDeviceToDevice(m_posterior.GetDeviceId(), deviceId, true);
+                }
+            }
+        }
+
+        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            ComputationNode<ElemType>::CopyTo(nodeP, newName, flags);
+            GMMLogLikelihoodNode<ElemType>* node = (GMMLogLikelihoodNode<ElemType>*) nodeP;
+
+            if (flags & CopyNodeFlags::copyNodeValue)
+            {
+                node->m_prior = m_prior;
+                node->m_normedDeviation = m_normedDeviation;
+                node->m_normedDeviationVectors = m_normedDeviationVectors;
+                node->m_stddev = m_stddev;
+                node->m_posterior = m_posterior;
+            }
+        }
+
+    protected:
+        Matrix<ElemType> m_prior;
+        Matrix<ElemType> m_normedDeviation;
+        Matrix<ElemType> m_normedDeviationVectors;
+        Matrix<ElemType> m_stddev;
+        Matrix<ElemType> m_posterior;
+        Matrix<ElemType> m_temp;
+    };
+
+    template class GMMLogLikelihoodNode<float>;
+    template class GMMLogLikelihoodNode<double>;
+
 }}}
diff --git a/MachineLearning/CNTK/SynchronousExecutionEngine.h b/MachineLearning/CNTK/SynchronousExecutionEngine.h
index 10eeaeecd..a7d2960ce 100644
--- a/MachineLearning/CNTK/SynchronousExecutionEngine.h
+++ b/MachineLearning/CNTK/SynchronousExecutionEngine.h
@@ -391,26 +391,43 @@ public:
             {
             std::vector<void*> inputs = EvaluateParameters(node, baseName, nodeParamStart, nodeParamCount, pass);
 
-            switch (inputs.size())
+            if (cnNodeType == RowStackNode<ElemType>::TypeName()) //support variable length inputs
             {
-            case 1:
-                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]));
-                break;
-            case 2:
-                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]));
-                break;
-            case 3:
-                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]));
-                break;
-            case 4:
-                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3]));
-                break;
-            default:
-                if (nodeParamCount > 0)
-                    RuntimeError("Invalid number of parameters name = '%s' call = '%s'\n", node->GetName().c_str(), node->GetValue().c_str());
-                break;
-            }
+                std::vector<const ComputationNodePtr> inputNodes;
+                inputNodes.resize(inputs.size());
+                for (int i = 0; i < inputs.size(); i++)
+                    inputNodes[i] = ComputationNodePtr(inputs[i]);
 
+                nodePtr->AttachInputs(inputNodes);
+            }
+            else
+            {
+                switch (inputs.size())
+                {
+                case 1:
+                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]));
+                    break;
+                case 2:
+                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]));
+                    break;
+                case 3:
+                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]));
+                    break;
+                case 4:
+                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3]));
+                    break;
+                case 5:
+                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3]), ComputationNodePtr(inputs[4]));
+                    break;
+                case 6:
+                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3]), ComputationNodePtr(inputs[4]), ComputationNodePtr(inputs[5]));
+                    break;
+                default:
+                    if (nodeParamCount > 0)
+                        RuntimeError("Invalid number of parameters name = '%s' call = '%s'\n", node->GetName().c_str(), node->GetValue().c_str());
+                    break;
+                }
+            }
             // process common optional parameters (like "tag");
             ProcessOptionalParameters(node);
             break;
diff --git a/Math/CNTKMathTest/CPUMatrixUnitTests.cpp b/Math/CNTKMathTest/CPUMatrixUnitTests.cpp
index fadebe55e..0c9a762b9 100644
--- a/Math/CNTKMathTest/CPUMatrixUnitTests.cpp
+++ b/Math/CNTKMathTest/CPUMatrixUnitTests.cpp
@@ -563,7 +563,7 @@ namespace CNTKMathTest
             Assert::IsTrue(C.IsEqualTo(D1, 0.0001)); 
         }
 
-        TEST_METHOD(CPUMatrixRowSlice)
+        TEST_METHOD(CPUMatrixRowSliceAndStack)
         {
             Matrix M0(5,3);
             M0(0,0) = 1; M0(0,1) = 6; M0(0,2) = 11;
@@ -590,6 +590,26 @@ namespace CNTKMathTest
             M3 += M0;
             M0.AddToRowSliceValuesOf(M1, 2,2);
             Assert::IsTrue(M3.IsEqualTo(M0, 0.0001)); 
+
+            M2.AddWithRowSliceValuesOf(M1, 0, 2);
+            Matrix M4(2, 3);
+            M4(0, 0) = 6; M4(0, 1) = 16; M4(0, 2) = 26;
+            M4(1, 0) = 8; M4(1, 1) = 18; M4(1, 2) = 28;
+            Assert::IsTrue(M2.IsEqualTo(M4, 0.0001));
+
+            Matrix M5, M6, M7, M8;
+            M5.AssignRowSliceValuesOf(M0, 0, 2);
+            M6.AssignRowSliceValuesOf(M0, 2, 1);
+            M7.AssignRowSliceValuesOf(M0, 3, 2);
+
+            std::vector<const Matrix*> inputMatrices;
+            inputMatrices.resize(3);
+            inputMatrices[0] = &M5;
+            inputMatrices[1] = &M6;
+            inputMatrices[2] = &M7;
+            M8.AssignRowStackValuesOf(inputMatrices, 0, 3);
+            
+            Assert::IsTrue(M8.IsEqualTo(M0, 0.0001));
         }
 
         TEST_METHOD(CPUAssignRepeatOf)
diff --git a/Math/CNTKMathTest/GPUMatrixUnitTests.cpp b/Math/CNTKMathTest/GPUMatrixUnitTests.cpp
index 86033e27c..c3d25bbe5 100644
--- a/Math/CNTKMathTest/GPUMatrixUnitTests.cpp
+++ b/Math/CNTKMathTest/GPUMatrixUnitTests.cpp
@@ -278,7 +278,7 @@ namespace CNTKMathTest
             Assert::IsTrue(M2.IsEqualTo(M3, 0.0001f)); 
         }
 
-        TEST_METHOD(GPUMatrixRowSlice)
+        TEST_METHOD(GPUMatrixRowSliceAndStack)
         {
             float *fArray = new float[15];
             fArray[0] = 1; fArray[5] = 6; fArray[10] = 11;
@@ -308,6 +308,27 @@ namespace CNTKMathTest
             M3 += M0;
             M0.AddToRowSliceValuesOf(M1, 2,2);
             Assert::IsTrue(M3.IsEqualTo(M0, 0.0001)); 
+
+            M2.AddWithRowSliceValuesOf(M1, 0, 2);
+            float *fArray4 = new float[6];
+            fArray4[0] = 6; fArray4[2] = 16; fArray4[4] = 26;
+            fArray4[1] = 8; fArray4[3] = 18; fArray4[5] = 28;
+            GPUMatrix<float> M4(2, 3, fArray4, matrixFlagNormal);
+            Assert::IsTrue(M2.IsEqualTo(M4, 0.0001));
+
+            GPUMatrix<float>  M5, M6, M7, M8;
+            M5.AssignRowSliceValuesOf(M0, 0, 2);
+            M6.AssignRowSliceValuesOf(M0, 2, 1);
+            M7.AssignRowSliceValuesOf(M0, 3, 2);
+
+            std::vector<const GPUMatrix<float> *> inputMatrices;
+            inputMatrices.resize(3);
+            inputMatrices[0] = &M5;
+            inputMatrices[1] = &M6;
+            inputMatrices[2] = &M7;
+            M8.AssignRowStackValuesOf(inputMatrices, 0, 3);
+
+            Assert::IsTrue(M8.IsEqualTo(M0, 0.0001));
         }
 
         TEST_METHOD(GPUKhatriRaoProduct)
diff --git a/Math/Math/CPUMatrix.cpp b/Math/Math/CPUMatrix.cpp
index 2c34b52ac..8052fe718 100644
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@@ -389,6 +389,48 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return *this;
     }
 
+    //stack the columns in inputMatrices (starting from sliceStartCol for sliceNumCols columns) and assign it to [this] object.
+    template<class ElemType>
+    CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignRowStackValuesOf(const std::vector<const CPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
+    {
+        if (sliceNumCols == 0)
+            LogicError("AssignRowStackValuesOf: sliceNumCols should > 0.");
+
+        size_t totalRows = 0;
+        size_t* startRowIndeces = new size_t[inputMatrices.size()];
+        startRowIndeces[0] = 0;
+        for (int i = 0; i < inputMatrices.size(); i++)
+        {
+            const CPUMatrix<ElemType>& a = *inputMatrices[i];
+            if (a.IsEmpty())
+                LogicError("AssignRowStackValuesOf: input matrix (%d) is empty.", i);
+
+            if (a.GetNumCols() < sliceStartCol + sliceNumCols)
+                LogicError("AssignRowStackValuesOf: input matrix (%d) GetNumCols() < sliceStartCol + sliceNumCols.", i);
+
+            totalRows += a.GetNumRows();
+            if (i<inputMatrices.size()-1)
+                startRowIndeces[i + 1] = startRowIndeces[i] + a.GetNumRows();
+        }
+
+        Resize(totalRows, sliceNumCols);
+
+        auto& us = *this;
+
+#pragma omp parallel for     
+        for (long j = 0; j<sliceNumCols; j++)
+        {
+            for (int i = 0; i < inputMatrices.size(); i++)
+            {
+                memcpy(&us(startRowIndeces[i], j), &(*inputMatrices[i])(0, sliceStartCol+j), inputMatrices[i]->GetNumRows() * sizeof(ElemType));
+            }
+        }
+        
+        delete [] startRowIndeces;
+
+        return *this;
+    }  
+
     template<class ElemType>
     void CPUMatrix<ElemType>::MinusOneAt(CPUMatrix<ElemType>& c, const size_t position)
     {
diff --git a/Math/Math/CPUMatrix.h b/Math/Math/CPUMatrix.h
index 59cce206f..cebe2afc8 100644
--- a/Math/Math/CPUMatrix.h
+++ b/Math/Math/CPUMatrix.h
@@ -244,6 +244,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         CPUMatrix<ElemType>&  AssignRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
         CPUMatrix<ElemType>&  AddToRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
         CPUMatrix<ElemType>&  AddWithRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);
+        CPUMatrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const CPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);
 
         CPUMatrix<ElemType>&  AssignRepeatOf(const CPUMatrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats);
         CPUMatrix<ElemType>&  AssignPositiveAndShiftedNegSample(const CPUMatrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
diff --git a/Math/Math/GPUMatrix.cu b/Math/Math/GPUMatrix.cu
index fd01aa342..792bf24c4 100644
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@@ -650,6 +650,63 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return *this;
     }
 
+    //stack the columns in inputMatrices (starting from sliceStartCol for sliceNumCols columns) and assign it to [this] object.
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRowStackValuesOf(const std::vector<const GPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
+    {
+        if (sliceNumCols == 0)
+            LogicError("AssignRowStackValuesOf: sliceNumCols should > 0.");
+
+        size_t totalRows = 0;
+        size_t* startRowIndeces = new size_t[inputMatrices.size()+1];
+        ElemType ** bufferPointersInInputMatrices = new ElemType*[inputMatrices.size()];
+
+        startRowIndeces[0] = 0;       
+
+        for (int i = 0; i < inputMatrices.size(); i++)
+        {
+            const GPUMatrix<ElemType>& a = *inputMatrices[i];
+            if (a.IsEmpty())
+                LogicError("AssignRowStackValuesOf: input matrix (%d) is empty.", i);
+
+            if (a.GetNumCols() < sliceStartCol + sliceNumCols)
+                LogicError("AssignRowStackValuesOf: input matrix (%d) GetNumCols() < sliceStartCol + sliceNumCols.", i);
+
+            totalRows += a.GetNumRows();
+            startRowIndeces[i + 1] = startRowIndeces[i] + a.GetNumRows();
+
+            bufferPointersInInputMatrices[i] = a.m_pArray + a.LocateColumn(sliceStartCol);
+        }
+
+        Resize(totalRows, sliceNumCols);
+
+        PrepareDevice();
+
+        ElemType** bufferPointersInGPU = NULL;
+        CUDA_CALL(cudaMalloc((void***)&bufferPointersInGPU, inputMatrices.size()*sizeof(ElemType*)));
+        CUDA_CALL(cudaMemcpy(bufferPointersInGPU, bufferPointersInInputMatrices, inputMatrices.size()*sizeof(ElemType*), cudaMemcpyHostToDevice));
+        delete[] bufferPointersInInputMatrices;
+
+        size_t* startRowIndecesInGPU = NULL;
+        CUDA_CALL(cudaMalloc((void**)&startRowIndecesInGPU, (1+inputMatrices.size())*sizeof(size_t)));
+        CUDA_CALL(cudaMemcpy(startRowIndecesInGPU, startRowIndeces, (1+inputMatrices.size())*sizeof(size_t), cudaMemcpyHostToDevice));
+        delete[] startRowIndeces;
+
+        LONG64 N = (LONG64)GetNumElements();
+        int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock);
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _assignRowStackValuesOf<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(m_pArray, bufferPointersInGPU, startRowIndecesInGPU, (long) inputMatrices.size(), N, (long)GetNumRows(), (long)GetNumCols());
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        CUDA_CALL(cudaFree(bufferPointersInGPU));
+        CUDA_CALL(cudaFree(startRowIndecesInGPU));
+
+        return *this;
+    }
+
     /// c = c - 1.0 for a specific position
     template<class ElemType>
     void GPUMatrix<ElemType>::MinusOneAt(GPUMatrix<ElemType>& c, const size_t position)
diff --git a/Math/Math/GPUMatrix.h b/Math/Math/GPUMatrix.h
index 7d0328fd4..1e5a780cf 100644
--- a/Math/Math/GPUMatrix.h
+++ b/Math/Math/GPUMatrix.h
@@ -273,6 +273,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         GPUMatrix<ElemType>&  AssignRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
         GPUMatrix<ElemType>&  AddToRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
         GPUMatrix<ElemType>&  AddWithRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);
+        GPUMatrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const GPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);
 
         GPUMatrix<ElemType>&  AssignRepeatOf(const GPUMatrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats);
         GPUMatrix<ElemType>&  AssignPositiveAndShiftedNegSample(const GPUMatrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
diff --git a/Math/Math/GPUMatrixCUDAKernels.cu b/Math/Math/GPUMatrixCUDAKernels.cu
index 3a6e6d201..b507dd445 100644
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@@ -364,6 +364,27 @@ __global__ void _addWithRowSliceValuesOf(ElemType * dest, ElemType * src, const
     dest[id] += src[IDX2C(row + startIndex, col, srcRows)];
 }
 
+template<class ElemType>
+__global__ void _assignRowStackValuesOf(ElemType * dest, ElemType ** srces, size_t* startRowIndeces, const LONG64 numSrces, const LONG64 N, const long destRows, const long destCols)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id >= N)
+        return;
+
+    long col = id / destRows;  //dest is the full matrix, rowslice is taken from the src
+    long row = id - (col * destRows);
+
+    //can we replace the for loop with something better?
+    int srcId = 0;
+    for (; srcId < numSrces; srcId++)
+    {
+        if (startRowIndeces[srcId + 1]>row)
+            break;
+    }
+
+    dest[id] = srces[srcId][IDX2C(row - startRowIndeces[srcId], col, startRowIndeces[srcId+1] - startRowIndeces[srcId])];
+}
+
 template<class ElemType>
 __global__ void _assignRepeatOf(ElemType * dest, ElemType * src, const LONG64 N, const long srcRows, const long srcCols, const long destRows)
 {
diff --git a/Math/Math/Matrix.cpp b/Math/Math/Matrix.cpp
index ec53f4b07..29e6e9247 100644
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@@ -1477,6 +1477,68 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         return *this;
     }
 
+    //stack the columns in inputMatrices (starting from sliceStartCol for sliceNumCols columns) and assign it to [this] object.
+    template<class ElemType>
+    Matrix<ElemType>& Matrix<ElemType>::AssignRowStackValuesOf(const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
+    {
+        for (int i = 0; i < inputMatrices.size(); i++)
+        {
+            const Matrix<ElemType>& a = *inputMatrices[i];
+            DecideAndMoveToRightDevice(*this, a);
+
+            //WARNING: a and this must have same type
+            if (!(GetMatrixType() == a.GetMatrixType()))
+                NOT_IMPLEMENTED;
+        }
+
+        CurrentDataLocation curLocation = GetCurrentMatrixLocation();
+        if (curLocation == CurrentDataLocation::GPU || curLocation == CurrentDataLocation::BOTH)
+        {
+            if (GetMatrixType() != MatrixType::SPARSE)
+            {
+                //GPUDense;
+                std::vector<const GPUMatrix<ElemType>*> gpuInputMatrices;
+                gpuInputMatrices.resize(inputMatrices.size());
+                for (int i = 0; i < inputMatrices.size(); i++)
+                    gpuInputMatrices[i] = inputMatrices[i]->m_GPUMatrix;
+
+                m_GPUMatrix->AssignRowStackValuesOf(gpuInputMatrices, sliceStartCol, sliceNumCols);
+
+                SetDataLocation(CurrentDataLocation::GPU, MatrixType::DENSE);
+            }
+            else
+            {
+                NOT_IMPLEMENTED;
+            }
+        }
+        else if (curLocation == CurrentDataLocation::CPU)
+        {
+            if (GetMatrixType() != MatrixType::SPARSE)
+            {
+                //CPUDense;
+                std::vector<const CPUMatrix<ElemType>*> cpuInputMatrices;
+                cpuInputMatrices.resize(inputMatrices.size());
+                for (int i = 0; i < inputMatrices.size(); i++)
+                    cpuInputMatrices[i] = inputMatrices[i]->m_CPUMatrix;
+
+                m_CPUMatrix->AssignRowStackValuesOf(cpuInputMatrices, sliceStartCol, sliceNumCols);
+
+                SetDataLocation(CurrentDataLocation::CPU, MatrixType::DENSE);
+            }
+            else
+            {
+                NOT_IMPLEMENTED;
+            }
+        }
+        else
+        {
+            throw std::runtime_error("Matrices do not exist in either CPU or GPU.");
+        }
+
+        return *this;
+    } 
+
+
     template<class ElemType>
     Matrix<ElemType>&  Matrix<ElemType>::AssignRepeatOf(const Matrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats)
     {
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index 033647ac0..aa224cafc 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -255,6 +255,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         Matrix<ElemType>&  AssignRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows);
         Matrix<ElemType>&  AddToRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
         Matrix<ElemType>&  AddWithRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows);
+        Matrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);
 
         Matrix<ElemType>&  AssignRepeatOf(const Matrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats);
         Matrix<ElemType>&  AssignPositiveAndShiftedNegSample(const Matrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
diff --git a/Math/Math/NoGPU.cpp b/Math/Math/NoGPU.cpp
index e120c7f08..05c78b5f9 100644
--- a/Math/Math/NoGPU.cpp
+++ b/Math/Math/NoGPU.cpp
@@ -477,6 +477,7 @@ namespace Microsoft {
             //for each column of a, we add all rows of a to this starting from startIndex
             template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddToRowSliceValuesOf(const GPUMatrix<ElemType>& /*a*/, const size_t startIndex, const size_t numRows) { return *this; }
             template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddWithRowSliceValuesOf(const GPUMatrix<ElemType>& /*a*/, const size_t startIndex, const size_t numRows) { return *this; }
+            GPUMatrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const GPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols) { return *this; }
 
             template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRepeatOf(const GPUMatrix<ElemType>& /*a*/, const size_t numRowRepeats, const size_t numColRepeats) { return *this; }
             template<class ElemType> GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignPositiveAndShiftedNegSample(const GPUMatrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber) { return *this; }

From a391ea7a7d0aa3040d1fb1c9a2048373ed4472b1 Mon Sep 17 00:00:00 2001
From: Dong Yu <dongyu@microsoft.com>
Date: Thu, 11 Jun 2015 18:48:35 -0700
Subject: [PATCH 15/21] ignore this. For some reason I have to commit it to be
 able to merge with the server.

---
 Scripts/build-and-test | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 Scripts/build-and-test

diff --git a/Scripts/build-and-test b/Scripts/build-and-test
old mode 100755
new mode 100644

From 8d0a82b5047f59ff1c80f1d64661b3ebfd9b87f3 Mon Sep 17 00:00:00 2001
From: Marko Radmilac <mradmila@microsoft.com>
Date: Thu, 11 Jun 2015 17:38:50 -0700
Subject: [PATCH 16/21] Add back chmod for config file, it is needed on
 Windows/Cygwin

---
 Scripts/build-and-test | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Scripts/build-and-test b/Scripts/build-and-test
index 4b452dd9d..669a28a19 100644
--- a/Scripts/build-and-test
+++ b/Scripts/build-and-test
@@ -119,6 +119,10 @@ cd $CNTK_ROOT
 
 if ! [[ -f $CONF_FILE ]]; then
     cp Demos/Simple/Simple.config $CONF_FILE || exit $?
+
+    # This chmod is necessary due to restrictive Cygwin interpretation of Windows permissions.
+    # Cygwin interprets Windows permissions as ----rwx---, which lacks read permissions for user.
+    chmod a+r $CONF_FILE || exit $?
 fi
 
 if [[ $QUIET_BUILD == 1 ]]; then

From 44b510d9e819e5d11e01dee71260423dfef6b58d Mon Sep 17 00:00:00 2001
From: Marko Radmilac <mradmila@microsoft.com>
Date: Fri, 12 Jun 2015 11:43:31 -0700
Subject: [PATCH 17/21] Reset build script to proper Linux permissions

---
 Scripts/build-and-test | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 Scripts/build-and-test

diff --git a/Scripts/build-and-test b/Scripts/build-and-test
old mode 100644
new mode 100755

From 34fdae054ce69a1a30f4ac009d87ff979fa89e5d Mon Sep 17 00:00:00 2001
From: amitaga <amitaga@microsoft.com>
Date: Fri, 12 Jun 2015 14:10:27 -0700
Subject: [PATCH 18/21] Fixed Linux build

---
 MachineLearning/CNTK/ComputationNetwork.h         | 5 +++--
 MachineLearning/CNTK/ComputationNode.h            | 4 +++-
 MachineLearning/CNTK/LinearAlgebraNodes.h         | 2 +-
 MachineLearning/CNTK/SynchronousExecutionEngine.h | 3 ++-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/MachineLearning/CNTK/ComputationNetwork.h b/MachineLearning/CNTK/ComputationNetwork.h
index 3b8c515b6..b024d20b9 100644
--- a/MachineLearning/CNTK/ComputationNetwork.h
+++ b/MachineLearning/CNTK/ComputationNetwork.h
@@ -41,6 +41,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
     protected:
         typedef ComputationNode<ElemType>* ComputationNodePtr;
+        typedef const ComputationNode<ElemType>* ConstComputationNodePtr;
 		typedef std::pair<ComputationNodePtr, ComputationNodePtr> ComputationArc;
         typedef struct stRecurrentInfo{
             std::vector<ComputationNodePtr> m_recurrentNodes;
@@ -548,7 +549,7 @@ public:
                     }
 
                     ComputationNodePtr nodePtr = GetNodeFromName(nodeName);
-                    std::vector<const ComputationNodePtr> childrenNodes;
+                    std::vector<ConstComputationNodePtr> childrenNodes;
                     childrenNodes.resize(numChildren);
                     for (int j = 0; j < numChildren; j++)
                         childrenNodes[j] = GetNodeFromName(childrenNames[j]);
@@ -1530,7 +1531,7 @@ public:
             return newNode;
         }
 
-        ComputationNodePtr RowStack(const std::vector<const ComputationNodePtr> inputs, const std::wstring nodeName = L"")
+        ComputationNodePtr RowStack(const std::vector<ConstComputationNodePtr> inputs, const std::wstring nodeName = L"")
         {
             ComputationNodePtr newNode(new RowStackNode<ElemType>(m_deviceId, nodeName));
             newNode->AttachInputs(inputs);
diff --git a/MachineLearning/CNTK/ComputationNode.h b/MachineLearning/CNTK/ComputationNode.h
index 55471acd3..a78f76a24 100644
--- a/MachineLearning/CNTK/ComputationNode.h
+++ b/MachineLearning/CNTK/ComputationNode.h
@@ -57,6 +57,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     protected:
         //std containers such as list and map does not support class reference so we need to use pointer
         typedef ComputationNode<ElemType>* ComputationNodePtr;
+        typedef const ComputationNode<ElemType>* ConstComputationNodePtr;
 		typedef std::pair<ComputationNodePtr, ComputationNodePtr> ComputationArc;
         int     m_loopId;
         size_t  m_samplesInRecurrentStep; 
@@ -152,7 +153,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             throw std::logic_error("This operation does not support six inputs.");
         }
 
-        virtual void AttachInputs(const std::vector<const ComputationNodePtr>& /*inputs*/)
+        virtual void AttachInputs(const std::vector<ConstComputationNodePtr>& /*inputs*/)
         {
             throw std::logic_error("This operation does not support variable-length inputs.");
         }
@@ -919,6 +920,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         typedef ComputationNode<ElemType> B; \
 protected:  \
         typedef ComputationNode<ElemType>* ComputationNodePtr;  \
+        typedef const ComputationNode<ElemType>* ConstComputationNodePtr;  \
 public: \
         using B::AttachInputs; using B::ChildrenNeedGradient; using B::ChildrenSize; using B::ClearGradientForChildren; \
         using B::ComputeGradientForChildren; using B::ComputeInputPartial; using B::ConstOnes; using B::CopyImageSizeFromInput; \
diff --git a/MachineLearning/CNTK/LinearAlgebraNodes.h b/MachineLearning/CNTK/LinearAlgebraNodes.h
index ffbda78ea..7961766e3 100644
--- a/MachineLearning/CNTK/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTK/LinearAlgebraNodes.h
@@ -574,7 +574,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 fprintf(stderr, "WARNING: RowStack operation cannot inherit image size information from its child. Image size info is lost.\n");
         }
 
-        virtual void AttachInputs(const std::vector<const ComputationNodePtr>& inputs)
+        virtual void AttachInputs(const std::vector<ConstComputationNodePtr>& inputs)
         {
             unsigned int numInputs = inputs.size();
             m_children.resize(numInputs);
diff --git a/MachineLearning/CNTK/SynchronousExecutionEngine.h b/MachineLearning/CNTK/SynchronousExecutionEngine.h
index a7d2960ce..72491d131 100644
--- a/MachineLearning/CNTK/SynchronousExecutionEngine.h
+++ b/MachineLearning/CNTK/SynchronousExecutionEngine.h
@@ -393,7 +393,7 @@ public:
 
             if (cnNodeType == RowStackNode<ElemType>::TypeName()) //support variable length inputs
             {
-                std::vector<const ComputationNodePtr> inputNodes;
+                std::vector<ConstComputationNodePtr> inputNodes;
                 inputNodes.resize(inputs.size());
                 for (int i = 0; i < inputs.size(); i++)
                     inputNodes[i] = ComputationNodePtr(inputs[i]);
@@ -734,6 +734,7 @@ public:
 private:
     ComputationNetwork<ElemType>& m_net;
     typedef ComputationNode<ElemType>* ComputationNodePtr;
+    typedef const ComputationNode<ElemType>* ConstComputationNodePtr;
     void operator=(const SynchronousNodeEvaluator&);
 };
 

From 892229b17006ab5197e48f3ed8fcabafbfca5996 Mon Sep 17 00:00:00 2001
From: amitaga <amitaga@microsoft.com>
Date: Fri, 12 Jun 2015 14:18:43 -0700
Subject: [PATCH 19/21] Revert "Fixed Linux build"

This reverts commit 34fdae054ce69a1a30f4ac009d87ff979fa89e5d.
---
 MachineLearning/CNTK/ComputationNetwork.h         | 5 ++---
 MachineLearning/CNTK/ComputationNode.h            | 4 +---
 MachineLearning/CNTK/LinearAlgebraNodes.h         | 2 +-
 MachineLearning/CNTK/SynchronousExecutionEngine.h | 3 +--
 4 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/MachineLearning/CNTK/ComputationNetwork.h b/MachineLearning/CNTK/ComputationNetwork.h
index b024d20b9..3b8c515b6 100644
--- a/MachineLearning/CNTK/ComputationNetwork.h
+++ b/MachineLearning/CNTK/ComputationNetwork.h
@@ -41,7 +41,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
     protected:
         typedef ComputationNode<ElemType>* ComputationNodePtr;
-        typedef const ComputationNode<ElemType>* ConstComputationNodePtr;
 		typedef std::pair<ComputationNodePtr, ComputationNodePtr> ComputationArc;
         typedef struct stRecurrentInfo{
             std::vector<ComputationNodePtr> m_recurrentNodes;
@@ -549,7 +548,7 @@ public:
                     }
 
                     ComputationNodePtr nodePtr = GetNodeFromName(nodeName);
-                    std::vector<ConstComputationNodePtr> childrenNodes;
+                    std::vector<const ComputationNodePtr> childrenNodes;
                     childrenNodes.resize(numChildren);
                     for (int j = 0; j < numChildren; j++)
                         childrenNodes[j] = GetNodeFromName(childrenNames[j]);
@@ -1531,7 +1530,7 @@ public:
             return newNode;
         }
 
-        ComputationNodePtr RowStack(const std::vector<ConstComputationNodePtr> inputs, const std::wstring nodeName = L"")
+        ComputationNodePtr RowStack(const std::vector<const ComputationNodePtr> inputs, const std::wstring nodeName = L"")
         {
             ComputationNodePtr newNode(new RowStackNode<ElemType>(m_deviceId, nodeName));
             newNode->AttachInputs(inputs);
diff --git a/MachineLearning/CNTK/ComputationNode.h b/MachineLearning/CNTK/ComputationNode.h
index a78f76a24..55471acd3 100644
--- a/MachineLearning/CNTK/ComputationNode.h
+++ b/MachineLearning/CNTK/ComputationNode.h
@@ -57,7 +57,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     protected:
         //std containers such as list and map does not support class reference so we need to use pointer
         typedef ComputationNode<ElemType>* ComputationNodePtr;
-        typedef const ComputationNode<ElemType>* ConstComputationNodePtr;
 		typedef std::pair<ComputationNodePtr, ComputationNodePtr> ComputationArc;
         int     m_loopId;
         size_t  m_samplesInRecurrentStep; 
@@ -153,7 +152,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             throw std::logic_error("This operation does not support six inputs.");
         }
 
-        virtual void AttachInputs(const std::vector<ConstComputationNodePtr>& /*inputs*/)
+        virtual void AttachInputs(const std::vector<const ComputationNodePtr>& /*inputs*/)
         {
             throw std::logic_error("This operation does not support variable-length inputs.");
         }
@@ -920,7 +919,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         typedef ComputationNode<ElemType> B; \
 protected:  \
         typedef ComputationNode<ElemType>* ComputationNodePtr;  \
-        typedef const ComputationNode<ElemType>* ConstComputationNodePtr;  \
 public: \
         using B::AttachInputs; using B::ChildrenNeedGradient; using B::ChildrenSize; using B::ClearGradientForChildren; \
         using B::ComputeGradientForChildren; using B::ComputeInputPartial; using B::ConstOnes; using B::CopyImageSizeFromInput; \
diff --git a/MachineLearning/CNTK/LinearAlgebraNodes.h b/MachineLearning/CNTK/LinearAlgebraNodes.h
index 7961766e3..ffbda78ea 100644
--- a/MachineLearning/CNTK/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTK/LinearAlgebraNodes.h
@@ -574,7 +574,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 fprintf(stderr, "WARNING: RowStack operation cannot inherit image size information from its child. Image size info is lost.\n");
         }
 
-        virtual void AttachInputs(const std::vector<ConstComputationNodePtr>& inputs)
+        virtual void AttachInputs(const std::vector<const ComputationNodePtr>& inputs)
         {
             unsigned int numInputs = inputs.size();
             m_children.resize(numInputs);
diff --git a/MachineLearning/CNTK/SynchronousExecutionEngine.h b/MachineLearning/CNTK/SynchronousExecutionEngine.h
index 72491d131..a7d2960ce 100644
--- a/MachineLearning/CNTK/SynchronousExecutionEngine.h
+++ b/MachineLearning/CNTK/SynchronousExecutionEngine.h
@@ -393,7 +393,7 @@ public:
 
             if (cnNodeType == RowStackNode<ElemType>::TypeName()) //support variable length inputs
             {
-                std::vector<ConstComputationNodePtr> inputNodes;
+                std::vector<const ComputationNodePtr> inputNodes;
                 inputNodes.resize(inputs.size());
                 for (int i = 0; i < inputs.size(); i++)
                     inputNodes[i] = ComputationNodePtr(inputs[i]);
@@ -734,7 +734,6 @@ public:
 private:
     ComputationNetwork<ElemType>& m_net;
     typedef ComputationNode<ElemType>* ComputationNodePtr;
-    typedef const ComputationNode<ElemType>* ConstComputationNodePtr;
     void operator=(const SynchronousNodeEvaluator&);
 };
 

From e1566298d55bec28c1fa28fdd66d1468060b6a93 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amitaga@microsoft.com>
Date: Fri, 12 Jun 2015 14:39:06 -0700
Subject: [PATCH 20/21] Fixed Linux build

---
 MachineLearning/CNTK/ComputationNetwork.h         | 4 ++--
 MachineLearning/CNTK/ComputationNode.h            | 2 +-
 MachineLearning/CNTK/LinearAlgebraNodes.h         | 2 +-
 MachineLearning/CNTK/SynchronousExecutionEngine.h | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/MachineLearning/CNTK/ComputationNetwork.h b/MachineLearning/CNTK/ComputationNetwork.h
index 3b8c515b6..2ae6a0851 100644
--- a/MachineLearning/CNTK/ComputationNetwork.h
+++ b/MachineLearning/CNTK/ComputationNetwork.h
@@ -548,7 +548,7 @@ public:
                     }
 
                     ComputationNodePtr nodePtr = GetNodeFromName(nodeName);
-                    std::vector<const ComputationNodePtr> childrenNodes;
+                    std::vector<ComputationNodePtr> childrenNodes;
                     childrenNodes.resize(numChildren);
                     for (int j = 0; j < numChildren; j++)
                         childrenNodes[j] = GetNodeFromName(childrenNames[j]);
@@ -1530,7 +1530,7 @@ public:
             return newNode;
         }
 
-        ComputationNodePtr RowStack(const std::vector<const ComputationNodePtr> inputs, const std::wstring nodeName = L"")
+        ComputationNodePtr RowStack(const std::vector<ComputationNodePtr> inputs, const std::wstring nodeName = L"")
         {
             ComputationNodePtr newNode(new RowStackNode<ElemType>(m_deviceId, nodeName));
             newNode->AttachInputs(inputs);
diff --git a/MachineLearning/CNTK/ComputationNode.h b/MachineLearning/CNTK/ComputationNode.h
index 55471acd3..888060edd 100644
--- a/MachineLearning/CNTK/ComputationNode.h
+++ b/MachineLearning/CNTK/ComputationNode.h
@@ -152,7 +152,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             throw std::logic_error("This operation does not support six inputs.");
         }
 
-        virtual void AttachInputs(const std::vector<const ComputationNodePtr>& /*inputs*/)
+        virtual void AttachInputs(const std::vector<ComputationNodePtr>& /*inputs*/)
         {
             throw std::logic_error("This operation does not support variable-length inputs.");
         }
diff --git a/MachineLearning/CNTK/LinearAlgebraNodes.h b/MachineLearning/CNTK/LinearAlgebraNodes.h
index ffbda78ea..d2b3d3302 100644
--- a/MachineLearning/CNTK/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTK/LinearAlgebraNodes.h
@@ -574,7 +574,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 fprintf(stderr, "WARNING: RowStack operation cannot inherit image size information from its child. Image size info is lost.\n");
         }
 
-        virtual void AttachInputs(const std::vector<const ComputationNodePtr>& inputs)
+        virtual void AttachInputs(const std::vector<ComputationNodePtr>& inputs)
         {
             unsigned int numInputs = inputs.size();
             m_children.resize(numInputs);
diff --git a/MachineLearning/CNTK/SynchronousExecutionEngine.h b/MachineLearning/CNTK/SynchronousExecutionEngine.h
index a7d2960ce..24bf24f08 100644
--- a/MachineLearning/CNTK/SynchronousExecutionEngine.h
+++ b/MachineLearning/CNTK/SynchronousExecutionEngine.h
@@ -393,7 +393,7 @@ public:
 
             if (cnNodeType == RowStackNode<ElemType>::TypeName()) //support variable length inputs
             {
-                std::vector<const ComputationNodePtr> inputNodes;
+                std::vector<ComputationNodePtr> inputNodes;
                 inputNodes.resize(inputs.size());
                 for (int i = 0; i < inputs.size(); i++)
                     inputNodes[i] = ComputationNodePtr(inputs[i]);

From faf6925bdc57d9c2ea44db1241a572aac1af5a96 Mon Sep 17 00:00:00 2001
From: Yu Zhang <sjtuzy@gmail.com>
Date: Tue, 16 Jun 2015 10:10:09 -0700
Subject: [PATCH 21/21] Fix the compile error on windows

---
 MachineLearning/CNTK/RecurrentNodes.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/MachineLearning/CNTK/RecurrentNodes.h b/MachineLearning/CNTK/RecurrentNodes.h
index abb890e78..0d382b164 100644
--- a/MachineLearning/CNTK/RecurrentNodes.h
+++ b/MachineLearning/CNTK/RecurrentNodes.h
@@ -218,10 +218,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     {
                         if (colBegin(i,0) == SENTENCE_MIDDLE)
                         {
-                            Matrix<ElemType> to1 = inputGradientValues.ColumnSlice((timeIdxInSeq - delay)*mNbr + i, 1);
-                            Matrix<ElemType> frm1= gradientValues.ColumnSlice(timeIdxInSeq * mNbr + i, 1);
+							Matrix<ElemType> frm = gradientValues.ColumnSlice(timeIdxInSeq * mNbr + i, 1);
+                            Matrix<ElemType> to = inputGradientValues.ColumnSlice((timeIdxInSeq - delay)*mNbr + i, 1);
 
-                            to1 += frm1;
+                            to += frm;
                         }
                     }