Merge with new changes including RowStackNode

2015-06-16 16:01:19 -07:00 · 2015-06-16 16:01:19 -07:00 · f332421b7b
--- a/.gitignore
+++ b/.gitignore
@ -15,6 +15,7 @@ x64/
 build/
 [Bb]in/
 [Oo]bj/
+.run-*

 # Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets
 !packages/*/build/
--- a/DataReader/BinaryReader/BinaryWriter.cpp
+++ b/DataReader/BinaryReader/BinaryWriter.cpp
@ -47,8 +47,8 @@ BinaryWriter<ElemType>::~BinaryWriter()
 //  miniBatchMode=Partial
 //  randomize=None
 //  wfile=c:\speech\mnist\mnist_test.bin
-//  #wsize - inital size of the file in MB
-//  # if calculated size would be bigger, that is used instead
+//  #wsize - inital size of the file in MB default to 256
+//  # has to be large enough for your dataset. the file will shrink to the actual size when closed.
 //  #wsize=256
 //  #wrecords - number of records we should allocate space for in the file
 //  # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
--- a/DataReader/HTKMLFReader/HTKMLFReader.cpp
+++ b/DataReader/HTKMLFReader/HTKMLFReader.cpp
@ -980,8 +980,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        {
                            // dereference matrix that corresponds to key (input/output name) and 
                            // populate based on whether its a feature or a label
-                            //Matrix<ElemType>& data =
-                                                        *matrices[iter->first]; // can be features or labels
+                            //Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels

                            if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                            {
@ -1058,8 +1057,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        {
                            // dereference matrix that corresponds to key (input/output name) and 
                            // populate based on whether its a feature or a label
-                            //Matrix<ElemType>& data =
-                                                        *matrices[iter->first]; // can be features or labels
+                            //Matrix<ElemType>& data =*matrices[iter->first]; // can be features or labels

                            if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                            {
@ -1134,8 +1132,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        {
                            // dereference matrix that corresponds to key (input/output name) and 
                            // populate based on whether its a feature or a label
-                            //Matrix<ElemType>& data =
-                                                        *matrices[iter->first]; // can be features or labels
+                            //Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels

                            if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                            {
--- a/DataReader/HTKMLFReader/basetypes.h
+++ b/DataReader/HTKMLFReader/basetypes.h
@ -142,6 +142,15 @@ extern void _CHECKED_ASSERT_error(const char * file, int line, const char * exp)
 #endif
 #endif

+/**
+These macros are used for sentence segmentation information.
+*/
+#define SENTENCE_BEGIN 0 
+#define SENTENCE_MIDDLE 1
+#define NO_LABELS -1
+#define EXISTS_SENTENCE_BEGIN_OR_NO_LABELS 0
+#define NO_EXISTS_SENTENCE_BEGIN_OR_NO_LABELS 1
+
 // ----------------------------------------------------------------------------
 // basic data types
 // ----------------------------------------------------------------------------
--- a/DataReader/HTKMLFReader/utterancesourcemulti.h
+++ b/DataReader/HTKMLFReader/utterancesourcemulti.h
@ -382,47 +382,58 @@ public:
                // TODO: we can store labels more efficiently now since we don't do frame-wise random access anymore.
    
                // OK, utterance has all we need --remember it
-                utteranceset.push_back (std::move (utterance));

                if (m==0)
                {
-                    _totalframes += uttframes;
-                    framesaccum.push_back(uttframes); //track number of frames in each utterance - first feature is the reference
                    if (!labels.empty() && !lacksmlf)
                    //if (!labels.empty() && labelsiter != labels[0].end())
                    {
-                        foreach_index (j, labels)
+                        // first verify that all the label files have the proper duration
+                        bool durationmatch = true;
+                        foreach_index(j, labels)
                        {
                            const auto & labseq = labels[j].find(key)->second;
                            // check if durations match; skip if not
-                            size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
+                            size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size() - 1].firstframe + labseq[labseq.size() - 1].numframes);
                            if (labframes != uttframes)
                            {
-                                fprintf (stderr, " [duration mismatch (%d in label vs. %d in feat file), skipping %S]", labframes, uttframes, key.c_str());
+                                fprintf(stderr, " [duration mismatch (%d in label vs. %d in feat file), skipping %S]", labframes, uttframes, key.c_str());
                                nomlf++;
-                                continue;   // skip this utterance at all
+                                durationmatch = false;
+                                break; // continue;   // skip this utterance at all
                            }
-                            // expand classid sequence into flat array
-                            foreach_index (i, labseq)
+                        }
+                        if (durationmatch){
+                            utteranceset.push_back(std::move(utterance));
+                            _totalframes += uttframes;
+                            framesaccum.push_back(uttframes); //track number of frames in each utterance - first feature is the reference
+                            // then parse each mlf if the durations are consistent
+                            foreach_index(j, labels)
                            {
-                                const auto & e = labseq[i];
-                                if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
-                                    throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
-                                if (e.classid >= udim[j])
-                                    throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: class id %d exceeds model output dimension %d in file %S", e.classid, udim, key.c_str()));
-                                if (e.classid != (CLASSIDTYPE) e.classid)
-                                    throw std::runtime_error ("CLASSIDTYPE has too few bits");
-                                for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
-                                    classids[j]->push_back ((CLASSIDTYPE) e.classid);
-                                numclasses[j] = max (numclasses[j], 1u + e.classid);
-                                counts[j].resize (numclasses[j], 0);
-                                counts[j][e.classid] += e.numframes;
-                            }
-                            classids[j]->push_back ((CLASSIDTYPE) -1);  // append a boundary marker marker for checking
+                                const auto & labseq = labels[j].find(key)->second;
+                                // expand classid sequence into flat array
+                                foreach_index(i, labseq)
+                                {
+                                    const auto & e = labseq[i];
+                                    if ((i > 0 && labseq[i - 1].firstframe + labseq[i - 1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
+                                        throw std::runtime_error(msra::strfun::strprintf("minibatchutterancesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
+                                    if (e.classid >= udim[j])
+                                        throw std::runtime_error(msra::strfun::strprintf("minibatchutterancesource: class id %d exceeds model output dimension %d in file %S", e.classid, udim, key.c_str()));
+                                    if (e.classid != (CLASSIDTYPE)e.classid)
+                                        throw std::runtime_error("CLASSIDTYPE has too few bits");
+                                    for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
+                                        classids[j]->push_back((CLASSIDTYPE)e.classid);
+                                    numclasses[j] = max(numclasses[j], 1u + e.classid);
+                                    counts[j].resize(numclasses[j], 0);
+                                    counts[j][e.classid] += e.numframes;
+                                }

-                            if (!labels[j].empty() && classids[j]->size() != _totalframes + utteranceset.size())
-                                throw std::logic_error (msra::strfun::strprintf ("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
-                            assert (labels[j].empty() || classids[j]->size() == _totalframes + utteranceset.size());
+                                classids[j]->push_back((CLASSIDTYPE)-1);  // append a boundary marker marker for checking
+
+                                if (!labels[j].empty() && classids[j]->size() != _totalframes + utteranceset.size())
+                                    throw std::logic_error(msra::strfun::strprintf("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
+                                assert(labels[j].empty() || classids[j]->size() == _totalframes + utteranceset.size());
+                            }
                        }
                    }
                    else{
@ -451,7 +462,7 @@ public:
            }
            if (nomlf + nolat > 0)
            {
-                fprintf (stderr, "minibatchutterancesource: out of %d files, %d files not found in label set and %d have no lattice\n", infiles.size(), nomlf, nolat);
+                fprintf (stderr, "minibatchutterancesource: out of %d files, %d files not found in label set and %d have no lattice\n", infiles[0].size(), nomlf, nolat);
                if (nomlf + nolat > infiles[m].size() / 2)
                    throw std::runtime_error ("minibatchutterancesource: too many files not found in label set--assuming broken configuration\n");
            }
--- a/DataReader/Kaldi2Reader/HTKMLFReader.cpp
+++ b/DataReader/Kaldi2Reader/HTKMLFReader.cpp
@ -24,6 +24,7 @@
 #define DATAREADER_EXPORTS  // creating the exports here
 #include "DataReader.h"
 #include "HTKMLFReader.h"
+#include "commandArgUtil.h"
 #ifdef LEAKDETECT
 #include <vld.h> // for memory leak detection
 #endif
--- a/DataReader/Kaldi2Reader/HTKMLFWriter.cpp
+++ b/DataReader/Kaldi2Reader/HTKMLFWriter.cpp
@ -28,6 +28,7 @@
 #include "DataWriter.h"
 #include "commandArgUtil.h"
 #include "HTKMLFWriter.h"
+#include "commandArgUtil.h"
 #ifdef LEAKDETECT
 #include <vld.h> // for memory leak detection
 #endif
--- a/DataReader/LMSequenceReader/SequenceReader.cpp
+++ b/DataReader/LMSequenceReader/SequenceReader.cpp
@ -2048,6 +2048,10 @@ void BatchSequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring,
    {
        RuntimeError("GetLabelOutput::should use CPU for labels ");
    }
+    if (curDevId != CPUDEVICE)
+    {
+        labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
+    }
 }

 template<class ElemType>
--- a/DataReader/UCIFastReader/UCIParser.cpp
+++ b/DataReader/UCIFastReader/UCIParser.cpp
@ -11,6 +11,11 @@
 #include <stdexcept>
 #include <stdint.h>

+#if WIN32
+#define ftell64 _ftelli64
+#else
+#define ftell64 ftell
+#endif

 // SetState for a particular value
 template <typename NumType, typename LabelType>
@ -362,10 +367,10 @@ void UCIParser<NumType, LabelType>::ParseInit(LPCWSTR fileName, size_t startFeat

    errno_t err = _wfopen_s( &m_pFile, fileName, L"rb" );
    if (err)
-        std::runtime_error("UCIParser::ParseInit - error opening file"); 
+        throw std::runtime_error("UCIParser::ParseInit - error opening file"); 
    int rc = _fseeki64(m_pFile, 0, SEEK_END);
    if (rc)
-        std::runtime_error("UCIParser::ParseInit - error seeking in file");
+        throw std::runtime_error("UCIParser::ParseInit - error seeking in file");

    m_fileSize = GetFilePosition();
    m_fileBuffer = new BYTE[m_bufferSize];
@ -377,9 +382,9 @@ void UCIParser<NumType, LabelType>::ParseInit(LPCWSTR fileName, size_t startFeat
 template <typename NumType, typename LabelType>
 int64_t UCIParser<NumType, LabelType>::GetFilePosition()
 {
-    int64_t position = _ftelli64(m_pFile);
+    int64_t position = ftell64(m_pFile);
    if (position == -1L)
-        std::runtime_error("UCIParser::GetFilePosition - error retrieving file position in file");
+        throw std::runtime_error("UCIParser::GetFilePosition - error retrieving file position in file");
    return position;
 }

@ -392,7 +397,7 @@ void UCIParser<NumType, LabelType>::SetFilePosition(int64_t position)
 {
    int rc = _fseeki64(m_pFile, position, SEEK_SET);
    if (rc)
-        std::runtime_error("UCIParser::SetFilePosition - error seeking in file");
+        throw std::runtime_error("UCIParser::SetFilePosition - error seeking in file");

    // setup state machine to start at this position
    PrepareStartPosition(position);
@ -445,7 +450,7 @@ size_t UCIParser<NumType, LabelType>::UpdateBuffer()
    size_t bytesToRead = min(m_bufferSize, m_fileSize-m_bufferStart)-saveBytes;
    size_t bytesRead = fread(m_fileBuffer+saveBytes, 1, bytesToRead, m_pFile);
    if (bytesRead == 0 && ferror(m_pFile))
-        std::runtime_error("UCIParser::UpdateBuffer - error reading file");
+        throw std::runtime_error("UCIParser::UpdateBuffer - error reading file");
    return bytesRead;
 }

--- a/DataReader/UCIFastReader/UCIParser.h
+++ b/DataReader/UCIFastReader/UCIParser.h
@ -90,8 +90,8 @@ private:
    int m_elementsConvertedThisLine;

    // global stats
-    int m_totalNumbersConverted;
-    int m_totalLabelsConverted;
+    int64_t m_totalNumbersConverted;
+    int64_t m_totalLabelsConverted;

    // file positions/buffer
    FILE * m_pFile;
--- a/Demos/Simple/Simple.config
+++ b/Demos/Simple/Simple.config
@ -1,8 +1,9 @@
-# command=Simple_Demo_Output
+RootDir=..
 command=Simple_Demo:Simple_Demo_Output

 # deviceId=-1 for CPU, >=0 for GPU devices
 DeviceNumber=-1
+
 #stderr=Demo

 precision=float
@ -13,7 +14,6 @@ deviceId=$DeviceNumber$
 outputNodeNames=ScaledLogLikelihood
 traceLevel=1

-
 #######################################
 #  TRAINING CONFIG (Simple, Fixed LR) #
 #######################################
@ -52,22 +52,22 @@ Simple_Demo=[
    reader=[
      # reader to use
      readerType=UCIFastReader
-      file=../Demos/Simple/SimpleDataTrain.txt
+      file=$RootDir$/Demos/Simple/SimpleDataTrain.txt

      miniBatchMode=Partial
      randomize=Auto
      verbosity=1   

      features=[
-	  dim=2      # two-dimensional input data
+          dim=2      # two-dimensional input data
          start=0    # Start with first element on line
      ]

      labels=[
-	start=2      # Skip two elements
+        start=2      # Skip two elements
        dim=1        # One label dimension
        labelDim=2   # Two labels possible
-        labelMappingFile=../Demos/Simple/SimpleMapping.txt
+        labelMappingFile=$RootDir$/Demos/Simple/SimpleMapping.txt
      ]
    ]
 ]
@ -84,16 +84,16 @@ Simple_Demo_Output=[
    reader=[
      # reader to use
      readerType=UCIFastReader
-      file=../Demos/Simple/SimpleDataTest.txt
+      file=$RootDir$/Demos/Simple/SimpleDataTest.txt
      features=[
          dim=2
-	  start=0
+          start=0
      ]
      labels=[
-	start=2
+      start=2
        dim=1
        labelDim=2
-        labelMappingFile=../Demos/Simple/SimpleMapping.txt
+        labelMappingFile=$RootDir$/Demos/Simple/SimpleMapping.txt
      ]
    ]
    outputPath=SimpleOutput    # Dump output as text
--- a/MachineLearning/CNTK/ComputationNetwork.h
+++ b/MachineLearning/CNTK/ComputationNetwork.h
@ -550,41 +550,38 @@ public:
                    }

                    ComputationNodePtr nodePtr = GetNodeFromName(nodeName);
-                    ComputationNodePtr childNodePtr0, childNodePtr1, childNodePtr2, childNodePtr3, childNodePtr4;
-                    switch (numChildren)
+                    std::vector<ComputationNodePtr> childrenNodes;
+                    childrenNodes.resize(numChildren);
+                    for (int j = 0; j < numChildren; j++)
+                        childrenNodes[j] = GetNodeFromName(childrenNames[j]);
+
+                    if (nodePtr->OperationName() == RowStackNode<ElemType>::TypeName()) //allow for variable input nodes
+                        nodePtr->AttachInputs(childrenNodes);
+                    else //fixed input nodes
                    {
-                    case 1:
-                        childNodePtr0 = GetNodeFromName(childrenNames[0]);
-                        nodePtr->AttachInputs(childNodePtr0);
-                        break;
-                    case 2:
-                        childNodePtr0 = GetNodeFromName(childrenNames[0]);
-                        childNodePtr1 = GetNodeFromName(childrenNames[1]);
-                        nodePtr->AttachInputs(childNodePtr0, childNodePtr1);
-                        break;
-                    case 3:
-                        childNodePtr0 = GetNodeFromName(childrenNames[0]);
-                        childNodePtr1 = GetNodeFromName(childrenNames[1]);
-                        childNodePtr2 = GetNodeFromName(childrenNames[2]);
-                        nodePtr->AttachInputs(childNodePtr0, childNodePtr1, childNodePtr2);
-                        break;
-                    case 4:
-                        childNodePtr0 = GetNodeFromName(childrenNames[0]);
-                        childNodePtr1 = GetNodeFromName(childrenNames[1]);
-                        childNodePtr2 = GetNodeFromName(childrenNames[2]);
-                        childNodePtr3 = GetNodeFromName(childrenNames[3]);
-                        nodePtr->AttachInputs(childNodePtr0, childNodePtr1, childNodePtr2, childNodePtr3);
-                        break;
-                    case 5:
-                        childNodePtr0 = GetNodeFromName(childrenNames[0]);
-                        childNodePtr1 = GetNodeFromName(childrenNames[1]);
-                        childNodePtr2 = GetNodeFromName(childrenNames[2]);
-                        childNodePtr3 = GetNodeFromName(childrenNames[3]);
-                        childNodePtr4 = GetNodeFromName(childrenNames[4]);
-                        nodePtr->AttachInputs(childNodePtr0, childNodePtr1, childNodePtr2, childNodePtr3, childNodePtr4);
-                        break;
-                    default:
-                        throw std::logic_error("Invalid number of children.");
+                        switch (numChildren)
+                        {
+                        case 1:
+                            nodePtr->AttachInputs(childrenNodes[0]);
+                            break;
+                        case 2:
+                            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1]);
+                            break;
+                        case 3:
+                            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2]);
+                            break;
+                        case 4:
+                            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3]);
+                            break;
+                        case 5:
+                            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3], childrenNodes[4]);
+                            break;
+                        case 6:
+                            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3], childrenNodes[4], childrenNodes[5]);
+                            break;
+                        default:
+                            throw std::logic_error("Invalid number of children.");
+                        }
                    }
                }
            }
@ -1028,6 +1025,8 @@ public:
                newNode = new LookupTableNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
            else if (nodeType == RowSliceNode<ElemType>::TypeName())
                newNode = new RowSliceNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
+            else if (nodeType == RowStackNode<ElemType>::TypeName())
+                newNode = new RowStackNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
            else if (nodeType == GMMLogLikelihoodNode<ElemType>::TypeName())
                newNode = new GMMLogLikelihoodNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
            else if (nodeType == SequenceDecoderNode<ElemType>::TypeName())
@ -1209,6 +1208,8 @@ public:
 				newNode = new CosDistanceWithNegativeSamplesNode<ElemType>(m_deviceId, nodeName);
            else if (nodeType == ParallelNode<ElemType>::TypeName())
                newNode = new ParallelNode<ElemType>(m_deviceId, nodeName);
+            else if (nodeType == RowStackNode<ElemType>::TypeName())
+                newNode = new RowStackNode<ElemType>(m_deviceId, nodeName);
            else
            {
                fprintf(stderr, "Error creating new ComputationNode of type %ls, with name %ls\n", nodeType.c_str(), nodeName.c_str());
@ -1582,6 +1583,15 @@ public:
            return newNode;
        }

+        ComputationNodePtr RowStack(const std::vector<ComputationNodePtr> inputs, const std::wstring nodeName = L"")
+        {
+            ComputationNodePtr newNode(new RowStackNode<ElemType>(m_deviceId, nodeName));
+            newNode->AttachInputs(inputs);
+            AddNodeToNet(newNode);
+
+            return newNode;
+        }
+
        ComputationNodePtr GMMLogLikelihood(const ComputationNodePtr unnormedPrior, const ComputationNodePtr mean, const ComputationNodePtr logStddev, const ComputationNodePtr feature, const std::wstring nodeName = L"")
        {
            ComputationNodePtr newNode(new GMMLogLikelihoodNode<ElemType>(m_deviceId, nodeName));
--- a/MachineLearning/CNTK/ComputationNode.h
+++ b/MachineLearning/CNTK/ComputationNode.h
@ -158,6 +158,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            throw std::logic_error("This operation does not support six inputs.");
        }

+        virtual void AttachInputs(const std::vector<ComputationNodePtr>& /*inputs*/)
+        {
+            throw std::logic_error("This operation does not support variable-length inputs.");
+        }
+
        virtual void DetachInputs()
        {
            m_children.resize(0);
--- a/MachineLearning/CNTK/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTK/LinearAlgebraNodes.h
@ -399,6 +399,167 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template class RowSliceNode<float>; 
    template class RowSliceNode<double>;

+    //this node is used to extract part of the input by rows as the output
+    //it has to be continuous segments of rows since each column is treated as one sample
+    template<class ElemType>
+    class RowStackNode : public ComputationNode<ElemType>
+    {
+        UsingComputationNodeMembers;
+    public:
+        RowStackNode(const DEVICEID_TYPE deviceId = AUTOPLACEMATRIX, const std::wstring name = L"") : ComputationNode<ElemType>(deviceId)
+        {
+            m_nodeName = (name == L"" ? CreateUniqNodeName() : name);
+            m_deviceId = deviceId;
+            MoveMatricesToDevice(deviceId);
+            InitRecurrentNode();
+        }
+
+        RowStackNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId = AUTOPLACEMATRIX, const std::wstring name = L"") : ComputationNode<ElemType>(deviceId)
+        {
+            m_nodeName = (name == L"" ? CreateUniqNodeName() : name);
+            LoadFromFile(fstream, modelVersion, deviceId);
+        }
+
+        // copy constructor
+        RowStackNode(const RowStackNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags) : ComputationNode<ElemType>(node->m_deviceId)
+        {
+            node->CopyTo(this, newName, flags);
+        }
+
+        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            const std::wstring& name = (newName == L"") ? NodeName() : newName;
+
+            ComputationNodePtr node = new RowStackNode<ElemType>(this, name, flags);
+            return node;
+        }
+
+        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
+        {
+            ComputationNode<ElemType>::CopyTo(nodeP, newName, flags);
+            RowStackNode<ElemType>* node = (RowStackNode<ElemType>*) nodeP;
+
+            if (flags & CopyNodeFlags::copyNodeChildren)
+            {
+                node->m_children = m_children;
+                node->m_startRowIndeces = m_startRowIndeces;
+                node->m_inputMatrices = m_inputMatrices;
+            }
+        }
+
+        virtual const std::wstring OperationName() const { return TypeName(); }
+        static const std::wstring TypeName() { return L"RowStack"; }
+
+        virtual void ComputeInputPartial(const size_t inputIndex)
+        {
+            if (inputIndex >= ChildrenSize())
+                throw std::invalid_argument("RowStack-ComputeInputPartial: inputIndex out of range.");
+
+            ComputeInputPartialS(Inputs(inputIndex)->GradientValues(), GradientValues(), m_startRowIndeces[inputIndex], m_startRowIndeces[inputIndex + 1] - m_startRowIndeces[inputIndex]);
+        }
+
+        virtual void ComputeInputPartial(const size_t inputIndex, const size_t timeIdxInSeq)
+        {
+            if (inputIndex >= ChildrenSize())
+                throw std::invalid_argument("RowStack-ComputeInputPartial: inputIndex out of range.");
+
+            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+            Matrix<ElemType> sliceOutputGrad = GradientValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+
+            ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startRowIndeces[inputIndex], m_startRowIndeces[inputIndex+1] - m_startRowIndeces[inputIndex]);
+        }
+
+        static void WINAPI ComputeInputPartialS(Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const size_t startIndex, const size_t numRows)
+        {
+            inputGradientValues.AddWithRowSliceValuesOf(gradientValues, startIndex, numRows);
+        }
+
+        virtual void EvaluateThisNode()
+        {
+            EvaluateThisNodeS(m_functionValues, m_inputMatrices,  0, Inputs(0)->FunctionValues().GetNumCols());
+        }
+
+        virtual void EvaluateThisNode(const size_t timeIdxInSeq)
+        {
+            Matrix<ElemType> sliceFunctionValues = FunctionValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+
+            EvaluateThisNodeS(sliceFunctionValues, m_inputMatrices, timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
+        }
+
+        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
+        {
+            functionValues.AssignRowStackValuesOf(inputMatrices, sliceStartCol, sliceNumCols);
+#if NANCHECK
+            functionValues.HasNan("RowStack");
+#endif
+        }
+
+        virtual void Validate()
+        {
+            PrintSelfBeforeValidation();
+            
+            unsigned int numInputs = ChildrenSize();
+            if (numInputs < 2)
+                LogicError("RowStack operation: must have two or more inputs.");
+
+            if (Inputs(0) == nullptr)
+                LogicError("RowStack operation: the input node is NULL.");
+
+            size_t numCols = Inputs(0)->FunctionValues().GetNumCols();
+            m_startRowIndeces.resize(ChildrenSize()+1);
+            m_inputMatrices.resize(ChildrenSize());
+
+            size_t totalRows = 0;
+            m_startRowIndeces[0] = 0;
+
+            for (int i = 0; i < ChildrenSize(); i++)
+            {
+                if (Inputs(i) == nullptr)
+                    LogicError("RowStack operation: the input node is NULL.");
+
+                Matrix<ElemType>& childMatrix = Inputs(i)->FunctionValues();
+                size_t numRows = childMatrix.GetNumRows();
+                if (numRows == 0)
+                    LogicError("RowStack operation: the input node %ls has 0 rows.", Inputs(i)->NodeName().c_str());
+                
+                if (childMatrix.GetNumCols() != numCols)
+                    LogicError("RowStack operation: the input node %ls has different number of columns.", Inputs(i)->NodeName().c_str());
+
+                totalRows += numRows;
+                m_inputMatrices[i] = &childMatrix;
+                m_startRowIndeces[i + 1] = m_startRowIndeces[i] + numRows;
+            }
+
+            FunctionValues().Resize(totalRows, numCols);
+            CopyImageSizeFromInputs();
+        }
+
+        virtual void CopyImageSizeFromInputs()
+        {
+            CopyImageSizeFromInput(0, true);
+            m_outputHeight = FunctionValues().GetNumRows();
+
+            //WARNING: this node will destroy the image size information from the child
+            if (m_inputWidth * m_inputChannels != 1)
+                fprintf(stderr, "WARNING: RowStack operation cannot inherit image size information from its child. Image size info is lost.\n");
+        }
+
+        virtual void AttachInputs(const std::vector<ComputationNodePtr>& inputs)
+        {
+            unsigned int numInputs = inputs.size();
+            m_children.resize(numInputs);
+            for (unsigned int i = 0; i < numInputs; i++)
+                m_children[i] = inputs[i];
+        }
+
+    private:
+        std::vector<size_t> m_startRowIndeces; //start row number in the stacked matrix of each input (child)
+        std::vector<const Matrix<ElemType>*> m_inputMatrices;
+    };
+
+    template class RowStackNode<float>;
+    template class RowStackNode<double>;
+
    template<class ElemType>
    class ScaleNode : public ComputationNode<ElemType>
    {
--- a/MachineLearning/CNTK/NetworkDescriptionLanguage.cpp
+++ b/MachineLearning/CNTK/NetworkDescriptionLanguage.cpp
@ -222,6 +222,8 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
        ret = true;
    else if (EqualInsensitive(nodeType, RowSliceNode<ElemType>::TypeName()))
        ret = true;
+    else if (EqualInsensitive(nodeType, RowStackNode<ElemType>::TypeName()))
+        ret = true;
    else if (EqualInsensitive(nodeType, LookupTableNode<ElemType>::TypeName()))
        ret = true;
    else if (EqualInsensitive(nodeType, GMMLogLikelihoodNode<ElemType>::TypeName(), L"GMMLL"))
--- a/MachineLearning/CNTK/RecurrentNodes.h
+++ b/MachineLearning/CNTK/RecurrentNodes.h
@ -218,10 +218,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    {
                        if (colBegin(i,0) == SENTENCE_MIDDLE)
                        {
-                            Matrix<ElemType> to1 = inputGradientValues.ColumnSlice((timeIdxInSeq - delay)*mNbr + i, 1);
-                            Matrix<ElemType> frm1= gradientValues.ColumnSlice(timeIdxInSeq * mNbr + i, 1);
+							Matrix<ElemType> frm = gradientValues.ColumnSlice(timeIdxInSeq * mNbr + i, 1);
+                            Matrix<ElemType> to = inputGradientValues.ColumnSlice((timeIdxInSeq - delay)*mNbr + i, 1);

-                            to1 += frm1;
+                            to += frm;
                        }
                    }

--- a/MachineLearning/CNTK/SimpleNetworkBuilder.cpp
+++ b/MachineLearning/CNTK/SimpleNetworkBuilder.cpp
@ -1810,8 +1810,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            w = m_net->CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
            m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

-            double val = w->FunctionValues()(0, 0);
-
            /// the label is a dense matrix. each element is the word index
            label = m_net->CreateInputNode(L"labels", 2 * (this->nce_noises + 1), mbSize);

--- a/MachineLearning/CNTK/SynchronousExecutionEngine.h
+++ b/MachineLearning/CNTK/SynchronousExecutionEngine.h
@ -391,29 +391,43 @@ public:
            {
            std::vector<void*> inputs = EvaluateParameters(node, baseName, nodeParamStart, nodeParamCount, pass);

-            switch (inputs.size())
+            if (cnNodeType == RowStackNode<ElemType>::TypeName()) //support variable length inputs
            {
-            case 1:
-                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]));
-                break;
-            case 2:
-                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]));
-                break;
-            case 3:
-                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]));
-                break;
-            case 4:
-                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3]));
-                break;
-            case 5:
-                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3]), ComputationNodePtr(inputs[4]));
-                break;
-            default:
-                if (nodeParamCount > 0)
-                    RuntimeError("Invalid number of parameters name = '%s' call = '%s'\n", node->GetName().c_str(), node->GetValue().c_str());
-                break;
-            }
+                std::vector<ComputationNodePtr> inputNodes;
+                inputNodes.resize(inputs.size());
+                for (int i = 0; i < inputs.size(); i++)
+                    inputNodes[i] = ComputationNodePtr(inputs[i]);

+                nodePtr->AttachInputs(inputNodes);
+            }
+            else
+            {
+                switch (inputs.size())
+                {
+                case 1:
+                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]));
+                    break;
+                case 2:
+                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]));
+                    break;
+                case 3:
+                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]));
+                    break;
+                case 4:
+                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3]));
+                    break;
+                case 5:
+                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3]), ComputationNodePtr(inputs[4]));
+                    break;
+                case 6:
+                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3]), ComputationNodePtr(inputs[4]), ComputationNodePtr(inputs[5]));
+                    break;
+                default:
+                    if (nodeParamCount > 0)
+                        RuntimeError("Invalid number of parameters name = '%s' call = '%s'\n", node->GetName().c_str(), node->GetValue().c_str());
+                    break;
+                }
+            }
            // process common optional parameters (like "tag");
            ProcessOptionalParameters(node);
            break;
--- a/Makefile.gpu
+++ b/Makefile.gpu
@ -32,11 +32,11 @@ DEVICE = gpu
 BUILDTYPE = debug
 #BUILDTYPE = release
 # comment following and uncomment the next one to enable MKL library
-#MATHLIB = acml
-MATHLIB = mkl
+MATHLIB = acml
+#MATHLIB = mkl
 # modify relevant path below for your system
 MKL_PATH = /usr/users/chiaying/intel/composer_xe_2013.2.146
-ACML_PATH = /usr/local/acml5.3.0/gfortran64
+ACML_PATH = /usr/local/acml5.3.1/ifort64
 #######

 BUILDFOR = $(ARCH).$(DEVICE).$(BUILDTYPE).$(MATHLIB)
@ -48,8 +48,8 @@ ifeq ($(BUILDTYPE),debug)
 	BUILDTYPE_OPT = -g
 	GPU_BUILDTYPE_OPT = -G
 else
-	BUILDTYPE_OPT = -O4
-	GPU_BUILDTYPE_OPT =
+	BUILDTYPE_OPT = -O3 -flto
+	GPU_BUILDTYPE_OPT = -O3
 endif

 ifeq ($(MATHLIB),mkl)
@ -142,7 +142,7 @@ $(OBJDIR)/%.o : %.cu Makefile
 	@echo $(SEPARATOR)
 	@echo creating $@ for $(ARCH) with build type $(BUILDTYPE) 
 	@mkdir -p $(dir $@)
-	$(NVCC) -c $< -o $@ $(BUILDTYPE_OPT) $(GPU_BUILDTYPE_OPT) $(NVCCFLAGS) $(INCFLAGS) -Xcompiler -fPIC
+	$(NVCC) -c $< -o $@ $(GPU_BUILDTYPE_OPT) $(NVCCFLAGS) $(INCFLAGS) -Xcompiler -fPIC

 $(OBJDIR)/%.o : %.cpp Makefile
 	@echo $(SEPARATOR)
--- a/Makefile_kaldi2.cpu
+++ b/Makefile_kaldi2.cpu
@ -31,8 +31,8 @@ DEVICE = cpu
 #BUILDTYPE = debug
 BUILDTYPE = release
 # comment following and uncomment the next one to enable MKL library
-#MATHLIB = acml
-MATHLIB = mkl
+MATHLIB = acml
+#MATHLIB = mkl
 # modify relevant path below for your system
 MKL_PATH = /usr/users/chiaying/intel/composer_xe_2013.2.146
 ACML_PATH = /usr/users/yzhang87/code/acml/gfortran64
--- a/Math/CNTKMathTest/CPUMatrixUnitTests.cpp
+++ b/Math/CNTKMathTest/CPUMatrixUnitTests.cpp
@ -563,7 +563,7 @@ namespace CNTKMathTest
            Assert::IsTrue(C.IsEqualTo(D1, 0.0001)); 
        }

-        TEST_METHOD(CPUMatrixRowSlice)
+        TEST_METHOD(CPUMatrixRowSliceAndStack)
        {
            Matrix M0(5,3);
            M0(0,0) = 1; M0(0,1) = 6; M0(0,2) = 11;
@ -590,6 +590,26 @@ namespace CNTKMathTest
            M3 += M0;
            M0.AddToRowSliceValuesOf(M1, 2,2);
            Assert::IsTrue(M3.IsEqualTo(M0, 0.0001)); 
+
+            M2.AddWithRowSliceValuesOf(M1, 0, 2);
+            Matrix M4(2, 3);
+            M4(0, 0) = 6; M4(0, 1) = 16; M4(0, 2) = 26;
+            M4(1, 0) = 8; M4(1, 1) = 18; M4(1, 2) = 28;
+            Assert::IsTrue(M2.IsEqualTo(M4, 0.0001));
+
+            Matrix M5, M6, M7, M8;
+            M5.AssignRowSliceValuesOf(M0, 0, 2);
+            M6.AssignRowSliceValuesOf(M0, 2, 1);
+            M7.AssignRowSliceValuesOf(M0, 3, 2);
+
+            std::vector<const Matrix*> inputMatrices;
+            inputMatrices.resize(3);
+            inputMatrices[0] = &M5;
+            inputMatrices[1] = &M6;
+            inputMatrices[2] = &M7;
+            M8.AssignRowStackValuesOf(inputMatrices, 0, 3);
+            
+            Assert::IsTrue(M8.IsEqualTo(M0, 0.0001));
        }

        TEST_METHOD(CPUAssignRepeatOf)
--- a/Math/CNTKMathTest/GPUMatrixUnitTests.cpp
+++ b/Math/CNTKMathTest/GPUMatrixUnitTests.cpp
@ -278,7 +278,7 @@ namespace CNTKMathTest
            Assert::IsTrue(M2.IsEqualTo(M3, 0.0001f)); 
        }

-        TEST_METHOD(GPUMatrixRowSlice)
+        TEST_METHOD(GPUMatrixRowSliceAndStack)
        {
            float *fArray = new float[15];
            fArray[0] = 1; fArray[5] = 6; fArray[10] = 11;
@ -308,6 +308,27 @@ namespace CNTKMathTest
            M3 += M0;
            M0.AddToRowSliceValuesOf(M1, 2,2);
            Assert::IsTrue(M3.IsEqualTo(M0, 0.0001)); 
+
+            M2.AddWithRowSliceValuesOf(M1, 0, 2);
+            float *fArray4 = new float[6];
+            fArray4[0] = 6; fArray4[2] = 16; fArray4[4] = 26;
+            fArray4[1] = 8; fArray4[3] = 18; fArray4[5] = 28;
+            GPUMatrix<float> M4(2, 3, fArray4, matrixFlagNormal);
+            Assert::IsTrue(M2.IsEqualTo(M4, 0.0001));
+
+            GPUMatrix<float>  M5, M6, M7, M8;
+            M5.AssignRowSliceValuesOf(M0, 0, 2);
+            M6.AssignRowSliceValuesOf(M0, 2, 1);
+            M7.AssignRowSliceValuesOf(M0, 3, 2);
+
+            std::vector<const GPUMatrix<float> *> inputMatrices;
+            inputMatrices.resize(3);
+            inputMatrices[0] = &M5;
+            inputMatrices[1] = &M6;
+            inputMatrices[2] = &M7;
+            M8.AssignRowStackValuesOf(inputMatrices, 0, 3);
+
+            Assert::IsTrue(M8.IsEqualTo(M0, 0.0001));
        }

        TEST_METHOD(GPUKhatriRaoProduct)
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@ -429,6 +429,48 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return *this;
    }

+    //stack the columns in inputMatrices (starting from sliceStartCol for sliceNumCols columns) and assign it to [this] object.
+    template<class ElemType>
+    CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignRowStackValuesOf(const std::vector<const CPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
+    {
+        if (sliceNumCols == 0)
+            LogicError("AssignRowStackValuesOf: sliceNumCols should > 0.");
+
+        size_t totalRows = 0;
+        size_t* startRowIndeces = new size_t[inputMatrices.size()];
+        startRowIndeces[0] = 0;
+        for (int i = 0; i < inputMatrices.size(); i++)
+        {
+            const CPUMatrix<ElemType>& a = *inputMatrices[i];
+            if (a.IsEmpty())
+                LogicError("AssignRowStackValuesOf: input matrix (%d) is empty.", i);
+
+            if (a.GetNumCols() < sliceStartCol + sliceNumCols)
+                LogicError("AssignRowStackValuesOf: input matrix (%d) GetNumCols() < sliceStartCol + sliceNumCols.", i);
+
+            totalRows += a.GetNumRows();
+            if (i<inputMatrices.size()-1)
+                startRowIndeces[i + 1] = startRowIndeces[i] + a.GetNumRows();
+        }
+
+        Resize(totalRows, sliceNumCols);
+
+        auto& us = *this;
+
+#pragma omp parallel for     
+        for (long j = 0; j<sliceNumCols; j++)
+        {
+            for (int i = 0; i < inputMatrices.size(); i++)
+            {
+                memcpy(&us(startRowIndeces[i], j), &(*inputMatrices[i])(0, sliceStartCol+j), inputMatrices[i]->GetNumRows() * sizeof(ElemType));
+            }
+        }
+        
+        delete [] startRowIndeces;
+
+        return *this;
+    }  
+
    template<class ElemType>
    void CPUMatrix<ElemType>::MinusOneAt(CPUMatrix<ElemType>& c, const size_t position)
    {
@ -672,16 +714,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // if it's externally managed, then populate the structure
        if (matrixFlags&matrixFlagDontOwnBuffer)
        {
+            // free previous array allocation if any before overwriting
            if (m_pArray != nullptr)
                delete [] m_pArray;

            m_pArray = pArray;
            m_numRows = numRows;
            m_numCols = numCols;
-            // free previous array allocation if any before overwriting
-            if (m_pArray != nullptr)
-                delete[] m_pArray;
-            m_pArray = pArray;
            m_elemSizeAllocated = GetNumElements();
            m_externalBuffer = true;
        }
@ -3877,7 +3916,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    template<class ElemType>
    void CPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const CPUMatrix<ElemType>& a,
-        const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias, size_t sampleCount, CPUMatrix<ElemType>& tmp, CPUMatrix<ElemType>& c)
+        const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias, CPUMatrix<ElemType>& tmp, CPUMatrix<ElemType>& c)
        //this: samples+probs
        // a:   hidden
        // b:   embedding
@ -3892,7 +3931,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            std::cerr << endl;
        }
        */
-        sampleCount *= 1;
        double log_likelihood = 0.0;
        size_t sample_size = this->GetNumRows() / 2;
        size_t batch_size = this->GetNumCols();
--- a/Math/Math/CPUMatrix.h
+++ b/Math/Math/CPUMatrix.h
@ -216,7 +216,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        CPUMatrix<ElemType>& AssignVectorNorm2Of(CPUMatrix<ElemType>& a, const bool isColWise);

        void AssignNoiseContrastiveEstimation(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias,
-            size_t sampleCount, CPUMatrix<ElemType>& tmp, CPUMatrix<ElemType>& c);
+            CPUMatrix<ElemType>& tmp, CPUMatrix<ElemType>& c);

        void AssignNCEUnnormalizedEval(const CPUMatrix<ElemType>& a,
            const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias, CPUMatrix<ElemType>& c);
@ -244,6 +244,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        CPUMatrix<ElemType>&  AssignRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
        CPUMatrix<ElemType>&  AddToRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
        CPUMatrix<ElemType>&  AddWithRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);
+        CPUMatrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const CPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);

        CPUMatrix<ElemType>& AssignToRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);

--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@ -678,6 +678,63 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return *this;
    }

+    //stack the columns in inputMatrices (starting from sliceStartCol for sliceNumCols columns) and assign it to [this] object.
+    template<class ElemType>
+    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRowStackValuesOf(const std::vector<const GPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
+    {
+        if (sliceNumCols == 0)
+            LogicError("AssignRowStackValuesOf: sliceNumCols should > 0.");
+
+        size_t totalRows = 0;
+        size_t* startRowIndeces = new size_t[inputMatrices.size()+1];
+        ElemType ** bufferPointersInInputMatrices = new ElemType*[inputMatrices.size()];
+
+        startRowIndeces[0] = 0;       
+
+        for (int i = 0; i < inputMatrices.size(); i++)
+        {
+            const GPUMatrix<ElemType>& a = *inputMatrices[i];
+            if (a.IsEmpty())
+                LogicError("AssignRowStackValuesOf: input matrix (%d) is empty.", i);
+
+            if (a.GetNumCols() < sliceStartCol + sliceNumCols)
+                LogicError("AssignRowStackValuesOf: input matrix (%d) GetNumCols() < sliceStartCol + sliceNumCols.", i);
+
+            totalRows += a.GetNumRows();
+            startRowIndeces[i + 1] = startRowIndeces[i] + a.GetNumRows();
+
+            bufferPointersInInputMatrices[i] = a.m_pArray + a.LocateColumn(sliceStartCol);
+        }
+
+        Resize(totalRows, sliceNumCols);
+
+        PrepareDevice();
+
+        ElemType** bufferPointersInGPU = NULL;
+        CUDA_CALL(cudaMalloc((void***)&bufferPointersInGPU, inputMatrices.size()*sizeof(ElemType*)));
+        CUDA_CALL(cudaMemcpy(bufferPointersInGPU, bufferPointersInInputMatrices, inputMatrices.size()*sizeof(ElemType*), cudaMemcpyHostToDevice));
+        delete[] bufferPointersInInputMatrices;
+
+        size_t* startRowIndecesInGPU = NULL;
+        CUDA_CALL(cudaMalloc((void**)&startRowIndecesInGPU, (1+inputMatrices.size())*sizeof(size_t)));
+        CUDA_CALL(cudaMemcpy(startRowIndecesInGPU, startRowIndeces, (1+inputMatrices.size())*sizeof(size_t), cudaMemcpyHostToDevice));
+        delete[] startRowIndeces;
+
+        LONG64 N = (LONG64)GetNumElements();
+        int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock);
+        cudaEvent_t done = nullptr;
+        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        _assignRowStackValuesOf<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(m_pArray, bufferPointersInGPU, startRowIndecesInGPU, (long) inputMatrices.size(), N, (long)GetNumRows(), (long)GetNumCols());
+        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
+        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
+
+        CUDA_CALL(cudaFree(bufferPointersInGPU));
+        CUDA_CALL(cudaFree(startRowIndecesInGPU));
+
+        return *this;
+    }
+
    /// c = c - 1.0 for a specific position
    template<class ElemType>
    void GPUMatrix<ElemType>::MinusOneAt(GPUMatrix<ElemType>& c, const size_t position)
--- a/Math/Math/GPUMatrix.h
+++ b/Math/Math/GPUMatrix.h
@ -274,6 +274,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        GPUMatrix<ElemType>&  AssignRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);
        GPUMatrix<ElemType>&  AddToRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
        GPUMatrix<ElemType>&  AddWithRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);
+        GPUMatrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const GPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);

        GPUMatrix<ElemType>&  AssignRepeatOf(const GPUMatrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats);
        GPUMatrix<ElemType>&  AssignPositiveAndShiftedNegSample(const GPUMatrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@ -377,6 +377,27 @@ __global__ void _addWithRowSliceValuesOf(ElemType * dest, ElemType * src, const
    dest[id] += src[IDX2C(row + startIndex, col, srcRows)];
 }

+template<class ElemType>
+__global__ void _assignRowStackValuesOf(ElemType * dest, ElemType ** srces, size_t* startRowIndeces, const LONG64 numSrces, const LONG64 N, const long destRows, const long destCols)
+{
+    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id >= N)
+        return;
+
+    long col = id / destRows;  //dest is the full matrix, rowslice is taken from the src
+    long row = id - (col * destRows);
+
+    //can we replace the for loop with something better?
+    int srcId = 0;
+    for (; srcId < numSrces; srcId++)
+    {
+        if (startRowIndeces[srcId + 1]>row)
+            break;
+    }
+
+    dest[id] = srces[srcId][IDX2C(row - startRowIndeces[srcId], col, startRowIndeces[srcId+1] - startRowIndeces[srcId])];
+}
+
 template<class ElemType>
 __global__ void _assignRepeatOf(ElemType * dest, ElemType * src, const LONG64 N, const long srcRows, const long srcCols, const long destRows)
 {
--- a/Math/Math/GPUSparseMatrix.h
+++ b/Math/Math/GPUSparseMatrix.h
@ -79,16 +79,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        size_t MajorIndexCount() const
        {
-            return MajorIndexCount(m_numRows, m_numCols, m_elemSizeAllocated, m_format);
+            return MajorIndexCount(m_numRows, m_numCols, m_nz, m_format);
        }
-        size_t MajorIndexCount(const size_t numRows, const size_t numCols, const size_t numNZReserved, const MatrixFormat format) const
+        size_t MajorIndexCount(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat format) const
        { 
            if (format == matrixFormatSparseBlockCol)
                return numCols;
            else if (format == matrixFormatSparseBlockRow)
                return numRows;
            else
-                return numNZReserved;
+                return numNZ;
        }
        size_t MajorIndexSize() const // actual number of major index bytes in use
        { 
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@ -1520,6 +1520,68 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return *this;
    }

+    //stack the columns in inputMatrices (starting from sliceStartCol for sliceNumCols columns) and assign it to [this] object.
+    template<class ElemType>
+    Matrix<ElemType>& Matrix<ElemType>::AssignRowStackValuesOf(const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
+    {
+        for (int i = 0; i < inputMatrices.size(); i++)
+        {
+            const Matrix<ElemType>& a = *inputMatrices[i];
+            DecideAndMoveToRightDevice(*this, a);
+
+            //WARNING: a and this must have same type
+            if (!(GetMatrixType() == a.GetMatrixType()))
+                NOT_IMPLEMENTED;
+        }
+
+        CurrentDataLocation curLocation = GetCurrentMatrixLocation();
+        if (curLocation == CurrentDataLocation::GPU || curLocation == CurrentDataLocation::BOTH)
+        {
+            if (GetMatrixType() != MatrixType::SPARSE)
+            {
+                //GPUDense;
+                std::vector<const GPUMatrix<ElemType>*> gpuInputMatrices;
+                gpuInputMatrices.resize(inputMatrices.size());
+                for (int i = 0; i < inputMatrices.size(); i++)
+                    gpuInputMatrices[i] = inputMatrices[i]->m_GPUMatrix;
+
+                m_GPUMatrix->AssignRowStackValuesOf(gpuInputMatrices, sliceStartCol, sliceNumCols);
+
+                SetDataLocation(CurrentDataLocation::GPU, MatrixType::DENSE);
+            }
+            else
+            {
+                NOT_IMPLEMENTED;
+            }
+        }
+        else if (curLocation == CurrentDataLocation::CPU)
+        {
+            if (GetMatrixType() != MatrixType::SPARSE)
+            {
+                //CPUDense;
+                std::vector<const CPUMatrix<ElemType>*> cpuInputMatrices;
+                cpuInputMatrices.resize(inputMatrices.size());
+                for (int i = 0; i < inputMatrices.size(); i++)
+                    cpuInputMatrices[i] = inputMatrices[i]->m_CPUMatrix;
+
+                m_CPUMatrix->AssignRowStackValuesOf(cpuInputMatrices, sliceStartCol, sliceNumCols);
+
+                SetDataLocation(CurrentDataLocation::CPU, MatrixType::DENSE);
+            }
+            else
+            {
+                NOT_IMPLEMENTED;
+            }
+        }
+        else
+        {
+            throw std::runtime_error("Matrices do not exist in either CPU or GPU.");
+        }
+
+        return *this;
+    } 
+
+
    template<class ElemType>
    Matrix<ElemType>&  Matrix<ElemType>::AssignRepeatOf(const Matrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats)
    {
@ -3600,7 +3662,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            size_t sampleCount = a.m_CPUMatrix->GetNumElements() / a.m_CPUMatrix->GetNumRows();
            tmp.Resize(a.GetNumRows() / 2, sampleCount);
-            a.m_CPUMatrix->AssignNoiseContrastiveEstimation(*b.m_CPUMatrix, *c.m_CPUMatrix, *bias.m_CPUMatrix, sampleCount, *tmp.m_CPUMatrix, *this->m_CPUMatrix);
+            a.m_CPUMatrix->AssignNoiseContrastiveEstimation(*b.m_CPUMatrix, *c.m_CPUMatrix, *bias.m_CPUMatrix, *tmp.m_CPUMatrix, *this->m_CPUMatrix);
        }
        else
        {
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@ -259,6 +259,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        Matrix<ElemType>&  AssignRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows);
        Matrix<ElemType>&  AddToRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
        Matrix<ElemType>&  AddWithRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows);
+        Matrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);

        Matrix<ElemType>&  AssignRepeatOf(const Matrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats);
        Matrix<ElemType>&  AssignPositiveAndShiftedNegSample(const Matrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
--- a/Math/Math/NoGPU.cpp
+++ b/Math/Math/NoGPU.cpp
@ -479,6 +479,7 @@ namespace Microsoft {
            //for each column of a, we add all rows of a to this starting from startIndex
            template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddToRowSliceValuesOf(const GPUMatrix<ElemType>& /*a*/, const size_t startIndex, const size_t numRows) { return *this; }
            template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddWithRowSliceValuesOf(const GPUMatrix<ElemType>& /*a*/, const size_t startIndex, const size_t numRows) { return *this; }
+            GPUMatrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const GPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols) { return *this; }

            template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRepeatOf(const GPUMatrix<ElemType>& /*a*/, const size_t numRowRepeats, const size_t numColRepeats) { return *this; }
            template<class ElemType> GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignPositiveAndShiftedNegSample(const GPUMatrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber) { return *this; }
--- a/Scripts/build-and-test
+++ b/Scripts/build-and-test
@ -0,0 +1,234 @@
+#!/bin/bash
+
+# Setting some default values
+BUILD=1
+RUN=1
+CLEAN_AFTER=0
+CLEAN_BEFORE=0
+
+# parsing command line arguments:
+while [[ $# > 0 ]]
+do
+key="$1"
+
+case $key in
+    -h|--help)
+    echo "Usage: build-and-test [options]"
+    echo "Options:"
+    echo "  -q|--quiet-build - redirect build output to file (by default those will be in <cntk_root>.run-<operating_system>-*)"
+    echo "  -r|--run-only - elides build step, runs the binaries that have already been built"
+    echo "  -b|--build-only - just build, do not run"
+    echo "  -cb|--clean-build - clean up the enlistment binaries before build"
+    echo "  -o|--output-directory <output_dir> - specify output directory to use"
+    echo "The root directory used to build and run CNTK is hosts the Scripts directory that contains this script"
+    exit 1
+    ;;
+    -q|--quiet)
+    QUIET_BUILD=1
+    ;;
+    -r|--run-only)
+    BUILD=0
+    RUN=1
+    ;;
+    -b|--build-only)
+    BUILD=1
+    RUN=0
+    ;;
+    -cb|--clean-build)
+    CLEAN_BEFORE=1
+    BUILD=1
+    ;;
+    -o|--output-directory)
+    OUTPUT_DIR="$2"
+    shift # past argument
+    ;;
+    *)
+    echo Unkown option $key
+    exit 1
+    ;;
+esac
+shift # past argument or value
+done
+
+# Step 0 -- Validate all necessary prerequisites and check for incompatible options
+# It is possible to use this script on Windows to build CNTK
+# from Cygwin window with Visual C++ environment loaded.
+# In that case OS environment variable will be set and we 
+# can use it to differentiate from Linux.
+if [[ $CLEAN_BEFORE == 1 && $RUN == 1 && $BUILD == 0 ]]; then
+    echo "============ ERROR: Incompatible options RUN and CLEAN_BEFORE set without BUILD ============"
+    exit 1
+fi
+
+if [[ $OS == "Windows_NT" && $OSTYPE == "cygwin" ]]; then
+    DEBUG_DIR=Debug
+    RELEASE_DIR=Release
+    PREFIX_DIR=x64
+    BIN_NAME=CNTK.exe
+    BUILD_OS="windows"
+  
+    if [[ $VS120COMNTOOLS == "" ]]; then
+        echo "============ Visual Studio 12.0 environment not properly setup or VS not installed ============"
+        echo "============ Please find and run the appropriate vcvarsall.bat script ============"
+        exit 1
+    fi
+
+    if [[ $ACML_PATH == "" ]]; then
+        echo "============ ACML path not set  ============"
+        echo "============ ACML libraries are needed to successfully build CNTK ============"
+        exit 1
+    fi
+elif [[ $OSTYPE == "linux-gnu" ]]; then
+    DEBUG_DIR=x86_64.gpu.debug.acml
+    RELEASE_DIR=x86_64.gpu.release.acml
+    PREFIX_DIR=bin
+    BIN_NAME=cntk
+    MAKEFILE=Makefile.gpu
+    BUILD_OS="linux"
+else
+    echo "============ ERROR: Unsupported OS ============"
+    echo "============ Scripts supports only building from Linux and Windows through Cygwin ============"
+    exit 1
+fi
+
+# Step 1 -- Prepare temporary folders and files, tweak settings if necessary
+
+# Get to the root path from which we know how to build and run
+SCRIPT=`readlink -f $0`
+SCRIPT_DIR=`dirname $SCRIPT`
+CNTK_ROOT=`dirname $SCRIPT_DIR`
+
+# Setup the output directory
+if [[ $OUTPUT_DIR == "" ]]; then
+    OUTPUT_DIR="$CNTK_ROOT/.run-$BUILD_OS-$RANDOM"
+fi
+
+echo "============ Creating CNTK temp directory in $TMP_ROOT ============"
+mkdir -p $OUTPUT_DIR || exit $?
+
+CONF_FILE="$OUTPUT_DIR/Simple.conf"
+BUILD_FILE="$OUTPUT_DIR/Build"
+RUN_FILE="$OUTPUT_DIR/Result"
+
+if ! [[ -d "$CNTK_ROOT/MachineLearning" ]]; then
+    echo "============ ERROR: Build script located in the wrong directory ($SCRIPT_DIR) ============"
+    exit 1
+fi
+
+cd $CNTK_ROOT
+
+if ! [[ -f $CONF_FILE ]]; then
+    cp Demos/Simple/Simple.config $CONF_FILE || exit $?
+
+    # This chmod is necessary due to restrictive Cygwin interpretation of Windows permissions.
+    # Cygwin interprets Windows permissions as ----rwx---, which lacks read permissions for user.
+    chmod a+r $CONF_FILE || exit $?
+fi
+
+if [[ $QUIET_BUILD == 1 ]]; then
+    echo "============ WARNING: You have selected quiet build. All build output will be placed in ($OUTPUT_DIR) ============"
+fi
+
+# Step 2 -- Build the project debug and release, if requested
+if [[ $BUILD == 1 ]]; then
+    # Step 2 -- Perform necessary builds
+    for FLAVOR in debug release
+    do
+        # Our make is too noisy right now and it is difficult to spot
+        # issues from stdout and stderr. In the quiet mode these are
+        # redirected to a file where they could be examined after the fact
+        if [[ $QUIET_BUILD == 1 ]]; then
+            exec 6>$BUILD_FILE.$FLAVOR.out || exit $?
+            exec 7>$BUILD_FILE.$FLAVOR.err || exit $?
+        else
+            exec 6>&1 || exit $?
+            exec 7>&2 || exit $?
+        fi
+
+        echo "============ Building CNTK $FLAVOR (clean=$CLEAN_BEFORE)  ============"
+
+        if [[ $OS == "Windows_NT" ]]; then
+            if [[ $CLEAN_BEFORE == 1 ]]; then
+                msbuild.exe /property:Configuration=$FLAVOR /t:Clean 1>&6 2>&7 || exit $?
+            fi
+            msbuild.exe /property:Configuration=$FLAVOR /m 1>&6 2>&7 || exit $?
+        else
+            if [[ $CLEAN_BEFORE == 1 ]]; then
+                make BUILDTYPE=$FLAVOR -f $MAKEFILE clean 1>&6 2>&7 || exit $?
+            fi
+            make BUILDTYPE=$FLAVOR -j -f $MAKEFILE 1>&6 2>&7 || exit $?
+        fi
+        chmod a+r $BUILD_FILE.*
+    done
+fi
+
+# Step 3 -- Run the project tests, both debug and release, if requested
+if [[ $RUN == 1 ]]; then
+    if ! [[ -f "$CNTK_ROOT/$PREFIX_DIR/$DEBUG_DIR/$BIN_NAME" && -f "$CNTK_ROOT/$PREFIX_DIR/$RELEASE_DIR/$BIN_NAME" ]]; then
+        echo "============ ERROR: CNTK did not build properly  ============"
+        exit 1
+    fi
+
+    cd $PREFIX_DIR
+
+    for TARGET in CPU GPU
+    do
+        # These sed scripts are simply toggling DeviceNumber argument in the config file
+        # If it is set to Auto, it will pick GPU over CPU. At -1 CPU is selected.
+        if [[ $TARGET == CPU ]]; then
+            sed -i -e 's/^DeviceNumber.*/DeviceNumber=-1/g' $CONF_FILE || exit $?
+        else
+            sed -i -e 's/^DeviceNumber.*/DeviceNumber=Auto/g' $CONF_FILE || exit $?
+        fi
+
+        for FLAVOR in debug release
+        do
+            if [[ FLAVOR == "debug" ]]; then
+                FLAVOR_DIR="$DEBUG_DIR"
+            else
+                FLAVOR_DIR="$RELEASE_DIR"
+            fi
+            OUT_FILE="$RUN_FILE.$FLAVOR.out"
+
+            echo "============ Running CNTK for ($FLAVOR) ($TARGET), output in ($RUN_FILE.*) ============"
+            rm -rf models
+            if [[ $OS == "Windows_NT" ]]; then
+                # We have to use cygpath on Windows to modify the file paths into the format readable by cntk.
+                time ./$FLAVOR_DIR/$BIN_NAME configFile="`cygpath -w $CONF_FILE`" &>$OUT_FILE || exit $?
+            else
+                time ./$FLAVOR_DIR/$BIN_NAME configFile=$CONF_FILE &>$OUT_FILE || exit $?
+            fi
+            chmod a+r $RUN_FILE.*
+
+            # Check if execution was successful
+            grep -q "Using $TARGET" "$OUT_FILE" || {
+                echo "============ ERROR: Run output (in $OUT_FILE) did not contain information about target device ($TARGET) ============"
+                exit 1
+            }
+
+            grep -q "EXCEPTION" "$OUT_FILE" && {
+                echo "============ ERROR: Run output in ($OUT_FILE) contains exceptions ============"
+                grep "EXCEPTION" "$OUT_FILE"
+                exit 1
+            }
+        done
+    done
+fi
+
+# Step 5 -- Optionally clean after builds and tests
+if [[ $CLEAN_AFTER == 1 ]]; then
+    rm -rf models
+    cd $CNTK_ROOT
+    for FLAVOR in debug release
+    do
+        echo "============ Cleaning up CNTK $FLAVOR  ============"
+        if [[ $OS == "Windows_NT" ]]; then
+            msbuild.exe /property:Configuration=$FLAVOR /t:clean 1>&6 2>&7 || exit $?
+        else
+            make BUILDTYPE=$FLAVOR -f $MAKEFILE clean 1>&6 2>&7 || exit $?
+        fi
+    done
+    rm -rf $OUTPUT_DIR
+fi
+
+echo "============ Build and test of CNTK was successful!  ============"