Merge with new changes including RowStackNode

2015-06-16 16:01:19 -07:00 · 2015-06-16 16:01:19 -07:00 · f332421b7b
--- a/.gitignore
+++ b/.gitignore
@ -15,6 +15,7 @@ x64/
 build/
 [Bb]in/
 [Oo]bj/
 .run-*
 # Enable "build/" folder in the NuGet Packages folder since NuGet packages use it for MSBuild targets
 !packages/*/build/
--- a/DataReader/BinaryReader/BinaryWriter.cpp
+++ b/DataReader/BinaryReader/BinaryWriter.cpp
@ -47,8 +47,8 @@ BinaryWriter<ElemType>::~BinaryWriter()
 //  miniBatchMode=Partial
 //  randomize=None
 //  wfile=c:\speech\mnist\mnist_test.bin
-//  #wsize - inital size of the file in MB
+//  #wsize - inital size of the file in MB default to 256
-//  # if calculated size would be bigger, that is used instead
+//  # has to be large enough for your dataset. the file will shrink to the actual size when closed.
 //  #wsize=256
 //  #wrecords - number of records we should allocate space for in the file
 //  # files cannot be expanded, so this should be large enough. If known modify this element in config before creating file
--- a/DataReader/HTKMLFReader/HTKMLFReader.cpp
+++ b/DataReader/HTKMLFReader/HTKMLFReader.cpp
@ -980,8 +980,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        {
                            // dereference matrix that corresponds to key (input/output name) and 
                            // populate based on whether its a feature or a label
-                            //Matrix<ElemType>& data =
+                            //Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels
                                                        *matrices[iter->first]; // can be features or labels
                            if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                            {
@ -1058,8 +1057,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        {
                            // dereference matrix that corresponds to key (input/output name) and 
                            // populate based on whether its a feature or a label
-                            //Matrix<ElemType>& data =
+                            //Matrix<ElemType>& data =*matrices[iter->first]; // can be features or labels
                                                        *matrices[iter->first]; // can be features or labels
                            if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                            {
@ -1134,8 +1132,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        {
                            // dereference matrix that corresponds to key (input/output name) and 
                            // populate based on whether its a feature or a label
-                            //Matrix<ElemType>& data =
+                            //Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels
                                                        *matrices[iter->first]; // can be features or labels
                            if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                            {
--- a/DataReader/HTKMLFReader/basetypes.h
+++ b/DataReader/HTKMLFReader/basetypes.h
@ -142,6 +142,15 @@ extern void _CHECKED_ASSERT_error(const char * file, int line, const char * exp)
 #endif
 #endif
 /**
 These macros are used for sentence segmentation information.
 */
 #define SENTENCE_BEGIN 0 
 #define SENTENCE_MIDDLE 1
 #define NO_LABELS -1
 #define EXISTS_SENTENCE_BEGIN_OR_NO_LABELS 0
 #define NO_EXISTS_SENTENCE_BEGIN_OR_NO_LABELS 1
 // ----------------------------------------------------------------------------
 // basic data types
 // ----------------------------------------------------------------------------
--- a/DataReader/HTKMLFReader/utterancesourcemulti.h
+++ b/DataReader/HTKMLFReader/utterancesourcemulti.h
@ -382,47 +382,58 @@ public:
                // TODO: we can store labels more efficiently now since we don't do frame-wise random access anymore.
                // OK, utterance has all we need --remember it
                utteranceset.push_back (std::move (utterance));
                if (m==0)
                {
                    _totalframes += uttframes;
                    framesaccum.push_back(uttframes); //track number of frames in each utterance - first feature is the reference
                    if (!labels.empty() && !lacksmlf)
                    //if (!labels.empty() && labelsiter != labels[0].end())
                    {
-                        foreach_index (j, labels)
+                        // first verify that all the label files have the proper duration
                        bool durationmatch = true;
                        foreach_index(j, labels)
                        {
                            const auto & labseq = labels[j].find(key)->second;
                            // check if durations match; skip if not
-                            size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
+                            size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size() - 1].firstframe + labseq[labseq.size() - 1].numframes);
                            if (labframes != uttframes)
                            {
-                                fprintf (stderr, " [duration mismatch (%d in label vs. %d in feat file), skipping %S]", labframes, uttframes, key.c_str());
+                                fprintf(stderr, " [duration mismatch (%d in label vs. %d in feat file), skipping %S]", labframes, uttframes, key.c_str());
                                nomlf++;
-                                continue;   // skip this utterance at all
+                                durationmatch = false;
                                break; // continue;   // skip this utterance at all
                            }
-                            // expand classid sequence into flat array
+                        }
-                            foreach_index (i, labseq)
+                        if (durationmatch){
                            utteranceset.push_back(std::move(utterance));
                            _totalframes += uttframes;
                            framesaccum.push_back(uttframes); //track number of frames in each utterance - first feature is the reference
                            // then parse each mlf if the durations are consistent
                            foreach_index(j, labels)
                            {
-                                const auto & e = labseq[i];
+                                const auto & labseq = labels[j].find(key)->second;
-                                if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
+                                // expand classid sequence into flat array
-                                    throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
+                                foreach_index(i, labseq)
-                                if (e.classid >= udim[j])
+                                {
-                                    throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: class id %d exceeds model output dimension %d in file %S", e.classid, udim, key.c_str()));
+                                    const auto & e = labseq[i];
-                                if (e.classid != (CLASSIDTYPE) e.classid)
+                                    if ((i > 0 && labseq[i - 1].firstframe + labseq[i - 1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
-                                    throw std::runtime_error ("CLASSIDTYPE has too few bits");
+                                        throw std::runtime_error(msra::strfun::strprintf("minibatchutterancesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
-                                for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
+                                    if (e.classid >= udim[j])
-                                    classids[j]->push_back ((CLASSIDTYPE) e.classid);
+                                        throw std::runtime_error(msra::strfun::strprintf("minibatchutterancesource: class id %d exceeds model output dimension %d in file %S", e.classid, udim, key.c_str()));
-                                numclasses[j] = max (numclasses[j], 1u + e.classid);
+                                    if (e.classid != (CLASSIDTYPE)e.classid)
-                                counts[j].resize (numclasses[j], 0);
+                                        throw std::runtime_error("CLASSIDTYPE has too few bits");
-                                counts[j][e.classid] += e.numframes;
+                                    for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
-                            }
+                                        classids[j]->push_back((CLASSIDTYPE)e.classid);
-                            classids[j]->push_back ((CLASSIDTYPE) -1);  // append a boundary marker marker for checking
+                                    numclasses[j] = max(numclasses[j], 1u + e.classid);
                                    counts[j].resize(numclasses[j], 0);
                                    counts[j][e.classid] += e.numframes;
                                }
-                            if (!labels[j].empty() && classids[j]->size() != _totalframes + utteranceset.size())
+                                classids[j]->push_back((CLASSIDTYPE)-1);  // append a boundary marker marker for checking
-                                throw std::logic_error (msra::strfun::strprintf ("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
+
-                            assert (labels[j].empty() || classids[j]->size() == _totalframes + utteranceset.size());
+                                if (!labels[j].empty() && classids[j]->size() != _totalframes + utteranceset.size())
                                    throw std::logic_error(msra::strfun::strprintf("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
                                assert(labels[j].empty() || classids[j]->size() == _totalframes + utteranceset.size());
                            }
                        }
                    }
                    else{
@ -451,7 +462,7 @@ public:
            }
            if (nomlf + nolat > 0)
            {
-                fprintf (stderr, "minibatchutterancesource: out of %d files, %d files not found in label set and %d have no lattice\n", infiles.size(), nomlf, nolat);
+                fprintf (stderr, "minibatchutterancesource: out of %d files, %d files not found in label set and %d have no lattice\n", infiles[0].size(), nomlf, nolat);
                if (nomlf + nolat > infiles[m].size() / 2)
                    throw std::runtime_error ("minibatchutterancesource: too many files not found in label set--assuming broken configuration\n");
            }
--- a/DataReader/Kaldi2Reader/HTKMLFReader.cpp
+++ b/DataReader/Kaldi2Reader/HTKMLFReader.cpp
@ -24,6 +24,7 @@
 #define DATAREADER_EXPORTS  // creating the exports here
 #include "DataReader.h"
 #include "HTKMLFReader.h"
 #include "commandArgUtil.h"
 #ifdef LEAKDETECT
 #include <vld.h> // for memory leak detection
 #endif
--- a/DataReader/Kaldi2Reader/HTKMLFWriter.cpp
+++ b/DataReader/Kaldi2Reader/HTKMLFWriter.cpp
@ -28,6 +28,7 @@
 #include "DataWriter.h"
 #include "commandArgUtil.h"
 #include "HTKMLFWriter.h"
 #include "commandArgUtil.h"
 #ifdef LEAKDETECT
 #include <vld.h> // for memory leak detection
 #endif
--- a/DataReader/LMSequenceReader/SequenceReader.cpp
+++ b/DataReader/LMSequenceReader/SequenceReader.cpp
@ -2048,6 +2048,10 @@ void BatchSequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring,
    {
        RuntimeError("GetLabelOutput::should use CPU for labels ");
    }
    if (curDevId != CPUDEVICE)
    {
        labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
    }
 }
 template<class ElemType>
--- a/DataReader/UCIFastReader/UCIParser.cpp
+++ b/DataReader/UCIFastReader/UCIParser.cpp
@ -11,6 +11,11 @@
 #include <stdexcept>
 #include <stdint.h>
 #if WIN32
 #define ftell64 _ftelli64
 #else
 #define ftell64 ftell
 #endif
 // SetState for a particular value
 template <typename NumType, typename LabelType>
@ -362,10 +367,10 @@ void UCIParser<NumType, LabelType>::ParseInit(LPCWSTR fileName, size_t startFeat
    errno_t err = _wfopen_s( &m_pFile, fileName, L"rb" );
    if (err)
-        std::runtime_error("UCIParser::ParseInit - error opening file"); 
+        throw std::runtime_error("UCIParser::ParseInit - error opening file"); 
    int rc = _fseeki64(m_pFile, 0, SEEK_END);
    if (rc)
-        std::runtime_error("UCIParser::ParseInit - error seeking in file");
+        throw std::runtime_error("UCIParser::ParseInit - error seeking in file");
    m_fileSize = GetFilePosition();
    m_fileBuffer = new BYTE[m_bufferSize];
@ -377,9 +382,9 @@ void UCIParser<NumType, LabelType>::ParseInit(LPCWSTR fileName, size_t startFeat
 template <typename NumType, typename LabelType>
 int64_t UCIParser<NumType, LabelType>::GetFilePosition()
 {
-    int64_t position = _ftelli64(m_pFile);
+    int64_t position = ftell64(m_pFile);
    if (position == -1L)
-        std::runtime_error("UCIParser::GetFilePosition - error retrieving file position in file");
+        throw std::runtime_error("UCIParser::GetFilePosition - error retrieving file position in file");
    return position;
 }
@ -392,7 +397,7 @@ void UCIParser<NumType, LabelType>::SetFilePosition(int64_t position)
 {
    int rc = _fseeki64(m_pFile, position, SEEK_SET);
    if (rc)
-        std::runtime_error("UCIParser::SetFilePosition - error seeking in file");
+        throw std::runtime_error("UCIParser::SetFilePosition - error seeking in file");
    // setup state machine to start at this position
    PrepareStartPosition(position);
@ -445,7 +450,7 @@ size_t UCIParser<NumType, LabelType>::UpdateBuffer()
    size_t bytesToRead = min(m_bufferSize, m_fileSize-m_bufferStart)-saveBytes;
    size_t bytesRead = fread(m_fileBuffer+saveBytes, 1, bytesToRead, m_pFile);
    if (bytesRead == 0 && ferror(m_pFile))
-        std::runtime_error("UCIParser::UpdateBuffer - error reading file");
+        throw std::runtime_error("UCIParser::UpdateBuffer - error reading file");
    return bytesRead;
 }
--- a/DataReader/UCIFastReader/UCIParser.h
+++ b/DataReader/UCIFastReader/UCIParser.h
@ -90,8 +90,8 @@ private:
    int m_elementsConvertedThisLine;
    // global stats
-    int m_totalNumbersConverted;
+    int64_t m_totalNumbersConverted;
-    int m_totalLabelsConverted;
+    int64_t m_totalLabelsConverted;
    // file positions/buffer
    FILE * m_pFile;
--- a/Demos/Simple/Simple.config
+++ b/Demos/Simple/Simple.config
@ -1,8 +1,9 @@
-# command=Simple_Demo_Output
+RootDir=..
 command=Simple_Demo:Simple_Demo_Output
 # deviceId=-1 for CPU, >=0 for GPU devices
 DeviceNumber=-1
 #stderr=Demo
 precision=float
@ -13,7 +14,6 @@ deviceId=$DeviceNumber$
 outputNodeNames=ScaledLogLikelihood
 traceLevel=1
 #######################################
 #  TRAINING CONFIG (Simple, Fixed LR) #
 #######################################
@ -52,22 +52,22 @@ Simple_Demo=[
    reader=[
      # reader to use
      readerType=UCIFastReader
-      file=../Demos/Simple/SimpleDataTrain.txt
+      file=$RootDir$/Demos/Simple/SimpleDataTrain.txt
      miniBatchMode=Partial
      randomize=Auto
      verbosity=1   
      features=[
-	  dim=2      # two-dimensional input data
+          dim=2      # two-dimensional input data
          start=0    # Start with first element on line
      ]
      labels=[
-	start=2      # Skip two elements
+        start=2      # Skip two elements
        dim=1        # One label dimension
        labelDim=2   # Two labels possible
-        labelMappingFile=../Demos/Simple/SimpleMapping.txt
+        labelMappingFile=$RootDir$/Demos/Simple/SimpleMapping.txt
      ]
    ]
 ]
@ -84,16 +84,16 @@ Simple_Demo_Output=[
    reader=[
      # reader to use
      readerType=UCIFastReader
-      file=../Demos/Simple/SimpleDataTest.txt
+      file=$RootDir$/Demos/Simple/SimpleDataTest.txt
      features=[
          dim=2
-	  start=0
+          start=0
      ]
      labels=[
-	start=2
+      start=2
        dim=1
        labelDim=2
-        labelMappingFile=../Demos/Simple/SimpleMapping.txt
+        labelMappingFile=$RootDir$/Demos/Simple/SimpleMapping.txt
      ]
    ]
    outputPath=SimpleOutput    # Dump output as text
--- a/MachineLearning/CNTK/ComputationNetwork.h
+++ b/MachineLearning/CNTK/ComputationNetwork.h
@ -550,41 +550,38 @@ public:
                    }
                    ComputationNodePtr nodePtr = GetNodeFromName(nodeName);
-                    ComputationNodePtr childNodePtr0, childNodePtr1, childNodePtr2, childNodePtr3, childNodePtr4;
+                    std::vector<ComputationNodePtr> childrenNodes;
-                    switch (numChildren)
+                    childrenNodes.resize(numChildren);
                    for (int j = 0; j < numChildren; j++)
                        childrenNodes[j] = GetNodeFromName(childrenNames[j]);
                    if (nodePtr->OperationName() == RowStackNode<ElemType>::TypeName()) //allow for variable input nodes
                        nodePtr->AttachInputs(childrenNodes);
                    else //fixed input nodes
                    {
-                    case 1:
+                        switch (numChildren)
-                        childNodePtr0 = GetNodeFromName(childrenNames[0]);
+                        {
-                        nodePtr->AttachInputs(childNodePtr0);
+                        case 1:
-                        break;
+                            nodePtr->AttachInputs(childrenNodes[0]);
-                    case 2:
+                            break;
-                        childNodePtr0 = GetNodeFromName(childrenNames[0]);
+                        case 2:
-                        childNodePtr1 = GetNodeFromName(childrenNames[1]);
+                            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1]);
-                        nodePtr->AttachInputs(childNodePtr0, childNodePtr1);
+                            break;
-                        break;
+                        case 3:
-                    case 3:
+                            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2]);
-                        childNodePtr0 = GetNodeFromName(childrenNames[0]);
+                            break;
-                        childNodePtr1 = GetNodeFromName(childrenNames[1]);
+                        case 4:
-                        childNodePtr2 = GetNodeFromName(childrenNames[2]);
+                            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3]);
-                        nodePtr->AttachInputs(childNodePtr0, childNodePtr1, childNodePtr2);
+                            break;
-                        break;
+                        case 5:
-                    case 4:
+                            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3], childrenNodes[4]);
-                        childNodePtr0 = GetNodeFromName(childrenNames[0]);
+                            break;
-                        childNodePtr1 = GetNodeFromName(childrenNames[1]);
+                        case 6:
-                        childNodePtr2 = GetNodeFromName(childrenNames[2]);
+                            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3], childrenNodes[4], childrenNodes[5]);
-                        childNodePtr3 = GetNodeFromName(childrenNames[3]);
+                            break;
-                        nodePtr->AttachInputs(childNodePtr0, childNodePtr1, childNodePtr2, childNodePtr3);
+                        default:
-                        break;
+                            throw std::logic_error("Invalid number of children.");
-                    case 5:
+                        }
                        childNodePtr0 = GetNodeFromName(childrenNames[0]);
                        childNodePtr1 = GetNodeFromName(childrenNames[1]);
                        childNodePtr2 = GetNodeFromName(childrenNames[2]);
                        childNodePtr3 = GetNodeFromName(childrenNames[3]);
                        childNodePtr4 = GetNodeFromName(childrenNames[4]);
                        nodePtr->AttachInputs(childNodePtr0, childNodePtr1, childNodePtr2, childNodePtr3, childNodePtr4);
                        break;
                    default:
                        throw std::logic_error("Invalid number of children.");
                    }
                }
            }
@ -1028,6 +1025,8 @@ public:
                newNode = new LookupTableNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
            else if (nodeType == RowSliceNode<ElemType>::TypeName())
                newNode = new RowSliceNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
            else if (nodeType == RowStackNode<ElemType>::TypeName())
                newNode = new RowStackNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
            else if (nodeType == GMMLogLikelihoodNode<ElemType>::TypeName())
                newNode = new GMMLogLikelihoodNode<ElemType>(fstream, modelVersion, m_deviceId, nodeName);
            else if (nodeType == SequenceDecoderNode<ElemType>::TypeName())
@ -1209,6 +1208,8 @@ public:
 				newNode = new CosDistanceWithNegativeSamplesNode<ElemType>(m_deviceId, nodeName);
            else if (nodeType == ParallelNode<ElemType>::TypeName())
                newNode = new ParallelNode<ElemType>(m_deviceId, nodeName);
            else if (nodeType == RowStackNode<ElemType>::TypeName())
                newNode = new RowStackNode<ElemType>(m_deviceId, nodeName);
            else
            {
                fprintf(stderr, "Error creating new ComputationNode of type %ls, with name %ls\n", nodeType.c_str(), nodeName.c_str());
@ -1582,6 +1583,15 @@ public:
            return newNode;
        }
        ComputationNodePtr RowStack(const std::vector<ComputationNodePtr> inputs, const std::wstring nodeName = L"")
        {
            ComputationNodePtr newNode(new RowStackNode<ElemType>(m_deviceId, nodeName));
            newNode->AttachInputs(inputs);
            AddNodeToNet(newNode);
            return newNode;
        }
        ComputationNodePtr GMMLogLikelihood(const ComputationNodePtr unnormedPrior, const ComputationNodePtr mean, const ComputationNodePtr logStddev, const ComputationNodePtr feature, const std::wstring nodeName = L"")
        {
            ComputationNodePtr newNode(new GMMLogLikelihoodNode<ElemType>(m_deviceId, nodeName));
--- a/MachineLearning/CNTK/ComputationNode.h
+++ b/MachineLearning/CNTK/ComputationNode.h
@ -158,6 +158,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            throw std::logic_error("This operation does not support six inputs.");
        }
        virtual void AttachInputs(const std::vector<ComputationNodePtr>& /*inputs*/)
        {
            throw std::logic_error("This operation does not support variable-length inputs.");
        }
        virtual void DetachInputs()
        {
            m_children.resize(0);
--- a/MachineLearning/CNTK/LinearAlgebraNodes.h
+++ b/MachineLearning/CNTK/LinearAlgebraNodes.h
@ -399,6 +399,167 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template class RowSliceNode<float>; 
    template class RowSliceNode<double>;
    //this node is used to extract part of the input by rows as the output
    //it has to be continuous segments of rows since each column is treated as one sample
    template<class ElemType>
    class RowStackNode : public ComputationNode<ElemType>
    {
        UsingComputationNodeMembers;
    public:
        RowStackNode(const DEVICEID_TYPE deviceId = AUTOPLACEMATRIX, const std::wstring name = L"") : ComputationNode<ElemType>(deviceId)
        {
            m_nodeName = (name == L"" ? CreateUniqNodeName() : name);
            m_deviceId = deviceId;
            MoveMatricesToDevice(deviceId);
            InitRecurrentNode();
        }
        RowStackNode(File& fstream, const size_t modelVersion, const DEVICEID_TYPE deviceId = AUTOPLACEMATRIX, const std::wstring name = L"") : ComputationNode<ElemType>(deviceId)
        {
            m_nodeName = (name == L"" ? CreateUniqNodeName() : name);
            LoadFromFile(fstream, modelVersion, deviceId);
        }
        // copy constructor
        RowStackNode(const RowStackNode<ElemType>* node, const std::wstring& newName, const CopyNodeFlags flags) : ComputationNode<ElemType>(node->m_deviceId)
        {
            node->CopyTo(this, newName, flags);
        }
        virtual ComputationNodePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) const
        {
            const std::wstring& name = (newName == L"") ? NodeName() : newName;
            ComputationNodePtr node = new RowStackNode<ElemType>(this, name, flags);
            return node;
        }
        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
        {
            ComputationNode<ElemType>::CopyTo(nodeP, newName, flags);
            RowStackNode<ElemType>* node = (RowStackNode<ElemType>*) nodeP;
            if (flags & CopyNodeFlags::copyNodeChildren)
            {
                node->m_children = m_children;
                node->m_startRowIndeces = m_startRowIndeces;
                node->m_inputMatrices = m_inputMatrices;
            }
        }
        virtual const std::wstring OperationName() const { return TypeName(); }
        static const std::wstring TypeName() { return L"RowStack"; }
        virtual void ComputeInputPartial(const size_t inputIndex)
        {
            if (inputIndex >= ChildrenSize())
                throw std::invalid_argument("RowStack-ComputeInputPartial: inputIndex out of range.");
            ComputeInputPartialS(Inputs(inputIndex)->GradientValues(), GradientValues(), m_startRowIndeces[inputIndex], m_startRowIndeces[inputIndex + 1] - m_startRowIndeces[inputIndex]);
        }
        virtual void ComputeInputPartial(const size_t inputIndex, const size_t timeIdxInSeq)
        {
            if (inputIndex >= ChildrenSize())
                throw std::invalid_argument("RowStack-ComputeInputPartial: inputIndex out of range.");
            Matrix<ElemType> sliceInputGrad = Inputs(inputIndex)->GradientValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
            Matrix<ElemType> sliceOutputGrad = GradientValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
            ComputeInputPartialS(sliceInputGrad, sliceOutputGrad, m_startRowIndeces[inputIndex], m_startRowIndeces[inputIndex+1] - m_startRowIndeces[inputIndex]);
        }
        static void WINAPI ComputeInputPartialS(Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues, const size_t startIndex, const size_t numRows)
        {
            inputGradientValues.AddWithRowSliceValuesOf(gradientValues, startIndex, numRows);
        }
        virtual void EvaluateThisNode()
        {
            EvaluateThisNodeS(m_functionValues, m_inputMatrices,  0, Inputs(0)->FunctionValues().GetNumCols());
        }
        virtual void EvaluateThisNode(const size_t timeIdxInSeq)
        {
            Matrix<ElemType> sliceFunctionValues = FunctionValues().ColumnSlice(timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
            EvaluateThisNodeS(sliceFunctionValues, m_inputMatrices, timeIdxInSeq * m_samplesInRecurrentStep, m_samplesInRecurrentStep);
        }
        static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
        {
            functionValues.AssignRowStackValuesOf(inputMatrices, sliceStartCol, sliceNumCols);
 #if NANCHECK
            functionValues.HasNan("RowStack");
 #endif
        }
        virtual void Validate()
        {
            PrintSelfBeforeValidation();
            unsigned int numInputs = ChildrenSize();
            if (numInputs < 2)
                LogicError("RowStack operation: must have two or more inputs.");
            if (Inputs(0) == nullptr)
                LogicError("RowStack operation: the input node is NULL.");
            size_t numCols = Inputs(0)->FunctionValues().GetNumCols();
            m_startRowIndeces.resize(ChildrenSize()+1);
            m_inputMatrices.resize(ChildrenSize());
            size_t totalRows = 0;
            m_startRowIndeces[0] = 0;
            for (int i = 0; i < ChildrenSize(); i++)
            {
                if (Inputs(i) == nullptr)
                    LogicError("RowStack operation: the input node is NULL.");
                Matrix<ElemType>& childMatrix = Inputs(i)->FunctionValues();
                size_t numRows = childMatrix.GetNumRows();
                if (numRows == 0)
                    LogicError("RowStack operation: the input node %ls has 0 rows.", Inputs(i)->NodeName().c_str());
                if (childMatrix.GetNumCols() != numCols)
                    LogicError("RowStack operation: the input node %ls has different number of columns.", Inputs(i)->NodeName().c_str());
                totalRows += numRows;
                m_inputMatrices[i] = &childMatrix;
                m_startRowIndeces[i + 1] = m_startRowIndeces[i] + numRows;
            }
            FunctionValues().Resize(totalRows, numCols);
            CopyImageSizeFromInputs();
        }
        virtual void CopyImageSizeFromInputs()
        {
            CopyImageSizeFromInput(0, true);
            m_outputHeight = FunctionValues().GetNumRows();
            //WARNING: this node will destroy the image size information from the child
            if (m_inputWidth * m_inputChannels != 1)
                fprintf(stderr, "WARNING: RowStack operation cannot inherit image size information from its child. Image size info is lost.\n");
        }
        virtual void AttachInputs(const std::vector<ComputationNodePtr>& inputs)
        {
            unsigned int numInputs = inputs.size();
            m_children.resize(numInputs);
            for (unsigned int i = 0; i < numInputs; i++)
                m_children[i] = inputs[i];
        }
    private:
        std::vector<size_t> m_startRowIndeces; //start row number in the stacked matrix of each input (child)
        std::vector<const Matrix<ElemType>*> m_inputMatrices;
    };
    template class RowStackNode<float>;
    template class RowStackNode<double>;
    template<class ElemType>
    class ScaleNode : public ComputationNode<ElemType>
    {
--- a/MachineLearning/CNTK/NetworkDescriptionLanguage.cpp
+++ b/MachineLearning/CNTK/NetworkDescriptionLanguage.cpp
@ -222,6 +222,8 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
        ret = true;
    else if (EqualInsensitive(nodeType, RowSliceNode<ElemType>::TypeName()))
        ret = true;
    else if (EqualInsensitive(nodeType, RowStackNode<ElemType>::TypeName()))
        ret = true;
    else if (EqualInsensitive(nodeType, LookupTableNode<ElemType>::TypeName()))
        ret = true;
    else if (EqualInsensitive(nodeType, GMMLogLikelihoodNode<ElemType>::TypeName(), L"GMMLL"))
--- a/MachineLearning/CNTK/RecurrentNodes.h
+++ b/MachineLearning/CNTK/RecurrentNodes.h
@ -218,10 +218,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    {
                        if (colBegin(i,0) == SENTENCE_MIDDLE)
                        {
-                            Matrix<ElemType> to1 = inputGradientValues.ColumnSlice((timeIdxInSeq - delay)*mNbr + i, 1);
+							Matrix<ElemType> frm = gradientValues.ColumnSlice(timeIdxInSeq * mNbr + i, 1);
-                            Matrix<ElemType> frm1= gradientValues.ColumnSlice(timeIdxInSeq * mNbr + i, 1);
+                            Matrix<ElemType> to = inputGradientValues.ColumnSlice((timeIdxInSeq - delay)*mNbr + i, 1);
-                            to1 += frm1;
+                            to += frm;
                        }
                    }
--- a/MachineLearning/CNTK/SimpleNetworkBuilder.cpp
+++ b/MachineLearning/CNTK/SimpleNetworkBuilder.cpp
@ -1810,8 +1810,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            w = m_net->CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
            m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
            double val = w->FunctionValues()(0, 0);
            /// the label is a dense matrix. each element is the word index
            label = m_net->CreateInputNode(L"labels", 2 * (this->nce_noises + 1), mbSize);
--- a/MachineLearning/CNTK/SynchronousExecutionEngine.h
+++ b/MachineLearning/CNTK/SynchronousExecutionEngine.h
@ -391,29 +391,43 @@ public:
            {
            std::vector<void*> inputs = EvaluateParameters(node, baseName, nodeParamStart, nodeParamCount, pass);
-            switch (inputs.size())
+            if (cnNodeType == RowStackNode<ElemType>::TypeName()) //support variable length inputs
            {
-            case 1:
+                std::vector<ComputationNodePtr> inputNodes;
-                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]));
+                inputNodes.resize(inputs.size());
-                break;
+                for (int i = 0; i < inputs.size(); i++)
-            case 2:
+                    inputNodes[i] = ComputationNodePtr(inputs[i]);
                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]));
                break;
            case 3:
                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]));
                break;
            case 4:
                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3]));
                break;
            case 5:
                nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3]), ComputationNodePtr(inputs[4]));
                break;
            default:
                if (nodeParamCount > 0)
                    RuntimeError("Invalid number of parameters name = '%s' call = '%s'\n", node->GetName().c_str(), node->GetValue().c_str());
                break;
            }
                nodePtr->AttachInputs(inputNodes);
            }
            else
            {
                switch (inputs.size())
                {
                case 1:
                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]));
                    break;
                case 2:
                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]));
                    break;
                case 3:
                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]));
                    break;
                case 4:
                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3]));
                    break;
                case 5:
                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3]), ComputationNodePtr(inputs[4]));
                    break;
                case 6:
                    nodePtr->AttachInputs(ComputationNodePtr(inputs[0]), ComputationNodePtr(inputs[1]), ComputationNodePtr(inputs[2]), ComputationNodePtr(inputs[3]), ComputationNodePtr(inputs[4]), ComputationNodePtr(inputs[5]));
                    break;
                default:
                    if (nodeParamCount > 0)
                        RuntimeError("Invalid number of parameters name = '%s' call = '%s'\n", node->GetName().c_str(), node->GetValue().c_str());
                    break;
                }
            }
            // process common optional parameters (like "tag");
            ProcessOptionalParameters(node);
            break;
--- a/Makefile.gpu
+++ b/Makefile.gpu
@ -32,11 +32,11 @@ DEVICE = gpu
 BUILDTYPE = debug
 #BUILDTYPE = release
 # comment following and uncomment the next one to enable MKL library
-#MATHLIB = acml
+MATHLIB = acml
-MATHLIB = mkl
+#MATHLIB = mkl
 # modify relevant path below for your system
 MKL_PATH = /usr/users/chiaying/intel/composer_xe_2013.2.146
-ACML_PATH = /usr/local/acml5.3.0/gfortran64
+ACML_PATH = /usr/local/acml5.3.1/ifort64
 #######
 BUILDFOR = $(ARCH).$(DEVICE).$(BUILDTYPE).$(MATHLIB)
@ -48,8 +48,8 @@ ifeq ($(BUILDTYPE),debug)
 	BUILDTYPE_OPT = -g
 	GPU_BUILDTYPE_OPT = -G
 else
-	BUILDTYPE_OPT = -O4
+	BUILDTYPE_OPT = -O3 -flto
-	GPU_BUILDTYPE_OPT =
+	GPU_BUILDTYPE_OPT = -O3
 endif
 ifeq ($(MATHLIB),mkl)
@ -142,7 +142,7 @@ $(OBJDIR)/%.o : %.cu Makefile
 	@echo $(SEPARATOR)
 	@echo creating $@ for $(ARCH) with build type $(BUILDTYPE) 
 	@mkdir -p $(dir $@)
-	$(NVCC) -c $< -o $@ $(BUILDTYPE_OPT) $(GPU_BUILDTYPE_OPT) $(NVCCFLAGS) $(INCFLAGS) -Xcompiler -fPIC
+	$(NVCC) -c $< -o $@ $(GPU_BUILDTYPE_OPT) $(NVCCFLAGS) $(INCFLAGS) -Xcompiler -fPIC
 $(OBJDIR)/%.o : %.cpp Makefile
 	@echo $(SEPARATOR)
--- a/Makefile_kaldi2.cpu
+++ b/Makefile_kaldi2.cpu
@ -31,8 +31,8 @@ DEVICE = cpu
 #BUILDTYPE = debug
 BUILDTYPE = release
 # comment following and uncomment the next one to enable MKL library
-#MATHLIB = acml
+MATHLIB = acml
-MATHLIB = mkl
+#MATHLIB = mkl
 # modify relevant path below for your system
 MKL_PATH = /usr/users/chiaying/intel/composer_xe_2013.2.146
 ACML_PATH = /usr/users/yzhang87/code/acml/gfortran64
--- a/Math/CNTKMathTest/CPUMatrixUnitTests.cpp
+++ b/Math/CNTKMathTest/CPUMatrixUnitTests.cpp
@ -563,7 +563,7 @@ namespace CNTKMathTest
            Assert::IsTrue(C.IsEqualTo(D1, 0.0001)); 
        }
-        TEST_METHOD(CPUMatrixRowSlice)
+        TEST_METHOD(CPUMatrixRowSliceAndStack)
        {
            Matrix M0(5,3);
            M0(0,0) = 1; M0(0,1) = 6; M0(0,2) = 11;
@ -590,6 +590,26 @@ namespace CNTKMathTest
            M3 += M0;
            M0.AddToRowSliceValuesOf(M1, 2,2);
            Assert::IsTrue(M3.IsEqualTo(M0, 0.0001)); 
            M2.AddWithRowSliceValuesOf(M1, 0, 2);
            Matrix M4(2, 3);
            M4(0, 0) = 6; M4(0, 1) = 16; M4(0, 2) = 26;
            M4(1, 0) = 8; M4(1, 1) = 18; M4(1, 2) = 28;
            Assert::IsTrue(M2.IsEqualTo(M4, 0.0001));
            Matrix M5, M6, M7, M8;
            M5.AssignRowSliceValuesOf(M0, 0, 2);
            M6.AssignRowSliceValuesOf(M0, 2, 1);
            M7.AssignRowSliceValuesOf(M0, 3, 2);
            std::vector<const Matrix*> inputMatrices;
            inputMatrices.resize(3);
            inputMatrices[0] = &M5;
            inputMatrices[1] = &M6;
            inputMatrices[2] = &M7;
            M8.AssignRowStackValuesOf(inputMatrices, 0, 3);
            Assert::IsTrue(M8.IsEqualTo(M0, 0.0001));
        }
        TEST_METHOD(CPUAssignRepeatOf)
--- a/Math/CNTKMathTest/GPUMatrixUnitTests.cpp
+++ b/Math/CNTKMathTest/GPUMatrixUnitTests.cpp
@ -278,7 +278,7 @@ namespace CNTKMathTest
            Assert::IsTrue(M2.IsEqualTo(M3, 0.0001f)); 
        }
-        TEST_METHOD(GPUMatrixRowSlice)
+        TEST_METHOD(GPUMatrixRowSliceAndStack)
        {
            float *fArray = new float[15];
            fArray[0] = 1; fArray[5] = 6; fArray[10] = 11;
@ -308,6 +308,27 @@ namespace CNTKMathTest
            M3 += M0;
            M0.AddToRowSliceValuesOf(M1, 2,2);
            Assert::IsTrue(M3.IsEqualTo(M0, 0.0001)); 
            M2.AddWithRowSliceValuesOf(M1, 0, 2);
            float *fArray4 = new float[6];
            fArray4[0] = 6; fArray4[2] = 16; fArray4[4] = 26;
            fArray4[1] = 8; fArray4[3] = 18; fArray4[5] = 28;
            GPUMatrix<float> M4(2, 3, fArray4, matrixFlagNormal);
            Assert::IsTrue(M2.IsEqualTo(M4, 0.0001));
            GPUMatrix<float>  M5, M6, M7, M8;
            M5.AssignRowSliceValuesOf(M0, 0, 2);
            M6.AssignRowSliceValuesOf(M0, 2, 1);
            M7.AssignRowSliceValuesOf(M0, 3, 2);
            std::vector<const GPUMatrix<float> *> inputMatrices;
            inputMatrices.resize(3);
            inputMatrices[0] = &M5;
            inputMatrices[1] = &M6;
            inputMatrices[2] = &M7;
            M8.AssignRowStackValuesOf(inputMatrices, 0, 3);
            Assert::IsTrue(M8.IsEqualTo(M0, 0.0001));
        }
        TEST_METHOD(GPUKhatriRaoProduct)
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@ -429,6 +429,48 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return *this;
    }
    //stack the columns in inputMatrices (starting from sliceStartCol for sliceNumCols columns) and assign it to [this] object.
    template<class ElemType>
    CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignRowStackValuesOf(const std::vector<const CPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
    {
        if (sliceNumCols == 0)
            LogicError("AssignRowStackValuesOf: sliceNumCols should > 0.");
        size_t totalRows = 0;
        size_t* startRowIndeces = new size_t[inputMatrices.size()];
        startRowIndeces[0] = 0;
        for (int i = 0; i < inputMatrices.size(); i++)
        {
            const CPUMatrix<ElemType>& a = *inputMatrices[i];
            if (a.IsEmpty())
                LogicError("AssignRowStackValuesOf: input matrix (%d) is empty.", i);
            if (a.GetNumCols() < sliceStartCol + sliceNumCols)
                LogicError("AssignRowStackValuesOf: input matrix (%d) GetNumCols() < sliceStartCol + sliceNumCols.", i);
            totalRows += a.GetNumRows();
            if (i<inputMatrices.size()-1)
                startRowIndeces[i + 1] = startRowIndeces[i] + a.GetNumRows();
        }
        Resize(totalRows, sliceNumCols);
        auto& us = *this;
 #pragma omp parallel for     
        for (long j = 0; j<sliceNumCols; j++)
        {
            for (int i = 0; i < inputMatrices.size(); i++)
            {
                memcpy(&us(startRowIndeces[i], j), &(*inputMatrices[i])(0, sliceStartCol+j), inputMatrices[i]->GetNumRows() * sizeof(ElemType));
            }
        }
        delete [] startRowIndeces;
        return *this;
    }  
    template<class ElemType>
    void CPUMatrix<ElemType>::MinusOneAt(CPUMatrix<ElemType>& c, const size_t position)
    {
@ -672,16 +714,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // if it's externally managed, then populate the structure
        if (matrixFlags&matrixFlagDontOwnBuffer)
        {
            // free previous array allocation if any before overwriting
            if (m_pArray != nullptr)
                delete [] m_pArray;
            m_pArray = pArray;
            m_numRows = numRows;
            m_numCols = numCols;
            // free previous array allocation if any before overwriting
            if (m_pArray != nullptr)
                delete[] m_pArray;
            m_pArray = pArray;
            m_elemSizeAllocated = GetNumElements();
            m_externalBuffer = true;
        }
@ -3877,7 +3916,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    void CPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const CPUMatrix<ElemType>& a,
-        const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias, size_t sampleCount, CPUMatrix<ElemType>& tmp, CPUMatrix<ElemType>& c)
+        const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias, CPUMatrix<ElemType>& tmp, CPUMatrix<ElemType>& c)
        //this: samples+probs
        // a:   hidden
        // b:   embedding
@ -3892,7 +3931,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            std::cerr << endl;
        }
        */
        sampleCount *= 1;
        double log_likelihood = 0.0;
        size_t sample_size = this->GetNumRows() / 2;
        size_t batch_size = this->GetNumCols();
--- a/Math/Math/CPUMatrix.h
+++ b/Math/Math/CPUMatrix.h
@ -216,7 +216,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        CPUMatrix<ElemType>& AssignVectorNorm2Of(CPUMatrix<ElemType>& a, const bool isColWise);
        void AssignNoiseContrastiveEstimation(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias,
-            size_t sampleCount, CPUMatrix<ElemType>& tmp, CPUMatrix<ElemType>& c);
+            CPUMatrix<ElemType>& tmp, CPUMatrix<ElemType>& c);
        void AssignNCEUnnormalizedEval(const CPUMatrix<ElemType>& a,
            const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias, CPUMatrix<ElemType>& c);
@ -244,6 +244,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        CPUMatrix<ElemType>&  AssignRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
        CPUMatrix<ElemType>&  AddToRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
        CPUMatrix<ElemType>&  AddWithRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);
        CPUMatrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const CPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);
        CPUMatrix<ElemType>& AssignToRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@ -678,6 +678,63 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return *this;
    }
    //stack the columns in inputMatrices (starting from sliceStartCol for sliceNumCols columns) and assign it to [this] object.
    template<class ElemType>
    GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRowStackValuesOf(const std::vector<const GPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
    {
        if (sliceNumCols == 0)
            LogicError("AssignRowStackValuesOf: sliceNumCols should > 0.");
        size_t totalRows = 0;
        size_t* startRowIndeces = new size_t[inputMatrices.size()+1];
        ElemType ** bufferPointersInInputMatrices = new ElemType*[inputMatrices.size()];
        startRowIndeces[0] = 0;       
        for (int i = 0; i < inputMatrices.size(); i++)
        {
            const GPUMatrix<ElemType>& a = *inputMatrices[i];
            if (a.IsEmpty())
                LogicError("AssignRowStackValuesOf: input matrix (%d) is empty.", i);
            if (a.GetNumCols() < sliceStartCol + sliceNumCols)
                LogicError("AssignRowStackValuesOf: input matrix (%d) GetNumCols() < sliceStartCol + sliceNumCols.", i);
            totalRows += a.GetNumRows();
            startRowIndeces[i + 1] = startRowIndeces[i] + a.GetNumRows();
            bufferPointersInInputMatrices[i] = a.m_pArray + a.LocateColumn(sliceStartCol);
        }
        Resize(totalRows, sliceNumCols);
        PrepareDevice();
        ElemType** bufferPointersInGPU = NULL;
        CUDA_CALL(cudaMalloc((void***)&bufferPointersInGPU, inputMatrices.size()*sizeof(ElemType*)));
        CUDA_CALL(cudaMemcpy(bufferPointersInGPU, bufferPointersInInputMatrices, inputMatrices.size()*sizeof(ElemType*), cudaMemcpyHostToDevice));
        delete[] bufferPointersInInputMatrices;
        size_t* startRowIndecesInGPU = NULL;
        CUDA_CALL(cudaMalloc((void**)&startRowIndecesInGPU, (1+inputMatrices.size())*sizeof(size_t)));
        CUDA_CALL(cudaMemcpy(startRowIndecesInGPU, startRowIndeces, (1+inputMatrices.size())*sizeof(size_t), cudaMemcpyHostToDevice));
        delete[] startRowIndeces;
        LONG64 N = (LONG64)GetNumElements();
        int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock);
        cudaEvent_t done = nullptr;
        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
        _assignRowStackValuesOf<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(m_pArray, bufferPointersInGPU, startRowIndecesInGPU, (long) inputMatrices.size(), N, (long)GetNumRows(), (long)GetNumCols());
        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
        CUDA_CALL(cudaFree(bufferPointersInGPU));
        CUDA_CALL(cudaFree(startRowIndecesInGPU));
        return *this;
    }
    /// c = c - 1.0 for a specific position
    template<class ElemType>
    void GPUMatrix<ElemType>::MinusOneAt(GPUMatrix<ElemType>& c, const size_t position)
--- a/Math/Math/GPUMatrix.h
+++ b/Math/Math/GPUMatrix.h
@ -274,6 +274,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        GPUMatrix<ElemType>&  AssignRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);
        GPUMatrix<ElemType>&  AddToRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
        GPUMatrix<ElemType>&  AddWithRowSliceValuesOf(const GPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);
        GPUMatrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const GPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);
        GPUMatrix<ElemType>&  AssignRepeatOf(const GPUMatrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats);
        GPUMatrix<ElemType>&  AssignPositiveAndShiftedNegSample(const GPUMatrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@ -377,6 +377,27 @@ __global__ void _addWithRowSliceValuesOf(ElemType * dest, ElemType * src, const
    dest[id] += src[IDX2C(row + startIndex, col, srcRows)];
 }
 template<class ElemType>
 __global__ void _assignRowStackValuesOf(ElemType * dest, ElemType ** srces, size_t* startRowIndeces, const LONG64 numSrces, const LONG64 N, const long destRows, const long destCols)
 {
    LONG64 id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    long col = id / destRows;  //dest is the full matrix, rowslice is taken from the src
    long row = id - (col * destRows);
    //can we replace the for loop with something better?
    int srcId = 0;
    for (; srcId < numSrces; srcId++)
    {
        if (startRowIndeces[srcId + 1]>row)
            break;
    }
    dest[id] = srces[srcId][IDX2C(row - startRowIndeces[srcId], col, startRowIndeces[srcId+1] - startRowIndeces[srcId])];
 }
 template<class ElemType>
 __global__ void _assignRepeatOf(ElemType * dest, ElemType * src, const LONG64 N, const long srcRows, const long srcCols, const long destRows)
 {
--- a/Math/Math/GPUSparseMatrix.h
+++ b/Math/Math/GPUSparseMatrix.h
@ -79,16 +79,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        size_t MajorIndexCount() const
        {
-            return MajorIndexCount(m_numRows, m_numCols, m_elemSizeAllocated, m_format);
+            return MajorIndexCount(m_numRows, m_numCols, m_nz, m_format);
        }
-        size_t MajorIndexCount(const size_t numRows, const size_t numCols, const size_t numNZReserved, const MatrixFormat format) const
+        size_t MajorIndexCount(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat format) const
        { 
            if (format == matrixFormatSparseBlockCol)
                return numCols;
            else if (format == matrixFormatSparseBlockRow)
                return numRows;
            else
-                return numNZReserved;
+                return numNZ;
        }
        size_t MajorIndexSize() const // actual number of major index bytes in use
        { 
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@ -1520,6 +1520,68 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return *this;
    }
    //stack the columns in inputMatrices (starting from sliceStartCol for sliceNumCols columns) and assign it to [this] object.
    template<class ElemType>
    Matrix<ElemType>& Matrix<ElemType>::AssignRowStackValuesOf(const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols)
    {
        for (int i = 0; i < inputMatrices.size(); i++)
        {
            const Matrix<ElemType>& a = *inputMatrices[i];
            DecideAndMoveToRightDevice(*this, a);
            //WARNING: a and this must have same type
            if (!(GetMatrixType() == a.GetMatrixType()))
                NOT_IMPLEMENTED;
        }
        CurrentDataLocation curLocation = GetCurrentMatrixLocation();
        if (curLocation == CurrentDataLocation::GPU || curLocation == CurrentDataLocation::BOTH)
        {
            if (GetMatrixType() != MatrixType::SPARSE)
            {
                //GPUDense;
                std::vector<const GPUMatrix<ElemType>*> gpuInputMatrices;
                gpuInputMatrices.resize(inputMatrices.size());
                for (int i = 0; i < inputMatrices.size(); i++)
                    gpuInputMatrices[i] = inputMatrices[i]->m_GPUMatrix;
                m_GPUMatrix->AssignRowStackValuesOf(gpuInputMatrices, sliceStartCol, sliceNumCols);
                SetDataLocation(CurrentDataLocation::GPU, MatrixType::DENSE);
            }
            else
            {
                NOT_IMPLEMENTED;
            }
        }
        else if (curLocation == CurrentDataLocation::CPU)
        {
            if (GetMatrixType() != MatrixType::SPARSE)
            {
                //CPUDense;
                std::vector<const CPUMatrix<ElemType>*> cpuInputMatrices;
                cpuInputMatrices.resize(inputMatrices.size());
                for (int i = 0; i < inputMatrices.size(); i++)
                    cpuInputMatrices[i] = inputMatrices[i]->m_CPUMatrix;
                m_CPUMatrix->AssignRowStackValuesOf(cpuInputMatrices, sliceStartCol, sliceNumCols);
                SetDataLocation(CurrentDataLocation::CPU, MatrixType::DENSE);
            }
            else
            {
                NOT_IMPLEMENTED;
            }
        }
        else
        {
            throw std::runtime_error("Matrices do not exist in either CPU or GPU.");
        }
        return *this;
    } 
    template<class ElemType>
    Matrix<ElemType>&  Matrix<ElemType>::AssignRepeatOf(const Matrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats)
    {
@ -3600,7 +3662,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            size_t sampleCount = a.m_CPUMatrix->GetNumElements() / a.m_CPUMatrix->GetNumRows();
            tmp.Resize(a.GetNumRows() / 2, sampleCount);
-            a.m_CPUMatrix->AssignNoiseContrastiveEstimation(*b.m_CPUMatrix, *c.m_CPUMatrix, *bias.m_CPUMatrix, sampleCount, *tmp.m_CPUMatrix, *this->m_CPUMatrix);
+            a.m_CPUMatrix->AssignNoiseContrastiveEstimation(*b.m_CPUMatrix, *c.m_CPUMatrix, *bias.m_CPUMatrix, *tmp.m_CPUMatrix, *this->m_CPUMatrix);
        }
        else
        {
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@ -259,6 +259,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        Matrix<ElemType>&  AssignRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows);
        Matrix<ElemType>&  AddToRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows); 
        Matrix<ElemType>&  AddWithRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows);
        Matrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);
        Matrix<ElemType>&  AssignRepeatOf(const Matrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats);
        Matrix<ElemType>&  AssignPositiveAndShiftedNegSample(const Matrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
--- a/Math/Math/NoGPU.cpp
+++ b/Math/Math/NoGPU.cpp
@ -479,6 +479,7 @@ namespace Microsoft {
            //for each column of a, we add all rows of a to this starting from startIndex
            template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddToRowSliceValuesOf(const GPUMatrix<ElemType>& /*a*/, const size_t startIndex, const size_t numRows) { return *this; }
            template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddWithRowSliceValuesOf(const GPUMatrix<ElemType>& /*a*/, const size_t startIndex, const size_t numRows) { return *this; }
            GPUMatrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const GPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols) { return *this; }
            template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRepeatOf(const GPUMatrix<ElemType>& /*a*/, const size_t numRowRepeats, const size_t numColRepeats) { return *this; }
            template<class ElemType> GPUMatrix<ElemType>&  GPUMatrix<ElemType>::AssignPositiveAndShiftedNegSample(const GPUMatrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber) { return *this; }
--- a/Scripts/build-and-test
+++ b/Scripts/build-and-test
@ -0,0 +1,234 @@
 #!/bin/bash
 # Setting some default values
 BUILD=1
 RUN=1
 CLEAN_AFTER=0
 CLEAN_BEFORE=0
 # parsing command line arguments:
 while [[ $# > 0 ]]
 do
 key="$1"
 case $key in
    -h|--help)
    echo "Usage: build-and-test [options]"
    echo "Options:"
    echo "  -q|--quiet-build - redirect build output to file (by default those will be in <cntk_root>.run-<operating_system>-*)"
    echo "  -r|--run-only - elides build step, runs the binaries that have already been built"
    echo "  -b|--build-only - just build, do not run"
    echo "  -cb|--clean-build - clean up the enlistment binaries before build"
    echo "  -o|--output-directory <output_dir> - specify output directory to use"
    echo "The root directory used to build and run CNTK is hosts the Scripts directory that contains this script"
    exit 1
    ;;
    -q|--quiet)
    QUIET_BUILD=1
    ;;
    -r|--run-only)
    BUILD=0
    RUN=1
    ;;
    -b|--build-only)
    BUILD=1
    RUN=0
    ;;
    -cb|--clean-build)
    CLEAN_BEFORE=1
    BUILD=1
    ;;
    -o|--output-directory)
    OUTPUT_DIR="$2"
    shift # past argument
    ;;
    *)
    echo Unkown option $key
    exit 1
    ;;
 esac
 shift # past argument or value
 done
 # Step 0 -- Validate all necessary prerequisites and check for incompatible options
 # It is possible to use this script on Windows to build CNTK
 # from Cygwin window with Visual C++ environment loaded.
 # In that case OS environment variable will be set and we 
 # can use it to differentiate from Linux.
 if [[ $CLEAN_BEFORE == 1 && $RUN == 1 && $BUILD == 0 ]]; then
    echo "============ ERROR: Incompatible options RUN and CLEAN_BEFORE set without BUILD ============"
    exit 1
 fi
 if [[ $OS == "Windows_NT" && $OSTYPE == "cygwin" ]]; then
    DEBUG_DIR=Debug
    RELEASE_DIR=Release
    PREFIX_DIR=x64
    BIN_NAME=CNTK.exe
    BUILD_OS="windows"
    if [[ $VS120COMNTOOLS == "" ]]; then
        echo "============ Visual Studio 12.0 environment not properly setup or VS not installed ============"
        echo "============ Please find and run the appropriate vcvarsall.bat script ============"
        exit 1
    fi
    if [[ $ACML_PATH == "" ]]; then
        echo "============ ACML path not set  ============"
        echo "============ ACML libraries are needed to successfully build CNTK ============"
        exit 1
    fi
 elif [[ $OSTYPE == "linux-gnu" ]]; then
    DEBUG_DIR=x86_64.gpu.debug.acml
    RELEASE_DIR=x86_64.gpu.release.acml
    PREFIX_DIR=bin
    BIN_NAME=cntk
    MAKEFILE=Makefile.gpu
    BUILD_OS="linux"
 else
    echo "============ ERROR: Unsupported OS ============"
    echo "============ Scripts supports only building from Linux and Windows through Cygwin ============"
    exit 1
 fi
 # Step 1 -- Prepare temporary folders and files, tweak settings if necessary
 # Get to the root path from which we know how to build and run
 SCRIPT=`readlink -f $0`
 SCRIPT_DIR=`dirname $SCRIPT`
 CNTK_ROOT=`dirname $SCRIPT_DIR`
 # Setup the output directory
 if [[ $OUTPUT_DIR == "" ]]; then
    OUTPUT_DIR="$CNTK_ROOT/.run-$BUILD_OS-$RANDOM"
 fi
 echo "============ Creating CNTK temp directory in $TMP_ROOT ============"
 mkdir -p $OUTPUT_DIR || exit $?
 CONF_FILE="$OUTPUT_DIR/Simple.conf"
 BUILD_FILE="$OUTPUT_DIR/Build"
 RUN_FILE="$OUTPUT_DIR/Result"
 if ! [[ -d "$CNTK_ROOT/MachineLearning" ]]; then
    echo "============ ERROR: Build script located in the wrong directory ($SCRIPT_DIR) ============"
    exit 1
 fi
 cd $CNTK_ROOT
 if ! [[ -f $CONF_FILE ]]; then
    cp Demos/Simple/Simple.config $CONF_FILE || exit $?
    # This chmod is necessary due to restrictive Cygwin interpretation of Windows permissions.
    # Cygwin interprets Windows permissions as ----rwx---, which lacks read permissions for user.
    chmod a+r $CONF_FILE || exit $?
 fi
 if [[ $QUIET_BUILD == 1 ]]; then
    echo "============ WARNING: You have selected quiet build. All build output will be placed in ($OUTPUT_DIR) ============"
 fi
 # Step 2 -- Build the project debug and release, if requested
 if [[ $BUILD == 1 ]]; then
    # Step 2 -- Perform necessary builds
    for FLAVOR in debug release
    do
        # Our make is too noisy right now and it is difficult to spot
        # issues from stdout and stderr. In the quiet mode these are
        # redirected to a file where they could be examined after the fact
        if [[ $QUIET_BUILD == 1 ]]; then
            exec 6>$BUILD_FILE.$FLAVOR.out || exit $?
            exec 7>$BUILD_FILE.$FLAVOR.err || exit $?
        else
            exec 6>&1 || exit $?
            exec 7>&2 || exit $?
        fi
        echo "============ Building CNTK $FLAVOR (clean=$CLEAN_BEFORE)  ============"
        if [[ $OS == "Windows_NT" ]]; then
            if [[ $CLEAN_BEFORE == 1 ]]; then
                msbuild.exe /property:Configuration=$FLAVOR /t:Clean 1>&6 2>&7 || exit $?
            fi
            msbuild.exe /property:Configuration=$FLAVOR /m 1>&6 2>&7 || exit $?
        else
            if [[ $CLEAN_BEFORE == 1 ]]; then
                make BUILDTYPE=$FLAVOR -f $MAKEFILE clean 1>&6 2>&7 || exit $?
            fi
            make BUILDTYPE=$FLAVOR -j -f $MAKEFILE 1>&6 2>&7 || exit $?
        fi
        chmod a+r $BUILD_FILE.*
    done
 fi
 # Step 3 -- Run the project tests, both debug and release, if requested
 if [[ $RUN == 1 ]]; then
    if ! [[ -f "$CNTK_ROOT/$PREFIX_DIR/$DEBUG_DIR/$BIN_NAME" && -f "$CNTK_ROOT/$PREFIX_DIR/$RELEASE_DIR/$BIN_NAME" ]]; then
        echo "============ ERROR: CNTK did not build properly  ============"
        exit 1
    fi
    cd $PREFIX_DIR
    for TARGET in CPU GPU
    do
        # These sed scripts are simply toggling DeviceNumber argument in the config file
        # If it is set to Auto, it will pick GPU over CPU. At -1 CPU is selected.
        if [[ $TARGET == CPU ]]; then
            sed -i -e 's/^DeviceNumber.*/DeviceNumber=-1/g' $CONF_FILE || exit $?
        else
            sed -i -e 's/^DeviceNumber.*/DeviceNumber=Auto/g' $CONF_FILE || exit $?
        fi
        for FLAVOR in debug release
        do
            if [[ FLAVOR == "debug" ]]; then
                FLAVOR_DIR="$DEBUG_DIR"
            else
                FLAVOR_DIR="$RELEASE_DIR"
            fi
            OUT_FILE="$RUN_FILE.$FLAVOR.out"
            echo "============ Running CNTK for ($FLAVOR) ($TARGET), output in ($RUN_FILE.*) ============"
            rm -rf models
            if [[ $OS == "Windows_NT" ]]; then
                # We have to use cygpath on Windows to modify the file paths into the format readable by cntk.
                time ./$FLAVOR_DIR/$BIN_NAME configFile="`cygpath -w $CONF_FILE`" &>$OUT_FILE || exit $?
            else
                time ./$FLAVOR_DIR/$BIN_NAME configFile=$CONF_FILE &>$OUT_FILE || exit $?
            fi
            chmod a+r $RUN_FILE.*
            # Check if execution was successful
            grep -q "Using $TARGET" "$OUT_FILE" || {
                echo "============ ERROR: Run output (in $OUT_FILE) did not contain information about target device ($TARGET) ============"
                exit 1
            }
            grep -q "EXCEPTION" "$OUT_FILE" && {
                echo "============ ERROR: Run output in ($OUT_FILE) contains exceptions ============"
                grep "EXCEPTION" "$OUT_FILE"
                exit 1
            }
        done
    done
 fi
 # Step 5 -- Optionally clean after builds and tests
 if [[ $CLEAN_AFTER == 1 ]]; then
    rm -rf models
    cd $CNTK_ROOT
    for FLAVOR in debug release
    do
        echo "============ Cleaning up CNTK $FLAVOR  ============"
        if [[ $OS == "Windows_NT" ]]; then
            msbuild.exe /property:Configuration=$FLAVOR /t:clean 1>&6 2>&7 || exit $?
        else
            make BUILDTYPE=$FLAVOR -f $MAKEFILE clean 1>&6 2>&7 || exit $?
        fi
    done
    rm -rf $OUTPUT_DIR
 fi
 echo "============ Build and test of CNTK was successful!  ============"