Merge remote-tracking branch 'origin/master' into linux-gcc

Conflicts: Common/Include/DataReader.h Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx MachineLearning/CNTK/TrainingCriterionNodes.h Math/Math/Matrix.cpp
2015-07-07 17:00:17 -07:00 · 2015-07-07 17:00:17 -07:00 · 56b9100826
--- a/Common/Include/DataReader.h
+++ b/Common/Include/DataReader.h
@ -29,9 +29,16 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-const size_t randomizeAuto = ((size_t)-1)>>2; // randomize range set automatically, parameter value for Init()
-const size_t randomizeNone = 0;  // don't randomize, parameter value for Init()
-const size_t requestDataSize = randomizeAuto;   // StartMinibatchLoop default parameter, sets number of requested frames equal to the number of frames in the dataset
+// randomize range set automatically, parameter value for Init()
+const size_t randomizeAuto = ((size_t) -1) >> 2;
+
+// don't randomize, parameter value for Init()
+const size_t randomizeNone = 0;
+
+// StartMinibatchLoop default parameter, sets number of requested
+// frames equal to the constant 3fffffffffffffff computed by ((size_t) -1) >> 2 above.
+// We use this constant as a stand in for the total number of frames in the dataset.
+const size_t requestDataSize = randomizeAuto;

 enum EndDataType
 {
@ -48,7 +55,7 @@ class DATAREADER_API IDataReader
 {
 public:
    typedef std::string LabelType;
-    typedef unsigned LabelIdType;
+    typedef unsigned int LabelIdType;
    unsigned m_seed;
    size_t   mBlgSize;  /// number of utterances per minibatch
    bool     mDoRandomize; 
--- a/Common/Include/commandArgUtil.h
+++ b/Common/Include/commandArgUtil.h
@ -612,6 +612,7 @@ public:
                // pop out of content level
                contentLevel = false;
            }
+
            if (quoteFound)
            {
                // skip the closing quote
@ -660,7 +661,7 @@ public:
    std::string ReadConfigFiles(const std::string& filePaths);
    std::string ReadConfigFiles(const std::wstring& filePaths);
    std::string ResolveIncludeStatements(const std::string& configString, std::vector<std::string>& resolvedConfigFiles);
-    void LoadConfigFile(const std::wstring & filePath);
+    void LoadConfigFile(const std::wstring& filePath);
    void LoadConfigFileAndResolveVariables(const std::wstring& filePath, const ConfigParameters& config);
    void LoadConfigFiles(const std::wstring& filePaths, const std::string* configStringToAppend = nullptr);

@ -873,17 +874,17 @@ public:
    }

    // Insert - insert an 'name=value' string into the dictionary
-    void Insert(const std::string &str)
+    void Insert(const std::string& str)
    {
        ParseValue(str, 0, str.length());
    }

-    bool Exists(const std::wstring & name) const
+    bool Exists(const std::wstring& name) const
    {
        return Exists(msra::strfun::utf8(name));
    }

-    bool Exists(const std::string & name) const
+    bool Exists(const std::string& name) const
    {
        if (find(name) != end())
        {
@ -899,42 +900,42 @@ public:
    }

    // ExistsCurrent - check to see if a key exists in THIS config, don't check parent
-    bool ExistsCurrent(const std::string & name) const
+    bool ExistsCurrent(const std::string& name) const
    {
        return (find(name) != end());
    }

    // dict(name, default) for strings
-    ConfigValue operator()(const std::wstring & name,
-                           const wchar_t *defaultvalue) const
+    ConfigValue operator()(const std::wstring& name,
+                           const wchar_t* defaultvalue) const
    {
        return operator()(msra::strfun::utf8(name), defaultvalue);
    }

    // dict(name, default) for strings
-    ConfigValue operator()(const std::string & name,
-                           const wchar_t *defaultvalue) const
+    ConfigValue operator()(const std::string& name,
+                           const wchar_t* defaultvalue) const
    {
        return operator()(name, msra::strfun::utf8(defaultvalue).c_str());
    }

    // dict(name, default) for strings
-    ConfigValue operator()(const std::wstring & name,
-                           const char *defaultvalue) const
+    ConfigValue operator()(const std::wstring& name,
+                           const char* defaultvalue) const
    {
        return operator()(msra::strfun::utf8(name), defaultvalue);
    }

    // dict(name, default) for strings
-    ConfigValue operator()(const std::string & name,
-                           const char *defaultvalue) const
+    ConfigValue operator()(const std::string& name,
+                           const char* defaultvalue) const
    {
        ConfigValue value = Find(name, defaultvalue);
        return value;
    }

-    ConfigValue Find(const std::string & name,
-                     const char *defaultvalue = NULL) const
+    ConfigValue Find(const std::string& name,
+                     const char* defaultvalue = NULL) const
    {
        auto iter = find(name);
        ConfigValue result;
@ -975,10 +976,11 @@ public:
    //     any whitespace characters.  If an opening "$" is found without a closing "$", an exception is thrown.
    // configString - the string that you would like to resolve variables in.
    // returns: A copy of 'configString' with all the variables resolved.
-    std::string ResolveVariablesInSingleLine(const std::string &configLine) const
+    std::string ResolveVariablesInSingleLine(const std::string& configLine) const
    {
        // ensure that this method was called on a single line (eg, no newline characters exist in 'configLine').
-        if (configLine.find_first_of("\n") != std::string::npos) {
+        if (configLine.find_first_of("\n") != std::string::npos)
+        {
            throw std::logic_error(
                "\"ResolveVariablesInSingleLine\" shouldn't be called with a string containing a newline character");
        }
@ -1053,7 +1055,7 @@ public:
    //     we shouldn't insert newlines where they didn't already exist.
    // configString - the string that you would like to resolve variables in.
    // returns: A copy of 'configString' with all the variables resolved.
-    std::string ResolveVariables(const std::string &configString) const
+    std::string ResolveVariables(const std::string& configString) const
    {
        std::string newConfigString;
        if (configString.find_first_of("\n") != std::string::npos)
@ -1347,14 +1349,14 @@ class argvector: public std::vector<T>
            RuntimeError("argvector: invalid arg value");
        }
    }
-    static void parse(const std::wstring & in, std::wstring & val)
+    static void parse(const std::wstring& in, std::wstring& val)
    {
        val = in;
    }

 public:
    // constructor --construct empty, then assign a wstring from command-line argument
-    void operator=(const std::wstring & arg)
+    void operator=(const std::wstring& arg)
    {
        clear();
        // separate the arguments
@ -1387,7 +1389,7 @@ public:
    }

    // constructor --use this for setting default values
-    argvector(const std::wstring & arg)
+    argvector(const std::wstring& arg)
    {
        *this = arg;
    }
@ -1438,7 +1440,7 @@ public:
    }

    // we give full read access to the vector, so we can use it bounded as well
-    const std::vector<T> & tovector() const
+    const std::vector<T>& tovector() const
    {
        return *this;
    }
--- a/DataReader/LMSequenceReader/SequenceReader.cpp
+++ b/DataReader/LMSequenceReader/SequenceReader.cpp
@ -2049,7 +2049,7 @@ void BatchSequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring,
    }
    if (curDevId != CPUDEVICE)
    {
-        labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
+        labels->TransferFromDeviceToDevice(CPUDEVICE, curDevId, false, false, false);
    }
 }

--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Chapter.lyx
@ -1725,6 +1725,102 @@ numBestSearchEpoch
 
 \end_layout

+\begin_layout Standard
+Used in the Adaptive Minibatch Sizing mode.
+\end_layout
+
+\begin_layout Itemize
+
+\emph on
+numMiniBatch4LRSearch
+\emph default
+
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+numMiniBatch4LRSearch
+\end_layout
+
+\end_inset
+
+: the number of minibatches used to search the minibatch size when
+in adaptive minibatch size mode.
+ Default value is 500.
+ It's typically set to 10-20% of the total minibatches in an epoch
+this is shared with the search for learning rate in 
+SearchBeforeEpoch mode.
+ 
+\end_layout
+
+\begin_layout Itemize
+
+\emph on
+autoAdjustMinibatch
+\emph default
+
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+autoAdjustMinibatch
+\end_layout
+
+\end_inset
+
+: enable or disable whether minibatch size is adaptively adjusted.
+ Default value is false.
+Adapative minibatch sizing will begin on 
+epochs starting after user minbatch sizes expcitily
+specified are complete.  For example if the user
+specifed minibatchSize=256:1024, then 256 and 1024
+are used in the first 2 Epochs and adaptive minibatch
+sizing is used aferwards
+ 
+\end_layout
+
+\begin_layout Itemize
+
+\emph on
+minibatchSizeTuningFrequency
+\emph default
+
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+minibatchSizeTuningFrequency
+\end_layout
+
+\end_inset
+
+: The number of epochs to skip, on a periodic basis, before
+dynamically adjusting the minibatch size.
+ Default value is 1.
+ 
+\end_layout
+
+\begin_layout Itemize
+
+\emph on
+minibatchSizeTuningMax
+\emph default
+
+\begin_inset Index idx
+status open
+
+\begin_layout Plain Layout
+minibatchSizeTuningMax
+\end_layout
+
+\end_inset
+
+: The maximum size allowed for an
+adaptively adjusted minibatch size.
+ Default value is 1048576.
+ 
+\end_layout
+
 \end_deeper
 \begin_layout Subsubsection
 Gradient control
--- a/MachineLearning/CNTK/SGD.h
+++ b/MachineLearning/CNTK/SGD.h
--- a/MachineLearning/CNTK/TrainingCriterionNodes.h
+++ b/MachineLearning/CNTK/TrainingCriterionNodes.h
@ -978,18 +978,27 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        virtual void EvaluateThisNode()   //-sum(left_i * log(softmax_i(right)))
        {
-            if (m_evalMode == NCEEvalMode::Softmax || Inputs(0)->FunctionValues().GetNumRows() == 1)
+            int positive = 0, negative = 0;
+            if (Inputs(0)->FunctionValues().GetNumRows() == 1)
+            {
+                for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++)
+                {
+                    if (Inputs(0)->FunctionValues()(0, i) > 0)
+                        positive++;
+                    else if (Inputs(0)->FunctionValues()(0, i) < 0)
+                        negative++;
+                }
+                assert(positive * negative == 0);
+            }
+            if (m_evalMode == NCEEvalMode::Softmax || (Inputs(0)->FunctionValues().GetNumRows() == 1 && positive > 0))
            {
                // evaluation uses softmax
                m_logSoftmax.AssignProductOf(Inputs(1)->FunctionValues(), true, Inputs(2)->FunctionValues(), false);
                m_logSoftmax += Inputs(3)->FunctionValues();
                m_logSoftmax.InplaceLogSoftmax(false);
-                FunctionValues().Resize(1, 1);
-                FunctionValues().SetValue(0);
-                for (int i = 0; i < Inputs(0)->FunctionValues().GetNumCols(); i++)
-                    FunctionValues()(0, 0) -= m_logSoftmax(i, (size_t)Inputs(0)->FunctionValues()(0, i));
+                FunctionValues().AssignSoftmaxSum(Inputs(0)->FunctionValues(), m_logSoftmax);
            }
-            else if (m_evalMode == NCEEvalMode::Unnormalized)
+            else if (m_evalMode == NCEEvalMode::Unnormalized || (Inputs(0)->FunctionValues().GetNumRows() == 1 && negative > 0))
            {
                FunctionValues().AssignNceUnnormalizedEval(Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues(), Inputs(2)->FunctionValues(), Inputs(3)->FunctionValues());
            }
--- a/Math/Math/CPUMatrix.cpp
+++ b/Math/Math/CPUMatrix.cpp
@ -3881,6 +3881,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    {
        return CPUMatrix<ElemType>::MultiplyAndWeightedAdd(1.0, a, transposeA, b, transposeB, 1.0, c);
    }
+    template<class ElemType>
+    void CPUMatrix<ElemType>::AssignSoftmaxSum(const CPUMatrix<ElemType>& softmax, CPUMatrix<ElemType>& c)
+    {
+        ElemType log_likelihood = 0.0;
+        size_t batch_size = this->GetNumCols();
+#pragma omp parallel for reduction(+:log_likelihood)
+        for (int instance_id = 0; instance_id < batch_size; instance_id++)
+        {
+            int sample = (int)(*this)(0, instance_id);
+            log_likelihood += softmax(instance_id, sample);
+        }
+        c(0, 0) = -log_likelihood;
+    }

    template<class ElemType>
    void CPUMatrix<ElemType>::AssignNCEUnnormalizedEval(const CPUMatrix<ElemType>& a,
--- a/Math/Math/CPUMatrix.h
+++ b/Math/Math/CPUMatrix.h
@ -217,6 +217,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        void AssignNoiseContrastiveEstimation(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias,
            CPUMatrix<ElemType>& tmp, CPUMatrix<ElemType>& c);
+        
+        void AssignSoftmaxSum(const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& softmax);

        void AssignNCEUnnormalizedEval(const CPUMatrix<ElemType>& a,
            const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias, CPUMatrix<ElemType>& c);
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@ -1957,7 +1957,27 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
        if (do_sync) CUDA_CALL(cudaEventDestroy(done));
    }
-    
+    template<class ElemType>
+    void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
+    {
+        UNCONST(ElemType, a, my_a);
+        cudaEvent_t done = nullptr;
+        if (do_sync) CUDA_CALL(cudaEventCreate(&done));
+        int p = 512;
+        int width = a.GetNumRows();
+        while (p / 2 > width) p = p / 2;
+
+        _assignSoftmaxSum<ElemType> << <1, p >> >(
+            my_a.GetArray(),
+            width,
+            GetArray(),
+            c.GetArray()
+            );
+
+        if (do_sync) CUDA_CALL(cudaEventRecord(done));
+        if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
+        if (do_sync) CUDA_CALL(cudaEventDestroy(done));
+    }
    template<class ElemType>
    void GPUMatrix<ElemType>::AssignNCEUnnormalizedEval(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)
    {
--- a/Math/Math/GPUMatrix.h
+++ b/Math/Math/GPUMatrix.h
@ -295,7 +295,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            size_t sampleCount, GPUMatrix<ElemType>& tmp, GPUMatrix<ElemType>& c);
        void AssignNCEDerivative(GPUMatrix<ElemType>& tmp, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, size_t inputIndex, GPUMatrix<ElemType>& c);    
        void AssignNCEUnnormalizedEval(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c);
-
+        void AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& softmax);

        void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const;
        void Print(const char* matrixName = NULL) const; //print whole matrix. can be expensive
--- a/Math/Math/GPUMatrixCUDAKernels.cu
+++ b/Math/Math/GPUMatrixCUDAKernels.cu
@ -2932,6 +2932,59 @@ __global__ void _computeNceOutput(
    }
 }

+
+template<class ElemType>
+__global__ void _assignSoftmaxSum(
+    const ElemType* softmax,    
+    int sampleCount,
+    const ElemType* a, 
+    ElemType* c) // run on 512 threads per block
+{
+    // val and col are in CSR format
+    // val is an array contains log_Pn(w). To differentiate positive and negative samples, 
+    // we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
+    // col is an array contains index of the word samples
+    // a is a matrix in column major format contains output from hidden layer
+    // b is the weight matrix for output layer
+    // tmp is the buffer that stores NCE output calculated from _computeNceOutput
+    // c is the matrix to store objective
+
+    __shared__ ElemType partials[512];
+    partials[threadIdx.x] = 0;
+
+    int total = sampleCount;
+    int loadPerThread = (total + blockDim.x - 1) / blockDim.x;
+
+    // find out the items this thread is responsible for
+    int start = loadPerThread * threadIdx.x;
+    int end = min(total, loadPerThread * (threadIdx.x + 1));    
+    for (int i = start; i < end; i++)
+    {
+        int wid = (int)a[i];
+        partials[threadIdx.x] += softmax[IDX2C(i, wid, sampleCount)];
+    }
+
+    __syncthreads();
+
+    // now sum up the objective function
+    int nTotalThreads = blockDim.x;
+
+    while (nTotalThreads >1)
+    {
+        int halfPoint = (nTotalThreads >> 1);
+
+        if (threadIdx.x < halfPoint)
+            partials[threadIdx.x] += partials[threadIdx.x + halfPoint];
+
+        __syncthreads();
+
+        nTotalThreads = (nTotalThreads >> 1);
+    }
+
+    if (threadIdx.x == 0)
+        c[0] = -partials[0];
+}
+
 template<class ElemType>
 __global__ void _assignNoiseContrastiveEstimation(
    const ElemType* val,
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@ -747,9 +747,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {

 #define NUM_MATRIXTYPE_CHANGED_WARN 20
        m_numTimesMatrixTypeChanged++;
+     
        if (m_numTimesMatrixTypeChanged == NUM_MATRIXTYPE_CHANGED_WARN)
-            fprintf(stderr, "WARNING: The same matrix with dim [%d, %d] has been transferred between different devices for %d times.\n", GetNumRows(), GetNumCols(), NUM_MATRIXTYPE_CHANGED_WARN);
-
+        {            
+            fprintf(stderr, "WARNING: The same matrix with dim [%lu, %lu] has been transferred between different devices for %d times.\n", (unsigned long)GetNumRows(), (unsigned long)GetNumCols(), NUM_MATRIXTYPE_CHANGED_WARN);         
+        }      
        if (GetDeviceId()<0) //CPU
        {
            if (newMatrixType==MatrixType::SPARSE)
@ -1241,14 +1243,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    {
        if (numRows != GetNumRows() || numCols != GetNumCols())
        {
-        DISPATCH_MATRIX_ON_FLAG(this,
-            this,
-                m_CPUMatrix->Reshape(numRows, numCols),
-                m_GPUMatrix->Reshape(numRows, numCols),
-            NOT_IMPLEMENTED, 
-            NOT_IMPLEMENTED
-            );
-    }
+            DISPATCH_MATRIX_ON_FLAG(this,
+                this,
+                    m_CPUMatrix->Reshape(numRows, numCols),
+                    m_GPUMatrix->Reshape(numRows, numCols),
+                NOT_IMPLEMENTED, 
+                NOT_IMPLEMENTED
+                );
+        }
    }

    template<class ElemType>
@ -3667,6 +3669,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        return *this;
    }
+
+    template<class ElemType>
+    Matrix<ElemType>& Matrix<ElemType>::AssignSoftmaxSum(const Matrix<ElemType>& a, const Matrix<ElemType>& softmax)
+    {
+        this->Resize(1, 1);
+        if (this->GetDeviceId() < 0)
+            a.m_CPUMatrix->AssignSoftmaxSum(*softmax.m_CPUMatrix, *this->m_CPUMatrix);
+        else
+            a.m_GPUMatrix->AssignSoftmaxSum(*softmax.m_GPUMatrix, *this->m_GPUMatrix);
+        return *this;
+    }
+
    template<class ElemType>
    Matrix<ElemType>& Matrix<ElemType>::AssignNceUnnormalizedEval(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& bias)
    {
@ -4454,7 +4468,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
    }

-    //		Matrix<ElemType>& Matrix<ElemType>::Shift(const Matrix<ElemType>& a, size_t shift)
+    //Matrix<ElemType>& Matrix<ElemType>::Shift(const Matrix<ElemType>& a, size_t shift)
    //[this]= (a right shift by n), padded with zeros
    // shift left, shift needs to be negative value
    // shift right, shift needs to be positive value
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@ -150,7 +150,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        Matrix<ElemType>& AssignNoiseContrastiveEstimation(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& bias, Matrix<ElemType>& tmp);

        Matrix<ElemType>& AssignNCEDerivative(const Matrix<ElemType>& tmp, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, size_t inputIndex);
-
+        Matrix<ElemType>& AssignSoftmaxSum(const Matrix<ElemType>& a, const Matrix<ElemType>& softmax);
        Matrix<ElemType>& AssignNceUnnormalizedEval(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& bias);

        Matrix<ElemType> Transpose(); // This method doesn't change state of Matrix. It should be a const function
--- a/Math/Math/NoGPU.cpp
+++ b/Math/Math/NoGPU.cpp
@ -1070,6 +1070,10 @@ namespace Microsoft {

            }

+            template<class ElemType>
+            void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& c)
+            {
+            }

            template<class ElemType>
            void GPUMatrix<ElemType>::AssignNCEUnnormalizedEval(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c)