Merge branch 'master' of https://git01.codeplex.com/cntk into amitaga/mergeHTKMLFReaders

2015-09-01 13:27:19 -07:00 · 2015-09-01 13:27:19 -07:00 · 8e85f07de3
--- a/MachineLearning/CNTK/ConvolutionalNodes.h
+++ b/MachineLearning/CNTK/ConvolutionalNodes.h
--- a/2
+++ b/2
@ -169,6 +169,7 @@ MATH_SRC =\
 	Math/Math/MatrixQuantizerCPU.cpp \
 	Math/Math/QuantizedMatrix.cpp \
 	Math/Math/Matrix.cpp \
+	Math/Math/CUDAPageLockedMemAllocator.cpp \

 ifdef CUDA_PATH
 MATH_SRC +=\
@ -176,7 +177,6 @@ MATH_SRC +=\
 	Math/Math/GPUMatrixCUDAKernels.cu \
 	Math/Math/GPUSparseMatrix.cu \
 	Math/Math/GPUWatcher.cu \
-	Math/Math/CUDAPageLockedMemAllocator.cpp \
 	Math/Math/MatrixQuantizerGPU.cu \

 else
--- a/Math/Math/CNTKMathCUDA.vcxproj
+++ b/Math/Math/CNTKMathCUDA.vcxproj
@ -152,10 +152,6 @@
      <ExcludedFromBuild>true</ExcludedFromBuild>
      <FileType>CppCode</FileType>
    </CudaCompile>
-    <CudaCompile Include="ValueQuantizer.cu">
-      <ExcludedFromBuild>true</ExcludedFromBuild>
-      <FileType>CppCode</FileType>
-    </CudaCompile>
  </ItemGroup>
  <ItemGroup>
    <CudaCompile Include="GPUMatrix.cu">
--- a/Math/Math/CNTKMathCUDA.vcxproj.filters
+++ b/Math/Math/CNTKMathCUDA.vcxproj.filters
@ -16,9 +16,6 @@
    <CudaCompile Include="MatrixQuantizerGPU.cu">
      <Filter>GPU\1bitSGD</Filter>
    </CudaCompile>
-    <CudaCompile Include="ValueQuantizer.cu">
-      <Filter>GPU\1bitSGD</Filter>
-    </CudaCompile>
    <CudaCompile Include="MatrixQuantizer_kernel.cu">
      <Filter>GPU\1bitSGD</Filter>
    </CudaCompile>
--- a/Math/Math/CUDAPageLockedMemAllocator.cpp
+++ b/Math/Math/CUDAPageLockedMemAllocator.cpp
@ -1,9 +1,13 @@
 #include "stdafx.h"
 #include "CUDAPageLockedMemAllocator.h"
+#ifndef CPUONLY
 #include <cuda_runtime_api.h>
+#endif // !CPUONLY
+#include "BestGpu.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

+#ifndef CPUONLY
    CUDAPageLockedMemAllocator::CUDAPageLockedMemAllocator(int deviceID)
        : m_deviceID(deviceID)
    {
@ -25,4 +29,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        cudaSetDevice(m_deviceID);
        cudaFreeHost(p) || "Free in CUDAPageLockedMemAllocator failed";
    }
+
+    int CUDAPageLockedMemAllocator::GetDeviceID() const
+    {
+        return m_deviceID;
+    }
+#else
+    // Dummy definitions when compiling for CPUONLY
+    CUDAPageLockedMemAllocator::CUDAPageLockedMemAllocator(int)
+    {
+    }
+
+    int CUDAPageLockedMemAllocator::GetDeviceID() const
+    {
+        return -1;
+    }
+
+    char* CUDAPageLockedMemAllocator::Malloc(size_t)
+    {
+        return NULL;
+    }
+
+    void CUDAPageLockedMemAllocator::Free(char*)
+    {
+    }
+#endif
 }}}
--- a/Math/Math/CUDAPageLockedMemAllocator.h
+++ b/Math/Math/CUDAPageLockedMemAllocator.h
@ -19,11 +19,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    public:
        CUDAPageLockedMemAllocator(int deviceID);

-        int GetDeviceID() const
-        {
-            return m_deviceID;
-        }
-
+        int GetDeviceID() const;
        char* Malloc(size_t size) override;
        void Free(char* p) override;

--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
--- a/Math/Math/MatrixQuantizerGPU.h
+++ b/Math/Math/MatrixQuantizerGPU.h
@ -4,8 +4,10 @@
 #include "ColumnQuantizer.h"
 #include "QuantizedMatrix.h"
 #include "GPUMatrix.h"
+#ifndef CPUONLY
 #include <cuda_runtime_api.h>
 #include <cuda.h>  
+#endif // !CPUONLY
 #include <vector>
 #include <memory>

@ -34,6 +36,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // Helper function to get a temporary intermediate matrix on the GPU to store quantization results
        QuantizedMatrix<ElemType>& GetTempGPUQuantizedMatrix(size_t nBits, bool& newlyAllocated);
        
+#ifndef CPUONLY
        // Record a event to flag the completion of quantization/unquantization kernel on the compute stream
        void RecordQuantizeCompleteEvent(cudaStream_t computestream) const;

@ -68,7 +71,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        mutable cudaEvent_t m_quantizeCompleteEvent;
        mutable cudaEvent_t m_fetchCompleteEvent;
        mutable cudaEvent_t m_assignCompleteEvent;
+#endif // !CPUONLY

+    private:
        bool m_forceSync;
        bool m_quantizeOpIncludedFetch;

--- a/Math/Math/MatrixQuantizer_kernel.cu
+++ b/Math/Math/MatrixQuantizer_kernel.cu
@ -7,7 +7,6 @@
 #include <device_launch_parameters.h>

 #include "ValueQuantizer.h"
-#include "ValueQuantizer.cu"
 #include "ColumnQuantizer.h"
 #include "QuantizedMatrix.h"

--- a/Math/Math/NoGPU.cpp
+++ b/Math/Math/NoGPU.cpp
@ -11,6 +11,7 @@

 #include "GPUMatrix.h"
 #include "GPUSparseMatrix.h"
+#include "MatrixQuantizerGPU.h"

 #pragma warning (disable: 4100) // unreferenced formal parameter, which is OK since all functions in here are dummies; disabling this allows to copy-paste prototypes here when we add new functions
 #pragma warning (disable: 4702) // unreachable code, which we get from the NOT_IMPLEMENTED macro which is OK
@ -355,6 +356,7 @@ namespace Microsoft {

 #pragma endregion Helper Functions

+            template class GPUSparseMatrix<char>;
            template class GPUSparseMatrix<float>;
            template class GPUSparseMatrix<double>;

@ -477,6 +479,7 @@ namespace Microsoft {

            template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignColumnSlice(const GPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols) { return *this; }

+            template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetColumnSlice(const GPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols) { return *this; }

            //for each column of a, we assign numRows starting from startIndex to this
            template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRowSliceValuesOf(const GPUMatrix<ElemType>& /*a*/, const size_t startIndex, const size_t numRows) { return *this; }
@ -1082,10 +1085,50 @@ namespace Microsoft {
            }
 #pragma endregion Static BLAS Functions

+#pragma region MatrixQuantizerGPU functions
+            template<class ElemType>
+            MatrixQuantizerGPU<ElemType>::MatrixQuantizerGPU(const Matrix<ElemType>& inMatrix, bool forceSync)
+                : MatrixQuantizer<ElemType>(inMatrix)
+            {
+            }
+
+            template<class ElemType>
+            MatrixQuantizerGPU<ElemType>::~MatrixQuantizerGPU()
+            {
+            }
+
+            template<class ElemType>
+            void MatrixQuantizerGPU<ElemType>::QuantizeAsync(QuantizedMatrix<ElemType>& outQMatrix, bool zeroThresholdFor1Bit)
+            {
+            }
+
+            template<class ElemType>
+            void MatrixQuantizerGPU<ElemType>::WaitQuantizeAsyncDone()
+            {
+            }
+
+            template<class ElemType>
+            void MatrixQuantizerGPU<ElemType>::UnquantizeAsync(QuantizedMatrix<ElemType>& inQMatrix, Matrix<ElemType>& outMatrix, bool add /*= false*/)
+            {
+            }
+
+            template<class ElemType>
+            void MatrixQuantizerGPU<ElemType>::WaitUnquantizeAsyncDone()
+            {
+            }
+#pragma endregion MatrixQuantizerGPU functions
+
+            template class GPUMatrix<char>;
            template class GPUMatrix<float>;
            template class GPUMatrix<double>;
            template class DeviceBoundNumber<float>;
            template class DeviceBoundNumber<double>;
+            template MatrixQuantizerGPU<float>::MatrixQuantizerGPU(const Matrix<float>&, bool forceSync);
+            template MatrixQuantizerGPU<double>::MatrixQuantizerGPU(const Matrix<double>&, bool forceSync);
+            template MatrixQuantizerGPU<float>::~MatrixQuantizerGPU();
+            template MatrixQuantizerGPU<double>::~MatrixQuantizerGPU();
+            template void MatrixQuantizerGPU<float>::QuantizeAsync(QuantizedMatrix<float>&, bool);
+            template void MatrixQuantizerGPU<double>::QuantizeAsync(QuantizedMatrix<double>&, bool);

            template<class ElemType> cublasHandle_t GPUMatrix<ElemType>::s_cuHandle[GPUMatrix<ElemType>::MaxGpus] = { 0 };

--- a/Math/Math/ValueQuantizer.cu
+++ b/Math/Math/ValueQuantizer.cu
@ -1,153 +0,0 @@
-#ifndef __VALLUE_QUANTIZER_CUH__
-#define __VALLUE_QUANTIZER_CUH__
-
-#include "stdafx.h"
-#include "ValueQuantizer.h"
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-    template<class ElemType>
-    cudasharedcode
-    ValueQuantizer<ElemType>::ValueQuantizer(size_t ldNbits, ElemType lower, ElemType upper) 
-    : ldNbits(ldNbits), Nbits(1 << ldNbits), quantimin(lower), quantimax(upper)
-    {
-        rangeend = ((QWordVal)1) << Nbits;
-
-        // post-fix for incorrect shift for no-quant hack (Nbits=32): << arg is taken mod 32!
-        // in this case, it's only used as (rangeend-1) which is now correct (before it was 0!)
-        if (Nbits >= (8 * sizeof(rangeend)))
-        {
-            rangeend = 0;
-        }
-
-        // must protect against NaN: interval is 0 -> quantization is futile, just emit 0
-        if (((quantimax - quantimin) < 1e-36f) || (rangeend == 0))
-        {
-            qfactor = ufactor = (ElemType)0.0;
-        }
-        else
-        {
-            // precompute this for quantize() (see comment there)
-            qfactor = rangeend / (quantimax - quantimin);   
-            // and for unquantize()
-            ufactor = (quantimax - quantimin) / rangeend;   
-        }
-
-        // set the quantization threshold for the special case of 1-bit
-        quantimid = 0.5f * (quantimax + quantimin);
-    }
-
-    // quantize for full ElemType size bits case (special case that allows to bypass quantization, for testing/debugging purposes)
-    template<class ElemType>
-    cudasharedcode ValueQuantizer<ElemType>::QWordVal
-    ValueQuantizer<ElemType>::QuantizeToFullQWord(ElemType u) const
-    {
-        assert(Nbits == QWordNumBits);
-        
-        // we return the bit pattern that encodes the float value
-        return *(QWordVal*)&u;
-    }
-
-    // quantize one value --special version for 1 bit
-    template<class ElemType>
-    template<bool ZeroThresholdFor1Bit>
-    cudasharedcode bool
-    ValueQuantizer<ElemType>::Quantize1(ElemType u) const
-    {
-        assert (Nbits == 1);
-        if (!ZeroThresholdFor1Bit)
-        {
-            return u >= quantimid;
-        }
-        else
-        {
-            return u >= (ElemType)0.0;
-        }
-    }
-
-    // quantize one value
-    // TODO: we can optimize for 1 bit here - very simply use a template arg 'isonebit'
-    template<class ElemType>
-    template<bool ZeroThresholdFor1Bit>
-    cudasharedcode ValueQuantizer<ElemType>::QWordVal
-    ValueQuantizer<ElemType>::Quantize(ElemType u) const
-    {
-        if (Nbits == QWordNumBits)
-        {
-            return QuantizeToFullQWord(u);
-        }
-        // TODO: we may need to optimize this by a template arg
-        else if (ldNbits == 0)
-        {
-            return Quantize1<ZeroThresholdFor1Bit>(u) ? 1 : 0;
-        }
-        else
-        {
-            if (u <= quantimin)
-            {
-                return 0;
-            }
-            else if (u >= quantimax)
-            {
-                return (rangeend - 1);
-            }
-            else
-            {
-                return (QWordVal)((QWordValSigned)((u - quantimin) * qfactor));
-            }
-        }
-    }
-
-    // unquantize one value
-    template<class ElemType>
-    cudasharedcode  
-    ElemType ValueQuantizer<ElemType>::Unquantize(QWordVal u) const
-    {
-        if (Nbits == QWordNumBits)
-        {
-            return *(ElemType*)&u;
-        }
-        
-        // Note: in 1-bit case, we want 0.5 -> mean0, 1.5 -> mean1
-        return ((u + (ElemType)0.5) * ufactor) + quantimin;
-    }
-
-    // unquantize one value  --special case for 1 bit
-    template<class ElemType>
-    cudasharedcode 
-    ElemType ValueQuantizer<ElemType>::Unquantize1(bool u, ElemType val0, ElemType val1)
-    {
-        return u ? val1 : val0;
-    }
-
-    // helper: compute the binary log of a power of two (utility function to convert 'Nbits' into 'ldNbits'
-    template<class ElemType>
-    size_t ValueQuantizer<ElemType>::ld(size_t v)
-    {
-        if (v == 1)
-        {
-            return 0;
-        }
-        else if (v & 1) // not a power of two
-        {
-            throw std::runtime_error("ld: 'bits' must be a power of two");
-        }
-        else
-        {
-            return 1 + ld (v >> 1);
-        }
-    }
-    
-    // Explicit instantiation
-    template class ValueQuantizer<float>;
-    template class ValueQuantizer<double>;
-    template ValueQuantizer<float>::QWordVal ValueQuantizer<float>::Quantize<true>(float u) const;
-    template ValueQuantizer<float>::QWordVal ValueQuantizer<float>::Quantize<false>(float u) const;
-    template ValueQuantizer<double>::QWordVal ValueQuantizer<double>::Quantize<true>(double u) const;
-    template ValueQuantizer<double>::QWordVal ValueQuantizer<double>::Quantize<false>(double u) const;
-    template bool ValueQuantizer<float>::Quantize1<true>(float u) const;
-    template bool ValueQuantizer<float>::Quantize1<false>(float u) const;
-    template bool ValueQuantizer<double>::Quantize1<true>(double u) const;
-    template bool ValueQuantizer<double>::Quantize1<false>(double u) const;
-}}}
-#endif
--- a/Math/Math/ValueQuantizer.h
+++ b/Math/Math/ValueQuantizer.h
@ -2,10 +2,17 @@
 #ifndef __VALLUE_QUANTIZER_H__
 #define __VALLUE_QUANTIZER_H__

+#ifndef CPUONLY
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 #include <device_launch_parameters.h>
+#endif // !CPUONLY
+
+#include <cassert>
+#include <stdexcept>
+
+#pragma warning (disable: 4127) // conditional expression is constant

 namespace Microsoft { namespace MSR { namespace CNTK {
    
@ -55,17 +62,98 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        static const size_t QWordNumBits = 8 * sizeof(QWord);

    public:
-        cudasharedcode ValueQuantizer(size_t ldNbits, ElemType lower, ElemType upper);
+        cudasharedcode ValueQuantizer(size_t ldNbits, ElemType lower, ElemType upper)
+            : ldNbits(ldNbits), Nbits(1 << ldNbits), quantimin(lower), quantimax(upper)
+        {
+            rangeend = ((QWordVal)1) << Nbits;

+            // post-fix for incorrect shift for no-quant hack (Nbits=32): << arg is taken mod 32!
+            // in this case, it's only used as (rangeend-1) which is now correct (before it was 0!)
+            if (Nbits >= (8 * sizeof(rangeend)))
+            {
+                rangeend = 0;
+            }
+
+            // must protect against NaN: interval is 0 -> quantization is futile, just emit 0
+            if (((quantimax - quantimin) < 1e-36f) || (rangeend == 0))
+            {
+                qfactor = ufactor = (ElemType)0.0;
+            }
+            else
+            {
+                // precompute this for quantize() (see comment there)
+                qfactor = rangeend / (quantimax - quantimin);
+                // and for unquantize()
+                ufactor = (quantimax - quantimin) / rangeend;
+            }
+
+            // set the quantization threshold for the special case of 1-bit
+            quantimid = 0.5f * (quantimax + quantimin);
+        }
+
+        // quantize one value
+        // TODO: we can optimize for 1 bit here - very simply use a template arg 'isonebit'
        template<bool ZeroThresholdFor1Bit>
-        cudasharedcode QWordVal Quantize(ElemType u) const;
+        cudasharedcode QWordVal Quantize(ElemType u) const
+        {
+            if (Nbits == QWordNumBits)
+            {
+                return QuantizeToFullQWord(u);
+            }
+            // TODO: we may need to optimize this by a template arg
+            else if (ldNbits == 0)
+            {
+                return Quantize1<ZeroThresholdFor1Bit>(u) ? 1 : 0;
+            }
+            else
+            {
+                if (u <= quantimin)
+                {
+                    return 0;
+                }
+                else if (u >= quantimax)
+                {
+                    return (rangeend - 1);
+                }
+                else
+                {
+                    return (QWordVal)((QWordValSigned)((u - quantimin) * qfactor));
+                }
+            }
+        }

-        cudasharedcode ElemType Unquantize(QWordVal u) const;
+        // unquantize one value
+        cudasharedcode ElemType Unquantize(QWordVal u) const
+        {
+            if (Nbits == QWordNumBits)
+            {
+                return *(ElemType*)&u;
+            }

+            // Note: in 1-bit case, we want 0.5 -> mean0, 1.5 -> mean1
+            return ((u + (ElemType)0.5) * ufactor) + quantimin;
+        }
+
+        // quantize one value --special version for 1 bit
        template<bool ZeroThresholdFor1Bit>
-        cudasharedcode bool Quantize1(ElemType u) const;
+        cudasharedcode bool Quantize1(ElemType u) const
+        {
+            assert(Nbits == 1);
+            if (!ZeroThresholdFor1Bit)
+            {
+                return u >= quantimid;
+            }
+            else
+            {
+                return u >= (ElemType)0.0;
+            }
+        }

-        static cudasharedcode ElemType Unquantize1(bool u, ElemType val0, ElemType val1);
+        // unquantize one value  --special case for 1 bit
+        static cudasharedcode ElemType Unquantize1(bool u, ElemType val0, ElemType val1)
+        {
+            return u ? val1 : val0;
+        }

        //how many bits we are quanatizing to
        cudasharedcode size_t NBits() const
@ -79,10 +167,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            return rangeend;
        } 
        
-        static size_t ld(size_t v);
-        
-    protected:   
-        cudasharedcode QWordVal QuantizeToFullQWord(ElemType u) const;
+        // helper: compute the binary log of a power of two (utility function to convert 'Nbits' into 'ldNbits'
+        static size_t ld(size_t v)
+        {
+            if (v == 1)
+            {
+                return 0;
+            }
+            else if (v & 1) // not a power of two
+            {
+                throw std::runtime_error("ld: 'bits' must be a power of two");
+            }
+            else
+            {
+                return 1 + ld(v >> 1);
+            }
+        }
+
+    protected:
+        // quantize for full ElemType size bits case (special case that allows to bypass quantization, for testing/debugging purposes)
+        cudasharedcode QWordVal QuantizeToFullQWord(ElemType u) const
+        {
+            assert(Nbits == QWordNumBits);
+
+            // we return the bit pattern that encodes the float value
+            return *(QWordVal*)&u;
+        }

    protected:
        // NBits must be power of two
@ -105,4 +215,4 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        ElemType ufactor;
    };
 }}}
-#endif 
+#endif 
--- a/Tests/Speech/LSTM/testcases.yml
+++ b/Tests/Speech/LSTM/testcases.yml
@ -1,27 +0,0 @@
-dataDir: ../Data
-
-testCases:
-  CNTK Run must be completed:
-    patterns:
-      - ^COMPLETED
-
-  Must train epochs in exactly same order and parameters:
-    patterns:
-      - ^Starting Epoch {{integer}}
-      - learning rate per sample = {{float}}
-      - momentum = {{float}}
-
-  Epochs must be finished with expected results:
-    patterns:
-      - ^Finished Epoch[{{integer}}]
-      - TrainLossPerSample = {{float,tolerance=1%}}
-      - EvalErrPerSample = {{float,tolerance=1%}}
-      - Ave LearnRatePerSample = {{float,tolerance=1%}}
-
-  Per-minibatch training results must match:
-    patterns:
-      - ^ Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}} of {{integer}}]
-      - SamplesSeen = {{integer}}
-      - TrainLossPerSample = {{float,tolerance=1%}}
-      - EvalErr[0]PerSample = {{float,tolerance=1%}}
-