Merge branch 'master' of https://git01.codeplex.com/cntk into amitaga/mergeHTKMLFReaders

2015-09-01 13:27:19 -07:00 · 2015-09-01 13:27:19 -07:00 · 8e85f07de3
--- a/2
+++ b/2
@ -169,6 +169,7 @@ MATH_SRC =\
 	Math/Math/MatrixQuantizerCPU.cpp \
 	Math/Math/QuantizedMatrix.cpp \
 	Math/Math/Matrix.cpp \
 	Math/Math/CUDAPageLockedMemAllocator.cpp \
 ifdef CUDA_PATH
 MATH_SRC +=\
@ -176,7 +177,6 @@ MATH_SRC +=\
 	Math/Math/GPUMatrixCUDAKernels.cu \
 	Math/Math/GPUSparseMatrix.cu \
 	Math/Math/GPUWatcher.cu \
 	Math/Math/CUDAPageLockedMemAllocator.cpp \
 	Math/Math/MatrixQuantizerGPU.cu \
 else
--- a/Math/Math/CNTKMathCUDA.vcxproj
+++ b/Math/Math/CNTKMathCUDA.vcxproj
@ -152,10 +152,6 @@
      <ExcludedFromBuild>true</ExcludedFromBuild>
      <FileType>CppCode</FileType>
    </CudaCompile>
    <CudaCompile Include="ValueQuantizer.cu">
      <ExcludedFromBuild>true</ExcludedFromBuild>
      <FileType>CppCode</FileType>
    </CudaCompile>
  </ItemGroup>
  <ItemGroup>
    <CudaCompile Include="GPUMatrix.cu">
--- a/Math/Math/CNTKMathCUDA.vcxproj.filters
+++ b/Math/Math/CNTKMathCUDA.vcxproj.filters
@ -16,9 +16,6 @@
    <CudaCompile Include="MatrixQuantizerGPU.cu">
      <Filter>GPU\1bitSGD</Filter>
    </CudaCompile>
    <CudaCompile Include="ValueQuantizer.cu">
      <Filter>GPU\1bitSGD</Filter>
    </CudaCompile>
    <CudaCompile Include="MatrixQuantizer_kernel.cu">
      <Filter>GPU\1bitSGD</Filter>
    </CudaCompile>
--- a/Math/Math/CUDAPageLockedMemAllocator.cpp
+++ b/Math/Math/CUDAPageLockedMemAllocator.cpp
@ -1,9 +1,13 @@
 #include "stdafx.h"
 #include "CUDAPageLockedMemAllocator.h"
 #ifndef CPUONLY
 #include <cuda_runtime_api.h>
 #endif // !CPUONLY
 #include "BestGpu.h"
 namespace Microsoft { namespace MSR { namespace CNTK {
 #ifndef CPUONLY
    CUDAPageLockedMemAllocator::CUDAPageLockedMemAllocator(int deviceID)
        : m_deviceID(deviceID)
    {
@ -25,4 +29,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        cudaSetDevice(m_deviceID);
        cudaFreeHost(p) || "Free in CUDAPageLockedMemAllocator failed";
    }
    int CUDAPageLockedMemAllocator::GetDeviceID() const
    {
        return m_deviceID;
    }
 #else
    // Dummy definitions when compiling for CPUONLY
    CUDAPageLockedMemAllocator::CUDAPageLockedMemAllocator(int)
    {
    }
    int CUDAPageLockedMemAllocator::GetDeviceID() const
    {
        return -1;
    }
    char* CUDAPageLockedMemAllocator::Malloc(size_t)
    {
        return NULL;
    }
    void CUDAPageLockedMemAllocator::Free(char*)
    {
    }
 #endif
 }}}
--- a/Math/Math/CUDAPageLockedMemAllocator.h
+++ b/Math/Math/CUDAPageLockedMemAllocator.h
@ -19,11 +19,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    public:
        CUDAPageLockedMemAllocator(int deviceID);
-        int GetDeviceID() const
+        int GetDeviceID() const;
        {
            return m_deviceID;
        }
        char* Malloc(size_t size) override;
        void Free(char* p) override;
--- a/Math/Math/Matrix.cpp
+++ b/Math/Math/Matrix.cpp
@ -4749,4 +4749,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template size_t Matrix<char>::GetNumElements() const;
    template Matrix<char> Matrix<char>::ColumnSlice(size_t startColumn, size_t numCols) const;
    template void Matrix<char>::_transferToDevice(int id_to, bool ismoved, bool emptyTransfer) const;
    template size_t Matrix<char>::GetNumRows() const;
    template size_t Matrix<char>::GetNumCols() const;
 }}}
--- a/Math/Math/MatrixQuantizerGPU.h
+++ b/Math/Math/MatrixQuantizerGPU.h
@ -4,8 +4,10 @@
 #include "ColumnQuantizer.h"
 #include "QuantizedMatrix.h"
 #include "GPUMatrix.h"
 #ifndef CPUONLY
 #include <cuda_runtime_api.h>
 #include <cuda.h>  
 #endif // !CPUONLY
 #include <vector>
 #include <memory>
@ -34,6 +36,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // Helper function to get a temporary intermediate matrix on the GPU to store quantization results
        QuantizedMatrix<ElemType>& GetTempGPUQuantizedMatrix(size_t nBits, bool& newlyAllocated);
 #ifndef CPUONLY
        // Record a event to flag the completion of quantization/unquantization kernel on the compute stream
        void RecordQuantizeCompleteEvent(cudaStream_t computestream) const;
@ -68,7 +71,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        mutable cudaEvent_t m_quantizeCompleteEvent;
        mutable cudaEvent_t m_fetchCompleteEvent;
        mutable cudaEvent_t m_assignCompleteEvent;
 #endif // !CPUONLY
    private:
        bool m_forceSync;
        bool m_quantizeOpIncludedFetch;
--- a/Math/Math/MatrixQuantizer_kernel.cu
+++ b/Math/Math/MatrixQuantizer_kernel.cu
@ -7,7 +7,6 @@
 #include <device_launch_parameters.h>
 #include "ValueQuantizer.h"
 #include "ValueQuantizer.cu"
 #include "ColumnQuantizer.h"
 #include "QuantizedMatrix.h"
--- a/Math/Math/NoGPU.cpp
+++ b/Math/Math/NoGPU.cpp
@ -11,6 +11,7 @@
 #include "GPUMatrix.h"
 #include "GPUSparseMatrix.h"
 #include "MatrixQuantizerGPU.h"
 #pragma warning (disable: 4100) // unreferenced formal parameter, which is OK since all functions in here are dummies; disabling this allows to copy-paste prototypes here when we add new functions
 #pragma warning (disable: 4702) // unreachable code, which we get from the NOT_IMPLEMENTED macro which is OK
@ -355,6 +356,7 @@ namespace Microsoft {
 #pragma endregion Helper Functions
            template class GPUSparseMatrix<char>;
            template class GPUSparseMatrix<float>;
            template class GPUSparseMatrix<double>;
@ -477,6 +479,7 @@ namespace Microsoft {
            template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignColumnSlice(const GPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols) { return *this; }
            template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetColumnSlice(const GPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols) { return *this; }
            //for each column of a, we assign numRows starting from startIndex to this
            template<class ElemType> GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignRowSliceValuesOf(const GPUMatrix<ElemType>& /*a*/, const size_t startIndex, const size_t numRows) { return *this; }
@ -1082,10 +1085,50 @@ namespace Microsoft {
            }
 #pragma endregion Static BLAS Functions
 #pragma region MatrixQuantizerGPU functions
            template<class ElemType>
            MatrixQuantizerGPU<ElemType>::MatrixQuantizerGPU(const Matrix<ElemType>& inMatrix, bool forceSync)
                : MatrixQuantizer<ElemType>(inMatrix)
            {
            }
            template<class ElemType>
            MatrixQuantizerGPU<ElemType>::~MatrixQuantizerGPU()
            {
            }
            template<class ElemType>
            void MatrixQuantizerGPU<ElemType>::QuantizeAsync(QuantizedMatrix<ElemType>& outQMatrix, bool zeroThresholdFor1Bit)
            {
            }
            template<class ElemType>
            void MatrixQuantizerGPU<ElemType>::WaitQuantizeAsyncDone()
            {
            }
            template<class ElemType>
            void MatrixQuantizerGPU<ElemType>::UnquantizeAsync(QuantizedMatrix<ElemType>& inQMatrix, Matrix<ElemType>& outMatrix, bool add /*= false*/)
            {
            }
            template<class ElemType>
            void MatrixQuantizerGPU<ElemType>::WaitUnquantizeAsyncDone()
            {
            }
 #pragma endregion MatrixQuantizerGPU functions
            template class GPUMatrix<char>;
            template class GPUMatrix<float>;
            template class GPUMatrix<double>;
            template class DeviceBoundNumber<float>;
            template class DeviceBoundNumber<double>;
            template MatrixQuantizerGPU<float>::MatrixQuantizerGPU(const Matrix<float>&, bool forceSync);
            template MatrixQuantizerGPU<double>::MatrixQuantizerGPU(const Matrix<double>&, bool forceSync);
            template MatrixQuantizerGPU<float>::~MatrixQuantizerGPU();
            template MatrixQuantizerGPU<double>::~MatrixQuantizerGPU();
            template void MatrixQuantizerGPU<float>::QuantizeAsync(QuantizedMatrix<float>&, bool);
            template void MatrixQuantizerGPU<double>::QuantizeAsync(QuantizedMatrix<double>&, bool);
            template<class ElemType> cublasHandle_t GPUMatrix<ElemType>::s_cuHandle[GPUMatrix<ElemType>::MaxGpus] = { 0 };
--- a/Math/Math/ValueQuantizer.cu
+++ b/Math/Math/ValueQuantizer.cu
@ -1,153 +0,0 @@
 #ifndef __VALLUE_QUANTIZER_CUH__
 #define __VALLUE_QUANTIZER_CUH__
 #include "stdafx.h"
 #include "ValueQuantizer.h"
 namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    cudasharedcode
    ValueQuantizer<ElemType>::ValueQuantizer(size_t ldNbits, ElemType lower, ElemType upper) 
    : ldNbits(ldNbits), Nbits(1 << ldNbits), quantimin(lower), quantimax(upper)
    {
        rangeend = ((QWordVal)1) << Nbits;
        // post-fix for incorrect shift for no-quant hack (Nbits=32): << arg is taken mod 32!
        // in this case, it's only used as (rangeend-1) which is now correct (before it was 0!)
        if (Nbits >= (8 * sizeof(rangeend)))
        {
            rangeend = 0;
        }
        // must protect against NaN: interval is 0 -> quantization is futile, just emit 0
        if (((quantimax - quantimin) < 1e-36f) || (rangeend == 0))
        {
            qfactor = ufactor = (ElemType)0.0;
        }
        else
        {
            // precompute this for quantize() (see comment there)
            qfactor = rangeend / (quantimax - quantimin);   
            // and for unquantize()
            ufactor = (quantimax - quantimin) / rangeend;   
        }
        // set the quantization threshold for the special case of 1-bit
        quantimid = 0.5f * (quantimax + quantimin);
    }
    // quantize for full ElemType size bits case (special case that allows to bypass quantization, for testing/debugging purposes)
    template<class ElemType>
    cudasharedcode ValueQuantizer<ElemType>::QWordVal
    ValueQuantizer<ElemType>::QuantizeToFullQWord(ElemType u) const
    {
        assert(Nbits == QWordNumBits);
        // we return the bit pattern that encodes the float value
        return *(QWordVal*)&u;
    }
    // quantize one value --special version for 1 bit
    template<class ElemType>
    template<bool ZeroThresholdFor1Bit>
    cudasharedcode bool
    ValueQuantizer<ElemType>::Quantize1(ElemType u) const
    {
        assert (Nbits == 1);
        if (!ZeroThresholdFor1Bit)
        {
            return u >= quantimid;
        }
        else
        {
            return u >= (ElemType)0.0;
        }
    }
    // quantize one value
    // TODO: we can optimize for 1 bit here - very simply use a template arg 'isonebit'
    template<class ElemType>
    template<bool ZeroThresholdFor1Bit>
    cudasharedcode ValueQuantizer<ElemType>::QWordVal
    ValueQuantizer<ElemType>::Quantize(ElemType u) const
    {
        if (Nbits == QWordNumBits)
        {
            return QuantizeToFullQWord(u);
        }
        // TODO: we may need to optimize this by a template arg
        else if (ldNbits == 0)
        {
            return Quantize1<ZeroThresholdFor1Bit>(u) ? 1 : 0;
        }
        else
        {
            if (u <= quantimin)
            {
                return 0;
            }
            else if (u >= quantimax)
            {
                return (rangeend - 1);
            }
            else
            {
                return (QWordVal)((QWordValSigned)((u - quantimin) * qfactor));
            }
        }
    }
    // unquantize one value
    template<class ElemType>
    cudasharedcode  
    ElemType ValueQuantizer<ElemType>::Unquantize(QWordVal u) const
    {
        if (Nbits == QWordNumBits)
        {
            return *(ElemType*)&u;
        }
        // Note: in 1-bit case, we want 0.5 -> mean0, 1.5 -> mean1
        return ((u + (ElemType)0.5) * ufactor) + quantimin;
    }
    // unquantize one value  --special case for 1 bit
    template<class ElemType>
    cudasharedcode 
    ElemType ValueQuantizer<ElemType>::Unquantize1(bool u, ElemType val0, ElemType val1)
    {
        return u ? val1 : val0;
    }
    // helper: compute the binary log of a power of two (utility function to convert 'Nbits' into 'ldNbits'
    template<class ElemType>
    size_t ValueQuantizer<ElemType>::ld(size_t v)
    {
        if (v == 1)
        {
            return 0;
        }
        else if (v & 1) // not a power of two
        {
            throw std::runtime_error("ld: 'bits' must be a power of two");
        }
        else
        {
            return 1 + ld (v >> 1);
        }
    }
    // Explicit instantiation
    template class ValueQuantizer<float>;
    template class ValueQuantizer<double>;
    template ValueQuantizer<float>::QWordVal ValueQuantizer<float>::Quantize<true>(float u) const;
    template ValueQuantizer<float>::QWordVal ValueQuantizer<float>::Quantize<false>(float u) const;
    template ValueQuantizer<double>::QWordVal ValueQuantizer<double>::Quantize<true>(double u) const;
    template ValueQuantizer<double>::QWordVal ValueQuantizer<double>::Quantize<false>(double u) const;
    template bool ValueQuantizer<float>::Quantize1<true>(float u) const;
    template bool ValueQuantizer<float>::Quantize1<false>(float u) const;
    template bool ValueQuantizer<double>::Quantize1<true>(double u) const;
    template bool ValueQuantizer<double>::Quantize1<false>(double u) const;
 }}}
 #endif
--- a/Math/Math/ValueQuantizer.h
+++ b/Math/Math/ValueQuantizer.h
@ -2,10 +2,17 @@
 #ifndef __VALLUE_QUANTIZER_H__
 #define __VALLUE_QUANTIZER_H__
 #ifndef CPUONLY
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 #include <device_launch_parameters.h>
 #endif // !CPUONLY
 #include <cassert>
 #include <stdexcept>
 #pragma warning (disable: 4127) // conditional expression is constant
 namespace Microsoft { namespace MSR { namespace CNTK {
@ -55,17 +62,98 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        static const size_t QWordNumBits = 8 * sizeof(QWord);
    public:
-        cudasharedcode ValueQuantizer(size_t ldNbits, ElemType lower, ElemType upper);
+        cudasharedcode ValueQuantizer(size_t ldNbits, ElemType lower, ElemType upper)
            : ldNbits(ldNbits), Nbits(1 << ldNbits), quantimin(lower), quantimax(upper)
        {
            rangeend = ((QWordVal)1) << Nbits;
            // post-fix for incorrect shift for no-quant hack (Nbits=32): << arg is taken mod 32!
            // in this case, it's only used as (rangeend-1) which is now correct (before it was 0!)
            if (Nbits >= (8 * sizeof(rangeend)))
            {
                rangeend = 0;
            }
            // must protect against NaN: interval is 0 -> quantization is futile, just emit 0
            if (((quantimax - quantimin) < 1e-36f) || (rangeend == 0))
            {
                qfactor = ufactor = (ElemType)0.0;
            }
            else
            {
                // precompute this for quantize() (see comment there)
                qfactor = rangeend / (quantimax - quantimin);
                // and for unquantize()
                ufactor = (quantimax - quantimin) / rangeend;
            }
            // set the quantization threshold for the special case of 1-bit
            quantimid = 0.5f * (quantimax + quantimin);
        }
        // quantize one value
        // TODO: we can optimize for 1 bit here - very simply use a template arg 'isonebit'
        template<bool ZeroThresholdFor1Bit>
-        cudasharedcode QWordVal Quantize(ElemType u) const;
+        cudasharedcode QWordVal Quantize(ElemType u) const
        {
            if (Nbits == QWordNumBits)
            {
                return QuantizeToFullQWord(u);
            }
            // TODO: we may need to optimize this by a template arg
            else if (ldNbits == 0)
            {
                return Quantize1<ZeroThresholdFor1Bit>(u) ? 1 : 0;
            }
            else
            {
                if (u <= quantimin)
                {
                    return 0;
                }
                else if (u >= quantimax)
                {
                    return (rangeend - 1);
                }
                else
                {
                    return (QWordVal)((QWordValSigned)((u - quantimin) * qfactor));
                }
            }
        }
-        cudasharedcode ElemType Unquantize(QWordVal u) const;
+        // unquantize one value
        cudasharedcode ElemType Unquantize(QWordVal u) const
        {
            if (Nbits == QWordNumBits)
            {
                return *(ElemType*)&u;
            }
            // Note: in 1-bit case, we want 0.5 -> mean0, 1.5 -> mean1
            return ((u + (ElemType)0.5) * ufactor) + quantimin;
        }
        // quantize one value --special version for 1 bit
        template<bool ZeroThresholdFor1Bit>
-        cudasharedcode bool Quantize1(ElemType u) const;
+        cudasharedcode bool Quantize1(ElemType u) const
        {
            assert(Nbits == 1);
            if (!ZeroThresholdFor1Bit)
            {
                return u >= quantimid;
            }
            else
            {
                return u >= (ElemType)0.0;
            }
        }
-        static cudasharedcode ElemType Unquantize1(bool u, ElemType val0, ElemType val1);
+        // unquantize one value  --special case for 1 bit
        static cudasharedcode ElemType Unquantize1(bool u, ElemType val0, ElemType val1)
        {
            return u ? val1 : val0;
        }
        //how many bits we are quanatizing to
        cudasharedcode size_t NBits() const
@ -79,10 +167,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            return rangeend;
        } 
-        static size_t ld(size_t v);
+        // helper: compute the binary log of a power of two (utility function to convert 'Nbits' into 'ldNbits'
        static size_t ld(size_t v)
        {
            if (v == 1)
            {
                return 0;
            }
            else if (v & 1) // not a power of two
            {
                throw std::runtime_error("ld: 'bits' must be a power of two");
            }
            else
            {
                return 1 + ld(v >> 1);
            }
        }
    protected:
-        cudasharedcode QWordVal QuantizeToFullQWord(ElemType u) const;
+        // quantize for full ElemType size bits case (special case that allows to bypass quantization, for testing/debugging purposes)
        cudasharedcode QWordVal QuantizeToFullQWord(ElemType u) const
        {
            assert(Nbits == QWordNumBits);
            // we return the bit pattern that encodes the float value
            return *(QWordVal*)&u;
        }
    protected:
        // NBits must be power of two
--- a/Tests/Speech/LSTM/testcases.yml
+++ b/Tests/Speech/LSTM/testcases.yml
@ -1,27 +0,0 @@
 dataDir: ../Data
 testCases:
  CNTK Run must be completed:
    patterns:
      - ^COMPLETED
  Must train epochs in exactly same order and parameters:
    patterns:
      - ^Starting Epoch {{integer}}
      - learning rate per sample = {{float}}
      - momentum = {{float}}
  Epochs must be finished with expected results:
    patterns:
      - ^Finished Epoch[{{integer}}]
      - TrainLossPerSample = {{float,tolerance=1%}}
      - EvalErrPerSample = {{float,tolerance=1%}}
      - Ave LearnRatePerSample = {{float,tolerance=1%}}
  Per-minibatch training results must match:
    patterns:
      - ^ Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}} of {{integer}}]
      - SamplesSeen = {{integer}}
      - TrainLossPerSample = {{float,tolerance=1%}}
      - EvalErr[0]PerSample = {{float,tolerance=1%}}