cudnn: enabled build on Linux with cuDNN.

2015-11-17 12:58:19 -08:00 · 2015-11-17 12:58:19 -08:00 · 3023f11e6e
--- a/11
+++ b/11
@ -23,6 +23,8 @@
 #   If not specified, GPU will not be enabled
 # CUB_PATH= path to NVIDIA CUB installation, so $(CUB_PATH)/cub/cub.cuh exists
 #   defaults to /usr/local/cub-1.4.1
 # CUDNN_PATH= path to NVIDIA cuDNN installation so $(CUDNN_PATH)/cuda/include/cudnn.h exists
 #   If not specified, CNTK will be be built without cuDNN.
 # KALDI_PATH= Path to Kaldi
 #   If not specified, Kaldi plugins will not be built
 # OPENCV_PATH= path to OpenCV 3.0.0 installation, so $(OPENCV_PATH) exists
@ -102,6 +104,13 @@ ifdef CUDA_PATH
  LIBPATH += $(CUDA_PATH)/lib64
  LIBS += -lcublas -lcudart -lcuda -lcurand -lcusparse -lnvidia-ml
 # Set up cuDNN if needed
  ifdef CUDNN_PATH
    INCLUDEPATH += $(CUDNN_PATH)/cuda/include
    LIBPATH += $(CUDNN_PATH)/cuda/lib64
    LIBS += -lcudnn
    CPPFLAGS +=-DUSE_CUDNN
  endif
 else
  DEVICE = cpu
@ -218,6 +227,7 @@ MATH_SRC =\
 	Math/Math/QuantizedMatrix.cpp \
 	Math/Math/Matrix.cpp \
 	Math/Math/CUDAPageLockedMemAllocator.cpp \
 	Math/Math/ConvolutionEngine.cpp \
 ifdef CUDA_PATH
 MATH_SRC +=\
@ -225,6 +235,7 @@ MATH_SRC +=\
 	Math/Math/GPUSparseMatrix.cu \
 	Math/Math/GPUWatcher.cu \
 	Math/Math/MatrixQuantizerGPU.cu \
 	Math/Math/CuDnnConvolutionEngine.cpp \
 else
 MATH_SRC +=\
--- a/Math/Math/ConvolutionEngine.cpp
+++ b/Math/Math/ConvolutionEngine.cpp
@ -10,63 +10,16 @@
 namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    class DefaultConvolutionEngineFactory : public ConvolutionEngineFactory<ElemType>
    {
    public:
        DefaultConvolutionEngineFactory(DEVICEID_TYPE deviceId)
            : ConvolutionEngineFactory<ElemType>(deviceId)
        {
        }
    public:
        Tensor4DPtr CreateTensor(size_t w, size_t h, size_t c, size_t n) override
        {
            return std::make_unique<ConvolutionTensor4D>(w, h, c, n);
        }
        FilterPtr CreateFilter(size_t w, size_t h, size_t c, size_t k) override
        {
            return std::make_unique<Filter>(w, h, c, k);
        }
        ConvDescPtr CreateConvDescriptor(const Tensor4D& /*inT*/, const Filter& /*filterT*/, 
            size_t wStride, size_t hStride, bool padding) override
        {
            return std::make_unique<ConvDesc>(wStride, hStride, padding);
        }
        PoolDescPtr CreatePoolDescriptor(PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) override
        {
            return std::make_unique<PoolDesc>(kind, w, h, wStride, hStride, wPad, hPad);
        }
        ConvEnginePtr CreateConvEngine(size_t maxTempMemSizeInSamples) override
        {
            return std::make_unique<DefaultConvolutionEngine<ElemType>>(m_deviceId, maxTempMemSizeInSamples);
        }
        PoolEnginePtr CreatePoolEngine() override
        {
            return std::make_unique<DefaultPoolingEngine<ElemType>>();
        }
    };
    template<class ElemType>
    std::unique_ptr<ConvolutionEngineFactory<ElemType>> ConvolutionEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId)
    {
        // REVIEW alexeyk: make cuDNN default when running on GPU and compiled with cuDNN, add config parameter to enable runtime switch between implementations.
        if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported())
            return std::make_unique<CuDnnConvolutionEngineFactory<ElemType>>(deviceId);
        return std::make_unique<DefaultConvolutionEngineFactory<ElemType>>(deviceId);
    }
    template class ConvolutionEngineFactory<float>;
    template class ConvolutionEngineFactory<double>;
    template<class ElemType>
    class DefaultConvolutionEngine : public ConvolutionEngine<ElemType>
    {
    public:
    	using Base = ConvolutionEngine<ElemType>;
        using typename Base::Mat;
        using typename Base::Tensor4D;
        using typename Base::Filter;
        using typename Base::ConvDesc;
    public:
        DefaultConvolutionEngine(DEVICEID_TYPE deviceId, size_t maxTempMemSizeInSamples)
            : m_tempMatrix(deviceId), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples)
@ -294,6 +247,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    class DefaultPoolingEngine : public PoolingEngine<ElemType>
    {
    public:
    	using Base = PoolingEngine<ElemType>;
        using typename Base::Tensor4D;
        using typename Base::PoolDesc;
        using typename Base::Mat;
    public:
        void Forward(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out) override
        {
@ -349,4 +308,75 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template class PoolingEngine<float>;
    template class PoolingEngine<double>;
    template<class ElemType>
    class DefaultConvolutionEngineFactory : public ConvolutionEngineFactory<ElemType>
    {
    public:
    	using Base = ConvolutionEngineFactory<ElemType>;
        using typename Base::Tensor4D;
        using typename Base::Tensor4DPtr;
        using typename Base::Filter;
        using typename Base::FilterPtr;
        using typename Base::ConvDesc;
        using typename Base::ConvDescPtr;
        using typename Base::PoolDesc;
        using typename Base::PoolDescPtr;
        using typename Base::ConvEnginePtr;
        using typename Base::PoolEnginePtr;
        using Base::m_deviceId;
    public:
        DefaultConvolutionEngineFactory(DEVICEID_TYPE deviceId)
            : ConvolutionEngineFactory<ElemType>(deviceId)
        {
        }
    public:
        Tensor4DPtr CreateTensor(size_t w, size_t h, size_t c, size_t n) override
        {
            return std::make_unique<ConvolutionTensor4D>(w, h, c, n);
        }
        FilterPtr CreateFilter(size_t w, size_t h, size_t c, size_t k) override
        {
            return std::make_unique<Filter>(w, h, c, k);
        }
        ConvDescPtr CreateConvDescriptor(const Tensor4D& /*inT*/, const Filter& /*filterT*/,
            size_t wStride, size_t hStride, bool padding) override
        {
            return std::make_unique<ConvDesc>(wStride, hStride, padding);
        }
        PoolDescPtr CreatePoolDescriptor(typename PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) override
        {
            return std::make_unique<PoolDesc>(kind, w, h, wStride, hStride, wPad, hPad);
        }
        ConvEnginePtr CreateConvEngine(size_t maxTempMemSizeInSamples) override
        {
            return std::make_unique<DefaultConvolutionEngine<ElemType>>(m_deviceId, maxTempMemSizeInSamples);
        }
        PoolEnginePtr CreatePoolEngine() override
        {
            return std::make_unique<DefaultPoolingEngine<ElemType>>();
        }
    };
    template<class ElemType>
    std::unique_ptr<ConvolutionEngineFactory<ElemType>> ConvolutionEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId)
    {
        // REVIEW alexeyk: make cuDNN default when running on GPU and compiled with cuDNN, add config parameter to enable runtime switch between implementations.
        //if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported())
            return std::make_unique<CuDnnConvolutionEngineFactory<ElemType>>(deviceId);
        //return std::make_unique<DefaultConvolutionEngineFactory<ElemType>>(deviceId);
    }
    template class ConvolutionEngineFactory<float>;
    template class ConvolutionEngineFactory<double>;
 }}}
--- a/Math/Math/CuDnnConvolutionEngine.cpp
+++ b/Math/Math/CuDnnConvolutionEngine.cpp
@ -10,7 +10,7 @@
 #ifdef USE_CUDNN
 #include <cudnn.h>
-template<> static const char* CudaErrString(cudnnStatus_t x)
+template<> const char* CudaErrString(cudnnStatus_t x)
 {
    return cudnnGetErrorString(x);
 }
@ -202,12 +202,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    class CuDnnConvolutionEngine : public ConvolutionEngine<ElemType>
    {
    public:
-        using Tensor4D = ConvolutionTensor4D;
+    	using Base = ConvolutionEngine<ElemType>;
-        using Tensor4DPtr = std::unique_ptr<Tensor4D>;
+        using typename Base::Mat;
-        using Filter = ConvolutionFilter;
+        using typename Base::Tensor4D;
-        using FilterPtr = std::unique_ptr<ConvolutionFilter>;
+        using typename Base::Filter;
-        using ConvDesc = ConvolutionDescriptor;
+        using typename Base::ConvDesc;
        using ConvDescPtr = std::unique_ptr<ConvolutionDescriptor>;
        CuDnnConvolutionEngine(DEVICEID_TYPE deviceId, size_t maxTempMemSizeInSamples)
            : m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_cudnn(nullptr), m_curMBSize(0), m_tempC(deviceId)
@ -495,6 +494,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    class CuDnnPoolingEngine : public PoolingEngine<ElemType>
    {
    public:
    	using Base = PoolingEngine<ElemType>;
        using typename Base::Tensor4D;
        using typename Base::PoolDesc;
        using typename Base::Mat;
    public:
        CuDnnPoolingEngine()
            : m_cudnn(nullptr)
@ -546,7 +551,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    typename CuDnnConvolutionEngineFactory<ElemType>::Tensor4DPtr CuDnnConvolutionEngineFactory<ElemType>::CreateTensor(size_t w, size_t h, size_t c, size_t n)
    {
-        static_assert(false, "cuDNN engine currently supports only single and double precision tensors.");
+    	// REVIEW alexeyk: assert fires in GCC but not in VC++.
        //static_assert(false, "cuDNN engine currently supports only single and double precision tensors.");
    }
    template<>
    typename CuDnnConvolutionEngineFactory<float>::Tensor4DPtr CuDnnConvolutionEngineFactory<float>::CreateTensor(size_t w, size_t h, size_t c, size_t n)
@ -562,7 +568,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    typename CuDnnConvolutionEngineFactory<ElemType>::FilterPtr CuDnnConvolutionEngineFactory<ElemType>::CreateFilter(size_t w, size_t h, size_t c, size_t k)
    {
-        static_assert(false, "cuDNN engine currently supports only single and double precision filters.");
+    	// REVIEW alexeyk: assert fires in GCC but not in VC++.
        //static_assert(false, "cuDNN engine currently supports only single and double precision filters.");
    }
    template<>
    typename CuDnnConvolutionEngineFactory<float>::FilterPtr CuDnnConvolutionEngineFactory<float>::CreateFilter(size_t w, size_t h, size_t c, size_t k)
@ -586,7 +593,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    typename CuDnnConvolutionEngineFactory<ElemType>::PoolDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolDescriptor(
-        PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad)
+        typename PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad)
    {
        return std::make_unique<CuDnnPoolingDescriptor>(kind, w, h, wStride, hStride, wPad, hPad);
    }
--- a/Math/Math/CuDnnConvolutionEngine.h
+++ b/Math/Math/CuDnnConvolutionEngine.h
@ -13,6 +13,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    class CuDnnConvolutionEngineFactory : public ConvolutionEngineFactory<ElemType>
    {
    public:
    	using Base = ConvolutionEngineFactory<ElemType>;
        using typename Base::Tensor4D;
        using typename Base::Tensor4DPtr;
        using typename Base::Filter;
        using typename Base::FilterPtr;
        using typename Base::ConvDesc;
        using typename Base::ConvDescPtr;
        using typename Base::PoolDesc;
        using typename Base::PoolDescPtr;
        using typename Base::ConvEnginePtr;
        using typename Base::PoolEnginePtr;
        using Base::m_deviceId;
    public:
        CuDnnConvolutionEngineFactory(DEVICEID_TYPE deviceId)
            : ConvolutionEngineFactory<ElemType>(deviceId)
@ -22,9 +38,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    public:
        Tensor4DPtr CreateTensor(size_t w, size_t h, size_t c, size_t n) override;
        FilterPtr CreateFilter(size_t w, size_t h, size_t c, size_t k) override;
-        ConvDescPtr CreateConvDescriptor(const Tensor4D& inT, const Filter& filterT, 
+        ConvDescPtr CreateConvDescriptor(const Tensor4D& inT, const Filter& filterT,
            size_t wStride, size_t hStride, bool padding) override;
-        PoolDescPtr CreatePoolDescriptor(PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) override;
+        PoolDescPtr CreatePoolDescriptor(typename PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) override;
        ConvEnginePtr CreateConvEngine(size_t maxTempMemSizeInSamples) override;
        PoolEnginePtr CreatePoolEngine() override;
--- a/45
+++ b/45
@ -32,6 +32,10 @@ have_cub=no
 cub_path=
 cub_check=cub/cub.cuh
 have_cudnn=no
 cudnn_path=
 cudnn_check=cuda/include/cudnn.h
 have_opencv=no
 opencv_path=
 opencv_check=include/opencv2/opencv.hpp
@ -49,6 +53,7 @@ default_cudas="cuda-7.5 cuda-7.0 cuda-6.5"
 default_kaldis="kaldi-trunk"
 default_gdks=". gdk/usr"
 default_cubs="cub-1.4.1"
 default_cudnns="cudnn-4.0"
 default_opencvs="opencv-3.0.0"
 function default_paths ()
@ -115,6 +120,11 @@ function find_cub ()
    find_dir "$default_cubs" "$cub_check"
 }
 function find_cudnn ()
 {
    find_dir "$default_cudnns" "$cudnn_check"
 }
 function find_opencv ()
 {
    find_dir "$default_opencvs" "$opencv_check"
@ -162,6 +172,7 @@ function show_help ()
    echo "  --with-cuda[=directory] $(show_default $(find_cuda))"
    echo "  --with-cub[=directory] $(show_default $(find_cub))"
    echo "  --with-gdk[=directory] $(show_default $(find_gdk))"
    echo "  --with-cudnn[=directory] $(show_default $(find_cudnn))"
    echo "  --with-acml[=directory] $(show_default $(find_acml))"
    echo "  --with-mkl[=directory] $(show_default $(find_mkl))"
    echo "  --with-buildtype=(debug|release) $(show_default $default_buildtype)"
@ -278,6 +289,27 @@ do
                fi
            fi
            ;;
        --with-cudnn*)
            have_cudnn=yes
            if test x$optarg = x
            then
                cudnn_path=$(find_cudnn)
                if test x$cudnn_path = x
                then
                    echo "Cannot find NVIDIA cuDNN directory."
                    echo "Please specify a value for --with-cudnn"
                    exit 1
                fi
            else
                if test $(check_dir $optarg $cudnn_check) = yes
                then
                    cudnn_path=$optarg
                else
                    echo "Invalid cuDNN directory $optarg"
                    exit 1
                fi
            fi
            ;;
        --with-acml*)
            have_acml=yes
            mathlib=acml
@ -448,6 +480,18 @@ then
        echo Found CUB at $cub_path
    fi
 fi
 if test $enable_cuda = yes && test x$cudnn_path = x
 then
    cudnn_path=$(find_cudnn)
    if test x$cudnn_path = x ; then
        echo Cannot locate NVIDIA cuDNN directory
        echo CNTK will use default convolution engine.
    else
        echo Found cuDNN at $cudnn_path
    fi
 fi
 config=$build_top/Config.make
 echo Generating $config
 echo "#Configuration file for cntk" > $config
@ -465,6 +509,7 @@ if test $enable_cuda = yes ; then
    echo CUDA_PATH=$cuda_path >> $config
    echo GDK_PATH=$gdk_path >> $config
    echo CUB_PATH=$cub_path >> $config
    echo CUDNN_PATH=$cudnn_path >> $config
 fi
 if test x$kaldi_path != x ; then
    echo KALDI_PATH=$kaldi_path >> $config