From 0b35c4646cae48906948b2564c37c71c6b3070b2 Mon Sep 17 00:00:00 2001 From: Marko Radmilac Date: Thu, 28 Apr 2016 10:21:13 -0700 Subject: [PATCH] Add support for new CU --- Makefile | 21 ++++++++++++++------- Source/Math/CuDnnBatchNormalization.cu | 2 +- Source/Math/CuDnnConvolutionEngine.cu | 6 ++++++ Source/Math/GPUMatrixCUDAKernels.cuh | 5 +++++ configure | 6 ++++-- 5 files changed, 30 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index d9f82b5a9..c096eeb68 100644 --- a/Makefile +++ b/Makefile @@ -113,7 +113,7 @@ ifdef CUDA_PATH # Set up cuDNN if needed ifdef CUDNN_PATH INCLUDEPATH += $(CUDNN_PATH)/cuda/include - LIBPATH += $(CUDNN_PATH)/cuda/lib64 + LIBPATH += $(CUDNN_PATH)/cuda/lib64 $(CUDA_PATH)/targets/x86_64-linux/lib/stubs LIBS += -lcudnn COMMON_FLAGS +=-DUSE_CUDNN endif @@ -159,16 +159,23 @@ endif # Set up nvcc target architectures (will generate code to support them all, i.e. fat-binary, in release mode) # In debug mode we will rely on JIT to create code "on the fly" for the underlying architecture -GENCODE_SM20 := -gencode arch=compute_20,code=\"sm_20,compute_20\" -GENCODE_SM30 := -gencode arch=compute_30,code=\"sm_30,compute_30\" -GENCODE_SM35 := -gencode arch=compute_35,code=\"sm_35,compute_35\" -GENCODE_SM50 := -gencode arch=compute_50,code=\"sm_50,compute_50\" + +GENCODE_TEMPLATE := -gencode arch=compute_XX,code=\"sm_XX,compute_XX\" + +ifndef CNTK_CUDA_GENCODE + ifeq ("$(BUILDTYPE)","debug") + CNTK_CUDA_GENCODE := 20 30 + endif + ifeq ("$(BUILDTYPE)","release") + CNTK_CUDA_GENCODE := 20 30 35 50 + endif +endif ifeq ("$(BUILDTYPE)","debug") ifdef CNTK_CUDA_CODEGEN_DEBUG GENCODE_FLAGS := $(CNTK_CUDA_CODEGEN_DEBUG) else - GENCODE_FLAGS := -gencode arch=compute_20,code=\"compute_20\" $(GENCODE_SM30) + GENCODE_FLAGS := $(foreach CUDA_ARCH,$(CNTK_CUDA_GENCODE),$(subst XX,$(CUDA_ARCH),$(GENCODE_TEMPLATE))) endif CXXFLAGS += -g @@ -181,7 +188,7 @@ ifeq ("$(BUILDTYPE)","release") ifdef CNTK_CUDA_CODEGEN_RELEASE GENCODE_FLAGS := $(CNTK_CUDA_CODEGEN_RELEASE) else - GENCODE_FLAGS := $(GENCODE_SM20) $(GENCODE_SM30) $(GENCODE_SM35) $(GENCODE_SM50) + GENCODE_FLAGS := $(foreach CUDA_ARCH,$(CNTK_CUDA_GENCODE),$(subst XX,$(CUDA_ARCH),$(GENCODE_TEMPLATE))) endif CXXFLAGS += -g -O4 diff --git a/Source/Math/CuDnnBatchNormalization.cu b/Source/Math/CuDnnBatchNormalization.cu index 4eb8a4acd..1eaa855bc 100644 --- a/Source/Math/CuDnnBatchNormalization.cu +++ b/Source/Math/CuDnnBatchNormalization.cu @@ -73,7 +73,7 @@ protected: m_inOutCuDnnT.UpdateBatchSize(srcGrad.GetNumCols()); cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION; // REVIEW alexeyk: remove once Philly is upgraded to prod version. Also change betaParamDiff to 1 and update CNTK BN engine. -#if CUDNN_PATCHLEVEL >= 7 +#if CUDNN_PATCHLEVEL >= 7 || CUDNN_MAJOR >= 5 CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad), m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev))); #else diff --git a/Source/Math/CuDnnConvolutionEngine.cu b/Source/Math/CuDnnConvolutionEngine.cu index 7d7990686..c0fc2183c 100644 --- a/Source/Math/CuDnnConvolutionEngine.cu +++ b/Source/Math/CuDnnConvolutionEngine.cu @@ -138,9 +138,15 @@ public: } // Must use CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING to get the same results as in reference engine. +#if CUDNN_MAJOR < 5 CUDNN_CALL(cudnnSetPoolingNdDescriptor(m_pool, kind == PoolKind::Max ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING, (int)dims.size(), dims.data(), pad.data(), stride.data())); +#else + CUDNN_CALL(cudnnSetPoolingNdDescriptor(m_pool, + kind == PoolKind::Max ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING, + (cudnnNanPropagation_t) 0, (int)dims.size(), dims.data(), pad.data(), stride.data())); +#endif } ~CuDnnPool() diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh index 2c86c749f..499389174 100644 --- a/Source/Math/GPUMatrixCUDAKernels.cuh +++ b/Source/Math/GPUMatrixCUDAKernels.cuh @@ -42,6 +42,10 @@ #define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing +// TODO: This condition seems wrong, it should be: +// !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 600 +// NVIDIA should fix their CUDA 8.0 headers +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 // CUDA atomicAdd() only exists for 'float'. This is the 'double' version. static __inline__ __device__ double atomicAdd(double* address, double val) { @@ -54,6 +58,7 @@ static __inline__ __device__ double atomicAdd(double* address, double val) } while (assumed != old); return __longlong_as_double(old); } +#endif // TODO: replace this with TensorOps.h LogAdd(). It differs in using ElemType throughout, while this one seems to use 'double' versions of exp() and log(). // The 'k' in the name is to avoid naming conflicts with various versions of logadd() that are defined throughout the codebase. diff --git a/configure b/configure index 95f909088..451124fe0 100755 --- a/configure +++ b/configure @@ -32,6 +32,7 @@ default_buildtype=release have_gdk=no gdk_path= gdk_check=include/nvidia/gdk/nvml.h +gdk_check_alternate=include/nvml.h have_cub=no cub_path= @@ -40,6 +41,7 @@ cub_check=cub/cub.cuh have_cudnn=no cudnn_path= cudnn_check=cuda/include/cudnn.h +cudnn_check_alternate=include/cudnn.h have_opencv=no opencv_path= @@ -322,7 +324,7 @@ do exit 1 fi else - if test $(check_dir $optarg $gdk_check) = yes + if test $(check_dir $optarg $gdk_check) = yes || test $(check_dir $optarg $gdk_check_alternate) = yes then gdk_path=$optarg else @@ -343,7 +345,7 @@ do exit 1 fi else - if test $(check_dir $optarg $cudnn_check) = yes + if test $(check_dir $optarg $cudnn_check) = yes || test $(check_dir $optarg $cudnn_check_alternate) = yes then cudnn_path=$optarg else