Add support for new CU
This commit is contained in:
Родитель
c9aafd1992
Коммит
0b35c4646c
21
Makefile
21
Makefile
|
@ -113,7 +113,7 @@ ifdef CUDA_PATH
|
||||||
# Set up cuDNN if needed
|
# Set up cuDNN if needed
|
||||||
ifdef CUDNN_PATH
|
ifdef CUDNN_PATH
|
||||||
INCLUDEPATH += $(CUDNN_PATH)/cuda/include
|
INCLUDEPATH += $(CUDNN_PATH)/cuda/include
|
||||||
LIBPATH += $(CUDNN_PATH)/cuda/lib64
|
LIBPATH += $(CUDNN_PATH)/cuda/lib64 $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
|
||||||
LIBS += -lcudnn
|
LIBS += -lcudnn
|
||||||
COMMON_FLAGS +=-DUSE_CUDNN
|
COMMON_FLAGS +=-DUSE_CUDNN
|
||||||
endif
|
endif
|
||||||
|
@ -159,16 +159,23 @@ endif
|
||||||
|
|
||||||
# Set up nvcc target architectures (will generate code to support them all, i.e. fat-binary, in release mode)
|
# Set up nvcc target architectures (will generate code to support them all, i.e. fat-binary, in release mode)
|
||||||
# In debug mode we will rely on JIT to create code "on the fly" for the underlying architecture
|
# In debug mode we will rely on JIT to create code "on the fly" for the underlying architecture
|
||||||
GENCODE_SM20 := -gencode arch=compute_20,code=\"sm_20,compute_20\"
|
|
||||||
GENCODE_SM30 := -gencode arch=compute_30,code=\"sm_30,compute_30\"
|
GENCODE_TEMPLATE := -gencode arch=compute_XX,code=\"sm_XX,compute_XX\"
|
||||||
GENCODE_SM35 := -gencode arch=compute_35,code=\"sm_35,compute_35\"
|
|
||||||
GENCODE_SM50 := -gencode arch=compute_50,code=\"sm_50,compute_50\"
|
ifndef CNTK_CUDA_GENCODE
|
||||||
|
ifeq ("$(BUILDTYPE)","debug")
|
||||||
|
CNTK_CUDA_GENCODE := 20 30
|
||||||
|
endif
|
||||||
|
ifeq ("$(BUILDTYPE)","release")
|
||||||
|
CNTK_CUDA_GENCODE := 20 30 35 50
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ("$(BUILDTYPE)","debug")
|
ifeq ("$(BUILDTYPE)","debug")
|
||||||
ifdef CNTK_CUDA_CODEGEN_DEBUG
|
ifdef CNTK_CUDA_CODEGEN_DEBUG
|
||||||
GENCODE_FLAGS := $(CNTK_CUDA_CODEGEN_DEBUG)
|
GENCODE_FLAGS := $(CNTK_CUDA_CODEGEN_DEBUG)
|
||||||
else
|
else
|
||||||
GENCODE_FLAGS := -gencode arch=compute_20,code=\"compute_20\" $(GENCODE_SM30)
|
GENCODE_FLAGS := $(foreach CUDA_ARCH,$(CNTK_CUDA_GENCODE),$(subst XX,$(CUDA_ARCH),$(GENCODE_TEMPLATE)))
|
||||||
endif
|
endif
|
||||||
|
|
||||||
CXXFLAGS += -g
|
CXXFLAGS += -g
|
||||||
|
@ -181,7 +188,7 @@ ifeq ("$(BUILDTYPE)","release")
|
||||||
ifdef CNTK_CUDA_CODEGEN_RELEASE
|
ifdef CNTK_CUDA_CODEGEN_RELEASE
|
||||||
GENCODE_FLAGS := $(CNTK_CUDA_CODEGEN_RELEASE)
|
GENCODE_FLAGS := $(CNTK_CUDA_CODEGEN_RELEASE)
|
||||||
else
|
else
|
||||||
GENCODE_FLAGS := $(GENCODE_SM20) $(GENCODE_SM30) $(GENCODE_SM35) $(GENCODE_SM50)
|
GENCODE_FLAGS := $(foreach CUDA_ARCH,$(CNTK_CUDA_GENCODE),$(subst XX,$(CUDA_ARCH),$(GENCODE_TEMPLATE)))
|
||||||
endif
|
endif
|
||||||
|
|
||||||
CXXFLAGS += -g -O4
|
CXXFLAGS += -g -O4
|
||||||
|
|
|
@ -73,7 +73,7 @@ protected:
|
||||||
m_inOutCuDnnT.UpdateBatchSize(srcGrad.GetNumCols());
|
m_inOutCuDnnT.UpdateBatchSize(srcGrad.GetNumCols());
|
||||||
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
|
cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
|
||||||
// REVIEW alexeyk: remove once Philly is upgraded to prod version. Also change betaParamDiff to 1 and update CNTK BN engine.
|
// REVIEW alexeyk: remove once Philly is upgraded to prod version. Also change betaParamDiff to 1 and update CNTK BN engine.
|
||||||
#if CUDNN_PATCHLEVEL >= 7
|
#if CUDNN_PATCHLEVEL >= 7 || CUDNN_MAJOR >= 5
|
||||||
CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
|
CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
|
||||||
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
|
m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -138,9 +138,15 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
// Must use CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING to get the same results as in reference engine.
|
// Must use CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING to get the same results as in reference engine.
|
||||||
|
#if CUDNN_MAJOR < 5
|
||||||
CUDNN_CALL(cudnnSetPoolingNdDescriptor(m_pool,
|
CUDNN_CALL(cudnnSetPoolingNdDescriptor(m_pool,
|
||||||
kind == PoolKind::Max ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
|
kind == PoolKind::Max ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
|
||||||
(int)dims.size(), dims.data(), pad.data(), stride.data()));
|
(int)dims.size(), dims.data(), pad.data(), stride.data()));
|
||||||
|
#else
|
||||||
|
CUDNN_CALL(cudnnSetPoolingNdDescriptor(m_pool,
|
||||||
|
kind == PoolKind::Max ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
|
||||||
|
(cudnnNanPropagation_t) 0, (int)dims.size(), dims.data(), pad.data(), stride.data()));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
~CuDnnPool()
|
~CuDnnPool()
|
||||||
|
|
|
@ -42,6 +42,10 @@
|
||||||
|
|
||||||
#define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing
|
#define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing
|
||||||
|
|
||||||
|
// TODO: This condition seems wrong, it should be:
|
||||||
|
// !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 600
|
||||||
|
// NVIDIA should fix their CUDA 8.0 headers
|
||||||
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
|
||||||
// CUDA atomicAdd() only exists for 'float'. This is the 'double' version.
|
// CUDA atomicAdd() only exists for 'float'. This is the 'double' version.
|
||||||
static __inline__ __device__ double atomicAdd(double* address, double val)
|
static __inline__ __device__ double atomicAdd(double* address, double val)
|
||||||
{
|
{
|
||||||
|
@ -54,6 +58,7 @@ static __inline__ __device__ double atomicAdd(double* address, double val)
|
||||||
} while (assumed != old);
|
} while (assumed != old);
|
||||||
return __longlong_as_double(old);
|
return __longlong_as_double(old);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// TODO: replace this with TensorOps.h LogAdd(). It differs in using ElemType throughout, while this one seems to use 'double' versions of exp() and log().
|
// TODO: replace this with TensorOps.h LogAdd(). It differs in using ElemType throughout, while this one seems to use 'double' versions of exp() and log().
|
||||||
// The 'k' in the name is to avoid naming conflicts with various versions of logadd() that are defined throughout the codebase.
|
// The 'k' in the name is to avoid naming conflicts with various versions of logadd() that are defined throughout the codebase.
|
||||||
|
|
|
@ -32,6 +32,7 @@ default_buildtype=release
|
||||||
have_gdk=no
|
have_gdk=no
|
||||||
gdk_path=
|
gdk_path=
|
||||||
gdk_check=include/nvidia/gdk/nvml.h
|
gdk_check=include/nvidia/gdk/nvml.h
|
||||||
|
gdk_check_alternate=include/nvml.h
|
||||||
|
|
||||||
have_cub=no
|
have_cub=no
|
||||||
cub_path=
|
cub_path=
|
||||||
|
@ -40,6 +41,7 @@ cub_check=cub/cub.cuh
|
||||||
have_cudnn=no
|
have_cudnn=no
|
||||||
cudnn_path=
|
cudnn_path=
|
||||||
cudnn_check=cuda/include/cudnn.h
|
cudnn_check=cuda/include/cudnn.h
|
||||||
|
cudnn_check_alternate=include/cudnn.h
|
||||||
|
|
||||||
have_opencv=no
|
have_opencv=no
|
||||||
opencv_path=
|
opencv_path=
|
||||||
|
@ -322,7 +324,7 @@ do
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
if test $(check_dir $optarg $gdk_check) = yes
|
if test $(check_dir $optarg $gdk_check) = yes || test $(check_dir $optarg $gdk_check_alternate) = yes
|
||||||
then
|
then
|
||||||
gdk_path=$optarg
|
gdk_path=$optarg
|
||||||
else
|
else
|
||||||
|
@ -343,7 +345,7 @@ do
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
if test $(check_dir $optarg $cudnn_check) = yes
|
if test $(check_dir $optarg $cudnn_check) = yes || test $(check_dir $optarg $cudnn_check_alternate) = yes
|
||||||
then
|
then
|
||||||
cudnn_path=$optarg
|
cudnn_path=$optarg
|
||||||
else
|
else
|
||||||
|
|
Загрузка…
Ссылка в новой задаче