From 5be5a3e71b1b1c924d12201ccc3c765c0649b252 Mon Sep 17 00:00:00 2001 From: Cha Zhang Date: Fri, 3 Feb 2017 13:06:10 -0800 Subject: [PATCH] Adding robustness to cuda convolution (handling failures). --- Source/Math/CuDnnConvolutionEngine.cu | 37 ++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/Source/Math/CuDnnConvolutionEngine.cu b/Source/Math/CuDnnConvolutionEngine.cu index 6d1b8e4e7..03c17ed68 100644 --- a/Source/Math/CuDnnConvolutionEngine.cu +++ b/Source/Math/CuDnnConvolutionEngine.cu @@ -246,7 +246,9 @@ protected: m_fwdAlgo.Algo.algo, ptr(workspace), m_fwdAlgo.Algo.memory, &C::Zero, m_outT, ptr(out)); // There might be a case where cuDNN fails due to workspace being too small, try using no-workspace algo instead. // REVIEW alexeyk: NVIDIA is currently reviewing this issue. - if (CUDNN_STATUS_INVALID_VALUE == err && m_fwdAlgo.Algo.memory > 0) + // chazhang: it seems even the no-workspace algo can fail from time to time. Hence we give it a second change here anyway (and second time it usually succeed) + // NVIDIA should definitely investigate on this + if (CUDNN_STATUS_SUCCESS != err) { if (m_forceDeterministicAlgorithms) RuntimeError("Falling back of the algorithms is not allowed. Please set 'forceDeterministicAlgorithms=false'."); @@ -260,7 +262,6 @@ protected: // Only supported in MatrixPool enable // NOTE: it's unnecessary to keep the workspace. workspace.Resize(0, 0); - CUDNN_CALL(err); } @@ -290,9 +291,21 @@ protected: if (m_backDataAlgo.Algo.memory > 0) workspace.Resize((m_backDataAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1); // Compute gradients with respect to the output tensor (data). - CUDNN_CALL(cudnnConvolutionBackwardData(*m_cudnn, &C::One, *m_kernelT, ptr(kernel), m_outT, ptr(srcGrad), *m_conv, m_backDataAlgo.Algo.algo, - ptr(workspace), m_backDataAlgo.Algo.memory, accumulateGradient ? &C::One : &C::Zero, m_inT, ptr(grad))); + auto err = cudnnConvolutionBackwardData(*m_cudnn, &C::One, *m_kernelT, ptr(kernel), m_outT, ptr(srcGrad), *m_conv, m_backDataAlgo.Algo.algo, + ptr(workspace), m_backDataAlgo.Algo.memory, accumulateGradient ? &C::One : &C::Zero, m_inT, ptr(grad)); + // handle NVIDIA failure, need to be careful this is only doable if accumulateGradient == false, otherwise, the state may already be messed up + if (CUDNN_STATUS_SUCCESS != err && !accumulateGradient) + { + if (m_forceDeterministicAlgorithms) + RuntimeError("Falling back of the algorithms is not allowed. Please set 'forceDeterministicAlgorithms=false'."); + auto err2 = cudnnConvolutionBackwardData(*m_cudnn, &C::One, *m_kernelT, ptr(kernel), m_outT, ptr(srcGrad), *m_conv, m_backDataAlgo.NoWorkspaceAlgo, + nullptr, 0, &C::Zero, m_inT, ptr(grad)); + // Update original error in case of success. + if (CUDNN_STATUS_SUCCESS == err2) + err = CUDNN_STATUS_SUCCESS; + } workspace.Resize(0, 0); + CUDNN_CALL(err); } void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool accumulateGradient, bool /*allowReuse*/, Mat& workspace) override @@ -321,9 +334,21 @@ protected: if (m_backFiltAlgo.Algo.memory > 0) workspace.Resize((m_backFiltAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1); // Compute gradients with respect to the output tensor (data). - CUDNN_CALL(cudnnConvolutionBackwardFilter(*m_cudnn, &C::One, m_inT, ptr(in), m_outT, ptr(srcGrad), *m_conv, m_backFiltAlgo.Algo.algo, - ptr(workspace), m_backFiltAlgo.Algo.memory, accumulateGradient ? &C::One : &C::Zero, *m_kernelT, ptr(kernelGrad))); + auto err = cudnnConvolutionBackwardFilter(*m_cudnn, &C::One, m_inT, ptr(in), m_outT, ptr(srcGrad), *m_conv, m_backFiltAlgo.Algo.algo, + ptr(workspace), m_backFiltAlgo.Algo.memory, accumulateGradient ? &C::One : &C::Zero, *m_kernelT, ptr(kernelGrad)); + // handle NVIDIA failure, need to be careful this is only doable if accumulateGradient == false, otherwise, the state may already be messed up + if (CUDNN_STATUS_SUCCESS != err && !accumulateGradient) + { + if (m_forceDeterministicAlgorithms) + RuntimeError("Falling back of the algorithms is not allowed. Please set 'forceDeterministicAlgorithms=false'."); + auto err2 = cudnnConvolutionBackwardFilter(*m_cudnn, &C::One, m_inT, ptr(in), m_outT, ptr(srcGrad), *m_conv, m_backFiltAlgo.NoWorkspaceAlgo, + nullptr, 0, &C::Zero, *m_kernelT, ptr(kernelGrad)); + // Update original error in case of success. + if (CUDNN_STATUS_SUCCESS == err2) + err = CUDNN_STATUS_SUCCESS; + } workspace.Resize(0, 0); + CUDNN_CALL(err); } void EnsurePoolingInitialized() override