CuDNN RNN move to CUDNN5 Release Version

cudnn-7.5-windows10-x64-v5.0-ga This required the following changes - CuDnnRNN::SetLength no longer necessary, the RNN descriptor is length-agnostic. - The sequence length is now a parameter of the backward and forward cudnn calls, so it is stored as a member function in the executor object. It is reset at each call of the forward pass. - dims[] and strides[] in cudnnSetTensorNDDescriptor have been re-arranged. But, data layout should be the same. - other API calls experienced minor changes
2016-05-19 14:18:06 -07:00 · 2016-05-19 14:18:06 -07:00 · 1360f0fb15
--- a/Source/Math/CuDnnRNN.cpp
+++ b/Source/Math/CuDnnRNN.cpp
@ -43,24 +43,6 @@ public:
    DISABLE_COPY_AND_MOVE(CuDnnTensorDescriptor);
 };

-template<class ElemType>
-void CuDnnRNN<ElemType>::SetLength(size_t len)
-{
-    if (m_seqLength != len)
-    {
-        m_seqLength = len;
-        CUDNN_CALL(cudnnSetRNNDescriptor(m_rnnDesc,
-            (int)m_rnnParameters.m_hiddenSize,
-            (int)m_seqLength,
-            (int)m_rnnParameters.m_numLayers,
-            m_dropout,
-            CUDNN_LINEAR_INPUT, // We can also skip the input matrix transformation
-            m_rnnParameters.m_bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL,
-            GetMode(),
-            m_dataType));
-    }
-}
-
 template <class ElemType>
 void CuDnnRNNExecutor<ElemType>::SetDescriptors(size_t dim, const vector<size_t>& numSequencesForFrame, vector<cudnnTensorDescriptor_t>& descriptors)
 {
@ -71,8 +53,8 @@ void CuDnnRNNExecutor<ElemType>::SetDescriptors(size_t dim, const vector<size_t>
            descriptors.push_back(cudnnTensorDescriptor_t());
            CUDNN_CALL(cudnnCreateTensorDescriptor(&descriptors[i]));
        }
-        int dims[3] = { (int)dim, (int)numSequencesForFrame[i], 1 };
-        int strides[3] = { 1, dims[0], dims[0] * dims[1] };
+        int dims[3] = { (int)numSequencesForFrame[i], (int)dim, 1 };
+        int strides[3] = { dims[2] * dims[1], dims[2], 1 };
        CUDNN_CALL(cudnnSetTensorNdDescriptor(descriptors[i], CUDNN_DATA_FLOAT, 3, dims, strides));
    }
 }
@ -93,20 +75,19 @@ void CuDnnRNNExecutor<ElemType>::ForwardCore(
    if (m_yDim != (m_rnnT->isBidirectional() ? 2 : 1) * m_rnnT->GetNumHidden())
        InvalidArgument("CuDnn ForwardCore: Output leading dimension must be twice hidden size for bidirectional networks");

-    m_rnnT->SetLength(numSequencesForFrame.size());
-
    // set up the input and output descriptors
    SetDescriptors(m_xDim, numSequencesForFrame, xDesc);
    SetDescriptors(m_yDim, numSequencesForFrame, yDesc);

    // ensure workspace and reserve are large enough
+    m_seqLength = numSequencesForFrame.size();
    size_t workSize;
    size_t reserveSize;

    // Need for every pass
-    CUDNN_CALL(cudnnGetRNNWorkspaceSize(*m_cudnn, *m_rnnT, xDesc.data(), &workSize));
+    CUDNN_CALL(cudnnGetRNNWorkspaceSize(*m_cudnn, *m_rnnT, (int)m_seqLength, xDesc.data(), &workSize));
    // Only needed in training, can't be touched between passes.
-    CUDNN_CALL(cudnnGetRNNTrainingReserveSize(*m_cudnn, *m_rnnT, xDesc.data(), &reserveSize));
+    CUDNN_CALL(cudnnGetRNNTrainingReserveSize(*m_cudnn, *m_rnnT, (int)m_seqLength, xDesc.data(), &reserveSize));

    // convert from bytes to ElemType
    workSize = (workSize + sizeof(ElemType) - 1) / (sizeof(ElemType));
@ -115,12 +96,13 @@ void CuDnnRNNExecutor<ElemType>::ForwardCore(
    reserve.Resize(reserveSize, 1);
    workspace.Resize(workSize, 1);

-    wDesc = make_unique<CuDnnFilter<ElemType>>(*m_rnnT, xDesc.data());
+    wDesc = make_unique<CuDnnFilter<ElemType>>(*m_rnnT, xDesc[0]);
    if (wDesc->GetSize() != weightsW.GetNumElements())
        InvalidArgument("RNN needs %ld parameters, but %ld were allocated", wDesc->GetSize(), weightsW.GetNumRows());

    CUDNN_CALL(cudnnRNNForwardTraining(
        *m_cudnn, *m_rnnT,
+        (int)m_seqLength,
        xDesc.data(), inputX.Data(),
        0, 0,
        0, 0,
@ -148,6 +130,7 @@ void CuDnnRNNExecutor<ElemType>::BackwardDataCore(
    {
        CUDNN_CALL(cudnnRNNBackwardData(
            *m_cudnn, *m_rnnT,
+            (int)m_seqLength,
            yDesc.data(), outputY.Data(),
            yDesc.data(), outputDY.Data(),
            0, 0,
@ -177,6 +160,7 @@ void CuDnnRNNExecutor<ElemType>::BackwardWeightsCore(const GPUMatrix<ElemType>&
        LogicError("out of order calling you have been very bad");
    CUDNN_CALL(cudnnRNNBackwardWeights(
        *m_cudnn, *m_rnnT,
+        (int)m_seqLength,
        xDesc.data(), inputX.Data(),
        0, 0,
        yDesc.data(), outputY.Data(),
@ -185,9 +169,6 @@ void CuDnnRNNExecutor<ElemType>::BackwardWeightsCore(const GPUMatrix<ElemType>&
        reserve.Data(), reserve.GetNumElements()*sizeof(ElemType)));
 }

-template class CuDnnRNN<double>;
-template class CuDnnRNN<float>;
-
 template class CuDnnRNNExecutor<double>;
 template class CuDnnRNNExecutor<float>;

--- a/Source/Math/CuDnnRNN.h
+++ b/Source/Math/CuDnnRNN.h
@ -60,7 +60,6 @@ private:
    cudnnDataType_t m_dataType;
    cudnnRNNDescriptor_t m_rnnDesc;
    CuDnnDropout m_dropout;
-    size_t m_seqLength;
    RnnParameters m_rnnParameters;

    cudnnRNNMode_t GetMode()
@ -77,12 +76,19 @@ private:
    }

 public:
-    CuDnnRNN(const RnnParameters& rnnParameters, const size_t seqLength)
-        : m_rnnDesc(nullptr), m_dropout(0.0f), m_rnnParameters(rnnParameters), m_seqLength(0),
+    CuDnnRNN(const RnnParameters& rnnParameters)
+        : m_rnnDesc(nullptr), m_dropout(0.0f), m_rnnParameters(rnnParameters),
        m_dataType(CuDnnTensor::GetDataType<ElemType>())
    {
        CUDNN_CALL(cudnnCreateRNNDescriptor(&m_rnnDesc));
-        SetLength(seqLength);
+        CUDNN_CALL(cudnnSetRNNDescriptor(m_rnnDesc,
+            (int)m_rnnParameters.m_hiddenSize,
+            (int)m_rnnParameters.m_numLayers,
+            m_dropout,
+            CUDNN_LINEAR_INPUT, // We can also skip the input matrix transformation
+            m_rnnParameters.m_bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL,
+            GetMode(),
+            m_dataType));
    }

    ~CuDnnRNN()
@ -99,8 +105,6 @@ public:
        return this->m_rnnParameters == rnnParameters;
    }

-    void SetLength(size_t len);
-
    size_t GetLength()
    {
        return m_seqLength;
@ -126,14 +130,14 @@ class CuDnnFilter
    CuDnn::ptr_t m_cudnn;
    size_t m_filterSize;
 public:
-    CuDnnFilter(const CuDnnRNN<ElemType>& rnn, const cudnnTensorDescriptor_t *xDesc) :
+    CuDnnFilter(const CuDnnRNN<ElemType>& rnn, const cudnnTensorDescriptor_t& xDesc) :
        m_cudnn(CuDnn::Instance()), m_dataType(CuDnnTensor::GetDataType<ElemType>())
    {
        CUDNN_CALL(cudnnCreateFilterDescriptor(&m_filterDesc));
        try
        {
            size_t filterSize;
-            CUDNN_CALL(cudnnGetRNNParamsSize(*m_cudnn, rnn, xDesc, &filterSize));
+            CUDNN_CALL(cudnnGetRNNParamsSize(*m_cudnn, rnn, xDesc, &filterSize, m_dataType));

            size_t dataSize = 2; // CUDNN_DATA_HALF

@ -178,24 +182,20 @@ class CuDnnRNNExecutor
    cudnnDataType_t m_dataType;
    size_t m_xDim, m_yDim;
 public:
-    CuDnnRNNExecutor(size_t xDim, size_t yDim, size_t seqLength, const RnnParameters& rnnParameters ) :
+    CuDnnRNNExecutor(size_t xDim, size_t yDim, const RnnParameters& rnnParameters ) :
        m_cudnn(CuDnn::Instance()),
        m_xDim(xDim), m_yDim(yDim),
+        m_seqLength(0),
        m_dataType(CuDnnTensor::GetDataType<ElemType>()),
        m_BackwardDataCalledYet(false)
    {
-        m_rnnT = std::make_unique<CuDnnRNN<ElemType>>(rnnParameters, seqLength);
+        m_rnnT = std::make_unique<CuDnnRNN<ElemType>>(rnnParameters);
    }

    void ForwardCore(const GPUMatrix<ElemType>& weightsW, const GPUMatrix<ElemType>& inputX, GPUMatrix<ElemType>& outputY, const vector<size_t>& numSequencesForFrame, const RnnParameters& rnnParameters, GPUMatrix<ElemType>& reserve, GPUMatrix<ElemType>& workspace);
    void BackwardWeightsCore(const GPUMatrix<ElemType>& inputX, const GPUMatrix<ElemType>& outputY, GPUMatrix<ElemType>& dw, const RnnParameters& rnnParameters, GPUMatrix<ElemType>& reserve, GPUMatrix<ElemType>& workspace);
    void BackwardDataCore(const GPUMatrix<ElemType>& outputY, const GPUMatrix<ElemType>& outputDY, const GPUMatrix<ElemType>& w, GPUMatrix<ElemType>& dx, const RnnParameters& rnnParameters, GPUMatrix<ElemType>& reserve, GPUMatrix<ElemType>& workspace);

-    void SetLength(int len)
-    {
-        m_rnnT->SetLength(len);
-    }
-
 protected:
    std::unique_ptr<CuDnnFilter<ElemType>> wDesc;
    vector<cudnnTensorDescriptor_t> xDesc;
@ -216,6 +216,7 @@ private:
 private:
    std::unique_ptr<CuDnnRNN<ElemType>> m_rnnT;
    bool m_BackwardDataCalledYet;
+    size_t m_seqLength;
 };

 } } }
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -3154,7 +3154,7 @@ void GPUMatrix<ElemType>::RNNForward(const GPUMatrix<ElemType> &inputX, const GP
    if (!m_RNNWrapper)
        m_RNNWrapper = std::make_unique<RNNWrapper>();
    if (!m_RNNWrapper->m_rnnExecutor)
-        m_RNNWrapper->m_rnnExecutor = std::make_unique<CuDnnRNNExecutor<ElemType>>(xDim, yDim, numSequencesForFrame.size(), rnnParameters);
+        m_RNNWrapper->m_rnnExecutor = std::make_unique<CuDnnRNNExecutor<ElemType>>(xDim, yDim, rnnParameters);
    m_RNNWrapper->m_rnnExecutor->ForwardCore(paramW, inputX, *this, numSequencesForFrame, rnnParameters, reserve, workspace);
 }

--- a/Source/Math/MathCUDA.vcxproj
+++ b/Source/Math/MathCUDA.vcxproj
@ -30,7 +30,7 @@
        <CuDnnIncPath>$(CUDNN_PATH)\include</CuDnnIncPath>
        <CuDnnLibPath>$(CUDNN_PATH)\lib\x64</CuDnnLibPath>
        <CuDnnLib>cudnn.lib</CuDnnLib>
-        <CuDnnDll>$(CUDNN_PATH)\bin\cudnn64_4.dll</CuDnnDll>
+        <CuDnnDll>$(CUDNN_PATH)\bin\cudnn64_5.dll</CuDnnDll>
      </PropertyGroup>
    </When>
    <Otherwise>