Add an overload that takes reference to a preallocated array instead of std::vector

2016-06-09 10:14:07 +02:00 · 2016-06-09 10:14:07 +02:00 · 40a16a45ec
--- a/Source/Common/Include/Eval.h
+++ b/Source/Common/Include/Eval.h
@ -171,7 +171,7 @@ public:
 // from a single input or output.
 // This is used for both dense and sparse data.
 //
-template<typename ElemType>
+template<template<typename> class Container, typename ElemType>
 struct ValueBuffer
 {
    //
@ -181,7 +181,7 @@ struct ValueBuffer
    // [2,2] and 12 elements in the buffer, the number of samples is 3.
    // For sparse inputs, the number of samples is indicated by the m_colIndices field.
    //
-    std::vector<ElemType> m_buffer;
+    Container<ElemType> m_buffer;

    // In case of sparse data, the following is also used. Otherwise, the 
    // contents are ignored.
@ -197,19 +197,44 @@ struct ValueBuffer
    // For every element in buffer, an entry in this array gives its position.
    // For every vector the entries must be ascending.
    //
-    std::vector<int> m_indices;
+    Container<int> m_indices;

    //
    // Contains numberOfsamples + 1 indices into the buffer. The first entry
    // is always 0. The last entry points after the last element.
    // See http://docs.nvidia.com/cuda/cusparse/#compressed-sparse-column-format-csc
    //
-    std::vector<int> m_colIndices;
+    Container<int> m_colIndices;
 };

+template <typename ElemType>
+using Vector = std::vector<ElemType, std::allocator<ElemType>>;

 template <typename ElemType>
-using Values = std::vector<ValueBuffer<ElemType>>;
+using Values = std::vector<ValueBuffer<Vector, ElemType>>;
+
+template <typename ElemType>
+struct VectorRef
+{
+    ElemType* m_vector;
+    size_t m_capacity;
+    size_t m_size;
+
+    VectorRef() : m_vector(nullptr), m_capacity(0), m_size(0) {}
+    void InitFrom(std::vector<ElemType>& src) { m_vector = src.data(); m_capacity = src.capacity(); m_size = src.size(); }
+    size_t size() const { return m_size; }
+    size_t capacity() const { return m_capacity; }
+    ElemType* data() { return m_vector; }
+//    const ElemType* data() const { return m_vector; }
+    ElemType* begin() { return m_vector; }
+    ElemType* end() { return m_vector + m_size; }
+    void resize(size_t size) { m_size = size; }
+    ElemType& operator[](size_t idx) { return m_vector[idx]; }
+    const ElemType& operator[](size_t idx) const { return m_vector[idx]; }
+};
+
+template <typename ElemType>
+using ValueRefs = std::vector<ValueBuffer<VectorRef, ElemType>>;

 //
 // Meta data
@ -290,7 +315,7 @@ public:
    virtual VariableSchema GetInputSchema() const = 0;

    //
-    // Evaluate - Evaluate (perform a forward pass for) a single unit using the model with the given inputs and 
+    // ForwardPass - Evaluate (perform a forward pass for) a single unit using the model with the given inputs and 
    // outputs.
    // The layout and shape of the data in inputs vector must match the schema returned by GetInputLayouts.
    // Output must be preallocated and sized to avoid memory allocation / deallocation across DLL
@ -300,6 +325,8 @@ public:
    // outputs - vector of output buffers. Must be sized to fit output schema.
    //
    virtual void ForwardPass(const Values<ElemType>& inputs, Values<ElemType>& output) = 0;
+
+    virtual void ForwardPass(const ValueRefs<ElemType>& inputs, ValueRefs<ElemType>& output) = 0;
 };

 template <typename ElemType>
--- a/Source/EvalDll/CNTKEval.cpp
+++ b/Source/EvalDll/CNTKEval.cpp
@ -301,7 +301,8 @@ VariableSchema CNTKEvalExtended<ElemType>::GetInputSchema() const
 }

 template<typename ElemType>
-void CNTKEvalExtended<ElemType>::ForwardPass(const Values<ElemType>& inputs, Values<ElemType>& outputs)
+template<template<typename> class ValueContainer>
+void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<ValueContainer, ElemType> >& inputs, std::vector<ValueBuffer<ValueContainer, ElemType> >& outputs)
 {
    if (!m_started)
        RuntimeError("ForwardPass() called before StartForwardEvaluation()");
@ -315,7 +316,9 @@ void CNTKEvalExtended<ElemType>::ForwardPass(const Values<ElemType>& inputs, Val
    size_t i = 0;
    for (auto& input : m_inputMatrices)
    {
-        ValueBuffer<ElemType> buffer = inputs[i];
+        // const cast: The matrix class takes this over without copying and could theoretically change the contents,
+        // though it doesn't in this case.
+        ValueBuffer<ValueContainer, ElemType>& buffer = const_cast<ValueBuffer<ValueContainer, ElemType>&>(inputs[i]);
        shared_ptr<Matrix<ElemType>> matrix = dynamic_pointer_cast<Matrix<ElemType>>(input.second.matrix);
        auto type = matrix->GetMatrixType();
        int numRows = input.second.sampleLayout.GetNumElements();
@ -333,7 +336,7 @@ void CNTKEvalExtended<ElemType>::ForwardPass(const Values<ElemType>& inputs, Val
                RuntimeError("Input %ls: Expected at least one element.", m_inputNodes[i]->GetName().c_str());
            if (buffer.m_colIndices[0] != 0)
                RuntimeError("Input %ls: First element of column indices must be 0", m_inputNodes[i]->GetName().c_str());
-            if (buffer.m_colIndices[buffer.m_colIndices.size()-1] != buffer.m_indices.size())
+            if (buffer.m_colIndices[buffer.m_colIndices.size() - 1] != buffer.m_indices.size())
                RuntimeError("Input %ls: Last element of column indices must be equal to the size of indices (%ld), but was %d", m_inputNodes[i]->GetName().c_str(), buffer.m_indices.size(), buffer.m_colIndices[buffer.m_colIndices.size() - 1]);
        }

@ -341,7 +344,7 @@ void CNTKEvalExtended<ElemType>::ForwardPass(const Values<ElemType>& inputs, Val
        assert(numCols >= 1);
        input.second.pMBLayout->Init(1, numCols);
        input.second.pMBLayout->AddSequence(0, 0, 0, numCols);
-       
+
        if (type == MatrixType::DENSE)
        {
            matrix->SetValue(numRows, numCols, matrix->GetDeviceId(), buffer.m_buffer.data(), matrixFlagNormal);
@ -357,14 +360,14 @@ void CNTKEvalExtended<ElemType>::ForwardPass(const Values<ElemType>& inputs, Val
    }

    ComputationNetwork::BumpEvalTimeStamp(m_inputNodes);
-    
+
    for (size_t i = 0; i < m_outputNodes.size(); ++i)
    {
        auto node = m_outputNodes[i];
        m_net->ForwardProp(node);
        shared_ptr<Matrix<ElemType>> outputMatrix = dynamic_pointer_cast<Matrix<ElemType>>(node->ValuePtr());
        auto pMBLayout = node->GetMBLayout();
-        if (!pMBLayout) 
+        if (!pMBLayout)
        {
            pMBLayout = make_shared<MBLayout>();
            pMBLayout->InitAsFrameMode(1); // treat this as if we have one single sample
@ -376,8 +379,8 @@ void CNTKEvalExtended<ElemType>::ForwardPass(const Values<ElemType>& inputs, Val
            RuntimeError("Only 1 output sequence supported by this API");
        }

-        std::vector<ElemType>& vec = outputs[i].m_buffer;
-        
+        ValueContainer<ElemType>& vec = outputs[i].m_buffer;
+
        size_t numElements = outputMatrix->GetNumElements();

        if (vec.capacity() < numElements)
@ -392,6 +395,18 @@ void CNTKEvalExtended<ElemType>::ForwardPass(const Values<ElemType>& inputs, Val
    }
 }

+template<typename ElemType>
+void CNTKEvalExtended<ElemType>::ForwardPass(const Values<ElemType>& inputs, Values<ElemType>& outputs)
+{
+    ForwardPassT(inputs, outputs);
+}
+
+template<typename ElemType>
+void CNTKEvalExtended<ElemType>::ForwardPass(const ValueRefs<ElemType>& inputs, ValueRefs<ElemType>& outputs)
+{
+    ForwardPassT(inputs, outputs);
+}
+
 template <typename ElemType>
 void CNTKEvalExtended<ElemType>::Destroy()
 {
--- a/Source/EvalDll/CNTKEval.h
+++ b/Source/EvalDll/CNTKEval.h
@ -100,6 +100,8 @@ public:

    virtual void ForwardPass(const Values<ElemType>& inputs, Values<ElemType>& output) override;

+    virtual void ForwardPass(const ValueRefs<ElemType>& inputs, ValueRefs<ElemType>& output) override;
+
    virtual void Destroy() override;

    virtual void CreateNetwork(const std::string& networkDescription) override
@ -118,5 +120,9 @@ private:
    std::vector<ComputationNodeBasePtr> m_inputNodes;
    StreamMinibatchInputs m_inputMatrices;
    bool m_started;
+
+    template<template<typename> class ValueContainer> 
+    void ForwardPassT(const std::vector < ValueBuffer<ValueContainer, ElemType> >& inputs,
+                      std::vector < ValueBuffer<ValueContainer, ElemType> >& outputs);
 };
 } } }
--- a/Tests/UnitTests/EvalTests/EvalExtendedTests.cpp
+++ b/Tests/UnitTests/EvalTests/EvalExtendedTests.cpp
@ -89,8 +89,7 @@ BOOST_AUTO_TEST_CASE(EvalConstantPlusTest)
    Values<float> outputBuffer = outputLayouts.CreateBuffers<float>({ 1 });

    // Allocate the input values layer (empty)
-
-    Values<float> inputBuffer;
+    Values<float> inputBuffer(0);

    // We can call the evaluate method and get back the results...
    eval->ForwardPass(inputBuffer, outputBuffer);
@ -161,10 +160,10 @@ BOOST_AUTO_TEST_CASE(EvalScalarTimesDualOutputTest)
    eval = SetupNetworkAndGetLayouts(modelDefinition, inputLayouts, outputLayouts);

    // Allocate the output values layer
-    std::vector<ValueBuffer<float>> outputBuffer = outputLayouts.CreateBuffers<float>({ 1 });
+    auto outputBuffer = outputLayouts.CreateBuffers<float>({ 1 });

    // Allocate the input values layer
-    std::vector<ValueBuffer<float>> inputBuffer(1);
+    Values<float> inputBuffer(1);
    inputBuffer[0].m_buffer = { 2 };

    // We can call the evaluate method and get back the results...
@ -197,14 +196,14 @@ BOOST_AUTO_TEST_CASE(EvalDenseTimesTest)
    eval = SetupNetworkAndGetLayouts(modelDefinition, inputLayouts, outputLayouts);

    // Allocate the output values layer
-    std::vector<ValueBuffer<float>> outputBuffer = outputLayouts.CreateBuffers<float>({ 1 });
+    Values<float> outputBuffer = outputLayouts.CreateBuffers<float>({ 1 });

    // Number of inputs must adhere to the schema
-    std::vector<ValueBuffer<float>> inputBuffer1(0);
+    Values<float> inputBuffer1(0);
    BOOST_REQUIRE_THROW(eval->ForwardPass(inputBuffer1, outputBuffer), std::exception); // Not enough inputs

    // Number of elements in the input must adhere to the schema
-    std::vector<ValueBuffer<float>> inputBuffer(1);
+    Values<float> inputBuffer(1);
    inputBuffer[0].m_buffer = { 1, 2, 3 };
    BOOST_REQUIRE_THROW(eval->ForwardPass(inputBuffer, outputBuffer), std::exception); // Not enough elements in the sample

@ -216,6 +215,17 @@ BOOST_AUTO_TEST_CASE(EvalDenseTimesTest)
    auto buf = outputBuffer[0].m_buffer;
    BOOST_CHECK_EQUAL_COLLECTIONS(buf.begin(), buf.end(), expected.begin(), expected.end());

+    // Do the same via ValueRefs
+    ValueRefs<float> inputRefs(1);
+    inputRefs[0].m_buffer.InitFrom(inputBuffer[0].m_buffer);
+    inputRefs[0].m_colIndices.InitFrom(inputBuffer[0].m_colIndices);
+    inputRefs[0].m_indices.InitFrom(inputBuffer[0].m_indices);
+    ValueRefs<float> outputRefs(1);
+    std::vector<float> output(1);
+    outputRefs[0].m_buffer.InitFrom(output);
+    eval->ForwardPass(inputRefs, outputRefs);
+    BOOST_CHECK_EQUAL_COLLECTIONS(output.begin(), output.end(), expected.begin(), expected.end());
+
    eval->Destroy();
 }

@ -238,10 +248,10 @@ BOOST_AUTO_TEST_CASE(EvalSparseTimesTest)
    eval = SetupNetworkAndGetLayouts(modelDefinition, inputLayouts, outputLayouts);

    // Allocate the output values layer
-    std::vector<ValueBuffer<float>> outputBuffer = outputLayouts.CreateBuffers<float>({ 3 });
+    Values<float> outputBuffer = outputLayouts.CreateBuffers<float>({ 3 });

    // Allocate the input values layer
-    std::vector<ValueBuffer<float>> inputBuffer(1);
+    Values<float> inputBuffer(1);
    inputBuffer[0].m_buffer = {1, 2, 3, 5, 6};
    inputBuffer[0].m_indices = {0, 2, 2, 1, 2};

@ -267,6 +277,17 @@ BOOST_AUTO_TEST_CASE(EvalSparseTimesTest)
    auto buf = outputBuffer[0].m_buffer;
    BOOST_CHECK_EQUAL_COLLECTIONS(buf.begin(), buf.end(), expected.begin(), expected.end());

+    // Do the same via ValueRefs
+    ValueRefs<float> inputRefs(1);
+    inputRefs[0].m_buffer.InitFrom(inputBuffer[0].m_buffer);
+    inputRefs[0].m_colIndices.InitFrom(inputBuffer[0].m_colIndices);
+    inputRefs[0].m_indices.InitFrom(inputBuffer[0].m_indices);
+    ValueRefs<float> outputRefs(1);
+    std::vector<float> output(3);
+    outputRefs[0].m_buffer.InitFrom(output);
+    eval->ForwardPass(inputRefs, outputRefs);
+    BOOST_CHECK_EQUAL_COLLECTIONS(output.begin(), output.end(), expected.begin(), expected.end());
+
    outputBuffer = outputLayouts.CreateBuffers<float>({ 1 });
    BOOST_REQUIRE_THROW(eval->ForwardPass(inputBuffer, outputBuffer), std::exception); // Not enough capacity in output.