add CopyTo for onehot vector; refactor code to share between CopyTo methods;

fix bug using GPU device. The cpuArrayView must be alive in the function scope add CopyTo for OneHot; support Value in both dense and sparse format
2016-12-17 22:07:11 +01:00 · 2016-12-17 22:07:11 +01:00 · f1cbc08e94
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -21,6 +21,7 @@
 #include <algorithm>
 #include <mutex>
 #include <future>
+#include <functional>

 #ifdef SWIG
 #define final
@ -956,24 +957,10 @@ namespace CNTK
        }

        template <typename ElementType>
-        void CopyTo(const NDShape& sampleShape, std::vector<std::vector<ElementType>>& sequences, std::vector<size_t>& sequenceLens, bool isResizeable = true)
+        void CopyTo(const NDShape& sampleShape, std::vector<std::vector<ElementType>>& sequences, std::vector<size_t>& sequenceLengths, bool isResizeable = true)
        {
-            // Check the data type matches
-            if (AsDataType<ElementType>() != GetDataType())
-                InvalidArgument("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(GetDataType()));
-
-            // Todo: convert sparse into dense.
-            if (GetStorageFormat() != StorageFormat::Dense)
-                InvalidArgument("Only the dense storage format is supported now.");
-
-            auto valueRank = Shape().Rank();
-            auto sampleRank = sampleShape.Rank();
-            if ((valueRank < sampleRank + 1) || (valueRank > sampleRank + 2) || (sampleShape != Shape().SubShape(0, sampleRank)))
-                RuntimeError("The variable and the Value does not have the same tensor shape.");
-
-            CheckAndResizeOutputBuffer(sampleRank, sampleShape.TotalSize(), sequences, sequenceLens, isResizeable);
-
-            CopyToImpl(sampleShape, sequences, sequenceLens);
+            CheckAndResizeOutputBuffer(sampleShape.Rank(), sampleShape.TotalSize(), sequences, sequenceLengths, isResizeable);
+            CopyToVector(sampleShape, sequences, sequenceLengths);
        }

        ///
@ -981,38 +968,26 @@ namespace CNTK
        /// The sequence buffer is on CPU.
        /// The Value should have the same axes as variable.
        ///
-        template <typename ElementType>
        void CopyTo(const NDShape& sampleShape, std::vector<std::vector<size_t>>& sequences)
        {
            std::vector<size_t> seqLens;
            CopyTo(sampleShape, sequences, seqLens, true);
        }

-        template <typename ElementType>
-        void CopyTo(const NDShape& sampleShape, std::vector<std::vector<size_t>>& sequences, std::vector<size_t>& sequenceLens, bool isResizeable = true)
+        void CopyTo(const NDShape& sampleShape, std::vector<std::vector<size_t>>& sequences, std::vector<size_t>& sequenceLengths, bool isResizeable = true)
        {
-            // Check the data type matches
-            if (AsDataType<ElementType>() != GetDataType())
-                InvalidArgument("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(GetDataType()));
-
-            // Todo: convert sparse into dense.
-            if (GetStorageFormat() != StorageFormat::Dense)
-                InvalidArgument("Only the dense storage format is supported now.");
-
-            if (sampleShape[0] != sampleShape.TotalSize())
-                InvalidArgument("")
-
-            auto valueRank = Shape().Rank();
-            auto sampleRank = sampleShape.Rank();
-            if ((valueRank < sampleRank + 1) || (valueRank > sampleRank + 2) || (sampleShape != Shape().SubShape(0, sampleRank)))
-                RuntimeError("The variable and the Value does not have the same tensor shape.");
-
            // For OneHot vector, only 1 value is needed for a sample.
-            CheckAndResizeOutputBuffer(sampleRank, 1, sequences, sequenceLens, isResizeable);
-
-            // CopyToImpl(sampleShape, sequences, sequenceLens);
+            CheckAndResizeOutputBuffer(sampleShape.Rank(), 1, sequences, sequenceLengths, isResizeable);
+            auto dataType = GetDataType();
+            if (dataType == DataType::Float)
+            {
+                CopyToVector<float>(sampleShape, sequences, sequenceLengths);
+            } 
+            else if (dataType == DataType::Double)
+            {
+                CopyToVector<double>(sampleShape, sequences, sequenceLengths);
+            }
        }
-

    private:
        template <typename ElementType>
@ -1021,10 +996,16 @@ namespace CNTK
        CNTK_API static ValuePtr Create(const NDShape& sampleShape, const std::vector<NDArrayViewPtr>& sequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly, bool createNewCopy);

        template <typename ElementType>
-        CNTK_API void CopyToImpl(const NDShape& sampleShape, std::vector<std::vector<ElementType>>& sequences, std::vector<size_t>& sequenceLens);
+        CNTK_API void CopyToVector(const NDShape& sampleShape, std::vector<std::vector<ElementType>>& sequences, std::vector<size_t>& sequenceLengths);

        template <typename ElementType>
-        void CheckAndResizeOutputBuffer(const size_t sampleRank, const size_t sampleSize, std::vector<std::vector<ElementType>>& sequences, std::vector<size_t>& sequenceLens, bool isResizeable)
+        CNTK_API void CopyToVector(const NDShape& sampleShape, std::vector<std::vector<size_t>>& sequences, std::vector<size_t>& sequenceLengths);
+
+        template <typename ValueType, typename DestType>
+        void CopyToImpl(const NDShape& sampleShape, std::vector<std::vector<DestType>>& sequences, std::vector<size_t>& sequenceLengths);
+
+        template <typename ElementType>
+        void CheckAndResizeOutputBuffer(const size_t sampleRank, const size_t sampleSize, std::vector<std::vector<ElementType>>& sequences, std::vector<size_t>& sequenceLengths, bool isResizeable)
        {
            auto valueRank = Shape().Rank();
            size_t numOfSequences;
@ -1044,15 +1025,19 @@ namespace CNTK
            }

            // resize the sequnce length buffer to reflect the number of sequences in output.
-            if (sequenceLens.size() < numOfSequences)
-                sequenceLens.resize(numOfSequences);
+            if (sequenceLengths.size() < numOfSequences)
+                sequenceLengths.resize(numOfSequences);

            // Check whether the additional space in the sequences output buffer needs to be allocated if it is resizeable.
            if (isResizeable)
            {
                const MaskKind* maskData = nullptr;
-                if (m_mask != nullptr)
-                    maskData = Device() != DeviceDescriptor::CPUDevice() ? m_mask->DeepClone(DeviceDescriptor::CPUDevice())->DataBuffer() : m_mask->DataBuffer();
+                NDMaskPtr cpuMask = nullptr;
+                if (Mask() != nullptr)
+                {
+                    cpuMask = (Device() != DeviceDescriptor::CPUDevice()) ? Mask()->DeepClone(DeviceDescriptor::CPUDevice()) : Mask();
+                    maskData = cpuMask->DataBuffer();
+                }

                size_t sampleCount, seqStart;
                for (auto seqIndex = 0; seqIndex < numOfSequences; seqIndex++)
--- a/Source/CNTKv2LibraryDll/Value.cpp
+++ b/Source/CNTKv2LibraryDll/Value.cpp
@ -359,197 +359,150 @@ namespace CNTK
        }
    }

-    // If outputData.size() is 0, CNTK will alocate stroage for data. Otherwise, the caller is reposible for allocating sufficient stroage space for saving the data.
+    template <typename ElementType, typename DestType>
+    void DirectCopy(const ElementType *source, const size_t sampleCount, const size_t sampleSize, std::vector<DestType>& dest, size_t& destSampleStart);
+
+    template <typename ElementType, typename DestType>
+    void CopyDenseToOneHot(const ElementType *source, const size_t sampleCount, const size_t sampleSize, std::vector<DestType>& dest, size_t& destSampleStart);
+
    template <typename ElementType>
-    void Value::CopyTo(const NDShape& sampleShape, std::vector<std::vector<ElementType>>& outputData)
+    void Value::CopyToVector(const NDShape& sampleShape, std::vector<std::vector<ElementType>>& sequences, std::vector<size_t>& sequenceLengths)
    { 
        // Check the data type matches
        if (AsDataType<ElementType>() != GetDataType())
            InvalidArgument("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(GetDataType()));

-        // Todo: convert sparse into dense.
-        if (GetStorageFormat() != StorageFormat::Dense)
-            InvalidArgument("Only the dense storage format is supported now.");
+        CopyToImpl<ElementType, ElementType>(sampleShape, sequences, sequenceLengths);
+    }

+    template <typename ElementType>
+    CNTK_API void Value::CopyToVector(const NDShape& sampleShape, std::vector<std::vector<size_t>>& sequences, std::vector<size_t>& sequenceLengths)
+    {
+        if (sampleShape[0] != sampleShape.TotalSize())
+            InvalidArgument("");
+
+        CopyToImpl<ElementType, size_t>(sampleShape, sequences, sequenceLengths);
+    }
+
+    template <typename ValueType, typename DestType>
+    void Value::CopyToImpl(const NDShape& sampleShape, 
+                           std::vector<std::vector<DestType>>& sequences, 
+                           std::vector<size_t>& sequenceLengths)
+    {
        auto valueRank = Shape().Rank();
-        if ((valueRank <= 2) || (sampleShape != Shape().SubShape(0, valueRank - 2))
+        auto sampleRank = sampleShape.Rank();
+        if ((valueRank < sampleRank + 1) || (valueRank > sampleRank + 2) || (sampleShape != Shape().SubShape(0, sampleRank)))
            RuntimeError("The variable and the Value does not have the same tensor shape.");

-        // Copy data to the CPU device if required.
-        NDArrayViewPtr cpuArrayView;
-        NDMaskPtr cpuNDMask;
-        if (Device() != DeviceDescriptor::CPUDevice())
+        size_t numOfSequences;
+        size_t maxSequenceLen;
+        if (valueRank == sampleShape.Rank() + 1)
        {
-            cpuArrayView = m_data->DeepClone(DeviceDescriptor::CPUDevice());
-            cpuNDMask = m_mask->DeepClone(DeviceDescriptor::CPUDevice());
+            // no batch axis, only sequence axis
+            numOfSequences = 1;
+            maxSequenceLen = Shape()[valueRank - 1];
        }
        else
        {
-            cpuArrayView = m_data;
-            cpuNDMask = m_mask;
+            assert(valueRank == sampleShape.Rank() + 2);
+            numOfSequences = Shape()[valueRank - 1];
+            maxSequenceLen = Shape()[valueRank - 2];
        }

-        auto maskData = cpuNDMask->DataBuffer();
-        auto valueData = cpuArrayView->DataBuffer<ElementType>();
-        auto numOfSequences = Shape()[valueRank - 1];
-        auto maxSequenceLen = Shape()[valueRank - 2];
+        // Check output buffer size
+        if (sequences.size() < numOfSequences)
+            RuntimeError("The size of output buffer is too small");
+
+        // Check sequenceLengths size.
+        if (sequenceLengths.size() < numOfSequences)
+        {
+            RuntimeError("The size of sequenceLengths does not match.");
+        }
+        else
+        {
+            for (size_t i = numOfSequences; i < sequenceLengths.size(); i++)
+                sequenceLengths[i] = 0;
+        }
+
+        // Copy data to the CPU device if required.
+        const ValueType *valueData;
+        const MaskKind* maskData;
+        NDArrayViewPtr cpuArrayView;
+        NDMaskPtr cpuMask;
+        if (Device() != DeviceDescriptor::CPUDevice())
+        {
+            // Todo: leverage sparse if the original NDArrayView is in spase.
+            cpuArrayView = MakeSharedObject<NDArrayView>(GetDataType(), Data()->Shape(), DeviceDescriptor::CPUDevice());
+            cpuArrayView->CopyFrom(*Data());
+            cpuMask = Mask() != nullptr ? Mask()->DeepClone(DeviceDescriptor::CPUDevice()) : nullptr;
+        }
+        else
+        {
+            // Todo: direct process sparse data without copy
+            if (GetStorageFormat() != StorageFormat::Dense)
+            {
+                cpuArrayView = MakeSharedObject<NDArrayView>(GetDataType(), Data()->Shape(), DeviceDescriptor::CPUDevice());
+                cpuArrayView->CopyFrom(*Data());
+            }
+            else
+            {
+                cpuArrayView = Data();
+            }
+            cpuMask = Mask();
+        }
+        valueData = cpuArrayView->DataBuffer<ValueType>();
+        maskData = cpuMask != nullptr ? cpuMask->DataBuffer() : nullptr;
+
        auto sampleSize = sampleShape.TotalSize();
-        bool needStorage = false;
-
-        if (outputData.size() == 0)
-        {
-            needStorage = true;
-        }
-        else if (numOfSequences > outputData.size())
-        {
-            RuntimeError("The size of the output buffer is smaller than the number of sequences.");
-        }
-
-        const ElementType *first, *last;
-        ElementType *dest;
-        std::vector<ElementType> seqBuf;
-        size_t count, current;
        for (auto seqIndex = 0; seqIndex < numOfSequences; seqIndex++)
        {
            size_t seqStart = seqIndex * maxSequenceLen;
-
-            // Check the number of valid elements. 
-            // Not using MaskedCount() to avoid extra data copy.
-            count = 0;
-            for (int i = 0; i < maxSequenceLen; i++)
+            size_t destSampleCount = 0;
+            if (maskData == nullptr)
            {
-                if (maskData[seqStart + i] != MaskKind::Invalid)
-                    count++;
-            }
-
-            if (needStorage)
+                // Todo: if function pointer or lambda could support template, switch to use them.
+                if (typeid(DestType) == typeid(size_t))
                {
-                auto p = new std::vector<ElementType>(count * sampleSize);
-                outputData.push_back(*p);
+                    CopyDenseToOneHot<ValueType, DestType>(valueData + seqStart * sampleSize, maxSequenceLen, sampleSize, sequences[seqIndex], destSampleCount);
                }
-
-            seqBuf = outputData[seqIndex];
-            if (count * sampleSize > seqBuf.size())
+                else
                {
-                RuntimeError("The sequenth %lu contains more data than the size of the provided vector.\n", (unsigned long)seqIndex);
+                    DirectCopy<ValueType, DestType>(valueData + seqStart * sampleSize, maxSequenceLen, sampleSize, sequences[seqIndex], destSampleCount);
                }
-
-            dest = seqBuf.data();
-            current = 0;
-            while (current < maxSequenceLen)
+                sequenceLengths[seqIndex] = destSampleCount;
+            }
+            else
+            {
+                // NDMask is not null
+                size_t current = seqStart;
+                size_t seqEnd = seqStart + maxSequenceLen;
+                while (current < seqEnd)
                {
                    // find first valid mask.
-                while ((maskData[seqStart + current] == MaskKind::Invalid) && (current < maxSequenceLen))
+                    while ((current < seqEnd) && (maskData[current] == MaskKind::Invalid))
                        current++;
-                first = valueData + (seqStart + current) * sampleSize;
+                    auto sampleStart = current;

                    // find the next invalid mask.
-                while ((maskData[seqStart + current] != MaskKind::Invalid) && (current < maxSequenceLen))
+                    while ((current < seqEnd) && (maskData[current] != MaskKind::Invalid))
                        current++;
-                last = valueData + (seqStart + current) * sampleSize;
-
-                if (last > first)
+                    assert(current >= sampleStart);
+                    if (current > sampleStart)
                    {
-                    std::copy(first, last, dest);
-                    dest += last - first;
-                    assert(dest <= seqBuf.data() + count);
-                }
-            }
-            assert(dest == seqBuf.data() + count);
-        }
-    }
-
-    void Value::CopyTo(const NDShape& sampleShape, std::vector<std::vector<size_t>>& outputData)
+                        // Todo: if function pointer or lambda could support template, switch to use them.
+                        if (typeid(DestType) == typeid(size_t))
                        {
-
-        if (sampleShape.Rank() != 1)
-            RuntimeError("Only data of 1-D tensor can be copied to OneHot vector.");
-
-        auto valueRank = Shape().Rank();
-        // Check the shape matches.
-        if (sampleShape != Shape().SubShape(0, valueRank - 2))
-            InvalidArgument("The variable and the value does not have the same tensor shape.");
-
-        //// Todo: convert sparse into dense.
-        //if (GetStorageFormat() != StorageFormat::Dense)
-        //    InvalidArgument("Only the dense storage format is supported now.");
-
-        //// Copy data to the CPU device if required.
-        //NDArrayViewPtr cpuArrayView;
-        //NDMaskPtr cpuNDMask;
-        //if (Device != DeviceDescriptor::CPUDevice())
-        //{
-        //    cpuArrayView = m_data->DeepClone(DeviceDescriptor::CPUDevice());
-        //    cpuNDMask = m_mask->DeepClone(DeviceDescriptor::CPUDevice());
-        //}
-        //else
-        //{
-        //    cpuArrayView = m_data;
-        //    cpuNDMask = m_mask;
-        //}
-
-        //auto maskData = cpuNDMask->DataBuffer();
-        //auto valueData = cpuArrayView->DataBuffer();
-        //auto numOfSequences = Shape[valueRank - 1];
-        //auto maxSequenceLen = Shape[valueRank - 2];
-        //auto sampleSize = variable.Shape().TotalSize();
-
-        //if (outputData == nullptr)
-        //{
-        //    outputData = new std::vector<std::vector<size_t>>(numOfSequences);
-        //}
-        //if (numOfSequences > outputData.size())
-        //{
-        //    RuntimeError("The size of output buffer is smaller than the number of sequences.");
-        //}
-
-        //ElementType *first, *last, *dest;
-        //size_t count;
-        //for (auto seqIndex = 0; seqIndex < numOfSequences; seqIndex++)
-        //{
-        //    size_t seqStart = seqIndex * maxSequenceLen;
-
-        //    // Check the number of valid elements. 
-        //    // Not using MaskedCount() to avoid extra data copy.
-        //    count = 0;
-        //    for (int i = 0; i < maxSequenceLen; i++)
-        //    {
-        //        if (maskData[seqStart + i] != MaskKind::Invalid)
-        //            count++;
-        //    }
-
-        //    auto seqBuf = outputData[seqIndex];
-        //    if (seqBuf == nullptr)
-        //    {
-        //        outputData[seqIndex] = seqBuf = new std::vector<ElementType>(count * sampleSize);
-        //    }
-        //    if (count * sampleSize > seqBuf.size())
-        //    {
-        //        RuntimeError("The sequenth %lu contains more data than the buffer size.\n", (unsigned long)seqIndex);
-        //    }
-        //    dest = seqBuf;
-
-        //    while (current < maxSequenceLen)
-        //    {
-        //        // find first valid mask.
-        //        while ((maskData[seqStart + current] == MaskKind::Invalid) && (current < maxSequenceLen))
-        //            current++;
-        //        first = valueData + (seqStart + current) * sampleSize;
-
-        //        // find the next invalid mask.
-        //        while ((maskData[seqStart + current] != MaskKind::Invalid) && (current < maxSequenceLen))
-        //            current++;
-        //        last = valueData + (seqStart + current) * sampleSize;
-
-        //        if (last > first)
-        //        {
-        //            std::copy(first, last, dest);
-        //            dest += last - first;
-        //            assert(dest <= seqBuf + count);
-        //        }
-        //    }
-        //    assert(dest == seqBuf + count);
-        //}
+                            CopyDenseToOneHot<ValueType, DestType>(valueData + seqStart * sampleSize, current - sampleStart, sampleSize, sequences[seqIndex], destSampleCount);
+                        }
+                        else
+                        {
+                            DirectCopy<ValueType, DestType>(valueData + seqStart * sampleSize, current - sampleStart, sampleSize, sequences[seqIndex], destSampleCount);
+                        }
+                    }
+                }
+                sequenceLengths[seqIndex] = destSampleCount;
+            }
+        }
    }

    void PackedValue::Unpack() const
@ -585,11 +538,69 @@ namespace CNTK
        }
    }

+    template <typename ElementType, typename DestType>
+    void DirectCopy(const ElementType *source, const size_t sampleCount, const size_t sampleSize, std::vector<DestType>& dest, size_t& destSampleStart)
+    {
+        if (typeid(ElementType) != typeid(DestType))
+            RuntimeError("Source and destination must be the same data type.");
+
+        DestType *destData = dest.data();
+        if ((destSampleStart + sampleCount) * sampleSize > dest.size())
+            RuntimeError("The output buffer is too small.");
+        std::copy(source, source + sampleCount * sampleSize, reinterpret_cast<ElementType *>(destData + destSampleStart * sampleSize));
+        destSampleStart += sampleCount;
+    }
+
+    template <typename ElementType, typename DestType>
+    void CopyDenseToOneHot(const ElementType *source, const size_t sampleCount, const size_t sampleSize, std::vector<DestType>& dest, size_t& destSampleStart)
+    {
+        if (typeid(DestType) != typeid(size_t))
+        {
+            RuntimeError("The destination data type must be size_t.");
+        }
+
+        const ElementType *currentp = source;
+        const ElementType *lastp = source + sampleCount * sampleSize;
+        while (currentp < lastp)
+        {
+            auto sampleEndp = currentp + sampleSize;
+            auto indexp = std::find_if(currentp, sampleEndp, [](const ElementType val) {
+                return val != 0;
+            });
+
+            if (indexp == sampleEndp)
+            {
+                RuntimeError("Cannot convert to onehot vector: the sample does not have any non-zero value.");
+            }
+            else
+            {
+                if (std::find_if(indexp + 1, sampleEndp, [](const ElementType val) {
+                    return val != 0;
+                }) != sampleEndp)
+                {
+                    RuntimeError("Cannot convert to onehot vector: more than one non-zero value in the sample.");
+                }
+                else
+                {
+                    if (destSampleStart >= dest.size())
+                        RuntimeError("The output buffer is too small.");
+                    else
+                    {
+                        dest[destSampleStart++] = static_cast<DestType>(indexp - currentp);
+                    }
+                }
+            }
+            currentp += sampleSize;
+        }
+        assert(currentp == lastp);
+    }
+
    // Explicit template instantiations
    template /*static*/ CNTK_API ValuePtr Value::Create<float>(const NDShape& sampleShape, const std::vector<std::vector<float>>& sequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template /*static*/ CNTK_API ValuePtr Value::Create<double>(const NDShape& sampleShape, const std::vector<std::vector<double>>& sequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template /*static*/ CNTK_API ValuePtr Value::Create<float>(size_t vocabSize, const std::vector<std::vector<size_t>>& oneHotSequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template /*static*/ CNTK_API ValuePtr Value::Create<double>(size_t vocabSize, const std::vector<std::vector<size_t>>& oneHotSequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
-    template CNTK_API void Value::CopyTo<float>(const NDShape& sampleShape, std::vector<std::vector<float>>& sequences);
-    template CNTK_API void Value::CopyTo<double>(const NDShape& sampleShape, std::vector<std::vector<double>>& sequences);
+    template CNTK_API void Value::CopyToVector<float>(const NDShape& sampleShape, std::vector<std::vector<float>>& sequences, std::vector<size_t>& sequencesLens);
+    template CNTK_API void Value::CopyToVector<double>(const NDShape& sampleShape, std::vector<std::vector<double>>& sequences, std::vector<size_t>& sequencesLens);
+
 }
--- a/Tests/UnitTests/V2LibraryTests/ValueTests.cpp
+++ b/Tests/UnitTests/V2LibraryTests/ValueTests.cpp
@ -147,6 +147,7 @@ void FillDenseMatrixData(vector<vector<ElementType>>& databuf,  const vector<siz
 template <typename ElementType>
 void ValueCreationNoNDMaskTest(const DeviceDescriptor device, bool readOnly)
 {
+    //Todo: test numberOfSequences == 1: no batch access, need to adapt checkShape(), CheckValue()
    size_t numberOfSequences = 5;
    size_t seqLen = 4;
    vector<size_t> dims{3, 2};
@ -242,24 +243,24 @@ void ValueCreationOneHotWithNDMaskTest(const DeviceDescriptor device, bool readO
 template <typename ElementType>
 void CheckCopyToOutput(const size_t sampleSize, std::vector<std::vector<ElementType>> expected, std::vector<std::vector<ElementType>> actual)
 {
-    std::vector<size_t> seqLens(0);
-    CheckCopyToOutput(sampleSize, expected, actual, seqLens);
+    std::vector<size_t> actualSeqLens(0);
+    CheckCopyToOutput(sampleSize, expected, actual, actualSeqLens);
 }

 template <typename ElementType>
-void CheckCopyToOutput(const size_t sampleSize, std::vector<std::vector<ElementType>>& expected, std::vector<std::vector<ElementType>>& actual, std::vector<size_t>& seqLens)
+void CheckCopyToOutput(const size_t sampleSize, std::vector<std::vector<ElementType>>& expected, std::vector<std::vector<ElementType>>& actual, std::vector<size_t>& actualSeqLens)
 {
    bool useSeqLens;
-    if (seqLens.size() != 0)
+    if (actualSeqLens.size() != 0)
    {
        useSeqLens = true;
-        if (seqLens.size() < expected.size())
-            ReportFailure("The seqLens size does not match. expected: %" PRIu64 " actual: %" PRIu64 "\n", expected.size(), seqLens.size());
+        if (actualSeqLens.size() < expected.size())
+            ReportFailure("The actualSeqLens size does not match. expected: %" PRIu64 " actual: %" PRIu64 "\n", expected.size(), actualSeqLens.size());
        else
        {
-            for (size_t i = expected.size(); i < seqLens.size(); i++)
-                if (seqLens[i] != 0)
-                    ReportFailure("The seqLens contains invalid data.");
+            for (size_t i = expected.size(); i < actualSeqLens.size(); i++)
+                if (actualSeqLens[i] != 0)
+                    ReportFailure("The actualSeqLens contains invalid data.");
        }

        if (actual.size() < expected.size())
@ -274,7 +275,7 @@ void CheckCopyToOutput(const size_t sampleSize, std::vector<std::vector<ElementT

    for (size_t i = 0; i < expected.size(); i++)
    {
-        auto len = useSeqLens ? seqLens[i] * sampleSize : actual[i].size();
+        auto len = useSeqLens ? actualSeqLens[i] * sampleSize : actual[i].size();
        if ((actual[i].size() < len) || (expected[i].size() != len))
        {
            ReportFailure("Seq " PRIu64 " does not match.\n", i);
@ -294,84 +295,249 @@ void ValueCopyToDenseTest(const DeviceDescriptor device)
 {
    NDShape sampleShape{{2, 3}};
    auto sampleSize = sampleShape.TotalSize();
+    std::vector<std::vector<ElementType>> input;
    std::vector<std::vector<ElementType>> output;
-    std::vector<std::vector<ElementType>> sequences;
-    std::vector<size_t> seqLens;
+    std::vector<size_t> expectedSeqLens;
+    std::vector<size_t> actualSeqLens;

+    //Todo: add tests sparse to dense.
    // Check single sample.
-    sequences.clear();
-    sequences.resize(1, std::vector<ElementType>(sampleSize));
-    // Todo: use generateSequences in common.h
-    for (size_t i = 0; i < sampleSize; i++)
-        sequences[0][i] = static_cast<ElementType>(i);
-    auto val = Value::Create(sampleShape, sequences, device);
+    size_t batchCount = 1;
+    expectedSeqLens.clear();
+    for (size_t i = 0; i < batchCount; i++)
+        expectedSeqLens.push_back(1);
+    input = GenerateSequences<ElementType>(expectedSeqLens, sampleShape);
+    auto val = Value::Create(sampleShape, input, device);
+
    val->CopyTo(sampleShape, output);
-    CheckCopyToOutput(sampleSize, sequences, output);
+    CheckCopyToOutput(sampleSize, input, output);

    // Check batch of sample.
-    size_t batchCount = 2;
-    sequences.clear();
-    sequences.resize(2, std::vector<ElementType>(sampleSize));
-    for (size_t s = 0; s < batchCount; s++)
-    {
-        for (size_t i = 0; i < sampleSize; i++)
-        {
-            sequences[s][i] = static_cast<ElementType>(s * 10 + i);
-        }
-    }
-    val = Value::Create(sampleShape, sequences, device);
-    VerifyException([&val, &sampleShape, &output, &seqLens]() {
-        val->CopyTo(sampleShape, output, seqLens, false);
-    }, "The output buffer is too small.");
-    val->CopyTo(sampleShape, output, seqLens);
-    CheckCopyToOutput(sampleSize, sequences, output, seqLens);
+    batchCount = 2;
+    expectedSeqLens.clear();
+    for (size_t i = 0; i < batchCount; i++)
+        expectedSeqLens.push_back(1);
+    input = GenerateSequences<ElementType>(expectedSeqLens, sampleShape);
+    val = Value::Create(sampleShape, input, device);

-    // Check sequence of sample
-    size_t sampleCount = 4;
-    sequences.clear();
-    sequences.resize(1, std::vector<ElementType>(sampleSize * sampleCount));
-    for (size_t i = 0; i < sampleSize * sampleCount; i++)
-    {
-        sequences[0][i] = static_cast<ElementType>(i);
-    }
-    val = Value::Create(sampleShape, sequences, device);
-    VerifyException([&val, &sampleShape, &output, &seqLens]() {
-        val->CopyTo(sampleShape, output, seqLens, false);
+    // The batch axis is too small
+    VerifyException([&val, &sampleShape, &output, &actualSeqLens]() {
+        val->CopyTo(sampleShape, output, actualSeqLens, false);
    }, "The output buffer is too small.");
-    val->CopyTo(sampleShape, output, seqLens);
-    CheckCopyToOutput(sampleSize, sequences, output, seqLens);
+
+    val->CopyTo(sampleShape, output, actualSeqLens);
+    CheckCopyToOutput(sampleSize, input, output);
+
+    // Check sequence of sample, but single batch
+    size_t sampleCount = 4;
+    batchCount = 1;
+    expectedSeqLens.clear();
+    for (size_t i = 0; i < batchCount; i++)
+        expectedSeqLens.push_back(sampleCount);
+    input = GenerateSequences<ElementType>(expectedSeqLens, sampleShape);
+    val = Value::Create(sampleShape, input, device);
+
+    // The sequence axis is too small
+    VerifyException([&val, &sampleShape, &output, &actualSeqLens]() {
+        val->CopyTo(sampleShape, output, actualSeqLens, false);
+    }, "The output buffer is too small.");
+    
+    val->CopyTo(sampleShape, output, actualSeqLens);
+    CheckCopyToOutput(sampleSize, input, output, actualSeqLens);

    // Check batch of sequence of the same length, no mask needed.
    batchCount = 4;
    sampleCount = 3;
-    sequences.clear();
-    sequences.resize(batchCount, std::vector<ElementType>(sampleSize * sampleCount));
-    for (size_t s = 0; s < batchCount; s++)
-    {
-        for (size_t i = 0; i < sampleSize * sampleCount; i++)
-        {
-            sequences[s][i] = static_cast<ElementType>(s * 10 + i);
-        }
-    }
-    val = Value::Create(sampleShape, sequences, device);
-    val->CopyTo(sampleShape, output, seqLens);
-    CheckCopyToOutput(sampleSize, sequences, output, seqLens);
+    expectedSeqLens.clear();
+    for (size_t i = 0; i < batchCount; i++)
+        expectedSeqLens.push_back(sampleCount);
+    input = GenerateSequences<ElementType>(expectedSeqLens, sampleShape);
+    val = Value::Create(sampleShape, input, device);
+
+    // The batch axis is too small, the sequence axis is sufficient.
+    VerifyException([&val, &sampleShape, &output, &actualSeqLens]() {
+        val->CopyTo(sampleShape, output, actualSeqLens, false);
+    }, "The output buffer is too small.");
+
+    val->CopyTo(sampleShape, output, actualSeqLens);
+    CheckCopyToOutput(sampleSize, input, output, actualSeqLens);

    // Check batch of sequecnes with different length, mask needed.
-    std::vector<size_t> sampleCountList{6, 9, 2, 1, 5, 3, 4};
+    std::vector<size_t> sampleCountList {6, 9, 2};
    batchCount = sampleCountList.size();
-    sequences.clear();
-    sequences.resize(batchCount, std::vector<ElementType>(0));
-    for (size_t s = 0; s < batchCount; s++)
+    expectedSeqLens.clear();
+    for (size_t i = 0; i < batchCount; i++)
+        expectedSeqLens.push_back(sampleCountList[i]);
+    input = GenerateSequences<ElementType>(expectedSeqLens, sampleShape);
+    val = Value::Create(sampleShape, input, device);
+
+    // The batch axis is sufficient, the sequence axis is too small
+    VerifyException([&val, &sampleShape, &output, &actualSeqLens]() {
+        val->CopyTo(sampleShape, output, actualSeqLens, false);
+    }, "The output buffer is too small.");
+
+    val->CopyTo(sampleShape, output, actualSeqLens);
+    CheckCopyToOutput(sampleSize, input, output, actualSeqLens);
+
+    // More batches and sequences
+    sampleCountList = {6, 12, 2, 1, 5, 3, 4};
+    batchCount = sampleCountList.size();
+    expectedSeqLens.clear();
+    for (size_t i = 0; i < batchCount; i++)
+        expectedSeqLens.push_back(sampleCountList[i]);
+    input = GenerateSequences<ElementType>(expectedSeqLens, sampleShape);
+    val = Value::Create(sampleShape, input, device);
+
+    // Both the batch and sequence axes are too small.
+    VerifyException([&val, &sampleShape, &output, &actualSeqLens]() {
+        val->CopyTo(sampleShape, output, actualSeqLens, false);
+    }, "The output buffer is too small.");
+    val->CopyTo(sampleShape, output, actualSeqLens);
+    CheckCopyToOutput(sampleSize, input, output, actualSeqLens);
+
+    // Random batch and sequence
+    int testRun = 4;
+    size_t maxNumOfSequences = 100;
+    size_t maxSequenceLen = 100;
+    // This is only used to generate number of sequnces, so boost distribution is not needed.
+    std::default_random_engine generator;
+    std::uniform_int_distribution<size_t> distribution(1, maxNumOfSequences);
+    for (int i = 0; i < testRun; i++)
    {
-        for (size_t i = 0; i < sampleSize * sampleCountList[s]; i++)
+        batchCount = distribution(generator);
+
+        expectedSeqLens = GenerateSequenceLengths(batchCount, maxSequenceLen);
+        input = GenerateSequences<ElementType>(expectedSeqLens, sampleShape);
+        val = Value::Create(sampleShape, input, device);
+
+        val->CopyTo(sampleShape, output, actualSeqLens);
+        CheckCopyToOutput(sampleSize, input, output, actualSeqLens);
+    }
+}
+
+template <typename ElementType>
+void ValueCopyToOneHotTest(const DeviceDescriptor device)
+{
+    size_t dim = 100;
+    NDShape sampleShape{{dim}};
+    std::vector<std::vector<size_t>> input;
+    std::vector<std::vector<size_t>> output;
+    std::vector<size_t> expectedSeqLens;
+    std::vector<size_t> actualSeqLens;
+
+    // Todo: add tests dense to sparse
+    // Check single sample.
+    size_t batchCount = 1;
+    expectedSeqLens.clear();
+    for (size_t i = 0; i < batchCount; i++)
+        expectedSeqLens.push_back(1);
+    input = GenerateOneHotSequences(expectedSeqLens, dim);
+    auto val = Value::Create<ElementType>(dim, input, device);
+
+    val->CopyTo(sampleShape, output);
+    CheckCopyToOutput(1, input, output);
+
+    // Check batch of sample.
+    batchCount = 2;
+    expectedSeqLens.clear();
+    for (size_t i = 0; i < batchCount; i++)
+        expectedSeqLens.push_back(1);
+    input = GenerateOneHotSequences(expectedSeqLens, dim);
+    val = Value::Create<ElementType>(dim, input, device);
+
+    // The batch axis is too small
+    VerifyException([&val, &sampleShape, &output, &actualSeqLens]() {
+        val->CopyTo(sampleShape, output, actualSeqLens, false);
+    }, "The output buffer is too small.");
+
+    val->CopyTo(sampleShape, output, actualSeqLens);
+    CheckCopyToOutput(1, input, output);
+
+    // Check sequence of sample, but single batch
+    size_t sampleCount = 4;
+    batchCount = 1;
+    expectedSeqLens.clear();
+    for (size_t i = 0; i < batchCount; i++)
+        expectedSeqLens.push_back(sampleCount);
+    input = GenerateOneHotSequences(expectedSeqLens, dim);
+    val = Value::Create<ElementType>(dim, input, device);
+
+    // The sequence axis is too small
+    VerifyException([&val, &sampleShape, &output, &actualSeqLens]() {
+        val->CopyTo(sampleShape, output, actualSeqLens, false);
+    }, "The output buffer is too small.");
+
+    val->CopyTo(sampleShape, output, actualSeqLens);
+    CheckCopyToOutput(1, input, output, actualSeqLens);
+
+    // Check batch of sequence of the same length, no mask needed.
+    batchCount = 4;
+    sampleCount = 3;
+    expectedSeqLens.clear();
+    for (size_t i = 0; i < batchCount; i++)
+        expectedSeqLens.push_back(sampleCount);
+    input = GenerateOneHotSequences(expectedSeqLens, dim);
+    val = Value::Create<ElementType>(dim, input, device);
+
+    // The batch axis is too small, the sequence axis is sufficient.
+    VerifyException([&val, &sampleShape, &output, &actualSeqLens]() {
+        val->CopyTo(sampleShape, output, actualSeqLens, false);
+    }, "The output buffer is too small.");
+
+    val->CopyTo(sampleShape, output, actualSeqLens);
+    CheckCopyToOutput(1, input, output, actualSeqLens);
+
+    // Check batch of sequecnes with different length, mask needed.
+    std::vector<size_t> sampleCountList{6, 9, 2};
+    batchCount = sampleCountList.size();
+    expectedSeqLens.clear();
+    for (size_t i = 0; i < batchCount; i++)
+        expectedSeqLens.push_back(sampleCountList[i]);
+    input = GenerateOneHotSequences(expectedSeqLens, dim);
+    val = Value::Create<ElementType>(dim, input, device);
+
+    // The batch axis is sufficient, the sequence axis is too small
+    VerifyException([&val, &sampleShape, &output, &actualSeqLens]() {
+        val->CopyTo(sampleShape, output, actualSeqLens, false);
+    }, "The output buffer is too small.");
+
+    val->CopyTo(sampleShape, output, actualSeqLens);
+    CheckCopyToOutput(1, input, output, actualSeqLens);
+
+    // More batches and sequences
+    sampleCountList = {6, 12, 2, 1, 5, 3, 4};
+    batchCount = sampleCountList.size();
+    expectedSeqLens.clear();
+    for (size_t i = 0; i < batchCount; i++)
+        expectedSeqLens.push_back(sampleCountList[i]);
+    input = GenerateOneHotSequences(expectedSeqLens, dim);
+    val = Value::Create<float>(dim, input, device);
+
+    // Both the batch and sequence axes are too small.
+    VerifyException([&val, &sampleShape, &output, &actualSeqLens]() {
+        val->CopyTo(sampleShape, output, actualSeqLens, false);
+    }, "The output buffer is too small.");
+    val->CopyTo(sampleShape, output, actualSeqLens);
+    CheckCopyToOutput(1, input, output, actualSeqLens);
+
+    // Random batch and sequence
+    int testRun = 4;
+    size_t maxNumOfSequences = 100;
+    size_t maxSequenceLen = 100;
+    // This is only used to generate number of sequnces, so boost distribution is not needed.
+    std::default_random_engine generator;
+    std::uniform_int_distribution<size_t> distribution(1, maxNumOfSequences);
+    for (int i = 0; i < testRun; i++)
    {
-            sequences[s].push_back(static_cast<ElementType>(s * 10 + i));
+        batchCount = distribution(generator);
+
+        expectedSeqLens = GenerateSequenceLengths(batchCount, maxSequenceLen);
+        input = GenerateOneHotSequences(expectedSeqLens, dim);
+        val = Value::Create<ElementType>(dim, input, device);
+
+        val->CopyTo(sampleShape, output, actualSeqLens);
+        CheckCopyToOutput(1, input, output, actualSeqLens);
    }
-    }
-    val = Value::Create(sampleShape, sequences, device);
-    val->CopyTo(sampleShape, output, seqLens);
-    CheckCopyToOutput(sampleSize, sequences, output, seqLens);
 }

 void TestSettingParameterValuesManually(const DeviceDescriptor& device)
@ -436,6 +602,7 @@ void SparseSequenceBatchValueCreationTest(size_t vocabSize, size_t maxAllowedSeq
 void ValueTests()
 {
    fprintf(stderr, "\nValueTests..\n");
+    srand(1);

    TestSettingParameterValuesManually(DeviceDescriptor::CPUDevice());

@ -447,12 +614,12 @@ void ValueTests()
    ValueCreationOneHotNoNDMaskTest<double>(DeviceDescriptor::CPUDevice(), true);
    ValueCreationOneHotWithNDMaskTest<double>(DeviceDescriptor::CPUDevice(), false);
    ValueCreationOneHotWithNDMaskTest<float>(DeviceDescriptor::CPUDevice(), true);
-<<<<<<< HEAD
    SparseSequenceBatchValueCreationTest(300, 7, DeviceDescriptor::CPUDevice());
    SparseSequenceBatchValueCreationTest(2300, 1, DeviceDescriptor::CPUDevice());
-=======
    ValueCopyToDenseTest<float>(DeviceDescriptor::CPUDevice());
->>>>>>> 0a57cbc... refactor Value::CopyTo to allocate memory in .h file; adapt unit tests; add sequencesLens as parameter for Value::CopyTo
+    ValueCopyToDenseTest<double>(DeviceDescriptor::CPUDevice());
+    ValueCopyToOneHotTest<float>(DeviceDescriptor::CPUDevice());
+    ValueCopyToOneHotTest<double>(DeviceDescriptor::CPUDevice());

    if (IsGPUAvailable())
    {
@ -466,11 +633,11 @@ void ValueTests()
        ValueCreationOneHotNoNDMaskTest<float>(DeviceDescriptor::GPUDevice(0), true);
        ValueCreationOneHotWithNDMaskTest<float>(DeviceDescriptor::GPUDevice(0), false);
        ValueCreationOneHotWithNDMaskTest<double>(DeviceDescriptor::GPUDevice(0), true);
-<<<<<<< HEAD
        SparseSequenceBatchValueCreationTest(50000, 1, DeviceDescriptor::GPUDevice(0));
        SparseSequenceBatchValueCreationTest(6000, 6, DeviceDescriptor::GPUDevice(0));
-=======
        ValueCopyToDenseTest<float>(DeviceDescriptor::GPUDevice(0));
->>>>>>> 0a57cbc... refactor Value::CopyTo to allocate memory in .h file; adapt unit tests; add sequencesLens as parameter for Value::CopyTo
+        ValueCopyToDenseTest<double>(DeviceDescriptor::GPUDevice(0));
+        ValueCopyToOneHotTest<float>(DeviceDescriptor::GPUDevice(0));
+        ValueCopyToOneHotTest<double>(DeviceDescriptor::GPUDevice(0));
    }
 }