зеркало из https://github.com/microsoft/LightGBM.git
[SWIG] Add streaming data support + cpp tests (#3997)
* [feature] Add ChunkedArray to SWIG * Add ChunkedArray * Add ChunkedArray_API_extensions.i * Add SWIG class wrappers * Address some review comments * Fix linting issues * Move test to tests/test_ChunkedArray_manually.cpp * Add test note * Move ChunkedArray to include/LightGBM/utils/ * Declare more explicit types of ChunkedArray in the SWIG API. * Port ChunkedArray tests to googletest * Please C++ linter * Address StrikerRUS' review comments * Update SWIG doc & disable ChunkedArray<int64_t> * Use CHECK_EQ instead of assert * Change include order (linting) * Rename ChunkedArray -> chunked_array files * Change header guards * Address last comments from StrikerRUS
This commit is contained in:
Родитель
971b548687
Коммит
4ded1342ae
|
@ -64,7 +64,7 @@ if [[ $TASK == "lint" ]]; then
|
|||
echo "Linting R code"
|
||||
Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1
|
||||
echo "Linting C++ code"
|
||||
cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include ./R-package || exit -1
|
||||
cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include ./R-package ./swig || exit -1
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
|
|
@ -0,0 +1,260 @@
|
|||
/*!
|
||||
* Copyright (c) 2021 Microsoft Corporation. All rights reserved.
|
||||
* Licensed under the MIT License. See LICENSE file in the project root for license information.
|
||||
*
|
||||
* Author: Alberto Ferreira
|
||||
*/
|
||||
#ifndef LIGHTGBM_UTILS_CHUNKED_ARRAY_HPP_
|
||||
#define LIGHTGBM_UTILS_CHUNKED_ARRAY_HPP_
|
||||
|
||||
#include <LightGBM/utils/log.h>
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <new>
|
||||
#include <vector>
|
||||
|
||||
|
||||
namespace LightGBM {
|
||||
|
||||
/**
|
||||
* Container that manages a dynamic array of fixed-length chunks.
|
||||
*
|
||||
* The class also takes care of allocation & release of the underlying
|
||||
* memory. It can be used with either a high or low-level API.
|
||||
*
|
||||
* The high-level API allocates chunks as needed, manages addresses automatically and keeps
|
||||
* track of number of inserted elements, but is not thread-safe (this is ok as usually input is a streaming iterator).
|
||||
* For parallel input sources the low-level API must be used.
|
||||
*
|
||||
* Note: When using this for `LGBM_DatasetCreateFromMats` use a
|
||||
* chunk_size multiple of #num_cols for your dataset, so each chunk
|
||||
* contains "complete" instances.
|
||||
*
|
||||
* === High-level insert API intro ===
|
||||
*
|
||||
* The easiest way to use is:
|
||||
* 0. ChunkedArray(chunk_size) # Choose appropriate size
|
||||
* 1. add(value) # as many times as you want (will generate chunks as needed)
|
||||
* 2. data() or void_data() # retrieves a T** or void** pointer (useful for `LGBM_DatasetCreateFromMats`).
|
||||
*
|
||||
* Useful query methods (all O(1)):
|
||||
* - get_add_count() # total count of added elements.
|
||||
* - get_chunks_count() # how many chunks are currently allocated.
|
||||
* - get_current_chunk_added_count() # for the last add() chunk, how many items there are.
|
||||
* - get_chunk_size() # get constant chunk_size from constructor call.
|
||||
*
|
||||
* With those you can generate int32_t sizes[]. Last chunk can be smaller than chunk_size, so, for any i:
|
||||
* - sizes[i<last] = get_chunk_size()
|
||||
* - sizes[i==last] = get_add_count()
|
||||
*
|
||||
*
|
||||
* === Low-level insert API intro ===
|
||||
*
|
||||
* For advanced usage - useful for inserting in parallel - one can also:
|
||||
* 1. call new_chunk() at any time for as many chunks as needed. (thread-UNsafe)
|
||||
* 2. call setitem(chunk, idx, value) to insert each value. (thread-safe)
|
||||
*
|
||||
*/
|
||||
template <class T>
|
||||
class ChunkedArray {
|
||||
public:
|
||||
explicit ChunkedArray(size_t chunk_size)
|
||||
: _chunk_size(chunk_size), _last_chunk_idx(0), _last_idx_in_last_chunk(0) {
|
||||
if (chunk_size == 0) {
|
||||
Log::Fatal("ChunkedArray chunk size must be larger than 0!");
|
||||
}
|
||||
new_chunk();
|
||||
}
|
||||
|
||||
~ChunkedArray() {
|
||||
release();
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a value to the chunks sequentially.
|
||||
* If the last chunk is full it creates a new one and appends to it.
|
||||
*
|
||||
* @param value value to insert.
|
||||
*/
|
||||
void add(T value) {
|
||||
if (!within_bounds(_last_chunk_idx, _last_idx_in_last_chunk)) {
|
||||
new_chunk();
|
||||
++_last_chunk_idx;
|
||||
_last_idx_in_last_chunk = 0;
|
||||
}
|
||||
|
||||
CHECK_EQ(setitem(_last_chunk_idx, _last_idx_in_last_chunk, value), 0);
|
||||
++_last_idx_in_last_chunk;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Number of add() calls.
|
||||
*/
|
||||
size_t get_add_count() const {
|
||||
return _last_chunk_idx * _chunk_size + _last_idx_in_last_chunk;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Number of allocated chunks.
|
||||
*/
|
||||
size_t get_chunks_count() const {
|
||||
return _chunks.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Number of elemends add()'ed in the last chunk.
|
||||
*/
|
||||
size_t get_last_chunk_add_count() const {
|
||||
return _last_idx_in_last_chunk;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter for the chunk size set at the constructor.
|
||||
*
|
||||
* @return Return the size of chunks.
|
||||
*/
|
||||
size_t get_chunk_size() const {
|
||||
return _chunk_size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the pointer to the raw chunks data.
|
||||
*
|
||||
* @return T** pointer to raw data.
|
||||
*/
|
||||
T **data() noexcept {
|
||||
return _chunks.data();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the pointer to the raw chunks data, but cast to void**.
|
||||
* This is so ``LGBM_DatasetCreateFromMats`` accepts it.
|
||||
*
|
||||
* @return void** pointer to raw data.
|
||||
*/
|
||||
void **data_as_void() noexcept {
|
||||
return reinterpret_cast<void**>(_chunks.data());
|
||||
}
|
||||
|
||||
/**
|
||||
* Coalesces (copies chunked data) to a contiguous array of the same type.
|
||||
* It assumes that ``other`` has enough space to receive that data.
|
||||
*
|
||||
* @param other array with elements T of size >= this->get_add_count().
|
||||
* @param all_valid_addresses
|
||||
* If true exports values from all valid addresses independently of add() count.
|
||||
* Otherwise, exports only up to `get_add_count()` addresses.
|
||||
*/
|
||||
void coalesce_to(T *other, bool all_valid_addresses = false) const {
|
||||
const size_t full_chunks = this->get_chunks_count() - 1;
|
||||
|
||||
// Copy full chunks:
|
||||
size_t i = 0;
|
||||
for (size_t chunk = 0; chunk < full_chunks; ++chunk) {
|
||||
T* chunk_ptr = _chunks[chunk];
|
||||
for (size_t in_chunk_idx = 0; in_chunk_idx < _chunk_size; ++in_chunk_idx) {
|
||||
other[i++] = chunk_ptr[in_chunk_idx];
|
||||
}
|
||||
}
|
||||
// Copy filled values from last chunk only:
|
||||
const size_t last_chunk_elems_to_copy = all_valid_addresses ? _chunk_size : this->get_last_chunk_add_count();
|
||||
T* chunk_ptr = _chunks[full_chunks];
|
||||
for (size_t in_chunk_idx = 0; in_chunk_idx < last_chunk_elems_to_copy; ++in_chunk_idx) {
|
||||
other[i++] = chunk_ptr[in_chunk_idx];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return value from array of chunks.
|
||||
*
|
||||
* @param chunk_index index of the chunk
|
||||
* @param index_within_chunk index within chunk
|
||||
* @param on_fail_value sentinel value. If out of bounds returns that value.
|
||||
*
|
||||
* @return pointer or nullptr if index is out of bounds.
|
||||
*/
|
||||
T getitem(size_t chunk_index, size_t index_within_chunk, T on_fail_value) const noexcept {
|
||||
if (within_bounds(chunk_index, index_within_chunk))
|
||||
return _chunks[chunk_index][index_within_chunk];
|
||||
else
|
||||
return on_fail_value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the value at a specific address in one of the chunks.
|
||||
*
|
||||
* @param chunk_index index of the chunk
|
||||
* @param index_within_chunk index within chunk
|
||||
* @param value value to store
|
||||
*
|
||||
* @return 0 = success, -1 = out of bounds access.
|
||||
*/
|
||||
int setitem(size_t chunk_index, size_t index_within_chunk, T value) noexcept {
|
||||
if (within_bounds(chunk_index, index_within_chunk)) {
|
||||
_chunks[chunk_index][index_within_chunk] = value;
|
||||
return 0;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* To reset storage call this.
|
||||
* Will release existing resources and prepare for reuse.
|
||||
*/
|
||||
void clear() noexcept {
|
||||
release();
|
||||
new_chunk();
|
||||
}
|
||||
|
||||
/**
|
||||
* Deletes all the allocated chunks.
|
||||
* Do not use container after this! See ``clear()`` instead.
|
||||
*/
|
||||
void release() noexcept {
|
||||
std::for_each(_chunks.begin(), _chunks.end(), [](T* c) { delete[] c; });
|
||||
_chunks.clear();
|
||||
_chunks.shrink_to_fit();
|
||||
_last_chunk_idx = 0;
|
||||
_last_idx_in_last_chunk = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* As the array is dynamic, checks whether a given address is currently within bounds.
|
||||
*
|
||||
* @param chunk_index index of the chunk
|
||||
* @param index_within_chunk index within that chunk
|
||||
* @return true if that chunk is already allocated and index_within_chunk < chunk size.
|
||||
*/
|
||||
inline bool within_bounds(size_t chunk_index, size_t index_within_chunk) const {
|
||||
return (chunk_index < _chunks.size()) && (index_within_chunk < _chunk_size);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new chunk to the array of chunks. Not thread-safe.
|
||||
*/
|
||||
void new_chunk() {
|
||||
_chunks.push_back(new (std::nothrow) T[_chunk_size]);
|
||||
|
||||
// Check memory allocation success:
|
||||
if (!_chunks[_chunks.size()-1]) {
|
||||
release();
|
||||
Log::Fatal("Memory exhausted! Cannot allocate new ChunkedArray chunk.");
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const size_t _chunk_size;
|
||||
std::vector<T*> _chunks;
|
||||
|
||||
// For the add() interface & some of the get_*() queries:
|
||||
size_t _last_chunk_idx; //<! Index of chunks
|
||||
size_t _last_idx_in_last_chunk; //<! Index within chunk
|
||||
};
|
||||
|
||||
|
||||
} // namespace LightGBM
|
||||
|
||||
#endif // LIGHTGBM_UTILS_CHUNKED_ARRAY_HPP_
|
|
@ -0,0 +1,23 @@
|
|||
/**
|
||||
* Wrap chunked_array.hpp class for SWIG usage.
|
||||
*
|
||||
* Author: Alberto Ferreira
|
||||
*/
|
||||
|
||||
%{
|
||||
#include "../include/LightGBM/utils/chunked_array.hpp"
|
||||
%}
|
||||
|
||||
%include "../include/LightGBM/utils/chunked_array.hpp"
|
||||
|
||||
using LightGBM::ChunkedArray;
|
||||
|
||||
%template(int32ChunkedArray) ChunkedArray<int32_t>;
|
||||
/* Unfortunately, for the time being,
|
||||
* SWIG has issues generating the overloads to coalesce_to()
|
||||
* for larger integral types
|
||||
* so we won't support that for now:
|
||||
*/
|
||||
//%template(int64ChunkedArray) ChunkedArray<int64_t>;
|
||||
%template(floatChunkedArray) ChunkedArray<float>;
|
||||
%template(doubleChunkedArray) ChunkedArray<double>;
|
|
@ -1,13 +1,16 @@
|
|||
/*!
|
||||
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
|
||||
* Licensed under the MIT License. See LICENSE file in the project root for license information.
|
||||
*
|
||||
* Author: Alberto Ferreira
|
||||
*/
|
||||
#ifndef __STRING_ARRAY_H__
|
||||
#define __STRING_ARRAY_H__
|
||||
#ifndef LIGHTGBM_SWIG_STRING_ARRAY_H_
|
||||
#define LIGHTGBM_SWIG_STRING_ARRAY_H_
|
||||
|
||||
#include <new>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <new>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
/**
|
||||
* Container that manages an array of fixed-length strings.
|
||||
|
@ -22,18 +25,15 @@
|
|||
* The class also takes care of allocation of the underlying
|
||||
* char* memory.
|
||||
*/
|
||||
class StringArray
|
||||
{
|
||||
public:
|
||||
class StringArray {
|
||||
public:
|
||||
StringArray(size_t num_elements, size_t string_size)
|
||||
: _string_size(string_size),
|
||||
_array(num_elements + 1, nullptr)
|
||||
{
|
||||
_array(num_elements + 1, nullptr) {
|
||||
_allocate_strings(num_elements, string_size);
|
||||
}
|
||||
|
||||
~StringArray()
|
||||
{
|
||||
~StringArray() {
|
||||
_release_strings();
|
||||
}
|
||||
|
||||
|
@ -43,8 +43,7 @@ class StringArray
|
|||
*
|
||||
* @return char** pointer to raw data (null-terminated).
|
||||
*/
|
||||
char **data() noexcept
|
||||
{
|
||||
char **data() noexcept {
|
||||
return _array.data();
|
||||
}
|
||||
|
||||
|
@ -56,8 +55,7 @@ class StringArray
|
|||
* @param index Index of the element to retrieve.
|
||||
* @return pointer or nullptr if index is out of bounds.
|
||||
*/
|
||||
char *getitem(size_t index) noexcept
|
||||
{
|
||||
char *getitem(size_t index) noexcept {
|
||||
if (_in_bounds(index))
|
||||
return _array[index];
|
||||
else
|
||||
|
@ -77,11 +75,9 @@ class StringArray
|
|||
* into the target string (_string_size), it errors out
|
||||
* and returns -1.
|
||||
*/
|
||||
int setitem(size_t index, std::string content) noexcept
|
||||
{
|
||||
if (_in_bounds(index) && content.size() < _string_size)
|
||||
{
|
||||
std::strcpy(_array[index], content.c_str());
|
||||
int setitem(size_t index, const std::string &content) noexcept {
|
||||
if (_in_bounds(index) && content.size() < _string_size) {
|
||||
std::strcpy(_array[index], content.c_str()); // NOLINT
|
||||
return 0;
|
||||
} else {
|
||||
return -1;
|
||||
|
@ -91,13 +87,11 @@ class StringArray
|
|||
/**
|
||||
* @return number of stored strings.
|
||||
*/
|
||||
size_t get_num_elements() noexcept
|
||||
{
|
||||
size_t get_num_elements() noexcept {
|
||||
return _array.size() - 1;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
private:
|
||||
/**
|
||||
* Returns true if and only if within bounds.
|
||||
* Notice that it excludes the last element of _array (NULL).
|
||||
|
@ -105,8 +99,7 @@ class StringArray
|
|||
* @param index index of the element
|
||||
* @return bool true if within bounds
|
||||
*/
|
||||
bool _in_bounds(size_t index) noexcept
|
||||
{
|
||||
bool _in_bounds(size_t index) noexcept {
|
||||
return index < get_num_elements();
|
||||
}
|
||||
|
||||
|
@ -120,15 +113,13 @@ class StringArray
|
|||
* @param num_elements Number of strings to store in the array.
|
||||
* @param string_size The size of each string in the array.
|
||||
*/
|
||||
void _allocate_strings(size_t num_elements, size_t string_size)
|
||||
{
|
||||
for (size_t i = 0; i < num_elements; ++i)
|
||||
{
|
||||
void _allocate_strings(size_t num_elements, size_t string_size) {
|
||||
for (size_t i = 0; i < num_elements; ++i) {
|
||||
// Leave space for \0 terminator:
|
||||
_array[i] = new (std::nothrow) char[string_size + 1];
|
||||
|
||||
// Check memory allocation:
|
||||
if (! _array[i]) {
|
||||
if (!_array[i]) {
|
||||
_release_strings();
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
|
@ -138,8 +129,7 @@ class StringArray
|
|||
/**
|
||||
* Deletes the allocated strings.
|
||||
*/
|
||||
void _release_strings() noexcept
|
||||
{
|
||||
void _release_strings() noexcept {
|
||||
std::for_each(_array.begin(), _array.end(), [](char* c) { delete[] c; });
|
||||
}
|
||||
|
||||
|
@ -147,4 +137,4 @@ class StringArray
|
|||
std::vector<char*> _array;
|
||||
};
|
||||
|
||||
#endif // __STRING_ARRAY_H__
|
||||
#endif // LIGHTGBM_SWIG_STRING_ARRAY_H_
|
||||
|
|
|
@ -282,3 +282,4 @@
|
|||
|
||||
%include "pointer_manipulation.i"
|
||||
%include "StringArray_API_extensions.i"
|
||||
%include "ChunkedArray_API_extensions.i"
|
||||
|
|
|
@ -0,0 +1,262 @@
|
|||
/*!
|
||||
* Copyright (c) 2021 Microsoft Corporation. All rights reserved.
|
||||
* Licensed under the MIT License. See LICENSE file in the project root for license information.
|
||||
*
|
||||
* Author: Alberto Ferreira
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include "../include/LightGBM/utils/chunked_array.hpp"
|
||||
|
||||
using LightGBM::ChunkedArray;
|
||||
|
||||
/*!
|
||||
Helper util to compare two vectors.
|
||||
|
||||
Don't compare floating point vectors this way!
|
||||
*/
|
||||
template <typename T>
|
||||
testing::AssertionResult are_vectors_equal(const std::vector<T> &a, const std::vector<T> &b) {
|
||||
if (a.size() != b.size()) {
|
||||
return testing::AssertionFailure()
|
||||
<< "Vectors differ in size: "
|
||||
<< a.size() << " != " << b.size();
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < a.size(); ++i) {
|
||||
if (a[i] != b[i]) {
|
||||
return testing::AssertionFailure()
|
||||
<< "Vectors differ at least at position " << i << ": "
|
||||
<< a[i] << " != " << b[i];
|
||||
}
|
||||
}
|
||||
|
||||
return testing::AssertionSuccess();
|
||||
}
|
||||
|
||||
|
||||
class ChunkedArrayTest : public testing::Test {
|
||||
protected:
|
||||
|
||||
void SetUp() override {
|
||||
|
||||
}
|
||||
|
||||
void add_items_to_array(const std::vector<int> &vec, ChunkedArray<int> &ca) {
|
||||
for (auto v: vec) {
|
||||
ca.add(v);
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
Ensures that if coalesce_to() is called upon the ChunkedArray,
|
||||
it would yield the same contents as vec
|
||||
*/
|
||||
testing::AssertionResult coalesced_output_equals_vec(const ChunkedArray<int> &ca, const std::vector<int> &vec,
|
||||
const bool all_addresses=false) {
|
||||
std::vector<int> out(vec.size());
|
||||
ca.coalesce_to(out.data(), all_addresses);
|
||||
return are_vectors_equal(out, vec);
|
||||
}
|
||||
|
||||
// Constants
|
||||
const std::vector<int> REF_VEC = {1, 5, 2, 4, 9, 8, 7};
|
||||
const size_t CHUNK_SIZE = 3;
|
||||
const size_t OUT_OF_BOUNDS_OFFSET = 4;
|
||||
|
||||
ChunkedArray<int> ca_ = ChunkedArray<int>(CHUNK_SIZE); //<! Re-used for many tests.
|
||||
};
|
||||
|
||||
|
||||
/*! ChunkedArray cannot be built from chunks of size 0. */
|
||||
TEST_F(ChunkedArrayTest, constructorWithChunkSize0Throws) {
|
||||
ASSERT_THROW(ChunkedArray<int> ca(0), std::runtime_error);
|
||||
}
|
||||
|
||||
/*! get_chunk_size() should return the size used in the constructor */
|
||||
TEST_F(ChunkedArrayTest, constructorWithChunkSize) {
|
||||
for (size_t chunk_size = 1; chunk_size < 10; ++chunk_size) {
|
||||
ChunkedArray<int> ca(chunk_size);
|
||||
ASSERT_EQ(ca.get_chunk_size(), chunk_size);
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
get_chunk_size() should return the size used in the constructor
|
||||
independently of array manipulations.
|
||||
*/
|
||||
TEST_F(ChunkedArrayTest, getChunkSizeIsConstant) {
|
||||
for (size_t i = 0; i < 3 * CHUNK_SIZE; ++i) {
|
||||
ASSERT_EQ(ca_.get_chunk_size(), CHUNK_SIZE);
|
||||
ca_.add(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*!
|
||||
get_add_count() should return the number of add calls,
|
||||
independently of the number of chunks used.
|
||||
*/
|
||||
TEST_F(ChunkedArrayTest, getChunksCount) {
|
||||
ASSERT_EQ(ca_.get_chunks_count(), 1); // ChunkedArray always starts with 1 chunk.
|
||||
|
||||
for (size_t i = 0; i < 3 * CHUNK_SIZE; ++i) {
|
||||
ca_.add(0);
|
||||
int expected_chunks = int(i/CHUNK_SIZE) + 1;
|
||||
ASSERT_EQ(ca_.get_chunks_count(), expected_chunks) << "with " << i << " add() call(s) "
|
||||
<< "and CHUNK_SIZE==" << CHUNK_SIZE << ".";
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
get_add_count() should return the number of add calls,
|
||||
independently of the number of chunks used.
|
||||
*/
|
||||
TEST_F(ChunkedArrayTest, getAddCount) {
|
||||
for (size_t i = 0; i < 3 * CHUNK_SIZE; ++i) {
|
||||
ASSERT_EQ(ca_.get_add_count(), i);
|
||||
ca_.add(0);
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
Ensure coalesce_to() works and dumps all the inserted data correctly.
|
||||
|
||||
If the ChunkedArray is created from a sequence of add() calls, coalescing to
|
||||
an output array after multiple add operations should yield the same
|
||||
exact data at both input and output.
|
||||
*/
|
||||
TEST_F(ChunkedArrayTest, coalesceTo) {
|
||||
std::vector<int> out(REF_VEC.size());
|
||||
add_items_to_array(REF_VEC, ca_);
|
||||
|
||||
ca_.coalesce_to(out.data());
|
||||
|
||||
ASSERT_TRUE(are_vectors_equal(REF_VEC, out));
|
||||
}
|
||||
|
||||
/*!
|
||||
After clear the ChunkedArray() should still be usable.
|
||||
*/
|
||||
TEST_F(ChunkedArrayTest, clear) {
|
||||
const std::vector<int> ref_vec2 = {1, 2, 5, -1};
|
||||
add_items_to_array(REF_VEC, ca_);
|
||||
// Start with some content:
|
||||
ASSERT_TRUE(coalesced_output_equals_vec(ca_, REF_VEC));
|
||||
|
||||
// Clear & re-use:
|
||||
ca_.clear();
|
||||
add_items_to_array(ref_vec2, ca_);
|
||||
|
||||
// Output should match new content:
|
||||
ASSERT_TRUE(coalesced_output_equals_vec(ca_, ref_vec2));
|
||||
}
|
||||
|
||||
/*!
|
||||
Ensure ChunkedArray is safe against double-frees.
|
||||
*/
|
||||
TEST_F(ChunkedArrayTest, doubleFreeSafe) {
|
||||
ca_.release(); // Cannot be used any longer from now on.
|
||||
ca_.release(); // Ensure we don't segfault.
|
||||
|
||||
SUCCEED();
|
||||
}
|
||||
|
||||
/*!
|
||||
Ensure size computations in the getters are correct.
|
||||
*/
|
||||
TEST_F(ChunkedArrayTest, totalArraySizeMatchesLastChunkAddCount) {
|
||||
add_items_to_array(REF_VEC, ca_);
|
||||
|
||||
const size_t first_chunks_add_count = (ca_.get_chunks_count() - 1) * ca_.get_chunk_size();
|
||||
const size_t last_chunk_add_count = ca_.get_last_chunk_add_count();
|
||||
|
||||
EXPECT_EQ(first_chunks_add_count, int(REF_VEC.size()/CHUNK_SIZE) * CHUNK_SIZE);
|
||||
EXPECT_EQ(last_chunk_add_count, REF_VEC.size() % CHUNK_SIZE);
|
||||
EXPECT_EQ(first_chunks_add_count + last_chunk_add_count, ca_.get_add_count());
|
||||
}
|
||||
|
||||
/*!
|
||||
Assert all values are correct and at the expected addresses throughout the
|
||||
several chunks.
|
||||
|
||||
This uses getitem() to reach each individual address of any of the chunks.
|
||||
|
||||
A sentinel value of -1 is used to check for invalid addresses.
|
||||
This would occur if there was an improper data layout with the chunks.
|
||||
*/
|
||||
TEST_F(ChunkedArrayTest, dataLayoutTestThroughGetitem) {
|
||||
add_items_to_array(REF_VEC, ca_);
|
||||
|
||||
for (size_t i = 0, chunk = 0, in_chunk_idx = 0; i < REF_VEC.size(); ++i) {
|
||||
int value = ca_.getitem(chunk, in_chunk_idx, -1); // -1 works as sentinel value (bad layout found)
|
||||
|
||||
EXPECT_EQ(value, REF_VEC[i]) << " for address (chunk,in_chunk_idx) = (" << chunk << "," << in_chunk_idx << ")";
|
||||
|
||||
if (++in_chunk_idx == ca_.get_chunk_size()) {
|
||||
in_chunk_idx = 0;
|
||||
++chunk;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
Perform an array of setitem & getitem at valid and invalid addresses.
|
||||
We use several random addresses and trials to avoid writing much code.
|
||||
|
||||
By testing a random number of addresses many more times than the size of the test space
|
||||
we are almost guaranteed to cover all possible search addresses.
|
||||
|
||||
We also gradually add more chunks to the ChunkedArray and re-run more trials
|
||||
to ensure the valid/invalid addresses are updated.
|
||||
|
||||
With each valid update we add to a "memory" vector the history of all the insertions.
|
||||
This is used at the end to ensure all values were stored properly, including after
|
||||
value overrides.
|
||||
*/
|
||||
TEST_F(ChunkedArrayTest, testDataLayoutWithAdvancedInsertionAPI) {
|
||||
const size_t MAX_CHUNKS_SEARCH = 5;
|
||||
const size_t MAX_IN_CHUNK_SEARCH_IDX = 2 * CHUNK_SIZE;
|
||||
// Number of trials for each new ChunkedArray configuration. Pass 100 times over the search space:
|
||||
const size_t N_TRIALS = MAX_CHUNKS_SEARCH * MAX_IN_CHUNK_SEARCH_IDX * 100;
|
||||
std::vector<int> overriden_trials_values(MAX_CHUNKS_SEARCH * CHUNK_SIZE);
|
||||
std::vector<bool> overriden_trials_mask(MAX_CHUNKS_SEARCH * CHUNK_SIZE, false);
|
||||
|
||||
// Each outer loop iteration changes the test by adding +1 chunk. We start with 1 chunk only:
|
||||
for (size_t chunks = 1; chunks < MAX_CHUNKS_SEARCH; ++chunks) {
|
||||
EXPECT_EQ(ca_.get_chunks_count(), chunks);
|
||||
|
||||
// Sweep valid and invalid addresses with a ChunkedArray with `chunks` chunks:
|
||||
for (size_t trial = 0; trial < N_TRIALS; ++trial) {
|
||||
// Compute a new trial address & value & if it is a valid address:
|
||||
const size_t trial_chunk = std::rand() % MAX_CHUNKS_SEARCH;
|
||||
const size_t trial_in_chunk_idx = std::rand() % MAX_IN_CHUNK_SEARCH_IDX;
|
||||
const int trial_value = std::rand() % 99999;
|
||||
const bool valid_address = (trial_chunk < chunks) & (trial_in_chunk_idx < CHUNK_SIZE);
|
||||
|
||||
// Insert item. If at a valid address, 0 is returned, otherwise, -1 is returned:
|
||||
EXPECT_EQ(ca_.setitem(trial_chunk, trial_in_chunk_idx, trial_value),
|
||||
valid_address ? 0 : -1);
|
||||
// If at valid address, check that the stored value is correct & remember it for the future:
|
||||
if (valid_address) {
|
||||
// Check the just-stored value with getitem():
|
||||
EXPECT_EQ(ca_.getitem(trial_chunk, trial_in_chunk_idx, -1), trial_value); // -1 is the sentinel value.
|
||||
|
||||
// Also store the just-stored value for future tracking:
|
||||
overriden_trials_values[trial_chunk*CHUNK_SIZE + trial_in_chunk_idx] = trial_value;
|
||||
overriden_trials_mask[trial_chunk*CHUNK_SIZE + trial_in_chunk_idx] = true;
|
||||
}
|
||||
}
|
||||
|
||||
ca_.new_chunk(); // Just finished a round of trials. Now add a new chunk. Valid addresses will be expanded.
|
||||
}
|
||||
|
||||
// Final check: ensure even with overrides, all valid insertions store the latest value at that address:
|
||||
std::vector<int> coalesced_out(MAX_CHUNKS_SEARCH * CHUNK_SIZE, -1);
|
||||
ca_.coalesce_to(coalesced_out.data(), true); // Export all valid addresses.
|
||||
for (size_t i = 0; i < overriden_trials_mask.size(); ++i) {
|
||||
if (overriden_trials_mask[i]) {
|
||||
EXPECT_EQ(ca_.getitem(i/CHUNK_SIZE, i % CHUNK_SIZE, -1), overriden_trials_values[i]);
|
||||
EXPECT_EQ(coalesced_out[i], overriden_trials_values[i]);
|
||||
}
|
||||
}
|
||||
}
|
Загрузка…
Ссылка в новой задаче