[SWIG] Add streaming data support + cpp tests (#3997)

* [feature] Add ChunkedArray to SWIG

* Add ChunkedArray
* Add ChunkedArray_API_extensions.i
* Add SWIG class wrappers

* Address some review comments

* Fix linting issues

* Move test to tests/test_ChunkedArray_manually.cpp

* Add test note

* Move ChunkedArray to include/LightGBM/utils/

* Declare more explicit types of ChunkedArray in the SWIG API.

* Port ChunkedArray tests to googletest

* Please C++ linter

* Address StrikerRUS' review comments

* Update SWIG doc & disable ChunkedArray<int64_t>

* Use CHECK_EQ instead of assert

* Change include order (linting)

* Rename ChunkedArray -> chunked_array files

* Change header guards

* Address last comments from StrikerRUS
This commit is contained in:
Alberto Ferreira 2021-03-21 12:07:21 +00:00 коммит произвёл GitHub
Родитель 971b548687
Коммит 4ded1342ae
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 571 добавлений и 35 удалений

Просмотреть файл

@ -64,7 +64,7 @@ if [[ $TASK == "lint" ]]; then
echo "Linting R code"
Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1
echo "Linting C++ code"
cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include ./R-package || exit -1
cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include ./R-package ./swig || exit -1
exit 0
fi

Просмотреть файл

@ -0,0 +1,260 @@
/*!
* Copyright (c) 2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*
* Author: Alberto Ferreira
*/
#ifndef LIGHTGBM_UTILS_CHUNKED_ARRAY_HPP_
#define LIGHTGBM_UTILS_CHUNKED_ARRAY_HPP_
#include <LightGBM/utils/log.h>
#include <stdint.h>
#include <algorithm>
#include <new>
#include <vector>
namespace LightGBM {
/**
* Container that manages a dynamic array of fixed-length chunks.
*
* The class also takes care of allocation & release of the underlying
* memory. It can be used with either a high or low-level API.
*
* The high-level API allocates chunks as needed, manages addresses automatically and keeps
* track of number of inserted elements, but is not thread-safe (this is ok as usually input is a streaming iterator).
* For parallel input sources the low-level API must be used.
*
* Note: When using this for `LGBM_DatasetCreateFromMats` use a
* chunk_size multiple of #num_cols for your dataset, so each chunk
* contains "complete" instances.
*
* === High-level insert API intro ===
*
* The easiest way to use is:
* 0. ChunkedArray(chunk_size) # Choose appropriate size
* 1. add(value) # as many times as you want (will generate chunks as needed)
* 2. data() or void_data() # retrieves a T** or void** pointer (useful for `LGBM_DatasetCreateFromMats`).
*
* Useful query methods (all O(1)):
* - get_add_count() # total count of added elements.
* - get_chunks_count() # how many chunks are currently allocated.
* - get_current_chunk_added_count() # for the last add() chunk, how many items there are.
* - get_chunk_size() # get constant chunk_size from constructor call.
*
* With those you can generate int32_t sizes[]. Last chunk can be smaller than chunk_size, so, for any i:
* - sizes[i<last] = get_chunk_size()
* - sizes[i==last] = get_add_count()
*
*
* === Low-level insert API intro ===
*
* For advanced usage - useful for inserting in parallel - one can also:
* 1. call new_chunk() at any time for as many chunks as needed. (thread-UNsafe)
* 2. call setitem(chunk, idx, value) to insert each value. (thread-safe)
*
*/
template <class T>
class ChunkedArray {
public:
explicit ChunkedArray(size_t chunk_size)
: _chunk_size(chunk_size), _last_chunk_idx(0), _last_idx_in_last_chunk(0) {
if (chunk_size == 0) {
Log::Fatal("ChunkedArray chunk size must be larger than 0!");
}
new_chunk();
}
~ChunkedArray() {
release();
}
/**
* Adds a value to the chunks sequentially.
* If the last chunk is full it creates a new one and appends to it.
*
* @param value value to insert.
*/
void add(T value) {
if (!within_bounds(_last_chunk_idx, _last_idx_in_last_chunk)) {
new_chunk();
++_last_chunk_idx;
_last_idx_in_last_chunk = 0;
}
CHECK_EQ(setitem(_last_chunk_idx, _last_idx_in_last_chunk, value), 0);
++_last_idx_in_last_chunk;
}
/**
* @return Number of add() calls.
*/
size_t get_add_count() const {
return _last_chunk_idx * _chunk_size + _last_idx_in_last_chunk;
}
/**
* @return Number of allocated chunks.
*/
size_t get_chunks_count() const {
return _chunks.size();
}
/**
* @return Number of elemends add()'ed in the last chunk.
*/
size_t get_last_chunk_add_count() const {
return _last_idx_in_last_chunk;
}
/**
* Getter for the chunk size set at the constructor.
*
* @return Return the size of chunks.
*/
size_t get_chunk_size() const {
return _chunk_size;
}
/**
* Returns the pointer to the raw chunks data.
*
* @return T** pointer to raw data.
*/
T **data() noexcept {
return _chunks.data();
}
/**
* Returns the pointer to the raw chunks data, but cast to void**.
* This is so ``LGBM_DatasetCreateFromMats`` accepts it.
*
* @return void** pointer to raw data.
*/
void **data_as_void() noexcept {
return reinterpret_cast<void**>(_chunks.data());
}
/**
* Coalesces (copies chunked data) to a contiguous array of the same type.
* It assumes that ``other`` has enough space to receive that data.
*
* @param other array with elements T of size >= this->get_add_count().
* @param all_valid_addresses
* If true exports values from all valid addresses independently of add() count.
* Otherwise, exports only up to `get_add_count()` addresses.
*/
void coalesce_to(T *other, bool all_valid_addresses = false) const {
const size_t full_chunks = this->get_chunks_count() - 1;
// Copy full chunks:
size_t i = 0;
for (size_t chunk = 0; chunk < full_chunks; ++chunk) {
T* chunk_ptr = _chunks[chunk];
for (size_t in_chunk_idx = 0; in_chunk_idx < _chunk_size; ++in_chunk_idx) {
other[i++] = chunk_ptr[in_chunk_idx];
}
}
// Copy filled values from last chunk only:
const size_t last_chunk_elems_to_copy = all_valid_addresses ? _chunk_size : this->get_last_chunk_add_count();
T* chunk_ptr = _chunks[full_chunks];
for (size_t in_chunk_idx = 0; in_chunk_idx < last_chunk_elems_to_copy; ++in_chunk_idx) {
other[i++] = chunk_ptr[in_chunk_idx];
}
}
/**
* Return value from array of chunks.
*
* @param chunk_index index of the chunk
* @param index_within_chunk index within chunk
* @param on_fail_value sentinel value. If out of bounds returns that value.
*
* @return pointer or nullptr if index is out of bounds.
*/
T getitem(size_t chunk_index, size_t index_within_chunk, T on_fail_value) const noexcept {
if (within_bounds(chunk_index, index_within_chunk))
return _chunks[chunk_index][index_within_chunk];
else
return on_fail_value;
}
/**
* Sets the value at a specific address in one of the chunks.
*
* @param chunk_index index of the chunk
* @param index_within_chunk index within chunk
* @param value value to store
*
* @return 0 = success, -1 = out of bounds access.
*/
int setitem(size_t chunk_index, size_t index_within_chunk, T value) noexcept {
if (within_bounds(chunk_index, index_within_chunk)) {
_chunks[chunk_index][index_within_chunk] = value;
return 0;
} else {
return -1;
}
}
/**
* To reset storage call this.
* Will release existing resources and prepare for reuse.
*/
void clear() noexcept {
release();
new_chunk();
}
/**
* Deletes all the allocated chunks.
* Do not use container after this! See ``clear()`` instead.
*/
void release() noexcept {
std::for_each(_chunks.begin(), _chunks.end(), [](T* c) { delete[] c; });
_chunks.clear();
_chunks.shrink_to_fit();
_last_chunk_idx = 0;
_last_idx_in_last_chunk = 0;
}
/**
* As the array is dynamic, checks whether a given address is currently within bounds.
*
* @param chunk_index index of the chunk
* @param index_within_chunk index within that chunk
* @return true if that chunk is already allocated and index_within_chunk < chunk size.
*/
inline bool within_bounds(size_t chunk_index, size_t index_within_chunk) const {
return (chunk_index < _chunks.size()) && (index_within_chunk < _chunk_size);
}
/**
* Adds a new chunk to the array of chunks. Not thread-safe.
*/
void new_chunk() {
_chunks.push_back(new (std::nothrow) T[_chunk_size]);
// Check memory allocation success:
if (!_chunks[_chunks.size()-1]) {
release();
Log::Fatal("Memory exhausted! Cannot allocate new ChunkedArray chunk.");
}
}
private:
const size_t _chunk_size;
std::vector<T*> _chunks;
// For the add() interface & some of the get_*() queries:
size_t _last_chunk_idx; //<! Index of chunks
size_t _last_idx_in_last_chunk; //<! Index within chunk
};
} // namespace LightGBM
#endif // LIGHTGBM_UTILS_CHUNKED_ARRAY_HPP_

Просмотреть файл

@ -0,0 +1,23 @@
/**
* Wrap chunked_array.hpp class for SWIG usage.
*
* Author: Alberto Ferreira
*/
%{
#include "../include/LightGBM/utils/chunked_array.hpp"
%}
%include "../include/LightGBM/utils/chunked_array.hpp"
using LightGBM::ChunkedArray;
%template(int32ChunkedArray) ChunkedArray<int32_t>;
/* Unfortunately, for the time being,
* SWIG has issues generating the overloads to coalesce_to()
* for larger integral types
* so we won't support that for now:
*/
//%template(int64ChunkedArray) ChunkedArray<int64_t>;
%template(floatChunkedArray) ChunkedArray<float>;
%template(doubleChunkedArray) ChunkedArray<double>;

Просмотреть файл

@ -1,13 +1,16 @@
/*!
* Copyright (c) 2020 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*
* Author: Alberto Ferreira
*/
#ifndef __STRING_ARRAY_H__
#define __STRING_ARRAY_H__
#ifndef LIGHTGBM_SWIG_STRING_ARRAY_H_
#define LIGHTGBM_SWIG_STRING_ARRAY_H_
#include <new>
#include <vector>
#include <algorithm>
#include <new>
#include <string>
#include <vector>
/**
* Container that manages an array of fixed-length strings.
@ -22,18 +25,15 @@
* The class also takes care of allocation of the underlying
* char* memory.
*/
class StringArray
{
public:
class StringArray {
public:
StringArray(size_t num_elements, size_t string_size)
: _string_size(string_size),
_array(num_elements + 1, nullptr)
{
_array(num_elements + 1, nullptr) {
_allocate_strings(num_elements, string_size);
}
~StringArray()
{
~StringArray() {
_release_strings();
}
@ -43,8 +43,7 @@ class StringArray
*
* @return char** pointer to raw data (null-terminated).
*/
char **data() noexcept
{
char **data() noexcept {
return _array.data();
}
@ -56,8 +55,7 @@ class StringArray
* @param index Index of the element to retrieve.
* @return pointer or nullptr if index is out of bounds.
*/
char *getitem(size_t index) noexcept
{
char *getitem(size_t index) noexcept {
if (_in_bounds(index))
return _array[index];
else
@ -77,11 +75,9 @@ class StringArray
* into the target string (_string_size), it errors out
* and returns -1.
*/
int setitem(size_t index, std::string content) noexcept
{
if (_in_bounds(index) && content.size() < _string_size)
{
std::strcpy(_array[index], content.c_str());
int setitem(size_t index, const std::string &content) noexcept {
if (_in_bounds(index) && content.size() < _string_size) {
std::strcpy(_array[index], content.c_str()); // NOLINT
return 0;
} else {
return -1;
@ -91,13 +87,11 @@ class StringArray
/**
* @return number of stored strings.
*/
size_t get_num_elements() noexcept
{
size_t get_num_elements() noexcept {
return _array.size() - 1;
}
private:
private:
/**
* Returns true if and only if within bounds.
* Notice that it excludes the last element of _array (NULL).
@ -105,8 +99,7 @@ class StringArray
* @param index index of the element
* @return bool true if within bounds
*/
bool _in_bounds(size_t index) noexcept
{
bool _in_bounds(size_t index) noexcept {
return index < get_num_elements();
}
@ -120,15 +113,13 @@ class StringArray
* @param num_elements Number of strings to store in the array.
* @param string_size The size of each string in the array.
*/
void _allocate_strings(size_t num_elements, size_t string_size)
{
for (size_t i = 0; i < num_elements; ++i)
{
void _allocate_strings(size_t num_elements, size_t string_size) {
for (size_t i = 0; i < num_elements; ++i) {
// Leave space for \0 terminator:
_array[i] = new (std::nothrow) char[string_size + 1];
// Check memory allocation:
if (! _array[i]) {
if (!_array[i]) {
_release_strings();
throw std::bad_alloc();
}
@ -138,8 +129,7 @@ class StringArray
/**
* Deletes the allocated strings.
*/
void _release_strings() noexcept
{
void _release_strings() noexcept {
std::for_each(_array.begin(), _array.end(), [](char* c) { delete[] c; });
}
@ -147,4 +137,4 @@ class StringArray
std::vector<char*> _array;
};
#endif // __STRING_ARRAY_H__
#endif // LIGHTGBM_SWIG_STRING_ARRAY_H_

Просмотреть файл

@ -282,3 +282,4 @@
%include "pointer_manipulation.i"
%include "StringArray_API_extensions.i"
%include "ChunkedArray_API_extensions.i"

Просмотреть файл

@ -0,0 +1,262 @@
/*!
* Copyright (c) 2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*
* Author: Alberto Ferreira
*/
#include <gtest/gtest.h>
#include "../include/LightGBM/utils/chunked_array.hpp"
using LightGBM::ChunkedArray;
/*!
Helper util to compare two vectors.
Don't compare floating point vectors this way!
*/
template <typename T>
testing::AssertionResult are_vectors_equal(const std::vector<T> &a, const std::vector<T> &b) {
if (a.size() != b.size()) {
return testing::AssertionFailure()
<< "Vectors differ in size: "
<< a.size() << " != " << b.size();
}
for (size_t i = 0; i < a.size(); ++i) {
if (a[i] != b[i]) {
return testing::AssertionFailure()
<< "Vectors differ at least at position " << i << ": "
<< a[i] << " != " << b[i];
}
}
return testing::AssertionSuccess();
}
class ChunkedArrayTest : public testing::Test {
protected:
void SetUp() override {
}
void add_items_to_array(const std::vector<int> &vec, ChunkedArray<int> &ca) {
for (auto v: vec) {
ca.add(v);
}
}
/*!
Ensures that if coalesce_to() is called upon the ChunkedArray,
it would yield the same contents as vec
*/
testing::AssertionResult coalesced_output_equals_vec(const ChunkedArray<int> &ca, const std::vector<int> &vec,
const bool all_addresses=false) {
std::vector<int> out(vec.size());
ca.coalesce_to(out.data(), all_addresses);
return are_vectors_equal(out, vec);
}
// Constants
const std::vector<int> REF_VEC = {1, 5, 2, 4, 9, 8, 7};
const size_t CHUNK_SIZE = 3;
const size_t OUT_OF_BOUNDS_OFFSET = 4;
ChunkedArray<int> ca_ = ChunkedArray<int>(CHUNK_SIZE); //<! Re-used for many tests.
};
/*! ChunkedArray cannot be built from chunks of size 0. */
TEST_F(ChunkedArrayTest, constructorWithChunkSize0Throws) {
ASSERT_THROW(ChunkedArray<int> ca(0), std::runtime_error);
}
/*! get_chunk_size() should return the size used in the constructor */
TEST_F(ChunkedArrayTest, constructorWithChunkSize) {
for (size_t chunk_size = 1; chunk_size < 10; ++chunk_size) {
ChunkedArray<int> ca(chunk_size);
ASSERT_EQ(ca.get_chunk_size(), chunk_size);
}
}
/*!
get_chunk_size() should return the size used in the constructor
independently of array manipulations.
*/
TEST_F(ChunkedArrayTest, getChunkSizeIsConstant) {
for (size_t i = 0; i < 3 * CHUNK_SIZE; ++i) {
ASSERT_EQ(ca_.get_chunk_size(), CHUNK_SIZE);
ca_.add(0);
}
}
/*!
get_add_count() should return the number of add calls,
independently of the number of chunks used.
*/
TEST_F(ChunkedArrayTest, getChunksCount) {
ASSERT_EQ(ca_.get_chunks_count(), 1); // ChunkedArray always starts with 1 chunk.
for (size_t i = 0; i < 3 * CHUNK_SIZE; ++i) {
ca_.add(0);
int expected_chunks = int(i/CHUNK_SIZE) + 1;
ASSERT_EQ(ca_.get_chunks_count(), expected_chunks) << "with " << i << " add() call(s) "
<< "and CHUNK_SIZE==" << CHUNK_SIZE << ".";
}
}
/*!
get_add_count() should return the number of add calls,
independently of the number of chunks used.
*/
TEST_F(ChunkedArrayTest, getAddCount) {
for (size_t i = 0; i < 3 * CHUNK_SIZE; ++i) {
ASSERT_EQ(ca_.get_add_count(), i);
ca_.add(0);
}
}
/*!
Ensure coalesce_to() works and dumps all the inserted data correctly.
If the ChunkedArray is created from a sequence of add() calls, coalescing to
an output array after multiple add operations should yield the same
exact data at both input and output.
*/
TEST_F(ChunkedArrayTest, coalesceTo) {
std::vector<int> out(REF_VEC.size());
add_items_to_array(REF_VEC, ca_);
ca_.coalesce_to(out.data());
ASSERT_TRUE(are_vectors_equal(REF_VEC, out));
}
/*!
After clear the ChunkedArray() should still be usable.
*/
TEST_F(ChunkedArrayTest, clear) {
const std::vector<int> ref_vec2 = {1, 2, 5, -1};
add_items_to_array(REF_VEC, ca_);
// Start with some content:
ASSERT_TRUE(coalesced_output_equals_vec(ca_, REF_VEC));
// Clear & re-use:
ca_.clear();
add_items_to_array(ref_vec2, ca_);
// Output should match new content:
ASSERT_TRUE(coalesced_output_equals_vec(ca_, ref_vec2));
}
/*!
Ensure ChunkedArray is safe against double-frees.
*/
TEST_F(ChunkedArrayTest, doubleFreeSafe) {
ca_.release(); // Cannot be used any longer from now on.
ca_.release(); // Ensure we don't segfault.
SUCCEED();
}
/*!
Ensure size computations in the getters are correct.
*/
TEST_F(ChunkedArrayTest, totalArraySizeMatchesLastChunkAddCount) {
add_items_to_array(REF_VEC, ca_);
const size_t first_chunks_add_count = (ca_.get_chunks_count() - 1) * ca_.get_chunk_size();
const size_t last_chunk_add_count = ca_.get_last_chunk_add_count();
EXPECT_EQ(first_chunks_add_count, int(REF_VEC.size()/CHUNK_SIZE) * CHUNK_SIZE);
EXPECT_EQ(last_chunk_add_count, REF_VEC.size() % CHUNK_SIZE);
EXPECT_EQ(first_chunks_add_count + last_chunk_add_count, ca_.get_add_count());
}
/*!
Assert all values are correct and at the expected addresses throughout the
several chunks.
This uses getitem() to reach each individual address of any of the chunks.
A sentinel value of -1 is used to check for invalid addresses.
This would occur if there was an improper data layout with the chunks.
*/
TEST_F(ChunkedArrayTest, dataLayoutTestThroughGetitem) {
add_items_to_array(REF_VEC, ca_);
for (size_t i = 0, chunk = 0, in_chunk_idx = 0; i < REF_VEC.size(); ++i) {
int value = ca_.getitem(chunk, in_chunk_idx, -1); // -1 works as sentinel value (bad layout found)
EXPECT_EQ(value, REF_VEC[i]) << " for address (chunk,in_chunk_idx) = (" << chunk << "," << in_chunk_idx << ")";
if (++in_chunk_idx == ca_.get_chunk_size()) {
in_chunk_idx = 0;
++chunk;
}
}
}
/*!
Perform an array of setitem & getitem at valid and invalid addresses.
We use several random addresses and trials to avoid writing much code.
By testing a random number of addresses many more times than the size of the test space
we are almost guaranteed to cover all possible search addresses.
We also gradually add more chunks to the ChunkedArray and re-run more trials
to ensure the valid/invalid addresses are updated.
With each valid update we add to a "memory" vector the history of all the insertions.
This is used at the end to ensure all values were stored properly, including after
value overrides.
*/
TEST_F(ChunkedArrayTest, testDataLayoutWithAdvancedInsertionAPI) {
const size_t MAX_CHUNKS_SEARCH = 5;
const size_t MAX_IN_CHUNK_SEARCH_IDX = 2 * CHUNK_SIZE;
// Number of trials for each new ChunkedArray configuration. Pass 100 times over the search space:
const size_t N_TRIALS = MAX_CHUNKS_SEARCH * MAX_IN_CHUNK_SEARCH_IDX * 100;
std::vector<int> overriden_trials_values(MAX_CHUNKS_SEARCH * CHUNK_SIZE);
std::vector<bool> overriden_trials_mask(MAX_CHUNKS_SEARCH * CHUNK_SIZE, false);
// Each outer loop iteration changes the test by adding +1 chunk. We start with 1 chunk only:
for (size_t chunks = 1; chunks < MAX_CHUNKS_SEARCH; ++chunks) {
EXPECT_EQ(ca_.get_chunks_count(), chunks);
// Sweep valid and invalid addresses with a ChunkedArray with `chunks` chunks:
for (size_t trial = 0; trial < N_TRIALS; ++trial) {
// Compute a new trial address & value & if it is a valid address:
const size_t trial_chunk = std::rand() % MAX_CHUNKS_SEARCH;
const size_t trial_in_chunk_idx = std::rand() % MAX_IN_CHUNK_SEARCH_IDX;
const int trial_value = std::rand() % 99999;
const bool valid_address = (trial_chunk < chunks) & (trial_in_chunk_idx < CHUNK_SIZE);
// Insert item. If at a valid address, 0 is returned, otherwise, -1 is returned:
EXPECT_EQ(ca_.setitem(trial_chunk, trial_in_chunk_idx, trial_value),
valid_address ? 0 : -1);
// If at valid address, check that the stored value is correct & remember it for the future:
if (valid_address) {
// Check the just-stored value with getitem():
EXPECT_EQ(ca_.getitem(trial_chunk, trial_in_chunk_idx, -1), trial_value); // -1 is the sentinel value.
// Also store the just-stored value for future tracking:
overriden_trials_values[trial_chunk*CHUNK_SIZE + trial_in_chunk_idx] = trial_value;
overriden_trials_mask[trial_chunk*CHUNK_SIZE + trial_in_chunk_idx] = true;
}
}
ca_.new_chunk(); // Just finished a round of trials. Now add a new chunk. Valid addresses will be expanded.
}
// Final check: ensure even with overrides, all valid insertions store the latest value at that address:
std::vector<int> coalesced_out(MAX_CHUNKS_SEARCH * CHUNK_SIZE, -1);
ca_.coalesce_to(coalesced_out.data(), true); // Export all valid addresses.
for (size_t i = 0; i < overriden_trials_mask.size(); ++i) {
if (overriden_trials_mask[i]) {
EXPECT_EQ(ca_.getitem(i/CHUNK_SIZE, i % CHUNK_SIZE, -1), overriden_trials_values[i]);
EXPECT_EQ(coalesced_out[i], overriden_trials_values[i]);
}
}
}