* checkin

* add unordered_set to traits.h

* main part checkin

* update

* fix centos/macos build failure and add config params to yaml

* fix

* resolve comments

* fix MSVC build

* fix

* update

* resolve comments
This commit is contained in:
Ye Wang 2020-02-27 09:55:23 -08:00 коммит произвёл GitHub
Родитель 6fbc92f298
Коммит dc7b42abd3
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
9 изменённых файлов: 1067 добавлений и 23 удалений

Просмотреть файл

@ -7,31 +7,11 @@
#include "Structs.h"
#include "../Traits.h"
#include "Components/InferenceOnlyFeaturizerImpl.h"
#include "../3rdParty/MurmurHash3.h"
namespace Microsoft {
namespace Featurizer {
namespace Featurizers {
namespace {
static inline std::uint32_t MurmurHashHelper(std::string const &input, std::uint32_t hashingSeedVal) {
std::uint32_t colHashVal;
MurmurHash3_x86_32(input.c_str(), static_cast<int>(sizeof(*input.c_str())) * static_cast<int>(input.size()), hashingSeedVal, &colHashVal);
return colHashVal;
}
template<typename T>
static inline std::uint32_t MurmurHashHelper(T const &input, std::uint32_t hashingSeedVal) {
static_assert(std::is_pod<T>::value, "Input must be PODs");
std::uint32_t colHashVal;
MurmurHash3_x86_32(reinterpret_cast<unsigned char const*>(&input), sizeof(input), hashingSeedVal, &colHashVal);
return colHashVal;
}
} // anonymous namespace
/////////////////////////////////////////////////////////////////////////
/// \class HashOneHotVectorizerTransformer
/// \brief Convert input to hash and encode to one hot encoded vector
@ -80,7 +60,7 @@ private:
// MSVC has problems when the function is defined outside of the declaration
void execute_impl(typename BaseType::InputType const &input, typename BaseType::CallbackFunction const &callback) override {
std::uint32_t colHashVal = MurmurHashHelper(input, _hashingSeedVal);
std::uint32_t colHashVal = MurmurHashGenerator(input, _hashingSeedVal);
callback(
SingleValueSparseVectorEncoding<std::uint8_t>(

Просмотреть файл

@ -0,0 +1,59 @@
// ----------------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// ----------------------------------------------------------------------
#include "ShortGrainDropperFeaturizer.h"
#include "../Archive.h"
namespace Microsoft {
namespace Featurizer {
namespace Featurizers {
// ----------------------------------------------------------------------
// |
// | ShortGrainDropperTransformer
// |
// ----------------------------------------------------------------------
ShortGrainDropperTransformer::ShortGrainDropperTransformer(GrainsSet grainsToDrop) :
//grainsToDrop can be empty
_grainsToDrop(std::move(grainsToDrop)) {
}
ShortGrainDropperTransformer::ShortGrainDropperTransformer(Archive &ar) :
ShortGrainDropperTransformer(
[&ar](void) {
// Version
std::uint16_t majorVersion(Traits<std::uint16_t>::deserialize(ar));
std::uint16_t minorVersion(Traits<std::uint16_t>::deserialize(ar));
if(majorVersion != 1 || minorVersion != 0)
throw std::runtime_error("Unsupported archive version");
// Data
GrainsSet grainsToDrop(Traits<GrainsSet>::deserialize(ar));
return ShortGrainDropperTransformer(std::move(grainsToDrop));
}()
) {
}
bool ShortGrainDropperTransformer::operator==(ShortGrainDropperTransformer const &other) const {
return this->_grainsToDrop == other._grainsToDrop;
}
void ShortGrainDropperTransformer::save(Archive &ar) const /*override*/ {
// Version
Traits<std::uint16_t>::serialize(ar, 1); // Major
Traits<std::uint16_t>::serialize(ar, 0); // Minor
// Data
Traits<decltype(_grainsToDrop)>::serialize(ar, _grainsToDrop);
}
void ShortGrainDropperTransformer::execute_impl(typename BaseType::InputType const &input, typename BaseType::CallbackFunction const &callback) /*override*/ {
callback(_grainsToDrop.find(input) != _grainsToDrop.end());
}
} // namespace Featurizers
} // namespace Featurizer
} // namespace Microsoft

Просмотреть файл

@ -0,0 +1,218 @@
// ----------------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// ----------------------------------------------------------------------
#pragma once
#include "../Featurizer.h"
#include "../Archive.h"
#include "../Traits.h"
namespace Microsoft {
namespace Featurizer {
namespace Featurizers {
/////////////////////////////////////////////////////////////////////////
/// \class ShortGrainDropperTransformer
/// \brief Return true if a grain needs dropping
///
class ShortGrainDropperTransformer : public StandardTransformer<std::vector<std::string>, bool> {
public:
// ----------------------------------------------------------------------
// |
// | Public Types
// |
// ----------------------------------------------------------------------
using BaseType = StandardTransformer<std::vector<std::string>, bool>;
using GrainsSet = std::unordered_set<std::vector<std::string>, Microsoft::Featurizer::ContainerHash<std::vector<std::string>>>;
// ----------------------------------------------------------------------
// |
// | Public Methods
// |
// ----------------------------------------------------------------------
explicit ShortGrainDropperTransformer(GrainsSet grainsToDrop);
explicit ShortGrainDropperTransformer(Archive &ar);
~ShortGrainDropperTransformer(void) override = default;
FEATURIZER_MOVE_CONSTRUCTOR_ONLY(ShortGrainDropperTransformer);
bool operator==(ShortGrainDropperTransformer const &other) const;
void save(Archive &ar) const override;
private:
// ----------------------------------------------------------------------
// |
// | Private Data
// |
// ----------------------------------------------------------------------
GrainsSet const _grainsToDrop;
// ----------------------------------------------------------------------
// |
// | Private Methods
// |
// ----------------------------------------------------------------------
void execute_impl(typename BaseType::InputType const &input, typename BaseType::CallbackFunction const &callback) override;
};
/////////////////////////////////////////////////////////////////////////
/// \class ShortGrainDropperEstimator
/// \brief Estimator to determine which grain to drop given the
/// threshod minPoints calculated by windowSize, lags,
/// maxHorizon and cv.
/// todo: more comments will add here later
///
template <
size_t MaxNumTrainingItemsV = std::numeric_limits<size_t>::max()
>
class ShortGrainDropperEstimator : public TransformerEstimator<std::vector<std::string>, bool> {
public:
// ----------------------------------------------------------------------
// |
// | Public Types
// |
// ----------------------------------------------------------------------
using BaseType = TransformerEstimator<std::vector<std::string>, bool>;
using TransformerType = ShortGrainDropperTransformer;
// ----------------------------------------------------------------------
// |
// | Public Methods
// |
// ----------------------------------------------------------------------
ShortGrainDropperEstimator(
AnnotationMapsPtr pAllColumnAnnotations,
size_t colIndex,
std::uint8_t windowSize,
//todo: possible name change and add commments, after sync with other Timeseries related Featurizers
std::vector<std::uint8_t> lags,
//todo: possible name change and add commments, after sync with other Timeseries related Featurizers
std::uint8_t maxHorizon,
//todo: possible name change and add commments, after sync with other Timeseries related Featurizers
nonstd::optional<std::uint8_t> cv
//todo: possible name change and add commments, after sync with other Timeseries related Featurizers
);
~ShortGrainDropperEstimator(void) override = default;
FEATURIZER_MOVE_CONSTRUCTOR_ONLY(ShortGrainDropperEstimator);
private:
// ----------------------------------------------------------------------
// |
// | Private Types
// |
// ----------------------------------------------------------------------
using GrainsSet = ShortGrainDropperTransformer::GrainsSet;
using GrainsMap = std::unordered_map<std::vector<std::string>, std::uint32_t, Microsoft::Featurizer::ContainerHash<std::vector<std::string>>>;
// ----------------------------------------------------------------------
// |
// | Private Data
// |
// ----------------------------------------------------------------------
size_t const _colIndex;
std::uint16_t const _minPoints;
GrainsSet _grainsToDrop;
GrainsMap _groupByGrains;
// ----------------------------------------------------------------------
// |
// | Private Methods
// |
// ----------------------------------------------------------------------
bool begin_training_impl(void) override;
// MSVC has problems when the declaration and definition are separated
FitResult fit_impl(typename BaseType::InputType const *pBuffer, size_t cElements) override {
InputType const * const pEndBuffer(pBuffer + cElements);
while(pBuffer != pEndBuffer) {
GrainsMap::iterator grainsMapIter(_groupByGrains.find(*pBuffer));
if (grainsMapIter != _groupByGrains.end())
++grainsMapIter->second;
else
_groupByGrains.emplace(*pBuffer, 1);
++pBuffer;
}
return FitResult::Continue;
}
void complete_training_impl(void) override;
// MSVC has problems when the definition is separate from the declaration
typename BaseType::TransformerUniquePtr create_transformer_impl(void) override {
return typename BaseType::TransformerUniquePtr(new TransformerType(std::move(_grainsToDrop)));
}
};
// ----------------------------------------------------------------------
// ----------------------------------------------------------------------
// ----------------------------------------------------------------------
// |
// | Implementation
// |
// ----------------------------------------------------------------------
// ----------------------------------------------------------------------
// ----------------------------------------------------------------------
// --------------------------------------------------------------------
// |
// | ShortGrainDropperEstimator
// |
// ----------------------------------------------------------------------
template <size_t MaxNumTrainingItemsV>
ShortGrainDropperEstimator<MaxNumTrainingItemsV>::ShortGrainDropperEstimator(
AnnotationMapsPtr pAllColumnAnnotations,
size_t colIndex,
std::uint8_t windowSize,
std::vector<std::uint8_t> lags,
std::uint8_t maxHorizon,
nonstd::optional<std::uint8_t> cv
) :
BaseType("ShortGrainDropperEstimatorImpl", std::move(pAllColumnAnnotations)),
_colIndex(
[this, &colIndex](void) -> size_t & {
if(colIndex >= this->get_column_annotations().size())
throw std::invalid_argument("colIndex");
return colIndex;
}()
),
_minPoints(
[&windowSize, &lags, &maxHorizon, &cv](void) -> std::uint16_t {
//it appears automl tests show that
//windowSize can be 0
//lags could contain 0s
//maxHorizon may not be 0, not sure currently
//cv may not be 0, not sure currently
if (lags.size() == 0)
throw std::invalid_argument("lags");
if (!cv.has_value())
return (maxHorizon + std::max(windowSize, *std::max_element(lags.cbegin(), lags.cend())) + 1);
return (2*maxHorizon + static_cast<std::uint8_t>(*cv) + std::max(windowSize, *std::max_element(lags.cbegin(), lags.cend())) + 1);
}()
) {
}
// ----------------------------------------------------------------------
// ----------------------------------------------------------------------
// ----------------------------------------------------------------------
template <size_t MaxNumTrainingItemsV>
bool ShortGrainDropperEstimator<MaxNumTrainingItemsV>::begin_training_impl(void) /*override*/ {
return true;
}
template <size_t MaxNumTrainingItemsV>
void ShortGrainDropperEstimator<MaxNumTrainingItemsV>::complete_training_impl(void) /*override*/ {
for (GrainsMap::value_type const & groupByGrainsElement : _groupByGrains) {
if (groupByGrainsElement.second <= _minPoints)
_grainsToDrop.emplace(std::move(groupByGrainsElement.first));
}
//clear _groupByGrains
_groupByGrains = {};
}
} // namespace Featurizers
} // namespace Featurizer
} // namespace Microsoft

Просмотреть файл

@ -51,6 +51,7 @@ foreach(_test_name IN ITEMS
PCAFeaturizer_UnitTests
RobustScalerFeaturizer_UnitTests
SampleAddFeaturizer_UnitTest
ShortGrainDropperFeaturizer_UnitTests
StandardScaleWrapperFeaturizer_UnitTest
StringFeaturizer_UnitTest
Structs_UnitTest

Просмотреть файл

@ -0,0 +1,605 @@
// ----------------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// ----------------------------------------------------------------------
#define CATCH_CONFIG_MAIN
#include "catch.hpp"
#include "../../Featurizers/ShortGrainDropperFeaturizer.h"
#include "../TestHelpers.h"
#include "../../Archive.h"
namespace NS = Microsoft::Featurizer;
void TestImpl(std::vector<std::vector<std::vector<std::string>>> trainingBatches,
std::vector<std::vector<std::string>> inferencingInput,
std::vector<bool> inferencingOutput,
std::uint8_t windowSize,
std::vector<std::uint8_t> lags,
std::uint8_t maxHorizon,
nonstd::optional<std::uint8_t> cv){
using SGDEstimator = NS::Featurizers::ShortGrainDropperEstimator<std::numeric_limits<size_t>::max()>;
SGDEstimator estimator(NS::CreateTestAnnotationMapsPtr(1), 0, windowSize, lags, maxHorizon, cv);
NS::TestHelpers::Train<SGDEstimator, std::vector<std::string>>(estimator, trainingBatches);
SGDEstimator::TransformerUniquePtr pTransformer(estimator.create_transformer());
std::vector<bool> output;
auto const callback(
[&output](bool value) {
//Use this workaround because C++11 on MacOS and CentOS doesn't support emplace() or emplace_back() for vector<bool>
output.push_back(value);
}
);
for(auto const &item : inferencingInput)
pTransformer->execute(item, callback);
pTransformer->flush(callback);
CHECK(output == inferencingOutput);
}
TEST_CASE("Invalid Transformer/Estimator") {
//parameter setting
std::uint8_t windowSize = 0;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>();
std::uint8_t maxHorizon = 1;
nonstd::optional<std::uint8_t> cv = static_cast<std::uint8_t>(1);
CHECK_THROWS_WITH(NS::Featurizers::ShortGrainDropperEstimator<std::numeric_limits<size_t>::max()>(NS::CreateTestAnnotationMapsPtr(1), 2, windowSize, lags, maxHorizon, cv), "colIndex");
CHECK_THROWS_WITH(NS::Featurizers::ShortGrainDropperEstimator<std::numeric_limits<size_t>::max()>(NS::CreateTestAnnotationMapsPtr(1), 0, windowSize, lags, maxHorizon, cv), "lags");
}
TEST_CASE("Standard Test") {
std::vector<std::vector<std::vector<std::string>>> trainingBatches = NS::TestHelpers::make_vector<std::vector<std::vector<std::string>>>(
NS::TestHelpers::make_vector<std::vector<std::string>>(
NS::TestHelpers::make_vector<std::string>("a", "b"),
NS::TestHelpers::make_vector<std::string>("a", "b"),
NS::TestHelpers::make_vector<std::string>("a", "b"),
NS::TestHelpers::make_vector<std::string>("a", "b"),
NS::TestHelpers::make_vector<std::string>("a", "b"),
NS::TestHelpers::make_vector<std::string>("a", "c"),
NS::TestHelpers::make_vector<std::string>("a", "c"),
NS::TestHelpers::make_vector<std::string>("a", "c"),
NS::TestHelpers::make_vector<std::string>("a", "c"),
NS::TestHelpers::make_vector<std::string>("a", "d"),
NS::TestHelpers::make_vector<std::string>("a", "d"),
NS::TestHelpers::make_vector<std::string>("a", "d"),
NS::TestHelpers::make_vector<std::string>("a", "e"),
NS::TestHelpers::make_vector<std::string>("a", "e"),
NS::TestHelpers::make_vector<std::string>("a", "f")
)
);
std::vector<std::vector<std::string>> inferencingInput = NS::TestHelpers::make_vector<std::vector<std::string>>(
NS::TestHelpers::make_vector<std::string>("a", "b"),
NS::TestHelpers::make_vector<std::string>("a", "c"),
NS::TestHelpers::make_vector<std::string>("a", "d"),
NS::TestHelpers::make_vector<std::string>("a", "e"),
NS::TestHelpers::make_vector<std::string>("a", "f"),
NS::TestHelpers::make_vector<std::string>("a", "g")
);
std::vector<bool> inferencingOutput = {
false,
true,
true,
true,
true,
false
};
//parameter setting
std::uint8_t windowSize = 0;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(0));
std::uint8_t maxHorizon = 1;
nonstd::optional<std::uint8_t> cv = static_cast<std::uint8_t>(1);
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
TEST_CASE("Standard Test_Parameter Combination") {
std::vector<std::vector<std::vector<std::string>>> trainingBatches = NS::TestHelpers::make_vector<std::vector<std::vector<std::string>>>(
NS::TestHelpers::make_vector<std::vector<std::string>>(
NS::TestHelpers::make_vector<std::string>("a", "b"),
NS::TestHelpers::make_vector<std::string>("a", "b"),
NS::TestHelpers::make_vector<std::string>("a", "b"),
NS::TestHelpers::make_vector<std::string>("a", "b"),
NS::TestHelpers::make_vector<std::string>("a", "b"),
NS::TestHelpers::make_vector<std::string>("a", "c"),
NS::TestHelpers::make_vector<std::string>("a", "c"),
NS::TestHelpers::make_vector<std::string>("a", "c"),
NS::TestHelpers::make_vector<std::string>("a", "c"),
NS::TestHelpers::make_vector<std::string>("a", "d"),
NS::TestHelpers::make_vector<std::string>("a", "d"),
NS::TestHelpers::make_vector<std::string>("a", "d"),
NS::TestHelpers::make_vector<std::string>("a", "e"),
NS::TestHelpers::make_vector<std::string>("a", "e"),
NS::TestHelpers::make_vector<std::string>("a", "f")
)
);
std::vector<std::vector<std::string>> inferencingInput = NS::TestHelpers::make_vector<std::vector<std::string>>(
NS::TestHelpers::make_vector<std::string>("a", "b"),
NS::TestHelpers::make_vector<std::string>("a", "c"),
NS::TestHelpers::make_vector<std::string>("a", "d"),
NS::TestHelpers::make_vector<std::string>("a", "e"),
NS::TestHelpers::make_vector<std::string>("a", "f"),
NS::TestHelpers::make_vector<std::string>("a", "g")
);
SECTION("windowSize=1/lags=[0,1]/maxHorizon=1/no cv") {
//parameter setting
std::uint8_t windowSize = 1;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(1));
std::uint8_t maxHorizon = 1;
nonstd::optional<std::uint8_t> cv = nonstd::optional<std::uint8_t>();
std::vector<bool> inferencingOutput = {
false,
false,
true,
true,
true,
false
};
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
SECTION("windowSize=0/lags=[0,1]/maxHorizon=1/no cv") {
//parameter setting
std::uint8_t windowSize = 0;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(1));
std::uint8_t maxHorizon = 1;
nonstd::optional<std::uint8_t> cv = nonstd::optional<std::uint8_t>();
std::vector<bool> inferencingOutput = {
false,
false,
true,
true,
true,
false
};
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
SECTION("windowSize=1/lags=[0,0]/maxHorizon=1/no cv") {
//parameter setting
std::uint8_t windowSize = 0;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(0));
std::uint8_t maxHorizon = 1;
nonstd::optional<std::uint8_t> cv = nonstd::optional<std::uint8_t>();
std::vector<bool> inferencingOutput = {
false,
false,
false,
true,
true,
false
};
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
SECTION("windowSize=0/lags=[0,0]/maxHorizon=1/no cv") {
//parameter setting
std::uint8_t windowSize = 0;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(0));
std::uint8_t maxHorizon = 1;
nonstd::optional<std::uint8_t> cv = nonstd::optional<std::uint8_t>();
std::vector<bool> inferencingOutput = {
false,
false,
false,
true,
true,
false
};
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
SECTION("windowSize=1/lags=[0,1]/maxHorizon=0/no cv") {
//parameter setting
std::uint8_t windowSize = 1;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(1));
std::uint8_t maxHorizon = 0;
nonstd::optional<std::uint8_t> cv = nonstd::optional<std::uint8_t>();
std::vector<bool> inferencingOutput = {
false,
false,
false,
true,
true,
false
};
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
SECTION("windowSize=0/lags=[0,1]/maxHorizon=0/no cv") {
//parameter setting
std::uint8_t windowSize = 0;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(1));
std::uint8_t maxHorizon = 0;
nonstd::optional<std::uint8_t> cv = nonstd::optional<std::uint8_t>();
std::vector<bool> inferencingOutput = {
false,
false,
false,
true,
true,
false
};
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
SECTION("windowSize=1/lags=[0,0]/maxHorizon=0/no cv") {
//parameter setting
std::uint8_t windowSize = 0;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(0));
std::uint8_t maxHorizon = 0;
nonstd::optional<std::uint8_t> cv = nonstd::optional<std::uint8_t>();
std::vector<bool> inferencingOutput = {
false,
false,
false,
false,
true,
false
};
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
SECTION("windowSize=0/lags=[0,0]/maxHorizon=0/no cv") {
//parameter setting
std::uint8_t windowSize = 0;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(0));
std::uint8_t maxHorizon = 0;
nonstd::optional<std::uint8_t> cv = nonstd::optional<std::uint8_t>();
std::vector<bool> inferencingOutput = {
false,
false,
false,
false,
true,
false
};
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
SECTION("windowSize=1/lags=[0,1]/maxHorizon=1/cv=1") {
//parameter setting
std::uint8_t windowSize = 1;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(1));
std::uint8_t maxHorizon = 1;
nonstd::optional<std::uint8_t> cv = static_cast<std::uint8_t>(1);
std::vector<bool> inferencingOutput = {
true,
true,
true,
true,
true,
false
};
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
SECTION("windowSize=0/lags=[0,1]/maxHorizon=1/cv=1") {
//parameter setting
std::uint8_t windowSize = 0;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(1));
std::uint8_t maxHorizon = 1;
nonstd::optional<std::uint8_t> cv = static_cast<std::uint8_t>(1);
std::vector<bool> inferencingOutput = {
true,
true,
true,
true,
true,
false
};
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
SECTION("windowSize=1/lags=[0,0]/maxHorizon=1/cv=1") {
//parameter setting
std::uint8_t windowSize = 0;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(0));
std::uint8_t maxHorizon = 1;
nonstd::optional<std::uint8_t> cv = static_cast<std::uint8_t>(1);
std::vector<bool> inferencingOutput = {
false,
true,
true,
true,
true,
false
};
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
SECTION("windowSize=0/lags=[0,0]/maxHorizon=1/cv=1") {
//parameter setting
std::uint8_t windowSize = 0;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(0));
std::uint8_t maxHorizon = 1;
nonstd::optional<std::uint8_t> cv = static_cast<std::uint8_t>(1);
std::vector<bool> inferencingOutput = {
false,
true,
true,
true,
true,
false
};
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
SECTION("windowSize=1/lags=[0,1]/maxHorizon=0/cv=1") {
//parameter setting
std::uint8_t windowSize = 1;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(1));
std::uint8_t maxHorizon = 0;
nonstd::optional<std::uint8_t> cv = static_cast<std::uint8_t>(1);
std::vector<bool> inferencingOutput = {
false,
false,
true,
true,
true,
false
};
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
SECTION("windowSize=0/lags=[0,1]/maxHorizon=0/cv=1") {
//parameter setting
std::uint8_t windowSize = 0;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(1));
std::uint8_t maxHorizon = 0;
nonstd::optional<std::uint8_t> cv = static_cast<std::uint8_t>(1);
std::vector<bool> inferencingOutput = {
false,
false,
true,
true,
true,
false
};
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
SECTION("windowSize=1/lags=[0,0]/maxHorizon=0/cv=1") {
//parameter setting
std::uint8_t windowSize = 0;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(0));
std::uint8_t maxHorizon = 0;
nonstd::optional<std::uint8_t> cv = static_cast<std::uint8_t>(1);
std::vector<bool> inferencingOutput = {
false,
false,
false,
true,
true,
false
};
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
SECTION("windowSize=0/lags=[0,0]/maxHorizon=0/cv=1") {
//parameter setting
std::uint8_t windowSize = 0;
std::vector<std::uint8_t> lags = NS::TestHelpers::make_vector<std::uint8_t>(static_cast<std::uint8_t>(0), static_cast<std::uint8_t>(0));
std::uint8_t maxHorizon = 0;
nonstd::optional<std::uint8_t> cv = static_cast<std::uint8_t>(1);
std::vector<bool> inferencingOutput = {
false,
false,
false,
true,
true,
false
};
TestImpl(
trainingBatches,
inferencingInput,
inferencingOutput,
windowSize,
lags,
maxHorizon,
cv
);
}
}
TEST_CASE("Serialization/Deserialization") {
using TransformerType = NS::Featurizers::ShortGrainDropperTransformer;
std::unordered_set<
std::vector<std::string>,
Microsoft::Featurizer::ContainerHash<std::vector<std::string>>
> grainsToDrop({{"aa"}, {"ab"}});
TransformerType original(std::move(grainsToDrop));
NS::Archive out;
original.save(out);
NS::Archive in(out.commit());
TransformerType other(in);
CHECK(other == original);
}
TEST_CASE("Serialization Version Error") {
NS::Archive out;
out.serialize(static_cast<std::uint16_t>(2));
out.serialize(static_cast<std::uint16_t>(0));
NS::Archive in(out.commit());
CHECK_THROWS_WITH(
NS::Featurizers::ShortGrainDropperTransformer(in),
Catch::Contains("Unsupported archive version")
);
}

Просмотреть файл

@ -37,6 +37,8 @@ function(Impl)
${_this_path}/../RobustScalerFeaturizer.h
${_this_path}/../SampleAddFeaturizer.h
${_this_path}/../SampleAddFeaturizer.cpp
${_this_path}/../ShortGrainDropperFeaturizer.h
${_this_path}/../ShortGrainDropperFeaturizer.cpp
${_this_path}/../StandardScaleWrapperFeaturizer.h
${_this_path}/../StringFeaturizer.h
${_this_path}/../Structs.h

Просмотреть файл

@ -1115,9 +1115,26 @@ featurizers:
todo:
type_mappings:
- input_type: std::vector<std::string>
- input_type: vector<string>
output_type: bool
configuration_params:
- type: uint8
name: windowSize
is_optional: false
- type: vector<uint8>
name: lags
is_optional: false
- type: uint8
name: maxHorizon
is_optional: false
- type: uint8
name: cv
is_optional: true
status: pending
# ----------------------------------------------------------------------

Просмотреть файл

@ -13,6 +13,7 @@
#include <sstream>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#if (defined __clang__)
@ -78,6 +79,7 @@
# pragma clang diagnostic pop
#endif
#include "3rdParty/MurmurHash3.h"
#include "3rdParty/optional.h"
namespace Microsoft {
@ -156,6 +158,37 @@ inline bool IsValid(TypeId id) {
|| id == TypeId::Map;
}
//Hash Functions related
static inline std::uint32_t MurmurHashGenerator(std::string const & value, std::uint32_t seed) {
std::uint32_t hash;
MurmurHash3_x86_32(value.c_str(), static_cast<int>(sizeof(*value.c_str())) * static_cast<int>(value.size()), seed, &hash);
return hash;
}
template<typename T>
static inline std::uint32_t MurmurHashGenerator(T const & value, std::uint32_t seed) {
static_assert(std::is_pod<T>::value, "Input must be PODs");
std::uint32_t hash;
MurmurHash3_x86_32(reinterpret_cast<unsigned char const*>(&value), sizeof(value), seed, &hash);
return hash;
}
/////////////////////////////////////////////////////////////////////////
/// \class ContainerHash
/// \brief Hash function for Container type
///
template <typename Container>
struct ContainerHash {
std::size_t operator()(Container const& container) const noexcept {
std::uint32_t hash = 0;
for (typename Container::value_type const & val : container) {
hash = MurmurHashGenerator(val, hash);
}
return static_cast<std::size_t>(hash);
}
};
// This mapper infers the type of the output Matrix
// base on the input matrix type. Note, that it has to be
// either an Eigen::Matrix or Eigen::Map
@ -975,6 +1008,56 @@ MapT DeserializeMap(ArchiveT &ar) {
return result;
}
template <typename SetT>
std::string ToSetString(SetT const &value) {
std::ostringstream out;
out << "{";
for(auto it = value.cbegin(); it != value.end(); ++it) {
out << Traits<typename SetT::key_type>::ToString(*it);
if(std::next(it) != value.end())
out << ",";
}
out << "}";
return out.str();
}
template <typename SetT>
SetT FromSetString(std::string const &value) {
std::ignore = value;
throw std::logic_error("Not Implemented Yet");
}
template <typename ArchiveT, typename SetT>
ArchiveT & SerializeSet(ArchiveT &ar, SetT const &value) {
ar.serialize(static_cast<std::uint32_t>(value.size()));
for(auto const &elem : value) {
Traits<typename SetT::key_type>::serialize(ar, elem);
}
return ar;
}
template <typename SetT, typename ArchiveT>
SetT DeserializeSet(ArchiveT &ar) {
SetT result;
std::uint32_t size(ar.template deserialize<std::uint32_t>());
while(size) {
typename SetT::key_type key(Traits<typename SetT::key_type>::deserialize(ar));
result.emplace(std::move(key));
--size;
}
return result;
}
} // anonymous namespace
template <typename KeyT, typename T, typename CompareT, typename AllocatorT>
@ -1019,6 +1102,27 @@ struct Traits<std::unordered_map<KeyT, T, HashT, KeyEqualT, AllocatorT>> : publi
}
};
template <typename KeyT, typename HashT, typename KeyEqualT, typename AllocatorT>
struct Traits<std::unordered_set<KeyT, HashT, KeyEqualT, AllocatorT>> : public TraitsImpl<std::unordered_set<KeyT, HashT, KeyEqualT, AllocatorT>> {
static std::string ToString(std::unordered_set<KeyT, HashT, KeyEqualT, AllocatorT> const &value) {
return ToSetString(value);
}
static std::unordered_set<KeyT, HashT, KeyEqualT, AllocatorT> FromString(std::string const &value) {
return FromSetString<std::unordered_set<KeyT, HashT, KeyEqualT, AllocatorT>>(value);
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::unordered_set<KeyT, HashT, KeyEqualT, AllocatorT> const &value) {
return SerializeSet(ar, value);
}
template <typename ArchiveT>
static std::unordered_set<KeyT, HashT, KeyEqualT, AllocatorT> deserialize(ArchiveT &ar) {
return DeserializeSet<std::unordered_set<KeyT, HashT, KeyEqualT, AllocatorT>>(ar);
}
};
template <typename T>
struct Traits<ColMajMatrix<T>> : public TraitsImpl<ColMajMatrix<T>> {

Просмотреть файл

@ -32,10 +32,51 @@ static_assert(std::is_same<Traits<std::array<char, 4>>::nullable_type, nonstd::o
static_assert(std::is_same<Traits<bool>::nullable_type, nonstd::optional<bool>>::value, "Incorrect nullable type for std::string");
static_assert(std::is_same<Traits<std::map<int,int>>::nullable_type, nonstd::optional<std::map<int,int>>>::value, "Incorrect nullable type for std::map");
static_assert(std::is_same<Traits<std::unordered_map<int,int>>::nullable_type, nonstd::optional<std::unordered_map<int,int>>>::value, "Incorrect nullable type for std::unordered_map");
static_assert(std::is_same<Traits<std::unordered_set<int,int>>::nullable_type, nonstd::optional<std::unordered_set<int,int>>>::value, "Incorrect nullable type for std::unordered_set");
static_assert(std::is_same<Traits<std::vector<int>>::nullable_type, nonstd::optional<std::vector<int>>>::value, "Incorrect nullable type for std::vector");
static_assert(std::is_same<Traits<nonstd::optional<int>>::nullable_type, nonstd::optional<int>>::value, "Incorrect nullable type for nonstd::optional");
static_assert(std::is_same<Traits<std::tuple<int>>::nullable_type, nonstd::optional<std::tuple<int>>>::value, "Incorrect nullable type for std::tuple");
TEST_CASE("MurmurHash_Generator") {
CHECK(MurmurHashGenerator(true, 1) == 0x295d376d);
CHECK(MurmurHashGenerator(10, 1) == 0x12ec2126);
CHECK(MurmurHashGenerator(2.5f, 1) == 0x5edbc123);
CHECK(MurmurHashGenerator(2.5, 1) == 0x54263515);
CHECK(MurmurHashGenerator("abcd", 1) == 0x353b7271);
}
TEST_CASE("ContainerHash_Test") {
//no test for bool because c++ optimize vector<bool> which each bool takes 1 bit
//if vector<bool> is really needed in the future we could use int type
//the changes required to fit the test for vector<bool> will harm the performance
//of ContainerHash by copying the element in Container especially when the type is string
std::unordered_set<
std::vector<std::int8_t>,
ContainerHash<std::vector<std::int8_t>>
> int8VecSet({{-1}, {2}});
std::unordered_set<
std::vector<std::uint8_t>,
ContainerHash<std::vector<std::uint8_t>>
> uint8VecSet({{1}, {2}});
std::unordered_set<
std::vector<std::float_t>,
ContainerHash<std::vector<std::float_t>>
> floatVecSet({{1.0f}, {2.0f}});
std::unordered_set<
std::vector<std::double_t>,
ContainerHash<std::vector<std::double_t>>
> doubleVecSet({{1.0}, {2.0}});
std::unordered_set<
std::vector<std::string>,
ContainerHash<std::vector<std::string>>
> strVecSet({{"a"}, {"b"}});
}
TEST_CASE("Transformer_Nullable") {
nonstd::optional<std::int8_t> arg_null;
std::float_t arg_f_ini = std::numeric_limits<std::float_t>::quiet_NaN();
@ -240,13 +281,26 @@ TEST_CASE("Transformer_Maps") {
CHECK_THROWS_WITH((Traits<std::map<std::int16_t, std::double_t>>::FromString(map_res)), "Not Implemented Yet");
}
TEST_CASE("Unordered map") {
TEST_CASE("Transformer_UnorderedMaps") {
std::unordered_map<std::int16_t, std::double_t> m;
m.insert(std::pair<std::int16_t, std::double_t>(static_cast<std::int16_t>(5), 35.8));
m.insert(std::pair<std::int16_t, std::double_t>(static_cast<std::int16_t>(93), 0.147));
std::string map_res = Traits<std::unordered_map<std::int16_t, std::double_t>>::ToString(m);
std::string map_s{ "{93:0.147000,5:35.800000}" };
CHECK(map_res == map_s);
CHECK_THROWS_WITH((Traits<std::unordered_map<std::int16_t, std::double_t>>::FromString(map_res)), "Not Implemented Yet");
}
TEST_CASE("Transformer_UnorderedSets") {
std::unordered_set<std::int16_t> s;
s.insert(5);
s.insert(93);
std::string set_res = Traits<std::unordered_set<std::int16_t>>::ToString(s);
std::string set_s{ "{93,5}" };
CHECK(set_res == set_s);
CHECK_THROWS_WITH((Traits<std::unordered_set<std::int16_t>>::FromString(set_res)), "Not Implemented Yet");
}
TEST_CASE("Transformer_EigenMatrix") {
@ -405,6 +459,9 @@ TEST_CASE("Serialization") {
CHECK(SerializationTestImpl(std::unordered_map<int, std::string>{ {10, "ten"}, {20, "twenty"} }));
CHECK(SerializationTestImpl(std::unordered_map<std::string, int>{ {"ten", 10}, {"twenty", 20} }));
CHECK(SerializationTestImpl(std::unordered_set<std::string>()));
CHECK(SerializationTestImpl(std::unordered_set<std::string>{ {"ten"}, {"twenty"} }));
CHECK(SerializationTestImpl(Eigen::MatrixX<float>()));
Eigen::MatrixX<float> matrix(1, 2);
matrix(0, 0) = 1.0f;
@ -439,6 +496,7 @@ TEST_CASE("CreateNullValue") {
CHECK(TestCreateNullValue<std::vector<std::string>>());
CHECK(TestCreateNullValue<std::map<std::string, std::uint32_t>>());
CHECK(TestCreateNullValue<std::unordered_map<std::string, std::uint32_t>>());
CHECK(TestCreateNullValue<std::unordered_set<std::string>>());
CHECK(TestCreateNullValue<Eigen::MatrixX<std::float_t>>());
CHECK(TestCreateNullValue<nonstd::optional<std::int8_t>>());
}