Merged PR 5153: Updates for TimeSeriesImputerFeaturizer

Updates for TimeSeriesImputerFeaturizer:

- Validates input impute strategy
- Moves median/col error validation to the transformer
- Populates empty values in median scenarios when errors are suppressed
- Introduces new Shared-object layer tests
- Ensures chronological order for inputs during transform
- Enumerates tests that need to be written
This commit is contained in:
David Brownell 2019-09-24 17:30:42 +00:00
Родитель 733ca71a36
Коммит 1242c0a9b8
6 изменённых файлов: 281 добавлений и 133 удалений

Просмотреть файл

@ -28,6 +28,7 @@ enum class TimeSeriesImputeStrategy : uint8_t {
inline bool IsValid(TimeSeriesImputeStrategy value) {
return value == TimeSeriesImputeStrategy::Forward
|| value == TimeSeriesImputeStrategy::Backward
|| value == TimeSeriesImputeStrategy::Median
|| value == TimeSeriesImputeStrategy::Interpolate;
}
@ -67,13 +68,13 @@ public:
//std::chrono::system_clock::duration has different specializations of std::chrono::duration
//in Windows and Linux. So for serDe we convert frequency to this specific type.
using SerDeDurationType = std::chrono::duration<int64_t, std::ratio<1,1000000000>>;
class Transformer : public QueuedTransformer<typename BaseType::InputType,typename BaseType::TransformedType> {
public:
using StrTraits = Traits<std::string>;
// ----------------------------------------------------------------------
// |
// | Public Methods
@ -96,23 +97,23 @@ public:
// | Public Data
// |
// ----------------------------------------------------------------------
//Version must the first value to get (de)serialized- as during deserialization we validate this before deserializing others.
//Version must the first value to get (de)serialized- as during deserialization we validate this before deserializing others.
//Making this class variable so that we won't have to deserialize this in the ctor of first variable- (as that will be less cleaner).
uint8_t const _version;
FrequencyType const _frequency;
std::vector<TypeId> const _colsToImputeDataTypes;
TimeSeriesImputeStrategy const _tsImputeStrategy;
std::map<KeyType,std::vector<double_t>> const _medianValues;
bool const _supressError;
uint8_t const _version;
FrequencyType const _frequency;
std::vector<TypeId> const _colsToImputeDataTypes;
TimeSeriesImputeStrategy const _tsImputeStrategy;
std::map<KeyType,std::vector<double_t>> const _medianValues;
bool const _supressError;
private:
private:
// ----------------------------------------------------------------------
// |
// | Private Data
// |
// ----------------------------------------------------------------------
std::map<KeyType,OutputRowType> _lastRowtracker;
std::map<KeyType,BaseType::TransformedType> _buffer;
std::map<KeyType,OutputRowType> _lastRowtracker;
std::map<KeyType,BaseType::TransformedType> _buffer;
// ----------------------------------------------------------------------
// |
@ -138,6 +139,8 @@ public:
FEATURIZER_MOVE_CONSTRUCTOR_ONLY(TimeSeriesImputerEstimator);
static bool DoesColTypeSupportMedian(TypeId typeId);
private:
// ----------------------------------------------------------------------
// |
@ -177,14 +180,14 @@ private:
throw std::runtime_error("Couldn't retrieve Frequency Annotation.");
Annotation const & freqAnnotation(*tsFreqIter->second[0]);
assert(dynamic_cast<TimeSeriesFrequencyAnnotation const *>(&freqAnnotation));
TimeSeriesFrequencyAnnotation const & tsFreqAnnotation(static_cast<TimeSeriesFrequencyAnnotation const &>(freqAnnotation));
TimeSeriesFrequencyAnnotation const & tsFreqAnnotation(static_cast<TimeSeriesFrequencyAnnotation const &>(freqAnnotation));
AnnotationMap::const_iterator const & tsMedianIter(annotations.find("TimeSeriesMedianEstimator"));
if(tsMedianIter == annotations.end())
throw std::runtime_error("Couldn't retrieve Median Annotation.");
Annotation const & medianAnnotation(*tsMedianIter->second[0]);
assert(dynamic_cast<TimeSeriesMedianAnnotation const *>(&medianAnnotation));
TimeSeriesMedianAnnotation const & tsMedianAnnotation(static_cast<TimeSeriesMedianAnnotation const &>(medianAnnotation));
TimeSeriesMedianAnnotation const & tsMedianAnnotation(static_cast<TimeSeriesMedianAnnotation const &>(medianAnnotation));
return std::make_unique<Transformer>(tsFreqAnnotation.Value, std::move(_colsToImputeDataTypes), std::move(_tsImputeStrategy), std::move(_supressError), tsMedianAnnotation.Value);
}
@ -200,7 +203,6 @@ private:
// ----------------------------------------------------------------------
// ----------------------------------------------------------------------
// ----------------------------------------------------------------------
// |
// | TimeSeriesImputerEstimator
@ -209,14 +211,39 @@ private:
TimeSeriesImputerEstimator::TimeSeriesImputerEstimator(AnnotationMapsPtr pAllColumnAnnotations,std::vector<TypeId> colsToImputeDataTypes,TimeSeriesImputeStrategy tsImputeStrategy,bool supressError) :
BaseType("TimeSeriesImputerEstimator", std::move(pAllColumnAnnotations), true),
_colsToImputeDataTypes(std::move(colsToImputeDataTypes)),
_tsImputeStrategy(std::move(tsImputeStrategy)),
_supressError(std::move(supressError)){
_tsImputeStrategy(
std::move(
[&tsImputeStrategy](void) -> TimeSeriesImputeStrategy & {
if(IsValid(tsImputeStrategy) == false)
throw std::invalid_argument("'tsImputStrategy' is not valid");
return tsImputeStrategy;
}()
)
),
_supressError(std::move(supressError)) {
if(_tsImputeStrategy == TimeSeriesImputeStrategy::Median && _supressError == false) {
// Verify that all col types are double/float
for(auto const & colType : _colsToImputeDataTypes) {
if(DoesColTypeSupportMedian(colType) == false)
throw std::runtime_error("Only Numeric type columns are supported for ImputationStrategy median. (use suppressError flag to skip imputing non-numeric types)");
}
}
}
Estimator::FitResult TimeSeriesImputerEstimator::complete_training_impl(void) {
throw std::runtime_error("This should never be called as this class will not be used during training");
}
/*static*/ bool TimeSeriesImputerEstimator::DoesColTypeSupportMedian(TypeId typeId) {
return typeId == TypeId::Float16
|| typeId == TypeId::Float32
|| typeId == TypeId::Float64
|| typeId == TypeId::BFloat16;
}
// ----------------------------------------------------------------------
// |
// | TimeSeriesImputerEstimator::Transformer
@ -246,10 +273,10 @@ TimeSeriesImputerEstimator::Transformer::Transformer(typename BaseType::Transfor
return version;
}()
),
_frequency(Traits<std::chrono::system_clock::duration>::deserialize(ar)),
_colsToImputeDataTypes( [&ar](void)->std::vector<TypeId> {
_colsToImputeDataTypes( [&ar](void)->std::vector<TypeId> {
std::vector<TypeId> colsToImputeDataTypes;
using TypeIdUnderlyingType = std::underlying_type<TypeId>::type;
std::vector<TypeIdUnderlyingType> _colsToImputeDataTypesUnWrapped(Traits<std::vector<TypeIdUnderlyingType>>::deserialize(ar));
@ -260,11 +287,11 @@ TimeSeriesImputerEstimator::Transformer::Transformer(typename BaseType::Transfor
throw std::runtime_error("Invalid TypeId");
colsToImputeDataTypes.push_back(std::move(typeId));
}
return colsToImputeDataTypes;
return colsToImputeDataTypes;
}()
),
_tsImputeStrategy( static_cast<TimeSeriesImputeStrategy>(Traits<uint8_t>::deserialize(ar))),
_medianValues(Traits<
std::map<
KeyType,
@ -275,6 +302,19 @@ TimeSeriesImputerEstimator::Transformer::Transformer(typename BaseType::Transfor
{}
typename TimeSeriesImputerEstimator::BaseType::TransformedType TimeSeriesImputerEstimator::Transformer::execute(typename BaseType::InputType input) {
// Ensure that this row is in chronological order
KeyType const & key(std::get<1>(input));
std::map<KeyType, OutputRowType>::const_iterator const iterLastRow(_lastRowtracker.find(key));
if(iterLastRow != _lastRowtracker.end()) {
std::chrono::system_clock::time_point const & lastRowTimePoint(std::get<1>(iterLastRow->second));
std::chrono::system_clock::time_point const & inputTimePoint(std::get<0>(input));
if(inputTimePoint < lastRowTimePoint)
throw std::runtime_error("Input stream not in chronological order.");
}
// Invoke the specified impute strategy
if(_tsImputeStrategy == TimeSeriesImputeStrategy::Forward || _tsImputeStrategy == TimeSeriesImputeStrategy::Median)
return ffill_or_median(input);
else if(_tsImputeStrategy == TimeSeriesImputeStrategy::Backward)
@ -290,12 +330,16 @@ typename TimeSeriesImputerEstimator::BaseType::TransformedType TimeSeriesImputer
for (auto it = _buffer.begin(); it != _buffer.end(); ++it)
output.insert(output.end(), it->second.begin(), it->second.end());
// Clear the working state
_lastRowtracker.clear();
_buffer.clear();
return output;
}
void TimeSeriesImputerEstimator::Transformer::save(typename TimeSeriesImputerEstimator::BaseType::Transformer::Archive & ar) const {
Traits<std::uint8_t>::serialize(ar, 1); // Current version
//_frequency
Traits<std::chrono::system_clock::duration>::serialize(ar,_frequency);
@ -308,7 +352,7 @@ void TimeSeriesImputerEstimator::Transformer::save(typename TimeSeriesImputerE
//_tsImputeStrategy
Traits<uint8_t>::serialize(ar,static_cast<std::underlying_type<TimeSeriesImputeStrategy>::type>(_tsImputeStrategy));
//_medianValues
Traits<
std::map<
@ -323,16 +367,16 @@ void TimeSeriesImputerEstimator::Transformer::save(typename TimeSeriesImputerE
}
typename TimeSeriesImputerEstimator::BaseType::TransformedType TimeSeriesImputerEstimator::Transformer::generate_rows(typename BaseType::InputType input, typename TimeSeriesImputerEstimator::TimePointType const & lastObservedTP) {
typename TimeSeriesImputerEstimator::BaseType::TransformedType output;
typename TimeSeriesImputerEstimator::BaseType::TransformedType output;
typename TimeSeriesImputerEstimator::TimePointType tempTP = lastObservedTP + _frequency;
typename TimeSeriesImputerEstimator::TimePointType inputTP = std::get<0>(input);
while(tempTP < inputTP) {
output.push_back(std::make_tuple(true, tempTP, std::get<1>(input), ColsToImputeType(std::get<2>(input).size())));
output.push_back(std::make_tuple(true, tempTP, std::get<1>(input), ColsToImputeType(std::get<2>(input).size())));
tempTP = tempTP + _frequency;
}
output.push_back(std::tuple_cat(std::make_tuple(false), input));
return output;
@ -340,14 +384,14 @@ typename TimeSeriesImputerEstimator::BaseType::TransformedType TimeSeriesImputer
void TimeSeriesImputerEstimator::Transformer::impute(typename TimeSeriesImputerEstimator::ColsToImputeType & prev, typename TimeSeriesImputerEstimator::ColsToImputeType & current) {
for(std::size_t i=0; i< current.size(); ++i)
for(std::size_t i=0; i< current.size(); ++i)
if(StrTraits::IsNull(current[i]))
current[i] = prev[i];
current[i] = prev[i];
}
bool TimeSeriesImputerEstimator::Transformer::no_nulls(typename TimeSeriesImputerEstimator::ColsToImputeType const & input) {
for(std::size_t i=0; i< input.size(); ++i)
for(std::size_t i=0; i< input.size(); ++i)
if(StrTraits::IsNull(input[i]))
return false;
@ -379,7 +423,7 @@ typename TimeSeriesImputerEstimator::BaseType::TransformedType TimeSeriesImputer
typename TimeSeriesImputerEstimator::BaseType::TransformedType results;
int count = 0;
for(std::size_t i=0; i< _buffer[key].size(); ++i)
for(std::size_t i=0; i< _buffer[key].size(); ++i)
{
if(no_nulls(std::get<3>(_buffer[key][i]))){
results.push_back(_buffer[key][i]);
@ -388,10 +432,10 @@ typename TimeSeriesImputerEstimator::BaseType::TransformedType TimeSeriesImputer
else
break;
}
if(count > 0)
_buffer[key].erase(_buffer[key].begin(),_buffer[key].begin()+count);
return results;
}
@ -404,19 +448,39 @@ typename TimeSeriesImputerEstimator::BaseType::TransformedType TimeSeriesImputer
typename TimeSeriesImputerEstimator::OutputRowType & lastRow = _lastRowtracker[key];
TimeSeriesImputerEstimator::BaseType::TransformedType addedRowsResultset = generate_rows(input, std::get<1>(lastRow));
for(std::size_t i=0; i< addedRowsResultset.size(); ++i)
{
for(auto &addedRow : addedRowsResultset) {
if(_tsImputeStrategy == TimeSeriesImputeStrategy::Forward)
impute(std::get<3>(lastRow) , std::get<3>(addedRowsResultset[i]));
impute(std::get<3>(lastRow) , std::get<3>(addedRow));
else {
typename TimeSeriesImputerEstimator::ColsToImputeType & current = std::get<3>(addedRowsResultset[i]);
for(std::size_t j=0; j< current.size(); ++j)
if(StrTraits::IsNull(current[j]))
current[j] = nonstd::optional<std::string>(std::to_string(_medianValues.at(key)[j]));
std::map<KeyType,std::vector<double_t>>::const_iterator const iterMedian(_medianValues.find(key));
typename TimeSeriesImputerEstimator::ColsToImputeType & addedRowData(std::get<3>(addedRow));
for(std::size_t addedRowColIndex = 0; addedRowColIndex < addedRowData.size(); ++addedRowColIndex) {
if(StrTraits::IsNull(addedRowData[addedRowColIndex])) {
addedRowData[addedRowColIndex] =
[this, &iterMedian, &addedRowColIndex]() -> nonstd::optional<std::string> {
assert(addedRowColIndex < _colsToImputeDataTypes.size());
if(_supressError && TimeSeriesImputerEstimator::DoesColTypeSupportMedian(_colsToImputeDataTypes[addedRowColIndex]) == false)
return nonstd::optional<std::string>();
if(iterMedian == _medianValues.end()) {
if(_supressError)
return nonstd::optional<std::string>();
throw std::runtime_error("Invalid key");
}
assert(addedRowColIndex < iterMedian->second.size());
return Traits<std::double_t>::ToString(iterMedian->second[addedRowColIndex]);
}();
}
}
}
lastRow = addedRowsResultset[i];
lastRow = addedRow;
}
_lastRowtracker[key] = lastRow;
return addedRowsResultset;

Просмотреть файл

@ -86,7 +86,7 @@ private:
// Annotation is created using _aggregateTracker.
std::map<KeyType,std::vector<double_t>> _aggregateTracker;
std::map<KeyType,std::vector<int64_t>> _countTracker;
// ----------------------------------------------------------------------
// |
// | Private Methods
@ -142,8 +142,11 @@ Estimator::FitResult TimeSeriesMedianEstimator::fit_impl(typename BaseType::FitB
}
for(std::size_t i=0; i< colValues.size(); ++i) {
_aggregateTracker[key][i] += Traits<std::string>::IsNull(colValues[i]) ? 0.0 : Traits<std::double_t>::FromString(colValues[i].value());
_countTracker[key][i] += Traits<std::string>::IsNull(colValues[i]) ? 0 : 1;
if(Traits<std::string>::IsNull(colValues[i]))
continue;
_aggregateTracker[key][i] += Traits<std::double_t>::FromString(Traits<std::string>::GetNullableValue(colValues[i]));
_countTracker[key][i] += 1;
}
}
@ -151,6 +154,8 @@ Estimator::FitResult TimeSeriesMedianEstimator::fit_impl(typename BaseType::FitB
}
Estimator::FitResult TimeSeriesMedianEstimator::complete_training_impl(void) {
// Note that this class reuses _aggregateTracker to calculate median values before
// moving it to the annotation.
for(auto & kvp: _aggregateTracker) {
KeyType const & key = kvp.first;
for(std::size_t i=0; i< kvp.second.size(); ++i) {

Просмотреть файл

@ -41,13 +41,6 @@ public:
TimeSeriesImputerEstimator(AnnotationMapsPtr pAllColumnAnnotations,std::vector<TypeId> colsToImputeDataTypes, bool suppresserror = false, Components::TimeSeriesImputeStrategy tsImputeStrategy= Components::TimeSeriesImputeStrategy::Forward);
FEATURIZER_MOVE_CONSTRUCTOR_ONLY(TimeSeriesImputerEstimator);
// ----------------------------------------------------------------------
// |
// | Public Methods
// |
// ----------------------------------------------------------------------
bool IsNumericTypeId(TypeId const & typeId);
};
// ----------------------------------------------------------------------
@ -61,43 +54,12 @@ public:
// ----------------------------------------------------------------------
TimeSeriesImputerEstimator::TimeSeriesImputerEstimator(AnnotationMapsPtr pAllColumnAnnotations, std::vector<TypeId> colsToImputeDataTypes, bool suppresserror, Components::TimeSeriesImputeStrategy tsImputeStrategy) :
BaseType("TimeSeriesImputerEstimator",
BaseType("TimeSeriesImputerEstimator",
pAllColumnAnnotations,
[&pAllColumnAnnotations](void) { return Components::TimeSeriesFrequencyEstimator(pAllColumnAnnotations); },
[&pAllColumnAnnotations](void) { return Components::TimeSeriesMedianEstimator(pAllColumnAnnotations); },
[&pAllColumnAnnotations,colsToImputeDataTypes,tsImputeStrategy,suppresserror](void) { return Components::TimeSeriesImputerEstimator(pAllColumnAnnotations,colsToImputeDataTypes,tsImputeStrategy,suppresserror); }
){
for(std::size_t i=0; i< colsToImputeDataTypes.size(); ++i) {
if(
tsImputeStrategy == Components::TimeSeriesImputeStrategy::Median &&
!IsNumericTypeId(colsToImputeDataTypes[i]) &&
suppresserror == false
)
throw std::runtime_error("Only Numeric type columns are supported for ImputationStrategy median. (use suppresserror flag to skip imputing non-numeric types)");
}
}
bool TimeSeriesImputerEstimator::IsNumericTypeId(TypeId const & id) {
if(
!(
id == TypeId::Int8
|| id == TypeId::Int16
|| id == TypeId::Int32
|| id == TypeId::Int64
|| id == TypeId::UInt8
|| id == TypeId::UInt16
|| id == TypeId::UInt32
|| id == TypeId::UInt64
|| id == TypeId::Float16
|| id == TypeId::Float32
|| id == TypeId::Float64
|| id == TypeId::Complex64
|| id == TypeId::Complex128
|| id == TypeId::BFloat16
))
return false;
return true;
[&pAllColumnAnnotations,&colsToImputeDataTypes,&tsImputeStrategy,&suppresserror](void) { return Components::TimeSeriesImputerEstimator(pAllColumnAnnotations,colsToImputeDataTypes,tsImputeStrategy,suppresserror); }
) {
}
} // namespace Featurizers

Просмотреть файл

@ -2,12 +2,13 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// ----------------------------------------------------------------------
#include "../../Traits.h"
#define CATCH_CONFIG_MAIN
#include "catch.hpp"
#include "../Components/TimeSeriesImputerTransformer.h"
#include "../TimeSeriesImputerFeaturizer.h"
#include "../../Traits.h"
namespace NS = Microsoft::Featurizer;
@ -45,7 +46,7 @@ TransformedType Test(std::vector<std::vector<InputType>> const &trainingBatches,
NS::AnnotationMapsPtr const pAllColumnAnnotations(NS::CreateTestAnnotationMapsPtr(1));
TSImputerEstimator estimator(pAllColumnAnnotations,colsToImputeDataTypes,supressError,tsImputeStrategy);
typename InputBatchesType::const_iterator iter(trainingBatches.begin());
while(true) {
@ -77,12 +78,49 @@ TransformedType Test(std::vector<std::vector<InputType>> const &trainingBatches,
}
//TODO: Add tests for more atomic scenarios. For eg.
// Validate rows inserted for forward fill
// Validate rows inserted for backward fill
// Validate ffill col mputation
// Validate row imputation:
// For 1 grain:
// - row, row, row [No gaps]
// - row, 1 gap, row
// - row, 2 gaps, row
// For 2 grain (input interleaved)
// - row, row, row [No gaps]
// - row, 1 gap, row
// - row, 2 gaps, row
// Validate ffill col imputation:
// For 1 grain:
// - Valid row, empty row
// - Valid row, empty row, empty row
// - Empty row, valid row
// - Empty row, empty row, valid row
// - Empty row, flush
// - Empty row, empty row, flush
// For 2 grains (input interleaved)
// - Valid row, empty row
// - Valid row, empty row, empty row
// Validate bfill col imputation
// For 1 grain:
// - Valid row, empty row, valid row
// - Valid row, empty row, empty row, valid row
// - Empty row, valid row
// - Empty row, empty row, valid row
// - Empty row, flush
// - Empty row, empty row, flush
// For 2 grains:
// - Valid row, empty row, valid row
// - Valid row, empty row, empty row, valid row
// Validate median col imputation
// For 1 grain:
// - Empty row
// - Empty row, empty row
// For 2 grains:
// - Empty row
// - Empty row, empty row
// Suppress errors:
// - Error when median on unsupported cols
// - Empty when median on unsupported cols with errors suppressed
// - Error when median on unrecognized grain
// - Empty when median on unrecognized grain with errors suppressed
TEST_CASE("FFill- Add Rows and Impute") {
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
@ -164,45 +202,55 @@ TEST_CASE("BFill- Add Rows and Impute") {
TEST_CASE("MedianFill- Add Rows and Impute") {
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
std::vector<std::tuple<bool,std::chrono::system_clock::time_point, std::vector<std::string>, std::vector<nonstd::optional<std::string>>>> output = {
std::make_tuple(false,GetTimePoint(now,-6), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.5","15.000000"}),
std::make_tuple(false,GetTimePoint(now,-6), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"114.5","115.000000"}),
std::make_tuple(true,GetTimePoint(now,-5), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"114.750000","115.000000"}),
std::make_tuple(false,GetTimePoint(now,-4), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"114.750000","120.5"}),
std::make_tuple(true,GetTimePoint(now,-5), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.750000","15.000000"}),
std::make_tuple(false,GetTimePoint(now,-4), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.750000","20.5"}),
std::make_tuple(false,GetTimePoint(now,-3), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.750000","15.000000"}),
std::make_tuple(false,GetTimePoint(now,-3), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"114.750000","115.000000"}),
std::make_tuple(true,GetTimePoint(now,-2), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"114.750000", "115.000000"}),
std::make_tuple(false,GetTimePoint(now,-1), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"115.0", "118.8"}),
std::make_tuple(true,GetTimePoint(now,-2), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.750000", "15.000000"}),
std::make_tuple(false,GetTimePoint(now,-1), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"15.0", "18.8"})
};
CHECK(Test({
{
std::make_tuple(GetTimePoint(now,-4), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.5","18"}),
std::make_tuple(GetTimePoint(now,-3), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{},"12"}),
std::make_tuple(GetTimePoint(now,-2), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"15.0",nonstd::optional<std::string>{}}),
std::make_tuple(GetTimePoint(now,-8), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"114.5","118"}),
std::make_tuple(GetTimePoint(now,-4), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{},"112"}),
std::make_tuple(GetTimePoint(now,-2), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"115.0",nonstd::optional<std::string>{}})
}
},
{
std::make_tuple(GetTimePoint(now,-6), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.5", nonstd::optional<std::string>{}}),
std::make_tuple(GetTimePoint(now,-6), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"114.5", nonstd::optional<std::string>{}}),
std::make_tuple(GetTimePoint(now,-4), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, "120.5"}),
std::make_tuple(GetTimePoint(now,-4), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, "20.5"}),
std::make_tuple(GetTimePoint(now,-3), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, nonstd::optional<std::string>{}}),
std::make_tuple(GetTimePoint(now,-3), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, nonstd::optional<std::string>{}}),
std::make_tuple(GetTimePoint(now,-1), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"115.0","118.8"}),
std::make_tuple(GetTimePoint(now,-1), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"15.0","18.8"})
},{NS::TypeId::Float64, NS::TypeId::String}, true, NS::Featurizers::Components::TimeSeriesImputeStrategy::Median) == output);
}
std::vector<std::tuple<bool,std::chrono::system_clock::time_point, std::vector<std::string>, std::vector<nonstd::optional<std::string>>>>
expected_output = {
std::make_tuple(false,GetTimePoint(now,-6), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.5","15.000000"}),
std::make_tuple(false,GetTimePoint(now,-6), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"114.5","115.000000"}),
std::make_tuple(true,GetTimePoint(now,-5), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"114.750000","115.000000"}),
std::make_tuple(false,GetTimePoint(now,-4), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"114.750000","120.5"}),
std::make_tuple(true,GetTimePoint(now,-5), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.750000","15.000000"}),
std::make_tuple(false,GetTimePoint(now,-4), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.750000","20.5"}),
std::make_tuple(false,GetTimePoint(now,-3), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.750000","15.000000"}),
std::make_tuple(false,GetTimePoint(now,-3), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"114.750000","115.000000"}),
std::make_tuple(true,GetTimePoint(now,-2), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"114.750000", "115.000000"}),
std::make_tuple(false,GetTimePoint(now,-1), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"115.0", "118.8"}),
std::make_tuple(true,GetTimePoint(now,-2), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.750000", "15.000000"}),
std::make_tuple(false,GetTimePoint(now,-1), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"15.0", "18.8"})
};
TEST_CASE("One Row input") {
std::vector<std::tuple<bool,std::chrono::system_clock::time_point, std::vector<std::string>, std::vector<nonstd::optional<std::string>>>>
actual_output = Test(
{
{
std::make_tuple(GetTimePoint(now,-4), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.5","18"}),
std::make_tuple(GetTimePoint(now,-3), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{},"12"}),
std::make_tuple(GetTimePoint(now,-2), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"15.0",nonstd::optional<std::string>{}}),
std::make_tuple(GetTimePoint(now,-8), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"114.5","118"}),
std::make_tuple(GetTimePoint(now,-4), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{},"112"}),
std::make_tuple(GetTimePoint(now,-2), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"115.0",nonstd::optional<std::string>{}})
}
},
{
std::make_tuple(GetTimePoint(now,-6), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.5", nonstd::optional<std::string>{}}),
std::make_tuple(GetTimePoint(now,-6), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"114.5", nonstd::optional<std::string>{}}),
std::make_tuple(GetTimePoint(now,-4), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, "120.5"}),
std::make_tuple(GetTimePoint(now,-4), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, "20.5"}),
std::make_tuple(GetTimePoint(now,-3), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, nonstd::optional<std::string>{}}),
std::make_tuple(GetTimePoint(now,-3), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, nonstd::optional<std::string>{}}),
std::make_tuple(GetTimePoint(now,-1), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"115.0","118.8"}),
std::make_tuple(GetTimePoint(now,-1), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"15.0","18.8"})
},
{NS::TypeId::Float64, NS::TypeId::Float64},
true,
NS::Featurizers::Components::TimeSeriesImputeStrategy::Median
);
CHECK(actual_output == expected_output);
}
TEST_CASE("One Row input") {
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
CHECK_THROWS_WITH(Test({
{
std::make_tuple(GetTimePoint(now,-4), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.5","18"})
@ -227,7 +275,7 @@ TEST_CASE("MedianFill- Add Rows and Impute") {
frequency = (foo - now);
return frequency;
}(),
std::vector<NS::TypeId>{NS::TypeId::Float64,NS::TypeId::Float64},
std::vector<NS::TypeId>{NS::TypeId::Float64,NS::TypeId::Float64},
NS::Featurizers::Components::TimeSeriesImputeStrategy::Median,
true,
std::map<std::vector<std::string>,std::vector<double>>{
@ -259,3 +307,46 @@ TEST_CASE("MedianFill- Add Rows and Impute") {
#if (defined __clang__)
# pragma clang diagnostic pop
#endif
TEST_CASE("SimpleMedianTest") {
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
std::vector<std::tuple<bool,std::chrono::system_clock::time_point, std::vector<std::string>, std::vector<nonstd::optional<std::string>>>>
expected_output =
{
std::make_tuple(false, GetTimePoint(now, 0), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"2.000000"}),
std::make_tuple(false, GetTimePoint(now, 1), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"2.000000"}),
std::make_tuple(true, GetTimePoint(now, 2), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"2.000000"}),
std::make_tuple(false, GetTimePoint(now, 3), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"2.000000"}),
std::make_tuple(true, GetTimePoint(now, 4), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"2.000000"}),
std::make_tuple(false, GetTimePoint(now, 5), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"2.000000"}),
std::make_tuple(true, GetTimePoint(now, 6), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"2.000000"}),
std::make_tuple(false, GetTimePoint(now, 7), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"2.000000"})
};
auto actual_output =
Test(
{
{
std::make_tuple(GetTimePoint(now, 0), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"2.000000"}),
std::make_tuple(GetTimePoint(now, 1), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>()}),
std::make_tuple(GetTimePoint(now, 3), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"2.000000"}),
std::make_tuple(GetTimePoint(now, 5), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>()}),
std::make_tuple(GetTimePoint(now, 7), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>()})
}
},
{
{
std::make_tuple(GetTimePoint(now, 0), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"2.000000"}),
std::make_tuple(GetTimePoint(now, 1), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>()}),
std::make_tuple(GetTimePoint(now, 3), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"2.000000"}),
std::make_tuple(GetTimePoint(now, 5), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>()}),
std::make_tuple(GetTimePoint(now, 7), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>()})
}
},
{NS::TypeId::Float64},
true,
NS::Featurizers::Components::TimeSeriesImputeStrategy::Median
);
CHECK(actual_output == expected_output);
}

Просмотреть файл

@ -17,7 +17,7 @@ using system_clock = std::chrono::system_clock;
TEST_CASE("End-to-end") {
std::vector<TypeId> keyIds{ StringId, StringId };
std::vector<TypeId> dataIds{ Int32Id, Float32Id, UInt32Id};
std::vector<TypeId> dataIds{ Int32Id, Float32Id, UInt32Id };
TimeSeriesImputerFeaturizer_BinaryArchive_EstimatorHandle * estimatorHandle(nullptr);
ErrorInfoHandle * pErrorInfo(nullptr);
bool suppressErrors(false);
@ -68,7 +68,7 @@ TEST_CASE("End-to-end") {
NS::Traits<std::string>::serialize(archive, "Hello");
NS::Traits<std::string>::serialize(archive, "World");
NS::Traits<typename NS::Traits<std::int32_t>::nullable_type>::serialize(archive, 18);
NS::Traits<typename NS::Traits<std::float_t>::nullable_type>::serialize(archive, 2.0f);
NS::Traits<typename NS::Traits<std::float_t>::nullable_type>::serialize(archive, 4.0f);
NS::Traits<typename NS::Traits<std::uint32_t>::nullable_type>::serialize(archive, static_cast<std::uint32_t>(123456));
return archive.commit();
@ -124,13 +124,33 @@ TEST_CASE("End-to-end") {
CHECK(pErrorInfo == nullptr);
// Transform
NS::Archive::ByteArray const bytes3(
[&originalTimePoint](void) {
NS::Archive archive;
NS::Traits<system_clock::time_point>::serialize(archive, originalTimePoint);
NS::Traits<std::string>::serialize(archive, "Hello");
NS::Traits<std::string>::serialize(archive, "World");
NS::Traits<typename NS::Traits<std::int32_t>::nullable_type>::serialize(archive, 18);
NS::Traits<typename NS::Traits<std::float_t>::nullable_type>::serialize(archive, 3.0f);
NS::Traits<typename NS::Traits<std::uint32_t>::nullable_type>::serialize(archive, static_cast<std::uint32_t>(123456));
return archive.commit();
}()
);
BinaryArchiveData bad3;
bad3.pBuffer = bytes3.data();
bad3.cBuffer = bytes3.size();
BinaryArchiveData * pTransformResults(nullptr);
size_t cNumResults(0);
CHECK(
TimeSeriesImputerFeaturizer_BinaryArchive_Transform(
transformerHandle,
bad1,
bad3,
&pTransformResults,
&cNumResults,
&pErrorInfo
@ -166,7 +186,7 @@ TEST_CASE("End-to-end") {
REQUIRE(NS::Traits<decltype(data1)>::IsNull(data1) == false);
CHECK(NS::Traits<decltype(data1)>::GetNullableValue(data1) == 18);
REQUIRE(NS::Traits<decltype(data2)>::IsNull(data2) == false);
CHECK(NS::Traits<decltype(data2)>::GetNullableValue(data2) == 2.0f);
CHECK(NS::Traits<decltype(data2)>::GetNullableValue(data2) == 3.0f);
REQUIRE(NS::Traits<decltype(data3)>::IsNull(data3) == false);
CHECK(NS::Traits<decltype(data3)>::GetNullableValue(data3) == 123456);

Просмотреть файл

@ -989,3 +989,9 @@ struct Traits<nonstd::optional<T>> {
} // namespace Featurizer
} // namespace Microsoft
template <typename T>
std::ostream & operator <<(std::ostream &os, nonstd::optional<T> const &value) {
os << Microsoft::Featurizer::Traits<nonstd::optional<T>>::ToString(value);
return os;
}