Merged PR 5133: Shared Object Interface for TimeSeriesImputer

Initial checkin
This commit is contained in:
David Brownell 2019-09-19 17:59:58 +00:00
Родитель 4181401a98
Коммит dae05ad276
13 изменённых файлов: 1626 добавлений и 96 удалений

Просмотреть файл

@ -160,7 +160,7 @@ public:
/// \brief Result returned by the `fit` method.
///
enum class FitResult: unsigned char {
Complete = 0, /// Fitting is complete and there is no need to call `fit` on this `Estimator` any more.
Complete = 1, /// Fitting is complete and there is no need to call `fit` on this `Estimator` any more.
Continue, /// Continue providing data to `fit` (if such data is available).
ResetAndContinue /// Reset the data back to the beginning and continue training.
};

Просмотреть файл

@ -23,6 +23,12 @@ enum class TimeSeriesImputeStrategy : unsigned char {
NumValues
};
inline bool IsValid(TimeSeriesImputeStrategy value) {
return value == TimeSeriesImputeStrategy::Forward
|| value == TimeSeriesImputeStrategy::Backward
|| value == TimeSeriesImputeStrategy::Interpolate;
}
using TimeSeriesImputerEstimatorInputType = std::tuple<
std::chrono::system_clock::time_point,
std::vector<std::string>,
@ -67,7 +73,7 @@ public:
// | Public Methods
// |
// ----------------------------------------------------------------------
Transformer(FrequencyType value, std::vector<DataTypes> colsToImputeDataTypes, TimeSeriesImputeStrategy tsImputeStrategy);
Transformer(FrequencyType value, std::vector<TypeId> colsToImputeDataTypes, TimeSeriesImputeStrategy tsImputeStrategy);
Transformer(typename BaseType::Transformer::Archive & ar);
~Transformer(void) override = default;
@ -86,7 +92,7 @@ public:
// |
// ----------------------------------------------------------------------
FrequencyType const _frequency;
std::vector<DataTypes> const _colsToImputeDataTypes;
std::vector<TypeId> const _colsToImputeDataTypes;
TimeSeriesImputeStrategy const _tsImputeStrategy;
};
@ -97,7 +103,7 @@ public:
// | Public Methods
// |
// ----------------------------------------------------------------------
TimeSeriesImputerEstimator(AnnotationMapsPtr pAllColumnAnnotations,std::vector<DataTypes> colsToImputeDataTypes={DataTypes::Float64},TimeSeriesImputeStrategy tsImputeStrategy = TimeSeriesImputeStrategy::Forward);
TimeSeriesImputerEstimator(AnnotationMapsPtr pAllColumnAnnotations,std::vector<TypeId> colsToImputeDataTypes={TypeId::Float64},TimeSeriesImputeStrategy tsImputeStrategy = TimeSeriesImputeStrategy::Forward);
~TimeSeriesImputerEstimator(void) override = default;
FEATURIZER_MOVE_CONSTRUCTOR_ONLY(TimeSeriesImputerEstimator);
@ -114,7 +120,7 @@ private:
// | Private Data
// |
// ----------------------------------------------------------------------
std::vector<DataTypes> const _colsToImputeDataTypes;
std::vector<TypeId> const _colsToImputeDataTypes;
TimeSeriesImputeStrategy const _tsImputeStrategy;
// ----------------------------------------------------------------------
@ -161,7 +167,7 @@ private:
// | TimeSeriesImputerEstimator
// |
// ----------------------------------------------------------------------
TimeSeriesImputerEstimator::TimeSeriesImputerEstimator(AnnotationMapsPtr pAllColumnAnnotations,std::vector<DataTypes> colsToImputeDataTypes,TimeSeriesImputeStrategy tsImputeStrategy) :
TimeSeriesImputerEstimator::TimeSeriesImputerEstimator(AnnotationMapsPtr pAllColumnAnnotations,std::vector<TypeId> colsToImputeDataTypes,TimeSeriesImputeStrategy tsImputeStrategy) :
BaseType("TimeSeriesImputerEstimator", std::move(pAllColumnAnnotations), true),
_colsToImputeDataTypes(std::move(colsToImputeDataTypes)),
_tsImputeStrategy(std::move(tsImputeStrategy)){
@ -176,7 +182,7 @@ Estimator::FitResult TimeSeriesImputerEstimator::complete_training_impl(void) {
// | TimeSeriesImputerEstimator::Transformer
// |
// ----------------------------------------------------------------------
TimeSeriesImputerEstimator::Transformer::Transformer(TimeSeriesImputerEstimator::FrequencyType value, std::vector<DataTypes> colsToImputeDataTypes,TimeSeriesImputeStrategy tsImputeStrategy) :
TimeSeriesImputerEstimator::Transformer::Transformer(TimeSeriesImputerEstimator::FrequencyType value, std::vector<TypeId> colsToImputeDataTypes,TimeSeriesImputeStrategy tsImputeStrategy) :
_frequency(std::move(value)),
_colsToImputeDataTypes(std::move(colsToImputeDataTypes)),
_tsImputeStrategy(std::move(tsImputeStrategy)) {
@ -188,7 +194,7 @@ TimeSeriesImputerEstimator::Transformer::Transformer(TimeSeriesImputerEstimator:
TimeSeriesImputerEstimator::Transformer::Transformer(typename BaseType::Transformer::Archive & ar) :
_frequency(std::chrono::system_clock::duration::max().count()),
_colsToImputeDataTypes({DataTypes::Float64}),
_colsToImputeDataTypes({TypeId::Float64}),
_tsImputeStrategy(TimeSeriesImputeStrategy::Forward) {
if(Traits<std::uint8_t>::deserialize(ar) != 1)
throw std::runtime_error("Invalid transformer version");

Просмотреть файл

@ -14,12 +14,12 @@ namespace Featurizer {
namespace Featurizers {
/////////////////////////////////////////////////////////////////////////
/// \class TimeSeriesImputerFeaturizerEstimator
/// \class TimeSeriesImputerEstimator
/// \brief This class 'chains' TimeSeriesFrequencyEstimator and TimeSeriesImputerEstimator.
/// TimeSeriesFrequencyEstimator generates Frequency Annotation which is consumed by
/// TimeSeriesImputerEstimator to Impute data.
///
class TimeSeriesImputerFeaturizerEstimator :
class TimeSeriesImputerEstimator :
public Components::PipelineExecutionEstimatorImpl<
Components::TimeSeriesFrequencyEstimator,
Components::TimeSeriesImputerEstimator
@ -34,14 +34,14 @@ public:
Components::TimeSeriesFrequencyEstimator,
Components::TimeSeriesImputerEstimator
>;
TimeSeriesImputerFeaturizerEstimator(AnnotationMapsPtr pAllColumnAnnotations,std::vector<DataTypes> colsToImputeDataTypes, bool supressError = false, Components::TimeSeriesImputeStrategy tsImputeStrategy= Components::TimeSeriesImputeStrategy::Forward);
FEATURIZER_MOVE_CONSTRUCTOR_ONLY(TimeSeriesImputerFeaturizerEstimator);
TimeSeriesImputerEstimator(AnnotationMapsPtr pAllColumnAnnotations,std::vector<TypeId> colsToImputeDataTypes, bool supressError = false, Components::TimeSeriesImputeStrategy tsImputeStrategy= Components::TimeSeriesImputeStrategy::Forward);
FEATURIZER_MOVE_CONSTRUCTOR_ONLY(TimeSeriesImputerEstimator);
};
TimeSeriesImputerFeaturizerEstimator::TimeSeriesImputerFeaturizerEstimator(AnnotationMapsPtr pAllColumnAnnotations, std::vector<DataTypes> colsToImputeDataTypes, bool supressError, Components::TimeSeriesImputeStrategy tsImputeStrategy) :
BaseType("TimeSeriesImputerFeaturizerEstimator", std::move(pAllColumnAnnotations)) {
TimeSeriesImputerEstimator::TimeSeriesImputerEstimator(AnnotationMapsPtr pAllColumnAnnotations, std::vector<TypeId> colsToImputeDataTypes, bool supressError, Components::TimeSeriesImputeStrategy tsImputeStrategy) :
BaseType("TimeSeriesImputerEstimator", std::move(pAllColumnAnnotations)) {
//Once PipelineExector enables instantiating templates types with ctor args- we'll make use of this.
std::ignore = colsToImputeDataTypes;
std::ignore = tsImputeStrategy;

Просмотреть файл

@ -31,11 +31,11 @@ include(${_this_path}/../cmake/FeaturizersCode.cmake)
enable_testing()
foreach(_test_name IN ITEMS
SampleAddFeaturizer_UnitTest
DateTimeFeaturizer_UnitTests
StringFeaturizer_UnitTest
CatImputerFeaturizer_UnitTests
TimeSeriesImputerFeaturizer_UnitTest
DateTimeFeaturizer_UnitTests
SampleAddFeaturizer_UnitTest
StringFeaturizer_UnitTest
TimeSeriesImputerFeaturizer_UnitTest
)
add_executable(${_test_name} ${_test_name}.cpp)

Просмотреть файл

@ -27,12 +27,12 @@ std::vector<std::tuple<bool,std::chrono::system_clock::time_point, std::vector<s
using KeyT = std::vector<std::string>;
using ColsToImputeT = std::vector<nonstd::optional<std::string>>;
using InputBatchesType = std::vector<std::vector<std::tuple<std::chrono::system_clock::time_point, std::vector<std::string>, std::vector<nonstd::optional<std::string>>>>>;
using TSImputerEstimator = NS::Featurizers::TimeSeriesImputerFeaturizerEstimator;
using TSImputerEstimator = NS::Featurizers::TimeSeriesImputerEstimator;
using TransformedType = std::vector<std::tuple<bool,std::chrono::system_clock::time_point, std::vector<std::string>, std::vector<nonstd::optional<std::string>>>>;
NS::AnnotationMapsPtr const pAllColumnAnnotations(NS::CreateTestAnnotationMapsPtr(1));
TSImputerEstimator estimator(pAllColumnAnnotations, {NS::DataTypes::Float64});
TSImputerEstimator estimator(pAllColumnAnnotations, {NS::TypeId::Float64});
typename InputBatchesType::const_iterator iter(trainingBatches.begin());
while(true) {

Просмотреть файл

@ -5,12 +5,22 @@
get_filename_component(_this_path ${CMAKE_CURRENT_LIST_FILE} DIRECTORY)
add_library(FeaturizersCode STATIC
${_this_path}/../CatImputerFeaturizer.h
${_this_path}/../DateTimeFeaturizer.h
${_this_path}/../DateTimeFeaturizer.cpp
${_this_path}/../SampleAddFeaturizer.h
${_this_path}/../SampleAddFeaturizer.cpp
${_this_path}/../StringFeaturizer.h
${_this_path}/../CatImputerFeaturizer.h
${_this_path}/../TimeSeriesImputerFeaturizer
${_this_path}/../Components/Components.h
${_this_path}/../Components/InferenceOnlyFeaturizerImpl.h
${_this_path}/../Components/PipelineExecutionEstimatorImpl.h
${_this_path}/../Components/TimeSeriesFrequencyEstimator.h
${_this_path}/../Components/TimeSeriesImputerTransformer.h
${_this_path}/../Components/TrainingOnlyEstimatorImpl.h
${_this_path}/../Components/Details/PipelineExecutionEstimatorImpl_details.h
)
file(GLOB JSON_DATA "${_this_path}/../DateTimeFeaturizerData/GeneratedCode/*.json")

Просмотреть файл

@ -49,11 +49,48 @@ FEATURIZER_LIBRARY_API bool GetErrorInfoString(/*in*/ ErrorInfoHandle *pHandle,
FEATURIZER_LIBRARY_API bool DestroyErrorInfoString(/*in*/ char const *input_ptr, /*in*/ std::size_t input_items);
FEATURIZER_LIBRARY_API bool DestroyErrorInfo(/*in*/ ErrorInfoHandle *pHandle);
// These values should match the values in Featurizer.h
enum FitResultValue {
Complete = 1,
Continue,
ResetAndContinue
};
typedef unsigned char FitResult;
static FitResult const Complete = 0;
static FitResult const Continue = 1;
static FitResult const ResetAndContinue = 2;
// These values should match the values in Traits.h
enum TypeIdValue {
StringId = 0x00000001,
Int8Id,
Int16Id,
Int32Id,
Int64Id,
UInt8Id,
UInt16Id,
UInt32Id,
UInt64Id,
Float16Id,
Float32Id,
Float64Id,
Complex64Id,
Complex128Id,
BFloat16Id,
BoolId,
TimepointId,
DurationId,
LastStaticValueId,
TensorId = 0x1001 | LastStaticValueId + 1,
SparseTensorId = 0x1001 | LastStaticValueId + 2,
TabularId = 0x1001 | LastStaticValueId + 3,
NullableId = 0x1001 | LastStaticValueId + 4,
VectorId = 0x1001 | LastStaticValueId + 5,
MapId = 0x1002 | LastStaticValueId + 6
};
typedef uint32_t TypeId;
} // extern "C"

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,64 @@
// ----------------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// ----------------------------------------------------------------------
#pragma once
// Note that most of the shared code is generated for each Featurizer. The
// TimeSeriesImputer is more complex than the other Featurizers, so this one
// is written by hand for now.
#include "SharedLibrary_Common.h"
extern "C" {
/* ---------------------------------------------------------------------- */
/* | TimeSeriesImputerFeaturizer <BinaryArchive> */
// Each "row" in this implementation is the serialized content of the row itself. Clients will serialize the data,
// pass it to these methods where it will be deserialized according to the type enumerations provided when creating
// the Estimator. Output will be serialized and passed back to the caller.
struct TimeSeriesImputerFeaturizer_BinaryArchive_EstimatorHandle {};
struct TimeSeriesImputerFeaturizer_BinaryArchive_TransformerHandle {};
FEATURIZER_LIBRARY_API_PACK_PREFIX
struct BinaryArchiveData {
unsigned char const * pBuffer;
std::size_t cBuffer;
} FEATURIZER_LIBRARY_API_PACK_INLINE;
FEATURIZER_LIBRARY_API_PACK_SUFFIX
// These values should match those found in Featurizers/Components/TimeSeriesImputerTransformer.h
enum ImputationStrategyValue {
Forward = 1,
Backward,
Interpolate
};
typedef uint8_t ImputationStrategy;
/* Training Methods */
FEATURIZER_LIBRARY_API bool TimeSeriesImputerFeaturizer_BinaryArchive_CreateEstimator(/*in*/ TypeId *pKeyColTypes, /*in*/ std::size_t numKeyColTypes, /*in*/ TypeId *pDataColTypes, /*in*/ std::size_t numDataColTypes, /*in*/ ImputationStrategy strategy, /*in*/ bool *pSuppressErrors, /*out*/ TimeSeriesImputerFeaturizer_BinaryArchive_EstimatorHandle **ppHandle, /*out*/ ErrorInfoHandle **ppErrorInfo);
FEATURIZER_LIBRARY_API bool TimeSeriesImputerFeaturizer_BinaryArchive_DestroyEstimator(/*in*/ TimeSeriesImputerFeaturizer_BinaryArchive_EstimatorHandle *pHandle, /*out*/ ErrorInfoHandle **ppErrorInfo);
FEATURIZER_LIBRARY_API bool TimeSeriesImputerFeaturizer_BinaryArchive_IsTrainingComplete(/*in*/ TimeSeriesImputerFeaturizer_BinaryArchive_EstimatorHandle *pHandle, /*out*/ bool *pIsTrainingComplete, /*out*/ ErrorInfoHandle **ppErrorInfo);
FEATURIZER_LIBRARY_API bool TimeSeriesImputerFeaturizer_BinaryArchive_Fit(/*in*/ TimeSeriesImputerFeaturizer_BinaryArchive_EstimatorHandle *pHandle, /*in*/ BinaryArchiveData data, /*out*/ FitResult *pFitResult, /*out*/ ErrorInfoHandle **ppErrorInfo);
FEATURIZER_LIBRARY_API bool TimeSeriesImputerFeaturizer_BinaryArchive_FitBuffer(/*in*/ TimeSeriesImputerFeaturizer_BinaryArchive_EstimatorHandle *pHandle, /*in*/ BinaryArchiveData const *pData, /*in*/ std::size_t numDataElements, /*out*/ FitResult *pFitResult, /*out*/ ErrorInfoHandle **ppErrorInfo);
FEATURIZER_LIBRARY_API bool TimeSeriesImputerFeaturizer_BinaryArchive_CompleteTraining(/*in*/ TimeSeriesImputerFeaturizer_BinaryArchive_EstimatorHandle *pHandle, /*out*/ FitResult *pFitResult, /*out*/ ErrorInfoHandle **ppErrorInfo);
/* Inference Methods */
FEATURIZER_LIBRARY_API bool TimeSeriesImputerFeaturizer_BinaryArchive_CreateTransformerFromEstimator(/*in*/ TimeSeriesImputerFeaturizer_BinaryArchive_EstimatorHandle *pEstimatorHandle, /*out*/ TimeSeriesImputerFeaturizer_BinaryArchive_TransformerHandle **ppTransformerHandle, /*out*/ ErrorInfoHandle **ppErrorInfo);
FEATURIZER_LIBRARY_API bool TimeSeriesImputerFeaturizer_BinaryArchive_CreateTransformerFromSavedData(/*in*/ unsigned char const *pBuffer, /*in*/ std::size_t cBufferSize, /*out*/ TimeSeriesImputerFeaturizer_BinaryArchive_TransformerHandle **ppTransformerHandle, /*out*/ ErrorInfoHandle **ppErrorInfo);
FEATURIZER_LIBRARY_API bool TimeSeriesImputerFeaturizer_BinaryArchive_DestroyTransformer(/*in*/ TimeSeriesImputerFeaturizer_BinaryArchive_TransformerHandle *pHandle, /*out*/ ErrorInfoHandle **ppErrorInfo);
FEATURIZER_LIBRARY_API bool TimeSeriesImputerFeaturizer_BinaryArchive_CreateTransformerSaveData(/*in*/ TimeSeriesImputerFeaturizer_BinaryArchive_TransformerHandle *pHandle, /*out*/ unsigned char const **ppBuffer, /*out*/ std::size_t *pBufferSize, /*out*/ ErrorInfoHandle **ppErrorInfo);
FEATURIZER_LIBRARY_API bool TimeSeriesImputerFeaturizer_BinaryArchive_DestroyTransformerSaveData(/*in*/ unsigned char const *pBuffer, /*in*/ std::size_t cBufferSize, /*out*/ ErrorInfoHandle **ppErrorInfo);
FEATURIZER_LIBRARY_API bool TimeSeriesImputerFeaturizer_BinaryArchive_Transform(/*in*/ TimeSeriesImputerFeaturizer_BinaryArchive_TransformerHandle *pHandle, /*in*/ BinaryArchiveData data, /*out*/ BinaryArchiveData **ppData, /*out*/ std::size_t *pNumDataElements, /*out*/ ErrorInfoHandle **ppErrorInfo);
FEATURIZER_LIBRARY_API bool TimeSeriesImputerFeaturizer_BinaryArchive_Flush(/*in*/ TimeSeriesImputerFeaturizer_BinaryArchive_TransformerHandle *pHandle, /*out*/ BinaryArchiveData ** ppData, /*out*/ std::size_t *pNumDataElements, /*out*/ ErrorInfoHandle **ppErrorInfo);
FEATURIZER_LIBRARY_API bool TimeSeriesImputerFeaturizer_BinaryArchive_DestroyTransformedData(/*in*/ BinaryArchiveData const *pData, /*in*/ std::size_t numDataElements, /*out*/ ErrorInfoHandle **ppErrorInfo);
} // extern "C"

Просмотреть файл

@ -30,6 +30,9 @@ generate_shared_library_attributes(
)
add_library(Featurizers SHARED
${_this_path}/../SharedLibrary_TimeSeriesImputerFeaturizer.h
${_this_path}/../SharedLibrary_TimeSeriesImputerFeaturizer.cpp
${_this_path}/../GeneratedCode/SharedLibrary_CatImputerFeaturizer.h
${_this_path}/../GeneratedCode/SharedLibrary_CatImputerFeaturizer.cpp
${_this_path}/../GeneratedCode/SharedLibrary_Common.h
@ -48,6 +51,7 @@ target_link_libraries(Featurizers PRIVATE
)
target_include_directories(Featurizers PRIVATE
${_this_path}/../GeneratedCode
${_this_path}/../..
${_this_path}/../../Featurizers
${_includes}

Просмотреть файл

@ -246,11 +246,48 @@ def _GenerateCommonFiles(output_dir, output_stream):
FEATURIZER_LIBRARY_API bool DestroyErrorInfoString(/*in*/ char const *input_ptr, /*in*/ std::size_t input_items);
FEATURIZER_LIBRARY_API bool DestroyErrorInfo(/*in*/ ErrorInfoHandle *pHandle);
// These values should match the values in Featurizer.h
enum FitResultValue {
Complete = 1,
Continue,
ResetAndContinue
};
typedef unsigned char FitResult;
static FitResult const Complete = 0;
static FitResult const Continue = 1;
static FitResult const ResetAndContinue = 2;
// These values should match the values in Traits.h
enum TypeIdValue {
StringId = 0x00000001,
Int8Id,
Int16Id,
Int32Id,
Int64Id,
UInt8Id,
UInt16Id,
UInt32Id,
UInt64Id,
Float16Id,
Float32Id,
Float64Id,
Complex64Id,
Complex128Id,
BFloat16Id,
BoolId,
TimepointId,
DurationId,
LastStaticValueId,
TensorId = 0x1001 | LastStaticValueId + 1,
SparseTensorId = 0x1001 | LastStaticValueId + 2,
TabularId = 0x1001 | LastStaticValueId + 3,
NullableId = 0x1001 | LastStaticValueId + 4,
VectorId = 0x1001 | LastStaticValueId + 5,
MapId = 0x1002 | LastStaticValueId + 6
};
typedef uint32_t TypeId;
} // extern "C"
@ -335,7 +372,7 @@ def _GenerateCommonFiles(output_dir, output_stream):
size_t index = reinterpret_cast<size_t>(pHandle);
std::string & str(*sg_pointerTable.Get<std::string>(index));
sg_pointerTable.Remove(index);
delete &str;
@ -712,7 +749,7 @@ def _GenerateCppFile(output_dir, items, c_data_items, output_stream):
{method_prefix}
if(pHandle == nullptr) throw std::invalid_argument("'pHandle' is null");
if(pFitResult == nullptr) throw std::invalid_argument("'pFitResult' is null");
{validation}
@ -753,7 +790,7 @@ def _GenerateCppFile(output_dir, items, c_data_items, output_stream):
if(pHandle == nullptr) throw std::invalid_argument("'pHandle' is null");
if(pFitResult == nullptr) throw std::invalid_argument("'pFitResult' is null");
{validation}
@ -787,7 +824,7 @@ def _GenerateCppFile(output_dir, items, c_data_items, output_stream):
if(pHandle == nullptr) throw std::invalid_argument("'pHandle' is null");
if(pFitResult == nullptr) throw std::invalid_argument("'pFitResult' is null");
Microsoft::Featurizer::Featurizers::{estimator_name}{cpp_template_suffix} & estimator(*sg_pointerTable.Get<Microsoft::Featurizer::Featurizers::{estimator_name}{cpp_template_suffix}>(reinterpret_cast<size_t>(pHandle)));
@ -808,13 +845,13 @@ def _GenerateCppFile(output_dir, items, c_data_items, output_stream):
if(pEstimatorHandle == nullptr) throw std::invalid_argument("'pEstimatorHandle' is null");
if(ppTransformerHandle == nullptr) throw std::invalid_argument("'ppTransformerHandle' is null");
Microsoft::Featurizer::Featurizers::{estimator_name}{cpp_template_suffix} & estimator(*sg_pointerTable.Get<Microsoft::Featurizer::Featurizers::{estimator_name}{cpp_template_suffix}>(reinterpret_cast<size_t>(pEstimatorHandle)));
Microsoft::Featurizer::Featurizers::{estimator_name}{cpp_template_suffix}::TransformerType * pTransformer = reinterpret_cast<Microsoft::Featurizer::Featurizers::{estimator_name}{cpp_template_suffix}::TransformerType*>(estimator.create_transformer().release());
size_t index = sg_pointerTable.Add(pTransformer);
*ppTransformerHandle = reinterpret_cast<{name}{suffix}TransformerHandle*>(index);
{method_suffix}
@ -837,7 +874,7 @@ def _GenerateCppFile(output_dir, items, c_data_items, output_stream):
Microsoft::Featurizer::Archive archive(pBuffer, cBufferSize);
Microsoft::Featurizer::Featurizers::{estimator_name}{cpp_template_suffix}::TransformerType* pTransformer= (std::make_unique<Microsoft::Featurizer::Featurizers::{estimator_name}{cpp_template_suffix}::TransformerType>(archive).release());
size_t index = sg_pointerTable.Add(pTransformer);
*ppTransformerHandle = reinterpret_cast<{name}{suffix}TransformerHandle*>(index);
{method_suffix}
@ -930,7 +967,7 @@ def _GenerateCppFile(output_dir, items, c_data_items, output_stream):
FEATURIZER_LIBRARY_API bool {name}{suffix}Transform(/*in*/ {name}{suffix}TransformerHandle *pHandle, {input_param}, {output_param}, /*out*/ ErrorInfoHandle **ppErrorInfo) {{
{method_prefix}
if(pHandle == nullptr) throw std::invalid_argument("'pHandle' is null");
{input_validation}
{output_validation}

Просмотреть файл

@ -5,6 +5,7 @@
#pragma once
#include <array>
#include <chrono>
#include <cmath>
#include <limits>
#include <map>
@ -12,6 +13,17 @@
#include <string>
#include <vector>
#if (defined __clang__)
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wshift-sign-overflow"
#endif
#include "3rdParty/date.h"
#if (defined __clang__)
# pragma clang diagnostic pop
#endif
#include "3rdParty/optional.h"
namespace Microsoft {
@ -23,7 +35,7 @@ namespace Featurizer {
#endif
enum class DataTypes : uint32_t {
enum class TypeId : uint32_t {
// Enumeration values are in the following format:
//
// 0xVTTTXXXX
@ -50,17 +62,46 @@ enum class DataTypes : uint32_t {
Bool,
Timepoint,
Duration,
LastStaticValue,
// The following values have N number of trailing types
Tensor = 0x1001 | Duration,
SparseTensor = 0x1001 | Tensor,
Tabular = 0x1001 | SparseTensor,
Nullable = 0x1001 | Tabular,
Vector = 0x1001 | Nullable,
Map = 0x1002 | Vector
Tensor = 0x1001 | LastStaticValue + 1,
SparseTensor = 0x1001 | LastStaticValue + 2,
Tabular = 0x1001 | LastStaticValue + 3,
Nullable = 0x1001 | LastStaticValue + 4,
Vector = 0x1001 | LastStaticValue + 5,
Map = 0x1002 | LastStaticValue + 6
};
inline bool IsValid(TypeId id) {
return id == TypeId::String
|| id == TypeId::Int8
|| id == TypeId::Int16
|| id == TypeId::Int32
|| id == TypeId::Int64
|| id == TypeId::UInt8
|| id == TypeId::UInt16
|| id == TypeId::UInt32
|| id == TypeId::UInt64
|| id == TypeId::Float16
|| id == TypeId::Float32
|| id == TypeId::Float64
|| id == TypeId::Complex64
|| id == TypeId::Complex128
|| id == TypeId::BFloat16
|| id == TypeId::Bool
|| id == TypeId::Timepoint
|| id == TypeId::Duration
|| id == TypeId::Tensor
|| id == TypeId::SparseTensor
|| id == TypeId::Tabular
|| id == TypeId::Nullable
|| id == TypeId::Vector
|| id == TypeId::Map;
}
/////////////////////////////////////////////////////////////////////////
/// \struct Traits
/// \brief We have a range of of types we are dealing with. Many types
@ -91,11 +132,23 @@ struct Traits {
// - static bool IsNull(nullable_type const &value);
// - static T const & GetNullableValue(nullable_type const &value);
// - static std::string ToString(T const &value);
// - template <typename ArchiveT> static ArchiveT & serialize(ArchiveT &ar, T const &value);
// - static T FromString(std::string const &value); // - template <typename ArchiveT> static ArchiveT & serialize(ArchiveT &ar, T const &value);
// - template <typename ArchiveT> static T deserialize(ArchiveT &ar);
//
};
/////////////////////////////////////////////////////////////////////////
/// \class Traits
/// \brief Strips references from types.
///
template <typename T> struct Traits<T &> : public Traits<T> {};
/////////////////////////////////////////////////////////////////////////
/// \class Traits
/// \brief Strips const from types
///
template <typename T> struct Traits<T const> : public Traits<T> {};
/////////////////////////////////////////////////////////////////////////
/// \struct TraitsImpl
/// \brief When using partial template specialization, if the compiler
@ -156,6 +209,10 @@ struct Traits<bool> : public TraitsImpl<bool> {
return value != 0 ? _TRUE_VALUE : _FALSE_VALUE;
}
static bool FromString(std::string const &value) {
return value == "True";
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, bool const &value) {
return ar.serialize(value);
@ -173,6 +230,15 @@ struct Traits<std::int8_t> : public TraitsImpl<std::int8_t> {
return std::to_string(value);
}
static std::int8_t FromString(std::string const &value) {
int v(std::stoi(value.c_str()));
if(v < std::numeric_limits<std::int8_t>::min() || v > std::numeric_limits<std::int8_t>::max())
throw std::invalid_argument("Invalid conversion");
return static_cast<std::int8_t>(v);
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::int8_t const &value) {
return ar.serialize(value);
@ -190,6 +256,15 @@ struct Traits<std::int16_t> : public TraitsImpl<std::int16_t> {
return std::to_string(value);
}
static std::int16_t FromString(std::string const &value) {
int v(std::stoi(value.c_str()));
if(v < std::numeric_limits<std::int16_t>::min() || v > std::numeric_limits<std::int16_t>::max())
throw std::invalid_argument("Invalid conversion");
return static_cast<std::int16_t>(v);
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::int16_t const &value) {
return ar.serialize(value);
@ -207,6 +282,11 @@ struct Traits<std::int32_t> : public TraitsImpl<std::int32_t> {
return std::to_string(value);
}
static std::int32_t FromString(std::string const &value) {
static_assert(sizeof(std::int32_t) == sizeof(int), "This code expects that an int is 32 bits");
return std::stoi(value.c_str());
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::int32_t const &value) {
return ar.serialize(value);
@ -224,6 +304,33 @@ struct Traits<std::int64_t> : public TraitsImpl<std::int64_t> {
return std::to_string(value);
}
static std::int64_t FromString(std::string const &value) {
static_assert(sizeof(std::int64_t) <= sizeof(long long), "This code expects that long long >= 64 bits");
long long v(std::stoll(value.c_str()));
#if (defined __clang__)
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wtautological-type-limit-compare"
#elif (defined _MSC_VER)
# pragma warning(push)
# pragma warning(disable: 4127) // Conditional expression is constant
#endif
if(sizeof(long long) > sizeof(std::int64_t)) {
if(v < std::numeric_limits<std::int64_t>::min() || v > std::numeric_limits<std::int64_t>::max())
throw std::invalid_argument("Invalid conversion");
}
#if (defined __clang__)
# pragma clang diagnostic pop
#elif (defined _MSC_VER)
# pragma warning(pop)
#endif
return static_cast<std::int64_t>(v);
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::int64_t const &value) {
return ar.serialize(value);
@ -241,6 +348,15 @@ struct Traits<std::uint8_t> : public TraitsImpl<std::uint8_t> {
return std::to_string(value);
}
static std::uint8_t FromString(std::string const &value) {
unsigned long v(std::stoul(value.c_str()));
if(v > std::numeric_limits<std::uint8_t>::max())
throw std::invalid_argument("Invalid conversion");
return static_cast<std::uint8_t>(v);
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::uint8_t const &value) {
return ar.serialize(value);
@ -258,6 +374,15 @@ struct Traits<std::uint16_t> : public TraitsImpl<std::uint16_t> {
return std::to_string(value);
}
static std::uint16_t FromString(std::string const &value) {
unsigned long v(std::stoul(value.c_str()));
if(v > std::numeric_limits<std::uint16_t>::max())
throw std::invalid_argument("Invalid conversion");
return static_cast<std::uint16_t>(v);
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::uint16_t const &value) {
return ar.serialize(value);
@ -275,6 +400,33 @@ struct Traits<std::uint32_t> : public TraitsImpl<std::uint32_t> {
return std::to_string(value);
}
static std::uint32_t FromString(std::string const &value) {
static_assert(sizeof(std::uint32_t) <= sizeof(unsigned long), "This code assumes that a long is more 32 bits");
unsigned long v(std::stoul(value.c_str()));
#if (defined __clang__)
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wtautological-type-limit-compare"
#elif (defined _MSC_VER)
# pragma warning(push)
# pragma warning(disable: 4127) // Conditional expression is constant
#endif
if(sizeof(unsigned long) > sizeof(std::uint32_t)) {
if(v > std::numeric_limits<std::uint32_t>::max())
throw std::invalid_argument("Invalid conversion");
}
#if (defined __clang__)
# pragma clang diagnostic pop
#elif (defined _MSC_VER)
# pragma warning(pop)
#endif
return static_cast<std::uint32_t>(v);
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::uint32_t const &value) {
return ar.serialize(value);
@ -292,6 +444,33 @@ struct Traits<std::uint64_t> : public TraitsImpl<std::uint64_t> {
return std::to_string(value);
}
static std::uint64_t FromString(std::string const &value) {
static_assert(sizeof(std::uint64_t) <= sizeof(unsigned long long), "This code expects that unsigned long long >= 64 bits");
unsigned long long v(std::stoull(value.c_str()));
#if (defined __clang__)
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wtautological-type-limit-compare"
#elif (defined _MSC_VER)
# pragma warning(push)
# pragma warning(disable: 4127) // Conditional expression is constant
#endif
if(sizeof(unsigned long long) > sizeof(std::uint64_t)) {
if(v > std::numeric_limits<std::uint64_t>::max())
throw std::invalid_argument("Invalid conversion");
}
#if (defined __clang__)
# pragma clang diagnostic pop
#elif (defined _MSC_VER)
# pragma warning(pop)
#endif
return static_cast<std::uint64_t>(v);
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::uint64_t const &value) {
return ar.serialize(value);
@ -308,9 +487,7 @@ struct Traits<std::float_t> {
using nullable_type = std::float_t;
static nullable_type CreateNullValue(void) {
// Note that std::numeric_limits doesn't seem to be specialized for std::float_t
// on some systems - using float_t instead.
return std::numeric_limits<float_t>::quiet_NaN();
return std::numeric_limits<std::float_t>::quiet_NaN();
}
static bool IsNull(nullable_type const& value) {
@ -333,6 +510,13 @@ struct Traits<std::float_t> {
return std::to_string(value);
}
static std::float_t FromString(std::string const &value) {
if(value == "NaN")
return std::numeric_limits<std::float_t>::quiet_NaN();
return std::stof(value.c_str());
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::float_t const &value) {
return ar.serialize(value);
@ -372,6 +556,13 @@ struct Traits<std::double_t> {
return std::to_string(value);
}
static std::double_t FromString(std::string const &value) {
if(value == "NaN")
return std::numeric_limits<std::double_t>::quiet_NaN();
return std::stod(value.c_str());
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::double_t const &value) {
return ar.serialize(value);
@ -389,6 +580,10 @@ struct Traits<std::string> : public TraitsImpl<std::string> {
return value;
}
static std::string const & FromString(std::string const &value) {
return value;
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::string const &value) {
ar.serialize(static_cast<std::uint32_t>(value.size()));
@ -447,6 +642,10 @@ struct Traits<std::array<T, ArrayV>> : public TraitsImpl<std::array<T, ArrayV>>
return ToStringImpl(value.data(), value.size());
}
static std::array<T, ArrayV> FromString(std::string const &value) {
std::ignore = value; throw std::logic_error("Not Implemented Yet");
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::array<T, ArrayV> const &value) {
for(auto const &item : value) {
@ -474,6 +673,10 @@ struct Traits<std::vector<T, AllocatorT>> : public TraitsImpl<std::vector<T, All
return ToStringImpl(value.data(), value.size());
}
static std::vector<T, AllocatorT> FromString(std::string const &value) {
std::ignore = value; throw std::logic_error("Not Implemented Yet");
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::vector<T, AllocatorT> const &value) {
ar.serialize(static_cast<std::uint32_t>(value.size()));
@ -521,6 +724,10 @@ struct Traits<std::map<KeyT, T, CompareT, AllocatorT>> : public TraitsImpl<std::
return streamObj.str();
}
static std::map<KeyT, T, CompareT, AllocatorT> FromString(std::string const &value) {
std::ignore = value; throw std::logic_error("Not Implemented Yet");
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::map<KeyT, T, CompareT, AllocatorT> const &value) {
ar.serialize(static_cast<std::uint32_t>(value.size()));
@ -557,53 +764,6 @@ struct Traits<std::map<KeyT, T, CompareT, AllocatorT>> : public TraitsImpl<std::
}
};
template <typename T>
struct Traits<nonstd::optional<T>> {
using nullable_type = nonstd::optional<T>;
static nullable_type CreateNullValue(void) {
return nullable_type();
}
static bool IsNull(nullable_type const& value) {
return !value.has_value();
}
static T const & GetNullableValue(nullable_type const& value) {
if (IsNull(value))
throw std::runtime_error("GetNullableValue attempt on Optional type null.");
return *value;
}
static std::string ToString(nullable_type const& value) {
if (value) {
return Traits<T>::ToString(*value);
}
return "NULL";
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, nonstd::optional<T> const &value) {
ar.serialize(static_cast<bool>(value));
if(value)
Traits<T>::serialize(ar, *value);
return ar;
}
template <typename ArchiveT>
static nonstd::optional<T> deserialize(ArchiveT &ar) {
nonstd::optional<T> result;
if(ar.template deserialize<bool>())
result = Traits<T>::deserialize(ar);
return result;
}
};
template <typename... Types>
struct Traits<std::tuple<Types...>> : public TraitsImpl<std::tuple<Types...>> {
static std::string ToString(std::tuple<Types ...> const& value) {
@ -614,6 +774,10 @@ struct Traits<std::tuple<Types...>> : public TraitsImpl<std::tuple<Types...>> {
return streamObj.str();
}
static std::tuple<Types...> FromString(std::string const &value) {
std::ignore = value; throw std::logic_error("Not Implemented Yet");
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::tuple<Types...> const &value) {
SerializeHelper<0>(ar, value);
@ -669,6 +833,114 @@ private:
}
};
template <typename RepT, typename PeriodT>
struct Traits<std::chrono::duration<RepT, PeriodT>> : public TraitsImpl<std::chrono::duration<RepT, PeriodT>> {
static std::string ToString(std::chrono::duration<RepT, PeriodT> const &duration) {
std::ostringstream out;
// TODO: This returns an absolutely awful string, but there isn't time to fix it now.
// Ideally, this should return something like HH:MM:SS[.Milliseconds]
date::operator <<(out, duration);
out.flush();
return out.str();
}
static std::chrono::duration<RepT, PeriodT> FromString(std::string const &value) {
std::ignore = value; throw std::logic_error("Not Implemented Yet");
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::chrono::duration<RepT, PeriodT> const &duration) {
return Traits<RepT>::serialize(ar, duration.count());
}
template <typename ArchiveT>
static std::chrono::duration<RepT, PeriodT> deserialize(ArchiveT &ar) {
return std::chrono::duration<RepT, PeriodT>(Traits<RepT>::deserialize(ar));
}
};
template <typename ClockT, typename DurationT>
struct Traits<std::chrono::time_point<ClockT, DurationT>> : public TraitsImpl<std::chrono::time_point<ClockT, DurationT>> {
static std::string ToString(std::chrono::time_point<ClockT, DurationT> const &tp) {
std::ostringstream out;
date::operator <<(out, tp);
out.flush();
return out.str();
}
static std::chrono::time_point<ClockT, DurationT> FromString(std::string const &value) {
std::ignore = value; throw std::logic_error("Not Implemented Yet");
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, std::chrono::time_point<ClockT, DurationT> const &tp) {
return Traits<DurationT>::serialize(ar, tp.time_since_epoch());
}
template <typename ArchiveT>
static std::chrono::time_point<ClockT, DurationT> deserialize(ArchiveT &ar) {
return std::chrono::time_point<ClockT, DurationT>(Traits<DurationT>::deserialize(ar));
}
};
template <typename T>
struct Traits<nonstd::optional<T>> {
using nullable_type = nonstd::optional<T>;
static nullable_type CreateNullValue(void) {
return nullable_type();
}
static bool IsNull(nullable_type const& value) {
return !value.has_value();
}
static T const & GetNullableValue(nullable_type const& value) {
if (IsNull(value))
throw std::runtime_error("GetNullableValue attempt on Optional type null.");
return *value;
}
static std::string ToString(nullable_type const& value) {
if (value) {
return Traits<T>::ToString(*value);
}
return "NULL";
}
static nonstd::optional<T> FromString(std::string const &value) {
if(value == "NULL")
return nonstd::optional<T>();
return Traits<T>::FromString(value);
}
template <typename ArchiveT>
static ArchiveT & serialize(ArchiveT &ar, nonstd::optional<T> const &value) {
ar.serialize(static_cast<bool>(value));
if(value)
Traits<T>::serialize(ar, *value);
return ar;
}
template <typename ArchiveT>
static nonstd::optional<T> deserialize(ArchiveT &ar) {
nonstd::optional<T> result;
if(ar.template deserialize<bool>())
result = Traits<T>::deserialize(ar);
return result;
}
};
// TODO: ONNX (Sparse) Tensor
// TODO: Apache Arrow

Просмотреть файл

@ -89,6 +89,40 @@ TEST_CASE("Transformer_Integers") {
CHECK(Traits<std::uint16_t>::ToString(arg_u16) == "250");
CHECK(Traits<std::uint32_t>::ToString(arg_u32) == "480");
CHECK(Traits<std::uint64_t>::ToString(arg_u64) == "7799");
CHECK(Traits<std::int8_t>::FromString("100") == 100);
CHECK(Traits<std::int8_t>::FromString("-100") == -100);
CHECK_THROWS(Traits<std::int8_t>::FromString("this is not valid"));
CHECK_THROWS_WITH(Traits<std::int8_t>::FromString("1000"), "Invalid conversion");
CHECK_THROWS_WITH(Traits<std::int8_t>::FromString("-1000"), "Invalid conversion");
CHECK(Traits<std::int16_t>::FromString("100") == 100);
CHECK(Traits<std::int16_t>::FromString("-100") == -100);
CHECK_THROWS(Traits<std::int16_t>::FromString("this is not valid"));
CHECK_THROWS_WITH(Traits<std::int16_t>::FromString("100000"), "Invalid conversion");
CHECK_THROWS_WITH(Traits<std::int16_t>::FromString("-100000"), "Invalid conversion");
CHECK(Traits<std::int32_t>::FromString("100") == 100);
CHECK(Traits<std::int32_t>::FromString("-100") == -100);
CHECK_THROWS(Traits<std::int32_t>::FromString("this is not valid"));
CHECK(Traits<std::int64_t>::FromString("100") == 100);
CHECK(Traits<std::int64_t>::FromString("-100") == -100);
CHECK_THROWS(Traits<std::int64_t>::FromString("this is not valid"));
CHECK(Traits<std::uint8_t>::FromString("100") == 100);
CHECK_THROWS(Traits<std::uint8_t>::FromString("this is not valid"));
CHECK_THROWS_WITH(Traits<std::uint8_t>::FromString("2000"), "Invalid conversion");
CHECK(Traits<std::uint16_t>::FromString("100") == 100);
CHECK_THROWS(Traits<std::uint16_t>::FromString("this is not valid"));
CHECK_THROWS_WITH(Traits<std::uint16_t>::FromString("200000"), "Invalid conversion");
CHECK(Traits<std::uint32_t>::FromString("100") == 100);
CHECK_THROWS(Traits<std::uint8_t>::FromString("this is not valid"));
CHECK(Traits<std::uint64_t>::FromString("100") == 100);
CHECK_THROWS(Traits<std::uint64_t>::FromString("this is not valid"));
}
TEST_CASE("Transformer_Numbers") {
@ -103,6 +137,24 @@ TEST_CASE("Transformer_Numbers") {
CHECK(Traits<std::float_t>::ToString(arg_f) == "123.000000");
CHECK(Traits<std::double_t>::ToString(arg_d1) == "123.450000");
CHECK(Traits<std::double_t>::ToString(arg_d2) == "135453984983490.546875");
#if (defined __clang__)
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wfloat-equal"
# pragma clang diagnostic ignored "-Wdouble-promotion"
#endif
CHECK(Traits<std::float_t>::FromString("0.12345") == 0.12345f);
CHECK(std::isnan(Traits<std::float_t>::FromString("NaN")));
CHECK_THROWS(Traits<std::float_t>::FromString("this is not valid"));
CHECK(Traits<std::double_t>::FromString("0.12345") == 0.12345);
CHECK(std::isnan(Traits<std::double_t>::FromString("NaN")));
CHECK_THROWS(Traits<std::double_t>::FromString("this is not valid"));
#if (defined __clang__)
# pragma clang diagnostic pop
#endif
}
TEST_CASE("Transformer_Arrays") {
@ -123,6 +175,7 @@ TEST_CASE("Transformer_Arrays") {
std::string vecinarr_s{"[[1.030000,-20.100000,305.800000],[1.030000,-20.100000,305.800000]]"};
CHECK(vecinarr_res == vecinarr_s);
CHECK_THROWS_WITH((Traits<std::array<std::double_t, 4>>::FromString(arr_hasnull_s)), "Not Implemented Yet");
}
TEST_CASE("Transformer_Vectors") {
@ -154,6 +207,8 @@ TEST_CASE("Transformer_Vectors") {
std::string vecwitharr_s{"[[8.800000,0.020000,3643.700000]]"};
std::string vecwitharr_res = Traits<std::vector<std::array<std::double_t, 3>>>::ToString(vecwitharr);
CHECK(vecwitharr_res == vecwitharr_s);
CHECK_THROWS_WITH(Traits<std::vector<int16_t>>::FromString(Rvect_s), "Not Implemented Yet");
}
TEST_CASE("Transformer_Maps") {
@ -163,6 +218,8 @@ TEST_CASE("Transformer_Maps") {
std::string map_res = Traits<std::map<std::int16_t, std::double_t>>::ToString(m);
std::string map_s{ "{5:35.800000,93:0.147000}" };
CHECK(map_res == map_s);
CHECK_THROWS_WITH((Traits<std::map<std::int16_t, std::double_t>>::FromString(map_res)), "Not Implemented Yet");
}
TEST_CASE("Transformer_Tuples") {
@ -170,6 +227,8 @@ TEST_CASE("Transformer_Tuples") {
std::string tu_res = Traits<std::tuple<int, std::string, double>>::ToString(tu);
std::string tu_s{"(42,hi,-3.140000)"};
CHECK(tu_res == tu_s);
CHECK_THROWS_WITH((Traits<std::tuple<int, std::string, double>>::FromString(tu_s)), "Not Implemented Yet");
}
#if (defined __clang__)