Support custom operator: StringToVector. (#84)
Co-authored-by: Zuwei Zhao <zuzhao@microsoft.com> Co-authored-by: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
This commit is contained in:
Родитель
7e906179aa
Коммит
794890985e
|
@ -16,7 +16,7 @@
|
|||
|StringConcat | Supported |
|
||||
|StringRegexSplitWithOffsets| Supported |
|
||||
|VectorToString| Supported |
|
||||
|StringToVector| Under development|
|
||||
|StringToVector| Supported|
|
||||
|StringSlice | Under development|
|
||||
### Tokenizer
|
||||
|
||||
|
@ -338,7 +338,7 @@ Mapping result for unmapped string
|
|||
|
||||
***data: tensor(string)***
|
||||
|
||||
Iut tensor
|
||||
Input tensor
|
||||
|
||||
#### Outputs
|
||||
|
||||
|
@ -424,13 +424,13 @@ the result returned when a vector aren't found in the map
|
|||
|
||||
#### Inputs
|
||||
|
||||
***data: tensor(string)***
|
||||
***data: tensor(T)***
|
||||
|
||||
Input tensor
|
||||
|
||||
#### Outputs
|
||||
|
||||
***output: tensor(T)***
|
||||
***output: tensor(string)***
|
||||
|
||||
The mapping result of the input
|
||||
|
||||
|
@ -454,7 +454,7 @@ mapping_table = \
|
|||
"""
|
||||
|
||||
node = onnx.helper.make_node(
|
||||
'StringToVector',
|
||||
'VectorToString',
|
||||
inputs=['x'],
|
||||
outputs=['y'],
|
||||
map=mapping_table,
|
||||
|
@ -463,7 +463,7 @@ node = onnx.helper.make_node(
|
|||
|
||||
|
||||
x = np.array([[0,0,1,2],[0,1,3,4],[0,0,0,0]], type=np.int64)
|
||||
y = ["a", "d", "unknown_worde"]
|
||||
y = ["a", "d", "unknown_word"]
|
||||
|
||||
|
||||
expect(node, inputs=[x], outputs=[y],
|
||||
|
|
|
@ -0,0 +1,154 @@
|
|||
#include <charconv>
|
||||
#include "kernels.h"
|
||||
#include "utils/string_utils.h"
|
||||
#include "string_to_vector.hpp"
|
||||
#include "string_common.h"
|
||||
|
||||
StringToVectorImpl::StringToVectorImpl(std::string& map, std::string& unk) {
|
||||
ParseMappingTable(map);
|
||||
ParseUnkownValue(unk);
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> StringToVectorImpl::Compute(std::vector<std::string>& str_input, const OrtTensorDimensions& input_dim, OrtTensorDimensions& output_dim) {
|
||||
std::vector<std::vector<int64_t>> result;
|
||||
|
||||
// Set output dimension
|
||||
output_dim = input_dim;
|
||||
output_dim.push_back(vector_len_);
|
||||
|
||||
std::string key;
|
||||
for (size_t i = 0; i < str_input.size(); i++) {
|
||||
key = str_input[i];
|
||||
|
||||
auto it = map_.find(key);
|
||||
if (it != map_.end()) {
|
||||
result.push_back(it->second);
|
||||
} else {
|
||||
result.push_back(unk_value_);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void StringToVectorImpl::ParseMappingTable(std::string& map) {
|
||||
auto lines = SplitString(map, "\n", true);
|
||||
|
||||
if (lines.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
vector_len_ = ParseVectorLen(lines[0]);
|
||||
if (vector_len_ == 0) {
|
||||
throw std::runtime_error(MakeString("The mapped value of string input cannot be empty: ", lines[0]));
|
||||
}
|
||||
|
||||
std::vector<int64_t> values(vector_len_);
|
||||
for (auto& line : lines) {
|
||||
auto kv = SplitString(line, "\t", true);
|
||||
|
||||
if (kv.size() != 2) {
|
||||
throw std::runtime_error(MakeString("Failed to parse mapping_table when processing the line: ", line));
|
||||
}
|
||||
|
||||
ParseValues(kv[1], values);
|
||||
|
||||
// string to vector mapping
|
||||
map_[std::string{kv[0]}] = values;
|
||||
}
|
||||
}
|
||||
|
||||
void StringToVectorImpl::ParseUnkownValue(std::string& unk) {
|
||||
auto unk_strs = SplitString(unk, " ", true);
|
||||
if (unk_strs.size() != vector_len_) {
|
||||
throw std::runtime_error(MakeString("Incompatible dimension: required vector length of unknown_value should be: ", vector_len_));
|
||||
}
|
||||
|
||||
for (auto& str : unk_strs) {
|
||||
int64_t value;
|
||||
auto [end, ec] = std::from_chars(str.data(), str.data() + str.size(), value);
|
||||
if (end != str.data() + str.size()) {
|
||||
throw std::runtime_error(MakeString("Failed to parse unknown_value when processing the number: ", str));
|
||||
}
|
||||
|
||||
unk_value_.push_back(value);
|
||||
}
|
||||
}
|
||||
|
||||
size_t StringToVectorImpl::ParseVectorLen(const std::string_view& line) {
|
||||
auto kv = SplitString(line, "\t", true);
|
||||
|
||||
if (kv.size() != 2) {
|
||||
throw std::runtime_error(MakeString("Failed to parse mapping_table when processing the line: ", line));
|
||||
}
|
||||
|
||||
auto value_strs = SplitString(kv[1], " ", true);
|
||||
return value_strs.size();
|
||||
}
|
||||
|
||||
void StringToVectorImpl::ParseValues(const std::string_view& v, std::vector<int64_t>& values) {
|
||||
std::vector<std::string_view> value_strs = SplitString(v, " ", true);
|
||||
|
||||
int64_t value;
|
||||
for (int i = 0; i < value_strs.size(); i++) {
|
||||
auto [end, ec] = std::from_chars(value_strs[i].data(), value_strs[i].data() + value_strs[i].size(), value);
|
||||
if (end != value_strs[i].data() + value_strs[i].size()) {
|
||||
throw std::runtime_error(MakeString("Failed to parse map when processing the number: ", value_strs[i]));
|
||||
}
|
||||
values[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
KernelStringToVector::KernelStringToVector(OrtApi api, const OrtKernelInfo* info) : BaseKernel(api, info) {
|
||||
std::string map = ort_.KernelInfoGetAttribute<std::string>(info, "map");
|
||||
// unk_value is string here because KernelInfoGetAttribute doesn't support returning vector
|
||||
std::string unk = ort_.KernelInfoGetAttribute<std::string>(info, "unk");
|
||||
|
||||
impl_ = std::make_shared<StringToVectorImpl>(map, unk);
|
||||
}
|
||||
|
||||
void KernelStringToVector::Compute(OrtKernelContext* context) {
|
||||
// Setup input
|
||||
const OrtValue* input = ort_.KernelContext_GetInput(context, 0);
|
||||
std::vector<std::string> input_data;
|
||||
GetTensorMutableDataString(api_, ort_, context, input, input_data);
|
||||
OrtTensorDimensions input_dim(ort_, input);
|
||||
|
||||
// Get output
|
||||
OrtTensorDimensions output_dim;
|
||||
auto mapping_result = impl_->Compute(input_data, input_dim, output_dim);
|
||||
|
||||
OrtValue* output = ort_.KernelContext_GetOutput(context, 0, output_dim.data(), output_dim.size());
|
||||
auto* output_data = ort_.GetTensorMutableData<int64_t>(output);
|
||||
|
||||
// Set output tensor data
|
||||
int idx = 0;
|
||||
for (auto& res : mapping_result) {
|
||||
for (int64_t value : res) {
|
||||
output_data[idx] = value;
|
||||
idx++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void* CustomOpStringToVector::CreateKernel(OrtApi api, const OrtKernelInfo* info) const {
|
||||
return new KernelStringToVector(api, info);
|
||||
};
|
||||
|
||||
const char* CustomOpStringToVector::GetName() const { return "StringToVector"; };
|
||||
|
||||
size_t CustomOpStringToVector::GetInputTypeCount() const {
|
||||
return 1;
|
||||
};
|
||||
|
||||
ONNXTensorElementDataType CustomOpStringToVector::GetInputType(size_t /*index*/) const {
|
||||
return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
|
||||
};
|
||||
|
||||
size_t CustomOpStringToVector::GetOutputTypeCount() const {
|
||||
return 1;
|
||||
};
|
||||
|
||||
ONNXTensorElementDataType CustomOpStringToVector::GetOutputType(size_t /*index*/) const {
|
||||
return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
|
||||
};
|
|
@ -0,0 +1,47 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include "kernels.h"
|
||||
#include "farmhash.h"
|
||||
#include "utils/string_utils.h"
|
||||
|
||||
|
||||
class StringToVectorImpl {
|
||||
public:
|
||||
StringToVectorImpl(std::string& map, std::string& unk);
|
||||
std::vector<std::vector<int64_t>> Compute(std::vector<std::string>& str_input, const OrtTensorDimensions& input_dim, OrtTensorDimensions& output_dim);
|
||||
|
||||
private:
|
||||
void ParseMappingTable(std::string& map);
|
||||
void ParseUnkownValue(std::string& unk);
|
||||
size_t ParseVectorLen(const std::string_view& line);
|
||||
void ParseValues(const std::string_view& v, std::vector<int64_t>& values);
|
||||
|
||||
// mapping of string to vector
|
||||
std::unordered_map<std::string, std::vector<int64_t>> map_;
|
||||
// unkown value is a vector of int
|
||||
std::vector<int64_t> unk_value_;
|
||||
size_t vector_len_;
|
||||
};
|
||||
|
||||
struct KernelStringToVector : BaseKernel {
|
||||
KernelStringToVector(OrtApi api, const OrtKernelInfo* info);
|
||||
void Compute(OrtKernelContext* context);
|
||||
|
||||
private:
|
||||
std::shared_ptr<StringToVectorImpl> impl_;
|
||||
};
|
||||
|
||||
struct CustomOpStringToVector : Ort::CustomOpBase<CustomOpStringToVector, KernelStringToVector> {
|
||||
void* CreateKernel(OrtApi api, const OrtKernelInfo* info) const;
|
||||
const char* GetName() const;
|
||||
size_t GetInputTypeCount() const;
|
||||
ONNXTensorElementDataType GetInputType(size_t index) const;
|
||||
size_t GetOutputTypeCount() const;
|
||||
ONNXTensorElementDataType GetOutputType(size_t index) const;
|
||||
};
|
|
@ -29,7 +29,6 @@ std::vector<std::string> VectorToStringImpl::Compute(const void* input, const Or
|
|||
for (int i = 0; i < input_dim.Size(); i += vector_len_) {
|
||||
//construct key
|
||||
for (int j = 0; j < vector_len_; j++) {
|
||||
;
|
||||
key[j] = ptr[j];
|
||||
}
|
||||
|
||||
|
|
|
@ -65,6 +65,29 @@ class VectorToString(CustomOp):
|
|||
attr_data[k_] = v_
|
||||
return attr_data
|
||||
|
||||
|
||||
class StringToVector(CustomOp):
|
||||
@classmethod
|
||||
def get_inputs(cls):
|
||||
return [cls.io_def("text", onnx.TensorProto.STRING, [None])]
|
||||
|
||||
@classmethod
|
||||
def get_outputs(cls):
|
||||
return [cls.io_def('token_ids', onnx_proto.TensorProto.INT64, [])]
|
||||
|
||||
@classmethod
|
||||
def serialize_attr(cls, attrs):
|
||||
attr_data = {}
|
||||
for k_, v_ in attrs.items():
|
||||
if k_ == 'map' and isinstance(v_, dict):
|
||||
attr_data[k_] = '\n'.join(k + "\t" + " ".join([str(i) for i in v]) for k, v in v_.items())
|
||||
elif k_ == 'unk' and isinstance(v_, list):
|
||||
attr_data[k_] = ' '.join(str(i) for i in v_)
|
||||
else:
|
||||
attr_data[k_] = v_
|
||||
return attr_data
|
||||
|
||||
|
||||
# TODO: list all custom operators schema here:
|
||||
# ...
|
||||
# ...
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include "kernels/string_regex_replace.hpp"
|
||||
#include "kernels/string_regex_split.hpp"
|
||||
#include "kernels/string_split.hpp"
|
||||
#include "kernels/string_to_vector.hpp"
|
||||
#include "kernels/string_upper.hpp"
|
||||
#include "kernels/negpos.hpp"
|
||||
#include "kernels/vector_to_string.hpp"
|
||||
|
@ -49,6 +50,7 @@ CustomOpStringRaggedTensorToDense c_CustomOpStringRaggedTensorToDense;
|
|||
CustomOpStringRegexReplace c_CustomOpStringRegexReplace;
|
||||
CustomOpStringRegexSplitWithOffsets c_CustomOpStringRegexSplitWithOffsets;
|
||||
CustomOpStringSplit c_CustomOpStringSplit;
|
||||
CustomOpStringToVector c_CustomOpStringToVector;
|
||||
CustomOpStringUpper c_CustomOpStringUpper;
|
||||
CustomOpVectorToString c_CustomOpVectorToString;
|
||||
CustomOpStringLength c_CustomOpStringLength;
|
||||
|
@ -78,6 +80,7 @@ OrtCustomOp* operator_lists[] = {
|
|||
&c_CustomOpStringRegexReplace,
|
||||
&c_CustomOpStringRegexSplitWithOffsets,
|
||||
&c_CustomOpStringSplit,
|
||||
&c_CustomOpStringToVector,
|
||||
&c_CustomOpStringUpper,
|
||||
&c_CustomOpVectorToString,
|
||||
&c_CustomOpStringLength,
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
import unittest
|
||||
import numpy as np
|
||||
from onnxruntime_customops.eager_op import EagerOp, StringToVector
|
||||
|
||||
|
||||
def _run_string_to_vector(input, output, map, unk):
|
||||
str2vector = EagerOp.from_customop(StringToVector, map=map, unk=unk)
|
||||
result = str2vector(input)
|
||||
np.testing.assert_array_equal(result, output)
|
||||
|
||||
|
||||
class TestStringToVector(unittest.TestCase):
|
||||
|
||||
def test_string_to_vector1(self):
|
||||
_run_string_to_vector(input=np.array(["a", "b", "c", "unknown_word"]),
|
||||
output=np.array([[0], [2], [3], [-1]], dtype=np.int64),
|
||||
map={"a": [0], "b": [2], "c": [3]},
|
||||
unk=[-1])
|
||||
|
||||
def test_string_to_vector2(self):
|
||||
_run_string_to_vector(input=np.array(["a", "b", "c", "unknown_word"]),
|
||||
output=np.array([[0, 1, 2], [1, 2, 3], [2, 3, 4], [-1, -1, -1]], dtype=np.int64),
|
||||
map={"a": [0, 1, 2], "b": [1, 2, 3], "c": [2, 3, 4]},
|
||||
unk=[-1, -1, -1])
|
||||
|
||||
def test_string_to_vector3(self):
|
||||
_run_string_to_vector(input=np.array(["a", "b", "c", "unknown_word", "你好", "下午", "测试"]),
|
||||
output=np.array([[0, 1, 2], [1, 2, 3], [2, 3, 4], [-1, -1, -1], [6, 6, 6], [7, 8, 9], [-1, -1, -1]], dtype=np.int64),
|
||||
map={"a": [0, 1, 2], "b": [1, 2, 3], "c": [2, 3, 4], "你好": [6, 6, 6], "下午": [7, 8, 9]},
|
||||
unk=[-1, -1, -1])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
Загрузка…
Ссылка в новой задаче