Support custom operator: StringToVector. (#84)

Co-authored-by: Zuwei Zhao <zuzhao@microsoft.com>
Co-authored-by: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
This commit is contained in:
Zuwei Zhao 2021-04-22 13:29:43 +08:00 коммит произвёл GitHub
Родитель 7e906179aa
Коммит 794890985e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
7 изменённых файлов: 267 добавлений и 7 удалений

Просмотреть файл

@ -16,7 +16,7 @@
|StringConcat | Supported |
|StringRegexSplitWithOffsets| Supported |
|VectorToString| Supported |
|StringToVector| Under development|
|StringToVector| Supported|
|StringSlice | Under development|
### Tokenizer
@ -338,7 +338,7 @@ Mapping result for unmapped string
***data: tensor(string)***
Iut tensor
Input tensor
#### Outputs
@ -424,13 +424,13 @@ the result returned when a vector aren't found in the map
#### Inputs
***data: tensor(string)***
***data: tensor(T)***
Input tensor
#### Outputs
***output: tensor(T)***
***output: tensor(string)***
The mapping result of the input
@ -454,7 +454,7 @@ mapping_table = \
"""
node = onnx.helper.make_node(
'StringToVector',
'VectorToString',
inputs=['x'],
outputs=['y'],
map=mapping_table,
@ -463,7 +463,7 @@ node = onnx.helper.make_node(
x = np.array([[0,0,1,2],[0,1,3,4],[0,0,0,0]], type=np.int64)
y = ["a", "d", "unknown_worde"]
y = ["a", "d", "unknown_word"]
expect(node, inputs=[x], outputs=[y],

Просмотреть файл

@ -0,0 +1,154 @@
#include <charconv>
#include "kernels.h"
#include "utils/string_utils.h"
#include "string_to_vector.hpp"
#include "string_common.h"
StringToVectorImpl::StringToVectorImpl(std::string& map, std::string& unk) {
ParseMappingTable(map);
ParseUnkownValue(unk);
}
std::vector<std::vector<int64_t>> StringToVectorImpl::Compute(std::vector<std::string>& str_input, const OrtTensorDimensions& input_dim, OrtTensorDimensions& output_dim) {
std::vector<std::vector<int64_t>> result;
// Set output dimension
output_dim = input_dim;
output_dim.push_back(vector_len_);
std::string key;
for (size_t i = 0; i < str_input.size(); i++) {
key = str_input[i];
auto it = map_.find(key);
if (it != map_.end()) {
result.push_back(it->second);
} else {
result.push_back(unk_value_);
}
}
return result;
}
void StringToVectorImpl::ParseMappingTable(std::string& map) {
auto lines = SplitString(map, "\n", true);
if (lines.empty()) {
return;
}
vector_len_ = ParseVectorLen(lines[0]);
if (vector_len_ == 0) {
throw std::runtime_error(MakeString("The mapped value of string input cannot be empty: ", lines[0]));
}
std::vector<int64_t> values(vector_len_);
for (auto& line : lines) {
auto kv = SplitString(line, "\t", true);
if (kv.size() != 2) {
throw std::runtime_error(MakeString("Failed to parse mapping_table when processing the line: ", line));
}
ParseValues(kv[1], values);
// string to vector mapping
map_[std::string{kv[0]}] = values;
}
}
void StringToVectorImpl::ParseUnkownValue(std::string& unk) {
auto unk_strs = SplitString(unk, " ", true);
if (unk_strs.size() != vector_len_) {
throw std::runtime_error(MakeString("Incompatible dimension: required vector length of unknown_value should be: ", vector_len_));
}
for (auto& str : unk_strs) {
int64_t value;
auto [end, ec] = std::from_chars(str.data(), str.data() + str.size(), value);
if (end != str.data() + str.size()) {
throw std::runtime_error(MakeString("Failed to parse unknown_value when processing the number: ", str));
}
unk_value_.push_back(value);
}
}
size_t StringToVectorImpl::ParseVectorLen(const std::string_view& line) {
auto kv = SplitString(line, "\t", true);
if (kv.size() != 2) {
throw std::runtime_error(MakeString("Failed to parse mapping_table when processing the line: ", line));
}
auto value_strs = SplitString(kv[1], " ", true);
return value_strs.size();
}
void StringToVectorImpl::ParseValues(const std::string_view& v, std::vector<int64_t>& values) {
std::vector<std::string_view> value_strs = SplitString(v, " ", true);
int64_t value;
for (int i = 0; i < value_strs.size(); i++) {
auto [end, ec] = std::from_chars(value_strs[i].data(), value_strs[i].data() + value_strs[i].size(), value);
if (end != value_strs[i].data() + value_strs[i].size()) {
throw std::runtime_error(MakeString("Failed to parse map when processing the number: ", value_strs[i]));
}
values[i] = value;
}
}
KernelStringToVector::KernelStringToVector(OrtApi api, const OrtKernelInfo* info) : BaseKernel(api, info) {
std::string map = ort_.KernelInfoGetAttribute<std::string>(info, "map");
// unk_value is string here because KernelInfoGetAttribute doesn't support returning vector
std::string unk = ort_.KernelInfoGetAttribute<std::string>(info, "unk");
impl_ = std::make_shared<StringToVectorImpl>(map, unk);
}
void KernelStringToVector::Compute(OrtKernelContext* context) {
// Setup input
const OrtValue* input = ort_.KernelContext_GetInput(context, 0);
std::vector<std::string> input_data;
GetTensorMutableDataString(api_, ort_, context, input, input_data);
OrtTensorDimensions input_dim(ort_, input);
// Get output
OrtTensorDimensions output_dim;
auto mapping_result = impl_->Compute(input_data, input_dim, output_dim);
OrtValue* output = ort_.KernelContext_GetOutput(context, 0, output_dim.data(), output_dim.size());
auto* output_data = ort_.GetTensorMutableData<int64_t>(output);
// Set output tensor data
int idx = 0;
for (auto& res : mapping_result) {
for (int64_t value : res) {
output_data[idx] = value;
idx++;
}
}
}
void* CustomOpStringToVector::CreateKernel(OrtApi api, const OrtKernelInfo* info) const {
return new KernelStringToVector(api, info);
};
const char* CustomOpStringToVector::GetName() const { return "StringToVector"; };
size_t CustomOpStringToVector::GetInputTypeCount() const {
return 1;
};
ONNXTensorElementDataType CustomOpStringToVector::GetInputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
};
size_t CustomOpStringToVector::GetOutputTypeCount() const {
return 1;
};
ONNXTensorElementDataType CustomOpStringToVector::GetOutputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
};

Просмотреть файл

@ -0,0 +1,47 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <memory>
#include <unordered_map>
#include <vector>
#include "kernels.h"
#include "farmhash.h"
#include "utils/string_utils.h"
class StringToVectorImpl {
public:
StringToVectorImpl(std::string& map, std::string& unk);
std::vector<std::vector<int64_t>> Compute(std::vector<std::string>& str_input, const OrtTensorDimensions& input_dim, OrtTensorDimensions& output_dim);
private:
void ParseMappingTable(std::string& map);
void ParseUnkownValue(std::string& unk);
size_t ParseVectorLen(const std::string_view& line);
void ParseValues(const std::string_view& v, std::vector<int64_t>& values);
// mapping of string to vector
std::unordered_map<std::string, std::vector<int64_t>> map_;
// unkown value is a vector of int
std::vector<int64_t> unk_value_;
size_t vector_len_;
};
struct KernelStringToVector : BaseKernel {
KernelStringToVector(OrtApi api, const OrtKernelInfo* info);
void Compute(OrtKernelContext* context);
private:
std::shared_ptr<StringToVectorImpl> impl_;
};
struct CustomOpStringToVector : Ort::CustomOpBase<CustomOpStringToVector, KernelStringToVector> {
void* CreateKernel(OrtApi api, const OrtKernelInfo* info) const;
const char* GetName() const;
size_t GetInputTypeCount() const;
ONNXTensorElementDataType GetInputType(size_t index) const;
size_t GetOutputTypeCount() const;
ONNXTensorElementDataType GetOutputType(size_t index) const;
};

Просмотреть файл

@ -29,7 +29,6 @@ std::vector<std::string> VectorToStringImpl::Compute(const void* input, const Or
for (int i = 0; i < input_dim.Size(); i += vector_len_) {
//construct key
for (int j = 0; j < vector_len_; j++) {
;
key[j] = ptr[j];
}

Просмотреть файл

@ -65,6 +65,29 @@ class VectorToString(CustomOp):
attr_data[k_] = v_
return attr_data
class StringToVector(CustomOp):
@classmethod
def get_inputs(cls):
return [cls.io_def("text", onnx.TensorProto.STRING, [None])]
@classmethod
def get_outputs(cls):
return [cls.io_def('token_ids', onnx_proto.TensorProto.INT64, [])]
@classmethod
def serialize_attr(cls, attrs):
attr_data = {}
for k_, v_ in attrs.items():
if k_ == 'map' and isinstance(v_, dict):
attr_data[k_] = '\n'.join(k + "\t" + " ".join([str(i) for i in v]) for k, v in v_.items())
elif k_ == 'unk' and isinstance(v_, list):
attr_data[k_] = ' '.join(str(i) for i in v_)
else:
attr_data[k_] = v_
return attr_data
# TODO: list all custom operators schema here:
# ...
# ...

Просмотреть файл

@ -12,6 +12,7 @@
#include "kernels/string_regex_replace.hpp"
#include "kernels/string_regex_split.hpp"
#include "kernels/string_split.hpp"
#include "kernels/string_to_vector.hpp"
#include "kernels/string_upper.hpp"
#include "kernels/negpos.hpp"
#include "kernels/vector_to_string.hpp"
@ -49,6 +50,7 @@ CustomOpStringRaggedTensorToDense c_CustomOpStringRaggedTensorToDense;
CustomOpStringRegexReplace c_CustomOpStringRegexReplace;
CustomOpStringRegexSplitWithOffsets c_CustomOpStringRegexSplitWithOffsets;
CustomOpStringSplit c_CustomOpStringSplit;
CustomOpStringToVector c_CustomOpStringToVector;
CustomOpStringUpper c_CustomOpStringUpper;
CustomOpVectorToString c_CustomOpVectorToString;
CustomOpStringLength c_CustomOpStringLength;
@ -78,6 +80,7 @@ OrtCustomOp* operator_lists[] = {
&c_CustomOpStringRegexReplace,
&c_CustomOpStringRegexSplitWithOffsets,
&c_CustomOpStringSplit,
&c_CustomOpStringToVector,
&c_CustomOpStringUpper,
&c_CustomOpVectorToString,
&c_CustomOpStringLength,

Просмотреть файл

@ -0,0 +1,34 @@
import unittest
import numpy as np
from onnxruntime_customops.eager_op import EagerOp, StringToVector
def _run_string_to_vector(input, output, map, unk):
str2vector = EagerOp.from_customop(StringToVector, map=map, unk=unk)
result = str2vector(input)
np.testing.assert_array_equal(result, output)
class TestStringToVector(unittest.TestCase):
def test_string_to_vector1(self):
_run_string_to_vector(input=np.array(["a", "b", "c", "unknown_word"]),
output=np.array([[0], [2], [3], [-1]], dtype=np.int64),
map={"a": [0], "b": [2], "c": [3]},
unk=[-1])
def test_string_to_vector2(self):
_run_string_to_vector(input=np.array(["a", "b", "c", "unknown_word"]),
output=np.array([[0, 1, 2], [1, 2, 3], [2, 3, 4], [-1, -1, -1]], dtype=np.int64),
map={"a": [0, 1, 2], "b": [1, 2, 3], "c": [2, 3, 4]},
unk=[-1, -1, -1])
def test_string_to_vector3(self):
_run_string_to_vector(input=np.array(["a", "b", "c", "unknown_word", "你好", "下午", "测试"]),
output=np.array([[0, 1, 2], [1, 2, 3], [2, 3, 4], [-1, -1, -1], [6, 6, 6], [7, 8, 9], [-1, -1, -1]], dtype=np.int64),
map={"a": [0, 1, 2], "b": [1, 2, 3], "c": [2, 3, 4], "你好": [6, 6, 6], "下午": [7, 8, 9]},
unk=[-1, -1, -1])
if __name__ == "__main__":
unittest.main()