Add operator StringSplit (#24)

2020-11-16 19:05:12 +01:00 · 2020-11-16 19:05:12 +01:00 · db43f413b8
--- a/ocos/kernels/string_split.cc
+++ b/ocos/kernels/string_split.cc
@ -0,0 +1,119 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 #include "string_split.hpp"
 KernelStringSplit::KernelStringSplit(OrtApi api) : BaseKernel(api) {
 }
 void KernelStringSplit::Compute(OrtKernelContext* context) {
  // Setup inputs
  const OrtValue* input_X = ort_.KernelContext_GetInput(context, 0);
  const std::string* X = ort_.GetTensorData<std::string>(input_X);
  const OrtValue* input_sep = ort_.KernelContext_GetInput(context, 1);
  const std::string* sep = ort_.GetTensorData<std::string>(input_sep);
  const OrtValue* input_skip_empty = ort_.KernelContext_GetInput(context, 2);
  const bool* skip_empty = ort_.GetTensorData<bool>(input_skip_empty);
  // Setup output
  OrtTensorDimensions dimensions_sep(ort_, input_sep);
  if (dimensions_sep.size() != 1 || dimensions_sep[0] != 1)
    throw std::runtime_error("Input 2 is the delimiter, it has 1 element.");
  OrtTensorDimensions dimensions_skip_empty(ort_, input_skip_empty);
  if (dimensions_skip_empty.size() != 1 || dimensions_skip_empty[0] != 1)
    throw std::runtime_error("Input 3 is skip_empty, it has 1 element.");
  OrtTensorDimensions dimensions(ort_, input_X);
  if (dimensions.size() != 1)
    throw std::runtime_error("Only 1D tensor are supported as input.");
  std::vector<std::string> words;
  std::vector<int64_t> indices;
  int64_t maxc = 0;
  int64_t col;
  std::string delimiter = *sep;
  bool keep = !(*skip_empty);
  std::size_t current, previous = 0;
  for (int64_t row = 0; row < dimensions[0]; ++row) {
    const std::string& str = X[row];
    if (str.empty())
      continue;
    previous = 0;
    col = 0;
    current = str.find_first_of(delimiter);
    while (current != std::string::npos) {
      if (keep || current > previous) {
        words.push_back(str.substr(previous, current - previous));
        indices.push_back(row);
        indices.push_back(col);
        ++col;
      }
      previous = current + 1;
      current = str.find_first_of(delimiter, previous);
    }
    if (keep || current > previous) {
      words.push_back(str.substr(previous, current - previous));
      indices.push_back(row);
      indices.push_back(col);
      ++col;
    }
    maxc = col > maxc ? col : maxc;
  }
  std::vector<int64_t> shape_indices = {static_cast<int64_t>(indices.size()) / 2, 2};
  OrtValue* out_indices = ort_.KernelContext_GetOutput(context, 0, shape_indices.data(), shape_indices.size());
  std::vector<int64_t> shape_text(1, words.size());
  OrtValue* out_text = ort_.KernelContext_GetOutput(context, 1, shape_text.data(), shape_text.size());
  std::vector<int64_t> shape_shape(1, 2);
  OrtValue* out_shape = ort_.KernelContext_GetOutput(context, 2, shape_shape.data(), shape_shape.size());
  int64_t* p_indices = ort_.GetTensorMutableData<int64_t>(out_indices);
  std::string* p_text = ort_.GetTensorMutableData<std::string>(out_text);
  int64_t* p_shape = ort_.GetTensorMutableData<int64_t>(out_shape);
  memcpy(p_indices, indices.data(), indices.size() * sizeof(int64_t));
  p_shape[0] = dimensions[0];
  p_shape[1] = maxc;
  std::copy(words.begin(), words.end(), p_text);
 }
 void* CustomOpStringSplit::CreateKernel(OrtApi api, const OrtKernelInfo* /* info */) {
  return new KernelStringSplit(api);
 };
 const char* CustomOpStringSplit::GetName() const {
  return "StringSplit";
 };
 size_t CustomOpStringSplit::GetInputTypeCount() const {
  return 3;
 };
 ONNXTensorElementDataType CustomOpStringSplit::GetInputType(size_t index) const {
  switch (index) {
    case 0:
    case 1:
      return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
    case 2:
      return ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL;
    default:
      throw std::runtime_error(MakeString("Unexpected input index ", index));
  }
 };
 size_t CustomOpStringSplit::GetOutputTypeCount() const {
  return 3;
 };
 ONNXTensorElementDataType CustomOpStringSplit::GetOutputType(size_t index) const {
  switch (index) {
    case 0:
    case 2:
      return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
    case 1:
      return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
    default:
      throw std::runtime_error(MakeString("Unexpected output index ", index));
  }
 };
--- a/ocos/kernels/string_split.hpp
+++ b/ocos/kernels/string_split.hpp
@ -0,0 +1,21 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 #pragma once
 #include "kernels.h"
 #include "utils.h"
 struct KernelStringSplit : BaseKernel {
  KernelStringSplit(OrtApi api);
  void Compute(OrtKernelContext* context);
 };
 struct CustomOpStringSplit : Ort::CustomOpBase<CustomOpStringSplit, KernelStringSplit> {
  void* CreateKernel(OrtApi api, const OrtKernelInfo* info);
  const char* GetName() const;
  size_t GetInputTypeCount() const;
  ONNXTensorElementDataType GetInputType(size_t index) const;
  size_t GetOutputTypeCount() const;
  ONNXTensorElementDataType GetOutputType(size_t index) const;
 };
--- a/ocos/ortcustomops.cc
+++ b/ocos/ortcustomops.cc
@ -5,6 +5,7 @@
 #include "kernels/string_hash.hpp"
 #include "kernels/string_join.hpp"
 #include "kernels/string_regex_replace.hpp"
 #include "kernels/string_split.hpp"
 #include "kernels/string_upper.hpp"
 #include "kernels/test_output.hpp"
 #include "utils.h"
@ -15,6 +16,7 @@ CustomOpStringHash c_CustomOpStringHash;
 CustomOpStringHashFast c_CustomOpStringHashFast;
 CustomOpStringJoin c_CustomOpStringJoin;
 CustomOpStringRegexReplace c_CustomOpStringRegexReplace;
 CustomOpStringSplit c_CustomOpStringSplit;
 CustomOpStringUpper c_CustomOpStringUpper;
 CustomOpOne c_CustomOpOne;
 CustomOpTwo c_CustomOpTwo;
@ -26,6 +28,7 @@ OrtCustomOp* operator_lists[] = {
    &c_CustomOpStringHashFast,
    &c_CustomOpStringJoin,
    &c_CustomOpStringRegexReplace,
    &c_CustomOpStringSplit,
    &c_CustomOpStringUpper,
    &c_CustomOpOne,
    &c_CustomOpTwo,
--- a/test/test_string_ops.py
+++ b/test/test_string_ops.py
@ -150,6 +150,36 @@ def _create_test_model_string_equal(prefix, domain='ai.onnx.contrib'):
    return model
 def _create_test_model_string_split(prefix, domain='ai.onnx.contrib'):
    nodes = []
    nodes.append(helper.make_node('Identity', ['input'], ['id1']))
    nodes.append(helper.make_node('Identity', ['delimiter'], ['id2']))
    nodes.append(helper.make_node('Identity', ['skip_empty'], ['id3']))
    nodes.append(
        helper.make_node(
            '%sStringSplit' % prefix, ['id1', 'id2', 'id3'],
            ['indices', 'values', 'shape'], domain=domain))
    input0 = helper.make_tensor_value_info(
        'input', onnx_proto.TensorProto.STRING, [])
    input1 = helper.make_tensor_value_info(
        'delimiter', onnx_proto.TensorProto.STRING, [])
    input2 = helper.make_tensor_value_info(
        'skip_empty', onnx_proto.TensorProto.BOOL, [])
    output0 = helper.make_tensor_value_info(
        'indices', onnx_proto.TensorProto.INT64, [])
    output1 = helper.make_tensor_value_info(
        'values', onnx_proto.TensorProto.STRING, [])
    output2 = helper.make_tensor_value_info(
        'shape', onnx_proto.TensorProto.INT64, [])
    graph = helper.make_graph(nodes, 'test0', [input0, input1, input2],
                              [output0, output1, output2])
    model = helper.make_model(
        graph, opset_imports=[helper.make_operatorsetid(domain, 1)])
    return model
 class TestPythonOpString(unittest.TestCase):
    _string_join = None
@ -246,6 +276,36 @@ class TestPythonOpString(unittest.TestCase):
        def string_equal(x, y):
            return x == y
        @onnx_op(op_type="PyStringSplit",
                 inputs=[PyCustomOpDef.dt_string, PyCustomOpDef.dt_string,
                         PyCustomOpDef.dt_bool],
                 outputs=[PyCustomOpDef.dt_int64, PyCustomOpDef.dt_string,
                          PyCustomOpDef.dt_int64])
        def string_split(input, delimiter, skip_empty):
            if delimiter.shape != (1, ):
                raise RuntimeError("demiliter must a single element tensor.")
            if skip_empty.shape != (1, ):
                raise RuntimeError("skip_empty must a single element tensor.")
            if len(input.shape) != 1:
                raise RuntimeError("input must a one dimension tensor.")
            delimiter = delimiter[0]
            skip_empty = skip_empty[0]
            texts = []
            indices = []
            max_split = 0
            for row, text in enumerate(input):
                if not text:
                    continue
                res = text.split(delimiter)                
                if skip_empty:
                    res = [t for t in res if t]
                texts.extend(res)
                max_split = max(max_split, len(res))
                indices.extend((row, i) for i in range(len(res)))
            return (np.array(indices, dtype=np.int64),
                    np.array(texts),
                    np.array([len(input), max_split], dtype=np.int64))
        cls._string_join = string_join
        cls._string_to_crc32 = string_to_crc32
@ -576,6 +636,79 @@ class TestPythonOpString(unittest.TestCase):
            txout = sess.run(None, {'x': y, 'y': x})
            self.assertEqual(txout[0].tolist(), (y == x).tolist())
    def test_string_split_python(self):
        so = _ort.SessionOptions()
        so.register_custom_ops_library(_get_library_path())
        onnx_model = _create_test_model_string_split('Py')
        self.assertIn('op_type: "PyStringSplit"', str(onnx_model))
        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
        input = np.array(["a,,b", "", "aa,b,c", "dddddd"])
        delimiter = np.array([","])
        for skip in [True, False]:
            with self.subTest(skip=skip):
                skip_empty = np.array([skip])
                txout = sess.run(
                    None, {'input': input, 'delimiter': delimiter,
                           'skip_empty': skip_empty})
                if skip_empty:
                    exp_indices = np.array(
                        [[0, 0], [0, 1], [2, 0], [2, 1], [2, 2], [3, 0]])
                    exp_text = np.array(['a', 'b', 'aa', 'b', 'c', 'dddddd'])
                else:
                    exp_indices = np.array(
                        [[0, 0], [0, 1], [0, 2], [2, 0], [2, 1], [2, 2], [3, 0]])
                    exp_text = np.array(['a', '', 'b', 'aa', 'b', 'c', 'dddddd'])
                exp_shape = np.array([4, 3])
                self.assertEqual(exp_indices.tolist(), txout[0].tolist())
                self.assertEqual(exp_text.tolist(), txout[1].tolist())
                self.assertEqual(exp_shape.tolist(), txout[2].tolist())
    def test_string_split_cc(self):
        so = _ort.SessionOptions()
        so.register_custom_ops_library(_get_library_path())
        onnx_model = _create_test_model_string_split('')
        self.assertIn('op_type: "StringSplit"', str(onnx_model))
        sess = _ort.InferenceSession(onnx_model.SerializeToString(), so)
        input = np.array(["a,,b", "", "aa,b,c", "dddddd"])
        delimiter = np.array([","])
        for skip in [True, False]:
            with self.subTest(skip=skip):
                skip_empty = np.array([skip])
                txout = sess.run(
                    None, {'input': input, 'delimiter': delimiter,
                           'skip_empty': skip_empty})
                try:
                    from tensorflow.raw_ops import StringSplit
                    dotf = True
                except ImportError:
                    dotf = False
                if dotf:
                    tfres = StringSplit(
                        input=input, delimiter=",,", skip_empty=skip)
                    self.assertEqual([_.decode() for _ in tfres[1].numpy().tolist()],
                                     txout[1].tolist())
                    self.assertEqual(tfres[0].numpy().tolist(), txout[0].tolist())
                    self.assertEqual(tfres[2].numpy().tolist(), txout[2].tolist())
                if skip_empty:
                    exp_indices = np.array(
                        [[0, 0], [0, 1], [2, 0], [2, 1], [2, 2], [3, 0]])
                    exp_text = np.array(['a', 'b', 'aa', 'b', 'c', 'dddddd'])
                else:
                    exp_indices = np.array(
                        [[0, 0], [0, 1], [0, 2], [2, 0], [2, 1], [2, 2], [3, 0]])
                    exp_text = np.array(['a', '', 'b', 'aa', 'b', 'c', 'dddddd'])
                exp_shape = np.array([4, 3])
                self.assertEqual(exp_indices.tolist(), txout[0].tolist())
                self.assertEqual(exp_text.tolist(), txout[1].tolist())
                self.assertEqual(exp_shape.tolist(), txout[2].tolist())
 if __name__ == "__main__":
    unittest.main()