Add initial CUDA native UT (#625)

* Add initial CUDA native UT * fix the build issue * fix other build error * add 30 mins to android packaging pipeline timeout due to early timing out * undo android pipeline timeout change - move to other PR * revert ifdef for testing ci * add if def for cuda * update ci ORT linux package name * update the package extraction path * Update ci.yml * Update ci.yml --------- Co-authored-by: Sayan Shaw <sayanshaw@microsoft.com> Co-authored-by: Wenbing Li <wenbingl@outlook.com> Co-authored-by: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
2024-01-13 15:34:16 -08:00 · 2024-01-13 15:34:16 -08:00 · a03eded71e
--- a/.pipelines/ci.yml
+++ b/.pipelines/ci.yml
@ -558,7 +558,7 @@ stages:
      name: 'onnxruntime-extensions-Linux-GPU-A10'
    timeoutInMinutes: 120
    variables:
-      ORT_VERSION: '1.16.2'
+      ORT_VERSION: '1.16.3'
      TORCH_VERSION: 'torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118'
    steps:
      - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
@ -584,7 +584,7 @@ stages:
          userRepository: 'microsoft/onnxruntime'
          defaultVersionType: 'specificTag'
          version: 'v$(ORT_VERSION)'
-          itemPattern: '*-linux-x64-$(ORT_VERSION)*'
+          itemPattern: '*-linux-x64-gpu-$(ORT_VERSION)*'
          downloadPath: '$(Build.SourcesDirectory)'
        displayName: Download the ONNXRuntime prebuilt package.

@ -609,7 +609,7 @@ stages:
          script: |
            docker run --gpus all --rm \
              --volume $(Build.SourcesDirectory):/onnxruntime-extensions \
-              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-$(ORT_VERSION):/onnxruntime \
+              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-gpu-$(ORT_VERSION):/onnxruntime \
              -e CUDA_PATH=/usr/local/cuda-11.8 \
              onnxruntime-extensionscuda11build \
              /bin/bash -c "
@ -626,7 +626,7 @@ stages:
          script: |
            docker run --gpus all --rm \
              --volume $(Build.SourcesDirectory):/onnxruntime-extensions \
-              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-$(ORT_VERSION):/onnxruntime \
+              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-gpu-$(ORT_VERSION):/onnxruntime \
              -e CUDA_PATH=/usr/local/cuda-11.8 \
              onnxruntime-extensionscuda11build \
              /bin/bash -c "
@ -644,7 +644,7 @@ stages:
          script: |
            docker run --gpus all --rm \
              --volume $(Build.SourcesDirectory):/onnxruntime-extensions \
-              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-$(ORT_VERSION):/onnxruntime \
+              --volume $(Build.SourcesDirectory)/onnxruntime-linux-x64-gpu-$(ORT_VERSION):/onnxruntime \
              -e CUDA_PATH=/usr/local/cuda-11.8 \
              onnxruntime-extensionscuda11build \
              /bin/bash -c "
--- a/test/data/cuda/test_fastgelu.onnx
+++ b/test/data/cuda/test_fastgelu.onnx
--- a/test/data/cuda/test_fastgelu_f16.onnx
+++ b/test/data/cuda/test_fastgelu_f16.onnx
--- a/test/data/cuda/test_negpos.onnx
+++ b/test/data/cuda/test_negpos.onnx
--- a/test/shared_test/test_kernel.hpp
+++ b/test/shared_test/test_kernel.hpp
@ -52,6 +52,7 @@ void RunSession(Ort::Session& session_object,
 void TestInference(Ort::Env& env, const ORTCHAR_T* model_uri,
                   const std::vector<TestValue>& inputs,
                   const std::vector<TestValue>& outputs,
-                   OutputValidator output_validator = nullptr);
+                   OutputValidator output_validator = nullptr,
+                   void* cuda_compute_stream = nullptr);

 void GetTensorMutableDataString(const OrtApi& api, const OrtValue* value, std::vector<std::string>& output);
--- a/test/shared_test/test_ortops.cc
+++ b/test/shared_test/test_ortops.cc
@ -296,11 +296,31 @@ void ValidateOutputEqual(size_t output_idx, Ort::Value& actual, TestValue expect
  }
 }

+OrtCUDAProviderOptions CreateDefaultOrtCudaProviderOptionsWithCustomStream(void* cuda_compute_stream) {
+  OrtCUDAProviderOptions cuda_options;
+
+  cuda_options.device_id = 0;
+  cuda_options.cudnn_conv_algo_search = OrtCudnnConvAlgoSearch::OrtCudnnConvAlgoSearchExhaustive;
+  cuda_options.gpu_mem_limit = std::numeric_limits<size_t>::max();
+  cuda_options.arena_extend_strategy = 0;
+  cuda_options.do_copy_in_default_stream = true;
+  cuda_options.has_user_compute_stream = cuda_compute_stream != nullptr ? 1 : 0;
+  cuda_options.user_compute_stream = cuda_compute_stream;
+  cuda_options.default_memory_arena_cfg = nullptr;
+
+  return cuda_options;
+}
+
 void TestInference(Ort::Env& env, const ORTCHAR_T* model_uri,
                   const std::vector<TestValue>& inputs,
                   const std::vector<TestValue>& outputs,
-                   OutputValidator output_validator) {
+                   OutputValidator output_validator,
+                   void* cuda_compute_stream) {
  Ort::SessionOptions session_options;
+#ifdef USE_CUDA
+  auto cuda_options = CreateDefaultOrtCudaProviderOptionsWithCustomStream(cuda_compute_stream);
+  session_options.AppendExecutionProvider_CUDA(cuda_options);
+#endif
  auto library_handle = RegisterExtOps(session_options);

  // if session creation passes, model loads fine
--- a/test/shared_test/test_ortops_cuda.cc
+++ b/test/shared_test/test_ortops_cuda.cc
@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <filesystem>
+#include <locale>
+#include "gtest/gtest.h"
+#include "ocos.h"
+#include "test_kernel.hpp"
+
+#ifdef USE_CUDA
+
+TEST(CudaOp, test_fastgelu) {
+  auto ort_env = std::make_unique<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "Default");
+
+  std::vector<TestValue> inputs(2);
+  inputs[0].name = "x";
+  inputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  inputs[0].dims = {6};
+  inputs[0].values_float = {0., 1., 2., 3., 4., 5.};
+
+  inputs[1].name = "bias";
+  inputs[1].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  inputs[1].dims = {6};
+  inputs[1].values_float = {0.0, 0.1, 0.2, 0.3, 0.4, 0.5};
+
+  std::vector<TestValue> outputs(1);
+  outputs[0].name = "y";
+  outputs[0].element_type = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  outputs[0].dims = {6};
+  outputs[0].values_float = {0., 0.9505811, 2.1696784, 3.298689, 4.399991, 5.5};
+
+  std::filesystem::path model_path = "data/cuda";
+  model_path /= "test_fastgelu.onnx";
+
+  TestInference(*ort_env, model_path.c_str(), inputs, outputs);
+}
+
+#endif