Enhancing CUDA Support in Python Package Build and Testing (#608)

* initial commit * Add the cuda support for python package * formt the code * refine it a little bit
2023-11-27 15:39:52 -08:00 · 2023-11-27 15:39:52 -08:00 · fb2a8c2841
--- a/.gitignore
+++ b/.gitignore
@ -52,3 +52,4 @@ java/hs_*.log
 *.pyd
 /test/data/ppp_vision/*.updated.onnx
 /test/data/generated/
+/CMakeSettings.json
--- a/.pyproject/backend.py
+++ b/.pyproject/backend.py
@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+###########################################################################
+
+import os
+import sys
+from setuptools import build_meta as _orig
+from setuptools.build_meta import *  # noqa: F403
+
+# add the current directory to the path, so we can import setup_cmds.py
+sys.path.append(os.path.dirname(__file__))
+import cmdclass as _cmds  # noqa: E402
+
+
+def build_wheel(wheel_directory, config_settings=None,
+                metadata_directory=None):
+    _cmds.CommandMixin.config_settings = config_settings
+
+    return _orig.build_wheel(
+        wheel_directory, config_settings,
+        metadata_directory
+    )
+
+
+def build_editable(wheel_directory, config_settings=None,
+                   metadata_directory=None):
+    _cmds.CommandMixin.config_settings = config_settings
+
+    return _orig.build_editable(
+        wheel_directory, config_settings,
+        metadata_directory
+    )
--- a/.pyproject/cmdclass.py
+++ b/.pyproject/cmdclass.py
@ -0,0 +1,275 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+###########################################################################
+
+import re
+import os
+import sys
+import pathlib
+import subprocess
+
+from textwrap import dedent
+from setuptools.command.build import build as _build
+from setuptools.command.build_ext import build_ext as _build_ext
+from setuptools.command.develop import develop as _develop
+
+VSINSTALLDIR_NAME = 'VSINSTALLDIR'
+ORTX_USER_OPTION = 'ortx-user-option'
+
+
+def _load_cuda_version():
+    pattern = r"\bV\d+\.\d+\.\d+\b"
+    output = subprocess.check_output(["nvcc", "--version"]).decode("utf-8")
+    match = re.search(pattern, output)
+    if match:
+        vers = match.group()[1:].split('.')
+        return f"{vers[0]}.{vers[1]}"  # only keep the major and minor version.
+
+    return None
+
+
+def _load_vsdevcmd(project_root):
+    if os.environ.get(VSINSTALLDIR_NAME) is None:
+        stdout, _ = subprocess.Popen([
+            'powershell', ' -noprofile', '-executionpolicy',
+            'bypass', '-f', project_root + '/tools/get_vsdevcmd.ps1', '-outputEnv', '1'],
+            stdout=subprocess.PIPE, shell=False, universal_newlines=True).communicate()
+        for line in stdout.splitlines():
+            kv_pair = line.split('=')
+            if len(kv_pair) == 2:
+                os.environ[kv_pair[0]] = kv_pair[1]
+    else:
+        import shutil
+        if shutil.which('cmake') is None:
+            raise SystemExit(
+                "Cannot find cmake in the executable path, "
+                "please run this script under Developer Command Prompt for VS.")
+
+
+def prepare_env(project_root):
+    if sys.platform == "win32":
+        _load_vsdevcmd(project_root)
+
+
+def read_git_refs(project_root):
+    release_branch = False
+    stdout, _ = subprocess.Popen(
+        ['git'] + ['log', '-1', '--format=%H'],
+        cwd=project_root,
+        stdout=subprocess.PIPE, universal_newlines=True).communicate()
+    HEAD = dedent(stdout.splitlines()[0]).strip('\n\r')
+    stdout, _ = subprocess.Popen(
+        ['git'] + ['show-ref', '--head'],
+        cwd=project_root,
+        stdout=subprocess.PIPE, universal_newlines=True).communicate()
+    for _ln in stdout.splitlines():
+        _ln = dedent(_ln).strip('\n\r')
+        if _ln.startswith(HEAD):
+            _, _2 = _ln.split(' ')
+            if _2.startswith('refs/remotes/origin/rel-'):
+                release_branch = True
+    return release_branch, HEAD
+
+
+class CommandMixin:
+    user_options = [
+        (ORTX_USER_OPTION + '=', None, "extensions options for kernel building")
+    ]
+    config_settings = None
+
+    # noinspection PyAttributeOutsideInit
+    def initialize_options(self) -> None:
+        super().initialize_options()
+        self.ortx_user_option = None
+
+    def finalize_options(self) -> None:
+        if self.ortx_user_option is not None:
+            if CommandMixin.config_settings is None:
+                CommandMixin.config_settings = {
+                    ORTX_USER_OPTION: self.ortx_user_option}
+            else:
+                raise RuntimeError(
+                    f"Cannot pass {ORTX_USER_OPTION} several times, like as the command args and in backend API.")
+
+        super().finalize_options()
+
+
+class CmdDevelop(CommandMixin, _develop):
+    user_options = getattr(_develop, 'user_options', []
+                           ) + CommandMixin.user_options
+
+
+class CmdBuild(CommandMixin, _build):
+    user_options = getattr(_build, 'user_options', []) + \
+                   CommandMixin.user_options
+
+    # noinspection PyAttributeOutsideInit
+    def finalize_options(self) -> None:
+        # There is a bug in setuptools that prevents the build get the right platform name from arguments.
+        # So, it cannot generate the correct wheel with the right arch in Official release pipeline.
+        # Force plat_name to be 'win-amd64' in Windows to fix that,
+        # since extensions cmake is only available on x64 for Windows now, it is not a problem to hardcode it.
+        if sys.platform == "win32" and "arm" not in sys.version.lower():
+            self.plat_name = "win-amd64"
+        if os.environ.get('OCOS_SCB_DEBUG', None) == '1':
+            self.debug = True
+        super().finalize_options()
+
+
+class CmdBuildCMakeExt(_build_ext):
+
+    # noinspection PyAttributeOutsideInit
+    def initialize_options(self):
+        super().initialize_options()
+        self.use_cuda = None
+        self.no_azure = None
+        self.no_opencv = None
+        self.cc_debug = None
+
+    def _parse_options(self, options):
+        for segment in options.split(','):
+            if not segment:
+                continue
+            key = segment
+            if '=' in segment:
+                key, value = segment.split('=')
+            else:
+                value = 1
+
+            key = key.replace('-', '_')
+            if not hasattr(self, key):
+                raise RuntimeError(
+                    f"Unknown {ORTX_USER_OPTION} option value: {key}")
+            setattr(self, key, value)
+        return self
+
+    def finalize_options(self) -> None:
+        if CommandMixin.config_settings is not None:
+            self._parse_options(
+                CommandMixin.config_settings.get(ORTX_USER_OPTION, ""))
+            if self.cc_debug:
+                self.debug = True
+        super().finalize_options()
+
+    def run(self):
+        """
+        Perform build_cmake before doing the 'normal' stuff
+        """
+        for extension in self.extensions:
+            if extension.name == 'onnxruntime_extensions._extensions_pydll':
+                self.build_cmake(extension)
+
+    def build_cmake(self, extension):
+        project_dir = pathlib.Path().absolute()
+        build_temp = pathlib.Path(self.build_temp)
+        build_temp.mkdir(parents=True, exist_ok=True)
+        ext_fullpath = pathlib.Path(
+            self.get_ext_fullpath(extension.name)).absolute()
+
+        config = 'RelWithDebInfo' if self.debug else 'Release'
+        cmake_args = [
+            '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' +
+            str(ext_fullpath.parent.absolute()),
+            '-DOCOS_BUILD_PYTHON=ON',
+            '-DOCOS_PYTHON_MODULE_PATH=' + str(ext_fullpath),
+            '-DCMAKE_BUILD_TYPE=' + config
+        ]
+
+        if self.no_opencv:
+            # Disabling openCV can drastically reduce the build time.
+            cmake_args += [
+                '-DOCOS_ENABLE_OPENCV_CODECS=OFF',
+                '-DOCOS_ENABLE_CV2=OFF',
+                '-DOCOS_ENABLE_VISION=OFF']
+
+        if self.no_azure is not None:
+            azure_flag = "OFF" if self.no_azure == 1 else "ON"
+            cmake_args += ['-DOCOS_ENABLE_AZURE=' + azure_flag]
+            print("=> AzureOp build flag: " + azure_flag)
+
+        if self.use_cuda is not None:
+            cuda_flag = "OFF" if self.use_cuda == 0 else "ON"
+            cmake_args += ['-DOCOS_USE_CUDA=' + cuda_flag]
+            print("=> CUDA build flag: " + cuda_flag)
+            cuda_ver = _load_cuda_version()
+            if cuda_ver is None:
+                raise RuntimeError(
+                    "Cannot find nvcc in your env:path, use-cuda doesn't work")
+            f_ver = ext_fullpath.parent / "_version.py"
+            with f_ver.open('a') as _f:
+                _f.writelines(["\n",
+                               f"cuda = {cuda_ver}",
+                               "\n"])
+
+        # CMake lets you override the generator - we need to check this.
+        # Can be set with Conda-Build, for example.
+        cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
+        # Adding CMake arguments set as environment variable
+        # (needed e.g. to build for ARM OSx on conda-forge)
+        if "CMAKE_ARGS" in os.environ:
+            cmake_args += [
+                item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
+
+        if sys.platform != "win32":
+            # Using Ninja-build since it a) is available as a wheel and b)
+            # multithread automatically. MSVC would require all variables be
+            # exported for Ninja to pick it up, which is a little tricky to do.
+            # Users can override the generator with CMAKE_GENERATOR in CMake
+            # 3.15+.
+            if not cmake_generator or cmake_generator == "Ninja":
+                try:
+                    import ninja  # noqa: F401
+
+                    ninja_executable_path = os.path.join(
+                        ninja.BIN_DIR, "ninja")
+                    cmake_args += [
+                        "-GNinja",
+                        f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
+                    ]
+                except ImportError:
+                    pass
+
+        if sys.platform.startswith("darwin"):
+            # Cross-compile support for macOS - respect ARCHFLAGS if set
+            archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", ""))
+            if archs:
+                cmake_args += [
+                    "-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
+
+        # overwrite the Python module info if the auto-detection doesn't work.
+        # export Python3_INCLUDE_DIRS=/opt/python/cp38-cp38
+        # export Python3_LIBRARIES=/opt/python/cp38-cp38
+        for env in ['Python3_INCLUDE_DIRS', 'Python3_LIBRARIES']:
+            if env in os.environ:
+                cmake_args.append("-D%s=%s" % (env, os.environ[env]))
+
+        if self.debug:
+            cmake_args += ['-DCC_OPTIMIZE=OFF']
+
+        # the parallel build has to be limited on some Linux VM machine.
+        cpu_number = os.environ.get('CPU_NUMBER')
+        build_args = [
+            '--config', config,
+            '--parallel' + ('' if cpu_number is None else ' ' + cpu_number)
+        ]
+        cmake_exe = 'cmake'
+        # unlike Linux/macOS, cmake pip package on Windows fails to build some 3rd party dependencies.
+        # so we have to use the cmake installed from Visual Studio.
+        if os.environ.get(VSINSTALLDIR_NAME):
+            cmake_exe = os.environ[VSINSTALLDIR_NAME] + \
+                        'Common7\\IDE\\CommonExtensions\\Microsoft\\CMake\\CMake\\bin\\cmake.exe'
+            # Add this cmake directory into PATH to make sure the child-process still find it.
+            os.environ['PATH'] = os.path.dirname(
+                cmake_exe) + os.pathsep + os.environ['PATH']
+
+        self.spawn([cmake_exe, '-S', str(project_dir),
+                    '-B', str(build_temp)] + cmake_args)
+        if not self.dry_run:
+            self.spawn([cmake_exe, '--build', str(build_temp)] + build_args)
+
+
+ortx_cmdclass = dict(build=CmdBuild,
+                     develop=CmdDevelop,
+                     build_ext=CmdBuildCMakeExt)
--- a/docs/development.md
+++ b/docs/development.md
@ -1,44 +1,83 @@
 # Build and Development

 This project supports Python and can be built from source easily, or a simple cmake build without Python dependency.
+
 ## Python package
+
 The package contains all custom operators and some Python scripts to manipulate the ONNX models.
- Install Visual Studio with C++ development tools on Windows, or gcc(>8.0) for Linux or xcode for macOS, and cmake on the unix-like platform. (**hints**: in Windows platform, if cmake bundled in Visual Studio was used, please specify the set _VSDEVCMD=%ProgramFiles(x86)%\Microsoft Visual Studio\<VERSION_YEAR>\<Edition>\Common7\Tools\VsDevCmd.bat_)
- If running on Windows, ensure that long file names are enabled, both for the [operating system](https://docs.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=cmd) and for git: `git config --system core.longpaths true`
+
+- Install Visual Studio with C++ development tools on Windows, or gcc(>8.0) for Linux or xcode for macOS, and cmake on
+  the unix-like platform.
+- If running on Windows, ensure that long file names are enabled, both for
+  the [operating system](https://docs.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=cmd)
+  and for git: `git config --system core.longpaths true`
 - Prepare Python env and install the pip packages in the requirements.txt.
- `pip install .` to build and install the package.<br/> OR `pip install -e .` to install the package in the development mode, which is more friendly for the developer since the Python code change will take effect without having to copy the files to a different location in the disk.(**hints**: debug=1 in setup.cfg wil make C++ code be debuggable in a Python process.)
+- `pip install .` to build and install the package.<br/> OR `pip install -e .` to install the package in the development
+  mode, which is more friendly for the developer since the Python code change will take effect without having to copy
+  the files to a different location in the disk.(**hints**: debug=1 in setup.cfg wil make C++ code be debuggable in a
+  Python process.)
+- Add the following argument `--config-settings "ortx-user-option=use-cuda"` in the pip command line to enable **CUDA**
+  kernels for the package.
+- The flags can be used in --config-settings are
+  - use-cuda: enable CUDA kernel build in Python package.
+  - no-azure: disable AzureOp kernel build in Python package.
+  - no-opencv: disable operators based on OpenCV in build.
+  - cc_debug: Generate debug info for extensions binaries and disable C/C++ compiler optimization.
+
+   For example:`pip install --config-settings "ortx-user-option=use-cuda,cc_debug" `, This command builds CUDA
+ kernels into the package and installs it, accompanied by the generation of debug information.

 Test:
+
 - 'pip install -r requirements-dev.txt' to install pip packages for development.
 - run `pytest test` in the project root directory.

 For a complete list of verified build configurations see [here](<./ci_matrix.md>)

 ## Java package
+
 `bash ./build.sh -DOCOS_BUILD_JAVA=ON` to build jar package in out/<OS>/Release folder

 ## Android package
+
 - pre-requisites: [Android Studio](https://developer.android.com/studio)

 Use `./tools/android/build_aar.py` to build an Android AAR package.

 ## iOS package
+
 Use `./tools/ios/build_xcframework.py` to build an iOS xcframework package.

 ## NuGet package
-In order to build a local NuGet package for testing, run `nuget.exe pack ./nuget/WinOnlyNuget.nuspec` to build a NuGet package for Windows.

-Note: you might need to update the src paths in the ./nuget/WinOnlyNuget.nuspec file if the appropriate ortextensions.dll files do not exist/are not in the given location.
+In order to build a local NuGet package for testing, run `nuget.exe pack ./nuget/WinOnlyNuget.nuspec` to build a NuGet
+package for Windows.
+
+Note: you might need to update the src paths in the ./nuget/WinOnlyNuget.nuspec file if the appropriate
+ortextensions.dll files do not exist/are not in the given location.

 ## Web-Assembly
-ONNXRuntime-Extensions will be built as a static library and linked with ONNXRuntime due to the lack of a good dynamic linking mechanism in WASM. Here are two additional arguments [–-use_extensions and --extensions_overridden_path](https://github.com/microsoft/onnxruntime/blob/860ba8820b72d13a61f0d08b915cd433b738ffdc/tools/ci_build/build.py#L416) on building onnxruntime to include ONNXRuntime-Extensions footprint in the ONNXRuntime package.
+
+ONNXRuntime-Extensions will be built as a static library and linked with ONNXRuntime due to the lack of a good dynamic
+linking mechanism in WASM. Here are two additional
+arguments [–-use_extensions and --extensions_overridden_path](https://github.com/microsoft/onnxruntime/blob/860ba8820b72d13a61f0d08b915cd433b738ffdc/tools/ci_build/build.py#L416)
+on building onnxruntime to include ONNXRuntime-Extensions footprint in the ONNXRuntime package.

 ## The C++ shared library
-for any other cases, please run `build.bat` or `bash ./build.sh` to build the library. By default, the DLL or the library will be generated in the directory `out/<OS>/<FLAVOR>`. There is a unit test to help verify the build.

+For any alternative scenarios, execute the following commands:
+
+- On Windows: Run `build.bat`.
+- On Unix-based systems: Execute `bash ./build.sh`.
+
+The generated DLL or library is typically located in the `out/<OS>/<FLAVOR>` directory. To validate the build, utilize
+the unit tests available in the `test/test_static_test` and `test/shared_test` directories.

 **VC Runtime static linkage**  
-If you want to build the binary with VC Runtime static linkage, please add a parameter _-DCMAKE_MSVC_RUNTIME_LIBRARY="MultiThreaded$<$<CONFIG:Debug>:Debug>"_ on running build.bat
+If you want to build the binary with VC Runtime static linkage, please add a parameter _-DCMAKE_MSVC_RUNTIME_LIBRARY="
+MultiThreaded$<$<CONFIG:Debug>:Debug>"_ on running build.bat

 ## Copyright guidance
-check this link https://docs.opensource.microsoft.com/releasing/general-guidance/copyright-headers/ for source file copyright header.
+
+check this link https://docs.opensource.microsoft.com/releasing/general-guidance/copyright-headers/ for source file
+copyright header.
--- a/onnxruntime_extensions/init.py
+++ b/onnxruntime_extensions/init.py
@ -2,7 +2,6 @@
 # Licensed under the MIT License. See License.txt in the project root for
 # license information.
 ###############################################################################
-
 """
 The `onnxruntime-extensions` Python package offers an API that allows users to generate models for pre-processing and
 post-processing tasks. In addition, it also provides an API to register custom operations implemented in Python.
@ -36,7 +35,7 @@ from ._ocos import enable_py_op
 from ._ocos import expand_onnx_inputs
 from ._ocos import hook_model_op
 from ._ocos import default_opset_domain
-from ._cuops import *   # noqa
+from ._cuops import *  # noqa
 from ._ortapi2 import OrtPyFunction as PyOrtFunction  # backward compatibility
 from ._ortapi2 import OrtPyFunction, ort_inference, optimize_model, make_onnx_model
 from ._ortapi2 import ONNXRuntimeError, ONNXRuntimeException
--- a/onnxruntime_extensions/_ocos.py
+++ b/onnxruntime_extensions/_ocos.py
@ -5,11 +5,34 @@
 """
 _ocos.py: PythonOp implementation
 """
-
+import os
 import sys
 import copy
+import glob
 import onnx
 from onnx import helper
+
+
+def _search_cuda_dir():
+    paths = os.getenv('PATH', '').split(os.pathsep)
+    for path in paths:
+        for filename in glob.glob(os.path.join(path, 'cudart64*.dll')):
+            return os.path.dirname(filename)
+
+    return None
+
+
+if sys.platform == 'win32':
+    from . import _version  # noqa: E402
+    if hasattr(_version, 'cuda'):
+        cuda_path = _search_cuda_dir()
+        if cuda_path is None:
+            raise RuntimeError(
+                "Cannot locate CUDA directory in the environment variable for GPU package")
+
+        os.add_dll_directory(cuda_path)
+
+
 from ._extensions_pydll import (  # noqa
    PyCustomOpDef, enable_py_op, add_custom_op, hash_64, default_opset_domain)

@ -65,7 +88,7 @@ class Opdef:
        if attrs is None:
            attrs = {}
        elif isinstance(attrs, (list, tuple)):
-                attrs = {k: PyCustomOpDef.dt_string for k in attrs}
+            attrs = {k: PyCustomOpDef.dt_string for k in attrs}
        opdef._nativedef.attrs = attrs
        add_custom_op(opdef._nativedef)
        return opdef
@ -115,7 +138,8 @@ def _ensure_opset_domain(model):
            domain_missing = False

    if domain_missing:
-        model.opset_import.extend([helper.make_operatorsetid(op_domain_name, 1)])
+        model.opset_import.extend(
+            [helper.make_operatorsetid(op_domain_name, 1)])

    return model

@ -130,7 +154,8 @@ def expand_onnx_inputs(model, target_input, extra_nodes, new_inputs):
    :return: The ONNX model after modification
    """
    graph = model.graph
-    new_inputs = [n for n in graph.input if n.name != target_input] + new_inputs
+    new_inputs = [n for n in graph.input if n.name !=
+                  target_input] + new_inputs
    new_nodes = list(model.graph.node) + extra_nodes
    new_graph = helper.make_graph(
        new_nodes, graph.name, new_inputs, list(graph.output), list(graph.initializer))
@ -179,7 +204,8 @@ def hook_model_op(model, node_name, hook_func, input_types):
    del hkd_model.graph.node[:]
    hkd_model.graph.node.extend(repacked)

-    Opdef.create(hook_func, op_type=optype_name, inputs=input_types, outputs=input_types)
+    Opdef.create(hook_func, op_type=optype_name,
+                 inputs=input_types, outputs=input_types)
    return _ensure_opset_domain(hkd_model)


--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,8 @@
 [build-system]
 # Minimum requirements for the build system to execute.
 requires = ["setuptools", "wheel", "numpy>=1.18.5", "ninja", "cmake"]  # PEP 508 specifications.
+build-backend = "backend"
+backend-path = [".pyproject"]

 [tool.black]
 line-length = 120
--- a/setup.py
+++ b/setup.py
@ -4,178 +4,24 @@
 # license information.
 ###########################################################################

-from setuptools import setup, find_packages
-from setuptools.command.build import build as _build
-from setuptools.command.build_ext import build_ext as _build_ext
-
-import re
 import os
 import sys
-import setuptools
 import pathlib
-import subprocess
+import setuptools

 from textwrap import dedent
+from setuptools import setup, find_packages

 TOP_DIR = os.path.dirname(__file__) or os.getcwd()
 PACKAGE_NAME = 'onnxruntime_extensions'
-VSINSTALLDIR_NAME = 'VSINSTALLDIR'

+# setup.py cannot be debugged in pip command line, so the command classes are refactored into another file
+cmds_dir = pathlib.Path(TOP_DIR) / '.pyproject'
+sys.path.append(str(cmds_dir))
+# noinspection PyUnresolvedReferences
+import cmdclass as _cmds  # noqa: E402

-def load_vsdevcmd():
-    if os.environ.get(VSINSTALLDIR_NAME) is None:
-        stdout, _ = subprocess.Popen([
-            'powershell', ' -noprofile', '-executionpolicy',
-            'bypass', '-f', TOP_DIR + '/tools/get_vsdevcmd.ps1', '-outputEnv', '1'],
-            stdout=subprocess.PIPE, shell=False, universal_newlines=True).communicate()
-        for line in stdout.splitlines():
-            kv_pair = line.split('=')
-            if len(kv_pair) == 2:
-                os.environ[kv_pair[0]] = kv_pair[1]
-    else:
-        import shutil
-        if shutil.which('cmake') is None:
-            raise SystemExit(
-                "Cannot find cmake in the executable path, "
-                "please run this script under Developer Command Prompt for VS.")
-
-
-def read_git_refs():
-    release_branch = False
-    stdout, _ = subprocess.Popen(
-        ['git'] + ['log', '-1', '--format=%H'],
-        cwd=TOP_DIR,
-        stdout=subprocess.PIPE, universal_newlines=True).communicate()
-    HEAD = dedent(stdout.splitlines()[0]).strip('\n\r')
-    stdout, _ = subprocess.Popen(
-        ['git'] + ['show-ref', '--head'],
-        cwd=TOP_DIR,
-        stdout=subprocess.PIPE, universal_newlines=True).communicate()
-    for _ln in stdout.splitlines():
-        _ln = dedent(_ln).strip('\n\r')
-        if _ln.startswith(HEAD):
-            _, _2 = _ln.split(' ')
-            if _2.startswith('refs/remotes/origin/rel-'):
-                release_branch = True
-    return release_branch, HEAD
-
-
-class BuildCMakeExt(_build_ext):
-
-    def run(self):
-        """
-        Perform build_cmake before doing the 'normal' stuff
-        """
-        for extension in self.extensions:
-            if extension.name == 'onnxruntime_extensions._extensions_pydll':
-                self.build_cmake(extension)
-
-    def build_cmake(self, extension):
-        project_dir = pathlib.Path().absolute()
-        build_temp = pathlib.Path(self.build_temp)
-        build_temp.mkdir(parents=True, exist_ok=True)
-        ext_fullpath = pathlib.Path(self.get_ext_fullpath(extension.name)).absolute()
-
-        config = 'RelWithDebInfo' if self.debug else 'Release'
-        cmake_args = [
-            '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + str(ext_fullpath.parent.absolute()),
-            '-DOCOS_BUILD_PYTHON=ON',
-            '-DOCOS_PYTHON_MODULE_PATH=' + str(ext_fullpath),
-            '-DCMAKE_BUILD_TYPE=' + config
-        ]
-
-        if os.environ.get('OCOS_NO_OPENCV') == '1':
-            # Disabling openCV can drastically reduce the build time.
-            cmake_args += [
-                '-DOCOS_ENABLE_OPENCV_CODECS=OFF',
-                '-DOCOS_ENABLE_CV2=OFF',
-                '-DOCOS_ENABLE_VISION=OFF']
-
-        # explicitly set the flag for AzureOp, despite the default value in CMakeLists.txt
-        azure_flag = "ON" if os.environ.get('OCOS_ENABLE_AZURE') == '1' else None
-        if azure_flag is None:
-            # OCOS_NO_AZURE will be ignored if OCOS_ENABLE_AZURE is set.
-            azure_flag = "OFF" if os.environ.get('OCOS_NO_AZURE') == '1' else None
-        if azure_flag is not None:
-            cmake_args += ['-DOCOS_ENABLE_AZURE=' + azure_flag]
-            print("=> AzureOp build flag: " + azure_flag)
-
-        # CMake lets you override the generator - we need to check this.
-        # Can be set with Conda-Build, for example.
-        cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
-        # Adding CMake arguments set as environment variable
-        # (needed e.g. to build for ARM OSx on conda-forge)
-        if "CMAKE_ARGS" in os.environ:
-            cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
-
-        if sys.platform != "win32":
-            # Using Ninja-build since it a) is available as a wheel and b)
-            # multithreads automatically. MSVC would require all variables be
-            # exported for Ninja to pick it up, which is a little tricky to do.
-            # Users can override the generator with CMAKE_GENERATOR in CMake
-            # 3.15+.
-            if not cmake_generator or cmake_generator == "Ninja":
-                try:
-                    import ninja  # noqa: F401
-
-                    ninja_executable_path = os.path.join(ninja.BIN_DIR, "ninja")
-                    cmake_args += [
-                        "-GNinja",
-                        f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
-                    ]
-                except ImportError:
-                    pass
-
-        if sys.platform.startswith("darwin"):
-            # Cross-compile support for macOS - respect ARCHFLAGS if set
-            archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", ""))
-            if archs:
-                cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
-
-        # overwrite the Python module info if the auto-detection doesn't work.
-        # export Python3_INCLUDE_DIRS=/opt/python/cp38-cp38
-        # export Python3_LIBRARIES=/opt/python/cp38-cp38
-        for env in ['Python3_INCLUDE_DIRS', 'Python3_LIBRARIES']:
-            if env in os.environ:
-                cmake_args.append("-D%s=%s" % (env, os.environ[env]))
-
-        if self.debug:
-            cmake_args += ['-DCC_OPTIMIZE=OFF']
-
-        # the parallel build has to be limited on some Linux VM machine.
-        cpu_number = os.environ.get('CPU_NUMBER')
-        build_args = [
-            '--config', config,
-            '--parallel' + ('' if cpu_number is None else ' ' + cpu_number)
-        ]
-        cmake_exe = 'cmake'
-        # unlike Linux/macOS, cmake pip package on Windows fails to build some 3rd party dependencies.
-        # so we have to use the cmake installed from Visual Studio.
-        if os.environ.get(VSINSTALLDIR_NAME):
-            cmake_exe = os.environ[VSINSTALLDIR_NAME] + \
-                        'Common7\\IDE\\CommonExtensions\\Microsoft\\CMake\\CMake\\bin\\cmake.exe'
-            # Add this cmake directory into PATH to make sure the child-process still find it.
-            os.environ['PATH'] = os.path.dirname(cmake_exe) + os.pathsep + os.environ['PATH']
-
-        self.spawn([cmake_exe, '-S', str(project_dir), '-B', str(build_temp)] + cmake_args)
-        if not self.dry_run:
-            self.spawn([cmake_exe, '--build', str(build_temp)] + build_args)
-
-
-class Build(_build):
-    def initialize_options(self) -> None:
-        super().initialize_options()
-        if os.environ.get('OCOS_SCB_DEBUG', None) == '1':
-            self.debug = True
-
-    def finalize_options(self) -> None:
-        # There is a bug in setuptools that prevents the build get the right platform name from arguments.
-        # So, it cannot generate the correct wheel with the right arch in Official release pipeline.
-        # Force plat_name to be 'win-amd64' in Windows to fix that.
-        # Since extensions cmake is only available on x64 for Windows now, it is not a problem to hardcode it.
-        if sys.platform == "win32" and "arm" not in sys.version.lower():
-            self.plat_name = "win-amd64"
-        super().finalize_options()
+_cmds.prepare_env(TOP_DIR)


 def read_requirements():
@ -195,7 +41,7 @@ def read_version():
        return version_str

    # is it a dev build or release?
-    rel_br, cid = read_git_refs() if os.path.isdir(
+    rel_br, cid = _cmds.read_git_refs(TOP_DIR) if os.path.isdir(
        os.path.join(TOP_DIR, '.git')) else (True, None)

    if rel_br:
@ -209,16 +55,13 @@ def read_version():
    return version_str


-def write_py_version(ortx_version):
+def write_py_version(ext_version):
    text = ["# Generated by setup.py, DON'T MANUALLY UPDATE IT!\n",
-            "__version__ = \"{}\"\n".format(ortx_version)]
-    with (open(os.path.join(TOP_DIR, 'onnxruntime_extensions/_version.py'), "w")) as _f:
-        _f.writelines(text)
+            "__version__ = \"{}\"\n".format(ext_version)]
+    with (open(os.path.join(TOP_DIR, 'onnxruntime_extensions/_version.py'), "w")) as _fver:
+        _fver.writelines(text)


-if sys.platform == "win32":
-    load_vsdevcmd()
-
 ext_modules = [
    setuptools.extension.Extension(
        name=str('onnxruntime_extensions._extensions_pydll'),
@ -255,7 +98,7 @@ setup(
    author_email='onnxruntime@microsoft.com',
    url='https://github.com/microsoft/onnxruntime-extensions',
    ext_modules=ext_modules,
-    cmdclass=dict(build_ext=BuildCMakeExt, build=Build),
+    cmdclass=_cmds.ortx_cmdclass,
    include_package_data=True,
    install_requires=read_requirements(),
    classifiers=[
--- a/test/cuda/test_cudaops.py
+++ b/test/cuda/test_cudaops.py
@ -0,0 +1,47 @@
+import unittest
+import numpy as np
+from numpy.testing import assert_almost_equal
+from onnx import helper, onnx_pb as onnx_proto
+from onnxruntime_extensions import make_onnx_model
+from onnxruntime_extensions import get_library_path as _get_library_path
+
+import onnxruntime as _ort
+
+
+class TestCudaOps(unittest.TestCase):
+    @staticmethod
+    def _create_test_model(domain='ai.onnx.contrib'):
+        nodes = [
+            helper.make_node('Identity', ['x'], ['identity1']),
+            helper.make_node(
+                'NegPos', ['identity1'], ['neg', 'pos'],
+                domain=domain)
+        ]
+
+        input0 = helper.make_tensor_value_info(
+            'x', onnx_proto.TensorProto.FLOAT, [])
+        output1 = helper.make_tensor_value_info(
+            'neg', onnx_proto.TensorProto.FLOAT, [])
+        output2 = helper.make_tensor_value_info(
+            'pos', onnx_proto.TensorProto.FLOAT, [])
+
+        graph = helper.make_graph(nodes, 'test0', [input0], [output1, output2])
+        model = make_onnx_model(graph)
+        return model
+
+    def test_cuda_negpos(self):
+        so = _ort.SessionOptions()
+        so.register_custom_ops_library(_get_library_path())
+        onnx_model = self._create_test_model()
+        self.assertIn('op_type: "NegPos"', str(onnx_model))
+        sess = _ort.InferenceSession(onnx_model.SerializeToString(),
+                                     so,
+                                     providers=['CUDAExecutionProvider'])
+        x = np.array([[0., 1., 1.5], [7., 8., -5.5]]).astype(np.float32)
+        neg, pos = sess.run(None, {'x': x})
+        diff = x - (neg + pos)
+        assert_almost_equal(diff, np.zeros(diff.shape))
+
+
+if __name__ == "__main__":
+    unittest.main()