diff --git a/.pyproject/cmdclass.py b/.pyproject/cmdclass.py
index 04a69120..16855a6b 100644
--- a/.pyproject/cmdclass.py
+++ b/.pyproject/cmdclass.py
@@ -38,9 +38,10 @@ def _load_cuda_version():
 
 def _load_nvidia_smi():
     try:
-        output = subprocess.check_output(
+        outputs = subprocess.check_output(
             ["nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader,nounits"],
-            stderr=subprocess.STDOUT).decode("utf-8")
+            stderr=subprocess.STDOUT).decode("utf-8").splitlines()
+        output = outputs[0] if outputs else ""
         arch = output.strip().replace('.', '')
         return arch if arch.isdigit() else None
     except (subprocess.CalledProcessError, OSError):
diff --git a/cmake/ext_cuda.cmake b/cmake/ext_cuda.cmake
index ac48dcb8..fddd3272 100644
--- a/cmake/ext_cuda.cmake
+++ b/cmake/ext_cuda.cmake
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
+find_package(CUDAToolkit)
 enable_language(CUDA)
 
 set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
diff --git a/cmake/ext_tests.cmake b/cmake/ext_tests.cmake
index fe710401..95d257dc 100644
--- a/cmake/ext_tests.cmake
+++ b/cmake/ext_tests.cmake
@@ -60,7 +60,7 @@ function(add_test_target)
                           gtest gmock)
 
     if(OCOS_USE_CUDA)
-      target_link_directories(${ARG_TARGET} PRIVATE $ENV{CUDA_PATH}/lib64)
+      target_link_directories(${ARG_TARGET} PRIVATE ${CUDAToolkit_LIBRARY_DIR})
     endif()
 
     set(test_data_destination_root_directory ${onnxruntime_extensions_BINARY_DIR})
diff --git a/docs/development.md b/docs/development.md
index 86e8610b..42a4cbd5 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -15,9 +15,10 @@ The package contains all custom operators and some Python scripts to manipulate
   - use-cuda: enable CUDA kernel build in Python package.
   - no-azure: disable AzureOp kernel build in Python package.
   - no-opencv: disable operators based on OpenCV in build.
-  - cc-debug: Generate debug info for extensions binaries and disable C/C++ compiler optimization.
+  - cc-debug: generate debug info for extensions binaries and disable C/C++ compiler optimization.
+  - cuda-archs: specify the CUDA architectures(like 70, 85, etc.), and the multiple values can be combined with semicolon. The default value is nvidia-smi util output of GPU-0
 
-   For example:`pip install . --config-settings "ortx-user-option=use-cuda,cc-debug" `, This command builds CUDA kernels into the package and installs it, accompanied by the generation of debug information.
+  For example:`pip install . --config-settings "ortx-user-option=use-cuda,cc-debug" `, This command builds CUDA kernels into the package and installs it, accompanied by the generation of debug information.
 
 Test:
 
@@ -59,6 +60,9 @@ For any alternative scenarios, execute the following commands:
 
 The generated DLL or library is typically located in the `out/<OS>/<FLAVOR>` directory. To validate the build, utilize the unit tests available in the `test/test_static_test` and `test/shared_test` directories.
 
+**CUDA Build**  
+The cuda build can be enabled with -DOCOS_USE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=<arch>
+
 **VC Runtime static linkage**  
 If you want to build the binary with VC Runtime static linkage, please add a parameter _-DCMAKE_MSVC_RUNTIME_LIBRARY="MultiThreaded$<$<CONFIG:Debug>:Debug>"_ when running build.bat