[CUDA] consolidate CUDA versions (#5677)

* [ci] speed up if-else, swig, and lint conda setup * add 'source activate' * python constraint * start removing cuda v1 * comment out CI * remove more references * revert some unnecessaary changes * revert a few more mistakes * revert another change that ignored params * sigh * remove CUDATreeLearner * fix tests, docs * fix quoting in setup.py * restore all CI * Apply suggestions from code review Co-authored-by: shiyu1994 <shiyu_k1994@qq.com> * Apply suggestions from code review * completely remove cuda_exp, update docs --------- Co-authored-by: shiyu1994 <shiyu_k1994@qq.com>
2023-01-31 21:27:52 -06:00 · 2023-01-31 21:27:52 -06:00 · 4f47547c88
--- a/.ci/setup.sh
+++ b/.ci/setup.sh
@ -106,7 +106,7 @@ else  # Linux
            || exit -1
        fi
    fi
-    if [[ $TASK == "cuda" || $TASK == "cuda_exp" ]]; then
+    if [[ $TASK == "cuda" ]]; then
        echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
        apt-get update
        apt-get install --no-install-recommends -y \
--- a/.ci/test.sh
+++ b/.ci/test.sh
@ -201,41 +201,24 @@ if [[ $TASK == "gpu" ]]; then
    elif [[ $METHOD == "source" ]]; then
        cmake -DUSE_GPU=ON ..
    fi
-elif [[ $TASK == "cuda" || $TASK == "cuda_exp" ]]; then
-    if [[ $TASK == "cuda" ]]; then
-        sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h
-        grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
-    else
-        sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda_exp";/' $BUILD_DIRECTORY/include/LightGBM/config.h
-        grep -q 'std::string device_type = "cuda_exp"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
-        # by default ``gpu_use_dp=false`` for efficiency. change to ``true`` here for exact results in ci tests
-        sed -i'.bak' 's/gpu_use_dp = false;/gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h
-        grep -q 'gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
-    fi
+elif [[ $TASK == "cuda" ]]; then
+    sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h
+    grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
+    # by default ``gpu_use_dp=false`` for efficiency. change to ``true`` here for exact results in ci tests
+    sed -i'.bak' 's/gpu_use_dp = false;/gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h
+    grep -q 'gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
    if [[ $METHOD == "pip" ]]; then
        cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1
-        if [[ $TASK == "cuda" ]]; then
-            pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1
-        else
-            pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda-exp || exit -1
-        fi
+        pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1
        pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1
        exit 0
    elif [[ $METHOD == "wheel" ]]; then
-        if [[ $TASK == "cuda" ]]; then
-            cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda || exit -1
-        else
-            cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda-exp || exit -1
-        fi
+        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda || exit -1
        pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER*.whl -v || exit -1
        pytest $BUILD_DIRECTORY/tests || exit -1
        exit 0
    elif [[ $METHOD == "source" ]]; then
-        if [[ $TASK == "cuda" ]]; then
-            cmake -DUSE_CUDA=ON ..
-        else
-            cmake -DUSE_CUDA_EXP=ON ..
-        fi
+        cmake -DUSE_CUDA=ON ..
    fi
 elif [[ $TASK == "mpi" ]]; then
    if [[ $METHOD == "pip" ]]; then
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@ -28,31 +28,21 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - method: source
-            compiler: gcc
-            python_version: "3.8"
-            cuda_version: "11.7.1"
-            task: cuda
-          - method: pip
-            compiler: clang
-            python_version: "3.9"
-            cuda_version: "10.0"
-            task: cuda
          - method: wheel
            compiler: gcc
            python_version: "3.10"
-            cuda_version: "9.0"
+            cuda_version: "11.7.1"
            task: cuda
          - method: source
            compiler: gcc
            python_version: "3.8"
-            cuda_version: "11.7.1"
-            task: cuda_exp
+            cuda_version: "10.0"
+            task: cuda
          - method: pip
            compiler: clang
            python_version: "3.9"
-            cuda_version: "10.0"
-            task: cuda_exp
+            cuda_version: "11.7.1"
+            task: cuda
    steps:
      - name: Setup or update software on host machine
        run: |
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -4,8 +4,7 @@ option(USE_GPU "Enable GPU-accelerated training" OFF)
 option(USE_SWIG "Enable SWIG to generate Java API" OFF)
 option(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF)
 option(USE_TIMETAG "Set to ON to output time costs" OFF)
-option(USE_CUDA "Enable CUDA-accelerated training (EXPERIMENTAL)" OFF)
-option(USE_CUDA_EXP "Enable CUDA-accelerated training with more acceleration (EXPERIMENTAL)" OFF)
+option(USE_CUDA "Enable CUDA-accelerated training " OFF)
 option(USE_DEBUG "Set to ON for Debug mode" OFF)
 option(USE_SANITIZER "Use santizer flags" OFF)
 set(
@ -31,7 +30,7 @@ elseif(USE_SWIG)
  cmake_minimum_required(VERSION 3.8)
 elseif(USE_GPU OR APPLE)
  cmake_minimum_required(VERSION 3.2)
-elseif(USE_CUDA OR USE_CUDA_EXP)
+elseif(USE_CUDA)
  cmake_minimum_required(VERSION 3.16)
 else()
  cmake_minimum_required(VERSION 3.0)
@ -137,7 +136,7 @@ else()
    add_definitions(-DUSE_SOCKET)
 endif()

-if(USE_CUDA OR USE_CUDA_EXP)
+if(USE_CUDA)
    set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
    enable_language(CUDA)
    set(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE)
@ -192,12 +191,8 @@ if(__INTEGRATE_OPENCL)
    endif()
 endif()

-if(USE_CUDA OR USE_CUDA_EXP)
-    if(USE_CUDA)
-      find_package(CUDA 9.0 REQUIRED)
-    else()
-      find_package(CUDA 10.0 REQUIRED)
-    endif()
+if(USE_CUDA)
+    find_package(CUDA 10.0 REQUIRED)
    include_directories(${CUDA_INCLUDE_DIRS})
    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall")

@ -224,11 +219,7 @@ if(USE_CUDA OR USE_CUDA_EXP)
    endif()
    message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")

-    if(USE_CUDA)
-      add_definitions(-DUSE_CUDA)
-    elseif(USE_CUDA_EXP)
-      add_definitions(-DUSE_CUDA_EXP)
-    endif()
+    add_definitions(-DUSE_CUDA)

    if(NOT DEFINED CMAKE_CUDA_STANDARD)
      set(CMAKE_CUDA_STANDARD 11)
@ -411,10 +402,8 @@ file(
      src/objective/*.cpp
      src/network/*.cpp
      src/treelearner/*.cpp
-if(USE_CUDA OR USE_CUDA_EXP)
+if(USE_CUDA)
      src/treelearner/*.cu
-endif()
-if(USE_CUDA_EXP)
      src/boosting/cuda/*.cpp
      src/boosting/cuda/*.cu
      src/metric/cuda/*.cpp
@ -549,7 +538,7 @@ if(__INTEGRATE_OPENCL)
  target_link_libraries(lightgbm_objs PUBLIC ${INTEGRATED_OPENCL_LIBRARIES} ${CMAKE_DL_LIBS})
 endif()

-if(USE_CUDA OR USE_CUDA_EXP)
+if(USE_CUDA)
  # Disable cmake warning about policy CMP0104. Refer to issue #3754 and PR #4268.
  # Custom target properties does not propagate, thus we need to specify for
  # each target that contains or depends on cuda source.
--- a/docs/Installation-Guide.rst
+++ b/docs/Installation-Guide.rst
@ -605,8 +605,8 @@ Docker

 Refer to `GPU Docker folder <https://github.com/microsoft/LightGBM/tree/master/docker/gpu>`__.

-Build CUDA Version (Experimental)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Build CUDA Version
+~~~~~~~~~~~~~~~~~~

 The `original GPU build <#build-gpu-version>`__ of LightGBM (``device_type=gpu``) is based on OpenCL.

@ -621,7 +621,7 @@ On Linux a CUDA version of LightGBM can be built using **CUDA**, **CMake** and *

 The following dependencies should be installed before compilation:

-  **CUDA** 9.0 or later libraries. Please refer to `this detailed guide`_. Pay great attention to the minimum required versions of host compilers listed in the table from that guide and use only recommended versions of compilers.
+-  **CUDA** 10.0 or later libraries. Please refer to `this detailed guide`_. Pay great attention to the minimum required versions of host compilers listed in the table from that guide and use only recommended versions of compilers.

 -  **CMake** 3.16 or later.

@ -636,8 +636,6 @@ To build LightGBM CUDA version, run the following commands:
  cmake -DUSE_CUDA=1 ..
  make -j4

-Recently, a new CUDA version with better efficiency is implemented as an experimental feature. To build the new CUDA version, replace ``-DUSE_CUDA`` with ``-DUSE_CUDA_EXP`` in the above commands. Please note that new version requires **CUDA** 10.0 or later libraries.
-
 **Note**: glibc >= 2.14 is required.

 **Note**: In some rare cases you may need to install OpenMP runtime library separately (use your package manager and search for ``lib[g|i]omp`` for doing this).
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@ -205,9 +205,15 @@ Core Parameters

   -  **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors

-  ``device_type`` :raw-html:`<a id="device_type" title="Permalink to this parameter" href="#device_type">&#x1F517;&#xFE0E;</a>`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, ``cuda``, ``cuda_exp``, aliases: ``device``
+-  ``device_type`` :raw-html:`<a id="device_type" title="Permalink to this parameter" href="#device_type">&#x1F517;&#xFE0E;</a>`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, ``cuda``, aliases: ``device``

-   -  device for the tree learning, you can use GPU to achieve the faster learning
+   -  device for the tree learning
+
+   -  ``cpu`` supports all LightGBM functionality and is portable across the widest range of operating systems and hardware
+
+   -  ``cuda`` offers faster training than ``gpu`` or ``cpu``, but only works on GPUs supporting CUDA
+
+   -  ``gpu`` can be faster than ``cpu`` and works on a wider range of GPUs than CUDA

   -  **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up

@ -215,10 +221,6 @@ Core Parameters

   -  **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support

-   -  **Note**: ``cuda_exp`` is an experimental CUDA version, the installation guide for ``cuda_exp`` is identical with ``cuda``
-
-   -  **Note**: ``cuda_exp`` is faster than ``cuda`` and will replace ``cuda`` in the future
-
 -  ``seed`` :raw-html:`<a id="seed" title="Permalink to this parameter" href="#seed">&#x1F517;&#xFE0E;</a>`, default = ``None``, type = int, aliases: ``random_seed``, ``random_state``

   -  this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``, etc.
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@ -480,13 +480,13 @@ class MultiValBin {

  virtual MultiValBin* Clone() = 0;

-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  virtual const void* GetRowWiseData(uint8_t* bit_type,
    size_t* total_size,
    bool* is_sparse,
    const void** out_data_ptr,
    uint8_t* data_ptr_bit_type) const = 0;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 };

 inline uint32_t BinMapper::ValueToBin(double value) const {
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@ -223,14 +223,15 @@ struct Config {

  // [doc-only]
  // type = enum
-  // options = cpu, gpu, cuda, cuda_exp
+  // options = cpu, gpu, cuda
  // alias = device
-  // desc = device for the tree learning, you can use GPU to achieve the faster learning
+  // desc = device for the tree learning
+  // desc = ``cpu`` supports all LightGBM functionality and is portable across the widest range of operating systems and hardware
+  // desc = ``cuda`` offers faster training than ``gpu`` or ``cpu``, but only works on GPUs supporting CUDA
+  // desc = ``gpu`` can be faster than ``cpu`` and works on a wider range of GPUs than CUDA
  // desc = **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up
  // desc = **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training
  // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support
-  // desc = **Note**: ``cuda_exp`` is an experimental CUDA version, the installation guide for ``cuda_exp`` is identical with ``cuda``
-  // desc = **Note**: ``cuda_exp`` is faster than ``cuda`` and will replace ``cuda`` in the future
  std::string device_type = "cpu";

  // [doc-only]
--- a/include/LightGBM/cuda/cuda_algorithms.hpp
+++ b/include/LightGBM/cuda/cuda_algorithms.hpp
@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_
 #define LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <cuda.h>
 #include <cuda_runtime.h>
@ -577,5 +577,5 @@ __device__ VAL_T PercentileDevice(const VAL_T* values,

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_
--- a/include/LightGBM/cuda/cuda_column_data.hpp
+++ b/include/LightGBM/cuda/cuda_column_data.hpp
@ -3,7 +3,7 @@
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #ifndef LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_
 #define LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_
@ -137,4 +137,4 @@ class CUDAColumnData {

 #endif  // LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/include/LightGBM/cuda/cuda_metadata.hpp
+++ b/include/LightGBM/cuda/cuda_metadata.hpp
@ -3,7 +3,7 @@
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #ifndef LIGHTGBM_CUDA_CUDA_METADATA_HPP_
 #define LIGHTGBM_CUDA_CUDA_METADATA_HPP_
@ -55,4 +55,4 @@ class CUDAMetadata {

 #endif  // LIGHTGBM_CUDA_CUDA_METADATA_HPP_

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/include/LightGBM/cuda/cuda_metric.hpp
+++ b/include/LightGBM/cuda/cuda_metric.hpp
@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_CUDA_CUDA_METRIC_HPP_
 #define LIGHTGBM_CUDA_CUDA_METRIC_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/metric.h>

@ -36,6 +36,6 @@ class CUDAMetricInterface: public HOST_METRIC {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA

 #endif  // LIGHTGBM_CUDA_CUDA_METRIC_HPP_
--- a/include/LightGBM/cuda/cuda_objective_function.hpp
+++ b/include/LightGBM/cuda/cuda_objective_function.hpp
@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_CUDA_CUDA_OBJECTIVE_FUNCTION_HPP_
 #define LIGHTGBM_CUDA_CUDA_OBJECTIVE_FUNCTION_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_utils.h>
 #include <LightGBM/objective_function.h>
@ -73,6 +73,6 @@ class CUDAObjectiveInterface: public HOST_OBJECTIVE {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA

 #endif  // LIGHTGBM_CUDA_CUDA_OBJECTIVE_FUNCTION_HPP_
--- a/include/LightGBM/cuda/cuda_random.hpp
+++ b/include/LightGBM/cuda/cuda_random.hpp
@ -5,7 +5,7 @@
 #ifndef LIGHTGBM_CUDA_CUDA_RANDOM_HPP_
 #define LIGHTGBM_CUDA_CUDA_RANDOM_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <cuda.h>
 #include <cuda_runtime.h>
@ -69,6 +69,6 @@ class CUDARandom {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA

 #endif  // LIGHTGBM_CUDA_CUDA_RANDOM_HPP_
--- a/include/LightGBM/cuda/cuda_row_data.hpp
+++ b/include/LightGBM/cuda/cuda_row_data.hpp
@ -3,7 +3,7 @@
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #ifndef LIGHTGBM_CUDA_CUDA_ROW_DATA_HPP_
 #define LIGHTGBM_CUDA_CUDA_ROW_DATA_HPP_
@ -176,4 +176,4 @@ class CUDARowData {
 }  // namespace LightGBM
 #endif  // LIGHTGBM_CUDA_CUDA_ROW_DATA_HPP_

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/include/LightGBM/cuda/cuda_split_info.hpp
+++ b/include/LightGBM/cuda/cuda_split_info.hpp
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #ifndef LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_
 #define LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_
@ -102,4 +102,4 @@ class CUDASplitInfo {

 #endif  // LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/include/LightGBM/cuda/cuda_tree.hpp
+++ b/include/LightGBM/cuda/cuda_tree.hpp
@ -3,7 +3,7 @@
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #ifndef LIGHTGBM_CUDA_CUDA_TREE_HPP_
 #define LIGHTGBM_CUDA_CUDA_TREE_HPP_
@ -170,4 +170,4 @@ class CUDATree : public Tree {

 #endif  // LIGHTGBM_CUDA_CUDA_TREE_HPP_

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/include/LightGBM/cuda/cuda_utils.h
+++ b/include/LightGBM/cuda/cuda_utils.h
@ -6,20 +6,15 @@
 #ifndef LIGHTGBM_CUDA_CUDA_UTILS_H_
 #define LIGHTGBM_CUDA_CUDA_UTILS_H_

-#if defined(USE_CUDA) || defined(USE_CUDA_EXP)
+#ifdef USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <stdio.h>
 #include <LightGBM/utils/log.h>
-#endif  // USE_CUDA || USE_CUDA_EXP
-
-#ifdef USE_CUDA_EXP
 #include <vector>
-#endif  // USE_CUDA_EXP

 namespace LightGBM {

-#if defined(USE_CUDA) || defined(USE_CUDA_EXP)
 #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
  if (code != cudaSuccess) {
@ -27,9 +22,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort =
    if (abort) exit(code);
  }
 }
-#endif  // USE_CUDA || USE_CUDA_EXP

-#ifdef USE_CUDA_EXP
 #define CUDASUCCESS_OR_FATAL_OUTER(ans) { gpuAssert((ans), file, line); }

 void SetCUDADevice(int gpu_device_id, const char* file, int line);
@ -184,8 +177,8 @@ class CUDAVector {
  size_t size_;
 };

-#endif  // USE_CUDA_EXP
-
 }  // namespace LightGBM

+#endif  // USE_CUDA
+
 #endif  // LIGHTGBM_CUDA_CUDA_UTILS_H_
--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
@ -7,7 +7,7 @@

 #include <LightGBM/utils/common.h>

-#if defined(USE_CUDA) || defined(USE_CUDA_EXP)
+#ifdef USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #endif
@ -43,7 +43,7 @@ struct CHAllocator {
    T* ptr;
    if (n == 0) return NULL;
    n = SIZE_ALIGNED(n);
-    #if defined(USE_CUDA) || defined(USE_CUDA_EXP)
+    #ifdef USE_CUDA
      if (LGBM_config_::current_device == lgbm_device_cuda) {
        cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable);
        if (ret != cudaSuccess) {
@ -62,7 +62,7 @@ struct CHAllocator {
  void deallocate(T* p, std::size_t n) {
    (void)n;  // UNUSED
    if (p == NULL) return;
-    #if defined(USE_CUDA) || defined(USE_CUDA_EXP)
+    #ifdef USE_CUDA
      if (LGBM_config_::current_device == lgbm_device_cuda) {
        cudaPointerAttributes attributes;
        cudaPointerGetAttributes(&attributes, p);
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@ -277,13 +277,13 @@ class Metadata {
  /*! \brief Disable copy */
  Metadata(const Metadata&) = delete;

-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA

  CUDAMetadata* cuda_metadata() const { return cuda_metadata_.get(); }

  void CreateCUDAMetadata(const int gpu_device_id);

-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA

 private:
  /*! \brief Load wights from file */
@ -329,9 +329,9 @@ class Metadata {
  bool weight_load_from_file_;
  bool query_load_from_file_;
  bool init_score_load_from_file_;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  std::unique_ptr<CUDAMetadata> cuda_metadata_;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 };


@ -910,13 +910,13 @@ class Dataset {
    return feature_groups_[feature_group_index]->feature_min_bin(sub_feature_index);
  }

-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA

  const CUDAColumnData* cuda_column_data() const {
    return cuda_column_data_.get();
  }

-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA

 private:
  void CreateCUDAColumnData();
@ -968,9 +968,9 @@ class Dataset {
  /*! \brief mutex for threading safe call */
  std::mutex mutex_;

-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  std::unique_ptr<CUDAColumnData> cuda_column_data_;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA

  std::string parser_config_str_;
 };
--- a/include/LightGBM/objective_function.h
+++ b/include/LightGBM/objective_function.h
@ -97,7 +97,7 @@ class ObjectiveFunction {
  */
  virtual bool IsCUDAObjective() const { return false; }

-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  /*!
  * \brief Convert output for CUDA version
  */
@ -107,7 +107,7 @@ class ObjectiveFunction {

  virtual bool NeedConvertOutputCUDA () const { return false; }

-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 };

 }  // namespace LightGBM
--- a/include/LightGBM/sample_strategy.h
+++ b/include/LightGBM/sample_strategy.h
@ -38,9 +38,9 @@ class SampleStrategy {

  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>>& bag_data_indices() { return bag_data_indices_; }

-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  CUDAVector<data_size_t>& cuda_bag_data_indices() { return cuda_bag_data_indices_; }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA

  void UpdateObjectiveFunction(const ObjectiveFunction* objective_function) {
    objective_function_ = objective_function;
@ -72,10 +72,10 @@ class SampleStrategy {
  /*! \brief whether need to resize the gradient vectors */
  bool need_resize_gradients_;

-  #ifdef USE_CUDA_EXP
-  /*! \brief Buffer for bag_data_indices_ on GPU, used only with cuda_exp */
+  #ifdef USE_CUDA
+  /*! \brief Buffer for bag_data_indices_ on GPU, used only with cuda */
  CUDAVector<data_size_t> cuda_bag_data_indices_;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 };

 }  // namespace LightGBM
--- a/include/LightGBM/train_share_states.h
+++ b/include/LightGBM/train_share_states.h
@ -126,7 +126,7 @@ class MultiValBinWrapper {
  }


-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  const void* GetRowWiseData(
    uint8_t* bit_type,
    size_t* total_size,
@ -142,7 +142,7 @@ class MultiValBinWrapper {
      return multi_val_bin_->GetRowWiseData(bit_type, total_size, is_sparse, out_data_ptr, data_ptr_bit_type);
    }
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA

 private:
  bool is_use_subcol_ = false;
@ -183,9 +183,9 @@ struct TrainingShareStates {

  const std::vector<uint32_t>& feature_hist_offsets() const { return feature_hist_offsets_; }

-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  const std::vector<uint32_t>& column_hist_offsets() const { return column_hist_offsets_; }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA

  bool IsSparseRowwise() {
    return (multi_val_bin_wrapper_ != nullptr && multi_val_bin_wrapper_->IsSparse());
@ -235,7 +235,7 @@ struct TrainingShareStates {
  }


-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  const void* GetRowWiseData(uint8_t* bit_type,
    size_t* total_size,
    bool* is_sparse,
@ -250,13 +250,13 @@ struct TrainingShareStates {
      return nullptr;
    }
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA

 private:
  std::vector<uint32_t> feature_hist_offsets_;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  std::vector<uint32_t> column_hist_offsets_;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
  int num_hist_total_bin_ = 0;
  std::unique_ptr<MultiValBinWrapper> multi_val_bin_wrapper_;
  std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>> hist_buf_;
--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@ -319,9 +319,9 @@ class Tree {

  inline bool is_linear() const { return is_linear_; }

-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  inline bool is_cuda_tree() const { return is_cuda_tree_; }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA

  inline void SetIsLinear(bool is_linear) {
    is_linear_ = is_linear;
@ -532,10 +532,10 @@ class Tree {
  std::vector<std::vector<int>> leaf_features_;
  /* \brief features used in leaf linear models; indexing is relative to used_features_ */
  std::vector<std::vector<int>> leaf_features_inner_;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  /*! \brief Marks whether this tree is a CUDATree */
  bool is_cuda_tree_;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 };

 inline void Tree::Split(int leaf, int feature, int real_feature,
--- a/python-package/README.rst
+++ b/python-package/README.rst
@ -121,11 +121,9 @@ Build CUDA Version

 All requirements from `Build from Sources section <#build-from-sources>`__ apply for this installation option as well, and `CMake`_ (version 3.16 or higher) is strongly required.

-**CUDA** library (version 9.0 or higher) is needed: details for installation can be found in `Installation Guide <https://github.com/microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#build-cuda-version-experimental>`__.
+**CUDA** library (version 10.0 or higher) is needed: details for installation can be found in `Installation Guide <https://github.com/microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#build-cuda-version-experimental>`__.

-Recently, a new CUDA version with better efficiency is implemented as an experimental feature. To build the new CUDA version, replace ``--cuda`` with ``--cuda-exp`` in the above commands. Please note that new version requires **CUDA** 10.0 or later libraries. Note that this new version uses twice the memory, since it stores data row-wise as well as column-wise in memory to improve performance (see this `issue <https://github.com/microsoft/LightGBM/issues/5318>`__ for discussion). 
-
-To use the regular or experimental CUDA versions within Python, pass ``{"device": "cuda"}`` or ``{"device": "cuda_exp"}`` respectively as parameters.
+To use the CUDA version within Python, pass ``{"device": "cuda"}`` respectively in parameters.

 Build HDFS Version
 ~~~~~~~~~~~~~~~~~~
@ -211,8 +209,6 @@ Run ``python setup.py install --gpu`` to enable GPU support. All requirements fr

 Run ``python setup.py install --cuda`` to enable CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well.

-Run ``python setup.py install --cuda-exp`` to enable the new experimental version of CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well.
-
 Run ``python setup.py install --hdfs`` to enable HDFS support. All requirements from `Build HDFS Version section <#build-hdfs-version>`__ apply for this installation option as well.

 Run ``python setup.py install --bit32``, if you want to use 32-bit version. All requirements from `Build 32-bit Version with 32-bit Python section <#build-32-bit-version-with-32-bit-python>`__ apply for this installation option as well.
--- a/python-package/setup.py
+++ b/python-package/setup.py
@ -21,7 +21,6 @@ LIGHTGBM_OPTIONS = [
    ('integrated-opencl', None, 'Compile integrated OpenCL version'),
    ('gpu', 'g', 'Compile GPU version'),
    ('cuda', None, 'Compile CUDA version'),
-    ('cuda-exp', None, 'Compile CUDA Experimental version'),
    ('mpi', None, 'Compile MPI version'),
    ('nomp', None, 'Compile version without OpenMP support'),
    ('hdfs', 'h', 'Compile HDFS version'),
@ -106,7 +105,6 @@ def compile_cpp(
    use_mingw: bool = False,
    use_gpu: bool = False,
    use_cuda: bool = False,
-    use_cuda_exp: bool = False,
    use_mpi: bool = False,
    use_hdfs: bool = False,
    boost_root: Optional[str] = None,
@ -148,8 +146,6 @@ def compile_cpp(
            cmake_cmd.append(f"-DOpenCL_LIBRARY={opencl_library}")
    elif use_cuda:
        cmake_cmd.append("-DUSE_CUDA=ON")
-    elif use_cuda_exp:
-        cmake_cmd.append("-DUSE_CUDA_EXP=ON")
    if use_mpi:
        cmake_cmd.append("-DUSE_MPI=ON")
    if nomp:
@ -171,7 +167,7 @@ def compile_cpp(
        else:
            status = 1
            lib_path = CURRENT_DIR / "compile" / "windows" / "x64" / "DLL" / "lib_lightgbm.dll"
-            if not any((use_gpu, use_cuda, use_cuda_exp, use_mpi, use_hdfs, nomp, bit32, integrated_opencl)):
+            if not any((use_gpu, use_cuda, use_mpi, use_hdfs, nomp, bit32, integrated_opencl)):
                logger.info("Starting to compile with MSBuild from existing solution file.")
                platform_toolsets = ("v143", "v142", "v141", "v140")
                for pt in platform_toolsets:
@ -235,7 +231,6 @@ class CustomInstall(install):
        self.integrated_opencl = False
        self.gpu = False
        self.cuda = False
-        self.cuda_exp = False
        self.boost_root = None
        self.boost_dir = None
        self.boost_include_dir = None
@ -260,7 +255,7 @@ class CustomInstall(install):
        LOG_PATH.touch()
        if not self.precompile:
            copy_files(integrated_opencl=self.integrated_opencl, use_gpu=self.gpu)
-            compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_cuda=self.cuda, use_cuda_exp=self.cuda_exp, use_mpi=self.mpi,
+            compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_cuda=self.cuda, use_mpi=self.mpi,
                        use_hdfs=self.hdfs, boost_root=self.boost_root, boost_dir=self.boost_dir,
                        boost_include_dir=self.boost_include_dir, boost_librarydir=self.boost_librarydir,
                        opencl_include_dir=self.opencl_include_dir, opencl_library=self.opencl_library,
@ -281,7 +276,6 @@ class CustomBdistWheel(bdist_wheel):
        self.integrated_opencl = False
        self.gpu = False
        self.cuda = False
-        self.cuda_exp = False
        self.boost_root = None
        self.boost_dir = None
        self.boost_include_dir = None
@ -304,7 +298,6 @@ class CustomBdistWheel(bdist_wheel):
        install.integrated_opencl = self.integrated_opencl
        install.gpu = self.gpu
        install.cuda = self.cuda
-        install.cuda_exp = self.cuda_exp
        install.boost_root = self.boost_root
        install.boost_dir = self.boost_dir
        install.boost_include_dir = self.boost_include_dir
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@ -36,7 +36,7 @@ Application::Application(int argc, char** argv) {
    Log::Fatal("No training/prediction data, application quit");
  }

-  if (config_.device_type == std::string("cuda") || config_.device_type == std::string("cuda_exp")) {
+  if (config_.device_type == std::string("cuda")) {
      LGBM_config_::current_device = lgbm_device_cuda;
  }
 }
--- a/src/boosting/bagging.hpp
+++ b/src/boosting/bagging.hpp
@ -47,33 +47,33 @@ class BaggingSampleStrategy : public SampleStrategy {
      Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
      // set bagging data to tree learner
      if (!is_use_subset_) {
-        #ifdef USE_CUDA_EXP
-        if (config_->device_type == std::string("cuda_exp")) {
+        #ifdef USE_CUDA
+        if (config_->device_type == std::string("cuda")) {
          CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
          tree_learner->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_);
        } else {
-        #endif  // USE_CUDA_EXP
+        #endif  // USE_CUDA
          tree_learner->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
-        #ifdef USE_CUDA_EXP
+        #ifdef USE_CUDA
        }
-        #endif  // USE_CUDA_EXP
+        #endif  // USE_CUDA
      } else {
        // get subset
        tmp_subset_->ReSize(bag_data_cnt_);
        tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
                                bag_data_cnt_, false);
-        #ifdef USE_CUDA_EXP
-        if (config_->device_type == std::string("cuda_exp")) {
+        #ifdef USE_CUDA
+        if (config_->device_type == std::string("cuda")) {
          CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
          tree_learner->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(),
                                       bag_data_cnt_);
        } else {
-        #endif  // USE_CUDA_EXP
+        #endif  // USE_CUDA
          tree_learner->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
                                       bag_data_cnt_);
-        #ifdef USE_CUDA_EXP
+        #ifdef USE_CUDA
        }
-        #endif  // USE_CUDA_EXP
+        #endif  // USE_CUDA
      }
    }
  }
@ -103,11 +103,11 @@ class BaggingSampleStrategy : public SampleStrategy {
        bag_data_cnt_ = static_cast<data_size_t>(config_->bagging_fraction * num_data_);
      }
      bag_data_indices_.resize(num_data_);
-      #ifdef USE_CUDA_EXP
-      if (config_->device_type == std::string("cuda_exp")) {
+      #ifdef USE_CUDA
+      if (config_->device_type == std::string("cuda")) {
        cuda_bag_data_indices_.Resize(num_data_);
      }
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
      bagging_runner_.ReSize(num_data_);
      bagging_rands_.clear();
      for (int i = 0;
@ -118,7 +118,7 @@ class BaggingSampleStrategy : public SampleStrategy {
      double average_bag_rate =
          (static_cast<double>(bag_data_cnt_) / num_data_) / config_->bagging_freq;
      is_use_subset_ = false;
-      if (config_->device_type != std::string("cuda_exp")) {
+      if (config_->device_type != std::string("cuda")) {
        const int group_threshold_usesubset = 100;
        const double average_bag_rate_threshold = 0.5;
        if (average_bag_rate <= average_bag_rate_threshold
@ -141,9 +141,9 @@ class BaggingSampleStrategy : public SampleStrategy {
    } else {
      bag_data_cnt_ = num_data_;
      bag_data_indices_.clear();
-      #ifdef USE_CUDA_EXP
+      #ifdef USE_CUDA
      cuda_bag_data_indices_.Clear();
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
      bagging_runner_.ReSize(0);
      is_use_subset_ = false;
    }
--- a/src/boosting/cuda/cuda_score_updater.cpp
+++ b/src/boosting/cuda/cuda_score_updater.cpp
@ -5,7 +5,7 @@

 #include "cuda_score_updater.hpp"

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 namespace LightGBM {

@ -91,4 +91,4 @@ inline void CUDAScoreUpdater::MultiplyScore(double val, int cur_tree_id) {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/boosting/cuda/cuda_score_updater.cu
+++ b/src/boosting/cuda/cuda_score_updater.cu
@ -5,7 +5,7 @@

 #include "cuda_score_updater.hpp"

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 namespace LightGBM {

@ -42,4 +42,4 @@ void CUDAScoreUpdater::LaunchMultiplyScoreConstantKernel(const double val, const

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/boosting/cuda/cuda_score_updater.hpp
+++ b/src/boosting/cuda/cuda_score_updater.hpp
@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_
 #define LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_utils.h>

@ -60,6 +60,6 @@ class CUDAScoreUpdater: public ScoreUpdater {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA

 #endif  // LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@ -68,14 +68,14 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
  es_first_metric_only_ = config_->first_metric_only;
  shrinkage_rate_ = config_->learning_rate;

-  if (config_->device_type == std::string("cuda") || config_->device_type == std::string("cuda_exp")) {
+  if (config_->device_type == std::string("cuda")) {
    LGBM_config_::current_learner = use_cuda_learner;
-    #ifdef USE_CUDA_EXP
-    if (config_->device_type == std::string("cuda_exp")) {
+    #ifdef USE_CUDA
+    if (config_->device_type == std::string("cuda")) {
      const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0;
      CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id));
    }
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
  }

  // load forced_splits file
@ -116,15 +116,15 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
  }
  training_metrics_.shrink_to_fit();

-  #ifdef USE_CUDA_EXP
-  if (config_->device_type == std::string("cuda_exp")) {
+  #ifdef USE_CUDA
+  if (config_->device_type == std::string("cuda")) {
    train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_, boosting_on_gpu_));
  } else {
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
    train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA

  num_data_ = train_data_->num_data();

@ -186,11 +186,11 @@ void GBDT::AddValidDataset(const Dataset* valid_data,
  }
  // for a validation dataset, we need its score and metric
  auto new_score_updater =
-    #ifdef USE_CUDA_EXP
-    config_->device_type == std::string("cuda_exp") ?
+    #ifdef USE_CUDA
+    config_->device_type == std::string("cuda") ?
    std::unique_ptr<CUDAScoreUpdater>(new CUDAScoreUpdater(valid_data, num_tree_per_iteration_,
      objective_function_ != nullptr && objective_function_->IsCUDAObjective())) :
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
    std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_tree_per_iteration_));
  // update score
  for (int i = 0; i < iter_; ++i) {
@ -481,15 +481,15 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
    const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt();
    // we need to predict out-of-bag scores of data for boosting
    if (num_data_ - bag_data_cnt > 0) {
-      #ifdef USE_CUDA_EXP
-      if (config_->device_type == std::string("cuda_exp")) {
+      #ifdef USE_CUDA
+      if (config_->device_type == std::string("cuda")) {
        train_score_updater_->AddScore(tree, data_sample_strategy_->cuda_bag_data_indices().RawData() + bag_data_cnt, num_data_ - bag_data_cnt, cur_tree_id);
      } else {
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
        train_score_updater_->AddScore(tree, data_sample_strategy_->bag_data_indices().data() + bag_data_cnt, num_data_ - bag_data_cnt, cur_tree_id);
-      #ifdef USE_CUDA_EXP
+      #ifdef USE_CUDA
      }
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
    }

  } else {
@ -503,17 +503,17 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
  }
 }

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const {
 #else
 std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t /*num_data*/) const {
-#endif  // USE_CUDA_EXP
-  #ifdef USE_CUDA_EXP
+#endif  // USE_CUDA
+  #ifdef USE_CUDA
  const bool evaluation_on_cuda = metric->IsCUDAMetric();
  if ((boosting_on_gpu_ && evaluation_on_cuda) || (!boosting_on_gpu_ && !evaluation_on_cuda)) {
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
    return metric->Eval(score, objective_function_);
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  } else if (boosting_on_gpu_ && !evaluation_on_cuda) {
    const size_t total_size = static_cast<size_t>(num_data) * static_cast<size_t>(num_tree_per_iteration_);
    if (total_size > host_score_.size()) {
@ -529,7 +529,7 @@ std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* scor
    CopyFromHostToCUDADevice<double>(cuda_score_.RawData(), score, total_size, __FILE__, __LINE__);
    return metric->Eval(cuda_score_.RawData(), objective_function_);
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }

 std::string GBDT::OutputMetric(int iter) {
@ -660,14 +660,14 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
    num_data = valid_score_updater_[used_idx]->num_data();
    *out_len = static_cast<int64_t>(num_data) * num_class_;
  }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  std::vector<double> host_raw_scores;
  if (boosting_on_gpu_) {
    host_raw_scores.resize(static_cast<size_t>(*out_len), 0.0);
    CopyFromCUDADeviceToHost<double>(host_raw_scores.data(), raw_scores, static_cast<size_t>(*out_len), __FILE__, __LINE__);
    raw_scores = host_raw_scores.data();
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
  if (objective_function_ != nullptr) {
    #pragma omp parallel for schedule(static)
    for (data_size_t i = 0; i < num_data; ++i) {
@ -730,26 +730,26 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction*
  }
  training_metrics_.shrink_to_fit();

-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  boosting_on_gpu_ = objective_function_ != nullptr && objective_function_->IsCUDAObjective() &&
                    !data_sample_strategy_->IsHessianChange();  // for sample strategy with Hessian change, fall back to boosting on CPU
  tree_learner_->ResetBoostingOnGPU(boosting_on_gpu_);
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA

  if (train_data != train_data_) {
    train_data_ = train_data;
    data_sample_strategy_->UpdateTrainingData(train_data);
    // not same training data, need reset score and others
    // create score tracker
-    #ifdef USE_CUDA_EXP
-    if (config_->device_type == std::string("cuda_exp")) {
+    #ifdef USE_CUDA
+    if (config_->device_type == std::string("cuda")) {
      train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_, boosting_on_gpu_));
    } else {
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
      train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));
-    #ifdef USE_CUDA_EXP
+    #ifdef USE_CUDA
    }
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA

    // update score
    for (int i = 0; i < iter_; ++i) {
@ -827,8 +827,8 @@ void GBDT::ResetGradientBuffers() {
  const bool is_use_subset = data_sample_strategy_->is_use_subset();
  const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt();
  if (objective_function_ != nullptr) {
-    #ifdef USE_CUDA_EXP
-    if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu_) {
+    #ifdef USE_CUDA
+    if (config_->device_type == std::string("cuda") && boosting_on_gpu_) {
      if (cuda_gradients_.Size() < total_size) {
        cuda_gradients_.Resize(total_size);
        cuda_hessians_.Resize(total_size);
@ -836,16 +836,16 @@ void GBDT::ResetGradientBuffers() {
      gradients_pointer_ = cuda_gradients_.RawData();
      hessians_pointer_ = cuda_hessians_.RawData();
    } else {
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
      if (gradients_.size() < total_size) {
        gradients_.resize(total_size);
        hessians_.resize(total_size);
      }
      gradients_pointer_ = gradients_.data();
      hessians_pointer_ = hessians_.data();
-    #ifdef USE_CUDA_EXP
+    #ifdef USE_CUDA
    }
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
  } else if (data_sample_strategy_->IsHessianChange() || (is_use_subset && bag_data_cnt < num_data_ && !boosting_on_gpu_)) {
    if (gradients_.size() < total_size) {
      gradients_.resize(total_size);
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@ -542,7 +542,7 @@ class GBDT : public GBDTBase {
  /*! \brief Parser config file content */
  std::string parser_config_str_ = "";

-#if defined(USE_CUDA) || defined(USE_CUDA_EXP)
+#ifdef USE_CUDA
  /*! \brief First order derivative of training data */
  std::vector<score_t, CHAllocator<score_t>> gradients_;
  /*! \brief Second order derivative of training data */
@ -557,18 +557,18 @@ class GBDT : public GBDTBase {
  score_t* gradients_pointer_;
  /*! \brief Pointer to hessian vector, can be on CPU or GPU */
  score_t* hessians_pointer_;
-  /*! \brief Whether boosting is done on GPU, used for cuda_exp */
+  /*! \brief Whether boosting is done on GPU, used for device_type=cuda */
  bool boosting_on_gpu_;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  /*! \brief Gradient vector on GPU */
  CUDAVector<score_t> cuda_gradients_;
  /*! \brief Hessian vector on GPU */
  CUDAVector<score_t> cuda_hessians_;
-  /*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with cuda_exp */
+  /*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with device_type=cuda */
  mutable std::vector<double> host_score_;
-  /*! \brief Buffer for scores when boosting is not on GPU but evaluation is, used only with cuda_exp */
+  /*! \brief Buffer for scores when boosting is not on GPU but evaluation is, used only with device_type=cuda */
  mutable CUDAVector<double> cuda_score_;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA

  /*! \brief Number of training data */
  data_size_t num_data_;
--- a/src/boosting/goss.hpp
+++ b/src/boosting/goss.hpp
@ -43,33 +43,33 @@ class GOSSStrategy : public SampleStrategy {
    bag_data_cnt_ = left_cnt;
    // set bagging data to tree learner
    if (!is_use_subset_) {
-      #ifdef USE_CUDA_EXP
-      if (config_->device_type == std::string("cuda_exp")) {
+      #ifdef USE_CUDA
+      if (config_->device_type == std::string("cuda")) {
        CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
        tree_learner->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_);
      } else {
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
        tree_learner->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
-      #ifdef USE_CUDA_EXP
+      #ifdef USE_CUDA
      }
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
    } else {
      // get subset
      tmp_subset_->ReSize(bag_data_cnt_);
      tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
                              bag_data_cnt_, false);
-      #ifdef USE_CUDA_EXP
-      if (config_->device_type == std::string("cuda_exp")) {
+      #ifdef USE_CUDA
+      if (config_->device_type == std::string("cuda")) {
        CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
        tree_learner->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(),
                                      bag_data_cnt_);
      } else {
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
        tree_learner->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
                                     bag_data_cnt_);
-      #ifdef USE_CUDA_EXP
+      #ifdef USE_CUDA
      }
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
    }
  }

--- a/src/cuda/cuda_algorithms.cu
+++ b/src/cuda/cuda_algorithms.cu
@ -3,7 +3,7 @@
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_algorithms.hpp>

@ -509,4 +509,4 @@ template __device__ double PercentileDevice<double, data_size_t, label_t, double

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/cuda/cuda_utils.cpp
+++ b/src/cuda/cuda_utils.cpp
@ -3,7 +3,7 @@
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_utils.h>

@ -28,4 +28,4 @@ void SetCUDADevice(int gpu_device_id, const char* file, int line) {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@ -886,7 +886,7 @@ namespace LightGBM {
    return nullptr;
  }

-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  template <>
  const void* MultiValDenseBin<uint8_t>::GetRowWiseData(uint8_t* bit_type,
      size_t* total_size,
@ -1081,6 +1081,6 @@ namespace LightGBM {
    return to_return;
  }

-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA

 }  // namespace LightGBM
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@ -177,8 +177,6 @@ void GetDeviceType(const std::unordered_map<std::string, std::string>& params, s
      *device_type = "gpu";
    } else if (value == std::string("cuda")) {
      *device_type = "cuda";
-    } else if (value == std::string("cuda_exp")) {
-      *device_type = "cuda_exp";
    } else {
      Log::Fatal("Unknown device type %s", value.c_str());
    }
@ -260,7 +258,7 @@ void Config::Set(const std::unordered_map<std::string, std::string>& params) {
  GetObjectiveType(params, &objective);
  GetMetricType(params, objective, &metric);
  GetDeviceType(params, &device_type);
-  if (device_type == std::string("cuda") || device_type == std::string("cuda_exp")) {
+  if (device_type == std::string("cuda")) {
    LGBM_config_::current_device = lgbm_device_cuda;
  }
  GetTreeLearnerType(params, &tree_learner);
@ -373,26 +371,21 @@ void Config::CheckParamConflict() {
      num_leaves = static_cast<int>(full_num_leaves);
    }
  }
-  if (device_type == std::string("gpu") || device_type == std::string("cuda")) {
+  if (device_type == std::string("gpu")) {
    // force col-wise for gpu, and cuda version
    force_col_wise = true;
    force_row_wise = false;
    if (deterministic) {
      Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
    }
-  } else if (device_type == std::string("cuda_exp")) {
-    // force row-wise for cuda_exp version
+  } else if (device_type == std::string("cuda")) {
+    // force row-wise for cuda version
    force_col_wise = false;
    force_row_wise = true;
    if (deterministic) {
      Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
    }
  }
-  // force gpu_use_dp for CUDA
-  if (device_type == std::string("cuda") && !gpu_use_dp) {
-    Log::Warning("CUDA currently requires double precision calculations.");
-    gpu_use_dp = true;
-  }
  // linear tree learner must be serial type and run on CPU device
  if (linear_tree) {
    if (device_type != std::string("cpu")) {
--- a/src/io/cuda/cuda_column_data.cpp
+++ b/src/io/cuda/cuda_column_data.cpp
@ -3,7 +3,7 @@
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_column_data.hpp>

@ -308,4 +308,4 @@ void CUDAColumnData::InitColumnMetaInfo() {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/io/cuda/cuda_column_data.cu
+++ b/src/io/cuda/cuda_column_data.cu
@ -4,7 +4,7 @@
 */


-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_column_data.hpp>

@ -58,4 +58,4 @@ void CUDAColumnData::LaunchCopySubrowKernel(void* const* in_cuda_data_by_column)

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/io/cuda/cuda_metadata.cpp
+++ b/src/io/cuda/cuda_metadata.cpp
@ -3,7 +3,7 @@
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_metadata.hpp>

@ -89,4 +89,4 @@ void CUDAMetadata::SetInitScore(const double* init_score, data_size_t len) {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/io/cuda/cuda_row_data.cpp
+++ b/src/io/cuda/cuda_row_data.cpp
@ -3,7 +3,7 @@
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_row_data.hpp>

@ -474,4 +474,4 @@ template const uint64_t* CUDARowData::GetPartitionPtr<uint64_t>() const;

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/io/cuda/cuda_tree.cpp
+++ b/src/io/cuda/cuda_tree.cpp
@ -3,7 +3,7 @@
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_tree.hpp>

@ -337,4 +337,4 @@ void CUDATree::AsConstantTree(double val) {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/io/cuda/cuda_tree.cu
+++ b/src/io/cuda/cuda_tree.cu
@ -4,7 +4,7 @@
 */


-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_tree.hpp>

@ -456,4 +456,4 @@ void CUDATree::LaunchAddPredictionToScoreKernel(

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@ -345,9 +345,9 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
  auto features_in_group = OneFeaturePerGroup(used_features);

  auto is_sparse = io_config.is_enable_sparse;
-  if (io_config.device_type == std::string("cuda") || io_config.device_type == std::string("cuda_exp")) {
+  if (io_config.device_type == std::string("cuda")) {
      LGBM_config_::current_device = lgbm_device_cuda;
-      if ((io_config.device_type == std::string("cuda") || io_config.device_type == std::string("cuda_exp")) && is_sparse) {
+      if ((io_config.device_type == std::string("cuda")) && is_sparse) {
        Log::Warning("Using sparse features with CUDA is currently not supported.");
        is_sparse = false;
      }
@ -355,8 +355,7 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,

  std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
  if (io_config.enable_bundle && !used_features.empty()) {
-    bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda")
-      || io_config.device_type == std::string("cuda_exp");
+    bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda");
    features_in_group = FastFeatureBundling(
        *bin_mappers, sample_non_zero_indices, sample_values, num_per_col,
        num_sample_col, static_cast<data_size_t>(total_sample_cnt),
@ -447,14 +446,14 @@ void Dataset::FinishLoad() {
  }
  metadata_.FinishLoad();

-  #ifdef USE_CUDA_EXP
-  if (device_type_ == std::string("cuda_exp")) {
+  #ifdef USE_CUDA
+  if (device_type_ == std::string("cuda")) {
    CreateCUDAColumnData();
    metadata_.CreateCUDAMetadata(gpu_device_id_);
  } else {
    cuda_column_data_.reset(nullptr);
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
  is_finish_load_ = true;
 }

@ -862,15 +861,15 @@ void Dataset::CopySubrow(const Dataset* fullset,
  device_type_ = fullset->device_type_;
  gpu_device_id_ = fullset->gpu_device_id_;

-  #ifdef USE_CUDA_EXP
-  if (device_type_ == std::string("cuda_exp")) {
+  #ifdef USE_CUDA
+  if (device_type_ == std::string("cuda")) {
    if (cuda_column_data_ == nullptr) {
      cuda_column_data_.reset(new CUDAColumnData(fullset->num_data(), gpu_device_id_));
      metadata_.CreateCUDAMetadata(gpu_device_id_);
    }
    cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices);
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }

 bool Dataset::SetFloatField(const char* field_name, const float* field_data,
@ -1508,13 +1507,13 @@ void Dataset::AddFeaturesFrom(Dataset* other) {
      raw_data_.push_back(other->raw_data_[i]);
    }
  }
-  #ifdef USE_CUDA_EXP
-  if (device_type_ == std::string("cuda_exp")) {
+  #ifdef USE_CUDA
+  if (device_type_ == std::string("cuda")) {
    CreateCUDAColumnData();
  } else {
    cuda_column_data_ = nullptr;
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }

 const void* Dataset::GetColWiseData(
@ -1536,7 +1535,7 @@ const void* Dataset::GetColWiseData(
  return feature_groups_[feature_group_index]->GetColWiseData(sub_feature_index, bit_type, is_sparse, bin_iterator);
 }

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 void Dataset::CreateCUDAColumnData() {
  cuda_column_data_.reset(new CUDAColumnData(num_data_, gpu_device_id_));
  int num_columns = 0;
@ -1671,6 +1670,6 @@ void Dataset::CreateCUDAColumnData() {
                          feature_to_column);
 }

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA

 }  // namespace LightGBM
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@ -279,14 +279,14 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac

    dataset->device_type_ = config_.device_type;
    dataset->gpu_device_id_ = config_.gpu_device_id;
-    #ifdef USE_CUDA_EXP
-    if (config_.device_type == std::string("cuda_exp")) {
+    #ifdef USE_CUDA
+    if (config_.device_type == std::string("cuda")) {
      dataset->CreateCUDAColumnData();
      dataset->metadata_.CreateCUDAMetadata(dataset->gpu_device_id_);
    } else {
      dataset->cuda_column_data_ = nullptr;
    }
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
  }
  // check meta data
  dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices);
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@ -467,7 +467,7 @@ class DenseBin : public Bin {

 private:
  data_size_t num_data_;
-#if defined(USE_CUDA) || defined(USE_CUDA_EXP)
+#ifdef USE_CUDA
  std::vector<VAL_T, CHAllocator<VAL_T>> data_;
 #else
  std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> data_;
--- a/src/io/metadata.cpp
+++ b/src/io/metadata.cpp
@ -18,9 +18,9 @@ Metadata::Metadata() {
  weight_load_from_file_ = false;
  query_load_from_file_ = false;
  init_score_load_from_file_ = false;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  cuda_metadata_ = nullptr;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }

 void Metadata::Init(const char* data_filename) {
@ -344,11 +344,11 @@ void Metadata::SetInitScore(const double* init_score, data_size_t len) {
    init_score_[i] = Common::AvoidInf(init_score[i]);
  }
  init_score_load_from_file_ = false;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  if (cuda_metadata_ != nullptr) {
    cuda_metadata_->SetInitScore(init_score_.data(), len);
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }

 void Metadata::InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size) {
@ -387,11 +387,11 @@ void Metadata::SetLabel(const label_t* label, data_size_t len) {
  for (data_size_t i = 0; i < num_data_; ++i) {
    label_[i] = Common::AvoidInf(label[i]);
  }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  if (cuda_metadata_ != nullptr) {
    cuda_metadata_->SetLabel(label_.data(), len);
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }

 void Metadata::InsertLabels(const label_t* labels, data_size_t start_index, data_size_t len) {
@ -428,11 +428,11 @@ void Metadata::SetWeights(const label_t* weights, data_size_t len) {
  }
  CalculateQueryWeights();
  weight_load_from_file_ = false;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  if (cuda_metadata_ != nullptr) {
    cuda_metadata_->SetWeights(weights_.data(), len);
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }

 void Metadata::InsertWeights(const label_t* weights, data_size_t start_index, data_size_t len) {
@ -477,7 +477,7 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
  }
  CalculateQueryWeights();
  query_load_from_file_ = false;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  if (cuda_metadata_ != nullptr) {
    if (query_weights_.size() > 0) {
      CHECK_EQ(query_weights_.size(), static_cast<size_t>(num_queries_));
@ -486,7 +486,7 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
      cuda_metadata_->SetQuery(query_boundaries_.data(), nullptr, num_queries_);
    }
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }

 void Metadata::InsertQueries(const data_size_t* queries, data_size_t start_index, data_size_t len) {
@ -635,12 +635,12 @@ void Metadata::FinishLoad() {
  CalculateQueryBoundaries();
 }

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 void Metadata::CreateCUDAMetadata(const int gpu_device_id) {
  cuda_metadata_.reset(new CUDAMetadata(gpu_device_id));
  cuda_metadata_->Init(label_, weights_, query_boundaries_, query_weights_, init_score_);
 }
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA

 void Metadata::LoadFromMemory(const void* memory) {
  const char* mem_ptr = reinterpret_cast<const char*>(memory);
--- a/src/io/multi_val_dense_bin.hpp
+++ b/src/io/multi_val_dense_bin.hpp
@ -211,13 +211,13 @@ class MultiValDenseBin : public MultiValBin {

  MultiValDenseBin<VAL_T>* Clone() override;

-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  const void* GetRowWiseData(uint8_t* bit_type,
    size_t* total_size,
    bool* is_sparse,
    const void** out_data_ptr,
    uint8_t* data_ptr_bit_type) const override;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA

 private:
  data_size_t num_data_;
--- a/src/io/multi_val_sparse_bin.hpp
+++ b/src/io/multi_val_sparse_bin.hpp
@ -292,13 +292,13 @@ class MultiValSparseBin : public MultiValBin {
  MultiValSparseBin<INDEX_T, VAL_T>* Clone() override;


-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  const void* GetRowWiseData(uint8_t* bit_type,
    size_t* total_size,
    bool* is_sparse,
    const void** out_data_ptr,
    uint8_t* data_ptr_bit_type) const override;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA

 private:
  data_size_t num_data_;
--- a/src/io/train_share_states.cpp
+++ b/src/io/train_share_states.cpp
@ -382,9 +382,9 @@ void TrainingShareStates::CalcBinOffsets(const std::vector<std::unique_ptr<Featu
    }
    num_hist_total_bin_ = static_cast<int>(feature_hist_offsets_.back());
  }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  column_hist_offsets_ = *offsets;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }

 void TrainingShareStates::SetMultiValBin(MultiValBin* bin, data_size_t num_data,
--- a/src/io/tree.cpp
+++ b/src/io/tree.cpp
@ -53,9 +53,9 @@ Tree::Tree(int max_leaves, bool track_branch_features, bool is_linear)
    leaf_features_.resize(max_leaves_);
    leaf_features_inner_.resize(max_leaves_);
  }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  is_cuda_tree_ = false;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }

 int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
@ -731,9 +731,9 @@ Tree::Tree(const char* str, size_t* used_len) {
    is_linear_ = false;
  }

-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  is_cuda_tree_ = false;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA

  if ((num_leaves_ <= 1) && !is_linear_) {
    return;
--- a/src/metric/cuda/cuda_binary_metric.cpp
+++ b/src/metric/cuda/cuda_binary_metric.cpp
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_binary_metric.hpp"

@ -28,4 +28,4 @@ std::vector<double> CUDABinaryMetricInterface<HOST_METRIC, CUDA_METRIC>::Eval(co

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/metric/cuda/cuda_binary_metric.hpp
+++ b/src/metric/cuda/cuda_binary_metric.hpp
@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_
 #define LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_metric.hpp>
 #include <LightGBM/cuda/cuda_utils.h>
@ -52,6 +52,6 @@ class CUDABinaryLoglossMetric: public CUDABinaryMetricInterface<BinaryLoglossMet

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA

 #endif  // LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_
--- a/src/metric/cuda/cuda_pointwise_metric.cpp
+++ b/src/metric/cuda/cuda_pointwise_metric.cpp
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_binary_metric.hpp"
 #include "cuda_pointwise_metric.hpp"
@ -35,4 +35,4 @@ template void CUDAPointwiseMetricInterface<BinaryLoglossMetric, CUDABinaryLoglos

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/metric/cuda/cuda_pointwise_metric.cu
+++ b/src/metric/cuda/cuda_pointwise_metric.cu
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_algorithms.hpp>

@ -66,4 +66,4 @@ template void CUDAPointwiseMetricInterface<BinaryLoglossMetric, CUDABinaryLoglos

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/metric/cuda/cuda_pointwise_metric.hpp
+++ b/src/metric/cuda/cuda_pointwise_metric.hpp
@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_METRIC_CUDA_CUDA_POINTWISE_METRIC_HPP_
 #define LIGHTGBM_METRIC_CUDA_CUDA_POINTWISE_METRIC_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_metric.hpp>
 #include <LightGBM/cuda/cuda_utils.h>
@ -38,6 +38,6 @@ class CUDAPointwiseMetricInterface: public CUDAMetricInterface<HOST_METRIC> {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA

 #endif  // LIGHTGBM_METRIC_CUDA_CUDA_POINTWISE_METRIC_HPP_
--- a/src/metric/cuda/cuda_regression_metric.cpp
+++ b/src/metric/cuda/cuda_regression_metric.cpp
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <vector>

@ -31,4 +31,4 @@ CUDAL2Metric::CUDAL2Metric(const Config& config): CUDARegressionMetricInterface<

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/metric/cuda/cuda_regression_metric.hpp
+++ b/src/metric/cuda/cuda_regression_metric.hpp
@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_
 #define LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_metric.hpp>
 #include <LightGBM/cuda/cuda_utils.h>
@ -54,6 +54,6 @@ class CUDAL2Metric : public CUDARegressionMetricInterface<L2Metric, CUDAL2Metric

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA

 #endif  // LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_
--- a/src/metric/metric.cpp
+++ b/src/metric/metric.cpp
@ -17,77 +17,77 @@
 namespace LightGBM {

 Metric* Metric::CreateMetric(const std::string& type, const Config& config) {
-  #ifdef USE_CUDA_EXP
-  if (config.device_type == std::string("cuda_exp") && config.boosting == std::string("gbdt")) {
+  #ifdef USE_CUDA
+  if (config.device_type == std::string("cuda") && config.boosting == std::string("gbdt")) {
    if (type == std::string("l2")) {
      return new CUDAL2Metric(config);
    } else if (type == std::string("rmse")) {
      return new CUDARMSEMetric(config);
    } else if (type == std::string("l1")) {
-      Log::Warning("Metric l1 is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric l1 is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new L1Metric(config);
    } else if (type == std::string("quantile")) {
-      Log::Warning("Metric quantile is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric quantile is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new QuantileMetric(config);
    } else if (type == std::string("huber")) {
-      Log::Warning("Metric huber is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric huber is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new HuberLossMetric(config);
    } else if (type == std::string("fair")) {
-      Log::Warning("Metric fair is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric fair is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new FairLossMetric(config);
    } else if (type == std::string("poisson")) {
-      Log::Warning("Metric poisson is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric poisson is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new PoissonMetric(config);
    } else if (type == std::string("binary_logloss")) {
      return new CUDABinaryLoglossMetric(config);
    } else if (type == std::string("binary_error")) {
-      Log::Warning("Metric binary_error is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric binary_error is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new BinaryErrorMetric(config);
    } else if (type == std::string("auc")) {
-      Log::Warning("Metric auc is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric auc is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new AUCMetric(config);
    } else if (type == std::string("average_precision")) {
-      Log::Warning("Metric average_precision is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric average_precision is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new AveragePrecisionMetric(config);
    } else if (type == std::string("auc_mu")) {
-      Log::Warning("Metric auc_mu is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric auc_mu is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new AucMuMetric(config);
    } else if (type == std::string("ndcg")) {
-      Log::Warning("Metric ndcg is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric ndcg is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new NDCGMetric(config);
    } else if (type == std::string("map")) {
-      Log::Warning("Metric map is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric map is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new MapMetric(config);
    } else if (type == std::string("multi_logloss")) {
-      Log::Warning("Metric multi_logloss is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric multi_logloss is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new MultiSoftmaxLoglossMetric(config);
    } else if (type == std::string("multi_error")) {
-      Log::Warning("Metric multi_error is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric multi_error is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new MultiErrorMetric(config);
    } else if (type == std::string("cross_entropy")) {
-      Log::Warning("Metric cross_entropy is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric cross_entropy is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new CrossEntropyMetric(config);
    } else if (type == std::string("cross_entropy_lambda")) {
-      Log::Warning("Metric cross_entropy_lambda is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric cross_entropy_lambda is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new CrossEntropyLambdaMetric(config);
    } else if (type == std::string("kullback_leibler")) {
-      Log::Warning("Metric kullback_leibler is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric kullback_leibler is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new KullbackLeiblerDivergence(config);
    } else if (type == std::string("mape")) {
-      Log::Warning("Metric mape is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric mape is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new MAPEMetric(config);
    } else if (type == std::string("gamma")) {
-      Log::Warning("Metric gamma is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric gamma is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new GammaMetric(config);
    } else if (type == std::string("gamma_deviance")) {
-      Log::Warning("Metric gamma_deviance is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric gamma_deviance is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new GammaDevianceMetric(config);
    } else if (type == std::string("tweedie")) {
-      Log::Warning("Metric tweedie is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric tweedie is not implemented in cuda version. Fall back to evaluation on CPU.");
      return new TweedieMetric(config);
    }
  } else {
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
    if (type == std::string("l2")) {
      return new L2Metric(config);
    } else if (type == std::string("rmse")) {
@ -135,9 +135,9 @@ Metric* Metric::CreateMetric(const std::string& type, const Config& config) {
    } else if (type == std::string("tweedie")) {
      return new TweedieMetric(config);
    }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
  return nullptr;
 }

--- a/src/objective/cuda/cuda_binary_objective.cpp
+++ b/src/objective/cuda/cuda_binary_objective.cpp
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_binary_objective.hpp"

@ -61,4 +61,4 @@ void CUDABinaryLogloss::Init(const Metadata& metadata, data_size_t num_data) {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/objective/cuda/cuda_binary_objective.cu
+++ b/src/objective/cuda/cuda_binary_objective.cu
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <algorithm>

@ -206,4 +206,4 @@ void CUDABinaryLogloss::LaunchResetOVACUDALabelKernel() const {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/objective/cuda/cuda_binary_objective.hpp
+++ b/src/objective/cuda/cuda_binary_objective.hpp
@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_BINARY_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_CUDA_CUDA_BINARY_OBJECTIVE_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #define GET_GRADIENTS_BLOCK_SIZE_BINARY (1024)
 #define CALC_INIT_SCORE_BLOCK_SIZE_BINARY (1024)
@ -58,6 +58,6 @@ class CUDABinaryLogloss : public CUDAObjectiveInterface<BinaryLogloss> {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA

 #endif  // LIGHTGBM_OBJECTIVE_CUDA_CUDA_BINARY_OBJECTIVE_HPP_
--- a/src/objective/cuda/cuda_multiclass_objective.cpp
+++ b/src/objective/cuda/cuda_multiclass_objective.cpp
@ -3,7 +3,7 @@
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_multiclass_objective.hpp"

@ -59,4 +59,4 @@ const double* CUDAMulticlassOVA::ConvertOutputCUDA(const data_size_t num_data, c

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/objective/cuda/cuda_multiclass_objective.cu
+++ b/src/objective/cuda/cuda_multiclass_objective.cu
@ -3,7 +3,7 @@
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <algorithm>

@ -105,4 +105,4 @@ const double* CUDAMulticlassSoftmax::LaunchConvertOutputCUDAKernel(

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/objective/cuda/cuda_multiclass_objective.hpp
+++ b/src/objective/cuda/cuda_multiclass_objective.hpp
@ -5,7 +5,7 @@
 #ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_MULTICLASS_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_CUDA_CUDA_MULTICLASS_OBJECTIVE_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_objective_function.hpp>

@ -74,5 +74,5 @@ class CUDAMulticlassOVA: public CUDAObjectiveInterface<MulticlassOVA> {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_OBJECTIVE_CUDA_CUDA_MULTICLASS_OBJECTIVE_HPP_
--- a/src/objective/cuda/cuda_rank_objective.cpp
+++ b/src/objective/cuda/cuda_rank_objective.cpp
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <string>
 #include <vector>
@ -64,4 +64,4 @@ void CUDARankXENDCG::GenerateItemRands() const {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/objective/cuda/cuda_rank_objective.cu
+++ b/src/objective/cuda/cuda_rank_objective.cu
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_rank_objective.hpp"

@ -658,4 +658,4 @@ void CUDARankXENDCG::LaunchGetGradientsKernel(const double* score, score_t* grad

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/objective/cuda/cuda_rank_objective.hpp
+++ b/src/objective/cuda/cuda_rank_objective.hpp
@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_RANK_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_CUDA_CUDA_RANK_OBJECTIVE_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #define NUM_QUERY_PER_BLOCK (10)

@ -118,5 +118,5 @@ class CUDARankXENDCG : public CUDALambdaRankObjectiveInterface<RankXENDCG> {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_OBJECTIVE_CUDA_CUDA_RANK_OBJECTIVE_HPP_
--- a/src/objective/cuda/cuda_regression_objective.cpp
+++ b/src/objective/cuda/cuda_regression_objective.cpp
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_regression_objective.hpp"

@ -85,4 +85,4 @@ double CUDARegressionPoissonLoss::LaunchCalcInitScoreKernel(const int class_id)

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/objective/cuda/cuda_regression_objective.cu
+++ b/src/objective/cuda/cuda_regression_objective.cu
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_regression_objective.hpp"
 #include <LightGBM/cuda/cuda_algorithms.hpp>
@ -353,4 +353,4 @@ const double* CUDARegressionPoissonLoss::LaunchConvertOutputCUDAKernel(const dat

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/objective/cuda/cuda_regression_objective.hpp
+++ b/src/objective/cuda/cuda_regression_objective.hpp
@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_REGRESSION_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_CUDA_CUDA_REGRESSION_OBJECTIVE_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #define GET_GRADIENTS_BLOCK_SIZE_REGRESSION (1024)

@ -135,5 +135,5 @@ class CUDARegressionPoissonLoss : public CUDARegressionObjectiveInterface<Regres

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_OBJECTIVE_CUDA_CUDA_REGRESSION_OBJECTIVE_HPP_
--- a/src/objective/objective_function.cpp
+++ b/src/objective/objective_function.cpp
@ -18,8 +18,8 @@
 namespace LightGBM {

 ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& type, const Config& config) {
-  #ifdef USE_CUDA_EXP
-  if (config.device_type == std::string("cuda_exp") &&
+  #ifdef USE_CUDA
+  if (config.device_type == std::string("cuda") &&
      config.data_sample_strategy != std::string("goss") &&
      config.boosting != std::string("rf")) {
    if (type == std::string("regression")) {
@ -27,7 +27,7 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
    } else if (type == std::string("regression_l1")) {
      return new CUDARegressionL1loss(config);
    } else if (type == std::string("quantile")) {
-      Log::Warning("Objective quantile is not implemented in cuda_exp version. Fall back to boosting on CPU.");
+      Log::Warning("Objective quantile is not implemented in cuda version. Fall back to boosting on CPU.");
      return new RegressionQuantileloss(config);
    } else if (type == std::string("huber")) {
      return new CUDARegressionHuberLoss(config);
@ -46,26 +46,26 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
    } else if (type == std::string("multiclassova")) {
      return new CUDAMulticlassOVA(config);
    } else if (type == std::string("cross_entropy")) {
-      Log::Warning("Objective cross_entropy is not implemented in cuda_exp version. Fall back to boosting on CPU.");
+      Log::Warning("Objective cross_entropy is not implemented in cuda version. Fall back to boosting on CPU.");
      return new CrossEntropy(config);
    } else if (type == std::string("cross_entropy_lambda")) {
-      Log::Warning("Objective cross_entropy_lambda is not implemented in cuda_exp version. Fall back to boosting on CPU.");
+      Log::Warning("Objective cross_entropy_lambda is not implemented in cuda version. Fall back to boosting on CPU.");
      return new CrossEntropyLambda(config);
    } else if (type == std::string("mape")) {
-      Log::Warning("Objective mape is not implemented in cuda_exp version. Fall back to boosting on CPU.");
+      Log::Warning("Objective mape is not implemented in cuda version. Fall back to boosting on CPU.");
      return new RegressionMAPELOSS(config);
    } else if (type == std::string("gamma")) {
-      Log::Warning("Objective gamma is not implemented in cuda_exp version. Fall back to boosting on CPU.");
+      Log::Warning("Objective gamma is not implemented in cuda version. Fall back to boosting on CPU.");
      return new RegressionGammaLoss(config);
    } else if (type == std::string("tweedie")) {
-      Log::Warning("Objective tweedie is not implemented in cuda_exp version. Fall back to boosting on CPU.");
+      Log::Warning("Objective tweedie is not implemented in cuda version. Fall back to boosting on CPU.");
      return new RegressionTweedieLoss(config);
    } else if (type == std::string("custom")) {
-      Log::Warning("Using customized objective with cuda_exp. This requires copying gradients from CPU to GPU, which can be slow.");
+      Log::Warning("Using customized objective with cuda. This requires copying gradients from CPU to GPU, which can be slow.");
      return nullptr;
    }
  } else {
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
    if (type == std::string("regression")) {
      return new RegressionL2loss(config);
    } else if (type == std::string("regression_l1")) {
@ -101,9 +101,9 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
    } else if (type == std::string("custom")) {
      return nullptr;
    }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
  Log::Fatal("Unknown objective type name: %s", type.c_str());
  return nullptr;
 }
--- a/src/treelearner/cuda/cuda_best_split_finder.cpp
+++ b/src/treelearner/cuda/cuda_best_split_finder.cpp
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <algorithm>

@ -383,4 +383,4 @@ void CUDABestSplitFinder::SetUsedFeatureByNode(const std::vector<int8_t>& is_fea

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_best_split_finder.cu
+++ b/src/treelearner/cuda/cuda_best_split_finder.cu
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <algorithm>

@ -1802,4 +1802,4 @@ void CUDABestSplitFinder::LaunchInitCUDARandomKernel() {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_best_split_finder.hpp
+++ b/src/treelearner/cuda/cuda_best_split_finder.hpp
@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_BEST_SPLIT_FINDER_HPP_
 #define LIGHTGBM_TREELEARNER_CUDA_CUDA_BEST_SPLIT_FINDER_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/bin.h>
 #include <LightGBM/dataset.h>
@ -211,5 +211,5 @@ class CUDABestSplitFinder {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_BEST_SPLIT_FINDER_HPP_
--- a/src/treelearner/cuda/cuda_data_partition.cpp
+++ b/src/treelearner/cuda/cuda_data_partition.cpp
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <algorithm>
 #include <memory>
@ -370,4 +370,4 @@ void CUDADataPartition::ResetByLeafPred(const std::vector<int>& leaf_pred, int n

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_data_partition.cu
+++ b/src/treelearner/cuda/cuda_data_partition.cu
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_data_partition.hpp"

@ -1071,4 +1071,4 @@ void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double* leaf_valu

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_data_partition.hpp
+++ b/src/treelearner/cuda/cuda_data_partition.hpp
@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_DATA_PARTITION_HPP_
 #define LIGHTGBM_TREELEARNER_CUDA_CUDA_DATA_PARTITION_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/bin.h>
 #include <LightGBM/meta.h>
@ -384,5 +384,5 @@ class CUDADataPartition {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_DATA_PARTITION_HPP_
--- a/src/treelearner/cuda/cuda_histogram_constructor.cpp
+++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_histogram_constructor.hpp"

@ -193,4 +193,4 @@ void CUDAHistogramConstructor::ResetConfig(const Config* config) {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_histogram_constructor.cu
+++ b/src/treelearner/cuda/cuda_histogram_constructor.cu
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_histogram_constructor.hpp"

@ -429,4 +429,4 @@ void CUDAHistogramConstructor::LaunchSubtractHistogramKernel(

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_histogram_constructor.hpp
+++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp
@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_
 #define LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_row_data.hpp>
 #include <LightGBM/feature_group.h>
@ -165,5 +165,5 @@ class CUDAHistogramConstructor {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_
--- a/src/treelearner/cuda/cuda_leaf_splits.cpp
+++ b/src/treelearner/cuda/cuda_leaf_splits.cpp
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_leaf_splits.hpp"

@ -68,4 +68,4 @@ void CUDALeafSplits::Resize(const data_size_t num_data) {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_leaf_splits.cu
+++ b/src/treelearner/cuda/cuda_leaf_splits.cu
@ -5,7 +5,7 @@
 */


-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_leaf_splits.hpp"
 #include <LightGBM/cuda/cuda_algorithms.hpp>
@ -126,4 +126,4 @@ void CUDALeafSplits::LaunchInitValuesKernal(

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_leaf_splits.hpp
+++ b/src/treelearner/cuda/cuda_leaf_splits.hpp
@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_
 #define LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_utils.h>
 #include <LightGBM/bin.h>
@ -156,5 +156,5 @@ class CUDALeafSplits {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_
--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_single_gpu_tree_learner.hpp"

@ -515,4 +515,4 @@ void CUDASingleGPUTreeLearner::CheckSplitValid(

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_algorithms.hpp>

@ -258,4 +258,4 @@ void CUDASingleGPUTreeLearner::LaunchConstructBitsetForCategoricalSplitKernel(

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
@ -9,7 +9,7 @@
 #include <memory>
 #include <vector>

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_leaf_splits.hpp"
 #include "cuda_histogram_constructor.hpp"
@ -137,7 +137,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {

 }  // namespace LightGBM

-#else  // USE_CUDA_EXP
+#else  // USE_CUDA

 // When GPU support is not compiled in, quit with an error message

@ -147,12 +147,12 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {
 public:
    #pragma warning(disable : 4702)
    explicit CUDASingleGPUTreeLearner(const Config* tree_config, const bool /*boosting_on_cuda*/) : SerialTreeLearner(tree_config) {
-      Log::Fatal("CUDA Tree Learner experimental version was not enabled in this build.\n"
-                 "Please recompile with CMake option -DUSE_CUDA_EXP=1");
+      Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
+                 "Please recompile with CMake option -DUSE_CUDAP=1");
    }
 };

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_SINGLE_GPU_TREE_LEARNER_HPP_
--- a/src/treelearner/cuda_kernel_launcher.cu
+++ b/src/treelearner/cuda_kernel_launcher.cu
@ -1,171 +0,0 @@
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifdef USE_CUDA
-
-#include "cuda_kernel_launcher.h"
-
-#include <LightGBM/utils/log.h>
-
-#include <cuda_runtime.h>
-
-#include <cstdio>
-
-namespace LightGBM {
-
-void cuda_histogram(
-                int             histogram_size,
-                data_size_t     leaf_num_data,
-                data_size_t     num_data,
-                bool            use_all_features,
-                bool            is_constant_hessian,
-                int             num_workgroups,
-                cudaStream_t    stream,
-                uint8_t*        arg0,
-                uint8_t*        arg1,
-                data_size_t     arg2,
-                data_size_t*    arg3,
-                data_size_t     arg4,
-                score_t*        arg5,
-                score_t*        arg6,
-                score_t         arg6_const,
-                char*           arg7,
-                volatile int*   arg8,
-                void*           arg9,
-                size_t          exp_workgroups_per_feature) {
-  if (histogram_size == 16) {
-    if (leaf_num_data == num_data) {
-      if (use_all_features) {
-        if (!is_constant_hessian)
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-           histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-           histogram16_fulldata<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-           histogram16_fulldata<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    } else {
-      if (use_all_features) {
-        // seems all features is always enabled, so this should be the same as fulldata
-        if (!is_constant_hessian)
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    }
-  } else if (histogram_size == 64) {
-    if (leaf_num_data == num_data) {
-      if (use_all_features) {
-        if (!is_constant_hessian)
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram64_fulldata<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram64_fulldata<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    } else {
-      if (use_all_features) {
-        // seems all features is always enabled, so this should be the same as fulldata
-        if (!is_constant_hessian)
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    }
-  } else {
-    if (leaf_num_data == num_data) {
-      if (use_all_features) {
-        if (!is_constant_hessian)
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    } else {
-      if (use_all_features) {
-        // seems all features is always enabled, so this should be the same as fulldata
-        if (!is_constant_hessian)
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    }
-  }
-}
-
-}  // namespace LightGBM
-
-#endif  // USE_CUDA
--- a/src/treelearner/cuda_kernel_launcher.h
+++ b/src/treelearner/cuda_kernel_launcher.h
@ -1,70 +0,0 @@
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifndef LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
-#define LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
-
-#ifdef USE_CUDA
-#include <chrono>
-#include "kernels/histogram_16_64_256.hu"  // kernel, acc_type, data_size_t, uchar, score_t
-
-namespace LightGBM {
-
-struct ThreadData {
-          // device id
-          int             device_id;
-          // parameters for cuda_histogram
-          int             histogram_size;
-          data_size_t     leaf_num_data;
-          data_size_t     num_data;
-          bool            use_all_features;
-          bool            is_constant_hessian;
-          int             num_workgroups;
-          cudaStream_t    stream;
-          uint8_t*        device_features;
-          uint8_t*        device_feature_masks;
-          data_size_t*    device_data_indices;
-          score_t*        device_gradients;
-          score_t*        device_hessians;
-          score_t         hessians_const;
-          char*           device_subhistograms;
-          volatile int*   sync_counters;
-          void*           device_histogram_outputs;
-          size_t          exp_workgroups_per_feature;
-          // cuda events
-          cudaEvent_t*    kernel_start;
-          cudaEvent_t*    kernel_wait_obj;
-          std::chrono::duration<double, std::milli>* kernel_input_wait_time;
-          // copy histogram
-          size_t        output_size;
-          char*                 host_histogram_output;
-          cudaEvent_t*          histograms_wait_obj;
-};
-
-
-void cuda_histogram(
-                int             histogram_size,
-                data_size_t     leaf_num_data,
-                data_size_t     num_data,
-                bool            use_all_features,
-                bool            is_constant_hessian,
-                int             num_workgroups,
-                cudaStream_t    stream,
-                uint8_t*        arg0,
-                uint8_t*        arg1,
-                data_size_t     arg2,
-                data_size_t*    arg3,
-                data_size_t     arg4,
-                score_t*        arg5,
-                score_t*        arg6,
-                score_t         arg6_const,
-                char*           arg7,
-                volatile int*   arg8,
-                void*           arg9,
-                size_t          exp_workgroups_per_feature);
-
-}  // namespace LightGBM
-
-#endif  // USE_CUDA
-#endif  // LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
@ -1,261 +0,0 @@
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifndef LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
-#define LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
-
-#include <LightGBM/utils/random.h>
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/dataset.h>
-#include <LightGBM/feature_group.h>
-#include <LightGBM/tree.h>
-
-#include <string>
-#include <cmath>
-#include <cstdio>
-#include <memory>
-#include <random>
-#include <vector>
-#ifdef USE_CUDA
-#include <cuda_runtime.h>
-#endif
-
-#include "feature_histogram.hpp"
-#include "serial_tree_learner.h"
-#include "data_partition.hpp"
-#include "split_info.hpp"
-#include "leaf_splits.hpp"
-
-#ifdef USE_CUDA
-#include <LightGBM/cuda/vector_cudahost.h>
-#include "cuda_kernel_launcher.h"
-
-
-using json11::Json;
-
-namespace LightGBM {
-
-/*!
-* \brief CUDA-based parallel learning algorithm.
-*/
-class CUDATreeLearner: public SerialTreeLearner {
- public:
-    explicit CUDATreeLearner(const Config* tree_config);
-    ~CUDATreeLearner();
-    void Init(const Dataset* train_data, bool is_constant_hessian) override;
-    void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
-    Tree* Train(const score_t* gradients, const score_t *hessians, bool is_first_tree) override;
-    void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
-      SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
-      if (subset == nullptr && used_indices != nullptr) {
-        if (num_data != num_data_) {
-          use_bagging_ = true;
-          return;
-        }
-      }
-      use_bagging_ = false;
-    }
-
- protected:
-    void BeforeTrain() override;
-    bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
-    void FindBestSplits(const Tree* tree) override;
-    void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
-    void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
-
- private:
-    typedef float gpu_hist_t;
-
-    /*!
-     * \brief Find the best number of workgroups processing one feature for maximizing efficiency
-     * \param leaf_num_data The number of data examples on the current leaf being processed
-     * \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature
-     */
-    int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data);
-
-    /*!
-     * \brief Initialize GPU device
-     * \param num_gpu: number of maximum gpus
-     */
-    void InitGPU(int num_gpu);
-
-    /*!
-     * \brief Allocate memory for GPU computation // alloc only
-     */
-    void CountDenseFeatureGroups();  // compute num_dense_feature_group
-    void prevAllocateGPUMemory();  // compute CPU-side param calculation & Pin HostMemory
-    void AllocateGPUMemory();
-
-    /*!
-     * \ ResetGPUMemory
-     */
-    void ResetGPUMemory();
-
-    /*!
-     * \ copy dense feature from CPU to GPU
-     */
-    void copyDenseFeature();
-
-    /*! 
-     * \brief Compute GPU feature histogram for the current leaf.
-     *        Indices, gradients and Hessians have been copied to the device.
-     * \param leaf_num_data Number of data on current leaf
-     * \param use_all_features Set to true to not use feature masks, with a faster kernel
-     */
-    void GPUHistogram(data_size_t leaf_num_data, bool use_all_features);
-
-    void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size,
-                int leaf_num_data, bool use_all_features,
-                int num_workgroups, int exp_workgroups_per_feature) {
-      ThreadData* td = &thread_data[device_id];
-      td->device_id             = device_id;
-      td->histogram_size        = histogram_size;
-      td->leaf_num_data         = leaf_num_data;
-      td->num_data              = num_data_;
-      td->use_all_features      = use_all_features;
-      td->is_constant_hessian   = share_state_->is_constant_hessian;
-      td->num_workgroups        = num_workgroups;
-      td->stream                = stream_[device_id];
-      td->device_features       = device_features_[device_id];
-      td->device_feature_masks  = reinterpret_cast<uint8_t *>(device_feature_masks_[device_id]);
-      td->device_data_indices   = device_data_indices_[device_id];
-      td->device_gradients      = device_gradients_[device_id];
-      td->device_hessians       = device_hessians_[device_id];
-      td->hessians_const        = hessians_[0];
-      td->device_subhistograms  = device_subhistograms_[device_id];
-      td->sync_counters         = sync_counters_[device_id];
-      td->device_histogram_outputs   = device_histogram_outputs_[device_id];
-      td->exp_workgroups_per_feature = exp_workgroups_per_feature;
-
-      td->kernel_start           = &(kernel_start_[device_id]);
-      td->kernel_wait_obj        = &(kernel_wait_obj_[device_id]);
-      td->kernel_input_wait_time = &(kernel_input_wait_time_[device_id]);
-
-      size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
-      size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
-      td->output_size           = output_size;
-      td->host_histogram_output = reinterpret_cast<char*>(host_histogram_outputs_) + host_output_offset;
-      td->histograms_wait_obj   = &(histograms_wait_obj_[device_id]);
-    }
-
-    /*!
-     * \brief Wait for GPU kernel execution and read histogram
-     * \param histograms Destination of histogram results from GPU.
-     */
-    template <typename HistType>
-    void WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array);
-
-    /*!
-     * \brief Construct GPU histogram asynchronously. 
-     *        Interface is similar to Dataset::ConstructHistograms().
-     * \param is_feature_used A predicate vector for enabling each feature
-     * \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU.
-     *                     Set to nullptr to skip copy to GPU.
-     * \param num_data Number of data examples to be included in histogram
-     * \return true if GPU kernel is launched, false if GPU is not used
-    */
-    bool ConstructGPUHistogramsAsync(
-      const std::vector<int8_t>& is_feature_used,
-      const data_size_t* data_indices, data_size_t num_data);
-
-    /*! brief Log2 of max number of workgroups per feature*/
-    const int kMaxLogWorkgroupsPerFeature = 10;  // 2^10
-    /*! brief Max total number of workgroups with preallocated workspace.
-     *        If we use more than this number of workgroups, we have to reallocate subhistograms */
-    std::vector<int> preallocd_max_num_wg_;
-
-    /*! \brief True if bagging is used */
-    bool use_bagging_;
-
-    /*! \brief GPU command queue object */
-    std::vector<cudaStream_t> stream_;
-
-    /*! \brief total number of feature-groups */
-    int num_feature_groups_;
-    /*! \brief total number of dense feature-groups, which will be processed on GPU */
-    int num_dense_feature_groups_;
-    std::vector<int> num_gpu_feature_groups_;
-    std::vector<int> offset_gpu_feature_groups_;
-    /*! \brief On GPU we read one DWORD (4-byte) of features of one example once.
-     *  With bin size > 16, there are 4 features per DWORD.
-     *  With bin size <=16, there are 8 features per DWORD.
-     */
-    int dword_features_;
-    /*! \brief Max number of bins of training data, used to determine 
-     * which GPU kernel to use */
-    int max_num_bin_;
-    /*! \brief Used GPU kernel bin size (64, 256) */
-    int histogram_size_;
-    int device_bin_size_;
-    /*! \brief Size of histogram bin entry, depending if single or double precision is used */
-    size_t hist_bin_entry_sz_;
-    /*! \brief Indices of all dense feature-groups */
-    std::vector<int> dense_feature_group_map_;
-    /*! \brief Indices of all sparse feature-groups */
-    std::vector<int> sparse_feature_group_map_;
-    /*! \brief GPU memory object holding the training data */
-    std::vector<uint8_t*> device_features_;
-    /*! \brief GPU memory object holding the ordered gradient */
-    std::vector<score_t*> device_gradients_;
-    /*! \brief GPU memory object holding the ordered hessian */
-    std::vector<score_t*> device_hessians_;
-    /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */
-    std::vector<char> feature_masks_;
-    /*! \brief GPU memory object holding the feature masks */
-    std::vector<char*> device_feature_masks_;
-    /*! \brief Pointer to pinned memory of feature masks */
-    char* ptr_pinned_feature_masks_ = nullptr;
-    /*! \brief GPU memory object holding indices of the leaf being processed */
-    std::vector<data_size_t*> device_data_indices_;
-    /*! \brief GPU memory object holding counters for workgroup coordination */
-    std::vector<int*> sync_counters_;
-    /*! \brief GPU memory object holding temporary sub-histograms per workgroup */
-    std::vector<char*> device_subhistograms_;
-    /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */
-    std::vector<void*> device_histogram_outputs_;
-    /*! \brief Host memory pointer for histogram outputs */
-    void *host_histogram_outputs_;
-    /*! CUDA waitlist object for waiting for data transfer before kernel execution */
-    std::vector<cudaEvent_t> kernel_wait_obj_;
-    /*! CUDA waitlist object for reading output histograms after kernel execution */
-    std::vector<cudaEvent_t> histograms_wait_obj_;
-    /*! CUDA Asynchronous waiting object for copying indices */
-    std::vector<cudaEvent_t> indices_future_;
-    /*! Asynchronous waiting object for copying gradients */
-    std::vector<cudaEvent_t> gradients_future_;
-    /*! Asynchronous waiting object for copying Hessians */
-    std::vector<cudaEvent_t> hessians_future_;
-    /*! Asynchronous waiting object for copying dense features */
-    std::vector<cudaEvent_t> features_future_;
-
-    // host-side buffer for converting feature data into featre4 data
-    int nthreads_;  // number of Feature4* vector on host4_vecs_
-    std::vector<cudaEvent_t> kernel_start_;
-    std::vector<float> kernel_time_;  // measure histogram kernel time
-    std::vector<std::chrono::duration<double, std::milli>> kernel_input_wait_time_;
-    int num_gpu_;
-    int allocated_num_data_;  // allocated data instances
-    pthread_t **cpu_threads_;  // pthread, 1 cpu thread / gpu
-};
-
-}  // namespace LightGBM
-#else  // USE_CUDA
-
-// When GPU support is not compiled in, quit with an error message
-
-namespace LightGBM {
-
-class CUDATreeLearner: public SerialTreeLearner {
- public:
-    #pragma warning(disable : 4702)
-    explicit CUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) {
-      Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
-                 "Please recompile with CMake option -DUSE_CUDA=1");
-    }
-};
-
-}  // namespace LightGBM
-
-#endif  // USE_CUDA
-#endif  // LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@ -276,7 +276,6 @@ void DataParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf, in
 }

 // instantiate template classes, otherwise linker cannot find the code
-template class DataParallelTreeLearner<CUDATreeLearner>;
 template class DataParallelTreeLearner<GPUTreeLearner>;
 template class DataParallelTreeLearner<SerialTreeLearner>;

--- a/src/treelearner/feature_parallel_tree_learner.cpp
+++ b/src/treelearner/feature_parallel_tree_learner.cpp
@ -77,7 +77,6 @@ void FeatureParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(
 }

 // instantiate template classes, otherwise linker cannot find the code
-template class FeatureParallelTreeLearner<CUDATreeLearner>;
 template class FeatureParallelTreeLearner<GPUTreeLearner>;
 template class FeatureParallelTreeLearner<SerialTreeLearner>;
 }  // namespace LightGBM
--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@ -12,7 +12,6 @@
 #include <memory>
 #include <vector>

-#include "cuda_tree_learner.h"
 #include "gpu_tree_learner.h"
 #include "serial_tree_learner.h"

--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@ -344,15 +344,7 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree, const std::set<int>* fo
  }
  bool use_subtract = parent_leaf_histogram_array_ != nullptr;

-#ifdef USE_CUDA
-  if (LGBM_config_::current_learner == use_cpu_learner) {
-    SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
-  } else {
-    ConstructHistograms(is_feature_used, use_subtract);
-  }
-#else
  ConstructHistograms(is_feature_used, use_subtract);
-#endif
  FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree);
 }

--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@ -211,7 +211,7 @@ class SerialTreeLearner: public TreeLearner {
  std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_gradients_;
  /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */
  std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
-#elif defined(USE_CUDA) || defined(USE_CUDA_EXP)
+#elif defined(USE_CUDA)
  /*! \brief gradients of current iteration, ordered for cache optimized */
  std::vector<score_t, CHAllocator<score_t>> ordered_gradients_;
  /*! \brief hessians of current iteration, ordered for cache optimized */
--- a/src/treelearner/tree_learner.cpp
+++ b/src/treelearner/tree_learner.cpp
@ -4,7 +4,6 @@
 */
 #include <LightGBM/tree_learner.h>

-#include "cuda_tree_learner.h"
 #include "gpu_tree_learner.h"
 #include "linear_tree_learner.h"
 #include "parallel_tree_learner.h"
@ -40,24 +39,14 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con
      return new VotingParallelTreeLearner<GPUTreeLearner>(config);
    }
  } else if (device_type == std::string("cuda")) {
-    if (learner_type == std::string("serial")) {
-      return new CUDATreeLearner(config);
-    } else if (learner_type == std::string("feature")) {
-      return new FeatureParallelTreeLearner<CUDATreeLearner>(config);
-    } else if (learner_type == std::string("data")) {
-      return new DataParallelTreeLearner<CUDATreeLearner>(config);
-    } else if (learner_type == std::string("voting")) {
-      return new VotingParallelTreeLearner<CUDATreeLearner>(config);
-    }
-  } else if (device_type == std::string("cuda_exp")) {
    if (learner_type == std::string("serial")) {
      if (config->num_gpu == 1) {
        return new CUDASingleGPUTreeLearner(config, boosting_on_cuda);
      } else {
-        Log::Fatal("cuda_exp only supports training on a single GPU.");
+        Log::Fatal("Currently cuda version only supports training on a single GPU.");
      }
    } else {
-      Log::Fatal("cuda_exp only supports training on a single machine.");
+      Log::Fatal("Currently cuda version only supports training on a single machine.");
    }
  }
  return nullptr;
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@ -501,7 +501,6 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf,
 }

 // instantiate template classes, otherwise linker cannot find the code
-template class VotingParallelTreeLearner<CUDATreeLearner>;
 template class VotingParallelTreeLearner<GPUTreeLearner>;
 template class VotingParallelTreeLearner<SerialTreeLearner>;
 }  // namespace LightGBM
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@ -48,7 +48,7 @@ def test_basic(tmp_path):
    assert bst.current_iteration() == 20
    assert bst.num_trees() == 20
    assert bst.num_model_per_iteration() == 1
-    if getenv('TASK', '') != 'cuda_exp':
+    if getenv('TASK', '') != 'cuda':
        assert bst.lower_bound() == pytest.approx(-2.9040190126976606)
        assert bst.upper_bound() == pytest.approx(3.3182142872462883)

--- a/Показать больше
+++ b/Показать больше