[R-package] Promote number of threads to top-level argument in `lightgbm()` and change default to number of cores (#4972)

2022-04-01 03:58:19 +03:00 · 2022-04-01 03:58:19 +03:00 · 33eb03767d
--- a/.ci/test_r_package.sh
+++ b/.ci/test_r_package.sh
@ -105,13 +105,13 @@ if [[ $OS_NAME == "macos" ]]; then
    fi
 fi

-# Manually install Depends and Imports libraries + 'knitr', 'rmarkdown', 'testthat'
+# Manually install Depends and Imports libraries + 'knitr', 'RhpcBLASctl', 'rmarkdown', 'testthat'
 # to avoid a CI-time dependency on devtools (for devtools::install_deps())
 # NOTE: testthat is not required when running rchk
 if [[ "${TASK}" == "r-rchk" ]]; then
-    packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'rmarkdown')"
+    packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'RhpcBLASctl', 'rmarkdown')"
 else
-    packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'rmarkdown', 'testthat')"
+    packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'RhpcBLASctl', 'rmarkdown', 'testthat')"
 fi
 compile_from_source="both"
 if [[ $OS_NAME == "macos" ]]; then
--- a/.ci/test_r_package_solaris.sh
+++ b/.ci/test_r_package_solaris.sh
@ -7,7 +7,7 @@ apt-get install --no-install-recommends -y \

 # installation of dependencies needs to happen before building the package,
 # since `R CMD build` needs to install the package to build vignettes
-Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'rhub', 'testthat'), dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
+Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'rhub', 'testthat'), dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1

 sh build-cran-package.sh || exit -1

--- a/.ci/test_r_package_valgrind.sh
+++ b/.ci/test_r_package_valgrind.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
+RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
 sh build-cran-package.sh \
  --r-executable=RDvalgrind \
  || exit -1
--- a/.ci/test_r_package_windows.ps1
+++ b/.ci/test_r_package_windows.ps1
@ -122,7 +122,7 @@ Start-Process -FilePath Rtools.exe -NoNewWindow -Wait -ArgumentList "/VERYSILENT
 Write-Output "Done installing Rtools"

 Write-Output "Installing dependencies"
-$packages = "c('data.table', 'jsonlite', 'knitr', 'Matrix', 'processx', 'R6', 'rmarkdown', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')"
+$packages = "c('data.table', 'jsonlite', 'knitr', 'Matrix', 'processx', 'R6', 'RhpcBLASctl', 'rmarkdown', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')"
 Run-R-Code-Redirect-Stderr "options(install.packages.check.source = 'no'); install.packages($packages, repos = '$env:CRAN_MIRROR', type = 'binary', lib = '$env:R_LIB_PATH', Ncpus = parallel::detectCores())" ; Check-Output $?

 # MiKTeX and pandoc can be skipped on non-MinGW builds, since we don't
--- a/.github/workflows/r_package.yml
+++ b/.github/workflows/r_package.yml
@ -188,7 +188,7 @@ jobs:
      - name: Install packages
        shell: bash
        run: |
-          RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
+          RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
          sh build-cran-package.sh --r-executable=RD${{ matrix.r_customization }}
          RD${{ matrix.r_customization }} CMD INSTALL lightgbm_*.tar.gz || exit -1
      - name: Run tests with sanitizers
@ -219,7 +219,7 @@ jobs:
        shell: bash
        run: |
          export PATH=/opt/R-devel/bin/:${PATH}
-          Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
+          Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
          sh build-cran-package.sh
          R CMD check --as-cran --run-donttest lightgbm_*.tar.gz || exit -1
          if grep -q -E "NOTE|WARNING|ERROR" lightgbm.Rcheck/00check.log; then
--- a/.vsts-ci.yml
+++ b/.vsts-ci.yml
@ -313,7 +313,7 @@ jobs:
      R_LIB_PATH=~/Rlib
      export R_LIBS=${R_LIB_PATH}
      mkdir -p ${R_LIB_PATH}
-      RDscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown'),  lib = '${R_LIB_PATH}', dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
+      RDscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown'),  lib = '${R_LIB_PATH}', dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
      sh build-cran-package.sh --r-executable=RD || exit -1
      mv lightgbm_${LGB_VER}.tar.gz $(Build.ArtifactStagingDirectory)/lightgbm-${LGB_VER}-r-cran.tar.gz
    displayName: 'Build CRAN R-package'
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@ -50,6 +50,7 @@ VignetteBuilder: knitr
 Suggests:
    knitr,
    processx,
+    RhpcBLASctl,
    rmarkdown,
    testthat
 Depends:
@ -61,6 +62,7 @@ Imports:
    jsonlite (>= 1.0),
    Matrix (>= 1.1-0),
    methods,
+    parallel,
    utils
 SystemRequirements:
    C++11
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@ -52,6 +52,7 @@ importFrom(graphics,barplot)
 importFrom(graphics,par)
 importFrom(jsonlite,fromJSON)
 importFrom(methods,is)
+importFrom(parallel,detectCores)
 importFrom(stats,quantile)
 importFrom(utils,modifyList)
 importFrom(utils,read.delim)
--- a/R-package/R/lgb.restore_handle.R
+++ b/R-package/R/lgb.restore_handle.R
@ -15,7 +15,7 @@
 #' model <- lightgbm(
 #'   agaricus.train$data
 #'   , agaricus.train$label
-#'   , params = list(objective = "binary", nthreads = 1L)
+#'   , params = list(objective = "binary")
 #'   , nrounds = 5L
 #'   , verbose = 0)
 #' fname <- tempfile(fileext="rds")
--- a/R-package/R/lightgbm.R
+++ b/R-package/R/lightgbm.R
@ -98,6 +98,22 @@ NULL
 #'                  \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#objective}{
 #'                  the "objective" item of the "Parameters" section of the documentation}.
 #' @param init_score initial score is the base prediction lightgbm will boost from
+#' @param num_threads Number of parallel threads to use. For best speed, this should be set to the number of
+#'                    physical cores in the CPU - in a typical x86-64 machine, this corresponds to half the
+#'                    number of maximum threads.
+#'
+#'                    Be aware that using too many threads can result in speed degradation in smaller datasets
+#'                    (see the parameters documentation for more details).
+#'
+#'                    If passing zero, will use the default number of threads configured for OpenMP
+#'                    (typically controlled through an environment variable \code{OMP_NUM_THREADS}).
+#'
+#'                    If passing \code{NULL} (the default), will try to use the number of physical cores in the
+#'                    system, but be aware that getting the number of cores detected correctly requires package
+#'                    \code{RhpcBLASctl} to be installed.
+#'
+#'                    This parameter gets overriden by \code{num_threads} and its aliases under \code{params}
+#'                    if passed there.
 #' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example
 #'     \itemize{
 #'        \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation}
@ -129,6 +145,7 @@ lightgbm <- function(data,
                     serializable = TRUE,
                     objective = "regression",
                     init_score = NULL,
+                     num_threads = NULL,
                     ...) {

  # validate inputs early to avoid unnecessary computation
@ -136,6 +153,15 @@ lightgbm <- function(data,
    stop("nrounds should be greater than zero")
  }

+  if (is.null(num_threads)) {
+    num_threads <- lgb.get.default.num.threads()
+  }
+  params <- lgb.check.wrapper_param(
+    main_param_name = "num_threads"
+    , params = params
+    , alternative_kwarg_value = num_threads
+  )
+
  # Set data to a temporary variable
  dtrain <- data

--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@ -217,3 +217,26 @@ lgb.check.wrapper_param <- function(main_param_name, params, alternative_kwarg_v
  params[[main_param_name]] <- alternative_kwarg_value
  return(params)
 }
+
+#' @importFrom parallel detectCores
+lgb.get.default.num.threads <- function() {
+  if (requireNamespace("RhpcBLASctl", quietly = TRUE)) { # nolint
+    return(RhpcBLASctl::get_num_cores())
+  } else {
+    msg <- "Optional package 'RhpcBLASctl' not found."
+    cores <- 0L
+    if (Sys.info()["sysname"] != "Linux") {
+      cores <- parallel::detectCores(logical = FALSE)
+      if (is.na(cores) || cores < 0L) {
+        cores <- 0L
+      }
+    }
+    if (cores == 0L) {
+      msg <- paste(msg, "Will use default number of OpenMP threads.", sep = " ")
+    } else {
+      msg <- paste(msg, "Detection of CPU cores might not be accurate.", sep = " ")
+    }
+    warning(msg)
+    return(cores)
+  }
+}
--- a/R-package/man/lgb.restore_handle.Rd
+++ b/R-package/man/lgb.restore_handle.Rd
@ -25,7 +25,7 @@ data("agaricus.train")
 model <- lightgbm(
  agaricus.train$data
  , agaricus.train$label
-  , params = list(objective = "binary", nthreads = 1L)
+  , params = list(objective = "binary")
  , nrounds = 5L
  , verbose = 0)
 fname <- tempfile(fileext="rds")
--- a/R-package/man/lightgbm.Rd
+++ b/R-package/man/lightgbm.Rd
@ -18,6 +18,7 @@ lightgbm(
  serializable = TRUE,
  objective = "regression",
  init_score = NULL,
+  num_threads = NULL,
  ...
 )
 }
@ -60,6 +61,23 @@ the "objective" item of the "Parameters" section of the documentation}.}

 \item{init_score}{initial score is the base prediction lightgbm will boost from}

+\item{num_threads}{Number of parallel threads to use. For best speed, this should be set to the number of
+                   physical cores in the CPU - in a typical x86-64 machine, this corresponds to half the
+                   number of maximum threads.
+
+                   Be aware that using too many threads can result in speed degradation in smaller datasets
+                   (see the parameters documentation for more details).
+
+                   If passing zero, will use the default number of threads configured for OpenMP
+                   (typically controlled through an environment variable \code{OMP_NUM_THREADS}).
+
+                   If passing \code{NULL} (the default), will try to use the number of physical cores in the
+                   system, but be aware that getting the number of cores detected correctly requires package
+                   \code{RhpcBLASctl} to be installed.
+
+                   This parameter gets overriden by \code{num_threads} and its aliases under \code{params}
+                   if passed there.}
+
 \item{...}{Additional arguments passed to \code{\link{lgb.train}}. For example
 \itemize{
   \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation}
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@ -2928,6 +2928,51 @@ test_that("lightgbm() defaults to 'regression' objective if objective not otherw
  expect_false(any(model_txt_lines == "objective=regression_l1"))
 })

+test_that("lightgbm() accepts 'num_threads' as either top-level argument or under params", {
+  bst <- lightgbm(
+    data = train$data
+    , label = train$label
+    , nrounds = 5L
+    , verbose = VERBOSITY
+    , num_threads = 1L
+  )
+  expect_equal(bst$params$num_threads, 1L)
+  model_txt_lines <- strsplit(
+    x = bst$save_model_to_string()
+    , split = "\n"
+  )[[1L]]
+  expect_true(any(grepl("\\[num_threads: 1\\]", model_txt_lines)))
+
+  bst <- lightgbm(
+    data = train$data
+    , label = train$label
+    , nrounds = 5L
+    , verbose = VERBOSITY
+    , params = list(num_threads = 1L)
+  )
+  expect_equal(bst$params$num_threads, 1L)
+  model_txt_lines <- strsplit(
+    x = bst$save_model_to_string()
+    , split = "\n"
+  )[[1L]]
+  expect_true(any(grepl("\\[num_threads: 1\\]", model_txt_lines)))
+
+  bst <- lightgbm(
+    data = train$data
+    , label = train$label
+    , nrounds = 5L
+    , verbose = VERBOSITY
+    , num_threads = 10L
+    , params = list(num_threads = 1L)
+  )
+  expect_equal(bst$params$num_threads, 1L)
+  model_txt_lines <- strsplit(
+    x = bst$save_model_to_string()
+    , split = "\n"
+  )[[1L]]
+  expect_true(any(grepl("\\[num_threads: 1\\]", model_txt_lines)))
+})
+
 test_that("lightgbm() accepts 'weight' and 'weights'", {
  data(mtcars)
  X <- as.matrix(mtcars[, -1L])