From 33eb03767de025ec71d5ac9db35f0936637c1b27 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Fri, 1 Apr 2022 03:58:19 +0300 Subject: [PATCH] [R-package] Promote number of threads to top-level argument in `lightgbm()` and change default to number of cores (#4972) --- .ci/test_r_package.sh | 6 ++-- .ci/test_r_package_solaris.sh | 2 +- .ci/test_r_package_valgrind.sh | 2 +- .ci/test_r_package_windows.ps1 | 2 +- .github/workflows/r_package.yml | 4 +-- .vsts-ci.yml | 2 +- R-package/DESCRIPTION | 2 ++ R-package/NAMESPACE | 1 + R-package/R/lgb.restore_handle.R | 2 +- R-package/R/lightgbm.R | 26 ++++++++++++++++ R-package/R/utils.R | 23 ++++++++++++++ R-package/man/lgb.restore_handle.Rd | 2 +- R-package/man/lightgbm.Rd | 18 +++++++++++ R-package/tests/testthat/test_basic.R | 45 +++++++++++++++++++++++++++ 14 files changed, 126 insertions(+), 11 deletions(-) diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh index 80ac4f141..1a96d4b4c 100755 --- a/.ci/test_r_package.sh +++ b/.ci/test_r_package.sh @@ -105,13 +105,13 @@ if [[ $OS_NAME == "macos" ]]; then fi fi -# Manually install Depends and Imports libraries + 'knitr', 'rmarkdown', 'testthat' +# Manually install Depends and Imports libraries + 'knitr', 'RhpcBLASctl', 'rmarkdown', 'testthat' # to avoid a CI-time dependency on devtools (for devtools::install_deps()) # NOTE: testthat is not required when running rchk if [[ "${TASK}" == "r-rchk" ]]; then - packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'rmarkdown')" + packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'RhpcBLASctl', 'rmarkdown')" else - packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'rmarkdown', 'testthat')" + packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'RhpcBLASctl', 'rmarkdown', 'testthat')" fi compile_from_source="both" if [[ $OS_NAME == "macos" ]]; then diff --git a/.ci/test_r_package_solaris.sh b/.ci/test_r_package_solaris.sh index 18ed6cb2f..298037f45 100755 --- a/.ci/test_r_package_solaris.sh +++ b/.ci/test_r_package_solaris.sh @@ -7,7 +7,7 @@ apt-get install --no-install-recommends -y \ # installation of dependencies needs to happen before building the package, # since `R CMD build` needs to install the package to build vignettes -Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'rhub', 'testthat'), dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1 +Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'rhub', 'testthat'), dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1 sh build-cran-package.sh || exit -1 diff --git a/.ci/test_r_package_valgrind.sh b/.ci/test_r_package_valgrind.sh index e7a6cb027..4185b9450 100755 --- a/.ci/test_r_package_valgrind.sh +++ b/.ci/test_r_package_valgrind.sh @@ -1,6 +1,6 @@ #!/bin/bash -RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1 +RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1 sh build-cran-package.sh \ --r-executable=RDvalgrind \ || exit -1 diff --git a/.ci/test_r_package_windows.ps1 b/.ci/test_r_package_windows.ps1 index ffbfeca89..4a911ca77 100644 --- a/.ci/test_r_package_windows.ps1 +++ b/.ci/test_r_package_windows.ps1 @@ -122,7 +122,7 @@ Start-Process -FilePath Rtools.exe -NoNewWindow -Wait -ArgumentList "/VERYSILENT Write-Output "Done installing Rtools" Write-Output "Installing dependencies" -$packages = "c('data.table', 'jsonlite', 'knitr', 'Matrix', 'processx', 'R6', 'rmarkdown', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')" +$packages = "c('data.table', 'jsonlite', 'knitr', 'Matrix', 'processx', 'R6', 'RhpcBLASctl', 'rmarkdown', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')" Run-R-Code-Redirect-Stderr "options(install.packages.check.source = 'no'); install.packages($packages, repos = '$env:CRAN_MIRROR', type = 'binary', lib = '$env:R_LIB_PATH', Ncpus = parallel::detectCores())" ; Check-Output $? # MiKTeX and pandoc can be skipped on non-MinGW builds, since we don't diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml index 1021d138d..87c05401c 100644 --- a/.github/workflows/r_package.yml +++ b/.github/workflows/r_package.yml @@ -188,7 +188,7 @@ jobs: - name: Install packages shell: bash run: | - RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" + RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" sh build-cran-package.sh --r-executable=RD${{ matrix.r_customization }} RD${{ matrix.r_customization }} CMD INSTALL lightgbm_*.tar.gz || exit -1 - name: Run tests with sanitizers @@ -219,7 +219,7 @@ jobs: shell: bash run: | export PATH=/opt/R-devel/bin/:${PATH} - Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" + Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" sh build-cran-package.sh R CMD check --as-cran --run-donttest lightgbm_*.tar.gz || exit -1 if grep -q -E "NOTE|WARNING|ERROR" lightgbm.Rcheck/00check.log; then diff --git a/.vsts-ci.yml b/.vsts-ci.yml index 5d81c61f2..d4b31e0ed 100644 --- a/.vsts-ci.yml +++ b/.vsts-ci.yml @@ -313,7 +313,7 @@ jobs: R_LIB_PATH=~/Rlib export R_LIBS=${R_LIB_PATH} mkdir -p ${R_LIB_PATH} - RDscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown'), lib = '${R_LIB_PATH}', dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1 + RDscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown'), lib = '${R_LIB_PATH}', dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1 sh build-cran-package.sh --r-executable=RD || exit -1 mv lightgbm_${LGB_VER}.tar.gz $(Build.ArtifactStagingDirectory)/lightgbm-${LGB_VER}-r-cran.tar.gz displayName: 'Build CRAN R-package' diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index d9287584d..7efb865f3 100755 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -50,6 +50,7 @@ VignetteBuilder: knitr Suggests: knitr, processx, + RhpcBLASctl, rmarkdown, testthat Depends: @@ -61,6 +62,7 @@ Imports: jsonlite (>= 1.0), Matrix (>= 1.1-0), methods, + parallel, utils SystemRequirements: C++11 diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 02e886bbc..33152e33b 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -52,6 +52,7 @@ importFrom(graphics,barplot) importFrom(graphics,par) importFrom(jsonlite,fromJSON) importFrom(methods,is) +importFrom(parallel,detectCores) importFrom(stats,quantile) importFrom(utils,modifyList) importFrom(utils,read.delim) diff --git a/R-package/R/lgb.restore_handle.R b/R-package/R/lgb.restore_handle.R index 020f3a145..3d7076351 100644 --- a/R-package/R/lgb.restore_handle.R +++ b/R-package/R/lgb.restore_handle.R @@ -15,7 +15,7 @@ #' model <- lightgbm( #' agaricus.train$data #' , agaricus.train$label -#' , params = list(objective = "binary", nthreads = 1L) +#' , params = list(objective = "binary") #' , nrounds = 5L #' , verbose = 0) #' fname <- tempfile(fileext="rds") diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R index b76eb819c..3e0c55c9d 100644 --- a/R-package/R/lightgbm.R +++ b/R-package/R/lightgbm.R @@ -98,6 +98,22 @@ NULL #' \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#objective}{ #' the "objective" item of the "Parameters" section of the documentation}. #' @param init_score initial score is the base prediction lightgbm will boost from +#' @param num_threads Number of parallel threads to use. For best speed, this should be set to the number of +#' physical cores in the CPU - in a typical x86-64 machine, this corresponds to half the +#' number of maximum threads. +#' +#' Be aware that using too many threads can result in speed degradation in smaller datasets +#' (see the parameters documentation for more details). +#' +#' If passing zero, will use the default number of threads configured for OpenMP +#' (typically controlled through an environment variable \code{OMP_NUM_THREADS}). +#' +#' If passing \code{NULL} (the default), will try to use the number of physical cores in the +#' system, but be aware that getting the number of cores detected correctly requires package +#' \code{RhpcBLASctl} to be installed. +#' +#' This parameter gets overriden by \code{num_threads} and its aliases under \code{params} +#' if passed there. #' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example #' \itemize{ #' \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation} @@ -129,6 +145,7 @@ lightgbm <- function(data, serializable = TRUE, objective = "regression", init_score = NULL, + num_threads = NULL, ...) { # validate inputs early to avoid unnecessary computation @@ -136,6 +153,15 @@ lightgbm <- function(data, stop("nrounds should be greater than zero") } + if (is.null(num_threads)) { + num_threads <- lgb.get.default.num.threads() + } + params <- lgb.check.wrapper_param( + main_param_name = "num_threads" + , params = params + , alternative_kwarg_value = num_threads + ) + # Set data to a temporary variable dtrain <- data diff --git a/R-package/R/utils.R b/R-package/R/utils.R index c89bfe9fb..7cbe36fe2 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -217,3 +217,26 @@ lgb.check.wrapper_param <- function(main_param_name, params, alternative_kwarg_v params[[main_param_name]] <- alternative_kwarg_value return(params) } + +#' @importFrom parallel detectCores +lgb.get.default.num.threads <- function() { + if (requireNamespace("RhpcBLASctl", quietly = TRUE)) { # nolint + return(RhpcBLASctl::get_num_cores()) + } else { + msg <- "Optional package 'RhpcBLASctl' not found." + cores <- 0L + if (Sys.info()["sysname"] != "Linux") { + cores <- parallel::detectCores(logical = FALSE) + if (is.na(cores) || cores < 0L) { + cores <- 0L + } + } + if (cores == 0L) { + msg <- paste(msg, "Will use default number of OpenMP threads.", sep = " ") + } else { + msg <- paste(msg, "Detection of CPU cores might not be accurate.", sep = " ") + } + warning(msg) + return(cores) + } +} diff --git a/R-package/man/lgb.restore_handle.Rd b/R-package/man/lgb.restore_handle.Rd index e07ec4b6d..31a0fcf9c 100644 --- a/R-package/man/lgb.restore_handle.Rd +++ b/R-package/man/lgb.restore_handle.Rd @@ -25,7 +25,7 @@ data("agaricus.train") model <- lightgbm( agaricus.train$data , agaricus.train$label - , params = list(objective = "binary", nthreads = 1L) + , params = list(objective = "binary") , nrounds = 5L , verbose = 0) fname <- tempfile(fileext="rds") diff --git a/R-package/man/lightgbm.Rd b/R-package/man/lightgbm.Rd index 4ea8cc3d6..62a041b5c 100644 --- a/R-package/man/lightgbm.Rd +++ b/R-package/man/lightgbm.Rd @@ -18,6 +18,7 @@ lightgbm( serializable = TRUE, objective = "regression", init_score = NULL, + num_threads = NULL, ... ) } @@ -60,6 +61,23 @@ the "objective" item of the "Parameters" section of the documentation}.} \item{init_score}{initial score is the base prediction lightgbm will boost from} +\item{num_threads}{Number of parallel threads to use. For best speed, this should be set to the number of + physical cores in the CPU - in a typical x86-64 machine, this corresponds to half the + number of maximum threads. + + Be aware that using too many threads can result in speed degradation in smaller datasets + (see the parameters documentation for more details). + + If passing zero, will use the default number of threads configured for OpenMP + (typically controlled through an environment variable \code{OMP_NUM_THREADS}). + + If passing \code{NULL} (the default), will try to use the number of physical cores in the + system, but be aware that getting the number of cores detected correctly requires package + \code{RhpcBLASctl} to be installed. + + This parameter gets overriden by \code{num_threads} and its aliases under \code{params} + if passed there.} + \item{...}{Additional arguments passed to \code{\link{lgb.train}}. For example \itemize{ \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation} diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 637e17b6f..ade559a0e 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -2928,6 +2928,51 @@ test_that("lightgbm() defaults to 'regression' objective if objective not otherw expect_false(any(model_txt_lines == "objective=regression_l1")) }) +test_that("lightgbm() accepts 'num_threads' as either top-level argument or under params", { + bst <- lightgbm( + data = train$data + , label = train$label + , nrounds = 5L + , verbose = VERBOSITY + , num_threads = 1L + ) + expect_equal(bst$params$num_threads, 1L) + model_txt_lines <- strsplit( + x = bst$save_model_to_string() + , split = "\n" + )[[1L]] + expect_true(any(grepl("\\[num_threads: 1\\]", model_txt_lines))) + + bst <- lightgbm( + data = train$data + , label = train$label + , nrounds = 5L + , verbose = VERBOSITY + , params = list(num_threads = 1L) + ) + expect_equal(bst$params$num_threads, 1L) + model_txt_lines <- strsplit( + x = bst$save_model_to_string() + , split = "\n" + )[[1L]] + expect_true(any(grepl("\\[num_threads: 1\\]", model_txt_lines))) + + bst <- lightgbm( + data = train$data + , label = train$label + , nrounds = 5L + , verbose = VERBOSITY + , num_threads = 10L + , params = list(num_threads = 1L) + ) + expect_equal(bst$params$num_threads, 1L) + model_txt_lines <- strsplit( + x = bst$save_model_to_string() + , split = "\n" + )[[1L]] + expect_true(any(grepl("\\[num_threads: 1\\]", model_txt_lines))) +}) + test_that("lightgbm() accepts 'weight' and 'weights'", { data(mtcars) X <- as.matrix(mtcars[, -1L])