[R-package] Promote number of threads to top-level argument in `lightgbm()` and change default to number of cores (#4972)

This commit is contained in:
david-cortes 2022-04-01 03:58:19 +03:00 коммит произвёл GitHub
Родитель 4ae3d1387d
Коммит 33eb03767d
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
14 изменённых файлов: 126 добавлений и 11 удалений

Просмотреть файл

@ -105,13 +105,13 @@ if [[ $OS_NAME == "macos" ]]; then
fi
fi
# Manually install Depends and Imports libraries + 'knitr', 'rmarkdown', 'testthat'
# Manually install Depends and Imports libraries + 'knitr', 'RhpcBLASctl', 'rmarkdown', 'testthat'
# to avoid a CI-time dependency on devtools (for devtools::install_deps())
# NOTE: testthat is not required when running rchk
if [[ "${TASK}" == "r-rchk" ]]; then
packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'rmarkdown')"
packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'RhpcBLASctl', 'rmarkdown')"
else
packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'rmarkdown', 'testthat')"
packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'RhpcBLASctl', 'rmarkdown', 'testthat')"
fi
compile_from_source="both"
if [[ $OS_NAME == "macos" ]]; then

Просмотреть файл

@ -7,7 +7,7 @@ apt-get install --no-install-recommends -y \
# installation of dependencies needs to happen before building the package,
# since `R CMD build` needs to install the package to build vignettes
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'rhub', 'testthat'), dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'rhub', 'testthat'), dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
sh build-cran-package.sh || exit -1

Просмотреть файл

@ -1,6 +1,6 @@
#!/bin/bash
RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
sh build-cran-package.sh \
--r-executable=RDvalgrind \
|| exit -1

Просмотреть файл

@ -122,7 +122,7 @@ Start-Process -FilePath Rtools.exe -NoNewWindow -Wait -ArgumentList "/VERYSILENT
Write-Output "Done installing Rtools"
Write-Output "Installing dependencies"
$packages = "c('data.table', 'jsonlite', 'knitr', 'Matrix', 'processx', 'R6', 'rmarkdown', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')"
$packages = "c('data.table', 'jsonlite', 'knitr', 'Matrix', 'processx', 'R6', 'RhpcBLASctl', 'rmarkdown', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')"
Run-R-Code-Redirect-Stderr "options(install.packages.check.source = 'no'); install.packages($packages, repos = '$env:CRAN_MIRROR', type = 'binary', lib = '$env:R_LIB_PATH', Ncpus = parallel::detectCores())" ; Check-Output $?
# MiKTeX and pandoc can be skipped on non-MinGW builds, since we don't

4
.github/workflows/r_package.yml поставляемый
Просмотреть файл

@ -188,7 +188,7 @@ jobs:
- name: Install packages
shell: bash
run: |
RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
sh build-cran-package.sh --r-executable=RD${{ matrix.r_customization }}
RD${{ matrix.r_customization }} CMD INSTALL lightgbm_*.tar.gz || exit -1
- name: Run tests with sanitizers
@ -219,7 +219,7 @@ jobs:
shell: bash
run: |
export PATH=/opt/R-devel/bin/:${PATH}
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
sh build-cran-package.sh
R CMD check --as-cran --run-donttest lightgbm_*.tar.gz || exit -1
if grep -q -E "NOTE|WARNING|ERROR" lightgbm.Rcheck/00check.log; then

Просмотреть файл

@ -313,7 +313,7 @@ jobs:
R_LIB_PATH=~/Rlib
export R_LIBS=${R_LIB_PATH}
mkdir -p ${R_LIB_PATH}
RDscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown'), lib = '${R_LIB_PATH}', dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
RDscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown'), lib = '${R_LIB_PATH}', dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
sh build-cran-package.sh --r-executable=RD || exit -1
mv lightgbm_${LGB_VER}.tar.gz $(Build.ArtifactStagingDirectory)/lightgbm-${LGB_VER}-r-cran.tar.gz
displayName: 'Build CRAN R-package'

Просмотреть файл

@ -50,6 +50,7 @@ VignetteBuilder: knitr
Suggests:
knitr,
processx,
RhpcBLASctl,
rmarkdown,
testthat
Depends:
@ -61,6 +62,7 @@ Imports:
jsonlite (>= 1.0),
Matrix (>= 1.1-0),
methods,
parallel,
utils
SystemRequirements:
C++11

Просмотреть файл

@ -52,6 +52,7 @@ importFrom(graphics,barplot)
importFrom(graphics,par)
importFrom(jsonlite,fromJSON)
importFrom(methods,is)
importFrom(parallel,detectCores)
importFrom(stats,quantile)
importFrom(utils,modifyList)
importFrom(utils,read.delim)

Просмотреть файл

@ -15,7 +15,7 @@
#' model <- lightgbm(
#' agaricus.train$data
#' , agaricus.train$label
#' , params = list(objective = "binary", nthreads = 1L)
#' , params = list(objective = "binary")
#' , nrounds = 5L
#' , verbose = 0)
#' fname <- tempfile(fileext="rds")

Просмотреть файл

@ -98,6 +98,22 @@ NULL
#' \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#objective}{
#' the "objective" item of the "Parameters" section of the documentation}.
#' @param init_score initial score is the base prediction lightgbm will boost from
#' @param num_threads Number of parallel threads to use. For best speed, this should be set to the number of
#' physical cores in the CPU - in a typical x86-64 machine, this corresponds to half the
#' number of maximum threads.
#'
#' Be aware that using too many threads can result in speed degradation in smaller datasets
#' (see the parameters documentation for more details).
#'
#' If passing zero, will use the default number of threads configured for OpenMP
#' (typically controlled through an environment variable \code{OMP_NUM_THREADS}).
#'
#' If passing \code{NULL} (the default), will try to use the number of physical cores in the
#' system, but be aware that getting the number of cores detected correctly requires package
#' \code{RhpcBLASctl} to be installed.
#'
#' This parameter gets overriden by \code{num_threads} and its aliases under \code{params}
#' if passed there.
#' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example
#' \itemize{
#' \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation}
@ -129,6 +145,7 @@ lightgbm <- function(data,
serializable = TRUE,
objective = "regression",
init_score = NULL,
num_threads = NULL,
...) {
# validate inputs early to avoid unnecessary computation
@ -136,6 +153,15 @@ lightgbm <- function(data,
stop("nrounds should be greater than zero")
}
if (is.null(num_threads)) {
num_threads <- lgb.get.default.num.threads()
}
params <- lgb.check.wrapper_param(
main_param_name = "num_threads"
, params = params
, alternative_kwarg_value = num_threads
)
# Set data to a temporary variable
dtrain <- data

Просмотреть файл

@ -217,3 +217,26 @@ lgb.check.wrapper_param <- function(main_param_name, params, alternative_kwarg_v
params[[main_param_name]] <- alternative_kwarg_value
return(params)
}
#' @importFrom parallel detectCores
lgb.get.default.num.threads <- function() {
if (requireNamespace("RhpcBLASctl", quietly = TRUE)) { # nolint
return(RhpcBLASctl::get_num_cores())
} else {
msg <- "Optional package 'RhpcBLASctl' not found."
cores <- 0L
if (Sys.info()["sysname"] != "Linux") {
cores <- parallel::detectCores(logical = FALSE)
if (is.na(cores) || cores < 0L) {
cores <- 0L
}
}
if (cores == 0L) {
msg <- paste(msg, "Will use default number of OpenMP threads.", sep = " ")
} else {
msg <- paste(msg, "Detection of CPU cores might not be accurate.", sep = " ")
}
warning(msg)
return(cores)
}
}

Просмотреть файл

@ -25,7 +25,7 @@ data("agaricus.train")
model <- lightgbm(
agaricus.train$data
, agaricus.train$label
, params = list(objective = "binary", nthreads = 1L)
, params = list(objective = "binary")
, nrounds = 5L
, verbose = 0)
fname <- tempfile(fileext="rds")

Просмотреть файл

@ -18,6 +18,7 @@ lightgbm(
serializable = TRUE,
objective = "regression",
init_score = NULL,
num_threads = NULL,
...
)
}
@ -60,6 +61,23 @@ the "objective" item of the "Parameters" section of the documentation}.}
\item{init_score}{initial score is the base prediction lightgbm will boost from}
\item{num_threads}{Number of parallel threads to use. For best speed, this should be set to the number of
physical cores in the CPU - in a typical x86-64 machine, this corresponds to half the
number of maximum threads.
Be aware that using too many threads can result in speed degradation in smaller datasets
(see the parameters documentation for more details).
If passing zero, will use the default number of threads configured for OpenMP
(typically controlled through an environment variable \code{OMP_NUM_THREADS}).
If passing \code{NULL} (the default), will try to use the number of physical cores in the
system, but be aware that getting the number of cores detected correctly requires package
\code{RhpcBLASctl} to be installed.
This parameter gets overriden by \code{num_threads} and its aliases under \code{params}
if passed there.}
\item{...}{Additional arguments passed to \code{\link{lgb.train}}. For example
\itemize{
\item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation}

Просмотреть файл

@ -2928,6 +2928,51 @@ test_that("lightgbm() defaults to 'regression' objective if objective not otherw
expect_false(any(model_txt_lines == "objective=regression_l1"))
})
test_that("lightgbm() accepts 'num_threads' as either top-level argument or under params", {
bst <- lightgbm(
data = train$data
, label = train$label
, nrounds = 5L
, verbose = VERBOSITY
, num_threads = 1L
)
expect_equal(bst$params$num_threads, 1L)
model_txt_lines <- strsplit(
x = bst$save_model_to_string()
, split = "\n"
)[[1L]]
expect_true(any(grepl("\\[num_threads: 1\\]", model_txt_lines)))
bst <- lightgbm(
data = train$data
, label = train$label
, nrounds = 5L
, verbose = VERBOSITY
, params = list(num_threads = 1L)
)
expect_equal(bst$params$num_threads, 1L)
model_txt_lines <- strsplit(
x = bst$save_model_to_string()
, split = "\n"
)[[1L]]
expect_true(any(grepl("\\[num_threads: 1\\]", model_txt_lines)))
bst <- lightgbm(
data = train$data
, label = train$label
, nrounds = 5L
, verbose = VERBOSITY
, num_threads = 10L
, params = list(num_threads = 1L)
)
expect_equal(bst$params$num_threads, 1L)
model_txt_lines <- strsplit(
x = bst$save_model_to_string()
, split = "\n"
)[[1L]]
expect_true(any(grepl("\\[num_threads: 1\\]", model_txt_lines)))
})
test_that("lightgbm() accepts 'weight' and 'weights'", {
data(mtcars)
X <- as.matrix(mtcars[, -1L])