* Fixed typos in docs

* Fixed inconsistencies in documentation

* Updated strategy for registering routines

* Fixed issues caused by smashing multiple functions into one Rd

* Fixed issues with documentation

* Removed VignetteBuilder and updated Rbuildignore

* Added R build artefacts to gitignore

* Added namespacing on data.table set function. Updated handling of CMakeLists file to get around CRAN check.

* Updated build instructions

* Added R build script

* Removed build_r.sh script and updated R-package install instructions
This commit is contained in:
James Lamb 2018-08-28 23:31:42 -05:00 коммит произвёл Qiwei Ye
Родитель 80a9a9419c
Коммит eded794efb
35 изменённых файлов: 871 добавлений и 663 удалений

8
.gitignore поставляемый
Просмотреть файл

@ -382,3 +382,11 @@ lightgbm.model
# duplicate version file
python-package/lightgbm/VERSION.txt
.Rproj.user
# R build artefacts
R-package/src/CMakeLists.txt
R-package/src/lib_lightgbm.so.dSYM/
R-package/src/src/
lightgbm_r/*
lightgbm*.tar.gz
lightgbm.Rcheck/

Просмотреть файл

@ -1 +1,12 @@
^build_package.R$
\.gitkeep$
# Objects created by compilation
\.o$
\.so$
\.dll$
\.out$
\.bin$
# Code copied in at build time
^src/CMakeLists.txt$

Просмотреть файл

@ -7,7 +7,7 @@ Authors@R: c(
person("Guolin", "Ke", email = "guolin.ke@microsoft.com", role = c("aut", "cre")),
person("Damien", "Soukhavong", email = "damien.soukhavong@skema.edu", role = c("ctb")),
person("Yachen", "Yan", role = c("ctb")),
person("James", "Lamb", role = c("ctb"))
person("James", "Lamb", email="james.lamb@uptake.com", role = c("ctb"))
)
Description: Tree based algorithms can be improved by introducing boosting frameworks. LightGBM is one such framework, and this package offers an R interface to work with it.
It is designed to be distributed and efficient with the following advantages:
@ -21,7 +21,6 @@ Description: Tree based algorithms can be improved by introducing boosting frame
License: MIT + file LICENSE
URL: https://github.com/Microsoft/LightGBM
BugReports: https://github.com/Microsoft/LightGBM/issues
VignetteBuilder: knitr
Suggests:
Ckmeans.1d.dp (>= 3.3.1),
DiagrammeR (>= 0.8.1),
@ -33,7 +32,7 @@ Suggests:
testthat,
vcd (>= 1.3)
Depends:
R (>= 3.0),
R (>= 3.4),
R6 (>= 2.0)
Imports:
data.table (>= 1.9.6),

Просмотреть файл

@ -49,4 +49,4 @@ importFrom(magrittr,"%T>%")
importFrom(magrittr,extract)
importFrom(magrittr,inset)
importFrom(methods,is)
useDynLib(lib_lightgbm)
useDynLib(lib_lightgbm , .registration = TRUE)

Просмотреть файл

@ -1,4 +1,5 @@
CB_ENV <- R6Class(
#' @importFrom R6 R6Class
CB_ENV <- R6::R6Class(
"lgb.cb_env",
cloneable = FALSE,
public = list(

Просмотреть файл

@ -1,4 +1,5 @@
Booster <- R6Class(
#' @importFrom R6 R6Class
Booster <- R6::R6Class(
classname = "lgb.Booster",
cloneable = FALSE,
public = list(
@ -654,13 +655,15 @@ Booster <- R6Class(
#'
#' @rdname predict.lgb.Booster
#' @export
predict.lgb.Booster <- function(object, data,
num_iteration = NULL,
rawscore = FALSE,
predleaf = FALSE,
predcontrib = FALSE,
header = FALSE,
reshape = FALSE, ...) {
predict.lgb.Booster <- function(object,
data,
num_iteration = NULL,
rawscore = FALSE,
predleaf = FALSE,
predcontrib = FALSE,
header = FALSE,
reshape = FALSE,
...) {
# Check booster existence
if (!lgb.is.Booster(object)) {

Просмотреть файл

@ -1,6 +1,8 @@
#' @importFrom methods is
Dataset <- R6Class(
#' @importFrom R6 R6Class
Dataset <- R6::R6Class(
classname = "lgb.Dataset",
cloneable = FALSE,
public = list(
@ -854,8 +856,8 @@ dimnames.lgb.Dataset <- function(x) {
#' Slice a dataset
#'
#' Get a new \code{lgb.Dataset} containing the specified rows of
#' orginal lgb.Dataset object
#'
#' original lgb.Dataset object
#'
#' @param dataset Object of class "lgb.Dataset"
#' @param idxset a integer vector of indices of rows needed
#' @param ... other parameters (currently not used)

Просмотреть файл

@ -1,6 +1,8 @@
#' @importFrom methods is
Predictor <- R6Class(
#' @importFrom R6 R6Class
Predictor <- R6::R6Class(
classname = "lgb.Predictor",
cloneable = FALSE,
public = list(

Просмотреть файл

@ -1,4 +1,5 @@
CVBooster <- R6Class(
#' @importFrom R6 R6Class
CVBooster <- R6::R6Class(
classname = "lgb.CVBooster",
cloneable = FALSE,
public = list(
@ -17,46 +18,39 @@ CVBooster <- R6Class(
)
#' @title Main CV logic for LightGBM
#' @description Cross validation logic used by LightGBM
#' @name lgb.cv
#' @param params List of parameters
#' @param data a \code{lgb.Dataset} object, used for CV
#' @param nrounds number of CV rounds
#' @inheritParams lgb_shared_params
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
#' @param label vector of response values. Should be provided only when data is an R-matrix.
#' @param weight vector of response values. If not NULL, will set to dataset
#' @param obj objective function, can be character or custom objective function. Examples include
#' \code{regression}, \code{regression_l1}, \code{huber},
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
#' @param boosting boosting type. \code{gbdt}, \code{dart}
#' @param num_leaves number of leaves in one tree. defaults to 127
#' @param max_depth Limit the max depth for tree model. This is used to deal with overfit when #data is small.
#' Tree still grow by leaf-wise.
#' @param num_threads Number of threads for LightGBM. For the best speed, set this to the number of real CPU cores, not the number of threads (most CPU using hyper-threading to generate 2 threads per CPU core).
#' @param eval evaluation function, can be (list of) character or custom eval function
#' @param verbose verbosity for output, if <= 0, also will disable the print of evalutaion during training
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
#' @param eval_freq evalutaion output frequence, only effect when verbose > 0
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation
#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
#' by the values of outcome labels.
#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
#' (each element must be a vector of test fold's indices). When folds are supplied,
#' the \code{nfold} and \code{stratified} parameters are ignored.
#' @param init_model path of model file of \code{lgb.Booster} object, will continue train from this model
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
#' @param categorical_feature list of str or int
#' type int represents index,
#' type str represents feature names
#' @param early_stopping_rounds int
#' Activates early stopping.
#' CV score needs to improve at least every early_stopping_rounds round(s) to continue.
#' Requires at least one metric.
#' If there's more than one, will check all of them.
#' Returns the model with (best_iter + early_stopping_rounds).
#' If early stopping occurs, the model will have 'best_iter' field
#' @param callbacks list of callback functions
#' List of callback functions that are applied at each iteration.
#' @param ... other parameters, see Parameters.rst for more informations
#' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
#' \itemize{
#' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
#' \item{num_leaves}{number of leaves in one tree. defaults to 127}
#' \item{max_depth}{Limit the max depth for tree model. This is used to deal with
#' overfit when #data is small. Tree still grow by leaf-wise.}
#' \item{num_threads}{Number of threads for LightGBM. For the best speed, set this to
#' the number of real CPU cores, not the number of threads (most
#' CPU using hyper-threading to generate 2 threads per CPU core).}
#' }
#'
#' @return a trained model \code{lgb.CVBooster}.
#'
@ -75,7 +69,6 @@ CVBooster <- R6Class(
#' learning_rate = 1,
#' early_stopping_rounds = 10)
#' }
#' @rdname lgb.train
#' @export
lgb.cv <- function(params = list(),
data,

Просмотреть файл

@ -20,7 +20,7 @@
#' \item \code{leaf_index}: ID of a leaf in a tree (integer)
#' \item \code{leaf_parent}: ID of the parent node for current leaf (integer)
#' \item \code{split_gain}: Split gain of a node
#' \item \code{threshold}: Spliting threshold value of a node
#' \item \code{threshold}: Splitting threshold value of a node
#' \item \code{decision_type}: Decision type of a node
#' \item \code{default_left}: Determine how to handle NA value, TRUE -> Left, FALSE -> Right
#' \item \code{internal_value}: Node value
@ -47,7 +47,7 @@
#' }
#'
#' @importFrom magrittr %>%
#' @importFrom data.table := data.table
#' @importFrom data.table := data.table rbindlist
#' @importFrom jsonlite fromJSON
#' @export
lgb.model.dt.tree <- function(model, num_iteration = NULL) {
@ -78,6 +78,7 @@ lgb.model.dt.tree <- function(model, num_iteration = NULL) {
}
#' @importFrom data.table data.table rbindlist
single.tree.parse <- function(lgb_tree) {

Просмотреть файл

@ -1,199 +1,200 @@
#' Data preparator for LightGBM datasets with rules (numeric)
#'
#' Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter.
#'
#' @param data A data.frame or data.table to prepare.
#' @param rules A set of rules from the data preparator, if already used.
#'
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
#'
#' @examples
#' \dontrun{
#' library(lightgbm)
#' data(iris)
#'
#' str(iris)
#' # 'data.frame': 150 obs. of 5 variables:
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#' # $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
#'
#' new_iris <- lgb.prepare_rules(data = iris) # Autoconverter
#' str(new_iris$data)
#' # 'data.frame': 150 obs. of 5 variables:
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#' # $ Species : num 1 1 1 1 1 1 1 1 1 1 ...
#'
#' data(iris) # Erase iris dataset
#' iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
#' # Warning message:
#' # In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L, :
#' # invalid factor level, NA generated
#'
#' # Use conversion using known rules
#' # Unknown factors become 0, excellent for sparse datasets
#' newer_iris <- lgb.prepare_rules(data = iris, rules = new_iris$rules)
#'
#' # Unknown factor is now zero, perfect for sparse datasets
#' newer_iris$data[1, ] # Species became 0 as it is an unknown factor
#' # Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#' # 1 5.1 3.5 1.4 0.2 0
#'
#' newer_iris$data[1, 5] <- 1 # Put back real initial value
#'
#' # Is the newly created dataset equal? YES!
#' all.equal(new_iris$data, newer_iris$data)
#' # [1] TRUE
#'
#' # Can we test our own rules?
#' data(iris) # Erase iris dataset
#'
#' # We remapped values differently
#' personal_rules <- list(Species = c("setosa" = 3,
#' "versicolor" = 2,
#' "virginica" = 1))
#' newest_iris <- lgb.prepare_rules(data = iris, rules = personal_rules)
#' str(newest_iris$data) # SUCCESS!
#' # 'data.frame': 150 obs. of 5 variables:
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#' # $ Species : num 3 3 3 3 3 3 3 3 3 3 ...
#'
#' }
#'
#' @export
lgb.prepare_rules <- function(data, rules = NULL) {
# data.table not behaving like data.frame
if (inherits(data, "data.table")) {
# Must use existing rules
if (!is.null(rules)) {
# Loop through rules
for (i in names(rules)) {
set(data, j = i, value = unname(rules[[i]][data[[i]]]))
data[[i]][is.na(data[[i]])] <- 0 # Overwrite NAs by 0s
}
} else {
# Get data classes
list_classes <- vapply(data, class, character(1))
# Map characters/factors
is_fix <- which(list_classes %in% c("character", "factor"))
rules <- list()
# Need to create rules?
if (length(is_fix) > 0) {
# Go through all characters/factors
for (i in is_fix) {
# Store column elsewhere
mini_data <- data[[i]]
# Get unique values
if (is.factor(mini_data)) {
mini_unique <- levels(mini_data) # Factor
mini_numeric <- numeric(length(mini_unique))
mini_numeric[seq_along(mini_unique)] <- seq_along(mini_unique) # Respect ordinal if needed
} else {
mini_unique <- as.factor(unique(mini_data)) # Character
mini_numeric <- as.numeric(mini_unique) # No respect of ordinality
}
# Create rules
indexed <- colnames(data)[i] # Index value
rules[[indexed]] <- mini_numeric # Numeric content
names(rules[[indexed]]) <- mini_unique # Character equivalent
# Apply to real data column
set(data, j = i, value = unname(rules[[indexed]][mini_data]))
}
}
}
} else {
# Must use existing rules
if (!is.null(rules)) {
# Loop through rules
for (i in names(rules)) {
data[[i]] <- unname(rules[[i]][data[[i]]])
data[[i]][is.na(data[[i]])] <- 0 # Overwrite NAs by 0s
}
} else {
# Default routine (data.frame)
if (inherits(data, "data.frame")) {
# Get data classes
list_classes <- vapply(data, class, character(1))
# Map characters/factors
is_fix <- which(list_classes %in% c("character", "factor"))
rules <- list()
# Need to create rules?
if (length(is_fix) > 0) {
# Go through all characters/factors
for (i in is_fix) {
# Store column elsewhere
mini_data <- data[[i]]
# Get unique values
if (is.factor(mini_data)) {
mini_unique <- levels(mini_data) # Factor
mini_numeric <- numeric(length(mini_unique))
mini_numeric[seq_along(mini_unique)] <- seq_along(mini_unique) # Respect ordinal if needed
} else {
mini_unique <- as.factor(unique(mini_data)) # Character
mini_numeric <- as.numeric(mini_unique) # No respect of ordinality
}
# Create rules
indexed <- colnames(data)[i] # Index value
rules[[indexed]] <- mini_numeric # Numeric content
names(rules[[indexed]]) <- mini_unique # Character equivalent
# Apply to real data column
data[[i]] <- unname(rules[[indexed]][mini_data])
}
}
} else {
# What do you think you are doing here? Throw error.
stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
}
}
}
return(list(data = data, rules = rules))
}
#' Data preparator for LightGBM datasets with rules (numeric)
#'
#' Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter.
#'
#' @param data A data.frame or data.table to prepare.
#' @param rules A set of rules from the data preparator, if already used.
#'
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
#'
#' @examples
#' \dontrun{
#' library(lightgbm)
#' data(iris)
#'
#' str(iris)
#' # 'data.frame': 150 obs. of 5 variables:
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#' # $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
#'
#' new_iris <- lgb.prepare_rules(data = iris) # Autoconverter
#' str(new_iris$data)
#' # 'data.frame': 150 obs. of 5 variables:
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#' # $ Species : num 1 1 1 1 1 1 1 1 1 1 ...
#'
#' data(iris) # Erase iris dataset
#' iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
#' # Warning message:
#' # In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L, :
#' # invalid factor level, NA generated
#'
#' # Use conversion using known rules
#' # Unknown factors become 0, excellent for sparse datasets
#' newer_iris <- lgb.prepare_rules(data = iris, rules = new_iris$rules)
#'
#' # Unknown factor is now zero, perfect for sparse datasets
#' newer_iris$data[1, ] # Species became 0 as it is an unknown factor
#' # Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#' # 1 5.1 3.5 1.4 0.2 0
#'
#' newer_iris$data[1, 5] <- 1 # Put back real initial value
#'
#' # Is the newly created dataset equal? YES!
#' all.equal(new_iris$data, newer_iris$data)
#' # [1] TRUE
#'
#' # Can we test our own rules?
#' data(iris) # Erase iris dataset
#'
#' # We remapped values differently
#' personal_rules <- list(Species = c("setosa" = 3,
#' "versicolor" = 2,
#' "virginica" = 1))
#' newest_iris <- lgb.prepare_rules(data = iris, rules = personal_rules)
#' str(newest_iris$data) # SUCCESS!
#' # 'data.frame': 150 obs. of 5 variables:
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#' # $ Species : num 3 3 3 3 3 3 3 3 3 3 ...
#'
#' }
#'
#' @importFrom data.table set
#' @export
lgb.prepare_rules <- function(data, rules = NULL) {
# data.table not behaving like data.frame
if (inherits(data, "data.table")) {
# Must use existing rules
if (!is.null(rules)) {
# Loop through rules
for (i in names(rules)) {
data.table::set(data, j = i, value = unname(rules[[i]][data[[i]]]))
data[[i]][is.na(data[[i]])] <- 0 # Overwrite NAs by 0s
}
} else {
# Get data classes
list_classes <- vapply(data, class, character(1))
# Map characters/factors
is_fix <- which(list_classes %in% c("character", "factor"))
rules <- list()
# Need to create rules?
if (length(is_fix) > 0) {
# Go through all characters/factors
for (i in is_fix) {
# Store column elsewhere
mini_data <- data[[i]]
# Get unique values
if (is.factor(mini_data)) {
mini_unique <- levels(mini_data) # Factor
mini_numeric <- numeric(length(mini_unique))
mini_numeric[seq_along(mini_unique)] <- seq_along(mini_unique) # Respect ordinal if needed
} else {
mini_unique <- as.factor(unique(mini_data)) # Character
mini_numeric <- as.numeric(mini_unique) # No respect of ordinality
}
# Create rules
indexed <- colnames(data)[i] # Index value
rules[[indexed]] <- mini_numeric # Numeric content
names(rules[[indexed]]) <- mini_unique # Character equivalent
# Apply to real data column
data.table::set(data, j = i, value = unname(rules[[indexed]][mini_data]))
}
}
}
} else {
# Must use existing rules
if (!is.null(rules)) {
# Loop through rules
for (i in names(rules)) {
data[[i]] <- unname(rules[[i]][data[[i]]])
data[[i]][is.na(data[[i]])] <- 0 # Overwrite NAs by 0s
}
} else {
# Default routine (data.frame)
if (inherits(data, "data.frame")) {
# Get data classes
list_classes <- vapply(data, class, character(1))
# Map characters/factors
is_fix <- which(list_classes %in% c("character", "factor"))
rules <- list()
# Need to create rules?
if (length(is_fix) > 0) {
# Go through all characters/factors
for (i in is_fix) {
# Store column elsewhere
mini_data <- data[[i]]
# Get unique values
if (is.factor(mini_data)) {
mini_unique <- levels(mini_data) # Factor
mini_numeric <- numeric(length(mini_unique))
mini_numeric[seq_along(mini_unique)] <- seq_along(mini_unique) # Respect ordinal if needed
} else {
mini_unique <- as.factor(unique(mini_data)) # Character
mini_numeric <- as.numeric(mini_unique) # No respect of ordinality
}
# Create rules
indexed <- colnames(data)[i] # Index value
rules[[indexed]] <- mini_numeric # Numeric content
names(rules[[indexed]]) <- mini_unique # Character equivalent
# Apply to real data column
data[[i]] <- unname(rules[[indexed]][mini_data])
}
}
} else {
# What do you think you are doing here? Throw error.
stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
}
}
}
return(list(data = data, rules = rules))
}

Просмотреть файл

@ -1,197 +1,198 @@
#' Data preparator for LightGBM datasets with rules (integer)
#'
#' Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
#'
#' @param data A data.frame or data.table to prepare.
#' @param rules A set of rules from the data preparator, if already used.
#'
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
#'
#' @examples
#' \dontrun{
#' library(lightgbm)
#' data(iris)
#'
#' str(iris)
#' # 'data.frame': 150 obs. of 5 variables:
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#' # $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
#'
#' new_iris <- lgb.prepare_rules2(data = iris) # Autoconverter
#' str(new_iris$data)
#' # 'data.frame': 150 obs. of 5 variables:
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#' # $ Species : int 1 1 1 1 1 1 1 1 1 1 ...
#'
#' data(iris) # Erase iris dataset
#' iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
#' # Warning message:
#' # In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L, :
#' # invalid factor level, NA generated
#'
#' # Use conversion using known rules
#' # Unknown factors become 0, excellent for sparse datasets
#' newer_iris <- lgb.prepare_rules2(data = iris, rules = new_iris$rules)
#'
#' # Unknown factor is now zero, perfect for sparse datasets
#' newer_iris$data[1, ] # Species became 0 as it is an unknown factor
#' # Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#' # 1 5.1 3.5 1.4 0.2 0
#'
#' newer_iris$data[1, 5] <- 1 # Put back real initial value
#'
#' # Is the newly created dataset equal? YES!
#' all.equal(new_iris$data, newer_iris$data)
#' # [1] TRUE
#'
#' # Can we test our own rules?
#' data(iris) # Erase iris dataset
#'
#' # We remapped values differently
#' personal_rules <- list(Species = c("setosa" = 3L,
#' "versicolor" = 2L,
#' "virginica" = 1L))
#' newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules)
#' str(newest_iris$data) # SUCCESS!
#' # 'data.frame': 150 obs. of 5 variables:
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#' # $ Species : int 3 3 3 3 3 3 3 3 3 3 ...
#'
#' }
#'
#' @export
lgb.prepare_rules2 <- function(data, rules = NULL) {
# data.table not behaving like data.frame
if (inherits(data, "data.table")) {
# Must use existing rules
if (!is.null(rules)) {
# Loop through rules
for (i in names(rules)) {
set(data, j = i, value = unname(rules[[i]][data[[i]]]))
data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
}
} else {
# Get data classes
list_classes <- vapply(data, class, character(1))
# Map characters/factors
is_fix <- which(list_classes %in% c("character", "factor"))
rules <- list()
# Need to create rules?
if (length(is_fix) > 0) {
# Go through all characters/factors
for (i in is_fix) {
# Store column elsewhere
mini_data <- data[[i]]
# Get unique values
if (is.factor(mini_data)) {
mini_unique <- levels(mini_data) # Factor
mini_numeric <- seq_along(mini_unique) # Respect ordinal if needed
} else {
mini_unique <- as.factor(unique(mini_data)) # Character
mini_numeric <- as.integer(mini_unique) # No respect of ordinality
}
# Create rules
indexed <- colnames(data)[i] # Index value
rules[[indexed]] <- mini_numeric # Numeric content
names(rules[[indexed]]) <- mini_unique # Character equivalent
# Apply to real data column
set(data, j = i, value = unname(rules[[indexed]][mini_data]))
}
}
}
} else {
# Must use existing rules
if (!is.null(rules)) {
# Loop through rules
for (i in names(rules)) {
data[[i]] <- unname(rules[[i]][data[[i]]])
data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
}
} else {
# Default routine (data.frame)
if (inherits(data, "data.frame")) {
# Get data classes
list_classes <- vapply(data, class, character(1))
# Map characters/factors
is_fix <- which(list_classes %in% c("character", "factor"))
rules <- list()
# Need to create rules?
if (length(is_fix) > 0) {
# Go through all characters/factors
for (i in is_fix) {
# Store column elsewhere
mini_data <- data[[i]]
# Get unique values
if (is.factor(mini_data)) {
mini_unique <- levels(mini_data) # Factor
mini_numeric <- seq_along(mini_unique) # Respect ordinal if needed
} else {
mini_unique <- as.factor(unique(mini_data)) # Character
mini_numeric <- as.integer(mini_unique) # No respect of ordinality
}
# Create rules
indexed <- colnames(data)[i] # Index value
rules[[indexed]] <- mini_numeric # Numeric content
names(rules[[indexed]]) <- mini_unique # Character equivalent
# Apply to real data column
data[[i]] <- unname(rules[[indexed]][mini_data])
}
}
} else {
# What do you think you are doing here? Throw error.
stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
}
}
}
return(list(data = data, rules = rules))
}
#' Data preparator for LightGBM datasets with rules (integer)
#'
#' Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
#'
#' @param data A data.frame or data.table to prepare.
#' @param rules A set of rules from the data preparator, if already used.
#'
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
#'
#' @examples
#' \dontrun{
#' library(lightgbm)
#' data(iris)
#'
#' str(iris)
#' # 'data.frame': 150 obs. of 5 variables:
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#' # $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
#'
#' new_iris <- lgb.prepare_rules2(data = iris) # Autoconverter
#' str(new_iris$data)
#' # 'data.frame': 150 obs. of 5 variables:
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#' # $ Species : int 1 1 1 1 1 1 1 1 1 1 ...
#'
#' data(iris) # Erase iris dataset
#' iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
#' # Warning message:
#' # In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L, :
#' # invalid factor level, NA generated
#'
#' # Use conversion using known rules
#' # Unknown factors become 0, excellent for sparse datasets
#' newer_iris <- lgb.prepare_rules2(data = iris, rules = new_iris$rules)
#'
#' # Unknown factor is now zero, perfect for sparse datasets
#' newer_iris$data[1, ] # Species became 0 as it is an unknown factor
#' # Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#' # 1 5.1 3.5 1.4 0.2 0
#'
#' newer_iris$data[1, 5] <- 1 # Put back real initial value
#'
#' # Is the newly created dataset equal? YES!
#' all.equal(new_iris$data, newer_iris$data)
#' # [1] TRUE
#'
#' # Can we test our own rules?
#' data(iris) # Erase iris dataset
#'
#' # We remapped values differently
#' personal_rules <- list(Species = c("setosa" = 3L,
#' "versicolor" = 2L,
#' "virginica" = 1L))
#' newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules)
#' str(newest_iris$data) # SUCCESS!
#' # 'data.frame': 150 obs. of 5 variables:
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#' # $ Species : int 3 3 3 3 3 3 3 3 3 3 ...
#'
#' }
#'
#' @importFrom data.table set
#' @export
lgb.prepare_rules2 <- function(data, rules = NULL) {
# data.table not behaving like data.frame
if (inherits(data, "data.table")) {
# Must use existing rules
if (!is.null(rules)) {
# Loop through rules
for (i in names(rules)) {
data.table::set(data, j = i, value = unname(rules[[i]][data[[i]]]))
data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
}
} else {
# Get data classes
list_classes <- vapply(data, class, character(1))
# Map characters/factors
is_fix <- which(list_classes %in% c("character", "factor"))
rules <- list()
# Need to create rules?
if (length(is_fix) > 0) {
# Go through all characters/factors
for (i in is_fix) {
# Store column elsewhere
mini_data <- data[[i]]
# Get unique values
if (is.factor(mini_data)) {
mini_unique <- levels(mini_data) # Factor
mini_numeric <- seq_along(mini_unique) # Respect ordinal if needed
} else {
mini_unique <- as.factor(unique(mini_data)) # Character
mini_numeric <- as.integer(mini_unique) # No respect of ordinality
}
# Create rules
indexed <- colnames(data)[i] # Index value
rules[[indexed]] <- mini_numeric # Numeric content
names(rules[[indexed]]) <- mini_unique # Character equivalent
# Apply to real data column
data.table::set(data, j = i, value = unname(rules[[indexed]][mini_data]))
}
}
}
} else {
# Must use existing rules
if (!is.null(rules)) {
# Loop through rules
for (i in names(rules)) {
data[[i]] <- unname(rules[[i]][data[[i]]])
data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
}
} else {
# Default routine (data.frame)
if (inherits(data, "data.frame")) {
# Get data classes
list_classes <- vapply(data, class, character(1))
# Map characters/factors
is_fix <- which(list_classes %in% c("character", "factor"))
rules <- list()
# Need to create rules?
if (length(is_fix) > 0) {
# Go through all characters/factors
for (i in is_fix) {
# Store column elsewhere
mini_data <- data[[i]]
# Get unique values
if (is.factor(mini_data)) {
mini_unique <- levels(mini_data) # Factor
mini_numeric <- seq_along(mini_unique) # Respect ordinal if needed
} else {
mini_unique <- as.factor(unique(mini_data)) # Character
mini_numeric <- as.integer(mini_unique) # No respect of ordinality
}
# Create rules
indexed <- colnames(data)[i] # Index value
rules[[indexed]] <- mini_numeric # Numeric content
names(rules[[indexed]]) <- mini_unique # Character equivalent
# Apply to real data column
data[[i]] <- unname(rules[[indexed]][mini_data])
}
}
} else {
# What do you think you are doing here? Throw error.
stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
}
}
}
return(list(data = data, rules = rules))
}

Просмотреть файл

@ -1,39 +1,28 @@
#' @title Main training logic for LightGBM
#' @name lgb.train
#' @param params List of parameters
#' @param data a \code{lgb.Dataset} object, used for training
#' @param nrounds number of training rounds
#' @description Logic to train with LightGBM
#' @inheritParams lgb_shared_params
#' @param valids a list of \code{lgb.Dataset} objects, used for validation
#' @param obj objective function, can be character or custom objective function. Examples include
#' \code{regression}, \code{regression_l1}, \code{huber},
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
#' @param boosting boosting type. \code{gbdt}, \code{dart}
#' @param num_leaves number of leaves in one tree. defaults to 127
#' @param max_depth Limit the max depth for tree model. This is used to deal with overfit when #data is small.
#' Tree still grow by leaf-wise.
#' @param num_threads Number of threads for LightGBM. For the best speed, set this to the number of real CPU cores, not the number of threads (most CPU using hyper-threading to generate 2 threads per CPU core).
#' @param eval evaluation function, can be (a list of) character or custom eval function
#' @param verbose verbosity for output, if <= 0, also will disable the print of evalutaion during training
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
#' @param eval_freq evalutaion output frequency, only effect when verbose > 0
#' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
#' @param categorical_feature list of str or int
#' type int represents index,
#' type str represents feature names
#' @param early_stopping_rounds int
#' Activates early stopping.
#' The model will train until the validation score stops improving.
#' Validation score needs to improve at least every early_stopping_rounds round(s) to continue training.
#' Requires at least one validation data and one metric.
#' If there's more than one, will check all of them. But the training data is ignored anyway.
#' Returns the model with (best_iter + early_stopping_rounds).
#' If early stopping occurs, the model will have 'best_iter' field
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets
#' @param callbacks list of callback functions
#' List of callback functions that are applied at each iteration.
#' @param ... other parameters, see Parameters.rst for more information
#'
#' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
#' \itemize{
#' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
#' \item{num_leaves}{number of leaves in one tree. defaults to 127}
#' \item{max_depth}{Limit the max depth for tree model. This is used to deal with
#' overfit when #data is small. Tree still grow by leaf-wise.}
#' \item{num_threads}{Number of threads for LightGBM. For the best speed, set this to
#' the number of real CPU cores, not the number of threads (most
#' CPU using hyper-threading to generate 2 threads per CPU core).}
#' }
#' @return a trained booster model \code{lgb.Booster}.
#'
#' @examples
@ -56,8 +45,6 @@
#' early_stopping_rounds = 10)
#' }
#'
#' @rdname lgb.train
#'
#' @export
lgb.train <- function(params = list(),
data,

Просмотреть файл

@ -1,7 +1,51 @@
#' Simple interface for training an lightgbm model.
#' Its documentation is combined with lgb.train.
#'
#' @rdname lgb.train
#' @name lgb_shared_params
#' @title Shared parameter docs
#' @description Parameter docs shared by \code{lgb.train}, \code{lgb.cv}, and \code{lightgbm}
#' @param callbacks list of callback functions
#' List of callback functions that are applied at each iteration.
#' @param data a \code{lgb.Dataset} object, used for training
#' @param early_stopping_rounds int
#' Activates early stopping.
#' Requires at least one validation data and one metric
#' If there's more than one, will check all of them except the training data
#' Returns the model with (best_iter + early_stopping_rounds)
#' If early stopping occurs, the model will have 'best_iter' field
#' @param eval_freq evaluation output frequency, only effect when verbose > 0
#' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
#' @param nrounds number of training rounds
#' @param params List of parameters
#' @param verbose verbosity for output, if <= 0, also will disable the print of evaluation during training
NULL
#' @title Train a LightGBM model
#' @name lightgbm
#' @description Simple interface for training an LightGBM model.
#' @inheritParams lgb_shared_params
#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}
#' @param weight vector of response values. If not NULL, will set to dataset
#' @param save_name File name to use when writing the trained model to disk. Should end in ".model".
#' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example
#' \itemize{
#' \item{valids}{a list of \code{lgb.Dataset} objects, used for validation}
#' \item{obj}{objective function, can be character or custom objective function. Examples include
#' \code{regression}, \code{regression_l1}, \code{huber},
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
#' \item{eval}{evaluation function, can be (a list of) character or custom eval function}
#' \item{record}{Boolean, TRUE will record iteration message to \code{booster$record_evals}}
#' \item{colnames}{feature names, if not null, will use this to overwrite the names in dataset}
#' \item{categorical_feature}{list of str or int. type int represents index, type str represents feature names}
#' \item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model
#' into a predictor model which frees up memory and the original datasets}
#' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
#' \item{num_leaves}{number of leaves in one tree. defaults to 127}
#' \item{max_depth}{Limit the max depth for tree model. This is used to deal with
#' overfit when #data is small. Tree still grow by leaf-wise.}
#' \item{num_threads}{Number of threads for LightGBM. For the best speed, set this to
#' the number of real CPU cores, not the number of threads (most
#' CPU using hyper-threading to generate 2 threads per CPU core).}
#' }
#' @export
lightgbm <- function(data,
label = NULL,
@ -122,7 +166,7 @@ NULL
# Various imports
#' @import methods
#' @importFrom R6 R6Class
#' @useDynLib lib_lightgbm
#' @useDynLib lib_lightgbm , .registration = TRUE
NULL
# Suppress false positive warnings from R CMD CHECK about

Просмотреть файл

@ -1,6 +1,6 @@
#' readRDS for lgb.Booster models
#'
#' Attemps to load a model using RDS.
#' Attempts to load a model using RDS.
#'
#' @param file a connection or the name of the file where the R object is saved to or read from.
#' @param refhook a hook function for handling reference objects.

Просмотреть файл

@ -1,6 +1,6 @@
#' saveRDS for lgb.Booster models
#'
#' Attemps to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
#' Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
#'
#' @param object R object to serialize.
#' @param file a connection or the name of the file where the R object is saved to or read from.

Просмотреть файл

@ -22,30 +22,28 @@ For users who wants to install online with GPU or want to choose a specific comp
**Warning for Windows users**: it is recommended to use *Visual Studio* for its better multi-threading efficiency in Windows for many core systems. For very simple systems (dual core computers or worse), MinGW64 is recommended for maximum performance. If you do not know what to choose, it is recommended to use [Visual Studio](https://visualstudio.microsoft.com/downloads/), the default compiler. **Do not try using MinGW in Windows on many core systems. It may result in 10x slower results than Visual Studio.**
#### macOS Preparation
#### Mac OS Preparation
You can perform installation either with **Apple Clang** or **gcc**. In case you prefer **Apple Clang**, you should install **OpenMP** (details for installation can be found in [Installation Guide](https://github.com/Microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#apple-clang)) first and **CMake** version 3.12 or higher is required. In case you prefer **gcc**, you need to install it (details for installation can be found in [Installation Guide](https://github.com/Microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#gcc)) and specify compilers by running ``export CXX=g++-7 CC=gcc-7`` (replace "7" with version of **gcc** installed on your machine) first.
### Install
Mac users may need to set some environment variables to tell R to use `gcc` and `g++`. If you install these from Homebrew, your versions of `g++` and `gcc` are most likely in `/usr/local/bin`, as shown below.
Install LightGBM R-package with the following command:
```sh
git clone --recursive https://github.com/Microsoft/LightGBM
cd LightGBM/R-package
# export CXX=g++-7 CC=gcc-7 # macOS users, if you decided to compile with gcc, don't forget to specify compilers (replace "7" with version of gcc installed on your machine)
R CMD INSTALL --build . --no-multiarch
```
# replace 8 with version of gcc installed on your machine
export CXX=/usr/local/bin/g++-8 CC=/usr/local/bin/gcc-8
```
Or build a self-contained R-package which can be installed afterwards:
### Install
Build and install R-package with the following commands:
```sh
git clone --recursive https://github.com/Microsoft/LightGBM
cd LightGBM/R-package
Rscript build_package.R
# export CXX=g++-7 CC=gcc-7 # macOS users, if you decided to compile with gcc, don't forget to specify compilers (replace "7" with version of gcc installed on your machine)
R CMD INSTALL lightgbm_2.1.1.tar.gz --no-multiarch
```
cd LightGBM
Rscript build_r.R
```
The `build_r.R` script builds the package in a temporary directory called `lightgbm_r`. It will destroy and recreate that directory each time you run the script.
Note: for the build with Visual Studio/MSBuild in Windows, you should use the Windows CMD or Powershell.
@ -53,15 +51,7 @@ Windows users may need to run with administrator rights (either R or the command
Set `use_gpu` to `TRUE` in `R-package/src/install.libs.R` to enable the build with GPU support. You will need to install Boost and OpenCL first: details for installation can be found in [Installation-Guide](https://github.com/Microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#build-gpu-version).
You can also install directly from R using the repository with `devtools`:
```r
library(devtools)
options(devtools.install.args = "--no-multiarch") # if you have 64-bit R only, you can skip this
install_github("Microsoft/LightGBM", subdir = "R-package")
```
If you are using a precompiled dll/lib locally, you can move the dll/lib into LightGBM root folder, modify `LightGBM/R-package/src/install.libs.R`'s 2nd line (change `use_precompile <- FALSE` to `use_precompile <- TRUE`), and install R-package as usual. **NOTE: If your R version is not smaller than 3.5.0, you should set `DUSE_R35=ON` in CMake options when build precompiled dll/lib**.
If you are using a precompiled dll/lib locally, you can move the dll/lib into LightGBM root folder, modify `LightGBM/R-package/src/install.libs.R`'s 2nd line (change `use_precompile <- FALSE` to `use_precompile <- TRUE`), and install R-package as usual. **NOTE: If your R version is not smaller than 3.5.0, you should set `DUSE_R35=ON` in cmake options when build precompiled dll/lib**.
When your package installation is done, you can check quickly if your LightGBM R-package is working by running the following:

Просмотреть файл

@ -1,22 +0,0 @@
unlink("./src/include", recursive = TRUE)
unlink("./src/src", recursive = TRUE)
unlink("./src/compute", recursive = TRUE)
unlink("./src/build", recursive = TRUE)
unlink("./src/Release", recursive = TRUE)
if (!file.copy("./../include", "./src/", overwrite = TRUE, recursive = TRUE)) {
stop("Cannot find folder LightGBM/include")
}
if (!file.copy("./../src", "./src/", overwrite = TRUE, recursive = TRUE)) {
stop("Cannot find folder LightGBM/src")
}
if (!file.copy("./../compute", "./src/", overwrite = TRUE, recursive = TRUE)) {
print("Cannot find folder LightGBM/compute, will disable GPU build")
}
if (!file.copy("./../CMakeLists.txt", "./src/", overwrite = TRUE, recursive = TRUE)) {
stop("Cannot find file LightGBM/CMakeLists.txt")
}
if (!file.exists("./src/_IS_FULL_PACKAGE")) {
file.create("./src/_IS_FULL_PACKAGE")
}
system("R CMD build --no-build-vignettes .")
file.remove("./src/_IS_FULL_PACKAGE")

Просмотреть файл

@ -1,10 +1,11 @@
basic_walkthrough Basic feature walkthrough
boost_from_prediction Boosting from existing prediction
categorical_feature_prepare Categorical Feature Preparation
categorical_feature_rules Categorical Feature Preparation with Rules
categorical_features_prepare Categorical Feature Preparation
categorical_features_rules Categorical Feature Preparation with Rules
cross_validation Cross Validation
early_stopping Early Stop in training
efficient_many_training Efficiency for Many Model Trainings
multiclass Multiclass training/prediction
multiclass_custom_objective Multiclass with Custom Objective Function
leaf_stability Leaf (in)Stability example
weight_param Weight-Parameter adjustment relationship

Просмотреть файл

97
R-package/man/lgb.cv.Rd Normal file
Просмотреть файл

@ -0,0 +1,97 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lgb.cv.R
\name{lgb.cv}
\alias{lgb.cv}
\title{Main CV logic for LightGBM}
\usage{
lgb.cv(params = list(), data, nrounds = 10, nfold = 3, label = NULL,
weight = NULL, obj = NULL, eval = NULL, verbose = 1, record = TRUE,
eval_freq = 1L, showsd = TRUE, stratified = TRUE, folds = NULL,
init_model = NULL, colnames = NULL, categorical_feature = NULL,
early_stopping_rounds = NULL, callbacks = list(), ...)
}
\arguments{
\item{params}{List of parameters}
\item{data}{a \code{lgb.Dataset} object, used for training}
\item{nrounds}{number of training rounds}
\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
\item{label}{vector of response values. Should be provided only when data is an R-matrix.}
\item{weight}{vector of response values. If not NULL, will set to dataset}
\item{obj}{objective function, can be character or custom objective function. Examples include
\code{regression}, \code{regression_l1}, \code{huber},
\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
\item{eval}{evaluation function, can be (list of) character or custom eval function}
\item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
\item{record}{Boolean, TRUE will record iteration message to \code{booster$record_evals}}
\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
\item{showsd}{\code{boolean}, whether to show standard deviation of cross validation}
\item{stratified}{a \code{boolean} indicating whether sampling of folds should be stratified
by the values of outcome labels.}
\item{folds}{\code{list} provides a possibility to use a list of pre-defined CV folds
(each element must be a vector of test fold's indices). When folds are supplied,
the \code{nfold} and \code{stratified} parameters are ignored.}
\item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model}
\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset}
\item{categorical_feature}{list of str or int
type int represents index,
type str represents feature names}
\item{early_stopping_rounds}{int
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them except the training data
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will have 'best_iter' field}
\item{callbacks}{list of callback functions
List of callback functions that are applied at each iteration.}
\item{...}{other parameters, see Parameters.rst for more information. A few key parameters:
\itemize{
\item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
\item{num_leaves}{number of leaves in one tree. defaults to 127}
\item{max_depth}{Limit the max depth for tree model. This is used to deal with
overfit when #data is small. Tree still grow by leaf-wise.}
\item{num_threads}{Number of threads for LightGBM. For the best speed, set this to
the number of real CPU cores, not the number of threads (most
CPU using hyper-threading to generate 2 threads per CPU core).}
}}
}
\value{
a trained model \code{lgb.CVBooster}.
}
\description{
Cross validation logic used by LightGBM
}
\examples{
\dontrun{
library(lightgbm)
data(agaricus.train, package = "lightgbm")
train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label = train$label)
params <- list(objective = "regression", metric = "l2")
model <- lgb.cv(params,
dtrain,
10,
nfold = 5,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10)
}
}

Просмотреть файл

@ -9,7 +9,7 @@ lgb.model.dt.tree(model, num_iteration = NULL)
\arguments{
\item{model}{object of class \code{lgb.Booster}}
\item{num_iteration}{number of iterations you want to predict with. NULL or
\item{num_iteration}{number of iterations you want to predict with. NULL or
<= 0 means use best iteration}
}
\value{
@ -26,7 +26,7 @@ The columns of the \code{data.table} are:
\item \code{leaf_index}: ID of a leaf in a tree (integer)
\item \code{leaf_parent}: ID of the parent node for current leaf (integer)
\item \code{split_gain}: Split gain of a node
\item \code{threshold}: Spliting threshold value of a node
\item \code{threshold}: Splitting threshold value of a node
\item \code{decision_type}: Decision type of a node
\item \code{default_left}: Determine how to handle NA value, TRUE -> Left, FALSE -> Right
\item \code{internal_value}: Node value

Просмотреть файл

@ -1,122 +1,35 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lgb.cv.R, R/lgb.train.R, R/lightgbm.R
\name{lgb.cv}
\alias{lgb.cv}
% Please edit documentation in R/lgb.train.R
\name{lgb.train}
\alias{lgb.train}
\alias{lightgbm}
\title{Main CV logic for LightGBM}
\title{Main training logic for LightGBM}
\usage{
lgb.cv(params = list(), data, nrounds = 10, nfold = 3, label = NULL,
weight = NULL, obj = NULL, eval = NULL, verbose = 1, record = TRUE,
eval_freq = 1L, showsd = TRUE, stratified = TRUE, folds = NULL,
init_model = NULL, colnames = NULL, categorical_feature = NULL,
early_stopping_rounds = NULL, callbacks = list(), ...)
lgb.train(params = list(), data, nrounds = 10, valids = list(),
obj = NULL, eval = NULL, verbose = 1, record = TRUE, eval_freq = 1L,
init_model = NULL, colnames = NULL, categorical_feature = NULL,
early_stopping_rounds = NULL, callbacks = list(), reset_data = FALSE,
...)
lightgbm(data, label = NULL, weight = NULL, params = list(),
nrounds = 10, verbose = 1, eval_freq = 1L,
early_stopping_rounds = NULL, save_name = "lightgbm.model",
init_model = NULL, callbacks = list(), ...)
}
\arguments{
\item{params}{List of parameters}
\item{data}{a \code{lgb.Dataset} object, used for CV}
\item{nrounds}{number of CV rounds}
\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
\item{label}{vector of response values. Should be provided only when data is an R-matrix.}
\item{weight}{vector of response values. If not NULL, will set to dataset}
\item{obj}{objective function, can be character or custom objective function. Examples include
\code{regression}, \code{regression_l1}, \code{huber},
\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
\item{eval}{evaluation function, can be (list of) character or custom eval function}
\item{verbose}{verbosity for output, if <= 0, also will disable the print of evalutaion during training}
\item{record}{Boolean, TRUE will record iteration message to \code{booster$record_evals}}
\item{eval_freq}{evalutaion output frequence, only effect when verbose > 0}
\item{showsd}{\code{boolean}, whether to show standard deviation of cross validation}
\item{stratified}{a \code{boolean} indicating whether sampling of folds should be stratified
by the values of outcome labels.}
\item{folds}{\code{list} provides a possibility to use a list of pre-defined CV folds
(each element must be a vector of test fold's indices). When folds are supplied,
the \code{nfold} and \code{stratified} parameters are ignored.}
\item{init_model}{path of model file of \code{lgb.Booster} object, will continue train from this model}
\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset}
\item{categorical_feature}{list of str or int
type int represents index,
type str represents feature names}
\item{early_stopping_rounds}{int
Activates early stopping.
CV score needs to improve at least every early_stopping_rounds round(s) to continue.
Requires at least one metric.
If there's more than one, will check all of them.
Returns the model with (best_iter + early_stopping_rounds).
If early stopping occurs, the model will have 'best_iter' field}
\item{callbacks}{list of callback functions
List of callback functions that are applied at each iteration.}
\item{...}{other parameters, see Parameters.rst for more informations}
\item{valids}{a list of \code{lgb.Dataset} objects, used for validation}
\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets}
\item{boosting}{boosting type. \code{gbdt}, \code{dart}}
\item{num_leaves}{number of leaves in one tree. defaults to 127}
\item{max_depth}{Limit the max depth for tree model. This is used to deal with overfit when #data is small.
Tree still grow by leaf-wise.}
\item{num_threads}{Number of threads for LightGBM. For the best speed, set this to the number of real CPU cores, not the number of threads (most CPU using hyper-threading to generate 2 threads per CPU core).}
\item{params}{List of parameters}
\item{data}{a \code{lgb.Dataset} object, used for training}
\item{nrounds}{number of training rounds}
\item{valids}{a list of \code{lgb.Dataset} objects, used for validation}
\item{obj}{objective function, can be character or custom objective function. Examples include
\code{regression}, \code{regression_l1}, \code{huber},
\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
\item{boosting}{boosting type. \code{gbdt}, \code{dart}}
\item{num_leaves}{number of leaves in one tree. defaults to 127}
\item{max_depth}{Limit the max depth for tree model. This is used to deal with overfit when #data is small.
Tree still grow by leaf-wise.}
\item{num_threads}{Number of threads for LightGBM. For the best speed, set this to the number of real CPU cores, not the number of threads (most CPU using hyper-threading to generate 2 threads per CPU core).}
\item{eval}{evaluation function, can be (a list of) character or custom eval function}
\item{verbose}{verbosity for output, if <= 0, also will disable the print of evalutaion during training}
\item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
\item{record}{Boolean, TRUE will record iteration message to \code{booster$record_evals}}
\item{eval_freq}{evalutaion output frequency, only effect when verbose > 0}
\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
\item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model}
@ -128,26 +41,32 @@ type str represents feature names}
\item{early_stopping_rounds}{int
Activates early stopping.
The model will train until the validation score stops improving.
Validation score needs to improve at least every early_stopping_rounds round(s) to continue training.
Requires at least one validation data and one metric.
If there's more than one, will check all of them. But the training data is ignored anyway.
Returns the model with (best_iter + early_stopping_rounds).
Requires at least one validation data and one metric
If there's more than one, will check all of them except the training data
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will have 'best_iter' field}
\item{callbacks}{list of callback functions
List of callback functions that are applied at each iteration.}
\item{...}{other parameters, see Parameters.rst for more informations}
\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets}
\item{...}{other parameters, see Parameters.rst for more information. A few key parameters:
\itemize{
\item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
\item{num_leaves}{number of leaves in one tree. defaults to 127}
\item{max_depth}{Limit the max depth for tree model. This is used to deal with
overfit when #data is small. Tree still grow by leaf-wise.}
\item{num_threads}{Number of threads for LightGBM. For the best speed, set this to
the number of real CPU cores, not the number of threads (most
CPU using hyper-threading to generate 2 threads per CPU core).}
}}
}
\value{
a trained model \code{lgb.CVBooster}.
a trained booster model \code{lgb.Booster}.
}
\description{
Simple interface for training an lightgbm model.
Its documentation is combined with lgb.train.
Logic to train with LightGBM
}
\examples{
\dontrun{
@ -155,20 +74,6 @@ library(lightgbm)
data(agaricus.train, package = "lightgbm")
train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label = train$label)
params <- list(objective = "regression", metric = "l2")
model <- lgb.cv(params,
dtrain,
10,
nfold = 5,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10)
}
\dontrun{
library(lightgbm)
data(agaricus.train, package = "lightgbm")
train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label = train$label)
data(agaricus.test, package = "lightgbm")
test <- agaricus.test
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)

Просмотреть файл

@ -0,0 +1,31 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lightgbm.R
\name{lgb_shared_params}
\alias{lgb_shared_params}
\title{Shared parameter docs}
\arguments{
\item{callbacks}{list of callback functions
List of callback functions that are applied at each iteration.}
\item{data}{a \code{lgb.Dataset} object, used for training}
\item{early_stopping_rounds}{int
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them except the training data
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will have 'best_iter' field}
\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
\item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model}
\item{nrounds}{number of training rounds}
\item{params}{List of parameters}
\item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
}
\description{
Parameter docs shared by \code{lgb.train}, \code{lgb.cv}, and \code{lightgbm}
}

64
R-package/man/lightgbm.Rd Normal file
Просмотреть файл

@ -0,0 +1,64 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lightgbm.R
\name{lightgbm}
\alias{lightgbm}
\title{Train a LightGBM model}
\usage{
lightgbm(data, label = NULL, weight = NULL, params = list(),
nrounds = 10, verbose = 1, eval_freq = 1L,
early_stopping_rounds = NULL, save_name = "lightgbm.model",
init_model = NULL, callbacks = list(), ...)
}
\arguments{
\item{data}{a \code{lgb.Dataset} object, used for training}
\item{label}{Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}}
\item{weight}{vector of response values. If not NULL, will set to dataset}
\item{params}{List of parameters}
\item{nrounds}{number of training rounds}
\item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
\item{early_stopping_rounds}{int
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them except the training data
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will have 'best_iter' field}
\item{save_name}{File name to use when writing the trained model to disk. Should end in ".model".}
\item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model}
\item{callbacks}{list of callback functions
List of callback functions that are applied at each iteration.}
\item{...}{Additional arguments passed to \code{\link{lgb.train}}. For example
\itemize{
\item{valids}{a list of \code{lgb.Dataset} objects, used for validation}
\item{obj}{objective function, can be character or custom objective function. Examples include
\code{regression}, \code{regression_l1}, \code{huber},
\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
\item{eval}{evaluation function, can be (a list of) character or custom eval function}
\item{record}{Boolean, TRUE will record iteration message to \code{booster$record_evals}}
\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset}
\item{categorical_feature}{list of str or int. type int represents index, type str represents feature names}
\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model
into a predictor model which frees up memory and the original datasets}
\item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
\item{num_leaves}{number of leaves in one tree. defaults to 127}
\item{max_depth}{Limit the max depth for tree model. This is used to deal with
overfit when #data is small. Tree still grow by leaf-wise.}
\item{num_threads}{Number of threads for LightGBM. For the best speed, set this to
the number of real CPU cores, not the number of threads (most
CPU using hyper-threading to generate 2 threads per CPU core).}
}}
}
\description{
Simple interface for training an LightGBM model.
}

Просмотреть файл

@ -5,8 +5,8 @@
\title{Predict method for LightGBM model}
\usage{
\method{predict}{lgb.Booster}(object, data, num_iteration = NULL,
rawscore = FALSE, predleaf = FALSE, header = FALSE, reshape = FALSE,
...)
rawscore = FALSE, predleaf = FALSE, predcontrib = FALSE,
header = FALSE, reshape = FALSE, ...)
}
\arguments{
\item{object}{Object of class \code{lgb.Booster}}
@ -21,6 +21,8 @@ logistic regression would result in predictions for log-odds instead of probabil
\item{predleaf}{whether predict leaf index instead.}
\item{predcontrib}{return per-feature contributions for each record.}
\item{header}{only used for prediction for text file. True if text file has header}
\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several

Просмотреть файл

@ -15,7 +15,7 @@ readRDS.lgb.Booster(file = "", refhook = NULL)
lgb.Booster.
}
\description{
Attemps to load a model using RDS.
Attempts to load a model using RDS.
}
\examples{
\dontrun{

Просмотреть файл

@ -26,7 +26,7 @@ saveRDS.lgb.Booster(object, file = "", ascii = FALSE, version = NULL,
NULL invisibly.
}
\description{
Attemps to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
}
\examples{
\dontrun{

Просмотреть файл

@ -21,7 +21,7 @@ constructed sub dataset
}
\description{
Get a new \code{lgb.Dataset} containing the specified rows of
orginal lgb.Dataset object
original lgb.Dataset object
}
\examples{
\dontrun{

12
R-package/src/R_init.c Normal file
Просмотреть файл

@ -0,0 +1,12 @@
// Register Dynamic Symbols
#include <R.h>
#include <Rinternals.h>
#include <R_ext/Rdynload.h>
#include "R_init.h"
void R_init_lightgbm(DllInfo* info) {
R_registerRoutines(info, NULL, NULL, NULL, NULL);
R_useDynamicSymbols(info, TRUE);
}

7
R-package/src/R_init.h Normal file
Просмотреть файл

@ -0,0 +1,7 @@
// Register Dynamic Symbols
#ifndef R_INIT_LIGHTGBM_H
#define R_INIT_LIGHTGBM_H
void R_init_lightgbm(DllInfo* info);
#endif // R_INIT_LIGHTGBM_H

Просмотреть файл

@ -14,6 +14,12 @@ if (!(R_int_UUID == "0310d4b8-ccb1-4bb8-ba94-d36a55f60262"
|| R_int_UUID == "2fdf6c18-697a-4ba7-b8ef-11c0d92f1327")){
print("Warning: unmatched R_INTERNALS_UUID, may cannot run normally.")
}
# Move in CMakeLists.txt
if (!file.copy("../inst/bin/CMakeLists.txt", "CMakeLists.txt", overwrite = TRUE)){
stop("Copying CMakeLists failed")
}
# Check for precompilation
if (!use_precompile) {
@ -21,26 +27,6 @@ if (!use_precompile) {
source_dir <- file.path(R_PACKAGE_SOURCE, "src", fsep = "/")
setwd(source_dir)
if (!file.exists("_IS_FULL_PACKAGE")) {
unlink("./include", recursive = TRUE)
unlink("./src", recursive = TRUE)
unlink("./compute", recursive = TRUE)
unlink("./build", recursive = TRUE)
if (!file.copy("./../../include", "./", overwrite = TRUE, recursive = TRUE)) {
stop("Cannot find folder LightGBM/include")
}
if (!file.copy("./../../src", "./", overwrite = TRUE, recursive = TRUE)) {
stop("Cannot find folder LightGBM/src")
}
if (!file.copy("./../../compute", "./", overwrite = TRUE, recursive = TRUE)) {
print("Cannot find folder LightGBM/compute, disabling GPU build.")
use_gpu <- FALSE
}
if (!file.copy("./../../CMakeLists.txt", "./", overwrite = TRUE, recursive = TRUE)) {
stop("Cannot find file LightGBM/CMakeLists.txt")
}
}
# Prepare building package
build_dir <- file.path(source_dir, "build", fsep = "/")
dir.create(build_dir, recursive = TRUE, showWarnings = FALSE)

Просмотреть файл

@ -62,7 +62,7 @@ test_that("lgb.Dataset: colnames", {
test_that("lgb.Dataset: nrow is correct for a very sparse matrix", {
nr <- 1000
x <- rsparsematrix(nr, 100, density=0.0005)
x <- Matrix::rsparsematrix(nr, 100, density=0.0005)
# we want it very sparse, so that last rows are empty
expect_lt(max(x@i), nr)
dtest <- lgb.Dataset(x)

82
build_r.R Normal file
Просмотреть файл

@ -0,0 +1,82 @@
# for macOS (replace 7 with version of gcc installed on your machine)
# NOTE: your gcc / g++ from Homebrew is probably in /usr/local/bin
#export CXX=/usr/local/bin/g++-8 CC=/usr/local/bin/gcc-8
# Sys.setenv("CXX" = "/usr/local/bin/g++-8")
# Sys.setenv("CC" = "/usr/local/bin/gcc-8")
# R returns FALSE (not a non-zero exit code) if a file copy operation
# breaks. Let's fix that
.handle_result <- function(res){
if (!res){
stop("Copying files failed!")
}
}
# Make a new temporary folder to work in
unlink(x = "lightgbm_r", recursive = TRUE)
dir.create("lightgbm_r")
# copy in the relevant files
result <- file.copy(
from = "R-package/./"
, to = "lightgbm_r/"
, recursive = TRUE
, overwrite = TRUE
)
.handle_result(result)
result <- file.copy(
from = "include/"
, to = file.path("lightgbm_r", "src/")
, recursive = TRUE
, overwrite = TRUE
)
.handle_result(result)
result <- file.copy(
from = "src/"
, to = file.path("lightgbm_r", "src/")
, recursive = TRUE
, overwrite = TRUE
)
.handle_result(result)
result <- file.copy(
from = "CMakeLists.txt"
, to = file.path("lightgbm_r", "inst", "bin/")
, recursive = TRUE
, overwrite = TRUE
)
.handle_result(result)
# rebuild documentation
devtools::document(
pkg = "lightgbm_r/"
)
# Build the package
# NOTE: --keep-empty-dirs is necessary to keep the deep paths expected
# by CMake while also meeting the CRAN req to create object files
# on demand
devtools::build(
pkg = "lightgbm_r"
, args = c("--keep-empty-dirs")
)
# Install the package
version <- gsub(
"Version: "
, ""
, grep(
"Version: "
, readLines(con = file.path("lightgbm_r", "DESCRIPTION"))
, value = TRUE
)
)
tarball <- file.path(getwd(), sprintf("lightgbm_%s.tar.gz", version))
system(sprintf("R CMD INSTALL %s --no-multi-arch", tarball))
# Run R CMD CHECK
#R CMD CHECK lightgbm_2.1.2.tar.gz --as-cran | tee check.log | cat

Просмотреть файл

@ -1096,7 +1096,7 @@ void GPUTreeLearner::FindBestSplits() {
void GPUTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
#if GPU_DEBUG >= 2
printf("Spliting leaf %d with feature %d thresh %d gain %f stat %f %f %f %f\n", best_Leaf, best_split_info.feature, best_split_info.threshold, best_split_info.gain, best_split_info.left_sum_gradient, best_split_info.right_sum_gradient, best_split_info.left_sum_hessian, best_split_info.right_sum_hessian);
printf("Splitting leaf %d with feature %d thresh %d gain %f stat %f %f %f %f\n", best_Leaf, best_split_info.feature, best_split_info.threshold, best_split_info.gain, best_split_info.left_sum_gradient, best_split_info.right_sum_gradient, best_split_info.left_sum_hessian, best_split_info.right_sum_hessian);
#endif
SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf);
if (Network::num_machines() == 1) {