зеркало из https://github.com/microsoft/LightGBM.git
[R-package] CRAN fixes (#1499)
* Fixed typos in docs * Fixed inconsistencies in documentation * Updated strategy for registering routines * Fixed issues caused by smashing multiple functions into one Rd * Fixed issues with documentation * Removed VignetteBuilder and updated Rbuildignore * Added R build artefacts to gitignore * Added namespacing on data.table set function. Updated handling of CMakeLists file to get around CRAN check. * Updated build instructions * Added R build script * Removed build_r.sh script and updated R-package install instructions
This commit is contained in:
Родитель
80a9a9419c
Коммит
eded794efb
|
@ -382,3 +382,11 @@ lightgbm.model
|
|||
# duplicate version file
|
||||
python-package/lightgbm/VERSION.txt
|
||||
.Rproj.user
|
||||
|
||||
# R build artefacts
|
||||
R-package/src/CMakeLists.txt
|
||||
R-package/src/lib_lightgbm.so.dSYM/
|
||||
R-package/src/src/
|
||||
lightgbm_r/*
|
||||
lightgbm*.tar.gz
|
||||
lightgbm.Rcheck/
|
||||
|
|
|
@ -1 +1,12 @@
|
|||
^build_package.R$
|
||||
\.gitkeep$
|
||||
|
||||
# Objects created by compilation
|
||||
\.o$
|
||||
\.so$
|
||||
\.dll$
|
||||
\.out$
|
||||
\.bin$
|
||||
|
||||
# Code copied in at build time
|
||||
^src/CMakeLists.txt$
|
||||
|
|
|
@ -7,7 +7,7 @@ Authors@R: c(
|
|||
person("Guolin", "Ke", email = "guolin.ke@microsoft.com", role = c("aut", "cre")),
|
||||
person("Damien", "Soukhavong", email = "damien.soukhavong@skema.edu", role = c("ctb")),
|
||||
person("Yachen", "Yan", role = c("ctb")),
|
||||
person("James", "Lamb", role = c("ctb"))
|
||||
person("James", "Lamb", email="james.lamb@uptake.com", role = c("ctb"))
|
||||
)
|
||||
Description: Tree based algorithms can be improved by introducing boosting frameworks. LightGBM is one such framework, and this package offers an R interface to work with it.
|
||||
It is designed to be distributed and efficient with the following advantages:
|
||||
|
@ -21,7 +21,6 @@ Description: Tree based algorithms can be improved by introducing boosting frame
|
|||
License: MIT + file LICENSE
|
||||
URL: https://github.com/Microsoft/LightGBM
|
||||
BugReports: https://github.com/Microsoft/LightGBM/issues
|
||||
VignetteBuilder: knitr
|
||||
Suggests:
|
||||
Ckmeans.1d.dp (>= 3.3.1),
|
||||
DiagrammeR (>= 0.8.1),
|
||||
|
@ -33,7 +32,7 @@ Suggests:
|
|||
testthat,
|
||||
vcd (>= 1.3)
|
||||
Depends:
|
||||
R (>= 3.0),
|
||||
R (>= 3.4),
|
||||
R6 (>= 2.0)
|
||||
Imports:
|
||||
data.table (>= 1.9.6),
|
||||
|
|
|
@ -49,4 +49,4 @@ importFrom(magrittr,"%T>%")
|
|||
importFrom(magrittr,extract)
|
||||
importFrom(magrittr,inset)
|
||||
importFrom(methods,is)
|
||||
useDynLib(lib_lightgbm)
|
||||
useDynLib(lib_lightgbm , .registration = TRUE)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
CB_ENV <- R6Class(
|
||||
#' @importFrom R6 R6Class
|
||||
CB_ENV <- R6::R6Class(
|
||||
"lgb.cb_env",
|
||||
cloneable = FALSE,
|
||||
public = list(
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
Booster <- R6Class(
|
||||
#' @importFrom R6 R6Class
|
||||
Booster <- R6::R6Class(
|
||||
classname = "lgb.Booster",
|
||||
cloneable = FALSE,
|
||||
public = list(
|
||||
|
@ -654,13 +655,15 @@ Booster <- R6Class(
|
|||
#'
|
||||
#' @rdname predict.lgb.Booster
|
||||
#' @export
|
||||
predict.lgb.Booster <- function(object, data,
|
||||
num_iteration = NULL,
|
||||
rawscore = FALSE,
|
||||
predleaf = FALSE,
|
||||
predcontrib = FALSE,
|
||||
header = FALSE,
|
||||
reshape = FALSE, ...) {
|
||||
predict.lgb.Booster <- function(object,
|
||||
data,
|
||||
num_iteration = NULL,
|
||||
rawscore = FALSE,
|
||||
predleaf = FALSE,
|
||||
predcontrib = FALSE,
|
||||
header = FALSE,
|
||||
reshape = FALSE,
|
||||
...) {
|
||||
|
||||
# Check booster existence
|
||||
if (!lgb.is.Booster(object)) {
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
|
||||
#' @importFrom methods is
|
||||
Dataset <- R6Class(
|
||||
#' @importFrom R6 R6Class
|
||||
Dataset <- R6::R6Class(
|
||||
|
||||
classname = "lgb.Dataset",
|
||||
cloneable = FALSE,
|
||||
public = list(
|
||||
|
@ -854,8 +856,8 @@ dimnames.lgb.Dataset <- function(x) {
|
|||
#' Slice a dataset
|
||||
#'
|
||||
#' Get a new \code{lgb.Dataset} containing the specified rows of
|
||||
#' orginal lgb.Dataset object
|
||||
#'
|
||||
#' original lgb.Dataset object
|
||||
#'
|
||||
#' @param dataset Object of class "lgb.Dataset"
|
||||
#' @param idxset a integer vector of indices of rows needed
|
||||
#' @param ... other parameters (currently not used)
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
|
||||
#' @importFrom methods is
|
||||
Predictor <- R6Class(
|
||||
#' @importFrom R6 R6Class
|
||||
Predictor <- R6::R6Class(
|
||||
|
||||
classname = "lgb.Predictor",
|
||||
cloneable = FALSE,
|
||||
public = list(
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
CVBooster <- R6Class(
|
||||
#' @importFrom R6 R6Class
|
||||
CVBooster <- R6::R6Class(
|
||||
classname = "lgb.CVBooster",
|
||||
cloneable = FALSE,
|
||||
public = list(
|
||||
|
@ -17,46 +18,39 @@ CVBooster <- R6Class(
|
|||
)
|
||||
|
||||
#' @title Main CV logic for LightGBM
|
||||
#' @description Cross validation logic used by LightGBM
|
||||
#' @name lgb.cv
|
||||
#' @param params List of parameters
|
||||
#' @param data a \code{lgb.Dataset} object, used for CV
|
||||
#' @param nrounds number of CV rounds
|
||||
#' @inheritParams lgb_shared_params
|
||||
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
|
||||
#' @param label vector of response values. Should be provided only when data is an R-matrix.
|
||||
#' @param weight vector of response values. If not NULL, will set to dataset
|
||||
#' @param obj objective function, can be character or custom objective function. Examples include
|
||||
#' \code{regression}, \code{regression_l1}, \code{huber},
|
||||
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
|
||||
#' @param boosting boosting type. \code{gbdt}, \code{dart}
|
||||
#' @param num_leaves number of leaves in one tree. defaults to 127
|
||||
#' @param max_depth Limit the max depth for tree model. This is used to deal with overfit when #data is small.
|
||||
#' Tree still grow by leaf-wise.
|
||||
#' @param num_threads Number of threads for LightGBM. For the best speed, set this to the number of real CPU cores, not the number of threads (most CPU using hyper-threading to generate 2 threads per CPU core).
|
||||
#' @param eval evaluation function, can be (list of) character or custom eval function
|
||||
#' @param verbose verbosity for output, if <= 0, also will disable the print of evalutaion during training
|
||||
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
|
||||
#' @param eval_freq evalutaion output frequence, only effect when verbose > 0
|
||||
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation
|
||||
#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
|
||||
#' by the values of outcome labels.
|
||||
#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
|
||||
#' (each element must be a vector of test fold's indices). When folds are supplied,
|
||||
#' the \code{nfold} and \code{stratified} parameters are ignored.
|
||||
#' @param init_model path of model file of \code{lgb.Booster} object, will continue train from this model
|
||||
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
|
||||
#' @param categorical_feature list of str or int
|
||||
#' type int represents index,
|
||||
#' type str represents feature names
|
||||
#' @param early_stopping_rounds int
|
||||
#' Activates early stopping.
|
||||
#' CV score needs to improve at least every early_stopping_rounds round(s) to continue.
|
||||
#' Requires at least one metric.
|
||||
#' If there's more than one, will check all of them.
|
||||
#' Returns the model with (best_iter + early_stopping_rounds).
|
||||
#' If early stopping occurs, the model will have 'best_iter' field
|
||||
#' @param callbacks list of callback functions
|
||||
#' List of callback functions that are applied at each iteration.
|
||||
#' @param ... other parameters, see Parameters.rst for more informations
|
||||
#' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
|
||||
#' \itemize{
|
||||
#' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
|
||||
#' \item{num_leaves}{number of leaves in one tree. defaults to 127}
|
||||
#' \item{max_depth}{Limit the max depth for tree model. This is used to deal with
|
||||
#' overfit when #data is small. Tree still grow by leaf-wise.}
|
||||
#' \item{num_threads}{Number of threads for LightGBM. For the best speed, set this to
|
||||
#' the number of real CPU cores, not the number of threads (most
|
||||
#' CPU using hyper-threading to generate 2 threads per CPU core).}
|
||||
#' }
|
||||
#'
|
||||
#' @return a trained model \code{lgb.CVBooster}.
|
||||
#'
|
||||
|
@ -75,7 +69,6 @@ CVBooster <- R6Class(
|
|||
#' learning_rate = 1,
|
||||
#' early_stopping_rounds = 10)
|
||||
#' }
|
||||
#' @rdname lgb.train
|
||||
#' @export
|
||||
lgb.cv <- function(params = list(),
|
||||
data,
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
#' \item \code{leaf_index}: ID of a leaf in a tree (integer)
|
||||
#' \item \code{leaf_parent}: ID of the parent node for current leaf (integer)
|
||||
#' \item \code{split_gain}: Split gain of a node
|
||||
#' \item \code{threshold}: Spliting threshold value of a node
|
||||
#' \item \code{threshold}: Splitting threshold value of a node
|
||||
#' \item \code{decision_type}: Decision type of a node
|
||||
#' \item \code{default_left}: Determine how to handle NA value, TRUE -> Left, FALSE -> Right
|
||||
#' \item \code{internal_value}: Node value
|
||||
|
@ -47,7 +47,7 @@
|
|||
#' }
|
||||
#'
|
||||
#' @importFrom magrittr %>%
|
||||
#' @importFrom data.table := data.table
|
||||
#' @importFrom data.table := data.table rbindlist
|
||||
#' @importFrom jsonlite fromJSON
|
||||
#' @export
|
||||
lgb.model.dt.tree <- function(model, num_iteration = NULL) {
|
||||
|
@ -78,6 +78,7 @@ lgb.model.dt.tree <- function(model, num_iteration = NULL) {
|
|||
|
||||
}
|
||||
|
||||
|
||||
#' @importFrom data.table data.table rbindlist
|
||||
single.tree.parse <- function(lgb_tree) {
|
||||
|
||||
|
|
|
@ -1,199 +1,200 @@
|
|||
#' Data preparator for LightGBM datasets with rules (numeric)
|
||||
#'
|
||||
#' Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter.
|
||||
#'
|
||||
#' @param data A data.frame or data.table to prepare.
|
||||
#' @param rules A set of rules from the data preparator, if already used.
|
||||
#'
|
||||
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
|
||||
#'
|
||||
#' @examples
|
||||
#' \dontrun{
|
||||
#' library(lightgbm)
|
||||
#' data(iris)
|
||||
#'
|
||||
#' str(iris)
|
||||
#' # 'data.frame': 150 obs. of 5 variables:
|
||||
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
|
||||
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
|
||||
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
|
||||
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
|
||||
#' # $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
|
||||
#'
|
||||
#' new_iris <- lgb.prepare_rules(data = iris) # Autoconverter
|
||||
#' str(new_iris$data)
|
||||
#' # 'data.frame': 150 obs. of 5 variables:
|
||||
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
|
||||
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
|
||||
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
|
||||
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
|
||||
#' # $ Species : num 1 1 1 1 1 1 1 1 1 1 ...
|
||||
#'
|
||||
#' data(iris) # Erase iris dataset
|
||||
#' iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
|
||||
#' # Warning message:
|
||||
#' # In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L, :
|
||||
#' # invalid factor level, NA generated
|
||||
#'
|
||||
#' # Use conversion using known rules
|
||||
#' # Unknown factors become 0, excellent for sparse datasets
|
||||
#' newer_iris <- lgb.prepare_rules(data = iris, rules = new_iris$rules)
|
||||
#'
|
||||
#' # Unknown factor is now zero, perfect for sparse datasets
|
||||
#' newer_iris$data[1, ] # Species became 0 as it is an unknown factor
|
||||
#' # Sepal.Length Sepal.Width Petal.Length Petal.Width Species
|
||||
#' # 1 5.1 3.5 1.4 0.2 0
|
||||
#'
|
||||
#' newer_iris$data[1, 5] <- 1 # Put back real initial value
|
||||
#'
|
||||
#' # Is the newly created dataset equal? YES!
|
||||
#' all.equal(new_iris$data, newer_iris$data)
|
||||
#' # [1] TRUE
|
||||
#'
|
||||
#' # Can we test our own rules?
|
||||
#' data(iris) # Erase iris dataset
|
||||
#'
|
||||
#' # We remapped values differently
|
||||
#' personal_rules <- list(Species = c("setosa" = 3,
|
||||
#' "versicolor" = 2,
|
||||
#' "virginica" = 1))
|
||||
#' newest_iris <- lgb.prepare_rules(data = iris, rules = personal_rules)
|
||||
#' str(newest_iris$data) # SUCCESS!
|
||||
#' # 'data.frame': 150 obs. of 5 variables:
|
||||
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
|
||||
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
|
||||
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
|
||||
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
|
||||
#' # $ Species : num 3 3 3 3 3 3 3 3 3 3 ...
|
||||
#'
|
||||
#' }
|
||||
#'
|
||||
#' @export
|
||||
lgb.prepare_rules <- function(data, rules = NULL) {
|
||||
|
||||
# data.table not behaving like data.frame
|
||||
if (inherits(data, "data.table")) {
|
||||
|
||||
# Must use existing rules
|
||||
if (!is.null(rules)) {
|
||||
|
||||
# Loop through rules
|
||||
for (i in names(rules)) {
|
||||
|
||||
set(data, j = i, value = unname(rules[[i]][data[[i]]]))
|
||||
data[[i]][is.na(data[[i]])] <- 0 # Overwrite NAs by 0s
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# Get data classes
|
||||
list_classes <- vapply(data, class, character(1))
|
||||
|
||||
# Map characters/factors
|
||||
is_fix <- which(list_classes %in% c("character", "factor"))
|
||||
rules <- list()
|
||||
|
||||
# Need to create rules?
|
||||
if (length(is_fix) > 0) {
|
||||
|
||||
# Go through all characters/factors
|
||||
for (i in is_fix) {
|
||||
|
||||
# Store column elsewhere
|
||||
mini_data <- data[[i]]
|
||||
|
||||
# Get unique values
|
||||
if (is.factor(mini_data)) {
|
||||
mini_unique <- levels(mini_data) # Factor
|
||||
mini_numeric <- numeric(length(mini_unique))
|
||||
mini_numeric[seq_along(mini_unique)] <- seq_along(mini_unique) # Respect ordinal if needed
|
||||
} else {
|
||||
mini_unique <- as.factor(unique(mini_data)) # Character
|
||||
mini_numeric <- as.numeric(mini_unique) # No respect of ordinality
|
||||
}
|
||||
|
||||
# Create rules
|
||||
indexed <- colnames(data)[i] # Index value
|
||||
rules[[indexed]] <- mini_numeric # Numeric content
|
||||
names(rules[[indexed]]) <- mini_unique # Character equivalent
|
||||
|
||||
# Apply to real data column
|
||||
set(data, j = i, value = unname(rules[[indexed]][mini_data]))
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# Must use existing rules
|
||||
if (!is.null(rules)) {
|
||||
|
||||
# Loop through rules
|
||||
for (i in names(rules)) {
|
||||
|
||||
data[[i]] <- unname(rules[[i]][data[[i]]])
|
||||
data[[i]][is.na(data[[i]])] <- 0 # Overwrite NAs by 0s
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# Default routine (data.frame)
|
||||
if (inherits(data, "data.frame")) {
|
||||
|
||||
# Get data classes
|
||||
list_classes <- vapply(data, class, character(1))
|
||||
|
||||
# Map characters/factors
|
||||
is_fix <- which(list_classes %in% c("character", "factor"))
|
||||
rules <- list()
|
||||
|
||||
# Need to create rules?
|
||||
if (length(is_fix) > 0) {
|
||||
|
||||
# Go through all characters/factors
|
||||
for (i in is_fix) {
|
||||
|
||||
# Store column elsewhere
|
||||
mini_data <- data[[i]]
|
||||
|
||||
# Get unique values
|
||||
if (is.factor(mini_data)) {
|
||||
mini_unique <- levels(mini_data) # Factor
|
||||
mini_numeric <- numeric(length(mini_unique))
|
||||
mini_numeric[seq_along(mini_unique)] <- seq_along(mini_unique) # Respect ordinal if needed
|
||||
} else {
|
||||
mini_unique <- as.factor(unique(mini_data)) # Character
|
||||
mini_numeric <- as.numeric(mini_unique) # No respect of ordinality
|
||||
}
|
||||
|
||||
# Create rules
|
||||
indexed <- colnames(data)[i] # Index value
|
||||
rules[[indexed]] <- mini_numeric # Numeric content
|
||||
names(rules[[indexed]]) <- mini_unique # Character equivalent
|
||||
|
||||
# Apply to real data column
|
||||
data[[i]] <- unname(rules[[indexed]][mini_data])
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# What do you think you are doing here? Throw error.
|
||||
stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(list(data = data, rules = rules))
|
||||
|
||||
}
|
||||
#' Data preparator for LightGBM datasets with rules (numeric)
|
||||
#'
|
||||
#' Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter.
|
||||
#'
|
||||
#' @param data A data.frame or data.table to prepare.
|
||||
#' @param rules A set of rules from the data preparator, if already used.
|
||||
#'
|
||||
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
|
||||
#'
|
||||
#' @examples
|
||||
#' \dontrun{
|
||||
#' library(lightgbm)
|
||||
#' data(iris)
|
||||
#'
|
||||
#' str(iris)
|
||||
#' # 'data.frame': 150 obs. of 5 variables:
|
||||
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
|
||||
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
|
||||
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
|
||||
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
|
||||
#' # $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
|
||||
#'
|
||||
#' new_iris <- lgb.prepare_rules(data = iris) # Autoconverter
|
||||
#' str(new_iris$data)
|
||||
#' # 'data.frame': 150 obs. of 5 variables:
|
||||
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
|
||||
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
|
||||
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
|
||||
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
|
||||
#' # $ Species : num 1 1 1 1 1 1 1 1 1 1 ...
|
||||
#'
|
||||
#' data(iris) # Erase iris dataset
|
||||
#' iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
|
||||
#' # Warning message:
|
||||
#' # In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L, :
|
||||
#' # invalid factor level, NA generated
|
||||
#'
|
||||
#' # Use conversion using known rules
|
||||
#' # Unknown factors become 0, excellent for sparse datasets
|
||||
#' newer_iris <- lgb.prepare_rules(data = iris, rules = new_iris$rules)
|
||||
#'
|
||||
#' # Unknown factor is now zero, perfect for sparse datasets
|
||||
#' newer_iris$data[1, ] # Species became 0 as it is an unknown factor
|
||||
#' # Sepal.Length Sepal.Width Petal.Length Petal.Width Species
|
||||
#' # 1 5.1 3.5 1.4 0.2 0
|
||||
#'
|
||||
#' newer_iris$data[1, 5] <- 1 # Put back real initial value
|
||||
#'
|
||||
#' # Is the newly created dataset equal? YES!
|
||||
#' all.equal(new_iris$data, newer_iris$data)
|
||||
#' # [1] TRUE
|
||||
#'
|
||||
#' # Can we test our own rules?
|
||||
#' data(iris) # Erase iris dataset
|
||||
#'
|
||||
#' # We remapped values differently
|
||||
#' personal_rules <- list(Species = c("setosa" = 3,
|
||||
#' "versicolor" = 2,
|
||||
#' "virginica" = 1))
|
||||
#' newest_iris <- lgb.prepare_rules(data = iris, rules = personal_rules)
|
||||
#' str(newest_iris$data) # SUCCESS!
|
||||
#' # 'data.frame': 150 obs. of 5 variables:
|
||||
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
|
||||
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
|
||||
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
|
||||
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
|
||||
#' # $ Species : num 3 3 3 3 3 3 3 3 3 3 ...
|
||||
#'
|
||||
#' }
|
||||
#'
|
||||
#' @importFrom data.table set
|
||||
#' @export
|
||||
lgb.prepare_rules <- function(data, rules = NULL) {
|
||||
|
||||
# data.table not behaving like data.frame
|
||||
if (inherits(data, "data.table")) {
|
||||
|
||||
# Must use existing rules
|
||||
if (!is.null(rules)) {
|
||||
|
||||
# Loop through rules
|
||||
for (i in names(rules)) {
|
||||
|
||||
data.table::set(data, j = i, value = unname(rules[[i]][data[[i]]]))
|
||||
data[[i]][is.na(data[[i]])] <- 0 # Overwrite NAs by 0s
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# Get data classes
|
||||
list_classes <- vapply(data, class, character(1))
|
||||
|
||||
# Map characters/factors
|
||||
is_fix <- which(list_classes %in% c("character", "factor"))
|
||||
rules <- list()
|
||||
|
||||
# Need to create rules?
|
||||
if (length(is_fix) > 0) {
|
||||
|
||||
# Go through all characters/factors
|
||||
for (i in is_fix) {
|
||||
|
||||
# Store column elsewhere
|
||||
mini_data <- data[[i]]
|
||||
|
||||
# Get unique values
|
||||
if (is.factor(mini_data)) {
|
||||
mini_unique <- levels(mini_data) # Factor
|
||||
mini_numeric <- numeric(length(mini_unique))
|
||||
mini_numeric[seq_along(mini_unique)] <- seq_along(mini_unique) # Respect ordinal if needed
|
||||
} else {
|
||||
mini_unique <- as.factor(unique(mini_data)) # Character
|
||||
mini_numeric <- as.numeric(mini_unique) # No respect of ordinality
|
||||
}
|
||||
|
||||
# Create rules
|
||||
indexed <- colnames(data)[i] # Index value
|
||||
rules[[indexed]] <- mini_numeric # Numeric content
|
||||
names(rules[[indexed]]) <- mini_unique # Character equivalent
|
||||
|
||||
# Apply to real data column
|
||||
data.table::set(data, j = i, value = unname(rules[[indexed]][mini_data]))
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# Must use existing rules
|
||||
if (!is.null(rules)) {
|
||||
|
||||
# Loop through rules
|
||||
for (i in names(rules)) {
|
||||
|
||||
data[[i]] <- unname(rules[[i]][data[[i]]])
|
||||
data[[i]][is.na(data[[i]])] <- 0 # Overwrite NAs by 0s
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# Default routine (data.frame)
|
||||
if (inherits(data, "data.frame")) {
|
||||
|
||||
# Get data classes
|
||||
list_classes <- vapply(data, class, character(1))
|
||||
|
||||
# Map characters/factors
|
||||
is_fix <- which(list_classes %in% c("character", "factor"))
|
||||
rules <- list()
|
||||
|
||||
# Need to create rules?
|
||||
if (length(is_fix) > 0) {
|
||||
|
||||
# Go through all characters/factors
|
||||
for (i in is_fix) {
|
||||
|
||||
# Store column elsewhere
|
||||
mini_data <- data[[i]]
|
||||
|
||||
# Get unique values
|
||||
if (is.factor(mini_data)) {
|
||||
mini_unique <- levels(mini_data) # Factor
|
||||
mini_numeric <- numeric(length(mini_unique))
|
||||
mini_numeric[seq_along(mini_unique)] <- seq_along(mini_unique) # Respect ordinal if needed
|
||||
} else {
|
||||
mini_unique <- as.factor(unique(mini_data)) # Character
|
||||
mini_numeric <- as.numeric(mini_unique) # No respect of ordinality
|
||||
}
|
||||
|
||||
# Create rules
|
||||
indexed <- colnames(data)[i] # Index value
|
||||
rules[[indexed]] <- mini_numeric # Numeric content
|
||||
names(rules[[indexed]]) <- mini_unique # Character equivalent
|
||||
|
||||
# Apply to real data column
|
||||
data[[i]] <- unname(rules[[indexed]][mini_data])
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# What do you think you are doing here? Throw error.
|
||||
stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(list(data = data, rules = rules))
|
||||
|
||||
}
|
||||
|
|
|
@ -1,197 +1,198 @@
|
|||
#' Data preparator for LightGBM datasets with rules (integer)
|
||||
#'
|
||||
#' Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
|
||||
#'
|
||||
#' @param data A data.frame or data.table to prepare.
|
||||
#' @param rules A set of rules from the data preparator, if already used.
|
||||
#'
|
||||
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
|
||||
#'
|
||||
#' @examples
|
||||
#' \dontrun{
|
||||
#' library(lightgbm)
|
||||
#' data(iris)
|
||||
#'
|
||||
#' str(iris)
|
||||
#' # 'data.frame': 150 obs. of 5 variables:
|
||||
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
|
||||
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
|
||||
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
|
||||
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
|
||||
#' # $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
|
||||
#'
|
||||
#' new_iris <- lgb.prepare_rules2(data = iris) # Autoconverter
|
||||
#' str(new_iris$data)
|
||||
#' # 'data.frame': 150 obs. of 5 variables:
|
||||
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
|
||||
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
|
||||
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
|
||||
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
|
||||
#' # $ Species : int 1 1 1 1 1 1 1 1 1 1 ...
|
||||
#'
|
||||
#' data(iris) # Erase iris dataset
|
||||
#' iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
|
||||
#' # Warning message:
|
||||
#' # In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L, :
|
||||
#' # invalid factor level, NA generated
|
||||
#'
|
||||
#' # Use conversion using known rules
|
||||
#' # Unknown factors become 0, excellent for sparse datasets
|
||||
#' newer_iris <- lgb.prepare_rules2(data = iris, rules = new_iris$rules)
|
||||
#'
|
||||
#' # Unknown factor is now zero, perfect for sparse datasets
|
||||
#' newer_iris$data[1, ] # Species became 0 as it is an unknown factor
|
||||
#' # Sepal.Length Sepal.Width Petal.Length Petal.Width Species
|
||||
#' # 1 5.1 3.5 1.4 0.2 0
|
||||
#'
|
||||
#' newer_iris$data[1, 5] <- 1 # Put back real initial value
|
||||
#'
|
||||
#' # Is the newly created dataset equal? YES!
|
||||
#' all.equal(new_iris$data, newer_iris$data)
|
||||
#' # [1] TRUE
|
||||
#'
|
||||
#' # Can we test our own rules?
|
||||
#' data(iris) # Erase iris dataset
|
||||
#'
|
||||
#' # We remapped values differently
|
||||
#' personal_rules <- list(Species = c("setosa" = 3L,
|
||||
#' "versicolor" = 2L,
|
||||
#' "virginica" = 1L))
|
||||
#' newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules)
|
||||
#' str(newest_iris$data) # SUCCESS!
|
||||
#' # 'data.frame': 150 obs. of 5 variables:
|
||||
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
|
||||
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
|
||||
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
|
||||
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
|
||||
#' # $ Species : int 3 3 3 3 3 3 3 3 3 3 ...
|
||||
#'
|
||||
#' }
|
||||
#'
|
||||
#' @export
|
||||
lgb.prepare_rules2 <- function(data, rules = NULL) {
|
||||
|
||||
# data.table not behaving like data.frame
|
||||
if (inherits(data, "data.table")) {
|
||||
|
||||
# Must use existing rules
|
||||
if (!is.null(rules)) {
|
||||
|
||||
# Loop through rules
|
||||
for (i in names(rules)) {
|
||||
|
||||
set(data, j = i, value = unname(rules[[i]][data[[i]]]))
|
||||
data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# Get data classes
|
||||
list_classes <- vapply(data, class, character(1))
|
||||
|
||||
# Map characters/factors
|
||||
is_fix <- which(list_classes %in% c("character", "factor"))
|
||||
rules <- list()
|
||||
|
||||
# Need to create rules?
|
||||
if (length(is_fix) > 0) {
|
||||
|
||||
# Go through all characters/factors
|
||||
for (i in is_fix) {
|
||||
|
||||
# Store column elsewhere
|
||||
mini_data <- data[[i]]
|
||||
|
||||
# Get unique values
|
||||
if (is.factor(mini_data)) {
|
||||
mini_unique <- levels(mini_data) # Factor
|
||||
mini_numeric <- seq_along(mini_unique) # Respect ordinal if needed
|
||||
} else {
|
||||
mini_unique <- as.factor(unique(mini_data)) # Character
|
||||
mini_numeric <- as.integer(mini_unique) # No respect of ordinality
|
||||
}
|
||||
|
||||
# Create rules
|
||||
indexed <- colnames(data)[i] # Index value
|
||||
rules[[indexed]] <- mini_numeric # Numeric content
|
||||
names(rules[[indexed]]) <- mini_unique # Character equivalent
|
||||
|
||||
# Apply to real data column
|
||||
set(data, j = i, value = unname(rules[[indexed]][mini_data]))
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# Must use existing rules
|
||||
if (!is.null(rules)) {
|
||||
|
||||
# Loop through rules
|
||||
for (i in names(rules)) {
|
||||
|
||||
data[[i]] <- unname(rules[[i]][data[[i]]])
|
||||
data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# Default routine (data.frame)
|
||||
if (inherits(data, "data.frame")) {
|
||||
|
||||
# Get data classes
|
||||
list_classes <- vapply(data, class, character(1))
|
||||
|
||||
# Map characters/factors
|
||||
is_fix <- which(list_classes %in% c("character", "factor"))
|
||||
rules <- list()
|
||||
|
||||
# Need to create rules?
|
||||
if (length(is_fix) > 0) {
|
||||
|
||||
# Go through all characters/factors
|
||||
for (i in is_fix) {
|
||||
|
||||
# Store column elsewhere
|
||||
mini_data <- data[[i]]
|
||||
|
||||
# Get unique values
|
||||
if (is.factor(mini_data)) {
|
||||
mini_unique <- levels(mini_data) # Factor
|
||||
mini_numeric <- seq_along(mini_unique) # Respect ordinal if needed
|
||||
} else {
|
||||
mini_unique <- as.factor(unique(mini_data)) # Character
|
||||
mini_numeric <- as.integer(mini_unique) # No respect of ordinality
|
||||
}
|
||||
|
||||
# Create rules
|
||||
indexed <- colnames(data)[i] # Index value
|
||||
rules[[indexed]] <- mini_numeric # Numeric content
|
||||
names(rules[[indexed]]) <- mini_unique # Character equivalent
|
||||
|
||||
# Apply to real data column
|
||||
data[[i]] <- unname(rules[[indexed]][mini_data])
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# What do you think you are doing here? Throw error.
|
||||
stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(list(data = data, rules = rules))
|
||||
|
||||
}
|
||||
#' Data preparator for LightGBM datasets with rules (integer)
|
||||
#'
|
||||
#' Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
|
||||
#'
|
||||
#' @param data A data.frame or data.table to prepare.
|
||||
#' @param rules A set of rules from the data preparator, if already used.
|
||||
#'
|
||||
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
|
||||
#'
|
||||
#' @examples
|
||||
#' \dontrun{
|
||||
#' library(lightgbm)
|
||||
#' data(iris)
|
||||
#'
|
||||
#' str(iris)
|
||||
#' # 'data.frame': 150 obs. of 5 variables:
|
||||
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
|
||||
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
|
||||
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
|
||||
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
|
||||
#' # $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
|
||||
#'
|
||||
#' new_iris <- lgb.prepare_rules2(data = iris) # Autoconverter
|
||||
#' str(new_iris$data)
|
||||
#' # 'data.frame': 150 obs. of 5 variables:
|
||||
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
|
||||
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
|
||||
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
|
||||
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
|
||||
#' # $ Species : int 1 1 1 1 1 1 1 1 1 1 ...
|
||||
#'
|
||||
#' data(iris) # Erase iris dataset
|
||||
#' iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
|
||||
#' # Warning message:
|
||||
#' # In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L, :
|
||||
#' # invalid factor level, NA generated
|
||||
#'
|
||||
#' # Use conversion using known rules
|
||||
#' # Unknown factors become 0, excellent for sparse datasets
|
||||
#' newer_iris <- lgb.prepare_rules2(data = iris, rules = new_iris$rules)
|
||||
#'
|
||||
#' # Unknown factor is now zero, perfect for sparse datasets
|
||||
#' newer_iris$data[1, ] # Species became 0 as it is an unknown factor
|
||||
#' # Sepal.Length Sepal.Width Petal.Length Petal.Width Species
|
||||
#' # 1 5.1 3.5 1.4 0.2 0
|
||||
#'
|
||||
#' newer_iris$data[1, 5] <- 1 # Put back real initial value
|
||||
#'
|
||||
#' # Is the newly created dataset equal? YES!
|
||||
#' all.equal(new_iris$data, newer_iris$data)
|
||||
#' # [1] TRUE
|
||||
#'
|
||||
#' # Can we test our own rules?
|
||||
#' data(iris) # Erase iris dataset
|
||||
#'
|
||||
#' # We remapped values differently
|
||||
#' personal_rules <- list(Species = c("setosa" = 3L,
|
||||
#' "versicolor" = 2L,
|
||||
#' "virginica" = 1L))
|
||||
#' newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules)
|
||||
#' str(newest_iris$data) # SUCCESS!
|
||||
#' # 'data.frame': 150 obs. of 5 variables:
|
||||
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
|
||||
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
|
||||
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
|
||||
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
|
||||
#' # $ Species : int 3 3 3 3 3 3 3 3 3 3 ...
|
||||
#'
|
||||
#' }
|
||||
#'
|
||||
#' @importFrom data.table set
|
||||
#' @export
|
||||
lgb.prepare_rules2 <- function(data, rules = NULL) {
|
||||
|
||||
# data.table not behaving like data.frame
|
||||
if (inherits(data, "data.table")) {
|
||||
|
||||
# Must use existing rules
|
||||
if (!is.null(rules)) {
|
||||
|
||||
# Loop through rules
|
||||
for (i in names(rules)) {
|
||||
|
||||
data.table::set(data, j = i, value = unname(rules[[i]][data[[i]]]))
|
||||
data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# Get data classes
|
||||
list_classes <- vapply(data, class, character(1))
|
||||
|
||||
# Map characters/factors
|
||||
is_fix <- which(list_classes %in% c("character", "factor"))
|
||||
rules <- list()
|
||||
|
||||
# Need to create rules?
|
||||
if (length(is_fix) > 0) {
|
||||
|
||||
# Go through all characters/factors
|
||||
for (i in is_fix) {
|
||||
|
||||
# Store column elsewhere
|
||||
mini_data <- data[[i]]
|
||||
|
||||
# Get unique values
|
||||
if (is.factor(mini_data)) {
|
||||
mini_unique <- levels(mini_data) # Factor
|
||||
mini_numeric <- seq_along(mini_unique) # Respect ordinal if needed
|
||||
} else {
|
||||
mini_unique <- as.factor(unique(mini_data)) # Character
|
||||
mini_numeric <- as.integer(mini_unique) # No respect of ordinality
|
||||
}
|
||||
|
||||
# Create rules
|
||||
indexed <- colnames(data)[i] # Index value
|
||||
rules[[indexed]] <- mini_numeric # Numeric content
|
||||
names(rules[[indexed]]) <- mini_unique # Character equivalent
|
||||
|
||||
# Apply to real data column
|
||||
data.table::set(data, j = i, value = unname(rules[[indexed]][mini_data]))
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# Must use existing rules
|
||||
if (!is.null(rules)) {
|
||||
|
||||
# Loop through rules
|
||||
for (i in names(rules)) {
|
||||
|
||||
data[[i]] <- unname(rules[[i]][data[[i]]])
|
||||
data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# Default routine (data.frame)
|
||||
if (inherits(data, "data.frame")) {
|
||||
|
||||
# Get data classes
|
||||
list_classes <- vapply(data, class, character(1))
|
||||
|
||||
# Map characters/factors
|
||||
is_fix <- which(list_classes %in% c("character", "factor"))
|
||||
rules <- list()
|
||||
|
||||
# Need to create rules?
|
||||
if (length(is_fix) > 0) {
|
||||
|
||||
# Go through all characters/factors
|
||||
for (i in is_fix) {
|
||||
|
||||
# Store column elsewhere
|
||||
mini_data <- data[[i]]
|
||||
|
||||
# Get unique values
|
||||
if (is.factor(mini_data)) {
|
||||
mini_unique <- levels(mini_data) # Factor
|
||||
mini_numeric <- seq_along(mini_unique) # Respect ordinal if needed
|
||||
} else {
|
||||
mini_unique <- as.factor(unique(mini_data)) # Character
|
||||
mini_numeric <- as.integer(mini_unique) # No respect of ordinality
|
||||
}
|
||||
|
||||
# Create rules
|
||||
indexed <- colnames(data)[i] # Index value
|
||||
rules[[indexed]] <- mini_numeric # Numeric content
|
||||
names(rules[[indexed]]) <- mini_unique # Character equivalent
|
||||
|
||||
# Apply to real data column
|
||||
data[[i]] <- unname(rules[[indexed]][mini_data])
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# What do you think you are doing here? Throw error.
|
||||
stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return(list(data = data, rules = rules))
|
||||
|
||||
}
|
||||
|
|
|
@ -1,39 +1,28 @@
|
|||
#' @title Main training logic for LightGBM
|
||||
#' @name lgb.train
|
||||
#' @param params List of parameters
|
||||
#' @param data a \code{lgb.Dataset} object, used for training
|
||||
#' @param nrounds number of training rounds
|
||||
#' @description Logic to train with LightGBM
|
||||
#' @inheritParams lgb_shared_params
|
||||
#' @param valids a list of \code{lgb.Dataset} objects, used for validation
|
||||
#' @param obj objective function, can be character or custom objective function. Examples include
|
||||
#' \code{regression}, \code{regression_l1}, \code{huber},
|
||||
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
|
||||
#' @param boosting boosting type. \code{gbdt}, \code{dart}
|
||||
#' @param num_leaves number of leaves in one tree. defaults to 127
|
||||
#' @param max_depth Limit the max depth for tree model. This is used to deal with overfit when #data is small.
|
||||
#' Tree still grow by leaf-wise.
|
||||
#' @param num_threads Number of threads for LightGBM. For the best speed, set this to the number of real CPU cores, not the number of threads (most CPU using hyper-threading to generate 2 threads per CPU core).
|
||||
#' @param eval evaluation function, can be (a list of) character or custom eval function
|
||||
#' @param verbose verbosity for output, if <= 0, also will disable the print of evalutaion during training
|
||||
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
|
||||
#' @param eval_freq evalutaion output frequency, only effect when verbose > 0
|
||||
#' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
|
||||
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
|
||||
#' @param categorical_feature list of str or int
|
||||
#' type int represents index,
|
||||
#' type str represents feature names
|
||||
#' @param early_stopping_rounds int
|
||||
#' Activates early stopping.
|
||||
#' The model will train until the validation score stops improving.
|
||||
#' Validation score needs to improve at least every early_stopping_rounds round(s) to continue training.
|
||||
#' Requires at least one validation data and one metric.
|
||||
#' If there's more than one, will check all of them. But the training data is ignored anyway.
|
||||
#' Returns the model with (best_iter + early_stopping_rounds).
|
||||
#' If early stopping occurs, the model will have 'best_iter' field
|
||||
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets
|
||||
#' @param callbacks list of callback functions
|
||||
#' List of callback functions that are applied at each iteration.
|
||||
#' @param ... other parameters, see Parameters.rst for more information
|
||||
#'
|
||||
#' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
|
||||
#' \itemize{
|
||||
#' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
|
||||
#' \item{num_leaves}{number of leaves in one tree. defaults to 127}
|
||||
#' \item{max_depth}{Limit the max depth for tree model. This is used to deal with
|
||||
#' overfit when #data is small. Tree still grow by leaf-wise.}
|
||||
#' \item{num_threads}{Number of threads for LightGBM. For the best speed, set this to
|
||||
#' the number of real CPU cores, not the number of threads (most
|
||||
#' CPU using hyper-threading to generate 2 threads per CPU core).}
|
||||
#' }
|
||||
#' @return a trained booster model \code{lgb.Booster}.
|
||||
#'
|
||||
#' @examples
|
||||
|
@ -56,8 +45,6 @@
|
|||
#' early_stopping_rounds = 10)
|
||||
#' }
|
||||
#'
|
||||
#' @rdname lgb.train
|
||||
#'
|
||||
#' @export
|
||||
lgb.train <- function(params = list(),
|
||||
data,
|
||||
|
|
|
@ -1,7 +1,51 @@
|
|||
#' Simple interface for training an lightgbm model.
|
||||
#' Its documentation is combined with lgb.train.
|
||||
#'
|
||||
#' @rdname lgb.train
|
||||
|
||||
#' @name lgb_shared_params
|
||||
#' @title Shared parameter docs
|
||||
#' @description Parameter docs shared by \code{lgb.train}, \code{lgb.cv}, and \code{lightgbm}
|
||||
#' @param callbacks list of callback functions
|
||||
#' List of callback functions that are applied at each iteration.
|
||||
#' @param data a \code{lgb.Dataset} object, used for training
|
||||
#' @param early_stopping_rounds int
|
||||
#' Activates early stopping.
|
||||
#' Requires at least one validation data and one metric
|
||||
#' If there's more than one, will check all of them except the training data
|
||||
#' Returns the model with (best_iter + early_stopping_rounds)
|
||||
#' If early stopping occurs, the model will have 'best_iter' field
|
||||
#' @param eval_freq evaluation output frequency, only effect when verbose > 0
|
||||
#' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
|
||||
#' @param nrounds number of training rounds
|
||||
#' @param params List of parameters
|
||||
#' @param verbose verbosity for output, if <= 0, also will disable the print of evaluation during training
|
||||
NULL
|
||||
|
||||
|
||||
#' @title Train a LightGBM model
|
||||
#' @name lightgbm
|
||||
#' @description Simple interface for training an LightGBM model.
|
||||
#' @inheritParams lgb_shared_params
|
||||
#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}
|
||||
#' @param weight vector of response values. If not NULL, will set to dataset
|
||||
#' @param save_name File name to use when writing the trained model to disk. Should end in ".model".
|
||||
#' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example
|
||||
#' \itemize{
|
||||
#' \item{valids}{a list of \code{lgb.Dataset} objects, used for validation}
|
||||
#' \item{obj}{objective function, can be character or custom objective function. Examples include
|
||||
#' \code{regression}, \code{regression_l1}, \code{huber},
|
||||
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
|
||||
#' \item{eval}{evaluation function, can be (a list of) character or custom eval function}
|
||||
#' \item{record}{Boolean, TRUE will record iteration message to \code{booster$record_evals}}
|
||||
#' \item{colnames}{feature names, if not null, will use this to overwrite the names in dataset}
|
||||
#' \item{categorical_feature}{list of str or int. type int represents index, type str represents feature names}
|
||||
#' \item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model
|
||||
#' into a predictor model which frees up memory and the original datasets}
|
||||
#' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
|
||||
#' \item{num_leaves}{number of leaves in one tree. defaults to 127}
|
||||
#' \item{max_depth}{Limit the max depth for tree model. This is used to deal with
|
||||
#' overfit when #data is small. Tree still grow by leaf-wise.}
|
||||
#' \item{num_threads}{Number of threads for LightGBM. For the best speed, set this to
|
||||
#' the number of real CPU cores, not the number of threads (most
|
||||
#' CPU using hyper-threading to generate 2 threads per CPU core).}
|
||||
#' }
|
||||
#' @export
|
||||
lightgbm <- function(data,
|
||||
label = NULL,
|
||||
|
@ -122,7 +166,7 @@ NULL
|
|||
# Various imports
|
||||
#' @import methods
|
||||
#' @importFrom R6 R6Class
|
||||
#' @useDynLib lib_lightgbm
|
||||
#' @useDynLib lib_lightgbm , .registration = TRUE
|
||||
NULL
|
||||
|
||||
# Suppress false positive warnings from R CMD CHECK about
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#' readRDS for lgb.Booster models
|
||||
#'
|
||||
#' Attemps to load a model using RDS.
|
||||
#' Attempts to load a model using RDS.
|
||||
#'
|
||||
#' @param file a connection or the name of the file where the R object is saved to or read from.
|
||||
#' @param refhook a hook function for handling reference objects.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#' saveRDS for lgb.Booster models
|
||||
#'
|
||||
#' Attemps to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
|
||||
#' Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
|
||||
#'
|
||||
#' @param object R object to serialize.
|
||||
#' @param file a connection or the name of the file where the R object is saved to or read from.
|
||||
|
|
|
@ -22,30 +22,28 @@ For users who wants to install online with GPU or want to choose a specific comp
|
|||
|
||||
**Warning for Windows users**: it is recommended to use *Visual Studio* for its better multi-threading efficiency in Windows for many core systems. For very simple systems (dual core computers or worse), MinGW64 is recommended for maximum performance. If you do not know what to choose, it is recommended to use [Visual Studio](https://visualstudio.microsoft.com/downloads/), the default compiler. **Do not try using MinGW in Windows on many core systems. It may result in 10x slower results than Visual Studio.**
|
||||
|
||||
#### macOS Preparation
|
||||
#### Mac OS Preparation
|
||||
|
||||
You can perform installation either with **Apple Clang** or **gcc**. In case you prefer **Apple Clang**, you should install **OpenMP** (details for installation can be found in [Installation Guide](https://github.com/Microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#apple-clang)) first and **CMake** version 3.12 or higher is required. In case you prefer **gcc**, you need to install it (details for installation can be found in [Installation Guide](https://github.com/Microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#gcc)) and specify compilers by running ``export CXX=g++-7 CC=gcc-7`` (replace "7" with version of **gcc** installed on your machine) first.
|
||||
|
||||
### Install
|
||||
Mac users may need to set some environment variables to tell R to use `gcc` and `g++`. If you install these from Homebrew, your versions of `g++` and `gcc` are most likely in `/usr/local/bin`, as shown below.
|
||||
|
||||
Install LightGBM R-package with the following command:
|
||||
|
||||
```sh
|
||||
git clone --recursive https://github.com/Microsoft/LightGBM
|
||||
cd LightGBM/R-package
|
||||
# export CXX=g++-7 CC=gcc-7 # macOS users, if you decided to compile with gcc, don't forget to specify compilers (replace "7" with version of gcc installed on your machine)
|
||||
R CMD INSTALL --build . --no-multiarch
|
||||
```
|
||||
# replace 8 with version of gcc installed on your machine
|
||||
export CXX=/usr/local/bin/g++-8 CC=/usr/local/bin/gcc-8
|
||||
```
|
||||
|
||||
Or build a self-contained R-package which can be installed afterwards:
|
||||
### Install
|
||||
|
||||
Build and install R-package with the following commands:
|
||||
|
||||
```sh
|
||||
git clone --recursive https://github.com/Microsoft/LightGBM
|
||||
cd LightGBM/R-package
|
||||
Rscript build_package.R
|
||||
# export CXX=g++-7 CC=gcc-7 # macOS users, if you decided to compile with gcc, don't forget to specify compilers (replace "7" with version of gcc installed on your machine)
|
||||
R CMD INSTALL lightgbm_2.1.1.tar.gz --no-multiarch
|
||||
```
|
||||
cd LightGBM
|
||||
Rscript build_r.R
|
||||
```
|
||||
|
||||
The `build_r.R` script builds the package in a temporary directory called `lightgbm_r`. It will destroy and recreate that directory each time you run the script.
|
||||
|
||||
Note: for the build with Visual Studio/MSBuild in Windows, you should use the Windows CMD or Powershell.
|
||||
|
||||
|
@ -53,15 +51,7 @@ Windows users may need to run with administrator rights (either R or the command
|
|||
|
||||
Set `use_gpu` to `TRUE` in `R-package/src/install.libs.R` to enable the build with GPU support. You will need to install Boost and OpenCL first: details for installation can be found in [Installation-Guide](https://github.com/Microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#build-gpu-version).
|
||||
|
||||
You can also install directly from R using the repository with `devtools`:
|
||||
|
||||
```r
|
||||
library(devtools)
|
||||
options(devtools.install.args = "--no-multiarch") # if you have 64-bit R only, you can skip this
|
||||
install_github("Microsoft/LightGBM", subdir = "R-package")
|
||||
```
|
||||
|
||||
If you are using a precompiled dll/lib locally, you can move the dll/lib into LightGBM root folder, modify `LightGBM/R-package/src/install.libs.R`'s 2nd line (change `use_precompile <- FALSE` to `use_precompile <- TRUE`), and install R-package as usual. **NOTE: If your R version is not smaller than 3.5.0, you should set `DUSE_R35=ON` in CMake options when build precompiled dll/lib**.
|
||||
If you are using a precompiled dll/lib locally, you can move the dll/lib into LightGBM root folder, modify `LightGBM/R-package/src/install.libs.R`'s 2nd line (change `use_precompile <- FALSE` to `use_precompile <- TRUE`), and install R-package as usual. **NOTE: If your R version is not smaller than 3.5.0, you should set `DUSE_R35=ON` in cmake options when build precompiled dll/lib**.
|
||||
|
||||
When your package installation is done, you can check quickly if your LightGBM R-package is working by running the following:
|
||||
|
||||
|
|
|
@ -1,22 +0,0 @@
|
|||
unlink("./src/include", recursive = TRUE)
|
||||
unlink("./src/src", recursive = TRUE)
|
||||
unlink("./src/compute", recursive = TRUE)
|
||||
unlink("./src/build", recursive = TRUE)
|
||||
unlink("./src/Release", recursive = TRUE)
|
||||
if (!file.copy("./../include", "./src/", overwrite = TRUE, recursive = TRUE)) {
|
||||
stop("Cannot find folder LightGBM/include")
|
||||
}
|
||||
if (!file.copy("./../src", "./src/", overwrite = TRUE, recursive = TRUE)) {
|
||||
stop("Cannot find folder LightGBM/src")
|
||||
}
|
||||
if (!file.copy("./../compute", "./src/", overwrite = TRUE, recursive = TRUE)) {
|
||||
print("Cannot find folder LightGBM/compute, will disable GPU build")
|
||||
}
|
||||
if (!file.copy("./../CMakeLists.txt", "./src/", overwrite = TRUE, recursive = TRUE)) {
|
||||
stop("Cannot find file LightGBM/CMakeLists.txt")
|
||||
}
|
||||
if (!file.exists("./src/_IS_FULL_PACKAGE")) {
|
||||
file.create("./src/_IS_FULL_PACKAGE")
|
||||
}
|
||||
system("R CMD build --no-build-vignettes .")
|
||||
file.remove("./src/_IS_FULL_PACKAGE")
|
|
@ -1,10 +1,11 @@
|
|||
basic_walkthrough Basic feature walkthrough
|
||||
boost_from_prediction Boosting from existing prediction
|
||||
categorical_feature_prepare Categorical Feature Preparation
|
||||
categorical_feature_rules Categorical Feature Preparation with Rules
|
||||
categorical_features_prepare Categorical Feature Preparation
|
||||
categorical_features_rules Categorical Feature Preparation with Rules
|
||||
cross_validation Cross Validation
|
||||
early_stopping Early Stop in training
|
||||
efficient_many_training Efficiency for Many Model Trainings
|
||||
multiclass Multiclass training/prediction
|
||||
multiclass_custom_objective Multiclass with Custom Objective Function
|
||||
leaf_stability Leaf (in)Stability example
|
||||
weight_param Weight-Parameter adjustment relationship
|
||||
|
|
|
@ -0,0 +1,97 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/lgb.cv.R
|
||||
\name{lgb.cv}
|
||||
\alias{lgb.cv}
|
||||
\title{Main CV logic for LightGBM}
|
||||
\usage{
|
||||
lgb.cv(params = list(), data, nrounds = 10, nfold = 3, label = NULL,
|
||||
weight = NULL, obj = NULL, eval = NULL, verbose = 1, record = TRUE,
|
||||
eval_freq = 1L, showsd = TRUE, stratified = TRUE, folds = NULL,
|
||||
init_model = NULL, colnames = NULL, categorical_feature = NULL,
|
||||
early_stopping_rounds = NULL, callbacks = list(), ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{List of parameters}
|
||||
|
||||
\item{data}{a \code{lgb.Dataset} object, used for training}
|
||||
|
||||
\item{nrounds}{number of training rounds}
|
||||
|
||||
\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
|
||||
|
||||
\item{label}{vector of response values. Should be provided only when data is an R-matrix.}
|
||||
|
||||
\item{weight}{vector of response values. If not NULL, will set to dataset}
|
||||
|
||||
\item{obj}{objective function, can be character or custom objective function. Examples include
|
||||
\code{regression}, \code{regression_l1}, \code{huber},
|
||||
\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
|
||||
|
||||
\item{eval}{evaluation function, can be (list of) character or custom eval function}
|
||||
|
||||
\item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
|
||||
|
||||
\item{record}{Boolean, TRUE will record iteration message to \code{booster$record_evals}}
|
||||
|
||||
\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
|
||||
|
||||
\item{showsd}{\code{boolean}, whether to show standard deviation of cross validation}
|
||||
|
||||
\item{stratified}{a \code{boolean} indicating whether sampling of folds should be stratified
|
||||
by the values of outcome labels.}
|
||||
|
||||
\item{folds}{\code{list} provides a possibility to use a list of pre-defined CV folds
|
||||
(each element must be a vector of test fold's indices). When folds are supplied,
|
||||
the \code{nfold} and \code{stratified} parameters are ignored.}
|
||||
|
||||
\item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model}
|
||||
|
||||
\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset}
|
||||
|
||||
\item{categorical_feature}{list of str or int
|
||||
type int represents index,
|
||||
type str represents feature names}
|
||||
|
||||
\item{early_stopping_rounds}{int
|
||||
Activates early stopping.
|
||||
Requires at least one validation data and one metric
|
||||
If there's more than one, will check all of them except the training data
|
||||
Returns the model with (best_iter + early_stopping_rounds)
|
||||
If early stopping occurs, the model will have 'best_iter' field}
|
||||
|
||||
\item{callbacks}{list of callback functions
|
||||
List of callback functions that are applied at each iteration.}
|
||||
|
||||
\item{...}{other parameters, see Parameters.rst for more information. A few key parameters:
|
||||
\itemize{
|
||||
\item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
|
||||
\item{num_leaves}{number of leaves in one tree. defaults to 127}
|
||||
\item{max_depth}{Limit the max depth for tree model. This is used to deal with
|
||||
overfit when #data is small. Tree still grow by leaf-wise.}
|
||||
\item{num_threads}{Number of threads for LightGBM. For the best speed, set this to
|
||||
the number of real CPU cores, not the number of threads (most
|
||||
CPU using hyper-threading to generate 2 threads per CPU core).}
|
||||
}}
|
||||
}
|
||||
\value{
|
||||
a trained model \code{lgb.CVBooster}.
|
||||
}
|
||||
\description{
|
||||
Cross validation logic used by LightGBM
|
||||
}
|
||||
\examples{
|
||||
\dontrun{
|
||||
library(lightgbm)
|
||||
data(agaricus.train, package = "lightgbm")
|
||||
train <- agaricus.train
|
||||
dtrain <- lgb.Dataset(train$data, label = train$label)
|
||||
params <- list(objective = "regression", metric = "l2")
|
||||
model <- lgb.cv(params,
|
||||
dtrain,
|
||||
10,
|
||||
nfold = 5,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 10)
|
||||
}
|
||||
}
|
|
@ -9,7 +9,7 @@ lgb.model.dt.tree(model, num_iteration = NULL)
|
|||
\arguments{
|
||||
\item{model}{object of class \code{lgb.Booster}}
|
||||
|
||||
\item{num_iteration}{number of iterations you want to predict with. NULL or
|
||||
\item{num_iteration}{number of iterations you want to predict with. NULL or
|
||||
<= 0 means use best iteration}
|
||||
}
|
||||
\value{
|
||||
|
@ -26,7 +26,7 @@ The columns of the \code{data.table} are:
|
|||
\item \code{leaf_index}: ID of a leaf in a tree (integer)
|
||||
\item \code{leaf_parent}: ID of the parent node for current leaf (integer)
|
||||
\item \code{split_gain}: Split gain of a node
|
||||
\item \code{threshold}: Spliting threshold value of a node
|
||||
\item \code{threshold}: Splitting threshold value of a node
|
||||
\item \code{decision_type}: Decision type of a node
|
||||
\item \code{default_left}: Determine how to handle NA value, TRUE -> Left, FALSE -> Right
|
||||
\item \code{internal_value}: Node value
|
||||
|
|
|
@ -1,122 +1,35 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/lgb.cv.R, R/lgb.train.R, R/lightgbm.R
|
||||
\name{lgb.cv}
|
||||
\alias{lgb.cv}
|
||||
% Please edit documentation in R/lgb.train.R
|
||||
\name{lgb.train}
|
||||
\alias{lgb.train}
|
||||
\alias{lightgbm}
|
||||
\title{Main CV logic for LightGBM}
|
||||
\title{Main training logic for LightGBM}
|
||||
\usage{
|
||||
lgb.cv(params = list(), data, nrounds = 10, nfold = 3, label = NULL,
|
||||
weight = NULL, obj = NULL, eval = NULL, verbose = 1, record = TRUE,
|
||||
eval_freq = 1L, showsd = TRUE, stratified = TRUE, folds = NULL,
|
||||
init_model = NULL, colnames = NULL, categorical_feature = NULL,
|
||||
early_stopping_rounds = NULL, callbacks = list(), ...)
|
||||
|
||||
lgb.train(params = list(), data, nrounds = 10, valids = list(),
|
||||
obj = NULL, eval = NULL, verbose = 1, record = TRUE, eval_freq = 1L,
|
||||
init_model = NULL, colnames = NULL, categorical_feature = NULL,
|
||||
early_stopping_rounds = NULL, callbacks = list(), reset_data = FALSE,
|
||||
...)
|
||||
|
||||
lightgbm(data, label = NULL, weight = NULL, params = list(),
|
||||
nrounds = 10, verbose = 1, eval_freq = 1L,
|
||||
early_stopping_rounds = NULL, save_name = "lightgbm.model",
|
||||
init_model = NULL, callbacks = list(), ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{List of parameters}
|
||||
|
||||
\item{data}{a \code{lgb.Dataset} object, used for CV}
|
||||
|
||||
\item{nrounds}{number of CV rounds}
|
||||
|
||||
\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
|
||||
|
||||
\item{label}{vector of response values. Should be provided only when data is an R-matrix.}
|
||||
|
||||
\item{weight}{vector of response values. If not NULL, will set to dataset}
|
||||
|
||||
\item{obj}{objective function, can be character or custom objective function. Examples include
|
||||
\code{regression}, \code{regression_l1}, \code{huber},
|
||||
\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
|
||||
|
||||
\item{eval}{evaluation function, can be (list of) character or custom eval function}
|
||||
|
||||
\item{verbose}{verbosity for output, if <= 0, also will disable the print of evalutaion during training}
|
||||
|
||||
\item{record}{Boolean, TRUE will record iteration message to \code{booster$record_evals}}
|
||||
|
||||
\item{eval_freq}{evalutaion output frequence, only effect when verbose > 0}
|
||||
|
||||
\item{showsd}{\code{boolean}, whether to show standard deviation of cross validation}
|
||||
|
||||
\item{stratified}{a \code{boolean} indicating whether sampling of folds should be stratified
|
||||
by the values of outcome labels.}
|
||||
|
||||
\item{folds}{\code{list} provides a possibility to use a list of pre-defined CV folds
|
||||
(each element must be a vector of test fold's indices). When folds are supplied,
|
||||
the \code{nfold} and \code{stratified} parameters are ignored.}
|
||||
|
||||
\item{init_model}{path of model file of \code{lgb.Booster} object, will continue train from this model}
|
||||
|
||||
\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset}
|
||||
|
||||
\item{categorical_feature}{list of str or int
|
||||
type int represents index,
|
||||
type str represents feature names}
|
||||
|
||||
\item{early_stopping_rounds}{int
|
||||
Activates early stopping.
|
||||
CV score needs to improve at least every early_stopping_rounds round(s) to continue.
|
||||
Requires at least one metric.
|
||||
If there's more than one, will check all of them.
|
||||
Returns the model with (best_iter + early_stopping_rounds).
|
||||
If early stopping occurs, the model will have 'best_iter' field}
|
||||
|
||||
\item{callbacks}{list of callback functions
|
||||
List of callback functions that are applied at each iteration.}
|
||||
|
||||
\item{...}{other parameters, see Parameters.rst for more informations}
|
||||
|
||||
\item{valids}{a list of \code{lgb.Dataset} objects, used for validation}
|
||||
|
||||
\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets}
|
||||
|
||||
\item{boosting}{boosting type. \code{gbdt}, \code{dart}}
|
||||
|
||||
\item{num_leaves}{number of leaves in one tree. defaults to 127}
|
||||
|
||||
\item{max_depth}{Limit the max depth for tree model. This is used to deal with overfit when #data is small.
|
||||
Tree still grow by leaf-wise.}
|
||||
|
||||
\item{num_threads}{Number of threads for LightGBM. For the best speed, set this to the number of real CPU cores, not the number of threads (most CPU using hyper-threading to generate 2 threads per CPU core).}
|
||||
|
||||
\item{params}{List of parameters}
|
||||
|
||||
\item{data}{a \code{lgb.Dataset} object, used for training}
|
||||
|
||||
\item{nrounds}{number of training rounds}
|
||||
|
||||
\item{valids}{a list of \code{lgb.Dataset} objects, used for validation}
|
||||
|
||||
\item{obj}{objective function, can be character or custom objective function. Examples include
|
||||
\code{regression}, \code{regression_l1}, \code{huber},
|
||||
\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
|
||||
|
||||
\item{boosting}{boosting type. \code{gbdt}, \code{dart}}
|
||||
|
||||
\item{num_leaves}{number of leaves in one tree. defaults to 127}
|
||||
|
||||
\item{max_depth}{Limit the max depth for tree model. This is used to deal with overfit when #data is small.
|
||||
Tree still grow by leaf-wise.}
|
||||
|
||||
\item{num_threads}{Number of threads for LightGBM. For the best speed, set this to the number of real CPU cores, not the number of threads (most CPU using hyper-threading to generate 2 threads per CPU core).}
|
||||
|
||||
\item{eval}{evaluation function, can be (a list of) character or custom eval function}
|
||||
|
||||
\item{verbose}{verbosity for output, if <= 0, also will disable the print of evalutaion during training}
|
||||
\item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
|
||||
|
||||
\item{record}{Boolean, TRUE will record iteration message to \code{booster$record_evals}}
|
||||
|
||||
\item{eval_freq}{evalutaion output frequency, only effect when verbose > 0}
|
||||
\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
|
||||
|
||||
\item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model}
|
||||
|
||||
|
@ -128,26 +41,32 @@ type str represents feature names}
|
|||
|
||||
\item{early_stopping_rounds}{int
|
||||
Activates early stopping.
|
||||
The model will train until the validation score stops improving.
|
||||
Validation score needs to improve at least every early_stopping_rounds round(s) to continue training.
|
||||
Requires at least one validation data and one metric.
|
||||
If there's more than one, will check all of them. But the training data is ignored anyway.
|
||||
Returns the model with (best_iter + early_stopping_rounds).
|
||||
Requires at least one validation data and one metric
|
||||
If there's more than one, will check all of them except the training data
|
||||
Returns the model with (best_iter + early_stopping_rounds)
|
||||
If early stopping occurs, the model will have 'best_iter' field}
|
||||
|
||||
\item{callbacks}{list of callback functions
|
||||
List of callback functions that are applied at each iteration.}
|
||||
|
||||
\item{...}{other parameters, see Parameters.rst for more informations}
|
||||
\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets}
|
||||
|
||||
\item{...}{other parameters, see Parameters.rst for more information. A few key parameters:
|
||||
\itemize{
|
||||
\item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
|
||||
\item{num_leaves}{number of leaves in one tree. defaults to 127}
|
||||
\item{max_depth}{Limit the max depth for tree model. This is used to deal with
|
||||
overfit when #data is small. Tree still grow by leaf-wise.}
|
||||
\item{num_threads}{Number of threads for LightGBM. For the best speed, set this to
|
||||
the number of real CPU cores, not the number of threads (most
|
||||
CPU using hyper-threading to generate 2 threads per CPU core).}
|
||||
}}
|
||||
}
|
||||
\value{
|
||||
a trained model \code{lgb.CVBooster}.
|
||||
|
||||
a trained booster model \code{lgb.Booster}.
|
||||
}
|
||||
\description{
|
||||
Simple interface for training an lightgbm model.
|
||||
Its documentation is combined with lgb.train.
|
||||
Logic to train with LightGBM
|
||||
}
|
||||
\examples{
|
||||
\dontrun{
|
||||
|
@ -155,20 +74,6 @@ library(lightgbm)
|
|||
data(agaricus.train, package = "lightgbm")
|
||||
train <- agaricus.train
|
||||
dtrain <- lgb.Dataset(train$data, label = train$label)
|
||||
params <- list(objective = "regression", metric = "l2")
|
||||
model <- lgb.cv(params,
|
||||
dtrain,
|
||||
10,
|
||||
nfold = 5,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 10)
|
||||
}
|
||||
\dontrun{
|
||||
library(lightgbm)
|
||||
data(agaricus.train, package = "lightgbm")
|
||||
train <- agaricus.train
|
||||
dtrain <- lgb.Dataset(train$data, label = train$label)
|
||||
data(agaricus.test, package = "lightgbm")
|
||||
test <- agaricus.test
|
||||
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/lightgbm.R
|
||||
\name{lgb_shared_params}
|
||||
\alias{lgb_shared_params}
|
||||
\title{Shared parameter docs}
|
||||
\arguments{
|
||||
\item{callbacks}{list of callback functions
|
||||
List of callback functions that are applied at each iteration.}
|
||||
|
||||
\item{data}{a \code{lgb.Dataset} object, used for training}
|
||||
|
||||
\item{early_stopping_rounds}{int
|
||||
Activates early stopping.
|
||||
Requires at least one validation data and one metric
|
||||
If there's more than one, will check all of them except the training data
|
||||
Returns the model with (best_iter + early_stopping_rounds)
|
||||
If early stopping occurs, the model will have 'best_iter' field}
|
||||
|
||||
\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
|
||||
|
||||
\item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model}
|
||||
|
||||
\item{nrounds}{number of training rounds}
|
||||
|
||||
\item{params}{List of parameters}
|
||||
|
||||
\item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
|
||||
}
|
||||
\description{
|
||||
Parameter docs shared by \code{lgb.train}, \code{lgb.cv}, and \code{lightgbm}
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/lightgbm.R
|
||||
\name{lightgbm}
|
||||
\alias{lightgbm}
|
||||
\title{Train a LightGBM model}
|
||||
\usage{
|
||||
lightgbm(data, label = NULL, weight = NULL, params = list(),
|
||||
nrounds = 10, verbose = 1, eval_freq = 1L,
|
||||
early_stopping_rounds = NULL, save_name = "lightgbm.model",
|
||||
init_model = NULL, callbacks = list(), ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{a \code{lgb.Dataset} object, used for training}
|
||||
|
||||
\item{label}{Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}}
|
||||
|
||||
\item{weight}{vector of response values. If not NULL, will set to dataset}
|
||||
|
||||
\item{params}{List of parameters}
|
||||
|
||||
\item{nrounds}{number of training rounds}
|
||||
|
||||
\item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
|
||||
|
||||
\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
|
||||
|
||||
\item{early_stopping_rounds}{int
|
||||
Activates early stopping.
|
||||
Requires at least one validation data and one metric
|
||||
If there's more than one, will check all of them except the training data
|
||||
Returns the model with (best_iter + early_stopping_rounds)
|
||||
If early stopping occurs, the model will have 'best_iter' field}
|
||||
|
||||
\item{save_name}{File name to use when writing the trained model to disk. Should end in ".model".}
|
||||
|
||||
\item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model}
|
||||
|
||||
\item{callbacks}{list of callback functions
|
||||
List of callback functions that are applied at each iteration.}
|
||||
|
||||
\item{...}{Additional arguments passed to \code{\link{lgb.train}}. For example
|
||||
\itemize{
|
||||
\item{valids}{a list of \code{lgb.Dataset} objects, used for validation}
|
||||
\item{obj}{objective function, can be character or custom objective function. Examples include
|
||||
\code{regression}, \code{regression_l1}, \code{huber},
|
||||
\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
|
||||
\item{eval}{evaluation function, can be (a list of) character or custom eval function}
|
||||
\item{record}{Boolean, TRUE will record iteration message to \code{booster$record_evals}}
|
||||
\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset}
|
||||
\item{categorical_feature}{list of str or int. type int represents index, type str represents feature names}
|
||||
\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model
|
||||
into a predictor model which frees up memory and the original datasets}
|
||||
\item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
|
||||
\item{num_leaves}{number of leaves in one tree. defaults to 127}
|
||||
\item{max_depth}{Limit the max depth for tree model. This is used to deal with
|
||||
overfit when #data is small. Tree still grow by leaf-wise.}
|
||||
\item{num_threads}{Number of threads for LightGBM. For the best speed, set this to
|
||||
the number of real CPU cores, not the number of threads (most
|
||||
CPU using hyper-threading to generate 2 threads per CPU core).}
|
||||
}}
|
||||
}
|
||||
\description{
|
||||
Simple interface for training an LightGBM model.
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
\title{Predict method for LightGBM model}
|
||||
\usage{
|
||||
\method{predict}{lgb.Booster}(object, data, num_iteration = NULL,
|
||||
rawscore = FALSE, predleaf = FALSE, header = FALSE, reshape = FALSE,
|
||||
...)
|
||||
rawscore = FALSE, predleaf = FALSE, predcontrib = FALSE,
|
||||
header = FALSE, reshape = FALSE, ...)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class \code{lgb.Booster}}
|
||||
|
@ -21,6 +21,8 @@ logistic regression would result in predictions for log-odds instead of probabil
|
|||
|
||||
\item{predleaf}{whether predict leaf index instead.}
|
||||
|
||||
\item{predcontrib}{return per-feature contributions for each record.}
|
||||
|
||||
\item{header}{only used for prediction for text file. True if text file has header}
|
||||
|
||||
\item{reshape}{whether to reshape the vector of predictions to a matrix form when there are several
|
||||
|
|
|
@ -15,7 +15,7 @@ readRDS.lgb.Booster(file = "", refhook = NULL)
|
|||
lgb.Booster.
|
||||
}
|
||||
\description{
|
||||
Attemps to load a model using RDS.
|
||||
Attempts to load a model using RDS.
|
||||
}
|
||||
\examples{
|
||||
\dontrun{
|
||||
|
|
|
@ -26,7 +26,7 @@ saveRDS.lgb.Booster(object, file = "", ascii = FALSE, version = NULL,
|
|||
NULL invisibly.
|
||||
}
|
||||
\description{
|
||||
Attemps to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
|
||||
Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
|
||||
}
|
||||
\examples{
|
||||
\dontrun{
|
||||
|
|
|
@ -21,7 +21,7 @@ constructed sub dataset
|
|||
}
|
||||
\description{
|
||||
Get a new \code{lgb.Dataset} containing the specified rows of
|
||||
orginal lgb.Dataset object
|
||||
original lgb.Dataset object
|
||||
}
|
||||
\examples{
|
||||
\dontrun{
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
|
||||
// Register Dynamic Symbols
|
||||
|
||||
#include <R.h>
|
||||
#include <Rinternals.h>
|
||||
#include <R_ext/Rdynload.h>
|
||||
#include "R_init.h"
|
||||
|
||||
void R_init_lightgbm(DllInfo* info) {
|
||||
R_registerRoutines(info, NULL, NULL, NULL, NULL);
|
||||
R_useDynamicSymbols(info, TRUE);
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
// Register Dynamic Symbols
|
||||
#ifndef R_INIT_LIGHTGBM_H
|
||||
#define R_INIT_LIGHTGBM_H
|
||||
|
||||
void R_init_lightgbm(DllInfo* info);
|
||||
|
||||
#endif // R_INIT_LIGHTGBM_H
|
|
@ -14,6 +14,12 @@ if (!(R_int_UUID == "0310d4b8-ccb1-4bb8-ba94-d36a55f60262"
|
|||
|| R_int_UUID == "2fdf6c18-697a-4ba7-b8ef-11c0d92f1327")){
|
||||
print("Warning: unmatched R_INTERNALS_UUID, may cannot run normally.")
|
||||
}
|
||||
|
||||
# Move in CMakeLists.txt
|
||||
if (!file.copy("../inst/bin/CMakeLists.txt", "CMakeLists.txt", overwrite = TRUE)){
|
||||
stop("Copying CMakeLists failed")
|
||||
}
|
||||
|
||||
# Check for precompilation
|
||||
if (!use_precompile) {
|
||||
|
||||
|
@ -21,26 +27,6 @@ if (!use_precompile) {
|
|||
source_dir <- file.path(R_PACKAGE_SOURCE, "src", fsep = "/")
|
||||
setwd(source_dir)
|
||||
|
||||
if (!file.exists("_IS_FULL_PACKAGE")) {
|
||||
unlink("./include", recursive = TRUE)
|
||||
unlink("./src", recursive = TRUE)
|
||||
unlink("./compute", recursive = TRUE)
|
||||
unlink("./build", recursive = TRUE)
|
||||
if (!file.copy("./../../include", "./", overwrite = TRUE, recursive = TRUE)) {
|
||||
stop("Cannot find folder LightGBM/include")
|
||||
}
|
||||
if (!file.copy("./../../src", "./", overwrite = TRUE, recursive = TRUE)) {
|
||||
stop("Cannot find folder LightGBM/src")
|
||||
}
|
||||
if (!file.copy("./../../compute", "./", overwrite = TRUE, recursive = TRUE)) {
|
||||
print("Cannot find folder LightGBM/compute, disabling GPU build.")
|
||||
use_gpu <- FALSE
|
||||
}
|
||||
if (!file.copy("./../../CMakeLists.txt", "./", overwrite = TRUE, recursive = TRUE)) {
|
||||
stop("Cannot find file LightGBM/CMakeLists.txt")
|
||||
}
|
||||
}
|
||||
|
||||
# Prepare building package
|
||||
build_dir <- file.path(source_dir, "build", fsep = "/")
|
||||
dir.create(build_dir, recursive = TRUE, showWarnings = FALSE)
|
||||
|
|
|
@ -62,7 +62,7 @@ test_that("lgb.Dataset: colnames", {
|
|||
|
||||
test_that("lgb.Dataset: nrow is correct for a very sparse matrix", {
|
||||
nr <- 1000
|
||||
x <- rsparsematrix(nr, 100, density=0.0005)
|
||||
x <- Matrix::rsparsematrix(nr, 100, density=0.0005)
|
||||
# we want it very sparse, so that last rows are empty
|
||||
expect_lt(max(x@i), nr)
|
||||
dtest <- lgb.Dataset(x)
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
|
||||
# for macOS (replace 7 with version of gcc installed on your machine)
|
||||
# NOTE: your gcc / g++ from Homebrew is probably in /usr/local/bin
|
||||
#export CXX=/usr/local/bin/g++-8 CC=/usr/local/bin/gcc-8
|
||||
# Sys.setenv("CXX" = "/usr/local/bin/g++-8")
|
||||
# Sys.setenv("CC" = "/usr/local/bin/gcc-8")
|
||||
|
||||
# R returns FALSE (not a non-zero exit code) if a file copy operation
|
||||
# breaks. Let's fix that
|
||||
.handle_result <- function(res){
|
||||
if (!res){
|
||||
stop("Copying files failed!")
|
||||
}
|
||||
}
|
||||
|
||||
# Make a new temporary folder to work in
|
||||
unlink(x = "lightgbm_r", recursive = TRUE)
|
||||
dir.create("lightgbm_r")
|
||||
|
||||
# copy in the relevant files
|
||||
result <- file.copy(
|
||||
from = "R-package/./"
|
||||
, to = "lightgbm_r/"
|
||||
, recursive = TRUE
|
||||
, overwrite = TRUE
|
||||
)
|
||||
.handle_result(result)
|
||||
|
||||
result <- file.copy(
|
||||
from = "include/"
|
||||
, to = file.path("lightgbm_r", "src/")
|
||||
, recursive = TRUE
|
||||
, overwrite = TRUE
|
||||
)
|
||||
.handle_result(result)
|
||||
|
||||
result <- file.copy(
|
||||
from = "src/"
|
||||
, to = file.path("lightgbm_r", "src/")
|
||||
, recursive = TRUE
|
||||
, overwrite = TRUE
|
||||
)
|
||||
.handle_result(result)
|
||||
|
||||
result <- file.copy(
|
||||
from = "CMakeLists.txt"
|
||||
, to = file.path("lightgbm_r", "inst", "bin/")
|
||||
, recursive = TRUE
|
||||
, overwrite = TRUE
|
||||
)
|
||||
.handle_result(result)
|
||||
|
||||
# rebuild documentation
|
||||
devtools::document(
|
||||
pkg = "lightgbm_r/"
|
||||
)
|
||||
|
||||
# Build the package
|
||||
# NOTE: --keep-empty-dirs is necessary to keep the deep paths expected
|
||||
# by CMake while also meeting the CRAN req to create object files
|
||||
# on demand
|
||||
devtools::build(
|
||||
pkg = "lightgbm_r"
|
||||
, args = c("--keep-empty-dirs")
|
||||
)
|
||||
|
||||
# Install the package
|
||||
version <- gsub(
|
||||
"Version: "
|
||||
, ""
|
||||
, grep(
|
||||
"Version: "
|
||||
, readLines(con = file.path("lightgbm_r", "DESCRIPTION"))
|
||||
, value = TRUE
|
||||
)
|
||||
)
|
||||
tarball <- file.path(getwd(), sprintf("lightgbm_%s.tar.gz", version))
|
||||
|
||||
system(sprintf("R CMD INSTALL %s --no-multi-arch", tarball))
|
||||
|
||||
# Run R CMD CHECK
|
||||
#R CMD CHECK lightgbm_2.1.2.tar.gz --as-cran | tee check.log | cat
|
|
@ -1096,7 +1096,7 @@ void GPUTreeLearner::FindBestSplits() {
|
|||
void GPUTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
|
||||
const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
|
||||
#if GPU_DEBUG >= 2
|
||||
printf("Spliting leaf %d with feature %d thresh %d gain %f stat %f %f %f %f\n", best_Leaf, best_split_info.feature, best_split_info.threshold, best_split_info.gain, best_split_info.left_sum_gradient, best_split_info.right_sum_gradient, best_split_info.left_sum_hessian, best_split_info.right_sum_hessian);
|
||||
printf("Splitting leaf %d with feature %d thresh %d gain %f stat %f %f %f %f\n", best_Leaf, best_split_info.feature, best_split_info.threshold, best_split_info.gain, best_split_info.left_sum_gradient, best_split_info.right_sum_gradient, best_split_info.left_sum_hessian, best_split_info.right_sum_hessian);
|
||||
#endif
|
||||
SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf);
|
||||
if (Network::num_machines() == 1) {
|
||||
|
|
Загрузка…
Ссылка в новой задаче