зеркало из https://github.com/microsoft/LightGBM.git
This commit is contained in:
Родитель
b4bb38d926
Коммит
fc991c9d7e
|
@ -0,0 +1,62 @@
|
|||
|
||||
library(lintr)
|
||||
|
||||
args <- commandArgs(
|
||||
trailingOnly = TRUE
|
||||
)
|
||||
SOURCE_DIR <- args[[1]]
|
||||
|
||||
FILES_TO_LINT <- list.files(
|
||||
path = SOURCE_DIR
|
||||
, pattern = "\\.r$"
|
||||
, all.files = TRUE
|
||||
, ignore.case = TRUE
|
||||
, full.names = TRUE
|
||||
, recursive = TRUE
|
||||
, include.dirs = FALSE
|
||||
)
|
||||
|
||||
# Some linters from the lintr package have not made it to CRAN yet
|
||||
# We build lintr from source to address that.
|
||||
LINTERS_TO_USE <- list(
|
||||
"closed_curly" = lintr::closed_curly_linter
|
||||
, "infix_spaces" = lintr::infix_spaces_linter
|
||||
, "long_lines" = lintr::line_length_linter(length = 120)
|
||||
, "tabs" = lintr::no_tab_linter
|
||||
, "open_curly" = lintr::open_curly_linter
|
||||
, "spaces_inside" = lintr::spaces_inside_linter
|
||||
, "spaces_left_parens" = lintr::spaces_left_parentheses_linter
|
||||
, "trailing_blank" = lintr::trailing_blank_lines_linter
|
||||
, "trailing_white" = lintr::trailing_whitespace_linter
|
||||
)
|
||||
|
||||
cat(sprintf("Found %i R files to lint\n", length(FILES_TO_LINT)))
|
||||
|
||||
results <- c()
|
||||
|
||||
for (r_file in FILES_TO_LINT){
|
||||
|
||||
this_result <- lintr::lint(
|
||||
filename = r_file
|
||||
, linters = LINTERS_TO_USE
|
||||
, cache = FALSE
|
||||
)
|
||||
|
||||
cat(sprintf(
|
||||
"Found %i linting errors in %s\n"
|
||||
, length(this_result)
|
||||
, r_file
|
||||
))
|
||||
|
||||
results <- c(results, this_result)
|
||||
|
||||
}
|
||||
|
||||
issues_found <- length(results)
|
||||
|
||||
if (issues_found > 0){
|
||||
cat("\n")
|
||||
print(results)
|
||||
}
|
||||
|
||||
quit(save = "no", status = issues_found)
|
|
@ -51,10 +51,17 @@ if [[ $TRAVIS == "true" ]] && [[ $TASK == "check-docs" ]]; then
|
|||
fi
|
||||
|
||||
if [[ $TASK == "lint" ]]; then
|
||||
conda install -q -y -n $CONDA_ENV pycodestyle pydocstyle
|
||||
conda install -q -y -n $CONDA_ENV \
|
||||
pycodestyle \
|
||||
pydocstyle \
|
||||
r-lintr
|
||||
pip install --user cpplint
|
||||
echo "Linting Python code"
|
||||
pycodestyle --ignore=E501,W503 --exclude=./compute,./.nuget . || exit -1
|
||||
pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^compute|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1
|
||||
echo "Linting R code"
|
||||
Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1
|
||||
echo "Linting C++ code"
|
||||
cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include || exit 0
|
||||
exit 0
|
||||
fi
|
||||
|
|
|
@ -39,7 +39,11 @@ cb.reset.parameters <- function(new_params) {
|
|||
# since changing them would simply wreck some chaos
|
||||
not_allowed <- c("num_class", "metric", "boosting_type")
|
||||
if (any(pnames %in% not_allowed)) {
|
||||
stop("Parameters ", paste0(pnames[pnames %in% not_allowed], collapse = ", "), " cannot be changed during boosting")
|
||||
stop(
|
||||
"Parameters "
|
||||
, paste0(pnames[pnames %in% not_allowed], collapse = ", ")
|
||||
, " cannot be changed during boosting"
|
||||
)
|
||||
}
|
||||
|
||||
# Check parameter names
|
||||
|
@ -166,7 +170,7 @@ cb.print.evaluation <- function(period = 1) {
|
|||
i <- env$iteration
|
||||
|
||||
# Check if iteration matches moduo
|
||||
if ((i - 1) %% period == 0 || is.element(i, c(env$begin_iteration, env$end_iteration ))) {
|
||||
if ( (i - 1) %% period == 0 || is.element(i, c(env$begin_iteration, env$end_iteration))) {
|
||||
|
||||
# Merge evaluation string
|
||||
msg <- merge.eval.string(env)
|
||||
|
@ -244,8 +248,14 @@ cb.record.evaluation <- function() {
|
|||
name <- eval_res$name
|
||||
|
||||
# Store evaluation data
|
||||
env$model$record_evals[[data_name]][[name]]$eval <- c(env$model$record_evals[[data_name]][[name]]$eval, eval_res$value)
|
||||
env$model$record_evals[[data_name]][[name]]$eval_err <- c(env$model$record_evals[[data_name]][[name]]$eval_err, eval_err)
|
||||
env$model$record_evals[[data_name]][[name]]$eval <- c(
|
||||
env$model$record_evals[[data_name]][[name]]$eval
|
||||
, eval_res$value
|
||||
)
|
||||
env$model$record_evals[[data_name]][[name]]$eval_err <- c(
|
||||
env$model$record_evals[[data_name]][[name]]$eval_err
|
||||
, eval_err
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
@ -391,7 +401,9 @@ cb.early.stop <- function(stopping_rounds, verbose = TRUE) {
|
|||
}
|
||||
|
||||
# Extract callback names from the list of callbacks
|
||||
callback.names <- function(cb_list) { unlist(lapply(cb_list, attr, "name")) }
|
||||
callback.names <- function(cb_list) {
|
||||
unlist(lapply(cb_list, attr, "name"))
|
||||
}
|
||||
|
||||
add.cb <- function(cb_list, cb) {
|
||||
|
||||
|
|
|
@ -46,7 +46,12 @@ Booster <- R6::R6Class(
|
|||
}
|
||||
|
||||
# Store booster handle
|
||||
handle <- lgb.call("LGBM_BoosterCreate_R", ret = handle, train_set$.__enclos_env__$private$get_handle(), params_str)
|
||||
handle <- lgb.call(
|
||||
"LGBM_BoosterCreate_R"
|
||||
, ret = handle
|
||||
, train_set$.__enclos_env__$private$get_handle()
|
||||
, params_str
|
||||
)
|
||||
|
||||
# Create private booster information
|
||||
private$train_set <- train_set
|
||||
|
@ -57,10 +62,12 @@ Booster <- R6::R6Class(
|
|||
if (!is.null(private$init_predictor)) {
|
||||
|
||||
# Merge booster
|
||||
lgb.call("LGBM_BoosterMerge_R",
|
||||
ret = NULL,
|
||||
handle,
|
||||
private$init_predictor$.__enclos_env__$private$handle)
|
||||
lgb.call(
|
||||
"LGBM_BoosterMerge_R"
|
||||
, ret = NULL
|
||||
, handle
|
||||
, private$init_predictor$.__enclos_env__$private$handle
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
@ -75,9 +82,11 @@ Booster <- R6::R6Class(
|
|||
}
|
||||
|
||||
# Create booster from model
|
||||
handle <- lgb.call("LGBM_BoosterCreateFromModelfile_R",
|
||||
ret = handle,
|
||||
lgb.c_str(modelfile))
|
||||
handle <- lgb.call(
|
||||
"LGBM_BoosterCreateFromModelfile_R"
|
||||
, ret = handle
|
||||
, lgb.c_str(modelfile)
|
||||
)
|
||||
|
||||
} else if (!is.null(model_str)) {
|
||||
|
||||
|
@ -87,14 +96,19 @@ Booster <- R6::R6Class(
|
|||
}
|
||||
|
||||
# Create booster from model
|
||||
handle <- lgb.call("LGBM_BoosterLoadModelFromString_R",
|
||||
ret = handle,
|
||||
lgb.c_str(model_str))
|
||||
handle <- lgb.call(
|
||||
"LGBM_BoosterLoadModelFromString_R"
|
||||
, ret = handle
|
||||
, lgb.c_str(model_str)
|
||||
)
|
||||
|
||||
} else {
|
||||
|
||||
# Booster non existent
|
||||
stop("lgb.Booster: Need at least either training dataset, model file, or model_str to create booster instance")
|
||||
stop(
|
||||
"lgb.Booster: Need at least either training dataset, "
|
||||
, "model file, or model_str to create booster instance"
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
@ -111,9 +125,11 @@ Booster <- R6::R6Class(
|
|||
class(handle) <- "lgb.Booster.handle"
|
||||
private$handle <- handle
|
||||
private$num_class <- 1L
|
||||
private$num_class <- lgb.call("LGBM_BoosterGetNumClasses_R",
|
||||
ret = private$num_class,
|
||||
private$handle)
|
||||
private$num_class <- lgb.call(
|
||||
"LGBM_BoosterGetNumClasses_R"
|
||||
, ret = private$num_class
|
||||
, private$handle
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
@ -138,7 +154,10 @@ Booster <- R6::R6Class(
|
|||
|
||||
# Check if predictors are identical
|
||||
if (!identical(data$.__enclos_env__$private$predictor, private$init_predictor)) {
|
||||
stop("lgb.Booster.add_valid: Failed to add validation data; you should use the same predictor for these data")
|
||||
stop(
|
||||
"lgb.Booster.add_valid: Failed to add validation data; "
|
||||
, "you should use the same predictor for these data"
|
||||
)
|
||||
}
|
||||
|
||||
# Check if names are character
|
||||
|
@ -147,10 +166,12 @@ Booster <- R6::R6Class(
|
|||
}
|
||||
|
||||
# Add validation data to booster
|
||||
lgb.call("LGBM_BoosterAddValidData_R",
|
||||
ret = NULL,
|
||||
private$handle,
|
||||
data$.__enclos_env__$private$get_handle())
|
||||
lgb.call(
|
||||
"LGBM_BoosterAddValidData_R"
|
||||
, ret = NULL
|
||||
, private$handle
|
||||
, data$.__enclos_env__$private$get_handle()
|
||||
)
|
||||
|
||||
# Store private information
|
||||
private$valid_sets <- c(private$valid_sets, data)
|
||||
|
@ -171,10 +192,12 @@ Booster <- R6::R6Class(
|
|||
params_str <- lgb.params2str(params)
|
||||
|
||||
# Reset parameters
|
||||
lgb.call("LGBM_BoosterResetParameter_R",
|
||||
ret = NULL,
|
||||
private$handle,
|
||||
params_str)
|
||||
lgb.call(
|
||||
"LGBM_BoosterResetParameter_R"
|
||||
, ret = NULL
|
||||
, private$handle
|
||||
, params_str
|
||||
)
|
||||
|
||||
# Return self
|
||||
return(invisible(self))
|
||||
|
@ -198,10 +221,12 @@ Booster <- R6::R6Class(
|
|||
}
|
||||
|
||||
# Reset training data on booster
|
||||
lgb.call("LGBM_BoosterResetTrainingData_R",
|
||||
ret = NULL,
|
||||
private$handle,
|
||||
train_set$.__enclos_env__$private$get_handle())
|
||||
lgb.call(
|
||||
"LGBM_BoosterResetTrainingData_R"
|
||||
, ret = NULL
|
||||
, private$handle
|
||||
, train_set$.__enclos_env__$private$get_handle()
|
||||
)
|
||||
|
||||
# Store private train set
|
||||
private$train_set = train_set
|
||||
|
@ -230,18 +255,20 @@ Booster <- R6::R6Class(
|
|||
gpair <- fobj(private$inner_predict(1), private$train_set)
|
||||
|
||||
# Check for gradient and hessian as list
|
||||
if(is.null(gpair$grad) || is.null(gpair$hess)){
|
||||
if (is.null(gpair$grad) || is.null(gpair$hess)){
|
||||
stop("lgb.Booster.update: custom objective should
|
||||
return a list with attributes (hess, grad)")
|
||||
}
|
||||
|
||||
# Return custom boosting gradient/hessian
|
||||
ret <- lgb.call("LGBM_BoosterUpdateOneIterCustom_R",
|
||||
ret = NULL,
|
||||
private$handle,
|
||||
gpair$grad,
|
||||
gpair$hess,
|
||||
length(gpair$grad))
|
||||
ret <- lgb.call(
|
||||
"LGBM_BoosterUpdateOneIterCustom_R"
|
||||
, ret = NULL
|
||||
, private$handle
|
||||
, gpair$grad
|
||||
, gpair$hess
|
||||
, length(gpair$grad)
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
@ -258,9 +285,11 @@ Booster <- R6::R6Class(
|
|||
rollback_one_iter = function() {
|
||||
|
||||
# Return one iteration behind
|
||||
lgb.call("LGBM_BoosterRollbackOneIter_R",
|
||||
ret = NULL,
|
||||
private$handle)
|
||||
lgb.call(
|
||||
"LGBM_BoosterRollbackOneIter_R"
|
||||
, ret = NULL
|
||||
, private$handle
|
||||
)
|
||||
|
||||
# Loop through each iteration
|
||||
for (i in seq_along(private$is_predicted_cur_iter)) {
|
||||
|
@ -276,9 +305,11 @@ Booster <- R6::R6Class(
|
|||
current_iter = function() {
|
||||
|
||||
cur_iter <- 0L
|
||||
lgb.call("LGBM_BoosterGetCurrentIteration_R",
|
||||
ret = cur_iter,
|
||||
private$handle)
|
||||
lgb.call(
|
||||
"LGBM_BoosterGetCurrentIteration_R"
|
||||
, ret = cur_iter
|
||||
, private$handle
|
||||
)
|
||||
|
||||
},
|
||||
|
||||
|
@ -349,7 +380,10 @@ Booster <- R6::R6Class(
|
|||
|
||||
# Loop through each validation set
|
||||
for (i in seq_along(private$valid_sets)) {
|
||||
ret <- append(ret, private$inner_eval(private$name_valid_sets[[i]], i + 1, feval))
|
||||
ret <- append(
|
||||
x = ret
|
||||
, values = private$inner_eval(private$name_valid_sets[[i]], i + 1, feval)
|
||||
)
|
||||
}
|
||||
|
||||
# Return ret
|
||||
|
@ -366,11 +400,13 @@ Booster <- R6::R6Class(
|
|||
}
|
||||
|
||||
# Save booster model
|
||||
lgb.call("LGBM_BoosterSaveModel_R",
|
||||
ret = NULL,
|
||||
private$handle,
|
||||
as.integer(num_iteration),
|
||||
lgb.c_str(filename))
|
||||
lgb.call(
|
||||
"LGBM_BoosterSaveModel_R"
|
||||
, ret = NULL
|
||||
, private$handle
|
||||
, as.integer(num_iteration)
|
||||
, lgb.c_str(filename)
|
||||
)
|
||||
|
||||
# Return self
|
||||
return(invisible(self))
|
||||
|
@ -385,9 +421,11 @@ Booster <- R6::R6Class(
|
|||
}
|
||||
|
||||
# Return model string
|
||||
return(lgb.call.return.str("LGBM_BoosterSaveModelToString_R",
|
||||
private$handle,
|
||||
as.integer(num_iteration)))
|
||||
return(lgb.call.return.str(
|
||||
"LGBM_BoosterSaveModelToString_R"
|
||||
, private$handle
|
||||
, as.integer(num_iteration)
|
||||
))
|
||||
|
||||
},
|
||||
|
||||
|
@ -400,9 +438,11 @@ Booster <- R6::R6Class(
|
|||
}
|
||||
|
||||
# Return dumped model
|
||||
lgb.call.return.str("LGBM_BoosterDumpModel_R",
|
||||
private$handle,
|
||||
as.integer(num_iteration))
|
||||
lgb.call.return.str(
|
||||
"LGBM_BoosterDumpModel_R"
|
||||
, private$handle
|
||||
, as.integer(num_iteration)
|
||||
)
|
||||
|
||||
},
|
||||
|
||||
|
@ -478,10 +518,12 @@ Booster <- R6::R6Class(
|
|||
|
||||
# Store predictions
|
||||
npred <- 0L
|
||||
npred <- lgb.call("LGBM_BoosterGetNumPredict_R",
|
||||
ret = npred,
|
||||
private$handle,
|
||||
as.integer(idx - 1))
|
||||
npred <- lgb.call(
|
||||
"LGBM_BoosterGetNumPredict_R"
|
||||
, ret = npred
|
||||
, private$handle
|
||||
, as.integer(idx - 1)
|
||||
)
|
||||
private$predict_buffer[[data_name]] <- numeric(npred)
|
||||
|
||||
}
|
||||
|
@ -490,10 +532,12 @@ Booster <- R6::R6Class(
|
|||
if (!private$is_predicted_cur_iter[[idx]]) {
|
||||
|
||||
# Use buffer
|
||||
private$predict_buffer[[data_name]] <- lgb.call("LGBM_BoosterGetPredict_R",
|
||||
ret = private$predict_buffer[[data_name]],
|
||||
private$handle,
|
||||
as.integer(idx - 1))
|
||||
private$predict_buffer[[data_name]] <- lgb.call(
|
||||
"LGBM_BoosterGetPredict_R"
|
||||
, ret = private$predict_buffer[[data_name]]
|
||||
, private$handle
|
||||
, as.integer(idx - 1)
|
||||
)
|
||||
private$is_predicted_cur_iter[[idx]] <- TRUE
|
||||
}
|
||||
|
||||
|
@ -508,8 +552,10 @@ Booster <- R6::R6Class(
|
|||
if (is.null(private$eval_names)) {
|
||||
|
||||
# Get evaluation names
|
||||
names <- lgb.call.return.str("LGBM_BoosterGetEvalNames_R",
|
||||
private$handle)
|
||||
names <- lgb.call.return.str(
|
||||
"LGBM_BoosterGetEvalNames_R"
|
||||
, private$handle
|
||||
)
|
||||
|
||||
# Check names' length
|
||||
if (nchar(names) > 0) {
|
||||
|
@ -547,10 +593,12 @@ Booster <- R6::R6Class(
|
|||
|
||||
# Create evaluation values
|
||||
tmp_vals <- numeric(length(private$eval_names))
|
||||
tmp_vals <- lgb.call("LGBM_BoosterGetEval_R",
|
||||
ret = tmp_vals,
|
||||
private$handle,
|
||||
as.integer(data_idx - 1))
|
||||
tmp_vals <- lgb.call(
|
||||
"LGBM_BoosterGetEval_R"
|
||||
, ret = tmp_vals
|
||||
, private$handle
|
||||
, as.integer(data_idx - 1)
|
||||
)
|
||||
|
||||
# Loop through all evaluation names
|
||||
for (i in seq_along(private$eval_names)) {
|
||||
|
@ -587,7 +635,7 @@ Booster <- R6::R6Class(
|
|||
res <- feval(private$inner_predict(data_idx), data)
|
||||
|
||||
# Check for name correctness
|
||||
if(is.null(res$name) || is.null(res$value) || is.null(res$higher_better)) {
|
||||
if (is.null(res$name) || is.null(res$value) || is.null(res$higher_better)) {
|
||||
stop("lgb.Booster.eval: custom eval function should return a
|
||||
list with attribute (name, value, higher_better)");
|
||||
}
|
||||
|
@ -614,13 +662,13 @@ Booster <- R6::R6Class(
|
|||
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
|
||||
#' @param num_iteration number of iteration want to predict with, NULL or <= 0 means use best iteration
|
||||
#' @param rawscore whether the prediction should be returned in the for of original untransformed
|
||||
#' sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} for
|
||||
#' logistic regression would result in predictions for log-odds instead of probabilities.
|
||||
#' sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE}
|
||||
#' for logistic regression would result in predictions for log-odds instead of probabilities.
|
||||
#' @param predleaf whether predict leaf index instead.
|
||||
#' @param predcontrib return per-feature contributions for each record.
|
||||
#' @param header only used for prediction for text file. True if text file has header
|
||||
#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several
|
||||
#' prediction outputs per case.
|
||||
#' prediction outputs per case.
|
||||
#' @param ... Additional named arguments passed to the \code{predict()} method of
|
||||
#' the \code{lgb.Booster} object passed to \code{object}.
|
||||
#' @return
|
||||
|
@ -642,13 +690,15 @@ Booster <- R6::R6Class(
|
|||
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
#' params <- list(objective = "regression", metric = "l2")
|
||||
#' valids <- list(test = dtest)
|
||||
#' model <- lgb.train(params,
|
||||
#' dtrain,
|
||||
#' 10,
|
||||
#' valids,
|
||||
#' min_data = 1,
|
||||
#' learning_rate = 1,
|
||||
#' early_stopping_rounds = 5)
|
||||
#' model <- lgb.train(
|
||||
#' params = params
|
||||
#' , data = dtrain
|
||||
#' , nrounds = 10
|
||||
#' , valids = valids
|
||||
#' , min_data = 1
|
||||
#' , learning_rate = 1
|
||||
#' , early_stopping_rounds = 5
|
||||
#' )
|
||||
#' preds <- predict(model, test$data)
|
||||
#'
|
||||
#' @rdname predict.lgb.Booster
|
||||
|
@ -669,13 +719,16 @@ predict.lgb.Booster <- function(object,
|
|||
}
|
||||
|
||||
# Return booster predictions
|
||||
object$predict(data,
|
||||
num_iteration,
|
||||
rawscore,
|
||||
predleaf,
|
||||
predcontrib,
|
||||
header,
|
||||
reshape, ...)
|
||||
object$predict(
|
||||
data
|
||||
, num_iteration
|
||||
, rawscore
|
||||
, predleaf
|
||||
, predcontrib
|
||||
, header
|
||||
, reshape
|
||||
, ...
|
||||
)
|
||||
}
|
||||
|
||||
#' Load LightGBM model
|
||||
|
@ -699,13 +752,15 @@ predict.lgb.Booster <- function(object,
|
|||
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
#' params <- list(objective = "regression", metric = "l2")
|
||||
#' valids <- list(test = dtest)
|
||||
#' model <- lgb.train(params,
|
||||
#' dtrain,
|
||||
#' 10,
|
||||
#' valids,
|
||||
#' min_data = 1,
|
||||
#' learning_rate = 1,
|
||||
#' early_stopping_rounds = 5)
|
||||
#' model <- lgb.train(
|
||||
#' params = params
|
||||
#' , data = dtrain
|
||||
#' , nrounds = 10
|
||||
#' , valids = valids
|
||||
#' , min_data = 1
|
||||
#' , learning_rate = 1
|
||||
#' , early_stopping_rounds = 5
|
||||
#' )
|
||||
#' lgb.save(model, "model.txt")
|
||||
#' load_booster <- lgb.load(filename = "model.txt")
|
||||
#' model_string <- model$save_model_to_string(NULL) # saves best iteration
|
||||
|
@ -757,13 +812,15 @@ lgb.load <- function(filename = NULL, model_str = NULL){
|
|||
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
#' params <- list(objective = "regression", metric = "l2")
|
||||
#' valids <- list(test = dtest)
|
||||
#' model <- lgb.train(params,
|
||||
#' dtrain,
|
||||
#' 10,
|
||||
#' valids,
|
||||
#' min_data = 1,
|
||||
#' learning_rate = 1,
|
||||
#' early_stopping_rounds = 5)
|
||||
#' model <- lgb.train(
|
||||
#' params = params
|
||||
#' , data = dtrain
|
||||
#' , nrounds = 10
|
||||
#' , valids = valids
|
||||
#' , min_data = 1
|
||||
#' , learning_rate = 1
|
||||
#' , early_stopping_rounds = 5
|
||||
#' )
|
||||
#' lgb.save(model, "model.txt")
|
||||
#'
|
||||
#' @rdname lgb.save
|
||||
|
@ -804,13 +861,15 @@ lgb.save <- function(booster, filename, num_iteration = NULL){
|
|||
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
#' params <- list(objective = "regression", metric = "l2")
|
||||
#' valids <- list(test = dtest)
|
||||
#' model <- lgb.train(params,
|
||||
#' dtrain,
|
||||
#' 10,
|
||||
#' valids,
|
||||
#' min_data = 1,
|
||||
#' learning_rate = 1,
|
||||
#' early_stopping_rounds = 5)
|
||||
#' model <- lgb.train(
|
||||
#' params = params
|
||||
#' , data = dtrain
|
||||
#' , nrounds = 10
|
||||
#' , valids = valids
|
||||
#' , min_data = 1
|
||||
#' , learning_rate = 1
|
||||
#' , early_stopping_rounds = 5
|
||||
#' )
|
||||
#' json_model <- lgb.dump(model)
|
||||
#'
|
||||
#' @rdname lgb.dump
|
||||
|
@ -848,13 +907,15 @@ lgb.dump <- function(booster, num_iteration = NULL){
|
|||
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
#' params <- list(objective = "regression", metric = "l2")
|
||||
#' valids <- list(test = dtest)
|
||||
#' model <- lgb.train(params,
|
||||
#' dtrain,
|
||||
#' 10,
|
||||
#' valids,
|
||||
#' min_data = 1,
|
||||
#' learning_rate = 1,
|
||||
#' early_stopping_rounds = 5)
|
||||
#' model <- lgb.train(
|
||||
#' params = params
|
||||
#' , data = dtrain
|
||||
#' , nrounds = 10
|
||||
#' , valids = valids
|
||||
#' , min_data = 1
|
||||
#' , learning_rate = 1
|
||||
#' , early_stopping_rounds = 5
|
||||
#' )
|
||||
#' lgb.get.eval.result(model, "test", "l2")
|
||||
#' @rdname lgb.get.eval.result
|
||||
#' @export
|
||||
|
|
|
@ -97,16 +97,18 @@ Dataset <- R6::R6Class(
|
|||
...) {
|
||||
|
||||
# Create new dataset
|
||||
ret <- Dataset$new(data,
|
||||
private$params,
|
||||
self,
|
||||
private$colnames,
|
||||
private$categorical_feature,
|
||||
private$predictor,
|
||||
private$free_raw_data,
|
||||
NULL,
|
||||
info,
|
||||
...)
|
||||
ret <- Dataset$new(
|
||||
data = data
|
||||
, params = private$params
|
||||
, reference = self
|
||||
, colnames = private$colnames
|
||||
, categorical_feature = private$categorical_feature
|
||||
, predictor = private$predictor
|
||||
, free_raw_data = private$free_raw_data
|
||||
, used_indices = NULL
|
||||
, info = info
|
||||
, ...
|
||||
)
|
||||
|
||||
# Return ret
|
||||
return(invisible(ret))
|
||||
|
@ -142,14 +144,23 @@ Dataset <- R6::R6Class(
|
|||
|
||||
# Provided indices, but some indices are not existing?
|
||||
if (sum(is.na(cate_indices)) > 0) {
|
||||
stop("lgb.self.get.handle: supplied an unknown feature in categorical_feature: ", sQuote(private$categorical_feature[is.na(cate_indices)]))
|
||||
stop(
|
||||
"lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
|
||||
, sQuote(private$categorical_feature[is.na(cate_indices)])
|
||||
)
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# Check if more categorical features were output over the feature space
|
||||
if (max(private$categorical_feature) > length(private$colnames)) {
|
||||
stop("lgb.self.get.handle: supplied a too large value in categorical_feature: ", max(private$categorical_feature), " but only ", length(private$colnames), " features")
|
||||
stop(
|
||||
"lgb.self.get.handle: supplied a too large value in categorical_feature: "
|
||||
, max(private$categorical_feature)
|
||||
, " but only "
|
||||
, length(private$colnames)
|
||||
, " features"
|
||||
)
|
||||
}
|
||||
|
||||
# Store indices as [0, n-1] indexed instead of [1, n] indexed
|
||||
|
@ -165,7 +176,9 @@ Dataset <- R6::R6Class(
|
|||
# Check has header or not
|
||||
has_header <- FALSE
|
||||
if (!is.null(private$params$has_header) || !is.null(private$params$header)) {
|
||||
if (tolower(as.character(private$params$has_header)) == "true" || tolower(as.character(private$params$header)) == "true") {
|
||||
params_has_header <- tolower(as.character(private$params$has_header)) == "true"
|
||||
params_header <- tolower(as.character(private$params$header)) == "true"
|
||||
if (params_has_header || params_header) {
|
||||
has_header <- TRUE
|
||||
}
|
||||
}
|
||||
|
@ -186,43 +199,52 @@ Dataset <- R6::R6Class(
|
|||
# Are we using a data file?
|
||||
if (is.character(private$raw_data)) {
|
||||
|
||||
handle <- lgb.call("LGBM_DatasetCreateFromFile_R",
|
||||
ret = handle,
|
||||
lgb.c_str(private$raw_data),
|
||||
params_str,
|
||||
ref_handle)
|
||||
handle <- lgb.call(
|
||||
"LGBM_DatasetCreateFromFile_R"
|
||||
, ret = handle
|
||||
, lgb.c_str(private$raw_data)
|
||||
, params_str
|
||||
, ref_handle
|
||||
)
|
||||
|
||||
} else if (is.matrix(private$raw_data)) {
|
||||
|
||||
# Are we using a matrix?
|
||||
handle <- lgb.call("LGBM_DatasetCreateFromMat_R",
|
||||
ret = handle,
|
||||
private$raw_data,
|
||||
nrow(private$raw_data),
|
||||
ncol(private$raw_data),
|
||||
params_str,
|
||||
ref_handle)
|
||||
handle <- lgb.call(
|
||||
"LGBM_DatasetCreateFromMat_R"
|
||||
, ret = handle
|
||||
, private$raw_data
|
||||
, nrow(private$raw_data)
|
||||
, ncol(private$raw_data)
|
||||
, params_str
|
||||
, ref_handle
|
||||
)
|
||||
|
||||
} else if (methods::is(private$raw_data, "dgCMatrix")) {
|
||||
if (length(private$raw_data@p) > 2147483647) {
|
||||
stop("Cannot support large CSC matrix")
|
||||
}
|
||||
# Are we using a dgCMatrix (sparsed matrix column compressed)
|
||||
handle <- lgb.call("LGBM_DatasetCreateFromCSC_R",
|
||||
ret = handle,
|
||||
private$raw_data@p,
|
||||
private$raw_data@i,
|
||||
private$raw_data@x,
|
||||
length(private$raw_data@p),
|
||||
length(private$raw_data@x),
|
||||
nrow(private$raw_data),
|
||||
params_str,
|
||||
ref_handle)
|
||||
handle <- lgb.call(
|
||||
"LGBM_DatasetCreateFromCSC_R"
|
||||
, ret = handle
|
||||
, private$raw_data@p
|
||||
, private$raw_data@i
|
||||
, private$raw_data@x
|
||||
, length(private$raw_data@p)
|
||||
, length(private$raw_data@x)
|
||||
, nrow(private$raw_data)
|
||||
, params_str
|
||||
, ref_handle
|
||||
)
|
||||
|
||||
} else {
|
||||
|
||||
# Unknown data type
|
||||
stop("lgb.Dataset.construct: does not support constructing from ", sQuote(class(private$raw_data)))
|
||||
stop(
|
||||
"lgb.Dataset.construct: does not support constructing from "
|
||||
, sQuote(class(private$raw_data))
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
@ -234,12 +256,14 @@ Dataset <- R6::R6Class(
|
|||
}
|
||||
|
||||
# Construct subset
|
||||
handle <- lgb.call("LGBM_DatasetGetSubset_R",
|
||||
ret = handle,
|
||||
ref_handle,
|
||||
c(private$used_indices), # Adding c() fixes issue in R v3.5
|
||||
length(private$used_indices),
|
||||
params_str)
|
||||
handle <- lgb.call(
|
||||
"LGBM_DatasetGetSubset_R"
|
||||
, ret = handle
|
||||
, ref_handle
|
||||
, c(private$used_indices) # Adding c() fixes issue in R v3.5
|
||||
, length(private$used_indices)
|
||||
, params_str
|
||||
)
|
||||
|
||||
}
|
||||
if (lgb.is.null.handle(handle)) {
|
||||
|
@ -258,7 +282,11 @@ Dataset <- R6::R6Class(
|
|||
if (!is.null(private$predictor) && is.null(private$used_indices)) {
|
||||
|
||||
# Setup initial scores
|
||||
init_score <- private$predictor$predict(private$raw_data, rawscore = TRUE, reshape = TRUE)
|
||||
init_score <- private$predictor$predict(
|
||||
private$raw_data
|
||||
, rawscore = TRUE
|
||||
, reshape = TRUE
|
||||
)
|
||||
|
||||
# Not needed to transpose, for is col_marjor
|
||||
init_score <- as.vector(init_score)
|
||||
|
@ -316,7 +344,10 @@ Dataset <- R6::R6Class(
|
|||
} else {
|
||||
|
||||
# Trying to work with unknown dimensions is not possible
|
||||
stop("dim: cannot get dimensions before dataset has been constructed, please call lgb.Dataset.construct explicitly")
|
||||
stop(
|
||||
"dim: cannot get dimensions before dataset has been constructed, "
|
||||
, "please call lgb.Dataset.construct explicitly"
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
@ -341,7 +372,10 @@ Dataset <- R6::R6Class(
|
|||
} else {
|
||||
|
||||
# Trying to work with unknown dimensions is not possible
|
||||
stop("dim: cannot get dimensions before dataset has been constructed, please call lgb.Dataset.construct explicitly")
|
||||
stop(
|
||||
"dim: cannot get dimensions before dataset has been constructed, please call "
|
||||
, "lgb.Dataset.construct explicitly"
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
@ -367,10 +401,12 @@ Dataset <- R6::R6Class(
|
|||
|
||||
# Merge names with tab separation
|
||||
merged_name <- paste0(as.list(private$colnames), collapse = "\t")
|
||||
lgb.call("LGBM_DatasetSetFeatureNames_R",
|
||||
ret = NULL,
|
||||
private$handle,
|
||||
lgb.c_str(merged_name))
|
||||
lgb.call(
|
||||
"LGBM_DatasetSetFeatureNames_R"
|
||||
, ret = NULL
|
||||
, private$handle
|
||||
, lgb.c_str(merged_name)
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
@ -399,10 +435,12 @@ Dataset <- R6::R6Class(
|
|||
|
||||
# Get field size of info
|
||||
info_len <- 0L
|
||||
info_len <- lgb.call("LGBM_DatasetGetFieldSize_R",
|
||||
ret = info_len,
|
||||
private$handle,
|
||||
lgb.c_str(name))
|
||||
info_len <- lgb.call(
|
||||
"LGBM_DatasetGetFieldSize_R"
|
||||
, ret = info_len
|
||||
, private$handle
|
||||
, lgb.c_str(name)
|
||||
)
|
||||
|
||||
# Check if info is not empty
|
||||
if (info_len > 0) {
|
||||
|
@ -415,10 +453,12 @@ Dataset <- R6::R6Class(
|
|||
numeric(info_len) # Numeric
|
||||
}
|
||||
|
||||
ret <- lgb.call("LGBM_DatasetGetField_R",
|
||||
ret = ret,
|
||||
private$handle,
|
||||
lgb.c_str(name))
|
||||
ret <- lgb.call(
|
||||
"LGBM_DatasetGetField_R"
|
||||
, ret = ret
|
||||
, private$handle
|
||||
, lgb.c_str(name)
|
||||
)
|
||||
|
||||
private$info[[name]] <- ret
|
||||
|
||||
|
@ -454,12 +494,14 @@ Dataset <- R6::R6Class(
|
|||
|
||||
if (length(info) > 0) {
|
||||
|
||||
lgb.call("LGBM_DatasetSetField_R",
|
||||
ret = NULL,
|
||||
private$handle,
|
||||
lgb.c_str(name),
|
||||
info,
|
||||
length(info))
|
||||
lgb.call(
|
||||
"LGBM_DatasetSetField_R"
|
||||
, ret = NULL
|
||||
, private$handle
|
||||
, lgb.c_str(name)
|
||||
, info
|
||||
, length(info)
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
@ -474,16 +516,18 @@ Dataset <- R6::R6Class(
|
|||
slice = function(idxset, ...) {
|
||||
|
||||
# Perform slicing
|
||||
Dataset$new(NULL,
|
||||
private$params,
|
||||
self,
|
||||
private$colnames,
|
||||
private$categorical_feature,
|
||||
private$predictor,
|
||||
private$free_raw_data,
|
||||
sort(idxset, decreasing = FALSE),
|
||||
NULL,
|
||||
...)
|
||||
Dataset$new(
|
||||
data = NULL
|
||||
, params = private$params
|
||||
, reference = self
|
||||
, colnames = private$colnames
|
||||
, categorical_feature = private$categorical_feature
|
||||
, predictor = private$predictor
|
||||
, free_raw_data = private$free_raw_data
|
||||
, used_indices = sort(idxset, decreasing = FALSE)
|
||||
, info = NULL
|
||||
, ...
|
||||
)
|
||||
|
||||
},
|
||||
|
||||
|
@ -492,7 +536,12 @@ Dataset <- R6::R6Class(
|
|||
|
||||
# Parameter updating
|
||||
if (!lgb.is.null.handle(private$handle)) {
|
||||
lgb.call("LGBM_DatasetUpdateParam_R", ret = NULL, private$handle, lgb.params2str(params))
|
||||
lgb.call(
|
||||
"LGBM_DatasetUpdateParam_R"
|
||||
, ret = NULL
|
||||
, private$handle
|
||||
, lgb.params2str(params)
|
||||
)
|
||||
return(invisible(self))
|
||||
}
|
||||
private$params <- modifyList(private$params, params)
|
||||
|
@ -568,10 +617,12 @@ Dataset <- R6::R6Class(
|
|||
|
||||
# Store binary data
|
||||
self$construct()
|
||||
lgb.call("LGBM_DatasetSaveBinary_R",
|
||||
ret = NULL,
|
||||
private$handle,
|
||||
lgb.c_str(fname))
|
||||
lgb.call(
|
||||
"LGBM_DatasetSaveBinary_R"
|
||||
, ret = NULL
|
||||
, private$handle
|
||||
, lgb.c_str(fname)
|
||||
)
|
||||
return(invisible(self))
|
||||
}
|
||||
|
||||
|
@ -671,16 +722,18 @@ lgb.Dataset <- function(data,
|
|||
...) {
|
||||
|
||||
# Create new dataset
|
||||
invisible(Dataset$new(data,
|
||||
params,
|
||||
reference,
|
||||
colnames,
|
||||
categorical_feature,
|
||||
NULL,
|
||||
free_raw_data,
|
||||
NULL,
|
||||
info,
|
||||
...))
|
||||
invisible(Dataset$new(
|
||||
data = data
|
||||
, params = params
|
||||
, reference = reference
|
||||
, colnames = colnames
|
||||
, categorical_feature = categorical_feature
|
||||
, predictor = NULL
|
||||
, free_raw_data = free_raw_data
|
||||
, used_indices = NULL
|
||||
, info = info
|
||||
, ...
|
||||
))
|
||||
|
||||
}
|
||||
|
||||
|
@ -784,7 +837,7 @@ dim.lgb.Dataset <- function(x, ...) {
|
|||
#'
|
||||
#' @param x object of class \code{lgb.Dataset}
|
||||
#' @param value a list of two elements: the first one is ignored
|
||||
#' and the second one is column names
|
||||
#' and the second one is column names
|
||||
#'
|
||||
#' @details
|
||||
#' Generic \code{dimnames} methods are used by \code{colnames}.
|
||||
|
@ -840,7 +893,13 @@ dimnames.lgb.Dataset <- function(x) {
|
|||
|
||||
# Check for unmatching column size
|
||||
if (ncol(x) != length(value[[2]])) {
|
||||
stop("can't assign ", sQuote(length(value[[2]])), " colnames to an lgb.Dataset with ", sQuote(ncol(x)), " columns")
|
||||
stop(
|
||||
"can't assign "
|
||||
, sQuote(length(value[[2]]))
|
||||
, " colnames to an lgb.Dataset with "
|
||||
, sQuote(ncol(x))
|
||||
, " columns"
|
||||
)
|
||||
}
|
||||
|
||||
# Set column names properly, and return
|
||||
|
|
|
@ -13,7 +13,11 @@ Predictor <- R6::R6Class(
|
|||
if (private$need_free_handle && !lgb.is.null.handle(private$handle)) {
|
||||
|
||||
# Freeing up handle
|
||||
lgb.call("LGBM_BoosterFree_R", ret = NULL, private$handle)
|
||||
lgb.call(
|
||||
"LGBM_BoosterFree_R"
|
||||
, ret = NULL
|
||||
, private$handle
|
||||
)
|
||||
private$handle <- NULL
|
||||
|
||||
}
|
||||
|
@ -31,7 +35,11 @@ Predictor <- R6::R6Class(
|
|||
if (is.character(modelfile)) {
|
||||
|
||||
# Create handle on it
|
||||
handle <- lgb.call("LGBM_BoosterCreateFromModelfile_R", ret = handle, lgb.c_str(modelfile))
|
||||
handle <- lgb.call(
|
||||
"LGBM_BoosterCreateFromModelfile_R"
|
||||
, ret = handle
|
||||
, lgb.c_str(modelfile)
|
||||
)
|
||||
private$need_free_handle <- TRUE
|
||||
|
||||
} else if (methods::is(modelfile, "lgb.Booster.handle")) {
|
||||
|
@ -57,7 +65,11 @@ Predictor <- R6::R6Class(
|
|||
current_iter = function() {
|
||||
|
||||
cur_iter <- 0L
|
||||
lgb.call("LGBM_BoosterGetCurrentIteration_R", ret = cur_iter, private$handle)
|
||||
lgb.call(
|
||||
"LGBM_BoosterGetCurrentIteration_R"
|
||||
, ret = cur_iter
|
||||
, private$handle
|
||||
)
|
||||
|
||||
},
|
||||
|
||||
|
@ -86,14 +98,19 @@ Predictor <- R6::R6Class(
|
|||
on.exit(unlink(tmp_filename), add = TRUE)
|
||||
|
||||
# Predict from temporary file
|
||||
lgb.call("LGBM_BoosterPredictForFile_R", ret = NULL, private$handle, data,
|
||||
as.integer(header),
|
||||
as.integer(rawscore),
|
||||
as.integer(predleaf),
|
||||
as.integer(predcontrib),
|
||||
as.integer(num_iteration),
|
||||
private$params,
|
||||
lgb.c_str(tmp_filename))
|
||||
lgb.call(
|
||||
"LGBM_BoosterPredictForFile_R"
|
||||
, ret = NULL
|
||||
, private$handle
|
||||
, data
|
||||
, as.integer(header)
|
||||
, as.integer(rawscore)
|
||||
, as.integer(predleaf)
|
||||
, as.integer(predcontrib)
|
||||
, as.integer(num_iteration)
|
||||
, private$params
|
||||
, lgb.c_str(tmp_filename)
|
||||
)
|
||||
|
||||
# Get predictions from file
|
||||
preds <- read.delim(tmp_filename, header = FALSE, sep = "\t")
|
||||
|
@ -108,51 +125,57 @@ Predictor <- R6::R6Class(
|
|||
npred <- 0L
|
||||
|
||||
# Check number of predictions to do
|
||||
npred <- lgb.call("LGBM_BoosterCalcNumPredict_R",
|
||||
ret = npred,
|
||||
private$handle,
|
||||
as.integer(num_row),
|
||||
as.integer(rawscore),
|
||||
as.integer(predleaf),
|
||||
as.integer(predcontrib),
|
||||
as.integer(num_iteration))
|
||||
npred <- lgb.call(
|
||||
"LGBM_BoosterCalcNumPredict_R"
|
||||
, ret = npred
|
||||
, private$handle
|
||||
, as.integer(num_row)
|
||||
, as.integer(rawscore)
|
||||
, as.integer(predleaf)
|
||||
, as.integer(predcontrib)
|
||||
, as.integer(num_iteration)
|
||||
)
|
||||
|
||||
# Pre-allocate empty vector
|
||||
preds <- numeric(npred)
|
||||
|
||||
# Check if data is a matrix
|
||||
if (is.matrix(data)) {
|
||||
preds <- lgb.call("LGBM_BoosterPredictForMat_R",
|
||||
ret = preds,
|
||||
private$handle,
|
||||
data,
|
||||
as.integer(nrow(data)),
|
||||
as.integer(ncol(data)),
|
||||
as.integer(rawscore),
|
||||
as.integer(predleaf),
|
||||
as.integer(predcontrib),
|
||||
as.integer(num_iteration),
|
||||
private$params)
|
||||
preds <- lgb.call(
|
||||
"LGBM_BoosterPredictForMat_R"
|
||||
, ret = preds
|
||||
, private$handle
|
||||
, data
|
||||
, as.integer(nrow(data))
|
||||
, as.integer(ncol(data))
|
||||
, as.integer(rawscore)
|
||||
, as.integer(predleaf)
|
||||
, as.integer(predcontrib)
|
||||
, as.integer(num_iteration)
|
||||
, private$params
|
||||
)
|
||||
|
||||
} else if (methods::is(data, "dgCMatrix")) {
|
||||
if (length(data@p) > 2147483647) {
|
||||
stop("Cannot support large CSC matrix")
|
||||
}
|
||||
# Check if data is a dgCMatrix (sparse matrix, column compressed format)
|
||||
preds <- lgb.call("LGBM_BoosterPredictForCSC_R",
|
||||
ret = preds,
|
||||
private$handle,
|
||||
data@p,
|
||||
data@i,
|
||||
data@x,
|
||||
length(data@p),
|
||||
length(data@x),
|
||||
nrow(data),
|
||||
as.integer(rawscore),
|
||||
as.integer(predleaf),
|
||||
as.integer(predcontrib),
|
||||
as.integer(num_iteration),
|
||||
private$params)
|
||||
preds <- lgb.call(
|
||||
"LGBM_BoosterPredictForCSC_R"
|
||||
, ret = preds
|
||||
, private$handle
|
||||
, data@p
|
||||
, data@i
|
||||
, data@x
|
||||
, length(data@p)
|
||||
, length(data@x)
|
||||
, nrow(data)
|
||||
, as.integer(rawscore)
|
||||
, as.integer(predleaf)
|
||||
, as.integer(predcontrib)
|
||||
, as.integer(num_iteration)
|
||||
, private$params
|
||||
)
|
||||
|
||||
} else {
|
||||
|
||||
|
@ -165,7 +188,12 @@ Predictor <- R6::R6Class(
|
|||
|
||||
# Check if number of rows is strange (not a multiple of the dataset rows)
|
||||
if (length(preds) %% num_row != 0) {
|
||||
stop("predict: prediction length ", sQuote(length(preds))," is not a multiple of nrows(data): ", sQuote(num_row))
|
||||
stop(
|
||||
"predict: prediction length "
|
||||
, sQuote(length(preds))
|
||||
," is not a multiple of nrows(data): "
|
||||
, sQuote(num_row)
|
||||
)
|
||||
}
|
||||
|
||||
# Get number of cases per row
|
||||
|
@ -192,7 +220,9 @@ Predictor <- R6::R6Class(
|
|||
}
|
||||
|
||||
),
|
||||
private = list(handle = NULL,
|
||||
need_free_handle = FALSE,
|
||||
params = "")
|
||||
private = list(
|
||||
handle = NULL
|
||||
, need_free_handle = FALSE
|
||||
, params = ""
|
||||
)
|
||||
)
|
||||
|
|
|
@ -25,23 +25,23 @@ CVBooster <- R6::R6Class(
|
|||
#' @param label vector of response values. Should be provided only when data is an R-matrix.
|
||||
#' @param weight vector of response values. If not NULL, will set to dataset
|
||||
#' @param obj objective function, can be character or custom objective function. Examples include
|
||||
#' \code{regression}, \code{regression_l1}, \code{huber},
|
||||
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
|
||||
#' \code{regression}, \code{regression_l1}, \code{huber},
|
||||
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
|
||||
#' @param eval evaluation function, can be (list of) character or custom eval function
|
||||
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
|
||||
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation
|
||||
#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
|
||||
#' by the values of outcome labels.
|
||||
#' by the values of outcome labels.
|
||||
#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
|
||||
#' (each element must be a vector of test fold's indices). When folds are supplied,
|
||||
#' the \code{nfold} and \code{stratified} parameters are ignored.
|
||||
#' (each element must be a vector of test fold's indices). When folds are supplied,
|
||||
#' the \code{nfold} and \code{stratified} parameters are ignored.
|
||||
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
|
||||
#' @param categorical_feature list of str or int
|
||||
#' type int represents index,
|
||||
#' type str represents feature names
|
||||
#' @param callbacks list of callback functions
|
||||
#' List of callback functions that are applied at each iteration.
|
||||
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets
|
||||
#' type int represents index,
|
||||
#' type str represents feature names
|
||||
#' @param callbacks List of callback functions that are applied at each iteration.
|
||||
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model
|
||||
#' into a predictor model which frees up memory and the original datasets
|
||||
#' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
|
||||
#' \itemize{
|
||||
#' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
|
||||
|
@ -61,13 +61,15 @@ CVBooster <- R6::R6Class(
|
|||
#' train <- agaricus.train
|
||||
#' dtrain <- lgb.Dataset(train$data, label = train$label)
|
||||
#' params <- list(objective = "regression", metric = "l2")
|
||||
#' model <- lgb.cv(params,
|
||||
#' dtrain,
|
||||
#' 10,
|
||||
#' nfold = 3,
|
||||
#' min_data = 1,
|
||||
#' learning_rate = 1,
|
||||
#' early_stopping_rounds = 5)
|
||||
#' model <- lgb.cv(
|
||||
#' params = params
|
||||
#' , data = dtrain
|
||||
#' , nrounds = 10
|
||||
#' , nfold = 3
|
||||
#' , min_data = 1
|
||||
#' , learning_rate = 1
|
||||
#' , early_stopping_rounds = 5
|
||||
#' )
|
||||
#' @export
|
||||
lgb.cv <- function(params = list(),
|
||||
data,
|
||||
|
@ -134,7 +136,17 @@ lgb.cv <- function(params = list(),
|
|||
begin_iteration <- predictor$current_iter() + 1
|
||||
}
|
||||
# Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
|
||||
n_trees <- c("num_iterations", "num_iteration", "n_iter", "num_tree", "num_trees", "num_round", "num_rounds", "num_boost_round", "n_estimators")
|
||||
n_trees <- c(
|
||||
"num_iterations"
|
||||
, "num_iteration"
|
||||
, "n_iter"
|
||||
, "num_tree"
|
||||
, "num_trees"
|
||||
, "num_round"
|
||||
, "num_rounds"
|
||||
, "num_boost_round"
|
||||
, "n_estimators"
|
||||
)
|
||||
if (any(names(params) %in% n_trees)) {
|
||||
end_iteration <- begin_iteration + params[[which(names(params) %in% n_trees)[1]]] - 1
|
||||
} else {
|
||||
|
@ -192,12 +204,14 @@ lgb.cv <- function(params = list(),
|
|||
}
|
||||
|
||||
# Create folds
|
||||
folds <- generate.cv.folds(nfold,
|
||||
nrow(data),
|
||||
stratified,
|
||||
getinfo(data, "label"),
|
||||
getinfo(data, "group"),
|
||||
params)
|
||||
folds <- generate.cv.folds(
|
||||
nfold
|
||||
, nrow(data)
|
||||
, stratified
|
||||
, getinfo(data, "label")
|
||||
, getinfo(data, "group")
|
||||
, params
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
@ -215,12 +229,24 @@ lgb.cv <- function(params = list(),
|
|||
early_stop <- c("early_stopping_round", "early_stopping_rounds", "early_stopping", "n_iter_no_change")
|
||||
if (any(names(params) %in% early_stop)) {
|
||||
if (params[[which(names(params) %in% early_stop)[1]]] > 0) {
|
||||
callbacks <- add.cb(callbacks, cb.early.stop(params[[which(names(params) %in% early_stop)[1]]], verbose = verbose))
|
||||
callbacks <- add.cb(
|
||||
callbacks
|
||||
, cb.early.stop(
|
||||
params[[which(names(params) %in% early_stop)[1]]]
|
||||
, verbose = verbose
|
||||
)
|
||||
)
|
||||
}
|
||||
} else {
|
||||
if (!is.null(early_stopping_rounds)) {
|
||||
if (early_stopping_rounds > 0) {
|
||||
callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, verbose = verbose))
|
||||
callbacks <- add.cb(
|
||||
callbacks
|
||||
, cb.early.stop(
|
||||
early_stopping_rounds
|
||||
, verbose = verbose
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -292,7 +318,7 @@ lgb.cv <- function(params = list(),
|
|||
env$eval_list <- merged_msg$eval_list
|
||||
|
||||
# Check for standard deviation requirement
|
||||
if(showsd) {
|
||||
if (showsd) {
|
||||
env$eval_err_list <- merged_msg$eval_err_list
|
||||
}
|
||||
|
||||
|
@ -319,9 +345,11 @@ lgb.cv <- function(params = list(),
|
|||
if (reset_data) {
|
||||
lapply(cv_booster$boosters, function(fd) {
|
||||
# Store temporarily model data elsewhere
|
||||
booster_old <- list(best_iter = fd$booster$best_iter,
|
||||
best_score = fd$booster$best_score,
|
||||
record_evals = fd$booster$record_evals)
|
||||
booster_old <- list(
|
||||
best_iter = fd$booster$best_iter
|
||||
, best_score = fd$booster$best_score,
|
||||
, record_evals = fd$booster$record_evals
|
||||
)
|
||||
# Reload model
|
||||
fd$booster <- lgb.load(model_str = fd$booster$save_model_to_string())
|
||||
fd$booster$best_iter <- booster_old$best_iter
|
||||
|
@ -384,8 +412,10 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
|
|||
# Loop through each fold
|
||||
for (i in seq_len(nfold)) {
|
||||
kstep <- length(rnd_idx) %/% (nfold - i + 1)
|
||||
folds[[i]] <- list(fold = which(ungrouped %in% rnd_idx[seq_len(kstep)]),
|
||||
group = rnd_idx[seq_len(kstep)])
|
||||
folds[[i]] <- list(
|
||||
fold = which(ungrouped %in% rnd_idx[seq_len(kstep)])
|
||||
, group = rnd_idx[seq_len(kstep)]
|
||||
)
|
||||
rnd_idx <- rnd_idx[-seq_len(kstep)]
|
||||
}
|
||||
|
||||
|
@ -413,11 +443,17 @@ lgb.stratified.folds <- function(y, k = 10) {
|
|||
if (is.numeric(y)) {
|
||||
|
||||
cuts <- length(y) %/% k
|
||||
if (cuts < 2) { cuts <- 2 }
|
||||
if (cuts > 5) { cuts <- 5 }
|
||||
y <- cut(y,
|
||||
unique(stats::quantile(y, probs = seq.int(0, 1, length.out = cuts))),
|
||||
include.lowest = TRUE)
|
||||
if (cuts < 2) {
|
||||
cuts <- 2
|
||||
}
|
||||
if (cuts > 5) {
|
||||
cuts <- 5
|
||||
}
|
||||
y <- cut(
|
||||
y
|
||||
, unique(stats::quantile(y, probs = seq.int(0, 1, length.out = cuts)))
|
||||
, include.lowest = TRUE
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
@ -499,8 +535,10 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) {
|
|||
|
||||
# Parse standard deviation
|
||||
for (j in seq_len(eval_len)) {
|
||||
ret_eval_err <- c(ret_eval_err,
|
||||
sqrt(mean(eval_result[[j]] ^ 2) - mean(eval_result[[j]]) ^ 2))
|
||||
ret_eval_err <- c(
|
||||
ret_eval_err
|
||||
, sqrt(mean(eval_result[[j]] ^ 2) - mean(eval_result[[j]]) ^ 2)
|
||||
)
|
||||
}
|
||||
|
||||
# Convert to list
|
||||
|
@ -509,7 +547,9 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) {
|
|||
}
|
||||
|
||||
# Return errors
|
||||
list(eval_list = ret_eval,
|
||||
eval_err_list = ret_eval_err)
|
||||
list(
|
||||
eval_list = ret_eval
|
||||
, eval_err_list = ret_eval_err
|
||||
)
|
||||
|
||||
}
|
||||
|
|
|
@ -21,9 +21,14 @@
|
|||
#' train <- agaricus.train
|
||||
#' dtrain <- lgb.Dataset(train$data, label = train$label)
|
||||
#'
|
||||
#' params <- list(objective = "binary",
|
||||
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
|
||||
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
|
||||
#' params <- list(
|
||||
#' objective = "binary"
|
||||
#' , learning_rate = 0.01
|
||||
#' , num_leaves = 63
|
||||
#' , max_depth = -1
|
||||
#' , min_data_in_leaf = 1
|
||||
#' , min_sum_hessian_in_leaf = 1
|
||||
#' )
|
||||
#' model <- lgb.train(params, dtrain, 10)
|
||||
#'
|
||||
#' tree_imp1 <- lgb.importance(model, percentage = TRUE)
|
||||
|
@ -63,9 +68,11 @@ lgb.importance <- function(model, percentage = TRUE) {
|
|||
|
||||
# Check if relative values are requested
|
||||
if (percentage) {
|
||||
tree_imp_dt[, ":="(Gain = Gain / sum(Gain),
|
||||
Cover = Cover / sum(Cover),
|
||||
Frequency = Frequency / sum(Frequency))]
|
||||
tree_imp_dt[, `:=`(
|
||||
Gain = Gain / sum(Gain)
|
||||
, Cover = Cover / sum(Cover)
|
||||
, Frequency = Frequency / sum(Frequency)
|
||||
)]
|
||||
}
|
||||
|
||||
# Return importance table
|
||||
|
|
|
@ -69,12 +69,21 @@ lgb.interprete <- function(model,
|
|||
)
|
||||
|
||||
# Get list of trees
|
||||
tree_index_mat_list <- lapply(leaf_index_mat_list,
|
||||
FUN = function(x) matrix(seq_len(length(x)) - 1, ncol = num_class, byrow = TRUE))
|
||||
tree_index_mat_list <- lapply(
|
||||
X = leaf_index_mat_list
|
||||
, FUN = function(x){
|
||||
matrix(seq_len(length(x)) - 1, ncol = num_class, byrow = TRUE)
|
||||
}
|
||||
)
|
||||
|
||||
# Sequence over idxset
|
||||
for (i in seq_along(idxset)) {
|
||||
tree_interpretation_dt_list[[i]] <- single.row.interprete(tree_dt, num_class, tree_index_mat_list[[i]], leaf_index_mat_list[[i]])
|
||||
tree_interpretation_dt_list[[i]] <- single.row.interprete(
|
||||
tree_dt
|
||||
, num_class
|
||||
, tree_index_mat_list[[i]]
|
||||
, leaf_index_mat_list[[i]]
|
||||
)
|
||||
}
|
||||
|
||||
# Return interpretation list
|
||||
|
@ -122,7 +131,10 @@ single.tree.interprete <- function(tree_dt,
|
|||
leaf_to_root(leaf_dt[["leaf_parent"]], leaf_dt[["leaf_value"]])
|
||||
|
||||
# Return formatted data.table
|
||||
data.table::data.table(Feature = feature_seq, Contribution = diff.default(value_seq))
|
||||
data.table::data.table(
|
||||
Feature = feature_seq
|
||||
, Contribution = diff.default(value_seq)
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
@ -198,16 +210,22 @@ single.row.interprete <- function(tree_dt, num_class, tree_index_mat, leaf_index
|
|||
} else {
|
||||
|
||||
# Full interpretation elements
|
||||
tree_interpretation_dt <- Reduce(f = function(x, y) merge(x, y, by = "Feature", all = TRUE),
|
||||
x = tree_interpretation)
|
||||
tree_interpretation_dt <- Reduce(
|
||||
f = function(x, y){
|
||||
merge(x, y, by = "Feature", all = TRUE)
|
||||
}
|
||||
, x = tree_interpretation
|
||||
)
|
||||
|
||||
# Loop throughout each tree
|
||||
for (j in 2:ncol(tree_interpretation_dt)) {
|
||||
|
||||
data.table::set(tree_interpretation_dt,
|
||||
i = which(is.na(tree_interpretation_dt[[j]])),
|
||||
j = j,
|
||||
value = 0)
|
||||
data.table::set(
|
||||
tree_interpretation_dt
|
||||
, i = which(is.na(tree_interpretation_dt[[j]]))
|
||||
, j = j
|
||||
, value = 0
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -35,9 +35,14 @@
|
|||
#' train <- agaricus.train
|
||||
#' dtrain <- lgb.Dataset(train$data, label = train$label)
|
||||
#'
|
||||
#' params <- list(objective = "binary",
|
||||
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
|
||||
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
|
||||
#' params <- list(
|
||||
#' objective = "binary"
|
||||
#' , learning_rate = 0.01
|
||||
#' , num_leaves = 63
|
||||
#' , max_depth = -1
|
||||
#' , min_data_in_leaf = 1
|
||||
#' , min_sum_hessian_in_leaf = 1
|
||||
#' )
|
||||
#' model <- lgb.train(params, dtrain, 10)
|
||||
#'
|
||||
#' tree_dt <- lgb.model.dt.tree(model)
|
||||
|
@ -51,11 +56,13 @@ lgb.model.dt.tree <- function(model, num_iteration = NULL) {
|
|||
json_model <- lgb.dump(model, num_iteration = num_iteration)
|
||||
|
||||
# Parse json model second
|
||||
parsed_json_model <- jsonlite::fromJSON(json_model,
|
||||
simplifyVector = TRUE,
|
||||
simplifyDataFrame = FALSE,
|
||||
simplifyMatrix = FALSE,
|
||||
flatten = FALSE)
|
||||
parsed_json_model <- jsonlite::fromJSON(
|
||||
json_model
|
||||
, simplifyVector = TRUE
|
||||
, simplifyDataFrame = FALSE
|
||||
, simplifyMatrix = FALSE
|
||||
, flatten = FALSE
|
||||
)
|
||||
|
||||
# Parse tree model third
|
||||
tree_list <- lapply(parsed_json_model$tree_info, single.tree.parse)
|
||||
|
@ -89,21 +96,23 @@ single.tree.parse <- function(lgb_tree) {
|
|||
if (is.null(env)) {
|
||||
# Setup initial default data.table with default types
|
||||
env <- new.env(parent = emptyenv())
|
||||
env$single_tree_dt <- data.table::data.table(tree_index = integer(0),
|
||||
depth = integer(0),
|
||||
split_index = integer(0),
|
||||
split_feature = integer(0),
|
||||
node_parent = integer(0),
|
||||
leaf_index = integer(0),
|
||||
leaf_parent = integer(0),
|
||||
split_gain = numeric(0),
|
||||
threshold = numeric(0),
|
||||
decision_type = character(0),
|
||||
default_left = character(0),
|
||||
internal_value = integer(0),
|
||||
internal_count = integer(0),
|
||||
leaf_value = integer(0),
|
||||
leaf_count = integer(0))
|
||||
env$single_tree_dt <- data.table::data.table(
|
||||
tree_index = integer(0)
|
||||
, depth = integer(0)
|
||||
, split_index = integer(0)
|
||||
, split_feature = integer(0)
|
||||
, node_parent = integer(0)
|
||||
, leaf_index = integer(0)
|
||||
, leaf_parent = integer(0)
|
||||
, split_gain = numeric(0)
|
||||
, threshold = numeric(0)
|
||||
, decision_type = character(0)
|
||||
, default_left = character(0)
|
||||
, internal_value = integer(0)
|
||||
, internal_count = integer(0)
|
||||
, leaf_value = integer(0)
|
||||
, leaf_count = integer(0)
|
||||
)
|
||||
# start tree traversal
|
||||
pre_order_traversal(env, tree_node_leaf, current_depth, parent_index)
|
||||
} else {
|
||||
|
@ -127,14 +136,18 @@ single.tree.parse <- function(lgb_tree) {
|
|||
fill = TRUE)
|
||||
|
||||
# Traverse tree again both left and right
|
||||
pre_order_traversal(env,
|
||||
tree_node_leaf$left_child,
|
||||
current_depth = current_depth + 1L,
|
||||
parent_index = tree_node_leaf$split_index)
|
||||
pre_order_traversal(env,
|
||||
tree_node_leaf$right_child,
|
||||
current_depth = current_depth + 1L,
|
||||
parent_index = tree_node_leaf$split_index)
|
||||
pre_order_traversal(
|
||||
env
|
||||
, tree_node_leaf$left_child
|
||||
, current_depth = current_depth + 1L
|
||||
, parent_index = tree_node_leaf$split_index
|
||||
)
|
||||
pre_order_traversal(
|
||||
env
|
||||
, tree_node_leaf$right_child
|
||||
, current_depth = current_depth + 1L
|
||||
, parent_index = tree_node_leaf$split_index
|
||||
)
|
||||
|
||||
} else if (!is.null(tree_node_leaf$leaf_index)) {
|
||||
|
||||
|
|
|
@ -43,7 +43,11 @@ lgb.plot.importance <- function(tree_imp,
|
|||
cex = NULL) {
|
||||
|
||||
# Check for measurement (column names) correctness
|
||||
measure <- match.arg(measure, choices = c("Gain", "Cover", "Frequency"), several.ok = FALSE)
|
||||
measure <- match.arg(
|
||||
measure
|
||||
, choices = c("Gain", "Cover", "Frequency")
|
||||
, several.ok = FALSE
|
||||
)
|
||||
|
||||
# Get top N importance (defaults to 10)
|
||||
top_n <- min(top_n, nrow(tree_imp))
|
||||
|
@ -72,14 +76,14 @@ lgb.plot.importance <- function(tree_imp,
|
|||
# Do plot
|
||||
tree_imp[.N:1,
|
||||
graphics::barplot(
|
||||
height = get(measure),
|
||||
names.arg = Feature,
|
||||
horiz = TRUE,
|
||||
border = NA,
|
||||
main = "Feature Importance",
|
||||
xlab = measure,
|
||||
cex.names = cex,
|
||||
las = 1
|
||||
height = get(measure)
|
||||
, names.arg = Feature
|
||||
, horiz = TRUE
|
||||
, border = NA
|
||||
, main = "Feature Importance"
|
||||
, xlab = measure
|
||||
, cex.names = cex
|
||||
, las = 1
|
||||
)]
|
||||
|
||||
# Return invisibly
|
||||
|
|
|
@ -9,8 +9,8 @@
|
|||
#' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}.
|
||||
#'
|
||||
#' @details
|
||||
#' The graph represents each feature as a horizontal bar of length proportional to the defined contribution of a feature.
|
||||
#' Features are shown ranked in a decreasing contribution order.
|
||||
#' The graph represents each feature as a horizontal bar of length proportional to the defined
|
||||
#' contribution of a feature. Features are shown ranked in a decreasing contribution order.
|
||||
#'
|
||||
#' @return
|
||||
#' The \code{lgb.plot.interpretation} function creates a \code{barplot}.
|
||||
|
@ -26,9 +26,14 @@
|
|||
#' data(agaricus.test, package = "lightgbm")
|
||||
#' test <- agaricus.test
|
||||
#'
|
||||
#' params <- list(objective = "binary",
|
||||
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
|
||||
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
|
||||
#' params <- list(
|
||||
#' objective = "binary"
|
||||
#' , learning_rate = 0.01
|
||||
#' , num_leaves = 63
|
||||
#' , max_depth = -1
|
||||
#' , min_data_in_leaf = 1
|
||||
#' , min_sum_hessian_in_leaf = 1
|
||||
#' )
|
||||
#' model <- lgb.train(params, dtrain, 10)
|
||||
#'
|
||||
#' tree_interpretation <- lgb.interprete(model, test$data, 1:5)
|
||||
|
@ -67,16 +72,21 @@ lgb.plot.interpretation <- function(tree_interpretation_dt,
|
|||
if (num_class == 1) {
|
||||
|
||||
# Only one class, plot straight away
|
||||
multiple.tree.plot.interpretation(tree_interpretation_dt,
|
||||
top_n = top_n,
|
||||
title = NULL,
|
||||
cex = cex)
|
||||
multiple.tree.plot.interpretation(
|
||||
tree_interpretation_dt
|
||||
, top_n = top_n
|
||||
, title = NULL
|
||||
, cex = cex
|
||||
)
|
||||
|
||||
} else {
|
||||
|
||||
# More than one class, shape data first
|
||||
layout_mat <- matrix(seq.int(to = cols * ceiling(num_class / cols)),
|
||||
ncol = cols, nrow = ceiling(num_class / cols))
|
||||
layout_mat <- matrix(
|
||||
seq.int(to = cols * ceiling(num_class / cols))
|
||||
, ncol = cols
|
||||
, nrow = ceiling(num_class / cols)
|
||||
)
|
||||
|
||||
# Shape output
|
||||
graphics::par(mfcol = c(nrow(layout_mat), ncol(layout_mat)))
|
||||
|
@ -119,14 +129,14 @@ multiple.tree.plot.interpretation <- function(tree_interpretation,
|
|||
# Do plot
|
||||
tree_interpretation[.N:1,
|
||||
graphics::barplot(
|
||||
height = Contribution,
|
||||
names.arg = Feature,
|
||||
horiz = TRUE,
|
||||
col = ifelse(Contribution > 0, "firebrick", "steelblue"),
|
||||
border = NA,
|
||||
main = title,
|
||||
cex.names = cex,
|
||||
las = 1
|
||||
height = Contribution
|
||||
, names.arg = Feature
|
||||
, horiz = TRUE
|
||||
, col = ifelse(Contribution > 0, "firebrick", "steelblue")
|
||||
, border = NA
|
||||
, main = title
|
||||
, cex.names = cex
|
||||
, las = 1
|
||||
)]
|
||||
|
||||
# Return invisibly
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
#' Data preparator for LightGBM datasets (numeric)
|
||||
#'
|
||||
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric without integers. Please use \code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
|
||||
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
|
||||
#' Factors and characters are converted to numeric without integers. Please use
|
||||
#' \code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
|
||||
#'
|
||||
#' @param data A data.frame or data.table to prepare.
|
||||
#'
|
||||
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
|
||||
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
|
||||
#' for input in \code{lgb.Dataset}.
|
||||
#'
|
||||
#' @examples
|
||||
#' library(lightgbm)
|
||||
|
@ -71,7 +74,11 @@ lgb.prepare <- function(data) {
|
|||
} else {
|
||||
|
||||
# What do you think you are doing here? Throw error.
|
||||
stop("lgb.prepare2: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
|
||||
stop(
|
||||
"lgb.prepare2: you provided "
|
||||
, paste(class(data), collapse = " & ")
|
||||
, " but data should have class data.frame"
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -1,10 +1,16 @@
|
|||
#' Data preparator for LightGBM datasets (integer)
|
||||
#'
|
||||
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric (specifically: integer). Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
|
||||
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
|
||||
#' Factors and characters are converted to numeric (specifically: integer).
|
||||
#' Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets.
|
||||
#' This is useful if you have a specific need for integer dataset instead of numeric dataset.
|
||||
#' Note that there are programs which do not support integer-only input. Consider this as a half
|
||||
#' memory technique which is dangerous, especially for LightGBM.
|
||||
#'
|
||||
#' @param data A data.frame or data.table to prepare.
|
||||
#'
|
||||
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
|
||||
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
|
||||
#' for input in \code{lgb.Dataset}.
|
||||
#'
|
||||
#' @examples
|
||||
#' library(lightgbm)
|
||||
|
|
|
@ -1,11 +1,15 @@
|
|||
#' Data preparator for LightGBM datasets with rules (numeric)
|
||||
#'
|
||||
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter.
|
||||
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
|
||||
#' Factors and characters are converted to numeric. In addition, keeps rules created
|
||||
#' so you can convert other datasets using this converter.
|
||||
#'
|
||||
#' @param data A data.frame or data.table to prepare.
|
||||
#' @param rules A set of rules from the data preparator, if already used.
|
||||
#'
|
||||
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
|
||||
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
|
||||
#' The data must be converted to a matrix format (\code{as.matrix}) for input
|
||||
#' in \code{lgb.Dataset}.
|
||||
#'
|
||||
#' @examples
|
||||
#' library(lightgbm)
|
||||
|
@ -160,7 +164,11 @@ lgb.prepare_rules <- function(data, rules = NULL) {
|
|||
} else {
|
||||
|
||||
# What do you think you are doing here? Throw error.
|
||||
stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
|
||||
stop(
|
||||
"lgb.prepare: you provided "
|
||||
, paste(class(data), collapse = " & ")
|
||||
, " but data should have class data.frame"
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -1,11 +1,18 @@
|
|||
#' Data preparator for LightGBM datasets with rules (integer)
|
||||
#'
|
||||
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
|
||||
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
|
||||
#' Factors and characters are converted to numeric (specifically: integer).
|
||||
#' In addition, keeps rules created so you can convert other datasets using this converter.
|
||||
#' This is useful if you have a specific need for integer dataset instead of numeric dataset.
|
||||
#' Note that there are programs which do not support integer-only input.
|
||||
#' Consider this as a half memory technique which is dangerous, especially for LightGBM.
|
||||
#'
|
||||
#' @param data A data.frame or data.table to prepare.
|
||||
#' @param rules A set of rules from the data preparator, if already used.
|
||||
#'
|
||||
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
|
||||
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
|
||||
#' The data must be converted to a matrix format (\code{as.matrix}) for input in
|
||||
#' \code{lgb.Dataset}.
|
||||
#'
|
||||
#' @examples
|
||||
#' library(lightgbm)
|
||||
|
@ -35,9 +42,13 @@
|
|||
#' data(iris) # Erase iris dataset
|
||||
#'
|
||||
#' # We remapped values differently
|
||||
#' personal_rules <- list(Species = c("setosa" = 3L,
|
||||
#' "versicolor" = 2L,
|
||||
#' "virginica" = 1L))
|
||||
#' personal_rules <- list(
|
||||
#' Species = c(
|
||||
#' "setosa" = 3L
|
||||
#' , "versicolor" = 2L
|
||||
#' , virginica" = 1L
|
||||
#' )
|
||||
#' )
|
||||
#' newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules)
|
||||
#' str(newest_iris$data) # SUCCESS!
|
||||
#'
|
||||
|
@ -158,7 +169,11 @@ lgb.prepare_rules2 <- function(data, rules = NULL) {
|
|||
} else {
|
||||
|
||||
# What do you think you are doing here? Throw error.
|
||||
stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
|
||||
stop(
|
||||
"lgb.prepare: you provided "
|
||||
, paste(class(data), collapse = " & ")
|
||||
, " but data should have class data.frame"
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -4,17 +4,18 @@
|
|||
#' @inheritParams lgb_shared_params
|
||||
#' @param valids a list of \code{lgb.Dataset} objects, used for validation
|
||||
#' @param obj objective function, can be character or custom objective function. Examples include
|
||||
#' \code{regression}, \code{regression_l1}, \code{huber},
|
||||
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
|
||||
#' \code{regression}, \code{regression_l1}, \code{huber},
|
||||
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
|
||||
#' @param eval evaluation function, can be (a list of) character or custom eval function
|
||||
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
|
||||
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
|
||||
#' @param categorical_feature list of str or int
|
||||
#' type int represents index,
|
||||
#' type str represents feature names
|
||||
#' @param callbacks list of callback functions
|
||||
#' List of callback functions that are applied at each iteration.
|
||||
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets
|
||||
#' type int represents index,
|
||||
#' type str represents feature names
|
||||
#' @param callbacks List of callback functions that are applied at each iteration.
|
||||
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the
|
||||
#' booster model into a predictor model which frees up memory and the
|
||||
#' original datasets
|
||||
#' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
|
||||
#' \itemize{
|
||||
#' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
|
||||
|
@ -37,13 +38,15 @@
|
|||
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
#' params <- list(objective = "regression", metric = "l2")
|
||||
#' valids <- list(test = dtest)
|
||||
#' model <- lgb.train(params,
|
||||
#' dtrain,
|
||||
#' 10,
|
||||
#' valids,
|
||||
#' min_data = 1,
|
||||
#' learning_rate = 1,
|
||||
#' early_stopping_rounds = 5)
|
||||
#' model <- lgb.train(
|
||||
#' params = params
|
||||
#' , data = dtrain
|
||||
#' , nrounds = 10
|
||||
#' , valids = valids
|
||||
#' , min_data = 1
|
||||
#' , learning_rate = 1
|
||||
#' , early_stopping_rounds = 5
|
||||
#' )
|
||||
#' @export
|
||||
lgb.train <- function(params = list(),
|
||||
data,
|
||||
|
@ -105,7 +108,17 @@ lgb.train <- function(params = list(),
|
|||
begin_iteration <- predictor$current_iter() + 1
|
||||
}
|
||||
# Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
|
||||
n_rounds <- c("num_iterations", "num_iteration", "n_iter", "num_tree", "num_trees", "num_round", "num_rounds", "num_boost_round", "n_estimators")
|
||||
n_rounds <- c(
|
||||
"num_iterations"
|
||||
, "num_iteration"
|
||||
, "n_iter"
|
||||
, "num_tree"
|
||||
, "num_trees"
|
||||
, "num_round"
|
||||
, "num_rounds"
|
||||
, "num_boost_round"
|
||||
, "n_estimators"
|
||||
)
|
||||
if (any(names(params) %in% n_rounds)) {
|
||||
end_iteration <- begin_iteration + params[[which(names(params) %in% n_rounds)[1]]] - 1
|
||||
} else {
|
||||
|
@ -198,12 +211,24 @@ lgb.train <- function(params = list(),
|
|||
early_stop <- c("early_stopping_round", "early_stopping_rounds", "early_stopping", "n_iter_no_change")
|
||||
if (any(names(params) %in% early_stop)) {
|
||||
if (params[[which(names(params) %in% early_stop)[1]]] > 0) {
|
||||
callbacks <- add.cb(callbacks, cb.early.stop(params[[which(names(params) %in% early_stop)[1]]], verbose = verbose))
|
||||
callbacks <- add.cb(
|
||||
callbacks
|
||||
, cb.early.stop(
|
||||
params[[which(names(params) %in% early_stop)[1]]]
|
||||
, verbose = verbose
|
||||
)
|
||||
)
|
||||
}
|
||||
} else {
|
||||
if (!is.null(early_stopping_rounds)) {
|
||||
if (early_stopping_rounds > 0) {
|
||||
callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, verbose = verbose))
|
||||
callbacks <- add.cb(
|
||||
callbacks
|
||||
, cb.early.stop(
|
||||
early_stopping_rounds
|
||||
, verbose = verbose
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -267,7 +292,8 @@ lgb.train <- function(params = list(),
|
|||
|
||||
}
|
||||
|
||||
# When early stopping is not activated, we compute the best iteration / score ourselves by selecting the first metric and the first dataset
|
||||
# When early stopping is not activated, we compute the best iteration / score ourselves by
|
||||
# selecting the first metric and the first dataset
|
||||
if (record && length(valids) > 0 && is.na(env$best_score)) {
|
||||
if (env$eval_list[[1]]$higher_better[1] == TRUE) {
|
||||
booster$best_iter <- unname(which.max(unlist(booster$record_evals[[2]][[1]][[1]])))
|
||||
|
@ -282,9 +308,11 @@ lgb.train <- function(params = list(),
|
|||
if (reset_data) {
|
||||
|
||||
# Store temporarily model data elsewhere
|
||||
booster_old <- list(best_iter = booster$best_iter,
|
||||
best_score = booster$best_score,
|
||||
record_evals = booster$record_evals)
|
||||
booster_old <- list(
|
||||
best_iter = booster$best_iter
|
||||
, best_score = booster$best_score
|
||||
, record_evals = booster$record_evals
|
||||
)
|
||||
|
||||
# Reload model
|
||||
booster <- lgb.load(model_str = booster$save_model_to_string())
|
||||
|
|
|
@ -2,9 +2,13 @@
|
|||
#'
|
||||
#' Attempts to unload LightGBM packages so you can remove objects cleanly without having to restart R. This is useful for instance if an object becomes stuck for no apparent reason and you do not want to restart R to fix the lost object.
|
||||
#'
|
||||
#' @param restore Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed.
|
||||
#' @param wipe Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them.
|
||||
#' @param envir The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment.
|
||||
#' @param restore Whether to reload \code{LightGBM} immediately after detaching from R.
|
||||
#' Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once
|
||||
#' unloading is performed.
|
||||
#' @param wipe Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global
|
||||
#' environment. Defaults to \code{FALSE} which means to not remove them.
|
||||
#' @param envir The environment to perform wiping on if \code{wipe == TRUE}. Defaults to
|
||||
#' \code{.GlobalEnv} which is the global environment.
|
||||
#'
|
||||
#' @return NULL invisibly.
|
||||
#'
|
||||
|
@ -18,13 +22,15 @@
|
|||
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
#' params <- list(objective = "regression", metric = "l2")
|
||||
#' valids <- list(test = dtest)
|
||||
#' model <- lgb.train(params,
|
||||
#' dtrain,
|
||||
#' 10,
|
||||
#' valids,
|
||||
#' min_data = 1,
|
||||
#' learning_rate = 1,
|
||||
#' early_stopping_rounds = 5)
|
||||
#' model <- lgb.train(
|
||||
#' params = params
|
||||
#' , data = dtrain
|
||||
#' , nrounds = 10
|
||||
#' , valids = valids
|
||||
#' , min_data = 1
|
||||
#' , learning_rate = 1
|
||||
#' , early_stopping_rounds = 5
|
||||
#' )
|
||||
#'
|
||||
#' \dontrun{
|
||||
#' lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv)
|
||||
|
@ -43,8 +49,18 @@ lgb.unloader <- function(restore = TRUE, wipe = FALSE, envir = .GlobalEnv) {
|
|||
|
||||
# Should we wipe variables? (lgb.Booster, lgb.Dataset)
|
||||
if (wipe) {
|
||||
boosters <- Filter(function(x) inherits(get(x, envir = envir), "lgb.Booster"), ls(envir = envir))
|
||||
datasets <- Filter(function(x) inherits(get(x, envir = envir), "lgb.Dataset"), ls(envir = envir))
|
||||
boosters <- Filter(
|
||||
f = function(x){
|
||||
inherits(get(x, envir = envir), "lgb.Booster")
|
||||
}
|
||||
, x = ls(envir = envir)
|
||||
)
|
||||
datasets <- Filter(
|
||||
f = function(x){
|
||||
inherits(get(x, envir = envir), "lgb.Dataset")
|
||||
}
|
||||
, x = ls(envir = envir)
|
||||
)
|
||||
rm(list = c(boosters, datasets), envir = envir)
|
||||
gc(verbose = FALSE)
|
||||
}
|
||||
|
|
|
@ -4,12 +4,10 @@
|
|||
#' @param callbacks list of callback functions
|
||||
#' List of callback functions that are applied at each iteration.
|
||||
#' @param data a \code{lgb.Dataset} object, used for training
|
||||
#' @param early_stopping_rounds int
|
||||
#' Activates early stopping.
|
||||
#' Requires at least one validation data and one metric
|
||||
#' If there's more than one, will check all of them except the training data
|
||||
#' Returns the model with (best_iter + early_stopping_rounds)
|
||||
#' If early stopping occurs, the model will have 'best_iter' field
|
||||
#' @param early_stopping_rounds int. Activates early stopping. Requires at least one validation data
|
||||
#' and one metric. If there's more than one, will check all of them
|
||||
#' except the training data. Returns the model with (best_iter + early_stopping_rounds).
|
||||
#' If early stopping occurs, the model will have 'best_iter' field.
|
||||
#' @param eval_freq evaluation output frequency, only effect when verbose > 0
|
||||
#' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
|
||||
#' @param nrounds number of training rounds
|
||||
|
@ -76,9 +74,18 @@ lightgbm <- function(data,
|
|||
}
|
||||
|
||||
# Train a model using the regular way
|
||||
bst <- lgb.train(params, dtrain, nrounds, valids, verbose = verbose, eval_freq = eval_freq,
|
||||
early_stopping_rounds = early_stopping_rounds,
|
||||
init_model = init_model, callbacks = callbacks, ...)
|
||||
bst <- lgb.train(
|
||||
params = params
|
||||
, data = dtrain
|
||||
, nrounds = nrounds
|
||||
, valids = valids
|
||||
, verbose = verbose
|
||||
, eval_freq = eval_freq
|
||||
, early_stopping_rounds = early_stopping_rounds
|
||||
, init_model = init_model
|
||||
, callbacks = callbacks
|
||||
, ...
|
||||
)
|
||||
|
||||
# Store model under a specific name
|
||||
bst$save_model(save_name)
|
||||
|
|
|
@ -17,13 +17,15 @@
|
|||
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
#' params <- list(objective = "regression", metric = "l2")
|
||||
#' valids <- list(test = dtest)
|
||||
#' model <- lgb.train(params,
|
||||
#' dtrain,
|
||||
#' 10,
|
||||
#' valids,
|
||||
#' min_data = 1,
|
||||
#' learning_rate = 1,
|
||||
#' early_stopping_rounds = 5)
|
||||
#' model <- lgb.train(
|
||||
#' params = params
|
||||
#' , data = dtrain
|
||||
#' , nrounds = 10
|
||||
#' , valids = valids
|
||||
#' , min_data = 1
|
||||
#' , learning_rate = 1
|
||||
#' , early_stopping_rounds = 5
|
||||
#' )
|
||||
#' saveRDS.lgb.Booster(model, "model.rds")
|
||||
#' new_model <- readRDS.lgb.Booster("model.rds")
|
||||
#'
|
||||
|
|
|
@ -1,12 +1,18 @@
|
|||
#' saveRDS for \code{lgb.Booster} models
|
||||
#'
|
||||
#' Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
|
||||
#' Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides
|
||||
#' whether to save the raw model or not.
|
||||
#'
|
||||
#' @param object R object to serialize.
|
||||
#' @param file a connection or the name of the file where the R object is saved to or read from.
|
||||
#' @param ascii a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save.
|
||||
#' @param version the workspace format version to use. \code{NULL} specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions.
|
||||
#' @param compress a logical specifying whether saving to a named file is to use "gzip" compression, or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of compression to be used. Ignored if file is a connection.
|
||||
#' @param ascii a logical. If TRUE or NA, an ASCII representation is written; otherwise (default),
|
||||
#' a binary one is used. See the comments in the help for save.
|
||||
#' @param version the workspace format version to use. \code{NULL} specifies the current default
|
||||
#' version (2). Versions prior to 2 are not supported, so this will only be relevant
|
||||
#' when there are later versions.
|
||||
#' @param compress a logical specifying whether saving to a named file is to use "gzip" compression,
|
||||
#' or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of
|
||||
#' compression to be used. Ignored if file is a connection.
|
||||
#' @param refhook a hook function for handling reference objects.
|
||||
#' @param raw whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}.
|
||||
#'
|
||||
|
@ -23,10 +29,10 @@
|
|||
#' params <- list(objective = "regression", metric = "l2")
|
||||
#' valids <- list(test = dtest)
|
||||
#' model <- lgb.train(
|
||||
#' params
|
||||
#' , dtrain
|
||||
#' , 10
|
||||
#' , valids
|
||||
#' params = params
|
||||
#' , data = dtrain
|
||||
#' , nrounds = 10
|
||||
#' , valids = valids
|
||||
#' , min_data = 1
|
||||
#' , learning_rate = 1
|
||||
#' , early_stopping_rounds = 5
|
||||
|
@ -48,12 +54,14 @@ saveRDS.lgb.Booster <- function(object,
|
|||
object$save()
|
||||
|
||||
# Save RDS
|
||||
saveRDS(object,
|
||||
file = file,
|
||||
ascii = ascii,
|
||||
version = version,
|
||||
compress = compress,
|
||||
refhook = refhook)
|
||||
saveRDS(
|
||||
object
|
||||
, file = file
|
||||
, ascii = ascii
|
||||
, version = version
|
||||
, compress = compress
|
||||
, refhook = refhook
|
||||
)
|
||||
|
||||
# Free model from memory
|
||||
object$raw <- NA
|
||||
|
@ -61,12 +69,14 @@ saveRDS.lgb.Booster <- function(object,
|
|||
} else {
|
||||
|
||||
# Save as usual
|
||||
saveRDS(object,
|
||||
file = file,
|
||||
ascii = ascii,
|
||||
version = version,
|
||||
compress = compress,
|
||||
refhook = refhook)
|
||||
saveRDS(
|
||||
object
|
||||
, file = file
|
||||
, ascii = ascii
|
||||
, version = version
|
||||
, compress = compress
|
||||
, refhook = refhook
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -25,9 +25,20 @@ lgb.call <- function(fun_name, ret, ...) {
|
|||
|
||||
# Check for a ret call
|
||||
if (!is.null(ret)) {
|
||||
call_state <- .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm") # Call with ret
|
||||
call_state <- .Call(
|
||||
fun_name
|
||||
, ...
|
||||
, ret
|
||||
, call_state
|
||||
, PACKAGE = "lib_lightgbm"
|
||||
)
|
||||
} else {
|
||||
call_state <- .Call(fun_name, ..., call_state, PACKAGE = "lib_lightgbm") # Call without ret
|
||||
call_state <- .Call(
|
||||
fun_name
|
||||
, ...
|
||||
, call_state
|
||||
, PACKAGE = "lib_lightgbm"
|
||||
)
|
||||
}
|
||||
call_state <- as.integer(call_state)
|
||||
# Check for call state value post call
|
||||
|
@ -37,17 +48,25 @@ lgb.call <- function(fun_name, ret, ...) {
|
|||
buf_len <- 200L
|
||||
act_len <- 0L
|
||||
err_msg <- raw(buf_len)
|
||||
err_msg <- .Call("LGBM_GetLastError_R", buf_len, act_len, err_msg, PACKAGE = "lib_lightgbm")
|
||||
err_msg <- .Call(
|
||||
"LGBM_GetLastError_R"
|
||||
, buf_len
|
||||
, act_len
|
||||
, err_msg
|
||||
, PACKAGE = "lib_lightgbm"
|
||||
)
|
||||
|
||||
# Check error buffer
|
||||
if (act_len > buf_len) {
|
||||
buf_len <- act_len
|
||||
err_msg <- raw(buf_len)
|
||||
err_msg <- .Call("LGBM_GetLastError_R",
|
||||
buf_len,
|
||||
act_len,
|
||||
err_msg,
|
||||
PACKAGE = "lib_lightgbm")
|
||||
err_msg <- .Call(
|
||||
"LGBM_GetLastError_R"
|
||||
, buf_len
|
||||
, act_len
|
||||
, err_msg
|
||||
, PACKAGE = "lib_lightgbm"
|
||||
)
|
||||
}
|
||||
|
||||
# Return error
|
||||
|
@ -97,7 +116,13 @@ lgb.params2str <- function(params, ...) {
|
|||
|
||||
# Check for identical parameters
|
||||
if (length(intersect(names(params), names(dot_params))) > 0) {
|
||||
stop("Same parameters in ", sQuote("params"), " and in the call are not allowed. Please check your ", sQuote("params"), " list")
|
||||
stop(
|
||||
"Same parameters in "
|
||||
, sQuote("params")
|
||||
, " and in the call are not allowed. Please check your "
|
||||
, sQuote("params")
|
||||
, " list"
|
||||
)
|
||||
}
|
||||
|
||||
# Merge parameters
|
||||
|
@ -160,15 +185,43 @@ lgb.check.params <- function(params) {
|
|||
lgb.check.obj <- function(params, obj) {
|
||||
|
||||
# List known objectives in a vector
|
||||
OBJECTIVES <- c("regression", "regression_l1", "regression_l2", "mean_squared_error", "mse", "l2_root", "root_mean_squared_error", "rmse",
|
||||
"mean_absolute_error", "mae", "quantile",
|
||||
"huber", "fair", "poisson", "binary", "lambdarank",
|
||||
"multiclass", "softmax", "multiclassova", "multiclass_ova", "ova", "ovr",
|
||||
"xentropy", "cross_entropy", "xentlambda", "cross_entropy_lambda", "mean_absolute_percentage_error", "mape",
|
||||
"gamma", "tweedie")
|
||||
OBJECTIVES <- c(
|
||||
"regression"
|
||||
, "regression_l1"
|
||||
, "regression_l2"
|
||||
, "mean_squared_error"
|
||||
, "mse"
|
||||
, "l2_root"
|
||||
, "root_mean_squared_error"
|
||||
, "rmse"
|
||||
, "mean_absolute_error"
|
||||
, "mae"
|
||||
, "quantile"
|
||||
, "huber"
|
||||
, "fair"
|
||||
, "poisson"
|
||||
, "binary"
|
||||
, "lambdarank"
|
||||
, "multiclass"
|
||||
, "softmax"
|
||||
, "multiclassova"
|
||||
, "multiclass_ova"
|
||||
, "ova"
|
||||
, "ovr"
|
||||
, "xentropy"
|
||||
, "cross_entropy"
|
||||
, "xentlambda"
|
||||
, "cross_entropy_lambda"
|
||||
, "mean_absolute_percentage_error"
|
||||
, "mape"
|
||||
, "gamma"
|
||||
, "tweedie"
|
||||
)
|
||||
|
||||
# Check whether the objective is empty or not, and take it from params if needed
|
||||
if (!is.null(obj)) { params$objective <- obj }
|
||||
if (!is.null(obj)) {
|
||||
params$objective <- obj
|
||||
}
|
||||
|
||||
# Check whether the objective is a character
|
||||
if (is.character(params$objective)) {
|
||||
|
|
|
@ -17,62 +17,82 @@ class(train$data)
|
|||
# Note: we are putting in sparse matrix here, lightgbm naturally handles sparse input
|
||||
# Use sparse matrix when your feature is sparse (e.g. when you are using one-hot encoding vector)
|
||||
print("Training lightgbm with sparseMatrix")
|
||||
bst <- lightgbm(data = train$data,
|
||||
label = train$label,
|
||||
num_leaves = 4,
|
||||
learning_rate = 1,
|
||||
nrounds = 2,
|
||||
objective = "binary")
|
||||
bst <- lightgbm(
|
||||
data = train$data
|
||||
, label = train$label
|
||||
, num_leaves = 4
|
||||
, learning_rate = 1
|
||||
, nrounds = 2
|
||||
, objective = "binary"
|
||||
)
|
||||
|
||||
# Alternatively, you can put in dense matrix, i.e. basic R-matrix
|
||||
print("Training lightgbm with Matrix")
|
||||
bst <- lightgbm(data = as.matrix(train$data),
|
||||
label = train$label,
|
||||
num_leaves = 4,
|
||||
learning_rate = 1,
|
||||
nrounds = 2,
|
||||
objective = "binary")
|
||||
bst <- lightgbm(
|
||||
data = as.matrix(train$data)
|
||||
, label = train$label
|
||||
, num_leaves = 4
|
||||
, learning_rate = 1
|
||||
, nrounds = 2
|
||||
, objective = "binary"
|
||||
)
|
||||
|
||||
# You can also put in lgb.Dataset object, which stores label, data and other meta datas needed for advanced features
|
||||
print("Training lightgbm with lgb.Dataset")
|
||||
dtrain <- lgb.Dataset(data = train$data,
|
||||
label = train$label)
|
||||
bst <- lightgbm(data = dtrain,
|
||||
num_leaves = 4,
|
||||
learning_rate = 1,
|
||||
nrounds = 2,
|
||||
objective = "binary")
|
||||
dtrain <- lgb.Dataset(
|
||||
data = train$data
|
||||
, label = train$label
|
||||
)
|
||||
bst <- lightgbm(
|
||||
data = dtrain
|
||||
, num_leaves = 4
|
||||
, learning_rate = 1
|
||||
, nrounds = 2
|
||||
, objective = "binary"
|
||||
)
|
||||
|
||||
# Verbose = 0,1,2
|
||||
print("Train lightgbm with verbose 0, no message")
|
||||
bst <- lightgbm(data = dtrain,
|
||||
num_leaves = 4,
|
||||
learning_rate = 1,
|
||||
nrounds = 2,
|
||||
objective = "binary",
|
||||
verbose = 0)
|
||||
bst <- lightgbm(
|
||||
data = dtrain
|
||||
, num_leaves = 4
|
||||
, learning_rate = 1
|
||||
, nrounds = 2
|
||||
, objective = "binary"
|
||||
, verbose = 0
|
||||
)
|
||||
|
||||
print("Train lightgbm with verbose 1, print evaluation metric")
|
||||
bst <- lightgbm(data = dtrain,
|
||||
num_leaves = 4,
|
||||
learning_rate = 1,
|
||||
nrounds = 2,
|
||||
nthread = 2,
|
||||
objective = "binary",
|
||||
verbose = 1)
|
||||
bst <- lightgbm(
|
||||
data = dtrain
|
||||
, num_leaves = 4
|
||||
, learning_rate = 1
|
||||
, nrounds = 2
|
||||
, nthread = 2
|
||||
, objective = "binary"
|
||||
, verbose = 1
|
||||
)
|
||||
|
||||
print("Train lightgbm with verbose 2, also print information about tree")
|
||||
bst <- lightgbm(data = dtrain,
|
||||
num_leaves = 4,
|
||||
learning_rate = 1,
|
||||
nrounds = 2,
|
||||
nthread = 2,
|
||||
objective = "binary",
|
||||
verbose = 2)
|
||||
bst <- lightgbm(
|
||||
data = dtrain
|
||||
, num_leaves = 4
|
||||
, learning_rate = 1
|
||||
, nrounds = 2
|
||||
, nthread = 2
|
||||
, objective = "binary"
|
||||
, verbose = 2
|
||||
)
|
||||
|
||||
# You can also specify data as file path to a LibSVM/TCV/CSV format input
|
||||
# Since we do not have this file with us, the following line is just for illustration
|
||||
# bst <- lightgbm(data = "agaricus.train.svm", num_leaves = 4, learning_rate = 1, nrounds = 2,objective = "binary")
|
||||
# bst <- lightgbm(
|
||||
# data = "agaricus.train.svm"
|
||||
# , num_leaves = 4
|
||||
# , learning_rate = 1
|
||||
# , nrounds = 2
|
||||
# , objective = "binary"
|
||||
# )
|
||||
|
||||
#--------------------Basic prediction using lightgbm--------------
|
||||
# You can do prediction using the following line
|
||||
|
@ -104,37 +124,43 @@ valids <- list(train = dtrain, test = dtest)
|
|||
# To train with valids, use lgb.train, which contains more advanced features
|
||||
# valids allows us to monitor the evaluation result on all data in the list
|
||||
print("Train lightgbm using lgb.train with valids")
|
||||
bst <- lgb.train(data = dtrain,
|
||||
num_leaves = 4,
|
||||
learning_rate = 1,
|
||||
nrounds = 2,
|
||||
valids = valids,
|
||||
nthread = 2,
|
||||
objective = "binary")
|
||||
bst <- lgb.train(
|
||||
data = dtrain
|
||||
, num_leaves = 4
|
||||
, learning_rate = 1
|
||||
, nrounds = 2
|
||||
, valids = valids
|
||||
, nthread = 2
|
||||
, objective = "binary"
|
||||
)
|
||||
|
||||
# We can change evaluation metrics, or use multiple evaluation metrics
|
||||
print("Train lightgbm using lgb.train with valids, watch logloss and error")
|
||||
bst <- lgb.train(data = dtrain,
|
||||
num_leaves = 4,
|
||||
learning_rate = 1,
|
||||
nrounds = 2,
|
||||
valids = valids,
|
||||
eval = c("binary_error", "binary_logloss"),
|
||||
nthread = 2,
|
||||
objective = "binary")
|
||||
bst <- lgb.train(
|
||||
data = dtrain
|
||||
, num_leaves = 4
|
||||
, learning_rate = 1
|
||||
, nrounds = 2
|
||||
, valids = valids
|
||||
, eval = c("binary_error", "binary_logloss")
|
||||
, nthread = 2
|
||||
, objective = "binary"
|
||||
)
|
||||
|
||||
# lgb.Dataset can also be saved using lgb.Dataset.save
|
||||
lgb.Dataset.save(dtrain, "dtrain.buffer")
|
||||
|
||||
# To load it in, simply call lgb.Dataset
|
||||
dtrain2 <- lgb.Dataset("dtrain.buffer")
|
||||
bst <- lgb.train(data = dtrain2,
|
||||
num_leaves = 4,
|
||||
learning_rate = 1,
|
||||
nrounds = 2,
|
||||
valids = valids,
|
||||
nthread = 2,
|
||||
objective = "binary")
|
||||
bst <- lgb.train(
|
||||
data = dtrain2
|
||||
, num_leaves = 4
|
||||
, learning_rate = 1
|
||||
, nrounds = 2
|
||||
, valids = valids
|
||||
, nthread = 2
|
||||
, objective = "binary"
|
||||
)
|
||||
|
||||
# information can be extracted from lgb.Dataset using getinfo
|
||||
label = getinfo(dtest, "label")
|
||||
|
|
|
@ -13,10 +13,12 @@ valids <- list(eval = dtest, train = dtrain)
|
|||
print("Start running example to start from an initial prediction")
|
||||
|
||||
# Train lightgbm for 1 round
|
||||
param <- list(num_leaves = 4,
|
||||
learning_rate = 1,
|
||||
nthread = 2,
|
||||
objective = "binary")
|
||||
param <- list(
|
||||
num_leaves = 4
|
||||
, learning_rate = 1
|
||||
, nthread = 2
|
||||
, objective = "binary"
|
||||
)
|
||||
bst <- lgb.train(param, dtrain, 1, valids = valids)
|
||||
|
||||
# Note: we need the margin value instead of transformed prediction in set_init_score
|
||||
|
@ -29,7 +31,9 @@ setinfo(dtrain, "init_score", ptrain)
|
|||
setinfo(dtest, "init_score", ptest)
|
||||
|
||||
print("This is result of boost from initial prediction")
|
||||
bst <- lgb.train(params = param,
|
||||
data = dtrain,
|
||||
nrounds = 5,
|
||||
valids = valids)
|
||||
bst <- lgb.train(
|
||||
params = param
|
||||
, data = dtrain
|
||||
, nrounds = 5
|
||||
, valids = valids
|
||||
)
|
||||
|
|
|
@ -60,21 +60,28 @@ my_data <- as.matrix(bank[, 1:16, with = FALSE])
|
|||
|
||||
# Creating the LightGBM dataset with categorical features
|
||||
# The categorical features must be indexed like in R (1-indexed, not 0-indexed)
|
||||
lgb_data <- lgb.Dataset(data = my_data,
|
||||
label = bank$y,
|
||||
categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16))
|
||||
lgb_data <- lgb.Dataset(
|
||||
data = my_data
|
||||
, label = bank$y
|
||||
, categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)
|
||||
)
|
||||
|
||||
# We can now train a model
|
||||
model <- lgb.train(list(objective = "binary",
|
||||
metric = "l2",
|
||||
min_data = 1,
|
||||
learning_rate = 0.1,
|
||||
min_data = 0,
|
||||
min_hessian = 1,
|
||||
max_depth = 2),
|
||||
lgb_data,
|
||||
100,
|
||||
valids = list(train = lgb_data))
|
||||
params <- list(
|
||||
objective = "binary"
|
||||
, metric = "l2"
|
||||
, min_data = 1
|
||||
, learning_rate = 0.1
|
||||
, min_data = 0
|
||||
, min_hessian = 1
|
||||
, max_depth = 2
|
||||
)
|
||||
model <- lgb.train(
|
||||
params = params
|
||||
, data = lgb_data
|
||||
, nrounds = 100
|
||||
, valids = list(train = lgb_data)
|
||||
)
|
||||
|
||||
# Try to find split_feature: 2
|
||||
# If you find it, it means it used a categorical feature in the first tree
|
||||
|
|
|
@ -68,24 +68,33 @@ my_data_test <- as.matrix(bank_test[, 1:16, with = FALSE])
|
|||
|
||||
# Creating the LightGBM dataset with categorical features
|
||||
# The categorical features can be passed to lgb.train to not copy and paste a lot
|
||||
dtrain <- lgb.Dataset(data = my_data_train,
|
||||
label = bank_train$y,
|
||||
categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16))
|
||||
dtest <- lgb.Dataset.create.valid(dtrain,
|
||||
data = my_data_test,
|
||||
label = bank_test$y)
|
||||
dtrain <- lgb.Dataset(
|
||||
data = my_data_train
|
||||
, label = bank_train$y
|
||||
, categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)
|
||||
)
|
||||
dtest <- lgb.Dataset.create.valid(
|
||||
dtrain
|
||||
, data = my_data_test
|
||||
, label = bank_test$y
|
||||
)
|
||||
|
||||
# We can now train a model
|
||||
model <- lgb.train(list(objective = "binary",
|
||||
metric = "l2",
|
||||
min_data = 1,
|
||||
learning_rate = 0.1,
|
||||
min_data = 0,
|
||||
min_hessian = 1,
|
||||
max_depth = 2),
|
||||
dtrain,
|
||||
100,
|
||||
valids = list(train = dtrain, valid = dtest))
|
||||
params <- list(
|
||||
objective = "binary"
|
||||
, metric = "l2"
|
||||
, min_data = 1
|
||||
, learning_rate = 0.1
|
||||
, min_data = 0
|
||||
, min_hessian = 1
|
||||
, max_depth = 2
|
||||
)
|
||||
model <- lgb.train(
|
||||
params = params
|
||||
, data = dtrain
|
||||
, nrounds = 100
|
||||
, valids = list(train = dtrain, valid = dtest)
|
||||
)
|
||||
|
||||
# Try to find split_feature: 11
|
||||
# If you find it, it means it used a categorical feature in the first tree
|
||||
|
|
|
@ -6,30 +6,36 @@ dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
|
|||
dtest <- lgb.Dataset.create.valid(dtrain, data = agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
nrounds <- 2
|
||||
param <- list(num_leaves = 4,
|
||||
learning_rate = 1,
|
||||
objective = "binary")
|
||||
param <- list(
|
||||
num_leaves = 4
|
||||
, learning_rate = 1
|
||||
, objective = "binary"
|
||||
)
|
||||
|
||||
print("Running cross validation")
|
||||
# Do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
lgb.cv(param,
|
||||
dtrain,
|
||||
nrounds,
|
||||
nfold = 5,
|
||||
eval = "binary_error")
|
||||
lgb.cv(
|
||||
param
|
||||
, dtrain
|
||||
, nrounds
|
||||
, nfold = 5
|
||||
, eval = "binary_error"
|
||||
)
|
||||
|
||||
print("Running cross validation, disable standard deviation display")
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
lgb.cv(param,
|
||||
dtrain,
|
||||
nrounds,
|
||||
nfold = 5,
|
||||
eval = "binary_error",
|
||||
showsd = FALSE)
|
||||
lgb.cv(
|
||||
param
|
||||
, dtrain
|
||||
, nrounds
|
||||
, nfold = 5
|
||||
, eval = "binary_error"
|
||||
, showsd = FALSE
|
||||
)
|
||||
|
||||
# You can also do cross validation with cutomized loss function
|
||||
print("Running cross validation, with cutomsized loss function")
|
||||
|
@ -48,9 +54,11 @@ evalerror <- function(preds, dtrain) {
|
|||
}
|
||||
|
||||
# train with customized objective
|
||||
lgb.cv(params = param,
|
||||
data = dtrain,
|
||||
nrounds = nrounds,
|
||||
obj = logregobj,
|
||||
eval = evalerror,
|
||||
nfold = 5)
|
||||
lgb.cv(
|
||||
params = param
|
||||
, data = dtrain
|
||||
, nrounds = nrounds
|
||||
, obj = logregobj
|
||||
, eval = evalerror
|
||||
, nfold = 5
|
||||
)
|
||||
|
|
|
@ -11,8 +11,10 @@ dtest <- lgb.Dataset.create.valid(dtrain, data = agaricus.test$data, label = aga
|
|||
# Note: for customized objective function, we leave objective as default
|
||||
# Note: what we are getting is margin value in prediction
|
||||
# You must know what you are doing
|
||||
param <- list(num_leaves = 4,
|
||||
learning_rate = 1)
|
||||
param <- list(
|
||||
num_leaves = 4
|
||||
, learning_rate = 1
|
||||
)
|
||||
valids <- list(eval = dtest)
|
||||
num_round <- 20
|
||||
|
||||
|
@ -39,10 +41,12 @@ evalerror <- function(preds, dtrain) {
|
|||
}
|
||||
print("Start training with early Stopping setting")
|
||||
|
||||
bst <- lgb.train(param,
|
||||
dtrain,
|
||||
num_round,
|
||||
valids,
|
||||
objective = logregobj,
|
||||
eval = evalerror,
|
||||
early_stopping_round = 3)
|
||||
bst <- lgb.train(
|
||||
param
|
||||
, dtrain
|
||||
, num_round
|
||||
, valids
|
||||
, objective = logregobj
|
||||
, eval = evalerror
|
||||
, early_stopping_round = 3
|
||||
)
|
||||
|
|
|
@ -26,9 +26,11 @@ gbm <- list()
|
|||
|
||||
for (i in 1:1000) {
|
||||
print(i)
|
||||
gbm[[i]] <- lgb.train(params = list(objective = "regression"),
|
||||
data = data,
|
||||
1,
|
||||
reset_data = TRUE)
|
||||
gbm[[i]] <- lgb.train(
|
||||
params = list(objective = "regression")
|
||||
, data = data
|
||||
, 1
|
||||
, reset_data = TRUE
|
||||
)
|
||||
gc(verbose = FALSE)
|
||||
}
|
||||
|
|
|
@ -17,32 +17,47 @@ dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
|||
# Third, we setup parameters and we train a model
|
||||
params <- list(objective = "regression", metric = "l2")
|
||||
valids <- list(test = dtest)
|
||||
model <- lgb.train(params,
|
||||
dtrain,
|
||||
50,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 0.1,
|
||||
bagging_fraction = 0.1,
|
||||
bagging_freq = 1,
|
||||
bagging_seed = 1)
|
||||
model <- lgb.train(
|
||||
params
|
||||
, dtrain
|
||||
, 50
|
||||
, valids
|
||||
, min_data = 1
|
||||
, learning_rate = 0.1
|
||||
, bagging_fraction = 0.1
|
||||
, bagging_freq = 1
|
||||
, bagging_seed = 1
|
||||
)
|
||||
|
||||
# We create a data.frame with the following structure:
|
||||
# X = average leaf of the observation throughout all trees
|
||||
# Y = prediction probability (clamped to [1e-15, 1-1e-15])
|
||||
# Z = logloss
|
||||
# binned = binned quantile of average leaf
|
||||
new_data <- data.frame(X = rowMeans(predict(model,
|
||||
agaricus.test$data,
|
||||
predleaf = TRUE)),
|
||||
Y = pmin(pmax(predict(model,
|
||||
agaricus.test$data), 1e-15), 1 - 1e-15))
|
||||
new_data$Z <- -(agaricus.test$label * log(new_data$Y) + (1 - agaricus.test$label) * log(1 - new_data$Y))
|
||||
new_data$binned <- .bincode(x = new_data$X,
|
||||
breaks = quantile(x = new_data$X,
|
||||
probs = (1:9)/10),
|
||||
right = TRUE,
|
||||
include.lowest = TRUE)
|
||||
new_data <- data.frame(
|
||||
X = rowMeans(predict(
|
||||
model
|
||||
, agaricus.test$data
|
||||
, predleaf = TRUE
|
||||
))
|
||||
, Y = pmin(
|
||||
pmax(
|
||||
predict(model, agaricus.test$data)
|
||||
, 1e-15
|
||||
)
|
||||
, 1 - 1e-15
|
||||
)
|
||||
)
|
||||
new_data$Z <- -1 * (agaricus.test$label * log(new_data$Y) + (1 - agaricus.test$label) * log(1 - new_data$Y))
|
||||
new_data$binned <- .bincode(
|
||||
x = new_data$X
|
||||
, breaks = quantile(
|
||||
x = new_data$X
|
||||
, probs = (1:9) / 10
|
||||
)
|
||||
, right = TRUE
|
||||
, include.lowest = TRUE
|
||||
)
|
||||
new_data$binned[is.na(new_data$binned)] <- 0
|
||||
new_data$binned <- as.factor(new_data$binned)
|
||||
|
||||
|
@ -52,31 +67,64 @@ table(new_data$binned)
|
|||
# We can plot the binned content
|
||||
# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss
|
||||
# On the third plot, it is smooth!
|
||||
ggplot(data = new_data, mapping = aes(x = X, y = Y, color = binned)) + geom_point() + theme_bw() + labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
|
||||
ggplot(data = new_data, mapping = aes(x = binned, y = Z, fill = binned, group = binned)) + geom_boxplot() + theme_bw() + labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
|
||||
ggplot(data = new_data, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
|
||||
ggplot(
|
||||
data = new_data
|
||||
, mapping = aes(x = X, y = Y, color = binned)
|
||||
) + geom_point() +
|
||||
theme_bw() +
|
||||
labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
|
||||
ggplot(
|
||||
data = new_data
|
||||
, mapping = aes(x = binned, y = Z, fill = binned, group = binned)
|
||||
) + geom_boxplot() +
|
||||
theme_bw() +
|
||||
labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
|
||||
ggplot(
|
||||
data = new_data
|
||||
, mapping = aes(x = Y, y = ..count.., fill = binned)
|
||||
) + geom_density(position = "fill") +
|
||||
theme_bw() +
|
||||
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
|
||||
|
||||
|
||||
# Now, let's show with other parameters
|
||||
model2 <- lgb.train(params,
|
||||
dtrain,
|
||||
100,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1)
|
||||
model2 <- lgb.train(
|
||||
params
|
||||
, dtrain
|
||||
, 100
|
||||
, valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
)
|
||||
|
||||
# We create the data structure, but for model2
|
||||
new_data2 <- data.frame(X = rowMeans(predict(model2,
|
||||
agaricus.test$data,
|
||||
predleaf = TRUE)),
|
||||
Y = pmin(pmax(predict(model2,
|
||||
agaricus.test$data), 1e-15), 1 - 1e-15))
|
||||
new_data2$Z <- -(agaricus.test$label * log(new_data2$Y) + (1 - agaricus.test$label) * log(1 - new_data2$Y))
|
||||
new_data2$binned <- .bincode(x = new_data2$X,
|
||||
breaks = quantile(x = new_data2$X,
|
||||
probs = (1:9)/10),
|
||||
right = TRUE,
|
||||
include.lowest = TRUE)
|
||||
new_data2 <- data.frame(
|
||||
X = rowMeans(predict(
|
||||
model2
|
||||
, agaricus.test$data
|
||||
, predleaf = TRUE
|
||||
))
|
||||
, Y = pmin(
|
||||
pmax(
|
||||
predict(
|
||||
model2
|
||||
, agaricus.test$data
|
||||
)
|
||||
, 1e-15
|
||||
)
|
||||
, 1 - 1e-15
|
||||
)
|
||||
)
|
||||
new_data2$Z <- -1 * (agaricus.test$label * log(new_data2$Y) + (1 - agaricus.test$label) * log(1 - new_data2$Y))
|
||||
new_data2$binned <- .bincode(
|
||||
x = new_data2$X
|
||||
, breaks = quantile(
|
||||
x = new_data2$X
|
||||
, probs = (1:9) / 10
|
||||
)
|
||||
, right = TRUE
|
||||
, include.lowest = TRUE
|
||||
)
|
||||
new_data2$binned[is.na(new_data2$binned)] <- 0
|
||||
new_data2$binned <- as.factor(new_data2$binned)
|
||||
|
||||
|
@ -87,31 +135,64 @@ table(new_data2$binned)
|
|||
# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss
|
||||
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are real thus it is not an issue
|
||||
# However, if the rules were not true, the loss would explode.
|
||||
ggplot(data = new_data2, mapping = aes(x = X, y = Y, color = binned)) + geom_point() + theme_bw() + labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
|
||||
ggplot(data = new_data2, mapping = aes(x = binned, y = Z, fill = binned, group = binned)) + geom_boxplot() + theme_bw() + labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
|
||||
ggplot(data = new_data2, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
|
||||
ggplot(
|
||||
data = new_data2
|
||||
, mapping = aes(x = X, y = Y, color = binned)
|
||||
) + geom_point() +
|
||||
theme_bw() +
|
||||
labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
|
||||
ggplot(
|
||||
data = new_data2
|
||||
, mapping = aes(x = binned, y = Z, fill = binned, group = binned)
|
||||
) + geom_boxplot() +
|
||||
theme_bw() +
|
||||
labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
|
||||
ggplot(
|
||||
data = new_data2
|
||||
, mapping = aes(x = Y, y = ..count.., fill = binned)
|
||||
) + geom_density(position = "fill") +
|
||||
theme_bw() +
|
||||
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
|
||||
|
||||
|
||||
# Now, try with very severe overfitting
|
||||
model3 <- lgb.train(params,
|
||||
dtrain,
|
||||
1000,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1)
|
||||
model3 <- lgb.train(
|
||||
params
|
||||
, dtrain
|
||||
, 1000
|
||||
, valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
)
|
||||
|
||||
# We create the data structure, but for model3
|
||||
new_data3 <- data.frame(X = rowMeans(predict(model3,
|
||||
agaricus.test$data,
|
||||
predleaf = TRUE)),
|
||||
Y = pmin(pmax(predict(model3,
|
||||
agaricus.test$data), 1e-15), 1 - 1e-15))
|
||||
new_data3$Z <- -(agaricus.test$label * log(new_data3$Y) + (1 - agaricus.test$label) * log(1 - new_data3$Y))
|
||||
new_data3$binned <- .bincode(x = new_data3$X,
|
||||
breaks = quantile(x = new_data3$X,
|
||||
probs = (1:9)/10),
|
||||
right = TRUE,
|
||||
include.lowest = TRUE)
|
||||
new_data3 <- data.frame(
|
||||
X = rowMeans(predict(
|
||||
model3
|
||||
, agaricus.test$data
|
||||
, predleaf = TRUE
|
||||
))
|
||||
, Y = pmin(
|
||||
pmax(
|
||||
predict(
|
||||
model3
|
||||
, agaricus.test$data
|
||||
)
|
||||
, 1e-15
|
||||
)
|
||||
, 1 - 1e-15
|
||||
)
|
||||
)
|
||||
new_data3$Z <- -1 * (agaricus.test$label * log(new_data3$Y) + (1 - agaricus.test$label) * log(1 - new_data3$Y))
|
||||
new_data3$binned <- .bincode(
|
||||
x = new_data3$X
|
||||
, breaks = quantile(
|
||||
x = new_data3$X
|
||||
, probs = (1:9) / 10
|
||||
)
|
||||
, right = TRUE
|
||||
, include.lowest = TRUE
|
||||
)
|
||||
new_data3$binned[is.na(new_data3$binned)] <- 0
|
||||
new_data3$binned <- as.factor(new_data3$binned)
|
||||
|
||||
|
@ -119,9 +200,21 @@ new_data3$binned <- as.factor(new_data3$binned)
|
|||
table(new_data3$binned)
|
||||
|
||||
# We can plot the binned content
|
||||
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are real thus it is not an issue.
|
||||
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules
|
||||
# are real thus it is not an issue.
|
||||
# However, if the rules were not true, the loss would explode. See the sudden spikes?
|
||||
ggplot(data = new_data3, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
|
||||
ggplot(
|
||||
data = new_data3
|
||||
, mapping = aes(x = Y, y = ..count.., fill = binned)
|
||||
) +
|
||||
geom_density(position = "fill") +
|
||||
theme_bw() +
|
||||
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
|
||||
|
||||
# Compare with our second model, the difference is severe. This is smooth.
|
||||
ggplot(data = new_data2, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
|
||||
ggplot(
|
||||
data = new_data2
|
||||
, mapping = aes(x = Y, y = ..count.., fill = binned)
|
||||
) + geom_density(position = "fill") +
|
||||
theme_bw() +
|
||||
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
|
||||
|
|
|
@ -19,29 +19,33 @@ valids <- list(test = dtest)
|
|||
|
||||
# Method 1 of training
|
||||
params <- list(objective = "multiclass", metric = "multi_error", num_class = 3)
|
||||
model <- lgb.train(params,
|
||||
dtrain,
|
||||
100,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 10)
|
||||
model <- lgb.train(
|
||||
params
|
||||
, dtrain
|
||||
, 100
|
||||
, valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 10
|
||||
)
|
||||
|
||||
# We can predict on test data, outputs a 90-length vector
|
||||
# Order: obs1 class1, obs1 class2, obs1 class3, obs2 class1, obs2 class2, obs2 class3...
|
||||
my_preds <- predict(model, test[, 1:4])
|
||||
|
||||
# Method 2 of training, identical
|
||||
model <- lgb.train(list(),
|
||||
dtrain,
|
||||
100,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 10,
|
||||
objective = "multiclass",
|
||||
metric = "multi_error",
|
||||
num_class = 3)
|
||||
model <- lgb.train(
|
||||
list()
|
||||
, dtrain
|
||||
, 100
|
||||
, valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 10
|
||||
, objective = "multiclass"
|
||||
, metric = "multi_error"
|
||||
, num_class = 3
|
||||
)
|
||||
|
||||
# We can predict on test data, identical
|
||||
my_preds <- predict(model, test[, 1:4])
|
||||
|
|
|
@ -20,17 +20,19 @@ valids <- list(train = dtrain, test = dtest)
|
|||
# Method 1 of training with built-in multiclass objective
|
||||
# Note: need to turn off boost from average to match custom objective
|
||||
# (https://github.com/microsoft/LightGBM/issues/1846)
|
||||
model_builtin <- lgb.train(list(),
|
||||
dtrain,
|
||||
boost_from_average = FALSE,
|
||||
100,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 10,
|
||||
objective = "multiclass",
|
||||
metric = "multi_logloss",
|
||||
num_class = 3)
|
||||
model_builtin <- lgb.train(
|
||||
list()
|
||||
, dtrain
|
||||
, boost_from_average = FALSE
|
||||
, 100
|
||||
, valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 10
|
||||
, objective = "multiclass"
|
||||
, metric = "multi_logloss"
|
||||
, num_class = 3
|
||||
)
|
||||
|
||||
preds_builtin <- predict(model_builtin, test[, 1:4], rawscore = TRUE, reshape = TRUE)
|
||||
probs_builtin <- exp(preds_builtin) / rowSums(exp(preds_builtin))
|
||||
|
@ -65,21 +67,25 @@ custom_multiclass_metric = function(preds, dtrain) {
|
|||
preds = preds - apply(preds, 1, max)
|
||||
prob = exp(preds) / rowSums(exp(preds))
|
||||
|
||||
return(list(name = "error",
|
||||
value = -mean(log(prob[cbind(1:length(labels), labels + 1)])),
|
||||
higher_better = FALSE))
|
||||
return(list(
|
||||
name = "error"
|
||||
, value = -mean(log(prob[cbind(1:length(labels), labels + 1)]))
|
||||
, higher_better = FALSE
|
||||
))
|
||||
}
|
||||
|
||||
model_custom <- lgb.train(list(),
|
||||
dtrain,
|
||||
100,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 10,
|
||||
objective = custom_multiclass_obj,
|
||||
eval = custom_multiclass_metric,
|
||||
num_class = 3)
|
||||
model_custom <- lgb.train(
|
||||
list()
|
||||
, dtrain
|
||||
, 100
|
||||
, valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 10
|
||||
, objective = custom_multiclass_obj
|
||||
, eval = custom_multiclass_metric
|
||||
, num_class = 3
|
||||
)
|
||||
|
||||
preds_custom <- predict(model_custom, test[, 1:4], rawscore = TRUE, reshape = TRUE)
|
||||
probs_custom <- exp(preds_custom) / rowSums(exp(preds_custom))
|
||||
|
@ -87,4 +93,3 @@ probs_custom <- exp(preds_custom) / rowSums(exp(preds_custom))
|
|||
# compare predictions
|
||||
stopifnot(identical(probs_builtin, probs_custom))
|
||||
stopifnot(identical(preds_builtin, preds_custom))
|
||||
|
||||
|
|
|
@ -11,8 +11,8 @@ library(lightgbm)
|
|||
# - Run 3: sum of weights equal to 6513 (x 1e5) with adjusted regularization (learning)
|
||||
|
||||
# Setup small weights
|
||||
weights1 <- rep(1/100000, 6513)
|
||||
weights2 <- rep(1/100000, 1611)
|
||||
weights1 <- rep(1 / 100000, 6513)
|
||||
weights2 <- rep(1 / 100000, 1611)
|
||||
|
||||
# Load data and create datasets
|
||||
data(agaricus.train, package = "lightgbm")
|
||||
|
@ -26,40 +26,48 @@ valids <- list(test = dtest)
|
|||
# Run 1: sum of weights equal to 0.06513 without adjusted regularization (not learning)
|
||||
# It cannot learn because regularization is too large!
|
||||
# min_sum_hessian alone is bigger than the sum of weights, thus you will never learn anything
|
||||
params <- list(objective = "regression",
|
||||
metric = "l2",
|
||||
device = "cpu",
|
||||
min_sum_hessian = 10,
|
||||
num_leaves = 7,
|
||||
max_depth = 3,
|
||||
nthread = 1)
|
||||
model <- lgb.train(params,
|
||||
dtrain,
|
||||
50,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 10)
|
||||
params <- list(
|
||||
objective = "regression"
|
||||
, metric = "l2"
|
||||
, device = "cpu"
|
||||
, min_sum_hessian = 10
|
||||
, num_leaves = 7
|
||||
, max_depth = 3
|
||||
, nthread = 1
|
||||
)
|
||||
model <- lgb.train(
|
||||
params
|
||||
, dtrain
|
||||
, 50
|
||||
, valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 10
|
||||
)
|
||||
weight_loss <- as.numeric(model$record_evals$test$l2$eval)
|
||||
plot(weight_loss) # Shows how poor the learning was: a straight line!
|
||||
|
||||
# Run 2: sum of weights equal to 0.06513 with adjusted regularization (learning)
|
||||
# Adjusted regularization just consisting in multiplicating results by 1e4 (x10000)
|
||||
# Notice how it learns, there is no issue as we adjusted regularization ourselves
|
||||
params <- list(objective = "regression",
|
||||
metric = "l2",
|
||||
device = "cpu",
|
||||
min_sum_hessian = 1e-4,
|
||||
num_leaves = 7,
|
||||
max_depth = 3,
|
||||
nthread = 1)
|
||||
model <- lgb.train(params,
|
||||
dtrain,
|
||||
50,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 10)
|
||||
params <- list(
|
||||
objective = "regression"
|
||||
, metric = "l2"
|
||||
, device = "cpu"
|
||||
, min_sum_hessian = 1e-4
|
||||
, num_leaves = 7
|
||||
, max_depth = 3
|
||||
, nthread = 1
|
||||
)
|
||||
model <- lgb.train(
|
||||
params
|
||||
, dtrain
|
||||
, 50
|
||||
, valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 10
|
||||
)
|
||||
small_weight_loss <- as.numeric(model$record_evals$test$l2$eval)
|
||||
plot(small_weight_loss) # It learns!
|
||||
|
||||
|
@ -78,24 +86,28 @@ dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
|||
valids <- list(test = dtest)
|
||||
|
||||
# Setup parameters and run model...
|
||||
params <- list(objective = "regression",
|
||||
metric = "l2",
|
||||
device = "cpu",
|
||||
min_sum_hessian = 10,
|
||||
num_leaves = 7,
|
||||
max_depth = 3,
|
||||
nthread = 1)
|
||||
model <- lgb.train(params,
|
||||
dtrain,
|
||||
50,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 10)
|
||||
params <- list(
|
||||
objective = "regression"
|
||||
, metric = "l2"
|
||||
, device = "cpu"
|
||||
, min_sum_hessian = 10
|
||||
, num_leaves = 7
|
||||
, max_depth = 3
|
||||
, nthread = 1
|
||||
)
|
||||
model <- lgb.train(
|
||||
params
|
||||
, dtrain
|
||||
, 50
|
||||
, valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 10
|
||||
)
|
||||
large_weight_loss <- as.numeric(model$record_evals$test$l2$eval)
|
||||
plot(large_weight_loss) # It learns!
|
||||
|
||||
|
||||
# Do you want to compare the learning? They both converge.
|
||||
plot(small_weight_loss, large_weight_loss)
|
||||
curve(1*x, from = 0, to = 0.02, add = TRUE)
|
||||
curve(1 * x, from = 0, to = 0.02, add = TRUE)
|
||||
|
|
|
@ -12,9 +12,9 @@ getinfo(dataset, ...)
|
|||
\arguments{
|
||||
\item{dataset}{Object of class \code{lgb.Dataset}}
|
||||
|
||||
\item{...}{other parameters}
|
||||
|
||||
\item{name}{the name of the information field to get (see details)}
|
||||
|
||||
\item{...}{other parameters}
|
||||
}
|
||||
\value{
|
||||
info data
|
||||
|
|
|
@ -4,9 +4,16 @@
|
|||
\alias{lgb.Dataset}
|
||||
\title{Construct \code{lgb.Dataset} object}
|
||||
\usage{
|
||||
lgb.Dataset(data, params = list(), reference = NULL, colnames = NULL,
|
||||
categorical_feature = NULL, free_raw_data = TRUE, info = list(),
|
||||
...)
|
||||
lgb.Dataset(
|
||||
data,
|
||||
params = list(),
|
||||
reference = NULL,
|
||||
colnames = NULL,
|
||||
categorical_feature = NULL,
|
||||
free_raw_data = TRUE,
|
||||
info = list(),
|
||||
...
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename}
|
||||
|
|
|
@ -4,13 +4,29 @@
|
|||
\alias{lgb.cv}
|
||||
\title{Main CV logic for LightGBM}
|
||||
\usage{
|
||||
lgb.cv(params = list(), data, nrounds = 10, nfold = 3,
|
||||
label = NULL, weight = NULL, obj = NULL, eval = NULL,
|
||||
verbose = 1, record = TRUE, eval_freq = 1L, showsd = TRUE,
|
||||
stratified = TRUE, folds = NULL, init_model = NULL,
|
||||
colnames = NULL, categorical_feature = NULL,
|
||||
early_stopping_rounds = NULL, callbacks = list(),
|
||||
reset_data = FALSE, ...)
|
||||
lgb.cv(
|
||||
params = list(),
|
||||
data,
|
||||
nrounds = 10,
|
||||
nfold = 3,
|
||||
label = NULL,
|
||||
weight = NULL,
|
||||
obj = NULL,
|
||||
eval = NULL,
|
||||
verbose = 1,
|
||||
record = TRUE,
|
||||
eval_freq = 1L,
|
||||
showsd = TRUE,
|
||||
stratified = TRUE,
|
||||
folds = NULL,
|
||||
init_model = NULL,
|
||||
colnames = NULL,
|
||||
categorical_feature = NULL,
|
||||
early_stopping_rounds = NULL,
|
||||
callbacks = list(),
|
||||
reset_data = FALSE,
|
||||
...
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{List of parameters}
|
||||
|
@ -27,7 +43,7 @@ lgb.cv(params = list(), data, nrounds = 10, nfold = 3,
|
|||
|
||||
\item{obj}{objective function, can be character or custom objective function. Examples include
|
||||
\code{regression}, \code{regression_l1}, \code{huber},
|
||||
\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
|
||||
\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
|
||||
|
||||
\item{eval}{evaluation function, can be (list of) character or custom eval function}
|
||||
|
||||
|
@ -54,17 +70,15 @@ the \code{nfold} and \code{stratified} parameters are ignored.}
|
|||
type int represents index,
|
||||
type str represents feature names}
|
||||
|
||||
\item{early_stopping_rounds}{int
|
||||
Activates early stopping.
|
||||
Requires at least one validation data and one metric
|
||||
If there's more than one, will check all of them except the training data
|
||||
Returns the model with (best_iter + early_stopping_rounds)
|
||||
If early stopping occurs, the model will have 'best_iter' field}
|
||||
\item{early_stopping_rounds}{int. Activates early stopping. Requires at least one validation data
|
||||
and one metric. If there's more than one, will check all of them
|
||||
except the training data. Returns the model with (best_iter + early_stopping_rounds).
|
||||
If early stopping occurs, the model will have 'best_iter' field.}
|
||||
|
||||
\item{callbacks}{list of callback functions
|
||||
List of callback functions that are applied at each iteration.}
|
||||
\item{callbacks}{List of callback functions that are applied at each iteration.}
|
||||
|
||||
\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets}
|
||||
\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model
|
||||
into a predictor model which frees up memory and the original datasets}
|
||||
|
||||
\item{...}{other parameters, see Parameters.rst for more information. A few key parameters:
|
||||
\itemize{
|
||||
|
@ -89,11 +103,13 @@ data(agaricus.train, package = "lightgbm")
|
|||
train <- agaricus.train
|
||||
dtrain <- lgb.Dataset(train$data, label = train$label)
|
||||
params <- list(objective = "regression", metric = "l2")
|
||||
model <- lgb.cv(params,
|
||||
dtrain,
|
||||
10,
|
||||
nfold = 3,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 5)
|
||||
model <- lgb.cv(
|
||||
params = params
|
||||
, data = dtrain
|
||||
, nrounds = 10
|
||||
, nfold = 3
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 5
|
||||
)
|
||||
}
|
||||
|
|
|
@ -27,13 +27,15 @@ test <- agaricus.test
|
|||
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
params <- list(objective = "regression", metric = "l2")
|
||||
valids <- list(test = dtest)
|
||||
model <- lgb.train(params,
|
||||
dtrain,
|
||||
10,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 5)
|
||||
model <- lgb.train(
|
||||
params = params
|
||||
, data = dtrain
|
||||
, nrounds = 10
|
||||
, valids = valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 5
|
||||
)
|
||||
json_model <- lgb.dump(model)
|
||||
|
||||
}
|
||||
|
|
|
@ -4,8 +4,13 @@
|
|||
\alias{lgb.get.eval.result}
|
||||
\title{Get record evaluation result from booster}
|
||||
\usage{
|
||||
lgb.get.eval.result(booster, data_name, eval_name, iters = NULL,
|
||||
is_err = FALSE)
|
||||
lgb.get.eval.result(
|
||||
booster,
|
||||
data_name,
|
||||
eval_name,
|
||||
iters = NULL,
|
||||
is_err = FALSE
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{booster}{Object of class \code{lgb.Booster}}
|
||||
|
@ -34,12 +39,14 @@ test <- agaricus.test
|
|||
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
params <- list(objective = "regression", metric = "l2")
|
||||
valids <- list(test = dtest)
|
||||
model <- lgb.train(params,
|
||||
dtrain,
|
||||
10,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 5)
|
||||
model <- lgb.train(
|
||||
params = params
|
||||
, data = dtrain
|
||||
, nrounds = 10
|
||||
, valids = valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 5
|
||||
)
|
||||
lgb.get.eval.result(model, "test", "l2")
|
||||
}
|
||||
|
|
|
@ -29,9 +29,14 @@ data(agaricus.train, package = "lightgbm")
|
|||
train <- agaricus.train
|
||||
dtrain <- lgb.Dataset(train$data, label = train$label)
|
||||
|
||||
params <- list(objective = "binary",
|
||||
learning_rate = 0.01, num_leaves = 63, max_depth = -1,
|
||||
min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
|
||||
params <- list(
|
||||
objective = "binary"
|
||||
, learning_rate = 0.01
|
||||
, num_leaves = 63
|
||||
, max_depth = -1
|
||||
, min_data_in_leaf = 1
|
||||
, min_sum_hessian_in_leaf = 1
|
||||
)
|
||||
model <- lgb.train(params, dtrain, 10)
|
||||
|
||||
tree_imp1 <- lgb.importance(model, percentage = TRUE)
|
||||
|
|
|
@ -11,7 +11,7 @@ lgb.interprete(model, data, idxset, num_iteration = NULL)
|
|||
|
||||
\item{data}{a matrix object or a dgCMatrix object.}
|
||||
|
||||
\item{idxset}{a integer vector of indices of rows needed.}
|
||||
\item{idxset}{an integer vector of indices of rows needed.}
|
||||
|
||||
\item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration.}
|
||||
}
|
||||
|
|
|
@ -29,13 +29,15 @@ test <- agaricus.test
|
|||
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
params <- list(objective = "regression", metric = "l2")
|
||||
valids <- list(test = dtest)
|
||||
model <- lgb.train(params,
|
||||
dtrain,
|
||||
10,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 5)
|
||||
model <- lgb.train(
|
||||
params = params
|
||||
, data = dtrain
|
||||
, nrounds = 10
|
||||
, valids = valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 5
|
||||
)
|
||||
lgb.save(model, "model.txt")
|
||||
load_booster <- lgb.load(filename = "model.txt")
|
||||
model_string <- model$save_model_to_string(NULL) # saves best iteration
|
||||
|
|
|
@ -44,9 +44,14 @@ data(agaricus.train, package = "lightgbm")
|
|||
train <- agaricus.train
|
||||
dtrain <- lgb.Dataset(train$data, label = train$label)
|
||||
|
||||
params <- list(objective = "binary",
|
||||
learning_rate = 0.01, num_leaves = 63, max_depth = -1,
|
||||
min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
|
||||
params <- list(
|
||||
objective = "binary"
|
||||
, learning_rate = 0.01
|
||||
, num_leaves = 63
|
||||
, max_depth = -1
|
||||
, min_data_in_leaf = 1
|
||||
, min_sum_hessian_in_leaf = 1
|
||||
)
|
||||
model <- lgb.train(params, dtrain, 10)
|
||||
|
||||
tree_dt <- lgb.model.dt.tree(model)
|
||||
|
|
|
@ -4,8 +4,13 @@
|
|||
\alias{lgb.plot.importance}
|
||||
\title{Plot feature importance as a bar graph}
|
||||
\usage{
|
||||
lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain",
|
||||
left_margin = 10, cex = NULL)
|
||||
lgb.plot.importance(
|
||||
tree_imp,
|
||||
top_n = 10,
|
||||
measure = "Gain",
|
||||
left_margin = 10,
|
||||
cex = NULL
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{tree_imp}{a \code{data.table} returned by \code{\link{lgb.importance}}.}
|
||||
|
|
|
@ -4,8 +4,13 @@
|
|||
\alias{lgb.plot.interpretation}
|
||||
\title{Plot feature contribution as a bar graph}
|
||||
\usage{
|
||||
lgb.plot.interpretation(tree_interpretation_dt, top_n = 10, cols = 1,
|
||||
left_margin = 10, cex = NULL)
|
||||
lgb.plot.interpretation(
|
||||
tree_interpretation_dt,
|
||||
top_n = 10,
|
||||
cols = 1,
|
||||
left_margin = 10,
|
||||
cex = NULL
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{tree_interpretation_dt}{a \code{data.table} returned by \code{\link{lgb.interprete}}.}
|
||||
|
@ -25,8 +30,8 @@ The \code{lgb.plot.interpretation} function creates a \code{barplot}.
|
|||
Plot previously calculated feature contribution as a bar graph.
|
||||
}
|
||||
\details{
|
||||
The graph represents each feature as a horizontal bar of length proportional to the defined contribution of a feature.
|
||||
Features are shown ranked in a decreasing contribution order.
|
||||
The graph represents each feature as a horizontal bar of length proportional to the defined
|
||||
contribution of a feature. Features are shown ranked in a decreasing contribution order.
|
||||
}
|
||||
\examples{
|
||||
library(lightgbm)
|
||||
|
@ -39,9 +44,14 @@ setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label))
|
|||
data(agaricus.test, package = "lightgbm")
|
||||
test <- agaricus.test
|
||||
|
||||
params <- list(objective = "binary",
|
||||
learning_rate = 0.01, num_leaves = 63, max_depth = -1,
|
||||
min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
|
||||
params <- list(
|
||||
objective = "binary"
|
||||
, learning_rate = 0.01
|
||||
, num_leaves = 63
|
||||
, max_depth = -1
|
||||
, min_data_in_leaf = 1
|
||||
, min_sum_hessian_in_leaf = 1
|
||||
)
|
||||
model <- lgb.train(params, dtrain, 10)
|
||||
|
||||
tree_interpretation <- lgb.interprete(model, test$data, 1:5)
|
||||
|
|
|
@ -10,10 +10,13 @@ lgb.prepare(data)
|
|||
\item{data}{A data.frame or data.table to prepare.}
|
||||
}
|
||||
\value{
|
||||
The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
|
||||
The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
|
||||
for input in \code{lgb.Dataset}.
|
||||
}
|
||||
\description{
|
||||
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric without integers. Please use \code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
|
||||
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
|
||||
Factors and characters are converted to numeric without integers. Please use
|
||||
\code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
|
||||
}
|
||||
\examples{
|
||||
library(lightgbm)
|
||||
|
|
|
@ -10,10 +10,16 @@ lgb.prepare2(data)
|
|||
\item{data}{A data.frame or data.table to prepare.}
|
||||
}
|
||||
\value{
|
||||
The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
|
||||
The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
|
||||
for input in \code{lgb.Dataset}.
|
||||
}
|
||||
\description{
|
||||
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric (specifically: integer). Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
|
||||
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
|
||||
Factors and characters are converted to numeric (specifically: integer).
|
||||
Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets.
|
||||
This is useful if you have a specific need for integer dataset instead of numeric dataset.
|
||||
Note that there are programs which do not support integer-only input. Consider this as a half
|
||||
memory technique which is dangerous, especially for LightGBM.
|
||||
}
|
||||
\examples{
|
||||
library(lightgbm)
|
||||
|
|
|
@ -12,10 +12,14 @@ lgb.prepare_rules(data, rules = NULL)
|
|||
\item{rules}{A set of rules from the data preparator, if already used.}
|
||||
}
|
||||
\value{
|
||||
A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
|
||||
A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
|
||||
The data must be converted to a matrix format (\code{as.matrix}) for input
|
||||
in \code{lgb.Dataset}.
|
||||
}
|
||||
\description{
|
||||
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter.
|
||||
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
|
||||
Factors and characters are converted to numeric. In addition, keeps rules created
|
||||
so you can convert other datasets using this converter.
|
||||
}
|
||||
\examples{
|
||||
library(lightgbm)
|
||||
|
|
|
@ -12,43 +12,15 @@ lgb.prepare_rules2(data, rules = NULL)
|
|||
\item{rules}{A set of rules from the data preparator, if already used.}
|
||||
}
|
||||
\value{
|
||||
A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
|
||||
A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
|
||||
The data must be converted to a matrix format (\code{as.matrix}) for input in
|
||||
\code{lgb.Dataset}.
|
||||
}
|
||||
\description{
|
||||
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
|
||||
}
|
||||
\examples{
|
||||
library(lightgbm)
|
||||
data(iris)
|
||||
|
||||
str(iris)
|
||||
|
||||
new_iris <- lgb.prepare_rules2(data = iris) # Autoconverter
|
||||
str(new_iris$data)
|
||||
|
||||
data(iris) # Erase iris dataset
|
||||
iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
|
||||
|
||||
# Use conversion using known rules
|
||||
# Unknown factors become 0, excellent for sparse datasets
|
||||
newer_iris <- lgb.prepare_rules2(data = iris, rules = new_iris$rules)
|
||||
|
||||
# Unknown factor is now zero, perfect for sparse datasets
|
||||
newer_iris$data[1, ] # Species became 0 as it is an unknown factor
|
||||
|
||||
newer_iris$data[1, 5] <- 1 # Put back real initial value
|
||||
|
||||
# Is the newly created dataset equal? YES!
|
||||
all.equal(new_iris$data, newer_iris$data)
|
||||
|
||||
# Can we test our own rules?
|
||||
data(iris) # Erase iris dataset
|
||||
|
||||
# We remapped values differently
|
||||
personal_rules <- list(Species = c("setosa" = 3L,
|
||||
"versicolor" = 2L,
|
||||
"virginica" = 1L))
|
||||
newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules)
|
||||
str(newest_iris$data) # SUCCESS!
|
||||
|
||||
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
|
||||
Factors and characters are converted to numeric (specifically: integer).
|
||||
In addition, keeps rules created so you can convert other datasets using this converter.
|
||||
This is useful if you have a specific need for integer dataset instead of numeric dataset.
|
||||
Note that there are programs which do not support integer-only input.
|
||||
Consider this as a half memory technique which is dangerous, especially for LightGBM.
|
||||
}
|
||||
|
|
|
@ -29,13 +29,15 @@ test <- agaricus.test
|
|||
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
params <- list(objective = "regression", metric = "l2")
|
||||
valids <- list(test = dtest)
|
||||
model <- lgb.train(params,
|
||||
dtrain,
|
||||
10,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 5)
|
||||
model <- lgb.train(
|
||||
params = params
|
||||
, data = dtrain
|
||||
, nrounds = 10
|
||||
, valids = valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 5
|
||||
)
|
||||
lgb.save(model, "model.txt")
|
||||
|
||||
}
|
||||
|
|
|
@ -4,11 +4,24 @@
|
|||
\alias{lgb.train}
|
||||
\title{Main training logic for LightGBM}
|
||||
\usage{
|
||||
lgb.train(params = list(), data, nrounds = 10, valids = list(),
|
||||
obj = NULL, eval = NULL, verbose = 1, record = TRUE,
|
||||
eval_freq = 1L, init_model = NULL, colnames = NULL,
|
||||
categorical_feature = NULL, early_stopping_rounds = NULL,
|
||||
callbacks = list(), reset_data = FALSE, ...)
|
||||
lgb.train(
|
||||
params = list(),
|
||||
data,
|
||||
nrounds = 10,
|
||||
valids = list(),
|
||||
obj = NULL,
|
||||
eval = NULL,
|
||||
verbose = 1,
|
||||
record = TRUE,
|
||||
eval_freq = 1L,
|
||||
init_model = NULL,
|
||||
colnames = NULL,
|
||||
categorical_feature = NULL,
|
||||
early_stopping_rounds = NULL,
|
||||
callbacks = list(),
|
||||
reset_data = FALSE,
|
||||
...
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{params}{List of parameters}
|
||||
|
@ -39,17 +52,16 @@ lgb.train(params = list(), data, nrounds = 10, valids = list(),
|
|||
type int represents index,
|
||||
type str represents feature names}
|
||||
|
||||
\item{early_stopping_rounds}{int
|
||||
Activates early stopping.
|
||||
Requires at least one validation data and one metric
|
||||
If there's more than one, will check all of them except the training data
|
||||
Returns the model with (best_iter + early_stopping_rounds)
|
||||
If early stopping occurs, the model will have 'best_iter' field}
|
||||
\item{early_stopping_rounds}{int. Activates early stopping. Requires at least one validation data
|
||||
and one metric. If there's more than one, will check all of them
|
||||
except the training data. Returns the model with (best_iter + early_stopping_rounds).
|
||||
If early stopping occurs, the model will have 'best_iter' field.}
|
||||
|
||||
\item{callbacks}{list of callback functions
|
||||
List of callback functions that are applied at each iteration.}
|
||||
\item{callbacks}{List of callback functions that are applied at each iteration.}
|
||||
|
||||
\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets}
|
||||
\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the
|
||||
booster model into a predictor model which frees up memory and the
|
||||
original datasets}
|
||||
|
||||
\item{...}{other parameters, see Parameters.rst for more information. A few key parameters:
|
||||
\itemize{
|
||||
|
@ -78,11 +90,13 @@ test <- agaricus.test
|
|||
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
params <- list(objective = "regression", metric = "l2")
|
||||
valids <- list(test = dtest)
|
||||
model <- lgb.train(params,
|
||||
dtrain,
|
||||
10,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 5)
|
||||
model <- lgb.train(
|
||||
params = params
|
||||
, data = dtrain
|
||||
, nrounds = 10
|
||||
, valids = valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 5
|
||||
)
|
||||
}
|
||||
|
|
|
@ -7,11 +7,15 @@
|
|||
lgb.unloader(restore = TRUE, wipe = FALSE, envir = .GlobalEnv)
|
||||
}
|
||||
\arguments{
|
||||
\item{restore}{Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed.}
|
||||
\item{restore}{Whether to reload \code{LightGBM} immediately after detaching from R.
|
||||
Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once
|
||||
unloading is performed.}
|
||||
|
||||
\item{wipe}{Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them.}
|
||||
\item{wipe}{Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global
|
||||
environment. Defaults to \code{FALSE} which means to not remove them.}
|
||||
|
||||
\item{envir}{The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment.}
|
||||
\item{envir}{The environment to perform wiping on if \code{wipe == TRUE}. Defaults to
|
||||
\code{.GlobalEnv} which is the global environment.}
|
||||
}
|
||||
\value{
|
||||
NULL invisibly.
|
||||
|
@ -29,13 +33,15 @@ test <- agaricus.test
|
|||
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
params <- list(objective = "regression", metric = "l2")
|
||||
valids <- list(test = dtest)
|
||||
model <- lgb.train(params,
|
||||
dtrain,
|
||||
10,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 5)
|
||||
model <- lgb.train(
|
||||
params = params
|
||||
, data = dtrain
|
||||
, nrounds = 10
|
||||
, valids = valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 5
|
||||
)
|
||||
|
||||
\dontrun{
|
||||
lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv)
|
||||
|
|
|
@ -9,12 +9,10 @@ List of callback functions that are applied at each iteration.}
|
|||
|
||||
\item{data}{a \code{lgb.Dataset} object, used for training}
|
||||
|
||||
\item{early_stopping_rounds}{int
|
||||
Activates early stopping.
|
||||
Requires at least one validation data and one metric
|
||||
If there's more than one, will check all of them except the training data
|
||||
Returns the model with (best_iter + early_stopping_rounds)
|
||||
If early stopping occurs, the model will have 'best_iter' field}
|
||||
\item{early_stopping_rounds}{int. Activates early stopping. Requires at least one validation data
|
||||
and one metric. If there's more than one, will check all of them
|
||||
except the training data. Returns the model with (best_iter + early_stopping_rounds).
|
||||
If early stopping occurs, the model will have 'best_iter' field.}
|
||||
|
||||
\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
|
||||
|
||||
|
|
|
@ -4,10 +4,20 @@
|
|||
\alias{lightgbm}
|
||||
\title{Train a LightGBM model}
|
||||
\usage{
|
||||
lightgbm(data, label = NULL, weight = NULL, params = list(),
|
||||
nrounds = 10, verbose = 1, eval_freq = 1L,
|
||||
early_stopping_rounds = NULL, save_name = "lightgbm.model",
|
||||
init_model = NULL, callbacks = list(), ...)
|
||||
lightgbm(
|
||||
data,
|
||||
label = NULL,
|
||||
weight = NULL,
|
||||
params = list(),
|
||||
nrounds = 10,
|
||||
verbose = 1,
|
||||
eval_freq = 1L,
|
||||
early_stopping_rounds = NULL,
|
||||
save_name = "lightgbm.model",
|
||||
init_model = NULL,
|
||||
callbacks = list(),
|
||||
...
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{a \code{lgb.Dataset} object, used for training}
|
||||
|
@ -24,12 +34,10 @@ lightgbm(data, label = NULL, weight = NULL, params = list(),
|
|||
|
||||
\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
|
||||
|
||||
\item{early_stopping_rounds}{int
|
||||
Activates early stopping.
|
||||
Requires at least one validation data and one metric
|
||||
If there's more than one, will check all of them except the training data
|
||||
Returns the model with (best_iter + early_stopping_rounds)
|
||||
If early stopping occurs, the model will have 'best_iter' field}
|
||||
\item{early_stopping_rounds}{int. Activates early stopping. Requires at least one validation data
|
||||
and one metric. If there's more than one, will check all of them
|
||||
except the training data. Returns the model with (best_iter + early_stopping_rounds).
|
||||
If early stopping occurs, the model will have 'best_iter' field.}
|
||||
|
||||
\item{save_name}{File name to use when writing the trained model to disk. Should end in ".model".}
|
||||
|
||||
|
|
|
@ -4,9 +4,17 @@
|
|||
\alias{predict.lgb.Booster}
|
||||
\title{Predict method for LightGBM model}
|
||||
\usage{
|
||||
\method{predict}{lgb.Booster}(object, data, num_iteration = NULL,
|
||||
rawscore = FALSE, predleaf = FALSE, predcontrib = FALSE,
|
||||
header = FALSE, reshape = FALSE, ...)
|
||||
\method{predict}{lgb.Booster}(
|
||||
object,
|
||||
data,
|
||||
num_iteration = NULL,
|
||||
rawscore = FALSE,
|
||||
predleaf = FALSE,
|
||||
predcontrib = FALSE,
|
||||
header = FALSE,
|
||||
reshape = FALSE,
|
||||
...
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{Object of class \code{lgb.Booster}}
|
||||
|
@ -16,8 +24,8 @@
|
|||
\item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration}
|
||||
|
||||
\item{rawscore}{whether the prediction should be returned in the for of original untransformed
|
||||
sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} for
|
||||
logistic regression would result in predictions for log-odds instead of probabilities.}
|
||||
sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE}
|
||||
for logistic regression would result in predictions for log-odds instead of probabilities.}
|
||||
|
||||
\item{predleaf}{whether predict leaf index instead.}
|
||||
|
||||
|
@ -53,13 +61,15 @@ test <- agaricus.test
|
|||
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
params <- list(objective = "regression", metric = "l2")
|
||||
valids <- list(test = dtest)
|
||||
model <- lgb.train(params,
|
||||
dtrain,
|
||||
10,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 5)
|
||||
model <- lgb.train(
|
||||
params = params
|
||||
, data = dtrain
|
||||
, nrounds = 10
|
||||
, valids = valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 5
|
||||
)
|
||||
preds <- predict(model, test$data)
|
||||
|
||||
}
|
||||
|
|
|
@ -27,13 +27,15 @@ test <- agaricus.test
|
|||
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
params <- list(objective = "regression", metric = "l2")
|
||||
valids <- list(test = dtest)
|
||||
model <- lgb.train(params,
|
||||
dtrain,
|
||||
10,
|
||||
valids,
|
||||
min_data = 1,
|
||||
learning_rate = 1,
|
||||
early_stopping_rounds = 5)
|
||||
model <- lgb.train(
|
||||
params = params
|
||||
, data = dtrain
|
||||
, nrounds = 10
|
||||
, valids = valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 5
|
||||
)
|
||||
saveRDS.lgb.Booster(model, "model.rds")
|
||||
new_model <- readRDS.lgb.Booster("model.rds")
|
||||
|
||||
|
|
|
@ -4,19 +4,31 @@
|
|||
\alias{saveRDS.lgb.Booster}
|
||||
\title{saveRDS for \code{lgb.Booster} models}
|
||||
\usage{
|
||||
saveRDS.lgb.Booster(object, file = "", ascii = FALSE, version = NULL,
|
||||
compress = TRUE, refhook = NULL, raw = TRUE)
|
||||
saveRDS.lgb.Booster(
|
||||
object,
|
||||
file = "",
|
||||
ascii = FALSE,
|
||||
version = NULL,
|
||||
compress = TRUE,
|
||||
refhook = NULL,
|
||||
raw = TRUE
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{object}{R object to serialize.}
|
||||
|
||||
\item{file}{a connection or the name of the file where the R object is saved to or read from.}
|
||||
|
||||
\item{ascii}{a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save.}
|
||||
\item{ascii}{a logical. If TRUE or NA, an ASCII representation is written; otherwise (default),
|
||||
a binary one is used. See the comments in the help for save.}
|
||||
|
||||
\item{version}{the workspace format version to use. \code{NULL} specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions.}
|
||||
\item{version}{the workspace format version to use. \code{NULL} specifies the current default
|
||||
version (2). Versions prior to 2 are not supported, so this will only be relevant
|
||||
when there are later versions.}
|
||||
|
||||
\item{compress}{a logical specifying whether saving to a named file is to use "gzip" compression, or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of compression to be used. Ignored if file is a connection.}
|
||||
\item{compress}{a logical specifying whether saving to a named file is to use "gzip" compression,
|
||||
or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of
|
||||
compression to be used. Ignored if file is a connection.}
|
||||
|
||||
\item{refhook}{a hook function for handling reference objects.}
|
||||
|
||||
|
@ -26,7 +38,8 @@ saveRDS.lgb.Booster(object, file = "", ascii = FALSE, version = NULL,
|
|||
NULL invisibly.
|
||||
}
|
||||
\description{
|
||||
Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
|
||||
Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides
|
||||
whether to save the raw model or not.
|
||||
}
|
||||
\examples{
|
||||
library(lightgbm)
|
||||
|
@ -39,10 +52,10 @@ dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
|||
params <- list(objective = "regression", metric = "l2")
|
||||
valids <- list(test = dtest)
|
||||
model <- lgb.train(
|
||||
params
|
||||
, dtrain
|
||||
, 10
|
||||
, valids
|
||||
params = params
|
||||
, data = dtrain
|
||||
, nrounds = 10
|
||||
, valids = valids
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 5
|
||||
|
|
|
@ -12,11 +12,11 @@ setinfo(dataset, ...)
|
|||
\arguments{
|
||||
\item{dataset}{Object of class \code{lgb.Dataset}}
|
||||
|
||||
\item{...}{other parameters}
|
||||
|
||||
\item{name}{the name of the field to get}
|
||||
|
||||
\item{info}{the specific field of information to set}
|
||||
|
||||
\item{...}{other parameters}
|
||||
}
|
||||
\value{
|
||||
passed object
|
||||
|
|
|
@ -12,9 +12,9 @@ slice(dataset, ...)
|
|||
\arguments{
|
||||
\item{dataset}{Object of class \code{lgb.Dataset}}
|
||||
|
||||
\item{...}{other parameters (currently not used)}
|
||||
\item{idxset}{an integer vector of indices of rows needed}
|
||||
|
||||
\item{idxset}{a integer vector of indices of rows needed}
|
||||
\item{...}{other parameters (currently not used)}
|
||||
}
|
||||
\value{
|
||||
constructed sub dataset
|
||||
|
|
|
@ -8,7 +8,7 @@ if (.Machine$sizeof.pointer != 8){
|
|||
}
|
||||
|
||||
R_int_UUID <- .Internal(internalsID())
|
||||
R_ver <- as.double(R.Version()$major) + as.double(R.Version()$minor)/10
|
||||
R_ver <- as.double(R.Version()$major) + as.double(R.Version()$minor) / 10
|
||||
|
||||
if (!(R_int_UUID == "0310d4b8-ccb1-4bb8-ba94-d36a55f60262"
|
||||
|| R_int_UUID == "2fdf6c18-697a-4ba7-b8ef-11c0d92f1327")){
|
||||
|
@ -74,7 +74,7 @@ if (!use_precompile) {
|
|||
try_vs <- 0
|
||||
local_vs_def <- ""
|
||||
vs_versions <- c("Visual Studio 16 2019", "Visual Studio 15 2017", "Visual Studio 14 2015")
|
||||
for(vs in vs_versions){
|
||||
for (vs in vs_versions){
|
||||
vs_def <- paste0(" -G \"", vs, "\" -A x64")
|
||||
tmp_cmake_cmd <- paste0(cmake_cmd, vs_def)
|
||||
try_vs <- system(paste0(tmp_cmake_cmd, " .."))
|
||||
|
@ -106,14 +106,29 @@ if (!use_precompile) {
|
|||
|
||||
# Has precompiled package
|
||||
lib_folder <- file.path(R_PACKAGE_SOURCE, "../", fsep = "/")
|
||||
if (file.exists(file.path(lib_folder, paste0("lib_lightgbm", SHLIB_EXT), fsep = "/"))) {
|
||||
src <- file.path(lib_folder, paste0("lib_lightgbm", SHLIB_EXT), fsep = "/")
|
||||
} else if (file.exists(file.path(lib_folder, paste0("Release/lib_lightgbm", SHLIB_EXT), fsep = "/"))) {
|
||||
src <- file.path(lib_folder, paste0("Release/lib_lightgbm", SHLIB_EXT), fsep = "/")
|
||||
shared_object_file <- file.path(
|
||||
lib_folder
|
||||
, paste0("lib_lightgbm", SHLIB_EXT)
|
||||
, fsep = "/"
|
||||
)
|
||||
release_file <- file.path(
|
||||
lib_folder
|
||||
, paste0("Release/lib_lightgbm", SHLIB_EXT)
|
||||
, fsep = "/"
|
||||
)
|
||||
windows_shared_object_file <- file.path(
|
||||
lib_folder
|
||||
, paste0("/windows/x64/DLL/lib_lightgbm", SHLIB_EXT)
|
||||
, fsep = "/"
|
||||
)
|
||||
if (file.exists(shared_object_file)) {
|
||||
src <- shared_object_file
|
||||
} else if (file.exists(release_file)) {
|
||||
src <- release_file
|
||||
} else {
|
||||
src <- file.path(lib_folder, paste0("/windows/x64/DLL/lib_lightgbm", SHLIB_EXT), fsep = "/") # Expected result: installation will fail if it is not here or any other
|
||||
# Expected result: installation will fail if it is not here or any other
|
||||
src <- windows_shared_object_file
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
# Check installation correctness
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
context("basic functions")
|
||||
|
||||
data(agaricus.train, package='lightgbm')
|
||||
data(agaricus.test, package='lightgbm')
|
||||
data(agaricus.train, package = 'lightgbm')
|
||||
data(agaricus.test, package = 'lightgbm')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
|
||||
|
@ -9,8 +9,14 @@ windows_flag = grepl('Windows', Sys.info()[['sysname']])
|
|||
|
||||
test_that("train and predict binary classification", {
|
||||
nrounds = 10
|
||||
bst <- lightgbm(data = train$data, label = train$label, num_leaves = 5,
|
||||
nrounds = nrounds, objective = "binary", metric="binary_error")
|
||||
bst <- lightgbm(
|
||||
data = train$data
|
||||
, label = train$label
|
||||
, num_leaves = 5
|
||||
, nrounds = nrounds
|
||||
, objective = "binary"
|
||||
, metric = "binary_error"
|
||||
)
|
||||
expect_false(is.null(bst$record_evals))
|
||||
record_results <- lgb.get.eval.result(bst, "train", "binary_error")
|
||||
expect_lt(min(record_results), 0.02)
|
||||
|
@ -20,7 +26,7 @@ test_that("train and predict binary classification", {
|
|||
|
||||
pred1 <- predict(bst, train$data, num_iteration = 1)
|
||||
expect_equal(length(pred1), 6513)
|
||||
err_pred1 <- sum((pred1 > 0.5) != train$label)/length(train$label)
|
||||
err_pred1 <- sum( (pred1 > 0.5) != train$label) / length(train$label)
|
||||
err_log <- record_results[1]
|
||||
expect_lt(abs(err_pred1 - err_log), 10e-6)
|
||||
})
|
||||
|
@ -29,9 +35,18 @@ test_that("train and predict binary classification", {
|
|||
test_that("train and predict softmax", {
|
||||
lb <- as.numeric(iris$Species) - 1
|
||||
|
||||
bst <- lightgbm(data = as.matrix(iris[, -5]), label = lb,
|
||||
num_leaves = 4, learning_rate = 0.1, nrounds = 20, min_data=20, min_hess=20,
|
||||
objective = "multiclass", metric="multi_error", num_class=3)
|
||||
bst <- lightgbm(
|
||||
data = as.matrix(iris[, -5])
|
||||
, label = lb
|
||||
, num_leaves = 4
|
||||
, learning_rate = 0.1
|
||||
, nrounds = 20
|
||||
, min_data = 20
|
||||
, min_hess = 20
|
||||
, objective = "multiclass"
|
||||
, metric = "multi_error"
|
||||
, num_class = 3
|
||||
)
|
||||
|
||||
expect_false(is.null(bst$record_evals))
|
||||
record_results <- lgb.get.eval.result(bst, "train", "multi_error")
|
||||
|
@ -43,18 +58,33 @@ test_that("train and predict softmax", {
|
|||
|
||||
|
||||
test_that("use of multiple eval metrics works", {
|
||||
bst <- lightgbm(data = train$data, label = train$label, num_leaves = 4,
|
||||
learning_rate=1, nrounds = 10, objective = "binary",
|
||||
metric = list("binary_error","auc","binary_logloss") )
|
||||
bst <- lightgbm(
|
||||
data = train$data
|
||||
, label = train$label
|
||||
, num_leaves = 4
|
||||
, learning_rate = 1
|
||||
, nrounds = 10
|
||||
, objective = "binary"
|
||||
, metric = list("binary_error","auc","binary_logloss")
|
||||
)
|
||||
expect_false(is.null(bst$record_evals))
|
||||
})
|
||||
|
||||
|
||||
test_that("training continuation works", {
|
||||
testthat::skip("This test is currently broken. See issue #2468 for details.")
|
||||
dtrain <- lgb.Dataset(train$data, label = train$label, free_raw_data=FALSE)
|
||||
watchlist = list(train=dtrain)
|
||||
param <- list(objective = "binary", metric="binary_logloss", num_leaves = 5, learning_rate = 1)
|
||||
dtrain <- lgb.Dataset(
|
||||
train$data
|
||||
, label = train$label
|
||||
, free_raw_data = FALSE
|
||||
)
|
||||
watchlist = list(train = dtrain)
|
||||
param <- list(
|
||||
objective = "binary"
|
||||
, metric = "binary_logloss"
|
||||
, num_leaves = 5
|
||||
, learning_rate = 1
|
||||
)
|
||||
|
||||
# for the reference, use 10 iterations at once:
|
||||
bst <- lgb.train(param, dtrain, nrounds = 10, watchlist)
|
||||
|
@ -75,8 +105,16 @@ test_that("training continuation works", {
|
|||
|
||||
|
||||
test_that("cv works", {
|
||||
dtrain <- lgb.Dataset(train$data, label=train$label)
|
||||
params <- list(objective="regression", metric="l2,l1")
|
||||
bst <- lgb.cv(params, dtrain, 10, nfold=5, min_data=1, learning_rate=1, early_stopping_rounds=10)
|
||||
dtrain <- lgb.Dataset(train$data, label = train$label)
|
||||
params <- list(objective = "regression", metric = "l2,l1")
|
||||
bst <- lgb.cv(
|
||||
params
|
||||
, dtrain
|
||||
, 10
|
||||
, nfold = 5
|
||||
, min_data = 1
|
||||
, learning_rate = 1
|
||||
, early_stopping_rounds = 10
|
||||
)
|
||||
expect_false(is.null(bst$record_evals))
|
||||
})
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
context('Test models with custom objective')
|
||||
|
||||
data(agaricus.train, package='lightgbm')
|
||||
data(agaricus.test, package='lightgbm')
|
||||
data(agaricus.train, package = 'lightgbm')
|
||||
data(agaricus.test, package = 'lightgbm')
|
||||
dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
|
@ -17,11 +17,19 @@ logregobj <- function(preds, dtrain) {
|
|||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
|
||||
return(list(name = "error", value = err, higher_better=FALSE))
|
||||
return(list(
|
||||
name = "error"
|
||||
, value = err
|
||||
, higher_better = FALSE
|
||||
))
|
||||
}
|
||||
|
||||
param <- list(num_leaves=8, learning_rate=1,
|
||||
objective=logregobj, metric="auc")
|
||||
param <- list(
|
||||
num_leaves = 8
|
||||
, learning_rate = 1
|
||||
, objective = logregobj
|
||||
, metric = "auc"
|
||||
)
|
||||
num_round <- 10
|
||||
|
||||
test_that("custom objective works", {
|
||||
|
|
|
@ -3,15 +3,15 @@ require(Matrix)
|
|||
|
||||
context("testing lgb.Dataset functionality")
|
||||
|
||||
data(agaricus.test, package='lightgbm')
|
||||
data(agaricus.test, package = 'lightgbm')
|
||||
test_data <- agaricus.test$data[1:100,]
|
||||
test_label <- agaricus.test$label[1:100]
|
||||
|
||||
test_that("lgb.Dataset: basic construction, saving, loading", {
|
||||
# from sparse matrix
|
||||
dtest1 <- lgb.Dataset(test_data, label=test_label)
|
||||
dtest1 <- lgb.Dataset(test_data, label = test_label)
|
||||
# from dense matrix
|
||||
dtest2 <- lgb.Dataset(as.matrix(test_data), label=test_label)
|
||||
dtest2 <- lgb.Dataset(as.matrix(test_data), label = test_label)
|
||||
expect_equal(getinfo(dtest1, 'label'), getinfo(dtest2, 'label'))
|
||||
|
||||
# save to a local file
|
||||
|
@ -40,7 +40,7 @@ test_that("lgb.Dataset: getinfo & setinfo", {
|
|||
})
|
||||
|
||||
test_that("lgb.Dataset: slice, dim", {
|
||||
dtest <- lgb.Dataset(test_data, label=test_label)
|
||||
dtest <- lgb.Dataset(test_data, label = test_label)
|
||||
lgb.Dataset.construct(dtest)
|
||||
expect_equal(dim(dtest), dim(test_data))
|
||||
dsub1 <- slice(dtest, 1:42)
|
||||
|
@ -50,7 +50,7 @@ test_that("lgb.Dataset: slice, dim", {
|
|||
})
|
||||
|
||||
test_that("lgb.Dataset: colnames", {
|
||||
dtest <- lgb.Dataset(test_data, label=test_label)
|
||||
dtest <- lgb.Dataset(test_data, label = test_label)
|
||||
expect_equal(colnames(dtest), colnames(test_data))
|
||||
lgb.Dataset.construct(dtest)
|
||||
expect_equal(colnames(dtest), colnames(test_data))
|
||||
|
@ -62,7 +62,7 @@ test_that("lgb.Dataset: colnames", {
|
|||
|
||||
test_that("lgb.Dataset: nrow is correct for a very sparse matrix", {
|
||||
nr <- 1000
|
||||
x <- Matrix::rsparsematrix(nr, 100, density=0.0005)
|
||||
x <- Matrix::rsparsematrix(nr, 100, density = 0.0005)
|
||||
# we want it very sparse, so that last rows are empty
|
||||
expect_lt(max(x@i), nr)
|
||||
dtest <- lgb.Dataset(x)
|
||||
|
@ -70,15 +70,17 @@ test_that("lgb.Dataset: nrow is correct for a very sparse matrix", {
|
|||
})
|
||||
|
||||
test_that("lgb.Dataset: Dataset should be able to construct from matrix and return non-null handle", {
|
||||
rawData <- matrix(runif(1000),ncol=10)
|
||||
rawData <- matrix(runif(1000), ncol = 10)
|
||||
handle <- NA_real_
|
||||
ref_handle <- NULL
|
||||
handle <- lightgbm:::lgb.call("LGBM_DatasetCreateFromMat_R"
|
||||
, ret = handle
|
||||
, rawData
|
||||
, nrow(rawData)
|
||||
, ncol(rawData)
|
||||
, lightgbm:::lgb.params2str(params=list())
|
||||
, ref_handle)
|
||||
handle <- lightgbm:::lgb.call(
|
||||
"LGBM_DatasetCreateFromMat_R"
|
||||
, ret = handle
|
||||
, rawData
|
||||
, nrow(rawData)
|
||||
, ncol(rawData)
|
||||
, lightgbm:::lgb.params2str(params = list())
|
||||
, ref_handle
|
||||
)
|
||||
expect_false(is.na(handle))
|
||||
})
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
data(agaricus.train, package='lightgbm')
|
||||
data(agaricus.test, package='lightgbm')
|
||||
|
||||
context("feature penalties")
|
||||
|
||||
data(agaricus.train, package = 'lightgbm')
|
||||
data(agaricus.test, package = 'lightgbm')
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
|
||||
|
@ -12,15 +15,15 @@ test_that("Feature penalties work properly", {
|
|||
feature_penalties <- rep(1, ncol(train$data))
|
||||
feature_penalties[var_index] <- x
|
||||
lightgbm(
|
||||
data = train$data,
|
||||
label = train$label,
|
||||
num_leaves = 5,
|
||||
learning_rate = 0.05,
|
||||
nrounds = 20,
|
||||
objective = "binary",
|
||||
feature_penalty = paste0(feature_penalties, collapse = ","),
|
||||
metric="binary_error",
|
||||
verbose = -1
|
||||
data = train$data
|
||||
, label = train$label
|
||||
, num_leaves = 5
|
||||
, learning_rate = 0.05
|
||||
, nrounds = 20
|
||||
, objective = "binary"
|
||||
, feature_penalty = paste0(feature_penalties, collapse = ",")
|
||||
, metric = "binary_error"
|
||||
, verbose = -1
|
||||
)
|
||||
})
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче