[R-package] added R linting and changed R code to comma-first (fixes #2373) (#2437)

This commit is contained in:
James Lamb 2019-10-24 13:47:26 -07:00 коммит произвёл GitHub
Родитель b4bb38d926
Коммит fc991c9d7e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
63 изменённых файлов: 1819 добавлений и 1007 удалений

62
.ci/lint_r_code.R Executable file
Просмотреть файл

@ -0,0 +1,62 @@
library(lintr)
args <- commandArgs(
trailingOnly = TRUE
)
SOURCE_DIR <- args[[1]]
FILES_TO_LINT <- list.files(
path = SOURCE_DIR
, pattern = "\\.r$"
, all.files = TRUE
, ignore.case = TRUE
, full.names = TRUE
, recursive = TRUE
, include.dirs = FALSE
)
# Some linters from the lintr package have not made it to CRAN yet
# We build lintr from source to address that.
LINTERS_TO_USE <- list(
"closed_curly" = lintr::closed_curly_linter
, "infix_spaces" = lintr::infix_spaces_linter
, "long_lines" = lintr::line_length_linter(length = 120)
, "tabs" = lintr::no_tab_linter
, "open_curly" = lintr::open_curly_linter
, "spaces_inside" = lintr::spaces_inside_linter
, "spaces_left_parens" = lintr::spaces_left_parentheses_linter
, "trailing_blank" = lintr::trailing_blank_lines_linter
, "trailing_white" = lintr::trailing_whitespace_linter
)
cat(sprintf("Found %i R files to lint\n", length(FILES_TO_LINT)))
results <- c()
for (r_file in FILES_TO_LINT){
this_result <- lintr::lint(
filename = r_file
, linters = LINTERS_TO_USE
, cache = FALSE
)
cat(sprintf(
"Found %i linting errors in %s\n"
, length(this_result)
, r_file
))
results <- c(results, this_result)
}
issues_found <- length(results)
if (issues_found > 0){
cat("\n")
print(results)
}
quit(save = "no", status = issues_found)

Просмотреть файл

@ -51,10 +51,17 @@ if [[ $TRAVIS == "true" ]] && [[ $TASK == "check-docs" ]]; then
fi
if [[ $TASK == "lint" ]]; then
conda install -q -y -n $CONDA_ENV pycodestyle pydocstyle
conda install -q -y -n $CONDA_ENV \
pycodestyle \
pydocstyle \
r-lintr
pip install --user cpplint
echo "Linting Python code"
pycodestyle --ignore=E501,W503 --exclude=./compute,./.nuget . || exit -1
pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^compute|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1
echo "Linting R code"
Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1
echo "Linting C++ code"
cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include || exit 0
exit 0
fi

Просмотреть файл

@ -39,7 +39,11 @@ cb.reset.parameters <- function(new_params) {
# since changing them would simply wreck some chaos
not_allowed <- c("num_class", "metric", "boosting_type")
if (any(pnames %in% not_allowed)) {
stop("Parameters ", paste0(pnames[pnames %in% not_allowed], collapse = ", "), " cannot be changed during boosting")
stop(
"Parameters "
, paste0(pnames[pnames %in% not_allowed], collapse = ", ")
, " cannot be changed during boosting"
)
}
# Check parameter names
@ -166,7 +170,7 @@ cb.print.evaluation <- function(period = 1) {
i <- env$iteration
# Check if iteration matches moduo
if ((i - 1) %% period == 0 || is.element(i, c(env$begin_iteration, env$end_iteration ))) {
if ( (i - 1) %% period == 0 || is.element(i, c(env$begin_iteration, env$end_iteration))) {
# Merge evaluation string
msg <- merge.eval.string(env)
@ -244,8 +248,14 @@ cb.record.evaluation <- function() {
name <- eval_res$name
# Store evaluation data
env$model$record_evals[[data_name]][[name]]$eval <- c(env$model$record_evals[[data_name]][[name]]$eval, eval_res$value)
env$model$record_evals[[data_name]][[name]]$eval_err <- c(env$model$record_evals[[data_name]][[name]]$eval_err, eval_err)
env$model$record_evals[[data_name]][[name]]$eval <- c(
env$model$record_evals[[data_name]][[name]]$eval
, eval_res$value
)
env$model$record_evals[[data_name]][[name]]$eval_err <- c(
env$model$record_evals[[data_name]][[name]]$eval_err
, eval_err
)
}
@ -391,7 +401,9 @@ cb.early.stop <- function(stopping_rounds, verbose = TRUE) {
}
# Extract callback names from the list of callbacks
callback.names <- function(cb_list) { unlist(lapply(cb_list, attr, "name")) }
callback.names <- function(cb_list) {
unlist(lapply(cb_list, attr, "name"))
}
add.cb <- function(cb_list, cb) {

Просмотреть файл

@ -46,7 +46,12 @@ Booster <- R6::R6Class(
}
# Store booster handle
handle <- lgb.call("LGBM_BoosterCreate_R", ret = handle, train_set$.__enclos_env__$private$get_handle(), params_str)
handle <- lgb.call(
"LGBM_BoosterCreate_R"
, ret = handle
, train_set$.__enclos_env__$private$get_handle()
, params_str
)
# Create private booster information
private$train_set <- train_set
@ -57,10 +62,12 @@ Booster <- R6::R6Class(
if (!is.null(private$init_predictor)) {
# Merge booster
lgb.call("LGBM_BoosterMerge_R",
ret = NULL,
handle,
private$init_predictor$.__enclos_env__$private$handle)
lgb.call(
"LGBM_BoosterMerge_R"
, ret = NULL
, handle
, private$init_predictor$.__enclos_env__$private$handle
)
}
@ -75,9 +82,11 @@ Booster <- R6::R6Class(
}
# Create booster from model
handle <- lgb.call("LGBM_BoosterCreateFromModelfile_R",
ret = handle,
lgb.c_str(modelfile))
handle <- lgb.call(
"LGBM_BoosterCreateFromModelfile_R"
, ret = handle
, lgb.c_str(modelfile)
)
} else if (!is.null(model_str)) {
@ -87,14 +96,19 @@ Booster <- R6::R6Class(
}
# Create booster from model
handle <- lgb.call("LGBM_BoosterLoadModelFromString_R",
ret = handle,
lgb.c_str(model_str))
handle <- lgb.call(
"LGBM_BoosterLoadModelFromString_R"
, ret = handle
, lgb.c_str(model_str)
)
} else {
# Booster non existent
stop("lgb.Booster: Need at least either training dataset, model file, or model_str to create booster instance")
stop(
"lgb.Booster: Need at least either training dataset, "
, "model file, or model_str to create booster instance"
)
}
@ -111,9 +125,11 @@ Booster <- R6::R6Class(
class(handle) <- "lgb.Booster.handle"
private$handle <- handle
private$num_class <- 1L
private$num_class <- lgb.call("LGBM_BoosterGetNumClasses_R",
ret = private$num_class,
private$handle)
private$num_class <- lgb.call(
"LGBM_BoosterGetNumClasses_R"
, ret = private$num_class
, private$handle
)
}
@ -138,7 +154,10 @@ Booster <- R6::R6Class(
# Check if predictors are identical
if (!identical(data$.__enclos_env__$private$predictor, private$init_predictor)) {
stop("lgb.Booster.add_valid: Failed to add validation data; you should use the same predictor for these data")
stop(
"lgb.Booster.add_valid: Failed to add validation data; "
, "you should use the same predictor for these data"
)
}
# Check if names are character
@ -147,10 +166,12 @@ Booster <- R6::R6Class(
}
# Add validation data to booster
lgb.call("LGBM_BoosterAddValidData_R",
ret = NULL,
private$handle,
data$.__enclos_env__$private$get_handle())
lgb.call(
"LGBM_BoosterAddValidData_R"
, ret = NULL
, private$handle
, data$.__enclos_env__$private$get_handle()
)
# Store private information
private$valid_sets <- c(private$valid_sets, data)
@ -171,10 +192,12 @@ Booster <- R6::R6Class(
params_str <- lgb.params2str(params)
# Reset parameters
lgb.call("LGBM_BoosterResetParameter_R",
ret = NULL,
private$handle,
params_str)
lgb.call(
"LGBM_BoosterResetParameter_R"
, ret = NULL
, private$handle
, params_str
)
# Return self
return(invisible(self))
@ -198,10 +221,12 @@ Booster <- R6::R6Class(
}
# Reset training data on booster
lgb.call("LGBM_BoosterResetTrainingData_R",
ret = NULL,
private$handle,
train_set$.__enclos_env__$private$get_handle())
lgb.call(
"LGBM_BoosterResetTrainingData_R"
, ret = NULL
, private$handle
, train_set$.__enclos_env__$private$get_handle()
)
# Store private train set
private$train_set = train_set
@ -230,18 +255,20 @@ Booster <- R6::R6Class(
gpair <- fobj(private$inner_predict(1), private$train_set)
# Check for gradient and hessian as list
if(is.null(gpair$grad) || is.null(gpair$hess)){
if (is.null(gpair$grad) || is.null(gpair$hess)){
stop("lgb.Booster.update: custom objective should
return a list with attributes (hess, grad)")
}
# Return custom boosting gradient/hessian
ret <- lgb.call("LGBM_BoosterUpdateOneIterCustom_R",
ret = NULL,
private$handle,
gpair$grad,
gpair$hess,
length(gpair$grad))
ret <- lgb.call(
"LGBM_BoosterUpdateOneIterCustom_R"
, ret = NULL
, private$handle
, gpair$grad
, gpair$hess
, length(gpair$grad)
)
}
@ -258,9 +285,11 @@ Booster <- R6::R6Class(
rollback_one_iter = function() {
# Return one iteration behind
lgb.call("LGBM_BoosterRollbackOneIter_R",
ret = NULL,
private$handle)
lgb.call(
"LGBM_BoosterRollbackOneIter_R"
, ret = NULL
, private$handle
)
# Loop through each iteration
for (i in seq_along(private$is_predicted_cur_iter)) {
@ -276,9 +305,11 @@ Booster <- R6::R6Class(
current_iter = function() {
cur_iter <- 0L
lgb.call("LGBM_BoosterGetCurrentIteration_R",
ret = cur_iter,
private$handle)
lgb.call(
"LGBM_BoosterGetCurrentIteration_R"
, ret = cur_iter
, private$handle
)
},
@ -349,7 +380,10 @@ Booster <- R6::R6Class(
# Loop through each validation set
for (i in seq_along(private$valid_sets)) {
ret <- append(ret, private$inner_eval(private$name_valid_sets[[i]], i + 1, feval))
ret <- append(
x = ret
, values = private$inner_eval(private$name_valid_sets[[i]], i + 1, feval)
)
}
# Return ret
@ -366,11 +400,13 @@ Booster <- R6::R6Class(
}
# Save booster model
lgb.call("LGBM_BoosterSaveModel_R",
ret = NULL,
private$handle,
as.integer(num_iteration),
lgb.c_str(filename))
lgb.call(
"LGBM_BoosterSaveModel_R"
, ret = NULL
, private$handle
, as.integer(num_iteration)
, lgb.c_str(filename)
)
# Return self
return(invisible(self))
@ -385,9 +421,11 @@ Booster <- R6::R6Class(
}
# Return model string
return(lgb.call.return.str("LGBM_BoosterSaveModelToString_R",
private$handle,
as.integer(num_iteration)))
return(lgb.call.return.str(
"LGBM_BoosterSaveModelToString_R"
, private$handle
, as.integer(num_iteration)
))
},
@ -400,9 +438,11 @@ Booster <- R6::R6Class(
}
# Return dumped model
lgb.call.return.str("LGBM_BoosterDumpModel_R",
private$handle,
as.integer(num_iteration))
lgb.call.return.str(
"LGBM_BoosterDumpModel_R"
, private$handle
, as.integer(num_iteration)
)
},
@ -478,10 +518,12 @@ Booster <- R6::R6Class(
# Store predictions
npred <- 0L
npred <- lgb.call("LGBM_BoosterGetNumPredict_R",
ret = npred,
private$handle,
as.integer(idx - 1))
npred <- lgb.call(
"LGBM_BoosterGetNumPredict_R"
, ret = npred
, private$handle
, as.integer(idx - 1)
)
private$predict_buffer[[data_name]] <- numeric(npred)
}
@ -490,10 +532,12 @@ Booster <- R6::R6Class(
if (!private$is_predicted_cur_iter[[idx]]) {
# Use buffer
private$predict_buffer[[data_name]] <- lgb.call("LGBM_BoosterGetPredict_R",
ret = private$predict_buffer[[data_name]],
private$handle,
as.integer(idx - 1))
private$predict_buffer[[data_name]] <- lgb.call(
"LGBM_BoosterGetPredict_R"
, ret = private$predict_buffer[[data_name]]
, private$handle
, as.integer(idx - 1)
)
private$is_predicted_cur_iter[[idx]] <- TRUE
}
@ -508,8 +552,10 @@ Booster <- R6::R6Class(
if (is.null(private$eval_names)) {
# Get evaluation names
names <- lgb.call.return.str("LGBM_BoosterGetEvalNames_R",
private$handle)
names <- lgb.call.return.str(
"LGBM_BoosterGetEvalNames_R"
, private$handle
)
# Check names' length
if (nchar(names) > 0) {
@ -547,10 +593,12 @@ Booster <- R6::R6Class(
# Create evaluation values
tmp_vals <- numeric(length(private$eval_names))
tmp_vals <- lgb.call("LGBM_BoosterGetEval_R",
ret = tmp_vals,
private$handle,
as.integer(data_idx - 1))
tmp_vals <- lgb.call(
"LGBM_BoosterGetEval_R"
, ret = tmp_vals
, private$handle
, as.integer(data_idx - 1)
)
# Loop through all evaluation names
for (i in seq_along(private$eval_names)) {
@ -587,7 +635,7 @@ Booster <- R6::R6Class(
res <- feval(private$inner_predict(data_idx), data)
# Check for name correctness
if(is.null(res$name) || is.null(res$value) || is.null(res$higher_better)) {
if (is.null(res$name) || is.null(res$value) || is.null(res$higher_better)) {
stop("lgb.Booster.eval: custom eval function should return a
list with attribute (name, value, higher_better)");
}
@ -614,13 +662,13 @@ Booster <- R6::R6Class(
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
#' @param num_iteration number of iteration want to predict with, NULL or <= 0 means use best iteration
#' @param rawscore whether the prediction should be returned in the for of original untransformed
#' sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} for
#' logistic regression would result in predictions for log-odds instead of probabilities.
#' sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE}
#' for logistic regression would result in predictions for log-odds instead of probabilities.
#' @param predleaf whether predict leaf index instead.
#' @param predcontrib return per-feature contributions for each record.
#' @param header only used for prediction for text file. True if text file has header
#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several
#' prediction outputs per case.
#' prediction outputs per case.
#' @param ... Additional named arguments passed to the \code{predict()} method of
#' the \code{lgb.Booster} object passed to \code{object}.
#' @return
@ -642,13 +690,15 @@ Booster <- R6::R6Class(
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 10,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' preds <- predict(model, test$data)
#'
#' @rdname predict.lgb.Booster
@ -669,13 +719,16 @@ predict.lgb.Booster <- function(object,
}
# Return booster predictions
object$predict(data,
num_iteration,
rawscore,
predleaf,
predcontrib,
header,
reshape, ...)
object$predict(
data
, num_iteration
, rawscore
, predleaf
, predcontrib
, header
, reshape
, ...
)
}
#' Load LightGBM model
@ -699,13 +752,15 @@ predict.lgb.Booster <- function(object,
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 10,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' lgb.save(model, "model.txt")
#' load_booster <- lgb.load(filename = "model.txt")
#' model_string <- model$save_model_to_string(NULL) # saves best iteration
@ -757,13 +812,15 @@ lgb.load <- function(filename = NULL, model_str = NULL){
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 10,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' lgb.save(model, "model.txt")
#'
#' @rdname lgb.save
@ -804,13 +861,15 @@ lgb.save <- function(booster, filename, num_iteration = NULL){
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 10,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' json_model <- lgb.dump(model)
#'
#' @rdname lgb.dump
@ -848,13 +907,15 @@ lgb.dump <- function(booster, num_iteration = NULL){
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 10,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' lgb.get.eval.result(model, "test", "l2")
#' @rdname lgb.get.eval.result
#' @export

Просмотреть файл

@ -97,16 +97,18 @@ Dataset <- R6::R6Class(
...) {
# Create new dataset
ret <- Dataset$new(data,
private$params,
self,
private$colnames,
private$categorical_feature,
private$predictor,
private$free_raw_data,
NULL,
info,
...)
ret <- Dataset$new(
data = data
, params = private$params
, reference = self
, colnames = private$colnames
, categorical_feature = private$categorical_feature
, predictor = private$predictor
, free_raw_data = private$free_raw_data
, used_indices = NULL
, info = info
, ...
)
# Return ret
return(invisible(ret))
@ -142,14 +144,23 @@ Dataset <- R6::R6Class(
# Provided indices, but some indices are not existing?
if (sum(is.na(cate_indices)) > 0) {
stop("lgb.self.get.handle: supplied an unknown feature in categorical_feature: ", sQuote(private$categorical_feature[is.na(cate_indices)]))
stop(
"lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
, sQuote(private$categorical_feature[is.na(cate_indices)])
)
}
} else {
# Check if more categorical features were output over the feature space
if (max(private$categorical_feature) > length(private$colnames)) {
stop("lgb.self.get.handle: supplied a too large value in categorical_feature: ", max(private$categorical_feature), " but only ", length(private$colnames), " features")
stop(
"lgb.self.get.handle: supplied a too large value in categorical_feature: "
, max(private$categorical_feature)
, " but only "
, length(private$colnames)
, " features"
)
}
# Store indices as [0, n-1] indexed instead of [1, n] indexed
@ -165,7 +176,9 @@ Dataset <- R6::R6Class(
# Check has header or not
has_header <- FALSE
if (!is.null(private$params$has_header) || !is.null(private$params$header)) {
if (tolower(as.character(private$params$has_header)) == "true" || tolower(as.character(private$params$header)) == "true") {
params_has_header <- tolower(as.character(private$params$has_header)) == "true"
params_header <- tolower(as.character(private$params$header)) == "true"
if (params_has_header || params_header) {
has_header <- TRUE
}
}
@ -186,43 +199,52 @@ Dataset <- R6::R6Class(
# Are we using a data file?
if (is.character(private$raw_data)) {
handle <- lgb.call("LGBM_DatasetCreateFromFile_R",
ret = handle,
lgb.c_str(private$raw_data),
params_str,
ref_handle)
handle <- lgb.call(
"LGBM_DatasetCreateFromFile_R"
, ret = handle
, lgb.c_str(private$raw_data)
, params_str
, ref_handle
)
} else if (is.matrix(private$raw_data)) {
# Are we using a matrix?
handle <- lgb.call("LGBM_DatasetCreateFromMat_R",
ret = handle,
private$raw_data,
nrow(private$raw_data),
ncol(private$raw_data),
params_str,
ref_handle)
handle <- lgb.call(
"LGBM_DatasetCreateFromMat_R"
, ret = handle
, private$raw_data
, nrow(private$raw_data)
, ncol(private$raw_data)
, params_str
, ref_handle
)
} else if (methods::is(private$raw_data, "dgCMatrix")) {
if (length(private$raw_data@p) > 2147483647) {
stop("Cannot support large CSC matrix")
}
# Are we using a dgCMatrix (sparsed matrix column compressed)
handle <- lgb.call("LGBM_DatasetCreateFromCSC_R",
ret = handle,
private$raw_data@p,
private$raw_data@i,
private$raw_data@x,
length(private$raw_data@p),
length(private$raw_data@x),
nrow(private$raw_data),
params_str,
ref_handle)
handle <- lgb.call(
"LGBM_DatasetCreateFromCSC_R"
, ret = handle
, private$raw_data@p
, private$raw_data@i
, private$raw_data@x
, length(private$raw_data@p)
, length(private$raw_data@x)
, nrow(private$raw_data)
, params_str
, ref_handle
)
} else {
# Unknown data type
stop("lgb.Dataset.construct: does not support constructing from ", sQuote(class(private$raw_data)))
stop(
"lgb.Dataset.construct: does not support constructing from "
, sQuote(class(private$raw_data))
)
}
@ -234,12 +256,14 @@ Dataset <- R6::R6Class(
}
# Construct subset
handle <- lgb.call("LGBM_DatasetGetSubset_R",
ret = handle,
ref_handle,
c(private$used_indices), # Adding c() fixes issue in R v3.5
length(private$used_indices),
params_str)
handle <- lgb.call(
"LGBM_DatasetGetSubset_R"
, ret = handle
, ref_handle
, c(private$used_indices) # Adding c() fixes issue in R v3.5
, length(private$used_indices)
, params_str
)
}
if (lgb.is.null.handle(handle)) {
@ -258,7 +282,11 @@ Dataset <- R6::R6Class(
if (!is.null(private$predictor) && is.null(private$used_indices)) {
# Setup initial scores
init_score <- private$predictor$predict(private$raw_data, rawscore = TRUE, reshape = TRUE)
init_score <- private$predictor$predict(
private$raw_data
, rawscore = TRUE
, reshape = TRUE
)
# Not needed to transpose, for is col_marjor
init_score <- as.vector(init_score)
@ -316,7 +344,10 @@ Dataset <- R6::R6Class(
} else {
# Trying to work with unknown dimensions is not possible
stop("dim: cannot get dimensions before dataset has been constructed, please call lgb.Dataset.construct explicitly")
stop(
"dim: cannot get dimensions before dataset has been constructed, "
, "please call lgb.Dataset.construct explicitly"
)
}
@ -341,7 +372,10 @@ Dataset <- R6::R6Class(
} else {
# Trying to work with unknown dimensions is not possible
stop("dim: cannot get dimensions before dataset has been constructed, please call lgb.Dataset.construct explicitly")
stop(
"dim: cannot get dimensions before dataset has been constructed, please call "
, "lgb.Dataset.construct explicitly"
)
}
@ -367,10 +401,12 @@ Dataset <- R6::R6Class(
# Merge names with tab separation
merged_name <- paste0(as.list(private$colnames), collapse = "\t")
lgb.call("LGBM_DatasetSetFeatureNames_R",
ret = NULL,
private$handle,
lgb.c_str(merged_name))
lgb.call(
"LGBM_DatasetSetFeatureNames_R"
, ret = NULL
, private$handle
, lgb.c_str(merged_name)
)
}
@ -399,10 +435,12 @@ Dataset <- R6::R6Class(
# Get field size of info
info_len <- 0L
info_len <- lgb.call("LGBM_DatasetGetFieldSize_R",
ret = info_len,
private$handle,
lgb.c_str(name))
info_len <- lgb.call(
"LGBM_DatasetGetFieldSize_R"
, ret = info_len
, private$handle
, lgb.c_str(name)
)
# Check if info is not empty
if (info_len > 0) {
@ -415,10 +453,12 @@ Dataset <- R6::R6Class(
numeric(info_len) # Numeric
}
ret <- lgb.call("LGBM_DatasetGetField_R",
ret = ret,
private$handle,
lgb.c_str(name))
ret <- lgb.call(
"LGBM_DatasetGetField_R"
, ret = ret
, private$handle
, lgb.c_str(name)
)
private$info[[name]] <- ret
@ -454,12 +494,14 @@ Dataset <- R6::R6Class(
if (length(info) > 0) {
lgb.call("LGBM_DatasetSetField_R",
ret = NULL,
private$handle,
lgb.c_str(name),
info,
length(info))
lgb.call(
"LGBM_DatasetSetField_R"
, ret = NULL
, private$handle
, lgb.c_str(name)
, info
, length(info)
)
}
@ -474,16 +516,18 @@ Dataset <- R6::R6Class(
slice = function(idxset, ...) {
# Perform slicing
Dataset$new(NULL,
private$params,
self,
private$colnames,
private$categorical_feature,
private$predictor,
private$free_raw_data,
sort(idxset, decreasing = FALSE),
NULL,
...)
Dataset$new(
data = NULL
, params = private$params
, reference = self
, colnames = private$colnames
, categorical_feature = private$categorical_feature
, predictor = private$predictor
, free_raw_data = private$free_raw_data
, used_indices = sort(idxset, decreasing = FALSE)
, info = NULL
, ...
)
},
@ -492,7 +536,12 @@ Dataset <- R6::R6Class(
# Parameter updating
if (!lgb.is.null.handle(private$handle)) {
lgb.call("LGBM_DatasetUpdateParam_R", ret = NULL, private$handle, lgb.params2str(params))
lgb.call(
"LGBM_DatasetUpdateParam_R"
, ret = NULL
, private$handle
, lgb.params2str(params)
)
return(invisible(self))
}
private$params <- modifyList(private$params, params)
@ -568,10 +617,12 @@ Dataset <- R6::R6Class(
# Store binary data
self$construct()
lgb.call("LGBM_DatasetSaveBinary_R",
ret = NULL,
private$handle,
lgb.c_str(fname))
lgb.call(
"LGBM_DatasetSaveBinary_R"
, ret = NULL
, private$handle
, lgb.c_str(fname)
)
return(invisible(self))
}
@ -671,16 +722,18 @@ lgb.Dataset <- function(data,
...) {
# Create new dataset
invisible(Dataset$new(data,
params,
reference,
colnames,
categorical_feature,
NULL,
free_raw_data,
NULL,
info,
...))
invisible(Dataset$new(
data = data
, params = params
, reference = reference
, colnames = colnames
, categorical_feature = categorical_feature
, predictor = NULL
, free_raw_data = free_raw_data
, used_indices = NULL
, info = info
, ...
))
}
@ -784,7 +837,7 @@ dim.lgb.Dataset <- function(x, ...) {
#'
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
#' and the second one is column names
#' and the second one is column names
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
@ -840,7 +893,13 @@ dimnames.lgb.Dataset <- function(x) {
# Check for unmatching column size
if (ncol(x) != length(value[[2]])) {
stop("can't assign ", sQuote(length(value[[2]])), " colnames to an lgb.Dataset with ", sQuote(ncol(x)), " columns")
stop(
"can't assign "
, sQuote(length(value[[2]]))
, " colnames to an lgb.Dataset with "
, sQuote(ncol(x))
, " columns"
)
}
# Set column names properly, and return

Просмотреть файл

@ -13,7 +13,11 @@ Predictor <- R6::R6Class(
if (private$need_free_handle && !lgb.is.null.handle(private$handle)) {
# Freeing up handle
lgb.call("LGBM_BoosterFree_R", ret = NULL, private$handle)
lgb.call(
"LGBM_BoosterFree_R"
, ret = NULL
, private$handle
)
private$handle <- NULL
}
@ -31,7 +35,11 @@ Predictor <- R6::R6Class(
if (is.character(modelfile)) {
# Create handle on it
handle <- lgb.call("LGBM_BoosterCreateFromModelfile_R", ret = handle, lgb.c_str(modelfile))
handle <- lgb.call(
"LGBM_BoosterCreateFromModelfile_R"
, ret = handle
, lgb.c_str(modelfile)
)
private$need_free_handle <- TRUE
} else if (methods::is(modelfile, "lgb.Booster.handle")) {
@ -57,7 +65,11 @@ Predictor <- R6::R6Class(
current_iter = function() {
cur_iter <- 0L
lgb.call("LGBM_BoosterGetCurrentIteration_R", ret = cur_iter, private$handle)
lgb.call(
"LGBM_BoosterGetCurrentIteration_R"
, ret = cur_iter
, private$handle
)
},
@ -86,14 +98,19 @@ Predictor <- R6::R6Class(
on.exit(unlink(tmp_filename), add = TRUE)
# Predict from temporary file
lgb.call("LGBM_BoosterPredictForFile_R", ret = NULL, private$handle, data,
as.integer(header),
as.integer(rawscore),
as.integer(predleaf),
as.integer(predcontrib),
as.integer(num_iteration),
private$params,
lgb.c_str(tmp_filename))
lgb.call(
"LGBM_BoosterPredictForFile_R"
, ret = NULL
, private$handle
, data
, as.integer(header)
, as.integer(rawscore)
, as.integer(predleaf)
, as.integer(predcontrib)
, as.integer(num_iteration)
, private$params
, lgb.c_str(tmp_filename)
)
# Get predictions from file
preds <- read.delim(tmp_filename, header = FALSE, sep = "\t")
@ -108,51 +125,57 @@ Predictor <- R6::R6Class(
npred <- 0L
# Check number of predictions to do
npred <- lgb.call("LGBM_BoosterCalcNumPredict_R",
ret = npred,
private$handle,
as.integer(num_row),
as.integer(rawscore),
as.integer(predleaf),
as.integer(predcontrib),
as.integer(num_iteration))
npred <- lgb.call(
"LGBM_BoosterCalcNumPredict_R"
, ret = npred
, private$handle
, as.integer(num_row)
, as.integer(rawscore)
, as.integer(predleaf)
, as.integer(predcontrib)
, as.integer(num_iteration)
)
# Pre-allocate empty vector
preds <- numeric(npred)
# Check if data is a matrix
if (is.matrix(data)) {
preds <- lgb.call("LGBM_BoosterPredictForMat_R",
ret = preds,
private$handle,
data,
as.integer(nrow(data)),
as.integer(ncol(data)),
as.integer(rawscore),
as.integer(predleaf),
as.integer(predcontrib),
as.integer(num_iteration),
private$params)
preds <- lgb.call(
"LGBM_BoosterPredictForMat_R"
, ret = preds
, private$handle
, data
, as.integer(nrow(data))
, as.integer(ncol(data))
, as.integer(rawscore)
, as.integer(predleaf)
, as.integer(predcontrib)
, as.integer(num_iteration)
, private$params
)
} else if (methods::is(data, "dgCMatrix")) {
if (length(data@p) > 2147483647) {
stop("Cannot support large CSC matrix")
}
# Check if data is a dgCMatrix (sparse matrix, column compressed format)
preds <- lgb.call("LGBM_BoosterPredictForCSC_R",
ret = preds,
private$handle,
data@p,
data@i,
data@x,
length(data@p),
length(data@x),
nrow(data),
as.integer(rawscore),
as.integer(predleaf),
as.integer(predcontrib),
as.integer(num_iteration),
private$params)
preds <- lgb.call(
"LGBM_BoosterPredictForCSC_R"
, ret = preds
, private$handle
, data@p
, data@i
, data@x
, length(data@p)
, length(data@x)
, nrow(data)
, as.integer(rawscore)
, as.integer(predleaf)
, as.integer(predcontrib)
, as.integer(num_iteration)
, private$params
)
} else {
@ -165,7 +188,12 @@ Predictor <- R6::R6Class(
# Check if number of rows is strange (not a multiple of the dataset rows)
if (length(preds) %% num_row != 0) {
stop("predict: prediction length ", sQuote(length(preds))," is not a multiple of nrows(data): ", sQuote(num_row))
stop(
"predict: prediction length "
, sQuote(length(preds))
," is not a multiple of nrows(data): "
, sQuote(num_row)
)
}
# Get number of cases per row
@ -192,7 +220,9 @@ Predictor <- R6::R6Class(
}
),
private = list(handle = NULL,
need_free_handle = FALSE,
params = "")
private = list(
handle = NULL
, need_free_handle = FALSE
, params = ""
)
)

Просмотреть файл

@ -25,23 +25,23 @@ CVBooster <- R6::R6Class(
#' @param label vector of response values. Should be provided only when data is an R-matrix.
#' @param weight vector of response values. If not NULL, will set to dataset
#' @param obj objective function, can be character or custom objective function. Examples include
#' \code{regression}, \code{regression_l1}, \code{huber},
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
#' \code{regression}, \code{regression_l1}, \code{huber},
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
#' @param eval evaluation function, can be (list of) character or custom eval function
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation
#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
#' by the values of outcome labels.
#' by the values of outcome labels.
#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
#' (each element must be a vector of test fold's indices). When folds are supplied,
#' the \code{nfold} and \code{stratified} parameters are ignored.
#' (each element must be a vector of test fold's indices). When folds are supplied,
#' the \code{nfold} and \code{stratified} parameters are ignored.
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
#' @param categorical_feature list of str or int
#' type int represents index,
#' type str represents feature names
#' @param callbacks list of callback functions
#' List of callback functions that are applied at each iteration.
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets
#' type int represents index,
#' type str represents feature names
#' @param callbacks List of callback functions that are applied at each iteration.
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model
#' into a predictor model which frees up memory and the original datasets
#' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
#' \itemize{
#' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
@ -61,13 +61,15 @@ CVBooster <- R6::R6Class(
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' params <- list(objective = "regression", metric = "l2")
#' model <- lgb.cv(params,
#' dtrain,
#' 10,
#' nfold = 3,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.cv(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , nfold = 3
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' @export
lgb.cv <- function(params = list(),
data,
@ -134,7 +136,17 @@ lgb.cv <- function(params = list(),
begin_iteration <- predictor$current_iter() + 1
}
# Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
n_trees <- c("num_iterations", "num_iteration", "n_iter", "num_tree", "num_trees", "num_round", "num_rounds", "num_boost_round", "n_estimators")
n_trees <- c(
"num_iterations"
, "num_iteration"
, "n_iter"
, "num_tree"
, "num_trees"
, "num_round"
, "num_rounds"
, "num_boost_round"
, "n_estimators"
)
if (any(names(params) %in% n_trees)) {
end_iteration <- begin_iteration + params[[which(names(params) %in% n_trees)[1]]] - 1
} else {
@ -192,12 +204,14 @@ lgb.cv <- function(params = list(),
}
# Create folds
folds <- generate.cv.folds(nfold,
nrow(data),
stratified,
getinfo(data, "label"),
getinfo(data, "group"),
params)
folds <- generate.cv.folds(
nfold
, nrow(data)
, stratified
, getinfo(data, "label")
, getinfo(data, "group")
, params
)
}
@ -215,12 +229,24 @@ lgb.cv <- function(params = list(),
early_stop <- c("early_stopping_round", "early_stopping_rounds", "early_stopping", "n_iter_no_change")
if (any(names(params) %in% early_stop)) {
if (params[[which(names(params) %in% early_stop)[1]]] > 0) {
callbacks <- add.cb(callbacks, cb.early.stop(params[[which(names(params) %in% early_stop)[1]]], verbose = verbose))
callbacks <- add.cb(
callbacks
, cb.early.stop(
params[[which(names(params) %in% early_stop)[1]]]
, verbose = verbose
)
)
}
} else {
if (!is.null(early_stopping_rounds)) {
if (early_stopping_rounds > 0) {
callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, verbose = verbose))
callbacks <- add.cb(
callbacks
, cb.early.stop(
early_stopping_rounds
, verbose = verbose
)
)
}
}
}
@ -292,7 +318,7 @@ lgb.cv <- function(params = list(),
env$eval_list <- merged_msg$eval_list
# Check for standard deviation requirement
if(showsd) {
if (showsd) {
env$eval_err_list <- merged_msg$eval_err_list
}
@ -319,9 +345,11 @@ lgb.cv <- function(params = list(),
if (reset_data) {
lapply(cv_booster$boosters, function(fd) {
# Store temporarily model data elsewhere
booster_old <- list(best_iter = fd$booster$best_iter,
best_score = fd$booster$best_score,
record_evals = fd$booster$record_evals)
booster_old <- list(
best_iter = fd$booster$best_iter
, best_score = fd$booster$best_score,
, record_evals = fd$booster$record_evals
)
# Reload model
fd$booster <- lgb.load(model_str = fd$booster$save_model_to_string())
fd$booster$best_iter <- booster_old$best_iter
@ -384,8 +412,10 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
# Loop through each fold
for (i in seq_len(nfold)) {
kstep <- length(rnd_idx) %/% (nfold - i + 1)
folds[[i]] <- list(fold = which(ungrouped %in% rnd_idx[seq_len(kstep)]),
group = rnd_idx[seq_len(kstep)])
folds[[i]] <- list(
fold = which(ungrouped %in% rnd_idx[seq_len(kstep)])
, group = rnd_idx[seq_len(kstep)]
)
rnd_idx <- rnd_idx[-seq_len(kstep)]
}
@ -413,11 +443,17 @@ lgb.stratified.folds <- function(y, k = 10) {
if (is.numeric(y)) {
cuts <- length(y) %/% k
if (cuts < 2) { cuts <- 2 }
if (cuts > 5) { cuts <- 5 }
y <- cut(y,
unique(stats::quantile(y, probs = seq.int(0, 1, length.out = cuts))),
include.lowest = TRUE)
if (cuts < 2) {
cuts <- 2
}
if (cuts > 5) {
cuts <- 5
}
y <- cut(
y
, unique(stats::quantile(y, probs = seq.int(0, 1, length.out = cuts)))
, include.lowest = TRUE
)
}
@ -499,8 +535,10 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) {
# Parse standard deviation
for (j in seq_len(eval_len)) {
ret_eval_err <- c(ret_eval_err,
sqrt(mean(eval_result[[j]] ^ 2) - mean(eval_result[[j]]) ^ 2))
ret_eval_err <- c(
ret_eval_err
, sqrt(mean(eval_result[[j]] ^ 2) - mean(eval_result[[j]]) ^ 2)
)
}
# Convert to list
@ -509,7 +547,9 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) {
}
# Return errors
list(eval_list = ret_eval,
eval_err_list = ret_eval_err)
list(
eval_list = ret_eval
, eval_err_list = ret_eval_err
)
}

Просмотреть файл

@ -21,9 +21,14 @@
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#'
#' params <- list(objective = "binary",
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
#' params <- list(
#' objective = "binary"
#' , learning_rate = 0.01
#' , num_leaves = 63
#' , max_depth = -1
#' , min_data_in_leaf = 1
#' , min_sum_hessian_in_leaf = 1
#' )
#' model <- lgb.train(params, dtrain, 10)
#'
#' tree_imp1 <- lgb.importance(model, percentage = TRUE)
@ -63,9 +68,11 @@ lgb.importance <- function(model, percentage = TRUE) {
# Check if relative values are requested
if (percentage) {
tree_imp_dt[, ":="(Gain = Gain / sum(Gain),
Cover = Cover / sum(Cover),
Frequency = Frequency / sum(Frequency))]
tree_imp_dt[, `:=`(
Gain = Gain / sum(Gain)
, Cover = Cover / sum(Cover)
, Frequency = Frequency / sum(Frequency)
)]
}
# Return importance table

Просмотреть файл

@ -69,12 +69,21 @@ lgb.interprete <- function(model,
)
# Get list of trees
tree_index_mat_list <- lapply(leaf_index_mat_list,
FUN = function(x) matrix(seq_len(length(x)) - 1, ncol = num_class, byrow = TRUE))
tree_index_mat_list <- lapply(
X = leaf_index_mat_list
, FUN = function(x){
matrix(seq_len(length(x)) - 1, ncol = num_class, byrow = TRUE)
}
)
# Sequence over idxset
for (i in seq_along(idxset)) {
tree_interpretation_dt_list[[i]] <- single.row.interprete(tree_dt, num_class, tree_index_mat_list[[i]], leaf_index_mat_list[[i]])
tree_interpretation_dt_list[[i]] <- single.row.interprete(
tree_dt
, num_class
, tree_index_mat_list[[i]]
, leaf_index_mat_list[[i]]
)
}
# Return interpretation list
@ -122,7 +131,10 @@ single.tree.interprete <- function(tree_dt,
leaf_to_root(leaf_dt[["leaf_parent"]], leaf_dt[["leaf_value"]])
# Return formatted data.table
data.table::data.table(Feature = feature_seq, Contribution = diff.default(value_seq))
data.table::data.table(
Feature = feature_seq
, Contribution = diff.default(value_seq)
)
}
@ -198,16 +210,22 @@ single.row.interprete <- function(tree_dt, num_class, tree_index_mat, leaf_index
} else {
# Full interpretation elements
tree_interpretation_dt <- Reduce(f = function(x, y) merge(x, y, by = "Feature", all = TRUE),
x = tree_interpretation)
tree_interpretation_dt <- Reduce(
f = function(x, y){
merge(x, y, by = "Feature", all = TRUE)
}
, x = tree_interpretation
)
# Loop throughout each tree
for (j in 2:ncol(tree_interpretation_dt)) {
data.table::set(tree_interpretation_dt,
i = which(is.na(tree_interpretation_dt[[j]])),
j = j,
value = 0)
data.table::set(
tree_interpretation_dt
, i = which(is.na(tree_interpretation_dt[[j]]))
, j = j
, value = 0
)
}

Просмотреть файл

@ -35,9 +35,14 @@
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#'
#' params <- list(objective = "binary",
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
#' params <- list(
#' objective = "binary"
#' , learning_rate = 0.01
#' , num_leaves = 63
#' , max_depth = -1
#' , min_data_in_leaf = 1
#' , min_sum_hessian_in_leaf = 1
#' )
#' model <- lgb.train(params, dtrain, 10)
#'
#' tree_dt <- lgb.model.dt.tree(model)
@ -51,11 +56,13 @@ lgb.model.dt.tree <- function(model, num_iteration = NULL) {
json_model <- lgb.dump(model, num_iteration = num_iteration)
# Parse json model second
parsed_json_model <- jsonlite::fromJSON(json_model,
simplifyVector = TRUE,
simplifyDataFrame = FALSE,
simplifyMatrix = FALSE,
flatten = FALSE)
parsed_json_model <- jsonlite::fromJSON(
json_model
, simplifyVector = TRUE
, simplifyDataFrame = FALSE
, simplifyMatrix = FALSE
, flatten = FALSE
)
# Parse tree model third
tree_list <- lapply(parsed_json_model$tree_info, single.tree.parse)
@ -89,21 +96,23 @@ single.tree.parse <- function(lgb_tree) {
if (is.null(env)) {
# Setup initial default data.table with default types
env <- new.env(parent = emptyenv())
env$single_tree_dt <- data.table::data.table(tree_index = integer(0),
depth = integer(0),
split_index = integer(0),
split_feature = integer(0),
node_parent = integer(0),
leaf_index = integer(0),
leaf_parent = integer(0),
split_gain = numeric(0),
threshold = numeric(0),
decision_type = character(0),
default_left = character(0),
internal_value = integer(0),
internal_count = integer(0),
leaf_value = integer(0),
leaf_count = integer(0))
env$single_tree_dt <- data.table::data.table(
tree_index = integer(0)
, depth = integer(0)
, split_index = integer(0)
, split_feature = integer(0)
, node_parent = integer(0)
, leaf_index = integer(0)
, leaf_parent = integer(0)
, split_gain = numeric(0)
, threshold = numeric(0)
, decision_type = character(0)
, default_left = character(0)
, internal_value = integer(0)
, internal_count = integer(0)
, leaf_value = integer(0)
, leaf_count = integer(0)
)
# start tree traversal
pre_order_traversal(env, tree_node_leaf, current_depth, parent_index)
} else {
@ -127,14 +136,18 @@ single.tree.parse <- function(lgb_tree) {
fill = TRUE)
# Traverse tree again both left and right
pre_order_traversal(env,
tree_node_leaf$left_child,
current_depth = current_depth + 1L,
parent_index = tree_node_leaf$split_index)
pre_order_traversal(env,
tree_node_leaf$right_child,
current_depth = current_depth + 1L,
parent_index = tree_node_leaf$split_index)
pre_order_traversal(
env
, tree_node_leaf$left_child
, current_depth = current_depth + 1L
, parent_index = tree_node_leaf$split_index
)
pre_order_traversal(
env
, tree_node_leaf$right_child
, current_depth = current_depth + 1L
, parent_index = tree_node_leaf$split_index
)
} else if (!is.null(tree_node_leaf$leaf_index)) {

Просмотреть файл

@ -43,7 +43,11 @@ lgb.plot.importance <- function(tree_imp,
cex = NULL) {
# Check for measurement (column names) correctness
measure <- match.arg(measure, choices = c("Gain", "Cover", "Frequency"), several.ok = FALSE)
measure <- match.arg(
measure
, choices = c("Gain", "Cover", "Frequency")
, several.ok = FALSE
)
# Get top N importance (defaults to 10)
top_n <- min(top_n, nrow(tree_imp))
@ -72,14 +76,14 @@ lgb.plot.importance <- function(tree_imp,
# Do plot
tree_imp[.N:1,
graphics::barplot(
height = get(measure),
names.arg = Feature,
horiz = TRUE,
border = NA,
main = "Feature Importance",
xlab = measure,
cex.names = cex,
las = 1
height = get(measure)
, names.arg = Feature
, horiz = TRUE
, border = NA
, main = "Feature Importance"
, xlab = measure
, cex.names = cex
, las = 1
)]
# Return invisibly

Просмотреть файл

@ -9,8 +9,8 @@
#' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}.
#'
#' @details
#' The graph represents each feature as a horizontal bar of length proportional to the defined contribution of a feature.
#' Features are shown ranked in a decreasing contribution order.
#' The graph represents each feature as a horizontal bar of length proportional to the defined
#' contribution of a feature. Features are shown ranked in a decreasing contribution order.
#'
#' @return
#' The \code{lgb.plot.interpretation} function creates a \code{barplot}.
@ -26,9 +26,14 @@
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#'
#' params <- list(objective = "binary",
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
#' params <- list(
#' objective = "binary"
#' , learning_rate = 0.01
#' , num_leaves = 63
#' , max_depth = -1
#' , min_data_in_leaf = 1
#' , min_sum_hessian_in_leaf = 1
#' )
#' model <- lgb.train(params, dtrain, 10)
#'
#' tree_interpretation <- lgb.interprete(model, test$data, 1:5)
@ -67,16 +72,21 @@ lgb.plot.interpretation <- function(tree_interpretation_dt,
if (num_class == 1) {
# Only one class, plot straight away
multiple.tree.plot.interpretation(tree_interpretation_dt,
top_n = top_n,
title = NULL,
cex = cex)
multiple.tree.plot.interpretation(
tree_interpretation_dt
, top_n = top_n
, title = NULL
, cex = cex
)
} else {
# More than one class, shape data first
layout_mat <- matrix(seq.int(to = cols * ceiling(num_class / cols)),
ncol = cols, nrow = ceiling(num_class / cols))
layout_mat <- matrix(
seq.int(to = cols * ceiling(num_class / cols))
, ncol = cols
, nrow = ceiling(num_class / cols)
)
# Shape output
graphics::par(mfcol = c(nrow(layout_mat), ncol(layout_mat)))
@ -119,14 +129,14 @@ multiple.tree.plot.interpretation <- function(tree_interpretation,
# Do plot
tree_interpretation[.N:1,
graphics::barplot(
height = Contribution,
names.arg = Feature,
horiz = TRUE,
col = ifelse(Contribution > 0, "firebrick", "steelblue"),
border = NA,
main = title,
cex.names = cex,
las = 1
height = Contribution
, names.arg = Feature
, horiz = TRUE
, col = ifelse(Contribution > 0, "firebrick", "steelblue")
, border = NA
, main = title
, cex.names = cex
, las = 1
)]
# Return invisibly

Просмотреть файл

@ -1,10 +1,13 @@
#' Data preparator for LightGBM datasets (numeric)
#'
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric without integers. Please use \code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to numeric without integers. Please use
#' \code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
#'
#' @param data A data.frame or data.table to prepare.
#'
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
#' for input in \code{lgb.Dataset}.
#'
#' @examples
#' library(lightgbm)
@ -71,7 +74,11 @@ lgb.prepare <- function(data) {
} else {
# What do you think you are doing here? Throw error.
stop("lgb.prepare2: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
stop(
"lgb.prepare2: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame"
)
}

Просмотреть файл

@ -1,10 +1,16 @@
#' Data preparator for LightGBM datasets (integer)
#'
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric (specifically: integer). Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to numeric (specifically: integer).
#' Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets.
#' This is useful if you have a specific need for integer dataset instead of numeric dataset.
#' Note that there are programs which do not support integer-only input. Consider this as a half
#' memory technique which is dangerous, especially for LightGBM.
#'
#' @param data A data.frame or data.table to prepare.
#'
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
#' for input in \code{lgb.Dataset}.
#'
#' @examples
#' library(lightgbm)

Просмотреть файл

@ -1,11 +1,15 @@
#' Data preparator for LightGBM datasets with rules (numeric)
#'
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter.
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to numeric. In addition, keeps rules created
#' so you can convert other datasets using this converter.
#'
#' @param data A data.frame or data.table to prepare.
#' @param rules A set of rules from the data preparator, if already used.
#'
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
#' The data must be converted to a matrix format (\code{as.matrix}) for input
#' in \code{lgb.Dataset}.
#'
#' @examples
#' library(lightgbm)
@ -160,7 +164,11 @@ lgb.prepare_rules <- function(data, rules = NULL) {
} else {
# What do you think you are doing here? Throw error.
stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
stop(
"lgb.prepare: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame"
)
}

Просмотреть файл

@ -1,11 +1,18 @@
#' Data preparator for LightGBM datasets with rules (integer)
#'
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to numeric (specifically: integer).
#' In addition, keeps rules created so you can convert other datasets using this converter.
#' This is useful if you have a specific need for integer dataset instead of numeric dataset.
#' Note that there are programs which do not support integer-only input.
#' Consider this as a half memory technique which is dangerous, especially for LightGBM.
#'
#' @param data A data.frame or data.table to prepare.
#' @param rules A set of rules from the data preparator, if already used.
#'
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
#' The data must be converted to a matrix format (\code{as.matrix}) for input in
#' \code{lgb.Dataset}.
#'
#' @examples
#' library(lightgbm)
@ -35,9 +42,13 @@
#' data(iris) # Erase iris dataset
#'
#' # We remapped values differently
#' personal_rules <- list(Species = c("setosa" = 3L,
#' "versicolor" = 2L,
#' "virginica" = 1L))
#' personal_rules <- list(
#' Species = c(
#' "setosa" = 3L
#' , "versicolor" = 2L
#' , virginica" = 1L
#' )
#' )
#' newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules)
#' str(newest_iris$data) # SUCCESS!
#'
@ -158,7 +169,11 @@ lgb.prepare_rules2 <- function(data, rules = NULL) {
} else {
# What do you think you are doing here? Throw error.
stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
stop(
"lgb.prepare: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame"
)
}

Просмотреть файл

@ -4,17 +4,18 @@
#' @inheritParams lgb_shared_params
#' @param valids a list of \code{lgb.Dataset} objects, used for validation
#' @param obj objective function, can be character or custom objective function. Examples include
#' \code{regression}, \code{regression_l1}, \code{huber},
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
#' \code{regression}, \code{regression_l1}, \code{huber},
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
#' @param eval evaluation function, can be (a list of) character or custom eval function
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
#' @param categorical_feature list of str or int
#' type int represents index,
#' type str represents feature names
#' @param callbacks list of callback functions
#' List of callback functions that are applied at each iteration.
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets
#' type int represents index,
#' type str represents feature names
#' @param callbacks List of callback functions that are applied at each iteration.
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the
#' booster model into a predictor model which frees up memory and the
#' original datasets
#' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
#' \itemize{
#' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
@ -37,13 +38,15 @@
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 10,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' @export
lgb.train <- function(params = list(),
data,
@ -105,7 +108,17 @@ lgb.train <- function(params = list(),
begin_iteration <- predictor$current_iter() + 1
}
# Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
n_rounds <- c("num_iterations", "num_iteration", "n_iter", "num_tree", "num_trees", "num_round", "num_rounds", "num_boost_round", "n_estimators")
n_rounds <- c(
"num_iterations"
, "num_iteration"
, "n_iter"
, "num_tree"
, "num_trees"
, "num_round"
, "num_rounds"
, "num_boost_round"
, "n_estimators"
)
if (any(names(params) %in% n_rounds)) {
end_iteration <- begin_iteration + params[[which(names(params) %in% n_rounds)[1]]] - 1
} else {
@ -198,12 +211,24 @@ lgb.train <- function(params = list(),
early_stop <- c("early_stopping_round", "early_stopping_rounds", "early_stopping", "n_iter_no_change")
if (any(names(params) %in% early_stop)) {
if (params[[which(names(params) %in% early_stop)[1]]] > 0) {
callbacks <- add.cb(callbacks, cb.early.stop(params[[which(names(params) %in% early_stop)[1]]], verbose = verbose))
callbacks <- add.cb(
callbacks
, cb.early.stop(
params[[which(names(params) %in% early_stop)[1]]]
, verbose = verbose
)
)
}
} else {
if (!is.null(early_stopping_rounds)) {
if (early_stopping_rounds > 0) {
callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, verbose = verbose))
callbacks <- add.cb(
callbacks
, cb.early.stop(
early_stopping_rounds
, verbose = verbose
)
)
}
}
}
@ -267,7 +292,8 @@ lgb.train <- function(params = list(),
}
# When early stopping is not activated, we compute the best iteration / score ourselves by selecting the first metric and the first dataset
# When early stopping is not activated, we compute the best iteration / score ourselves by
# selecting the first metric and the first dataset
if (record && length(valids) > 0 && is.na(env$best_score)) {
if (env$eval_list[[1]]$higher_better[1] == TRUE) {
booster$best_iter <- unname(which.max(unlist(booster$record_evals[[2]][[1]][[1]])))
@ -282,9 +308,11 @@ lgb.train <- function(params = list(),
if (reset_data) {
# Store temporarily model data elsewhere
booster_old <- list(best_iter = booster$best_iter,
best_score = booster$best_score,
record_evals = booster$record_evals)
booster_old <- list(
best_iter = booster$best_iter
, best_score = booster$best_score
, record_evals = booster$record_evals
)
# Reload model
booster <- lgb.load(model_str = booster$save_model_to_string())

Просмотреть файл

@ -2,9 +2,13 @@
#'
#' Attempts to unload LightGBM packages so you can remove objects cleanly without having to restart R. This is useful for instance if an object becomes stuck for no apparent reason and you do not want to restart R to fix the lost object.
#'
#' @param restore Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed.
#' @param wipe Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them.
#' @param envir The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment.
#' @param restore Whether to reload \code{LightGBM} immediately after detaching from R.
#' Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once
#' unloading is performed.
#' @param wipe Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global
#' environment. Defaults to \code{FALSE} which means to not remove them.
#' @param envir The environment to perform wiping on if \code{wipe == TRUE}. Defaults to
#' \code{.GlobalEnv} which is the global environment.
#'
#' @return NULL invisibly.
#'
@ -18,13 +22,15 @@
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 10,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#'
#' \dontrun{
#' lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv)
@ -43,8 +49,18 @@ lgb.unloader <- function(restore = TRUE, wipe = FALSE, envir = .GlobalEnv) {
# Should we wipe variables? (lgb.Booster, lgb.Dataset)
if (wipe) {
boosters <- Filter(function(x) inherits(get(x, envir = envir), "lgb.Booster"), ls(envir = envir))
datasets <- Filter(function(x) inherits(get(x, envir = envir), "lgb.Dataset"), ls(envir = envir))
boosters <- Filter(
f = function(x){
inherits(get(x, envir = envir), "lgb.Booster")
}
, x = ls(envir = envir)
)
datasets <- Filter(
f = function(x){
inherits(get(x, envir = envir), "lgb.Dataset")
}
, x = ls(envir = envir)
)
rm(list = c(boosters, datasets), envir = envir)
gc(verbose = FALSE)
}

Просмотреть файл

@ -4,12 +4,10 @@
#' @param callbacks list of callback functions
#' List of callback functions that are applied at each iteration.
#' @param data a \code{lgb.Dataset} object, used for training
#' @param early_stopping_rounds int
#' Activates early stopping.
#' Requires at least one validation data and one metric
#' If there's more than one, will check all of them except the training data
#' Returns the model with (best_iter + early_stopping_rounds)
#' If early stopping occurs, the model will have 'best_iter' field
#' @param early_stopping_rounds int. Activates early stopping. Requires at least one validation data
#' and one metric. If there's more than one, will check all of them
#' except the training data. Returns the model with (best_iter + early_stopping_rounds).
#' If early stopping occurs, the model will have 'best_iter' field.
#' @param eval_freq evaluation output frequency, only effect when verbose > 0
#' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
#' @param nrounds number of training rounds
@ -76,9 +74,18 @@ lightgbm <- function(data,
}
# Train a model using the regular way
bst <- lgb.train(params, dtrain, nrounds, valids, verbose = verbose, eval_freq = eval_freq,
early_stopping_rounds = early_stopping_rounds,
init_model = init_model, callbacks = callbacks, ...)
bst <- lgb.train(
params = params
, data = dtrain
, nrounds = nrounds
, valids = valids
, verbose = verbose
, eval_freq = eval_freq
, early_stopping_rounds = early_stopping_rounds
, init_model = init_model
, callbacks = callbacks
, ...
)
# Store model under a specific name
bst$save_model(save_name)

Просмотреть файл

@ -17,13 +17,15 @@
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 10,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' saveRDS.lgb.Booster(model, "model.rds")
#' new_model <- readRDS.lgb.Booster("model.rds")
#'

Просмотреть файл

@ -1,12 +1,18 @@
#' saveRDS for \code{lgb.Booster} models
#'
#' Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
#' Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides
#' whether to save the raw model or not.
#'
#' @param object R object to serialize.
#' @param file a connection or the name of the file where the R object is saved to or read from.
#' @param ascii a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save.
#' @param version the workspace format version to use. \code{NULL} specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions.
#' @param compress a logical specifying whether saving to a named file is to use "gzip" compression, or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of compression to be used. Ignored if file is a connection.
#' @param ascii a logical. If TRUE or NA, an ASCII representation is written; otherwise (default),
#' a binary one is used. See the comments in the help for save.
#' @param version the workspace format version to use. \code{NULL} specifies the current default
#' version (2). Versions prior to 2 are not supported, so this will only be relevant
#' when there are later versions.
#' @param compress a logical specifying whether saving to a named file is to use "gzip" compression,
#' or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of
#' compression to be used. Ignored if file is a connection.
#' @param refhook a hook function for handling reference objects.
#' @param raw whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}.
#'
@ -23,10 +29,10 @@
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(
#' params
#' , dtrain
#' , 10
#' , valids
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
@ -48,12 +54,14 @@ saveRDS.lgb.Booster <- function(object,
object$save()
# Save RDS
saveRDS(object,
file = file,
ascii = ascii,
version = version,
compress = compress,
refhook = refhook)
saveRDS(
object
, file = file
, ascii = ascii
, version = version
, compress = compress
, refhook = refhook
)
# Free model from memory
object$raw <- NA
@ -61,12 +69,14 @@ saveRDS.lgb.Booster <- function(object,
} else {
# Save as usual
saveRDS(object,
file = file,
ascii = ascii,
version = version,
compress = compress,
refhook = refhook)
saveRDS(
object
, file = file
, ascii = ascii
, version = version
, compress = compress
, refhook = refhook
)
}

Просмотреть файл

@ -25,9 +25,20 @@ lgb.call <- function(fun_name, ret, ...) {
# Check for a ret call
if (!is.null(ret)) {
call_state <- .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm") # Call with ret
call_state <- .Call(
fun_name
, ...
, ret
, call_state
, PACKAGE = "lib_lightgbm"
)
} else {
call_state <- .Call(fun_name, ..., call_state, PACKAGE = "lib_lightgbm") # Call without ret
call_state <- .Call(
fun_name
, ...
, call_state
, PACKAGE = "lib_lightgbm"
)
}
call_state <- as.integer(call_state)
# Check for call state value post call
@ -37,17 +48,25 @@ lgb.call <- function(fun_name, ret, ...) {
buf_len <- 200L
act_len <- 0L
err_msg <- raw(buf_len)
err_msg <- .Call("LGBM_GetLastError_R", buf_len, act_len, err_msg, PACKAGE = "lib_lightgbm")
err_msg <- .Call(
"LGBM_GetLastError_R"
, buf_len
, act_len
, err_msg
, PACKAGE = "lib_lightgbm"
)
# Check error buffer
if (act_len > buf_len) {
buf_len <- act_len
err_msg <- raw(buf_len)
err_msg <- .Call("LGBM_GetLastError_R",
buf_len,
act_len,
err_msg,
PACKAGE = "lib_lightgbm")
err_msg <- .Call(
"LGBM_GetLastError_R"
, buf_len
, act_len
, err_msg
, PACKAGE = "lib_lightgbm"
)
}
# Return error
@ -97,7 +116,13 @@ lgb.params2str <- function(params, ...) {
# Check for identical parameters
if (length(intersect(names(params), names(dot_params))) > 0) {
stop("Same parameters in ", sQuote("params"), " and in the call are not allowed. Please check your ", sQuote("params"), " list")
stop(
"Same parameters in "
, sQuote("params")
, " and in the call are not allowed. Please check your "
, sQuote("params")
, " list"
)
}
# Merge parameters
@ -160,15 +185,43 @@ lgb.check.params <- function(params) {
lgb.check.obj <- function(params, obj) {
# List known objectives in a vector
OBJECTIVES <- c("regression", "regression_l1", "regression_l2", "mean_squared_error", "mse", "l2_root", "root_mean_squared_error", "rmse",
"mean_absolute_error", "mae", "quantile",
"huber", "fair", "poisson", "binary", "lambdarank",
"multiclass", "softmax", "multiclassova", "multiclass_ova", "ova", "ovr",
"xentropy", "cross_entropy", "xentlambda", "cross_entropy_lambda", "mean_absolute_percentage_error", "mape",
"gamma", "tweedie")
OBJECTIVES <- c(
"regression"
, "regression_l1"
, "regression_l2"
, "mean_squared_error"
, "mse"
, "l2_root"
, "root_mean_squared_error"
, "rmse"
, "mean_absolute_error"
, "mae"
, "quantile"
, "huber"
, "fair"
, "poisson"
, "binary"
, "lambdarank"
, "multiclass"
, "softmax"
, "multiclassova"
, "multiclass_ova"
, "ova"
, "ovr"
, "xentropy"
, "cross_entropy"
, "xentlambda"
, "cross_entropy_lambda"
, "mean_absolute_percentage_error"
, "mape"
, "gamma"
, "tweedie"
)
# Check whether the objective is empty or not, and take it from params if needed
if (!is.null(obj)) { params$objective <- obj }
if (!is.null(obj)) {
params$objective <- obj
}
# Check whether the objective is a character
if (is.character(params$objective)) {

Просмотреть файл

@ -17,62 +17,82 @@ class(train$data)
# Note: we are putting in sparse matrix here, lightgbm naturally handles sparse input
# Use sparse matrix when your feature is sparse (e.g. when you are using one-hot encoding vector)
print("Training lightgbm with sparseMatrix")
bst <- lightgbm(data = train$data,
label = train$label,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
objective = "binary")
bst <- lightgbm(
data = train$data
, label = train$label
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, objective = "binary"
)
# Alternatively, you can put in dense matrix, i.e. basic R-matrix
print("Training lightgbm with Matrix")
bst <- lightgbm(data = as.matrix(train$data),
label = train$label,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
objective = "binary")
bst <- lightgbm(
data = as.matrix(train$data)
, label = train$label
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, objective = "binary"
)
# You can also put in lgb.Dataset object, which stores label, data and other meta datas needed for advanced features
print("Training lightgbm with lgb.Dataset")
dtrain <- lgb.Dataset(data = train$data,
label = train$label)
bst <- lightgbm(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
objective = "binary")
dtrain <- lgb.Dataset(
data = train$data
, label = train$label
)
bst <- lightgbm(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, objective = "binary"
)
# Verbose = 0,1,2
print("Train lightgbm with verbose 0, no message")
bst <- lightgbm(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
objective = "binary",
verbose = 0)
bst <- lightgbm(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, objective = "binary"
, verbose = 0
)
print("Train lightgbm with verbose 1, print evaluation metric")
bst <- lightgbm(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
nthread = 2,
objective = "binary",
verbose = 1)
bst <- lightgbm(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, nthread = 2
, objective = "binary"
, verbose = 1
)
print("Train lightgbm with verbose 2, also print information about tree")
bst <- lightgbm(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
nthread = 2,
objective = "binary",
verbose = 2)
bst <- lightgbm(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, nthread = 2
, objective = "binary"
, verbose = 2
)
# You can also specify data as file path to a LibSVM/TCV/CSV format input
# Since we do not have this file with us, the following line is just for illustration
# bst <- lightgbm(data = "agaricus.train.svm", num_leaves = 4, learning_rate = 1, nrounds = 2,objective = "binary")
# bst <- lightgbm(
# data = "agaricus.train.svm"
# , num_leaves = 4
# , learning_rate = 1
# , nrounds = 2
# , objective = "binary"
# )
#--------------------Basic prediction using lightgbm--------------
# You can do prediction using the following line
@ -104,37 +124,43 @@ valids <- list(train = dtrain, test = dtest)
# To train with valids, use lgb.train, which contains more advanced features
# valids allows us to monitor the evaluation result on all data in the list
print("Train lightgbm using lgb.train with valids")
bst <- lgb.train(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
valids = valids,
nthread = 2,
objective = "binary")
bst <- lgb.train(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, valids = valids
, nthread = 2
, objective = "binary"
)
# We can change evaluation metrics, or use multiple evaluation metrics
print("Train lightgbm using lgb.train with valids, watch logloss and error")
bst <- lgb.train(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
valids = valids,
eval = c("binary_error", "binary_logloss"),
nthread = 2,
objective = "binary")
bst <- lgb.train(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, valids = valids
, eval = c("binary_error", "binary_logloss")
, nthread = 2
, objective = "binary"
)
# lgb.Dataset can also be saved using lgb.Dataset.save
lgb.Dataset.save(dtrain, "dtrain.buffer")
# To load it in, simply call lgb.Dataset
dtrain2 <- lgb.Dataset("dtrain.buffer")
bst <- lgb.train(data = dtrain2,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
valids = valids,
nthread = 2,
objective = "binary")
bst <- lgb.train(
data = dtrain2
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, valids = valids
, nthread = 2
, objective = "binary"
)
# information can be extracted from lgb.Dataset using getinfo
label = getinfo(dtest, "label")

Просмотреть файл

@ -13,10 +13,12 @@ valids <- list(eval = dtest, train = dtrain)
print("Start running example to start from an initial prediction")
# Train lightgbm for 1 round
param <- list(num_leaves = 4,
learning_rate = 1,
nthread = 2,
objective = "binary")
param <- list(
num_leaves = 4
, learning_rate = 1
, nthread = 2
, objective = "binary"
)
bst <- lgb.train(param, dtrain, 1, valids = valids)
# Note: we need the margin value instead of transformed prediction in set_init_score
@ -29,7 +31,9 @@ setinfo(dtrain, "init_score", ptrain)
setinfo(dtest, "init_score", ptest)
print("This is result of boost from initial prediction")
bst <- lgb.train(params = param,
data = dtrain,
nrounds = 5,
valids = valids)
bst <- lgb.train(
params = param
, data = dtrain
, nrounds = 5
, valids = valids
)

Просмотреть файл

@ -60,21 +60,28 @@ my_data <- as.matrix(bank[, 1:16, with = FALSE])
# Creating the LightGBM dataset with categorical features
# The categorical features must be indexed like in R (1-indexed, not 0-indexed)
lgb_data <- lgb.Dataset(data = my_data,
label = bank$y,
categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16))
lgb_data <- lgb.Dataset(
data = my_data
, label = bank$y
, categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)
)
# We can now train a model
model <- lgb.train(list(objective = "binary",
metric = "l2",
min_data = 1,
learning_rate = 0.1,
min_data = 0,
min_hessian = 1,
max_depth = 2),
lgb_data,
100,
valids = list(train = lgb_data))
params <- list(
objective = "binary"
, metric = "l2"
, min_data = 1
, learning_rate = 0.1
, min_data = 0
, min_hessian = 1
, max_depth = 2
)
model <- lgb.train(
params = params
, data = lgb_data
, nrounds = 100
, valids = list(train = lgb_data)
)
# Try to find split_feature: 2
# If you find it, it means it used a categorical feature in the first tree

Просмотреть файл

@ -68,24 +68,33 @@ my_data_test <- as.matrix(bank_test[, 1:16, with = FALSE])
# Creating the LightGBM dataset with categorical features
# The categorical features can be passed to lgb.train to not copy and paste a lot
dtrain <- lgb.Dataset(data = my_data_train,
label = bank_train$y,
categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16))
dtest <- lgb.Dataset.create.valid(dtrain,
data = my_data_test,
label = bank_test$y)
dtrain <- lgb.Dataset(
data = my_data_train
, label = bank_train$y
, categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)
)
dtest <- lgb.Dataset.create.valid(
dtrain
, data = my_data_test
, label = bank_test$y
)
# We can now train a model
model <- lgb.train(list(objective = "binary",
metric = "l2",
min_data = 1,
learning_rate = 0.1,
min_data = 0,
min_hessian = 1,
max_depth = 2),
dtrain,
100,
valids = list(train = dtrain, valid = dtest))
params <- list(
objective = "binary"
, metric = "l2"
, min_data = 1
, learning_rate = 0.1
, min_data = 0
, min_hessian = 1
, max_depth = 2
)
model <- lgb.train(
params = params
, data = dtrain
, nrounds = 100
, valids = list(train = dtrain, valid = dtest)
)
# Try to find split_feature: 11
# If you find it, it means it used a categorical feature in the first tree

Просмотреть файл

@ -6,30 +6,36 @@ dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
dtest <- lgb.Dataset.create.valid(dtrain, data = agaricus.test$data, label = agaricus.test$label)
nrounds <- 2
param <- list(num_leaves = 4,
learning_rate = 1,
objective = "binary")
param <- list(
num_leaves = 4
, learning_rate = 1
, objective = "binary"
)
print("Running cross validation")
# Do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
lgb.cv(param,
dtrain,
nrounds,
nfold = 5,
eval = "binary_error")
lgb.cv(
param
, dtrain
, nrounds
, nfold = 5
, eval = "binary_error"
)
print("Running cross validation, disable standard deviation display")
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
lgb.cv(param,
dtrain,
nrounds,
nfold = 5,
eval = "binary_error",
showsd = FALSE)
lgb.cv(
param
, dtrain
, nrounds
, nfold = 5
, eval = "binary_error"
, showsd = FALSE
)
# You can also do cross validation with cutomized loss function
print("Running cross validation, with cutomsized loss function")
@ -48,9 +54,11 @@ evalerror <- function(preds, dtrain) {
}
# train with customized objective
lgb.cv(params = param,
data = dtrain,
nrounds = nrounds,
obj = logregobj,
eval = evalerror,
nfold = 5)
lgb.cv(
params = param
, data = dtrain
, nrounds = nrounds
, obj = logregobj
, eval = evalerror
, nfold = 5
)

Просмотреть файл

@ -11,8 +11,10 @@ dtest <- lgb.Dataset.create.valid(dtrain, data = agaricus.test$data, label = aga
# Note: for customized objective function, we leave objective as default
# Note: what we are getting is margin value in prediction
# You must know what you are doing
param <- list(num_leaves = 4,
learning_rate = 1)
param <- list(
num_leaves = 4
, learning_rate = 1
)
valids <- list(eval = dtest)
num_round <- 20
@ -39,10 +41,12 @@ evalerror <- function(preds, dtrain) {
}
print("Start training with early Stopping setting")
bst <- lgb.train(param,
dtrain,
num_round,
valids,
objective = logregobj,
eval = evalerror,
early_stopping_round = 3)
bst <- lgb.train(
param
, dtrain
, num_round
, valids
, objective = logregobj
, eval = evalerror
, early_stopping_round = 3
)

Просмотреть файл

@ -26,9 +26,11 @@ gbm <- list()
for (i in 1:1000) {
print(i)
gbm[[i]] <- lgb.train(params = list(objective = "regression"),
data = data,
1,
reset_data = TRUE)
gbm[[i]] <- lgb.train(
params = list(objective = "regression")
, data = data
, 1
, reset_data = TRUE
)
gc(verbose = FALSE)
}

Просмотреть файл

@ -17,32 +17,47 @@ dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
# Third, we setup parameters and we train a model
params <- list(objective = "regression", metric = "l2")
valids <- list(test = dtest)
model <- lgb.train(params,
dtrain,
50,
valids,
min_data = 1,
learning_rate = 0.1,
bagging_fraction = 0.1,
bagging_freq = 1,
bagging_seed = 1)
model <- lgb.train(
params
, dtrain
, 50
, valids
, min_data = 1
, learning_rate = 0.1
, bagging_fraction = 0.1
, bagging_freq = 1
, bagging_seed = 1
)
# We create a data.frame with the following structure:
# X = average leaf of the observation throughout all trees
# Y = prediction probability (clamped to [1e-15, 1-1e-15])
# Z = logloss
# binned = binned quantile of average leaf
new_data <- data.frame(X = rowMeans(predict(model,
agaricus.test$data,
predleaf = TRUE)),
Y = pmin(pmax(predict(model,
agaricus.test$data), 1e-15), 1 - 1e-15))
new_data$Z <- -(agaricus.test$label * log(new_data$Y) + (1 - agaricus.test$label) * log(1 - new_data$Y))
new_data$binned <- .bincode(x = new_data$X,
breaks = quantile(x = new_data$X,
probs = (1:9)/10),
right = TRUE,
include.lowest = TRUE)
new_data <- data.frame(
X = rowMeans(predict(
model
, agaricus.test$data
, predleaf = TRUE
))
, Y = pmin(
pmax(
predict(model, agaricus.test$data)
, 1e-15
)
, 1 - 1e-15
)
)
new_data$Z <- -1 * (agaricus.test$label * log(new_data$Y) + (1 - agaricus.test$label) * log(1 - new_data$Y))
new_data$binned <- .bincode(
x = new_data$X
, breaks = quantile(
x = new_data$X
, probs = (1:9) / 10
)
, right = TRUE
, include.lowest = TRUE
)
new_data$binned[is.na(new_data$binned)] <- 0
new_data$binned <- as.factor(new_data$binned)
@ -52,31 +67,64 @@ table(new_data$binned)
# We can plot the binned content
# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss
# On the third plot, it is smooth!
ggplot(data = new_data, mapping = aes(x = X, y = Y, color = binned)) + geom_point() + theme_bw() + labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
ggplot(data = new_data, mapping = aes(x = binned, y = Z, fill = binned, group = binned)) + geom_boxplot() + theme_bw() + labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
ggplot(data = new_data, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
ggplot(
data = new_data
, mapping = aes(x = X, y = Y, color = binned)
) + geom_point() +
theme_bw() +
labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
ggplot(
data = new_data
, mapping = aes(x = binned, y = Z, fill = binned, group = binned)
) + geom_boxplot() +
theme_bw() +
labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
ggplot(
data = new_data
, mapping = aes(x = Y, y = ..count.., fill = binned)
) + geom_density(position = "fill") +
theme_bw() +
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
# Now, let's show with other parameters
model2 <- lgb.train(params,
dtrain,
100,
valids,
min_data = 1,
learning_rate = 1)
model2 <- lgb.train(
params
, dtrain
, 100
, valids
, min_data = 1
, learning_rate = 1
)
# We create the data structure, but for model2
new_data2 <- data.frame(X = rowMeans(predict(model2,
agaricus.test$data,
predleaf = TRUE)),
Y = pmin(pmax(predict(model2,
agaricus.test$data), 1e-15), 1 - 1e-15))
new_data2$Z <- -(agaricus.test$label * log(new_data2$Y) + (1 - agaricus.test$label) * log(1 - new_data2$Y))
new_data2$binned <- .bincode(x = new_data2$X,
breaks = quantile(x = new_data2$X,
probs = (1:9)/10),
right = TRUE,
include.lowest = TRUE)
new_data2 <- data.frame(
X = rowMeans(predict(
model2
, agaricus.test$data
, predleaf = TRUE
))
, Y = pmin(
pmax(
predict(
model2
, agaricus.test$data
)
, 1e-15
)
, 1 - 1e-15
)
)
new_data2$Z <- -1 * (agaricus.test$label * log(new_data2$Y) + (1 - agaricus.test$label) * log(1 - new_data2$Y))
new_data2$binned <- .bincode(
x = new_data2$X
, breaks = quantile(
x = new_data2$X
, probs = (1:9) / 10
)
, right = TRUE
, include.lowest = TRUE
)
new_data2$binned[is.na(new_data2$binned)] <- 0
new_data2$binned <- as.factor(new_data2$binned)
@ -87,31 +135,64 @@ table(new_data2$binned)
# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are real thus it is not an issue
# However, if the rules were not true, the loss would explode.
ggplot(data = new_data2, mapping = aes(x = X, y = Y, color = binned)) + geom_point() + theme_bw() + labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
ggplot(data = new_data2, mapping = aes(x = binned, y = Z, fill = binned, group = binned)) + geom_boxplot() + theme_bw() + labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
ggplot(data = new_data2, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
ggplot(
data = new_data2
, mapping = aes(x = X, y = Y, color = binned)
) + geom_point() +
theme_bw() +
labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
ggplot(
data = new_data2
, mapping = aes(x = binned, y = Z, fill = binned, group = binned)
) + geom_boxplot() +
theme_bw() +
labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
ggplot(
data = new_data2
, mapping = aes(x = Y, y = ..count.., fill = binned)
) + geom_density(position = "fill") +
theme_bw() +
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
# Now, try with very severe overfitting
model3 <- lgb.train(params,
dtrain,
1000,
valids,
min_data = 1,
learning_rate = 1)
model3 <- lgb.train(
params
, dtrain
, 1000
, valids
, min_data = 1
, learning_rate = 1
)
# We create the data structure, but for model3
new_data3 <- data.frame(X = rowMeans(predict(model3,
agaricus.test$data,
predleaf = TRUE)),
Y = pmin(pmax(predict(model3,
agaricus.test$data), 1e-15), 1 - 1e-15))
new_data3$Z <- -(agaricus.test$label * log(new_data3$Y) + (1 - agaricus.test$label) * log(1 - new_data3$Y))
new_data3$binned <- .bincode(x = new_data3$X,
breaks = quantile(x = new_data3$X,
probs = (1:9)/10),
right = TRUE,
include.lowest = TRUE)
new_data3 <- data.frame(
X = rowMeans(predict(
model3
, agaricus.test$data
, predleaf = TRUE
))
, Y = pmin(
pmax(
predict(
model3
, agaricus.test$data
)
, 1e-15
)
, 1 - 1e-15
)
)
new_data3$Z <- -1 * (agaricus.test$label * log(new_data3$Y) + (1 - agaricus.test$label) * log(1 - new_data3$Y))
new_data3$binned <- .bincode(
x = new_data3$X
, breaks = quantile(
x = new_data3$X
, probs = (1:9) / 10
)
, right = TRUE
, include.lowest = TRUE
)
new_data3$binned[is.na(new_data3$binned)] <- 0
new_data3$binned <- as.factor(new_data3$binned)
@ -119,9 +200,21 @@ new_data3$binned <- as.factor(new_data3$binned)
table(new_data3$binned)
# We can plot the binned content
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are real thus it is not an issue.
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules
# are real thus it is not an issue.
# However, if the rules were not true, the loss would explode. See the sudden spikes?
ggplot(data = new_data3, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
ggplot(
data = new_data3
, mapping = aes(x = Y, y = ..count.., fill = binned)
) +
geom_density(position = "fill") +
theme_bw() +
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
# Compare with our second model, the difference is severe. This is smooth.
ggplot(data = new_data2, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
ggplot(
data = new_data2
, mapping = aes(x = Y, y = ..count.., fill = binned)
) + geom_density(position = "fill") +
theme_bw() +
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")

Просмотреть файл

@ -19,29 +19,33 @@ valids <- list(test = dtest)
# Method 1 of training
params <- list(objective = "multiclass", metric = "multi_error", num_class = 3)
model <- lgb.train(params,
dtrain,
100,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10)
model <- lgb.train(
params
, dtrain
, 100
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
)
# We can predict on test data, outputs a 90-length vector
# Order: obs1 class1, obs1 class2, obs1 class3, obs2 class1, obs2 class2, obs2 class3...
my_preds <- predict(model, test[, 1:4])
# Method 2 of training, identical
model <- lgb.train(list(),
dtrain,
100,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10,
objective = "multiclass",
metric = "multi_error",
num_class = 3)
model <- lgb.train(
list()
, dtrain
, 100
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
, objective = "multiclass"
, metric = "multi_error"
, num_class = 3
)
# We can predict on test data, identical
my_preds <- predict(model, test[, 1:4])

Просмотреть файл

@ -20,17 +20,19 @@ valids <- list(train = dtrain, test = dtest)
# Method 1 of training with built-in multiclass objective
# Note: need to turn off boost from average to match custom objective
# (https://github.com/microsoft/LightGBM/issues/1846)
model_builtin <- lgb.train(list(),
dtrain,
boost_from_average = FALSE,
100,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10,
objective = "multiclass",
metric = "multi_logloss",
num_class = 3)
model_builtin <- lgb.train(
list()
, dtrain
, boost_from_average = FALSE
, 100
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
, objective = "multiclass"
, metric = "multi_logloss"
, num_class = 3
)
preds_builtin <- predict(model_builtin, test[, 1:4], rawscore = TRUE, reshape = TRUE)
probs_builtin <- exp(preds_builtin) / rowSums(exp(preds_builtin))
@ -65,21 +67,25 @@ custom_multiclass_metric = function(preds, dtrain) {
preds = preds - apply(preds, 1, max)
prob = exp(preds) / rowSums(exp(preds))
return(list(name = "error",
value = -mean(log(prob[cbind(1:length(labels), labels + 1)])),
higher_better = FALSE))
return(list(
name = "error"
, value = -mean(log(prob[cbind(1:length(labels), labels + 1)]))
, higher_better = FALSE
))
}
model_custom <- lgb.train(list(),
dtrain,
100,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10,
objective = custom_multiclass_obj,
eval = custom_multiclass_metric,
num_class = 3)
model_custom <- lgb.train(
list()
, dtrain
, 100
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
, objective = custom_multiclass_obj
, eval = custom_multiclass_metric
, num_class = 3
)
preds_custom <- predict(model_custom, test[, 1:4], rawscore = TRUE, reshape = TRUE)
probs_custom <- exp(preds_custom) / rowSums(exp(preds_custom))
@ -87,4 +93,3 @@ probs_custom <- exp(preds_custom) / rowSums(exp(preds_custom))
# compare predictions
stopifnot(identical(probs_builtin, probs_custom))
stopifnot(identical(preds_builtin, preds_custom))

Просмотреть файл

@ -11,8 +11,8 @@ library(lightgbm)
# - Run 3: sum of weights equal to 6513 (x 1e5) with adjusted regularization (learning)
# Setup small weights
weights1 <- rep(1/100000, 6513)
weights2 <- rep(1/100000, 1611)
weights1 <- rep(1 / 100000, 6513)
weights2 <- rep(1 / 100000, 1611)
# Load data and create datasets
data(agaricus.train, package = "lightgbm")
@ -26,40 +26,48 @@ valids <- list(test = dtest)
# Run 1: sum of weights equal to 0.06513 without adjusted regularization (not learning)
# It cannot learn because regularization is too large!
# min_sum_hessian alone is bigger than the sum of weights, thus you will never learn anything
params <- list(objective = "regression",
metric = "l2",
device = "cpu",
min_sum_hessian = 10,
num_leaves = 7,
max_depth = 3,
nthread = 1)
model <- lgb.train(params,
dtrain,
50,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10)
params <- list(
objective = "regression"
, metric = "l2"
, device = "cpu"
, min_sum_hessian = 10
, num_leaves = 7
, max_depth = 3
, nthread = 1
)
model <- lgb.train(
params
, dtrain
, 50
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
)
weight_loss <- as.numeric(model$record_evals$test$l2$eval)
plot(weight_loss) # Shows how poor the learning was: a straight line!
# Run 2: sum of weights equal to 0.06513 with adjusted regularization (learning)
# Adjusted regularization just consisting in multiplicating results by 1e4 (x10000)
# Notice how it learns, there is no issue as we adjusted regularization ourselves
params <- list(objective = "regression",
metric = "l2",
device = "cpu",
min_sum_hessian = 1e-4,
num_leaves = 7,
max_depth = 3,
nthread = 1)
model <- lgb.train(params,
dtrain,
50,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10)
params <- list(
objective = "regression"
, metric = "l2"
, device = "cpu"
, min_sum_hessian = 1e-4
, num_leaves = 7
, max_depth = 3
, nthread = 1
)
model <- lgb.train(
params
, dtrain
, 50
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
)
small_weight_loss <- as.numeric(model$record_evals$test$l2$eval)
plot(small_weight_loss) # It learns!
@ -78,24 +86,28 @@ dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
valids <- list(test = dtest)
# Setup parameters and run model...
params <- list(objective = "regression",
metric = "l2",
device = "cpu",
min_sum_hessian = 10,
num_leaves = 7,
max_depth = 3,
nthread = 1)
model <- lgb.train(params,
dtrain,
50,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10)
params <- list(
objective = "regression"
, metric = "l2"
, device = "cpu"
, min_sum_hessian = 10
, num_leaves = 7
, max_depth = 3
, nthread = 1
)
model <- lgb.train(
params
, dtrain
, 50
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
)
large_weight_loss <- as.numeric(model$record_evals$test$l2$eval)
plot(large_weight_loss) # It learns!
# Do you want to compare the learning? They both converge.
plot(small_weight_loss, large_weight_loss)
curve(1*x, from = 0, to = 0.02, add = TRUE)
curve(1 * x, from = 0, to = 0.02, add = TRUE)

Просмотреть файл

@ -12,9 +12,9 @@ getinfo(dataset, ...)
\arguments{
\item{dataset}{Object of class \code{lgb.Dataset}}
\item{...}{other parameters}
\item{name}{the name of the information field to get (see details)}
\item{...}{other parameters}
}
\value{
info data

Просмотреть файл

@ -4,9 +4,16 @@
\alias{lgb.Dataset}
\title{Construct \code{lgb.Dataset} object}
\usage{
lgb.Dataset(data, params = list(), reference = NULL, colnames = NULL,
categorical_feature = NULL, free_raw_data = TRUE, info = list(),
...)
lgb.Dataset(
data,
params = list(),
reference = NULL,
colnames = NULL,
categorical_feature = NULL,
free_raw_data = TRUE,
info = list(),
...
)
}
\arguments{
\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename}

Просмотреть файл

@ -4,13 +4,29 @@
\alias{lgb.cv}
\title{Main CV logic for LightGBM}
\usage{
lgb.cv(params = list(), data, nrounds = 10, nfold = 3,
label = NULL, weight = NULL, obj = NULL, eval = NULL,
verbose = 1, record = TRUE, eval_freq = 1L, showsd = TRUE,
stratified = TRUE, folds = NULL, init_model = NULL,
colnames = NULL, categorical_feature = NULL,
early_stopping_rounds = NULL, callbacks = list(),
reset_data = FALSE, ...)
lgb.cv(
params = list(),
data,
nrounds = 10,
nfold = 3,
label = NULL,
weight = NULL,
obj = NULL,
eval = NULL,
verbose = 1,
record = TRUE,
eval_freq = 1L,
showsd = TRUE,
stratified = TRUE,
folds = NULL,
init_model = NULL,
colnames = NULL,
categorical_feature = NULL,
early_stopping_rounds = NULL,
callbacks = list(),
reset_data = FALSE,
...
)
}
\arguments{
\item{params}{List of parameters}
@ -27,7 +43,7 @@ lgb.cv(params = list(), data, nrounds = 10, nfold = 3,
\item{obj}{objective function, can be character or custom objective function. Examples include
\code{regression}, \code{regression_l1}, \code{huber},
\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
\item{eval}{evaluation function, can be (list of) character or custom eval function}
@ -54,17 +70,15 @@ the \code{nfold} and \code{stratified} parameters are ignored.}
type int represents index,
type str represents feature names}
\item{early_stopping_rounds}{int
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them except the training data
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will have 'best_iter' field}
\item{early_stopping_rounds}{int. Activates early stopping. Requires at least one validation data
and one metric. If there's more than one, will check all of them
except the training data. Returns the model with (best_iter + early_stopping_rounds).
If early stopping occurs, the model will have 'best_iter' field.}
\item{callbacks}{list of callback functions
List of callback functions that are applied at each iteration.}
\item{callbacks}{List of callback functions that are applied at each iteration.}
\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets}
\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model
into a predictor model which frees up memory and the original datasets}
\item{...}{other parameters, see Parameters.rst for more information. A few key parameters:
\itemize{
@ -89,11 +103,13 @@ data(agaricus.train, package = "lightgbm")
train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label = train$label)
params <- list(objective = "regression", metric = "l2")
model <- lgb.cv(params,
dtrain,
10,
nfold = 3,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 5)
model <- lgb.cv(
params = params
, data = dtrain
, nrounds = 10
, nfold = 3
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 5
)
}

Просмотреть файл

@ -27,13 +27,15 @@ test <- agaricus.test
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
params <- list(objective = "regression", metric = "l2")
valids <- list(test = dtest)
model <- lgb.train(params,
dtrain,
10,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 5)
model <- lgb.train(
params = params
, data = dtrain
, nrounds = 10
, valids = valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 5
)
json_model <- lgb.dump(model)
}

Просмотреть файл

@ -4,8 +4,13 @@
\alias{lgb.get.eval.result}
\title{Get record evaluation result from booster}
\usage{
lgb.get.eval.result(booster, data_name, eval_name, iters = NULL,
is_err = FALSE)
lgb.get.eval.result(
booster,
data_name,
eval_name,
iters = NULL,
is_err = FALSE
)
}
\arguments{
\item{booster}{Object of class \code{lgb.Booster}}
@ -34,12 +39,14 @@ test <- agaricus.test
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
params <- list(objective = "regression", metric = "l2")
valids <- list(test = dtest)
model <- lgb.train(params,
dtrain,
10,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 5)
model <- lgb.train(
params = params
, data = dtrain
, nrounds = 10
, valids = valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 5
)
lgb.get.eval.result(model, "test", "l2")
}

Просмотреть файл

@ -29,9 +29,14 @@ data(agaricus.train, package = "lightgbm")
train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label = train$label)
params <- list(objective = "binary",
learning_rate = 0.01, num_leaves = 63, max_depth = -1,
min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
params <- list(
objective = "binary"
, learning_rate = 0.01
, num_leaves = 63
, max_depth = -1
, min_data_in_leaf = 1
, min_sum_hessian_in_leaf = 1
)
model <- lgb.train(params, dtrain, 10)
tree_imp1 <- lgb.importance(model, percentage = TRUE)

Просмотреть файл

@ -11,7 +11,7 @@ lgb.interprete(model, data, idxset, num_iteration = NULL)
\item{data}{a matrix object or a dgCMatrix object.}
\item{idxset}{a integer vector of indices of rows needed.}
\item{idxset}{an integer vector of indices of rows needed.}
\item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration.}
}

Просмотреть файл

@ -29,13 +29,15 @@ test <- agaricus.test
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
params <- list(objective = "regression", metric = "l2")
valids <- list(test = dtest)
model <- lgb.train(params,
dtrain,
10,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 5)
model <- lgb.train(
params = params
, data = dtrain
, nrounds = 10
, valids = valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 5
)
lgb.save(model, "model.txt")
load_booster <- lgb.load(filename = "model.txt")
model_string <- model$save_model_to_string(NULL) # saves best iteration

Просмотреть файл

@ -44,9 +44,14 @@ data(agaricus.train, package = "lightgbm")
train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label = train$label)
params <- list(objective = "binary",
learning_rate = 0.01, num_leaves = 63, max_depth = -1,
min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
params <- list(
objective = "binary"
, learning_rate = 0.01
, num_leaves = 63
, max_depth = -1
, min_data_in_leaf = 1
, min_sum_hessian_in_leaf = 1
)
model <- lgb.train(params, dtrain, 10)
tree_dt <- lgb.model.dt.tree(model)

Просмотреть файл

@ -4,8 +4,13 @@
\alias{lgb.plot.importance}
\title{Plot feature importance as a bar graph}
\usage{
lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain",
left_margin = 10, cex = NULL)
lgb.plot.importance(
tree_imp,
top_n = 10,
measure = "Gain",
left_margin = 10,
cex = NULL
)
}
\arguments{
\item{tree_imp}{a \code{data.table} returned by \code{\link{lgb.importance}}.}

Просмотреть файл

@ -4,8 +4,13 @@
\alias{lgb.plot.interpretation}
\title{Plot feature contribution as a bar graph}
\usage{
lgb.plot.interpretation(tree_interpretation_dt, top_n = 10, cols = 1,
left_margin = 10, cex = NULL)
lgb.plot.interpretation(
tree_interpretation_dt,
top_n = 10,
cols = 1,
left_margin = 10,
cex = NULL
)
}
\arguments{
\item{tree_interpretation_dt}{a \code{data.table} returned by \code{\link{lgb.interprete}}.}
@ -25,8 +30,8 @@ The \code{lgb.plot.interpretation} function creates a \code{barplot}.
Plot previously calculated feature contribution as a bar graph.
}
\details{
The graph represents each feature as a horizontal bar of length proportional to the defined contribution of a feature.
Features are shown ranked in a decreasing contribution order.
The graph represents each feature as a horizontal bar of length proportional to the defined
contribution of a feature. Features are shown ranked in a decreasing contribution order.
}
\examples{
library(lightgbm)
@ -39,9 +44,14 @@ setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label))
data(agaricus.test, package = "lightgbm")
test <- agaricus.test
params <- list(objective = "binary",
learning_rate = 0.01, num_leaves = 63, max_depth = -1,
min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
params <- list(
objective = "binary"
, learning_rate = 0.01
, num_leaves = 63
, max_depth = -1
, min_data_in_leaf = 1
, min_sum_hessian_in_leaf = 1
)
model <- lgb.train(params, dtrain, 10)
tree_interpretation <- lgb.interprete(model, test$data, 1:5)

Просмотреть файл

@ -10,10 +10,13 @@ lgb.prepare(data)
\item{data}{A data.frame or data.table to prepare.}
}
\value{
The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
for input in \code{lgb.Dataset}.
}
\description{
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric without integers. Please use \code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
Factors and characters are converted to numeric without integers. Please use
\code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
}
\examples{
library(lightgbm)

Просмотреть файл

@ -10,10 +10,16 @@ lgb.prepare2(data)
\item{data}{A data.frame or data.table to prepare.}
}
\value{
The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
for input in \code{lgb.Dataset}.
}
\description{
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric (specifically: integer). Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
Factors and characters are converted to numeric (specifically: integer).
Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets.
This is useful if you have a specific need for integer dataset instead of numeric dataset.
Note that there are programs which do not support integer-only input. Consider this as a half
memory technique which is dangerous, especially for LightGBM.
}
\examples{
library(lightgbm)

Просмотреть файл

@ -12,10 +12,14 @@ lgb.prepare_rules(data, rules = NULL)
\item{rules}{A set of rules from the data preparator, if already used.}
}
\value{
A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
The data must be converted to a matrix format (\code{as.matrix}) for input
in \code{lgb.Dataset}.
}
\description{
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter.
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
Factors and characters are converted to numeric. In addition, keeps rules created
so you can convert other datasets using this converter.
}
\examples{
library(lightgbm)

Просмотреть файл

@ -12,43 +12,15 @@ lgb.prepare_rules2(data, rules = NULL)
\item{rules}{A set of rules from the data preparator, if already used.}
}
\value{
A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
The data must be converted to a matrix format (\code{as.matrix}) for input in
\code{lgb.Dataset}.
}
\description{
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
}
\examples{
library(lightgbm)
data(iris)
str(iris)
new_iris <- lgb.prepare_rules2(data = iris) # Autoconverter
str(new_iris$data)
data(iris) # Erase iris dataset
iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
# Use conversion using known rules
# Unknown factors become 0, excellent for sparse datasets
newer_iris <- lgb.prepare_rules2(data = iris, rules = new_iris$rules)
# Unknown factor is now zero, perfect for sparse datasets
newer_iris$data[1, ] # Species became 0 as it is an unknown factor
newer_iris$data[1, 5] <- 1 # Put back real initial value
# Is the newly created dataset equal? YES!
all.equal(new_iris$data, newer_iris$data)
# Can we test our own rules?
data(iris) # Erase iris dataset
# We remapped values differently
personal_rules <- list(Species = c("setosa" = 3L,
"versicolor" = 2L,
"virginica" = 1L))
newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules)
str(newest_iris$data) # SUCCESS!
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
Factors and characters are converted to numeric (specifically: integer).
In addition, keeps rules created so you can convert other datasets using this converter.
This is useful if you have a specific need for integer dataset instead of numeric dataset.
Note that there are programs which do not support integer-only input.
Consider this as a half memory technique which is dangerous, especially for LightGBM.
}

Просмотреть файл

@ -29,13 +29,15 @@ test <- agaricus.test
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
params <- list(objective = "regression", metric = "l2")
valids <- list(test = dtest)
model <- lgb.train(params,
dtrain,
10,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 5)
model <- lgb.train(
params = params
, data = dtrain
, nrounds = 10
, valids = valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 5
)
lgb.save(model, "model.txt")
}

Просмотреть файл

@ -4,11 +4,24 @@
\alias{lgb.train}
\title{Main training logic for LightGBM}
\usage{
lgb.train(params = list(), data, nrounds = 10, valids = list(),
obj = NULL, eval = NULL, verbose = 1, record = TRUE,
eval_freq = 1L, init_model = NULL, colnames = NULL,
categorical_feature = NULL, early_stopping_rounds = NULL,
callbacks = list(), reset_data = FALSE, ...)
lgb.train(
params = list(),
data,
nrounds = 10,
valids = list(),
obj = NULL,
eval = NULL,
verbose = 1,
record = TRUE,
eval_freq = 1L,
init_model = NULL,
colnames = NULL,
categorical_feature = NULL,
early_stopping_rounds = NULL,
callbacks = list(),
reset_data = FALSE,
...
)
}
\arguments{
\item{params}{List of parameters}
@ -39,17 +52,16 @@ lgb.train(params = list(), data, nrounds = 10, valids = list(),
type int represents index,
type str represents feature names}
\item{early_stopping_rounds}{int
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them except the training data
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will have 'best_iter' field}
\item{early_stopping_rounds}{int. Activates early stopping. Requires at least one validation data
and one metric. If there's more than one, will check all of them
except the training data. Returns the model with (best_iter + early_stopping_rounds).
If early stopping occurs, the model will have 'best_iter' field.}
\item{callbacks}{list of callback functions
List of callback functions that are applied at each iteration.}
\item{callbacks}{List of callback functions that are applied at each iteration.}
\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets}
\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the
booster model into a predictor model which frees up memory and the
original datasets}
\item{...}{other parameters, see Parameters.rst for more information. A few key parameters:
\itemize{
@ -78,11 +90,13 @@ test <- agaricus.test
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
params <- list(objective = "regression", metric = "l2")
valids <- list(test = dtest)
model <- lgb.train(params,
dtrain,
10,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 5)
model <- lgb.train(
params = params
, data = dtrain
, nrounds = 10
, valids = valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 5
)
}

Просмотреть файл

@ -7,11 +7,15 @@
lgb.unloader(restore = TRUE, wipe = FALSE, envir = .GlobalEnv)
}
\arguments{
\item{restore}{Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed.}
\item{restore}{Whether to reload \code{LightGBM} immediately after detaching from R.
Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once
unloading is performed.}
\item{wipe}{Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them.}
\item{wipe}{Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global
environment. Defaults to \code{FALSE} which means to not remove them.}
\item{envir}{The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment.}
\item{envir}{The environment to perform wiping on if \code{wipe == TRUE}. Defaults to
\code{.GlobalEnv} which is the global environment.}
}
\value{
NULL invisibly.
@ -29,13 +33,15 @@ test <- agaricus.test
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
params <- list(objective = "regression", metric = "l2")
valids <- list(test = dtest)
model <- lgb.train(params,
dtrain,
10,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 5)
model <- lgb.train(
params = params
, data = dtrain
, nrounds = 10
, valids = valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 5
)
\dontrun{
lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv)

Просмотреть файл

@ -9,12 +9,10 @@ List of callback functions that are applied at each iteration.}
\item{data}{a \code{lgb.Dataset} object, used for training}
\item{early_stopping_rounds}{int
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them except the training data
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will have 'best_iter' field}
\item{early_stopping_rounds}{int. Activates early stopping. Requires at least one validation data
and one metric. If there's more than one, will check all of them
except the training data. Returns the model with (best_iter + early_stopping_rounds).
If early stopping occurs, the model will have 'best_iter' field.}
\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}

Просмотреть файл

@ -4,10 +4,20 @@
\alias{lightgbm}
\title{Train a LightGBM model}
\usage{
lightgbm(data, label = NULL, weight = NULL, params = list(),
nrounds = 10, verbose = 1, eval_freq = 1L,
early_stopping_rounds = NULL, save_name = "lightgbm.model",
init_model = NULL, callbacks = list(), ...)
lightgbm(
data,
label = NULL,
weight = NULL,
params = list(),
nrounds = 10,
verbose = 1,
eval_freq = 1L,
early_stopping_rounds = NULL,
save_name = "lightgbm.model",
init_model = NULL,
callbacks = list(),
...
)
}
\arguments{
\item{data}{a \code{lgb.Dataset} object, used for training}
@ -24,12 +34,10 @@ lightgbm(data, label = NULL, weight = NULL, params = list(),
\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
\item{early_stopping_rounds}{int
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them except the training data
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will have 'best_iter' field}
\item{early_stopping_rounds}{int. Activates early stopping. Requires at least one validation data
and one metric. If there's more than one, will check all of them
except the training data. Returns the model with (best_iter + early_stopping_rounds).
If early stopping occurs, the model will have 'best_iter' field.}
\item{save_name}{File name to use when writing the trained model to disk. Should end in ".model".}

Просмотреть файл

@ -4,9 +4,17 @@
\alias{predict.lgb.Booster}
\title{Predict method for LightGBM model}
\usage{
\method{predict}{lgb.Booster}(object, data, num_iteration = NULL,
rawscore = FALSE, predleaf = FALSE, predcontrib = FALSE,
header = FALSE, reshape = FALSE, ...)
\method{predict}{lgb.Booster}(
object,
data,
num_iteration = NULL,
rawscore = FALSE,
predleaf = FALSE,
predcontrib = FALSE,
header = FALSE,
reshape = FALSE,
...
)
}
\arguments{
\item{object}{Object of class \code{lgb.Booster}}
@ -16,8 +24,8 @@
\item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration}
\item{rawscore}{whether the prediction should be returned in the for of original untransformed
sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} for
logistic regression would result in predictions for log-odds instead of probabilities.}
sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE}
for logistic regression would result in predictions for log-odds instead of probabilities.}
\item{predleaf}{whether predict leaf index instead.}
@ -53,13 +61,15 @@ test <- agaricus.test
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
params <- list(objective = "regression", metric = "l2")
valids <- list(test = dtest)
model <- lgb.train(params,
dtrain,
10,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 5)
model <- lgb.train(
params = params
, data = dtrain
, nrounds = 10
, valids = valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 5
)
preds <- predict(model, test$data)
}

Просмотреть файл

@ -27,13 +27,15 @@ test <- agaricus.test
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
params <- list(objective = "regression", metric = "l2")
valids <- list(test = dtest)
model <- lgb.train(params,
dtrain,
10,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 5)
model <- lgb.train(
params = params
, data = dtrain
, nrounds = 10
, valids = valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 5
)
saveRDS.lgb.Booster(model, "model.rds")
new_model <- readRDS.lgb.Booster("model.rds")

Просмотреть файл

@ -4,19 +4,31 @@
\alias{saveRDS.lgb.Booster}
\title{saveRDS for \code{lgb.Booster} models}
\usage{
saveRDS.lgb.Booster(object, file = "", ascii = FALSE, version = NULL,
compress = TRUE, refhook = NULL, raw = TRUE)
saveRDS.lgb.Booster(
object,
file = "",
ascii = FALSE,
version = NULL,
compress = TRUE,
refhook = NULL,
raw = TRUE
)
}
\arguments{
\item{object}{R object to serialize.}
\item{file}{a connection or the name of the file where the R object is saved to or read from.}
\item{ascii}{a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save.}
\item{ascii}{a logical. If TRUE or NA, an ASCII representation is written; otherwise (default),
a binary one is used. See the comments in the help for save.}
\item{version}{the workspace format version to use. \code{NULL} specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions.}
\item{version}{the workspace format version to use. \code{NULL} specifies the current default
version (2). Versions prior to 2 are not supported, so this will only be relevant
when there are later versions.}
\item{compress}{a logical specifying whether saving to a named file is to use "gzip" compression, or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of compression to be used. Ignored if file is a connection.}
\item{compress}{a logical specifying whether saving to a named file is to use "gzip" compression,
or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of
compression to be used. Ignored if file is a connection.}
\item{refhook}{a hook function for handling reference objects.}
@ -26,7 +38,8 @@ saveRDS.lgb.Booster(object, file = "", ascii = FALSE, version = NULL,
NULL invisibly.
}
\description{
Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides
whether to save the raw model or not.
}
\examples{
library(lightgbm)
@ -39,10 +52,10 @@ dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
params <- list(objective = "regression", metric = "l2")
valids <- list(test = dtest)
model <- lgb.train(
params
, dtrain
, 10
, valids
params = params
, data = dtrain
, nrounds = 10
, valids = valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 5

Просмотреть файл

@ -12,11 +12,11 @@ setinfo(dataset, ...)
\arguments{
\item{dataset}{Object of class \code{lgb.Dataset}}
\item{...}{other parameters}
\item{name}{the name of the field to get}
\item{info}{the specific field of information to set}
\item{...}{other parameters}
}
\value{
passed object

Просмотреть файл

@ -12,9 +12,9 @@ slice(dataset, ...)
\arguments{
\item{dataset}{Object of class \code{lgb.Dataset}}
\item{...}{other parameters (currently not used)}
\item{idxset}{an integer vector of indices of rows needed}
\item{idxset}{a integer vector of indices of rows needed}
\item{...}{other parameters (currently not used)}
}
\value{
constructed sub dataset

Просмотреть файл

@ -8,7 +8,7 @@ if (.Machine$sizeof.pointer != 8){
}
R_int_UUID <- .Internal(internalsID())
R_ver <- as.double(R.Version()$major) + as.double(R.Version()$minor)/10
R_ver <- as.double(R.Version()$major) + as.double(R.Version()$minor) / 10
if (!(R_int_UUID == "0310d4b8-ccb1-4bb8-ba94-d36a55f60262"
|| R_int_UUID == "2fdf6c18-697a-4ba7-b8ef-11c0d92f1327")){
@ -74,7 +74,7 @@ if (!use_precompile) {
try_vs <- 0
local_vs_def <- ""
vs_versions <- c("Visual Studio 16 2019", "Visual Studio 15 2017", "Visual Studio 14 2015")
for(vs in vs_versions){
for (vs in vs_versions){
vs_def <- paste0(" -G \"", vs, "\" -A x64")
tmp_cmake_cmd <- paste0(cmake_cmd, vs_def)
try_vs <- system(paste0(tmp_cmake_cmd, " .."))
@ -106,14 +106,29 @@ if (!use_precompile) {
# Has precompiled package
lib_folder <- file.path(R_PACKAGE_SOURCE, "../", fsep = "/")
if (file.exists(file.path(lib_folder, paste0("lib_lightgbm", SHLIB_EXT), fsep = "/"))) {
src <- file.path(lib_folder, paste0("lib_lightgbm", SHLIB_EXT), fsep = "/")
} else if (file.exists(file.path(lib_folder, paste0("Release/lib_lightgbm", SHLIB_EXT), fsep = "/"))) {
src <- file.path(lib_folder, paste0("Release/lib_lightgbm", SHLIB_EXT), fsep = "/")
shared_object_file <- file.path(
lib_folder
, paste0("lib_lightgbm", SHLIB_EXT)
, fsep = "/"
)
release_file <- file.path(
lib_folder
, paste0("Release/lib_lightgbm", SHLIB_EXT)
, fsep = "/"
)
windows_shared_object_file <- file.path(
lib_folder
, paste0("/windows/x64/DLL/lib_lightgbm", SHLIB_EXT)
, fsep = "/"
)
if (file.exists(shared_object_file)) {
src <- shared_object_file
} else if (file.exists(release_file)) {
src <- release_file
} else {
src <- file.path(lib_folder, paste0("/windows/x64/DLL/lib_lightgbm", SHLIB_EXT), fsep = "/") # Expected result: installation will fail if it is not here or any other
# Expected result: installation will fail if it is not here or any other
src <- windows_shared_object_file
}
}
# Check installation correctness

Просмотреть файл

@ -1,7 +1,7 @@
context("basic functions")
data(agaricus.train, package='lightgbm')
data(agaricus.test, package='lightgbm')
data(agaricus.train, package = 'lightgbm')
data(agaricus.test, package = 'lightgbm')
train <- agaricus.train
test <- agaricus.test
@ -9,8 +9,14 @@ windows_flag = grepl('Windows', Sys.info()[['sysname']])
test_that("train and predict binary classification", {
nrounds = 10
bst <- lightgbm(data = train$data, label = train$label, num_leaves = 5,
nrounds = nrounds, objective = "binary", metric="binary_error")
bst <- lightgbm(
data = train$data
, label = train$label
, num_leaves = 5
, nrounds = nrounds
, objective = "binary"
, metric = "binary_error"
)
expect_false(is.null(bst$record_evals))
record_results <- lgb.get.eval.result(bst, "train", "binary_error")
expect_lt(min(record_results), 0.02)
@ -20,7 +26,7 @@ test_that("train and predict binary classification", {
pred1 <- predict(bst, train$data, num_iteration = 1)
expect_equal(length(pred1), 6513)
err_pred1 <- sum((pred1 > 0.5) != train$label)/length(train$label)
err_pred1 <- sum( (pred1 > 0.5) != train$label) / length(train$label)
err_log <- record_results[1]
expect_lt(abs(err_pred1 - err_log), 10e-6)
})
@ -29,9 +35,18 @@ test_that("train and predict binary classification", {
test_that("train and predict softmax", {
lb <- as.numeric(iris$Species) - 1
bst <- lightgbm(data = as.matrix(iris[, -5]), label = lb,
num_leaves = 4, learning_rate = 0.1, nrounds = 20, min_data=20, min_hess=20,
objective = "multiclass", metric="multi_error", num_class=3)
bst <- lightgbm(
data = as.matrix(iris[, -5])
, label = lb
, num_leaves = 4
, learning_rate = 0.1
, nrounds = 20
, min_data = 20
, min_hess = 20
, objective = "multiclass"
, metric = "multi_error"
, num_class = 3
)
expect_false(is.null(bst$record_evals))
record_results <- lgb.get.eval.result(bst, "train", "multi_error")
@ -43,18 +58,33 @@ test_that("train and predict softmax", {
test_that("use of multiple eval metrics works", {
bst <- lightgbm(data = train$data, label = train$label, num_leaves = 4,
learning_rate=1, nrounds = 10, objective = "binary",
metric = list("binary_error","auc","binary_logloss") )
bst <- lightgbm(
data = train$data
, label = train$label
, num_leaves = 4
, learning_rate = 1
, nrounds = 10
, objective = "binary"
, metric = list("binary_error","auc","binary_logloss")
)
expect_false(is.null(bst$record_evals))
})
test_that("training continuation works", {
testthat::skip("This test is currently broken. See issue #2468 for details.")
dtrain <- lgb.Dataset(train$data, label = train$label, free_raw_data=FALSE)
watchlist = list(train=dtrain)
param <- list(objective = "binary", metric="binary_logloss", num_leaves = 5, learning_rate = 1)
dtrain <- lgb.Dataset(
train$data
, label = train$label
, free_raw_data = FALSE
)
watchlist = list(train = dtrain)
param <- list(
objective = "binary"
, metric = "binary_logloss"
, num_leaves = 5
, learning_rate = 1
)
# for the reference, use 10 iterations at once:
bst <- lgb.train(param, dtrain, nrounds = 10, watchlist)
@ -75,8 +105,16 @@ test_that("training continuation works", {
test_that("cv works", {
dtrain <- lgb.Dataset(train$data, label=train$label)
params <- list(objective="regression", metric="l2,l1")
bst <- lgb.cv(params, dtrain, 10, nfold=5, min_data=1, learning_rate=1, early_stopping_rounds=10)
dtrain <- lgb.Dataset(train$data, label = train$label)
params <- list(objective = "regression", metric = "l2,l1")
bst <- lgb.cv(
params
, dtrain
, 10
, nfold = 5
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
)
expect_false(is.null(bst$record_evals))
})

Просмотреть файл

@ -1,7 +1,7 @@
context('Test models with custom objective')
data(agaricus.train, package='lightgbm')
data(agaricus.test, package='lightgbm')
data(agaricus.train, package = 'lightgbm')
data(agaricus.test, package = 'lightgbm')
dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
watchlist <- list(eval = dtest, train = dtrain)
@ -17,11 +17,19 @@ logregobj <- function(preds, dtrain) {
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
return(list(name = "error", value = err, higher_better=FALSE))
return(list(
name = "error"
, value = err
, higher_better = FALSE
))
}
param <- list(num_leaves=8, learning_rate=1,
objective=logregobj, metric="auc")
param <- list(
num_leaves = 8
, learning_rate = 1
, objective = logregobj
, metric = "auc"
)
num_round <- 10
test_that("custom objective works", {

Просмотреть файл

@ -3,15 +3,15 @@ require(Matrix)
context("testing lgb.Dataset functionality")
data(agaricus.test, package='lightgbm')
data(agaricus.test, package = 'lightgbm')
test_data <- agaricus.test$data[1:100,]
test_label <- agaricus.test$label[1:100]
test_that("lgb.Dataset: basic construction, saving, loading", {
# from sparse matrix
dtest1 <- lgb.Dataset(test_data, label=test_label)
dtest1 <- lgb.Dataset(test_data, label = test_label)
# from dense matrix
dtest2 <- lgb.Dataset(as.matrix(test_data), label=test_label)
dtest2 <- lgb.Dataset(as.matrix(test_data), label = test_label)
expect_equal(getinfo(dtest1, 'label'), getinfo(dtest2, 'label'))
# save to a local file
@ -40,7 +40,7 @@ test_that("lgb.Dataset: getinfo & setinfo", {
})
test_that("lgb.Dataset: slice, dim", {
dtest <- lgb.Dataset(test_data, label=test_label)
dtest <- lgb.Dataset(test_data, label = test_label)
lgb.Dataset.construct(dtest)
expect_equal(dim(dtest), dim(test_data))
dsub1 <- slice(dtest, 1:42)
@ -50,7 +50,7 @@ test_that("lgb.Dataset: slice, dim", {
})
test_that("lgb.Dataset: colnames", {
dtest <- lgb.Dataset(test_data, label=test_label)
dtest <- lgb.Dataset(test_data, label = test_label)
expect_equal(colnames(dtest), colnames(test_data))
lgb.Dataset.construct(dtest)
expect_equal(colnames(dtest), colnames(test_data))
@ -62,7 +62,7 @@ test_that("lgb.Dataset: colnames", {
test_that("lgb.Dataset: nrow is correct for a very sparse matrix", {
nr <- 1000
x <- Matrix::rsparsematrix(nr, 100, density=0.0005)
x <- Matrix::rsparsematrix(nr, 100, density = 0.0005)
# we want it very sparse, so that last rows are empty
expect_lt(max(x@i), nr)
dtest <- lgb.Dataset(x)
@ -70,15 +70,17 @@ test_that("lgb.Dataset: nrow is correct for a very sparse matrix", {
})
test_that("lgb.Dataset: Dataset should be able to construct from matrix and return non-null handle", {
rawData <- matrix(runif(1000),ncol=10)
rawData <- matrix(runif(1000), ncol = 10)
handle <- NA_real_
ref_handle <- NULL
handle <- lightgbm:::lgb.call("LGBM_DatasetCreateFromMat_R"
, ret = handle
, rawData
, nrow(rawData)
, ncol(rawData)
, lightgbm:::lgb.params2str(params=list())
, ref_handle)
handle <- lightgbm:::lgb.call(
"LGBM_DatasetCreateFromMat_R"
, ret = handle
, rawData
, nrow(rawData)
, ncol(rawData)
, lightgbm:::lgb.params2str(params = list())
, ref_handle
)
expect_false(is.na(handle))
})

Просмотреть файл

@ -1,5 +1,8 @@
data(agaricus.train, package='lightgbm')
data(agaricus.test, package='lightgbm')
context("feature penalties")
data(agaricus.train, package = 'lightgbm')
data(agaricus.test, package = 'lightgbm')
train <- agaricus.train
test <- agaricus.test
@ -12,15 +15,15 @@ test_that("Feature penalties work properly", {
feature_penalties <- rep(1, ncol(train$data))
feature_penalties[var_index] <- x
lightgbm(
data = train$data,
label = train$label,
num_leaves = 5,
learning_rate = 0.05,
nrounds = 20,
objective = "binary",
feature_penalty = paste0(feature_penalties, collapse = ","),
metric="binary_error",
verbose = -1
data = train$data
, label = train$label
, num_leaves = 5
, learning_rate = 0.05
, nrounds = 20
, objective = "binary"
, feature_penalty = paste0(feature_penalties, collapse = ",")
, metric = "binary_error"
, verbose = -1
)
})