[R-package] added support for first_metric_only (fixes #2368) (#2912)

* [R-package] started implementing first_metric_only * trying stuff * more changes * fixed handling of multiple metrics * fixed tests * remove duplicate tests * get training tests * fixes for lgb.cv() * fixes for lgb.cv() * fix linting
2020-09-06 02:41:37 +01:00 · 2020-09-06 02:41:37 +01:00 · d4325c5a44
--- a/R-package/R/aliases.R
+++ b/R-package/R/aliases.R
@ -108,3 +108,17 @@
    )
    return(c(learning_params, .DATASET_PARAMETERS()))
 }
+
+# [description]
+#     Per https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst#metric,
+#     a few different strings can be used to indicate "no metrics".
+# [returns]
+#     A character vector
+.NO_METRIC_STRINGS <- function() {
+    return(c(
+        "na"
+        , "None"
+        , "null"
+        , "custom"
+    ))
+}
--- a/R-package/R/callback.R
+++ b/R-package/R/callback.R
@ -268,7 +268,7 @@ cb.record.evaluation <- function() {

 }

-cb.early.stop <- function(stopping_rounds, verbose = TRUE) {
+cb.early.stop <- function(stopping_rounds, first_metric_only = FALSE, verbose = TRUE) {

  # Initialize variables
  factor_to_bigger_better <- NULL
@ -325,8 +325,16 @@ cb.early.stop <- function(stopping_rounds, verbose = TRUE) {
    # Store iteration
    cur_iter <- env$iteration

+    # By default, any metric can trigger early stopping. This can be disabled
+    # with 'first_metric_only = TRUE'
+    if (isTRUE(first_metric_only)) {
+      evals_to_check <- 1L
+    } else {
+      evals_to_check <- seq_len(eval_len)
+    }
+
    # Loop through evaluation
-    for (i in seq_len(eval_len)) {
+    for (i in evals_to_check) {

      # Store score
      score <- env$eval_list[[i]]$value * factor_to_bigger_better[i]
--- a/R-package/R/lgb.cv.R
+++ b/R-package/R/lgb.cv.R
@ -24,10 +24,6 @@ CVBooster <- R6::R6Class(
 #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
 #' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}
 #' @param weight vector of response values. If not NULL, will set to dataset
-#' @param obj objective function, can be character or custom objective function. Examples include
-#'            \code{regression}, \code{regression_l1}, \code{huber},
-#'             \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
-#' @param eval evaluation function, can be (list of) character or custom eval function
 #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
 #' @param showsd \code{boolean}, whether to show standard deviation of cross validation
 #' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
@ -52,7 +48,7 @@ CVBooster <- R6::R6Class(
 #'                                   the number of real CPU cores, not the number of threads (most
 #'                                   CPU using hyper-threading to generate 2 threads per CPU core).}
 #'            }
-#'
+#' @inheritSection lgb_shared_params Early Stopping
 #' @return a trained model \code{lgb.CVBooster}.
 #'
 #' @examples
@ -114,7 +110,7 @@ lgb.cv <- function(params = list()
  params <- lgb.check.obj(params, obj)
  params <- lgb.check.eval(params, eval)
  fobj <- NULL
-  feval <- NULL
+  eval_functions <- list(NULL)

  # Check for objective (function or not)
  if (is.function(params$objective)) {
@ -122,9 +118,17 @@ lgb.cv <- function(params = list()
    params$objective <- "NONE"
  }

-  # Check for loss (function or not)
+  # If loss is a single function, store it as a 1-element list
+  # (for backwards compatibility). If it is a list of functions, store
+  # all of them
  if (is.function(eval)) {
-    feval <- eval
+    eval_functions <- list(eval)
+  }
+  if (methods::is(eval, "list")) {
+    eval_functions <- Filter(
+      f = is.function
+      , x = eval
+    )
  }

  # Init predictor to empty
@ -266,6 +270,7 @@ lgb.cv <- function(params = list()
      callbacks
      , cb.early.stop(
        stopping_rounds = early_stopping_rounds
+        , first_metric_only = isTRUE(params[["first_metric_only"]])
        , verbose = verbose
      )
    )
@ -357,7 +362,11 @@ lgb.cv <- function(params = list()
    # Update one boosting iteration
    msg <- lapply(cv_booster$boosters, function(fd) {
      fd$booster$update(fobj = fobj)
-      fd$booster$eval_valid(feval = feval)
+      out <- list()
+      for (eval_function in eval_functions) {
+        out <- append(out, fd$booster$eval_valid(feval = eval_function))
+      }
+      return(out)
    })

    # Prepare collection of evaluation results
@ -384,7 +393,13 @@ lgb.cv <- function(params = list()
  # When early stopping is not activated, we compute the best iteration / score ourselves
  # based on the first first metric
  if (record && is.na(env$best_score)) {
-    first_metric <- cv_booster$boosters[[1L]][[1L]]$.__enclos_env__$private$eval_names[1L]
+    # when using a custom eval function, the metric name is returned from the
+    # function, so figure it out from record_evals
+    if (!is.null(eval_functions[1L])) {
+      first_metric <- names(cv_booster$record_evals[["valid"]])[1L]
+    } else {
+      first_metric <- cv_booster$.__enclos_env__$private$eval_names[1L]
+    }
    .find_best <- which.min
    if (isTRUE(env$eval_list[[1L]]$higher_better[1L])) {
      .find_best <- which.max
@ -576,7 +591,8 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) {
      msg[[i]][[j]]$value }))
  })

-  # Get evaluation
+  # Get evaluation. Just taking the first element here to
+  # get structture (name, higher_bettter, data_name)
  ret_eval <- msg[[1L]]

  # Go through evaluation length items
--- a/R-package/R/lgb.train.R
+++ b/R-package/R/lgb.train.R
@ -3,10 +3,6 @@
 #' @description Logic to train with LightGBM
 #' @inheritParams lgb_shared_params
 #' @param valids a list of \code{lgb.Dataset} objects, used for validation
-#' @param obj objective function, can be character or custom objective function. Examples include
-#'            \code{regression}, \code{regression_l1}, \code{huber},
-#'            \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
-#' @param eval evaluation function, can be (a list of) character or custom eval function
 #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
 #' @param colnames feature names, if not null, will use this to overwrite the names in dataset
 #' @param categorical_feature list of str or int
@ -26,6 +22,7 @@
 #'                                   the number of real CPU cores, not the number of threads (most
 #'                                   CPU using hyper-threading to generate 2 threads per CPU core).}
 #'            }
+#' @inheritSection lgb_shared_params Early Stopping
 #' @return a trained booster model \code{lgb.Booster}.
 #'
 #' @examples
@ -90,7 +87,7 @@ lgb.train <- function(params = list(),
  params <- lgb.check.obj(params, obj)
  params <- lgb.check.eval(params, eval)
  fobj <- NULL
-  feval <- NULL
+  eval_functions <- list(NULL)

  # Check for objective (function or not)
  if (is.function(params$objective)) {
@ -98,9 +95,17 @@ lgb.train <- function(params = list(),
    params$objective <- "NONE"
  }

-  # Check for loss (function or not)
+  # If loss is a single function, store it as a 1-element list
+  # (for backwards compatibility). If it is a list of functions, store
+  # all of them
  if (is.function(eval)) {
-    feval <- eval
+    eval_functions <- list(eval)
+  }
+  if (methods::is(eval, "list")) {
+    eval_functions <- Filter(
+      f = is.function
+      , x = eval
+    )
  }

  # Init predictor to empty
@ -235,6 +240,7 @@ lgb.train <- function(params = list(),
      callbacks
      , cb.early.stop(
        stopping_rounds = early_stopping_rounds
+        , first_metric_only = isTRUE(params[["first_metric_only"]])
        , verbose = verbose
      )
    )
@ -280,13 +286,28 @@ lgb.train <- function(params = list(),
    # Collection: Has validation dataset?
    if (length(valids) > 0L) {

-      # Validation has training dataset?
-      if (valid_contain_train) {
-        eval_list <- append(eval_list, booster$eval_train(feval = feval))
+      # Get evaluation results with passed-in functions
+      for (eval_function in eval_functions) {
+
+        # Validation has training dataset?
+        if (valid_contain_train) {
+          eval_list <- append(eval_list, booster$eval_train(feval = eval_function))
+        }
+
+        eval_list <- append(eval_list, booster$eval_valid(feval = eval_function))
+      }
+
+      # Calling booster$eval_valid() will get
+      # evaluation results with the metrics in params$metric by calling LGBM_BoosterGetEval_R",
+      # so need to be sure that gets called, which it wouldn't be above if no functions
+      # were passed in
+      if (length(eval_functions) == 0L) {
+        if (valid_contain_train) {
+          eval_list <- append(eval_list, booster$eval_train(feval = eval_function))
+        }
+        eval_list <- append(eval_list, booster$eval_valid(feval = eval_function))
      }

-      # Has no validation dataset
-      eval_list <- append(eval_list, booster$eval_valid(feval = feval))
    }

    # Write evaluation result in environment
@ -312,7 +333,7 @@ lgb.train <- function(params = list(),

    # when using a custom eval function, the metric name is returned from the
    # function, so figure it out from record_evals
-    if (!is.null(feval)) {
+    if (!is.null(eval_functions[1L])) {
      first_metric <- names(booster$record_evals[[first_valid_name]])[1L]
    } else {
      first_metric <- booster$.__enclos_env__$private$eval_names[1L]
--- a/R-package/R/lightgbm.R
+++ b/R-package/R/lightgbm.R
@ -10,11 +10,61 @@
 #'                              and one metric. If there's more than one, will check all of them
 #'                              except the training data. Returns the model with (best_iter + early_stopping_rounds).
 #'                              If early stopping occurs, the model will have 'best_iter' field.
+#' @param eval evaluation function(s). This can be a character vector, function, or list with a mixture of
+#'             strings and functions.
+#'
+#'             \itemize{
+#'                 \item{\bold{a. character vector}:
+#'                     If you provide a character vector to this argument, it should contain strings with valid
+#'                     evaluation metrics.
+#'                     See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric}{
+#'                     The "metric" section of the documentation}
+#'                     for a list of valid metrics.
+#'                 }
+#'                 \item{\bold{b. function}:
+#'                      You can provide a custom evaluation function. This
+#'                      should accept the keyword arguments \code{preds} and \code{dtrain} and should return a named
+#'                      list with three elements:
+#'                      \itemize{
+#'                          \item{\code{name}: A string with the name of the metric, used for printing
+#'                              and storing results.
+#'                          }
+#'                          \item{\code{value}: A single number indicating the value of the metric for the
+#'                              given predictions and true values
+#'                          }
+#'                          \item{
+#'                              \code{higher_better}: A boolean indicating whether higher values indicate a better fit.
+#'                              For example, this would be \code{FALSE} for metrics like MAE or RMSE.
+#'                          }
+#'                      }
+#'                 }
+#'                 \item{\bold{c. list}:
+#'                     If a list is given, it should only contain character vectors and functions.
+#'                     These should follow the requirements from the descriptions above.
+#'                 }
+#'             }
 #' @param eval_freq evaluation output frequency, only effect when verbose > 0
 #' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
 #' @param nrounds number of training rounds
+#' @param obj objective function, can be character or custom objective function. Examples include
+#'            \code{regression}, \code{regression_l1}, \code{huber},
+#'            \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
 #' @param params List of parameters
 #' @param verbose verbosity for output, if <= 0, also will disable the print of evaluation during training
+#' @section Early Stopping:
+#'
+#'          "early stopping" refers to stopping the training process if the model's performance on a given
+#'          validation set does not improve for several consecutive iterations.
+#'
+#'          If multiple arguments are given to \code{eval}, their order will be preserved. If you enable
+#'          early stopping by setting \code{early_stopping_rounds} in \code{params}, by default all
+#'          metrics will be considered for early stopping.
+#'
+#'          If you want to only consider the first metric for early stopping, pass
+#'          \code{first_metric_only = TRUE} in \code{params}. Note that if you also specify \code{metric}
+#'          in \code{params}, that metric will be considered the "first" one. If you omit \code{metric},
+#'          a default metric will be used based on your choice for the parameter \code{obj} (keyword argument)
+#'          or \code{objective} (passed into \code{params}).
 #' @keywords internal
 NULL

@ -47,6 +97,7 @@ NULL
 #'                             the number of real CPU cores, not the number of threads (most
 #'                             CPU using hyper-threading to generate 2 threads per CPU core).}
 #'     }
+#' @inheritSection lgb_shared_params Early Stopping
 #' @export
 lightgbm <- function(data,
                     label = NULL,
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@ -318,10 +318,10 @@ lgb.check.obj <- function(params, obj) {
 }

 # [description]
-#     make sure that "metric" is populated on params,
-#     and add any eval values to it
-# [return]
-#     params, where "metric" is a list
+#     Take any character values from eval and store them in params$metric.
+#     This has to account for the fact that `eval` could be a character vector,
+#     a function, a list of functions, or a list with a mix of strings and
+#     functions
 lgb.check.eval <- function(params, eval) {

  if (is.null(params$metric)) {
@ -330,13 +330,30 @@ lgb.check.eval <- function(params, eval) {
    params$metric <- as.list(params$metric)
  }

-  if (is.character(eval)) {
-    params$metric <- append(params$metric, eval)
+  # if 'eval' is a character vector or list, find the character
+  # elements and add them to 'metric'
+  if (!is.function(eval)) {
+    for (i in seq_along(eval)) {
+      element <- eval[[i]]
+      if (is.character(element)) {
+        params$metric <- append(params$metric, element)
+      }
+    }
  }

-  if (identical(class(eval), "list")) {
-    params$metric <- append(params$metric, unlist(eval))
+  # If more than one character metric was given, then "None" should
+  # not be included
+  if (length(params$metric) > 1L) {
+    params$metric <- Filter(
+        f = function(metric) {
+          !(metric %in% .NO_METRIC_STRINGS())
+        }
+        , x = params$metric
+    )
  }

+  # duplicate metrics should be filtered out
+  params$metric <- as.list(unique(unlist(params$metric)))
+
  return(params)
 }
--- a/R-package/man/lgb.cv.Rd
+++ b/R-package/man/lgb.cv.Rd
@ -45,9 +45,41 @@ may allow you to pass other types of data like \code{matrix} and then separately

 \item{obj}{objective function, can be character or custom objective function. Examples include
 \code{regression}, \code{regression_l1}, \code{huber},
- \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
+\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}

-\item{eval}{evaluation function, can be (list of) character or custom eval function}
+\item{eval}{evaluation function(s). This can be a character vector, function, or list with a mixture of
+            strings and functions.
+
+            \itemize{
+                \item{\bold{a. character vector}:
+                    If you provide a character vector to this argument, it should contain strings with valid
+                    evaluation metrics.
+                    See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric}{
+                    The "metric" section of the documentation}
+                    for a list of valid metrics.
+                }
+                \item{\bold{b. function}:
+                     You can provide a custom evaluation function. This
+                     should accept the keyword arguments \code{preds} and \code{dtrain} and should return a named
+                     list with three elements:
+                     \itemize{
+                         \item{\code{name}: A string with the name of the metric, used for printing
+                             and storing results.
+                         }
+                         \item{\code{value}: A single number indicating the value of the metric for the
+                             given predictions and true values
+                         }
+                         \item{
+                             \code{higher_better}: A boolean indicating whether higher values indicate a better fit.
+                             For example, this would be \code{FALSE} for metrics like MAE or RMSE.
+                         }
+                     }
+                }
+                \item{\bold{c. list}:
+                    If a list is given, it should only contain character vectors and functions.
+                    These should follow the requirements from the descriptions above.
+                }
+            }}

 \item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}

@ -99,6 +131,23 @@ a trained model \code{lgb.CVBooster}.
 \description{
 Cross validation logic used by LightGBM
 }
+\section{Early Stopping}{
+
+
+         "early stopping" refers to stopping the training process if the model's performance on a given
+         validation set does not improve for several consecutive iterations.
+
+         If multiple arguments are given to \code{eval}, their order will be preserved. If you enable
+         early stopping by setting \code{early_stopping_rounds} in \code{params}, by default all
+         metrics will be considered for early stopping.
+
+         If you want to only consider the first metric for early stopping, pass
+         \code{first_metric_only = TRUE} in \code{params}. Note that if you also specify \code{metric}
+         in \code{params}, that metric will be considered the "first" one. If you omit \code{metric},
+         a default metric will be used based on your choice for the parameter \code{obj} (keyword argument)
+         or \code{objective} (passed into \code{params}).
+}
+
 \examples{
 \dontrun{
 data(agaricus.train, package = "lightgbm")
--- a/R-package/man/lgb.train.Rd
+++ b/R-package/man/lgb.train.Rd
@ -38,7 +38,39 @@ may allow you to pass other types of data like \code{matrix} and then separately
 \code{regression}, \code{regression_l1}, \code{huber},
 \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}

-\item{eval}{evaluation function, can be (a list of) character or custom eval function}
+\item{eval}{evaluation function(s). This can be a character vector, function, or list with a mixture of
+            strings and functions.
+
+            \itemize{
+                \item{\bold{a. character vector}:
+                    If you provide a character vector to this argument, it should contain strings with valid
+                    evaluation metrics.
+                    See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric}{
+                    The "metric" section of the documentation}
+                    for a list of valid metrics.
+                }
+                \item{\bold{b. function}:
+                     You can provide a custom evaluation function. This
+                     should accept the keyword arguments \code{preds} and \code{dtrain} and should return a named
+                     list with three elements:
+                     \itemize{
+                         \item{\code{name}: A string with the name of the metric, used for printing
+                             and storing results.
+                         }
+                         \item{\code{value}: A single number indicating the value of the metric for the
+                             given predictions and true values
+                         }
+                         \item{
+                             \code{higher_better}: A boolean indicating whether higher values indicate a better fit.
+                             For example, this would be \code{FALSE} for metrics like MAE or RMSE.
+                         }
+                     }
+                }
+                \item{\bold{c. list}:
+                    If a list is given, it should only contain character vectors and functions.
+                    These should follow the requirements from the descriptions above.
+                }
+            }}

 \item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}

@ -82,6 +114,23 @@ a trained booster model \code{lgb.Booster}.
 \description{
 Logic to train with LightGBM
 }
+\section{Early Stopping}{
+
+
+         "early stopping" refers to stopping the training process if the model's performance on a given
+         validation set does not improve for several consecutive iterations.
+
+         If multiple arguments are given to \code{eval}, their order will be preserved. If you enable
+         early stopping by setting \code{early_stopping_rounds} in \code{params}, by default all
+         metrics will be considered for early stopping.
+
+         If you want to only consider the first metric for early stopping, pass
+         \code{first_metric_only = TRUE} in \code{params}. Note that if you also specify \code{metric}
+         in \code{params}, that metric will be considered the "first" one. If you omit \code{metric},
+         a default metric will be used based on your choice for the parameter \code{obj} (keyword argument)
+         or \code{objective} (passed into \code{params}).
+}
+
 \examples{
 \dontrun{
 data(agaricus.train, package = "lightgbm")
--- a/R-package/man/lgb_shared_params.Rd
+++ b/R-package/man/lgb_shared_params.Rd
@ -16,12 +16,50 @@ and one metric. If there's more than one, will check all of them
 except the training data. Returns the model with (best_iter + early_stopping_rounds).
 If early stopping occurs, the model will have 'best_iter' field.}

+\item{eval}{evaluation function(s). This can be a character vector, function, or list with a mixture of
+            strings and functions.
+
+            \itemize{
+                \item{\bold{a. character vector}:
+                    If you provide a character vector to this argument, it should contain strings with valid
+                    evaluation metrics.
+                    See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric}{
+                    The "metric" section of the documentation}
+                    for a list of valid metrics.
+                }
+                \item{\bold{b. function}:
+                     You can provide a custom evaluation function. This
+                     should accept the keyword arguments \code{preds} and \code{dtrain} and should return a named
+                     list with three elements:
+                     \itemize{
+                         \item{\code{name}: A string with the name of the metric, used for printing
+                             and storing results.
+                         }
+                         \item{\code{value}: A single number indicating the value of the metric for the
+                             given predictions and true values
+                         }
+                         \item{
+                             \code{higher_better}: A boolean indicating whether higher values indicate a better fit.
+                             For example, this would be \code{FALSE} for metrics like MAE or RMSE.
+                         }
+                     }
+                }
+                \item{\bold{c. list}:
+                    If a list is given, it should only contain character vectors and functions.
+                    These should follow the requirements from the descriptions above.
+                }
+            }}
+
 \item{eval_freq}{evaluation output frequency, only effect when verbose > 0}

 \item{init_model}{path of model file of \code{lgb.Booster} object, will continue training from this model}

 \item{nrounds}{number of training rounds}

+\item{obj}{objective function, can be character or custom objective function. Examples include
+\code{regression}, \code{regression_l1}, \code{huber},
+\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
+
 \item{params}{List of parameters}

 \item{verbose}{verbosity for output, if <= 0, also will disable the print of evaluation during training}
@ -29,4 +67,21 @@ If early stopping occurs, the model will have 'best_iter' field.}
 \description{
 Parameter docs shared by \code{lgb.train}, \code{lgb.cv}, and \code{lightgbm}
 }
+\section{Early Stopping}{
+
+
+         "early stopping" refers to stopping the training process if the model's performance on a given
+         validation set does not improve for several consecutive iterations.
+
+         If multiple arguments are given to \code{eval}, their order will be preserved. If you enable
+         early stopping by setting \code{early_stopping_rounds} in \code{params}, by default all
+         metrics will be considered for early stopping.
+
+         If you want to only consider the first metric for early stopping, pass
+         \code{first_metric_only = TRUE} in \code{params}. Note that if you also specify \code{metric}
+         in \code{params}, that metric will be considered the "first" one. If you omit \code{metric},
+         a default metric will be used based on your choice for the parameter \code{obj} (keyword argument)
+         or \code{objective} (passed into \code{params}).
+}
+
 \keyword{internal}
--- a/R-package/man/lightgbm.Rd
+++ b/R-package/man/lightgbm.Rd
@ -74,3 +74,20 @@ List of callback functions that are applied at each iteration.}
 \description{
 Simple interface for training a LightGBM model.
 }
+\section{Early Stopping}{
+
+
+         "early stopping" refers to stopping the training process if the model's performance on a given
+         validation set does not improve for several consecutive iterations.
+
+         If multiple arguments are given to \code{eval}, their order will be preserved. If you enable
+         early stopping by setting \code{early_stopping_rounds} in \code{params}, by default all
+         metrics will be considered for early stopping.
+
+         If you want to only consider the first metric for early stopping, pass
+         \code{first_metric_only = TRUE} in \code{params}. Note that if you also specify \code{metric}
+         in \code{params}, that metric will be considered the "first" one. If you omit \code{metric},
+         a default metric will be used based on your choice for the parameter \code{obj} (keyword argument)
+         or \code{objective} (passed into \code{params}).
+}
+
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@ -6,6 +6,59 @@ train <- agaricus.train
 test <- agaricus.test

 TOLERANCE <- 1e-6
+set.seed(708L)
+
+# [description] Every time this function is called, it adds 0.1
+#               to an accumulator then returns the current value.
+#               This is used to mock the situation where an evaluation
+#               metric increases every iteration
+ACCUMULATOR_NAME <- "INCREASING_METRIC_ACUMULATOR"
+assign(x = "INCREASING_METRIC_ACUMULATOR", value = 0.0, envir = .GlobalEnv)
+
+.increasing_metric <- function(preds, dtrain) {
+  if (!exists(ACCUMULATOR_NAME, envir = .GlobalEnv)) {
+    assign(ACCUMULATOR_NAME, 0.0, envir = .GlobalEnv)
+  }
+  assign(
+    x = ACCUMULATOR_NAME
+    , value = get(ACCUMULATOR_NAME, envir = .GlobalEnv) + 0.1
+    , envir = .GlobalEnv
+  )
+  return(list(
+    name = "increasing_metric"
+    , value = get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+    , higher_better = TRUE
+  ))
+}
+
+# [description] Evaluation function that always returns the
+#               same value
+CONSTANT_METRIC_VALUE <- 0.2
+.constant_metric <- function(preds, dtrain) {
+  return(list(
+    name = "constant_metric"
+    , value = CONSTANT_METRIC_VALUE
+    , higher_better = FALSE
+  ))
+}
+
+# sample datasets to test early stopping
+DTRAIN_RANDOM_REGRESSION <- lgb.Dataset(
+  data = as.matrix(rnorm(100L), ncol = 1L, drop = FALSE)
+  , label = rnorm(100L)
+)
+DVALID_RANDOM_REGRESSION <- lgb.Dataset(
+  data = as.matrix(rnorm(50L), ncol = 1L, drop = FALSE)
+  , label = rnorm(50L)
+)
+DTRAIN_RANDOM_CLASSIFICATION <- lgb.Dataset(
+  data = as.matrix(rnorm(120L), ncol = 1L, drop = FALSE)
+  , label = sample(c(0L, 1L), size = 120L, replace = TRUE)
+)
+DVALID_RANDOM_CLASSIFICATION <- lgb.Dataset(
+  data = as.matrix(rnorm(37L), ncol = 1L, drop = FALSE)
+  , label = sample(c(0L, 1L), size = 37L, replace = TRUE)
+)

 test_that("train and predict binary classification", {
  nrounds <- 10L
@ -707,7 +760,6 @@ test_that("lgb.train() works with early stopping for regression", {
    params = list(
      objective = "regression"
      , metric = "rmse"
-      , min_data_in_bin = 5L
    )
    , data = dtrain
    , nrounds = nrounds
@ -730,7 +782,6 @@ test_that("lgb.train() works with early stopping for regression", {
    params = list(
      objective = "regression"
      , metric = "rmse"
-      , min_data_in_bin = 5L
      , early_stopping_rounds = early_stopping_rounds
    )
    , data = dtrain
@ -750,6 +801,314 @@ test_that("lgb.train() works with early stopping for regression", {
  )
 })

+test_that("lgb.train() does not stop early if early_stopping_rounds is not given", {
+  set.seed(708L)
+
+  increasing_metric_starting_value <- get(
+    ACCUMULATOR_NAME
+    , envir = .GlobalEnv
+  )
+  nrounds <- 10L
+  metrics <- list(
+    .constant_metric
+    , .increasing_metric
+  )
+  bst <- lgb.train(
+    params = list(
+      objective = "regression"
+      , metric = "None"
+    )
+    , data = DTRAIN_RANDOM_REGRESSION
+    , nrounds = nrounds
+    , valids = list("valid1" = DVALID_RANDOM_REGRESSION)
+    , eval = metrics
+  )
+
+  # Only the two functions provided to "eval" should have been evaluated
+  expect_equal(length(bst$record_evals[["valid1"]]), 2L)
+
+  # all 10 iterations should have happen, and the best_iter should be
+  # the first one (based on constant_metric)
+  best_iter <- 1L
+  expect_equal(bst$best_iter, best_iter)
+
+  # best_score should be taken from the first metric
+  expect_equal(
+    bst$best_score
+    , bst$record_evals[["valid1"]][["constant_metric"]][["eval"]][[best_iter]]
+  )
+
+  # early stopping should not have happened. Even though constant_metric
+  # had 9 consecutive iterations with no improvement, it is ignored because of
+  # first_metric_only = TRUE
+  expect_equal(
+    length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]])
+    , nrounds
+  )
+  expect_equal(
+    length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]])
+    , nrounds
+  )
+})
+
+test_that("If first_metric_only is not given or is FALSE, lgb.train() decides to stop early based on all metrics", {
+  set.seed(708L)
+
+  early_stopping_rounds <- 3L
+  param_variations <- list(
+    list(
+      objective = "regression"
+      , metric = "None"
+      , early_stopping_rounds = early_stopping_rounds
+    )
+    , list(
+      objective = "regression"
+      , metric = "None"
+      , early_stopping_rounds = early_stopping_rounds
+      , first_metric_only = FALSE
+    )
+  )
+
+  for (params in param_variations) {
+
+    nrounds <- 10L
+    bst <- lgb.train(
+      params = params
+      , data = DTRAIN_RANDOM_REGRESSION
+      , nrounds = nrounds
+      , valids = list(
+        "valid1" = DVALID_RANDOM_REGRESSION
+      )
+      , eval = list(
+        .increasing_metric
+        , .constant_metric
+      )
+    )
+
+    # Only the two functions provided to "eval" should have been evaluated
+    expect_equal(length(bst$record_evals[["valid1"]]), 2L)
+
+    # early stopping should have happened, and should have stopped early_stopping_rounds + 1 rounds in
+    # because constant_metric never improves
+    #
+    # the best iteration should be the last one, because increasing_metric was first
+    # and gets better every iteration
+    best_iter <- early_stopping_rounds + 1L
+    expect_equal(bst$best_iter, best_iter)
+
+    # best_score should be taken from "increasing_metric" because it was first
+    expect_equal(
+      bst$best_score
+      , bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]][[best_iter]]
+    )
+
+    # early stopping should not have happened. even though increasing_metric kept
+    # getting better, early stopping should have happened because "constant_metric"
+    # did not improve
+    expect_equal(
+      length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]])
+      , early_stopping_rounds + 1L
+    )
+    expect_equal(
+      length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]])
+      , early_stopping_rounds + 1L
+    )
+  }
+
+})
+
+test_that("If first_metric_only is TRUE, lgb.train() decides to stop early based on only the first metric", {
+  set.seed(708L)
+  nrounds <- 10L
+  early_stopping_rounds <- 3L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.train(
+    params = list(
+      objective = "regression"
+      , metric = "None"
+      , early_stopping_rounds = early_stopping_rounds
+      , first_metric_only = TRUE
+    )
+    , data = DTRAIN_RANDOM_REGRESSION
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = DVALID_RANDOM_REGRESSION
+    )
+    , eval = list(
+      .increasing_metric
+      , .constant_metric
+    )
+  )
+
+  # Only the two functions provided to "eval" should have been evaluated
+  expect_equal(length(bst$record_evals[["valid1"]]), 2L)
+
+  # all 10 iterations should happen, and the best_iter should be the final one
+  expect_equal(bst$best_iter, nrounds)
+
+  # best_score should be taken from "increasing_metric"
+  expect_equal(
+    bst$best_score
+    , increasing_metric_starting_value + 0.1 * nrounds
+  )
+
+  # early stopping should not have happened. Even though constant_metric
+  # had 9 consecutive iterations with no improvement, it is ignored because of
+  # first_metric_only = TRUE
+  expect_equal(
+    length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]])
+    , nrounds
+  )
+  expect_equal(
+    length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]])
+    , nrounds
+  )
+})
+
+test_that("lgb.train() works when a mixture of functions and strings are passed to eval", {
+  set.seed(708L)
+  nrounds <- 10L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.train(
+    params = list(
+      objective = "regression"
+      , metric = "None"
+    )
+    , data = DTRAIN_RANDOM_REGRESSION
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = DVALID_RANDOM_REGRESSION
+    )
+    , eval = list(
+      .increasing_metric
+      , "rmse"
+      , .constant_metric
+      , "l2"
+    )
+  )
+
+  # all 4 metrics should have been used
+  expect_named(
+    bst$record_evals[["valid1"]]
+    , expected = c("rmse", "l2", "increasing_metric", "constant_metric")
+    , ignore.order = TRUE
+    , ignore.case = FALSE
+  )
+
+  # the difference metrics shouldn't have been mixed up with each other
+  results <- bst$record_evals[["valid1"]]
+  expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 1.105012) < TOLERANCE)
+  expect_true(abs(results[["l2"]][["eval"]][[1L]] - 1.221051) < TOLERANCE)
+  expected_increasing_metric <- increasing_metric_starting_value + 0.1
+  expect_true(
+    abs(
+      results[["increasing_metric"]][["eval"]][[1L]] - expected_increasing_metric
+    ) < TOLERANCE
+  )
+  expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE)
+
+})
+
+test_that("lgb.train() works when a list of strings or a character vector is passed to eval", {
+
+  # testing list and character vector, as well as length-1 and length-2
+  eval_variations <- list(
+    c("binary_error", "binary_logloss")
+    , "binary_logloss"
+    , list("binary_error", "binary_logloss")
+    , list("binary_logloss")
+  )
+
+  for (eval_variation in eval_variations) {
+
+    set.seed(708L)
+    nrounds <- 10L
+    increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+    bst <- lgb.train(
+      params = list(
+        objective = "binary"
+        , metric = "None"
+      )
+      , data = DTRAIN_RANDOM_CLASSIFICATION
+      , nrounds = nrounds
+      , valids = list(
+        "valid1" = DVALID_RANDOM_CLASSIFICATION
+      )
+      , eval = eval_variation
+    )
+
+    # both metrics should have been used
+    expect_named(
+      bst$record_evals[["valid1"]]
+      , expected = unlist(eval_variation)
+      , ignore.order = TRUE
+      , ignore.case = FALSE
+    )
+
+    # the difference metrics shouldn't have been mixed up with each other
+    results <- bst$record_evals[["valid1"]]
+    if ("binary_error" %in% unlist(eval_variation)) {
+      expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE)
+    }
+    if ("binary_logloss" %in% unlist(eval_variation)) {
+      expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE)
+    }
+  }
+})
+
+test_that("lgb.train() works when you specify both 'metric' and 'eval' with strings", {
+  set.seed(708L)
+  nrounds <- 10L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.train(
+    params = list(
+      objective = "binary"
+      , metric = "binary_error"
+    )
+    , data = DTRAIN_RANDOM_CLASSIFICATION
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = DVALID_RANDOM_CLASSIFICATION
+    )
+    , eval = "binary_logloss"
+  )
+
+  # both metrics should have been used
+  expect_named(
+    bst$record_evals[["valid1"]]
+    , expected = c("binary_error", "binary_logloss")
+    , ignore.order = TRUE
+    , ignore.case = FALSE
+  )
+
+  # the difference metrics shouldn't have been mixed up with each other
+  results <- bst$record_evals[["valid1"]]
+  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < TOLERANCE)
+  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < TOLERANCE)
+})
+
+test_that("lgb.train() works when you give a function for eval", {
+  set.seed(708L)
+  nrounds <- 10L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.train(
+    params = list(
+      objective = "binary"
+      , metric = "None"
+    )
+    , data = DTRAIN_RANDOM_CLASSIFICATION
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = DVALID_RANDOM_CLASSIFICATION
+    )
+    , eval = .constant_metric
+  )
+
+  # the difference metrics shouldn't have been mixed up with each other
+  results <- bst$record_evals[["valid1"]]
+  expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE)
+})
+
 test_that("lgb.train() works with early stopping for regression with a metric that should be minimized", {
  set.seed(708L)
  trainDF <- data.frame(
@ -1099,6 +1458,172 @@ test_that("using lightgbm() without early stopping, best_iter and best_score com
  expect_identical(bst$best_score, auc_scores[which.max(auc_scores)])
 })

+test_that("lgb.cv() works when you specify both 'metric' and 'eval' with strings", {
+  set.seed(708L)
+  nrounds <- 10L
+  nfolds <- 4L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.cv(
+    params = list(
+      objective = "binary"
+      , metric = "binary_error"
+    )
+    , data = DTRAIN_RANDOM_CLASSIFICATION
+    , nrounds = nrounds
+    , nfold = nfolds
+    , eval = "binary_logloss"
+  )
+
+  # both metrics should have been used
+  expect_named(
+    bst$record_evals[["valid"]]
+    , expected = c("binary_error", "binary_logloss")
+    , ignore.order = TRUE
+    , ignore.case = FALSE
+  )
+
+  # the difference metrics shouldn't have been mixed up with each other
+  results <- bst$record_evals[["valid"]]
+  expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5005654) < TOLERANCE)
+  expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.7011232) < TOLERANCE)
+
+  # all boosters should have been created
+  expect_length(bst$boosters, nfolds)
+})
+
+test_that("lgb.cv() works when you give a function for eval", {
+  set.seed(708L)
+  nrounds <- 10L
+  nfolds <- 3L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.cv(
+    params = list(
+      objective = "binary"
+      , metric = "None"
+    )
+    , data = DTRAIN_RANDOM_CLASSIFICATION
+    , nfold = nfolds
+    , nrounds = nrounds
+    , eval = .constant_metric
+  )
+
+  # the difference metrics shouldn't have been mixed up with each other
+  results <- bst$record_evals[["valid"]]
+  expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < TOLERANCE)
+  expect_named(results, "constant_metric")
+})
+
+test_that("If first_metric_only is TRUE, lgb.cv() decides to stop early based on only the first metric", {
+  set.seed(708L)
+  nrounds <- 10L
+  nfolds <- 5L
+  early_stopping_rounds <- 3L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.cv(
+    params = list(
+      objective = "regression"
+      , metric = "None"
+      , early_stopping_rounds = early_stopping_rounds
+      , first_metric_only = TRUE
+    )
+    , data = DTRAIN_RANDOM_REGRESSION
+    , nfold = nfolds
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = DVALID_RANDOM_REGRESSION
+    )
+    , eval = list(
+      .increasing_metric
+      , .constant_metric
+    )
+  )
+
+  # Only the two functions provided to "eval" should have been evaluated
+  expect_named(bst$record_evals[["valid"]], c("increasing_metric", "constant_metric"))
+
+  # all 10 iterations should happen, and the best_iter should be the final one
+  expect_equal(bst$best_iter, nrounds)
+
+  # best_score should be taken from "increasing_metric"
+  #
+  # this expected value looks magical and confusing, but it's because
+  # evaluation metrics are averaged over all folds.
+  #
+  # consider 5-fold CV with a metric that adds 0.1 to a global accumulator
+  # each time it's called
+  #
+  # * iter 1: [0.1, 0.2, 0.3, 0.4, 0.5] (mean = 0.3)
+  # * iter 2: [0.6, 0.7, 0.8, 0.9, 1.0] (mean = 1.3)
+  # * iter 3: [1.1, 1.2, 1.3, 1.4, 1.5] (mean = 1.8)
+  #
+  cv_value <- increasing_metric_starting_value + mean(seq_len(nfolds) / 10.0) + (nrounds  - 1L) * 0.1 * nfolds
+  expect_equal(bst$best_score, cv_value)
+
+  # early stopping should not have happened. Even though constant_metric
+  # had 9 consecutive iterations with no improvement, it is ignored because of
+  # first_metric_only = TRUE
+  expect_equal(
+    length(bst$record_evals[["valid"]][["constant_metric"]][["eval"]])
+    , nrounds
+  )
+  expect_equal(
+    length(bst$record_evals[["valid"]][["increasing_metric"]][["eval"]])
+    , nrounds
+  )
+})
+
+test_that("early stopping works with lgb.cv()", {
+  set.seed(708L)
+  nrounds <- 10L
+  nfolds <- 5L
+  early_stopping_rounds <- 3L
+  increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv)
+  bst <- lgb.cv(
+    params = list(
+      objective = "regression"
+      , metric = "None"
+      , early_stopping_rounds = early_stopping_rounds
+      , first_metric_only = TRUE
+    )
+    , data = DTRAIN_RANDOM_REGRESSION
+    , nfold = nfolds
+    , nrounds = nrounds
+    , valids = list(
+      "valid1" = DVALID_RANDOM_REGRESSION
+    )
+    , eval = list(
+      .constant_metric
+      , .increasing_metric
+    )
+  )
+
+  # only the two functions provided to "eval" should have been evaluated
+  expect_named(bst$record_evals[["valid"]], c("constant_metric", "increasing_metric"))
+
+  # best_iter should be based on the first metric. Since constant_metric
+  # never changes, its first iteration was the best oone
+  expect_equal(bst$best_iter, 1L)
+
+  # best_score should be taken from the first metri
+  expect_equal(bst$best_score, 0.2)
+
+  # early stopping should have happened, since constant_metric was the first
+  # one passed to eval and it will not improve over consecutive iterations
+  #
+  # note that this test is identical to the previous one, but with the
+  # order of the eval metrics switched
+  expect_equal(
+    length(bst$record_evals[["valid"]][["constant_metric"]][["eval"]])
+    , early_stopping_rounds + 1L
+  )
+  expect_equal(
+    length(bst$record_evals[["valid"]][["increasing_metric"]][["eval"]])
+    , early_stopping_rounds + 1L
+  )
+})
+
+context("interaction constraints")
+
 test_that("lgb.train() throws an informative error if interaction_constraints is not a list", {
  dtrain <- lgb.Dataset(train$data, label = train$label)
  params <- list(objective = "regression", interaction_constraints = "[1,2],[3]")
--- a/R-package/tests/testthat/test_utils.R
+++ b/R-package/tests/testthat/test_utils.R
@ -115,3 +115,12 @@ test_that("lgb.check.eval adds eval to metric in params if a list is provided",
    expect_named(params, "metric")
    expect_identical(params[["metric"]], list("auc", "binary_error", "binary_logloss"))
 })
+
+test_that("lgb.check.eval drops duplicate metrics and preserves order", {
+    params <- lgb.check.eval(
+        params = list(metric = "l1")
+        , eval = list("l2", "rmse", "l1", "rmse")
+    )
+    expect_named(params, "metric")
+    expect_identical(params[["metric"]], list("l1", "l2", "rmse"))
+})