[python-package][R-package] allow using feature names when retrieving number of bins (#5116)

* allow using feature names when retrieving number of bins * unname vector * use default feature names when not defined * lint * apply suggestions * remove extra comma * add test with categorical feature * make feature names sync more transparent
2022-05-16 21:45:13 -05:00 · 2022-05-16 21:45:13 -05:00 · 5b664b67c4
--- a/R-package/R/lgb.Dataset.R
+++ b/R-package/R/lgb.Dataset.R
@ -289,6 +289,13 @@ Dataset <- R6::R6Class(
        self$set_colnames(colnames = private$colnames)
      }

+      # Ensure that private$colnames matches the feature names on the C++ side. This line is necessary
+      # in cases like constructing from a file or from a matrix with no column names.
+      private$colnames <- .Call(
+          LGBM_DatasetGetFeatureNames_R
+          , private$handle
+      )
+
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {

@ -381,6 +388,13 @@ Dataset <- R6::R6Class(
      if (lgb.is.null.handle(x = private$handle)) {
        stop("Cannot get number of bins in feature before constructing Dataset.")
      }
+      if (is.character(feature)) {
+        feature_name <- feature
+        feature <- which(private$colnames == feature_name)
+        if (length(feature) == 0L) {
+          stop(sprintf("feature '%s' not found", feature_name))
+        }
+      }
      num_bin <- integer(1L)
      .Call(
        LGBM_DatasetGetFeatureNumBin_R
--- a/R-package/tests/testthat/test_dataset.R
+++ b/R-package/tests/testthat/test_dataset.R
@ -533,10 +533,16 @@ test_that("lgb.Dataset$get_feature_num_bin() works", {
    , three_vals = c(rep(c(0.0, 1.0, 2.0), 33L), 0.0)
    , two_vals_plus_missing = c(rep(c(1.0, 2.0), 49L), NA_real_, NA_real_)
    , all_zero = rep(0.0, 100L)
+    , categorical = sample.int(2L, 100L, replace = TRUE)
  )
+  n_features <- ncol(raw_df)
  raw_mat <- data.matrix(raw_df)
  min_data_in_bin <- 2L
-  ds <- lgb.Dataset(raw_mat, params = list(min_data_in_bin = min_data_in_bin))
+  ds <- lgb.Dataset(
+    raw_mat
+    , params = list(min_data_in_bin = min_data_in_bin)
+    , categorical_feature = n_features
+  )
  ds$construct()
  expected_num_bins <- c(
    100L %/% min_data_in_bin + 1L  # extra bin for zero
@ -544,9 +550,30 @@ test_that("lgb.Dataset$get_feature_num_bin() works", {
    , 3L  # 0, 1, 2
    , 4L  # 0, 1, 2 + NA
    , 0L  # unused
+    , 3L  # 1, 2 + NA
  )
-  actual_num_bins <- sapply(1L:5L, ds$get_feature_num_bin)
+  actual_num_bins <- sapply(1L:n_features, ds$get_feature_num_bin)
  expect_identical(actual_num_bins, expected_num_bins)
+  # test using defined feature names
+  bins_by_name <- sapply(colnames(raw_mat), ds$get_feature_num_bin)
+  expect_identical(unname(bins_by_name), expected_num_bins)
+  # test using default feature names
+  no_names_mat <- raw_mat
+  colnames(no_names_mat) <- NULL
+  ds_no_names <- lgb.Dataset(
+    no_names_mat
+    , params = list(min_data_in_bin = min_data_in_bin)
+    , categorical_feature = n_features
+  )
+  ds_no_names$construct()
+  default_names <- lapply(
+    X = seq(1L, ncol(raw_mat))
+    , FUN = function(i) {
+      sprintf("Column_%d", i - 1L)
+    }
+  )
+  bins_by_default_name <- sapply(default_names, ds_no_names$get_feature_num_bin)
+  expect_identical(bins_by_default_name, expected_num_bins)
 })

 test_that("lgb.Dataset can be constructed with categorical features and without colnames", {
@ -555,9 +582,9 @@ test_that("lgb.Dataset can be constructed with categorical features and without
  ds <- lgb.Dataset(raw_mat, categorical_feature = 1L)$construct()
  sparse_mat <- as(raw_mat, "dgCMatrix")
  ds2 <- lgb.Dataset(sparse_mat, categorical_feature = 1L)$construct()
-  # check that the column names are NULL
-  expect_null(ds$.__enclos_env__$private$colnames)
-  expect_null(ds2$.__enclos_env__$private$colnames)
+  # check that the column names are the default ones
+  expect_equal(ds$.__enclos_env__$private$colnames, "Column_0")
+  expect_equal(ds2$.__enclos_env__$private$colnames, "Column_0")
  # check for error when index is greater than the number of columns
  expect_error({
    lgb.Dataset(raw_mat, categorical_feature = 2L)$construct()
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@ -1817,6 +1817,7 @@ class Dataset:
                                feature_name=self.feature_name, categorical_feature=self.categorical_feature, params=self.params)
            if self.free_raw_data:
                self.data = None
+            self.feature_name = self.get_feature_name()
        return self

    def create_valid(self, data, label=None, weight=None, group=None, init_score=None, params=None):
@ -2382,13 +2383,13 @@ class Dataset:
        else:
            raise LightGBMError("Cannot get num_feature before construct dataset")

-    def feature_num_bin(self, feature: int) -> int:
+    def feature_num_bin(self, feature: Union[int, str]) -> int:
        """Get the number of bins for a feature.

        Parameters
        ----------
-        feature : int
-            Index of the feature.
+        feature : int or str
+            Index or name of the feature.

        Returns
        -------
@ -2396,6 +2397,8 @@ class Dataset:
            The number of constructed bins for the feature in the Dataset.
        """
        if self.handle is not None:
+            if isinstance(feature, str):
+                feature = self.feature_name.index(feature)
            ret = ctypes.c_int(0)
            _safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self.handle,
                                                         ctypes.c_int(feature),
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@ -663,17 +663,33 @@ def test_feature_num_bin(min_data_in_bin):
        np.array([0, 1, 2] * 33 + [0]),
        np.array([1, 2] * 49 + 2 * [np.nan]),
        np.zeros(100),
+        np.random.choice([0, 1], 100),
    ]).T
-    ds = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin}).construct()
+    n_continuous = X.shape[1] - 1
+    feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1']
+    ds_kwargs = dict(
+        params={'min_data_in_bin': min_data_in_bin},
+        categorical_feature=[n_continuous],  # last feature
+    )
+    ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
    expected_num_bins = [
        100 // min_data_in_bin + 1,  # extra bin for zero
        3,  # 0, 1, 2
        3,  # 0, 1, 2
        4,  # 0, 1, 2 + nan
        0,  # unused
+        3,  # 0, 1 + nan
    ]
    actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
    assert actual_num_bins == expected_num_bins
+    # test using defined feature names
+    bins_by_name = [ds.feature_num_bin(name) for name in feature_name]
+    assert bins_by_name == expected_num_bins
+    # test using default feature names
+    ds_no_names = lgb.Dataset(X, **ds_kwargs).construct()
+    default_names = [f'Column_{i}' for i in range(X.shape[1])]
+    bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names]
+    assert bins_by_default_name == expected_num_bins
    # check for feature indices outside of range
    num_features = X.shape[1]
    with pytest.raises(