зеркало из https://github.com/microsoft/LightGBM.git
[python-package][R-package] allow using feature names when retrieving number of bins (#5116)
* allow using feature names when retrieving number of bins * unname vector * use default feature names when not defined * lint * apply suggestions * remove extra comma * add test with categorical feature * make feature names sync more transparent
This commit is contained in:
Родитель
53218c11a0
Коммит
5b664b67c4
|
@ -289,6 +289,13 @@ Dataset <- R6::R6Class(
|
|||
self$set_colnames(colnames = private$colnames)
|
||||
}
|
||||
|
||||
# Ensure that private$colnames matches the feature names on the C++ side. This line is necessary
|
||||
# in cases like constructing from a file or from a matrix with no column names.
|
||||
private$colnames <- .Call(
|
||||
LGBM_DatasetGetFeatureNames_R
|
||||
, private$handle
|
||||
)
|
||||
|
||||
# Load init score if requested
|
||||
if (!is.null(private$predictor) && is.null(private$used_indices)) {
|
||||
|
||||
|
@ -381,6 +388,13 @@ Dataset <- R6::R6Class(
|
|||
if (lgb.is.null.handle(x = private$handle)) {
|
||||
stop("Cannot get number of bins in feature before constructing Dataset.")
|
||||
}
|
||||
if (is.character(feature)) {
|
||||
feature_name <- feature
|
||||
feature <- which(private$colnames == feature_name)
|
||||
if (length(feature) == 0L) {
|
||||
stop(sprintf("feature '%s' not found", feature_name))
|
||||
}
|
||||
}
|
||||
num_bin <- integer(1L)
|
||||
.Call(
|
||||
LGBM_DatasetGetFeatureNumBin_R
|
||||
|
|
|
@ -533,10 +533,16 @@ test_that("lgb.Dataset$get_feature_num_bin() works", {
|
|||
, three_vals = c(rep(c(0.0, 1.0, 2.0), 33L), 0.0)
|
||||
, two_vals_plus_missing = c(rep(c(1.0, 2.0), 49L), NA_real_, NA_real_)
|
||||
, all_zero = rep(0.0, 100L)
|
||||
, categorical = sample.int(2L, 100L, replace = TRUE)
|
||||
)
|
||||
n_features <- ncol(raw_df)
|
||||
raw_mat <- data.matrix(raw_df)
|
||||
min_data_in_bin <- 2L
|
||||
ds <- lgb.Dataset(raw_mat, params = list(min_data_in_bin = min_data_in_bin))
|
||||
ds <- lgb.Dataset(
|
||||
raw_mat
|
||||
, params = list(min_data_in_bin = min_data_in_bin)
|
||||
, categorical_feature = n_features
|
||||
)
|
||||
ds$construct()
|
||||
expected_num_bins <- c(
|
||||
100L %/% min_data_in_bin + 1L # extra bin for zero
|
||||
|
@ -544,9 +550,30 @@ test_that("lgb.Dataset$get_feature_num_bin() works", {
|
|||
, 3L # 0, 1, 2
|
||||
, 4L # 0, 1, 2 + NA
|
||||
, 0L # unused
|
||||
, 3L # 1, 2 + NA
|
||||
)
|
||||
actual_num_bins <- sapply(1L:5L, ds$get_feature_num_bin)
|
||||
actual_num_bins <- sapply(1L:n_features, ds$get_feature_num_bin)
|
||||
expect_identical(actual_num_bins, expected_num_bins)
|
||||
# test using defined feature names
|
||||
bins_by_name <- sapply(colnames(raw_mat), ds$get_feature_num_bin)
|
||||
expect_identical(unname(bins_by_name), expected_num_bins)
|
||||
# test using default feature names
|
||||
no_names_mat <- raw_mat
|
||||
colnames(no_names_mat) <- NULL
|
||||
ds_no_names <- lgb.Dataset(
|
||||
no_names_mat
|
||||
, params = list(min_data_in_bin = min_data_in_bin)
|
||||
, categorical_feature = n_features
|
||||
)
|
||||
ds_no_names$construct()
|
||||
default_names <- lapply(
|
||||
X = seq(1L, ncol(raw_mat))
|
||||
, FUN = function(i) {
|
||||
sprintf("Column_%d", i - 1L)
|
||||
}
|
||||
)
|
||||
bins_by_default_name <- sapply(default_names, ds_no_names$get_feature_num_bin)
|
||||
expect_identical(bins_by_default_name, expected_num_bins)
|
||||
})
|
||||
|
||||
test_that("lgb.Dataset can be constructed with categorical features and without colnames", {
|
||||
|
@ -555,9 +582,9 @@ test_that("lgb.Dataset can be constructed with categorical features and without
|
|||
ds <- lgb.Dataset(raw_mat, categorical_feature = 1L)$construct()
|
||||
sparse_mat <- as(raw_mat, "dgCMatrix")
|
||||
ds2 <- lgb.Dataset(sparse_mat, categorical_feature = 1L)$construct()
|
||||
# check that the column names are NULL
|
||||
expect_null(ds$.__enclos_env__$private$colnames)
|
||||
expect_null(ds2$.__enclos_env__$private$colnames)
|
||||
# check that the column names are the default ones
|
||||
expect_equal(ds$.__enclos_env__$private$colnames, "Column_0")
|
||||
expect_equal(ds2$.__enclos_env__$private$colnames, "Column_0")
|
||||
# check for error when index is greater than the number of columns
|
||||
expect_error({
|
||||
lgb.Dataset(raw_mat, categorical_feature = 2L)$construct()
|
||||
|
|
|
@ -1817,6 +1817,7 @@ class Dataset:
|
|||
feature_name=self.feature_name, categorical_feature=self.categorical_feature, params=self.params)
|
||||
if self.free_raw_data:
|
||||
self.data = None
|
||||
self.feature_name = self.get_feature_name()
|
||||
return self
|
||||
|
||||
def create_valid(self, data, label=None, weight=None, group=None, init_score=None, params=None):
|
||||
|
@ -2382,13 +2383,13 @@ class Dataset:
|
|||
else:
|
||||
raise LightGBMError("Cannot get num_feature before construct dataset")
|
||||
|
||||
def feature_num_bin(self, feature: int) -> int:
|
||||
def feature_num_bin(self, feature: Union[int, str]) -> int:
|
||||
"""Get the number of bins for a feature.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
feature : int
|
||||
Index of the feature.
|
||||
feature : int or str
|
||||
Index or name of the feature.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
@ -2396,6 +2397,8 @@ class Dataset:
|
|||
The number of constructed bins for the feature in the Dataset.
|
||||
"""
|
||||
if self.handle is not None:
|
||||
if isinstance(feature, str):
|
||||
feature = self.feature_name.index(feature)
|
||||
ret = ctypes.c_int(0)
|
||||
_safe_call(_LIB.LGBM_DatasetGetFeatureNumBin(self.handle,
|
||||
ctypes.c_int(feature),
|
||||
|
|
|
@ -663,17 +663,33 @@ def test_feature_num_bin(min_data_in_bin):
|
|||
np.array([0, 1, 2] * 33 + [0]),
|
||||
np.array([1, 2] * 49 + 2 * [np.nan]),
|
||||
np.zeros(100),
|
||||
np.random.choice([0, 1], 100),
|
||||
]).T
|
||||
ds = lgb.Dataset(X, params={'min_data_in_bin': min_data_in_bin}).construct()
|
||||
n_continuous = X.shape[1] - 1
|
||||
feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1']
|
||||
ds_kwargs = dict(
|
||||
params={'min_data_in_bin': min_data_in_bin},
|
||||
categorical_feature=[n_continuous], # last feature
|
||||
)
|
||||
ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
|
||||
expected_num_bins = [
|
||||
100 // min_data_in_bin + 1, # extra bin for zero
|
||||
3, # 0, 1, 2
|
||||
3, # 0, 1, 2
|
||||
4, # 0, 1, 2 + nan
|
||||
0, # unused
|
||||
3, # 0, 1 + nan
|
||||
]
|
||||
actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
|
||||
assert actual_num_bins == expected_num_bins
|
||||
# test using defined feature names
|
||||
bins_by_name = [ds.feature_num_bin(name) for name in feature_name]
|
||||
assert bins_by_name == expected_num_bins
|
||||
# test using default feature names
|
||||
ds_no_names = lgb.Dataset(X, **ds_kwargs).construct()
|
||||
default_names = [f'Column_{i}' for i in range(X.shape[1])]
|
||||
bins_by_default_name = [ds_no_names.feature_num_bin(name) for name in default_names]
|
||||
assert bins_by_default_name == expected_num_bins
|
||||
# check for feature indices outside of range
|
||||
num_features = X.shape[1]
|
||||
with pytest.raises(
|
||||
|
|
Загрузка…
Ссылка в новой задаче