зеркало из https://github.com/microsoft/LightGBM.git
[R-package] skip integer categorical feature check when building dataset subset (fixes #6412) (#6442)
This commit is contained in:
Родитель
4401401553
Коммит
63926827d2
|
@ -106,10 +106,10 @@ if [[ $OS_NAME == "macos" ]]; then
|
||||||
-target / || exit 1
|
-target / || exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# fix for issue where CRAN was not returning {lattice} when using R 3.6
|
# fix for issue where CRAN was not returning {lattice} and {evaluate} when using R 3.6
|
||||||
# "Warning: dependency ‘lattice’ is not available"
|
# "Warning: dependency ‘lattice’ is not available"
|
||||||
if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
|
if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
|
||||||
Rscript --vanilla -e "install.packages('https://cran.r-project.org/src/contrib/Archive/lattice/lattice_0.20-41.tar.gz', repos = NULL, lib = '${R_LIB_PATH}')"
|
Rscript --vanilla -e "install.packages(c('https://cran.r-project.org/src/contrib/Archive/lattice/lattice_0.20-41.tar.gz', 'https://cran.r-project.org/src/contrib/Archive/evaluate/evaluate_0.23.tar.gz'), repos = NULL, lib = '${R_LIB_PATH}')"
|
||||||
else
|
else
|
||||||
# {Matrix} needs {lattice}, so this needs to run before manually installing {Matrix}.
|
# {Matrix} needs {lattice}, so this needs to run before manually installing {Matrix}.
|
||||||
# This should be unnecessary on R >=4.4.0
|
# This should be unnecessary on R >=4.4.0
|
||||||
|
|
|
@ -170,7 +170,12 @@ Dataset <- R6::R6Class(
|
||||||
|
|
||||||
# Check if more categorical features were output over the feature space
|
# Check if more categorical features were output over the feature space
|
||||||
data_is_not_filename <- !is.character(private$raw_data)
|
data_is_not_filename <- !is.character(private$raw_data)
|
||||||
if (data_is_not_filename && max(private$categorical_feature) > ncol(private$raw_data)) {
|
if (
|
||||||
|
data_is_not_filename
|
||||||
|
&& !is.null(private$raw_data)
|
||||||
|
&& is.null(private$used_indices)
|
||||||
|
&& max(private$categorical_feature) > ncol(private$raw_data)
|
||||||
|
) {
|
||||||
stop(
|
stop(
|
||||||
"lgb.Dataset.construct: supplied a too large value in categorical_feature: "
|
"lgb.Dataset.construct: supplied a too large value in categorical_feature: "
|
||||||
, max(private$categorical_feature)
|
, max(private$categorical_feature)
|
||||||
|
|
|
@ -440,6 +440,35 @@ test_that("lgb.Dataset: should be able to run lgb.cv() immediately after using l
|
||||||
expect_true(methods::is(bst, "lgb.CVBooster"))
|
expect_true(methods::is(bst, "lgb.CVBooster"))
|
||||||
})
|
})
|
||||||
|
|
||||||
|
test_that("lgb.Dataset: should be able to be used in lgb.cv() when constructed with categorical feature indices", {
|
||||||
|
data("mtcars")
|
||||||
|
y <- mtcars$mpg
|
||||||
|
x <- as.matrix(mtcars[, -1L])
|
||||||
|
categorical_feature <- which(names(mtcars) %in% c("cyl", "vs", "am", "gear", "carb")) - 1L
|
||||||
|
dtrain <- lgb.Dataset(
|
||||||
|
data = x
|
||||||
|
, label = y
|
||||||
|
, categorical_feature = categorical_feature
|
||||||
|
, free_raw_data = TRUE
|
||||||
|
, params = list(num_threads = .LGB_MAX_THREADS)
|
||||||
|
)
|
||||||
|
# constructing the Dataset frees the raw data
|
||||||
|
dtrain$construct()
|
||||||
|
params <- list(
|
||||||
|
objective = "regression"
|
||||||
|
, num_leaves = 2L
|
||||||
|
, verbose = .LGB_VERBOSITY
|
||||||
|
, num_threads = .LGB_MAX_THREADS
|
||||||
|
)
|
||||||
|
# cv should reuse the same categorical features without checking the indices
|
||||||
|
bst <- lgb.cv(params = params, data = dtrain, stratified = FALSE, nrounds = 1L)
|
||||||
|
expect_equal(
|
||||||
|
unlist(bst$boosters[[1L]]$booster$params$categorical_feature)
|
||||||
|
, categorical_feature - 1L # 0-based
|
||||||
|
)
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
test_that("lgb.Dataset: should be able to use and retrieve long feature names", {
|
test_that("lgb.Dataset: should be able to use and retrieve long feature names", {
|
||||||
# set one feature to a value longer than the default buffer size used
|
# set one feature to a value longer than the default buffer size used
|
||||||
# in LGBM_DatasetGetFeatureNames_R
|
# in LGBM_DatasetGetFeatureNames_R
|
||||||
|
@ -621,3 +650,12 @@ test_that("lgb.Dataset can be constructed with categorical features and without
|
||||||
lgb.Dataset(raw_mat, categorical_feature = 2L)$construct()
|
lgb.Dataset(raw_mat, categorical_feature = 2L)$construct()
|
||||||
}, regexp = "supplied a too large value in categorical_feature: 2 but only 1 features")
|
}, regexp = "supplied a too large value in categorical_feature: 2 but only 1 features")
|
||||||
})
|
})
|
||||||
|
|
||||||
|
test_that("lgb.Dataset.slice fails with a categorical feature index greater than the number of features", {
|
||||||
|
data <- matrix(runif(100L), nrow = 50L, ncol = 2L)
|
||||||
|
ds <- lgb.Dataset(data = data, categorical_feature = 3L)
|
||||||
|
subset <- ds$slice(1L:20L)
|
||||||
|
expect_error({
|
||||||
|
subset$construct()
|
||||||
|
}, regexp = "supplied a too large value in categorical_feature: 3 but only 2 features")
|
||||||
|
})
|
||||||
|
|
Загрузка…
Ссылка в новой задаче