From e0cda880fc74ca6d1b7d6cb425a24e3a69764bb1 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 3 Jun 2024 20:17:40 -0500 Subject: [PATCH] [python-package] remove uses of deprecated NumPy random number generation APIs, require 'numpy>=1.17.0' (#6468) --- .gitignore | 2 +- docs/Python-Intro.rst | 14 +- examples/python-guide/logistic_regression.py | 9 +- python-package/lightgbm/compat.py | 12 -- python-package/lightgbm/sklearn.py | 5 +- python-package/pyproject.toml | 4 +- tests/python_package_test/conftest.py | 12 ++ tests/python_package_test/test_basic.py | 107 +++++----- tests/python_package_test/test_engine.py | 208 +++++++++---------- tests/python_package_test/test_sklearn.py | 64 +++--- 10 files changed, 221 insertions(+), 216 deletions(-) create mode 100644 tests/python_package_test/conftest.py diff --git a/.gitignore b/.gitignore index 9403475cc..efa59fdfc 100644 --- a/.gitignore +++ b/.gitignore @@ -405,7 +405,7 @@ python-package/lightgbm/VERSION.txt # R build artefacts **/autom4te.cache/ -conftest* +R-package/conftest* R-package/config.status !R-package/data/agaricus.test.rda !R-package/data/agaricus.train.rda diff --git a/docs/Python-Intro.rst b/docs/Python-Intro.rst index 3c1cb1557..a1c62c585 100644 --- a/docs/Python-Intro.rst +++ b/docs/Python-Intro.rst @@ -59,8 +59,9 @@ Many of the examples in this page use functionality from ``numpy``. To run the e .. code:: python - data = np.random.rand(500, 10) # 500 entities, each contains 10 features - label = np.random.randint(2, size=500) # binary target + rng = np.random.default_rng() + data = rng.uniform(size=(500, 10)) # 500 entities, each contains 10 features + label = rng.integers(low=0, high=2, size=(500, )) # binary target train_data = lgb.Dataset(data, label=label) **To load a scipy.sparse.csr\_matrix array into Dataset:** @@ -139,7 +140,8 @@ It doesn't need to convert to one-hot encoding, and is much faster than one-hot .. code:: python - w = np.random.rand(500, ) + rng = np.random.default_rng() + w = rng.uniform(size=(500, )) train_data = lgb.Dataset(data, label=label, weight=w) or @@ -147,7 +149,8 @@ or .. code:: python train_data = lgb.Dataset(data, label=label) - w = np.random.rand(500, ) + rng = np.random.default_rng() + w = rng.uniform(size=(500, )) train_data.set_weight(w) And you can use ``Dataset.set_init_score()`` to set initial score, and ``Dataset.set_group()`` to set group/query data for ranking tasks. @@ -249,7 +252,8 @@ A model that has been trained or loaded can perform predictions on datasets: .. code:: python # 7 entities, each contains 10 features - data = np.random.rand(7, 10) + rng = np.random.default_rng() + data = rng.uniform(size=(7, 10)) ypred = bst.predict(data) If early stopping is enabled during training, you can get predictions from the best iteration with ``bst.best_iteration``: diff --git a/examples/python-guide/logistic_regression.py b/examples/python-guide/logistic_regression.py index ea02382eb..c73155da5 100644 --- a/examples/python-guide/logistic_regression.py +++ b/examples/python-guide/logistic_regression.py @@ -22,15 +22,15 @@ import lightgbm as lgb ################# # Simulate some binary data with a single categorical and # single continuous predictor -np.random.seed(0) +rng = np.random.default_rng(seed=0) N = 1000 X = pd.DataFrame({"continuous": range(N), "categorical": np.repeat([0, 1, 2, 3, 4], N / 5)}) CATEGORICAL_EFFECTS = [-1, -1, -2, -2, 2] LINEAR_TERM = np.array( [-0.5 + 0.01 * X["continuous"][k] + CATEGORICAL_EFFECTS[X["categorical"][k]] for k in range(X.shape[0])] -) + np.random.normal(0, 1, X.shape[0]) +) + rng.normal(loc=0, scale=1, size=X.shape[0]) TRUE_PROB = expit(LINEAR_TERM) -Y = np.random.binomial(1, TRUE_PROB, size=N) +Y = rng.binomial(n=1, p=TRUE_PROB, size=N) DATA = { "X": X, "probability_labels": TRUE_PROB, @@ -65,10 +65,9 @@ def experiment(objective, label_type, data): result : dict Experiment summary stats. """ - np.random.seed(0) nrounds = 5 lgb_data = data[f"lgb_with_{label_type}_labels"] - params = {"objective": objective, "feature_fraction": 1, "bagging_fraction": 1, "verbose": -1} + params = {"objective": objective, "feature_fraction": 1, "bagging_fraction": 1, "verbose": -1, "seed": 123} time_zero = time.time() gbm = lgb.train(params, lgb_data, num_boost_round=nrounds) y_fitted = gbm.predict(data["X"]) diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index 9eed61a66..113960609 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -37,18 +37,6 @@ except ImportError: concat = None -"""numpy""" -try: - from numpy.random import Generator as np_random_Generator -except ImportError: - - class np_random_Generator: # type: ignore - """Dummy class for np.random.Generator.""" - - def __init__(self, *args: Any, **kwargs: Any): - pass - - """matplotlib""" try: import matplotlib # noqa: F401 diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 9f1a62f54..46f41a428 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -41,7 +41,6 @@ from .compat import ( _LGBMModelBase, _LGBMRegressorBase, dt_DataTable, - np_random_Generator, pd_DataFrame, ) from .engine import train @@ -476,7 +475,7 @@ class LGBMModel(_LGBMModelBase): colsample_bytree: float = 1.0, reg_alpha: float = 0.0, reg_lambda: float = 0.0, - random_state: Optional[Union[int, np.random.RandomState, "np.random.Generator"]] = None, + random_state: Optional[Union[int, np.random.RandomState, np.random.Generator]] = None, n_jobs: Optional[int] = None, importance_type: str = "split", **kwargs: Any, @@ -739,7 +738,7 @@ class LGBMModel(_LGBMModelBase): if isinstance(params["random_state"], np.random.RandomState): params["random_state"] = params["random_state"].randint(np.iinfo(np.int32).max) - elif isinstance(params["random_state"], np_random_Generator): + elif isinstance(params["random_state"], np.random.Generator): params["random_state"] = int(params["random_state"].integers(np.iinfo(np.int32).max)) if self._n_classes > 2: for alias in _ConfigAliases.get("num_class"): diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index b7bff79ed..d671b5456 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -19,7 +19,7 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence" ] dependencies = [ - "numpy", + "numpy>=1.17.0", "scipy" ] description = "LightGBM Python Package" @@ -156,6 +156,8 @@ select = [ "E", # pyflakes "F", + # NumPy-specific rules + "NPY", # pylint "PL", # flake8-return: unnecessary assignment before return diff --git a/tests/python_package_test/conftest.py b/tests/python_package_test/conftest.py new file mode 100644 index 000000000..7d9c5b270 --- /dev/null +++ b/tests/python_package_test/conftest.py @@ -0,0 +1,12 @@ +import numpy as np +import pytest + + +@pytest.fixture(scope="function") +def rng(): + return np.random.default_rng() + + +@pytest.fixture(scope="function") +def rng_fixed_seed(): + return np.random.default_rng(seed=42) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 92f5593ef..e2f379dad 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -136,7 +136,7 @@ def _create_sequence_from_ndarray(data, num_seq, batch_size): @pytest.mark.parametrize("batch_size", [3, None]) @pytest.mark.parametrize("include_0_and_nan", [False, True]) @pytest.mark.parametrize("num_seq", [1, 3]) -def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq): +def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq, rng): params = {"bin_construct_sample_cnt": sample_count} nrow = 50 @@ -175,7 +175,6 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq): # Test for validation set. # Select some random rows as valid data. - rng = np.random.default_rng() # Pass integer to set seed when needed. valid_idx = (rng.random(10) * nrow).astype(np.int32) valid_data = data[valid_idx, :] valid_X = valid_data[:, :-1] @@ -201,7 +200,7 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq): @pytest.mark.parametrize("num_seq", [1, 2]) -def test_sequence_get_data(num_seq): +def test_sequence_get_data(num_seq, rng): nrow = 20 ncol = 11 data = np.arange(nrow * ncol, dtype=np.float64).reshape((nrow, ncol)) @@ -212,7 +211,7 @@ def test_sequence_get_data(num_seq): seq_ds = lgb.Dataset(seqs, label=Y, params=None, free_raw_data=False).construct() assert seq_ds.get_data() == seqs - used_indices = np.random.choice(np.arange(nrow), nrow // 3, replace=False) + used_indices = rng.choice(a=np.arange(nrow), size=nrow // 3, replace=False) subset_data = seq_ds.subset(used_indices).construct() np.testing.assert_array_equal(subset_data.get_data(), X[sorted(used_indices)]) @@ -246,8 +245,8 @@ def test_chunked_dataset_linear(): valid_data.construct() -def test_save_dataset_subset_and_load_from_file(tmp_path): - data = np.random.rand(100, 2) +def test_save_dataset_subset_and_load_from_file(tmp_path, rng): + data = rng.standard_normal(size=(100, 2)) params = {"max_bin": 50, "min_data_in_bin": 10} ds = lgb.Dataset(data, params=params) ds.subset([1, 2, 3, 5, 8]).save_binary(tmp_path / "subset.bin") @@ -267,18 +266,18 @@ def test_subset_group(): assert subset_group[1] == 9 -def test_add_features_throws_if_num_data_unequal(): - X1 = np.random.random((100, 1)) - X2 = np.random.random((10, 1)) +def test_add_features_throws_if_num_data_unequal(rng): + X1 = rng.uniform(size=(100, 1)) + X2 = rng.uniform(size=(10, 1)) d1 = lgb.Dataset(X1).construct() d2 = lgb.Dataset(X2).construct() with pytest.raises(lgb.basic.LightGBMError): d1.add_features_from(d2) -def test_add_features_throws_if_datasets_unconstructed(): - X1 = np.random.random((100, 1)) - X2 = np.random.random((100, 1)) +def test_add_features_throws_if_datasets_unconstructed(rng): + X1 = rng.uniform(size=(100, 1)) + X2 = rng.uniform(size=(100, 1)) with pytest.raises(ValueError): d1 = lgb.Dataset(X1) d2 = lgb.Dataset(X2) @@ -293,8 +292,8 @@ def test_add_features_throws_if_datasets_unconstructed(): d1.add_features_from(d2) -def test_add_features_equal_data_on_alternating_used_unused(tmp_path): - X = np.random.random((100, 5)) +def test_add_features_equal_data_on_alternating_used_unused(tmp_path, rng): + X = rng.uniform(size=(100, 5)) X[:, [1, 3]] = 0 names = [f"col_{i}" for i in range(5)] for j in range(1, 5): @@ -313,8 +312,8 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path): assert dtxt == d1txt -def test_add_features_same_booster_behaviour(tmp_path): - X = np.random.random((100, 5)) +def test_add_features_same_booster_behaviour(tmp_path, rng): + X = rng.uniform(size=(100, 5)) X[:, [1, 3]] = 0 names = [f"col_{i}" for i in range(5)] for j in range(1, 5): @@ -322,7 +321,7 @@ def test_add_features_same_booster_behaviour(tmp_path): d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct() d1.add_features_from(d2) d = lgb.Dataset(X, feature_name=names).construct() - y = np.random.random(100) + y = rng.uniform(size=(100,)) d1.set_label(y) d.set_label(y) b1 = lgb.Booster(train_set=d1) @@ -341,11 +340,11 @@ def test_add_features_same_booster_behaviour(tmp_path): assert dtxt == d1txt -def test_add_features_from_different_sources(): +def test_add_features_from_different_sources(rng): pd = pytest.importorskip("pandas") n_row = 100 n_col = 5 - X = np.random.random((n_row, n_col)) + X = rng.uniform(size=(n_row, n_col)) xxs = [X, sparse.csr_matrix(X), pd.DataFrame(X)] names = [f"col_{i}" for i in range(n_col)] seq = _create_sequence_from_ndarray(X, 1, 30) @@ -380,9 +379,9 @@ def test_add_features_from_different_sources(): assert d1.feature_name == res_feature_names -def test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_features(capsys): +def test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_features(capsys, rng): arr_a = np.zeros((100, 1), dtype=np.float32) - arr_b = np.random.normal(size=(100, 5)) + arr_b = rng.uniform(size=(100, 5)) dataset_a = lgb.Dataset(arr_a).construct() expected_msg = ( @@ -402,10 +401,10 @@ def test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_feat assert dataset_a._handle.value == original_handle -def test_cegb_affects_behavior(tmp_path): - X = np.random.random((100, 5)) +def test_cegb_affects_behavior(tmp_path, rng): + X = rng.uniform(size=(100, 5)) X[:, [1, 3]] = 0 - y = np.random.random(100) + y = rng.uniform(size=(100,)) names = [f"col_{i}" for i in range(5)] ds = lgb.Dataset(X, feature_name=names).construct() ds.set_label(y) @@ -433,10 +432,10 @@ def test_cegb_affects_behavior(tmp_path): assert basetxt != casetxt -def test_cegb_scaling_equalities(tmp_path): - X = np.random.random((100, 5)) +def test_cegb_scaling_equalities(tmp_path, rng): + X = rng.uniform(size=(100, 5)) X[:, [1, 3]] = 0 - y = np.random.random(100) + y = rng.uniform(size=(100,)) names = [f"col_{i}" for i in range(5)] ds = lgb.Dataset(X, feature_name=names).construct() ds.set_label(y) @@ -573,10 +572,10 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields(): np_assert_array_equal(dtrain.get_field("weight"), expected_weight, strict=True) -def test_dataset_construction_with_high_cardinality_categorical_succeeds(): +def test_dataset_construction_with_high_cardinality_categorical_succeeds(rng): pd = pytest.importorskip("pandas") - X = pd.DataFrame({"x1": np.random.randint(0, 5_000, 10_000)}) - y = np.random.rand(10_000) + X = pd.DataFrame({"x1": rng.integers(low=0, high=5_000, size=(10_000,))}) + y = rng.uniform(size=(10_000,)) ds = lgb.Dataset(X, y, categorical_feature=["x1"]) ds.construct() assert ds.num_data() == 10_000 @@ -663,11 +662,11 @@ def test_choose_param_value_objective(objective_alias): @pytest.mark.parametrize("collection", ["1d_np", "2d_np", "pd_float", "pd_str", "1d_list", "2d_list"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -def test_list_to_1d_numpy(collection, dtype): +def test_list_to_1d_numpy(collection, dtype, rng): collection2y = { - "1d_np": np.random.rand(10), - "2d_np": np.random.rand(10, 1), - "pd_float": np.random.rand(10), + "1d_np": rng.uniform(size=(10,)), + "2d_np": rng.uniform(size=(10, 1)), + "pd_float": rng.uniform(size=(10,)), "pd_str": ["a", "b"], "1d_list": [1] * 10, "2d_list": [[1], [2]], @@ -696,7 +695,7 @@ def test_list_to_1d_numpy(collection, dtype): @pytest.mark.parametrize("init_score_type", ["array", "dataframe", "list"]) -def test_init_score_for_multiclass_classification(init_score_type): +def test_init_score_for_multiclass_classification(init_score_type, rng): init_score = [[i * 10 + j for j in range(3)] for i in range(10)] if init_score_type == "array": init_score = np.array(init_score) @@ -704,7 +703,7 @@ def test_init_score_for_multiclass_classification(init_score_type): if not PANDAS_INSTALLED: pytest.skip("Pandas is not installed.") init_score = pd_DataFrame(init_score) - data = np.random.rand(10, 2) + data = rng.uniform(size=(10, 2)) ds = lgb.Dataset(data, init_score=init_score).construct() np.testing.assert_equal(ds.get_field("init_score"), init_score) np.testing.assert_equal(ds.init_score, init_score) @@ -741,16 +740,20 @@ def test_param_aliases(): def _bad_gradients(preds, _): - return np.random.randn(len(preds) + 1), np.random.rand(len(preds) + 1) + rng = np.random.default_rng() + # "bad" = 1 element too many + size = (len(preds) + 1,) + return rng.standard_normal(size=size), rng.uniform(size=size) def _good_gradients(preds, _): - return np.random.randn(*preds.shape), np.random.rand(*preds.shape) + rng = np.random.default_rng() + return rng.standard_normal(size=preds.shape), rng.uniform(size=preds.shape) -def test_custom_objective_safety(): +def test_custom_objective_safety(rng): nrows = 100 - X = np.random.randn(nrows, 5) + X = rng.standard_normal(size=(nrows, 5)) y_binary = np.arange(nrows) % 2 classes = [0, 1, 2] nclass = len(classes) @@ -771,9 +774,9 @@ def test_custom_objective_safety(): @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("feature_name", [["x1", "x2"], "auto"]) -def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name): +def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name, rng): pd = pytest.importorskip("pandas") - X = np.random.rand(10, 2).astype(dtype) + X = rng.uniform(size=(10, 2)).astype(dtype) df = pd.DataFrame(X) built_data = lgb.basic._data_from_pandas( data=df, feature_name=feature_name, categorical_feature="auto", pandas_categorical=None @@ -784,9 +787,9 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name): @pytest.mark.parametrize("feature_name", [["x1"], [42], "auto"]) @pytest.mark.parametrize("categories", ["seen", "unseen"]) -def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories): +def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories, rng): pd = pytest.importorskip("pandas") - X = np.random.choice(["a", "b"], 100).reshape(-1, 1) + X = rng.choice(a=["a", "b"], size=(100, 1)) column_name = "a" if feature_name == "auto" else feature_name[0] df = pd.DataFrame(X.copy(), columns=[column_name], dtype="category") if categories == "seen": @@ -814,15 +817,15 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, c @pytest.mark.parametrize("min_data_in_bin", [2, 10]) -def test_feature_num_bin(min_data_in_bin): +def test_feature_num_bin(min_data_in_bin, rng): X = np.vstack( [ - np.random.rand(100), + rng.uniform(size=(100,)), np.array([1, 2] * 50), np.array([0, 1, 2] * 33 + [0]), np.array([1, 2] * 49 + 2 * [np.nan]), np.zeros(100), - np.random.choice([0, 1], 100), + rng.choice(a=[0, 1], size=(100,)), ] ).T n_continuous = X.shape[1] - 1 @@ -862,9 +865,9 @@ def test_feature_num_bin(min_data_in_bin): ds.feature_num_bin(num_features) -def test_feature_num_bin_with_max_bin_by_feature(): - X = np.random.rand(100, 3) - max_bin_by_feature = np.random.randint(3, 30, size=X.shape[1]) +def test_feature_num_bin_with_max_bin_by_feature(rng): + X = rng.uniform(size=(100, 3)) + max_bin_by_feature = rng.integers(low=3, high=30, size=X.shape[1]) ds = lgb.Dataset(X, params={"max_bin_by_feature": max_bin_by_feature}).construct() actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])] np.testing.assert_equal(actual_num_bins, max_bin_by_feature) @@ -882,8 +885,8 @@ def test_set_leaf_output(): np.testing.assert_allclose(bst.predict(X), y_pred + 1) -def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Dataset(): +def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Dataset(rng): ds = lgb.Dataset( - data=np.random.randn(100, 3), + data=rng.standard_normal(size=(100, 3)), ) assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"] diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 7b1009632..a0706e401 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -550,7 +550,7 @@ def test_multi_class_error(): @pytest.mark.skipif( getenv("TASK", "") == "cuda", reason="Skip due to differences in implementation details of CUDA version" ) -def test_auc_mu(): +def test_auc_mu(rng): # should give same result as binary auc for 2 classes X, y = load_digits(n_class=10, return_X_y=True) y_new = np.zeros((len(y))) @@ -578,7 +578,7 @@ def test_auc_mu(): assert results_auc_mu["training"]["auc_mu"][-1] == pytest.approx(0.5) # test that weighted data gives different auc_mu lgb_X = lgb.Dataset(X, label=y) - lgb_X_weighted = lgb.Dataset(X, label=y, weight=np.abs(np.random.normal(size=y.shape))) + lgb_X_weighted = lgb.Dataset(X, label=y, weight=np.abs(rng.standard_normal(size=y.shape))) results_unweighted = {} results_weighted = {} params = dict(params, num_classes=10, num_leaves=5) @@ -1432,9 +1432,9 @@ def test_feature_name(): assert feature_names == gbm.feature_name() -def test_feature_name_with_non_ascii(): - X_train = np.random.normal(size=(100, 4)) - y_train = np.random.random(100) +def test_feature_name_with_non_ascii(rng): + X_train = rng.normal(size=(100, 4)) + y_train = rng.normal(size=(100,)) # This has non-ascii strings. feature_names = ["F_零", "F_一", "F_二", "F_三"] params = {"verbose": -1} @@ -1448,9 +1448,14 @@ def test_feature_name_with_non_ascii(): assert feature_names == gbm2.feature_name() -def test_parameters_are_loaded_from_model_file(tmp_path, capsys): - X = np.hstack([np.random.rand(100, 1), np.random.randint(0, 5, (100, 2))]) - y = np.random.rand(100) +def test_parameters_are_loaded_from_model_file(tmp_path, capsys, rng): + X = np.hstack( + [ + rng.uniform(size=(100, 1)), + rng.integers(low=0, high=5, size=(100, 2)), + ] + ) + y = rng.uniform(size=(100,)) ds = lgb.Dataset(X, y) params = { "bagging_fraction": 0.8, @@ -1702,29 +1707,29 @@ def test_all_expected_params_are_written_out_to_model_text(tmp_path): assert param_str in model_txt_from_memory -def test_pandas_categorical(): +# why fixed seed? +# sometimes there is no difference how cols are treated (cat or not cat) +def test_pandas_categorical(rng_fixed_seed): pd = pytest.importorskip("pandas") - np.random.seed(42) # sometimes there is no difference how cols are treated (cat or not cat) X = pd.DataFrame( { - "A": np.random.permutation(["a", "b", "c", "d"] * 75), # str - "B": np.random.permutation([1, 2, 3] * 100), # int - "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float - "D": np.random.permutation([True, False] * 150), # bool - "E": pd.Categorical(np.random.permutation(["z", "y", "x", "w", "v"] * 60), ordered=True), + "A": rng_fixed_seed.permutation(["a", "b", "c", "d"] * 75), # str + "B": rng_fixed_seed.permutation([1, 2, 3] * 100), # int + "C": rng_fixed_seed.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float + "D": rng_fixed_seed.permutation([True, False] * 150), # bool + "E": pd.Categorical(rng_fixed_seed.permutation(["z", "y", "x", "w", "v"] * 60), ordered=True), } ) # str and ordered categorical - y = np.random.permutation([0, 1] * 150) + y = rng_fixed_seed.permutation([0, 1] * 150) X_test = pd.DataFrame( { - "A": np.random.permutation(["a", "b", "e"] * 20), # unseen category - "B": np.random.permutation([1, 3] * 30), - "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15), - "D": np.random.permutation([True, False] * 30), - "E": pd.Categorical(np.random.permutation(["z", "y"] * 30), ordered=True), + "A": rng_fixed_seed.permutation(["a", "b", "e"] * 20), # unseen category + "B": rng_fixed_seed.permutation([1, 3] * 30), + "C": rng_fixed_seed.permutation([0.1, -0.1, 0.2, 0.2] * 15), + "D": rng_fixed_seed.permutation([True, False] * 30), + "E": pd.Categorical(rng_fixed_seed.permutation(["z", "y"] * 30), ordered=True), } ) - np.random.seed() # reset seed cat_cols_actual = ["A", "B", "C", "D"] cat_cols_to_store = cat_cols_actual + ["E"] X[cat_cols_actual] = X[cat_cols_actual].astype("category") @@ -1786,21 +1791,21 @@ def test_pandas_categorical(): assert gbm7.pandas_categorical == cat_values -def test_pandas_sparse(): +def test_pandas_sparse(rng): pd = pytest.importorskip("pandas") X = pd.DataFrame( { - "A": pd.arrays.SparseArray(np.random.permutation([0, 1, 2] * 100)), - "B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)), - "C": pd.arrays.SparseArray(np.random.permutation([True, False] * 150)), + "A": pd.arrays.SparseArray(rng.permutation([0, 1, 2] * 100)), + "B": pd.arrays.SparseArray(rng.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)), + "C": pd.arrays.SparseArray(rng.permutation([True, False] * 150)), } ) - y = pd.Series(pd.arrays.SparseArray(np.random.permutation([0, 1] * 150))) + y = pd.Series(pd.arrays.SparseArray(rng.permutation([0, 1] * 150))) X_test = pd.DataFrame( { - "A": pd.arrays.SparseArray(np.random.permutation([0, 2] * 30)), - "B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)), - "C": pd.arrays.SparseArray(np.random.permutation([True, False] * 30)), + "A": pd.arrays.SparseArray(rng.permutation([0, 2] * 30)), + "B": pd.arrays.SparseArray(rng.permutation([0.0, 0.1, 0.2, -0.1] * 15)), + "C": pd.arrays.SparseArray(rng.permutation([True, False] * 30)), } ) for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]): @@ -1816,9 +1821,9 @@ def test_pandas_sparse(): np.testing.assert_allclose(pred_sparse, pred_dense) -def test_reference_chain(): - X = np.random.normal(size=(100, 2)) - y = np.random.normal(size=100) +def test_reference_chain(rng): + X = rng.normal(size=(100, 2)) + y = rng.normal(size=(100,)) tmp_dat = lgb.Dataset(X, y) # take subsets and train tmp_dat_train = tmp_dat.subset(np.arange(80)) @@ -1940,28 +1945,28 @@ def test_contribs_sparse_multiclass(): np.testing.assert_allclose(contribs_csc_array, contribs_dense) -@pytest.mark.skipif(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, reason="not enough RAM") -def test_int32_max_sparse_contribs(): - params = {"objective": "binary"} - train_features = np.random.rand(100, 1000) - train_targets = [0] * 50 + [1] * 50 - lgb_train = lgb.Dataset(train_features, train_targets) - gbm = lgb.train(params, lgb_train, num_boost_round=2) - csr_input_shape = (3000000, 1000) - test_features = csr_matrix(csr_input_shape) - for i in range(0, csr_input_shape[0], csr_input_shape[0] // 6): - for j in range(0, 1000, 100): - test_features[i, j] = random.random() - y_pred_csr = gbm.predict(test_features, pred_contrib=True) - # Note there is an extra column added to the output for the expected value - csr_output_shape = (csr_input_shape[0], csr_input_shape[1] + 1) - assert y_pred_csr.shape == csr_output_shape - y_pred_csc = gbm.predict(test_features.tocsc(), pred_contrib=True) - # Note output CSC shape should be same as CSR output shape - assert y_pred_csc.shape == csr_output_shape +# @pytest.mark.skipif(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, reason="not enough RAM") +# def test_int32_max_sparse_contribs(rng): +# params = {"objective": "binary"} +# train_features = rng.uniform(size=(100, 1000)) +# train_targets = [0] * 50 + [1] * 50 +# lgb_train = lgb.Dataset(train_features, train_targets) +# gbm = lgb.train(params, lgb_train, num_boost_round=2) +# csr_input_shape = (3000000, 1000) +# test_features = csr_matrix(csr_input_shape) +# for i in range(0, csr_input_shape[0], csr_input_shape[0] // 6): +# for j in range(0, 1000, 100): +# test_features[i, j] = random.random() +# y_pred_csr = gbm.predict(test_features, pred_contrib=True) +# # Note there is an extra column added to the output for the expected value +# csr_output_shape = (csr_input_shape[0], csr_input_shape[1] + 1) +# assert y_pred_csr.shape == csr_output_shape +# y_pred_csc = gbm.predict(test_features.tocsc(), pred_contrib=True) +# # Note output CSC shape should be same as CSR output shape +# assert y_pred_csc.shape == csr_output_shape -def test_sliced_data(): +def test_sliced_data(rng): def train_and_get_predictions(features, labels): dataset = lgb.Dataset(features, label=labels) lgb_params = { @@ -1977,7 +1982,7 @@ def test_sliced_data(): return gbm.predict(features) num_samples = 100 - features = np.random.rand(num_samples, 5) + features = rng.uniform(size=(num_samples, 5)) positive_samples = int(num_samples * 0.25) labels = np.append( np.ones(positive_samples, dtype=np.float32), np.zeros(num_samples - positive_samples, dtype=np.float32) @@ -2011,13 +2016,13 @@ def test_sliced_data(): np.testing.assert_allclose(origin_pred, sliced_pred) -def test_init_with_subset(): - data = np.random.random((50, 2)) +def test_init_with_subset(rng): + data = rng.uniform(size=(50, 2)) y = [1] * 25 + [0] * 25 lgb_train = lgb.Dataset(data, y, free_raw_data=False) - subset_index_1 = np.random.choice(np.arange(50), 30, replace=False) + subset_index_1 = rng.choice(a=np.arange(50), size=30, replace=False) subset_data_1 = lgb_train.subset(subset_index_1) - subset_index_2 = np.random.choice(np.arange(50), 20, replace=False) + subset_index_2 = rng.choice(a=np.arange(50), size=20, replace=False) subset_data_2 = lgb_train.subset(subset_index_2) params = {"objective": "binary", "verbose": -1} init_gbm = lgb.train(params=params, train_set=subset_data_1, num_boost_round=10, keep_training_booster=True) @@ -2037,9 +2042,9 @@ def test_init_with_subset(): assert subset_data_4.get_data() == "lgb_train_data.bin" -def test_training_on_constructed_subset_without_params(): - X = np.random.random((100, 10)) - y = np.random.random(100) +def test_training_on_constructed_subset_without_params(rng): + X = rng.uniform(size=(100, 10)) + y = rng.uniform(size=(100,)) lgb_data = lgb.Dataset(X, y) subset_indices = [1, 2, 3, 4] subset = lgb_data.subset(subset_indices).construct() @@ -2051,9 +2056,10 @@ def test_training_on_constructed_subset_without_params(): def generate_trainset_for_monotone_constraints_tests(x3_to_category=True): number_of_dpoints = 3000 - x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints) - x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) - x3_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) + rng = np.random.default_rng() + x1_positively_correlated_with_y = rng.uniform(size=number_of_dpoints) + x2_negatively_correlated_with_y = rng.uniform(size=number_of_dpoints) + x3_negatively_correlated_with_y = rng.uniform(size=number_of_dpoints) x = np.column_stack( ( x1_positively_correlated_with_y, @@ -2062,8 +2068,8 @@ def generate_trainset_for_monotone_constraints_tests(x3_to_category=True): ) ) - zs = np.random.normal(loc=0.0, scale=0.01, size=number_of_dpoints) - scales = 10.0 * (np.random.random(6) + 0.5) + zs = rng.normal(loc=0.0, scale=0.01, size=number_of_dpoints) + scales = 10.0 * (rng.uniform(size=6) + 0.5) y = ( scales[0] * x1_positively_correlated_with_y + np.sin(scales[1] * np.pi * x1_positively_correlated_with_y) @@ -2265,9 +2271,8 @@ def test_max_bin_by_feature(): assert len(np.unique(est.predict(X))) == 3 -def test_small_max_bin(): - np.random.seed(0) - y = np.random.choice([0, 1], 100) +def test_small_max_bin(rng_fixed_seed): + y = rng_fixed_seed.choice([0, 1], 100) x = np.ones((100, 1)) x[:30, 0] = -1 x[60:, 0] = 2 @@ -2278,7 +2283,6 @@ def test_small_max_bin(): params["max_bin"] = 3 lgb_x = lgb.Dataset(x, label=y) lgb.train(params, lgb_x, num_boost_round=5) - np.random.seed() # reset seed def test_refit(): @@ -2293,14 +2297,14 @@ def test_refit(): assert err_pred > new_err_pred -def test_refit_dataset_params(): +def test_refit_dataset_params(rng): # check refit accepts dataset_params X, y = load_breast_cancer(return_X_y=True) lgb_train = lgb.Dataset(X, y, init_score=np.zeros(y.size)) train_params = {"objective": "binary", "verbose": -1, "seed": 123} gbm = lgb.train(train_params, lgb_train, num_boost_round=10) non_weight_err_pred = log_loss(y, gbm.predict(X)) - refit_weight = np.random.rand(y.shape[0]) + refit_weight = rng.uniform(size=(y.shape[0],)) dataset_params = { "max_bin": 260, "min_data_in_bin": 5, @@ -3011,7 +3015,7 @@ def test_model_size(): @pytest.mark.skipif( getenv("TASK", "") == "cuda", reason="Skip due to differences in implementation details of CUDA version" ) -def test_get_split_value_histogram(): +def test_get_split_value_histogram(rng_fixed_seed): X, y = make_synthetic_regression() X = np.repeat(X, 3, axis=0) y = np.repeat(y, 3, axis=0) @@ -3351,7 +3355,7 @@ def test_binning_same_sign(): assert predicted[1] == pytest.approx(predicted[2]) -def test_dataset_update_params(): +def test_dataset_update_params(rng): default_params = { "max_bin": 100, "max_bin_by_feature": [20, 10], @@ -3400,8 +3404,8 @@ def test_dataset_update_params(): "linear_tree": True, "precise_float_parser": False, } - X = np.random.random((100, 2)) - y = np.random.random(100) + X = rng.uniform(size=(100, 2)) + y = rng.uniform(size=(100,)) # decreasing without freeing raw data is allowed lgb_data = lgb.Dataset(X, y, params=default_params, free_raw_data=False).construct() @@ -3443,12 +3447,12 @@ def test_dataset_update_params(): lgb.train(new_params, lgb_data, num_boost_round=3) -def test_dataset_params_with_reference(): +def test_dataset_params_with_reference(rng): default_params = {"max_bin": 100} - X = np.random.random((100, 2)) - y = np.random.random(100) - X_val = np.random.random((100, 2)) - y_val = np.random.random(100) + X = rng.uniform(size=(100, 2)) + y = rng.uniform(size=(100,)) + X_val = rng.uniform(size=(100, 2)) + y_val = rng.uniform(size=(100,)) lgb_train = lgb.Dataset(X, y, params=default_params, free_raw_data=False).construct() lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train, free_raw_data=False).construct() assert lgb_train.get_params() == default_params @@ -3486,7 +3490,7 @@ def test_path_smoothing(): assert err < err_new -def test_trees_to_dataframe(): +def test_trees_to_dataframe(rng): pytest.importorskip("pandas") def _imptcs_to_numpy(X, impcts_dict): @@ -3516,7 +3520,7 @@ def test_trees_to_dataframe(): # test edge case with one leaf X = np.ones((10, 2)) - y = np.random.rand(10) + y = rng.uniform(size=(10,)) data = lgb.Dataset(X, label=y) bst = lgb.train({"objective": "binary", "verbose": -1}, data, num_trees) tree_df = bst.trees_to_dataframe() @@ -3574,11 +3578,10 @@ def test_interaction_constraints(): ) -def test_linear_trees_num_threads(): +def test_linear_trees_num_threads(rng_fixed_seed): # check that number of threads does not affect result - np.random.seed(0) x = np.arange(0, 1000, 0.1) - y = 2 * x + np.random.normal(0, 0.1, len(x)) + y = 2 * x + rng_fixed_seed.normal(loc=0, scale=0.1, size=(len(x),)) x = x[:, np.newaxis] lgb_train = lgb.Dataset(x, label=y) params = {"verbose": -1, "objective": "regression", "seed": 0, "linear_tree": True, "num_threads": 2} @@ -3590,11 +3593,10 @@ def test_linear_trees_num_threads(): np.testing.assert_allclose(pred1, pred2) -def test_linear_trees(tmp_path): +def test_linear_trees(tmp_path, rng_fixed_seed): # check that setting linear_tree=True fits better than ordinary trees when data has linear relationship - np.random.seed(0) x = np.arange(0, 100, 0.1) - y = 2 * x + np.random.normal(0, 0.1, len(x)) + y = 2 * x + rng_fixed_seed.normal(0, 0.1, len(x)) x = x[:, np.newaxis] lgb_train = lgb.Dataset(x, label=y) params = {"verbose": -1, "metric": "mse", "seed": 0, "num_leaves": 2} @@ -4099,21 +4101,20 @@ def test_record_evaluation_with_cv(train_metric): np.testing.assert_allclose(cv_hist[key], eval_result[dataset][f"{metric}-{agg}"]) -def test_pandas_with_numpy_regular_dtypes(): +def test_pandas_with_numpy_regular_dtypes(rng_fixed_seed): pd = pytest.importorskip("pandas") uints = ["uint8", "uint16", "uint32", "uint64"] ints = ["int8", "int16", "int32", "int64"] bool_and_floats = ["bool", "float16", "float32", "float64"] - rng = np.random.RandomState(42) n_samples = 100 # data as float64 df = pd.DataFrame( { - "x1": rng.randint(0, 2, n_samples), - "x2": rng.randint(1, 3, n_samples), - "x3": 10 * rng.randint(1, 3, n_samples), - "x4": 100 * rng.randint(1, 3, n_samples), + "x1": rng_fixed_seed.integers(low=0, high=2, size=n_samples), + "x2": rng_fixed_seed.integers(low=1, high=3, size=n_samples), + "x3": 10 * rng_fixed_seed.integers(low=1, high=3, size=n_samples), + "x4": 100 * rng_fixed_seed.integers(low=1, high=3, size=n_samples), } ) df = df.astype(np.float64) @@ -4139,15 +4140,14 @@ def test_pandas_with_numpy_regular_dtypes(): np.testing.assert_allclose(preds, preds2) -def test_pandas_nullable_dtypes(): +def test_pandas_nullable_dtypes(rng_fixed_seed): pd = pytest.importorskip("pandas") - rng = np.random.RandomState(0) df = pd.DataFrame( { - "x1": rng.randint(1, 3, size=100), + "x1": rng_fixed_seed.integers(low=1, high=3, size=100), "x2": np.linspace(-1, 1, 100), - "x3": pd.arrays.SparseArray(rng.randint(0, 11, size=100)), - "x4": rng.rand(100) < 0.5, + "x3": pd.arrays.SparseArray(rng_fixed_seed.integers(low=0, high=11, size=100)), + "x4": rng_fixed_seed.uniform(size=(100,)) < 0.5, } ) # introduce some missing values @@ -4219,7 +4219,7 @@ def test_boost_from_average_with_single_leaf_trees(): assert y.min() <= mean_preds <= y.max() -def test_cegb_split_buffer_clean(): +def test_cegb_split_buffer_clean(rng_fixed_seed): # modified from https://github.com/microsoft/LightGBM/issues/3679#issuecomment-938652811 # and https://github.com/microsoft/LightGBM/pull/5087 # test that the ``splits_per_leaf_`` of CEGB is cleaned before training a new tree @@ -4228,11 +4228,9 @@ def test_cegb_split_buffer_clean(): # Check failed: (best_split_info.left_count) > (0) R, C = 1000, 100 - seed = 29 - np.random.seed(seed) - data = np.random.randn(R, C) + data = rng_fixed_seed.standard_normal(size=(R, C)) for i in range(1, C): - data[i] += data[0] * np.random.randn() + data[i] += data[0] * rng_fixed_seed.standard_normal() N = int(0.8 * len(data)) train_data = data[:N] diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 2fc127b52..a995bfcae 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -340,7 +340,7 @@ def test_grid_search(): assert evals_result == grid.best_estimator_.evals_result_ -def test_random_search(): +def test_random_search(rng): X, y = load_iris(return_X_y=True) y = y.astype(str) # utilize label encoder at it's max power X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -349,8 +349,8 @@ def test_random_search(): params = {"subsample": 0.8, "subsample_freq": 1} param_dist = { "boosting_type": ["rf", "gbdt"], - "n_estimators": [np.random.randint(low=3, high=10) for i in range(n_iter)], - "reg_alpha": [np.random.uniform(low=0.01, high=0.06) for i in range(n_iter)], + "n_estimators": rng.integers(low=3, high=10, size=(n_iter,)).tolist(), + "reg_alpha": rng.uniform(low=0.01, high=0.06, size=(n_iter,)).tolist(), } fit_params = {"eval_set": [(X_val, y_val)], "eval_metric": constant_metric, "callbacks": [lgb.early_stopping(2)]} rand = RandomizedSearchCV( @@ -556,29 +556,29 @@ def test_feature_importances_type(): assert importance_split_top1 != importance_gain_top1 -def test_pandas_categorical(): +# why fixed seed? +# sometimes there is no difference how cols are treated (cat or not cat) +def test_pandas_categorical(rng_fixed_seed): pd = pytest.importorskip("pandas") - np.random.seed(42) # sometimes there is no difference how cols are treated (cat or not cat) X = pd.DataFrame( { - "A": np.random.permutation(["a", "b", "c", "d"] * 75), # str - "B": np.random.permutation([1, 2, 3] * 100), # int - "C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float - "D": np.random.permutation([True, False] * 150), # bool - "E": pd.Categorical(np.random.permutation(["z", "y", "x", "w", "v"] * 60), ordered=True), + "A": rng_fixed_seed.permutation(["a", "b", "c", "d"] * 75), # str + "B": rng_fixed_seed.permutation([1, 2, 3] * 100), # int + "C": rng_fixed_seed.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float + "D": rng_fixed_seed.permutation([True, False] * 150), # bool + "E": pd.Categorical(rng_fixed_seed.permutation(["z", "y", "x", "w", "v"] * 60), ordered=True), } ) # str and ordered categorical - y = np.random.permutation([0, 1] * 150) + y = rng_fixed_seed.permutation([0, 1] * 150) X_test = pd.DataFrame( { - "A": np.random.permutation(["a", "b", "e"] * 20), # unseen category - "B": np.random.permutation([1, 3] * 30), - "C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15), - "D": np.random.permutation([True, False] * 30), - "E": pd.Categorical(np.random.permutation(["z", "y"] * 30), ordered=True), + "A": rng_fixed_seed.permutation(["a", "b", "e"] * 20), # unseen category + "B": rng_fixed_seed.permutation([1, 3] * 30), + "C": rng_fixed_seed.permutation([0.1, -0.1, 0.2, 0.2] * 15), + "D": rng_fixed_seed.permutation([True, False] * 30), + "E": pd.Categorical(rng_fixed_seed.permutation(["z", "y"] * 30), ordered=True), } ) - np.random.seed() # reset seed cat_cols_actual = ["A", "B", "C", "D"] cat_cols_to_store = cat_cols_actual + ["E"] X[cat_cols_actual] = X[cat_cols_actual].astype("category") @@ -620,21 +620,21 @@ def test_pandas_categorical(): assert gbm6.booster_.pandas_categorical == cat_values -def test_pandas_sparse(): +def test_pandas_sparse(rng): pd = pytest.importorskip("pandas") X = pd.DataFrame( { - "A": pd.arrays.SparseArray(np.random.permutation([0, 1, 2] * 100)), - "B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)), - "C": pd.arrays.SparseArray(np.random.permutation([True, False] * 150)), + "A": pd.arrays.SparseArray(rng.permutation([0, 1, 2] * 100)), + "B": pd.arrays.SparseArray(rng.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)), + "C": pd.arrays.SparseArray(rng.permutation([True, False] * 150)), } ) - y = pd.Series(pd.arrays.SparseArray(np.random.permutation([0, 1] * 150))) + y = pd.Series(pd.arrays.SparseArray(rng.permutation([0, 1] * 150))) X_test = pd.DataFrame( { - "A": pd.arrays.SparseArray(np.random.permutation([0, 2] * 30)), - "B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)), - "C": pd.arrays.SparseArray(np.random.permutation([True, False] * 30)), + "A": pd.arrays.SparseArray(rng.permutation([0, 2] * 30)), + "B": pd.arrays.SparseArray(rng.permutation([0.0, 0.1, 0.2, -0.1] * 15)), + "C": pd.arrays.SparseArray(rng.permutation([True, False] * 30)), } ) for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]): @@ -1073,11 +1073,11 @@ def test_multiple_eval_metrics(): assert "binary_logloss" in gbm.evals_result_["training"] -def test_nan_handle(): +def test_nan_handle(rng): nrows = 100 ncols = 10 - X = np.random.randn(nrows, ncols) - y = np.random.randn(nrows) + np.full(nrows, 1e30) + X = rng.standard_normal(size=(nrows, ncols)) + y = rng.standard_normal(size=(nrows,)) + np.full(nrows, 1e30) weight = np.zeros(nrows) params = {"n_estimators": 20, "verbose": -1} params_fit = {"X": X, "y": y, "sample_weight": weight, "eval_set": (X, y), "callbacks": [lgb.early_stopping(5)]} @@ -1410,13 +1410,13 @@ def test_validate_features(task): @pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"]) @pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"]) @pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "regression"]) -def test_classification_and_regression_minimally_work_with_all_all_accepted_data_types(X_type, y_type, task): +def test_classification_and_regression_minimally_work_with_all_all_accepted_data_types(X_type, y_type, task, rng): if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED: pytest.skip("pandas is not installed") if any(t.startswith("dt_") for t in [X_type, y_type]) and not DATATABLE_INSTALLED: pytest.skip("datatable is not installed") X, y, g = _create_data(task, n_samples=2_000) - weights = np.abs(np.random.randn(y.shape[0])) + weights = np.abs(rng.standard_normal(size=(y.shape[0],))) if task == "binary-classification" or task == "regression": init_score = np.full_like(y, np.mean(y)) @@ -1487,13 +1487,13 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data @pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"]) @pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_DataFrame", "pd_Series"]) @pytest.mark.parametrize("g_type", ["list1d_float", "list1d_int", "numpy", "pd_Series"]) -def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type, g_type): +def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type, g_type, rng): if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED: pytest.skip("pandas is not installed") if any(t.startswith("dt_") for t in [X_type, y_type, g_type]) and not DATATABLE_INSTALLED: pytest.skip("datatable is not installed") X, y, g = _create_data(task="ranking", n_samples=1_000) - weights = np.abs(np.random.randn(y.shape[0])) + weights = np.abs(rng.standard_normal(size=(y.shape[0],))) init_score = np.full_like(y, np.mean(y)) X_valid = X * 2