diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index a62a29af8..a62f552b6 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -372,7 +372,7 @@ def _predict_part( # dask.DataFrame.map_partitions() expects each call to return a pandas DataFrame or Series if isinstance(part, pd_DataFrame): - if pred_proba or pred_contrib: + if pred_proba or pred_contrib or pred_leaf: result = pd_DataFrame(result, index=part.index) else: result = pd_Series(result, index=part.index, name='predictions') diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index da91c4ef2..7c3815b4f 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -223,6 +223,7 @@ def test_classifier(output, centers, client, listen_port): dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) p1 = dask_classifier.predict(dX) p1_proba = dask_classifier.predict_proba(dX).compute() + p1_pred_leaf = dask_classifier.predict(dX, pred_leaf=True) p1_local = dask_classifier.to_local().predict(X) s1 = _accuracy_score(dy, p1) p1 = p1.compute() @@ -241,6 +242,17 @@ def test_classifier(output, centers, client, listen_port): assert_eq(p1_local, p2) assert_eq(y, p1_local) + # pref_leaf values should have the right shape + # and values that look like valid tree nodes + pred_leaf_vals = p1_pred_leaf.compute() + assert pred_leaf_vals.shape == ( + X.shape[0], + dask_classifier.booster_.num_trees() + ) + assert np.max(pred_leaf_vals) <= params['num_leaves'] + assert np.min(pred_leaf_vals) >= 0 + assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] + # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': @@ -380,6 +392,8 @@ def test_regressor(output, client, listen_port): ) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) p1 = dask_regressor.predict(dX) + p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True) + if not output.startswith('dataframe'): s1 = _r2_score(dy, p1) p1 = p1.compute() @@ -399,6 +413,17 @@ def test_regressor(output, client, listen_port): # Predictions should be roughly the same. assert_eq(p1, p1_local) + # pref_leaf values should have the right shape + # and values that look like valid tree nodes + pred_leaf_vals = p1_pred_leaf.compute() + assert pred_leaf_vals.shape == ( + X.shape[0], + dask_regressor.booster_.num_trees() + ) + assert np.max(pred_leaf_vals) <= params['num_leaves'] + assert np.min(pred_leaf_vals) >= 0 + assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] + # The checks below are skipped # for the categorical data case because it's difficult to get # a good fit from just categoricals for a regression problem @@ -582,6 +607,7 @@ def test_ranker(output, client, listen_port, group): dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg) rnkvec_dask = dask_ranker.predict(dX) rnkvec_dask = rnkvec_dask.compute() + p1_pred_leaf = dask_ranker.predict(dX, pred_leaf=True) rnkvec_dask_local = dask_ranker.to_local().predict(X) local_ranker = lgb.LGBMRanker(**params) @@ -595,6 +621,17 @@ def test_ranker(output, client, listen_port, group): assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8 assert_eq(rnkvec_dask, rnkvec_dask_local) + # pref_leaf values should have the right shape + # and values that look like valid tree nodes + pred_leaf_vals = p1_pred_leaf.compute() + assert pred_leaf_vals.shape == ( + X.shape[0], + dask_ranker.booster_.num_trees() + ) + assert np.max(pred_leaf_vals) <= params['num_leaves'] + assert np.min(pred_leaf_vals) >= 0 + assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] + # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical':