[dask] Add support for 'pred_leaf' in Dask estimators (fixes #3792) (#3919)

* fix tests

* fix tests

* fix test comments

* simplify tests

* Apply suggestions from code review
This commit is contained in:
James Lamb 2021-02-07 13:17:28 -06:00 коммит произвёл GitHub
Родитель 84b519b77c
Коммит 37485fff5d
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
2 изменённых файлов: 38 добавлений и 1 удалений

Просмотреть файл

@ -372,7 +372,7 @@ def _predict_part(
# dask.DataFrame.map_partitions() expects each call to return a pandas DataFrame or Series # dask.DataFrame.map_partitions() expects each call to return a pandas DataFrame or Series
if isinstance(part, pd_DataFrame): if isinstance(part, pd_DataFrame):
if pred_proba or pred_contrib: if pred_proba or pred_contrib or pred_leaf:
result = pd_DataFrame(result, index=part.index) result = pd_DataFrame(result, index=part.index)
else: else:
result = pd_Series(result, index=part.index, name='predictions') result = pd_Series(result, index=part.index, name='predictions')

Просмотреть файл

@ -223,6 +223,7 @@ def test_classifier(output, centers, client, listen_port):
dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
p1 = dask_classifier.predict(dX) p1 = dask_classifier.predict(dX)
p1_proba = dask_classifier.predict_proba(dX).compute() p1_proba = dask_classifier.predict_proba(dX).compute()
p1_pred_leaf = dask_classifier.predict(dX, pred_leaf=True)
p1_local = dask_classifier.to_local().predict(X) p1_local = dask_classifier.to_local().predict(X)
s1 = _accuracy_score(dy, p1) s1 = _accuracy_score(dy, p1)
p1 = p1.compute() p1 = p1.compute()
@ -241,6 +242,17 @@ def test_classifier(output, centers, client, listen_port):
assert_eq(p1_local, p2) assert_eq(p1_local, p2)
assert_eq(y, p1_local) assert_eq(y, p1_local)
# pref_leaf values should have the right shape
# and values that look like valid tree nodes
pred_leaf_vals = p1_pred_leaf.compute()
assert pred_leaf_vals.shape == (
X.shape[0],
dask_classifier.booster_.num_trees()
)
assert np.max(pred_leaf_vals) <= params['num_leaves']
assert np.min(pred_leaf_vals) >= 0
assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']
# be sure LightGBM actually used at least one categorical column, # be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature # and that it was correctly treated as a categorical feature
if output == 'dataframe-with-categorical': if output == 'dataframe-with-categorical':
@ -380,6 +392,8 @@ def test_regressor(output, client, listen_port):
) )
dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
p1 = dask_regressor.predict(dX) p1 = dask_regressor.predict(dX)
p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True)
if not output.startswith('dataframe'): if not output.startswith('dataframe'):
s1 = _r2_score(dy, p1) s1 = _r2_score(dy, p1)
p1 = p1.compute() p1 = p1.compute()
@ -399,6 +413,17 @@ def test_regressor(output, client, listen_port):
# Predictions should be roughly the same. # Predictions should be roughly the same.
assert_eq(p1, p1_local) assert_eq(p1, p1_local)
# pref_leaf values should have the right shape
# and values that look like valid tree nodes
pred_leaf_vals = p1_pred_leaf.compute()
assert pred_leaf_vals.shape == (
X.shape[0],
dask_regressor.booster_.num_trees()
)
assert np.max(pred_leaf_vals) <= params['num_leaves']
assert np.min(pred_leaf_vals) >= 0
assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']
# The checks below are skipped # The checks below are skipped
# for the categorical data case because it's difficult to get # for the categorical data case because it's difficult to get
# a good fit from just categoricals for a regression problem # a good fit from just categoricals for a regression problem
@ -582,6 +607,7 @@ def test_ranker(output, client, listen_port, group):
dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg) dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg)
rnkvec_dask = dask_ranker.predict(dX) rnkvec_dask = dask_ranker.predict(dX)
rnkvec_dask = rnkvec_dask.compute() rnkvec_dask = rnkvec_dask.compute()
p1_pred_leaf = dask_ranker.predict(dX, pred_leaf=True)
rnkvec_dask_local = dask_ranker.to_local().predict(X) rnkvec_dask_local = dask_ranker.to_local().predict(X)
local_ranker = lgb.LGBMRanker(**params) local_ranker = lgb.LGBMRanker(**params)
@ -595,6 +621,17 @@ def test_ranker(output, client, listen_port, group):
assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8 assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8
assert_eq(rnkvec_dask, rnkvec_dask_local) assert_eq(rnkvec_dask, rnkvec_dask_local)
# pref_leaf values should have the right shape
# and values that look like valid tree nodes
pred_leaf_vals = p1_pred_leaf.compute()
assert pred_leaf_vals.shape == (
X.shape[0],
dask_ranker.booster_.num_trees()
)
assert np.max(pred_leaf_vals) <= params['num_leaves']
assert np.min(pred_leaf_vals) >= 0
assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']
# be sure LightGBM actually used at least one categorical column, # be sure LightGBM actually used at least one categorical column,
# and that it was correctly treated as a categorical feature # and that it was correctly treated as a categorical feature
if output == 'dataframe-with-categorical': if output == 'dataframe-with-categorical':