зеркало из https://github.com/py-why/EconML.git
Add model selection tests
Signed-off-by: Keith Battocchi <kebatt@microsoft.com>
This commit is contained in:
@ -0,0 +1,135 @@
# Copyright (c) PyWhy contributors. All rights reserved.
# Licensed under the MIT License.
import unittest
import numpy as np
from scipy.special import expit
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import (ElasticNetCV, Lasso, LassoCV, LinearRegression, LogisticRegression,
LogisticRegressionCV, MultiTaskElasticNetCV, MultiTaskLassoCV,
RidgeCV, RidgeClassifierCV)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures
from econml.dml import LinearDML
from econml.sklearn_extensions.linear_model import WeightedLassoCVWrapper
class TestModelSelection(unittest.TestCase):
def _simple_dgp(self, n, d_x, d_w, discrete_treatment):
n = 500 # keep the data set small since we're testing a lot of models and don't care about the results
X = np.random.normal(size=(n, d_x))
W = np.random.normal(size=(n, d_w))
alpha = np.random.normal(size=(X.shape[1]))
n_f = d_w + d_x
beta = np.random.normal(size=(n_f,))
gamma = np.random.normal(size=(n_f,))
XW = np.hstack([X, W])
if discrete_treatment:
T = np.random.binomial(1, expit(XW @ beta))
T = XW @ beta + np.random.normal(size=(n,))
Y = (X @ alpha) * T + XW @ gamma + np.random.normal(size=(n,))
return Y, T, X, W
def test_poly(self):
# tests that we can recover the right degree of polynomial features
# implicitly also tests ability to handle pipelines
# since 'poly' uses pipelines containing PolynomialFeatures
n = 5000
X = np.random.normal(size=(n, 2))
W = np.random.normal(size=(n, 3))
for true_d in range(1, 4):
with self.subTest(true_d=true_d):
pf = PolynomialFeatures(degree=true_d)
fts = pf.fit_transform(np.hstack([X, W]))
k = fts.shape[1]
m = X.shape[1] + W.shape[1]
alpha_x = np.random.normal(size=(X.shape[1],))
alpha_1 = np.random.normal(size=())
beta = np.random.normal(size=(k,))
gamma = np.random.normal(size=(k,))
# generate larger coefficients in a set of high degree features,
# weighted towards higher degree features
ft_inds_beta = np.random.choice(k, size=m, replace=False, p=np.arange(k) / np.sum(np.arange(k)))
ft_inds_gamma = np.random.choice(k, size=m, replace=False, p=np.arange(k) / np.sum(np.arange(k)))
beta[ft_inds_beta] = 10 * np.random.normal(1, size=(m,))
gamma[ft_inds_gamma] = 10 * np.random.normal(1, size=(m,))
t = np.random.normal(size=(n,)) + fts @ beta + np.random.normal(scale=0.5, size=(n,))
y = np.random.normal(size=(n,)) + t * (alpha_1 + X @ alpha_x) + fts @ gamma
# just test a polynomial T model, since for Y the correct degree also depends on
# the interation of T and X
mdl = LinearDML(model_t='poly',
model_y=LinearRegression()).fit(y, t, X=X, W=W)
for t in mdl.models_t[0]:
self.assertEqual(t[0].degree, true_d)
def test_all_strings(self):
for discrete_treatment in [True, False]:
Y, T, X, W = self._simple_dgp(500, 2, 3, discrete_treatment)
for model_t in ['auto', 'linear', 'poly', 'forest', 'gbf', 'nnet', 'automl']:
with self.subTest(model_t=model_t, discrete_treatment=discrete_treatment):
mdl = LinearDML(model_t=model_t,
mdl.fit(Y, T, X=X, W=W)
model_t = 'some_random_string'
with self.subTest(model_t=model_t, discrete_treatment=True):
mdl = LinearDML(model_t=model_t,
with self.assertRaises(ValueError):
mdl.fit(Y, T, X=X, W=W)
def test_list_selection(self):
Y, T, X, W = self._simple_dgp(500, 2, 3, False)
# test corner case with just one model in a list
mdl = LinearDML(model_t=[LinearRegression()],
mdl.fit(Y, T, X=X, W=W)
# test corner case with empty list
with self.assertRaises(Exception):
mdl = LinearDML(model_t=[],
mdl.fit(Y, T, X=X, W=W)
# test selecting between two fixed models
mdl = LinearDML(model_t=[LinearRegression(), RandomForestRegressor()],
mdl.fit(Y, T, X=X, W=W)
# DGP is a linear model, so linear regression should fit better
assert isinstance(mdl.models_t[0][0], LinearRegression)
T2 = T + 10 * (X[:, 1] > 0) # add a non-linear effect
mdl.fit(Y, T2, X=X, W=W)
# DGP is now non-linear, so random forest should fit better
assert isinstance(mdl.models_t[0][0], RandomForestRegressor)
def test_sklearn_model_selection(self):
for is_discrete, mdls in [(True, [LogisticRegressionCV(), RidgeClassifierCV(),
GridSearchCV(LogisticRegression(), {'C': [1, 10]}),
RandomizedSearchCV(LogisticRegression(), {'C': [1, 10]})]),
(False, [ElasticNetCV(), LassoCV(), RidgeCV(),
MultiTaskElasticNetCV(), MultiTaskLassoCV(), WeightedLassoCVWrapper(),
GridSearchCV(Lasso(), {'alpha': [0.1, 1]}),
RandomizedSearchCV(Lasso(), {'alpha': [0.1, 1]})])]:
Y, T, X, W = self._simple_dgp(500, 2, 3, is_discrete)
T2 = np.tile(T.reshape(-1, 1), (1, 2)) # multi-column T
for mdl in mdls:
# these models only work on multi-output data
use_array = isinstance(mdl, (MultiTaskElasticNetCV, MultiTaskLassoCV))
with self.subTest(model=mdl):
est = LinearDML(model_t=mdl,
est.fit(Y, T2 if use_array else T, X=X, W=W)
Ссылка в новой задаче