Revise gcm auto assignment logic
Signed-off-by: Patrick Bloebaum <bloebp@amazon.com>
This commit is contained in:
Родитель
2ed7cf4e93
Коммит
c9d9c3a6f0
|
@ -12,6 +12,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression
|
|||
from sklearn.model_selection import KFold, train_test_split
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
|
||||
from dowhy.gcm import config
|
||||
from dowhy.gcm.cms import ProbabilisticCausalModel
|
||||
from dowhy.gcm.fcms import AdditiveNoiseModel, ClassificationModel, ClassifierFCM, PredictionModel
|
||||
from dowhy.gcm.graph import CAUSAL_MECHANISM, get_ordered_predecessors, is_root_node, validate_causal_model_assignment
|
||||
|
@ -36,7 +37,6 @@ from dowhy.gcm.ml.classification import (
|
|||
)
|
||||
from dowhy.gcm.ml.regression import (
|
||||
create_ada_boost_regressor,
|
||||
create_elastic_net_regressor,
|
||||
create_extra_trees_regressor,
|
||||
create_knn_regressor,
|
||||
create_polynom_regressor,
|
||||
|
@ -50,25 +50,28 @@ from dowhy.gcm.util.general import (
|
|||
shape_into_2d,
|
||||
)
|
||||
|
||||
_LIST_OF_POTENTIAL_CLASSIFIERS = [
|
||||
_LIST_OF_POTENTIAL_CLASSIFIERS_GOOD = [
|
||||
partial(create_logistic_regression_classifier, max_iter=1000),
|
||||
partial(create_polynom_logistic_regression_classifier, max_iter=1000),
|
||||
create_random_forest_classifier,
|
||||
create_hist_gradient_boost_classifier,
|
||||
]
|
||||
_LIST_OF_POTENTIAL_REGRESSORS_GOOD = [
|
||||
create_linear_regressor,
|
||||
create_hist_gradient_boost_regressor,
|
||||
]
|
||||
|
||||
_LIST_OF_POTENTIAL_CLASSIFIERS_BETTER = _LIST_OF_POTENTIAL_CLASSIFIERS_GOOD + [
|
||||
create_random_forest_classifier,
|
||||
create_extra_trees_classifier,
|
||||
create_support_vector_classifier,
|
||||
create_knn_classifier,
|
||||
create_gaussian_nb_classifier,
|
||||
create_ada_boost_classifier,
|
||||
]
|
||||
_LIST_OF_POTENTIAL_REGRESSORS = [
|
||||
create_linear_regressor,
|
||||
_LIST_OF_POTENTIAL_REGRESSORS_BETTER = _LIST_OF_POTENTIAL_REGRESSORS_GOOD + [
|
||||
create_ridge_regressor,
|
||||
create_polynom_regressor,
|
||||
partial(create_lasso_regressor, max_iter=5000),
|
||||
partial(create_elastic_net_regressor, max_iter=5000),
|
||||
create_random_forest_regressor,
|
||||
create_hist_gradient_boost_regressor,
|
||||
create_support_vector_regressor,
|
||||
create_extra_trees_regressor,
|
||||
create_knn_regressor,
|
||||
|
@ -96,8 +99,8 @@ def assign_causal_mechanisms(
|
|||
:param based_on: Jointly sampled data corresponding to the nodes of the given graph.
|
||||
:param quality: AssignmentQuality for the automatic model selection and model accuracy. This changes the type of
|
||||
prediction model and time spent on the selection. Options are:
|
||||
- AssignmentQuality.GOOD: Checks whether the data is linear. If the data is linear, an OLS model is
|
||||
used, otherwise a gradient boost model.
|
||||
- AssignmentQuality.GOOD: Compares a linear, polynomial and gradient boost model on small test-training split
|
||||
of the data. The best performing model is then selected.
|
||||
Model selection speed: Fast
|
||||
Model training speed: Fast
|
||||
Model inference speed: Fast
|
||||
|
@ -105,8 +108,8 @@ def assign_causal_mechanisms(
|
|||
- AssignmentQuality.BETTER: Compares multiple model types and uses the one with the best performance
|
||||
averaged over multiple splits of the training data. By default, the model with the smallest root mean
|
||||
squared error is selected for regression problems and the model with the highest F1 score is selected for
|
||||
classification problems. For a list of possible models, see _LIST_OF_POTENTIAL_REGRESSORS and
|
||||
_LIST_OF_POTENTIAL_CLASSIFIERS, respectively.
|
||||
classification problems. For a list of possible models, see _LIST_OF_POTENTIAL_REGRESSORS_BETTER and
|
||||
_LIST_OF_POTENTIAL_CLASSIFIERS_BETTER, respectively.
|
||||
Model selection speed: Medium
|
||||
Model training speed: Fast
|
||||
Model inference speed: Fast
|
||||
|
@ -139,29 +142,26 @@ def assign_causal_mechanisms(
|
|||
def select_model(
|
||||
X: np.ndarray, Y: np.ndarray, model_selection_quality: AssignmentQuality
|
||||
) -> Union[PredictionModel, ClassificationModel]:
|
||||
target_is_categorical = is_categorical(Y)
|
||||
if model_selection_quality == AssignmentQuality.GOOD:
|
||||
use_linear_prediction_models = has_linear_relationship(X, Y)
|
||||
|
||||
if target_is_categorical:
|
||||
if use_linear_prediction_models:
|
||||
return create_logistic_regression_classifier(max_iter=1000)
|
||||
else:
|
||||
return create_hist_gradient_boost_classifier()
|
||||
else:
|
||||
if use_linear_prediction_models:
|
||||
return find_best_model(
|
||||
[create_linear_regressor, create_polynom_regressor], X, Y, model_selection_splits=2
|
||||
)()
|
||||
else:
|
||||
return find_best_model(
|
||||
[create_hist_gradient_boost_regressor, create_polynom_regressor], X, Y, model_selection_splits=2
|
||||
)()
|
||||
list_of_regressor = list(_LIST_OF_POTENTIAL_REGRESSORS_GOOD)
|
||||
list_of_classifier = list(_LIST_OF_POTENTIAL_CLASSIFIERS_GOOD)
|
||||
model_selection_splits = 2
|
||||
elif model_selection_quality == AssignmentQuality.BETTER:
|
||||
if target_is_categorical:
|
||||
return find_best_model(_LIST_OF_POTENTIAL_CLASSIFIERS, X, Y)()
|
||||
else:
|
||||
return find_best_model(_LIST_OF_POTENTIAL_REGRESSORS, X, Y)()
|
||||
list_of_regressor = list(_LIST_OF_POTENTIAL_REGRESSORS_BETTER)
|
||||
list_of_classifier = list(_LIST_OF_POTENTIAL_CLASSIFIERS_BETTER)
|
||||
model_selection_splits = 5
|
||||
else:
|
||||
raise ValueError("Invalid model selection quality.")
|
||||
|
||||
if apply_one_hot_encoding(X, fit_one_hot_encoders(X)).shape[1] <= 5:
|
||||
# Avoid too many features
|
||||
list_of_regressor += [create_polynom_regressor]
|
||||
list_of_classifier += [partial(create_polynom_logistic_regression_classifier, max_iter=1000)]
|
||||
|
||||
if is_categorical(Y):
|
||||
return find_best_model(list_of_classifier, X, Y, model_selection_splits=model_selection_splits)()
|
||||
else:
|
||||
return find_best_model(list_of_regressor, X, Y, model_selection_splits=model_selection_splits)()
|
||||
|
||||
|
||||
def has_linear_relationship(X: np.ndarray, Y: np.ndarray, max_num_samples: int = 3000) -> bool:
|
||||
|
@ -221,8 +221,10 @@ def find_best_model(
|
|||
metric: Optional[Callable[[np.ndarray, np.ndarray], float]] = None,
|
||||
max_samples_per_split: int = 10000,
|
||||
model_selection_splits: int = 5,
|
||||
n_jobs: int = -1,
|
||||
n_jobs: Optional[int] = None,
|
||||
) -> Callable[[], PredictionModel]:
|
||||
n_jobs = config.default_n_jobs if n_jobs is None else n_jobs
|
||||
|
||||
X, Y = shape_into_2d(X, Y)
|
||||
|
||||
is_classification_problem = isinstance(prediction_model_factories[0](), ClassificationModel)
|
||||
|
|
|
@ -5,6 +5,7 @@ from flaky import flaky
|
|||
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
|
||||
from sklearn.linear_model import ElasticNetCV, LassoCV, LinearRegression, LogisticRegression, RidgeCV
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from dowhy.gcm import ProbabilisticCausalModel
|
||||
from dowhy.gcm.auto import AssignmentQuality, assign_causal_mechanisms
|
||||
|
@ -77,7 +78,9 @@ def test_given_non_linear_regression_problem_when_auto_assign_causal_models_with
|
|||
data.update({"Y": Y})
|
||||
|
||||
assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD)
|
||||
assert isinstance(causal_model.causal_mechanism("Y").prediction_model.sklearn_model, HistGradientBoostingRegressor)
|
||||
assert isinstance(
|
||||
causal_model.causal_mechanism("Y").prediction_model.sklearn_model, HistGradientBoostingRegressor
|
||||
) or isinstance(causal_model.causal_mechanism("Y").prediction_model.sklearn_model, Pipeline)
|
||||
|
||||
|
||||
@flaky(max_runs=3)
|
||||
|
@ -136,7 +139,9 @@ def test_given_non_linear_classification_problem_when_auto_assign_causal_models_
|
|||
data.update({"Y": Y})
|
||||
|
||||
assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD)
|
||||
assert isinstance(causal_model.causal_mechanism("Y").classifier_model.sklearn_model, HistGradientBoostingClassifier)
|
||||
assert isinstance(
|
||||
causal_model.causal_mechanism("Y").classifier_model.sklearn_model, HistGradientBoostingClassifier
|
||||
) or isinstance(causal_model.causal_mechanism("Y").classifier_model.sklearn_model, Pipeline)
|
||||
|
||||
|
||||
@flaky(max_runs=3)
|
||||
|
@ -154,6 +159,46 @@ def test_given_non_linear_classification_problem_when_auto_assign_causal_models_
|
|||
assert not isinstance(causal_model.causal_mechanism("Y").classifier_model.sklearn_model, GaussianNB)
|
||||
|
||||
|
||||
@flaky(max_runs=3)
|
||||
def test_given_polynomial_regression_data_with_categorical_input_when_auto_assign_causal_models_then_does_not_raise_error():
|
||||
X = np.column_stack(
|
||||
[np.random.choice(2, 100, replace=True).astype(str), np.random.normal(0, 1, (100, 2)).astype(object)]
|
||||
).astype(object)
|
||||
Y = []
|
||||
for i in range(X.shape[0]):
|
||||
Y.append(X[i, 1] * X[i, 2] if X[i, 0] == "0" else X[i, 1] + X[i, 2])
|
||||
|
||||
Y = np.array(Y)
|
||||
|
||||
causal_model = ProbabilisticCausalModel(nx.DiGraph([("X0", "Y"), ("X1", "Y"), ("X2", "Y")]))
|
||||
data = {"X" + str(i): X[:, i] for i in range(X.shape[1])}
|
||||
data.update({"Y": Y})
|
||||
|
||||
assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD)
|
||||
assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.BETTER, override_models=True)
|
||||
|
||||
|
||||
@flaky(max_runs=3)
|
||||
def test_given_polynomial_classification_data_with_categorical_input_when_auto_assign_causal_models_then_does_not_raise_error():
|
||||
X = np.random.normal(0, 1, (100, 2))
|
||||
Y = []
|
||||
|
||||
for x in X:
|
||||
if x[0] * x[1] > 0:
|
||||
Y.append("Class 0")
|
||||
else:
|
||||
Y.append("Class 1")
|
||||
|
||||
Y = np.array(Y)
|
||||
|
||||
causal_model = ProbabilisticCausalModel(nx.DiGraph([("X0", "Y"), ("X1", "Y")]))
|
||||
data = {"X" + str(i): X[:, i] for i in range(X.shape[1])}
|
||||
data.update({"Y": Y})
|
||||
|
||||
assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.BETTER)
|
||||
assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD, override_models=True)
|
||||
|
||||
|
||||
def test_when_auto_called_from_main_namespace_returns_no_attribute_error():
|
||||
from dowhy import gcm
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче