Revise gcm auto assignment logic

Signed-off-by: Patrick Bloebaum <bloebp@amazon.com>
This commit is contained in:
Patrick Bloebaum 2022-11-03 08:51:34 -07:00 коммит произвёл Patrick Blöbaum
Родитель 2ed7cf4e93
Коммит c9d9c3a6f0
2 изменённых файлов: 83 добавлений и 36 удалений

Просмотреть файл

@ -12,6 +12,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from dowhy.gcm import config
from dowhy.gcm.cms import ProbabilisticCausalModel
from dowhy.gcm.fcms import AdditiveNoiseModel, ClassificationModel, ClassifierFCM, PredictionModel
from dowhy.gcm.graph import CAUSAL_MECHANISM, get_ordered_predecessors, is_root_node, validate_causal_model_assignment
@ -36,7 +37,6 @@ from dowhy.gcm.ml.classification import (
)
from dowhy.gcm.ml.regression import (
create_ada_boost_regressor,
create_elastic_net_regressor,
create_extra_trees_regressor,
create_knn_regressor,
create_polynom_regressor,
@ -50,25 +50,28 @@ from dowhy.gcm.util.general import (
shape_into_2d,
)
_LIST_OF_POTENTIAL_CLASSIFIERS = [
_LIST_OF_POTENTIAL_CLASSIFIERS_GOOD = [
partial(create_logistic_regression_classifier, max_iter=1000),
partial(create_polynom_logistic_regression_classifier, max_iter=1000),
create_random_forest_classifier,
create_hist_gradient_boost_classifier,
]
_LIST_OF_POTENTIAL_REGRESSORS_GOOD = [
create_linear_regressor,
create_hist_gradient_boost_regressor,
]
_LIST_OF_POTENTIAL_CLASSIFIERS_BETTER = _LIST_OF_POTENTIAL_CLASSIFIERS_GOOD + [
create_random_forest_classifier,
create_extra_trees_classifier,
create_support_vector_classifier,
create_knn_classifier,
create_gaussian_nb_classifier,
create_ada_boost_classifier,
]
_LIST_OF_POTENTIAL_REGRESSORS = [
create_linear_regressor,
_LIST_OF_POTENTIAL_REGRESSORS_BETTER = _LIST_OF_POTENTIAL_REGRESSORS_GOOD + [
create_ridge_regressor,
create_polynom_regressor,
partial(create_lasso_regressor, max_iter=5000),
partial(create_elastic_net_regressor, max_iter=5000),
create_random_forest_regressor,
create_hist_gradient_boost_regressor,
create_support_vector_regressor,
create_extra_trees_regressor,
create_knn_regressor,
@ -96,8 +99,8 @@ def assign_causal_mechanisms(
:param based_on: Jointly sampled data corresponding to the nodes of the given graph.
:param quality: AssignmentQuality for the automatic model selection and model accuracy. This changes the type of
prediction model and time spent on the selection. Options are:
- AssignmentQuality.GOOD: Checks whether the data is linear. If the data is linear, an OLS model is
used, otherwise a gradient boost model.
- AssignmentQuality.GOOD: Compares a linear, polynomial and gradient boost model on small test-training split
of the data. The best performing model is then selected.
Model selection speed: Fast
Model training speed: Fast
Model inference speed: Fast
@ -105,8 +108,8 @@ def assign_causal_mechanisms(
- AssignmentQuality.BETTER: Compares multiple model types and uses the one with the best performance
averaged over multiple splits of the training data. By default, the model with the smallest root mean
squared error is selected for regression problems and the model with the highest F1 score is selected for
classification problems. For a list of possible models, see _LIST_OF_POTENTIAL_REGRESSORS and
_LIST_OF_POTENTIAL_CLASSIFIERS, respectively.
classification problems. For a list of possible models, see _LIST_OF_POTENTIAL_REGRESSORS_BETTER and
_LIST_OF_POTENTIAL_CLASSIFIERS_BETTER, respectively.
Model selection speed: Medium
Model training speed: Fast
Model inference speed: Fast
@ -139,29 +142,26 @@ def assign_causal_mechanisms(
def select_model(
X: np.ndarray, Y: np.ndarray, model_selection_quality: AssignmentQuality
) -> Union[PredictionModel, ClassificationModel]:
target_is_categorical = is_categorical(Y)
if model_selection_quality == AssignmentQuality.GOOD:
use_linear_prediction_models = has_linear_relationship(X, Y)
if target_is_categorical:
if use_linear_prediction_models:
return create_logistic_regression_classifier(max_iter=1000)
else:
return create_hist_gradient_boost_classifier()
else:
if use_linear_prediction_models:
return find_best_model(
[create_linear_regressor, create_polynom_regressor], X, Y, model_selection_splits=2
)()
else:
return find_best_model(
[create_hist_gradient_boost_regressor, create_polynom_regressor], X, Y, model_selection_splits=2
)()
list_of_regressor = list(_LIST_OF_POTENTIAL_REGRESSORS_GOOD)
list_of_classifier = list(_LIST_OF_POTENTIAL_CLASSIFIERS_GOOD)
model_selection_splits = 2
elif model_selection_quality == AssignmentQuality.BETTER:
if target_is_categorical:
return find_best_model(_LIST_OF_POTENTIAL_CLASSIFIERS, X, Y)()
else:
return find_best_model(_LIST_OF_POTENTIAL_REGRESSORS, X, Y)()
list_of_regressor = list(_LIST_OF_POTENTIAL_REGRESSORS_BETTER)
list_of_classifier = list(_LIST_OF_POTENTIAL_CLASSIFIERS_BETTER)
model_selection_splits = 5
else:
raise ValueError("Invalid model selection quality.")
if apply_one_hot_encoding(X, fit_one_hot_encoders(X)).shape[1] <= 5:
# Avoid too many features
list_of_regressor += [create_polynom_regressor]
list_of_classifier += [partial(create_polynom_logistic_regression_classifier, max_iter=1000)]
if is_categorical(Y):
return find_best_model(list_of_classifier, X, Y, model_selection_splits=model_selection_splits)()
else:
return find_best_model(list_of_regressor, X, Y, model_selection_splits=model_selection_splits)()
def has_linear_relationship(X: np.ndarray, Y: np.ndarray, max_num_samples: int = 3000) -> bool:
@ -221,8 +221,10 @@ def find_best_model(
metric: Optional[Callable[[np.ndarray, np.ndarray], float]] = None,
max_samples_per_split: int = 10000,
model_selection_splits: int = 5,
n_jobs: int = -1,
n_jobs: Optional[int] = None,
) -> Callable[[], PredictionModel]:
n_jobs = config.default_n_jobs if n_jobs is None else n_jobs
X, Y = shape_into_2d(X, Y)
is_classification_problem = isinstance(prediction_model_factories[0](), ClassificationModel)

Просмотреть файл

@ -5,6 +5,7 @@ from flaky import flaky
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.linear_model import ElasticNetCV, LassoCV, LinearRegression, LogisticRegression, RidgeCV
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from dowhy.gcm import ProbabilisticCausalModel
from dowhy.gcm.auto import AssignmentQuality, assign_causal_mechanisms
@ -77,7 +78,9 @@ def test_given_non_linear_regression_problem_when_auto_assign_causal_models_with
data.update({"Y": Y})
assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD)
assert isinstance(causal_model.causal_mechanism("Y").prediction_model.sklearn_model, HistGradientBoostingRegressor)
assert isinstance(
causal_model.causal_mechanism("Y").prediction_model.sklearn_model, HistGradientBoostingRegressor
) or isinstance(causal_model.causal_mechanism("Y").prediction_model.sklearn_model, Pipeline)
@flaky(max_runs=3)
@ -136,7 +139,9 @@ def test_given_non_linear_classification_problem_when_auto_assign_causal_models_
data.update({"Y": Y})
assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD)
assert isinstance(causal_model.causal_mechanism("Y").classifier_model.sklearn_model, HistGradientBoostingClassifier)
assert isinstance(
causal_model.causal_mechanism("Y").classifier_model.sklearn_model, HistGradientBoostingClassifier
) or isinstance(causal_model.causal_mechanism("Y").classifier_model.sklearn_model, Pipeline)
@flaky(max_runs=3)
@ -154,6 +159,46 @@ def test_given_non_linear_classification_problem_when_auto_assign_causal_models_
assert not isinstance(causal_model.causal_mechanism("Y").classifier_model.sklearn_model, GaussianNB)
@flaky(max_runs=3)
def test_given_polynomial_regression_data_with_categorical_input_when_auto_assign_causal_models_then_does_not_raise_error():
X = np.column_stack(
[np.random.choice(2, 100, replace=True).astype(str), np.random.normal(0, 1, (100, 2)).astype(object)]
).astype(object)
Y = []
for i in range(X.shape[0]):
Y.append(X[i, 1] * X[i, 2] if X[i, 0] == "0" else X[i, 1] + X[i, 2])
Y = np.array(Y)
causal_model = ProbabilisticCausalModel(nx.DiGraph([("X0", "Y"), ("X1", "Y"), ("X2", "Y")]))
data = {"X" + str(i): X[:, i] for i in range(X.shape[1])}
data.update({"Y": Y})
assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD)
assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.BETTER, override_models=True)
@flaky(max_runs=3)
def test_given_polynomial_classification_data_with_categorical_input_when_auto_assign_causal_models_then_does_not_raise_error():
X = np.random.normal(0, 1, (100, 2))
Y = []
for x in X:
if x[0] * x[1] > 0:
Y.append("Class 0")
else:
Y.append("Class 1")
Y = np.array(Y)
causal_model = ProbabilisticCausalModel(nx.DiGraph([("X0", "Y"), ("X1", "Y")]))
data = {"X" + str(i): X[:, i] for i in range(X.shape[1])}
data.update({"Y": Y})
assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.BETTER)
assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD, override_models=True)
def test_when_auto_called_from_main_namespace_returns_no_attribute_error():
from dowhy import gcm