Revise gcm auto assignment logic

Signed-off-by: Patrick Bloebaum <bloebp@amazon.com>
2022-11-03 08:51:34 -07:00 · 2022-11-03 08:51:34 -07:00 · c9d9c3a6f0
--- a/dowhy/gcm/auto.py
+++ b/dowhy/gcm/auto.py
@ -12,6 +12,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.model_selection import KFold, train_test_split
 from sklearn.preprocessing import MultiLabelBinarizer

+from dowhy.gcm import config
 from dowhy.gcm.cms import ProbabilisticCausalModel
 from dowhy.gcm.fcms import AdditiveNoiseModel, ClassificationModel, ClassifierFCM, PredictionModel
 from dowhy.gcm.graph import CAUSAL_MECHANISM, get_ordered_predecessors, is_root_node, validate_causal_model_assignment
@ -36,7 +37,6 @@ from dowhy.gcm.ml.classification import (
 )
 from dowhy.gcm.ml.regression import (
    create_ada_boost_regressor,
-    create_elastic_net_regressor,
    create_extra_trees_regressor,
    create_knn_regressor,
    create_polynom_regressor,
@ -50,25 +50,28 @@ from dowhy.gcm.util.general import (
    shape_into_2d,
 )

-_LIST_OF_POTENTIAL_CLASSIFIERS = [
+_LIST_OF_POTENTIAL_CLASSIFIERS_GOOD = [
    partial(create_logistic_regression_classifier, max_iter=1000),
-    partial(create_polynom_logistic_regression_classifier, max_iter=1000),
-    create_random_forest_classifier,
    create_hist_gradient_boost_classifier,
+]
+_LIST_OF_POTENTIAL_REGRESSORS_GOOD = [
+    create_linear_regressor,
+    create_hist_gradient_boost_regressor,
+]
+
+_LIST_OF_POTENTIAL_CLASSIFIERS_BETTER = _LIST_OF_POTENTIAL_CLASSIFIERS_GOOD + [
+    create_random_forest_classifier,
    create_extra_trees_classifier,
    create_support_vector_classifier,
    create_knn_classifier,
    create_gaussian_nb_classifier,
    create_ada_boost_classifier,
 ]
-_LIST_OF_POTENTIAL_REGRESSORS = [
-    create_linear_regressor,
+_LIST_OF_POTENTIAL_REGRESSORS_BETTER = _LIST_OF_POTENTIAL_REGRESSORS_GOOD + [
    create_ridge_regressor,
    create_polynom_regressor,
    partial(create_lasso_regressor, max_iter=5000),
-    partial(create_elastic_net_regressor, max_iter=5000),
    create_random_forest_regressor,
-    create_hist_gradient_boost_regressor,
    create_support_vector_regressor,
    create_extra_trees_regressor,
    create_knn_regressor,
@ -96,8 +99,8 @@ def assign_causal_mechanisms(
    :param based_on: Jointly sampled data corresponding to the nodes of the given graph.
    :param quality: AssignmentQuality for the automatic model selection and model accuracy. This changes the type of
    prediction model and time spent on the selection. Options are:
-        - AssignmentQuality.GOOD: Checks whether the data is linear. If the data is linear, an OLS model is
-            used, otherwise a gradient boost model.
+        - AssignmentQuality.GOOD: Compares a linear, polynomial and gradient boost model on small test-training split
+            of the data. The best performing model is then selected.
            Model selection speed: Fast
            Model training speed: Fast
            Model inference speed: Fast
@ -105,8 +108,8 @@ def assign_causal_mechanisms(
        - AssignmentQuality.BETTER: Compares multiple model types and uses the one with the best performance
            averaged over multiple splits of the training data. By default, the model with the smallest root mean
            squared error is selected for regression problems and the model with the highest F1 score is selected for
-            classification problems. For a list of possible models, see _LIST_OF_POTENTIAL_REGRESSORS and
-            _LIST_OF_POTENTIAL_CLASSIFIERS, respectively.
+            classification problems. For a list of possible models, see _LIST_OF_POTENTIAL_REGRESSORS_BETTER and
+            _LIST_OF_POTENTIAL_CLASSIFIERS_BETTER, respectively.
            Model selection speed: Medium
            Model training speed: Fast
            Model inference speed: Fast
@ -139,29 +142,26 @@ def assign_causal_mechanisms(
 def select_model(
    X: np.ndarray, Y: np.ndarray, model_selection_quality: AssignmentQuality
 ) -> Union[PredictionModel, ClassificationModel]:
-    target_is_categorical = is_categorical(Y)
    if model_selection_quality == AssignmentQuality.GOOD:
-        use_linear_prediction_models = has_linear_relationship(X, Y)
-
-        if target_is_categorical:
-            if use_linear_prediction_models:
-                return create_logistic_regression_classifier(max_iter=1000)
-            else:
-                return create_hist_gradient_boost_classifier()
-        else:
-            if use_linear_prediction_models:
-                return find_best_model(
-                    [create_linear_regressor, create_polynom_regressor], X, Y, model_selection_splits=2
-                )()
-            else:
-                return find_best_model(
-                    [create_hist_gradient_boost_regressor, create_polynom_regressor], X, Y, model_selection_splits=2
-                )()
+        list_of_regressor = list(_LIST_OF_POTENTIAL_REGRESSORS_GOOD)
+        list_of_classifier = list(_LIST_OF_POTENTIAL_CLASSIFIERS_GOOD)
+        model_selection_splits = 2
    elif model_selection_quality == AssignmentQuality.BETTER:
-        if target_is_categorical:
-            return find_best_model(_LIST_OF_POTENTIAL_CLASSIFIERS, X, Y)()
-        else:
-            return find_best_model(_LIST_OF_POTENTIAL_REGRESSORS, X, Y)()
+        list_of_regressor = list(_LIST_OF_POTENTIAL_REGRESSORS_BETTER)
+        list_of_classifier = list(_LIST_OF_POTENTIAL_CLASSIFIERS_BETTER)
+        model_selection_splits = 5
+    else:
+        raise ValueError("Invalid model selection quality.")
+
+    if apply_one_hot_encoding(X, fit_one_hot_encoders(X)).shape[1] <= 5:
+        # Avoid too many features
+        list_of_regressor += [create_polynom_regressor]
+        list_of_classifier += [partial(create_polynom_logistic_regression_classifier, max_iter=1000)]
+
+    if is_categorical(Y):
+        return find_best_model(list_of_classifier, X, Y, model_selection_splits=model_selection_splits)()
+    else:
+        return find_best_model(list_of_regressor, X, Y, model_selection_splits=model_selection_splits)()


 def has_linear_relationship(X: np.ndarray, Y: np.ndarray, max_num_samples: int = 3000) -> bool:
@ -221,8 +221,10 @@ def find_best_model(
    metric: Optional[Callable[[np.ndarray, np.ndarray], float]] = None,
    max_samples_per_split: int = 10000,
    model_selection_splits: int = 5,
-    n_jobs: int = -1,
+    n_jobs: Optional[int] = None,
 ) -> Callable[[], PredictionModel]:
+    n_jobs = config.default_n_jobs if n_jobs is None else n_jobs
+
    X, Y = shape_into_2d(X, Y)

    is_classification_problem = isinstance(prediction_model_factories[0](), ClassificationModel)
--- a/tests/gcm/test_auto.py
+++ b/tests/gcm/test_auto.py
@ -5,6 +5,7 @@ from flaky import flaky
 from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
 from sklearn.linear_model import ElasticNetCV, LassoCV, LinearRegression, LogisticRegression, RidgeCV
 from sklearn.naive_bayes import GaussianNB
+from sklearn.pipeline import Pipeline

 from dowhy.gcm import ProbabilisticCausalModel
 from dowhy.gcm.auto import AssignmentQuality, assign_causal_mechanisms
@ -77,7 +78,9 @@ def test_given_non_linear_regression_problem_when_auto_assign_causal_models_with
    data.update({"Y": Y})

    assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD)
-    assert isinstance(causal_model.causal_mechanism("Y").prediction_model.sklearn_model, HistGradientBoostingRegressor)
+    assert isinstance(
+        causal_model.causal_mechanism("Y").prediction_model.sklearn_model, HistGradientBoostingRegressor
+    ) or isinstance(causal_model.causal_mechanism("Y").prediction_model.sklearn_model, Pipeline)


@flaky(max_runs=3)
@ -136,7 +139,9 @@ def test_given_non_linear_classification_problem_when_auto_assign_causal_models_
    data.update({"Y": Y})

    assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD)
-    assert isinstance(causal_model.causal_mechanism("Y").classifier_model.sklearn_model, HistGradientBoostingClassifier)
+    assert isinstance(
+        causal_model.causal_mechanism("Y").classifier_model.sklearn_model, HistGradientBoostingClassifier
+    ) or isinstance(causal_model.causal_mechanism("Y").classifier_model.sklearn_model, Pipeline)


@flaky(max_runs=3)
@ -154,6 +159,46 @@ def test_given_non_linear_classification_problem_when_auto_assign_causal_models_
    assert not isinstance(causal_model.causal_mechanism("Y").classifier_model.sklearn_model, GaussianNB)


+@flaky(max_runs=3)
+def test_given_polynomial_regression_data_with_categorical_input_when_auto_assign_causal_models_then_does_not_raise_error():
+    X = np.column_stack(
+        [np.random.choice(2, 100, replace=True).astype(str), np.random.normal(0, 1, (100, 2)).astype(object)]
+    ).astype(object)
+    Y = []
+    for i in range(X.shape[0]):
+        Y.append(X[i, 1] * X[i, 2] if X[i, 0] == "0" else X[i, 1] + X[i, 2])
+
+    Y = np.array(Y)
+
+    causal_model = ProbabilisticCausalModel(nx.DiGraph([("X0", "Y"), ("X1", "Y"), ("X2", "Y")]))
+    data = {"X" + str(i): X[:, i] for i in range(X.shape[1])}
+    data.update({"Y": Y})
+
+    assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD)
+    assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.BETTER, override_models=True)
+
+
+@flaky(max_runs=3)
+def test_given_polynomial_classification_data_with_categorical_input_when_auto_assign_causal_models_then_does_not_raise_error():
+    X = np.random.normal(0, 1, (100, 2))
+    Y = []
+
+    for x in X:
+        if x[0] * x[1] > 0:
+            Y.append("Class 0")
+        else:
+            Y.append("Class 1")
+
+    Y = np.array(Y)
+
+    causal_model = ProbabilisticCausalModel(nx.DiGraph([("X0", "Y"), ("X1", "Y")]))
+    data = {"X" + str(i): X[:, i] for i in range(X.shape[1])}
+    data.update({"Y": Y})
+
+    assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.BETTER)
+    assign_causal_mechanisms(causal_model, pd.DataFrame(data), quality=AssignmentQuality.GOOD, override_models=True)
+
+
 def test_when_auto_called_from_main_namespace_returns_no_attribute_error():
    from dowhy import gcm