diff --git a/dowhy/gcm/auto.py b/dowhy/gcm/auto.py index 283d8ef83..6bafaf81a 100644 --- a/dowhy/gcm/auto.py +++ b/dowhy/gcm/auto.py @@ -16,7 +16,6 @@ from dowhy.gcm.cms import ProbabilisticCausalModel from dowhy.gcm.fcms import AdditiveNoiseModel, ClassificationModel, ClassifierFCM, PredictionModel from dowhy.gcm.graph import CAUSAL_MECHANISM, get_ordered_predecessors, is_root_node, validate_causal_model_assignment from dowhy.gcm.ml import ( - create_elastic_net_regressor, create_hist_gradient_boost_classifier, create_hist_gradient_boost_regressor, create_lasso_regressor, @@ -31,14 +30,16 @@ from dowhy.gcm.ml.classification import ( create_extra_trees_classifier, create_gaussian_nb_classifier, create_knn_classifier, + create_polynom_logistic_regression_classifier, create_random_forest_classifier, create_support_vector_classifier, ) from dowhy.gcm.ml.regression import ( create_ada_boost_regressor, + create_elastic_net_regressor, create_extra_trees_regressor, create_knn_regressor, - create_product_regressor, + create_polynom_regressor, ) from dowhy.gcm.stochastic_models import EmpiricalDistribution from dowhy.gcm.util.general import ( @@ -51,6 +52,7 @@ from dowhy.gcm.util.general import ( _LIST_OF_POTENTIAL_CLASSIFIERS = [ partial(create_logistic_regression_classifier, max_iter=1000), + partial(create_polynom_logistic_regression_classifier, max_iter=1000), create_random_forest_classifier, create_hist_gradient_boost_classifier, create_extra_trees_classifier, @@ -62,6 +64,7 @@ _LIST_OF_POTENTIAL_CLASSIFIERS = [ _LIST_OF_POTENTIAL_REGRESSORS = [ create_linear_regressor, create_ridge_regressor, + create_polynom_regressor, partial(create_lasso_regressor, max_iter=5000), partial(create_elastic_net_regressor, max_iter=5000), create_random_forest_regressor, @@ -70,7 +73,6 @@ _LIST_OF_POTENTIAL_REGRESSORS = [ create_extra_trees_regressor, create_knn_regressor, create_ada_boost_regressor, - create_product_regressor, ] @@ -149,11 +151,11 @@ def select_model( else: if use_linear_prediction_models: return find_best_model( - [create_linear_regressor, create_product_regressor], X, Y, model_selection_splits=2 + [create_linear_regressor, create_polynom_regressor], X, Y, model_selection_splits=2 )() else: return find_best_model( - [create_hist_gradient_boost_regressor, create_product_regressor], X, Y, model_selection_splits=2 + [create_hist_gradient_boost_regressor, create_polynom_regressor], X, Y, model_selection_splits=2 )() elif model_selection_quality == AssignmentQuality.BETTER: if target_is_categorical: diff --git a/dowhy/gcm/ml/__init__.py b/dowhy/gcm/ml/__init__.py index 763dd927c..8717e9141 100644 --- a/dowhy/gcm/ml/__init__.py +++ b/dowhy/gcm/ml/__init__.py @@ -8,6 +8,7 @@ from .classification import ( create_gaussian_process_classifier, create_hist_gradient_boost_classifier, create_logistic_regression_classifier, + create_polynom_logistic_regression_classifier, create_random_forest_classifier, ) from .regression import ( @@ -19,6 +20,7 @@ from .regression import ( create_lasso_regressor, create_linear_regressor, create_linear_regressor_with_given_parameters, + create_polynom_regressor, create_random_forest_regressor, create_ridge_regressor, create_support_vector_regressor, diff --git a/dowhy/gcm/ml/classification.py b/dowhy/gcm/ml/classification.py index 205245cb9..4bf0c798e 100644 --- a/dowhy/gcm/ml/classification.py +++ b/dowhy/gcm/ml/classification.py @@ -7,6 +7,8 @@ from typing import List import numpy as np import sklearn from packaging import version +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import PolynomialFeatures if version.parse(sklearn.__version__) < version.parse("1.0"): from sklearn.experimental import enable_hist_gradient_boosting # noqa @@ -74,3 +76,13 @@ def create_knn_classifier(**kwargs) -> SklearnClassificationModel: def create_gaussian_nb_classifier(**kwargs) -> SklearnClassificationModel: return SklearnClassificationModel(GaussianNB(**kwargs)) + + +def create_polynom_logistic_regression_classifier( + degree: int = 3, **kwargs_logistic_regression +) -> SklearnClassificationModel: + return SklearnClassificationModel( + make_pipeline( + PolynomialFeatures(degree=degree, include_bias=False), LogisticRegression(**kwargs_logistic_regression) + ) + ) diff --git a/dowhy/gcm/ml/regression.py b/dowhy/gcm/ml/regression.py index 71e1774f0..aea2b7497 100644 --- a/dowhy/gcm/ml/regression.py +++ b/dowhy/gcm/ml/regression.py @@ -7,6 +7,8 @@ from typing import Any import numpy as np import sklearn from packaging import version +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import PolynomialFeatures if version.parse(sklearn.__version__) < version.parse("1.0"): from sklearn.experimental import enable_hist_gradient_boosting # noqa @@ -115,8 +117,10 @@ def create_ada_boost_regressor(**kwargs) -> SklearnRegressionModel: return SklearnRegressionModel(AdaBoostRegressor(**kwargs)) -def create_product_regressor() -> PredictionModel: - return ProductRegressor() +def create_polynom_regressor(degree: int = 3, **kwargs_linear_model) -> SklearnRegressionModel: + return SklearnRegressionModel( + make_pipeline(PolynomialFeatures(degree=degree, include_bias=False), LinearRegression(**kwargs_linear_model)) + ) class InvertibleIdentityFunction(InvertibleFunction): @@ -141,18 +145,3 @@ class InvertibleLogarithmicFunction(InvertibleFunction): def evaluate_inverse(self, X: np.ndarray) -> np.ndarray: return np.exp(X) - - -class ProductRegressor(PredictionModel): - def __init__(self): - self._one_hot_encoders = {} - - def fit(self, X, Y): - self._one_hot_encoders = fit_one_hot_encoders(X) - - def predict(self, X): - X = apply_one_hot_encoding(X, self._one_hot_encoders) - return np.prod(X, axis=1).reshape(-1, 1) - - def clone(self): - return ProductRegressor() diff --git a/tests/gcm/ml/test_classification.py b/tests/gcm/ml/test_classification.py new file mode 100644 index 000000000..bd754eb1d --- /dev/null +++ b/tests/gcm/ml/test_classification.py @@ -0,0 +1,26 @@ +import numpy as np +from flaky import flaky + +from dowhy.gcm.ml import create_polynom_logistic_regression_classifier + + +@flaky(max_runs=3) +def test_when_fit_and_predict_polynom_classifier_then_returns_accurate_results(): + def _generate_data(): + X = np.random.normal(0, 1, (1000, 2)) + Y = [] + + for x in X: + if x[0] * x[1] > 0: + Y.append("Class 0") + else: + Y.append("Class 1") + + return X, np.array(Y) + + X_training, Y_training = _generate_data() + X_test, Y_test = _generate_data() + mdl = create_polynom_logistic_regression_classifier() + mdl.fit(X_training, Y_training) + + assert np.sum(mdl.predict(X_test).reshape(-1) == Y_test) > 950 diff --git a/tests/gcm/ml/test_regression.py b/tests/gcm/ml/test_regression.py index 37430cf19..49428456d 100644 --- a/tests/gcm/ml/test_regression.py +++ b/tests/gcm/ml/test_regression.py @@ -1,22 +1,39 @@ import numpy as np from _pytest.python_api import approx +from flaky import flaky -from dowhy.gcm.ml.regression import create_product_regressor +from dowhy.gcm.ml.regression import create_polynom_regressor -def test_when_use_product_regressor_then_computes_correct_values(): - X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) +@flaky(max_runs=3) +def test_when_fit_and_predict_polynom_regressor_then_returns_accurate_results(): + X = np.random.normal(0, 1, (100, 2)) + Y = X[:, 0] * X[:, 1] - mdl = create_product_regressor() - # No fit needed + mdl = create_polynom_regressor() + mdl.fit(X, Y) - assert mdl.predict(X).reshape(-1) == approx(np.array([6, 120, 504])) + X_test = np.random.normal(0, 1, (100, 2)) + Y_test = X_test[:, 0] * X_test[:, 1] + + assert mdl.predict(X_test).reshape(-1) == approx(Y_test, abs=1e-10) -def test_when_input_is_categorical_when_use_product_regressor_then_computes_correct_values(): - X = np.column_stack([np.array(["Class 1", "Class 2"]).astype(object), np.array([1, 2])]).astype(object) +@flaky(max_runs=3) +def test_when_given_categorical_training_data_when_fit_and_predict_polynom_regressor_then_returns_accurate_results(): + def _generate_data(): + X = np.column_stack( + [np.random.choice(2, 100, replace=True).astype(str), np.random.normal(0, 1, (100, 2)).astype(object)] + ).astype(object) + Y = [] + for i in range(X.shape[0]): + Y.append(X[i, 1] * X[i, 2] if X[i, 0] == "0" else X[i, 1] + X[i, 2]) - mdl = create_product_regressor() - mdl.fit(X, np.zeros(2)) # Need to fit one-hot-encoder + return X, np.array(Y) - assert mdl.predict(X).reshape(-1) == approx(np.array([0, 2])) + X_training, Y_training = _generate_data() + X_test, Y_test = _generate_data() + mdl = create_polynom_regressor() + mdl.fit(X_training, Y_training) + + assert mdl.predict(X_test).reshape(-1) == approx(Y_test, abs=1e-10) diff --git a/tests/gcm/test_auto.py b/tests/gcm/test_auto.py index 5fefedd49..b3e4efbba 100644 --- a/tests/gcm/test_auto.py +++ b/tests/gcm/test_auto.py @@ -19,7 +19,7 @@ def _generate_linear_regression_data(): def _generate_non_linear_regression_data(): X = np.random.normal(0, 1, (1000, 5)) - Y = np.sum(X**2, axis=1) + Y = np.sum(np.log(abs(X)), axis=1) return X, Y