Add polynom regressor and classifier to gcm

This replaces the ProductRegressor.

Signed-off-by: Patrick Bloebaum <bloebp@amazon.com>
This commit is contained in:
Patrick Bloebaum 2022-11-01 08:19:24 -07:00 коммит произвёл Patrick Blöbaum
Родитель fb5b4d5260
Коммит 2ed7cf4e93
7 изменённых файлов: 82 добавлений и 34 удалений

Просмотреть файл

@ -16,7 +16,6 @@ from dowhy.gcm.cms import ProbabilisticCausalModel
from dowhy.gcm.fcms import AdditiveNoiseModel, ClassificationModel, ClassifierFCM, PredictionModel
from dowhy.gcm.graph import CAUSAL_MECHANISM, get_ordered_predecessors, is_root_node, validate_causal_model_assignment
from dowhy.gcm.ml import (
create_elastic_net_regressor,
create_hist_gradient_boost_classifier,
create_hist_gradient_boost_regressor,
create_lasso_regressor,
@ -31,14 +30,16 @@ from dowhy.gcm.ml.classification import (
create_extra_trees_classifier,
create_gaussian_nb_classifier,
create_knn_classifier,
create_polynom_logistic_regression_classifier,
create_random_forest_classifier,
create_support_vector_classifier,
)
from dowhy.gcm.ml.regression import (
create_ada_boost_regressor,
create_elastic_net_regressor,
create_extra_trees_regressor,
create_knn_regressor,
create_product_regressor,
create_polynom_regressor,
)
from dowhy.gcm.stochastic_models import EmpiricalDistribution
from dowhy.gcm.util.general import (
@ -51,6 +52,7 @@ from dowhy.gcm.util.general import (
_LIST_OF_POTENTIAL_CLASSIFIERS = [
partial(create_logistic_regression_classifier, max_iter=1000),
partial(create_polynom_logistic_regression_classifier, max_iter=1000),
create_random_forest_classifier,
create_hist_gradient_boost_classifier,
create_extra_trees_classifier,
@ -62,6 +64,7 @@ _LIST_OF_POTENTIAL_CLASSIFIERS = [
_LIST_OF_POTENTIAL_REGRESSORS = [
create_linear_regressor,
create_ridge_regressor,
create_polynom_regressor,
partial(create_lasso_regressor, max_iter=5000),
partial(create_elastic_net_regressor, max_iter=5000),
create_random_forest_regressor,
@ -70,7 +73,6 @@ _LIST_OF_POTENTIAL_REGRESSORS = [
create_extra_trees_regressor,
create_knn_regressor,
create_ada_boost_regressor,
create_product_regressor,
]
@ -149,11 +151,11 @@ def select_model(
else:
if use_linear_prediction_models:
return find_best_model(
[create_linear_regressor, create_product_regressor], X, Y, model_selection_splits=2
[create_linear_regressor, create_polynom_regressor], X, Y, model_selection_splits=2
)()
else:
return find_best_model(
[create_hist_gradient_boost_regressor, create_product_regressor], X, Y, model_selection_splits=2
[create_hist_gradient_boost_regressor, create_polynom_regressor], X, Y, model_selection_splits=2
)()
elif model_selection_quality == AssignmentQuality.BETTER:
if target_is_categorical:

Просмотреть файл

@ -8,6 +8,7 @@ from .classification import (
create_gaussian_process_classifier,
create_hist_gradient_boost_classifier,
create_logistic_regression_classifier,
create_polynom_logistic_regression_classifier,
create_random_forest_classifier,
)
from .regression import (
@ -19,6 +20,7 @@ from .regression import (
create_lasso_regressor,
create_linear_regressor,
create_linear_regressor_with_given_parameters,
create_polynom_regressor,
create_random_forest_regressor,
create_ridge_regressor,
create_support_vector_regressor,

Просмотреть файл

@ -7,6 +7,8 @@ from typing import List
import numpy as np
import sklearn
from packaging import version
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
if version.parse(sklearn.__version__) < version.parse("1.0"):
from sklearn.experimental import enable_hist_gradient_boosting # noqa
@ -74,3 +76,13 @@ def create_knn_classifier(**kwargs) -> SklearnClassificationModel:
def create_gaussian_nb_classifier(**kwargs) -> SklearnClassificationModel:
return SklearnClassificationModel(GaussianNB(**kwargs))
def create_polynom_logistic_regression_classifier(
degree: int = 3, **kwargs_logistic_regression
) -> SklearnClassificationModel:
return SklearnClassificationModel(
make_pipeline(
PolynomialFeatures(degree=degree, include_bias=False), LogisticRegression(**kwargs_logistic_regression)
)
)

Просмотреть файл

@ -7,6 +7,8 @@ from typing import Any
import numpy as np
import sklearn
from packaging import version
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
if version.parse(sklearn.__version__) < version.parse("1.0"):
from sklearn.experimental import enable_hist_gradient_boosting # noqa
@ -115,8 +117,10 @@ def create_ada_boost_regressor(**kwargs) -> SklearnRegressionModel:
return SklearnRegressionModel(AdaBoostRegressor(**kwargs))
def create_product_regressor() -> PredictionModel:
return ProductRegressor()
def create_polynom_regressor(degree: int = 3, **kwargs_linear_model) -> SklearnRegressionModel:
return SklearnRegressionModel(
make_pipeline(PolynomialFeatures(degree=degree, include_bias=False), LinearRegression(**kwargs_linear_model))
)
class InvertibleIdentityFunction(InvertibleFunction):
@ -141,18 +145,3 @@ class InvertibleLogarithmicFunction(InvertibleFunction):
def evaluate_inverse(self, X: np.ndarray) -> np.ndarray:
return np.exp(X)
class ProductRegressor(PredictionModel):
def __init__(self):
self._one_hot_encoders = {}
def fit(self, X, Y):
self._one_hot_encoders = fit_one_hot_encoders(X)
def predict(self, X):
X = apply_one_hot_encoding(X, self._one_hot_encoders)
return np.prod(X, axis=1).reshape(-1, 1)
def clone(self):
return ProductRegressor()

Просмотреть файл

@ -0,0 +1,26 @@
import numpy as np
from flaky import flaky
from dowhy.gcm.ml import create_polynom_logistic_regression_classifier
@flaky(max_runs=3)
def test_when_fit_and_predict_polynom_classifier_then_returns_accurate_results():
def _generate_data():
X = np.random.normal(0, 1, (1000, 2))
Y = []
for x in X:
if x[0] * x[1] > 0:
Y.append("Class 0")
else:
Y.append("Class 1")
return X, np.array(Y)
X_training, Y_training = _generate_data()
X_test, Y_test = _generate_data()
mdl = create_polynom_logistic_regression_classifier()
mdl.fit(X_training, Y_training)
assert np.sum(mdl.predict(X_test).reshape(-1) == Y_test) > 950

Просмотреть файл

@ -1,22 +1,39 @@
import numpy as np
from _pytest.python_api import approx
from flaky import flaky
from dowhy.gcm.ml.regression import create_product_regressor
from dowhy.gcm.ml.regression import create_polynom_regressor
def test_when_use_product_regressor_then_computes_correct_values():
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
@flaky(max_runs=3)
def test_when_fit_and_predict_polynom_regressor_then_returns_accurate_results():
X = np.random.normal(0, 1, (100, 2))
Y = X[:, 0] * X[:, 1]
mdl = create_product_regressor()
# No fit needed
mdl = create_polynom_regressor()
mdl.fit(X, Y)
assert mdl.predict(X).reshape(-1) == approx(np.array([6, 120, 504]))
X_test = np.random.normal(0, 1, (100, 2))
Y_test = X_test[:, 0] * X_test[:, 1]
assert mdl.predict(X_test).reshape(-1) == approx(Y_test, abs=1e-10)
def test_when_input_is_categorical_when_use_product_regressor_then_computes_correct_values():
X = np.column_stack([np.array(["Class 1", "Class 2"]).astype(object), np.array([1, 2])]).astype(object)
@flaky(max_runs=3)
def test_when_given_categorical_training_data_when_fit_and_predict_polynom_regressor_then_returns_accurate_results():
def _generate_data():
X = np.column_stack(
[np.random.choice(2, 100, replace=True).astype(str), np.random.normal(0, 1, (100, 2)).astype(object)]
).astype(object)
Y = []
for i in range(X.shape[0]):
Y.append(X[i, 1] * X[i, 2] if X[i, 0] == "0" else X[i, 1] + X[i, 2])
mdl = create_product_regressor()
mdl.fit(X, np.zeros(2)) # Need to fit one-hot-encoder
return X, np.array(Y)
assert mdl.predict(X).reshape(-1) == approx(np.array([0, 2]))
X_training, Y_training = _generate_data()
X_test, Y_test = _generate_data()
mdl = create_polynom_regressor()
mdl.fit(X_training, Y_training)
assert mdl.predict(X_test).reshape(-1) == approx(Y_test, abs=1e-10)

Просмотреть файл

@ -19,7 +19,7 @@ def _generate_linear_regression_data():
def _generate_non_linear_regression_data():
X = np.random.normal(0, 1, (1000, 5))
Y = np.sum(X**2, axis=1)
Y = np.sum(np.log(abs(X)), axis=1)
return X, Y