Add `create_complex_regression_pipeline` to `sklearn_model_utils.py` (#2318)

* Add `create_complex_regression_pipeline` to `sklearn_model_utils.py` Signed-off-by: Gaurav Gupta <gaugup@microsoft.com> * Add version update Signed-off-by: Gaurav Gupta <gaugup@microsoft.com> * add pandas timestamps and test * Update test_data_processing_utils.py * Update test_data_processing_utils.py --------- Signed-off-by: Gaurav Gupta <gaugup@microsoft.com> Co-authored-by: Roman Lutz <romanlutz13@gmail.com>
2023-09-07 15:48:19 -07:00 · 2023-09-07 15:48:19 -07:00 · 6eea36c28b
--- a/rai_test_utils/rai_test_utils/models/sklearn/init.py
+++ b/rai_test_utils/rai_test_utils/models/sklearn/init.py
@ -4,6 +4,7 @@
 """Namespace for sklearn models."""

 from .sklearn_model_utils import (create_complex_classification_pipeline,
+                                  create_complex_regression_pipeline,
                                  create_kneighbors_classifier,
                                  create_sklearn_logistic_regressor,
                                  create_sklearn_random_forest_classifier,
@ -18,5 +19,6 @@ __all__ = [
    "create_sklearn_random_forest_regressor",
    "create_sklearn_svm_classifier",
    "create_titanic_pipeline",
-    "create_complex_classification_pipeline"
+    "create_complex_classification_pipeline",
+    "create_complex_regression_pipeline"
 ]
--- a/rai_test_utils/rai_test_utils/models/sklearn/sklearn_model_utils.py
+++ b/rai_test_utils/rai_test_utils/models/sklearn/sklearn_model_utils.py
@ -181,3 +181,39 @@ def create_complex_classification_pipeline(
    pipeline = Pipeline(steps=[('preprocessor', transformations),
                               ('classifier', RandomForestClassifier())])
    return pipeline.fit(X_train, y_train)
+
+
+def create_complex_regression_pipeline(
+        X_train, y_train, continuous_features, categorical_features):
+    """Create a complex sklearn pipeline for regression.
+
+    param X_train: The training data.
+    type X_train: numpy.ndarray or pandas.DataFrame
+    param y_train: The training labels.
+    type y_train: numpy.ndarray or pandas.DataFrame
+    param continuous_features: The continuous features.
+    type continuous_features: list
+    param categorical_features: The categorical features.
+    type categorical_features: list
+    return: A complex sklearn pipeline for regression.
+    rtype: sklearn.pipeline.Pipeline
+    """
+    # We create the preprocessing pipelines for both
+    # numeric and categorical data.
+    numeric_transformer = Pipeline(steps=[
+        ("imputer", SimpleImputer(strategy='median')),
+        ('scaler', StandardScaler())])
+
+    categorical_transformer = Pipeline(steps=[
+        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
+
+    transformations = ColumnTransformer(
+        transformers=[
+            ('num', numeric_transformer, continuous_features),
+            ('cat', categorical_transformer, categorical_features)])
+
+    # Append classifier to preprocessing pipeline.
+    # Now we have a full prediction pipeline.
+    pipeline = Pipeline(steps=[('preprocessor', transformations),
+                               ('classifier', RandomForestRegressor())])
+    return pipeline.fit(X_train, y_train)
--- a/rai_test_utils/rai_test_utils/version.py
+++ b/rai_test_utils/rai_test_utils/version.py
@ -4,5 +4,5 @@
 name = 'rai_test_utils'
 _major = '0'
 _minor = '4'
-_patch = '0'
+_patch = '1'
 version = '{}.{}.{}'.format(_major, _minor, _patch)
--- a/rai_test_utils/tests/test_model_utils.py
+++ b/rai_test_utils/tests/test_model_utils.py
@ -2,6 +2,7 @@
 # Licensed under the MIT License.

 import numpy as np
+import pandas as pd
 import pytest
 from ml_wrappers import wrap_model

@ -13,8 +14,8 @@ from rai_test_utils.datasets.vision import (
 from rai_test_utils.models import (create_models_classification,
                                   create_models_object_detection,
                                   create_models_regression)
-from rai_test_utils.models.sklearn import \
-    create_complex_classification_pipeline
+from rai_test_utils.models.sklearn import (
+    create_complex_classification_pipeline, create_complex_regression_pipeline)

 try:
    import torch  # noqa: F401
@ -47,6 +48,15 @@ class TestModelUtils:
            X_train, y_train, num_feature_names, cat_feature_names)
        assert pipeline.predict(X_test) is not None

+    def test_create_complex_regression_pipeline(self):
+        X_train, X_test, y_train, y_test, num_feature_names, \
+            = create_housing_data()
+        X_train = pd.DataFrame(X_train, columns=num_feature_names)
+        X_test = pd.DataFrame(X_test, columns=num_feature_names)
+        pipeline = create_complex_regression_pipeline(
+            X_train, y_train, num_feature_names, [])
+        assert pipeline.predict(X_test) is not None
+
    @pytest.mark.skipif(not pytorch_installed,
                        reason="requires torch/torchvision")
    def test_object_detection_models(self):