diff --git a/erroranalysis/requirements-dev.txt b/erroranalysis/requirements-dev.txt index 625c453b7..89a4e5c74 100644 --- a/erroranalysis/requirements-dev.txt +++ b/erroranalysis/requirements-dev.txt @@ -4,4 +4,4 @@ pytest-mock==3.6.1 requests==2.25.1 requirements-parser==0.2.0 -rai_test_utils==0.0.0 +rai_test_utils==0.1.0 diff --git a/erroranalysis/tests/test_matrix_filter.py b/erroranalysis/tests/test_matrix_filter.py index 180f1aaa9..d2f82ef2c 100644 --- a/erroranalysis/tests/test_matrix_filter.py +++ b/erroranalysis/tests/test_matrix_filter.py @@ -309,7 +309,7 @@ class TestMatrixFilter(object): # Test quantile binning on CRIM feature in california housing dataset, # which errored out due to first category not fitting into bins (X_train, X_test, y_train, y_test, - feature_names) = create_housing_data(test_size=0.5) + feature_names) = create_housing_data() model_task = ModelTask.REGRESSION matrix_features = ['Population'] diff --git a/responsibleai/requirements-dev.txt b/responsibleai/requirements-dev.txt index caccf78ca..46ea22c67 100644 --- a/responsibleai/requirements-dev.txt +++ b/responsibleai/requirements-dev.txt @@ -7,4 +7,4 @@ pytest-mock==3.6.1 # Required for responsibleai package tests deptree~=0.0.10 xgboost<=1.0.0 -rai_test_utils==0.0.0 +rai_test_utils==0.1.0 diff --git a/responsibleai/tests/causal/conftest.py b/responsibleai/tests/causal/conftest.py index d39a98006..23cf56143 100644 --- a/responsibleai/tests/causal/conftest.py +++ b/responsibleai/tests/causal/conftest.py @@ -8,7 +8,9 @@ import pandas as pd import pytest import shap from sklearn.model_selection import train_test_split -from tests.common_utils import create_adult_income_dataset, create_housing_data +from tests.common_utils import create_adult_income_dataset + +from rai_test_utils.datasets.tabular import create_housing_data @pytest.fixture(scope='session') diff --git a/responsibleai/tests/common_utils.py b/responsibleai/tests/common_utils.py index 9bc0963b3..496cf8021 100644 --- a/responsibleai/tests/common_utils.py +++ b/responsibleai/tests/common_utils.py @@ -8,12 +8,7 @@ import pandas as pd import pytest # Defines common utilities for responsibleai tests from dice_ml.utils import helpers -from sklearn.compose import ColumnTransformer -from sklearn.datasets import fetch_california_housing, load_breast_cancer -from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import OneHotEncoder, StandardScaler from rai_test_utils.datasets.tabular import \ create_iris_data as _create_iris_data @@ -35,39 +30,6 @@ def create_iris_data(): return X_train, X_test, y_train, y_test, feature_names, classes -def create_housing_data(create_small_dataset=True): - # Import California housing dataset - housing = fetch_california_housing() - # Split data into train and test - if create_small_dataset: - x_train, x_test, y_train, y_test = train_test_split(housing.data, - housing.target, - train_size=500, - test_size=50, - random_state=7) - else: - x_train, x_test, y_train, y_test = train_test_split(housing.data, - housing.target, - test_size=0.2, - random_state=7) - return x_train, x_test, y_train, y_test, housing.feature_names - - -def create_cancer_data(): - breast_cancer_data = load_breast_cancer() - classes = breast_cancer_data.target_names.tolist() - - # Split data into train and test - X_train, X_test, y_train, y_test = train_test_split( - breast_cancer_data.data, breast_cancer_data.target, - test_size=0.2, random_state=0) - feature_names = breast_cancer_data.feature_names - classes = breast_cancer_data.target_names.tolist() - X_train = pd.DataFrame(X_train, columns=feature_names) - X_test = pd.DataFrame(X_test, columns=feature_names) - return X_train, X_test, y_train, y_test, feature_names, classes - - class FetchDiceAdultCensusIncomeDataset(object): def __init__(self): pass @@ -108,28 +70,6 @@ def create_adult_income_dataset(create_small_dataset=True): feature_columns, feature_range_keys -def create_complex_classification_pipeline( - X_train, y_train, continuous_features, categorical_features): - # We create the preprocessing pipelines for both - # numeric and categorical data. - numeric_transformer = Pipeline(steps=[ - ('scaler', StandardScaler())]) - - categorical_transformer = Pipeline(steps=[ - ('onehot', OneHotEncoder(handle_unknown='ignore'))]) - - transformations = ColumnTransformer( - transformers=[ - ('num', numeric_transformer, continuous_features), - ('cat', categorical_transformer, categorical_features)]) - - # Append classifier to preprocessing pipeline. - # Now we have a full prediction pipeline. - pipeline = Pipeline(steps=[('preprocessor', transformations), - ('classifier', RandomForestClassifier())]) - return pipeline.fit(X_train, y_train) - - def assert_series_and_dict_equal(left: pd.Series, right: dict): left_json = left.to_json(orient="index") left_dict = json.loads(left_json) diff --git a/responsibleai/tests/model_analysis/test_model_analysis.py b/responsibleai/tests/model_analysis/test_model_analysis.py index bd3069d74..d05ad8443 100644 --- a/responsibleai/tests/model_analysis/test_model_analysis.py +++ b/responsibleai/tests/model_analysis/test_model_analysis.py @@ -12,20 +12,20 @@ import numpy as np import pandas as pd import pytest from tests.causal_manager_validator import validate_causal -from tests.common_utils import (create_adult_income_dataset, - create_cancer_data, - create_complex_classification_pipeline, - create_housing_data, create_iris_data) +from tests.common_utils import create_adult_income_dataset, create_iris_data from tests.counterfactual_manager_validator import validate_counterfactual from tests.error_analysis_validator import (setup_error_analysis, validate_error_analysis) from tests.explainer_manager_validator import (setup_explainer, validate_explainer) -from rai_test_utils.datasets.tabular import \ - create_binary_classification_dataset +from rai_test_utils.datasets.tabular import ( + create_binary_classification_dataset, create_cancer_data, + create_housing_data) from rai_test_utils.models.model_utils import (create_models_classification, create_models_regression) +from rai_test_utils.models.sklearn import \ + create_complex_classification_pipeline from responsibleai import ModelAnalysis, ModelTask from responsibleai._internal.constants import ManagerNames from responsibleai._tools.shared.state_directory_management import \ @@ -72,7 +72,7 @@ class TestModelAnalysis(object): ManagerNames.EXPLAINER]) def test_model_analysis_cancer(self, manager_type): X_train, X_test, y_train, y_test, _, classes = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) models = create_models_classification(X_train, y_train) X_train[LABELS] = y_train X_test[LABELS] = y_test diff --git a/responsibleai/tests/model_analysis/test_model_analysis_validations.py b/responsibleai/tests/model_analysis/test_model_analysis_validations.py index ea9d2e58c..fe23e98af 100644 --- a/responsibleai/tests/model_analysis/test_model_analysis_validations.py +++ b/responsibleai/tests/model_analysis/test_model_analysis_validations.py @@ -8,11 +8,11 @@ from unittest.mock import MagicMock import pandas as pd import pytest -from tests.common_utils import (create_cancer_data, create_housing_data, - create_iris_data) +from tests.common_utils import create_iris_data -from rai_test_utils.datasets.tabular import \ - create_binary_classification_dataset +from rai_test_utils.datasets.tabular import ( + create_binary_classification_dataset, create_cancer_data, + create_housing_data) from rai_test_utils.models.lightgbm import create_lightgbm_classifier from rai_test_utils.models.sklearn import \ create_sklearn_random_forest_regressor @@ -119,7 +119,7 @@ class TestModelAnalysisValidations: def test_validate_serializer(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) model = create_lightgbm_classifier(X_train, y_train) X_train['target'] = y_train @@ -189,7 +189,7 @@ class TestModelAnalysisValidations: def test_model_predictions_predict(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) X_train['target'] = y_train X_test['target'] = y_test @@ -209,7 +209,7 @@ class TestModelAnalysisValidations: def test_model_predictions_predict_proba(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) X_train['target'] = y_train X_test['target'] = y_test @@ -231,7 +231,7 @@ class TestModelAnalysisValidations: def test_model_analysis_incorrect_task_type(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) model = create_lightgbm_classifier(X_train, y_train) X_train['target'] = y_train @@ -250,7 +250,7 @@ class TestModelAnalysisValidations: def test_mismatch_train_test_features(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) model = create_lightgbm_classifier(X_train, y_train) X_train['target'] = y_train @@ -268,7 +268,7 @@ class TestModelAnalysisValidations: def test_unsupported_train_test_types(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) model = create_lightgbm_classifier(X_train, y_train) X_train['target'] = y_train @@ -286,7 +286,7 @@ class TestModelAnalysisValidations: def test_train_labels(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) model = create_lightgbm_classifier(X_train, y_train) X_train['target'] = y_train diff --git a/responsibleai/tests/rai_insights/test_rai_insights.py b/responsibleai/tests/rai_insights/test_rai_insights.py index 538cfd96d..df5dc223d 100644 --- a/responsibleai/tests/rai_insights/test_rai_insights.py +++ b/responsibleai/tests/rai_insights/test_rai_insights.py @@ -11,20 +11,20 @@ import numpy as np import pandas as pd import pytest from tests.causal_manager_validator import validate_causal -from tests.common_utils import (create_adult_income_dataset, - create_cancer_data, - create_complex_classification_pipeline, - create_housing_data, create_iris_data) +from tests.common_utils import create_adult_income_dataset, create_iris_data from tests.counterfactual_manager_validator import validate_counterfactual from tests.error_analysis_validator import (setup_error_analysis, validate_error_analysis) from tests.explainer_manager_validator import (setup_explainer, validate_explainer) -from rai_test_utils.datasets.tabular import \ - create_binary_classification_dataset +from rai_test_utils.datasets.tabular import ( + create_binary_classification_dataset, create_cancer_data, + create_housing_data) from rai_test_utils.models.model_utils import (create_models_classification, create_models_regression) +from rai_test_utils.models.sklearn import \ + create_complex_classification_pipeline from responsibleai import ModelTask, RAIInsights from responsibleai._internal.constants import (ManagerNames, SerializationAttributes) @@ -78,7 +78,7 @@ class TestRAIInsights(object): ManagerNames.EXPLAINER]) def test_rai_insights_cancer(self, manager_type): X_train, X_test, y_train, y_test, _, classes = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) models = create_models_classification(X_train, y_train) X_train[LABELS] = y_train X_test[LABELS] = y_test diff --git a/responsibleai/tests/rai_insights/test_rai_insights_cohort_filter.py b/responsibleai/tests/rai_insights/test_rai_insights_cohort_filter.py index 47b354683..08a45347e 100644 --- a/responsibleai/tests/rai_insights/test_rai_insights_cohort_filter.py +++ b/responsibleai/tests/rai_insights/test_rai_insights_cohort_filter.py @@ -4,11 +4,12 @@ import numpy as np import pandas as pd import pytest -from tests.common_utils import create_housing_data, create_iris_data +from tests.common_utils import create_iris_data from erroranalysis._internal.constants import (PRED_Y, ROW_INDEX, TRUE_Y, ModelTask) -from rai_test_utils.datasets.tabular import create_simple_titanic_data +from rai_test_utils.datasets.tabular import (create_housing_data, + create_simple_titanic_data) from rai_test_utils.models.sklearn import ( create_sklearn_random_forest_regressor, create_sklearn_svm_classifier, create_titanic_pipeline) diff --git a/responsibleai/tests/rai_insights/test_rai_insights_data_size_scenarios.py b/responsibleai/tests/rai_insights/test_rai_insights_data_size_scenarios.py index c64a00a86..2e94109a0 100644 --- a/responsibleai/tests/rai_insights/test_rai_insights_data_size_scenarios.py +++ b/responsibleai/tests/rai_insights/test_rai_insights_data_size_scenarios.py @@ -6,8 +6,9 @@ from tempfile import TemporaryDirectory import pandas as pd import pytest -from tests.common_utils import create_housing_data, create_iris_data +from tests.common_utils import create_iris_data +from rai_test_utils.datasets.tabular import create_housing_data from rai_test_utils.models.sklearn import ( create_sklearn_random_forest_classifier, create_sklearn_random_forest_regressor) diff --git a/responsibleai/tests/rai_insights/test_rai_insights_save_and_load_scenarios.py b/responsibleai/tests/rai_insights/test_rai_insights_save_and_load_scenarios.py index ee538eb24..72d05ca13 100644 --- a/responsibleai/tests/rai_insights/test_rai_insights_save_and_load_scenarios.py +++ b/responsibleai/tests/rai_insights/test_rai_insights_save_and_load_scenarios.py @@ -8,13 +8,13 @@ from tempfile import TemporaryDirectory import numpy as np import pandas as pd import pytest -from tests.common_utils import (create_adult_income_dataset, - create_complex_classification_pipeline, - create_iris_data) +from tests.common_utils import create_adult_income_dataset, create_iris_data from rai_test_utils.datasets.tabular import \ create_binary_classification_dataset from rai_test_utils.models.lightgbm import create_lightgbm_classifier +from rai_test_utils.models.sklearn import \ + create_complex_classification_pipeline from responsibleai import ModelTask, RAIInsights from responsibleai._internal.constants import (ManagerNames, SerializationAttributes) diff --git a/responsibleai/tests/rai_insights/test_rai_insights_validations.py b/responsibleai/tests/rai_insights/test_rai_insights_validations.py index 1ece42d9b..15100bde6 100644 --- a/responsibleai/tests/rai_insights/test_rai_insights_validations.py +++ b/responsibleai/tests/rai_insights/test_rai_insights_validations.py @@ -8,11 +8,11 @@ import numpy as np import pandas as pd import pytest from lightgbm import LGBMClassifier -from tests.common_utils import (create_cancer_data, create_housing_data, - create_iris_data) +from tests.common_utils import create_iris_data -from rai_test_utils.datasets.tabular import \ - create_binary_classification_dataset +from rai_test_utils.datasets.tabular import ( + create_binary_classification_dataset, create_cancer_data, + create_housing_data) from rai_test_utils.models.lightgbm import create_lightgbm_classifier from rai_test_utils.models.sklearn import \ create_sklearn_random_forest_regressor @@ -157,7 +157,7 @@ class TestRAIInsightsValidations: def test_validate_serializer(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) model = create_lightgbm_classifier(X_train, y_train) X_train[TARGET] = y_train @@ -227,7 +227,7 @@ class TestRAIInsightsValidations: def test_model_predictions_predict(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) X_train[TARGET] = y_train X_test[TARGET] = y_test @@ -247,7 +247,7 @@ class TestRAIInsightsValidations: def test_model_predictions_predict_proba(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) X_train[TARGET] = y_train X_test[TARGET] = y_test @@ -269,7 +269,7 @@ class TestRAIInsightsValidations: def test_incorrect_task_type(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) model = create_lightgbm_classifier(X_train, y_train) X_train[TARGET] = y_train @@ -287,7 +287,7 @@ class TestRAIInsightsValidations: def test_mismatch_train_test_features(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) model = create_lightgbm_classifier(X_train, y_train) X_train[TARGET] = y_train @@ -344,7 +344,7 @@ class TestRAIInsightsValidations: def test_unsupported_train_test_types(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) model = create_lightgbm_classifier(X_train, y_train) X_train[TARGET] = y_train @@ -363,7 +363,7 @@ class TestRAIInsightsValidations: def test_classes_exceptions(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) model = create_lightgbm_classifier(X_train, y_train) X_train[TARGET] = y_train @@ -414,7 +414,7 @@ class TestRAIInsightsValidations: def test_dataset_exception(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) model = create_lightgbm_classifier(X_train, y_train) X_train_feature_names = X_train.columns.tolist() @@ -453,7 +453,7 @@ class TestRAIInsightsValidations: def test_classes_passes(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) model = create_lightgbm_classifier(X_train, y_train) X_train[TARGET] = y_train @@ -471,7 +471,7 @@ class TestRAIInsightsValidations: def test_no_model_but_serializer_provided(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) X_train[TARGET] = y_train X_test[TARGET] = y_test @@ -489,7 +489,7 @@ class TestRAIInsightsValidations: def test_feature_metadata(self): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) model = create_lightgbm_classifier(X_train, y_train) X_train[TARGET] = y_train diff --git a/responsibleai/tests/test_model_serializer.py b/responsibleai/tests/test_model_serializer.py index 2b5f5aa2a..afcb2fbb9 100644 --- a/responsibleai/tests/test_model_serializer.py +++ b/responsibleai/tests/test_model_serializer.py @@ -3,12 +3,11 @@ import pickle from pathlib import Path +from rai_test_utils.datasets.tabular import create_cancer_data from rai_test_utils.models.lightgbm import create_lightgbm_classifier from responsibleai import RAIInsights from responsibleai._internal.constants import SerializationAttributes -from .common_utils import create_cancer_data - class PickleSerializer: def save(self, model, model_dir): @@ -35,7 +34,7 @@ class TestModelSerializer: def test_init_with_pickle_serializer(self, tmpdir): X_train, X_test, y_train, y_test, _, _ = \ - create_cancer_data() + create_cancer_data(return_dataframe=True) model = create_lightgbm_classifier(X_train, y_train) X_train['target'] = y_train