Add cohort filtering capability in RAIInsights class (#1618)
Signed-off-by: Gaurav Gupta <gaugup@microsoft.com>
This commit is contained in:
Родитель
b49b08983b
Коммит
46e04a056f
|
@ -12,6 +12,7 @@ from typing import Any, List, Optional
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from erroranalysis._internal.cohort_filter import FilterDataWithCohortFilters
|
||||
from erroranalysis._internal.process_categoricals import process_categoricals
|
||||
from raiutils.data_processing import convert_to_list
|
||||
from raiutils.models import SKLearn, is_classifier
|
||||
|
@ -430,6 +431,37 @@ class RAIInsights(RAIBaseInsights):
|
|||
"""
|
||||
return self._explainer_manager
|
||||
|
||||
def get_filtered_test_data(self, filters, composite_filters,
|
||||
include_original_columns_only=False):
|
||||
"""Get the filtered test data based on cohort filters.
|
||||
|
||||
:param filters: The filters to apply.
|
||||
:type filters: list[Filter]
|
||||
:param composite_filters: The composite filters to apply.
|
||||
:type composite_filters: list[CompositeFilter]
|
||||
:param include_original_columns_only: Whether to return the original
|
||||
data columns.
|
||||
:type include_original_columns_only: bool
|
||||
:return: The filtered test data.
|
||||
:rtype: pandas.DataFrame
|
||||
"""
|
||||
pred_y = self.model.predict(
|
||||
self.test.drop(columns=[self.target_column]))
|
||||
filter_data_with_cohort = FilterDataWithCohortFilters(
|
||||
model=self.model,
|
||||
dataset=self.test.drop(columns=[self.target_column]),
|
||||
features=self.test.drop(columns=[self.target_column]).columns,
|
||||
categorical_features=self.categorical_features,
|
||||
categories=self._categories,
|
||||
true_y=self.test[self.target_column],
|
||||
pred_y=pred_y,
|
||||
model_task=self.task_type)
|
||||
|
||||
return filter_data_with_cohort.filter_data_from_cohort(
|
||||
filters=filters,
|
||||
composite_filters=composite_filters,
|
||||
include_original_columns_only=include_original_columns_only)
|
||||
|
||||
def get_data(self):
|
||||
"""Get all data as RAIInsightsData object
|
||||
|
||||
|
|
|
@ -14,10 +14,12 @@ from sklearn.compose import ColumnTransformer
|
|||
from sklearn.datasets import (fetch_california_housing, load_breast_cancer,
|
||||
load_iris, make_classification)
|
||||
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
||||
from sklearn.preprocessing import (FunctionTransformer, OneHotEncoder,
|
||||
StandardScaler)
|
||||
from xgboost import XGBClassifier
|
||||
|
||||
|
||||
|
@ -87,6 +89,58 @@ def create_iris_data():
|
|||
return X_train, X_test, y_train, y_test, feature_names, classes
|
||||
|
||||
|
||||
def create_simple_titanic_data():
|
||||
titanic_url = ('https://raw.githubusercontent.com/amueller/'
|
||||
'scipy-2017-sklearn/091d371/notebooks/'
|
||||
'datasets/titanic3.csv')
|
||||
data = pd.read_csv(titanic_url)
|
||||
# fill missing values
|
||||
data = data.fillna(method="ffill")
|
||||
data = data.fillna(method="bfill")
|
||||
num_features = ['age', 'fare']
|
||||
cat_features = ['embarked', 'sex', 'pclass']
|
||||
|
||||
y = data['survived'].values
|
||||
X = data[cat_features + num_features]
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42)
|
||||
return X_train, X_test, y_train, y_test, num_features, cat_features
|
||||
|
||||
|
||||
def create_titanic_pipeline(X_train, y_train):
|
||||
def conv(X):
|
||||
if isinstance(X, pd.Series):
|
||||
return X.values
|
||||
return X
|
||||
|
||||
many_to_one_transformer = \
|
||||
FunctionTransformer(lambda x: conv(x.sum(axis=1)).reshape(-1, 1))
|
||||
many_to_many_transformer = \
|
||||
FunctionTransformer(lambda x: np.hstack(
|
||||
(conv(np.prod(x, axis=1)).reshape(-1, 1),
|
||||
conv(np.prod(x, axis=1)**2).reshape(-1, 1))
|
||||
))
|
||||
transformations = ColumnTransformer([
|
||||
("age_fare_1", Pipeline(steps=[
|
||||
('imputer', SimpleImputer(strategy='median')),
|
||||
('scaler', StandardScaler())
|
||||
]), ["age", "fare"]),
|
||||
("age_fare_2", many_to_one_transformer, ["age", "fare"]),
|
||||
("age_fare_3", many_to_many_transformer, ["age", "fare"]),
|
||||
("embarked", Pipeline(steps=[
|
||||
("imputer",
|
||||
SimpleImputer(strategy='constant', fill_value='missing')),
|
||||
("encoder", OneHotEncoder(sparse=False))]), ["embarked"]),
|
||||
("sex_pclass", OneHotEncoder(sparse=False), ["sex", "pclass"])
|
||||
])
|
||||
clf = Pipeline(steps=[('preprocessor', transformations),
|
||||
('classifier',
|
||||
LogisticRegression(solver='lbfgs'))])
|
||||
clf.fit(X_train, y_train)
|
||||
return clf
|
||||
|
||||
|
||||
def create_cancer_data():
|
||||
breast_cancer_data = load_breast_cancer()
|
||||
classes = breast_cancer_data.target_names.tolist()
|
||||
|
|
|
@ -0,0 +1,301 @@
|
|||
# Copyright (c) Microsoft Corporation
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from tests.common_utils import (create_iris_data, create_simple_titanic_data,
|
||||
create_sklearn_svm_classifier,
|
||||
create_titanic_pipeline)
|
||||
|
||||
from erroranalysis._internal.constants import (PRED_Y, ROW_INDEX, TRUE_Y,
|
||||
ModelTask)
|
||||
from responsibleai.rai_insights import RAIInsights
|
||||
|
||||
TOL = 1e-10
|
||||
SEPAL_WIDTH = 'sepal width'
|
||||
EMBARKED = 'embarked'
|
||||
CLASSIFICATION_OUTCOME = 'Classification outcome'
|
||||
|
||||
|
||||
class TestCohortFilterRAIInsights(object):
|
||||
def test_cohort_filter_equal(self):
|
||||
X_train, X_test, y_train, y_test, feature_names = create_iris_pandas()
|
||||
filters = [{'arg': [2.8],
|
||||
'column': SEPAL_WIDTH,
|
||||
'method': 'equal'}]
|
||||
validation_data = create_validation_data(X_test, y_test)
|
||||
validation_data = validation_data.loc[X_test[SEPAL_WIDTH] == 2.8]
|
||||
model_task = ModelTask.CLASSIFICATION
|
||||
model = create_sklearn_svm_classifier(X_train, y_train)
|
||||
categorical_features = []
|
||||
run_rai_insights(validation_data,
|
||||
model,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
y_test,
|
||||
feature_names,
|
||||
categorical_features,
|
||||
model_task,
|
||||
filters=filters)
|
||||
|
||||
def test_cohort_filter_less(self):
|
||||
X_train, X_test, y_train, y_test, feature_names = create_iris_pandas()
|
||||
filters = [{'arg': [2.8],
|
||||
'column': SEPAL_WIDTH,
|
||||
'method': 'less'}]
|
||||
validation_data = create_validation_data(X_test, y_test)
|
||||
validation_data = validation_data.loc[X_test[SEPAL_WIDTH] < 2.8]
|
||||
model_task = ModelTask.CLASSIFICATION
|
||||
model = create_sklearn_svm_classifier(X_train, y_train)
|
||||
categorical_features = []
|
||||
run_rai_insights(validation_data,
|
||||
model,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
y_test,
|
||||
feature_names,
|
||||
categorical_features,
|
||||
model_task,
|
||||
filters=filters)
|
||||
|
||||
def test_cohort_filter_less_and_equal(self):
|
||||
X_train, X_test, y_train, y_test, feature_names = create_iris_pandas()
|
||||
|
||||
filters = [{'arg': [2.8],
|
||||
'column': SEPAL_WIDTH,
|
||||
'method': 'less and equal'}]
|
||||
validation_data = create_validation_data(X_test, y_test)
|
||||
validation_data = validation_data.loc[X_test[SEPAL_WIDTH] <= 2.8]
|
||||
model_task = ModelTask.CLASSIFICATION
|
||||
model = create_sklearn_svm_classifier(X_train, y_train)
|
||||
categorical_features = []
|
||||
run_rai_insights(validation_data,
|
||||
model,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
y_test,
|
||||
feature_names,
|
||||
categorical_features,
|
||||
model_task,
|
||||
filters=filters)
|
||||
|
||||
def test_cohort_filter_greater(self):
|
||||
X_train, X_test, y_train, y_test, feature_names = create_iris_pandas()
|
||||
filters = [{'arg': [2.8],
|
||||
'column': SEPAL_WIDTH,
|
||||
'method': 'greater'}]
|
||||
validation_data = create_validation_data(X_test, y_test)
|
||||
validation_data = validation_data.loc[X_test[SEPAL_WIDTH] > 2.8]
|
||||
model_task = ModelTask.CLASSIFICATION
|
||||
model = create_sklearn_svm_classifier(X_train, y_train)
|
||||
categorical_features = []
|
||||
run_rai_insights(validation_data,
|
||||
model,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
y_test,
|
||||
feature_names,
|
||||
categorical_features,
|
||||
model_task,
|
||||
filters=filters)
|
||||
|
||||
def test_cohort_filter_greater_and_equal(self):
|
||||
X_train, X_test, y_train, y_test, feature_names = create_iris_pandas()
|
||||
filters = [{'arg': [2.8],
|
||||
'column': SEPAL_WIDTH,
|
||||
'method': 'greater and equal'}]
|
||||
validation_data = create_validation_data(X_test, y_test)
|
||||
validation_data = validation_data.loc[X_test[SEPAL_WIDTH] >= 2.8]
|
||||
model_task = ModelTask.CLASSIFICATION
|
||||
model = create_sklearn_svm_classifier(X_train, y_train)
|
||||
categorical_features = []
|
||||
run_rai_insights(validation_data,
|
||||
model,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
y_test,
|
||||
feature_names,
|
||||
categorical_features,
|
||||
model_task,
|
||||
filters=filters)
|
||||
|
||||
def test_cohort_filter_in_the_range_of(self):
|
||||
X_train, X_test, y_train, y_test, feature_names = create_iris_pandas()
|
||||
filters = [{'arg': [2.8, 3.4],
|
||||
'column': SEPAL_WIDTH,
|
||||
'method': 'in the range of'}]
|
||||
validation_data = create_validation_data(X_test, y_test)
|
||||
validation_data = validation_data.loc[
|
||||
(X_test[SEPAL_WIDTH] <= 3.4) & (X_test[SEPAL_WIDTH] >= 2.8)]
|
||||
model_task = ModelTask.CLASSIFICATION
|
||||
model = create_sklearn_svm_classifier(X_train, y_train)
|
||||
categorical_features = []
|
||||
run_rai_insights(validation_data,
|
||||
model,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
y_test,
|
||||
feature_names,
|
||||
categorical_features,
|
||||
model_task,
|
||||
filters=filters)
|
||||
|
||||
def test_cohort_filter_includes(self):
|
||||
X_train, X_test, y_train, y_test, numeric, categorical = \
|
||||
create_simple_titanic_data()
|
||||
feature_names = categorical + numeric
|
||||
clf = create_titanic_pipeline(X_train, y_train)
|
||||
categorical_features = categorical
|
||||
# the indexes 0, 2 correspond to S, C
|
||||
filters = [{'arg': [0, 2],
|
||||
'column': EMBARKED,
|
||||
'method': 'includes'}]
|
||||
validation_data = create_validation_data(X_test, y_test)
|
||||
filter_embarked = X_test[EMBARKED].isin(['S', 'C'])
|
||||
validation_data = validation_data.loc[filter_embarked]
|
||||
model_task = ModelTask.CLASSIFICATION
|
||||
run_rai_insights(validation_data,
|
||||
clf,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
y_test,
|
||||
feature_names,
|
||||
categorical_features,
|
||||
model_task,
|
||||
filters=filters)
|
||||
|
||||
def test_cohort_filter_excludes(self):
|
||||
X_train, X_test, y_train, y_test, numeric, categorical = \
|
||||
create_simple_titanic_data()
|
||||
feature_names = categorical + numeric
|
||||
clf = create_titanic_pipeline(X_train, y_train)
|
||||
categorical_features = categorical
|
||||
# the indexes other than 0, 2 correspond to Q
|
||||
filters = [{'arg': [0, 2],
|
||||
'column': EMBARKED,
|
||||
'method': 'excludes'}]
|
||||
validation_data = create_validation_data(X_test, y_test)
|
||||
filter_embarked = X_test[EMBARKED].isin(['Q'])
|
||||
validation_data = validation_data.loc[filter_embarked]
|
||||
model_task = ModelTask.CLASSIFICATION
|
||||
run_rai_insights(validation_data,
|
||||
clf,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
y_test,
|
||||
feature_names,
|
||||
categorical_features,
|
||||
model_task,
|
||||
filters=filters)
|
||||
|
||||
@pytest.mark.parametrize('arg, outcome', [([1, 2], False), ([0, 3], True)])
|
||||
def test_cohort_filter_classification_outcome(self, arg, outcome):
|
||||
X_train, X_test, y_train, y_test, numeric, categorical = \
|
||||
create_simple_titanic_data()
|
||||
feature_names = categorical + numeric
|
||||
clf = create_titanic_pipeline(X_train, y_train)
|
||||
categorical_features = categorical
|
||||
# the indexes 1, 2 correspond to false positives and false negatives
|
||||
# the indexes 0, 3 correspond to true positives and true negatives
|
||||
filters = [{'arg': arg,
|
||||
'column': CLASSIFICATION_OUTCOME,
|
||||
'method': 'includes'}]
|
||||
pred_y = clf.predict(X_test)
|
||||
validation_data = create_validation_data(X_test, y_test, pred_y)
|
||||
if not outcome:
|
||||
validation_filter = validation_data[PRED_Y] != validation_data[
|
||||
TRUE_Y]
|
||||
else:
|
||||
validation_filter = validation_data[PRED_Y] == validation_data[
|
||||
TRUE_Y]
|
||||
validation_data = validation_data.loc[validation_filter]
|
||||
validation_data = validation_data.drop(columns=PRED_Y)
|
||||
model_task = ModelTask.CLASSIFICATION
|
||||
run_rai_insights(validation_data,
|
||||
clf,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
y_test,
|
||||
feature_names,
|
||||
categorical_features,
|
||||
model_task,
|
||||
filters=filters)
|
||||
|
||||
def test_cohort_filter_index(self):
|
||||
X_train, X_test, y_train, y_test, feature_names = create_iris_pandas()
|
||||
# filter on index, which can be done from the RAI dashboard
|
||||
filters = [{'arg': [40],
|
||||
'column': ROW_INDEX,
|
||||
'method': 'less and equal'}]
|
||||
validation_data = create_validation_data(X_test, y_test)
|
||||
validation_data = validation_data.loc[validation_data[ROW_INDEX] <= 40]
|
||||
model_task = ModelTask.CLASSIFICATION
|
||||
model = create_sklearn_svm_classifier(X_train, y_train)
|
||||
categorical_features = []
|
||||
model_task = ModelTask.CLASSIFICATION
|
||||
run_rai_insights(validation_data,
|
||||
model,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
y_test,
|
||||
feature_names,
|
||||
categorical_features,
|
||||
model_task,
|
||||
filters=filters)
|
||||
|
||||
|
||||
def create_iris_pandas():
|
||||
X_train, X_test, y_train, y_test, feature_names, _ = create_iris_data()
|
||||
|
||||
X_train = pd.DataFrame(X_train, columns=feature_names)
|
||||
X_test = pd.DataFrame(X_test, columns=feature_names)
|
||||
|
||||
return X_train, X_test, y_train, y_test, feature_names
|
||||
|
||||
|
||||
def create_validation_data(X_test, y_test, pred_y=None):
|
||||
validation_data = X_test.copy()
|
||||
validation_data[TRUE_Y] = y_test
|
||||
validation_data[ROW_INDEX] = np.arange(0, len(y_test))
|
||||
if pred_y is not None:
|
||||
validation_data[PRED_Y] = pred_y
|
||||
return validation_data
|
||||
|
||||
|
||||
def run_rai_insights(validation_data,
|
||||
model,
|
||||
X_train,
|
||||
y_train,
|
||||
X_test,
|
||||
y_test,
|
||||
feature_names,
|
||||
categorical_features,
|
||||
model_task,
|
||||
filters=None,
|
||||
composite_filters=None):
|
||||
train = X_train.copy()
|
||||
train["target"] = y_train
|
||||
test = X_test.copy()
|
||||
test["target"] = y_test
|
||||
rai_insights = RAIInsights(
|
||||
model, train, test, "target", model_task,
|
||||
categorical_features=categorical_features)
|
||||
|
||||
filtered_data = rai_insights.get_filtered_test_data(
|
||||
filters,
|
||||
composite_filters)
|
||||
|
||||
# validate there is some data selected for each of the filters
|
||||
assert validation_data.shape[0] > 0
|
||||
assert validation_data.equals(filtered_data)
|
Загрузка…
Ссылка в новой задаче