Add cohort filtering capability in RAIInsights class (#1618)

Signed-off-by: Gaurav Gupta <gaugup@microsoft.com>
This commit is contained in:
Gaurav Gupta 2022-08-08 12:32:26 -07:00 коммит произвёл GitHub
Родитель b49b08983b
Коммит 46e04a056f
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 388 добавлений и 1 удалений

Просмотреть файл

@ -12,6 +12,7 @@ from typing import Any, List, Optional
import numpy as np
import pandas as pd
from erroranalysis._internal.cohort_filter import FilterDataWithCohortFilters
from erroranalysis._internal.process_categoricals import process_categoricals
from raiutils.data_processing import convert_to_list
from raiutils.models import SKLearn, is_classifier
@ -430,6 +431,37 @@ class RAIInsights(RAIBaseInsights):
"""
return self._explainer_manager
def get_filtered_test_data(self, filters, composite_filters,
include_original_columns_only=False):
"""Get the filtered test data based on cohort filters.
:param filters: The filters to apply.
:type filters: list[Filter]
:param composite_filters: The composite filters to apply.
:type composite_filters: list[CompositeFilter]
:param include_original_columns_only: Whether to return the original
data columns.
:type include_original_columns_only: bool
:return: The filtered test data.
:rtype: pandas.DataFrame
"""
pred_y = self.model.predict(
self.test.drop(columns=[self.target_column]))
filter_data_with_cohort = FilterDataWithCohortFilters(
model=self.model,
dataset=self.test.drop(columns=[self.target_column]),
features=self.test.drop(columns=[self.target_column]).columns,
categorical_features=self.categorical_features,
categories=self._categories,
true_y=self.test[self.target_column],
pred_y=pred_y,
model_task=self.task_type)
return filter_data_with_cohort.filter_data_from_cohort(
filters=filters,
composite_filters=composite_filters,
include_original_columns_only=include_original_columns_only)
def get_data(self):
"""Get all data as RAIInsightsData object

Просмотреть файл

@ -14,10 +14,12 @@ from sklearn.compose import ColumnTransformer
from sklearn.datasets import (fetch_california_housing, load_breast_cancer,
load_iris, make_classification)
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import (FunctionTransformer, OneHotEncoder,
StandardScaler)
from xgboost import XGBClassifier
@ -87,6 +89,58 @@ def create_iris_data():
return X_train, X_test, y_train, y_test, feature_names, classes
def create_simple_titanic_data():
titanic_url = ('https://raw.githubusercontent.com/amueller/'
'scipy-2017-sklearn/091d371/notebooks/'
'datasets/titanic3.csv')
data = pd.read_csv(titanic_url)
# fill missing values
data = data.fillna(method="ffill")
data = data.fillna(method="bfill")
num_features = ['age', 'fare']
cat_features = ['embarked', 'sex', 'pclass']
y = data['survived'].values
X = data[cat_features + num_features]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
return X_train, X_test, y_train, y_test, num_features, cat_features
def create_titanic_pipeline(X_train, y_train):
def conv(X):
if isinstance(X, pd.Series):
return X.values
return X
many_to_one_transformer = \
FunctionTransformer(lambda x: conv(x.sum(axis=1)).reshape(-1, 1))
many_to_many_transformer = \
FunctionTransformer(lambda x: np.hstack(
(conv(np.prod(x, axis=1)).reshape(-1, 1),
conv(np.prod(x, axis=1)**2).reshape(-1, 1))
))
transformations = ColumnTransformer([
("age_fare_1", Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
]), ["age", "fare"]),
("age_fare_2", many_to_one_transformer, ["age", "fare"]),
("age_fare_3", many_to_many_transformer, ["age", "fare"]),
("embarked", Pipeline(steps=[
("imputer",
SimpleImputer(strategy='constant', fill_value='missing')),
("encoder", OneHotEncoder(sparse=False))]), ["embarked"]),
("sex_pclass", OneHotEncoder(sparse=False), ["sex", "pclass"])
])
clf = Pipeline(steps=[('preprocessor', transformations),
('classifier',
LogisticRegression(solver='lbfgs'))])
clf.fit(X_train, y_train)
return clf
def create_cancer_data():
breast_cancer_data = load_breast_cancer()
classes = breast_cancer_data.target_names.tolist()

Просмотреть файл

@ -0,0 +1,301 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.
import numpy as np
import pandas as pd
import pytest
from tests.common_utils import (create_iris_data, create_simple_titanic_data,
create_sklearn_svm_classifier,
create_titanic_pipeline)
from erroranalysis._internal.constants import (PRED_Y, ROW_INDEX, TRUE_Y,
ModelTask)
from responsibleai.rai_insights import RAIInsights
TOL = 1e-10
SEPAL_WIDTH = 'sepal width'
EMBARKED = 'embarked'
CLASSIFICATION_OUTCOME = 'Classification outcome'
class TestCohortFilterRAIInsights(object):
def test_cohort_filter_equal(self):
X_train, X_test, y_train, y_test, feature_names = create_iris_pandas()
filters = [{'arg': [2.8],
'column': SEPAL_WIDTH,
'method': 'equal'}]
validation_data = create_validation_data(X_test, y_test)
validation_data = validation_data.loc[X_test[SEPAL_WIDTH] == 2.8]
model_task = ModelTask.CLASSIFICATION
model = create_sklearn_svm_classifier(X_train, y_train)
categorical_features = []
run_rai_insights(validation_data,
model,
X_train,
y_train,
X_test,
y_test,
feature_names,
categorical_features,
model_task,
filters=filters)
def test_cohort_filter_less(self):
X_train, X_test, y_train, y_test, feature_names = create_iris_pandas()
filters = [{'arg': [2.8],
'column': SEPAL_WIDTH,
'method': 'less'}]
validation_data = create_validation_data(X_test, y_test)
validation_data = validation_data.loc[X_test[SEPAL_WIDTH] < 2.8]
model_task = ModelTask.CLASSIFICATION
model = create_sklearn_svm_classifier(X_train, y_train)
categorical_features = []
run_rai_insights(validation_data,
model,
X_train,
y_train,
X_test,
y_test,
feature_names,
categorical_features,
model_task,
filters=filters)
def test_cohort_filter_less_and_equal(self):
X_train, X_test, y_train, y_test, feature_names = create_iris_pandas()
filters = [{'arg': [2.8],
'column': SEPAL_WIDTH,
'method': 'less and equal'}]
validation_data = create_validation_data(X_test, y_test)
validation_data = validation_data.loc[X_test[SEPAL_WIDTH] <= 2.8]
model_task = ModelTask.CLASSIFICATION
model = create_sklearn_svm_classifier(X_train, y_train)
categorical_features = []
run_rai_insights(validation_data,
model,
X_train,
y_train,
X_test,
y_test,
feature_names,
categorical_features,
model_task,
filters=filters)
def test_cohort_filter_greater(self):
X_train, X_test, y_train, y_test, feature_names = create_iris_pandas()
filters = [{'arg': [2.8],
'column': SEPAL_WIDTH,
'method': 'greater'}]
validation_data = create_validation_data(X_test, y_test)
validation_data = validation_data.loc[X_test[SEPAL_WIDTH] > 2.8]
model_task = ModelTask.CLASSIFICATION
model = create_sklearn_svm_classifier(X_train, y_train)
categorical_features = []
run_rai_insights(validation_data,
model,
X_train,
y_train,
X_test,
y_test,
feature_names,
categorical_features,
model_task,
filters=filters)
def test_cohort_filter_greater_and_equal(self):
X_train, X_test, y_train, y_test, feature_names = create_iris_pandas()
filters = [{'arg': [2.8],
'column': SEPAL_WIDTH,
'method': 'greater and equal'}]
validation_data = create_validation_data(X_test, y_test)
validation_data = validation_data.loc[X_test[SEPAL_WIDTH] >= 2.8]
model_task = ModelTask.CLASSIFICATION
model = create_sklearn_svm_classifier(X_train, y_train)
categorical_features = []
run_rai_insights(validation_data,
model,
X_train,
y_train,
X_test,
y_test,
feature_names,
categorical_features,
model_task,
filters=filters)
def test_cohort_filter_in_the_range_of(self):
X_train, X_test, y_train, y_test, feature_names = create_iris_pandas()
filters = [{'arg': [2.8, 3.4],
'column': SEPAL_WIDTH,
'method': 'in the range of'}]
validation_data = create_validation_data(X_test, y_test)
validation_data = validation_data.loc[
(X_test[SEPAL_WIDTH] <= 3.4) & (X_test[SEPAL_WIDTH] >= 2.8)]
model_task = ModelTask.CLASSIFICATION
model = create_sklearn_svm_classifier(X_train, y_train)
categorical_features = []
run_rai_insights(validation_data,
model,
X_train,
y_train,
X_test,
y_test,
feature_names,
categorical_features,
model_task,
filters=filters)
def test_cohort_filter_includes(self):
X_train, X_test, y_train, y_test, numeric, categorical = \
create_simple_titanic_data()
feature_names = categorical + numeric
clf = create_titanic_pipeline(X_train, y_train)
categorical_features = categorical
# the indexes 0, 2 correspond to S, C
filters = [{'arg': [0, 2],
'column': EMBARKED,
'method': 'includes'}]
validation_data = create_validation_data(X_test, y_test)
filter_embarked = X_test[EMBARKED].isin(['S', 'C'])
validation_data = validation_data.loc[filter_embarked]
model_task = ModelTask.CLASSIFICATION
run_rai_insights(validation_data,
clf,
X_train,
y_train,
X_test,
y_test,
feature_names,
categorical_features,
model_task,
filters=filters)
def test_cohort_filter_excludes(self):
X_train, X_test, y_train, y_test, numeric, categorical = \
create_simple_titanic_data()
feature_names = categorical + numeric
clf = create_titanic_pipeline(X_train, y_train)
categorical_features = categorical
# the indexes other than 0, 2 correspond to Q
filters = [{'arg': [0, 2],
'column': EMBARKED,
'method': 'excludes'}]
validation_data = create_validation_data(X_test, y_test)
filter_embarked = X_test[EMBARKED].isin(['Q'])
validation_data = validation_data.loc[filter_embarked]
model_task = ModelTask.CLASSIFICATION
run_rai_insights(validation_data,
clf,
X_train,
y_train,
X_test,
y_test,
feature_names,
categorical_features,
model_task,
filters=filters)
@pytest.mark.parametrize('arg, outcome', [([1, 2], False), ([0, 3], True)])
def test_cohort_filter_classification_outcome(self, arg, outcome):
X_train, X_test, y_train, y_test, numeric, categorical = \
create_simple_titanic_data()
feature_names = categorical + numeric
clf = create_titanic_pipeline(X_train, y_train)
categorical_features = categorical
# the indexes 1, 2 correspond to false positives and false negatives
# the indexes 0, 3 correspond to true positives and true negatives
filters = [{'arg': arg,
'column': CLASSIFICATION_OUTCOME,
'method': 'includes'}]
pred_y = clf.predict(X_test)
validation_data = create_validation_data(X_test, y_test, pred_y)
if not outcome:
validation_filter = validation_data[PRED_Y] != validation_data[
TRUE_Y]
else:
validation_filter = validation_data[PRED_Y] == validation_data[
TRUE_Y]
validation_data = validation_data.loc[validation_filter]
validation_data = validation_data.drop(columns=PRED_Y)
model_task = ModelTask.CLASSIFICATION
run_rai_insights(validation_data,
clf,
X_train,
y_train,
X_test,
y_test,
feature_names,
categorical_features,
model_task,
filters=filters)
def test_cohort_filter_index(self):
X_train, X_test, y_train, y_test, feature_names = create_iris_pandas()
# filter on index, which can be done from the RAI dashboard
filters = [{'arg': [40],
'column': ROW_INDEX,
'method': 'less and equal'}]
validation_data = create_validation_data(X_test, y_test)
validation_data = validation_data.loc[validation_data[ROW_INDEX] <= 40]
model_task = ModelTask.CLASSIFICATION
model = create_sklearn_svm_classifier(X_train, y_train)
categorical_features = []
model_task = ModelTask.CLASSIFICATION
run_rai_insights(validation_data,
model,
X_train,
y_train,
X_test,
y_test,
feature_names,
categorical_features,
model_task,
filters=filters)
def create_iris_pandas():
X_train, X_test, y_train, y_test, feature_names, _ = create_iris_data()
X_train = pd.DataFrame(X_train, columns=feature_names)
X_test = pd.DataFrame(X_test, columns=feature_names)
return X_train, X_test, y_train, y_test, feature_names
def create_validation_data(X_test, y_test, pred_y=None):
validation_data = X_test.copy()
validation_data[TRUE_Y] = y_test
validation_data[ROW_INDEX] = np.arange(0, len(y_test))
if pred_y is not None:
validation_data[PRED_Y] = pred_y
return validation_data
def run_rai_insights(validation_data,
model,
X_train,
y_train,
X_test,
y_test,
feature_names,
categorical_features,
model_task,
filters=None,
composite_filters=None):
train = X_train.copy()
train["target"] = y_train
test = X_test.copy()
test["target"] = y_test
rai_insights = RAIInsights(
model, train, test, "target", model_task,
categorical_features=categorical_features)
filtered_data = rai_insights.get_filtered_test_data(
filters,
composite_filters)
# validate there is some data selected for each of the filters
assert validation_data.shape[0] > 0
assert validation_data.equals(filtered_data)