initial commit
This commit is contained in:
Родитель
d416224d8f
Коммит
0f189e729b
|
@ -0,0 +1,194 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import random
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
import mct.Constants as Constants
|
||||
import mct.Utilities as Utils
|
||||
from mct.HypothesisTester import chi_square_bias_test
|
||||
|
||||
|
||||
class BiasTester(object):
|
||||
"""
|
||||
Perform a bias check between the control and treatment dataframes.
|
||||
"""
|
||||
__group_control = 'group_control'
|
||||
__group_treatment = 'group_treatment'
|
||||
__index_group = 'index_group'
|
||||
__rf_propensity_scores = 'rf_propensity_scores'
|
||||
|
||||
def __init__(self, config: json):
|
||||
self.config = config
|
||||
self.__logger = logging.getLogger("mct")
|
||||
return
|
||||
|
||||
def check_bias(self, control_df: pd.DataFrame, treatment_df: pd.DataFrame) -> \
|
||||
(pd.DataFrame, pd.DataFrame, pd.DataFrame, bool):
|
||||
"""
|
||||
# Compares the destruction of invariant features separately and flag any stat sig
|
||||
# difference that satisfies the given minimum percentage deviation threshold
|
||||
:param control_df: control dataframe
|
||||
:param treatment_df: treatment dataframe
|
||||
:return:
|
||||
"""
|
||||
|
||||
self.__logger.debug('Checking for Population Bias')
|
||||
|
||||
invariant_features = self.config[Constants.invariant_columns]
|
||||
p_value_threshold = self.config[Constants.p_value_threshold]
|
||||
percentage_deviation_threshold = self.config[Constants.resample_threshold]
|
||||
small_bin_percent_threshold = self.config[Constants.small_bin_percent_threshold]
|
||||
|
||||
bias_results, deviation, is_biased = chi_square_bias_test(control_df[invariant_features],
|
||||
treatment_df[invariant_features],
|
||||
groups=[Constants.control_group,
|
||||
Constants.treatment_group],
|
||||
group_column_name=Constants.group_column_name,
|
||||
other_threshold=small_bin_percent_threshold,
|
||||
p_value=0.01)
|
||||
|
||||
bias_results[Constants.num_of_bins] = bias_results[Constants.degree_of_freedom] + 1
|
||||
|
||||
bias_results[Constants.resample] = 'no'
|
||||
bias_results.loc[(bias_results[Constants.percentage_deviation] > percentage_deviation_threshold)
|
||||
& (bias_results[Constants.p_value_threshold] < p_value_threshold),
|
||||
Constants.resample] = 'yes'
|
||||
|
||||
# Sort and round Bias results
|
||||
bias_results = bias_results.sort_values(by=Constants.percentage_deviation, ascending=False)
|
||||
bias_results.sort_values(by=[Constants.percentage_deviation, Constants.feature], ascending=False, inplace=True)
|
||||
|
||||
is_biased = is_biased and (bias_results[Constants.resample] == 'yes').any()
|
||||
self.__logger.info("Is Data biased: {0}".format(is_biased))
|
||||
|
||||
# Sort and round deviations.
|
||||
deviation.sort_values(
|
||||
by=[Constants.feature, Constants.bin_column],
|
||||
ascending=False,
|
||||
inplace=True)
|
||||
|
||||
return bias_results, deviation, is_biased
|
||||
|
||||
def normalize_bias(self, control: pd.DataFrame, treatment: pd.DataFrame, bias_results: pd.DataFrame,
|
||||
random_state=None) -> (pd.DataFrame, pd.DataFrame):
|
||||
"""
|
||||
Normalize and correct for the major biases.
|
||||
|
||||
bias_results - needs to include columns to normalize, and dof
|
||||
"""
|
||||
self.__logger.debug("Bias Normalization: started")
|
||||
|
||||
Utils.add_group_columns(control, treatment)
|
||||
|
||||
if self.config[Constants.normalization_type] != 'rf':
|
||||
message = 'Currently only supported normalization type is random forest'
|
||||
self.__logger.error(message)
|
||||
raise Exception(message)
|
||||
|
||||
if not bias_results.empty:
|
||||
resample_columns = bias_results[Constants.feature]
|
||||
max_categories = bias_results[Constants.num_of_bins]
|
||||
|
||||
data_splits = [(self.__group_control, control), (self.__group_treatment, treatment)]
|
||||
|
||||
feature_transforms = [('categorical', x, y) for x, y in zip(resample_columns, max_categories)]
|
||||
|
||||
self.__logger.info('Using RF propensity scores with caliper based matching.')
|
||||
|
||||
# Get data after sampling.
|
||||
df_metric = self.__sample_propensity(data_splits, feature_transforms, random_state=random_state)
|
||||
df_control = df_metric[df_metric[Constants.group_column_name] == Constants.control_group]
|
||||
df_treatment = df_metric[df_metric[Constants.group_column_name] == Constants.treatment_group]
|
||||
|
||||
return df_control, df_treatment
|
||||
else:
|
||||
self.__logger.info("Bias Normalization skipped.")
|
||||
self.__logger.debug("Bias Normalization finished. ")
|
||||
|
||||
# Transform the input data
|
||||
def __transform(self, input_frame, features):
|
||||
train = pd.DataFrame(index=input_frame.index)
|
||||
for func, feat, max_categories in features:
|
||||
# Reduce cardinality of input_frame
|
||||
dt = input_frame[feat].astype(str)
|
||||
feat_counts = dt.value_counts()
|
||||
if len(feat_counts) > max_categories:
|
||||
dt[~dt.isin(feat_counts[:max_categories].index)] = Constants.other_feature_cluster_name
|
||||
# OneHot encode the features
|
||||
train = train.join(pd.get_dummies(dt, prefix=feat))
|
||||
|
||||
return train
|
||||
|
||||
def __rf_propensity(self, data, target, random_state=None):
|
||||
|
||||
scalar = StandardScaler()
|
||||
data_transformed = scalar.fit_transform(data)
|
||||
|
||||
clf = RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=random_state, n_estimators=10)
|
||||
clf.fit(data_transformed, target)
|
||||
scores = clf.predict_proba(data_transformed)
|
||||
|
||||
return scores[:, 1]
|
||||
|
||||
def ___matching_1_1(self, df, random_state=None):
|
||||
df_c = df[df[self.__index_group] == self.__group_control]
|
||||
df_t = df[df[self.__index_group] == self.__group_treatment]
|
||||
df_ps = pd.DataFrame(df_c[self.__rf_propensity_scores].value_counts()).join(
|
||||
pd.DataFrame(df_t[self.__rf_propensity_scores].value_counts()),
|
||||
on=None,
|
||||
how='inner',
|
||||
lsuffix='l',
|
||||
rsuffix='r',
|
||||
sort=False)
|
||||
df_ps['num2use'] = df_ps[['rf_propensity_scoresl', 'rf_propensity_scoresr']].min(axis=1)
|
||||
index_c = []
|
||||
index_t = []
|
||||
|
||||
random.seed(a=random_state)
|
||||
for i in df_ps.index:
|
||||
kk = df_ps.loc[i]['num2use']
|
||||
index_c += random.sample([ind for ind in df_c[df_c[self.__rf_propensity_scores] == i].index_original], k=kk)
|
||||
index_t += random.sample([ind for ind in df_t[df_t[self.__rf_propensity_scores] == i].index_original], k=kk)
|
||||
|
||||
return index_c, index_t
|
||||
|
||||
def __matching_caliper(self, df, caliper_coeff=0.1, random_state=None):
|
||||
caliper_width = caliper_coeff * df[self.__rf_propensity_scores].std()
|
||||
df[self.__rf_propensity_scores] = (df[self.__rf_propensity_scores] / caliper_width).astype(int)
|
||||
return self.___matching_1_1(df, random_state=random_state)
|
||||
|
||||
def __sample_propensity(self, splits, feats, caliper_coeff=0.1, match_type='caliper', random_state=None):
|
||||
# concatenates the split dataframes, keeping the labels
|
||||
|
||||
df = pd.concat([i for _, i in splits], keys=[splits[0][0], splits[1][0]],
|
||||
names=[self.__index_group, 'index_original'])
|
||||
|
||||
# Note: resetting index, to prevent potential problems with having the same index values after the concat.
|
||||
df.reset_index(inplace=True)
|
||||
|
||||
# Set up data frame for classification algorithm.
|
||||
pred_frame = self.__transform(df, feats)
|
||||
|
||||
# Get propensity scores using RF algorithm.s
|
||||
df[self.__rf_propensity_scores] = self.__rf_propensity(pred_frame, df[self.__index_group],
|
||||
random_state=random_state)
|
||||
|
||||
# Perform 1-1 matching based on the propensity scores.
|
||||
if match_type == 'caliper':
|
||||
ind_c, ind_t = self.__matching_caliper(df, caliper_coeff=caliper_coeff, random_state=random_state)
|
||||
else:
|
||||
ind_c, ind_t = self.___matching_1_1(df, random_state=random_state)
|
||||
|
||||
self.__logger.info("Resampled data size: {}, Percent of retained data: {}:"
|
||||
.format(len(ind_c) * 2, int(len(ind_c) * 2 / len(df) * 100)))
|
||||
self.__logger.info("Percent retained in Control {}, Percent of retained in Treatment {}:"
|
||||
.format(int(len(ind_c) / len(df[df[self.__index_group] == self.__group_control]) * 100),
|
||||
int(len(ind_c) / len(df[df[self.__index_group] == self.__group_treatment]) * 100)))
|
||||
|
||||
return pd.concat([splits[0][1].filter(ind_c, axis=0), splits[1][1].filter(ind_t, axis=0)])
|
|
@ -0,0 +1,49 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
# Code Constants
|
||||
skip = 'skip'
|
||||
sorting_type_delta_count = 'delta_count'
|
||||
sorting_type_delta_percent = 'delta_percent'
|
||||
# Reserved column names
|
||||
group_column_name = "group"
|
||||
other_feature_cluster_name = 'other'
|
||||
control_group = 'control'
|
||||
treatment_group = 'treatment'
|
||||
# Computed columns
|
||||
feature = 'feature'
|
||||
resample = 'resample'
|
||||
expected_failures = '# of Expected Failures in Treatment'
|
||||
actual_failures = '# of Actual Failures in Treatment'
|
||||
num_of_bins = 'num bins'
|
||||
degree_of_freedom = 'dof'
|
||||
percentage_deviation = 'Percentage Deviation'
|
||||
mean_difference = 'Percent Difference'
|
||||
mean_control = 'Percent Control'
|
||||
control_percent = 'control_percent'
|
||||
treatment_percent = 'treatment_percent'
|
||||
mean_treatment = 'Percent Treatment'
|
||||
percent_delta = '% Delta'
|
||||
count_delta = 'Delta (Count)'
|
||||
hazard_score = 'Hazard Score'
|
||||
bin_column = 'bin'
|
||||
p_value = 'P-Value'
|
||||
is_stat_sig = 'Is Stat-Sig'
|
||||
# Config parameters
|
||||
small_bin_percent_threshold = 'small_bin_percent_threshold'
|
||||
p_value_threshold = 'p_value'
|
||||
sort_type = 'sort_type'
|
||||
normalization_type = 'normalization_type'
|
||||
metric_column = "metric_col"
|
||||
invariant_columns = 'invariant_columns'
|
||||
feature_columns = 'feature_columns'
|
||||
results_dir = "results_dir"
|
||||
add_is_null_column = 'add_null'
|
||||
resample_threshold = 'resample_threshold'
|
||||
decomposition_type = 'decomposition_type'
|
||||
required_config_keys = [metric_column, invariant_columns, feature_columns, resample_threshold, results_dir,
|
||||
p_value_threshold, decomposition_type, normalization_type, sort_type, add_is_null_column,
|
||||
'apply_feature_target_metric_dependence_test']
|
||||
|
||||
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
log_file_name = 'mct.log'
|
|
@ -0,0 +1,177 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import scipy.stats as sp
|
||||
|
||||
import mct.Constants as Constants
|
||||
from mct.Utilities import get_is_null_column_name
|
||||
|
||||
|
||||
class FeatureBinGenerator(object):
|
||||
"""
|
||||
Class to bin numeric features
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.__logger = logging.getLogger("mct")
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
def create_top_bins(data: pd.DataFrame, column_name: str, number_of_bins: int,
|
||||
minimum_size: int, other_bin_name: str, add_is_null_column: bool) -> pd.DataFrame:
|
||||
"""
|
||||
Create maximum of number_of_bins bins by selecting the top number_of_bins frequent values
|
||||
and combining the rest into other bin.
|
||||
|
||||
:param data: dataframe
|
||||
:param column_name: column name to bin
|
||||
:param number_of_bins:
|
||||
:param minimum_size:
|
||||
:param other_bin_name:
|
||||
:param add_is_null_column:
|
||||
:return: binned column
|
||||
"""
|
||||
|
||||
column = data[column_name].astype(str)
|
||||
feats = column.value_counts(dropna=(not add_is_null_column)).to_frame('count').reset_index().sort_values(
|
||||
['count', 'index'], ascending=[False, True])
|
||||
feats = feats[:number_of_bins]
|
||||
feats = feats.loc[feats['count'] >= minimum_size, 'index']
|
||||
column[~column.isin(feats)] = other_bin_name
|
||||
|
||||
return column
|
||||
|
||||
def create_percentile_bins(self, df: pd.DataFrame, num_cols: list, add_null: bool = False,
|
||||
num_bins: int = 4) -> pd.DataFrame:
|
||||
"""
|
||||
Method to bin numerical features by their percentile.
|
||||
|
||||
Numerical Variables
|
||||
* Bins data by percentile.
|
||||
* Encodes the new variables with variable name, GTE (greater than or equal) to LTE syntax
|
||||
* Add a dummy *_nan* variable for recording the fact that feature was null
|
||||
|
||||
Categorical Variables
|
||||
* Returns a warning. Doesn't bin the feature.
|
||||
|
||||
:param df: input pandas dataframe
|
||||
:param num_cols: a list of the names of the numerical columns in df to bin.
|
||||
:param add_null: whether to add the *_nan* features to data (default False)
|
||||
:param num_bins: the number of bins to break the data into the percent width of each bin is 100/num_bins.
|
||||
(default 4, is quartile break down, set low to avoid blow up.)
|
||||
|
||||
:returns: Transformed pandas dataframe
|
||||
"""
|
||||
dummy_frames = []
|
||||
|
||||
num_cols = [col for col in num_cols if col in df.columns]
|
||||
|
||||
for col in num_cols:
|
||||
# make sure numerical column for binning.
|
||||
if df[col].dtype == np.object:
|
||||
self.__logger.warning("Warning: Feature {0} is not numerical and wasn't binned.".format(col))
|
||||
continue
|
||||
|
||||
# get percentiles
|
||||
dt = df[col]
|
||||
dt = ((dt.rank() - 1) * num_bins / len(dt.dropna())).apply(np.floor)
|
||||
|
||||
dt_agg = df.groupby(dt)[col].agg([np.min, np.max]).rename(columns={'amin': 'min', 'amax': 'max'})
|
||||
|
||||
for bin_num in dt.unique():
|
||||
if np.isnan(bin_num):
|
||||
continue
|
||||
if dt_agg.loc[bin_num]['min'] == dt_agg.loc[bin_num]['max']:
|
||||
dt.replace(bin_num, 'is_{}'.format(dt_agg.loc[bin_num]['min']), inplace=True)
|
||||
else:
|
||||
dt.replace(bin_num, 'GTE_{}_LTE_{}'.format(dt_agg.loc[bin_num]['min'], dt_agg.loc[bin_num]['max']),
|
||||
inplace=True)
|
||||
|
||||
add_is_null = add_null and df[col].isnull().any()
|
||||
dummy_frames.append(pd.get_dummies(dt, prefix=col, dummy_na=add_is_null))
|
||||
|
||||
df.drop(col, axis=1, inplace=True)
|
||||
|
||||
df = df.join(dummy_frames, sort=True)
|
||||
return df
|
||||
|
||||
@staticmethod
|
||||
def get_feature_dummies(df: pd.DataFrame, feature_cols, target_col, add_null=True, p_thresh=.01,
|
||||
min_data_points=500, max_categories=5, apply_feature_target_metric_dependence_test=True):
|
||||
"""
|
||||
Method to transform a dataframe wrt to a target variable to be used with classification models.
|
||||
|
||||
Numerical Variables
|
||||
* Null values are left in dataframe.
|
||||
* Add a dummy *is_null* variable for recording the fact that feature was null
|
||||
|
||||
Categorical Variables
|
||||
* One hot encode categorical variables
|
||||
* To avoid blow up:
|
||||
max_categories: Maximum number of categories a feature can have.
|
||||
The rest are collected into a category 'other'
|
||||
* To maintain informativeness:
|
||||
min_data_points: The minimum number of points required to create a bin.
|
||||
p_thresh: A chi-squared test is run against target_col.
|
||||
Variable is kept if resulting p_value < p_thresh. Otherwise dropped.
|
||||
|
||||
:param apply_feature_target_metric_dependence_test:
|
||||
:param add_null:
|
||||
:param target_col:
|
||||
:param df: input pandas dataframe
|
||||
:param feature_cols: A list of the feature names in the df to be transformed.
|
||||
:param max_categories: Maximum number of categories.
|
||||
:param min_data_points: Minimum number of point in categorical bin.
|
||||
:param p_thresh: Critical value for chi-squared test.
|
||||
|
||||
:returns: Transformed pandas dataframe, a list of columns that were treated as numerical.
|
||||
|
||||
Future Work: Algorithm that selects the candidate categorical binning based on information gain as
|
||||
opposed to data size.
|
||||
"""
|
||||
|
||||
is_null_frame = {}
|
||||
numerical_columns = []
|
||||
dummy_frames = []
|
||||
|
||||
# Drop columns with only single value including null
|
||||
for col in feature_cols:
|
||||
if df[col].nunique(dropna=False) == 1:
|
||||
df.drop(col, axis=1, inplace=True)
|
||||
continue
|
||||
|
||||
# For numerical columns {int, float} create is_null.
|
||||
if not (df[col].dtype == np.object or df[col].dtype == np.bool):
|
||||
if df[col].isnull().any() and add_null:
|
||||
is_null_frame[get_is_null_column_name(col)] = np.isnan(df[col])
|
||||
|
||||
if df[col].nunique(dropna=True) == 1:
|
||||
df.drop(col, axis=1, inplace=True)
|
||||
else:
|
||||
numerical_columns.append(col)
|
||||
else:
|
||||
# For categorical columns create feature dummies.
|
||||
dt_col = FeatureBinGenerator.create_top_bins(df, col, max_categories, min_data_points,
|
||||
Constants.other_feature_cluster_name,
|
||||
add_null)
|
||||
|
||||
if apply_feature_target_metric_dependence_test:
|
||||
chi2, p_val, dof, expected = sp.chi2_contingency(pd.crosstab(dt_col, df[target_col]))
|
||||
if p_val > p_thresh:
|
||||
df.drop(col, axis=1, inplace=True)
|
||||
continue
|
||||
# TODO: Refactor the code to create dummies when we need them to optimize the performance
|
||||
dummy_frames.append(pd.get_dummies(dt_col, prefix=col))
|
||||
df.drop(col, axis=1, inplace=True)
|
||||
|
||||
if add_null:
|
||||
null_frame = pd.DataFrame(is_null_frame)
|
||||
df = df.join(null_frame, sort=True)
|
||||
|
||||
df = df.join(dummy_frames, sort=True)
|
||||
|
||||
return df, numerical_columns
|
|
@ -0,0 +1,258 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import mct.Constants as Constants
|
||||
from mct.FeatureBinGenerator import FeatureBinGenerator
|
||||
from mct.HypothesisTester import ci_proportion_bounds
|
||||
|
||||
|
||||
class FeatureRanker(object):
|
||||
"""
|
||||
Feature ranking for metric delta.
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
self.__config = config
|
||||
self.__bin_generator = FeatureBinGenerator()
|
||||
self.__logger = logging.getLogger("mct")
|
||||
return
|
||||
|
||||
def compute_ranks(self, df_metric: pd.DataFrame, df_metric_not_normalized: pd.DataFrame,
|
||||
numerical_cols: list) -> pd.DataFrame:
|
||||
"""
|
||||
Feature ranking for metric delta.
|
||||
"""
|
||||
config = self.__config
|
||||
target_col = config[Constants.metric_column]
|
||||
add_null = config[Constants.add_is_null_column]
|
||||
sorting_type = config[Constants.sort_type]
|
||||
# Compute featuring ranking
|
||||
df_feature_ranking = self.__decompose_metric_univar(df_metric, df_metric_not_normalized, target_col,
|
||||
numerical_cols, add_null)
|
||||
|
||||
if sorting_type == Constants.sorting_type_delta_count:
|
||||
sorted_feature = df_feature_ranking.sort_values(Constants.count_delta, ascending=False, inplace=False)
|
||||
elif sorting_type == Constants.sorting_type_delta_percent:
|
||||
sorted_feature = df_feature_ranking.sort_values(Constants.percent_delta, ascending=False, inplace=False)
|
||||
else:
|
||||
sorted_feature = df_feature_ranking.sort_values(Constants.hazard_score, ascending=False, inplace=False)
|
||||
|
||||
sorted_feature.reset_index(inplace=True, drop=True)
|
||||
|
||||
return sorted_feature
|
||||
|
||||
def __decompose_metric_univar(self, df_metric, df_metric_not_normalized, target_col, numerical_cols, add_null):
|
||||
"""
|
||||
Computes the univariate feature ranking.
|
||||
"""
|
||||
df_uni_var_un_norm, df_uni_var_norm, categorical_cols = self.__set_univar_frames(df_metric,
|
||||
df_metric_not_normalized,
|
||||
numerical_cols, target_col,
|
||||
add_null)
|
||||
|
||||
expected = []
|
||||
actual = []
|
||||
contribution = []
|
||||
is_sig = []
|
||||
feature_results = pd.DataFrame(categorical_cols, columns=[Constants.feature])
|
||||
|
||||
# categorical feature ranking
|
||||
for col in categorical_cols:
|
||||
sig, exp, act, con = self.__feat_cat_significant(col,
|
||||
df_uni_var_norm,
|
||||
df_uni_var_un_norm,
|
||||
target_col,
|
||||
Constants.group_column_name)
|
||||
is_sig.append(sig)
|
||||
contribution.append(con)
|
||||
expected.append(exp)
|
||||
actual.append(act)
|
||||
|
||||
feature_results[Constants.is_stat_sig] = is_sig
|
||||
feature_results[Constants.count_delta] = contribution
|
||||
feature_results[Constants.expected_failures] = expected
|
||||
feature_results[Constants.actual_failures] = actual
|
||||
|
||||
# numerical feature ranking
|
||||
for col in numerical_cols:
|
||||
sig, binned_feats, is_bin_sig, exp, act, con = self.__feat_num_significant(col, df_uni_var_norm,
|
||||
df_uni_var_un_norm,
|
||||
target_col,
|
||||
Constants.group_column_name,
|
||||
add_null)
|
||||
if sig:
|
||||
feature_results = feature_results.append(
|
||||
pd.DataFrame(list(zip(*[binned_feats, is_bin_sig, exp, act, con])),
|
||||
columns=[Constants.feature,
|
||||
Constants.is_stat_sig,
|
||||
Constants.expected_failures,
|
||||
Constants.actual_failures,
|
||||
Constants.count_delta]),
|
||||
sort=True)
|
||||
|
||||
# Set up columns for output feature ranking.
|
||||
feature_results[Constants.percent_delta] = np.abs(
|
||||
feature_results[Constants.count_delta] / feature_results[Constants.expected_failures] * 100)
|
||||
n_fail_exp = \
|
||||
df_metric_not_normalized.loc[
|
||||
df_metric_not_normalized[Constants.group_column_name] == Constants.control_group][
|
||||
target_col].sum() / len(
|
||||
df_metric_not_normalized.loc[
|
||||
df_metric_not_normalized[Constants.group_column_name] == Constants.control_group]) * len(
|
||||
df_metric_not_normalized.loc[
|
||||
df_metric_not_normalized[Constants.group_column_name] == Constants.treatment_group])
|
||||
n_fail_act = df_metric_not_normalized.loc[
|
||||
df_metric_not_normalized[Constants.group_column_name] == Constants.treatment_group][target_col].sum()
|
||||
feature_results[Constants.hazard_score] = (feature_results[Constants.actual_failures] / n_fail_act -
|
||||
feature_results[Constants.expected_failures] / n_fail_exp) * 100
|
||||
|
||||
feature_results.reset_index(inplace=True, drop=True)
|
||||
|
||||
output_columns = [Constants.feature, Constants.hazard_score, Constants.expected_failures,
|
||||
Constants.actual_failures, Constants.count_delta, Constants.percent_delta]
|
||||
|
||||
stat_sig_features = feature_results[feature_results[Constants.is_stat_sig] == True][output_columns]
|
||||
return stat_sig_features
|
||||
|
||||
def __set_univar_frames(self, df_metric, df_metric_not_normalized, numerical_cols, target_col, add_null):
|
||||
"""
|
||||
Prepares data_frames for univariate feature ranking. One before and one after bias normalization.
|
||||
Sets up categorical and numerical features.
|
||||
"""
|
||||
|
||||
config = self.__config
|
||||
invar_target_cols = config[Constants.invariant_columns] + [target_col]
|
||||
|
||||
df_invar_target = df_metric_not_normalized[invar_target_cols]
|
||||
frame_invar_target, num_cols = self.__bin_generator.get_feature_dummies(
|
||||
df_invar_target.copy(),
|
||||
config[Constants.invariant_columns],
|
||||
target_col,
|
||||
min_data_points=1,
|
||||
max_categories=10,
|
||||
p_thresh=0.25,
|
||||
add_null=add_null,
|
||||
apply_feature_target_metric_dependence_test=self.__config['apply_feature_target_metric_dependence_test'])
|
||||
|
||||
# normalized and non-normalized data_frames for feature ranking
|
||||
df_uni_var_un_norm = df_metric_not_normalized[
|
||||
config[Constants.feature_columns] + [Constants.group_column_name]].merge(frame_invar_target,
|
||||
left_index=True,
|
||||
right_index=True)
|
||||
df_uni_var_norm = df_metric[config[Constants.feature_columns] + [Constants.group_column_name]].merge(
|
||||
frame_invar_target, left_index=True, right_index=True)
|
||||
|
||||
feature_cols = list(df_uni_var_un_norm.columns)
|
||||
feature_cols.remove(Constants.group_column_name)
|
||||
feature_cols.remove(target_col)
|
||||
categorical_cols = [col for col in feature_cols if col not in numerical_cols]
|
||||
|
||||
return df_uni_var_un_norm, df_uni_var_norm, categorical_cols
|
||||
|
||||
def __feat_cat_significant(self, col, df_uni_var_norm, df_uni_var_un_norm, target_col, group_col):
|
||||
"""
|
||||
Determines if categorical col is significant on the normalized dataset. If it is it computes the impact
|
||||
on the non-normalized data set and returns the contribution.
|
||||
"""
|
||||
num_c, len_c, num_t, len_t = self.__feat_info_cat(col, df_uni_var_norm, target_col, group_col)
|
||||
|
||||
sig = self.__sig_check(num_c, len_c, num_t, len_t)
|
||||
if not sig:
|
||||
return False, 0, 0, 0
|
||||
|
||||
# If number of drops is significant return the number of impacted calls on the original data set.
|
||||
num_c, len_c, num_t, len_t = self.__feat_info_cat(col, df_uni_var_un_norm, target_col, group_col)
|
||||
|
||||
return True, num_c * len_t / len_c, num_t, num_t - num_c * len_t / len_c
|
||||
|
||||
def __feat_info_cat(self, col, df_uni_var, target_col, group_col):
|
||||
"""
|
||||
sets up the appropriate dataframe and returns the number of failure associated with the given feature
|
||||
on the treatment and control datasets
|
||||
"""
|
||||
|
||||
return self.__feat_info(df_uni_var[[target_col, group_col, col]], col, target_col, group_col)
|
||||
|
||||
@staticmethod
|
||||
def __feat_info(df_col, col, target_col, group_col):
|
||||
"""
|
||||
computes the number of failure associated with the given feature on the treatment and control datasets
|
||||
"""
|
||||
df = df_col.copy()
|
||||
|
||||
df['targ_and_col'] = df[target_col] & df[col]
|
||||
col_info = df.groupby(group_col)['targ_and_col'].agg({'size', 'sum'})
|
||||
|
||||
num_c = col_info.loc[Constants.control_group, 'sum']
|
||||
len_c = col_info.loc[Constants.control_group, 'size']
|
||||
num_t = col_info.loc[Constants.treatment_group, 'sum']
|
||||
len_t = col_info.loc[Constants.treatment_group, 'size']
|
||||
|
||||
return num_c, len_c, num_t, len_t
|
||||
|
||||
@staticmethod
|
||||
def __sig_check(num_c, len_c, num_t, len_t):
|
||||
"""
|
||||
checks if the change in failures is significant between treatment and control datasets
|
||||
"""
|
||||
low_c, up_c = ci_proportion_bounds(num_c / len_c, len_c)
|
||||
low_t, up_t = ci_proportion_bounds(num_t / len_t, len_t)
|
||||
|
||||
if (low_c <= low_t) and (up_c >= low_t):
|
||||
return False
|
||||
if (low_t <= low_c) and (up_t >= low_c):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def __feat_num_significant(self, col, df_uni_var_norm, df_uni_var_un_norm, target_col, group_col, add_null):
|
||||
"""
|
||||
Determines if the binning for a numerical col is significant on the normalized dataset.
|
||||
If it is it computes the impact on the non-normalized data set by creating a new binning,
|
||||
and returning significance and contributions.
|
||||
"""
|
||||
df_target_col = df_uni_var_norm[[target_col, group_col, col]]
|
||||
df_bin_col = self.__bin_generator.create_percentile_bins(df_target_col.copy(), [col], num_bins=10,
|
||||
add_null=add_null)
|
||||
binned_feats = [feat for feat in df_bin_col.columns if col in feat]
|
||||
|
||||
sig = False
|
||||
|
||||
for feat in binned_feats:
|
||||
num_c, len_c, num_t, len_t = self.__feat_info(df_bin_col[[target_col, group_col, feat]],
|
||||
feat,
|
||||
target_col,
|
||||
group_col)
|
||||
sig = self.__sig_check(num_c, len_c, num_t, len_t)
|
||||
if sig:
|
||||
break
|
||||
|
||||
# if none of the binned features are significant return False, 0 impact
|
||||
if not sig:
|
||||
return False, 0, 0, 0, 0, 0
|
||||
|
||||
# contribution on the non-normalized data set
|
||||
df_target_col = df_uni_var_un_norm[[target_col, group_col, col]]
|
||||
df_bin_col = self.__bin_generator.create_percentile_bins(df_target_col.copy(), [col], num_bins=10,
|
||||
add_null=add_null)
|
||||
binned_feats = [feat for feat in df_bin_col.columns if col in feat]
|
||||
|
||||
expected = []
|
||||
actual = []
|
||||
contribution = []
|
||||
is_sig = []
|
||||
|
||||
for feat in binned_feats:
|
||||
num_c, len_c, num_t, len_t = self.__feat_info(df_bin_col[[target_col, group_col, feat]], feat, target_col,
|
||||
group_col)
|
||||
contribution.append(num_t - num_c * len_t / len_c)
|
||||
actual.append(num_t)
|
||||
expected.append(num_c * len_t / len_c)
|
||||
is_sig.append(self.__sig_check(num_c, len_c, num_t, len_t))
|
||||
|
||||
return True, binned_feats, is_sig, expected, actual, contribution
|
|
@ -0,0 +1,230 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import scipy.stats as sp
|
||||
import statsmodels.stats.api as sms
|
||||
|
||||
import mct.Constants as Constants
|
||||
|
||||
|
||||
# t-test
|
||||
def get_t_test_result(a, b):
|
||||
# Run t-test on control and treatment.
|
||||
(mean_diff, control_diff, stat, p_value) = __two_sample_t_test(a, b)
|
||||
(lower, upper) = __t_test_conf_interval(a, b)
|
||||
mean_rel = mean_diff / control_diff
|
||||
lower = lower
|
||||
upper = upper
|
||||
return mean_diff, mean_rel, lower, upper, stat, p_value
|
||||
|
||||
|
||||
def __two_sample_t_test(a, b):
|
||||
# Run t-test on control and treatment.
|
||||
(stat, p_value) = sp.ttest_ind(a, b, equal_var=False)
|
||||
control_mean = np.mean(a)
|
||||
mean_diff = control_mean - np.mean(b)
|
||||
return mean_diff, control_mean, stat, p_value
|
||||
|
||||
|
||||
def __t_test_conf_interval(a, b):
|
||||
cm = sms.CompareMeans(sms.DescrStatsW(a), sms.DescrStatsW(b))
|
||||
return cm.tconfint_diff(usevar='pooled')
|
||||
|
||||
|
||||
# bernoulli test - Test of Proportions
|
||||
def chi_squared_results(a, b):
|
||||
# Run Chi-Squared Test on control and treatment.
|
||||
mean_control = np.mean(a)
|
||||
mean_treatment = np.mean(b)
|
||||
mean_diff = mean_treatment - mean_control
|
||||
|
||||
df = pd.concat([
|
||||
pd.DataFrame(data={'metric': a, 'label': Constants.control_group}),
|
||||
pd.DataFrame(data={'metric': b, 'label': Constants.treatment_group})
|
||||
])
|
||||
|
||||
cont_table = pd.crosstab(df['label'], df['metric'])
|
||||
chi2, p_val, dof, expected = sp.chi2_contingency(cont_table)
|
||||
|
||||
return mean_diff, mean_control, mean_treatment, chi2, p_val
|
||||
|
||||
|
||||
def ci_proportion_bounds(p, n):
|
||||
err = __ci_proportion(p, n)
|
||||
return (p - err), (p + err)
|
||||
|
||||
|
||||
def __se_proportion(p, n):
|
||||
return np.sqrt((p * (1 - p)) / n)
|
||||
|
||||
|
||||
def __ci_proportion(p, n):
|
||||
return 1.96 * __se_proportion(p, n)
|
||||
|
||||
|
||||
"""
|
||||
Created on Fri Apr 8 09:48:53 2016
|
||||
|
||||
Python utility to perform if data is biased between two groups.
|
||||
|
||||
NOTE: This only performs a bias check for categorical values.
|
||||
It does not perform a bias check for numeric variables.
|
||||
|
||||
See examples in https://onlinecourses.science.psu.edu/stat414/node/311
|
||||
|
||||
@author: jayagup
|
||||
"""
|
||||
|
||||
|
||||
def chi_square_bias_test(control, treatment, groups, group_column_name, other_threshold, p_value):
|
||||
"""
|
||||
Compute the chi square test of homogeneity between two
|
||||
groups.
|
||||
|
||||
See details in https://onlinecourses.science.psu.edu/stat414/node/311
|
||||
|
||||
:input data_1: The first data frame.
|
||||
:input data_2: The second data frame.
|
||||
:input groups: The name of the groups.
|
||||
:input p_value: The p-value with which to evaluate univariate table.
|
||||
|
||||
:returns is_biased: True if biased else False
|
||||
:returns table_biased: A table of results with differences..
|
||||
"""
|
||||
|
||||
contingency_table_c = __get_contingency_table(control)
|
||||
contingency_table_t = __get_contingency_table(treatment)
|
||||
|
||||
# Add group information to the dataframes.
|
||||
contingency_table_c[group_column_name] = groups[0]
|
||||
contingency_table_t[group_column_name] = groups[1]
|
||||
|
||||
# Create a single contingency table including both groups.
|
||||
contingency_table = contingency_table_c.append(contingency_table_t)
|
||||
|
||||
# Run the bias check.
|
||||
return chi_square_bias_test_contingency(contingency_table, other_threshold, p_value)
|
||||
|
||||
|
||||
def chi_square_bias_test_contingency(df_cont_table, other_threshold, p_value):
|
||||
"""
|
||||
Chi square test of homogeneity over all features.
|
||||
"""
|
||||
# Perform the bias check for all features, one feature at a time.
|
||||
bias_results = []
|
||||
deviation = pd.DataFrame()
|
||||
for feature in df_cont_table[Constants.feature].unique():
|
||||
chi2, p_val, dof, perc_dev, feature_deviation = bias_check_covariate(
|
||||
df_cont_table, feature, other_threshold)
|
||||
feature_deviation[Constants.feature] = feature
|
||||
deviation = deviation.append(feature_deviation)
|
||||
|
||||
bias_results.append({
|
||||
Constants.feature: feature,
|
||||
"chi_square": chi2,
|
||||
"p_value": p_val,
|
||||
Constants.degree_of_freedom: dof,
|
||||
Constants.percentage_deviation: perc_dev})
|
||||
|
||||
df_bias_results = pd.DataFrame(bias_results)
|
||||
|
||||
# Test whether each feature meets the p-value criterion.
|
||||
p_value_check = (np.sum(df_bias_results.p_value < p_value) > 0)
|
||||
|
||||
return df_bias_results, deviation, p_value_check
|
||||
|
||||
|
||||
def bias_check_covariate(df_cont_table, feature, other_threshold=1.0):
|
||||
"""
|
||||
Chi square test of homogeneity for single feature.
|
||||
|
||||
:input df_cont_table: Counts for the feature.
|
||||
:input feature: The name of the feature.
|
||||
:return outcome of the chi square bias check.
|
||||
"""
|
||||
# Filter the feature.
|
||||
df_cont_feature = df_cont_table[df_cont_table.feature == feature]
|
||||
|
||||
# Pivot the counts to create R X C format.
|
||||
df_cont_pivot = pd.pivot_table(
|
||||
df_cont_feature,
|
||||
values='count',
|
||||
columns=Constants.group_column_name,
|
||||
index=Constants.bin_column)
|
||||
|
||||
df_cont_pivot, grps = __combine_small_bins(df_cont_pivot, other_threshold)
|
||||
|
||||
# Feed the contingency table to chi square test.
|
||||
chi2, p_val, dof, expected = sp.chi2_contingency(df_cont_pivot)
|
||||
|
||||
# Compute the probability deviation from expected.
|
||||
diff_percent = np.abs(expected - df_cont_pivot) / sum(expected) * 100.0
|
||||
|
||||
# Compute percentage for each bin.
|
||||
grp_percent = list(map(lambda x: x + "_percent", grps)) # control_percent, treatment_percent
|
||||
diff_percent[grp_percent] = df_cont_pivot[grps] / df_cont_pivot[grps].sum() * 100.0
|
||||
diff_percent.reset_index(inplace=True)
|
||||
|
||||
# Sum of differences of expected probability and observed probability
|
||||
# Note that the sum should be the same for the two columns.
|
||||
perc_dev = np.max(diff_percent[grps.tolist()].max())
|
||||
|
||||
return chi2, p_val, dof, perc_dev, diff_percent
|
||||
|
||||
|
||||
def __combine_small_bins(df_cont_pivot, other_threshold):
|
||||
"""
|
||||
Combine bins that are too small in both control and treatment group into Constants.other_feature_cluster_name
|
||||
|
||||
:input df_cont_pivot: contingency pivot table.
|
||||
"""
|
||||
# if there are bins which are too small in both groups
|
||||
# then set them to the other group.
|
||||
grps = df_cont_pivot.columns
|
||||
df_cont_pivot.reset_index(inplace=True)
|
||||
df_cont_pivot.fillna(0, inplace=True)
|
||||
other_grp_1 = df_cont_pivot[grps[0]] / df_cont_pivot[grps[0]].sum() * 100.0 < other_threshold
|
||||
other_grp_2 = df_cont_pivot[grps[1]] / df_cont_pivot[grps[1]].sum() * 100.0 < other_threshold
|
||||
other_grp = other_grp_1 & other_grp_2
|
||||
df_cont_pivot.loc[other_grp, Constants.bin_column] = Constants.other_feature_cluster_name
|
||||
|
||||
# Combine all the others by grouping by bin again
|
||||
df_cont_pivot = df_cont_pivot.groupby(Constants.bin_column).sum()
|
||||
df_cont_pivot.fillna(0, inplace=True)
|
||||
|
||||
return df_cont_pivot, grps
|
||||
|
||||
|
||||
def __get_contingency_table(data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Create a contingency table for the dataframe.
|
||||
|
||||
:input df: The dataframe.
|
||||
:returns The contingency table data frame with [Constants.bin_column ,'count', Constants.feature] columns
|
||||
"""
|
||||
contingency_data_frame = pd.DataFrame()
|
||||
|
||||
for c in data.columns:
|
||||
contingency_data_frame = contingency_data_frame.append(_get_feature_values_distribution(data[c]))
|
||||
|
||||
return contingency_data_frame
|
||||
|
||||
|
||||
def _get_feature_values_distribution(feature):
|
||||
"""
|
||||
Get the count for each feature value.
|
||||
"""
|
||||
|
||||
# TODO: this would perform poorly when we have a numerical feature.
|
||||
distribution = pd.DataFrame(feature
|
||||
.groupby(feature.values)
|
||||
.agg('count')
|
||||
.reset_index()
|
||||
.rename(columns={"index": Constants.bin_column, feature.name: 'count'})
|
||||
)
|
||||
|
||||
distribution[Constants.feature] = feature.name
|
||||
|
||||
return distribution
|
|
@ -0,0 +1,59 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
import mct.Constants as Constants
|
||||
from mct.BiasTester import BiasTester
|
||||
from mct.FeatureRanker import FeatureRanker
|
||||
from mct.MetricComparer import MetricComparer
|
||||
from mct.PreProcessor import PreProcessor
|
||||
from mct.Visualizer import Visualizer
|
||||
|
||||
|
||||
class MCT(object):
|
||||
def __init__(self, config: json):
|
||||
self.__config = config
|
||||
log_file = os.path.join(config[Constants.results_dir], Constants.log_file_name)
|
||||
logger = logging.getLogger("mct")
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setFormatter(logging.Formatter(Constants.log_format))
|
||||
file_handler.setLevel(logging.INFO)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
def process(self, control: DataFrame, treatment: DataFrame, random_state=None):
|
||||
preprocessor = PreProcessor(self.__config)
|
||||
visualizer = Visualizer(self.__config)
|
||||
|
||||
df_metric, numerical_cols = preprocessor.pre_process_data(control, treatment)
|
||||
df_metric_not_norm = df_metric.copy()
|
||||
|
||||
# Compare Control vs Treatment
|
||||
delta_comparer = MetricComparer(self.__config)
|
||||
control = df_metric[df_metric[Constants.group_column_name] == Constants.control_group]
|
||||
treatment = df_metric[df_metric[Constants.group_column_name] == Constants.treatment_group]
|
||||
metric_delta = delta_comparer.compare(control, treatment)
|
||||
|
||||
# Bias checker
|
||||
bias_tester = BiasTester(self.__config)
|
||||
visualizer.create_metric_delta_report(metric_delta, "initial_metric_comparison.html")
|
||||
|
||||
bias_results, deviation, is_biased = bias_tester.check_bias(control, treatment)
|
||||
visualizer.create_bias_result_report(bias_results, deviation)
|
||||
|
||||
if is_biased and (self.__config[Constants.normalization_type] != Constants.skip):
|
||||
n_control, n_treatment = bias_tester.normalize_bias(control, treatment, bias_results, random_state)
|
||||
|
||||
df_metric = n_control.append(n_treatment)
|
||||
|
||||
b_metric_delta = delta_comparer.compare(n_control, n_treatment)
|
||||
visualizer.create_metric_delta_report(b_metric_delta, "normalized_metric_comparison.html")
|
||||
n_bias_results, n_deviation, n_is_biased = bias_tester.check_bias(n_control, n_treatment)
|
||||
|
||||
feature_ranker = FeatureRanker(self.__config)
|
||||
feature_rank = feature_ranker.compute_ranks(df_metric, df_metric_not_norm, numerical_cols)
|
||||
visualizer.create_feature_rank_report(feature_rank)
|
|
@ -0,0 +1,46 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import mct.Constants as Constants
|
||||
from mct.HypothesisTester import chi_squared_results
|
||||
|
||||
|
||||
class MetricComparer(object):
|
||||
"""
|
||||
Class to compare a metric on two datasets
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.__logger = logging.getLogger("mct")
|
||||
return
|
||||
|
||||
def compare(self, control: pd.DataFrame, treatment: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
:param control: control dataframe
|
||||
:param treatment: treatment dataframe
|
||||
:return: dataframe [Constants.mean_difference,
|
||||
Constants.mean_control,
|
||||
Constants.mean_treatment,
|
||||
Constants.p_value,
|
||||
Constants.is_stat_sig]
|
||||
"""
|
||||
control_metric = control[self.config[Constants.metric_column]]
|
||||
treatment_metric = treatment[self.config[Constants.metric_column]]
|
||||
|
||||
mean_diff, mean_control, mean_treatment, chi2, p_val = chi_squared_results(control_metric, treatment_metric)
|
||||
|
||||
metric_delta = pd.DataFrame(
|
||||
[{
|
||||
Constants.mean_difference: mean_diff,
|
||||
Constants.mean_control: mean_control,
|
||||
Constants.mean_treatment: mean_treatment,
|
||||
Constants.p_value: p_val,
|
||||
Constants.is_stat_sig: (p_val < self.config[Constants.p_value_threshold])
|
||||
}])
|
||||
|
||||
return metric_delta
|
|
@ -0,0 +1,216 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import mct.Constants as Constants
|
||||
import mct.Utilities as Utils
|
||||
from mct.FeatureBinGenerator import FeatureBinGenerator
|
||||
|
||||
|
||||
class PreProcessor(object):
|
||||
|
||||
def __init__(self, config: json):
|
||||
self.__config = config
|
||||
self.__logger = logging.getLogger("mct")
|
||||
return
|
||||
|
||||
def pre_process_data(self, control_df: pd.DataFrame, treatment_df: pd.DataFrame) -> \
|
||||
(pd.DataFrame, list):
|
||||
"""
|
||||
# Validating data against the config input.
|
||||
# Adding is_null column based on config
|
||||
# OneHot-encoding the Categorical features
|
||||
# Dropping non-informative features:
|
||||
# - Categorical: based on Chi^2 test
|
||||
# - Numerical: when the feature has only single non-null value
|
||||
"""
|
||||
|
||||
self.__validate_config()
|
||||
self.__validate_column_types(control_df, treatment_df)
|
||||
|
||||
# Partition columns into target, invariant and variant features
|
||||
df = Utils.merge_control_treatment(control_df, treatment_df)
|
||||
df_feature_target, df_invariant_columns, df_metric_columns, feature_columns = self.__partition_columns(df)
|
||||
self.__validate_data_set(df)
|
||||
|
||||
# Encode Categorical features - remove ones with 0 variation, or with no impact to the metric.
|
||||
# Keep track of numerical columns (possibly containing NULL values)
|
||||
df_feature_target_binned, num_cols = FeatureBinGenerator.get_feature_dummies(
|
||||
df_feature_target,
|
||||
feature_columns,
|
||||
self.__config[Constants.metric_column],
|
||||
add_null=self.__config[Constants.add_is_null_column],
|
||||
p_thresh=0.25,
|
||||
min_data_points=1,
|
||||
max_categories=10,
|
||||
apply_feature_target_metric_dependence_test=self.__config['apply_feature_target_metric_dependence_test'])
|
||||
|
||||
# Drop target metric column
|
||||
df_metric = self.__merge_columns(df_feature_target_binned, df_invariant_columns, df_metric_columns)
|
||||
|
||||
return df_metric, num_cols
|
||||
|
||||
def __merge_columns(self, df_feature_target_binned, df_invariant_columns, df_metric_columns):
|
||||
metric_column = self.__config[Constants.metric_column]
|
||||
df_feature_target_binned.drop(metric_column, axis=1, inplace=True)
|
||||
feature_columns = list(df_feature_target_binned.columns)
|
||||
if not feature_columns:
|
||||
raise Exception("There is no feature left, that meets the threshold criteria")
|
||||
self.__config[Constants.feature_columns] = feature_columns
|
||||
# Join the feature, invariant and target data_frames.
|
||||
df_metric = df_feature_target_binned.merge(df_invariant_columns, copy=False, left_index=True, right_index=True)
|
||||
df_metric = df_metric.merge(df_metric_columns, left_index=True, right_index=True)
|
||||
df_metric.reset_index(drop=True, inplace=True)
|
||||
return df_metric
|
||||
|
||||
def __partition_columns(self, df):
|
||||
# Set the metric columns: contains the metric column and Constants.group_column_name column
|
||||
metric_column = self.__config[Constants.metric_column]
|
||||
df_metric_columns = df[[Constants.group_column_name, metric_column]]
|
||||
# Set invariant columns.
|
||||
invariant_columns = self.__get_available_features(df, self.__config[Constants.invariant_columns])
|
||||
df[invariant_columns] = df[invariant_columns].astype('object')
|
||||
df[invariant_columns] = df[invariant_columns].fillna('NULL')
|
||||
df_invariant = df[invariant_columns]
|
||||
# Set feature columns.
|
||||
feature_columns = self.__get_available_features(df, self.__config[Constants.feature_columns])
|
||||
df_feature_columns = df[feature_columns]
|
||||
# Merge features and metric column.
|
||||
df_feature_target = df_feature_columns.merge(
|
||||
pd.DataFrame(df_metric_columns[metric_column]),
|
||||
left_index=True,
|
||||
right_index=True)
|
||||
return df_feature_target, df_invariant, df_metric_columns, feature_columns
|
||||
|
||||
def __validate_config(self):
|
||||
config = self.__config
|
||||
successful = True
|
||||
|
||||
# Check access to the output folder
|
||||
output_folder = config[Constants.results_dir]
|
||||
if not os.access(output_folder, os.W_OK):
|
||||
successful = False
|
||||
self.__logger.error('There is no write access to the output folder: {0}'.format(output_folder))
|
||||
|
||||
# Make sure all config parameters exist
|
||||
missing_keys = [key for key in Constants.required_config_keys if key not in config.keys()]
|
||||
if missing_keys:
|
||||
missing = ','.join(str(x) for x in missing_keys)
|
||||
successful = False
|
||||
self.__logger.error('Following config parameters are missing: {0}'.format(missing))
|
||||
|
||||
# Make sure there is no intersection of metric_col, invariant_columns and feature_columns
|
||||
# and deduplicate if there have common features. Giving priorities in the following order:
|
||||
# 1) metric_col
|
||||
# 2) invariant_columns
|
||||
# 3) feature_columns
|
||||
config[Constants.feature_columns] = list(set(config[Constants.feature_columns]))
|
||||
config[Constants.invariant_columns] = list(set(config[Constants.invariant_columns]))
|
||||
|
||||
if config[Constants.metric_column] in config[Constants.invariant_columns]:
|
||||
config[Constants.invariant_columns].remove(config[Constants.metric_column])
|
||||
self.__logger.warning(
|
||||
'Metric column {0} cannot be part of invariant columns.'.format(config[Constants.metric_column]))
|
||||
|
||||
if config[Constants.metric_column] in config[Constants.feature_columns]:
|
||||
config[Constants.feature_columns].remove(config[Constants.metric_column])
|
||||
self.__logger.warning(
|
||||
'Metric column {0} cannot be part of feature columns.'.format(config[Constants.metric_column]))
|
||||
|
||||
intersection = set(config[Constants.feature_columns]).intersection(config[Constants.invariant_columns])
|
||||
if len(intersection) > 0:
|
||||
config[Constants.feature_columns] = [feat for feat in config[Constants.feature_columns] if
|
||||
feat not in intersection]
|
||||
common = ','.join(str(x) for x in intersection)
|
||||
self.__logger.warning('Features {0} are set as invariant and cannot be part of a features.'.format(common))
|
||||
|
||||
if not successful:
|
||||
raise Exception('The config-file validation has failed!')
|
||||
return
|
||||
|
||||
def __validate_data_set(self, data: pd.DataFrame):
|
||||
# Check:
|
||||
# 1) There are no duplicate columns
|
||||
# 2) No reserved prefix/suffix is not used in column name
|
||||
# 3) No reserved values is used; e.g 'others'
|
||||
|
||||
successful = True
|
||||
|
||||
feature_columns_set = set(data.columns)
|
||||
|
||||
if len(feature_columns_set) != len(data.columns):
|
||||
successful = False
|
||||
self.__logger.error('Dataset has duplicate features.')
|
||||
|
||||
if self.__config[Constants.add_is_null_column]:
|
||||
for column in feature_columns_set:
|
||||
is_null_name = Utils.get_is_null_column_name(column)
|
||||
if is_null_name in feature_columns_set:
|
||||
successful = False
|
||||
self.__logger.error('{0} suffix is reserved for a computed is_null column for feature {1} '.format(
|
||||
Utils.get_is_null_column_name(''), column))
|
||||
|
||||
for feature in self.__config[Constants.feature_columns]:
|
||||
if Constants.other_feature_cluster_name in data[feature]:
|
||||
successful = False
|
||||
self.__logger.error('Value {0} is a reserved name and it appears as a value in feature {1}'.format(
|
||||
Constants.other_feature_cluster_name, feature))
|
||||
|
||||
if not successful:
|
||||
raise Exception('The data-set validation has failed!')
|
||||
return
|
||||
|
||||
def __validate_column_types(self, control: pd.DataFrame, treatment: pd.DataFrame):
|
||||
# 1) Validate reserved column names are not used in the control/treatment data
|
||||
# 2) Validate that the target metric is either 0/1 or True/False.
|
||||
# 3) There is no column of type DateTime or TimeDelta
|
||||
|
||||
successful = True
|
||||
reserved_column_names: set = {Constants.group_column_name}
|
||||
feature_columns_list = set(control.columns).union(set(treatment.columns))
|
||||
reserved_in_use: set = reserved_column_names.intersection(set(feature_columns_list))
|
||||
|
||||
if len(reserved_in_use) > 0:
|
||||
successful = False
|
||||
self.__logger.error('Dataset has features called {0}. These are reserved keywords.'.format(
|
||||
','.join(list(reserved_in_use))))
|
||||
|
||||
metric_values_c = control[self.__config[Constants.metric_column]].unique().astype(int)
|
||||
metric_values_t = treatment[self.__config[Constants.metric_column]].unique().astype(int)
|
||||
if (len([value for value in metric_values_c if value not in [1, 0]]) > 0) or (
|
||||
len([value for value in metric_values_t if value not in [1, 0]]) > 0):
|
||||
successful = False
|
||||
self.__logger.error('We currently only support binary target metric.')
|
||||
|
||||
features = set(self.__config[Constants.feature_columns]).union(set(self.__config[Constants.invariant_columns]))
|
||||
for feature in features:
|
||||
if (control[feature].dtype == np.datetime64) or (treatment[feature].dtype == np.datetime64) or \
|
||||
(control[feature].dtype == np.timedelta64) or (treatment[feature].dtype == np.timedelta64):
|
||||
if feature in self.__config[Constants.feature_columns]:
|
||||
self.__config[Constants.feature_columns].remove(feature)
|
||||
if feature in self.__config[Constants.invariant_columns]:
|
||||
self.__config[Constants.invariant_columns].remove(feature)
|
||||
message = 'Date/Time features are not supported. We have removed column {0} in our the analysis'.format(
|
||||
feature)
|
||||
self.__logger.warning(message)
|
||||
|
||||
if not successful:
|
||||
raise Exception('The column-type validation has failed!')
|
||||
return
|
||||
|
||||
def __get_available_features(self, df: pd.DataFrame, feature_set: list) -> list:
|
||||
df_cols = set(df.columns)
|
||||
feature_cols = set()
|
||||
for feature in feature_set:
|
||||
if feature in df_cols:
|
||||
feature_cols.add(feature)
|
||||
else:
|
||||
self.__logger.warning('Feature {0} is missing in the data-set.'.format(feature))
|
||||
|
||||
return list(feature_cols)
|
|
@ -0,0 +1,28 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import mct.Constants as Constants
|
||||
|
||||
|
||||
def get_is_null_column_name(col):
|
||||
return col + "_is_null"
|
||||
|
||||
|
||||
def add_group_columns(control: pd.DataFrame, treatment: pd.DataFrame,
|
||||
group_column_name: str = Constants.group_column_name, force: bool = False):
|
||||
if force or (group_column_name not in control.columns):
|
||||
control[group_column_name] = Constants.control_group
|
||||
if force or (group_column_name not in treatment.columns):
|
||||
treatment[group_column_name] = Constants.treatment_group
|
||||
return
|
||||
|
||||
|
||||
def merge_control_treatment(control: pd.DataFrame,
|
||||
treatment: pd.DataFrame,
|
||||
group_column_name: str = Constants.group_column_name) -> pd.DataFrame:
|
||||
add_group_columns(control, treatment, group_column_name=group_column_name, force=True)
|
||||
df = control.append(treatment)
|
||||
df.reset_index(drop=True, inplace=True)
|
||||
return df
|
|
@ -0,0 +1,78 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import mct.Constants as Constants
|
||||
|
||||
|
||||
# Class to create a visualization of the result of the comparison
|
||||
|
||||
|
||||
class Visualizer(object):
|
||||
"""
|
||||
Class to create a visualization/report of the result of the comparison
|
||||
"""
|
||||
|
||||
def __init__(self, config: json):
|
||||
self.config = config
|
||||
self.__logger = logging.getLogger("mct")
|
||||
return
|
||||
|
||||
def create_metric_delta_report(self, metric_delta: pd.DataFrame, result_file: str):
|
||||
# Output metric_delta as HTML.
|
||||
metric_delta.sort_values(
|
||||
by=[Constants.mean_difference, Constants.mean_control, Constants.mean_treatment],
|
||||
inplace=True)
|
||||
metric_delta[Constants.mean_difference] = np.round(metric_delta[Constants.mean_difference] * 100, 2)
|
||||
metric_delta[Constants.mean_control] = np.round(metric_delta[Constants.mean_control] * 100, 2)
|
||||
metric_delta[Constants.mean_treatment] = np.round(metric_delta[Constants.mean_treatment] * 100, 2)
|
||||
metric_delta[Constants.p_value] = np.round(metric_delta[Constants.p_value], 4)
|
||||
# Output metric_delta as HTML.
|
||||
result_file = os.path.join(self.config[Constants.results_dir], result_file)
|
||||
metric_delta[
|
||||
[Constants.mean_control, Constants.mean_treatment, Constants.mean_difference, Constants.p_value,
|
||||
Constants.is_stat_sig]].to_html(
|
||||
result_file, index=False, justify='center', index_names=False)
|
||||
|
||||
def create_bias_result_report(self, bias_results: pd.DataFrame, deviation: pd.DataFrame):
|
||||
results_dir = self.config[Constants.results_dir]
|
||||
|
||||
bias_results[Constants.p_value_threshold] = np.round(bias_results[Constants.p_value_threshold], 4)
|
||||
bias_results[Constants.percentage_deviation] = np.round(bias_results[Constants.percentage_deviation], 2)
|
||||
|
||||
# Sort and round Bias results
|
||||
bias_results = bias_results.sort_values(by=Constants.percentage_deviation, ascending=False)
|
||||
bias_results[Constants.percentage_deviation] = np.round(bias_results[Constants.percentage_deviation], 2)
|
||||
bias_results.sort_values(by=[Constants.percentage_deviation, Constants.feature], ascending=False, inplace=True)
|
||||
|
||||
bias_file = os.path.join(results_dir, "bias_results.html")
|
||||
bias_result_columns = [Constants.feature, Constants.num_of_bins, Constants.p_value_threshold,
|
||||
Constants.percentage_deviation,
|
||||
Constants.resample]
|
||||
bias_results[bias_result_columns].to_html(bias_file, index=False, justify='center', index_names=False)
|
||||
|
||||
# Sort and round deviations
|
||||
deviation.sort_values(by=[Constants.control_group, Constants.feature, Constants.bin_column], ascending=False,
|
||||
inplace=True)
|
||||
deviation_file = os.path.join(results_dir, "bias_deviations.html")
|
||||
deviation_result_columns = [Constants.feature, Constants.bin_column, Constants.control_percent,
|
||||
Constants.treatment_percent]
|
||||
deviation[Constants.control_percent] = np.round(deviation[Constants.control_percent], 2)
|
||||
deviation[Constants.treatment_percent] = np.round(deviation[Constants.treatment_percent], 2)
|
||||
deviation[deviation_result_columns].to_html(deviation_file, index=False, justify='center', index_names=False)
|
||||
|
||||
def create_feature_rank_report(self, ranked_feature: pd.DataFrame):
|
||||
feature_ranking_file_csv = os.path.join(self.config[Constants.results_dir], "feature_ranking.csv")
|
||||
sorted_feature = ranked_feature.sort_values(
|
||||
by=[Constants.hazard_score, Constants.percent_delta, Constants.count_delta, Constants.feature,
|
||||
Constants.expected_failures],
|
||||
ascending=False, inplace=False)
|
||||
|
||||
sorted_feature.reset_index(inplace=True, drop=True)
|
||||
sorted_feature.to_csv(feature_ranking_file_csv)
|
|
@ -0,0 +1,2 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
|
@ -0,0 +1,9 @@
|
|||
numpy==1.16.3
|
||||
scipy==1.2.1
|
||||
scikit-learn==0.21.1
|
||||
pandas==0.25.3
|
||||
IPython==7.5.0
|
||||
matplotlib==3.0.3
|
||||
statsmodels==0.9.0
|
||||
jinja2==2.10.1
|
||||
patsy==0.5.1
|
|
@ -0,0 +1,2 @@
|
|||
[wheel]
|
||||
universal = 1
|
|
@ -0,0 +1,46 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
# !/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
from setuptools import find_packages
|
||||
from setuptools import setup
|
||||
|
||||
with open('README.rst') as readme_file:
|
||||
readme = readme_file.read()
|
||||
|
||||
with open('HISTORY.rst') as history_file:
|
||||
history = history_file.read().replace('.. :changelog:', '')
|
||||
|
||||
requirements = [
|
||||
# TODO: put package requirements here
|
||||
]
|
||||
|
||||
test_requirements = [
|
||||
# TODO: put package test requirements here
|
||||
]
|
||||
|
||||
setup(
|
||||
name='mct',
|
||||
version='1.0.0',
|
||||
description="Tools to compare metrics between datasets, accounting for population differences "
|
||||
"and invariant features.",
|
||||
long_description=readme + '\n\n' + history,
|
||||
author="Jamie Pool, Ashkan Aazami, Ebrahim Beyrami, Jay Gupchup, Martin Ellis",
|
||||
author_email='',
|
||||
url='https://github.com/microsoft/MS-MCT',
|
||||
packages=find_packages(),
|
||||
package_dir={'mct': 'mct'},
|
||||
include_package_data=True,
|
||||
install_requires=requirements,
|
||||
zip_safe=False,
|
||||
keywords=['mct'],
|
||||
classifiers=[
|
||||
'Natural Language :: English',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
],
|
||||
test_suite='tests',
|
||||
tests_require=test_requirements
|
||||
)
|
Загрузка…
Ссылка в новой задаче