This commit is contained in:
Ashkan Aazami 2020-01-29 15:05:56 -08:00
Родитель d416224d8f
Коммит 0f189e729b
14 изменённых файлов: 1394 добавлений и 0 удалений

194
mct/BiasTester.py Normal file
Просмотреть файл

@ -0,0 +1,194 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import json
import logging
import random
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import mct.Constants as Constants
import mct.Utilities as Utils
from mct.HypothesisTester import chi_square_bias_test
class BiasTester(object):
"""
Perform a bias check between the control and treatment dataframes.
"""
__group_control = 'group_control'
__group_treatment = 'group_treatment'
__index_group = 'index_group'
__rf_propensity_scores = 'rf_propensity_scores'
def __init__(self, config: json):
self.config = config
self.__logger = logging.getLogger("mct")
return
def check_bias(self, control_df: pd.DataFrame, treatment_df: pd.DataFrame) -> \
(pd.DataFrame, pd.DataFrame, pd.DataFrame, bool):
"""
# Compares the destruction of invariant features separately and flag any stat sig
# difference that satisfies the given minimum percentage deviation threshold
:param control_df: control dataframe
:param treatment_df: treatment dataframe
:return:
"""
self.__logger.debug('Checking for Population Bias')
invariant_features = self.config[Constants.invariant_columns]
p_value_threshold = self.config[Constants.p_value_threshold]
percentage_deviation_threshold = self.config[Constants.resample_threshold]
small_bin_percent_threshold = self.config[Constants.small_bin_percent_threshold]
bias_results, deviation, is_biased = chi_square_bias_test(control_df[invariant_features],
treatment_df[invariant_features],
groups=[Constants.control_group,
Constants.treatment_group],
group_column_name=Constants.group_column_name,
other_threshold=small_bin_percent_threshold,
p_value=0.01)
bias_results[Constants.num_of_bins] = bias_results[Constants.degree_of_freedom] + 1
bias_results[Constants.resample] = 'no'
bias_results.loc[(bias_results[Constants.percentage_deviation] > percentage_deviation_threshold)
& (bias_results[Constants.p_value_threshold] < p_value_threshold),
Constants.resample] = 'yes'
# Sort and round Bias results
bias_results = bias_results.sort_values(by=Constants.percentage_deviation, ascending=False)
bias_results.sort_values(by=[Constants.percentage_deviation, Constants.feature], ascending=False, inplace=True)
is_biased = is_biased and (bias_results[Constants.resample] == 'yes').any()
self.__logger.info("Is Data biased: {0}".format(is_biased))
# Sort and round deviations.
deviation.sort_values(
by=[Constants.feature, Constants.bin_column],
ascending=False,
inplace=True)
return bias_results, deviation, is_biased
def normalize_bias(self, control: pd.DataFrame, treatment: pd.DataFrame, bias_results: pd.DataFrame,
random_state=None) -> (pd.DataFrame, pd.DataFrame):
"""
Normalize and correct for the major biases.
bias_results - needs to include columns to normalize, and dof
"""
self.__logger.debug("Bias Normalization: started")
Utils.add_group_columns(control, treatment)
if self.config[Constants.normalization_type] != 'rf':
message = 'Currently only supported normalization type is random forest'
self.__logger.error(message)
raise Exception(message)
if not bias_results.empty:
resample_columns = bias_results[Constants.feature]
max_categories = bias_results[Constants.num_of_bins]
data_splits = [(self.__group_control, control), (self.__group_treatment, treatment)]
feature_transforms = [('categorical', x, y) for x, y in zip(resample_columns, max_categories)]
self.__logger.info('Using RF propensity scores with caliper based matching.')
# Get data after sampling.
df_metric = self.__sample_propensity(data_splits, feature_transforms, random_state=random_state)
df_control = df_metric[df_metric[Constants.group_column_name] == Constants.control_group]
df_treatment = df_metric[df_metric[Constants.group_column_name] == Constants.treatment_group]
return df_control, df_treatment
else:
self.__logger.info("Bias Normalization skipped.")
self.__logger.debug("Bias Normalization finished. ")
# Transform the input data
def __transform(self, input_frame, features):
train = pd.DataFrame(index=input_frame.index)
for func, feat, max_categories in features:
# Reduce cardinality of input_frame
dt = input_frame[feat].astype(str)
feat_counts = dt.value_counts()
if len(feat_counts) > max_categories:
dt[~dt.isin(feat_counts[:max_categories].index)] = Constants.other_feature_cluster_name
# OneHot encode the features
train = train.join(pd.get_dummies(dt, prefix=feat))
return train
def __rf_propensity(self, data, target, random_state=None):
scalar = StandardScaler()
data_transformed = scalar.fit_transform(data)
clf = RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=random_state, n_estimators=10)
clf.fit(data_transformed, target)
scores = clf.predict_proba(data_transformed)
return scores[:, 1]
def ___matching_1_1(self, df, random_state=None):
df_c = df[df[self.__index_group] == self.__group_control]
df_t = df[df[self.__index_group] == self.__group_treatment]
df_ps = pd.DataFrame(df_c[self.__rf_propensity_scores].value_counts()).join(
pd.DataFrame(df_t[self.__rf_propensity_scores].value_counts()),
on=None,
how='inner',
lsuffix='l',
rsuffix='r',
sort=False)
df_ps['num2use'] = df_ps[['rf_propensity_scoresl', 'rf_propensity_scoresr']].min(axis=1)
index_c = []
index_t = []
random.seed(a=random_state)
for i in df_ps.index:
kk = df_ps.loc[i]['num2use']
index_c += random.sample([ind for ind in df_c[df_c[self.__rf_propensity_scores] == i].index_original], k=kk)
index_t += random.sample([ind for ind in df_t[df_t[self.__rf_propensity_scores] == i].index_original], k=kk)
return index_c, index_t
def __matching_caliper(self, df, caliper_coeff=0.1, random_state=None):
caliper_width = caliper_coeff * df[self.__rf_propensity_scores].std()
df[self.__rf_propensity_scores] = (df[self.__rf_propensity_scores] / caliper_width).astype(int)
return self.___matching_1_1(df, random_state=random_state)
def __sample_propensity(self, splits, feats, caliper_coeff=0.1, match_type='caliper', random_state=None):
# concatenates the split dataframes, keeping the labels
df = pd.concat([i for _, i in splits], keys=[splits[0][0], splits[1][0]],
names=[self.__index_group, 'index_original'])
# Note: resetting index, to prevent potential problems with having the same index values after the concat.
df.reset_index(inplace=True)
# Set up data frame for classification algorithm.
pred_frame = self.__transform(df, feats)
# Get propensity scores using RF algorithm.s
df[self.__rf_propensity_scores] = self.__rf_propensity(pred_frame, df[self.__index_group],
random_state=random_state)
# Perform 1-1 matching based on the propensity scores.
if match_type == 'caliper':
ind_c, ind_t = self.__matching_caliper(df, caliper_coeff=caliper_coeff, random_state=random_state)
else:
ind_c, ind_t = self.___matching_1_1(df, random_state=random_state)
self.__logger.info("Resampled data size: {}, Percent of retained data: {}:"
.format(len(ind_c) * 2, int(len(ind_c) * 2 / len(df) * 100)))
self.__logger.info("Percent retained in Control {}, Percent of retained in Treatment {}:"
.format(int(len(ind_c) / len(df[df[self.__index_group] == self.__group_control]) * 100),
int(len(ind_c) / len(df[df[self.__index_group] == self.__group_treatment]) * 100)))
return pd.concat([splits[0][1].filter(ind_c, axis=0), splits[1][1].filter(ind_t, axis=0)])

49
mct/Constants.py Normal file
Просмотреть файл

@ -0,0 +1,49 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# Code Constants
skip = 'skip'
sorting_type_delta_count = 'delta_count'
sorting_type_delta_percent = 'delta_percent'
# Reserved column names
group_column_name = "group"
other_feature_cluster_name = 'other'
control_group = 'control'
treatment_group = 'treatment'
# Computed columns
feature = 'feature'
resample = 'resample'
expected_failures = '# of Expected Failures in Treatment'
actual_failures = '# of Actual Failures in Treatment'
num_of_bins = 'num bins'
degree_of_freedom = 'dof'
percentage_deviation = 'Percentage Deviation'
mean_difference = 'Percent Difference'
mean_control = 'Percent Control'
control_percent = 'control_percent'
treatment_percent = 'treatment_percent'
mean_treatment = 'Percent Treatment'
percent_delta = '% Delta'
count_delta = 'Delta (Count)'
hazard_score = 'Hazard Score'
bin_column = 'bin'
p_value = 'P-Value'
is_stat_sig = 'Is Stat-Sig'
# Config parameters
small_bin_percent_threshold = 'small_bin_percent_threshold'
p_value_threshold = 'p_value'
sort_type = 'sort_type'
normalization_type = 'normalization_type'
metric_column = "metric_col"
invariant_columns = 'invariant_columns'
feature_columns = 'feature_columns'
results_dir = "results_dir"
add_is_null_column = 'add_null'
resample_threshold = 'resample_threshold'
decomposition_type = 'decomposition_type'
required_config_keys = [metric_column, invariant_columns, feature_columns, resample_threshold, results_dir,
p_value_threshold, decomposition_type, normalization_type, sort_type, add_is_null_column,
'apply_feature_target_metric_dependence_test']
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
log_file_name = 'mct.log'

177
mct/FeatureBinGenerator.py Normal file
Просмотреть файл

@ -0,0 +1,177 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import logging
import numpy as np
import pandas as pd
import scipy.stats as sp
import mct.Constants as Constants
from mct.Utilities import get_is_null_column_name
class FeatureBinGenerator(object):
"""
Class to bin numeric features
"""
def __init__(self):
self.__logger = logging.getLogger("mct")
return
@staticmethod
def create_top_bins(data: pd.DataFrame, column_name: str, number_of_bins: int,
minimum_size: int, other_bin_name: str, add_is_null_column: bool) -> pd.DataFrame:
"""
Create maximum of number_of_bins bins by selecting the top number_of_bins frequent values
and combining the rest into other bin.
:param data: dataframe
:param column_name: column name to bin
:param number_of_bins:
:param minimum_size:
:param other_bin_name:
:param add_is_null_column:
:return: binned column
"""
column = data[column_name].astype(str)
feats = column.value_counts(dropna=(not add_is_null_column)).to_frame('count').reset_index().sort_values(
['count', 'index'], ascending=[False, True])
feats = feats[:number_of_bins]
feats = feats.loc[feats['count'] >= minimum_size, 'index']
column[~column.isin(feats)] = other_bin_name
return column
def create_percentile_bins(self, df: pd.DataFrame, num_cols: list, add_null: bool = False,
num_bins: int = 4) -> pd.DataFrame:
"""
Method to bin numerical features by their percentile.
Numerical Variables
* Bins data by percentile.
* Encodes the new variables with variable name, GTE (greater than or equal) to LTE syntax
* Add a dummy *_nan* variable for recording the fact that feature was null
Categorical Variables
* Returns a warning. Doesn't bin the feature.
:param df: input pandas dataframe
:param num_cols: a list of the names of the numerical columns in df to bin.
:param add_null: whether to add the *_nan* features to data (default False)
:param num_bins: the number of bins to break the data into the percent width of each bin is 100/num_bins.
(default 4, is quartile break down, set low to avoid blow up.)
:returns: Transformed pandas dataframe
"""
dummy_frames = []
num_cols = [col for col in num_cols if col in df.columns]
for col in num_cols:
# make sure numerical column for binning.
if df[col].dtype == np.object:
self.__logger.warning("Warning: Feature {0} is not numerical and wasn't binned.".format(col))
continue
# get percentiles
dt = df[col]
dt = ((dt.rank() - 1) * num_bins / len(dt.dropna())).apply(np.floor)
dt_agg = df.groupby(dt)[col].agg([np.min, np.max]).rename(columns={'amin': 'min', 'amax': 'max'})
for bin_num in dt.unique():
if np.isnan(bin_num):
continue
if dt_agg.loc[bin_num]['min'] == dt_agg.loc[bin_num]['max']:
dt.replace(bin_num, 'is_{}'.format(dt_agg.loc[bin_num]['min']), inplace=True)
else:
dt.replace(bin_num, 'GTE_{}_LTE_{}'.format(dt_agg.loc[bin_num]['min'], dt_agg.loc[bin_num]['max']),
inplace=True)
add_is_null = add_null and df[col].isnull().any()
dummy_frames.append(pd.get_dummies(dt, prefix=col, dummy_na=add_is_null))
df.drop(col, axis=1, inplace=True)
df = df.join(dummy_frames, sort=True)
return df
@staticmethod
def get_feature_dummies(df: pd.DataFrame, feature_cols, target_col, add_null=True, p_thresh=.01,
min_data_points=500, max_categories=5, apply_feature_target_metric_dependence_test=True):
"""
Method to transform a dataframe wrt to a target variable to be used with classification models.
Numerical Variables
* Null values are left in dataframe.
* Add a dummy *is_null* variable for recording the fact that feature was null
Categorical Variables
* One hot encode categorical variables
* To avoid blow up:
max_categories: Maximum number of categories a feature can have.
The rest are collected into a category 'other'
* To maintain informativeness:
min_data_points: The minimum number of points required to create a bin.
p_thresh: A chi-squared test is run against target_col.
Variable is kept if resulting p_value < p_thresh. Otherwise dropped.
:param apply_feature_target_metric_dependence_test:
:param add_null:
:param target_col:
:param df: input pandas dataframe
:param feature_cols: A list of the feature names in the df to be transformed.
:param max_categories: Maximum number of categories.
:param min_data_points: Minimum number of point in categorical bin.
:param p_thresh: Critical value for chi-squared test.
:returns: Transformed pandas dataframe, a list of columns that were treated as numerical.
Future Work: Algorithm that selects the candidate categorical binning based on information gain as
opposed to data size.
"""
is_null_frame = {}
numerical_columns = []
dummy_frames = []
# Drop columns with only single value including null
for col in feature_cols:
if df[col].nunique(dropna=False) == 1:
df.drop(col, axis=1, inplace=True)
continue
# For numerical columns {int, float} create is_null.
if not (df[col].dtype == np.object or df[col].dtype == np.bool):
if df[col].isnull().any() and add_null:
is_null_frame[get_is_null_column_name(col)] = np.isnan(df[col])
if df[col].nunique(dropna=True) == 1:
df.drop(col, axis=1, inplace=True)
else:
numerical_columns.append(col)
else:
# For categorical columns create feature dummies.
dt_col = FeatureBinGenerator.create_top_bins(df, col, max_categories, min_data_points,
Constants.other_feature_cluster_name,
add_null)
if apply_feature_target_metric_dependence_test:
chi2, p_val, dof, expected = sp.chi2_contingency(pd.crosstab(dt_col, df[target_col]))
if p_val > p_thresh:
df.drop(col, axis=1, inplace=True)
continue
# TODO: Refactor the code to create dummies when we need them to optimize the performance
dummy_frames.append(pd.get_dummies(dt_col, prefix=col))
df.drop(col, axis=1, inplace=True)
if add_null:
null_frame = pd.DataFrame(is_null_frame)
df = df.join(null_frame, sort=True)
df = df.join(dummy_frames, sort=True)
return df, numerical_columns

258
mct/FeatureRanker.py Normal file
Просмотреть файл

@ -0,0 +1,258 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import logging
import numpy as np
import pandas as pd
import mct.Constants as Constants
from mct.FeatureBinGenerator import FeatureBinGenerator
from mct.HypothesisTester import ci_proportion_bounds
class FeatureRanker(object):
"""
Feature ranking for metric delta.
"""
def __init__(self, config):
self.__config = config
self.__bin_generator = FeatureBinGenerator()
self.__logger = logging.getLogger("mct")
return
def compute_ranks(self, df_metric: pd.DataFrame, df_metric_not_normalized: pd.DataFrame,
numerical_cols: list) -> pd.DataFrame:
"""
Feature ranking for metric delta.
"""
config = self.__config
target_col = config[Constants.metric_column]
add_null = config[Constants.add_is_null_column]
sorting_type = config[Constants.sort_type]
# Compute featuring ranking
df_feature_ranking = self.__decompose_metric_univar(df_metric, df_metric_not_normalized, target_col,
numerical_cols, add_null)
if sorting_type == Constants.sorting_type_delta_count:
sorted_feature = df_feature_ranking.sort_values(Constants.count_delta, ascending=False, inplace=False)
elif sorting_type == Constants.sorting_type_delta_percent:
sorted_feature = df_feature_ranking.sort_values(Constants.percent_delta, ascending=False, inplace=False)
else:
sorted_feature = df_feature_ranking.sort_values(Constants.hazard_score, ascending=False, inplace=False)
sorted_feature.reset_index(inplace=True, drop=True)
return sorted_feature
def __decompose_metric_univar(self, df_metric, df_metric_not_normalized, target_col, numerical_cols, add_null):
"""
Computes the univariate feature ranking.
"""
df_uni_var_un_norm, df_uni_var_norm, categorical_cols = self.__set_univar_frames(df_metric,
df_metric_not_normalized,
numerical_cols, target_col,
add_null)
expected = []
actual = []
contribution = []
is_sig = []
feature_results = pd.DataFrame(categorical_cols, columns=[Constants.feature])
# categorical feature ranking
for col in categorical_cols:
sig, exp, act, con = self.__feat_cat_significant(col,
df_uni_var_norm,
df_uni_var_un_norm,
target_col,
Constants.group_column_name)
is_sig.append(sig)
contribution.append(con)
expected.append(exp)
actual.append(act)
feature_results[Constants.is_stat_sig] = is_sig
feature_results[Constants.count_delta] = contribution
feature_results[Constants.expected_failures] = expected
feature_results[Constants.actual_failures] = actual
# numerical feature ranking
for col in numerical_cols:
sig, binned_feats, is_bin_sig, exp, act, con = self.__feat_num_significant(col, df_uni_var_norm,
df_uni_var_un_norm,
target_col,
Constants.group_column_name,
add_null)
if sig:
feature_results = feature_results.append(
pd.DataFrame(list(zip(*[binned_feats, is_bin_sig, exp, act, con])),
columns=[Constants.feature,
Constants.is_stat_sig,
Constants.expected_failures,
Constants.actual_failures,
Constants.count_delta]),
sort=True)
# Set up columns for output feature ranking.
feature_results[Constants.percent_delta] = np.abs(
feature_results[Constants.count_delta] / feature_results[Constants.expected_failures] * 100)
n_fail_exp = \
df_metric_not_normalized.loc[
df_metric_not_normalized[Constants.group_column_name] == Constants.control_group][
target_col].sum() / len(
df_metric_not_normalized.loc[
df_metric_not_normalized[Constants.group_column_name] == Constants.control_group]) * len(
df_metric_not_normalized.loc[
df_metric_not_normalized[Constants.group_column_name] == Constants.treatment_group])
n_fail_act = df_metric_not_normalized.loc[
df_metric_not_normalized[Constants.group_column_name] == Constants.treatment_group][target_col].sum()
feature_results[Constants.hazard_score] = (feature_results[Constants.actual_failures] / n_fail_act -
feature_results[Constants.expected_failures] / n_fail_exp) * 100
feature_results.reset_index(inplace=True, drop=True)
output_columns = [Constants.feature, Constants.hazard_score, Constants.expected_failures,
Constants.actual_failures, Constants.count_delta, Constants.percent_delta]
stat_sig_features = feature_results[feature_results[Constants.is_stat_sig] == True][output_columns]
return stat_sig_features
def __set_univar_frames(self, df_metric, df_metric_not_normalized, numerical_cols, target_col, add_null):
"""
Prepares data_frames for univariate feature ranking. One before and one after bias normalization.
Sets up categorical and numerical features.
"""
config = self.__config
invar_target_cols = config[Constants.invariant_columns] + [target_col]
df_invar_target = df_metric_not_normalized[invar_target_cols]
frame_invar_target, num_cols = self.__bin_generator.get_feature_dummies(
df_invar_target.copy(),
config[Constants.invariant_columns],
target_col,
min_data_points=1,
max_categories=10,
p_thresh=0.25,
add_null=add_null,
apply_feature_target_metric_dependence_test=self.__config['apply_feature_target_metric_dependence_test'])
# normalized and non-normalized data_frames for feature ranking
df_uni_var_un_norm = df_metric_not_normalized[
config[Constants.feature_columns] + [Constants.group_column_name]].merge(frame_invar_target,
left_index=True,
right_index=True)
df_uni_var_norm = df_metric[config[Constants.feature_columns] + [Constants.group_column_name]].merge(
frame_invar_target, left_index=True, right_index=True)
feature_cols = list(df_uni_var_un_norm.columns)
feature_cols.remove(Constants.group_column_name)
feature_cols.remove(target_col)
categorical_cols = [col for col in feature_cols if col not in numerical_cols]
return df_uni_var_un_norm, df_uni_var_norm, categorical_cols
def __feat_cat_significant(self, col, df_uni_var_norm, df_uni_var_un_norm, target_col, group_col):
"""
Determines if categorical col is significant on the normalized dataset. If it is it computes the impact
on the non-normalized data set and returns the contribution.
"""
num_c, len_c, num_t, len_t = self.__feat_info_cat(col, df_uni_var_norm, target_col, group_col)
sig = self.__sig_check(num_c, len_c, num_t, len_t)
if not sig:
return False, 0, 0, 0
# If number of drops is significant return the number of impacted calls on the original data set.
num_c, len_c, num_t, len_t = self.__feat_info_cat(col, df_uni_var_un_norm, target_col, group_col)
return True, num_c * len_t / len_c, num_t, num_t - num_c * len_t / len_c
def __feat_info_cat(self, col, df_uni_var, target_col, group_col):
"""
sets up the appropriate dataframe and returns the number of failure associated with the given feature
on the treatment and control datasets
"""
return self.__feat_info(df_uni_var[[target_col, group_col, col]], col, target_col, group_col)
@staticmethod
def __feat_info(df_col, col, target_col, group_col):
"""
computes the number of failure associated with the given feature on the treatment and control datasets
"""
df = df_col.copy()
df['targ_and_col'] = df[target_col] & df[col]
col_info = df.groupby(group_col)['targ_and_col'].agg({'size', 'sum'})
num_c = col_info.loc[Constants.control_group, 'sum']
len_c = col_info.loc[Constants.control_group, 'size']
num_t = col_info.loc[Constants.treatment_group, 'sum']
len_t = col_info.loc[Constants.treatment_group, 'size']
return num_c, len_c, num_t, len_t
@staticmethod
def __sig_check(num_c, len_c, num_t, len_t):
"""
checks if the change in failures is significant between treatment and control datasets
"""
low_c, up_c = ci_proportion_bounds(num_c / len_c, len_c)
low_t, up_t = ci_proportion_bounds(num_t / len_t, len_t)
if (low_c <= low_t) and (up_c >= low_t):
return False
if (low_t <= low_c) and (up_t >= low_c):
return False
return True
def __feat_num_significant(self, col, df_uni_var_norm, df_uni_var_un_norm, target_col, group_col, add_null):
"""
Determines if the binning for a numerical col is significant on the normalized dataset.
If it is it computes the impact on the non-normalized data set by creating a new binning,
and returning significance and contributions.
"""
df_target_col = df_uni_var_norm[[target_col, group_col, col]]
df_bin_col = self.__bin_generator.create_percentile_bins(df_target_col.copy(), [col], num_bins=10,
add_null=add_null)
binned_feats = [feat for feat in df_bin_col.columns if col in feat]
sig = False
for feat in binned_feats:
num_c, len_c, num_t, len_t = self.__feat_info(df_bin_col[[target_col, group_col, feat]],
feat,
target_col,
group_col)
sig = self.__sig_check(num_c, len_c, num_t, len_t)
if sig:
break
# if none of the binned features are significant return False, 0 impact
if not sig:
return False, 0, 0, 0, 0, 0
# contribution on the non-normalized data set
df_target_col = df_uni_var_un_norm[[target_col, group_col, col]]
df_bin_col = self.__bin_generator.create_percentile_bins(df_target_col.copy(), [col], num_bins=10,
add_null=add_null)
binned_feats = [feat for feat in df_bin_col.columns if col in feat]
expected = []
actual = []
contribution = []
is_sig = []
for feat in binned_feats:
num_c, len_c, num_t, len_t = self.__feat_info(df_bin_col[[target_col, group_col, feat]], feat, target_col,
group_col)
contribution.append(num_t - num_c * len_t / len_c)
actual.append(num_t)
expected.append(num_c * len_t / len_c)
is_sig.append(self.__sig_check(num_c, len_c, num_t, len_t))
return True, binned_feats, is_sig, expected, actual, contribution

230
mct/HypothesisTester.py Normal file
Просмотреть файл

@ -0,0 +1,230 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import numpy as np
import pandas as pd
import scipy.stats as sp
import statsmodels.stats.api as sms
import mct.Constants as Constants
# t-test
def get_t_test_result(a, b):
# Run t-test on control and treatment.
(mean_diff, control_diff, stat, p_value) = __two_sample_t_test(a, b)
(lower, upper) = __t_test_conf_interval(a, b)
mean_rel = mean_diff / control_diff
lower = lower
upper = upper
return mean_diff, mean_rel, lower, upper, stat, p_value
def __two_sample_t_test(a, b):
# Run t-test on control and treatment.
(stat, p_value) = sp.ttest_ind(a, b, equal_var=False)
control_mean = np.mean(a)
mean_diff = control_mean - np.mean(b)
return mean_diff, control_mean, stat, p_value
def __t_test_conf_interval(a, b):
cm = sms.CompareMeans(sms.DescrStatsW(a), sms.DescrStatsW(b))
return cm.tconfint_diff(usevar='pooled')
# bernoulli test - Test of Proportions
def chi_squared_results(a, b):
# Run Chi-Squared Test on control and treatment.
mean_control = np.mean(a)
mean_treatment = np.mean(b)
mean_diff = mean_treatment - mean_control
df = pd.concat([
pd.DataFrame(data={'metric': a, 'label': Constants.control_group}),
pd.DataFrame(data={'metric': b, 'label': Constants.treatment_group})
])
cont_table = pd.crosstab(df['label'], df['metric'])
chi2, p_val, dof, expected = sp.chi2_contingency(cont_table)
return mean_diff, mean_control, mean_treatment, chi2, p_val
def ci_proportion_bounds(p, n):
err = __ci_proportion(p, n)
return (p - err), (p + err)
def __se_proportion(p, n):
return np.sqrt((p * (1 - p)) / n)
def __ci_proportion(p, n):
return 1.96 * __se_proportion(p, n)
"""
Created on Fri Apr 8 09:48:53 2016
Python utility to perform if data is biased between two groups.
NOTE: This only performs a bias check for categorical values.
It does not perform a bias check for numeric variables.
See examples in https://onlinecourses.science.psu.edu/stat414/node/311
@author: jayagup
"""
def chi_square_bias_test(control, treatment, groups, group_column_name, other_threshold, p_value):
"""
Compute the chi square test of homogeneity between two
groups.
See details in https://onlinecourses.science.psu.edu/stat414/node/311
:input data_1: The first data frame.
:input data_2: The second data frame.
:input groups: The name of the groups.
:input p_value: The p-value with which to evaluate univariate table.
:returns is_biased: True if biased else False
:returns table_biased: A table of results with differences..
"""
contingency_table_c = __get_contingency_table(control)
contingency_table_t = __get_contingency_table(treatment)
# Add group information to the dataframes.
contingency_table_c[group_column_name] = groups[0]
contingency_table_t[group_column_name] = groups[1]
# Create a single contingency table including both groups.
contingency_table = contingency_table_c.append(contingency_table_t)
# Run the bias check.
return chi_square_bias_test_contingency(contingency_table, other_threshold, p_value)
def chi_square_bias_test_contingency(df_cont_table, other_threshold, p_value):
"""
Chi square test of homogeneity over all features.
"""
# Perform the bias check for all features, one feature at a time.
bias_results = []
deviation = pd.DataFrame()
for feature in df_cont_table[Constants.feature].unique():
chi2, p_val, dof, perc_dev, feature_deviation = bias_check_covariate(
df_cont_table, feature, other_threshold)
feature_deviation[Constants.feature] = feature
deviation = deviation.append(feature_deviation)
bias_results.append({
Constants.feature: feature,
"chi_square": chi2,
"p_value": p_val,
Constants.degree_of_freedom: dof,
Constants.percentage_deviation: perc_dev})
df_bias_results = pd.DataFrame(bias_results)
# Test whether each feature meets the p-value criterion.
p_value_check = (np.sum(df_bias_results.p_value < p_value) > 0)
return df_bias_results, deviation, p_value_check
def bias_check_covariate(df_cont_table, feature, other_threshold=1.0):
"""
Chi square test of homogeneity for single feature.
:input df_cont_table: Counts for the feature.
:input feature: The name of the feature.
:return outcome of the chi square bias check.
"""
# Filter the feature.
df_cont_feature = df_cont_table[df_cont_table.feature == feature]
# Pivot the counts to create R X C format.
df_cont_pivot = pd.pivot_table(
df_cont_feature,
values='count',
columns=Constants.group_column_name,
index=Constants.bin_column)
df_cont_pivot, grps = __combine_small_bins(df_cont_pivot, other_threshold)
# Feed the contingency table to chi square test.
chi2, p_val, dof, expected = sp.chi2_contingency(df_cont_pivot)
# Compute the probability deviation from expected.
diff_percent = np.abs(expected - df_cont_pivot) / sum(expected) * 100.0
# Compute percentage for each bin.
grp_percent = list(map(lambda x: x + "_percent", grps)) # control_percent, treatment_percent
diff_percent[grp_percent] = df_cont_pivot[grps] / df_cont_pivot[grps].sum() * 100.0
diff_percent.reset_index(inplace=True)
# Sum of differences of expected probability and observed probability
# Note that the sum should be the same for the two columns.
perc_dev = np.max(diff_percent[grps.tolist()].max())
return chi2, p_val, dof, perc_dev, diff_percent
def __combine_small_bins(df_cont_pivot, other_threshold):
"""
Combine bins that are too small in both control and treatment group into Constants.other_feature_cluster_name
:input df_cont_pivot: contingency pivot table.
"""
# if there are bins which are too small in both groups
# then set them to the other group.
grps = df_cont_pivot.columns
df_cont_pivot.reset_index(inplace=True)
df_cont_pivot.fillna(0, inplace=True)
other_grp_1 = df_cont_pivot[grps[0]] / df_cont_pivot[grps[0]].sum() * 100.0 < other_threshold
other_grp_2 = df_cont_pivot[grps[1]] / df_cont_pivot[grps[1]].sum() * 100.0 < other_threshold
other_grp = other_grp_1 & other_grp_2
df_cont_pivot.loc[other_grp, Constants.bin_column] = Constants.other_feature_cluster_name
# Combine all the others by grouping by bin again
df_cont_pivot = df_cont_pivot.groupby(Constants.bin_column).sum()
df_cont_pivot.fillna(0, inplace=True)
return df_cont_pivot, grps
def __get_contingency_table(data: pd.DataFrame) -> pd.DataFrame:
"""
Create a contingency table for the dataframe.
:input df: The dataframe.
:returns The contingency table data frame with [Constants.bin_column ,'count', Constants.feature] columns
"""
contingency_data_frame = pd.DataFrame()
for c in data.columns:
contingency_data_frame = contingency_data_frame.append(_get_feature_values_distribution(data[c]))
return contingency_data_frame
def _get_feature_values_distribution(feature):
"""
Get the count for each feature value.
"""
# TODO: this would perform poorly when we have a numerical feature.
distribution = pd.DataFrame(feature
.groupby(feature.values)
.agg('count')
.reset_index()
.rename(columns={"index": Constants.bin_column, feature.name: 'count'})
)
distribution[Constants.feature] = feature.name
return distribution

59
mct/MCT.py Normal file
Просмотреть файл

@ -0,0 +1,59 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import json
import logging
import os
from pandas import DataFrame
import mct.Constants as Constants
from mct.BiasTester import BiasTester
from mct.FeatureRanker import FeatureRanker
from mct.MetricComparer import MetricComparer
from mct.PreProcessor import PreProcessor
from mct.Visualizer import Visualizer
class MCT(object):
def __init__(self, config: json):
self.__config = config
log_file = os.path.join(config[Constants.results_dir], Constants.log_file_name)
logger = logging.getLogger("mct")
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(logging.Formatter(Constants.log_format))
file_handler.setLevel(logging.INFO)
logger.addHandler(file_handler)
def process(self, control: DataFrame, treatment: DataFrame, random_state=None):
preprocessor = PreProcessor(self.__config)
visualizer = Visualizer(self.__config)
df_metric, numerical_cols = preprocessor.pre_process_data(control, treatment)
df_metric_not_norm = df_metric.copy()
# Compare Control vs Treatment
delta_comparer = MetricComparer(self.__config)
control = df_metric[df_metric[Constants.group_column_name] == Constants.control_group]
treatment = df_metric[df_metric[Constants.group_column_name] == Constants.treatment_group]
metric_delta = delta_comparer.compare(control, treatment)
# Bias checker
bias_tester = BiasTester(self.__config)
visualizer.create_metric_delta_report(metric_delta, "initial_metric_comparison.html")
bias_results, deviation, is_biased = bias_tester.check_bias(control, treatment)
visualizer.create_bias_result_report(bias_results, deviation)
if is_biased and (self.__config[Constants.normalization_type] != Constants.skip):
n_control, n_treatment = bias_tester.normalize_bias(control, treatment, bias_results, random_state)
df_metric = n_control.append(n_treatment)
b_metric_delta = delta_comparer.compare(n_control, n_treatment)
visualizer.create_metric_delta_report(b_metric_delta, "normalized_metric_comparison.html")
n_bias_results, n_deviation, n_is_biased = bias_tester.check_bias(n_control, n_treatment)
feature_ranker = FeatureRanker(self.__config)
feature_rank = feature_ranker.compute_ranks(df_metric, df_metric_not_norm, numerical_cols)
visualizer.create_feature_rank_report(feature_rank)

46
mct/MetricComparer.py Normal file
Просмотреть файл

@ -0,0 +1,46 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import logging
import pandas as pd
import mct.Constants as Constants
from mct.HypothesisTester import chi_squared_results
class MetricComparer(object):
"""
Class to compare a metric on two datasets
"""
def __init__(self, config):
self.config = config
self.__logger = logging.getLogger("mct")
return
def compare(self, control: pd.DataFrame, treatment: pd.DataFrame) -> pd.DataFrame:
"""
:param control: control dataframe
:param treatment: treatment dataframe
:return: dataframe [Constants.mean_difference,
Constants.mean_control,
Constants.mean_treatment,
Constants.p_value,
Constants.is_stat_sig]
"""
control_metric = control[self.config[Constants.metric_column]]
treatment_metric = treatment[self.config[Constants.metric_column]]
mean_diff, mean_control, mean_treatment, chi2, p_val = chi_squared_results(control_metric, treatment_metric)
metric_delta = pd.DataFrame(
[{
Constants.mean_difference: mean_diff,
Constants.mean_control: mean_control,
Constants.mean_treatment: mean_treatment,
Constants.p_value: p_val,
Constants.is_stat_sig: (p_val < self.config[Constants.p_value_threshold])
}])
return metric_delta

216
mct/PreProcessor.py Normal file
Просмотреть файл

@ -0,0 +1,216 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import json
import logging
import os
import numpy as np
import pandas as pd
import mct.Constants as Constants
import mct.Utilities as Utils
from mct.FeatureBinGenerator import FeatureBinGenerator
class PreProcessor(object):
def __init__(self, config: json):
self.__config = config
self.__logger = logging.getLogger("mct")
return
def pre_process_data(self, control_df: pd.DataFrame, treatment_df: pd.DataFrame) -> \
(pd.DataFrame, list):
"""
# Validating data against the config input.
# Adding is_null column based on config
# OneHot-encoding the Categorical features
# Dropping non-informative features:
# - Categorical: based on Chi^2 test
# - Numerical: when the feature has only single non-null value
"""
self.__validate_config()
self.__validate_column_types(control_df, treatment_df)
# Partition columns into target, invariant and variant features
df = Utils.merge_control_treatment(control_df, treatment_df)
df_feature_target, df_invariant_columns, df_metric_columns, feature_columns = self.__partition_columns(df)
self.__validate_data_set(df)
# Encode Categorical features - remove ones with 0 variation, or with no impact to the metric.
# Keep track of numerical columns (possibly containing NULL values)
df_feature_target_binned, num_cols = FeatureBinGenerator.get_feature_dummies(
df_feature_target,
feature_columns,
self.__config[Constants.metric_column],
add_null=self.__config[Constants.add_is_null_column],
p_thresh=0.25,
min_data_points=1,
max_categories=10,
apply_feature_target_metric_dependence_test=self.__config['apply_feature_target_metric_dependence_test'])
# Drop target metric column
df_metric = self.__merge_columns(df_feature_target_binned, df_invariant_columns, df_metric_columns)
return df_metric, num_cols
def __merge_columns(self, df_feature_target_binned, df_invariant_columns, df_metric_columns):
metric_column = self.__config[Constants.metric_column]
df_feature_target_binned.drop(metric_column, axis=1, inplace=True)
feature_columns = list(df_feature_target_binned.columns)
if not feature_columns:
raise Exception("There is no feature left, that meets the threshold criteria")
self.__config[Constants.feature_columns] = feature_columns
# Join the feature, invariant and target data_frames.
df_metric = df_feature_target_binned.merge(df_invariant_columns, copy=False, left_index=True, right_index=True)
df_metric = df_metric.merge(df_metric_columns, left_index=True, right_index=True)
df_metric.reset_index(drop=True, inplace=True)
return df_metric
def __partition_columns(self, df):
# Set the metric columns: contains the metric column and Constants.group_column_name column
metric_column = self.__config[Constants.metric_column]
df_metric_columns = df[[Constants.group_column_name, metric_column]]
# Set invariant columns.
invariant_columns = self.__get_available_features(df, self.__config[Constants.invariant_columns])
df[invariant_columns] = df[invariant_columns].astype('object')
df[invariant_columns] = df[invariant_columns].fillna('NULL')
df_invariant = df[invariant_columns]
# Set feature columns.
feature_columns = self.__get_available_features(df, self.__config[Constants.feature_columns])
df_feature_columns = df[feature_columns]
# Merge features and metric column.
df_feature_target = df_feature_columns.merge(
pd.DataFrame(df_metric_columns[metric_column]),
left_index=True,
right_index=True)
return df_feature_target, df_invariant, df_metric_columns, feature_columns
def __validate_config(self):
config = self.__config
successful = True
# Check access to the output folder
output_folder = config[Constants.results_dir]
if not os.access(output_folder, os.W_OK):
successful = False
self.__logger.error('There is no write access to the output folder: {0}'.format(output_folder))
# Make sure all config parameters exist
missing_keys = [key for key in Constants.required_config_keys if key not in config.keys()]
if missing_keys:
missing = ','.join(str(x) for x in missing_keys)
successful = False
self.__logger.error('Following config parameters are missing: {0}'.format(missing))
# Make sure there is no intersection of metric_col, invariant_columns and feature_columns
# and deduplicate if there have common features. Giving priorities in the following order:
# 1) metric_col
# 2) invariant_columns
# 3) feature_columns
config[Constants.feature_columns] = list(set(config[Constants.feature_columns]))
config[Constants.invariant_columns] = list(set(config[Constants.invariant_columns]))
if config[Constants.metric_column] in config[Constants.invariant_columns]:
config[Constants.invariant_columns].remove(config[Constants.metric_column])
self.__logger.warning(
'Metric column {0} cannot be part of invariant columns.'.format(config[Constants.metric_column]))
if config[Constants.metric_column] in config[Constants.feature_columns]:
config[Constants.feature_columns].remove(config[Constants.metric_column])
self.__logger.warning(
'Metric column {0} cannot be part of feature columns.'.format(config[Constants.metric_column]))
intersection = set(config[Constants.feature_columns]).intersection(config[Constants.invariant_columns])
if len(intersection) > 0:
config[Constants.feature_columns] = [feat for feat in config[Constants.feature_columns] if
feat not in intersection]
common = ','.join(str(x) for x in intersection)
self.__logger.warning('Features {0} are set as invariant and cannot be part of a features.'.format(common))
if not successful:
raise Exception('The config-file validation has failed!')
return
def __validate_data_set(self, data: pd.DataFrame):
# Check:
# 1) There are no duplicate columns
# 2) No reserved prefix/suffix is not used in column name
# 3) No reserved values is used; e.g 'others'
successful = True
feature_columns_set = set(data.columns)
if len(feature_columns_set) != len(data.columns):
successful = False
self.__logger.error('Dataset has duplicate features.')
if self.__config[Constants.add_is_null_column]:
for column in feature_columns_set:
is_null_name = Utils.get_is_null_column_name(column)
if is_null_name in feature_columns_set:
successful = False
self.__logger.error('{0} suffix is reserved for a computed is_null column for feature {1} '.format(
Utils.get_is_null_column_name(''), column))
for feature in self.__config[Constants.feature_columns]:
if Constants.other_feature_cluster_name in data[feature]:
successful = False
self.__logger.error('Value {0} is a reserved name and it appears as a value in feature {1}'.format(
Constants.other_feature_cluster_name, feature))
if not successful:
raise Exception('The data-set validation has failed!')
return
def __validate_column_types(self, control: pd.DataFrame, treatment: pd.DataFrame):
# 1) Validate reserved column names are not used in the control/treatment data
# 2) Validate that the target metric is either 0/1 or True/False.
# 3) There is no column of type DateTime or TimeDelta
successful = True
reserved_column_names: set = {Constants.group_column_name}
feature_columns_list = set(control.columns).union(set(treatment.columns))
reserved_in_use: set = reserved_column_names.intersection(set(feature_columns_list))
if len(reserved_in_use) > 0:
successful = False
self.__logger.error('Dataset has features called {0}. These are reserved keywords.'.format(
','.join(list(reserved_in_use))))
metric_values_c = control[self.__config[Constants.metric_column]].unique().astype(int)
metric_values_t = treatment[self.__config[Constants.metric_column]].unique().astype(int)
if (len([value for value in metric_values_c if value not in [1, 0]]) > 0) or (
len([value for value in metric_values_t if value not in [1, 0]]) > 0):
successful = False
self.__logger.error('We currently only support binary target metric.')
features = set(self.__config[Constants.feature_columns]).union(set(self.__config[Constants.invariant_columns]))
for feature in features:
if (control[feature].dtype == np.datetime64) or (treatment[feature].dtype == np.datetime64) or \
(control[feature].dtype == np.timedelta64) or (treatment[feature].dtype == np.timedelta64):
if feature in self.__config[Constants.feature_columns]:
self.__config[Constants.feature_columns].remove(feature)
if feature in self.__config[Constants.invariant_columns]:
self.__config[Constants.invariant_columns].remove(feature)
message = 'Date/Time features are not supported. We have removed column {0} in our the analysis'.format(
feature)
self.__logger.warning(message)
if not successful:
raise Exception('The column-type validation has failed!')
return
def __get_available_features(self, df: pd.DataFrame, feature_set: list) -> list:
df_cols = set(df.columns)
feature_cols = set()
for feature in feature_set:
if feature in df_cols:
feature_cols.add(feature)
else:
self.__logger.warning('Feature {0} is missing in the data-set.'.format(feature))
return list(feature_cols)

28
mct/Utilities.py Normal file
Просмотреть файл

@ -0,0 +1,28 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import pandas as pd
import mct.Constants as Constants
def get_is_null_column_name(col):
return col + "_is_null"
def add_group_columns(control: pd.DataFrame, treatment: pd.DataFrame,
group_column_name: str = Constants.group_column_name, force: bool = False):
if force or (group_column_name not in control.columns):
control[group_column_name] = Constants.control_group
if force or (group_column_name not in treatment.columns):
treatment[group_column_name] = Constants.treatment_group
return
def merge_control_treatment(control: pd.DataFrame,
treatment: pd.DataFrame,
group_column_name: str = Constants.group_column_name) -> pd.DataFrame:
add_group_columns(control, treatment, group_column_name=group_column_name, force=True)
df = control.append(treatment)
df.reset_index(drop=True, inplace=True)
return df

78
mct/Visualizer.py Normal file
Просмотреть файл

@ -0,0 +1,78 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import json
import logging
import os
import numpy as np
import pandas as pd
import mct.Constants as Constants
# Class to create a visualization of the result of the comparison
class Visualizer(object):
"""
Class to create a visualization/report of the result of the comparison
"""
def __init__(self, config: json):
self.config = config
self.__logger = logging.getLogger("mct")
return
def create_metric_delta_report(self, metric_delta: pd.DataFrame, result_file: str):
# Output metric_delta as HTML.
metric_delta.sort_values(
by=[Constants.mean_difference, Constants.mean_control, Constants.mean_treatment],
inplace=True)
metric_delta[Constants.mean_difference] = np.round(metric_delta[Constants.mean_difference] * 100, 2)
metric_delta[Constants.mean_control] = np.round(metric_delta[Constants.mean_control] * 100, 2)
metric_delta[Constants.mean_treatment] = np.round(metric_delta[Constants.mean_treatment] * 100, 2)
metric_delta[Constants.p_value] = np.round(metric_delta[Constants.p_value], 4)
# Output metric_delta as HTML.
result_file = os.path.join(self.config[Constants.results_dir], result_file)
metric_delta[
[Constants.mean_control, Constants.mean_treatment, Constants.mean_difference, Constants.p_value,
Constants.is_stat_sig]].to_html(
result_file, index=False, justify='center', index_names=False)
def create_bias_result_report(self, bias_results: pd.DataFrame, deviation: pd.DataFrame):
results_dir = self.config[Constants.results_dir]
bias_results[Constants.p_value_threshold] = np.round(bias_results[Constants.p_value_threshold], 4)
bias_results[Constants.percentage_deviation] = np.round(bias_results[Constants.percentage_deviation], 2)
# Sort and round Bias results
bias_results = bias_results.sort_values(by=Constants.percentage_deviation, ascending=False)
bias_results[Constants.percentage_deviation] = np.round(bias_results[Constants.percentage_deviation], 2)
bias_results.sort_values(by=[Constants.percentage_deviation, Constants.feature], ascending=False, inplace=True)
bias_file = os.path.join(results_dir, "bias_results.html")
bias_result_columns = [Constants.feature, Constants.num_of_bins, Constants.p_value_threshold,
Constants.percentage_deviation,
Constants.resample]
bias_results[bias_result_columns].to_html(bias_file, index=False, justify='center', index_names=False)
# Sort and round deviations
deviation.sort_values(by=[Constants.control_group, Constants.feature, Constants.bin_column], ascending=False,
inplace=True)
deviation_file = os.path.join(results_dir, "bias_deviations.html")
deviation_result_columns = [Constants.feature, Constants.bin_column, Constants.control_percent,
Constants.treatment_percent]
deviation[Constants.control_percent] = np.round(deviation[Constants.control_percent], 2)
deviation[Constants.treatment_percent] = np.round(deviation[Constants.treatment_percent], 2)
deviation[deviation_result_columns].to_html(deviation_file, index=False, justify='center', index_names=False)
def create_feature_rank_report(self, ranked_feature: pd.DataFrame):
feature_ranking_file_csv = os.path.join(self.config[Constants.results_dir], "feature_ranking.csv")
sorted_feature = ranked_feature.sort_values(
by=[Constants.hazard_score, Constants.percent_delta, Constants.count_delta, Constants.feature,
Constants.expected_failures],
ascending=False, inplace=False)
sorted_feature.reset_index(inplace=True, drop=True)
sorted_feature.to_csv(feature_ranking_file_csv)

2
mct/__init__.py Normal file
Просмотреть файл

@ -0,0 +1,2 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

9
requirements.txt Normal file
Просмотреть файл

@ -0,0 +1,9 @@
numpy==1.16.3
scipy==1.2.1
scikit-learn==0.21.1
pandas==0.25.3
IPython==7.5.0
matplotlib==3.0.3
statsmodels==0.9.0
jinja2==2.10.1
patsy==0.5.1

2
setup.cfg Normal file
Просмотреть файл

@ -0,0 +1,2 @@
[wheel]
universal = 1

46
setup.py Normal file
Просмотреть файл

@ -0,0 +1,46 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# !/usr/bin/env python
# -*- coding: utf-8 -*-
from setuptools import find_packages
from setuptools import setup
with open('README.rst') as readme_file:
readme = readme_file.read()
with open('HISTORY.rst') as history_file:
history = history_file.read().replace('.. :changelog:', '')
requirements = [
# TODO: put package requirements here
]
test_requirements = [
# TODO: put package test requirements here
]
setup(
name='mct',
version='1.0.0',
description="Tools to compare metrics between datasets, accounting for population differences "
"and invariant features.",
long_description=readme + '\n\n' + history,
author="Jamie Pool, Ashkan Aazami, Ebrahim Beyrami, Jay Gupchup, Martin Ellis",
author_email='',
url='https://github.com/microsoft/MS-MCT',
packages=find_packages(),
package_dir={'mct': 'mct'},
include_package_data=True,
install_requires=requirements,
zip_safe=False,
keywords=['mct'],
classifiers=[
'Natural Language :: English',
'Programming Language :: Python :: 3.7',
],
test_suite='tests',
tests_require=test_requirements
)