Merge pull request #2 from microsoft/ashkana/update

moving caliper_coefficient to config
This commit is contained in:
Ashkan Aazami 2020-02-13 13:14:43 -08:00 коммит произвёл GitHub
Родитель 8e3efd5b6f 7f6aed098d
Коммит 5771e6bd63
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
7 изменённых файлов: 61 добавлений и 15 удалений

Просмотреть файл

@ -158,12 +158,13 @@ class BiasTester(object):
return index_c, index_t
def __matching_caliper(self, df, caliper_coeff=0.1, random_state=None):
def __matching_caliper(self, df, random_state=None):
caliper_coeff=self.config[Constants.caliper_coefficient]
caliper_width = caliper_coeff * df[self.__rf_propensity_scores].std()
df[self.__rf_propensity_scores] = (df[self.__rf_propensity_scores] / caliper_width).astype(int)
return self.___matching_1_1(df, random_state=random_state)
def __sample_propensity(self, splits, feats, caliper_coeff=0.1, match_type='caliper', random_state=None):
def __sample_propensity(self, splits, feats, match_type='caliper', random_state=None):
# concatenates the split dataframes, keeping the labels
df = pd.concat([i for _, i in splits], keys=[splits[0][0], splits[1][0]],
@ -181,7 +182,7 @@ class BiasTester(object):
# Perform 1-1 matching based on the propensity scores.
if match_type == 'caliper':
ind_c, ind_t = self.__matching_caliper(df, caliper_coeff=caliper_coeff, random_state=random_state)
ind_c, ind_t = self.__matching_caliper(df, random_state=random_state)
else:
ind_c, ind_t = self.___matching_1_1(df, random_state=random_state)

Просмотреть файл

@ -41,8 +41,12 @@ results_dir = "results_dir"
add_is_null_column = 'add_null'
resample_threshold = 'resample_threshold'
decomposition_type = 'decomposition_type'
num_bins_numerical = 'num_bins_numerical'
num_bins_categorical = 'num_bins_categorical'
caliper_coefficient = 'caliper_coefficient'
required_config_keys = [metric_column, invariant_columns, feature_columns, resample_threshold, results_dir,
p_value_threshold, decomposition_type, normalization_type, sort_type, add_is_null_column,
num_bins_categorical, num_bins_numerical, caliper_coefficient,
'apply_feature_target_metric_dependence_test']
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'

Просмотреть файл

@ -66,6 +66,7 @@ class FeatureBinGenerator(object):
:returns: Transformed pandas dataframe
"""
dummy_frames = []
num_cols = [col for col in num_cols if col in df.columns]

Просмотреть файл

@ -134,7 +134,7 @@ class FeatureRanker(object):
config[Constants.invariant_columns],
target_col,
min_data_points=1,
max_categories=10,
max_categories=config[Constants.num_bins_categorical],
p_thresh=0.25,
add_null=add_null,
apply_feature_target_metric_dependence_test=self.__config['apply_feature_target_metric_dependence_test'])
@ -217,7 +217,8 @@ class FeatureRanker(object):
and returning significance and contributions.
"""
df_target_col = df_uni_var_norm[[target_col, group_col, col]]
df_bin_col = self.__bin_generator.create_percentile_bins(df_target_col.copy(), [col], num_bins=10,
df_bin_col = self.__bin_generator.create_percentile_bins(df_target_col.copy(), [col],
num_bins=self.__config[Constants.num_bins_numerical],
add_null=add_null)
binned_feats = [feat for feat in df_bin_col.columns if col in feat]
@ -238,7 +239,8 @@ class FeatureRanker(object):
# contribution on the non-normalized data set
df_target_col = df_uni_var_un_norm[[target_col, group_col, col]]
df_bin_col = self.__bin_generator.create_percentile_bins(df_target_col.copy(), [col], num_bins=10,
df_bin_col = self.__bin_generator.create_percentile_bins(df_target_col.copy(), [col],
num_bins=self.__config[Constants.num_bins_numerical],
add_null=add_null)
binned_feats = [feat for feat in df_bin_col.columns if col in feat]

Просмотреть файл

@ -47,7 +47,6 @@ class MCT(object):
if is_biased and (self.__config[Constants.normalization_type] != Constants.skip):
n_control, n_treatment = bias_tester.normalize_bias(control, treatment, bias_results, random_state)
df_metric = n_control.append(n_treatment)
b_metric_delta = delta_comparer.compare(n_control, n_treatment)

Просмотреть файл

@ -36,7 +36,7 @@ class PreProcessor(object):
# Partition columns into target, invariant and variant features
df = Utils.merge_control_treatment(control_df, treatment_df)
df_feature_target, df_invariant_columns, df_metric_columns, feature_columns = self.__partition_columns(df)
df_feature_target, df_invariant, df_metric_group, feature_columns = self.__partition_columns(df)
self.__validate_data_set(df)
# Encode Categorical features - remove ones with 0 variation, or with no impact to the metric.
@ -48,11 +48,11 @@ class PreProcessor(object):
add_null=self.__config[Constants.add_is_null_column],
p_thresh=0.25,
min_data_points=1,
max_categories=10,
max_categories=self.__config[Constants.num_bins_categorical],
apply_feature_target_metric_dependence_test=self.__config['apply_feature_target_metric_dependence_test'])
# Drop target metric column
df_metric = self.__merge_columns(df_feature_target_binned, df_invariant_columns, df_metric_columns)
df_metric = self.__merge_columns(df_feature_target_binned, df_invariant, df_metric_group)
return df_metric, num_cols
@ -72,7 +72,7 @@ class PreProcessor(object):
def __partition_columns(self, df):
# Set the metric columns: contains the metric column and Constants.group_column_name column
metric_column = self.__config[Constants.metric_column]
df_metric_columns = df[[Constants.group_column_name, metric_column]]
df_metric_group = df[[Constants.group_column_name, metric_column]]
# Set invariant columns.
invariant_columns = self.__get_available_features(df, self.__config[Constants.invariant_columns])
df[invariant_columns] = df[invariant_columns].astype('object')
@ -83,10 +83,10 @@ class PreProcessor(object):
df_feature_columns = df[feature_columns]
# Merge features and metric column.
df_feature_target = df_feature_columns.merge(
pd.DataFrame(df_metric_columns[metric_column]),
pd.DataFrame(df_metric_group[metric_column]),
left_index=True,
right_index=True)
return df_feature_target, df_invariant, df_metric_columns, feature_columns
return df_feature_target, df_invariant, df_metric_group, feature_columns
def __validate_config(self):
config = self.__config

Просмотреть файл

@ -10,9 +10,38 @@ import pandas as pd
import mct.Constants as Constants
# Class to create a visualization of the result of the comparison
_index_ = """<!DOCTYPE html>
<html>
<body>
<section>
<h2>Initial Metric Comparison:</h2>
<iframe src=".\\initial_metric_comparison.html" frameBorder="0" height="100%" style="width:100%;height:100px"></iframe>
</section>
<section>
<h2>Top Level Bias Check:</h2>
<iframe src=".\\bias_results.html" frameBorder="0" height="100%" style="width:100%;height:200px"></iframe>
</section>
<section>
<h2>Detailed Bias Check:</h2>
<iframe src=".\\bias_deviations.html" frameBorder="0" height="100%" style="width:100%;height:400px"></iframe>
</section>
<section>
<h2>Normalized Metric Comparison (adjusting for biases):</h2>
<iframe src=".\\normalized_metric_comparison.html" frameBorder="0" height="100%" style="width:100%;height:100px"></iframe>
</section>
<section>
<h2>Features Explaining Metric Difference:</h2>
<iframe src=".\\feature_ranking.html" frameBorder="0" height="100%" style="width:100%;height:300px"></iframe>
</section>
<section>
<h2>Debug:</h2>
<iframe src="" frameBorder="0" height="100%" style="width:100%;height:20px"></iframe>
</section>
</body>
</html> """
class Visualizer(object):
"""
@ -22,8 +51,15 @@ class Visualizer(object):
def __init__(self, config: json):
self.config = config
self.__logger = logging.getLogger("mct")
self.__create_index_file()
return
def __create_index_file(self):
results_dir = self.config[Constants.results_dir]
index_html = os.path.join(results_dir, "index.html")
with open(index_html,mode='w') as index:
index.write(_index_)
def create_metric_delta_report(self, metric_delta: pd.DataFrame, result_file: str):
# Output metric_delta as HTML.
metric_delta.sort_values(
@ -68,7 +104,9 @@ class Visualizer(object):
deviation[deviation_result_columns].to_html(deviation_file, index=False, justify='center', index_names=False)
def create_feature_rank_report(self, ranked_feature: pd.DataFrame):
feature_ranking_file_csv = os.path.join(self.config[Constants.results_dir], "feature_ranking.csv")
results_dir = self.config[Constants.results_dir]
feature_ranking_file_csv = os.path.join(results_dir, "feature_ranking.csv")
feature_ranking_file_html = os.path.join(results_dir, "feature_ranking.html")
sorted_feature = ranked_feature.sort_values(
by=[Constants.hazard_score, Constants.percent_delta, Constants.count_delta, Constants.feature,
Constants.expected_failures],
@ -76,3 +114,4 @@ class Visualizer(object):
sorted_feature.reset_index(inplace=True, drop=True)
sorted_feature.to_csv(feature_ranking_file_csv)
sorted_feature.to_html(feature_ranking_file_html)