Merge pull request #2 from microsoft/ashkana/update
moving caliper_coefficient to config
This commit is contained in:
Коммит
5771e6bd63
|
@ -158,12 +158,13 @@ class BiasTester(object):
|
|||
|
||||
return index_c, index_t
|
||||
|
||||
def __matching_caliper(self, df, caliper_coeff=0.1, random_state=None):
|
||||
def __matching_caliper(self, df, random_state=None):
|
||||
caliper_coeff=self.config[Constants.caliper_coefficient]
|
||||
caliper_width = caliper_coeff * df[self.__rf_propensity_scores].std()
|
||||
df[self.__rf_propensity_scores] = (df[self.__rf_propensity_scores] / caliper_width).astype(int)
|
||||
return self.___matching_1_1(df, random_state=random_state)
|
||||
|
||||
def __sample_propensity(self, splits, feats, caliper_coeff=0.1, match_type='caliper', random_state=None):
|
||||
def __sample_propensity(self, splits, feats, match_type='caliper', random_state=None):
|
||||
# concatenates the split dataframes, keeping the labels
|
||||
|
||||
df = pd.concat([i for _, i in splits], keys=[splits[0][0], splits[1][0]],
|
||||
|
@ -181,7 +182,7 @@ class BiasTester(object):
|
|||
|
||||
# Perform 1-1 matching based on the propensity scores.
|
||||
if match_type == 'caliper':
|
||||
ind_c, ind_t = self.__matching_caliper(df, caliper_coeff=caliper_coeff, random_state=random_state)
|
||||
ind_c, ind_t = self.__matching_caliper(df, random_state=random_state)
|
||||
else:
|
||||
ind_c, ind_t = self.___matching_1_1(df, random_state=random_state)
|
||||
|
||||
|
|
|
@ -41,8 +41,12 @@ results_dir = "results_dir"
|
|||
add_is_null_column = 'add_null'
|
||||
resample_threshold = 'resample_threshold'
|
||||
decomposition_type = 'decomposition_type'
|
||||
num_bins_numerical = 'num_bins_numerical'
|
||||
num_bins_categorical = 'num_bins_categorical'
|
||||
caliper_coefficient = 'caliper_coefficient'
|
||||
required_config_keys = [metric_column, invariant_columns, feature_columns, resample_threshold, results_dir,
|
||||
p_value_threshold, decomposition_type, normalization_type, sort_type, add_is_null_column,
|
||||
num_bins_categorical, num_bins_numerical, caliper_coefficient,
|
||||
'apply_feature_target_metric_dependence_test']
|
||||
|
||||
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
|
|
|
@ -66,6 +66,7 @@ class FeatureBinGenerator(object):
|
|||
|
||||
:returns: Transformed pandas dataframe
|
||||
"""
|
||||
|
||||
dummy_frames = []
|
||||
|
||||
num_cols = [col for col in num_cols if col in df.columns]
|
||||
|
|
|
@ -134,7 +134,7 @@ class FeatureRanker(object):
|
|||
config[Constants.invariant_columns],
|
||||
target_col,
|
||||
min_data_points=1,
|
||||
max_categories=10,
|
||||
max_categories=config[Constants.num_bins_categorical],
|
||||
p_thresh=0.25,
|
||||
add_null=add_null,
|
||||
apply_feature_target_metric_dependence_test=self.__config['apply_feature_target_metric_dependence_test'])
|
||||
|
@ -217,7 +217,8 @@ class FeatureRanker(object):
|
|||
and returning significance and contributions.
|
||||
"""
|
||||
df_target_col = df_uni_var_norm[[target_col, group_col, col]]
|
||||
df_bin_col = self.__bin_generator.create_percentile_bins(df_target_col.copy(), [col], num_bins=10,
|
||||
df_bin_col = self.__bin_generator.create_percentile_bins(df_target_col.copy(), [col],
|
||||
num_bins=self.__config[Constants.num_bins_numerical],
|
||||
add_null=add_null)
|
||||
binned_feats = [feat for feat in df_bin_col.columns if col in feat]
|
||||
|
||||
|
@ -238,7 +239,8 @@ class FeatureRanker(object):
|
|||
|
||||
# contribution on the non-normalized data set
|
||||
df_target_col = df_uni_var_un_norm[[target_col, group_col, col]]
|
||||
df_bin_col = self.__bin_generator.create_percentile_bins(df_target_col.copy(), [col], num_bins=10,
|
||||
df_bin_col = self.__bin_generator.create_percentile_bins(df_target_col.copy(), [col],
|
||||
num_bins=self.__config[Constants.num_bins_numerical],
|
||||
add_null=add_null)
|
||||
binned_feats = [feat for feat in df_bin_col.columns if col in feat]
|
||||
|
||||
|
|
|
@ -47,7 +47,6 @@ class MCT(object):
|
|||
|
||||
if is_biased and (self.__config[Constants.normalization_type] != Constants.skip):
|
||||
n_control, n_treatment = bias_tester.normalize_bias(control, treatment, bias_results, random_state)
|
||||
|
||||
df_metric = n_control.append(n_treatment)
|
||||
|
||||
b_metric_delta = delta_comparer.compare(n_control, n_treatment)
|
||||
|
|
|
@ -36,7 +36,7 @@ class PreProcessor(object):
|
|||
|
||||
# Partition columns into target, invariant and variant features
|
||||
df = Utils.merge_control_treatment(control_df, treatment_df)
|
||||
df_feature_target, df_invariant_columns, df_metric_columns, feature_columns = self.__partition_columns(df)
|
||||
df_feature_target, df_invariant, df_metric_group, feature_columns = self.__partition_columns(df)
|
||||
self.__validate_data_set(df)
|
||||
|
||||
# Encode Categorical features - remove ones with 0 variation, or with no impact to the metric.
|
||||
|
@ -48,11 +48,11 @@ class PreProcessor(object):
|
|||
add_null=self.__config[Constants.add_is_null_column],
|
||||
p_thresh=0.25,
|
||||
min_data_points=1,
|
||||
max_categories=10,
|
||||
max_categories=self.__config[Constants.num_bins_categorical],
|
||||
apply_feature_target_metric_dependence_test=self.__config['apply_feature_target_metric_dependence_test'])
|
||||
|
||||
# Drop target metric column
|
||||
df_metric = self.__merge_columns(df_feature_target_binned, df_invariant_columns, df_metric_columns)
|
||||
df_metric = self.__merge_columns(df_feature_target_binned, df_invariant, df_metric_group)
|
||||
|
||||
return df_metric, num_cols
|
||||
|
||||
|
@ -72,7 +72,7 @@ class PreProcessor(object):
|
|||
def __partition_columns(self, df):
|
||||
# Set the metric columns: contains the metric column and Constants.group_column_name column
|
||||
metric_column = self.__config[Constants.metric_column]
|
||||
df_metric_columns = df[[Constants.group_column_name, metric_column]]
|
||||
df_metric_group = df[[Constants.group_column_name, metric_column]]
|
||||
# Set invariant columns.
|
||||
invariant_columns = self.__get_available_features(df, self.__config[Constants.invariant_columns])
|
||||
df[invariant_columns] = df[invariant_columns].astype('object')
|
||||
|
@ -83,10 +83,10 @@ class PreProcessor(object):
|
|||
df_feature_columns = df[feature_columns]
|
||||
# Merge features and metric column.
|
||||
df_feature_target = df_feature_columns.merge(
|
||||
pd.DataFrame(df_metric_columns[metric_column]),
|
||||
pd.DataFrame(df_metric_group[metric_column]),
|
||||
left_index=True,
|
||||
right_index=True)
|
||||
return df_feature_target, df_invariant, df_metric_columns, feature_columns
|
||||
return df_feature_target, df_invariant, df_metric_group, feature_columns
|
||||
|
||||
def __validate_config(self):
|
||||
config = self.__config
|
||||
|
|
|
@ -10,9 +10,38 @@ import pandas as pd
|
|||
|
||||
import mct.Constants as Constants
|
||||
|
||||
|
||||
# Class to create a visualization of the result of the comparison
|
||||
|
||||
_index_ = """<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<section>
|
||||
<h2>Initial Metric Comparison:</h2>
|
||||
<iframe src=".\\initial_metric_comparison.html" frameBorder="0" height="100%" style="width:100%;height:100px"></iframe>
|
||||
</section>
|
||||
<section>
|
||||
<h2>Top Level Bias Check:</h2>
|
||||
<iframe src=".\\bias_results.html" frameBorder="0" height="100%" style="width:100%;height:200px"></iframe>
|
||||
</section>
|
||||
<section>
|
||||
<h2>Detailed Bias Check:</h2>
|
||||
<iframe src=".\\bias_deviations.html" frameBorder="0" height="100%" style="width:100%;height:400px"></iframe>
|
||||
</section>
|
||||
<section>
|
||||
<h2>Normalized Metric Comparison (adjusting for biases):</h2>
|
||||
<iframe src=".\\normalized_metric_comparison.html" frameBorder="0" height="100%" style="width:100%;height:100px"></iframe>
|
||||
</section>
|
||||
<section>
|
||||
<h2>Features Explaining Metric Difference:</h2>
|
||||
<iframe src=".\\feature_ranking.html" frameBorder="0" height="100%" style="width:100%;height:300px"></iframe>
|
||||
</section>
|
||||
<section>
|
||||
<h2>Debug:</h2>
|
||||
<iframe src="" frameBorder="0" height="100%" style="width:100%;height:20px"></iframe>
|
||||
</section>
|
||||
</body>
|
||||
</html> """
|
||||
|
||||
|
||||
class Visualizer(object):
|
||||
"""
|
||||
|
@ -22,8 +51,15 @@ class Visualizer(object):
|
|||
def __init__(self, config: json):
|
||||
self.config = config
|
||||
self.__logger = logging.getLogger("mct")
|
||||
self.__create_index_file()
|
||||
return
|
||||
|
||||
def __create_index_file(self):
|
||||
results_dir = self.config[Constants.results_dir]
|
||||
index_html = os.path.join(results_dir, "index.html")
|
||||
with open(index_html,mode='w') as index:
|
||||
index.write(_index_)
|
||||
|
||||
def create_metric_delta_report(self, metric_delta: pd.DataFrame, result_file: str):
|
||||
# Output metric_delta as HTML.
|
||||
metric_delta.sort_values(
|
||||
|
@ -68,7 +104,9 @@ class Visualizer(object):
|
|||
deviation[deviation_result_columns].to_html(deviation_file, index=False, justify='center', index_names=False)
|
||||
|
||||
def create_feature_rank_report(self, ranked_feature: pd.DataFrame):
|
||||
feature_ranking_file_csv = os.path.join(self.config[Constants.results_dir], "feature_ranking.csv")
|
||||
results_dir = self.config[Constants.results_dir]
|
||||
feature_ranking_file_csv = os.path.join(results_dir, "feature_ranking.csv")
|
||||
feature_ranking_file_html = os.path.join(results_dir, "feature_ranking.html")
|
||||
sorted_feature = ranked_feature.sort_values(
|
||||
by=[Constants.hazard_score, Constants.percent_delta, Constants.count_delta, Constants.feature,
|
||||
Constants.expected_failures],
|
||||
|
@ -76,3 +114,4 @@ class Visualizer(object):
|
|||
|
||||
sorted_feature.reset_index(inplace=True, drop=True)
|
||||
sorted_feature.to_csv(feature_ranking_file_csv)
|
||||
sorted_feature.to_html(feature_ranking_file_html)
|
||||
|
|
Загрузка…
Ссылка в новой задаче