Fixed data cleaning/checker tools with recent overhaul

2020-10-27 23:54:08 -07:00 · 2020-10-27 23:54:08 -07:00 · 64bbd5bfad
--- a/checkDataQuality.py
+++ b/checkDataQuality.py
@ -9,12 +9,10 @@ import matplotlib as mpl

 import argparse
 import pickle
-from conf_params_var import STATE_SPACE_DIM, ACTION_SPACE_DIM, FEATURE_NAME, OUTPUT_NAME
-
+import yaml

 parser = argparse.ArgumentParser()
 parser.add_argument("--thrhld", type=float, default=3,help="choose the threshold to declare outlier (thrhld*sigma)")
-parser.add_argument("--OutputNameProvided", type=bool, default=False,help="write output name in conf_params_var.py")

 def read_env_data():
    try:
@ -28,48 +26,46 @@ def read_env_data():


 ######################## Functions for Outlier Detection and Ploting  ###################
-def plotOutliers(y_set, y_predict_all, outlier_data, OutputNameProvided = False):
+def plotOutliers(y_set, y_predict_all, outlier_data):
    fig = plt.figure()
    numSubPlots = y_set.shape[1]

    outlierData = outlier_data['y' + str(0)]
-    if OutputNameProvided:
-        dataLabel = OUTPUT_NAME[0]
-    else:
-        dataLabel =  'y'+ str(0)   
+    dataLabel = []
+    for key, value in config['IO']['feature_name'].items():
+        if value == 'state':
+            dataLabel.append(key)
+
    ax1 = plt.subplot(numSubPlots, 1, 0+1)
-    plt.plot(y_set[:,0], label=dataLabel, linewidth=1, color = 'blue' )
-    plt.plot(y_predict_all[0], label=dataLabel, linewidth=1, color = 'black' )
+    plt.plot(y_set[:,0], label=dataLabel[0], linewidth=1, color = 'blue' )
+    plt.plot(y_predict_all[0], label=dataLabel[0], linewidth=1, color = 'black' )
    plt.scatter(outlierData,y_set[outlierData,0], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)
    plt.xticks(rotation='horizontal')
-    plt.legend(loc='upper right');
+    plt.legend(loc='upper right')
    
    for i in range(1,numSubPlots):        
        outlierData = outlier_data['y' + str(i)]
-        if OutputNameProvided:
-            dataLabel = OUTPUT_NAME[i]
-        else:
-            dataLabel =  'y'+ str(i)   
+
        ax2 = plt.subplot(numSubPlots, 1, i+1, sharex=ax1)
-        plt.plot(y_set[:,i], label=dataLabel, linewidth=1, color = 'blue' )
-        plt.plot(y_predict_all[i], label=dataLabel, linewidth=1, color = 'black' )
+        plt.plot(y_set[:,i], label=dataLabel[i], linewidth=1, color = 'blue' )
+        plt.plot(y_predict_all[i], label=dataLabel[i], linewidth=1, color = 'black' )
        plt.scatter(outlierData,y_set[outlierData,i], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)
        plt.xticks(rotation='horizontal')
-        plt.legend(loc='upper right');
+        plt.legend(loc='upper right')


    # plt.show()
                                         
-def findOutliersAll(x_set,y_set, thrhld = 2):
+def findOutliersAll(x_set,y_set, thrhld=2):
    ## Computing Feature importance using gradient boosting
    print('computing Outliers ....')
    outlier_data = {}
    y_predict_all = []
    for i in range (0, y_set.shape[1]):
-        gb_estimator=GradientBoostingRegressor(n_iter_no_change  = 50, validation_fraction = .2)
+        gb_estimator=GradientBoostingRegressor(n_iter_no_change=50, validation_fraction=.2)
        gb_model= gb_estimator.fit(x_set,y_set[:,i])
        y_predict = gb_estimator.predict(x_set)
-        outlier_data['y' + str(i)] = findOutlier(y_set[:,i], y_predict, thrhld = 2)
+        outlier_data['y' + str(i)] = findOutlier(y_set[:,i], y_predict, thrhld=thrhld)
        y_predict_all.append(y_predict)
        print('y', str(i), ': ', outlier_data['y' + str(i)])
    return outlier_data, y_predict_all
@ -80,29 +76,24 @@ def findOutlier(y, y_predict, thrhld = 2):
    return outL[0]

 ###################### Lot Inputs #########################
-def plotInputs(x_set,y_set, OutputNameProvided = False):
+def plotInputs(x_set,y_set):
    fig = plt.figure()
    numSubPlots = x_set.shape[1] - y_set.shape[1]  ## Num of inputs
-
-    if OutputNameProvided:
-        dataLabel = INPUT_NAME[0]
-    else:
-        dataLabel =  'x'+ str(0)   
+    
+    dataLabel = []
+    for key, value in config['IO']['feature_name'].items():
+        if value == 'action':
+            dataLabel.append(key)
    ax1 = plt.subplot(numSubPlots, 1, 1)
-    plt.plot(x_set[:,y_set.shape[1]+0], label=dataLabel, linewidth=1, color = 'blue' )
+    plt.plot(x_set[:,y_set.shape[1]+0], label=dataLabel[0], linewidth=1, color = 'blue' )
    plt.xticks(rotation='horizontal')
-    plt.legend(loc='upper right');
+    plt.legend(loc='upper right')
    
    for i in range(1,numSubPlots):
-
-        if OutputNameProvided:
-            dataLabel = OUTPUT_NAME[i]
-        else:
-            dataLabel =  'x'+ str(i)   
        ax2 = plt.subplot(numSubPlots, 1, i+1, sharex=ax1)
-        plt.plot(x_set[:,y_set.shape[1]+i], label=dataLabel, linewidth=1, color = 'blue' )
+        plt.plot(x_set[:,y_set.shape[1]+i], label=dataLabel[i], linewidth=1, color = 'blue' )
        plt.xticks(rotation='horizontal')
-        plt.legend(loc='upper right');
+        plt.legend(loc='upper right')

    # plt.show()

@ -119,9 +110,11 @@ def maxMinMeanStd(x, varName = 'x'):
 if __name__=="__main__":
    args=parser.parse_args()

-    x_set, y_set=read_env_data()
-    mpl.rcParams['agg.path.chunksize'] = max(10000, x_set.shape[1]+100)
+    with open('config/config_model.yml') as cmfile:
+        config = yaml.full_load(cmfile)

+    x_set, y_set = read_env_data()
+    mpl.rcParams['agg.path.chunksize'] = max(10000, x_set.shape[1]+100)

    ##################### Outlier Code Usage ###############################################

@ -135,14 +128,12 @@ if __name__=="__main__":
        x_set[i+1,1] = 1.5

    ## Find Outlier and Plot them
-    outlier_data, y_predict_all = findOutliersAll(x_set,y_set, thrhld = args.thrhld)
+    outlier_data, y_predict_all = findOutliersAll(x_set,y_set, thrhld=args.thrhld)
    modelname='./models/OutlierData_Y.sav'
    joblib.dump(outlier_data, modelname)
-    plotOutliers(y_set, y_predict_all, outlier_data, OutputNameProvided = args.OutputNameProvided)
-    plotInputs(x_set,y_set, OutputNameProvided = False)
+    plotOutliers(y_set, y_predict_all, outlier_data)
+    plotInputs(x_set,y_set)
    plt.show()
+
    ############################# Detecting NaN ######################
-    hasNaN(x_set)
-
-
-
+    hasNaN(x_set)
--- a/env_data/x_set.pickle
+++ b/env_data/x_set.pickle
--- a/env_data/y_set.pickle
+++ b/env_data/y_set.pickle
--- a/featureImportance.py
+++ b/featureImportance.py
@ -7,12 +7,10 @@ from env_data_modeler import env_gb_modeler

 import argparse
 import pickle
-from conf_params_var import STATE_SPACE_DIM, ACTION_SPACE_DIM, FEATURE_NAME
-
+import yaml

 parser = argparse.ArgumentParser()
 parser.add_argument("--widthbar", type=float, default=.5,help="choose width of the bars around")
-parser.add_argument("--featureNameProvided", type=bool, default=False,help="write xlabel name in conf_params_var.py")

 def read_env_data():
    try:
@ -25,7 +23,7 @@ def read_env_data():
    return x_set, y_set


-def feature_plots(feature_data, total_width=0.5, featureNameProvided = False):
+def feature_plots(feature_data, total_width=0.5):
    fig, ax = plt.subplots()

    colors = plt.rcParams['axes.prop_cycle'].by_key()['color']  
@ -43,8 +41,7 @@ def feature_plots(feature_data, total_width=0.5, featureNameProvided = False):
    plt.xlabel('Feature Number', fontsize=18)
    plt.ylabel('Feature Importance', fontsize=18)

-    if featureNameProvided == True:
-        plt.xticks(ticks=range(len(FEATURE_NAME)), labels=FEATURE_NAME)
+    plt.xticks(ticks=range(state_space_dim+action_space_dim), labels=config['IO']['feature_name'].keys())
    plt.show()


@ -52,9 +49,19 @@ def feature_plots(feature_data, total_width=0.5, featureNameProvided = False):
 if __name__=="__main__":
    args=parser.parse_args()

-    state_space_dim=int(STATE_SPACE_DIM)
-    action_space_dim=int(ACTION_SPACE_DIM)
+    with open('config/config_model.yml') as cmfile:
+        config = yaml.full_load(cmfile)

+    state_space_dim = 0
+    action_space_dim = 0
+    for key, value in config['IO']['feature_name'].items():
+        if value == 'state':
+            state_space_dim += 1
+        elif value == 'action':
+            action_space_dim += 1
+        else:
+            print('Please fix config_model.yml to specify either state or action')
+            exit()

    x_set, y_set=read_env_data()
    x_train, x_test, y_train, y_test = train_test_split(x_set, y_set, test_size=0.33, random_state=42)
@ -63,7 +70,7 @@ if __name__=="__main__":
    print('computing Feature Importance ....')
    feature_importance_data = {}
    for i in range (0, y_set.shape[1]):
-        gb_estimator=env_gb_modeler()
+        gb_estimator=env_gb_modeler(state_space_dim, action_space_dim)
        gb_estimator.create_gb_model()
        gb_model= gb_estimator.train_gb_model(x_train,y_train[:,i])
        feature_importance_data['y' + str(i)] = gb_model.feature_importances_
@ -72,7 +79,7 @@ if __name__=="__main__":
    modelname='./models/feature_importance.sav'
    joblib.dump(feature_importance_data, modelname)
        
-    feature_plots(feature_importance_data, total_width= args.widthbar, featureNameProvided = args.featureNameProvided)
+    feature_plots(feature_importance_data, total_width=args.widthbar)



--- a/models/nnmodel.h5
+++ b/models/nnmodel.h5
--- a/models/nnmodel98.h5
+++ b/models/nnmodel98.h5