Merge pull request #6 from microsoft/journeyman/17878

Journeyman/17878
2020-10-27 23:56:27 -07:00 · 2020-10-27 23:56:27 -07:00 · e06fd73262
--- a/README.md
+++ b/README.md
@ -230,8 +230,8 @@ This helps in removing the Outliers and NaNs in the data. Outlier detection algo
 Once the data is generated (in the same way as data generation process for datadriven modeling), we can detect outliers in the following way:

 ```bash 
->>> python checkDataQuality.py
->>> python checkDataQuality.py --thrhld 4
+python checkDataQuality.py
+python checkDataQuality.py --thrhld 4
 ```

 By default, outlier threshold is 3, it can be adjusted based on the data. 
@ -245,7 +245,7 @@ This code helps in computing feature importance using gradient boosting trees.
 Once the data is generated (in the same way as data generation process for datadriven modeling), we can compute feature importance in the following way:

 ```bash 
->>> python featureImportance.py
+python featureImportance.py
 ```

 ## Contribute Code
--- a/checkDataQuality.py
+++ b/checkDataQuality.py
@ -9,12 +9,10 @@ import matplotlib as mpl

 import argparse
 import pickle
-from conf_params_var import STATE_SPACE_DIM, ACTION_SPACE_DIM, FEATURE_NAME, OUTPUT_NAME
-
+import yaml

 parser = argparse.ArgumentParser()
 parser.add_argument("--thrhld", type=float, default=3,help="choose the threshold to declare outlier (thrhld*sigma)")
-parser.add_argument("--OutputNameProvided", type=bool, default=False,help="write output name in conf_params_var.py")

 def read_env_data():
    try:
@ -28,48 +26,46 @@ def read_env_data():


 ######################## Functions for Outlier Detection and Ploting  ###################
-def plotOutliers(y_set, y_predict_all, outlier_data, OutputNameProvided = False):
+def plotOutliers(y_set, y_predict_all, outlier_data):
    fig = plt.figure()
    numSubPlots = y_set.shape[1]

    outlierData = outlier_data['y' + str(0)]
-    if OutputNameProvided:
-        dataLabel = OUTPUT_NAME[0]
-    else:
-        dataLabel =  'y'+ str(0)   
+    dataLabel = []
+    for key, value in config['IO']['feature_name'].items():
+        if value == 'state':
+            dataLabel.append(key)
+
    ax1 = plt.subplot(numSubPlots, 1, 0+1)
-    plt.plot(y_set[:,0], label=dataLabel, linewidth=1, color = 'blue' )
-    plt.plot(y_predict_all[0], label=dataLabel, linewidth=1, color = 'black' )
+    plt.plot(y_set[:,0], label=dataLabel[0], linewidth=1, color = 'blue' )
+    plt.plot(y_predict_all[0], label=dataLabel[0], linewidth=1, color = 'black' )
    plt.scatter(outlierData,y_set[outlierData,0], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)
    plt.xticks(rotation='horizontal')
-    plt.legend(loc='upper right');
+    plt.legend(loc='upper right')
    
    for i in range(1,numSubPlots):        
        outlierData = outlier_data['y' + str(i)]
-        if OutputNameProvided:
-            dataLabel = OUTPUT_NAME[i]
-        else:
-            dataLabel =  'y'+ str(i)   
+
        ax2 = plt.subplot(numSubPlots, 1, i+1, sharex=ax1)
-        plt.plot(y_set[:,i], label=dataLabel, linewidth=1, color = 'blue' )
-        plt.plot(y_predict_all[i], label=dataLabel, linewidth=1, color = 'black' )
+        plt.plot(y_set[:,i], label=dataLabel[i], linewidth=1, color = 'blue' )
+        plt.plot(y_predict_all[i], label=dataLabel[i], linewidth=1, color = 'black' )
        plt.scatter(outlierData,y_set[outlierData,i], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)
        plt.xticks(rotation='horizontal')
-        plt.legend(loc='upper right');
+        plt.legend(loc='upper right')


    # plt.show()
                                         
-def findOutliersAll(x_set,y_set, thrhld = 2):
+def findOutliersAll(x_set,y_set, thrhld=2):
    ## Computing Feature importance using gradient boosting
    print('computing Outliers ....')
    outlier_data = {}
    y_predict_all = []
    for i in range (0, y_set.shape[1]):
-        gb_estimator=GradientBoostingRegressor(n_iter_no_change  = 50, validation_fraction = .2)
+        gb_estimator=GradientBoostingRegressor(n_iter_no_change=50, validation_fraction=.2)
        gb_model= gb_estimator.fit(x_set,y_set[:,i])
        y_predict = gb_estimator.predict(x_set)
-        outlier_data['y' + str(i)] = findOutlier(y_set[:,i], y_predict, thrhld = 2)
+        outlier_data['y' + str(i)] = findOutlier(y_set[:,i], y_predict, thrhld=thrhld)
        y_predict_all.append(y_predict)
        print('y', str(i), ': ', outlier_data['y' + str(i)])
    return outlier_data, y_predict_all
@ -80,29 +76,24 @@ def findOutlier(y, y_predict, thrhld = 2):
    return outL[0]

 ###################### Lot Inputs #########################
-def plotInputs(x_set,y_set, OutputNameProvided = False):
+def plotInputs(x_set,y_set):
    fig = plt.figure()
    numSubPlots = x_set.shape[1] - y_set.shape[1]  ## Num of inputs
-
-    if OutputNameProvided:
-        dataLabel = INPUT_NAME[0]
-    else:
-        dataLabel =  'x'+ str(0)   
+    
+    dataLabel = []
+    for key, value in config['IO']['feature_name'].items():
+        if value == 'action':
+            dataLabel.append(key)
    ax1 = plt.subplot(numSubPlots, 1, 1)
-    plt.plot(x_set[:,y_set.shape[1]+0], label=dataLabel, linewidth=1, color = 'blue' )
+    plt.plot(x_set[:,y_set.shape[1]+0], label=dataLabel[0], linewidth=1, color = 'blue' )
    plt.xticks(rotation='horizontal')
-    plt.legend(loc='upper right');
+    plt.legend(loc='upper right')
    
    for i in range(1,numSubPlots):
-
-        if OutputNameProvided:
-            dataLabel = OUTPUT_NAME[i]
-        else:
-            dataLabel =  'x'+ str(i)   
        ax2 = plt.subplot(numSubPlots, 1, i+1, sharex=ax1)
-        plt.plot(x_set[:,y_set.shape[1]+i], label=dataLabel, linewidth=1, color = 'blue' )
+        plt.plot(x_set[:,y_set.shape[1]+i], label=dataLabel[i], linewidth=1, color = 'blue' )
        plt.xticks(rotation='horizontal')
-        plt.legend(loc='upper right');
+        plt.legend(loc='upper right')

    # plt.show()

@ -119,9 +110,11 @@ def maxMinMeanStd(x, varName = 'x'):
 if __name__=="__main__":
    args=parser.parse_args()

-    x_set, y_set=read_env_data()
-    mpl.rcParams['agg.path.chunksize'] = max(10000, x_set.shape[1]+100)
+    with open('config/config_model.yml') as cmfile:
+        config = yaml.full_load(cmfile)

+    x_set, y_set = read_env_data()
+    mpl.rcParams['agg.path.chunksize'] = max(10000, x_set.shape[1]+100)

    ##################### Outlier Code Usage ###############################################

@ -135,14 +128,12 @@ if __name__=="__main__":
        x_set[i+1,1] = 1.5

    ## Find Outlier and Plot them
-    outlier_data, y_predict_all = findOutliersAll(x_set,y_set, thrhld = args.thrhld)
+    outlier_data, y_predict_all = findOutliersAll(x_set,y_set, thrhld=args.thrhld)
    modelname='./models/OutlierData_Y.sav'
    joblib.dump(outlier_data, modelname)
-    plotOutliers(y_set, y_predict_all, outlier_data, OutputNameProvided = args.OutputNameProvided)
-    plotInputs(x_set,y_set, OutputNameProvided = False)
+    plotOutliers(y_set, y_predict_all, outlier_data)
+    plotInputs(x_set,y_set)
    plt.show()
+
    ############################# Detecting NaN ######################
-    hasNaN(x_set)
-
-
-
+    hasNaN(x_set)
--- a/env_data/x_set.pickle
+++ b/env_data/x_set.pickle
--- a/env_data/y_set.pickle
+++ b/env_data/y_set.pickle
--- a/featureImportance.py
+++ b/featureImportance.py
@ -7,12 +7,10 @@ from env_data_modeler import env_gb_modeler

 import argparse
 import pickle
-from conf_params_var import STATE_SPACE_DIM, ACTION_SPACE_DIM, FEATURE_NAME
-
+import yaml

 parser = argparse.ArgumentParser()
 parser.add_argument("--widthbar", type=float, default=.5,help="choose width of the bars around")
-parser.add_argument("--featureNameProvided", type=bool, default=False,help="write xlabel name in conf_params_var.py")

 def read_env_data():
    try:
@ -25,7 +23,7 @@ def read_env_data():
    return x_set, y_set


-def feature_plots(feature_data, total_width=0.5, featureNameProvided = False):
+def feature_plots(feature_data, total_width=0.5):
    fig, ax = plt.subplots()

    colors = plt.rcParams['axes.prop_cycle'].by_key()['color']  
@ -43,8 +41,7 @@ def feature_plots(feature_data, total_width=0.5, featureNameProvided = False):
    plt.xlabel('Feature Number', fontsize=18)
    plt.ylabel('Feature Importance', fontsize=18)

-    if featureNameProvided == True:
-        plt.xticks(ticks=range(len(FEATURE_NAME)), labels=FEATURE_NAME)
+    plt.xticks(ticks=range(state_space_dim+action_space_dim), labels=config['IO']['feature_name'].keys())
    plt.show()


@ -52,9 +49,19 @@ def feature_plots(feature_data, total_width=0.5, featureNameProvided = False):
 if __name__=="__main__":
    args=parser.parse_args()

-    state_space_dim=int(STATE_SPACE_DIM)
-    action_space_dim=int(ACTION_SPACE_DIM)
+    with open('config/config_model.yml') as cmfile:
+        config = yaml.full_load(cmfile)

+    state_space_dim = 0
+    action_space_dim = 0
+    for key, value in config['IO']['feature_name'].items():
+        if value == 'state':
+            state_space_dim += 1
+        elif value == 'action':
+            action_space_dim += 1
+        else:
+            print('Please fix config_model.yml to specify either state or action')
+            exit()

    x_set, y_set=read_env_data()
    x_train, x_test, y_train, y_test = train_test_split(x_set, y_set, test_size=0.33, random_state=42)
@ -63,7 +70,7 @@ if __name__=="__main__":
    print('computing Feature Importance ....')
    feature_importance_data = {}
    for i in range (0, y_set.shape[1]):
-        gb_estimator=env_gb_modeler()
+        gb_estimator=env_gb_modeler(state_space_dim, action_space_dim)
        gb_estimator.create_gb_model()
        gb_model= gb_estimator.train_gb_model(x_train,y_train[:,i])
        feature_importance_data['y' + str(i)] = gb_model.feature_importances_
@ -72,7 +79,7 @@ if __name__=="__main__":
    modelname='./models/feature_importance.sav'
    joblib.dump(feature_importance_data, modelname)
        
-    feature_plots(feature_importance_data, total_width= args.widthbar, featureNameProvided = args.featureNameProvided)
+    feature_plots(feature_importance_data, total_width=args.widthbar)



--- a/models/nnmodel.h5
+++ b/models/nnmodel.h5
--- a/models/nnmodel98.h5
+++ b/models/nnmodel98.h5