Fixed data cleaning/checker tools with recent overhaul

This commit is contained in:
Journey McDowell 2020-10-27 23:54:08 -07:00
Родитель e3d334c514
Коммит 64bbd5bfad
6 изменённых файлов: 53 добавлений и 55 удалений

Просмотреть файл

@ -9,12 +9,10 @@ import matplotlib as mpl
import argparse
import pickle
from conf_params_var import STATE_SPACE_DIM, ACTION_SPACE_DIM, FEATURE_NAME, OUTPUT_NAME
import yaml
parser = argparse.ArgumentParser()
parser.add_argument("--thrhld", type=float, default=3,help="choose the threshold to declare outlier (thrhld*sigma)")
parser.add_argument("--OutputNameProvided", type=bool, default=False,help="write output name in conf_params_var.py")
def read_env_data():
try:
@ -28,48 +26,46 @@ def read_env_data():
######################## Functions for Outlier Detection and Ploting ###################
def plotOutliers(y_set, y_predict_all, outlier_data, OutputNameProvided = False):
def plotOutliers(y_set, y_predict_all, outlier_data):
fig = plt.figure()
numSubPlots = y_set.shape[1]
outlierData = outlier_data['y' + str(0)]
if OutputNameProvided:
dataLabel = OUTPUT_NAME[0]
else:
dataLabel = 'y'+ str(0)
dataLabel = []
for key, value in config['IO']['feature_name'].items():
if value == 'state':
dataLabel.append(key)
ax1 = plt.subplot(numSubPlots, 1, 0+1)
plt.plot(y_set[:,0], label=dataLabel, linewidth=1, color = 'blue' )
plt.plot(y_predict_all[0], label=dataLabel, linewidth=1, color = 'black' )
plt.plot(y_set[:,0], label=dataLabel[0], linewidth=1, color = 'blue' )
plt.plot(y_predict_all[0], label=dataLabel[0], linewidth=1, color = 'black' )
plt.scatter(outlierData,y_set[outlierData,0], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)
plt.xticks(rotation='horizontal')
plt.legend(loc='upper right');
plt.legend(loc='upper right')
for i in range(1,numSubPlots):
outlierData = outlier_data['y' + str(i)]
if OutputNameProvided:
dataLabel = OUTPUT_NAME[i]
else:
dataLabel = 'y'+ str(i)
ax2 = plt.subplot(numSubPlots, 1, i+1, sharex=ax1)
plt.plot(y_set[:,i], label=dataLabel, linewidth=1, color = 'blue' )
plt.plot(y_predict_all[i], label=dataLabel, linewidth=1, color = 'black' )
plt.plot(y_set[:,i], label=dataLabel[i], linewidth=1, color = 'blue' )
plt.plot(y_predict_all[i], label=dataLabel[i], linewidth=1, color = 'black' )
plt.scatter(outlierData,y_set[outlierData,i], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)
plt.xticks(rotation='horizontal')
plt.legend(loc='upper right');
plt.legend(loc='upper right')
# plt.show()
def findOutliersAll(x_set,y_set, thrhld = 2):
def findOutliersAll(x_set,y_set, thrhld=2):
## Computing Feature importance using gradient boosting
print('computing Outliers ....')
outlier_data = {}
y_predict_all = []
for i in range (0, y_set.shape[1]):
gb_estimator=GradientBoostingRegressor(n_iter_no_change = 50, validation_fraction = .2)
gb_estimator=GradientBoostingRegressor(n_iter_no_change=50, validation_fraction=.2)
gb_model= gb_estimator.fit(x_set,y_set[:,i])
y_predict = gb_estimator.predict(x_set)
outlier_data['y' + str(i)] = findOutlier(y_set[:,i], y_predict, thrhld = 2)
outlier_data['y' + str(i)] = findOutlier(y_set[:,i], y_predict, thrhld=thrhld)
y_predict_all.append(y_predict)
print('y', str(i), ': ', outlier_data['y' + str(i)])
return outlier_data, y_predict_all
@ -80,29 +76,24 @@ def findOutlier(y, y_predict, thrhld = 2):
return outL[0]
###################### Lot Inputs #########################
def plotInputs(x_set,y_set, OutputNameProvided = False):
def plotInputs(x_set,y_set):
fig = plt.figure()
numSubPlots = x_set.shape[1] - y_set.shape[1] ## Num of inputs
if OutputNameProvided:
dataLabel = INPUT_NAME[0]
else:
dataLabel = 'x'+ str(0)
dataLabel = []
for key, value in config['IO']['feature_name'].items():
if value == 'action':
dataLabel.append(key)
ax1 = plt.subplot(numSubPlots, 1, 1)
plt.plot(x_set[:,y_set.shape[1]+0], label=dataLabel, linewidth=1, color = 'blue' )
plt.plot(x_set[:,y_set.shape[1]+0], label=dataLabel[0], linewidth=1, color = 'blue' )
plt.xticks(rotation='horizontal')
plt.legend(loc='upper right');
plt.legend(loc='upper right')
for i in range(1,numSubPlots):
if OutputNameProvided:
dataLabel = OUTPUT_NAME[i]
else:
dataLabel = 'x'+ str(i)
ax2 = plt.subplot(numSubPlots, 1, i+1, sharex=ax1)
plt.plot(x_set[:,y_set.shape[1]+i], label=dataLabel, linewidth=1, color = 'blue' )
plt.plot(x_set[:,y_set.shape[1]+i], label=dataLabel[i], linewidth=1, color = 'blue' )
plt.xticks(rotation='horizontal')
plt.legend(loc='upper right');
plt.legend(loc='upper right')
# plt.show()
@ -119,9 +110,11 @@ def maxMinMeanStd(x, varName = 'x'):
if __name__=="__main__":
args=parser.parse_args()
x_set, y_set=read_env_data()
mpl.rcParams['agg.path.chunksize'] = max(10000, x_set.shape[1]+100)
with open('config/config_model.yml') as cmfile:
config = yaml.full_load(cmfile)
x_set, y_set = read_env_data()
mpl.rcParams['agg.path.chunksize'] = max(10000, x_set.shape[1]+100)
##################### Outlier Code Usage ###############################################
@ -135,14 +128,12 @@ if __name__=="__main__":
x_set[i+1,1] = 1.5
## Find Outlier and Plot them
outlier_data, y_predict_all = findOutliersAll(x_set,y_set, thrhld = args.thrhld)
outlier_data, y_predict_all = findOutliersAll(x_set,y_set, thrhld=args.thrhld)
modelname='./models/OutlierData_Y.sav'
joblib.dump(outlier_data, modelname)
plotOutliers(y_set, y_predict_all, outlier_data, OutputNameProvided = args.OutputNameProvided)
plotInputs(x_set,y_set, OutputNameProvided = False)
plotOutliers(y_set, y_predict_all, outlier_data)
plotInputs(x_set,y_set)
plt.show()
############################# Detecting NaN ######################
hasNaN(x_set)
hasNaN(x_set)

Двоичные данные
env_data/x_set.pickle

Двоичный файл не отображается.

Двоичные данные
env_data/y_set.pickle

Двоичный файл не отображается.

Просмотреть файл

@ -7,12 +7,10 @@ from env_data_modeler import env_gb_modeler
import argparse
import pickle
from conf_params_var import STATE_SPACE_DIM, ACTION_SPACE_DIM, FEATURE_NAME
import yaml
parser = argparse.ArgumentParser()
parser.add_argument("--widthbar", type=float, default=.5,help="choose width of the bars around")
parser.add_argument("--featureNameProvided", type=bool, default=False,help="write xlabel name in conf_params_var.py")
def read_env_data():
try:
@ -25,7 +23,7 @@ def read_env_data():
return x_set, y_set
def feature_plots(feature_data, total_width=0.5, featureNameProvided = False):
def feature_plots(feature_data, total_width=0.5):
fig, ax = plt.subplots()
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
@ -43,8 +41,7 @@ def feature_plots(feature_data, total_width=0.5, featureNameProvided = False):
plt.xlabel('Feature Number', fontsize=18)
plt.ylabel('Feature Importance', fontsize=18)
if featureNameProvided == True:
plt.xticks(ticks=range(len(FEATURE_NAME)), labels=FEATURE_NAME)
plt.xticks(ticks=range(state_space_dim+action_space_dim), labels=config['IO']['feature_name'].keys())
plt.show()
@ -52,9 +49,19 @@ def feature_plots(feature_data, total_width=0.5, featureNameProvided = False):
if __name__=="__main__":
args=parser.parse_args()
state_space_dim=int(STATE_SPACE_DIM)
action_space_dim=int(ACTION_SPACE_DIM)
with open('config/config_model.yml') as cmfile:
config = yaml.full_load(cmfile)
state_space_dim = 0
action_space_dim = 0
for key, value in config['IO']['feature_name'].items():
if value == 'state':
state_space_dim += 1
elif value == 'action':
action_space_dim += 1
else:
print('Please fix config_model.yml to specify either state or action')
exit()
x_set, y_set=read_env_data()
x_train, x_test, y_train, y_test = train_test_split(x_set, y_set, test_size=0.33, random_state=42)
@ -63,7 +70,7 @@ if __name__=="__main__":
print('computing Feature Importance ....')
feature_importance_data = {}
for i in range (0, y_set.shape[1]):
gb_estimator=env_gb_modeler()
gb_estimator=env_gb_modeler(state_space_dim, action_space_dim)
gb_estimator.create_gb_model()
gb_model= gb_estimator.train_gb_model(x_train,y_train[:,i])
feature_importance_data['y' + str(i)] = gb_model.feature_importances_
@ -72,7 +79,7 @@ if __name__=="__main__":
modelname='./models/feature_importance.sav'
joblib.dump(feature_importance_data, modelname)
feature_plots(feature_importance_data, total_width= args.widthbar, featureNameProvided = args.featureNameProvided)
feature_plots(feature_importance_data, total_width=args.widthbar)

Двоичные данные
models/nnmodel.h5

Двоичный файл не отображается.

Двоичные данные
models/nnmodel98.h5

Двоичный файл не отображается.