Fixed data cleaning/checker tools with recent overhaul
This commit is contained in:
Родитель
e3d334c514
Коммит
64bbd5bfad
|
@ -9,12 +9,10 @@ import matplotlib as mpl
|
|||
|
||||
import argparse
|
||||
import pickle
|
||||
from conf_params_var import STATE_SPACE_DIM, ACTION_SPACE_DIM, FEATURE_NAME, OUTPUT_NAME
|
||||
|
||||
import yaml
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--thrhld", type=float, default=3,help="choose the threshold to declare outlier (thrhld*sigma)")
|
||||
parser.add_argument("--OutputNameProvided", type=bool, default=False,help="write output name in conf_params_var.py")
|
||||
|
||||
def read_env_data():
|
||||
try:
|
||||
|
@ -28,48 +26,46 @@ def read_env_data():
|
|||
|
||||
|
||||
######################## Functions for Outlier Detection and Ploting ###################
|
||||
def plotOutliers(y_set, y_predict_all, outlier_data, OutputNameProvided = False):
|
||||
def plotOutliers(y_set, y_predict_all, outlier_data):
|
||||
fig = plt.figure()
|
||||
numSubPlots = y_set.shape[1]
|
||||
|
||||
outlierData = outlier_data['y' + str(0)]
|
||||
if OutputNameProvided:
|
||||
dataLabel = OUTPUT_NAME[0]
|
||||
else:
|
||||
dataLabel = 'y'+ str(0)
|
||||
dataLabel = []
|
||||
for key, value in config['IO']['feature_name'].items():
|
||||
if value == 'state':
|
||||
dataLabel.append(key)
|
||||
|
||||
ax1 = plt.subplot(numSubPlots, 1, 0+1)
|
||||
plt.plot(y_set[:,0], label=dataLabel, linewidth=1, color = 'blue' )
|
||||
plt.plot(y_predict_all[0], label=dataLabel, linewidth=1, color = 'black' )
|
||||
plt.plot(y_set[:,0], label=dataLabel[0], linewidth=1, color = 'blue' )
|
||||
plt.plot(y_predict_all[0], label=dataLabel[0], linewidth=1, color = 'black' )
|
||||
plt.scatter(outlierData,y_set[outlierData,0], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)
|
||||
plt.xticks(rotation='horizontal')
|
||||
plt.legend(loc='upper right');
|
||||
plt.legend(loc='upper right')
|
||||
|
||||
for i in range(1,numSubPlots):
|
||||
outlierData = outlier_data['y' + str(i)]
|
||||
if OutputNameProvided:
|
||||
dataLabel = OUTPUT_NAME[i]
|
||||
else:
|
||||
dataLabel = 'y'+ str(i)
|
||||
|
||||
ax2 = plt.subplot(numSubPlots, 1, i+1, sharex=ax1)
|
||||
plt.plot(y_set[:,i], label=dataLabel, linewidth=1, color = 'blue' )
|
||||
plt.plot(y_predict_all[i], label=dataLabel, linewidth=1, color = 'black' )
|
||||
plt.plot(y_set[:,i], label=dataLabel[i], linewidth=1, color = 'blue' )
|
||||
plt.plot(y_predict_all[i], label=dataLabel[i], linewidth=1, color = 'black' )
|
||||
plt.scatter(outlierData,y_set[outlierData,i], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)
|
||||
plt.xticks(rotation='horizontal')
|
||||
plt.legend(loc='upper right');
|
||||
plt.legend(loc='upper right')
|
||||
|
||||
|
||||
# plt.show()
|
||||
|
||||
def findOutliersAll(x_set,y_set, thrhld = 2):
|
||||
def findOutliersAll(x_set,y_set, thrhld=2):
|
||||
## Computing Feature importance using gradient boosting
|
||||
print('computing Outliers ....')
|
||||
outlier_data = {}
|
||||
y_predict_all = []
|
||||
for i in range (0, y_set.shape[1]):
|
||||
gb_estimator=GradientBoostingRegressor(n_iter_no_change = 50, validation_fraction = .2)
|
||||
gb_estimator=GradientBoostingRegressor(n_iter_no_change=50, validation_fraction=.2)
|
||||
gb_model= gb_estimator.fit(x_set,y_set[:,i])
|
||||
y_predict = gb_estimator.predict(x_set)
|
||||
outlier_data['y' + str(i)] = findOutlier(y_set[:,i], y_predict, thrhld = 2)
|
||||
outlier_data['y' + str(i)] = findOutlier(y_set[:,i], y_predict, thrhld=thrhld)
|
||||
y_predict_all.append(y_predict)
|
||||
print('y', str(i), ': ', outlier_data['y' + str(i)])
|
||||
return outlier_data, y_predict_all
|
||||
|
@ -80,29 +76,24 @@ def findOutlier(y, y_predict, thrhld = 2):
|
|||
return outL[0]
|
||||
|
||||
###################### Lot Inputs #########################
|
||||
def plotInputs(x_set,y_set, OutputNameProvided = False):
|
||||
def plotInputs(x_set,y_set):
|
||||
fig = plt.figure()
|
||||
numSubPlots = x_set.shape[1] - y_set.shape[1] ## Num of inputs
|
||||
|
||||
if OutputNameProvided:
|
||||
dataLabel = INPUT_NAME[0]
|
||||
else:
|
||||
dataLabel = 'x'+ str(0)
|
||||
|
||||
dataLabel = []
|
||||
for key, value in config['IO']['feature_name'].items():
|
||||
if value == 'action':
|
||||
dataLabel.append(key)
|
||||
ax1 = plt.subplot(numSubPlots, 1, 1)
|
||||
plt.plot(x_set[:,y_set.shape[1]+0], label=dataLabel, linewidth=1, color = 'blue' )
|
||||
plt.plot(x_set[:,y_set.shape[1]+0], label=dataLabel[0], linewidth=1, color = 'blue' )
|
||||
plt.xticks(rotation='horizontal')
|
||||
plt.legend(loc='upper right');
|
||||
plt.legend(loc='upper right')
|
||||
|
||||
for i in range(1,numSubPlots):
|
||||
|
||||
if OutputNameProvided:
|
||||
dataLabel = OUTPUT_NAME[i]
|
||||
else:
|
||||
dataLabel = 'x'+ str(i)
|
||||
ax2 = plt.subplot(numSubPlots, 1, i+1, sharex=ax1)
|
||||
plt.plot(x_set[:,y_set.shape[1]+i], label=dataLabel, linewidth=1, color = 'blue' )
|
||||
plt.plot(x_set[:,y_set.shape[1]+i], label=dataLabel[i], linewidth=1, color = 'blue' )
|
||||
plt.xticks(rotation='horizontal')
|
||||
plt.legend(loc='upper right');
|
||||
plt.legend(loc='upper right')
|
||||
|
||||
# plt.show()
|
||||
|
||||
|
@ -119,9 +110,11 @@ def maxMinMeanStd(x, varName = 'x'):
|
|||
if __name__=="__main__":
|
||||
args=parser.parse_args()
|
||||
|
||||
x_set, y_set=read_env_data()
|
||||
mpl.rcParams['agg.path.chunksize'] = max(10000, x_set.shape[1]+100)
|
||||
with open('config/config_model.yml') as cmfile:
|
||||
config = yaml.full_load(cmfile)
|
||||
|
||||
x_set, y_set = read_env_data()
|
||||
mpl.rcParams['agg.path.chunksize'] = max(10000, x_set.shape[1]+100)
|
||||
|
||||
##################### Outlier Code Usage ###############################################
|
||||
|
||||
|
@ -135,14 +128,12 @@ if __name__=="__main__":
|
|||
x_set[i+1,1] = 1.5
|
||||
|
||||
## Find Outlier and Plot them
|
||||
outlier_data, y_predict_all = findOutliersAll(x_set,y_set, thrhld = args.thrhld)
|
||||
outlier_data, y_predict_all = findOutliersAll(x_set,y_set, thrhld=args.thrhld)
|
||||
modelname='./models/OutlierData_Y.sav'
|
||||
joblib.dump(outlier_data, modelname)
|
||||
plotOutliers(y_set, y_predict_all, outlier_data, OutputNameProvided = args.OutputNameProvided)
|
||||
plotInputs(x_set,y_set, OutputNameProvided = False)
|
||||
plotOutliers(y_set, y_predict_all, outlier_data)
|
||||
plotInputs(x_set,y_set)
|
||||
plt.show()
|
||||
|
||||
############################# Detecting NaN ######################
|
||||
hasNaN(x_set)
|
||||
|
||||
|
||||
|
||||
hasNaN(x_set)
|
Двоичные данные
env_data/x_set.pickle
Двоичные данные
env_data/x_set.pickle
Двоичный файл не отображается.
Двоичные данные
env_data/y_set.pickle
Двоичные данные
env_data/y_set.pickle
Двоичный файл не отображается.
|
@ -7,12 +7,10 @@ from env_data_modeler import env_gb_modeler
|
|||
|
||||
import argparse
|
||||
import pickle
|
||||
from conf_params_var import STATE_SPACE_DIM, ACTION_SPACE_DIM, FEATURE_NAME
|
||||
|
||||
import yaml
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--widthbar", type=float, default=.5,help="choose width of the bars around")
|
||||
parser.add_argument("--featureNameProvided", type=bool, default=False,help="write xlabel name in conf_params_var.py")
|
||||
|
||||
def read_env_data():
|
||||
try:
|
||||
|
@ -25,7 +23,7 @@ def read_env_data():
|
|||
return x_set, y_set
|
||||
|
||||
|
||||
def feature_plots(feature_data, total_width=0.5, featureNameProvided = False):
|
||||
def feature_plots(feature_data, total_width=0.5):
|
||||
fig, ax = plt.subplots()
|
||||
|
||||
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
|
||||
|
@ -43,8 +41,7 @@ def feature_plots(feature_data, total_width=0.5, featureNameProvided = False):
|
|||
plt.xlabel('Feature Number', fontsize=18)
|
||||
plt.ylabel('Feature Importance', fontsize=18)
|
||||
|
||||
if featureNameProvided == True:
|
||||
plt.xticks(ticks=range(len(FEATURE_NAME)), labels=FEATURE_NAME)
|
||||
plt.xticks(ticks=range(state_space_dim+action_space_dim), labels=config['IO']['feature_name'].keys())
|
||||
plt.show()
|
||||
|
||||
|
||||
|
@ -52,9 +49,19 @@ def feature_plots(feature_data, total_width=0.5, featureNameProvided = False):
|
|||
if __name__=="__main__":
|
||||
args=parser.parse_args()
|
||||
|
||||
state_space_dim=int(STATE_SPACE_DIM)
|
||||
action_space_dim=int(ACTION_SPACE_DIM)
|
||||
with open('config/config_model.yml') as cmfile:
|
||||
config = yaml.full_load(cmfile)
|
||||
|
||||
state_space_dim = 0
|
||||
action_space_dim = 0
|
||||
for key, value in config['IO']['feature_name'].items():
|
||||
if value == 'state':
|
||||
state_space_dim += 1
|
||||
elif value == 'action':
|
||||
action_space_dim += 1
|
||||
else:
|
||||
print('Please fix config_model.yml to specify either state or action')
|
||||
exit()
|
||||
|
||||
x_set, y_set=read_env_data()
|
||||
x_train, x_test, y_train, y_test = train_test_split(x_set, y_set, test_size=0.33, random_state=42)
|
||||
|
@ -63,7 +70,7 @@ if __name__=="__main__":
|
|||
print('computing Feature Importance ....')
|
||||
feature_importance_data = {}
|
||||
for i in range (0, y_set.shape[1]):
|
||||
gb_estimator=env_gb_modeler()
|
||||
gb_estimator=env_gb_modeler(state_space_dim, action_space_dim)
|
||||
gb_estimator.create_gb_model()
|
||||
gb_model= gb_estimator.train_gb_model(x_train,y_train[:,i])
|
||||
feature_importance_data['y' + str(i)] = gb_model.feature_importances_
|
||||
|
@ -72,7 +79,7 @@ if __name__=="__main__":
|
|||
modelname='./models/feature_importance.sav'
|
||||
joblib.dump(feature_importance_data, modelname)
|
||||
|
||||
feature_plots(feature_importance_data, total_width= args.widthbar, featureNameProvided = args.featureNameProvided)
|
||||
feature_plots(feature_importance_data, total_width=args.widthbar)
|
||||
|
||||
|
||||
|
||||
|
|
Двоичные данные
models/nnmodel.h5
Двоичные данные
models/nnmodel.h5
Двоичный файл не отображается.
Двоичные данные
models/nnmodel98.h5
Двоичные данные
models/nnmodel98.h5
Двоичный файл не отображается.
Загрузка…
Ссылка в новой задаче