Merge pull request #6 from microsoft/journeyman/17878

Journeyman/17878
This commit is contained in:
journeyman-msft 2020-10-27 23:56:27 -07:00 коммит произвёл GitHub
Родитель e3d334c514 c1107ad8b2
Коммит e06fd73262
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
7 изменённых файлов: 56 добавлений и 58 удалений

Просмотреть файл

@ -230,8 +230,8 @@ This helps in removing the Outliers and NaNs in the data. Outlier detection algo
Once the data is generated (in the same way as data generation process for datadriven modeling), we can detect outliers in the following way:
```bash
>>> python checkDataQuality.py
>>> python checkDataQuality.py --thrhld 4
python checkDataQuality.py
python checkDataQuality.py --thrhld 4
```
By default, outlier threshold is 3, it can be adjusted based on the data.
@ -245,7 +245,7 @@ This code helps in computing feature importance using gradient boosting trees.
Once the data is generated (in the same way as data generation process for datadriven modeling), we can compute feature importance in the following way:
```bash
>>> python featureImportance.py
python featureImportance.py
```
## Contribute Code

Просмотреть файл

@ -9,12 +9,10 @@ import matplotlib as mpl
import argparse
import pickle
from conf_params_var import STATE_SPACE_DIM, ACTION_SPACE_DIM, FEATURE_NAME, OUTPUT_NAME
import yaml
parser = argparse.ArgumentParser()
parser.add_argument("--thrhld", type=float, default=3,help="choose the threshold to declare outlier (thrhld*sigma)")
parser.add_argument("--OutputNameProvided", type=bool, default=False,help="write output name in conf_params_var.py")
def read_env_data():
try:
@ -28,48 +26,46 @@ def read_env_data():
######################## Functions for Outlier Detection and Ploting ###################
def plotOutliers(y_set, y_predict_all, outlier_data, OutputNameProvided = False):
def plotOutliers(y_set, y_predict_all, outlier_data):
fig = plt.figure()
numSubPlots = y_set.shape[1]
outlierData = outlier_data['y' + str(0)]
if OutputNameProvided:
dataLabel = OUTPUT_NAME[0]
else:
dataLabel = 'y'+ str(0)
dataLabel = []
for key, value in config['IO']['feature_name'].items():
if value == 'state':
dataLabel.append(key)
ax1 = plt.subplot(numSubPlots, 1, 0+1)
plt.plot(y_set[:,0], label=dataLabel, linewidth=1, color = 'blue' )
plt.plot(y_predict_all[0], label=dataLabel, linewidth=1, color = 'black' )
plt.plot(y_set[:,0], label=dataLabel[0], linewidth=1, color = 'blue' )
plt.plot(y_predict_all[0], label=dataLabel[0], linewidth=1, color = 'black' )
plt.scatter(outlierData,y_set[outlierData,0], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)
plt.xticks(rotation='horizontal')
plt.legend(loc='upper right');
plt.legend(loc='upper right')
for i in range(1,numSubPlots):
outlierData = outlier_data['y' + str(i)]
if OutputNameProvided:
dataLabel = OUTPUT_NAME[i]
else:
dataLabel = 'y'+ str(i)
ax2 = plt.subplot(numSubPlots, 1, i+1, sharex=ax1)
plt.plot(y_set[:,i], label=dataLabel, linewidth=1, color = 'blue' )
plt.plot(y_predict_all[i], label=dataLabel, linewidth=1, color = 'black' )
plt.plot(y_set[:,i], label=dataLabel[i], linewidth=1, color = 'blue' )
plt.plot(y_predict_all[i], label=dataLabel[i], linewidth=1, color = 'black' )
plt.scatter(outlierData,y_set[outlierData,i], label='outlier', linewidth=1, marker = '*', color = 'red', s = 50)
plt.xticks(rotation='horizontal')
plt.legend(loc='upper right');
plt.legend(loc='upper right')
# plt.show()
def findOutliersAll(x_set,y_set, thrhld = 2):
def findOutliersAll(x_set,y_set, thrhld=2):
## Computing Feature importance using gradient boosting
print('computing Outliers ....')
outlier_data = {}
y_predict_all = []
for i in range (0, y_set.shape[1]):
gb_estimator=GradientBoostingRegressor(n_iter_no_change = 50, validation_fraction = .2)
gb_estimator=GradientBoostingRegressor(n_iter_no_change=50, validation_fraction=.2)
gb_model= gb_estimator.fit(x_set,y_set[:,i])
y_predict = gb_estimator.predict(x_set)
outlier_data['y' + str(i)] = findOutlier(y_set[:,i], y_predict, thrhld = 2)
outlier_data['y' + str(i)] = findOutlier(y_set[:,i], y_predict, thrhld=thrhld)
y_predict_all.append(y_predict)
print('y', str(i), ': ', outlier_data['y' + str(i)])
return outlier_data, y_predict_all
@ -80,29 +76,24 @@ def findOutlier(y, y_predict, thrhld = 2):
return outL[0]
###################### Lot Inputs #########################
def plotInputs(x_set,y_set, OutputNameProvided = False):
def plotInputs(x_set,y_set):
fig = plt.figure()
numSubPlots = x_set.shape[1] - y_set.shape[1] ## Num of inputs
if OutputNameProvided:
dataLabel = INPUT_NAME[0]
else:
dataLabel = 'x'+ str(0)
dataLabel = []
for key, value in config['IO']['feature_name'].items():
if value == 'action':
dataLabel.append(key)
ax1 = plt.subplot(numSubPlots, 1, 1)
plt.plot(x_set[:,y_set.shape[1]+0], label=dataLabel, linewidth=1, color = 'blue' )
plt.plot(x_set[:,y_set.shape[1]+0], label=dataLabel[0], linewidth=1, color = 'blue' )
plt.xticks(rotation='horizontal')
plt.legend(loc='upper right');
plt.legend(loc='upper right')
for i in range(1,numSubPlots):
if OutputNameProvided:
dataLabel = OUTPUT_NAME[i]
else:
dataLabel = 'x'+ str(i)
ax2 = plt.subplot(numSubPlots, 1, i+1, sharex=ax1)
plt.plot(x_set[:,y_set.shape[1]+i], label=dataLabel, linewidth=1, color = 'blue' )
plt.plot(x_set[:,y_set.shape[1]+i], label=dataLabel[i], linewidth=1, color = 'blue' )
plt.xticks(rotation='horizontal')
plt.legend(loc='upper right');
plt.legend(loc='upper right')
# plt.show()
@ -119,9 +110,11 @@ def maxMinMeanStd(x, varName = 'x'):
if __name__=="__main__":
args=parser.parse_args()
x_set, y_set=read_env_data()
mpl.rcParams['agg.path.chunksize'] = max(10000, x_set.shape[1]+100)
with open('config/config_model.yml') as cmfile:
config = yaml.full_load(cmfile)
x_set, y_set = read_env_data()
mpl.rcParams['agg.path.chunksize'] = max(10000, x_set.shape[1]+100)
##################### Outlier Code Usage ###############################################
@ -135,14 +128,12 @@ if __name__=="__main__":
x_set[i+1,1] = 1.5
## Find Outlier and Plot them
outlier_data, y_predict_all = findOutliersAll(x_set,y_set, thrhld = args.thrhld)
outlier_data, y_predict_all = findOutliersAll(x_set,y_set, thrhld=args.thrhld)
modelname='./models/OutlierData_Y.sav'
joblib.dump(outlier_data, modelname)
plotOutliers(y_set, y_predict_all, outlier_data, OutputNameProvided = args.OutputNameProvided)
plotInputs(x_set,y_set, OutputNameProvided = False)
plotOutliers(y_set, y_predict_all, outlier_data)
plotInputs(x_set,y_set)
plt.show()
############################# Detecting NaN ######################
hasNaN(x_set)
hasNaN(x_set)

Двоичные данные
env_data/x_set.pickle

Двоичный файл не отображается.

Двоичные данные
env_data/y_set.pickle

Двоичный файл не отображается.

Просмотреть файл

@ -7,12 +7,10 @@ from env_data_modeler import env_gb_modeler
import argparse
import pickle
from conf_params_var import STATE_SPACE_DIM, ACTION_SPACE_DIM, FEATURE_NAME
import yaml
parser = argparse.ArgumentParser()
parser.add_argument("--widthbar", type=float, default=.5,help="choose width of the bars around")
parser.add_argument("--featureNameProvided", type=bool, default=False,help="write xlabel name in conf_params_var.py")
def read_env_data():
try:
@ -25,7 +23,7 @@ def read_env_data():
return x_set, y_set
def feature_plots(feature_data, total_width=0.5, featureNameProvided = False):
def feature_plots(feature_data, total_width=0.5):
fig, ax = plt.subplots()
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
@ -43,8 +41,7 @@ def feature_plots(feature_data, total_width=0.5, featureNameProvided = False):
plt.xlabel('Feature Number', fontsize=18)
plt.ylabel('Feature Importance', fontsize=18)
if featureNameProvided == True:
plt.xticks(ticks=range(len(FEATURE_NAME)), labels=FEATURE_NAME)
plt.xticks(ticks=range(state_space_dim+action_space_dim), labels=config['IO']['feature_name'].keys())
plt.show()
@ -52,9 +49,19 @@ def feature_plots(feature_data, total_width=0.5, featureNameProvided = False):
if __name__=="__main__":
args=parser.parse_args()
state_space_dim=int(STATE_SPACE_DIM)
action_space_dim=int(ACTION_SPACE_DIM)
with open('config/config_model.yml') as cmfile:
config = yaml.full_load(cmfile)
state_space_dim = 0
action_space_dim = 0
for key, value in config['IO']['feature_name'].items():
if value == 'state':
state_space_dim += 1
elif value == 'action':
action_space_dim += 1
else:
print('Please fix config_model.yml to specify either state or action')
exit()
x_set, y_set=read_env_data()
x_train, x_test, y_train, y_test = train_test_split(x_set, y_set, test_size=0.33, random_state=42)
@ -63,7 +70,7 @@ if __name__=="__main__":
print('computing Feature Importance ....')
feature_importance_data = {}
for i in range (0, y_set.shape[1]):
gb_estimator=env_gb_modeler()
gb_estimator=env_gb_modeler(state_space_dim, action_space_dim)
gb_estimator.create_gb_model()
gb_model= gb_estimator.train_gb_model(x_train,y_train[:,i])
feature_importance_data['y' + str(i)] = gb_model.feature_importances_
@ -72,7 +79,7 @@ if __name__=="__main__":
modelname='./models/feature_importance.sav'
joblib.dump(feature_importance_data, modelname)
feature_plots(feature_importance_data, total_width= args.widthbar, featureNameProvided = args.featureNameProvided)
feature_plots(feature_importance_data, total_width=args.widthbar)

Двоичные данные
models/nnmodel.h5

Двоичный файл не отображается.

Двоичные данные
models/nnmodel98.h5

Двоичный файл не отображается.