This commit is contained in:
Gustavo Pabon 2022-05-23 22:15:52 +00:00
Родитель b9c17cbb1f
Коммит 74946cffa2
6 изменённых файлов: 57 добавлений и 1313 удалений

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -2,6 +2,13 @@ import os
import numpy as np import numpy as np
from azureml.core import Model from azureml.core import Model
import joblib import joblib
#import argparse
#parser = argparse.ArgumentParser()
#parser.add_argument("--id-feature", type=str, dest='id_feature', help='ID Freature')
#args = parser.parse_args()
#id_feat = str(args.id_feature)
#print('id feature', id_feat)
def init(): def init():
@ -13,13 +20,8 @@ def init():
model = joblib.load(model_path) model = joblib.load(model_path)
def run(mini_batch): def run(mini_batch):
# # This runs for each batch mini_batch.set_index('Van_Stock_Proposal_Detail_Id', inplace=True)
# resultList = model.predict(mini_batch)
# ind = mini_batch.index
# return resultList.tolist()
index_list = list(mini_batch.index) index_list = list(mini_batch.index)
y_pred = model.predict(mini_batch).tolist() y_pred = model.predict(mini_batch).tolist()
score = model.score_samples(mini_batch).tolist() score = model.score_samples(mini_batch).tolist()
return(list(zip(index_list, y_pred, score))) return(list(zip(index_list, y_pred, score)))

Просмотреть файл

@ -3,7 +3,7 @@ from azureml.core import Model, Run
import argparse import argparse
import numpy as np import numpy as np
import iJungle import iJungle
import pickle import joblib
run = Run.get_context() run = Run.get_context()
@ -14,6 +14,7 @@ parser = argparse.ArgumentParser()
# Input Data # Input Data
parser.add_argument("--input-data", type=str, dest='input_data', help='Overhead dataset') parser.add_argument("--input-data", type=str, dest='input_data', help='Overhead dataset')
parser.add_argument("--id-feature", type=str, dest='id_feature', help='ID Freature')
# Hyper parameters # Hyper parameters
parser.add_argument('--trees', type=int, dest='trees', default=100, help='Number of trees') parser.add_argument('--trees', type=int, dest='trees', default=100, help='Number of trees')
@ -21,6 +22,8 @@ parser.add_argument('--subsample-size', type=int, dest='subsample_size', default
# Add arguments to args collection # Add arguments to args collection
args = parser.parse_args() args = parser.parse_args()
id_feat = str(args.id_feature)
print('id feature', id_feat)
# Log Hyperparameter values # Log Hyperparameter values
trees = np.int(args.trees) trees = np.int(args.trees)
@ -33,6 +36,7 @@ run.log('subsample_size', subsample_size)
# Load training data # Load training data
print("Loading Data...") print("Loading Data...")
W = run.input_datasets['overhead_data'].to_pandas_dataframe() # Get the training data from the estimator input W = run.input_datasets['overhead_data'].to_pandas_dataframe() # Get the training data from the estimator input
W.set_index(id_feat, inplace=True)
# Load iFor_list pickle # Load iFor_list pickle
print("Loading pickle...") print("Loading pickle...")
@ -40,9 +44,7 @@ model_name = 'iJungle_light_' + str(trees) + '_' + str(subsample_size)
print(model_name) print(model_name)
model_path = Model.get_model_path(model_name) model_path = Model.get_model_path(model_name)
print(model_path) print(model_path)
with open(model_path, 'rb') as infile: iFor_list = joblib.load(model_path)
iFor_list = pickle.load(infile)
# Evaluation # Evaluation
print("Starting evaluation ...") print("Starting evaluation ...")
@ -50,8 +52,7 @@ os.makedirs(iJungle._MODEL_DIR, exist_ok=True)
results = iJungle.model_eval_fun(W, iFor_list) results = iJungle.model_eval_fun(W, iFor_list)
results_filename = os.path.join(iJungle._MODEL_DIR, model_name + '_results.pkl') results_filename = os.path.join(iJungle._MODEL_DIR, model_name + '_results.pkl')
print("Writing results:", results_filename) print("Writing results:", results_filename)
with open(results_filename, 'wb') as outfile: joblib.dump(value=results, filename=results_filename)
pickle.dump(results, outfile)
# Log dummy metric # Log dummy metric
run.log('Dummy', np.float(0)) run.log('Dummy', np.float(0))

Просмотреть файл

@ -12,6 +12,7 @@ parser = argparse.ArgumentParser()
# Input Data # Input Data
parser.add_argument("--input-data", type=str, dest='input_data', help='training dataset') parser.add_argument("--input-data", type=str, dest='input_data', help='training dataset')
parser.add_argument("--id-feature", type=str, dest='id_feature', help='ID Freature')
parser.add_argument("--max-subsample-size", type=int, dest='max_sss', help='Max subsample size') parser.add_argument("--max-subsample-size", type=int, dest='max_sss', help='Max subsample size')
parser.add_argument("--train-size", type=float, dest='train_size', help='Train size') parser.add_argument("--train-size", type=float, dest='train_size', help='Train size')
@ -21,6 +22,8 @@ parser.add_argument('--subsample-size', type=int, dest='subsample_size', default
# Add arguments to args collection # Add arguments to args collection
args = parser.parse_args() args = parser.parse_args()
id_feat = str(args.id_feature)
print('id feature', id_feat)
# Log Hyperparameter values # Log Hyperparameter values
trees = np.int(args.trees) trees = np.int(args.trees)
@ -41,7 +44,7 @@ run.log('train_size', train_size)
# Load training data # Load training data
print("Loading Data...") print("Loading Data...")
df = run.input_datasets['training_data'].to_pandas_dataframe() # Get the training data from the estimator input df = run.input_datasets['training_data'].to_pandas_dataframe() # Get the training data from the estimator input
df.set_index(id_feat, inplace=True)
print("Starting training ...") print("Starting training ...")
model_filename = iJungle.model_train_fun(df, trees, subsample_size, train_size, max_sss) model_filename = iJungle.model_train_fun(df, trees, subsample_size, train_size, max_sss)

Просмотреть файл

@ -1,2 +1,2 @@
__version__ = '0.1.66' __version__ = '0.1.73'
_MODEL_DIR = 'outputs' _MODEL_DIR = 'outputs'

Просмотреть файл

@ -3,7 +3,7 @@ from iJungle.config import _MODEL_DIR
import random import random
from sklearn.ensemble import IsolationForest from sklearn.ensemble import IsolationForest
import pickle import joblib
import os import os
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@ -67,9 +67,8 @@ def model_train_fun(df, trees=100, subsample_size=8192, train_size = 0.2, max_ss
print("{}/{}".format(counter, int(df_len/max_sss+1))) print("{}/{}".format(counter, int(df_len/max_sss+1)))
filename = 'iJungle_light_' + str(trees) + '_' + str(subsample_size) + '.pkl' filename = 'iJungle_light_' + str(trees) + '_' + str(subsample_size) + '.pkl'
with open(os.path.join(_MODEL_DIR, filename), 'wb') as outfile: joblib.dump(value=iFor_list, filename=os.path.join(_MODEL_DIR, filename))
pickle.dump(iFor_list, outfile)
return(filename) return(filename)
except Exception as err: except Exception as err:
# TODO: Implement logger # TODO: Implement logger
@ -135,7 +134,7 @@ def grid_eval(df, subsample_list = [4096, 2048, 1024, 512],
W = df.iloc[my_indexes[:df_len]] W = df.iloc[my_indexes[:df_len]]
results_dic = {} results_dic = {}
## Evaluation with stored models as external files(pickle format) ## Evaluation with stored models as external files(joblib format)
for i, subsample_size in enumerate(subsample_list): for i, subsample_size in enumerate(subsample_list):
results_dic_t = {} results_dic_t = {}
for j, trees in enumerate(trees_list): for j, trees in enumerate(trees_list):
@ -143,16 +142,14 @@ def grid_eval(df, subsample_list = [4096, 2048, 1024, 512],
# TODO: Implement logger # TODO: Implement logger
if verbose: if verbose:
print('Reading ' + filename) print('Reading ' + filename)
with open(os.path.join(_MODEL_DIR, filename), 'rb') as infile: iFor_list = joblib.load(os.path.join(_MODEL_DIR, filename))
iFor_list = pickle.load(infile)
results_dic_t[str(trees)] = model_eval_fun(W, iFor_list, verbose) results_dic_t[str(trees)] = model_eval_fun(W, iFor_list, verbose)
results_dic[str(subsample_size)] = results_dic_t results_dic[str(subsample_size)] = results_dic_t
filename_results = 'iJungle_light_results_overhead.pkl' filename_results = 'iJungle_light_results_overhead.pkl'
results = pd.DataFrame(results_dic) results = pd.DataFrame(results_dic)
with open(os.path.join(_MODEL_DIR, filename_results), 'wb') as outfile: joblib.dump(value=results, filename=os.path.join(_MODEL_DIR, filename_results))
pickle.dump(results, outfile)
return(results) return(results)
except Exception as err: except Exception as err:
# TODO: Implement logger # TODO: Implement logger
@ -165,8 +162,7 @@ def get_grid_eval_results(verbose = True):
if os.path.exists(picklename): if os.path.exists(picklename):
if verbose: if verbose:
print("Reading ", picklename) print("Reading ", picklename)
with open(picklename, 'rb') as pickle_in: results = joblib.load(picklename)
results = pickle.load(pickle_in)
return(results) return(results)
else: else:
raise Exception("grid_eval has not have been executed") raise Exception("grid_eval has not have been executed")
@ -211,9 +207,7 @@ def best_iforest(results, verbose=True):
picklename = os.path.join(_MODEL_DIR,'iJungle_light_' + str(trees) + '_' + str(subsample_size) + '.pkl') picklename = os.path.join(_MODEL_DIR,'iJungle_light_' + str(trees) + '_' + str(subsample_size) + '.pkl')
if verbose: if verbose:
print('Reading ' + picklename) print('Reading ' + picklename)
iFor_list = joblib.load(picklename)
with open(picklename,"rb") as pickle_in:
iFor_list = pickle.load(pickle_in)
model = iFor_list[best_iF_k] model = iFor_list[best_iF_k]
if verbose: if verbose: