From 5495f66ca9f53ea9a52c4c904f02ed7c08268efa Mon Sep 17 00:00:00 2001 From: cindyweng Date: Mon, 11 Apr 2022 16:02:00 +0100 Subject: [PATCH] temp revert to old train.py --- data-science/src/train.py | 58 ++++++++----------- data-science/src/train_new.py | 105 ++++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 35 deletions(-) create mode 100644 data-science/src/train_new.py diff --git a/data-science/src/train.py b/data-science/src/train.py index 131d2d2..ae10635 100644 --- a/data-science/src/train.py +++ b/data-science/src/train.py @@ -16,56 +16,44 @@ from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler -from azureml.core import Run -run = Run.get_context() -ws = run.experiment.workspace - def parse_args(): parser = argparse.ArgumentParser(description="UCI Credit example") - parser.add_argument("--transformed_data_path", type=str, default='transformed_data/', help="Directory path to training data") - parser.add_argument("--model_path", type=str, default='trained_model/', help="Model output directory") + parser.add_argument("--data_path", type=str, default='data/', help="Directory path to training data") + parser.add_argument("--model_path", type=str, default='outputs/', help="Model output directory") return parser.parse_args() def main(): # Parse command-line arguments args = parse_args() - transformed_data_path = os.path.join(args.transformed_data_path, run.parent.id) - model_path = os.path.join(args.model_path, run.parent.id) - # Make sure model output path exists - if not os.path.exists(model_path): - os.makedirs(model_path) + if not os.path.exists(args.model_path): + os.makedirs(args.model_path) + print(args.data_path) + os.listdir(os.getcwd()) + print(os.getcwd()) + # Enable auto logging mlflow.sklearn.autolog() # Read training data - train = pd.read_csv(os.path.join(transformed_data_path, 'train.csv')) - val = pd.read_csv(os.path.join(transformed_data_path, 'val.csv')) - - run.log('TRAIN SIZE', train.shape[0]) - run.log('VAL SIZE', val.shape[0]) + df = pd.read_csv(os.path.join(args.data_path, 'credit.csv')) # Train model - model = model_train(train, val) + model = model_train(df) #copying model to "outputs" directory, this will automatically upload it to Azure ML - joblib.dump(value=model, filename=os.path.join(model_path, 'model.pkl')) + joblib.dump(value=model, filename=os.path.join(args.model_path, 'model.pkl')) -def model_train(train, val): - - train.drop("Sno", axis=1, inplace=True) - val.drop("Sno", axis=1, inplace=True) +def model_train(df): + df.drop("Sno", axis=1, inplace=True) - y_train = train['Risk'] - X_train = train.drop('Risk', axis=1) + y_raw = df['Risk'] + X_raw = df.drop('Risk', axis=1) - y_val = val['Risk'] - X_val = val.drop('Risk', axis=1) - - categorical_features = X_train.select_dtypes(include=['object']).columns - numeric_features = X_train.select_dtypes(include=['int64', 'float']).columns + categorical_features = X_raw.select_dtypes(include=['object']).columns + numeric_features = X_raw.select_dtypes(include=['int64', 'float']).columns categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value="missing")), @@ -82,7 +70,10 @@ def model_train(train, val): # Encode Labels le = LabelEncoder() - encoded_y = le.fit_transform(y_train) + encoded_y = le.fit_transform(y_raw) + + # Train test split + X_train, X_test, y_train, y_test = train_test_split(X_raw, encoded_y, test_size=0.20, stratify=encoded_y, random_state=42) # Create sklearn pipeline lr_clf = Pipeline(steps=[('preprocessor', feature_engineering_pipeline), @@ -92,12 +83,9 @@ def model_train(train, val): # Capture metrics train_acc = lr_clf.score(X_train, y_train) - val_acc = lr_clf.score(X_val, y_val) + test_acc = lr_clf.score(X_test, y_test) print("Training accuracy: %.3f" % train_acc) - print("Validation accuracy: %.3f" % val_acc) - - run.log('Training accuracy', train_acc) - run.log('Validation accuracy', val_acc) + print("Testing accuracy: %.3f" % test_acc) return lr_clf diff --git a/data-science/src/train_new.py b/data-science/src/train_new.py new file mode 100644 index 0000000..131d2d2 --- /dev/null +++ b/data-science/src/train_new.py @@ -0,0 +1,105 @@ +import os +import sys +import argparse +import joblib +import pandas as pd + +import mlflow +import mlflow.sklearn + +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import LabelEncoder +from sklearn.preprocessing import OneHotEncoder +from sklearn.preprocessing import StandardScaler + +from azureml.core import Run +run = Run.get_context() +ws = run.experiment.workspace + +def parse_args(): + parser = argparse.ArgumentParser(description="UCI Credit example") + parser.add_argument("--transformed_data_path", type=str, default='transformed_data/', help="Directory path to training data") + parser.add_argument("--model_path", type=str, default='trained_model/', help="Model output directory") + return parser.parse_args() + +def main(): + # Parse command-line arguments + args = parse_args() + + transformed_data_path = os.path.join(args.transformed_data_path, run.parent.id) + model_path = os.path.join(args.model_path, run.parent.id) + + # Make sure model output path exists + if not os.path.exists(model_path): + os.makedirs(model_path) + + # Enable auto logging + mlflow.sklearn.autolog() + + # Read training data + train = pd.read_csv(os.path.join(transformed_data_path, 'train.csv')) + val = pd.read_csv(os.path.join(transformed_data_path, 'val.csv')) + + run.log('TRAIN SIZE', train.shape[0]) + run.log('VAL SIZE', val.shape[0]) + + # Train model + model = model_train(train, val) + + #copying model to "outputs" directory, this will automatically upload it to Azure ML + joblib.dump(value=model, filename=os.path.join(model_path, 'model.pkl')) + +def model_train(train, val): + + train.drop("Sno", axis=1, inplace=True) + val.drop("Sno", axis=1, inplace=True) + + y_train = train['Risk'] + X_train = train.drop('Risk', axis=1) + + y_val = val['Risk'] + X_val = val.drop('Risk', axis=1) + + categorical_features = X_train.select_dtypes(include=['object']).columns + numeric_features = X_train.select_dtypes(include=['int64', 'float']).columns + + categorical_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='constant', fill_value="missing")), + ('onehotencoder', OneHotEncoder(categories='auto', sparse=False))]) + + numeric_transformer = Pipeline(steps=[ + ('scaler', StandardScaler())]) + + feature_engineering_pipeline = ColumnTransformer( + transformers=[ + ('numeric', numeric_transformer, numeric_features), + ('categorical', categorical_transformer, categorical_features) + ], remainder="drop") + + # Encode Labels + le = LabelEncoder() + encoded_y = le.fit_transform(y_train) + + # Create sklearn pipeline + lr_clf = Pipeline(steps=[('preprocessor', feature_engineering_pipeline), + ('classifier', LogisticRegression(solver="lbfgs"))]) + # Train the model + lr_clf.fit(X_train, y_train) + + # Capture metrics + train_acc = lr_clf.score(X_train, y_train) + val_acc = lr_clf.score(X_val, y_val) + print("Training accuracy: %.3f" % train_acc) + print("Validation accuracy: %.3f" % val_acc) + + run.log('Training accuracy', train_acc) + run.log('Validation accuracy', val_acc) + + return lr_clf + +if __name__ == "__main__": + main() \ No newline at end of file