This commit is contained in:
cindyweng 2022-04-11 16:02:00 +01:00
Родитель a836266332
Коммит 5495f66ca9
2 изменённых файлов: 128 добавлений и 35 удалений

Просмотреть файл

@ -16,56 +16,44 @@ from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from azureml.core import Run
run = Run.get_context()
ws = run.experiment.workspace
def parse_args():
parser = argparse.ArgumentParser(description="UCI Credit example")
parser.add_argument("--transformed_data_path", type=str, default='transformed_data/', help="Directory path to training data")
parser.add_argument("--model_path", type=str, default='trained_model/', help="Model output directory")
parser.add_argument("--data_path", type=str, default='data/', help="Directory path to training data")
parser.add_argument("--model_path", type=str, default='outputs/', help="Model output directory")
return parser.parse_args()
def main():
# Parse command-line arguments
args = parse_args()
transformed_data_path = os.path.join(args.transformed_data_path, run.parent.id)
model_path = os.path.join(args.model_path, run.parent.id)
# Make sure model output path exists
if not os.path.exists(model_path):
os.makedirs(model_path)
if not os.path.exists(args.model_path):
os.makedirs(args.model_path)
print(args.data_path)
os.listdir(os.getcwd())
print(os.getcwd())
# Enable auto logging
mlflow.sklearn.autolog()
# Read training data
train = pd.read_csv(os.path.join(transformed_data_path, 'train.csv'))
val = pd.read_csv(os.path.join(transformed_data_path, 'val.csv'))
run.log('TRAIN SIZE', train.shape[0])
run.log('VAL SIZE', val.shape[0])
df = pd.read_csv(os.path.join(args.data_path, 'credit.csv'))
# Train model
model = model_train(train, val)
model = model_train(df)
#copying model to "outputs" directory, this will automatically upload it to Azure ML
joblib.dump(value=model, filename=os.path.join(model_path, 'model.pkl'))
joblib.dump(value=model, filename=os.path.join(args.model_path, 'model.pkl'))
def model_train(train, val):
train.drop("Sno", axis=1, inplace=True)
val.drop("Sno", axis=1, inplace=True)
def model_train(df):
df.drop("Sno", axis=1, inplace=True)
y_train = train['Risk']
X_train = train.drop('Risk', axis=1)
y_raw = df['Risk']
X_raw = df.drop('Risk', axis=1)
y_val = val['Risk']
X_val = val.drop('Risk', axis=1)
categorical_features = X_train.select_dtypes(include=['object']).columns
numeric_features = X_train.select_dtypes(include=['int64', 'float']).columns
categorical_features = X_raw.select_dtypes(include=['object']).columns
numeric_features = X_raw.select_dtypes(include=['int64', 'float']).columns
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value="missing")),
@ -82,7 +70,10 @@ def model_train(train, val):
# Encode Labels
le = LabelEncoder()
encoded_y = le.fit_transform(y_train)
encoded_y = le.fit_transform(y_raw)
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_raw, encoded_y, test_size=0.20, stratify=encoded_y, random_state=42)
# Create sklearn pipeline
lr_clf = Pipeline(steps=[('preprocessor', feature_engineering_pipeline),
@ -92,12 +83,9 @@ def model_train(train, val):
# Capture metrics
train_acc = lr_clf.score(X_train, y_train)
val_acc = lr_clf.score(X_val, y_val)
test_acc = lr_clf.score(X_test, y_test)
print("Training accuracy: %.3f" % train_acc)
print("Validation accuracy: %.3f" % val_acc)
run.log('Training accuracy', train_acc)
run.log('Validation accuracy', val_acc)
print("Testing accuracy: %.3f" % test_acc)
return lr_clf

Просмотреть файл

@ -0,0 +1,105 @@
import os
import sys
import argparse
import joblib
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from azureml.core import Run
run = Run.get_context()
ws = run.experiment.workspace
def parse_args():
parser = argparse.ArgumentParser(description="UCI Credit example")
parser.add_argument("--transformed_data_path", type=str, default='transformed_data/', help="Directory path to training data")
parser.add_argument("--model_path", type=str, default='trained_model/', help="Model output directory")
return parser.parse_args()
def main():
# Parse command-line arguments
args = parse_args()
transformed_data_path = os.path.join(args.transformed_data_path, run.parent.id)
model_path = os.path.join(args.model_path, run.parent.id)
# Make sure model output path exists
if not os.path.exists(model_path):
os.makedirs(model_path)
# Enable auto logging
mlflow.sklearn.autolog()
# Read training data
train = pd.read_csv(os.path.join(transformed_data_path, 'train.csv'))
val = pd.read_csv(os.path.join(transformed_data_path, 'val.csv'))
run.log('TRAIN SIZE', train.shape[0])
run.log('VAL SIZE', val.shape[0])
# Train model
model = model_train(train, val)
#copying model to "outputs" directory, this will automatically upload it to Azure ML
joblib.dump(value=model, filename=os.path.join(model_path, 'model.pkl'))
def model_train(train, val):
train.drop("Sno", axis=1, inplace=True)
val.drop("Sno", axis=1, inplace=True)
y_train = train['Risk']
X_train = train.drop('Risk', axis=1)
y_val = val['Risk']
X_val = val.drop('Risk', axis=1)
categorical_features = X_train.select_dtypes(include=['object']).columns
numeric_features = X_train.select_dtypes(include=['int64', 'float']).columns
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value="missing")),
('onehotencoder', OneHotEncoder(categories='auto', sparse=False))])
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())])
feature_engineering_pipeline = ColumnTransformer(
transformers=[
('numeric', numeric_transformer, numeric_features),
('categorical', categorical_transformer, categorical_features)
], remainder="drop")
# Encode Labels
le = LabelEncoder()
encoded_y = le.fit_transform(y_train)
# Create sklearn pipeline
lr_clf = Pipeline(steps=[('preprocessor', feature_engineering_pipeline),
('classifier', LogisticRegression(solver="lbfgs"))])
# Train the model
lr_clf.fit(X_train, y_train)
# Capture metrics
train_acc = lr_clf.score(X_train, y_train)
val_acc = lr_clf.score(X_val, y_val)
print("Training accuracy: %.3f" % train_acc)
print("Validation accuracy: %.3f" % val_acc)
run.log('Training accuracy', train_acc)
run.log('Validation accuracy', val_acc)
return lr_clf
if __name__ == "__main__":
main()