Added comments and refactoring code in hashmaps

This commit is contained in:
John Wu 2019-12-09 13:54:20 -08:00
Родитель c82362f1d8
Коммит d6e18ed9ea
27 изменённых файлов: 93 добавлений и 350 удалений

Двоичные данные
modules/__pycache__/__init__.cpython-36.pyc

Двоичный файл не отображается.

Двоичный файл не отображается.

Двоичный файл не отображается.

Двоичный файл не отображается.

Двоичный файл не отображается.

Просмотреть файл

@ -4,6 +4,22 @@ from azureml.pipeline.core import PipelineData
from azureml.pipeline.core import PipelineParameter
def deploy_step(model_dir, accuracy_file, test_dir, compute_target):
'''
This step registers and deploys a new model on its first run. In subsequent runs, it will only register
and deploy a new model if the training dataset has changed or the dataset did not change, but the accuracy improved.
:param model_dir: The reference to the directory containing the trained model
:type model_dir: DataReference
:param accuracy_file: The reference to the file containing the evaluation accuracy
:type accuracy_file: DataReference
:param test_dir: The reference to the directory containing the testing data
:type test_dir: DataReference
:param compute_target: The compute target to run the step on
:type compute_target: ComputeTarget
:return: The preprocess step, step outputs dictionary (keys: scoring_url)
:rtype: PythonScriptStep, dict
'''
scoring_url = PipelineData(
name='scoring_url',
@ -13,6 +29,7 @@ def deploy_step(model_dir, accuracy_file, test_dir, compute_target):
is_directory=False)
outputs = [scoring_url]
outputs_map = { 'scoring_url': scoring_url }
step = PythonScriptStep(
script_name='deploy.py',
@ -29,5 +46,5 @@ def deploy_step(model_dir, accuracy_file, test_dir, compute_target):
allow_reuse=False
)
return step, outputs
return step, outputs_map

Просмотреть файл

@ -1,76 +0,0 @@
from __future__ import print_function, division
import argparse
import time
import torch
import torch.nn as nn
from torchvision import datasets, models, transforms
def load_data(test_dir):
test_transform = transforms.Compose([
transforms.Resize(200),
transforms.CenterCrop(200),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.405],
std=[0.229, 0.224, 0.225])
])
test_dataset = datasets.ImageFolder(root=test_dir, transform=test_transform)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=True, num_workers=4)
dataset_size = len(test_loader.dataset)
class_names = test_dataset.classes
return test_loader, dataset_size, class_names
def evaluate_model(model, criterion, dataloader, dataset_size, class_names, device):
model.eval()
running_loss = 0.0
running_corrects = 0
for batch_idx, (inputs, labels) in enumerate(dataloader):
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
running_loss += loss.item() * inputs.size(0)
corrects = torch.sum(preds == labels.data)
running_corrects += corrects
print('{}/{} predictions correct.'.format(running_corrects, dataset_size))
loss = running_loss / dataset_size
acc = running_corrects.double() / dataset_size
print('Loss: {:.4f} Acc: {:.4f}'.format(loss, acc))
return acc
# Define arguments
parser = argparse.ArgumentParser(description='Evaluate arg parser')
parser.add_argument('--test_dir', type=str, help='Directory where testing data is stored')
parser.add_argument('--model_dir', type=str, help='Directory where model is stored')
parser.add_argument('--accuracy_file', type=str, help='File to output the accuracy to')
args = parser.parse_args()
# Get arguments from parser
test_dir = args.test_dir
model_dir = args.model_dir
accuracy_file = args.accuracy_file
# Load testing data, model, and device
test_loader, dataset_size, class_names = load_data(test_dir)
model = torch.load(os.path.join(model_dir,'model.pt'))
device = torch.device('cuda:0')
# Define criterion
criterion = nn.CrossEntropyLoss()
# Evaluate model
acc = evaluate_model(model, criterion, test_loader, dataset_size, class_names, device)
# Output accuracy to file
with open(accuracy_file, 'w+') as f:
f.write(str(acc.item()))

Просмотреть файл

@ -1,40 +0,0 @@
import os
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.pipeline.core import PipelineData
from azureml.pipeline.core import PipelineParameter
from azureml.pipeline.steps import EstimatorStep
from azureml.train.dnn import PyTorch
def evaluate_step(model_dir, test_dir, compute_target):
accuracy_file = PipelineData(
name='accuracy_file',
pipeline_output_name='accuracy_file',
datastore=test_dir.datastore,
output_mode='mount',
is_directory=False)
outputs = [accuracy_file]
estimator = PyTorch(
source_directory=os.path.dirname(os.path.abspath(__file__)),
entry_script='evaluate.py',
framework_version='1.3',
compute_target=compute_target,
use_gpu=True)
step = EstimatorStep(
estimator=estimator,
estimator_entry_script_arguments=[
'--test_dir', test_dir,
'--model_dir', model_dir,
'--accuracy_file', accuracy_file
],
inputs=[model_dir, test_dir],
outputs=outputs,
compute_target=compute_target,
allow_reuse=False)
return step, outputs

Двоичный файл не отображается.

Двоичный файл не отображается.

Просмотреть файл

@ -6,7 +6,9 @@ import torch.nn as nn
from torchvision import datasets, models, transforms
def load_data(test_dir):
'''
Loads the the testing data
'''
test_transform = transforms.Compose([
transforms.Resize(200),
transforms.CenterCrop(200),
@ -24,7 +26,9 @@ def load_data(test_dir):
return test_loader, dataset_size, class_names
def evaluate_model(model, criterion, dataloader, dataset_size, class_names, device):
'''
Evaluates the model
'''
model.eval()
running_loss = 0.0
running_corrects = 0

Просмотреть файл

@ -8,6 +8,19 @@ from azureml.pipeline.steps import EstimatorStep
from azureml.train.dnn import PyTorch
def evaluate_step(model_dir, test_dir, compute_target):
'''
This step evaluates the trained model on the testing data and outputs the accuracy.
:param model_dir: The reference to the directory containing the trained model
:type model_dir: DataReference
:param test_dir: The reference to the directory containing the testing data
:type test_dir: DataReference
:param compute_target: The compute target to run the step on
:type compute_target: ComputeTarget
:return: The preprocess step, step outputs dictionary (keys: accuracy_file)
:rtype: EstimatorStep, dict
'''
accuracy_file = PipelineData(
name='accuracy_file',
@ -17,6 +30,7 @@ def evaluate_step(model_dir, test_dir, compute_target):
is_directory=False)
outputs = [accuracy_file]
outputs_map = { 'accuracy_file': accuracy_file }
estimator = PyTorch(
source_directory=os.path.dirname(os.path.abspath(__file__)),
@ -37,4 +51,4 @@ def evaluate_step(model_dir, test_dir, compute_target):
compute_target=compute_target,
allow_reuse=False)
return step, outputs
return step, outputs_map

Просмотреть файл

@ -1,35 +0,0 @@
import os
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.runconfig import RunConfiguration
from azureml.pipeline.core import PipelineData
from azureml.pipeline.core import PipelineParameter
def data_ingestion_step(datastore_reference, compute_target):
run_config = RunConfiguration()
run_config.environment.environment_variables = {'COGNITIVE_SERVICES_API_KEY': os.environ['COGNITIVE_SERVICES_API_KEY']}
run_config.environment.docker.enabled = True
num_images = PipelineParameter(name='num_images', default_value=25)
raw_data_dir = PipelineData(
name='raw_data_dir',
pipeline_output_name='raw_data_dir',
datastore=datastore_reference.datastore,
output_mode='mount',
is_directory=True)
outputs = [raw_data_dir]
step = PythonScriptStep(
script_name='data_ingestion.py',
arguments=['--output_dir', raw_data_dir, '--num_images', num_images],
inputs=[datastore_reference],
outputs=outputs,
compute_target=compute_target,
source_directory=os.path.dirname(os.path.abspath(__file__)),
runconfig=run_config,
allow_reuse=False
)
return step, outputs

Двоичный файл не отображается.

Двоичный файл не отображается.

Просмотреть файл

@ -5,6 +5,20 @@ from azureml.pipeline.core import PipelineData
from azureml.pipeline.core import PipelineParameter
def data_ingestion_step(datastore_reference, compute_target):
'''
This step will leverage Azure Cognitive Services to search the web for images
to create a dataset. This replicates the real-world scenario of data being
ingested from a constantly changing source. The same 10 classes in the CIFAR-10 dataset
will be used (airplane, automobile, bird, cat, deer, dog, frog, horse, ship, truck).
:param datastore_reference: The reference to the datastore that will be used
:type datastore_reference: DataReference
:param compute_target: The compute target to run the step on
:type compute_target: ComputeTarget
:return: The ingestion step, step outputs dictionary (keys: raw_data_dir)
:rtype: PythonScriptStep, dict
'''
run_config = RunConfiguration()
run_config.environment.environment_variables = {'COGNITIVE_SERVICES_API_KEY': os.environ['COGNITIVE_SERVICES_API_KEY']}
@ -20,6 +34,7 @@ def data_ingestion_step(datastore_reference, compute_target):
is_directory=True)
outputs = [raw_data_dir]
outputs_map = { 'raw_data_dir': raw_data_dir }
step = PythonScriptStep(
script_name='data_ingestion.py',
@ -32,4 +47,4 @@ def data_ingestion_step(datastore_reference, compute_target):
allow_reuse=False
)
return step, outputs
return step, outputs_map

Просмотреть файл

@ -1,72 +0,0 @@
import os
import argparse
import random
import cv2
from imutils import paths
def preprocess_images(files, image_dim, output_dir, label):
'''
Load files, crop to consistent size, and save to respective folder
'''
# Make class directory
class_directory = '{}/{}'.format(output_dir, label)
if not os.path.exists(class_directory):
os.makedirs(class_directory)
# Iterate through files
for f in files:
temp = f.split('/')
output_file = '{}/{}/{}'.format(output_dir, label, temp[-1])
try:
image = cv2.imread(f)
image = cv2.resize(image, (image_dim, image_dim))
cv2.imwrite(output_file, image)
print('Cropping image: {}'.format(output_file))
except:
print('Removing corrupted file: {}'.format(output_file))
# Define arguments
parser = argparse.ArgumentParser(description='Web scraping arg parser')
parser.add_argument('--raw_data_dir', type=str, help='Directory where raw data is stored')
parser.add_argument('--image_dim', type=int, help='Image dimension to be cropped to')
parser.add_argument('--train_dir', type=str, help='Directory to output the processed training data')
parser.add_argument('--valid_dir', type=str, help='Directory to output the processed valid data')
parser.add_argument('--test_dir', type=str, help='Directory to output the processed test data')
args = parser.parse_args()
# Get arguments from parser
raw_data_dir = args.raw_data_dir
image_dim = args.image_dim
train_dir = args.train_dir
valid_dir = args.valid_dir
test_dir = args.test_dir
# Make train, valid, test directories
if not os.path.exists(train_dir):
os.makedirs(train_dir)
if not os.path.exists(valid_dir):
os.makedirs(valid_dir)
if not os.path.exists(test_dir):
os.makedirs(test_dir)
# Get all the classes that have been sorted into directories from previous step
classes = os.listdir(raw_data_dir)
for label in classes:
# Get and shuffle files
image_files = list(paths.list_images('{}/{}'.format(raw_data_dir, label)))
random.shuffle(image_files)
# Split into train, valid, test sets
num_images = len(image_files)
train_files = image_files[0:int(num_images*0.7)]
valid_files = image_files[int(num_images*0.7):int(num_images*0.9)]
test_files = image_files[int(num_images*0.9):num_images]
# Load files, crop to consistent size, and save to respective folder
preprocess_images(train_files, image_dim, train_dir, label)
preprocess_images(valid_files, image_dim, valid_dir, label)
preprocess_images(test_files, image_dim, test_dir, label)

Просмотреть файл

@ -1,56 +0,0 @@
import os
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.pipeline.core import PipelineData
from azureml.pipeline.core import PipelineParameter
def data_preprocess_step(raw_data_dir, compute_target):
run_config = RunConfiguration()
run_config.environment.python.conda_dependencies = CondaDependencies.create(pip_packages=['opencv-python==4.1.1.26', 'imutils==0.5.3'])
run_config.environment.docker.enabled = True
image_dim = PipelineParameter(name='image_dim', default_value=200)
train_dir = PipelineData(
name='train_dir',
pipeline_output_name='train_dir',
datastore=raw_data_dir.datastore,
output_mode='mount',
is_directory=True)
valid_dir = PipelineData(
name='valid_dir',
pipeline_output_name='valid_dir',
datastore=raw_data_dir.datastore,
output_mode='mount',
is_directory=True)
test_dir = PipelineData(
name='test_dir',
pipeline_output_name='test_dir',
datastore=raw_data_dir.datastore,
output_mode='mount',
is_directory=True)
outputs = [train_dir, valid_dir, test_dir]
step = PythonScriptStep(
script_name='data_preprocess.py',
arguments=[
'--raw_data_dir', raw_data_dir,
'--train_dir', train_dir,
'--valid_dir', valid_dir,
'--test_dir', test_dir,
'--image_dim', image_dim
],
inputs=[raw_data_dir],
outputs=outputs,
compute_target=compute_target,
runconfig=run_config,
source_directory=os.path.dirname(os.path.abspath(__file__)),
allow_reuse=False
)
return step, outputs

Двоичный файл не отображается.

Двоичный файл не отображается.

Просмотреть файл

@ -6,6 +6,18 @@ from azureml.pipeline.core import PipelineData
from azureml.pipeline.core import PipelineParameter
def data_preprocess_step(raw_data_dir, compute_target):
'''
This step will take the raw data downloaded from the previous step and preprocess it by cropping
it to a consistent size, shuffling the data, and splitting it into train, valid, and test directories.
:param raw_data_dir: The reference to the directory containing the raw data
:type raw_data_dir: DataReference
:param compute_target: The compute target to run the step on
:type compute_target: ComputeTarget
:return: The preprocess step, step outputs dictionary (keys: train_dir, valid_dir, test_dir)
:rtype: PythonScriptStep, dict
'''
run_config = RunConfiguration()
run_config.environment.python.conda_dependencies = CondaDependencies.create(pip_packages=['opencv-python==4.1.1.26', 'imutils==0.5.3'])
@ -35,6 +47,11 @@ def data_preprocess_step(raw_data_dir, compute_target):
is_directory=True)
outputs = [train_dir, valid_dir, test_dir]
outputs_map = {
'train_dir': train_dir,
'valid_dir': valid_dir,
'test_dir': test_dir,
}
step = PythonScriptStep(
script_name='data_preprocess.py',
@ -53,4 +70,4 @@ def data_preprocess_step(raw_data_dir, compute_target):
allow_reuse=False
)
return step, outputs
return step, outputs_map

Просмотреть файл

@ -1,11 +0,0 @@
# Define arguments
parser = argparse.ArgumentParser(description='Training arg parser')
parser.add_argument('--train_dir', type=str, help='Directory where training data is stored')
parser.add_argument('--valid_dir', type=str, help='Directory where validation data is stored')
parser.add_argument('--output_dir', type=str, help='Directory to output the model to')
args = parser.parse_args()
# Get arguments from parser
train_dir = args.train_dir
valid_dir = args.valid_dir
output_dir = args.output_dir

Просмотреть файл

@ -1,49 +0,0 @@
import os
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.pipeline.core import PipelineData
from azureml.pipeline.core import PipelineParameter
from azureml.pipeline.steps import EstimatorStep
from azureml.train.dnn import PyTorch
def train_step(train_dir, valid_dir, compute_target):
num_epochs = PipelineParameter(name='num_epochs', default_value=25)
batch_size = PipelineParameter(name='batch_size', default_value=16)
learning_rate = PipelineParameter(name='learning_rate', default_value=0.001)
momentum = PipelineParameter(name='momentum', default_value=0.9)
model_dir = PipelineData(
name='model_dir',
pipeline_output_name='model_dir',
datastore=train_dir.datastore,
output_mode='mount',
is_directory=True)
outputs = [model_dir]
estimator = PyTorch(
source_directory=os.path.dirname(os.path.abspath(__file__)),
entry_script='train.py',
framework_version='1.3',
compute_target=compute_target,
use_gpu=True)
step = EstimatorStep(
estimator=estimator,
estimator_entry_script_arguments=[
'--train_dir', train_dir,
'--valid_dir', valid_dir,
'--output_dir', model_dir,
'--num_epochs', num_epochs,
'--batch_size', batch_size,
'--learning_rate', learning_rate,
'--momentum', momentum
],
inputs=[train_dir, valid_dir],
compute_target=compute_target,
outputs=outputs,
allow_reuse=False)
return step, outputs

Двоичный файл не отображается.

Двоичный файл не отображается.

Просмотреть файл

@ -8,6 +8,20 @@ from azureml.pipeline.steps import EstimatorStep
from azureml.train.dnn import PyTorch
def train_step(train_dir, valid_dir, compute_target):
'''
This step will fine-tune a RESNET-18 model on our dataset using PyTorch.
It will use the corresponding input image directories as training and validation data.
:param train_dir: The reference to the directory containing the training data
:type train_dir: DataReference
:param valid_dir: The reference to the directory containing the validation data
:type valid_dir: DataReference
:param compute_target: The compute target to run the step on
:type compute_target: ComputeTarget
:return: The preprocess step, step outputs dictionary (keys: model_dir)
:rtype: EstimatorStep, dict
'''
num_epochs = PipelineParameter(name='num_epochs', default_value=25)
batch_size = PipelineParameter(name='batch_size', default_value=16)
@ -22,6 +36,7 @@ def train_step(train_dir, valid_dir, compute_target):
is_directory=True)
outputs = [model_dir]
outputs_map = { 'model_dir': model_dir }
estimator = PyTorch(
source_directory=os.path.dirname(os.path.abspath(__file__)),
@ -46,4 +61,4 @@ def train_step(train_dir, valid_dir, compute_target):
outputs=outputs,
allow_reuse=False)
return step, outputs
return step, outputs_map

Просмотреть файл

@ -41,16 +41,16 @@ datastore = DataReference(datastore, mode='mount')
data_ingestion_step, data_ingestion_outputs = data_ingestion_step(datastore, cpu_compute_target)
# Step 2: Data preprocessing
data_preprocess_step, data_preprocess_outputs = data_preprocess_step(data_ingestion_outputs[0], cpu_compute_target)
data_preprocess_step, data_preprocess_outputs = data_preprocess_step(data_ingestion_outputs['raw_data_dir'], cpu_compute_target)
# Step 3: Train Model
train_step, train_outputs = train_step(data_preprocess_outputs[0], data_preprocess_outputs[1], gpu_compute_target)
train_step, train_outputs = train_step(data_preprocess_outputs['train_dir'], data_preprocess_outputs['valid_dir'], gpu_compute_target)
# Step 4: Evaluate Model
evaluate_step, evaluate_outputs = evaluate_step(train_outputs[0], data_preprocess_outputs[2], gpu_compute_target)
evaluate_step, evaluate_outputs = evaluate_step(train_outputs['model_dir'], data_preprocess_outputs['test_dir'], gpu_compute_target)
# Step 5: Deploy Model
deploy_step, deploy_outputs = deploy_step(train_outputs[0], evaluate_outputs[0], data_preprocess_outputs[2], cpu_compute_target)
deploy_step, deploy_outputs = deploy_step(train_outputs['model_dir'], evaluate_outputs['accuracy_file'], data_preprocess_outputs['test_dir'], cpu_compute_target)
# Submit pipeline
print('Submitting pipeline ...')