Added the Labels artifacts and cleaned up unused imports
* label mapping to output from the API is stored in the label-mapping.txt. This file can be used together with the output from the API to map the list of floats to the dogs' class. Formatting fixes Adding the DevOps Pipeline Update deploy-model-training-pipeline.yml Updating the template path (Temporary - Fix) Update deploy-model-training-pipeline.yml for Azure Pipelines Update deploy-model-training-pipeline.yml for Azure Pipelines Update deploy-model-training-pipeline.yml for Azure Pipelines Update deploy-model-training-pipeline.yml for Azure Pipelines Update deploy-model-training-pipeline.yml fixes to the path Hardcoding the variables Temporary fixing the resource names to test the e2e pipeline Update the online deployment pipeline [skip ci] Update deploy-online-endpoint-pipeline.yml for Azure Pipelines Create a sample request for the online api Update deploy-online-endpoint-pipeline.yml Update deploy-online-endpoint-pipeline.yml Update deploy-online-endpoint-pipeline.yml for Azure Pipelines Updated the VM with large memory Standard_F2s_v2 run out of memory when setting up the CV container. Update online-deployment.yml Update train model to use float instead of double Fixing the double/float issue Update train.py Update train.py
This commit is contained in:
Родитель
42429666e6
Коммит
39930f3ba5
|
@ -12,9 +12,3 @@ repos:
|
|||
hooks:
|
||||
- id: black
|
||||
additional_dependencies: ['click==8.0.4']
|
||||
|
||||
# Sort imports deterministically
|
||||
- repo: https://github.com/pycqa/isort
|
||||
rev: 5.10.1
|
||||
hooks:
|
||||
- id: isort
|
||||
|
|
|
@ -7,11 +7,11 @@ variables:
|
|||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- template: ../../../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: aml-cli-v2
|
||||
- name: endpoint_name
|
||||
value: taxi-fare-online
|
||||
value: dogs-classifier-online
|
||||
- name: endpoint_type
|
||||
value: online
|
||||
|
||||
|
@ -45,16 +45,16 @@ stages:
|
|||
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/create-endpoint.yml@mlops-templates
|
||||
parameters:
|
||||
endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
|
||||
endpoint_file: cv/aml-cli-v2/mlops/azureml/deploy/online/online-endpoint.yml
|
||||
- template: templates/${{ variables.version }}/create-deployment.yml@mlops-templates
|
||||
parameters:
|
||||
deployment_name: taxi-online-dp
|
||||
deployment_file: mlops/azureml/deploy/online/online-deployment.yml
|
||||
deployment_name: dogs-online-dp
|
||||
deployment_file: cv/aml-cli-v2/mlops/azureml/deploy/online/online-deployment.yml
|
||||
- template: templates/${{ variables.version }}/allocate-traffic.yml@mlops-templates
|
||||
parameters:
|
||||
traffic_allocation: taxi-online-dp=100
|
||||
traffic_allocation: dogs-online-dp=100
|
||||
- template: templates/${{ variables.version }}/test-deployment.yml@mlops-templates
|
||||
parameters:
|
||||
deployment_name: taxi-online-dp
|
||||
sample_request: data/taxi-request.json
|
||||
deployment_name: dogs-online-dp
|
||||
sample_request: cv/aml-cli-v2/data/sample-request.json
|
||||
request_type: json
|
|
@ -17,12 +17,12 @@ variables:
|
|||
ado_service_connection_aml_ws: Azure-ARM-Dev
|
||||
|
||||
# IaC
|
||||
resource_group: rg-$(namespace)-$(postfix)$(environment)
|
||||
aml_workspace: mlw-$(namespace)-$(postfix)$(environment)
|
||||
application_insights: mlw-$(namespace)-$(postfix)$(environment)
|
||||
key_vault: kv-$(namespace)-$(postfix)$(environment)
|
||||
container_registry: cr$(namespace)$(postfix)$(environment)
|
||||
storage_account: st$(namespace)$(postfix)$(environment)
|
||||
resource_group: azureml-examples-rg
|
||||
aml_workspace: main
|
||||
application_insights: maininsights3940d807586b
|
||||
key_vault: mainkeyvaultaa801245b95c
|
||||
container_registry: e7fd97c2665e408bab3c1c126576f89e
|
||||
storage_account: mainstorage5338948febc34
|
||||
|
||||
# Terraform
|
||||
terraform_version: 0.14.7
|
||||
|
|
|
@ -43,4 +43,4 @@ ENV NCCL_DEBUG_SUBSYS="GRAPH,INIT,ENV"
|
|||
# Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
|
||||
ENV NCCL_IB_PCI_RELAXED_ORDERING="1"
|
||||
ENV CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||
ENV NCCL_SOCKET_IFNAME="eth0"
|
||||
ENV NCCL_SOCKET_IFNAME="eth0"
|
||||
|
|
|
@ -32,4 +32,4 @@
|
|||
<pci busid="0108:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
</pci>
|
||||
</cpu>
|
||||
</system>
|
||||
</system>
|
||||
|
|
|
@ -11,4 +11,4 @@ tqdm==4.64.0
|
|||
psutil==5.9.0
|
||||
|
||||
# for unit testing
|
||||
pytest==7.1.2
|
||||
pytest==7.1.2
|
||||
|
|
|
@ -13,6 +13,7 @@ import tempfile
|
|||
from torch.profiler import profile, record_function, ProfilerActivity
|
||||
from typing import Any
|
||||
|
||||
|
||||
def markdown_trace_handler(dir_name: str, rank: int = 0):
|
||||
"""This handler can be used inside torch.profiler call to output
|
||||
tables in markdown format"""
|
||||
|
@ -51,6 +52,7 @@ def markdown_trace_handler(dir_name: str, rank: int = 0):
|
|||
|
||||
def composite_trace_handler(handler_list):
|
||||
"""This can call multiple trace handlers inside one"""
|
||||
|
||||
def _handler_fn(prof) -> None:
|
||||
for handler in handler_list:
|
||||
handler(prof)
|
||||
|
@ -58,7 +60,9 @@ def composite_trace_handler(handler_list):
|
|||
return _handler_fn
|
||||
|
||||
|
||||
def export_stack_trace_handler(dir_name: str, rank: int=0, metrics=["self_cuda_time_total"]):
|
||||
def export_stack_trace_handler(
|
||||
dir_name: str, rank: int = 0, metrics=["self_cuda_time_total"]
|
||||
):
|
||||
"""This handler can be used inside torch.profiler call to output
|
||||
tables in markdown format"""
|
||||
|
||||
|
@ -128,24 +132,23 @@ class PyTorchProfilerHandler:
|
|||
markdown_logs_export = os.path.join(
|
||||
self.profiler_output_tmp_dir.name, "markdown"
|
||||
)
|
||||
trace_handlers.append(markdown_trace_handler(
|
||||
markdown_logs_export, rank=self.rank
|
||||
))
|
||||
trace_handlers.append(
|
||||
markdown_trace_handler(markdown_logs_export, rank=self.rank)
|
||||
)
|
||||
|
||||
# export stacks in txt
|
||||
stacks_logs_export = os.path.join(
|
||||
self.profiler_output_tmp_dir.name, "stacks"
|
||||
)
|
||||
stack_metrics = [
|
||||
"self_cpu_time_total"
|
||||
]
|
||||
stack_metrics = ["self_cpu_time_total"]
|
||||
if torch.cuda.is_available():
|
||||
stack_metrics.append("self_cuda_time_total")
|
||||
|
||||
trace_handlers.append(export_stack_trace_handler(
|
||||
stacks_logs_export, rank=self.rank,
|
||||
metrics=stack_metrics
|
||||
))
|
||||
trace_handlers.append(
|
||||
export_stack_trace_handler(
|
||||
stacks_logs_export, rank=self.rank, metrics=stack_metrics
|
||||
)
|
||||
)
|
||||
|
||||
# export tensorboard
|
||||
# NOTE: removed due to segfault in pytorch 1.11.0
|
||||
|
@ -170,7 +173,7 @@ class PyTorchProfilerHandler:
|
|||
with_flops=True,
|
||||
profile_memory=True,
|
||||
activities=activities,
|
||||
with_stack=True, # needed to export stacks
|
||||
with_stack=True, # needed to export stacks
|
||||
on_trace_ready=trace_handler,
|
||||
)
|
||||
self.profiler.start()
|
||||
|
@ -204,8 +207,9 @@ class PyTorchProfilerHandler:
|
|||
"Not stopping profiler as it was not started in the first place."
|
||||
)
|
||||
|
||||
|
||||
class LogTimeBlock(object):
|
||||
""" This class should be used to time a code block.
|
||||
"""This class should be used to time a code block.
|
||||
The time diff is computed from __enter__ to __exit__.
|
||||
Example
|
||||
-------
|
||||
|
@ -224,8 +228,8 @@ class LogTimeBlock(object):
|
|||
kwargs (dict): any keyword will be added as properties to metrics for logging (work in progress)
|
||||
"""
|
||||
# kwargs
|
||||
self.step = kwargs.get('step', None)
|
||||
self.enabled = kwargs.get('enabled', True)
|
||||
self.step = kwargs.get("step", None)
|
||||
self.enabled = kwargs.get("enabled", True)
|
||||
|
||||
# internal variables
|
||||
self.name = name
|
||||
|
@ -233,23 +237,25 @@ class LogTimeBlock(object):
|
|||
self._logger = logging.getLogger(__name__)
|
||||
|
||||
def __enter__(self):
|
||||
""" Starts the timer, gets triggered at beginning of code block """
|
||||
"""Starts the timer, gets triggered at beginning of code block"""
|
||||
if not self.enabled:
|
||||
return
|
||||
self.start_time = time.time() # starts "timer"
|
||||
self.start_time = time.time() # starts "timer"
|
||||
|
||||
def __exit__(self, exc_type, value, traceback):
|
||||
""" Stops the timer and stores accordingly
|
||||
"""Stops the timer and stores accordingly
|
||||
gets triggered at beginning of code block.
|
||||
|
||||
|
||||
Note:
|
||||
arguments are by design for with statements.
|
||||
"""
|
||||
if not self.enabled:
|
||||
return
|
||||
run_time = time.time() - self.start_time # stops "timer"
|
||||
run_time = time.time() - self.start_time # stops "timer"
|
||||
|
||||
self._logger.info(f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]")
|
||||
self._logger.info(
|
||||
f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]"
|
||||
)
|
||||
mlflow.log_metric(self.name + ".time", run_time)
|
||||
|
||||
|
||||
|
@ -262,18 +268,18 @@ class LogDiskIOBlock(object):
|
|||
kwargs (dict): any keyword will be added as properties to metrics for logging (work in progress)
|
||||
"""
|
||||
# kwargs
|
||||
self.step = kwargs.get('step', None)
|
||||
self.enabled = kwargs.get('enabled', True)
|
||||
self.step = kwargs.get("step", None)
|
||||
self.enabled = kwargs.get("enabled", True)
|
||||
|
||||
# internal variables
|
||||
self.name = name
|
||||
self.process_id = os.getpid() # focus on current process
|
||||
self.process_id = os.getpid() # focus on current process
|
||||
self.start_time = None
|
||||
self.start_disk_counters = None
|
||||
self._logger = logging.getLogger(__name__)
|
||||
|
||||
def __enter__(self):
|
||||
""" Get initial values, gets triggered at beginning of code block """
|
||||
"""Get initial values, gets triggered at beginning of code block"""
|
||||
if not self.enabled:
|
||||
return
|
||||
try:
|
||||
|
@ -286,9 +292,9 @@ class LogDiskIOBlock(object):
|
|||
self.logger.critical("import psutil failed, cannot display disk stats.")
|
||||
|
||||
def __exit__(self, exc_type, value, traceback):
|
||||
""" Stops the timer and stores accordingly
|
||||
"""Stops the timer and stores accordingly
|
||||
gets triggered at beginning of code block.
|
||||
|
||||
|
||||
Note:
|
||||
arguments are by design for with statements.
|
||||
"""
|
||||
|
@ -304,19 +310,32 @@ class LogDiskIOBlock(object):
|
|||
|
||||
disk_io_metrics = {}
|
||||
end_disk_counters = psutil.Process(self.process_id).io_counters()
|
||||
disk_io_metrics[f"{self.name}.disk.read"] = (end_disk_counters.read_bytes - self.start_disk_counters.read_bytes) / (1024 * 1024)
|
||||
disk_io_metrics[f"{self.name}.disk.write"] = (end_disk_counters.write_bytes - self.start_disk_counters.write_bytes) / (1024 * 1024)
|
||||
disk_io_metrics[f"{self.name}.disk.read"] = (
|
||||
end_disk_counters.read_bytes - self.start_disk_counters.read_bytes
|
||||
) / (1024 * 1024)
|
||||
disk_io_metrics[f"{self.name}.disk.write"] = (
|
||||
end_disk_counters.write_bytes - self.start_disk_counters.write_bytes
|
||||
) / (1024 * 1024)
|
||||
|
||||
self._logger.info(f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]")
|
||||
self._logger.info(
|
||||
f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]"
|
||||
)
|
||||
self._logger.info(f"--- disk_io_metrics: {disk_io_metrics}s [step={self.step}]")
|
||||
|
||||
mlflow.log_metrics(disk_io_metrics)
|
||||
|
||||
|
||||
class LogTimeOfIterator():
|
||||
class LogTimeOfIterator:
|
||||
"""This class is intended to "wrap" an existing Iterator
|
||||
and log metrics for each next() call"""
|
||||
def __init__(self, wrapped_sequence:Any, name:str, enabled:bool=True, async_collector:dict=None):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
wrapped_sequence: Any,
|
||||
name: str,
|
||||
enabled: bool = True,
|
||||
async_collector: dict = None,
|
||||
):
|
||||
self.wrapped_sequence = wrapped_sequence
|
||||
self.wrapped_iterator = None
|
||||
|
||||
|
@ -328,7 +347,7 @@ class LogTimeOfIterator():
|
|||
self.async_collector = async_collector
|
||||
|
||||
self._logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def __iter__(self):
|
||||
"""Creates the iterator"""
|
||||
if self.enabled:
|
||||
|
|
|
@ -23,15 +23,11 @@ import traceback
|
|||
from distutils.util import strtobool
|
||||
|
||||
import mlflow
|
||||
from mlflow.models.signature import infer_signature
|
||||
|
||||
|
||||
# the long list of torch imports
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
from torch.autograd import Variable
|
||||
import torchvision
|
||||
from torch.optim import lr_scheduler
|
||||
from torch.profiler import record_function
|
||||
from torch.utils.data import DataLoader
|
||||
|
@ -39,9 +35,6 @@ from torch.utils.data.distributed import DistributedSampler
|
|||
from tqdm import tqdm
|
||||
from transformers.utils import ModelOutput
|
||||
|
||||
from azureml.core import Run
|
||||
|
||||
|
||||
# add path to here, if necessary
|
||||
COMPONENT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "."))
|
||||
if COMPONENT_ROOT not in sys.path:
|
||||
|
@ -59,6 +52,7 @@ from profiling import (
|
|||
PyTorchProfilerHandler,
|
||||
)
|
||||
|
||||
torch.set_default_dtype(torch.float64)
|
||||
|
||||
class PyTorchDistributedModelTrainingSequence:
|
||||
"""Generic class to run the sequence for training a PyTorch model
|
||||
|
@ -78,9 +72,6 @@ class PyTorchDistributedModelTrainingSequence:
|
|||
self.labels = []
|
||||
self.model_signature = None
|
||||
|
||||
# Run
|
||||
self.runid = None
|
||||
|
||||
# DISTRIBUTED CONFIG
|
||||
self.world_size = 1
|
||||
self.world_rank = 0
|
||||
|
@ -289,7 +280,7 @@ class PyTorchDistributedModelTrainingSequence:
|
|||
def setup_model(self, model):
|
||||
"""Configures a model for training."""
|
||||
self.logger.info(f"Setting up model to use device {self.device}")
|
||||
self.model = model.float().to(self.device)
|
||||
self.model = model.to(self.device)
|
||||
|
||||
# DISTRIBUTED: the model needs to be wrapped in a DistributedDataParallel class
|
||||
if self.multinode_available:
|
||||
|
@ -661,6 +652,13 @@ class PyTorchDistributedModelTrainingSequence:
|
|||
else:
|
||||
model_to_save = self.model.to("cpu")
|
||||
|
||||
# Save the labels to a csv file.
|
||||
# This file will be required to map the output array
|
||||
# from the API to the labels.
|
||||
with open("label-mapping.txt", "w") as f:
|
||||
f.write("\n".join(self.labels))
|
||||
mlflow.log_artifact("label-mapping.txt")
|
||||
|
||||
# MLFLOW: mlflow has a nice method to export the model automatically
|
||||
# add tags and environment for it. You can then use it in Azure ML
|
||||
# to register your model to an endpoint.
|
||||
|
@ -863,12 +861,6 @@ def run(args):
|
|||
# sets cuda and distributed config
|
||||
training_handler.setup_config(args)
|
||||
|
||||
# Get Run Id
|
||||
run = Run.get_context()
|
||||
run_id = run.get_details()["runId"]
|
||||
training_handler.runid = run_id
|
||||
logger.info(f"Run Id: {run_id}")
|
||||
|
||||
# PROFILER: here we use a helper class to enable profiling
|
||||
# see profiling.py for the implementation details
|
||||
training_profiler = PyTorchProfilerHandler(
|
||||
|
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -0,0 +1,6 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
|
||||
name: blue
|
||||
endpoint_name: dogs-classifier-online
|
||||
model: azureml:resnet-dogs-classifier@latest
|
||||
instance_type: Standard_DS2_v2
|
||||
instance_count: 1
|
|
@ -0,0 +1,4 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
|
||||
name: dogs-classifier-online
|
||||
description: Stanford Dogs Classifier
|
||||
auth_mode: key
|
|
@ -0,0 +1,83 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
|
||||
type: pipeline
|
||||
|
||||
# <inputs_and_outputs>
|
||||
inputs:
|
||||
training_images:
|
||||
type: uri_folder
|
||||
mode: download # pick ro_mount, rw_mount or download
|
||||
path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/dogs/**
|
||||
# path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/places2/train//**
|
||||
validation_images:
|
||||
type: uri_folder
|
||||
mode: download # pick ro_mount, rw_mount or download
|
||||
path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/dogs/**
|
||||
# path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/places2/valid/**
|
||||
# </inputs_and_outputs>
|
||||
|
||||
# <jobs>
|
||||
settings:
|
||||
default_datastore: azureml:workspaceblobstore
|
||||
continue_on_step_failure: true
|
||||
|
||||
jobs:
|
||||
train:
|
||||
type: command
|
||||
component: file:train.yaml
|
||||
compute: azureml:gpu-cluster
|
||||
resources:
|
||||
instance_count: 1 # number of nodes
|
||||
distribution:
|
||||
type: pytorch
|
||||
process_count_per_instance: 1 # number of gpus
|
||||
|
||||
# NOTE: set env var if needed
|
||||
environment_variables:
|
||||
NCCL_DEBUG: "INFO" # adjusts the level of info from NCCL tests
|
||||
|
||||
# NCCL_TOPO_FILE: "/opt/microsoft/ndv4-topo.xml" # Use specific topology file for A100
|
||||
|
||||
# NCCL_IB_PCI_RELAXED_ORDERING: "1" # Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
|
||||
# NCCL_IB_DISABLE: "1" # force disable infiniband (if set to "1")
|
||||
# NCCL_NET_PLUGIN: "none" # to force NET/Plugin off (no rdma/sharp plugin at all)
|
||||
# NCCL_NET: "Socket" # to force node-to-node comm to use Socket (slow)
|
||||
# NCCL_SOCKET_IFNAME: "eth0" # to force Socket comm to use eth0 (use NCCL_NET=Socket)
|
||||
|
||||
# UCX_IB_PCI_RELAXED_ORDERING: "on"
|
||||
# UCX_TLS: "tcp"
|
||||
# UCX_NET_DEVICES: "eth0" # if you have Error: Failed to resolve UCX endpoint...
|
||||
|
||||
# CUDA_DEVICE_ORDER: "PCI_BUS_ID" # ordering of gpus
|
||||
|
||||
# TORCH_DISTRIBUTED_DEBUG: "DETAIL"
|
||||
|
||||
inputs:
|
||||
# data inputs
|
||||
train_images: ${{parent.inputs.training_images}}
|
||||
valid_images: ${{parent.inputs.validation_images}}
|
||||
|
||||
# data loading
|
||||
batch_size: 64
|
||||
num_workers: 5
|
||||
prefetch_factor: 4
|
||||
persistent_workers: true
|
||||
pin_memory: true
|
||||
non_blocking: false
|
||||
|
||||
# model
|
||||
model_arch: "resnet18"
|
||||
model_arch_pretrained: true
|
||||
|
||||
# training
|
||||
num_epochs: 1
|
||||
learning_rate: 0.001
|
||||
momentum: 0.9
|
||||
|
||||
# profiling
|
||||
enable_profiling: false
|
||||
# multiprocessing_sharing_strategy: "file_system" # WARNING: this can cause hang at job completion
|
||||
|
||||
# Model Registrataion
|
||||
register_model_as: "resnet-dogs-classifier"
|
||||
|
||||
# </jobs>
|
|
@ -2,7 +2,7 @@ $schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
|
|||
name: nvidia_pytorch
|
||||
version: 22.04-py3
|
||||
build:
|
||||
path: .
|
||||
path: ../../../data-science/src/environment/
|
||||
tags:
|
||||
os: ubuntu
|
||||
os_version: 20.04
|
||||
|
@ -19,5 +19,4 @@ tags:
|
|||
nccl_test: 2.11.0
|
||||
azureml-defaults: 1.41.0
|
||||
mlflow: 1.25.1
|
||||
transformers: 4.18.0
|
||||
onnxconverter-common: 1.9.0
|
||||
transformers: 4.18.0
|
|
@ -0,0 +1,123 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
|
||||
type: command
|
||||
|
||||
description: >-
|
||||
Fine-tunes a pre-trained pytorch model for image classification.
|
||||
Inputs should be provided as distinct directories containing distinct images
|
||||
as we're using [ImageFolder](http://pytorch.org/vision/main/generated/torchvision.datasets.ImageFolder.html) to load data.
|
||||
name: pytorch_image_classifier
|
||||
display_name: Image Classification Model (PyTorch)
|
||||
version: 1.0.4
|
||||
|
||||
inputs:
|
||||
# data loading
|
||||
train_images:
|
||||
type: path
|
||||
description: "Path to folder containing training images, stored in subdirectories according to their class."
|
||||
valid_images:
|
||||
type: path
|
||||
description: "Path to folder containing validation images, stored in subdirectories according to their class."
|
||||
|
||||
# data loading
|
||||
batch_size:
|
||||
type: integer
|
||||
min: 1
|
||||
optional: true
|
||||
description: "Train/valid data loading batch size (default: 64)"
|
||||
num_workers:
|
||||
type: integer
|
||||
optional: true
|
||||
description: "Num workers for data loader (default: -1 => all cpus available)"
|
||||
prefetch_factor:
|
||||
type: integer
|
||||
optional: true
|
||||
description: "Data loader prefetch factor (default: 2)"
|
||||
persistent_workers:
|
||||
type: boolean
|
||||
optional: true
|
||||
description: "Use persistent prefetching workers (default: True)"
|
||||
pin_memory:
|
||||
type: boolean
|
||||
optional: true
|
||||
description: "Pin Data loader prefetch factor (default: True)"
|
||||
non_blocking:
|
||||
type: boolean
|
||||
optional: true
|
||||
description: "Use non-blocking transfer to device (default: False)"
|
||||
|
||||
# model
|
||||
model_arch:
|
||||
type: string
|
||||
optional: true
|
||||
description: "Which model architecture to use (default: resnet18)"
|
||||
model_arch_pretrained:
|
||||
type: boolean
|
||||
optional: true
|
||||
description: "Use pretrained model (default: true)"
|
||||
|
||||
# training
|
||||
num_epochs:
|
||||
type: integer
|
||||
optional: true
|
||||
description: "Number of epochs to train for (default: 1)"
|
||||
learning_rate:
|
||||
type: number
|
||||
optional: true
|
||||
description: "Learning rate of optimizer (default: 0.001)"
|
||||
momentum:
|
||||
type: number
|
||||
optional: true
|
||||
description: "Momentum of optimizer (default: 0.9)"
|
||||
|
||||
# model registration
|
||||
register_model_as:
|
||||
type: string
|
||||
optional: true
|
||||
description: "Name to register final model in MLFlow"
|
||||
|
||||
# system parameters
|
||||
enable_profiling:
|
||||
type: boolean
|
||||
default: false
|
||||
description: "Enables profiler"
|
||||
multiprocessing_sharing_strategy:
|
||||
type: string
|
||||
optional: true
|
||||
description: "Check https://pytorch.org/docs/stable/multiprocessing.html"
|
||||
|
||||
outputs:
|
||||
checkpoints:
|
||||
type: path
|
||||
description: "Path to export checkpoints"
|
||||
trained_model:
|
||||
type: path
|
||||
description: "Path to the final model"
|
||||
|
||||
code: ../../../data-science/src
|
||||
|
||||
environment: azureml:azureml:nvidia_pytorch:22.04-py3
|
||||
|
||||
command: >-
|
||||
python train.py
|
||||
--train_images ${{inputs.train_images}}
|
||||
--valid_images ${{inputs.valid_images}}
|
||||
[--batch_size ${{inputs.batch_size}}]
|
||||
[--num_workers ${{inputs.num_workers}}]
|
||||
[--prefetch_factor ${{inputs.prefetch_factor}}]
|
||||
[--persistent_workers ${{inputs.persistent_workers}}]
|
||||
[--pin_memory ${{inputs.pin_memory}}]
|
||||
[--non_blocking ${{inputs.non_blocking}}]
|
||||
[--model_arch ${{inputs.model_arch}}]
|
||||
[--model_arch_pretrained ${{inputs.model_arch_pretrained}}]
|
||||
[--num_epochs ${{inputs.num_epochs}}]
|
||||
[--learning_rate ${{inputs.learning_rate}}]
|
||||
[--momentum ${{inputs.momentum}}]
|
||||
--model_output ${{outputs.trained_model}}
|
||||
--checkpoints ${{outputs.checkpoints}}
|
||||
[--register_model_as ${{inputs.register_model_as}}]
|
||||
--enable_profiling ${{inputs.enable_profiling}}
|
||||
[--multiprocessing_sharing_strategy ${{inputs.multiprocessing_sharing_strategy}}]
|
||||
distribution:
|
||||
# NOTE: using type:pytorch will use all the right env variables for pytorch init_process_group
|
||||
type: pytorch
|
||||
process_count_per_instance: 1
|
|
@ -0,0 +1,62 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
variables:
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: aml-cli-v2
|
||||
- name: endpoint_name
|
||||
value: taxi-fare-batch
|
||||
- name: endpoint_type
|
||||
value: batch
|
||||
|
||||
trigger:
|
||||
- none
|
||||
|
||||
pool:
|
||||
vmImage: ubuntu-20.04
|
||||
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: mlops-templates # Template Repo
|
||||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
|
||||
stages:
|
||||
- stage: CreateBatchEndpoint
|
||||
displayName: Create/Update Batch Endpoint
|
||||
jobs:
|
||||
- job: DeployBatchEndpoint
|
||||
steps:
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
|
||||
parameters:
|
||||
cluster_name: batch-cluster # name must match cluster name in deployment file below
|
||||
min_instances: 0
|
||||
max_instances: 5
|
||||
- template: templates/${{ variables.version }}/create-endpoint.yml@mlops-templates
|
||||
parameters:
|
||||
endpoint_file: mlops/azureml/deploy/batch/batch-endpoint.yml
|
||||
- template: templates/${{ variables.version }}/create-deployment.yml@mlops-templates
|
||||
parameters:
|
||||
deployment_name: taxi-batch-dp
|
||||
deployment_file: mlops/azureml/deploy/batch/batch-deployment.yml
|
||||
- template: templates/${{ variables.version }}/test-deployment.yml@mlops-templates
|
||||
parameters:
|
||||
deployment_name: taxi-batch-dp
|
||||
sample_request: data/taxi-batch.csv
|
||||
request_type: uri_file #either uri_folder or uri_file
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
variables:
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: aml-cli-v2
|
||||
|
||||
|
||||
trigger:
|
||||
- none
|
||||
|
||||
pool:
|
||||
vmImage: ubuntu-20.04
|
||||
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: mlops-templates # Template Repo
|
||||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
|
||||
stages:
|
||||
- stage: DeployTrainingPipeline
|
||||
displayName: Deploy Training Pipeline
|
||||
jobs:
|
||||
- job: DeployTrainingPipeline
|
||||
steps:
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
|
||||
- task: CmdLine@2
|
||||
inputs:
|
||||
script: |
|
||||
echo '$(System.DefaultWorkingDirectory)'
|
||||
tree $(System.DefaultWorkingDirectory) /f
|
||||
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
|
||||
parameters:
|
||||
pipeline_file: cv/aml-cli-v2/mlops/azureml/train/pipeline.yaml
|
|
@ -0,0 +1,60 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
variables:
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: aml-cli-v2
|
||||
- name: endpoint_name
|
||||
value: dogs-classifier-online
|
||||
- name: endpoint_type
|
||||
value: online
|
||||
|
||||
|
||||
trigger:
|
||||
- none
|
||||
|
||||
pool:
|
||||
vmImage: ubuntu-20.04
|
||||
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: mlops-templates # Template Repo
|
||||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
|
||||
stages:
|
||||
- stage: CreateOnlineEndpoint
|
||||
displayName: Create/Update Online Endpoint
|
||||
jobs:
|
||||
- job: DeployOnlineEndpoint
|
||||
steps:
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/create-endpoint.yml@mlops-templates
|
||||
parameters:
|
||||
endpoint_file: cv/aml-cli-v2/mlops/azureml/deploy/online/online-endpoint.yml
|
||||
- template: templates/${{ variables.version }}/create-deployment.yml@mlops-templates
|
||||
parameters:
|
||||
deployment_name: dogs-online-dp
|
||||
deployment_file: cv/aml-cli-v2/mlops/azureml/deploy/online/online-deployment.yml
|
||||
- template: templates/${{ variables.version }}/allocate-traffic.yml@mlops-templates
|
||||
parameters:
|
||||
traffic_allocation: dogs-online-dp=100
|
||||
- template: templates/${{ variables.version }}/test-deployment.yml@mlops-templates
|
||||
parameters:
|
||||
deployment_name: dogs-online-dp
|
||||
sample_request: cv/aml-cli-v2/data/sample-request.json
|
||||
request_type: json
|
Загрузка…
Ссылка в новой задаче