Added the Labels artifacts and cleaned up unused imports

* label mapping to output from the API is stored in the label-mapping.txt. This file can be used together with the output from the API to map the list of floats to the dogs' class.

Formatting fixes

Adding the  DevOps Pipeline

Update deploy-model-training-pipeline.yml

Updating the template path (Temporary - Fix)

Update deploy-model-training-pipeline.yml for Azure Pipelines

Update deploy-model-training-pipeline.yml for Azure Pipelines

Update deploy-model-training-pipeline.yml for Azure Pipelines

Update deploy-model-training-pipeline.yml for Azure Pipelines

Update deploy-model-training-pipeline.yml

fixes to the path

Hardcoding the variables

Temporary fixing the resource names to test the e2e pipeline

Update the online deployment pipeline

[skip ci]

Update deploy-online-endpoint-pipeline.yml for Azure Pipelines

Create a sample request for the online api

Update deploy-online-endpoint-pipeline.yml

Update deploy-online-endpoint-pipeline.yml

Update deploy-online-endpoint-pipeline.yml for Azure Pipelines

Updated the VM with large memory

Standard_F2s_v2 run out of memory when setting up the CV container.

Update online-deployment.yml

Update train model to use float instead of double

Fixing the double/float issue

Update train.py

Update train.py
This commit is contained in:
Setu Chokshi 2022-06-02 18:33:40 +08:00
Родитель 42429666e6
Коммит 39930f3ba5
17 изменённых файлов: 468 добавлений и 76 удалений

Просмотреть файл

@ -12,9 +12,3 @@ repos:
hooks:
- id: black
additional_dependencies: ['click==8.0.4']
# Sort imports deterministically
- repo: https://github.com/pycqa/isort
rev: 5.10.1
hooks:
- id: isort

Просмотреть файл

@ -7,11 +7,11 @@ variables:
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- template: ../../../../config-infra-dev.yml
- name: version
value: aml-cli-v2
- name: endpoint_name
value: taxi-fare-online
value: dogs-classifier-online
- name: endpoint_type
value: online
@ -45,16 +45,16 @@ stages:
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
- template: templates/${{ variables.version }}/create-endpoint.yml@mlops-templates
parameters:
endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
endpoint_file: cv/aml-cli-v2/mlops/azureml/deploy/online/online-endpoint.yml
- template: templates/${{ variables.version }}/create-deployment.yml@mlops-templates
parameters:
deployment_name: taxi-online-dp
deployment_file: mlops/azureml/deploy/online/online-deployment.yml
deployment_name: dogs-online-dp
deployment_file: cv/aml-cli-v2/mlops/azureml/deploy/online/online-deployment.yml
- template: templates/${{ variables.version }}/allocate-traffic.yml@mlops-templates
parameters:
traffic_allocation: taxi-online-dp=100
traffic_allocation: dogs-online-dp=100
- template: templates/${{ variables.version }}/test-deployment.yml@mlops-templates
parameters:
deployment_name: taxi-online-dp
sample_request: data/taxi-request.json
deployment_name: dogs-online-dp
sample_request: cv/aml-cli-v2/data/sample-request.json
request_type: json

Просмотреть файл

@ -17,12 +17,12 @@ variables:
ado_service_connection_aml_ws: Azure-ARM-Dev
# IaC
resource_group: rg-$(namespace)-$(postfix)$(environment)
aml_workspace: mlw-$(namespace)-$(postfix)$(environment)
application_insights: mlw-$(namespace)-$(postfix)$(environment)
key_vault: kv-$(namespace)-$(postfix)$(environment)
container_registry: cr$(namespace)$(postfix)$(environment)
storage_account: st$(namespace)$(postfix)$(environment)
resource_group: azureml-examples-rg
aml_workspace: main
application_insights: maininsights3940d807586b
key_vault: mainkeyvaultaa801245b95c
container_registry: e7fd97c2665e408bab3c1c126576f89e
storage_account: mainstorage5338948febc34
# Terraform
terraform_version: 0.14.7

Просмотреть файл

@ -43,4 +43,4 @@ ENV NCCL_DEBUG_SUBSYS="GRAPH,INIT,ENV"
# Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
ENV NCCL_IB_PCI_RELAXED_ORDERING="1"
ENV CUDA_DEVICE_ORDER="PCI_BUS_ID"
ENV NCCL_SOCKET_IFNAME="eth0"
ENV NCCL_SOCKET_IFNAME="eth0"

Просмотреть файл

@ -32,4 +32,4 @@
<pci busid="0108:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
</system>
</system>

Просмотреть файл

@ -11,4 +11,4 @@ tqdm==4.64.0
psutil==5.9.0
# for unit testing
pytest==7.1.2
pytest==7.1.2

Просмотреть файл

@ -13,6 +13,7 @@ import tempfile
from torch.profiler import profile, record_function, ProfilerActivity
from typing import Any
def markdown_trace_handler(dir_name: str, rank: int = 0):
"""This handler can be used inside torch.profiler call to output
tables in markdown format"""
@ -51,6 +52,7 @@ def markdown_trace_handler(dir_name: str, rank: int = 0):
def composite_trace_handler(handler_list):
"""This can call multiple trace handlers inside one"""
def _handler_fn(prof) -> None:
for handler in handler_list:
handler(prof)
@ -58,7 +60,9 @@ def composite_trace_handler(handler_list):
return _handler_fn
def export_stack_trace_handler(dir_name: str, rank: int=0, metrics=["self_cuda_time_total"]):
def export_stack_trace_handler(
dir_name: str, rank: int = 0, metrics=["self_cuda_time_total"]
):
"""This handler can be used inside torch.profiler call to output
tables in markdown format"""
@ -128,24 +132,23 @@ class PyTorchProfilerHandler:
markdown_logs_export = os.path.join(
self.profiler_output_tmp_dir.name, "markdown"
)
trace_handlers.append(markdown_trace_handler(
markdown_logs_export, rank=self.rank
))
trace_handlers.append(
markdown_trace_handler(markdown_logs_export, rank=self.rank)
)
# export stacks in txt
stacks_logs_export = os.path.join(
self.profiler_output_tmp_dir.name, "stacks"
)
stack_metrics = [
"self_cpu_time_total"
]
stack_metrics = ["self_cpu_time_total"]
if torch.cuda.is_available():
stack_metrics.append("self_cuda_time_total")
trace_handlers.append(export_stack_trace_handler(
stacks_logs_export, rank=self.rank,
metrics=stack_metrics
))
trace_handlers.append(
export_stack_trace_handler(
stacks_logs_export, rank=self.rank, metrics=stack_metrics
)
)
# export tensorboard
# NOTE: removed due to segfault in pytorch 1.11.0
@ -170,7 +173,7 @@ class PyTorchProfilerHandler:
with_flops=True,
profile_memory=True,
activities=activities,
with_stack=True, # needed to export stacks
with_stack=True, # needed to export stacks
on_trace_ready=trace_handler,
)
self.profiler.start()
@ -204,8 +207,9 @@ class PyTorchProfilerHandler:
"Not stopping profiler as it was not started in the first place."
)
class LogTimeBlock(object):
""" This class should be used to time a code block.
"""This class should be used to time a code block.
The time diff is computed from __enter__ to __exit__.
Example
-------
@ -224,8 +228,8 @@ class LogTimeBlock(object):
kwargs (dict): any keyword will be added as properties to metrics for logging (work in progress)
"""
# kwargs
self.step = kwargs.get('step', None)
self.enabled = kwargs.get('enabled', True)
self.step = kwargs.get("step", None)
self.enabled = kwargs.get("enabled", True)
# internal variables
self.name = name
@ -233,23 +237,25 @@ class LogTimeBlock(object):
self._logger = logging.getLogger(__name__)
def __enter__(self):
""" Starts the timer, gets triggered at beginning of code block """
"""Starts the timer, gets triggered at beginning of code block"""
if not self.enabled:
return
self.start_time = time.time() # starts "timer"
self.start_time = time.time() # starts "timer"
def __exit__(self, exc_type, value, traceback):
""" Stops the timer and stores accordingly
"""Stops the timer and stores accordingly
gets triggered at beginning of code block.
Note:
arguments are by design for with statements.
"""
if not self.enabled:
return
run_time = time.time() - self.start_time # stops "timer"
run_time = time.time() - self.start_time # stops "timer"
self._logger.info(f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]")
self._logger.info(
f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]"
)
mlflow.log_metric(self.name + ".time", run_time)
@ -262,18 +268,18 @@ class LogDiskIOBlock(object):
kwargs (dict): any keyword will be added as properties to metrics for logging (work in progress)
"""
# kwargs
self.step = kwargs.get('step', None)
self.enabled = kwargs.get('enabled', True)
self.step = kwargs.get("step", None)
self.enabled = kwargs.get("enabled", True)
# internal variables
self.name = name
self.process_id = os.getpid() # focus on current process
self.process_id = os.getpid() # focus on current process
self.start_time = None
self.start_disk_counters = None
self._logger = logging.getLogger(__name__)
def __enter__(self):
""" Get initial values, gets triggered at beginning of code block """
"""Get initial values, gets triggered at beginning of code block"""
if not self.enabled:
return
try:
@ -286,9 +292,9 @@ class LogDiskIOBlock(object):
self.logger.critical("import psutil failed, cannot display disk stats.")
def __exit__(self, exc_type, value, traceback):
""" Stops the timer and stores accordingly
"""Stops the timer and stores accordingly
gets triggered at beginning of code block.
Note:
arguments are by design for with statements.
"""
@ -304,19 +310,32 @@ class LogDiskIOBlock(object):
disk_io_metrics = {}
end_disk_counters = psutil.Process(self.process_id).io_counters()
disk_io_metrics[f"{self.name}.disk.read"] = (end_disk_counters.read_bytes - self.start_disk_counters.read_bytes) / (1024 * 1024)
disk_io_metrics[f"{self.name}.disk.write"] = (end_disk_counters.write_bytes - self.start_disk_counters.write_bytes) / (1024 * 1024)
disk_io_metrics[f"{self.name}.disk.read"] = (
end_disk_counters.read_bytes - self.start_disk_counters.read_bytes
) / (1024 * 1024)
disk_io_metrics[f"{self.name}.disk.write"] = (
end_disk_counters.write_bytes - self.start_disk_counters.write_bytes
) / (1024 * 1024)
self._logger.info(f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]")
self._logger.info(
f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]"
)
self._logger.info(f"--- disk_io_metrics: {disk_io_metrics}s [step={self.step}]")
mlflow.log_metrics(disk_io_metrics)
class LogTimeOfIterator():
class LogTimeOfIterator:
"""This class is intended to "wrap" an existing Iterator
and log metrics for each next() call"""
def __init__(self, wrapped_sequence:Any, name:str, enabled:bool=True, async_collector:dict=None):
def __init__(
self,
wrapped_sequence: Any,
name: str,
enabled: bool = True,
async_collector: dict = None,
):
self.wrapped_sequence = wrapped_sequence
self.wrapped_iterator = None
@ -328,7 +347,7 @@ class LogTimeOfIterator():
self.async_collector = async_collector
self._logger = logging.getLogger(__name__)
def __iter__(self):
"""Creates the iterator"""
if self.enabled:

Просмотреть файл

@ -23,15 +23,11 @@ import traceback
from distutils.util import strtobool
import mlflow
from mlflow.models.signature import infer_signature
# the long list of torch imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torchvision
from torch.optim import lr_scheduler
from torch.profiler import record_function
from torch.utils.data import DataLoader
@ -39,9 +35,6 @@ from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm
from transformers.utils import ModelOutput
from azureml.core import Run
# add path to here, if necessary
COMPONENT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "."))
if COMPONENT_ROOT not in sys.path:
@ -59,6 +52,7 @@ from profiling import (
PyTorchProfilerHandler,
)
torch.set_default_dtype(torch.float64)
class PyTorchDistributedModelTrainingSequence:
"""Generic class to run the sequence for training a PyTorch model
@ -78,9 +72,6 @@ class PyTorchDistributedModelTrainingSequence:
self.labels = []
self.model_signature = None
# Run
self.runid = None
# DISTRIBUTED CONFIG
self.world_size = 1
self.world_rank = 0
@ -289,7 +280,7 @@ class PyTorchDistributedModelTrainingSequence:
def setup_model(self, model):
"""Configures a model for training."""
self.logger.info(f"Setting up model to use device {self.device}")
self.model = model.float().to(self.device)
self.model = model.to(self.device)
# DISTRIBUTED: the model needs to be wrapped in a DistributedDataParallel class
if self.multinode_available:
@ -661,6 +652,13 @@ class PyTorchDistributedModelTrainingSequence:
else:
model_to_save = self.model.to("cpu")
# Save the labels to a csv file.
# This file will be required to map the output array
# from the API to the labels.
with open("label-mapping.txt", "w") as f:
f.write("\n".join(self.labels))
mlflow.log_artifact("label-mapping.txt")
# MLFLOW: mlflow has a nice method to export the model automatically
# add tags and environment for it. You can then use it in Azure ML
# to register your model to an endpoint.
@ -863,12 +861,6 @@ def run(args):
# sets cuda and distributed config
training_handler.setup_config(args)
# Get Run Id
run = Run.get_context()
run_id = run.get_details()["runId"]
training_handler.runid = run_id
logger.info(f"Run Id: {run_id}")
# PROFILER: here we use a helper class to enable profiling
# see profiling.py for the implementation details
training_profiler = PyTorchProfilerHandler(

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,6 @@
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
name: blue
endpoint_name: dogs-classifier-online
model: azureml:resnet-dogs-classifier@latest
instance_type: Standard_DS2_v2
instance_count: 1

Просмотреть файл

@ -0,0 +1,4 @@
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
name: dogs-classifier-online
description: Stanford Dogs Classifier
auth_mode: key

Просмотреть файл

@ -0,0 +1,83 @@
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
# <inputs_and_outputs>
inputs:
training_images:
type: uri_folder
mode: download # pick ro_mount, rw_mount or download
path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/dogs/**
# path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/places2/train//**
validation_images:
type: uri_folder
mode: download # pick ro_mount, rw_mount or download
path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/dogs/**
# path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/places2/valid/**
# </inputs_and_outputs>
# <jobs>
settings:
default_datastore: azureml:workspaceblobstore
continue_on_step_failure: true
jobs:
train:
type: command
component: file:train.yaml
compute: azureml:gpu-cluster
resources:
instance_count: 1 # number of nodes
distribution:
type: pytorch
process_count_per_instance: 1 # number of gpus
# NOTE: set env var if needed
environment_variables:
NCCL_DEBUG: "INFO" # adjusts the level of info from NCCL tests
# NCCL_TOPO_FILE: "/opt/microsoft/ndv4-topo.xml" # Use specific topology file for A100
# NCCL_IB_PCI_RELAXED_ORDERING: "1" # Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
# NCCL_IB_DISABLE: "1" # force disable infiniband (if set to "1")
# NCCL_NET_PLUGIN: "none" # to force NET/Plugin off (no rdma/sharp plugin at all)
# NCCL_NET: "Socket" # to force node-to-node comm to use Socket (slow)
# NCCL_SOCKET_IFNAME: "eth0" # to force Socket comm to use eth0 (use NCCL_NET=Socket)
# UCX_IB_PCI_RELAXED_ORDERING: "on"
# UCX_TLS: "tcp"
# UCX_NET_DEVICES: "eth0" # if you have Error: Failed to resolve UCX endpoint...
# CUDA_DEVICE_ORDER: "PCI_BUS_ID" # ordering of gpus
# TORCH_DISTRIBUTED_DEBUG: "DETAIL"
inputs:
# data inputs
train_images: ${{parent.inputs.training_images}}
valid_images: ${{parent.inputs.validation_images}}
# data loading
batch_size: 64
num_workers: 5
prefetch_factor: 4
persistent_workers: true
pin_memory: true
non_blocking: false
# model
model_arch: "resnet18"
model_arch_pretrained: true
# training
num_epochs: 1
learning_rate: 0.001
momentum: 0.9
# profiling
enable_profiling: false
# multiprocessing_sharing_strategy: "file_system" # WARNING: this can cause hang at job completion
# Model Registrataion
register_model_as: "resnet-dogs-classifier"
# </jobs>

Просмотреть файл

@ -2,7 +2,7 @@ $schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
name: nvidia_pytorch
version: 22.04-py3
build:
path: .
path: ../../../data-science/src/environment/
tags:
os: ubuntu
os_version: 20.04
@ -19,5 +19,4 @@ tags:
nccl_test: 2.11.0
azureml-defaults: 1.41.0
mlflow: 1.25.1
transformers: 4.18.0
onnxconverter-common: 1.9.0
transformers: 4.18.0

Просмотреть файл

@ -0,0 +1,123 @@
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
type: command
description: >-
Fine-tunes a pre-trained pytorch model for image classification.
Inputs should be provided as distinct directories containing distinct images
as we're using [ImageFolder](http://pytorch.org/vision/main/generated/torchvision.datasets.ImageFolder.html) to load data.
name: pytorch_image_classifier
display_name: Image Classification Model (PyTorch)
version: 1.0.4
inputs:
# data loading
train_images:
type: path
description: "Path to folder containing training images, stored in subdirectories according to their class."
valid_images:
type: path
description: "Path to folder containing validation images, stored in subdirectories according to their class."
# data loading
batch_size:
type: integer
min: 1
optional: true
description: "Train/valid data loading batch size (default: 64)"
num_workers:
type: integer
optional: true
description: "Num workers for data loader (default: -1 => all cpus available)"
prefetch_factor:
type: integer
optional: true
description: "Data loader prefetch factor (default: 2)"
persistent_workers:
type: boolean
optional: true
description: "Use persistent prefetching workers (default: True)"
pin_memory:
type: boolean
optional: true
description: "Pin Data loader prefetch factor (default: True)"
non_blocking:
type: boolean
optional: true
description: "Use non-blocking transfer to device (default: False)"
# model
model_arch:
type: string
optional: true
description: "Which model architecture to use (default: resnet18)"
model_arch_pretrained:
type: boolean
optional: true
description: "Use pretrained model (default: true)"
# training
num_epochs:
type: integer
optional: true
description: "Number of epochs to train for (default: 1)"
learning_rate:
type: number
optional: true
description: "Learning rate of optimizer (default: 0.001)"
momentum:
type: number
optional: true
description: "Momentum of optimizer (default: 0.9)"
# model registration
register_model_as:
type: string
optional: true
description: "Name to register final model in MLFlow"
# system parameters
enable_profiling:
type: boolean
default: false
description: "Enables profiler"
multiprocessing_sharing_strategy:
type: string
optional: true
description: "Check https://pytorch.org/docs/stable/multiprocessing.html"
outputs:
checkpoints:
type: path
description: "Path to export checkpoints"
trained_model:
type: path
description: "Path to the final model"
code: ../../../data-science/src
environment: azureml:azureml:nvidia_pytorch:22.04-py3
command: >-
python train.py
--train_images ${{inputs.train_images}}
--valid_images ${{inputs.valid_images}}
[--batch_size ${{inputs.batch_size}}]
[--num_workers ${{inputs.num_workers}}]
[--prefetch_factor ${{inputs.prefetch_factor}}]
[--persistent_workers ${{inputs.persistent_workers}}]
[--pin_memory ${{inputs.pin_memory}}]
[--non_blocking ${{inputs.non_blocking}}]
[--model_arch ${{inputs.model_arch}}]
[--model_arch_pretrained ${{inputs.model_arch_pretrained}}]
[--num_epochs ${{inputs.num_epochs}}]
[--learning_rate ${{inputs.learning_rate}}]
[--momentum ${{inputs.momentum}}]
--model_output ${{outputs.trained_model}}
--checkpoints ${{outputs.checkpoints}}
[--register_model_as ${{inputs.register_model_as}}]
--enable_profiling ${{inputs.enable_profiling}}
[--multiprocessing_sharing_strategy ${{inputs.multiprocessing_sharing_strategy}}]
distribution:
# NOTE: using type:pytorch will use all the right env variables for pytorch init_process_group
type: pytorch
process_count_per_instance: 1

Просмотреть файл

@ -0,0 +1,62 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
variables:
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: aml-cli-v2
- name: endpoint_name
value: taxi-fare-batch
- name: endpoint_type
value: batch
trigger:
- none
pool:
vmImage: ubuntu-20.04
resources:
repositories:
- repository: mlops-templates # Template Repo
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
stages:
- stage: CreateBatchEndpoint
displayName: Create/Update Batch Endpoint
jobs:
- job: DeployBatchEndpoint
steps:
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
- template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
parameters:
cluster_name: batch-cluster # name must match cluster name in deployment file below
min_instances: 0
max_instances: 5
- template: templates/${{ variables.version }}/create-endpoint.yml@mlops-templates
parameters:
endpoint_file: mlops/azureml/deploy/batch/batch-endpoint.yml
- template: templates/${{ variables.version }}/create-deployment.yml@mlops-templates
parameters:
deployment_name: taxi-batch-dp
deployment_file: mlops/azureml/deploy/batch/batch-deployment.yml
- template: templates/${{ variables.version }}/test-deployment.yml@mlops-templates
parameters:
deployment_name: taxi-batch-dp
sample_request: data/taxi-batch.csv
request_type: uri_file #either uri_folder or uri_file

Просмотреть файл

@ -0,0 +1,49 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
variables:
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../../../config-infra-dev.yml
- name: version
value: aml-cli-v2
trigger:
- none
pool:
vmImage: ubuntu-20.04
resources:
repositories:
- repository: mlops-templates # Template Repo
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
stages:
- stage: DeployTrainingPipeline
displayName: Deploy Training Pipeline
jobs:
- job: DeployTrainingPipeline
steps:
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
- task: CmdLine@2
inputs:
script: |
echo '$(System.DefaultWorkingDirectory)'
tree $(System.DefaultWorkingDirectory) /f
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
parameters:
pipeline_file: cv/aml-cli-v2/mlops/azureml/train/pipeline.yaml

Просмотреть файл

@ -0,0 +1,60 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
variables:
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../../../config-infra-dev.yml
- name: version
value: aml-cli-v2
- name: endpoint_name
value: dogs-classifier-online
- name: endpoint_type
value: online
trigger:
- none
pool:
vmImage: ubuntu-20.04
resources:
repositories:
- repository: mlops-templates # Template Repo
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
stages:
- stage: CreateOnlineEndpoint
displayName: Create/Update Online Endpoint
jobs:
- job: DeployOnlineEndpoint
steps:
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
- template: templates/${{ variables.version }}/create-endpoint.yml@mlops-templates
parameters:
endpoint_file: cv/aml-cli-v2/mlops/azureml/deploy/online/online-endpoint.yml
- template: templates/${{ variables.version }}/create-deployment.yml@mlops-templates
parameters:
deployment_name: dogs-online-dp
deployment_file: cv/aml-cli-v2/mlops/azureml/deploy/online/online-deployment.yml
- template: templates/${{ variables.version }}/allocate-traffic.yml@mlops-templates
parameters:
traffic_allocation: dogs-online-dp=100
- template: templates/${{ variables.version }}/test-deployment.yml@mlops-templates
parameters:
deployment_name: dogs-online-dp
sample_request: cv/aml-cli-v2/data/sample-request.json
request_type: json