Added the Labels artifacts and cleaned up unused imports

* label mapping to output from the API is stored in the label-mapping.txt. This file can be used together with the output from the API to map the list of floats to the dogs' class. Formatting fixes Adding the DevOps Pipeline Update deploy-model-training-pipeline.yml Updating the template path (Temporary - Fix) Update deploy-model-training-pipeline.yml for Azure Pipelines Update deploy-model-training-pipeline.yml for Azure Pipelines Update deploy-model-training-pipeline.yml for Azure Pipelines Update deploy-model-training-pipeline.yml for Azure Pipelines Update deploy-model-training-pipeline.yml fixes to the path Hardcoding the variables Temporary fixing the resource names to test the e2e pipeline Update the online deployment pipeline [skip ci] Update deploy-online-endpoint-pipeline.yml for Azure Pipelines Create a sample request for the online api Update deploy-online-endpoint-pipeline.yml Update deploy-online-endpoint-pipeline.yml Update deploy-online-endpoint-pipeline.yml for Azure Pipelines Updated the VM with large memory Standard_F2s_v2 run out of memory when setting up the CV container. Update online-deployment.yml Update train model to use float instead of double Fixing the double/float issue Update train.py Update train.py
2022-06-02 18:33:40 +08:00 · 2022-06-02 18:33:40 +08:00 · 39930f3ba5
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -12,9 +12,3 @@ repos:
    hooks:
    -   id: black
        additional_dependencies: ['click==8.0.4']
-
-  # Sort imports deterministically
-   repo: https://github.com/pycqa/isort
-    rev: 5.10.1
-    hooks:
-      - id: isort
--- a/classical/aml-cli-v2/mlops/devops-pipelines/deploy-online-endpoint-pipeline.yml
+++ b/classical/aml-cli-v2/mlops/devops-pipelines/deploy-online-endpoint-pipeline.yml
@ -7,11 +7,11 @@ variables:
    - template: ../../config-infra-prod.yml
 - ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:  
    # 'develop' or feature branches: DEV environment
-    - template: ../../config-infra-dev.yml
+    - template: ../../../../config-infra-dev.yml
 - name: version
  value: aml-cli-v2 
 - name: endpoint_name
-  value: taxi-fare-online
+  value: dogs-classifier-online
 - name: endpoint_type
  value: online

@ -45,16 +45,16 @@ stages:
      - template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
      - template: templates/${{ variables.version }}/create-endpoint.yml@mlops-templates
        parameters:
-          endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
+          endpoint_file: cv/aml-cli-v2/mlops/azureml/deploy/online/online-endpoint.yml
      - template: templates/${{ variables.version }}/create-deployment.yml@mlops-templates
        parameters:
-          deployment_name: taxi-online-dp
-          deployment_file: mlops/azureml/deploy/online/online-deployment.yml 
+          deployment_name: dogs-online-dp
+          deployment_file: cv/aml-cli-v2/mlops/azureml/deploy/online/online-deployment.yml 
      - template: templates/${{ variables.version }}/allocate-traffic.yml@mlops-templates
        parameters:
-          traffic_allocation: taxi-online-dp=100
+          traffic_allocation: dogs-online-dp=100
      - template: templates/${{ variables.version }}/test-deployment.yml@mlops-templates
        parameters:
-          deployment_name: taxi-online-dp
-          sample_request: data/taxi-request.json
+          deployment_name: dogs-online-dp
+          sample_request: cv/aml-cli-v2/data/sample-request.json
          request_type: json
--- a/config-infra-dev.yml
+++ b/config-infra-dev.yml
@ -17,12 +17,12 @@ variables:
  ado_service_connection_aml_ws: Azure-ARM-Dev

  # IaC
-  resource_group: rg-$(namespace)-$(postfix)$(environment)
-  aml_workspace: mlw-$(namespace)-$(postfix)$(environment) 
-  application_insights: mlw-$(namespace)-$(postfix)$(environment)
-  key_vault: kv-$(namespace)-$(postfix)$(environment)
-  container_registry: cr$(namespace)$(postfix)$(environment)
-  storage_account: st$(namespace)$(postfix)$(environment)
+  resource_group: azureml-examples-rg
+  aml_workspace: main 
+  application_insights: maininsights3940d807586b
+  key_vault: mainkeyvaultaa801245b95c
+  container_registry: e7fd97c2665e408bab3c1c126576f89e
+  storage_account: mainstorage5338948febc34

  # Terraform
  terraform_version: 0.14.7
--- a/cv/aml-cli-v2/data-science/src/environment/Dockerfile
+++ b/cv/aml-cli-v2/data-science/src/environment/Dockerfile
@ -43,4 +43,4 @@ ENV NCCL_DEBUG_SUBSYS="GRAPH,INIT,ENV"
 # Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
 ENV NCCL_IB_PCI_RELAXED_ORDERING="1"
 ENV CUDA_DEVICE_ORDER="PCI_BUS_ID"
-ENV NCCL_SOCKET_IFNAME="eth0"
+ENV NCCL_SOCKET_IFNAME="eth0"
--- a/cv/aml-cli-v2/data-science/src/environment/ndv4-topo.xml
+++ b/cv/aml-cli-v2/data-science/src/environment/ndv4-topo.xml
@ -32,4 +32,4 @@
      <pci busid="0108:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
    </pci>
  </cpu>
-</system>
+</system>
--- a/cv/aml-cli-v2/data-science/src/environment/requirements.txt
+++ b/cv/aml-cli-v2/data-science/src/environment/requirements.txt
@ -11,4 +11,4 @@ tqdm==4.64.0
 psutil==5.9.0

 # for unit testing
-pytest==7.1.2
+pytest==7.1.2
--- a/cv/aml-cli-v2/data-science/src/profiling.py
+++ b/cv/aml-cli-v2/data-science/src/profiling.py
@ -13,6 +13,7 @@ import tempfile
 from torch.profiler import profile, record_function, ProfilerActivity
 from typing import Any

+
 def markdown_trace_handler(dir_name: str, rank: int = 0):
    """This handler can be used inside torch.profiler call to output
    tables in markdown format"""
@ -51,6 +52,7 @@ def markdown_trace_handler(dir_name: str, rank: int = 0):

 def composite_trace_handler(handler_list):
    """This can call multiple trace handlers inside one"""
+
    def _handler_fn(prof) -> None:
        for handler in handler_list:
            handler(prof)
@ -58,7 +60,9 @@ def composite_trace_handler(handler_list):
    return _handler_fn


-def export_stack_trace_handler(dir_name: str, rank: int=0, metrics=["self_cuda_time_total"]):
+def export_stack_trace_handler(
+    dir_name: str, rank: int = 0, metrics=["self_cuda_time_total"]
+):
    """This handler can be used inside torch.profiler call to output
    tables in markdown format"""

@ -128,24 +132,23 @@ class PyTorchProfilerHandler:
            markdown_logs_export = os.path.join(
                self.profiler_output_tmp_dir.name, "markdown"
            )
-            trace_handlers.append(markdown_trace_handler(
-                markdown_logs_export, rank=self.rank
-            ))
+            trace_handlers.append(
+                markdown_trace_handler(markdown_logs_export, rank=self.rank)
+            )

            # export stacks in txt
            stacks_logs_export = os.path.join(
                self.profiler_output_tmp_dir.name, "stacks"
            )
-            stack_metrics = [
-                "self_cpu_time_total"
-            ]
+            stack_metrics = ["self_cpu_time_total"]
            if torch.cuda.is_available():
                stack_metrics.append("self_cuda_time_total")

-            trace_handlers.append(export_stack_trace_handler(
-                stacks_logs_export, rank=self.rank,
-                metrics=stack_metrics
-            ))
+            trace_handlers.append(
+                export_stack_trace_handler(
+                    stacks_logs_export, rank=self.rank, metrics=stack_metrics
+                )
+            )

            # export tensorboard
            # NOTE: removed due to segfault in pytorch 1.11.0
@ -170,7 +173,7 @@ class PyTorchProfilerHandler:
                with_flops=True,
                profile_memory=True,
                activities=activities,
-                with_stack=True, # needed to export stacks
+                with_stack=True,  # needed to export stacks
                on_trace_ready=trace_handler,
            )
            self.profiler.start()
@ -204,8 +207,9 @@ class PyTorchProfilerHandler:
                "Not stopping profiler as it was not started in the first place."
            )

+
 class LogTimeBlock(object):
-    """ This class should be used to time a code block.
+    """This class should be used to time a code block.
    The time diff is computed from __enter__ to __exit__.
    Example
    -------
@ -224,8 +228,8 @@ class LogTimeBlock(object):
        kwargs (dict): any keyword will be added  as properties to metrics for logging (work in progress)
        """
        # kwargs
-        self.step = kwargs.get('step', None)
-        self.enabled = kwargs.get('enabled', True)
+        self.step = kwargs.get("step", None)
+        self.enabled = kwargs.get("enabled", True)

        # internal variables
        self.name = name
@ -233,23 +237,25 @@ class LogTimeBlock(object):
        self._logger = logging.getLogger(__name__)

    def __enter__(self):
-        """ Starts the timer, gets triggered at beginning of code block """
+        """Starts the timer, gets triggered at beginning of code block"""
        if not self.enabled:
            return
-        self.start_time = time.time() # starts "timer"
+        self.start_time = time.time()  # starts "timer"

    def __exit__(self, exc_type, value, traceback):
-        """ Stops the timer and stores accordingly
+        """Stops the timer and stores accordingly
        gets triggered at beginning of code block.
-        
+
        Note:
            arguments are by design for with statements.
        """
        if not self.enabled:
            return
-        run_time = time.time() - self.start_time # stops "timer"
+        run_time = time.time() - self.start_time  # stops "timer"

-        self._logger.info(f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]")
+        self._logger.info(
+            f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]"
+        )
        mlflow.log_metric(self.name + ".time", run_time)


@ -262,18 +268,18 @@ class LogDiskIOBlock(object):
        kwargs (dict): any keyword will be added  as properties to metrics for logging (work in progress)
        """
        # kwargs
-        self.step = kwargs.get('step', None)
-        self.enabled = kwargs.get('enabled', True)
+        self.step = kwargs.get("step", None)
+        self.enabled = kwargs.get("enabled", True)

        # internal variables
        self.name = name
-        self.process_id = os.getpid() # focus on current process
+        self.process_id = os.getpid()  # focus on current process
        self.start_time = None
        self.start_disk_counters = None
        self._logger = logging.getLogger(__name__)

    def __enter__(self):
-        """ Get initial values, gets triggered at beginning of code block """
+        """Get initial values, gets triggered at beginning of code block"""
        if not self.enabled:
            return
        try:
@ -286,9 +292,9 @@ class LogDiskIOBlock(object):
            self.logger.critical("import psutil failed, cannot display disk stats.")

    def __exit__(self, exc_type, value, traceback):
-        """ Stops the timer and stores accordingly
+        """Stops the timer and stores accordingly
        gets triggered at beginning of code block.
-        
+
        Note:
            arguments are by design for with statements.
        """
@ -304,19 +310,32 @@ class LogDiskIOBlock(object):

        disk_io_metrics = {}
        end_disk_counters = psutil.Process(self.process_id).io_counters()
-        disk_io_metrics[f"{self.name}.disk.read"] = (end_disk_counters.read_bytes - self.start_disk_counters.read_bytes) / (1024 * 1024)
-        disk_io_metrics[f"{self.name}.disk.write"] = (end_disk_counters.write_bytes - self.start_disk_counters.write_bytes) / (1024 * 1024)
+        disk_io_metrics[f"{self.name}.disk.read"] = (
+            end_disk_counters.read_bytes - self.start_disk_counters.read_bytes
+        ) / (1024 * 1024)
+        disk_io_metrics[f"{self.name}.disk.write"] = (
+            end_disk_counters.write_bytes - self.start_disk_counters.write_bytes
+        ) / (1024 * 1024)

-        self._logger.info(f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]")
+        self._logger.info(
+            f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]"
+        )
        self._logger.info(f"--- disk_io_metrics: {disk_io_metrics}s [step={self.step}]")

        mlflow.log_metrics(disk_io_metrics)


-class LogTimeOfIterator():
+class LogTimeOfIterator:
    """This class is intended to "wrap" an existing Iterator
    and log metrics for each next() call"""
-    def __init__(self, wrapped_sequence:Any, name:str, enabled:bool=True, async_collector:dict=None):
+
+    def __init__(
+        self,
+        wrapped_sequence: Any,
+        name: str,
+        enabled: bool = True,
+        async_collector: dict = None,
+    ):
        self.wrapped_sequence = wrapped_sequence
        self.wrapped_iterator = None

@ -328,7 +347,7 @@ class LogTimeOfIterator():
        self.async_collector = async_collector

        self._logger = logging.getLogger(__name__)
-    
+
    def __iter__(self):
        """Creates the iterator"""
        if self.enabled:
--- a/cv/aml-cli-v2/data-science/src/train.py
+++ b/cv/aml-cli-v2/data-science/src/train.py
@ -23,15 +23,11 @@ import traceback
 from distutils.util import strtobool

 import mlflow
-from mlflow.models.signature import infer_signature
-

 # the long list of torch imports
 import torch
 import torch.nn as nn
 import torch.optim as optim
-from torch.autograd import Variable
-import torchvision
 from torch.optim import lr_scheduler
 from torch.profiler import record_function
 from torch.utils.data import DataLoader
@ -39,9 +35,6 @@ from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
 from transformers.utils import ModelOutput

-from azureml.core import Run
-
-
 # add path to here, if necessary
 COMPONENT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "."))
 if COMPONENT_ROOT not in sys.path:
@ -59,6 +52,7 @@ from profiling import (
    PyTorchProfilerHandler,
 )

+torch.set_default_dtype(torch.float64)

 class PyTorchDistributedModelTrainingSequence:
    """Generic class to run the sequence for training a PyTorch model
@ -78,9 +72,6 @@ class PyTorchDistributedModelTrainingSequence:
        self.labels = []
        self.model_signature = None

-        # Run
-        self.runid = None
-
        # DISTRIBUTED CONFIG
        self.world_size = 1
        self.world_rank = 0
@ -289,7 +280,7 @@ class PyTorchDistributedModelTrainingSequence:
    def setup_model(self, model):
        """Configures a model for training."""
        self.logger.info(f"Setting up model to use device {self.device}")
-        self.model = model.float().to(self.device)
+        self.model = model.to(self.device)

        # DISTRIBUTED: the model needs to be wrapped in a DistributedDataParallel class
        if self.multinode_available:
@ -661,6 +652,13 @@ class PyTorchDistributedModelTrainingSequence:
            else:
                model_to_save = self.model.to("cpu")

+            # Save the labels to a csv file.
+            # This file will be required to map the output array
+            # from the API to the labels.
+            with open("label-mapping.txt", "w") as f:
+                f.write("\n".join(self.labels))
+            mlflow.log_artifact("label-mapping.txt")
+
            # MLFLOW: mlflow has a nice method to export the model automatically
            # add tags and environment for it. You can then use it in Azure ML
            # to register your model to an endpoint.
@ -863,12 +861,6 @@ def run(args):
    # sets cuda and distributed config
    training_handler.setup_config(args)

-    # Get Run Id
-    run = Run.get_context()
-    run_id = run.get_details()["runId"]
-    training_handler.runid = run_id
-    logger.info(f"Run Id: {run_id}")
-
    # PROFILER: here we use a helper class to enable profiling
    # see profiling.py for the implementation details
    training_profiler = PyTorchProfilerHandler(
--- a/cv/aml-cli-v2/data/sample-request.json
+++ b/cv/aml-cli-v2/data/sample-request.json
--- a/cv/aml-cli-v2/mlops/azureml/deploy/online/online-deployment.yml
+++ b/cv/aml-cli-v2/mlops/azureml/deploy/online/online-deployment.yml
@ -0,0 +1,6 @@
+$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
+name: blue
+endpoint_name: dogs-classifier-online
+model: azureml:resnet-dogs-classifier@latest
+instance_type: Standard_DS2_v2
+instance_count: 1
--- a/cv/aml-cli-v2/mlops/azureml/deploy/online/online-endpoint.yml
+++ b/cv/aml-cli-v2/mlops/azureml/deploy/online/online-endpoint.yml
@ -0,0 +1,4 @@
+$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
+name: dogs-classifier-online
+description: Stanford Dogs Classifier
+auth_mode: key
--- a/cv/aml-cli-v2/mlops/azureml/train/pipeline.yaml
+++ b/cv/aml-cli-v2/mlops/azureml/train/pipeline.yaml
@ -0,0 +1,83 @@
+$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
+type: pipeline
+
+# <inputs_and_outputs>
+inputs:
+  training_images:
+    type: uri_folder
+    mode: download # pick ro_mount, rw_mount or download
+    path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/dogs/**
+    # path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/places2/train//**
+  validation_images:
+    type: uri_folder
+    mode: download # pick ro_mount, rw_mount or download
+    path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/dogs/**
+    # path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/places2/valid/**
+# </inputs_and_outputs>
+
+# <jobs>
+settings:
+  default_datastore: azureml:workspaceblobstore
+  continue_on_step_failure: true
+
+jobs:
+  train:
+    type: command
+    component: file:train.yaml
+    compute: azureml:gpu-cluster
+    resources:
+      instance_count: 1 # number of nodes
+    distribution:
+      type: pytorch
+      process_count_per_instance: 1 # number of gpus
+
+    # NOTE: set env var if needed
+    environment_variables:
+      NCCL_DEBUG: "INFO" # adjusts the level of info from NCCL tests
+
+      # NCCL_TOPO_FILE: "/opt/microsoft/ndv4-topo.xml" # Use specific topology file for A100
+
+      # NCCL_IB_PCI_RELAXED_ORDERING: "1" # Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
+      # NCCL_IB_DISABLE: "1" # force disable infiniband (if set to "1")
+      # NCCL_NET_PLUGIN: "none" # to force NET/Plugin off (no rdma/sharp plugin at all)
+      # NCCL_NET: "Socket" # to force node-to-node comm to use Socket (slow)
+      # NCCL_SOCKET_IFNAME: "eth0" # to force Socket comm to use eth0 (use NCCL_NET=Socket)
+
+      # UCX_IB_PCI_RELAXED_ORDERING: "on"
+      # UCX_TLS: "tcp"
+      # UCX_NET_DEVICES: "eth0" # if you have Error: Failed to resolve UCX endpoint...
+
+      # CUDA_DEVICE_ORDER: "PCI_BUS_ID" # ordering of gpus
+
+      # TORCH_DISTRIBUTED_DEBUG: "DETAIL"
+
+    inputs:
+      # data inputs
+      train_images: ${{parent.inputs.training_images}}
+      valid_images: ${{parent.inputs.validation_images}}
+
+      # data loading
+      batch_size: 64
+      num_workers: 5
+      prefetch_factor: 4
+      persistent_workers: true
+      pin_memory: true
+      non_blocking: false
+
+      # model
+      model_arch: "resnet18"
+      model_arch_pretrained: true
+
+      # training
+      num_epochs: 1
+      learning_rate: 0.001
+      momentum: 0.9
+
+      # profiling
+      enable_profiling: false
+      # multiprocessing_sharing_strategy: "file_system" # WARNING: this can cause hang at job completion
+
+      # Model Registrataion
+      register_model_as: "resnet-dogs-classifier"
+
+# </jobs>
--- a/cv/aml-cli-v2/data-science/src/environment/env.yaml
+++ b/cv/aml-cli-v2/data-science/src/environment/env.yaml
@ -2,7 +2,7 @@ $schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
 name: nvidia_pytorch
 version: 22.04-py3
 build:
-  path: .
+  path: ../../../data-science/src/environment/
 tags:
  os: ubuntu
  os_version: 20.04
@ -19,5 +19,4 @@ tags:
  nccl_test: 2.11.0
  azureml-defaults: 1.41.0
  mlflow: 1.25.1
-  transformers: 4.18.0
-  onnxconverter-common: 1.9.0
+  transformers: 4.18.0
--- a/cv/aml-cli-v2/mlops/azureml/train/train.yaml
+++ b/cv/aml-cli-v2/mlops/azureml/train/train.yaml
@ -0,0 +1,123 @@
+$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
+type: command
+
+description: >-
+  Fine-tunes a pre-trained pytorch model for image classification.
+  Inputs should be provided as distinct directories containing distinct images
+  as we're using [ImageFolder](http://pytorch.org/vision/main/generated/torchvision.datasets.ImageFolder.html) to load data.
+name: pytorch_image_classifier
+display_name: Image Classification Model (PyTorch)
+version: 1.0.4
+
+inputs:
+  # data loading
+  train_images:
+    type: path
+    description: "Path to folder containing training images, stored in subdirectories according to their class."
+  valid_images:
+    type: path
+    description: "Path to folder containing validation images, stored in subdirectories according to their class."
+
+  # data loading
+  batch_size:
+    type: integer
+    min: 1
+    optional: true
+    description: "Train/valid data loading batch size (default: 64)"
+  num_workers:
+    type: integer
+    optional: true
+    description: "Num workers for data loader (default: -1 => all cpus available)"
+  prefetch_factor:
+    type: integer
+    optional: true
+    description: "Data loader prefetch factor (default: 2)"
+  persistent_workers:
+    type: boolean
+    optional: true
+    description: "Use persistent prefetching workers (default: True)"
+  pin_memory:
+    type: boolean
+    optional: true
+    description: "Pin Data loader prefetch factor (default: True)"
+  non_blocking:
+    type: boolean
+    optional: true
+    description: "Use non-blocking transfer to device (default: False)"
+
+  # model
+  model_arch:
+    type: string
+    optional: true
+    description: "Which model architecture to use (default: resnet18)"
+  model_arch_pretrained:
+    type: boolean
+    optional: true
+    description: "Use pretrained model (default: true)"
+
+  # training
+  num_epochs:
+    type: integer
+    optional: true
+    description: "Number of epochs to train for (default: 1)"
+  learning_rate:
+    type: number
+    optional: true
+    description: "Learning rate of optimizer (default: 0.001)"
+  momentum:
+    type: number
+    optional: true
+    description: "Momentum of optimizer (default: 0.9)"
+
+  # model registration
+  register_model_as:
+    type: string
+    optional: true
+    description: "Name to register final model in MLFlow"
+
+  # system parameters
+  enable_profiling:
+    type: boolean
+    default: false
+    description: "Enables profiler"
+  multiprocessing_sharing_strategy:
+    type: string
+    optional: true
+    description: "Check https://pytorch.org/docs/stable/multiprocessing.html"
+
+outputs:
+  checkpoints:
+    type: path
+    description: "Path to export checkpoints"
+  trained_model:
+    type: path
+    description: "Path to the final model"
+
+code: ../../../data-science/src
+
+environment: azureml:azureml:nvidia_pytorch:22.04-py3
+
+command: >-
+  python train.py
+  --train_images ${{inputs.train_images}}
+  --valid_images ${{inputs.valid_images}}
+  [--batch_size ${{inputs.batch_size}}]
+  [--num_workers ${{inputs.num_workers}}]
+  [--prefetch_factor ${{inputs.prefetch_factor}}]
+  [--persistent_workers ${{inputs.persistent_workers}}]
+  [--pin_memory ${{inputs.pin_memory}}]
+  [--non_blocking ${{inputs.non_blocking}}]
+  [--model_arch ${{inputs.model_arch}}]
+  [--model_arch_pretrained ${{inputs.model_arch_pretrained}}]
+  [--num_epochs ${{inputs.num_epochs}}]
+  [--learning_rate ${{inputs.learning_rate}}]
+  [--momentum ${{inputs.momentum}}]
+  --model_output ${{outputs.trained_model}}
+  --checkpoints ${{outputs.checkpoints}}
+  [--register_model_as ${{inputs.register_model_as}}]
+  --enable_profiling ${{inputs.enable_profiling}}
+  [--multiprocessing_sharing_strategy ${{inputs.multiprocessing_sharing_strategy}}]
+distribution:
+  # NOTE: using type:pytorch will use all the right env variables for pytorch init_process_group
+  type: pytorch
+  process_count_per_instance: 1
--- a/cv/aml-cli-v2/mlops/devops-pipelines/deploy-batch-endpoint-pipeline.yml
+++ b/cv/aml-cli-v2/mlops/devops-pipelines/deploy-batch-endpoint-pipeline.yml
@ -0,0 +1,62 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+variables:
+- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
+    # 'main' branch: PRD environment
+    - template: ../../config-infra-prod.yml
+- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:  
+    # 'develop' or feature branches: DEV environment
+    - template: ../../config-infra-dev.yml
+- name: version
+  value: aml-cli-v2 
+- name: endpoint_name
+  value: taxi-fare-batch
+- name: endpoint_type
+  value: batch
+
+trigger:
+- none
+
+pool:
+  vmImage: ubuntu-20.04
+
+
+resources:
+  repositories:
+    - repository: mlops-templates  # Template Repo
+      name: Azure/mlops-templates # need to change org name from "Azure" to your own org
+      endpoint: github-connection # need to set up and hardcode
+      type: github
+
+stages:
+- stage: CreateBatchEndpoint
+  displayName: Create/Update Batch Endpoint 
+  jobs:
+    - job: DeployBatchEndpoint
+      steps:
+      - checkout: self
+        path: s/
+      - checkout: mlops-templates
+        path: s/templates/
+      - template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
+      - template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
+      - template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
+      - template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
+        parameters:
+          cluster_name: batch-cluster # name must match cluster name in deployment file below
+          min_instances: 0
+          max_instances: 5
+      - template: templates/${{ variables.version }}/create-endpoint.yml@mlops-templates
+        parameters: 
+          endpoint_file: mlops/azureml/deploy/batch/batch-endpoint.yml
+      - template: templates/${{ variables.version }}/create-deployment.yml@mlops-templates
+        parameters:
+          deployment_name: taxi-batch-dp
+          deployment_file: mlops/azureml/deploy/batch/batch-deployment.yml      
+      - template: templates/${{ variables.version }}/test-deployment.yml@mlops-templates
+        parameters:
+          deployment_name: taxi-batch-dp
+          sample_request: data/taxi-batch.csv
+          request_type: uri_file #either uri_folder or uri_file 
+
--- a/cv/aml-cli-v2/mlops/devops-pipelines/deploy-model-training-pipeline.yml
+++ b/cv/aml-cli-v2/mlops/devops-pipelines/deploy-model-training-pipeline.yml
@ -0,0 +1,49 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+variables:
+- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
+    # 'main' branch: PRD environment
+    - template: ../../config-infra-prod.yml
+- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:  
+    # 'develop' or feature branches: DEV environment
+    - template: ../../../../config-infra-dev.yml
+- name: version
+  value: aml-cli-v2 
+
+
+trigger:
+- none
+
+pool:
+  vmImage: ubuntu-20.04
+
+
+resources:
+  repositories:
+    - repository: mlops-templates  # Template Repo
+      name: Azure/mlops-templates # need to change org name from "Azure" to your own org
+      endpoint: github-connection # need to set up and hardcode
+      type: github
+
+stages:
+- stage: DeployTrainingPipeline
+  displayName: Deploy Training Pipeline
+  jobs:
+    - job: DeployTrainingPipeline
+      steps:
+      - checkout: self
+        path: s/
+      - checkout: mlops-templates
+        path: s/templates/
+      - template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
+      - template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
+      - template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
+      - task: CmdLine@2
+        inputs:
+          script: |
+            echo  '$(System.DefaultWorkingDirectory)'
+            tree $(System.DefaultWorkingDirectory) /f
+      - template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
+        parameters: 
+          pipeline_file: cv/aml-cli-v2/mlops/azureml/train/pipeline.yaml
--- a/cv/aml-cli-v2/mlops/devops-pipelines/deploy-online-endpoint-pipeline.yml
+++ b/cv/aml-cli-v2/mlops/devops-pipelines/deploy-online-endpoint-pipeline.yml
@ -0,0 +1,60 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+variables:
+- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
+    # 'main' branch: PRD environment
+    - template: ../../config-infra-prod.yml
+- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:  
+    # 'develop' or feature branches: DEV environment
+    - template: ../../../../config-infra-dev.yml
+- name: version
+  value: aml-cli-v2 
+- name: endpoint_name
+  value: dogs-classifier-online
+- name: endpoint_type
+  value: online
+
+
+trigger:
+- none
+
+pool:
+  vmImage: ubuntu-20.04
+
+
+resources:
+  repositories:
+    - repository: mlops-templates  # Template Repo
+      name: Azure/mlops-templates # need to change org name from "Azure" to your own org
+      endpoint: github-connection # need to set up and hardcode
+      type: github
+
+stages:
+- stage: CreateOnlineEndpoint
+  displayName: Create/Update Online Endpoint 
+  jobs:
+    - job: DeployOnlineEndpoint
+      steps:
+      - checkout: self
+        path: s/
+      - checkout: mlops-templates
+        path: s/templates/
+      - template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
+      - template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
+      - template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
+      - template: templates/${{ variables.version }}/create-endpoint.yml@mlops-templates
+        parameters:
+          endpoint_file: cv/aml-cli-v2/mlops/azureml/deploy/online/online-endpoint.yml
+      - template: templates/${{ variables.version }}/create-deployment.yml@mlops-templates
+        parameters:
+          deployment_name: dogs-online-dp
+          deployment_file: cv/aml-cli-v2/mlops/azureml/deploy/online/online-deployment.yml 
+      - template: templates/${{ variables.version }}/allocate-traffic.yml@mlops-templates
+        parameters:
+          traffic_allocation: dogs-online-dp=100
+      - template: templates/${{ variables.version }}/test-deployment.yml@mlops-templates
+        parameters:
+          deployment_name: dogs-online-dp
+          sample_request: cv/aml-cli-v2/data/sample-request.json
+          request_type: json