Merge branch 'main' into dev

2022-09-09 10:51:46 +01:00 · 2022-09-09 10:51:46 +01:00 · b4ef102154
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -0,0 +1,13 @@
+blank_issues_enabled: false
+contact_links:
+  - name: MLOps v2 solution accelerators discussions.
+    url: https://github.com/azure/mlops-v2/discussions
+    about: >-
+      Please ask questions and start open-ended discussions here. 
+      Use issues for well-defined work in the solution accelerator repositories.
+  - name: Azure ML CLI issues.
+    url: https://github.com/azure/azure-cli-extensions/issues/new/choose
+    about: Please open issues with the Azure ML CLI extension here.
+  - name: Azure ML Python SDK issues.
+    url: https://github.com/azure/azure-sdk-for-python/issues/new/choose
+    about: Please open issues with the Azure ML Python SDK here.
--- a/.github/ISSUE_TEMPLATE/repository-issue.md
+++ b/.github/ISSUE_TEMPLATE/repository-issue.md
@ -0,0 +1,21 @@
+---
+name: Suggest an enhancement for this repository.
+about: Have an idea for improvements to this repository?
+title: '[repo] <title>'
+labels: ''
+assignees: ''
+---
+
+## Why?
+
+<!-- What problem is this solving? -->
+
+## How?
+
+<!-- How are you suggesting it gets solved? -->
+
+## Anything else?
+
+<!--
+Links? References? Anything that will give us more context about the issue that you are encountering!
+-->
--- a/.github/ISSUE_TEMPLATE/solution-accelerator-request.md
+++ b/.github/ISSUE_TEMPLATE/solution-accelerator-request.md
@ -0,0 +1,25 @@
+---
+name: Request or suggest a new solution accelerator.
+about: Have an idea for a new solution accelerator?
+title: '[new accelerator] <title>'
+labels: ''
+assignees: ''
+---
+
+## Why doesn't an existing solution accelerator work?
+
+<!-- Concisely explain why a new solution accelerator is needed. -->
+
+## What work is needed?
+
+<!-- 
+Concisely explain the infrastructure and MLOps work needed. 
+Include as much detail as possible in how this would fit into the
+overall solution accelerator.
+-->
+
+## Anything else?
+
+<!--
+Links? References? Anything that will give us more context about the issue that you are encountering!
+-->
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -0,0 +1,13 @@
+# PR into Azure/mlops-v2
+
+## Checklist
+
+I have:
+
+- [ ] read and followed the contributing guidelines
+
+## Changes
+
+-
+
+fixes #
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,14 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.2.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+
+    # Opinionated code formatter to forget about formatting
+-   repo: https://github.com/psf/black
+    rev: 21.12b0
+    hooks:
+    -   id: black
+        additional_dependencies: ['click==8.0.4']
--- a/classical/aml-cli-v2/data-science/environment/train-conda.yml
+++ b/classical/aml-cli-v2/data-science/environment/train-conda.yml
@ -12,3 +12,5 @@ dependencies:
      - pandas==1.2.1
      - joblib==1.0.0
      - matplotlib==3.3.3
+      - git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-client
+      - git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector
--- a/classical/aml-cli-v2/data-science/environment/train-monitor-conda.yml
+++ b/classical/aml-cli-v2/data-science/environment/train-monitor-conda.yml
@ -0,0 +1,16 @@
+channels:
+  - defaults
+  - anaconda
+  - conda-forge
+dependencies:
+  - python=3.7.5
+  - pip
+  - pip:
+      - azureml-mlflow==1.38.0
+      - azureml-sdk==1.38.0
+      - scikit-learn==0.24.1
+      - pandas==1.2.1
+      - joblib==1.0.0
+      - matplotlib==3.3.3
+      - git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-client
+      - git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector
--- a/classical/aml-cli-v2/data-science/src/prep/prep.py
+++ b/classical/aml-cli-v2/data-science/src/prep/prep.py
@ -53,10 +53,18 @@ def parse_args():
    parser.add_argument("--val_data", type=str, help="Path to test dataset")
    parser.add_argument("--test_data", type=str, help="Path to test dataset")
    
+    parser.add_argument("--enable_monitoring", type=str, help="enable logging to ADX")
+    parser.add_argument("--table_name", type=str, default="mlmonitoring", help="Table name in ADX for logging")
+    
    args = parser.parse_args()

    return args

+def log_training_data(df, table_name):
+    from obs.collector import Online_Collector
+    collector = Online_Collector(table_name)
+    collector.batch_collect(df)
+
 def main(args):
    '''Read, split, and save datasets'''

@ -93,6 +101,9 @@ def main(args):
    val.to_parquet((Path(args.val_data) / "val.parquet"))
    test.to_parquet((Path(args.test_data) / "test.parquet"))

+    if (args.enable_monitoring.lower == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower == 'yes'):
+        log_training_data(data, args.table_name)
+

 if __name__ == "__main__":

--- a/classical/aml-cli-v2/mlops/azureml/deploy/online/online-deployment.yml
+++ b/classical/aml-cli-v2/mlops/azureml/deploy/online/online-deployment.yml
@ -2,5 +2,5 @@ $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.sch
 name: blue
 endpoint_name: taxi-fare-online
 model: azureml:taxi-model@latest
-instance_type: Standard_F2s_v2
+instance_type: Standard_DS2_v2
 instance_count: 1
--- a/classical/aml-cli-v2/mlops/azureml/deploy/online/score.py
+++ b/classical/aml-cli-v2/mlops/azureml/deploy/online/score.py
--- a/classical/aml-cli-v2/mlops/azureml/train/pipeline.yml
+++ b/classical/aml-cli-v2/mlops/azureml/train/pipeline.yml
@ -8,6 +8,8 @@ inputs:
  input: #using local data, will create an anonymous data asset
    type: uri_folder
    path: ../../../data/
+  enable_monitoring: 
+  table_name: 'taximonitoring'

 outputs: 
  train_data:
@ -35,9 +37,12 @@ jobs:
      --train_data ${{outputs.train_data}}
      --val_data ${{outputs.val_data}}
      --test_data ${{outputs.test_data}}
+      --enable_monitoring ${{inputs.enable_monitoring}}
    environment: azureml:taxi-train-env@latest
    inputs:
      raw_data: ${{parent.inputs.input}}
+      enable_monitoring: ${{parent.inputs.enable_monitoring}}
+      table_name: ${{parent.inputs.table_name}}
    outputs:
      train_data: ${{parent.outputs.train_data}}
      val_data: ${{parent.outputs.val_data}}
--- a/classical/aml-cli-v2/mlops/devops-pipelines/deploy-batch-endpoint-pipeline.yml
+++ b/classical/aml-cli-v2/mlops/devops-pipelines/deploy-batch-endpoint-pipeline.yml
@ -11,7 +11,7 @@ variables:
 - name: version
  value: aml-cli-v2 
 - name: endpoint_name
-  value: taxi-fare-batch
+  value: taxi-batch-$(namespace)$(postfix)$(environment)
 - name: endpoint_type
  value: batch

@ -28,6 +28,8 @@ resources:
      name: Azure/mlops-templates # need to change org name from "Azure" to your own org
      endpoint: github-connection # need to set up and hardcode
      type: github
+      ref: main
+      

 stages:
 - stage: CreateBatchEndpoint
--- a/classical/aml-cli-v2/mlops/devops-pipelines/deploy-model-training-pipeline.yml
+++ b/classical/aml-cli-v2/mlops/devops-pipelines/deploy-model-training-pipeline.yml
@ -25,10 +25,14 @@ resources:
      name: Azure/mlops-templates # need to change org name from "Azure" to your own org
      endpoint: github-connection # need to set up and hardcode
      type: github
+<<<<<<< HEAD
    - repository: rai-vnext-preview  # Template Repo
      name: Azure/rai-vnext-preview # need to change org name from "Azure" to your own org
      endpoint: github-connection # need to set up and hardcode
      type: github
+=======
+      ref: main
+>>>>>>> main

 stages:
 - stage: DeployTrainingPipeline
@ -50,6 +54,7 @@ stages:
          build_type: conda
          environment_name: taxi-train-env
          environment_file: mlops/azureml/train/train-env.yml
+          enable_monitoring: $(enable_monitoring)
      - checkout: rai-vnext-preview
        path: s/
      - template: register-rai-components.yml
@ -60,3 +65,6 @@ stages:
      - template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
        parameters: 
          pipeline_file: mlops/azureml/train/pipeline.yml
+          experiment_name: $(environment)_taxi_fare_train_$(Build.SourceBranchName)
+          display_name: $(environment)_taxi_fare_run_$(Build.BuildID)
+          enable_monitoring: $(enable_monitoring)
--- a/classical/aml-cli-v2/mlops/devops-pipelines/deploy-online-endpoint-pipeline.yml
+++ b/classical/aml-cli-v2/mlops/devops-pipelines/deploy-online-endpoint-pipeline.yml
@ -7,11 +7,11 @@ variables:
    - template: ../../config-infra-prod.yml
 - ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:  
    # 'develop' or feature branches: DEV environment
-    - template: ../../config-infra-dev.yml
+    - template: ../../../../config-infra-dev.yml
 - name: version
  value: aml-cli-v2 
 - name: endpoint_name
-  value: taxi-fare-online
+  value: taxi-online-$(namespace)$(postfix)$(environment)
 - name: endpoint_type
  value: online

@ -29,6 +29,7 @@ resources:
      name: Azure/mlops-templates # need to change org name from "Azure" to your own org
      endpoint: github-connection # need to set up and hardcode
      type: github
+      ref: main

 stages:
 - stage: CreateOnlineEndpoint
--- a/classical/aml-cli-v2/mlops/github-actions/deploy-batch-endpoint-pipeline-classical.yml
+++ b/classical/aml-cli-v2/mlops/github-actions/deploy-batch-endpoint-pipeline-classical.yml
@ -0,0 +1,44 @@
+name: deploy-batch-endpoint-pipeline
+
+on: 
+  workflow_dispatch:
+jobs:
+  get-config:
+      uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
+      with:
+        file_name: config-infra-prod.yml
+  create-compute:
+      needs: get-config
+      uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
+      with:
+        cluster_name: batch-cluster
+        size: STANDARD_DS3_V2
+        min_instances: 0
+        max_instances: 5
+        resource_group: ${{ needs.get-config.outputs.resource_group }}
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }}     
+      secrets:
+          creds: ${{secrets.AZURE_CREDENTIALS}}
+  create-endpoint:
+      needs: [get-config,create-compute]
+      uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
+      with:
+        resource_group: ${{ needs.get-config.outputs.resource_group }}
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }} 
+        endpoint_file: mlops/azureml/deploy/batch/batch-endpoint.yml
+        endpoint_name:  ${{ format('taxi-batch-{0}', needs.get-config.outputs.bep) }} 
+        endpoint_type: batch
+      secrets:
+        creds: ${{secrets.AZURE_CREDENTIALS}}
+  create-deployment:
+      uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
+      needs: [get-config,create-endpoint]
+      with:
+        resource_group: ${{ needs.get-config.outputs.resource_group }}
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }} 
+        endpoint_file: mlops/azureml/deploy/batch/batch-deployment.yml
+        endpoint_name: ${{ format('taxi-batch-{0}', needs.get-config.outputs.bep) }} 
+        endpoint_type: batch
+        deployment_name: eptestdeploy
+      secrets:
+        creds: ${{secrets.AZURE_CREDENTIALS}}
--- a/classical/aml-cli-v2/mlops/github-actions/deploy-model-training-pipeline-classical.yml
+++ b/classical/aml-cli-v2/mlops/github-actions/deploy-model-training-pipeline-classical.yml
@ -0,0 +1,29 @@
+name: deploy-model-training-pipeline
+
+on: 
+  workflow_dispatch:
+jobs:
+  get-config:
+      uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
+      with:
+        file_name: config-infra-prod.yml
+  register-environment:
+      needs: get-config
+      uses: Azure/mlops-templates/.github/workflows/register-environment.yml@main
+      with:
+        resource_group: ${{ needs.get-config.outputs.resource_group }}
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }}     
+        environment_file:  mlops/azureml/train/train-env.yml
+        conda_file: data-science/environment/train-conda.yml
+      secrets:
+          creds: ${{secrets.AZURE_CREDENTIALS}}
+  run-pipeline:
+      needs: [get-config,register-environment]
+      uses: Azure/mlops-templates/.github/workflows/run-pipeline.yml@main
+      with:
+        resource_group: ${{ needs.get-config.outputs.resource_group }}
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }}     
+        parameters-file:  mlops/azureml/train/pipeline.yml
+        job-name: test
+      secrets:
+          creds: ${{secrets.AZURE_CREDENTIALS}}
--- a/classical/aml-cli-v2/mlops/github-actions/deploy-online-endpoint-pipeline-classical.yml
+++ b/classical/aml-cli-v2/mlops/github-actions/deploy-online-endpoint-pipeline-classical.yml
@ -0,0 +1,42 @@
+name: deploy-online-endpoint-pipeline
+
+on: 
+  workflow_dispatch:
+jobs:
+  get-config:
+      uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
+      with:
+        file_name: config-infra-prod.yml
+  create-endpoint:
+      needs: get-config
+      uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
+      with:
+        resource_group: ${{ needs.get-config.outputs.resource_group }}
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }} 
+        endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
+        endpoint_name: ${{ format('taxi-online-{0}', needs.get-config.outputs.oep) }}
+        endpoint_type: online
+      secrets:
+        creds: ${{secrets.AZURE_CREDENTIALS}}
+  create-deployment:
+      uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
+      needs: [get-config,create-endpoint]
+      with:
+        resource_group: ${{ needs.get-config.outputs.resource_group }}
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }} 
+        endpoint_file: mlops/azureml/deploy/online/online-deployment.yml
+        endpoint_name: ${{ format('taxi-online-{0}', needs.get-config.outputs.oep) }}
+        endpoint_type: online
+        deployment_name: taxi-online-dp
+      secrets:
+        creds: ${{secrets.AZURE_CREDENTIALS}}
+  allocate-traffic:
+      uses: Azure/mlops-templates/.github/workflows/allocate-traffic.yml@main
+      needs: [get-config,create-deployment]
+      with:
+        resource_group: ${{ needs.get-config.outputs.resource_group }}
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }} 
+        traffic_allocation: taxi-online-dp=100
+        endpoint_name: ${{ format('taxi-online-{0}', needs.get-config.outputs.oep) }}
+      secrets:
+        creds: ${{secrets.AZURE_CREDENTIALS}}
--- a/classical/python-sdk/config-aml.yml
+++ b/classical/python-sdk/config-aml.yml
@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 variables:

  ap_vm_image: ubuntu-20.04
@ -16,7 +19,7 @@ variables:
  training_env_name: credit-training

  # Training AzureML Environment conda yaml
-  training_env_conda_yaml: data-science/environments/train.yml
+  training_env_conda_yaml: data-science/environment/train.yml

  # Name for the training pipeline
  training_pipeline_name: credit-training
@ -27,8 +30,12 @@ variables:
  training_target_min_nodes: 0
  training_target_max_nodes: 4

-  # Training arguments specification; use azureml:dataset_name:version to reference an AML Dataset for --data_path
-  training_arguments: --data_path azureml:uci-credit:1
+  # Training arguments specification
+  training_arguments: ''
+
+  # Training datasets specification
+  # Syntax: <name>:<version>:<mode>:<steps (names separated by +)>
+  training_datasets: uci-credit:1:download:prep

  # Name under which the model will be registered
  model_name: credit-ci
@ -47,7 +54,7 @@ variables:
  batch_env_name: credit-batch

  # Batch AzureML Environment conda yaml
-  batch_env_conda_yaml: data-science/environments/batch.yml
+  batch_env_conda_yaml: data-science/environment/batch.yml

  # Name for the batch scoring pipeline
  batch_pipeline_name: credit-batch-scoring
@ -73,3 +80,7 @@ variables:
  batch_process_count_per_node: 1
  batch_node_count: 1

+  # Monitoring settings
+  scoring_table_name: scoringdata
+  training_table_name: mlmonitoring
+
--- a/classical/python-sdk/data-science/environment/batch_monitor.yml
+++ b/classical/python-sdk/data-science/environment/batch_monitor.yml
@ -0,0 +1,18 @@
+name: batch-monitoring
+channels:
+  - defaults
+  - anaconda
+  - conda-forge
+dependencies:
+  - python=3.7.5
+  - pip
+  - pip:
+    - azureml-defaults==1.38.0
+    - azureml-mlflow==1.38.0
+    - azureml-sdk==1.38.0
+    - azureml-interpret==1.38.0
+    - scikit-learn==0.24.1
+    - pandas==1.2.1
+    - joblib==1.0.0
+    - matplotlib==3.3.3
+    - git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector
--- a/classical/python-sdk/data-science/environment/train.yml
+++ b/classical/python-sdk/data-science/environment/train.yml
@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 name: mnist-train
 channels:
  - defaults
--- a/classical/python-sdk/data-science/environment/train_monitor.yml
+++ b/classical/python-sdk/data-science/environment/train_monitor.yml
@ -0,0 +1,22 @@
+name: train
+channels:
+  - defaults
+  - anaconda
+  - conda-forge
+dependencies:
+  - python=3.7.5
+  - pip
+  - pip:
+    - azureml-mlflow==1.38.0
+    - azureml-sdk==1.38.0
+    - scikit-learn==0.24.1
+    - pandas==1.2.1
+    - joblib==1.0.0
+    - matplotlib==3.3.3
+    - fairlearn==0.7.0
+    - azureml-contrib-fairness==1.38.0
+    - interpret-community==0.24.1
+    - interpret-core==0.2.7
+    - azureml-interpret==1.38.0
+    - git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-client
+    - git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector
--- a/classical/python-sdk/data-science/src/evaluate.py
+++ b/classical/python-sdk/data-science/src/evaluate.py
@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 import os
 import sys
 import argparse
@ -40,10 +43,10 @@ def parse_args():
 def main():
    # Parse command-line arguments
    args = parse_args()
-    prepared_data_path = os.path.join(args.prepared_data_path, run.parent.id)
-    model_path = os.path.join(args.model_path, run.parent.id)
-    explainer_path = os.path.join(args.explainer_path, run.parent.id)
-    evaluation_path = os.path.join(args.evaluation_path, run.parent.id)
+    prepared_data_path = args.prepared_data_path
+    model_path = args.model_path
+    explainer_path = args.explainer_path
+    evaluation_path = args.evaluation_path

    # Make sure evaluation output path exists
    if not os.path.exists(evaluation_path):
@ -111,6 +114,10 @@ def main():
    for model_run in Model.list(ws):
        if model_run.name == args.model_name:
            mdl_path = Model.download(model_run, exist_ok=True)
+
+            if 'model.pkl' in mdl_path:
+                mdl = joblib.load(mdl_path)
+            else:
                mdl = joblib.load(os.path.join(mdl_path, 'model.pkl'))
            
            test_accuracies[model_run.id] = mdl.score(X_test, y_test)
--- a/classical/python-sdk/data-science/src/prep.py
+++ b/classical/python-sdk/data-science/src/prep.py
@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 import os
 import sys
 import argparse
@ -17,14 +20,25 @@ ws = run.experiment.workspace

 def parse_args():
    parser = argparse.ArgumentParser(description="UCI Credit example")
-    parser.add_argument("--data_path", type=str, default='data/', help="Directory path to training data")
+    parser.add_argument("--uci-credit", type=str, default='data/', help="Directory path to training data")
    parser.add_argument("--prepared_data_path", type=str, default='prepared_data/', help="prepared data directory")
-    return parser.parse_args()
+    parser.add_argument("--enable_monitoring", type=str, default="false", help="enable logging to ADX")
+    parser.add_argument("--table_name", type=str, default="mlmonitoring", help="Table name in ADX for logging")
+    return parser.parse_known_args()
+
+def log_training_data(df, table_name):
+    from obs.collector import Online_Collector
+    from datetime import timedelta
+    print("If there is an Authorization error, check your Azure KeyVault secret named kvmonitoringspkey. Terraform might put single quotation marks around the secret. Remove the single quotes and the secret should work.")
+    collector = Online_Collector(table_name)
+    df["timestamp"] = [pd.to_datetime('now') - timedelta(days=x) for x in range(len(df))]
+    collector.batch_collect(df)
+

 def main():
    # Parse command-line arguments
-    args = parse_args()
-    prepared_data_path = os.path.join(args.prepared_data_path, run.parent.id)
+    args, unknown = parse_args()
+    prepared_data_path = args.prepared_data_path

    # Make sure data output path exists
    if not os.path.exists(prepared_data_path):
@ -34,7 +48,7 @@ def main():
    mlflow.sklearn.autolog()
    
    # Read training data
-    df = pd.read_csv(os.path.join(args.data_path, 'credit.csv'))
+    df = pd.read_csv(os.path.join(args.uci_credit, 'credit.csv'))

    random_data = np.random.rand(len(df))

@ -62,5 +76,8 @@ def main():
    val.to_csv(VAL_PATH, index=False)
    test.to_csv(TEST_PATH, index=False)

+    if (args.enable_monitoring.lower() == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower() == 'yes'):
+        log_training_data(df, args.table_name)
+    
 if __name__ == '__main__':
    main()
--- a/classical/python-sdk/data-science/src/score.py
+++ b/classical/python-sdk/data-science/src/score.py
@ -1,3 +1,5 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.

 import os
 import glob
@ -6,33 +8,55 @@ import argparse
 import numpy as np
 import pandas as pd
 import joblib
-
+from datetime import timedelta
 from azureml.core.model import Model

 model = None
 explainer = None
+collector = None
+

 def init():
-    global model, explainer
+    global model, explainer, collector
    print("Started batch scoring by running init()")

-    parser = argparse.ArgumentParser('batch_scoring')
-    parser.add_argument('--model_name', type=str, help='Model to use for batch scoring')
+    parser = argparse.ArgumentParser("batch_scoring")
+    parser.add_argument("--model_name", type=str, help="Model to use for batch scoring")
+    parser.add_argument(
+        "--enable_monitoring", type=str, help="Enable Monitoring", default="false"
+    )
+    parser.add_argument("--table_name", type=str, help="Table Name for logging data")
    args, _ = parser.parse_known_args()

    model_path = Model.get_model_path(args.model_name)
    print(f"Model path: {model_path}")
-    model = joblib.load(os.path.join(model_path, 'model.pkl'))
+
+    if "model.pkl" in model_path:
+        model = joblib.load(model_path)
+    else:
+        model = joblib.load(os.path.join(model_path, "model.pkl"))

        # load the explainer
    explainer_path = os.path.join(Model.get_model_path(args.model_name), "explainer")
    # explainer = joblib.load(explainer_path)

+    if (
+        args.enable_monitoring.lower() == "true"
+        or args.enable_monitoring == "1"
+        or args.enable_monitoring.lower() == "yes"
+    ):
+        from obs.collector import Online_Collector
+
+        collector = Online_Collector(args.table_name)
+
+
 def run(file_list):

    print(f"Files to process: {file_list}")
-    results = pd.DataFrame(columns=["Sno", "ProbaGoodCredit", "ProbaBadCredit", "FeatureImportance"])
-    
+    results = pd.DataFrame(
+        columns=["Sno", "ProbaGoodCredit", "ProbaBadCredit", "FeatureImportance"]
+    )
+    all_results = []
    for filename in file_list:

        df = pd.read_csv(filename)
@ -55,5 +79,14 @@ def run(file_list):
        # result = pd.concat([sno, proba, explanation], axis=1)
        result = pd.concat([sno, proba], axis=1)
        results = results.append(result)
+        all_results.append(pd.concat([df, proba], axis=1))
        print(f"Batch scored: {filename}")
+
+    if collector:
+        full_results = pd.concat(all_results)
+        full_results["timestamp"] = [
+            pd.to_datetime("now") - timedelta(days=x) for x in range(len(full_results))
+        ]
+        collector.batch_collect(full_results)
+
    return results
--- a/classical/python-sdk/data-science/src/train.py
+++ b/classical/python-sdk/data-science/src/train.py
@ -1,3 +1,7 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+
 import os
 import sys
 import argparse
@ -30,8 +34,8 @@ def main():
    # Parse command-line arguments
    args = parse_args()

-    prepared_data_path = os.path.join(args.prepared_data_path, run.parent.id)
-    model_path = os.path.join(args.model_path, run.parent.id)
+    prepared_data_path = args.prepared_data_path
+    model_path = args.model_path

    # Make sure model output path exists
    if not os.path.exists(model_path):
--- a/classical/python-sdk/mlops/devops-pipelines/deploy-drift-detection.yml
+++ b/classical/python-sdk/mlops/devops-pipelines/deploy-drift-detection.yml
@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+variables:
+- template: ../../config-aml.yml
+- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
+    # 'main' branch: PRD environment
+    - template: ../../config-infra-prod.yml
+- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:  
+    # 'develop' or feature branches: DEV environment
+    - template: ../../config-infra-dev.yml
+- name: version
+  value: python-sdk
+
+trigger:
+- none
+
+pool:
+  vmImage: $(ap_vm_image)
+
+stages:
+- stage: DeployDriftJob
+  displayName: Deploy Drift Job
+  jobs:
+    - job: DeployDriftJob
+      steps:
+      - checkout: self
+        path: s/
+      - checkout: mlops-templates
+        path: s/templates/
+      - template: templates/${{ variables.version }}/deploy-drift-detection.yml@mlops-templates
--- a/classical/python-sdk/mlops/devops-pipelines/deploy-model-batch-scoring.yml
+++ b/classical/python-sdk/mlops/devops-pipelines/deploy-model-batch-scoring.yml
@ -24,6 +24,7 @@ resources:
      name: Azure/mlops-templates
      endpoint: github-connection # need to set up and hardcode
      type: github
+      ref: main

 stages:
 - stage: DeployBatchScoringPipeline
@ -41,10 +42,14 @@ stages:
      - template: templates/${{ variables.version }}/create-environment.yml@mlops-templates
        parameters:
          environment_name: $(batch_env_name)
-          environment_conda_yaml: $(batch_env_conda_yaml)
+          build_type: 'conda'
+          environment_file: $(batch_env_conda_yaml)
+          enable_monitoring: $(enable_monitoring)
      - template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
        parameters:
          data_type: scoring
      - template: templates/${{ variables.version }}/deploy-batch-scoring-pipeline.yml@mlops-templates
+        parameters:
+          enable_monitoring: $(enable_monitoring)
      - template: templates/${{ variables.version }}/add-pipeline-to-endpoint.yml@mlops-templates
      - template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
--- a/classical/python-sdk/mlops/devops-pipelines/deploy-model-training-pipeline.yml
+++ b/classical/python-sdk/mlops/devops-pipelines/deploy-model-training-pipeline.yml
@ -24,6 +24,7 @@ resources:
      name: Azure/mlops-templates # need to change org name from Azure when pulling the template
      endpoint: github-connection # need to set up and hardcode
      type: github
+      ref: main

 stages:
 - stage: DeployTrainingPipeline
@ -41,7 +42,9 @@ stages:
      - template: templates/${{ variables.version }}/create-environment.yml@mlops-templates
        parameters:
          environment_name: $(training_env_name)
-          environment_conda_yaml: $(training_env_conda_yaml)
+          build_type: 'conda'
+          environment_file: $(training_env_conda_yaml)
+          enable_monitoring: $(enable_monitoring)
      - template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
        parameters:
          data_type: training
@ -49,5 +52,7 @@ stages:
        parameters:
          compute_type: training
      - template: templates/${{ variables.version }}/deploy-training-pipeline.yml@mlops-templates
+        parameters:
+          enable_monitoring: $(enable_monitoring)
      - template: templates/${{ variables.version }}/add-pipeline-to-endpoint.yml@mlops-templates
      - template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
--- a/config-infra-dev.yml
+++ b/config-infra-dev.yml
@ -6,17 +6,22 @@ variables:
  # Global
  ap_vm_image: ubuntu-20.04

-  namespace: mlopsv2
-  postfix: 0621
+  namespace: mlopsv2 #Note: A namespace with many characters will cause storage account creation to fail due to storage account names having a limit of 24 characters.
+  postfix: 0659
  location: westus
+
  environment: dev
  enable_aml_computecluster: true
+  enable_aml_secure_workspace: true
+  enable_monitoring: false

  # Azure DevOps
  ado_service_connection_rg: Azure-ARM-Dev
  ado_service_connection_aml_ws: Azure-ARM-Dev

-  # IaC
+  # DO NOT TOUCH
+
+  # For pipeline reference
  resource_group: rg-$(namespace)-$(postfix)$(environment)
  aml_workspace: mlw-$(namespace)-$(postfix)$(environment)
  application_insights: mlw-$(namespace)-$(postfix)$(environment)
@ -24,10 +29,10 @@ variables:
  container_registry: cr$(namespace)$(postfix)$(environment)
  storage_account: st$(namespace)$(postfix)$(environment)

-  # Terraform
+  # For terraform reference
  terraform_version: 0.14.7
  terraform_workingdir: infrastructure/terraform
-  terraform_st_resource_group: rg-$(namespace)-$(postfix)$(environment)-tf-state
-  terraform_st_storage_account: st$(namespace)$(postfix)$(environment)tfstate
+  terraform_st_resource_group: rg-$(namespace)-$(postfix)$(environment)-tf
+  terraform_st_storage_account: st$(namespace)$(postfix)$(environment)tf
  terraform_st_container_name: default
  terraform_st_key: mlops-tab
--- a/config-infra-prod.yml
+++ b/config-infra-prod.yml
@ -7,17 +7,22 @@ variables:
  # Global
  ap_vm_image: ubuntu-20.04

-  namespace: mlopsv2
+  namespace: mlopsv2 #Note: A namespace with many characters will cause storage account creation to fail due to storage account names having a limit of 24 characters.
  postfix: 0518
-  location: westus
+  location: westeurope
  environment: prod
  enable_aml_computecluster: true
+  enable_aml_secure_workspace: false
+  enable_monitoring: true
+

  # Azure DevOps
-  ado_service_connection_rg: Azure-ARM-Dev
-  ado_service_connection_aml_ws: Azure-ARM-Dev
+  ado_service_connection_rg: Azure-ARM-Prod
+  ado_service_connection_aml_ws: Azure-ARM-Prod

-  # IaC
+  # DO NOT TOUCH
+
+  # For pipeline reference
  resource_group: rg-$(namespace)-$(postfix)$(environment)
  aml_workspace: mlw-$(namespace)-$(postfix)$(environment)
  application_insights: mlw-$(namespace)-$(postfix)$(environment)
@ -25,10 +30,10 @@ variables:
  container_registry: cr$(namespace)$(postfix)$(environment)
  storage_account: st$(namespace)$(postfix)$(environment)

-  # Terraform
+  # For terraform reference
  terraform_version: 0.14.7
  terraform_workingdir: infrastructure
-  terraform_st_resource_group: rg-$(namespace)-$(postfix)$(environment)-tf-state
-  terraform_st_storage_account: st$(namespace)$(postfix)$(environment)tfstate
+  terraform_st_resource_group: rg-$(namespace)-$(postfix)$(environment)-tf
+  terraform_st_storage_account: st$(namespace)$(postfix)$(environment)tf
  terraform_st_container_name: default
  terraform_st_key: mlops-tab
--- a/cv/aml-cli-v2/data-science/environment/Dockerfile
+++ b/cv/aml-cli-v2/data-science/environment/Dockerfile
@ -0,0 +1,46 @@
+# check release notes https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
+FROM nvcr.io/nvidia/pytorch:22.04-py3
+
+##############################################################################
+# NCCL TESTS
+##############################################################################
+ENV NCCL_TESTS_TAG=v2.11.0
+
+# NOTE: adding gencodes to support K80, M60, V100, A100
+RUN mkdir /tmp/nccltests && \
+    cd /tmp/nccltests && \
+    git clone -b ${NCCL_TESTS_TAG} https://github.com/NVIDIA/nccl-tests.git && \
+    cd nccl-tests && \
+    make \
+    MPI=1 MPI_HOME=/opt/hpcx/ompi \
+    NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80" \
+    CUDA_HOME=/usr/local/cuda && \
+    cp ./build/* /usr/local/bin && \
+    rm -rf /tmp/nccltests
+
+# Install dependencies missing in this container
+# NOTE: container already has matplotlib==3.5.1 tqdm==4.62.0
+COPY requirements.txt ./
+RUN pip install -r requirements.txt
+
+# RUN python -m pip install   azureml-defaults==1.41.0 \
+#     mlflow==1.25.1 \
+#     azureml-mlflow==1.41.0 \
+#     transformers==4.18.0 \
+#     psutil==5.9.0
+
+# add ndv4-topo.xml
+RUN mkdir /opt/microsoft/
+ADD ./ndv4-topo.xml /opt/microsoft
+
+# to use on A100, enable env var below in your job
+# ENV NCCL_TOPO_FILE="/opt/microsoft/ndv4-topo.xml"
+
+# adjusts the level of info from NCCL tests
+ENV NCCL_DEBUG="INFO"
+ENV NCCL_DEBUG_SUBSYS="GRAPH,INIT,ENV"
+
+# Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
+ENV NCCL_IB_PCI_RELAXED_ORDERING="1"
+ENV CUDA_DEVICE_ORDER="PCI_BUS_ID"
+ENV NCCL_SOCKET_IFNAME="eth0"
--- a/cv/aml-cli-v2/data-science/environment/ndv4-topo.xml
+++ b/cv/aml-cli-v2/data-science/environment/ndv4-topo.xml
@ -0,0 +1,35 @@
+<!-- This topology file was copied from https://github.com/Azure/azhpc-images/blob/master/common/network-tuning.sh -->
+<system version="1">
+  <cpu numaid="0" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
+    <pci busid="ffff:ff:01.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0001:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0101:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0002:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0102:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+    </pci>
+  </cpu>
+  <cpu numaid="1" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
+    <pci busid="ffff:ff:02.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0003:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0103:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0004:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0104:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+    </pci>
+  </cpu>
+  <cpu numaid="2" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
+      <pci busid="ffff:ff:03.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="000b:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0105:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="000c:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0106:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+    </pci>
+  </cpu>
+  <cpu numaid="3" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
+    <pci busid="ffff:ff:04.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="000d:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0107:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="000e:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0108:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+    </pci>
+  </cpu>
+</system>
--- a/cv/aml-cli-v2/data-science/environment/requirements.txt
+++ b/cv/aml-cli-v2/data-science/environment/requirements.txt
@ -0,0 +1,14 @@
+ # for local testing (cpu)
+torchvision==0.12.0
+torch==1.11.0
+transformers==4.18.0
+
+# for metrics reporting/plotting
+mlflow==1.25.1
+azureml-mlflow==1.41.0
+matplotlib==3.5.2
+tqdm==4.64.0
+psutil==5.9.0
+
+# for unit testing
+pytest==7.1.2
--- a/cv/aml-cli-v2/data-science/requirements-tests.txt
+++ b/cv/aml-cli-v2/data-science/requirements-tests.txt
@ -0,0 +1,20 @@
+# NOTE: install these requirements to run the unit tests
+
+# CV packages
+torchvision==0.12.0
+torch==1.11.0
+transformers==4.18.0
+
+# for metrics reporting/plotting
+mlflow==1.25.1
+azureml-mlflow==1.41.0
+matplotlib==3.5.2
+tqdm==4.64.0
+psutil==5.9.0
+
+# for unit testing
+pytest==7.1.2
+pytest-cov==2.12.1
+
+# Fix: force protobuf downgrade to avoid exception
+protobuf==3.20.1
--- a/cv/aml-cli-v2/data-science/src/image_io.py
+++ b/cv/aml-cli-v2/data-science/src/image_io.py
@ -0,0 +1,108 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+# Original Author: Jeff Omhover (MSFT)
+
+"""
+This script contains methods to hangle inputs for pytorch model training
+using the COCO dataset https://cocodataset.org/.
+"""
+import glob
+import logging
+import os
+
+import torchvision
+
+
+def find_image_subfolder(current_root):
+    """Identifies the right level of a directory
+    that matches with torchvision.datasets.ImageFolder requirements.
+    In particular, if images are in current_root/foo/bar/category_X/*.jpg
+    we will want to feed current_root/foo/bar/ to ImageFolder.
+
+    Args:
+        current_root (str): a given directory
+
+    Returns:
+        image_folder (str): the subfolder containing multiple subdirs
+    """
+    if not os.path.isdir(current_root):
+        raise FileNotFoundError(
+            f"While identifying the image folder, provided current_root={current_root} is not a directory."
+        )
+
+    sub_directories = glob.glob(os.path.join(current_root, "*"))
+    if len(sub_directories) == 1:
+        # let's do it recursively
+        return find_image_subfolder(sub_directories[0])
+    if len(sub_directories) == 0:
+        raise FileNotFoundError(
+            f"While identifying image folder under {current_root}, we found no content at all. The image folder is empty."
+        )
+    else:
+        return current_root
+
+
+def build_image_datasets(
+    train_images_dir: str,
+    valid_images_dir: str,
+    input_size: int = 224,
+):
+    """
+    Args:
+        train_images_dir (str): path to the directory containing training images
+        valid_images_dir (str): path to the directory containing validation images
+        input_size (int): input size expected by the model
+
+    Returns:
+        train_dataset (torchvision.datasets.VisionDataset): training dataset
+        valid_dataset (torchvision.datasets.VisionDataset): validation dataset
+        labels (Dict[str, int]): labels
+    """
+    logger = logging.getLogger(__name__)
+
+    # identify the right level of sub directory
+    train_images_dir = find_image_subfolder(train_images_dir)
+
+    logger.info(f"Creating training dataset from {train_images_dir}")
+
+    train_transform = torchvision.transforms.Compose(
+        [
+            torchvision.transforms.RandomResizedCrop(input_size),
+            torchvision.transforms.RandomHorizontalFlip(),
+            torchvision.transforms.ToTensor(),
+            torchvision.transforms.Normalize(
+                mean=[0.485, 0.456, 0.405], std=[0.229, 0.224, 0.225]
+            ),
+        ]
+    )
+    train_dataset = torchvision.datasets.ImageFolder(
+        root=train_images_dir, transform=train_transform
+    )
+    logger.info(
+        f"ImageFolder loaded training image from {train_images_dir}: samples={len(train_dataset)}, #classes={len(train_dataset.classes)} classes={train_dataset.classes}"
+    )
+
+    # identify the right level of sub directory
+    valid_images_dir = find_image_subfolder(valid_images_dir)
+
+    logger.info(f"Creating validation dataset from {valid_images_dir}")
+
+    valid_transform = torchvision.transforms.Compose(
+        [
+            torchvision.transforms.Resize(input_size),
+            torchvision.transforms.CenterCrop(input_size),
+            torchvision.transforms.ToTensor(),
+            torchvision.transforms.Normalize(
+                mean=[0.485, 0.456, 0.405], std=[0.229, 0.224, 0.225]
+            ),
+        ]
+    )
+    valid_dataset = torchvision.datasets.ImageFolder(
+        root=valid_images_dir, transform=valid_transform
+    )
+
+    logger.info(
+        f"ImageFolder loaded validation image from {valid_images_dir}: samples={len(valid_dataset)}, #classes={len(valid_dataset.classes)} classes={valid_dataset.classes}"
+    )
+
+    return train_dataset, valid_dataset, train_dataset.classes
--- a/cv/aml-cli-v2/data-science/src/model/init.py
+++ b/cv/aml-cli-v2/data-science/src/model/init.py
@ -0,0 +1 @@
+from .model_loader import MODEL_ARCH_LIST, get_model_metadata, load_model
--- a/cv/aml-cli-v2/data-science/src/model/model_loader.py
+++ b/cv/aml-cli-v2/data-science/src/model/model_loader.py
@ -0,0 +1,88 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+# Original Author: Jeff Omhover (MSFT)
+
+"""
+This script provides code to load and setup a variety of models from multiple libraries.
+"""
+
+MODEL_ARCH_MAP = {
+    # TorchVision models
+    "resnet18": {"input_size": 224, "library": "torchvision"},
+    "resnet34": {"input_size": 224, "library": "torchvision"},
+    "resnet50": {"input_size": 224, "library": "torchvision"},
+    "resnet101": {"input_size": 224, "library": "torchvision"},
+    "resnet152": {"input_size": 224, "library": "torchvision"},
+    "alexnet": {"input_size": 224, "library": "torchvision"},
+    "vgg11": {"input_size": 224, "library": "torchvision"},
+    "vgg11_bn": {"input_size": 224, "library": "torchvision"},
+    "vgg13": {"input_size": 224, "library": "torchvision"},
+    "vgg13_bn": {"input_size": 224, "library": "torchvision"},
+    "vgg16": {"input_size": 224, "library": "torchvision"},
+    "vgg16_bn": {"input_size": 224, "library": "torchvision"},
+    "vgg19": {"input_size": 224, "library": "torchvision"},
+    "vgg19_bn": {"input_size": 224, "library": "torchvision"},
+    "densenet121": {"input_size": 224, "library": "torchvision"},
+    "densenet169": {"input_size": 224, "library": "torchvision"},
+    "densenet201": {"input_size": 224, "library": "torchvision"},
+    "densenet161": {"input_size": 224, "library": "torchvision"},
+    # Swin HuggingFace models
+    "microsoft/swin-tiny-patch4-window7-224": {"input_size": 224, "library": "swin"},
+    "microsoft/swin-small-patch4-window7-224": {"input_size": 224, "library": "swin"},
+    "microsoft/swin-base-patch4-window7-224": {"input_size": 224, "library": "swin"},
+    "microsoft/swin-base-patch4-window7-224-in22k": {
+        "input_size": 224,
+        "library": "swin",
+    },
+    "microsoft/swin-large-patch4-window7-224": {"input_size": 224, "library": "swin"},
+    "microsoft/swin-large-patch4-window7-224-in22k": {
+        "input_size": 224,
+        "library": "swin",
+    },
+    "microsoft/swin-base-patch4-window12-384": {"input_size": 384, "library": "swin"},
+    "microsoft/swin-base-patch4-window12-384-in22k": {
+        "input_size": 384,
+        "library": "swin",
+    },
+    "microsoft/swin-large-patch4-window12-384": {"input_size": 384, "library": "swin"},
+    "microsoft/swin-large-patch4-window12-384-in22k": {
+        "input_size": 384,
+        "library": "swin",
+    },
+    # test model (super small)
+    "test": {"input_size": 32, "library": "test"},
+}
+
+MODEL_ARCH_LIST = list(MODEL_ARCH_MAP.keys())
+
+
+def get_model_metadata(model_arch: str):
+    """Returns the model metadata"""
+    if model_arch in MODEL_ARCH_MAP:
+        return MODEL_ARCH_MAP[model_arch]
+    else:
+        raise NotImplementedError(f"model_arch={model_arch} is not implemented yet.")
+
+
+def load_model(model_arch: str, output_dimension: int = 1, pretrained: bool = True):
+    """Loads a model from a given arch and sets it up for training"""
+    if model_arch not in MODEL_ARCH_MAP:
+        raise NotImplementedError(f"model_arch={model_arch} is not implemented yet.")
+
+    if MODEL_ARCH_MAP[model_arch]["library"] == "torchvision":
+        from .torchvision_models import load_torchvision_model
+
+        return load_torchvision_model(model_arch, output_dimension, pretrained)
+    if MODEL_ARCH_MAP[model_arch]["library"] == "swin":
+        from .swin_models import load_swin_model
+
+        return load_swin_model(model_arch, output_dimension, pretrained)
+
+    if MODEL_ARCH_MAP[model_arch]["library"] == "test":
+        from .test_model import load_test_model
+
+        return load_test_model(model_arch, output_dimension, pretrained)
+
+    raise NotImplementedError(
+        f"library {MODEL_ARCH_MAP[model_arch]['library']} is not implemented yet."
+    )
--- a/cv/aml-cli-v2/data-science/src/model/swin_models.py
+++ b/cv/aml-cli-v2/data-science/src/model/swin_models.py
@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+# Original Author: Jeff Omhover (MSFT)
+
+"""
+This script provides code to load and setup a variety of models from torchvision.models.
+"""
+import logging
+
+import torch
+from transformers import SwinConfig, SwinForImageClassification
+
+
+def load_swin_model(
+    model_arch: str, output_dimension: int = 1, pretrained: bool = True
+):
+    """Loads a model from a given arch and sets it up for training"""
+    logger = logging.getLogger(__name__)
+
+    logger.info(
+        f"Loading model from arch={model_arch} pretrained={pretrained} output_dimension={output_dimension}"
+    )
+    if pretrained:
+        model = SwinForImageClassification.from_pretrained(model_arch)
+    else:
+        model = SwinForImageClassification(config=SwinConfig())
+
+    model.classifier = torch.nn.Linear(model.swin.num_features, output_dimension)
+
+    return model
--- a/cv/aml-cli-v2/data-science/src/model/test_model.py
+++ b/cv/aml-cli-v2/data-science/src/model/test_model.py
@ -0,0 +1,40 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+# Original Author: Jeff Omhover (MSFT)
+
+"""
+Creates a super simple 32x32 CNN model for testing.
+From the CIFAR10 tutorial https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
+"""
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Net(nn.Module):
+    def __init__(self, output_dimension):
+        super().__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, output_dimension)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = torch.flatten(x, 1)  # flatten all dimensions except batch
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+def load_test_model(
+    model_arch: str, output_dimension: int = 1, pretrained: bool = True
+):
+    """Loads a model from a given arch and sets it up for training"""
+    return Net(output_dimension)
--- a/cv/aml-cli-v2/data-science/src/model/torchvision_models.py
+++ b/cv/aml-cli-v2/data-science/src/model/torchvision_models.py
@ -0,0 +1,44 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+# Original Author: Jeff Omhover (MSFT)
+
+"""
+This script provides code to load and setup a variety of models from torchvision.models.
+"""
+import logging
+
+import torch
+import torchvision.models as models
+
+
+def load_torchvision_model(
+    model_arch: str, output_dimension: int = 1, pretrained: bool = True
+):
+    """Loads a model from a given arch and sets it up for training"""
+    logger = logging.getLogger(__name__)
+
+    logger.info(
+        f"Loading model from arch={model_arch} pretrained={pretrained} output_dimension={output_dimension}"
+    )
+    if hasattr(models, model_arch):
+        model = getattr(models, model_arch)(pretrained=pretrained)
+    else:
+        raise NotImplementedError(
+            f"model_arch={model_arch} is not implemented in torchvision model zoo."
+        )
+
+    # see https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html
+    if model_arch.startswith("resnet"):
+        model.fc = torch.nn.Linear(model.fc.in_features, output_dimension)
+    elif model_arch == "alexnet":
+        model.classifier[6] = torch.nn.Linear(4096, output_dimension)
+    elif model_arch.startswith("vgg"):
+        model.classifier[6] = torch.nn.Linear(4096, output_dimension)
+    elif model_arch.startswith("densenet"):
+        model.classifier = torch.nn.Linear(1024, output_dimension)
+    else:
+        raise NotImplementedError(
+            f"loading model_arch={model_arch} is not implemented yet in our custom code."
+        )
+
+    return model
--- a/cv/aml-cli-v2/data-science/src/profiling.py
+++ b/cv/aml-cli-v2/data-science/src/profiling.py
@ -0,0 +1,390 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+# Original Author: Jeff Omhover (MSFT)
+
+"""
+This script provides some helper code to help with pytorch profiling.
+"""
+import os
+import time
+import logging
+import torch
+import mlflow
+import tempfile
+from torch.profiler import ProfilerActivity
+from typing import Any
+
+
+def markdown_trace_handler(dir_name: str, rank: int = 0):
+    """This handler can be used inside torch.profiler call to output
+    tables in markdown format"""
+
+    def _handler_fn(prof) -> None:
+        if not os.path.isdir(dir_name):
+            try:
+                os.makedirs(dir_name, exist_ok=True)
+            except Exception:
+                raise RuntimeError("Can't create directory: " + dir_name)
+
+        # Note: trying to identify a unique name for the file
+        file_name = os.path.join(
+            dir_name,
+            f"stacks_rank{rank}_step{prof.step_num}_t{int(time.time() * 1000)}.ms",
+        )
+
+        logging.getLogger(__name__).info(
+            f"Exporting profiler trace as markdown at {file_name}"
+        )
+        # generate report in markdown format
+        markdown = ["# Pytorch Profiler report"]
+
+        markdown.append("## Average by cuda time")
+        markdown.append("```")
+        markdown.append(
+            prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1)
+        )
+        markdown.append("```")
+
+        with open(file_name, "w") as out_file:
+            out_file.write("\n".join(markdown))
+
+    return _handler_fn
+
+
+def composite_trace_handler(handler_list):
+    """This can call multiple trace handlers inside one"""
+
+    def _handler_fn(prof) -> None:
+        for handler in handler_list:
+            handler(prof)
+
+    return _handler_fn
+
+
+def export_stack_trace_handler(
+    dir_name: str, rank: int = 0, metrics=["self_cuda_time_total"]
+):
+    """This handler can be used inside torch.profiler call to output
+    tables in markdown format"""
+
+    def _handler_fn(prof) -> None:
+        if not os.path.isdir(dir_name):
+            try:
+                os.makedirs(dir_name, exist_ok=True)
+            except Exception:
+                raise RuntimeError("Can't create directory: " + dir_name)
+
+        # Note: trying to identify a unique name for the file
+        for metric in metrics:
+            file_name = os.path.join(
+                dir_name,
+                f"stacks_{metric}_rank{rank}_step{prof.step_num}_t{ int(time.time() * 1000)}.txt",
+            )
+
+            logging.getLogger(__name__).info(
+                f"Exporting {metric} stacks as text at {file_name}"
+            )
+
+            prof.export_stacks(file_name, metric)
+
+    return _handler_fn
+
+
+class PyTorchProfilerHandler:
+    """This class handles the initialization and setup of PyTorch profiler"""
+
+    def __init__(self, enabled=False, rank=None):
+        """Constructor.
+
+        Args:
+            enabled (bool): is profiling enabled?
+            export_format (str): generate 'markdown' or 'tensorboard' profile in mlflow artifacts
+            rank (int): rank of the current process/node
+        """
+        self.logger = logging.getLogger(__name__)
+        self.enabled = enabled
+        self.rank = rank
+        self.profiler_output_tmp_dir = None
+        self.profiler = None
+
+    def start_profiler(self):
+        """Setup and start the pytorch profiler.
+
+        Returns:
+            profiler (torch.profiler): the profiler
+        """
+        if self.enabled:
+            self.profiler_output_tmp_dir = tempfile.TemporaryDirectory()
+            self.logger.info(
+                f"Starting profiler (enabled=True) with tmp dir {self.profiler_output_tmp_dir.name}."
+            )
+
+            ## profiler activities CPU/GPU
+            activities = [ProfilerActivity.CPU]
+            if torch.cuda.is_available():
+                self.logger.info(f"Enabling CUDA in profiler.")
+                activities.append(ProfilerActivity.CUDA)
+
+            ## handlers for exporting profile at each step
+            # we're creating a list to export in multiple formats
+            trace_handlers = []
+
+            # export in markdown
+            markdown_logs_export = os.path.join(
+                self.profiler_output_tmp_dir.name, "markdown"
+            )
+            trace_handlers.append(
+                markdown_trace_handler(markdown_logs_export, rank=self.rank)
+            )
+
+            # export stacks in txt
+            stacks_logs_export = os.path.join(
+                self.profiler_output_tmp_dir.name, "stacks"
+            )
+            stack_metrics = ["self_cpu_time_total"]
+            if torch.cuda.is_available():
+                stack_metrics.append("self_cuda_time_total")
+
+            trace_handlers.append(
+                export_stack_trace_handler(
+                    stacks_logs_export, rank=self.rank, metrics=stack_metrics
+                )
+            )
+
+            # export tensorboard
+            # NOTE: removed due to segfault in pytorch 1.11.0
+            # will need to be uncommented for pytorch 1.11.1 which has a fix
+            # tensorboard_logs_export = os.path.join(
+            #     self.profiler_output_tmp_dir.name, "tensorboard_logs"
+            # )
+            # trace_handlers.append(torch.profiler.tensorboard_trace_handler(
+            #     tensorboard_logs_export
+            # ))
+
+            # profiler takes 1 handler, we're composing all above in a single handler
+            trace_handler = composite_trace_handler(trace_handlers)
+
+            # process every single step
+            profiler_schedule = torch.profiler.schedule(wait=0, warmup=0, active=1)
+
+            # initialize profiler
+            self.profiler = torch.profiler.profile(
+                schedule=profiler_schedule,
+                record_shapes=True,
+                with_flops=True,
+                profile_memory=True,
+                activities=activities,
+                with_stack=True,  # needed to export stacks
+                on_trace_ready=trace_handler,
+            )
+            self.profiler.start()
+
+        else:
+            self.logger.info(f"Profiler not started (enabled=False).")
+            self.profiler = None
+
+        return self.profiler
+
+    def stop_profiler(self) -> None:
+        """Stops the pytorch profiler and logs the outputs using mlflow"""
+        if self.profiler:
+            self.logger.info(f"Stopping profiler.")
+            self.profiler.stop()
+
+            # log via mlflow
+            self.logger.info(
+                f"MLFLOW log {self.profiler_output_tmp_dir.name} as an artifact."
+            )
+            mlflow.log_artifacts(
+                self.profiler_output_tmp_dir.name, artifact_path="profiler"
+            )
+
+            self.logger.info(
+                f"Clean up profiler temp dir {self.profiler_output_tmp_dir.name}"
+            )
+            self.profiler_output_tmp_dir.cleanup()
+        else:
+            self.logger.info(
+                "Not stopping profiler as it was not started in the first place."
+            )
+
+
+class LogTimeBlock(object):
+    """This class should be used to time a code block.
+    The time diff is computed from __enter__ to __exit__.
+    Example
+    -------
+    ```python
+    with LogTimeBlock("my_perf_metric_name"):
+        print("(((sleeping for 1 second)))")
+        time.sleep(1)
+    ```
+    """
+
+    def __init__(self, name, **kwargs):
+        """
+        Constructs the LogTimeBlock.
+        Args:
+        name (str): key for the time difference (for storing as metric)
+        kwargs (dict): any keyword will be added  as properties to metrics for logging (work in progress)
+        """
+        # kwargs
+        self.step = kwargs.get("step", None)
+        self.enabled = kwargs.get("enabled", True)
+
+        # internal variables
+        self.name = name
+        self.start_time = None
+        self._logger = logging.getLogger(__name__)
+
+    def __enter__(self):
+        """Starts the timer, gets triggered at beginning of code block"""
+        if not self.enabled:
+            return
+        self.start_time = time.time()  # starts "timer"
+
+    def __exit__(self, exc_type, value, traceback):
+        """Stops the timer and stores accordingly
+        gets triggered at beginning of code block.
+
+        Note:
+            arguments are by design for with statements.
+        """
+        if not self.enabled:
+            return
+        run_time = time.time() - self.start_time  # stops "timer"
+
+        self._logger.info(
+            f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]"
+        )
+        mlflow.log_metric(self.name + ".time", run_time)
+
+
+class LogDiskIOBlock(object):
+    def __init__(self, name, **kwargs):
+        """
+        Constructs the LogDiskUsageBlock.
+        Args:
+        name (str): key for the time difference (for storing as metric)
+        kwargs (dict): any keyword will be added  as properties to metrics for logging (work in progress)
+        """
+        # kwargs
+        self.step = kwargs.get("step", None)
+        self.enabled = kwargs.get("enabled", True)
+
+        # internal variables
+        self.name = name
+        self.process_id = os.getpid()  # focus on current process
+        self.start_time = None
+        self.start_disk_counters = None
+        self._logger = logging.getLogger(__name__)
+
+    def __enter__(self):
+        """Get initial values, gets triggered at beginning of code block"""
+        if not self.enabled:
+            return
+        try:
+            import psutil
+
+            self.start_time = time.time()
+            self.start_disk_counters = psutil.Process(self.process_id).io_counters()
+
+        except ModuleNotFoundError:
+            self.logger.critical("import psutil failed, cannot display disk stats.")
+
+    def __exit__(self, exc_type, value, traceback):
+        """Stops the timer and stores accordingly
+        gets triggered at beginning of code block.
+
+        Note:
+            arguments are by design for with statements.
+        """
+        if not self.enabled:
+            return
+        try:
+            import psutil
+        except ModuleNotFoundError:
+            self.logger.critical("import psutil failed, cannot display disk stats.")
+            return
+
+        run_time = time.time() - self.start_time
+
+        disk_io_metrics = {}
+        end_disk_counters = psutil.Process(self.process_id).io_counters()
+        disk_io_metrics[f"{self.name}.disk.read"] = (
+            end_disk_counters.read_bytes - self.start_disk_counters.read_bytes
+        ) / (1024 * 1024)
+        disk_io_metrics[f"{self.name}.disk.write"] = (
+            end_disk_counters.write_bytes - self.start_disk_counters.write_bytes
+        ) / (1024 * 1024)
+
+        self._logger.info(
+            f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]"
+        )
+        self._logger.info(f"--- disk_io_metrics: {disk_io_metrics}s [step={self.step}]")
+
+        mlflow.log_metrics(disk_io_metrics)
+
+
+class LogTimeOfIterator:  # lgtm [py/iter-returns-non-self]
+    """This class is intended to "wrap" an existing Iterator
+    and log metrics for each next() call"""
+
+    def __init__(
+        self,
+        wrapped_sequence: Any,
+        name: str,
+        enabled: bool = True,
+        async_collector: dict = None,
+    ):
+        self.wrapped_sequence = wrapped_sequence
+        self.wrapped_iterator = None
+
+        # for metrics
+        self.enabled = enabled
+        self.name = name
+        self.iterator_times = []
+        self.metrics = {}
+        self.async_collector = async_collector
+
+        self._logger = logging.getLogger(__name__)
+
+    def __iter__(self):
+        """Creates the iterator"""
+        if self.enabled:
+            start_time = time.time()
+            # if enabled, creates iterator from wrapped_sequence
+            self.wrapped_iterator = self.wrapped_sequence.__iter__()
+            self.metrics[f"{self.name}.init"] = time.time() - start_time
+
+            # return self
+            return self
+        else:
+            # if disabled, return the iterator from wrapped_sequence
+            # so that LogTimeOfIterator.__next__() will never get called
+            return self.wrapped_sequence.__iter__()
+
+    def __next__(self):
+        """Iterates"""
+        try:
+            start_time = time.time()
+            next_val = self.wrapped_iterator.__next__()
+            self.iterator_times.append(time.time() - start_time)
+            return next_val
+        except StopIteration as e:
+            self.log_metrics()
+            raise e
+
+    def log_metrics(self):
+        """Logs metrics once iterator is finished"""
+        self.metrics[f"{self.name}.count"] = len(self.iterator_times)
+        self.metrics[f"{self.name}.time.sum"] = sum(self.iterator_times)
+        self.metrics[f"{self.name}.time.first"] = self.iterator_times[0]
+
+        if self.async_collector is not None:
+            self._logger.info(f"Async MLFLOW: {self.metrics}")
+            for k in self.metrics:
+                self.async_collector[k] = self.metrics[k]
+        else:
+            self._logger.info(f"MLFLOW: {self.metrics}")
+            mlflow.log_metrics(self.metrics)
--- a/cv/aml-cli-v2/data-science/src/train.py
+++ b/cv/aml-cli-v2/data-science/src/train.py
@ -0,0 +1,960 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+# Original Author: Jeff Omhover (MSFT)
+
+
+"""
+This script implements a Distributed PyTorch training sequence.
+
+IMPORTANT: We have tagged the code with the following expressions to walk you through
+the key implementation details.
+
+Using your editor, search for those strings to get an idea of how to implement:
+- DISTRIBUTED : how to implement distributed pytorch
+- MLFLOW : how to implement mlflow reporting of metrics and artifacts
+- PROFILER : how to implement pytorch profiler
+"""
+import argparse
+import json
+import logging
+import os
+import sys
+import time
+import traceback
+from distutils.util import strtobool
+
+import mlflow
+
+# the long list of torch imports
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.optim import lr_scheduler
+from torch.profiler import record_function
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm
+from transformers.utils import ModelOutput
+
+# add path to here, if necessary
+COMPONENT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "."))
+if COMPONENT_ROOT not in sys.path:
+    logging.info(f"Adding {COMPONENT_ROOT} to path")
+    sys.path.append(str(COMPONENT_ROOT))
+
+from image_io import build_image_datasets
+
+# internal imports
+from model import get_model_metadata, load_model
+from profiling import (
+    LogDiskIOBlock,
+    LogTimeBlock,
+    LogTimeOfIterator,
+    PyTorchProfilerHandler,
+)
+
+torch.set_default_dtype(torch.float64)
+
+
+class PyTorchDistributedModelTrainingSequence:
+    """Generic class to run the sequence for training a PyTorch model
+    using distributed training."""
+
+    def __init__(self):
+        """Constructor"""
+        self.logger = logging.getLogger(__name__)
+
+        # DATA
+        self.training_data_sampler = None
+        self.training_data_loader = None
+        self.validation_data_loader = None
+
+        # MODEL
+        self.model = None
+        self.labels = []
+        self.model_signature = None
+
+        # DISTRIBUTED CONFIG
+        self.world_size = 1
+        self.world_rank = 0
+        self.local_world_size = 1
+        self.local_rank = 0
+        self.multinode_available = False
+        self.cpu_count = os.cpu_count()
+        self.device = None
+        # NOTE: if we're running multiple nodes, this indicates if we're on first node
+        self.self_is_main_node = True
+
+        # TRAINING CONFIGS
+        self.dataloading_config = None
+        self.training_config = None
+
+        # PROFILER
+        self.profiler = None
+        self.profiler_output_tmp_dir = None
+
+    #####################
+    ### SETUP METHODS ###
+    #####################
+
+    def setup_config(self, args):
+        """Sets internal variables using provided CLI arguments (see build_arguments_parser()).
+        In particular, sets device(cuda) and multinode parameters."""
+        self.dataloading_config = args
+        self.training_config = args
+
+        # verify parameter default values
+        if self.dataloading_config.num_workers is None:
+            self.dataloading_config.num_workers = 0
+        if self.dataloading_config.num_workers < 0:
+            self.dataloading_config.num_workers = self.cpu_count
+        if self.dataloading_config.num_workers == 0:
+            self.logger.warning(
+                "You specified num_workers=0, forcing prefetch_factor to be discarded."
+            )
+            self.dataloading_config.prefetch_factor = None
+
+        # NOTE: strtobool returns an int, converting to bool explicitely
+        self.dataloading_config.pin_memory = bool(self.dataloading_config.pin_memory)
+        self.dataloading_config.non_blocking = bool(
+            self.dataloading_config.non_blocking
+        )
+
+        # add this switch to test for different strategies
+        if self.dataloading_config.multiprocessing_sharing_strategy:
+            torch.multiprocessing.set_sharing_strategy(
+                self.dataloading_config.multiprocessing_sharing_strategy
+            )
+
+        # DISTRIBUTED: detect multinode config
+        # depending on the Azure ML distribution.type, different environment variables will be provided
+        # to configure DistributedDataParallel
+        self.distributed_backend = args.distributed_backend
+        if self.distributed_backend == "nccl":
+            self.world_size = int(os.environ.get("WORLD_SIZE", "1"))
+            self.world_rank = int(os.environ.get("RANK", "0"))
+            self.local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", "1"))
+            self.local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+            self.multinode_available = self.world_size > 1
+            self.self_is_main_node = self.world_rank == 0
+
+        elif self.distributed_backend == "mpi":
+            # Note: Distributed pytorch package doesn't have MPI built in.
+            # MPI is only included if you build PyTorch from source on a host that has MPI installed.
+            self.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", "1"))
+            self.world_rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", "0"))
+            self.local_world_size = int(
+                os.environ.get("OMPI_COMM_WORLD_LOCAL_SIZE", "1")
+            )
+            self.local_rank = int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK", "0"))
+            self.multinode_available = self.world_size > 1
+            self.self_is_main_node = self.world_rank == 0
+
+        else:
+            raise NotImplementedError(
+                f"distributed_backend={self.distributed_backend} is not implemented yet."
+            )
+
+        # Use CUDA if it is available
+        if not self.training_config.disable_cuda and torch.cuda.is_available():
+            self.logger.info(
+                f"Setting up torch.device for CUDA for local gpu:{self.local_rank}"
+            )
+            self.device = torch.device(self.local_rank)
+        else:
+            self.logger.info(f"Setting up torch.device for cpu")
+            self.device = torch.device("cpu")
+
+        if self.multinode_available:
+            self.logger.info(
+                f"Running in multinode with backend={self.distributed_backend} local_rank={self.local_rank} rank={self.world_rank} size={self.world_size}"
+            )
+            # DISTRIBUTED: this is required to initialize the pytorch backend
+            torch.distributed.init_process_group(
+                self.distributed_backend,
+                rank=self.world_rank,
+                world_size=self.world_size,
+            )
+        else:
+            self.logger.info(
+                f"Not running in multinode, so not initializing process group."
+            )
+
+        # DISTRIBUTED: in distributed mode, you want to report parameters
+        # only from main process (rank==0) to avoid conflict
+        if self.self_is_main_node:
+            # MLFLOW: report relevant parameters using mlflow
+            logged_params = {
+                # log some distribution params
+                "nodes": int(os.environ.get("AZUREML_NODE_COUNT", "1")),
+                "instance_per_node": self.world_size
+                // int(os.environ.get("AZUREML_NODE_COUNT", "1")),
+                "cuda_available": torch.cuda.is_available(),
+                "disable_cuda": self.training_config.disable_cuda,
+                "distributed": self.multinode_available,
+                "distributed_backend": self.distributed_backend,
+                # data loading params
+                "batch_size": self.dataloading_config.batch_size,
+                "num_workers": self.dataloading_config.num_workers,
+                "cpu_count": self.cpu_count,
+                "prefetch_factor": self.dataloading_config.prefetch_factor,
+                "persistent_workers": self.dataloading_config.persistent_workers,
+                "pin_memory": self.dataloading_config.pin_memory,
+                "non_blocking": self.dataloading_config.non_blocking,
+                "multiprocessing_sharing_strategy": self.dataloading_config.multiprocessing_sharing_strategy,
+                # training params
+                "model_arch": self.training_config.model_arch,
+                "model_arch_pretrained": self.training_config.model_arch_pretrained,
+                "optimizer.learning_rate": self.training_config.learning_rate,
+                "optimizer.momentum": self.training_config.momentum,
+                # profiling params
+                "enable_profiling": self.training_config.enable_profiling,
+            }
+
+            if not self.training_config.disable_cuda and torch.cuda.is_available():
+                # add some gpu properties
+                logged_params["cuda_device_count"] = torch.cuda.device_count()
+                cuda_device_properties = torch.cuda.get_device_properties(self.device)
+                logged_params["cuda_device_name"] = cuda_device_properties.name
+                logged_params["cuda_device_major"] = cuda_device_properties.major
+                logged_params["cuda_device_minor"] = cuda_device_properties.minor
+                logged_params[
+                    "cuda_device_memory"
+                ] = cuda_device_properties.total_memory
+                logged_params[
+                    "cuda_device_processor_count"
+                ] = cuda_device_properties.multi_processor_count
+
+            mlflow.log_params(logged_params)
+
+    def setup_datasets(
+        self,
+        training_dataset: torch.utils.data.Dataset,
+        validation_dataset: torch.utils.data.Dataset,
+        labels: list,
+    ):
+        """Creates and sets up dataloaders for training/validation datasets."""
+        self.labels = labels
+
+        # DISTRIBUTED: you need to use a DistributedSampler that wraps your dataset
+        # it will draw a different sample on each node/process to distribute data sampling
+        self.training_data_sampler = DistributedSampler(
+            training_dataset, num_replicas=self.world_size, rank=self.world_rank
+        )
+
+        # setting up DataLoader with the right arguments
+        optional_data_loading_kwargs = {}
+
+        if self.dataloading_config.num_workers > 0:
+            # NOTE: this option _ONLY_ applies if num_workers > 0
+            # or else DataLoader will except
+            optional_data_loading_kwargs[
+                "prefetch_factor"
+            ] = self.dataloading_config.prefetch_factor
+            optional_data_loading_kwargs[
+                "persistent_workers"
+            ] = self.dataloading_config.persistent_workers
+
+        self.training_data_loader = DataLoader(
+            training_dataset,
+            batch_size=self.dataloading_config.batch_size,
+            num_workers=self.dataloading_config.num_workers,  # self.cpu_count,
+            pin_memory=self.dataloading_config.pin_memory,
+            # DISTRIBUTED: the sampler needs to be provided to the DataLoader
+            sampler=self.training_data_sampler,
+            # all other args
+            **optional_data_loading_kwargs,
+        )
+
+        # DISTRIBUTED: we don't need a sampler for validation set
+        # it is used as-is in every node/process
+        self.validation_data_loader = DataLoader(
+            validation_dataset,
+            batch_size=self.dataloading_config.batch_size,
+            num_workers=self.dataloading_config.num_workers,  # self.cpu_count,
+            pin_memory=self.dataloading_config.pin_memory,
+        )
+
+        if self.self_is_main_node:
+            # MLFLOW: report relevant parameters using mlflow
+            mlflow.log_params({"num_classes": len(labels)})
+
+    def setup_model(self, model):
+        """Configures a model for training."""
+        self.logger.info(f"Setting up model to use device {self.device}")
+        self.model = model.to(self.device)
+
+        # DISTRIBUTED: the model needs to be wrapped in a DistributedDataParallel class
+        if self.multinode_available:
+            self.logger.info(f"Setting up model to use DistributedDataParallel.")
+            self.model = torch.nn.parallel.DistributedDataParallel(self.model)
+
+        # fun: log the number of parameters
+        params_count = 0
+        for param in model.parameters():
+            if param.requires_grad:
+                params_count += param.numel()
+        self.logger.info(
+            "MLFLOW: model_param_count={:.2f} (millions)".format(
+                round(params_count / 1e6, 2)
+            )
+        )
+        if self.self_is_main_node:
+            mlflow.log_params({"model_param_count": round(params_count / 1e6, 2)})
+
+        return self.model
+
+    ########################
+    ### TRAINING METHODS ###
+    ########################
+
+    def _epoch_eval(self, epoch, criterion):
+        """Called during train() for running the eval phase of one epoch."""
+        with torch.no_grad():
+            num_correct = 0
+            num_total_images = 0
+            running_loss = 0.0
+
+            epoch_eval_metrics = {}
+
+            # PROFILER: here we're introducing a layer on top of data loader to capture its performance
+            # in pratice, we'd just use for images, targets in tqdm(self.training_data_loader)
+            for images, targets in LogTimeOfIterator(
+                tqdm(self.validation_data_loader),
+                "validation_data_loader",
+                async_collector=epoch_eval_metrics,
+            ):
+                with record_function("eval.to_device"):
+                    images = images.to(
+                        self.device, non_blocking=self.dataloading_config.non_blocking
+                    )
+                    targets = targets.to(
+                        self.device, non_blocking=self.dataloading_config.non_blocking
+                    )
+
+                with record_function("eval.forward"):
+                    outputs = self.model(images)
+
+                    if isinstance(outputs, torch.Tensor):
+                        # if we're training a regular pytorch model (ex: torchvision)
+                        loss = criterion(outputs, targets)
+                        _, predicted = torch.max(outputs.data, 1)
+                        correct = predicted == targets
+                    elif isinstance(outputs, ModelOutput):
+                        # if we're training a HuggingFace model
+                        loss = criterion(outputs.logits, targets)
+                        _, predicted = torch.max(outputs.logits.data, 1)
+                        correct = predicted == targets
+                    else:
+                        # if anything else, just except
+                        raise ValueError(
+                            f"outputs from model is type {type(outputs)} which is unknown."
+                        )
+
+                    running_loss += loss.item() * images.size(0)
+
+                    num_correct += torch.sum(correct).item()
+                    num_total_images += len(images)
+
+        epoch_eval_metrics["running_loss"] = running_loss
+        epoch_eval_metrics["num_correct"] = num_correct
+        epoch_eval_metrics["num_samples"] = num_total_images
+
+        return epoch_eval_metrics
+
+    def _epoch_train(self, epoch, optimizer, scheduler, criterion):
+        """Called during train() for running the train phase of one epoch."""
+        self.model.train()
+        self.training_data_sampler.set_epoch(epoch)
+
+        num_correct = 0
+        num_total_images = 0
+        running_loss = 0.0
+
+        epoch_train_metrics = {}
+
+        # PROFILER: here we're introducing a layer on top of data loader to capture its performance
+        # in pratice, we'd just use for images, targets in tqdm(self.training_data_loader)
+        for images, targets in LogTimeOfIterator(
+            tqdm(self.training_data_loader),
+            "training_data_loader",
+            async_collector=epoch_train_metrics,
+        ):
+            # PROFILER: record_function will report to the profiler (if enabled)
+            # here a specific wall time for a given block of code
+            with record_function("train.to_device"):
+                images = images.to(
+                    self.device, non_blocking=self.dataloading_config.non_blocking
+                )
+                targets = targets.to(
+                    self.device, non_blocking=self.dataloading_config.non_blocking
+                )
+
+            with record_function("train.forward"):
+                # zero the parameter gradients
+                optimizer.zero_grad()
+
+                outputs = self.model(images)
+
+                # if self.model_signature is None:
+                #     self.model_signature = infer_signature(images, outputs)
+
+                if isinstance(outputs, torch.Tensor):
+                    # if we're training a regular pytorch model (ex: torchvision)
+                    loss = criterion(outputs, targets)
+                    _, predicted = torch.max(outputs.data, 1)
+                    correct = predicted == targets
+                elif isinstance(outputs, ModelOutput):
+                    # if we're training a HuggingFace model
+                    loss = criterion(outputs.logits, targets)
+                    _, predicted = torch.max(outputs.logits.data, 1)
+                    correct = predicted == targets
+                else:
+                    # if anything else, just except
+                    raise ValueError(
+                        f"outputs from model is type {type(outputs)} which is unknown."
+                    )
+
+                running_loss += loss.item() * images.size(0)
+                num_correct += torch.sum(correct).item()
+                num_total_images += len(images)
+
+            # PROFILER: record_function will report to the profiler (if enabled)
+            # here a specific wall time for a given block of code
+            with record_function("train.backward"):
+                loss.backward()
+                optimizer.step()
+                scheduler.step()
+
+        epoch_train_metrics["running_loss"] = running_loss
+        epoch_train_metrics["num_correct"] = num_correct
+        epoch_train_metrics["num_samples"] = num_total_images
+
+        return epoch_train_metrics
+
+    def train(self, epochs: int = None, checkpoints_dir: str = None):
+        """Trains the model.
+
+        Args:
+            epochs (int, optional): if not provided uses internal config
+            checkpoints_dir (str, optional): path to write checkpoints
+        """
+        if epochs is None:
+            epochs = self.training_config.num_epochs
+
+        # Observe that all parameters are being optimized
+        optimizer = optim.SGD(
+            self.model.parameters(),
+            lr=self.training_config.learning_rate,
+            momentum=self.training_config.momentum,
+            nesterov=True,
+            # weight_decay=1e-4,
+        )
+
+        # criterion = nn.BCEWithLogitsLoss()
+        criterion = nn.CrossEntropyLoss()
+
+        # Decay LR by a factor of 0.1 every 7 epochs
+        scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
+
+        # DISTRIBUTED: export checkpoint only from main node
+        if self.self_is_main_node and checkpoints_dir is not None:
+            # saving checkpoint before training
+            self.checkpoint_save(
+                self.model, optimizer, checkpoints_dir, epoch=-1, loss=0.0
+            )
+
+        # DISTRIBUTED: you'll node that this loop has nothing specifically "distributed"
+        # that's because most of the changes are in the backend (DistributedDataParallel)
+        for epoch in range(epochs):
+            self.logger.info(f"Starting epoch={epoch}")
+
+            # we'll collect metrics we want to report for this epoch
+            epoch_metrics = {}
+
+            # start timer for epoch time metric
+            epoch_train_start = time.time()
+
+            # TRAIN: loop on training set and return metrics
+            epoch_train_metrics = self._epoch_train(
+                epoch, optimizer, scheduler, criterion
+            )
+            self.logger.info(f"Epoch metrics: {epoch_train_metrics}")
+
+            # stop timer
+            epoch_metrics["epoch_train_time"] = time.time() - epoch_train_start
+
+            # record metrics of interest
+            epoch_metrics["training_data_loader.count"] = epoch_train_metrics[
+                "training_data_loader.count"
+            ]
+            epoch_metrics["training_data_loader.time.sum"] = epoch_train_metrics[
+                "training_data_loader.time.sum"
+            ]
+            epoch_metrics["training_data_loader.time.first"] = epoch_train_metrics[
+                "training_data_loader.time.first"
+            ]
+            epoch_metrics["epoch_train_loss"] = (
+                epoch_train_metrics["running_loss"] / epoch_train_metrics["num_samples"]
+            )
+            epoch_metrics["epoch_train_acc"] = (
+                epoch_train_metrics["num_correct"] / epoch_train_metrics["num_samples"]
+            )
+
+            # start timer for epoch time metric
+            epoch_eval_start = time.time()
+
+            # EVAL: run evaluation on validation set and return metrics
+            epoch_eval_metrics = self._epoch_eval(epoch, criterion)
+            self.logger.info(f"Epoch metrics: {epoch_train_metrics}")
+
+            # stop timer
+            epoch_metrics["epoch_eval_time"] = time.time() - epoch_eval_start
+
+            # record metrics of interest
+            epoch_metrics["validation_data_loader.count"] = epoch_eval_metrics[
+                "validation_data_loader.count"
+            ]
+            epoch_metrics["validation_data_loader.time.sum"] = epoch_eval_metrics[
+                "validation_data_loader.time.sum"
+            ]
+            epoch_metrics["validation_data_loader.time.first"] = epoch_eval_metrics[
+                "validation_data_loader.time.first"
+            ]
+            epoch_metrics["epoch_valid_loss"] = (
+                epoch_eval_metrics["running_loss"] / epoch_eval_metrics["num_samples"]
+            )
+            epoch_metrics["epoch_valid_acc"] = (
+                epoch_eval_metrics["num_correct"] / epoch_eval_metrics["num_samples"]
+            )
+
+            # start timer for epoch time metric
+            epoch_utility_start = time.time()
+
+            # PROFILER: use profiler.step() to mark a step in training
+            # the pytorch profiler will use internally to trigger
+            # saving the traces in different files
+            if self.profiler:
+                self.profiler.step()
+
+            # DISTRIBUTED: export checkpoint only from main node
+            if self.self_is_main_node and checkpoints_dir is not None:
+                self.checkpoint_save(
+                    self.model,
+                    optimizer,
+                    checkpoints_dir,
+                    epoch=epoch,
+                    loss=epoch_metrics["epoch_valid_loss"],
+                )
+
+            # report metric values in stdout
+            self.logger.info(f"MLFLOW: metrics={epoch_metrics} epoch={epoch}")
+
+            # MLFLOW / DISTRIBUTED: report metrics only from main node
+            if self.self_is_main_node:
+                mlflow.log_metrics(epoch_metrics)
+                mlflow.log_metric(
+                    "epoch_utility_time", time.time() - epoch_utility_start, step=epoch
+                )
+
+    def runtime_error_report(self, runtime_exception):
+        """Call this when catching a critical exception.
+        Will print all sorts of relevant information to the log."""
+        self.logger.critical(traceback.format_exc())
+        try:
+            import psutil
+
+            self.logger.critical(f"Memory: {str(psutil.virtual_memory())}")
+        except ModuleNotFoundError:
+            self.logger.critical(
+                "import psutil failed, cannot display virtual memory stats."
+            )
+
+        if torch.cuda.is_available():
+            self.logger.critical(
+                "Cuda memory summary:\n"
+                + str(torch.cuda.memory_summary(device=None, abbreviated=False))
+            )
+            self.logger.critical(
+                "Cuda memory snapshot:\n"
+                + json.dumps(torch.cuda.memory_snapshot(), indent="    ")
+            )
+        else:
+            self.logger.critical(
+                "Cuda is not available, cannot report cuda memory allocation."
+            )
+
+    def close(self):
+        """Tear down potential resources"""
+        if self.multinode_available:
+            self.logger.info(
+                f"Destroying process group on local_rank={self.local_rank} rank={self.world_rank} size={self.world_size}"
+            )
+            # DISTRIBUTED: this will teardown the distributed process group
+            torch.distributed.destroy_process_group()
+        else:
+            self.logger.info(
+                f"Not running in multinode, so not destroying process group."
+            )
+
+    #################
+    ### MODEL I/O ###
+    #################
+
+    def checkpoint_save(
+        self, model, optimizer, output_dir: str, epoch: int, loss: float
+    ):
+        """Saves model as checkpoint"""
+        # create output directory just in case
+        os.makedirs(output_dir, exist_ok=True)
+
+        model_output_path = os.path.join(
+            output_dir, f"model-checkpoint-epoch{epoch}-loss{loss}.pt"
+        )
+
+        self.logger.info(f"Exporting checkpoint to {model_output_path}")
+
+        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+            # DISTRIBUTED: to export model, you need to get it out of the DistributedDataParallel class
+            self.logger.info(
+                "Model was distributed, we will checkpoint DistributedDataParallel.module"
+            )
+            model_to_save = model.module
+        else:
+            model_to_save = model
+
+        with record_function("checkpoint.save"):
+            torch.save(
+                {
+                    "epoch": epoch,
+                    "model_state_dict": model_to_save.state_dict(),
+                    "optimizer_state_dict": optimizer.state_dict(),
+                    "loss": loss,
+                },
+                model_output_path,
+            )
+
+    def save(
+        self,
+        output_dir: str,
+        name: str = "dev",
+        register_as: str = None,
+    ) -> None:
+        # DISTRIBUTED: you want to save the model only from the main node/process
+        # in data distributed mode, all models should theoretically be the same
+        if self.self_is_main_node:
+            self.logger.info(f"Saving model and classes in {output_dir}...")
+
+            # create output directory just in case
+            os.makedirs(output_dir, exist_ok=True)
+
+            if isinstance(self.model, torch.nn.parallel.DistributedDataParallel):
+                # DISTRIBUTED: to export model, you need to get it out of the DistributedDataParallel class
+                self.logger.info(
+                    "Model was distributed, we will export DistributedDataParallel.module"
+                )
+                model_to_save = self.model.module.to("cpu")
+            else:
+                model_to_save = self.model.to("cpu")
+
+            # Save the labels to a csv file.
+            # This file will be required to map the output array
+            # from the API to the labels.
+            with open("label-mapping.txt", "w") as f:
+                f.write("\n".join(self.labels))
+            mlflow.log_artifact("label-mapping.txt")
+
+            # MLFLOW: mlflow has a nice method to export the model automatically
+            # add tags and environment for it. You can then use it in Azure ML
+            # to register your model to an endpoint.
+            mlflow.pytorch.log_model(
+                model_to_save,
+                artifact_path="final_model",
+                registered_model_name=register_as,  # also register it if name is provided
+                signature=self.model_signature,
+            )
+
+            # MLFLOW: Register the model with the model registry
+            # This is useful for Azure ML to register your model
+            # to an endpoint.
+            if register_as is not None:
+                mlflow.register_model(
+                    model_uri=f"runs:/{mlflow.active_run().info.run_id}/final_model",
+                    name=register_as,
+                )
+
+
+def build_arguments_parser(parser: argparse.ArgumentParser = None):
+    """Builds the argument parser for CLI settings"""
+    if parser is None:
+        parser = argparse.ArgumentParser()
+
+    group = parser.add_argument_group(f"Training Inputs")
+    group.add_argument(
+        "--train_images",
+        type=str,
+        required=True,
+        help="Path to folder containing training images",
+    )
+    group.add_argument(
+        "--valid_images",
+        type=str,
+        required=True,
+        help="path to folder containing validation images",
+    )
+
+    group = parser.add_argument_group(f"Training Outputs")
+    group.add_argument(
+        "--model_output",
+        type=str,
+        required=False,
+        default=None,
+        help="Path to write final model",
+    )
+    group.add_argument(
+        "--checkpoints",
+        type=str,
+        required=False,
+        default=None,
+        help="Path to read/write checkpoints",
+    )
+    group.add_argument(
+        "--register_model_as",
+        type=str,
+        required=False,
+        default=None,
+        help="Name to register final model in MLFlow",
+    )
+
+    group = parser.add_argument_group(f"Data Loading Parameters")
+    group.add_argument(
+        "--batch_size",
+        type=int,
+        required=False,
+        default=64,
+        help="Train/valid data loading batch size (default: 64)",
+    )
+    group.add_argument(
+        "--num_workers",
+        type=int,
+        required=False,
+        default=None,
+        help="Num workers for data loader (default: -1 => all cpus available)",
+    )
+    group.add_argument(
+        "--prefetch_factor",
+        type=int,
+        required=False,
+        default=2,
+        help="Data loader prefetch factor (default: 2)",
+    )
+    group.add_argument(
+        "--persistent_workers",
+        type=strtobool,
+        required=False,
+        default=True,
+        help="Use persistent prefetching workers (default: True)",
+    )
+    group.add_argument(
+        "--pin_memory",
+        type=strtobool,
+        required=False,
+        default=True,
+        help="Pin Data loader prefetch factor (default: True)",
+    )
+    group.add_argument(
+        "--non_blocking",
+        type=strtobool,
+        required=False,
+        default=False,
+        help="Use non-blocking transfer to device (default: False)",
+    )
+
+    group = parser.add_argument_group(f"Model/Training Parameters")
+    group.add_argument(
+        "--model_arch",
+        type=str,
+        required=False,
+        default="resnet18",
+        help="Which model architecture to use (default: resnet18)",
+    )
+    group.add_argument(
+        "--model_arch_pretrained",
+        type=strtobool,
+        required=False,
+        default=True,
+        help="Use pretrained model (default: true)",
+    )
+    group.add_argument(
+        "--distributed_backend",
+        type=str,
+        required=False,
+        choices=["nccl", "mpi"],
+        default="nccl",
+        help="Which distributed backend to use.",
+    )
+    group.add_argument(
+        "--disable_cuda",
+        type=strtobool,
+        required=False,
+        default=False,
+        help="set True to force use of cpu (local testing).",
+    )
+    # DISTRIBUTED: torch.distributed.launch is passing this argument to your script
+    # it is likely to be deprecated in favor of os.environ['LOCAL_RANK']
+    # see https://pytorch.org/docs/stable/distributed.html#launch-utility
+    group.add_argument(
+        "--local_rank",
+        type=int,
+        required=False,
+        default=None,
+        help="Passed by torch.distributed.launch utility when running from cli.",
+    )
+    group.add_argument(
+        "--num_epochs",
+        type=int,
+        required=False,
+        default=1,
+        help="Number of epochs to train for",
+    )
+    group.add_argument(
+        "--learning_rate",
+        type=float,
+        required=False,
+        default=0.001,
+        help="Learning rate of optimizer",
+    )
+    group.add_argument(
+        "--momentum",
+        type=float,
+        required=False,
+        default=0.9,
+        help="Momentum of optimizer",
+    )
+
+    group = parser.add_argument_group(f"System Parameters")
+    group.add_argument(
+        "--enable_profiling",
+        type=strtobool,
+        required=False,
+        default=False,
+        help="Enable pytorch profiler.",
+    )
+    group.add_argument(
+        "--multiprocessing_sharing_strategy",
+        type=str,
+        choices=torch.multiprocessing.get_all_sharing_strategies(),
+        required=False,
+        default=None,
+        help="Check https://pytorch.org/docs/stable/multiprocessing.html",
+    )
+
+    return parser
+
+
+def run(args):
+    """Run the script using CLI arguments"""
+    logger = logging.getLogger(__name__)
+    logger.info(f"Running with arguments: {args}")
+
+    # MLFLOW: initialize mlflow (once in entire script)
+    mlflow.start_run()
+
+    # use a handler for the training sequence
+    training_handler = PyTorchDistributedModelTrainingSequence()
+
+    # sets cuda and distributed config
+    training_handler.setup_config(args)
+
+    # PROFILER: here we use a helper class to enable profiling
+    # see profiling.py for the implementation details
+    training_profiler = PyTorchProfilerHandler(
+        enabled=bool(args.enable_profiling),
+        rank=training_handler.world_rank,
+    )
+    # PROFILER: set profiler in trainer to call profiler.step() during training
+    training_handler.profiler = training_profiler.start_profiler()
+
+    # report the time and disk usage during this code block
+    with LogTimeBlock(
+        "build_image_datasets", enabled=training_handler.self_is_main_node
+    ), LogDiskIOBlock(
+        "build_image_datasets", enabled=training_handler.self_is_main_node
+    ):
+        # build the image folder datasets
+        train_dataset, valid_dataset, labels = build_image_datasets(
+            train_images_dir=args.train_images,
+            valid_images_dir=args.valid_images,
+            input_size=get_model_metadata(args.model_arch)["input_size"],
+        )
+
+    # creates data loaders from datasets for distributed training
+    training_handler.setup_datasets(train_dataset, valid_dataset, labels)
+
+    with LogTimeBlock("load_model", enabled=training_handler.self_is_main_node):
+        # creates the model architecture
+        model = load_model(
+            args.model_arch,
+            output_dimension=len(labels),
+            pretrained=args.model_arch_pretrained,
+        )
+
+    # logging of labels
+    logger.info(labels)
+    # sets the model for distributed training
+    training_handler.setup_model(model)
+
+    # runs training sequence
+    # NOTE: num_epochs is provided in args
+    try:
+        training_handler.train(checkpoints_dir=args.checkpoints)
+    except RuntimeError as runtime_exception:  # if runtime error occurs (ex: cuda out of memory)
+        # then print some runtime error report in the logs
+        training_handler.runtime_error_report(runtime_exception)
+        # re-raise
+        raise runtime_exception
+
+    # stops profiling (and save in mlflow)
+    training_profiler.stop_profiler()
+
+    # saves final model
+    if args.model_output:
+        training_handler.save(
+            args.model_output,
+            name=f"epoch-{args.num_epochs}",
+            register_as=args.register_model_as,
+        )
+
+    # properly teardown distributed resources
+    training_handler.close()
+
+    # MLFLOW: finalize mlflow (once in entire script)
+    mlflow.end_run()
+
+    logger.info("run() completed")
+
+
+def main(cli_args=None):
+    """Main function of the script."""
+    # initialize root logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler()
+    formatter = logging.Formatter(
+        "%(asctime)s : %(levelname)s : %(name)s : %(message)s"
+    )
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)
+
+    # create argument parser
+    parser = build_arguments_parser()
+
+    # runs on cli arguments
+    args = parser.parse_args(cli_args)  # if None, runs on sys.argv
+
+    # run the run function
+    run(args)
+
+
+if __name__ == "__main__":
+    main()
--- a/cv/aml-cli-v2/data-science/tests/conftest.py
+++ b/cv/aml-cli-v2/data-science/tests/conftest.py
@ -0,0 +1,20 @@
+import os
+import sys
+import logging
+import pytest
+import tempfile
+from unittest.mock import Mock
+
+SRC_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src"))
+
+if SRC_ROOT not in sys.path:
+    logging.info(f"Adding {SRC_ROOT} to path")
+    sys.path.append(str(SRC_ROOT))
+
+
+@pytest.fixture()
+def temporary_dir():
+    """Creates a temporary directory for the tests"""
+    temp_directory = tempfile.TemporaryDirectory()
+    yield temp_directory.name
+    temp_directory.cleanup()
--- a/cv/aml-cli-v2/data-science/tests/model/test_model_loader.py
+++ b/cv/aml-cli-v2/data-science/tests/model/test_model_loader.py
@ -0,0 +1,40 @@
+"""
+Tests running the model loader for every possible model in the list
+"""
+import pytest
+import torch
+
+# local imports
+from model import (
+    MODEL_ARCH_LIST,
+    get_model_metadata,
+    load_model,
+)
+
+# IMPORTANT: see conftest.py for fixtures
+
+
+@pytest.mark.parametrize("model_arch", MODEL_ARCH_LIST)
+def test_model_loader(model_arch):
+    """Tests src/components/pytorch_image_classifier/model/"""
+    model_metadata = get_model_metadata(model_arch)
+
+    assert model_metadata is not None
+    assert isinstance(model_metadata, dict)
+    assert "library" in model_metadata
+    assert "input_size" in model_metadata
+
+    # using pretrained=False to avoid downloading each time we unit test
+    model = load_model(model_arch, output_dimension=4, pretrained=False)
+
+    assert model is not None
+    assert isinstance(model, torch.nn.Module)
+
+
+def test_model_loader_failure():
+    """Test asking for a model that deosn't exist"""
+    with pytest.raises(NotImplementedError):
+        get_model_metadata("not_a_model")
+
+    with pytest.raises(NotImplementedError):
+        load_model("not_a_model", output_dimension=4, pretrained=False)
--- a/cv/aml-cli-v2/data-science/tests/test_train.py
+++ b/cv/aml-cli-v2/data-science/tests/test_train.py
@ -0,0 +1,219 @@
+"""
+Tests running the train.py script end-to-end
+on a randomly generated (small) dataset.
+"""
+import os
+import sys
+import tempfile
+import pytest
+from unittest.mock import patch
+
+import numpy as np
+from PIL import Image
+
+# local imports
+import train
+from model import MODEL_ARCH_LIST
+
+# IMPORTANT: see conftest.py for fixtures (ex: temporary_dir)
+
+
+@pytest.fixture()
+def random_image_in_folder_classes(temporary_dir):
+    image_dataset_path = os.path.join(temporary_dir, "image_in_folders")
+    os.makedirs(image_dataset_path, exist_ok=False)
+
+    n_samples = 100
+    n_classes = 4
+
+    for i in range(n_samples):
+        a = np.random.rand(300, 300, 3) * 255
+        im_out = Image.fromarray(a.astype("uint8")).convert("RGB")
+
+        class_dir = "class_{}".format(i % n_classes)
+
+        image_path = os.path.join(
+            image_dataset_path, class_dir, "random_image_{}.jpg".format(i)
+        )
+        os.makedirs(os.path.join(image_dataset_path, class_dir), exist_ok=True)
+        im_out.save(image_path)
+
+    return image_dataset_path
+
+
+# IMPORTANT: we have to restrict the list of models for unit test
+# because github actions runners have 7GB RAM only and will OOM
+TEST_MODEL_ARCH_LIST = [
+    "test",
+    "resnet18",
+    "resnet34",
+]
+
+# NOTE: we only care about patching those specific mlflow methods
+# to mlflow initialization conflict between tests
+@patch("mlflow.end_run")  # we can have only 1 start/end per test session
+@patch("mlflow.register_model")  # patched to test model name registration
+@patch("mlflow.pytorch.log_model")  # patched to test model name registration
+@patch("mlflow.log_params")  # patched to avoid conflict in parameters
+@patch("mlflow.start_run")  # we can have only 1 start/end per test session
+@pytest.mark.parametrize("model_arch", TEST_MODEL_ARCH_LIST)
+def test_components_pytorch_image_classifier_single_node(
+    mlflow_start_run_mock,
+    mlflow_log_params_mock,
+    mlflow_pytorch_log_model_mock,
+    mlflow_register_model_mock,
+    mlflow_end_run_mock,
+    model_arch,
+    temporary_dir,
+    random_image_in_folder_classes,
+):
+    """Tests src/components/pytorch_image_classifier/train.py"""
+    model_dir = os.path.join(temporary_dir, "pytorch_image_classifier_model")
+    checkpoints_dir = os.path.join(
+        temporary_dir, "pytorch_image_classifier_checkpoints"
+    )
+
+    # create test arguments for the script
+    # fmt: off
+    script_args = [
+        "train.py",
+        "--train_images", random_image_in_folder_classes,
+        "--valid_images", random_image_in_folder_classes,  # using same data for train/valid
+        "--batch_size", "16",
+        "--num_workers", "0",  # single thread pre-fetching
+        "--prefetch_factor", "2",  # will be discarded if num_workers=0
+        "--pin_memory", "True",
+        "--non_blocking", "False",
+        "--model_arch", model_arch,
+        "--model_arch_pretrained", "True",
+        "--num_epochs", "2",
+        "--model_output", model_dir,
+        "--checkpoints", checkpoints_dir,
+        "--register_model_as", "foo",
+        "--enable_profiling", "True",
+    ]
+    # fmt: on
+
+    # replaces sys.argv with test arguments and run main
+    with patch.object(sys, "argv", script_args):
+        train.main()
+
+    # those mlflow calls must be unique in the script
+    mlflow_start_run_mock.assert_called_once()
+    mlflow_end_run_mock.assert_called_once()
+
+    # test all log_params calls
+    for log_params_call in mlflow_log_params_mock.call_args_list:
+        args, kwargs = log_params_call
+        assert isinstance(args[0], dict)  # call has only 1 argument, and it's a dict
+
+    # test model registration with mlflow.pytorch.log_model()
+    log_model_calls = mlflow_pytorch_log_model_mock.call_args_list
+    assert len(log_model_calls) == 1
+    args, kwargs = log_model_calls[0] # unpack arguments
+    assert "artifact_path" in kwargs
+    assert kwargs["artifact_path"] == "final_model"
+    assert "registered_model_name" in kwargs
+    assert kwargs["registered_model_name"] == "foo"
+
+    # test model registration with mlflow.register_model()
+    register_model_calls = mlflow_register_model_mock.call_args_list
+    assert len(register_model_calls) == 1 # call should happen only once
+    args, kwargs = register_model_calls[0] # unpack arguments
+    assert "model_uri" in kwargs
+    assert kwargs["model_uri"].endswith("final_model")
+    assert "name" in kwargs
+    assert kwargs["name"] == "foo"
+
+    # test checkpoints presence
+    assert len(os.listdir(checkpoints_dir)) == 3  # 1 before training loop, + 2 epochs
+
+
+@patch("mlflow.end_run")  # we can have only 1 start/end per test session
+@patch("mlflow.register_model")  # patched to test model name registration
+@patch("mlflow.pytorch.log_model")  # patched to test model name registration
+@patch("mlflow.log_params")  # patched to avoid conflict in parameters
+@patch("mlflow.start_run")  # we can have only 1 start/end per test session
+@patch("torch.distributed.init_process_group")  # to avoid calling for the actual thing
+@patch(
+    "torch.distributed.destroy_process_group"
+)  # to avoid calling for the actual thing
+@patch(
+    "torch.nn.parallel.DistributedDataParallel"
+)  # to avoid calling for the actual thing
+@pytest.mark.parametrize("backend", ["nccl", "mpi"])
+def test_components_pytorch_image_classifier_second_of_two_nodes(
+    torch_ddp_mock,
+    torch_dist_destroy_process_group_mock,
+    torch_dist_init_process_group_mock,
+    mlflow_start_run_mock,
+    mlflow_log_params_mock,
+    mlflow_pytorch_log_model_mock,
+    mlflow_register_model_mock,
+    mlflow_end_run_mock,
+    backend,
+    temporary_dir,
+    random_image_in_folder_classes,
+):
+    """Tests src/components/pytorch_image_classifier/train.py"""
+    model_dir = os.path.join(
+        temporary_dir, "pytorch_image_classifier_distributed_model"
+    )
+
+    torch_ddp_mock.side_effect = lambda model: model  # ddp would return just the model
+
+    # create some environment variables for the backend
+    if backend == "nccl":
+        backend_expected_env = {
+            # setup as if there were 2 nodes with 1 gpu each
+            "WORLD_SIZE": "2",
+            "RANK": "1",
+            "LOCAL_WORLD_SIZE": "1",
+            "LOCAL_RANK": "0",
+        }
+    elif backend == "mpi":
+        backend_expected_env = {
+            # setup as if there were 2 nodes with 1 gpu each
+            "OMPI_COMM_WORLD_SIZE": "2",
+            "OMPI_COMM_WORLD_RANK": "1",
+            "OMPI_COMM_WORLD_LOCAL_SIZE": "1",
+            "OMPI_COMM_WORLD_LOCAL_RANK": "0",
+        }
+    else:
+        raise Exception("backend {} used for testing is not implemented in script.")
+
+    with patch.dict(os.environ, backend_expected_env, clear=False):
+        # create test arguments for the script
+        # fmt: off
+        script_args = [
+            "train.py",
+            "--train_images", random_image_in_folder_classes,
+            "--valid_images", random_image_in_folder_classes,  # using same data for train/valid
+            "--distributed_backend", backend,
+            "--batch_size", "16",
+            "--num_workers", "0",  # single thread pre-fetching
+            "--prefetch_factor", "2",  # will be discarded if num_workers=0
+            "--pin_memory", "True",
+            "--non_blocking", "False",
+            "--model_arch", "resnet18",
+            "--model_arch_pretrained", "True",
+            "--num_epochs", "1",
+            "--register_model_as", "foo",
+        ]
+        # fmt: on
+
+        # replaces sys.argv with test arguments and run main
+        with patch.object(sys, "argv", script_args):
+            train.main()
+
+    # those mlflow calls must be unique in the script
+    mlflow_start_run_mock.assert_called_once()
+    mlflow_end_run_mock.assert_called_once()
+
+    mlflow_pytorch_log_model_mock.assert_not_called()  # not saving from non-head nodes
+    mlflow_register_model_mock.assert_not_called()  # not registering from non-head nodes
+
+    torch_dist_init_process_group_mock.assert_called_once()
+    torch_dist_init_process_group_mock.assert_called_with(backend, rank=1, world_size=2)
+
+    torch_dist_destroy_process_group_mock.assert_called_once()
--- a/cv/aml-cli-v2/data/sample-request.json
+++ b/cv/aml-cli-v2/data/sample-request.json
--- a/cv/aml-cli-v2/mlops/azureml/deploy/online/online-deployment.yml
+++ b/cv/aml-cli-v2/mlops/azureml/deploy/online/online-deployment.yml
@ -0,0 +1,6 @@
+$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
+name: blue
+endpoint_name: dogs-classifier-online
+model: azureml:resnet-dogs-classifier@latest
+instance_type: Standard_DS2_v2
+instance_count: 1
--- a/cv/aml-cli-v2/mlops/azureml/deploy/online/online-endpoint.yml
+++ b/cv/aml-cli-v2/mlops/azureml/deploy/online/online-endpoint.yml
@ -0,0 +1,4 @@
+$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
+name: dogs-classifier-online
+description: Stanford Dogs Classifier
+auth_mode: key
--- a/cv/aml-cli-v2/mlops/azureml/train/create_stanford_dogs_dataset.yaml
+++ b/cv/aml-cli-v2/mlops/azureml/train/create_stanford_dogs_dataset.yaml
@ -0,0 +1,19 @@
+$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
+
+command: |
+  tar xvfm ${{inputs.archive}} --no-same-owner -C ${{outputs.images}} #TODO: Split data into Train-Validate-Test
+
+inputs:
+  archive:
+    type: uri_file
+    path: http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar
+
+outputs:
+  images:
+    type: uri_folder
+    mode: upload
+    path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/dogs/
+
+environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest
+
+compute: azureml:cpu-cluster
--- a/cv/aml-cli-v2/mlops/azureml/train/pipeline.yaml
+++ b/cv/aml-cli-v2/mlops/azureml/train/pipeline.yaml
@ -0,0 +1,83 @@
+$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
+type: pipeline
+
+# <inputs_and_outputs>
+inputs:
+  training_images:
+    type: uri_folder
+    mode: download # pick ro_mount, rw_mount or download
+    path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/dogs/**
+    # path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/places2/train//**
+  validation_images: #TODO: Use different datasets for validation
+    type: uri_folder
+    mode: download # pick ro_mount, rw_mount or download
+    path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/dogs/**
+    # path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/places2/valid/**
+# </inputs_and_outputs>
+
+# <jobs>
+settings:
+  default_datastore: azureml:workspaceblobstore
+  continue_on_step_failure: true
+
+jobs:
+  train:
+    type: command
+    component: file:train.yaml
+    compute: azureml:gpu-cluster
+    resources:
+      instance_count: 1 # number of nodes
+    distribution:
+      type: pytorch
+      process_count_per_instance: 1 # number of gpus
+
+    # NOTE: set env var if needed
+    environment_variables:
+      NCCL_DEBUG: "INFO" # adjusts the level of info from NCCL tests
+
+      # NCCL_TOPO_FILE: "/opt/microsoft/ndv4-topo.xml" # Use specific topology file for A100
+
+      # NCCL_IB_PCI_RELAXED_ORDERING: "1" # Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
+      # NCCL_IB_DISABLE: "1" # force disable infiniband (if set to "1")
+      # NCCL_NET_PLUGIN: "none" # to force NET/Plugin off (no rdma/sharp plugin at all)
+      # NCCL_NET: "Socket" # to force node-to-node comm to use Socket (slow)
+      # NCCL_SOCKET_IFNAME: "eth0" # to force Socket comm to use eth0 (use NCCL_NET=Socket)
+
+      # UCX_IB_PCI_RELAXED_ORDERING: "on"
+      # UCX_TLS: "tcp"
+      # UCX_NET_DEVICES: "eth0" # if you have Error: Failed to resolve UCX endpoint...
+
+      # CUDA_DEVICE_ORDER: "PCI_BUS_ID" # ordering of gpus
+
+      # TORCH_DISTRIBUTED_DEBUG: "DETAIL"
+
+    inputs:
+      # data inputs
+      train_images: ${{parent.inputs.training_images}}
+      valid_images: ${{parent.inputs.validation_images}}
+
+      # data loading
+      batch_size: 64
+      num_workers: 5
+      prefetch_factor: 4
+      persistent_workers: true
+      pin_memory: true
+      non_blocking: false
+
+      # model
+      model_arch: "resnet18"
+      model_arch_pretrained: true
+
+      # training
+      num_epochs: 1
+      learning_rate: 0.001
+      momentum: 0.9
+
+      # profiling
+      enable_profiling: false
+      # multiprocessing_sharing_strategy: "file_system" # WARNING: this can cause hang at job completion
+
+      # Model Registrataion
+      register_model_as: "resnet-dogs-classifier"
+
+# </jobs>
--- a/cv/aml-cli-v2/mlops/azureml/train/train-env.yaml
+++ b/cv/aml-cli-v2/mlops/azureml/train/train-env.yaml
@ -0,0 +1,21 @@
+$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
+name: nvidia_pytorch
+build:
+  path: ../../../data-science/environment/
+tags:
+  os: ubuntu
+  os_version: 20.04
+  hpcx: 2.10
+  mpi: openmpi
+  mpi_version: 4.1.2rc4
+  ucx: 1.12.0
+  cuda: 11.6.2
+  cudnn: 8.4.0.27
+  nccl: 2.12.10
+  rdma_core: 36.0
+  nsight_compute: 2022.1.1.2
+  nsight_systems: "2022.2.1.31-5fe97ab"
+  nccl_test: 2.11.0
+  azureml-defaults: 1.41.0
+  mlflow: 1.25.1
+  transformers: 4.18.0
--- a/cv/aml-cli-v2/mlops/azureml/train/train.yaml
+++ b/cv/aml-cli-v2/mlops/azureml/train/train.yaml
@ -0,0 +1,123 @@
+$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
+type: command
+
+description: >-
+  Fine-tunes a pre-trained pytorch model for image classification.
+  Inputs should be provided as distinct directories containing distinct images
+  as we're using [ImageFolder](http://pytorch.org/vision/main/generated/torchvision.datasets.ImageFolder.html) to load data.
+name: pytorch_image_classifier
+display_name: Image Classification Model (PyTorch)
+version: 1.0.4
+
+inputs:
+  # data loading
+  train_images:
+    type: path
+    description: "Path to folder containing training images, stored in subdirectories according to their class."
+  valid_images:
+    type: path
+    description: "Path to folder containing validation images, stored in subdirectories according to their class."
+
+  # data loading
+  batch_size:
+    type: integer
+    min: 1
+    optional: true
+    description: "Train/valid data loading batch size (default: 64)"
+  num_workers:
+    type: integer
+    optional: true
+    description: "Num workers for data loader (default: -1 => all cpus available)"
+  prefetch_factor:
+    type: integer
+    optional: true
+    description: "Data loader prefetch factor (default: 2)"
+  persistent_workers:
+    type: boolean
+    optional: true
+    description: "Use persistent prefetching workers (default: True)"
+  pin_memory:
+    type: boolean
+    optional: true
+    description: "Pin Data loader prefetch factor (default: True)"
+  non_blocking:
+    type: boolean
+    optional: true
+    description: "Use non-blocking transfer to device (default: False)"
+
+  # model
+  model_arch:
+    type: string
+    optional: true
+    description: "Which model architecture to use (default: resnet18)"
+  model_arch_pretrained:
+    type: boolean
+    optional: true
+    description: "Use pretrained model (default: true)"
+
+  # training
+  num_epochs:
+    type: integer
+    optional: true
+    description: "Number of epochs to train for (default: 1)"
+  learning_rate:
+    type: number
+    optional: true
+    description: "Learning rate of optimizer (default: 0.001)"
+  momentum:
+    type: number
+    optional: true
+    description: "Momentum of optimizer (default: 0.9)"
+
+  # model registration
+  register_model_as:
+    type: string
+    optional: true
+    description: "Name to register final model in MLFlow"
+
+  # system parameters
+  enable_profiling:
+    type: boolean
+    default: false
+    description: "Enables profiler"
+  multiprocessing_sharing_strategy:
+    type: string
+    optional: true
+    description: "Check https://pytorch.org/docs/stable/multiprocessing.html"
+
+outputs:
+  checkpoints:
+    type: path
+    description: "Path to export checkpoints"
+  trained_model:
+    type: path
+    description: "Path to the final model"
+
+code: ../../../data-science/src
+
+environment: azureml:nvidia_pytorch@latest
+
+command: >-
+  python train.py
+  --train_images ${{inputs.train_images}}
+  --valid_images ${{inputs.valid_images}}
+  [--batch_size ${{inputs.batch_size}}]
+  [--num_workers ${{inputs.num_workers}}]
+  [--prefetch_factor ${{inputs.prefetch_factor}}]
+  [--persistent_workers ${{inputs.persistent_workers}}]
+  [--pin_memory ${{inputs.pin_memory}}]
+  [--non_blocking ${{inputs.non_blocking}}]
+  [--model_arch ${{inputs.model_arch}}]
+  [--model_arch_pretrained ${{inputs.model_arch_pretrained}}]
+  [--num_epochs ${{inputs.num_epochs}}]
+  [--learning_rate ${{inputs.learning_rate}}]
+  [--momentum ${{inputs.momentum}}]
+  --model_output ${{outputs.trained_model}}
+  --checkpoints ${{outputs.checkpoints}}
+  [--register_model_as ${{inputs.register_model_as}}]
+  --enable_profiling ${{inputs.enable_profiling}}
+  [--multiprocessing_sharing_strategy ${{inputs.multiprocessing_sharing_strategy}}]
+distribution:
+  # NOTE: using type:pytorch will use all the right env variables for pytorch init_process_group
+  type: pytorch
+  process_count_per_instance: 1
--- a/cv/aml-cli-v2/mlops/devops-pipelines/deploy-model-training-pipeline.yml
+++ b/cv/aml-cli-v2/mlops/devops-pipelines/deploy-model-training-pipeline.yml
@ -0,0 +1,63 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+variables:
+- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
+    # 'main' branch: PRD environment
+    - template: ../../config-infra-prod.yml
+- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:  
+    # 'develop' or feature branches: DEV environment
+    - template: ../../config-infra-dev.yml
+- name: version
+  value: aml-cli-v2 
+
+
+trigger:
+- none
+
+pool:
+  vmImage: ubuntu-20.04
+
+
+resources:
+  repositories:
+    - repository: mlops-templates  # Template Repo
+      name: Azure/mlops-templates # need to change org name from "Azure" to your own org
+      endpoint: github-connection # need to set up and hardcode
+      type: github
+      ref: main
+
+stages:
+- stage: DeployTrainingPipeline
+  displayName: Deploy Training Pipeline
+  jobs:
+    - job: DeployTrainingPipeline
+      steps:
+      - checkout: self
+        path: s/
+      - checkout: mlops-templates
+        path: s/templates/
+      - template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
+      - template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
+      - template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
+      - template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
+        parameters:
+          cluster_name: gpu-cluster
+          size: Standard_NC6
+          min_instances: 0
+          max_instances: 1
+          cluster_tier: dedicated
+      - template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
+        parameters:
+          build_type: docker
+          environment_name: nvidia_pytorch # Not used for docker builds
+          environment_file: mlops/azureml/train/train-env.yaml        
+      - template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
+        parameters:
+          data_type: training
+          environment_file: mlops/azureml/train/create_stanford_dogs_dataset.yaml      
+      - template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
+        parameters: 
+          pipeline_file: mlops/azureml/train/pipeline.yaml
+          experiment_name: $(environment)_cv_train_$(Build.SourceBranchName)
+          display_name: $(environment)_cv_run_$(Build.BuildID)
--- a/cv/aml-cli-v2/mlops/devops-pipelines/deploy-online-endpoint-pipeline.yml
+++ b/cv/aml-cli-v2/mlops/devops-pipelines/deploy-online-endpoint-pipeline.yml
@ -0,0 +1,61 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+variables:
+- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
+    # 'main' branch: PRD environment
+    - template: ../../config-infra-prod.yml
+- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:  
+    # 'develop' or feature branches: DEV environment
+    - template: ../../config-infra-dev.yml
+- name: version
+  value: aml-cli-v2 
+- name: endpoint_name
+  value: dogs-online-$(namespace)$(postfix)$(environment)
+- name: endpoint_type
+  value: online
+
+
+trigger:
+- none
+
+pool:
+  vmImage: ubuntu-20.04
+
+
+resources:
+  repositories:
+    - repository: mlops-templates  # Template Repo
+      name: Azure/mlops-templates # need to change org name from "Azure" to your own org
+      endpoint: github-connection # need to set up and hardcode
+      type: github
+      ref: main
+
+stages:
+- stage: CreateOnlineEndpoint
+  displayName: Create/Update Online Endpoint 
+  jobs:
+    - job: DeployOnlineEndpoint
+      steps:
+      - checkout: self
+        path: s/
+      - checkout: mlops-templates
+        path: s/templates/
+      - template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
+      - template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
+      - template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
+      - template: templates/${{ variables.version }}/create-endpoint.yml@mlops-templates
+        parameters:
+          endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
+      - template: templates/${{ variables.version }}/create-deployment.yml@mlops-templates
+        parameters:
+          deployment_name: dogs-online-dp
+          deployment_file: mlops/azureml/deploy/online/online-deployment.yml 
+      - template: templates/${{ variables.version }}/allocate-traffic.yml@mlops-templates
+        parameters:
+          traffic_allocation: dogs-online-dp=100
+      - template: templates/${{ variables.version }}/test-deployment.yml@mlops-templates
+        parameters:
+          deployment_name: dogs-online-dp
+          sample_request: data/sample-request.json
+          request_type: json
--- a/cv/aml-cli-v2/mlops/github-actions/deploy-model-training-pipeline.yml
+++ b/cv/aml-cli-v2/mlops/github-actions/deploy-model-training-pipeline.yml
@ -0,0 +1,51 @@
+name: deploy-cv-model-training-pipeline
+
+on: 
+  workflow_dispatch:
+jobs:
+  get-config:
+      uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
+      with:
+        file_name: config-infra-prod.yml
+  create-compute:
+      needs: get-config
+      uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
+      with:
+        cluster_name: gpu-cluster
+        size: Standard_NC6
+        min_instances: 0
+        max_instances: 1
+        resource_group: ${{ needs.get-config.outputs.resource_group }}
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }}     
+      secrets:
+          creds: ${{secrets.AZURE_CREDENTIALS}}
+  register-environment:
+      needs: [get-config,create-compute]
+      uses: ./.github/workflows/register-environment.yml
+      with:
+        resource_group: ${{ needs.get-config.outputs.resource_group }}
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }}     
+        environment_file: mlops/azureml/train/train-env.yaml 
+      secrets:
+        creds: ${{secrets.AZURE_CREDENTIALS}}
+  register-dataset:
+      needs: [get-config,register-environment]
+      uses: Azure/mlops-templates/.github/workflows/register-dataset.yml@main
+      with:
+        resource_group: ${{ needs.get-config.outputs.resource_group }}
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }} 
+        data_file: mlops/azureml/train/create_stanford_dogs_dataset.yaml
+        file_type: Training
+        name: stanford_dogs
+      secrets:
+        creds: ${{secrets.AZURE_CREDENTIALS}}
+  run-pipeline:
+      needs: [get-config,register-dataset]
+      uses: Azure/mlops-templates/.github/workflows/run-pipeline.yml@main
+      with:
+        resource_group: ${{ needs.get-config.outputs.resource_group }}
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }}     
+        parameters-file:  mlops/azureml/train/pipeline.yaml
+        job-name: cv-train
+      secrets:
+          creds: ${{secrets.AZURE_CREDENTIALS}}
--- a/cv/aml-cli-v2/mlops/github-actions/deploy-online-endpoint-pipeline.yml
+++ b/cv/aml-cli-v2/mlops/github-actions/deploy-online-endpoint-pipeline.yml
@ -0,0 +1,42 @@
+name: deploy-online-endpoint-pipeline
+
+on: 
+  workflow_dispatch:
+jobs:
+  get-config:
+      uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
+      with:
+        file_name: config-infra-prod.yml
+  create-endpoint:
+      needs: get-config
+      uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
+      with:
+        resource_group: ${{ needs.get-config.outputs.resource_group }}
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }} 
+        endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
+        endpoint_name: dogs-classifier-online2
+        endpoint_type: online
+      secrets:
+        creds: ${{secrets.AZURE_CREDENTIALS}}
+  create-deployment:
+      uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
+      needs: [get-config,create-endpoint]
+      with:
+        resource_group: ${{ needs.get-config.outputs.resource_group }}
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }} 
+        endpoint_file: mlops/azureml/deploy/online/online-deployment.yml
+        endpoint_name: dogs-classifier-online2
+        endpoint_type: online
+        deployment_name: dogs-online-dp
+      secrets:
+        creds: ${{secrets.AZURE_CREDENTIALS}}
+  allocate-traffic:
+      uses: Azure/mlops-templates/.github/workflows/allocate-traffic.yml@main
+      needs: [get-config,create-deployment]
+      with:
+        resource_group: ${{ needs.get-config.outputs.resource_group }}
+        workspace_name: ${{ needs.get-config.outputs.aml_workspace }} 
+        traffic_allocation: dogs-online-dp=100
+        endpoint_name: dogs-classifier-online2
+      secrets:
+        creds: ${{secrets.AZURE_CREDENTIALS}}
--- a/cv/python-sdk/config-aml.yml
+++ b/cv/python-sdk/config-aml.yml
@ -0,0 +1,43 @@
+variables:
+
+  ap_vm_image: ubuntu-20.04
+
+## Training pipeline settings
+
+  # Training dataset settings
+  training_dataset_name: dogs-imgs
+  training_dataset_description: 'Stanford Dogs Dataset (http://vision.stanford.edu/aditya86/ImageNetDogs/)'
+  training_dataset_local_path: data/training-imgs/
+  training_dataset_path_on_datastore: dogs-imgs/
+  training_dataset_type: local
+  training_dataset_storage_url: 'http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar'
+
+  labels_dataset_name: dogs-labels
+  labels_dataset_description: 'Labels for Stanford Dogs Dataset (http://vision.stanford.edu/aditya86/ImageNetDogs/)'
+  labels_dataset_local_path: data/training/
+  labels_dataset_path_on_datastore: dogs-labels/
+  labels_dataset_type: local
+
+  # Training AzureML Environment settings
+  training_env_name: nvidia_pytorch
+  training_env_path: data-science/environment/training/
+
+  # Compute target for pipeline
+  training_target: gpu-cluster
+  training_target_sku: Standard_NC6
+  training_target_min_nodes: 0
+  training_target_max_nodes: 1
+
+  # Name for the training pipeline
+  training_pipeline_name: resnet-dogs-training-pipeline
+  training_experiment_name: resnet-dogs-training
+
+  # Training arguments specification
+  training_arguments: --epochs 2 --batch-size 64 --training-mode feature-extraction
+
+  # Training datasets specification
+  # Syntax: <name>:<version>:<mode>:<steps (names separated by +)>
+  training_datasets: dogs-labels:1:download:prep dogs-imgs:latest:mount:train+eval
+
+  # Name under which the model will be registered
+  model_name: resnet-dogs-classifier
--- a/cv/python-sdk/data-science/environment/training/azureml_environment.json
+++ b/cv/python-sdk/data-science/environment/training/azureml_environment.json
@ -0,0 +1,44 @@
+{
+    "name": "pytorch_manual",
+    "environmentVariables": {
+        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
+    },
+    "python": {
+        "userManagedDependencies": false,
+        "interpreterPath": "python",
+        "condaDependenciesFile": null,
+        "baseCondaEnvironment": null
+    },
+    "docker": {
+        "enabled": true,
+        "baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04",
+        "baseDockerfile": null,
+        "sharedVolumes": true,
+        "shmSize": "2g",
+        "arguments": [],
+        "baseImageRegistry": {
+            "address": null,
+            "username": null,
+            "password": null,
+            "registryIdentity": null
+        },
+        "platform": {
+            "os": "Linux",
+            "architecture": "amd64"
+        }
+    },
+    "spark": {
+        "repositories": [],
+        "packages": [],
+        "precachePackages": true
+    },
+    "databricks": {
+        "mavenLibraries": [],
+        "pypiLibraries": [],
+        "rcranLibraries": [],
+        "jarLibraries": [],
+        "eggLibraries": []
+    },
+    "r": null,
+    "inferencingStackVersion": null
+}
--- a/cv/python-sdk/data-science/environment/training/conda_dependencies.yml
+++ b/cv/python-sdk/data-science/environment/training/conda_dependencies.yml
@ -0,0 +1,17 @@
+name: pytorch_manual
+channels:
+- conda-forge
+dependencies:
+- python=3.7
+- pip=20.2.4
+- pip:
+  - pandas==1.3.5
+  - scikit-learn==1.0.2
+  - matplotlib==3.5.2
+  - msrest==0.6.21
+  - mlflow==1.27.0
+  - azureml-core==1.43.0
+  - azureml-defaults==1.43.0
+  - azureml-mlflow==1.43.0
+  - torch==1.11.0
+  - torchvision==0.12.0
--- a/cv/python-sdk/data-science/notebooks/.gitkeep
+++ b/cv/python-sdk/data-science/notebooks/.gitkeep
--- a/cv/python-sdk/data-science/src/evaluate.py
+++ b/cv/python-sdk/data-science/src/evaluate.py
@ -0,0 +1,129 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import argparse
+
+import pandas as pd
+
+import mlflow
+import torch
+from sklearn import metrics as sklmetrics
+import matplotlib.pyplot as plt
+
+from model import CustomImageDataset, load_model
+
+
+def main(labels_path, images_path, model_path, model_name, output_dir, deploy_flag_output):
+
+    # Load model
+    model_file = os.path.join(model_path, f'{model_name}.pth')
+    net = load_model(path=model_file)
+
+    # Load test data
+    labels_data = pd.read_csv(os.path.join(labels_path, 'labels_test.csv'))
+    labels_data = labels_data.set_index('path').squeeze() # Convert to appropiate format for CustomImageDataset
+    testset = CustomImageDataset(images_path, labels_data, mode='test')
+    print(f'Test size: {len(testset)}')
+
+    # Generate predictions
+    predictions = get_predictions(testset, net)
+    predictions.to_csv(os.path.join(output_dir, 'predictions.csv'), index=False)
+
+    # Evaluation metrics
+    metrics = evaluate(predictions.label_real, predictions.label_predicted)
+    for k, v in metrics.items():
+        mlflow.log_metric(k, v)
+
+    # Plot confusion matrix
+    plt.rcParams["figure.figsize"] = [40, 40]
+    sklmetrics.ConfusionMatrixDisplay.from_predictions(predictions.label_real, predictions.label_predicted, cmap='YlGnBu')
+    plt.savefig("confusion_matrix.png")
+    mlflow.log_artifact("confusion_matrix.png")
+
+    # Promote model
+    deploy_flag = is_new_model_better()
+    mlflow.log_metric("deploy flag", deploy_flag)
+    with open(deploy_flag_output, 'w') as f:
+        f.write('%d' % int(deploy_flag))
+
+    deploy_flag_str = 'not' if deploy_flag == False else ''
+    print(f'Finished. Model will {deploy_flag_str} be registered.')
+
+
+def get_predictions(dataset, net, batch_size=256):
+
+    testloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2)
+
+    predictions = pd.DataFrame()
+    # since we're not training, we don't need to calculate the gradients for our outputs
+    with torch.no_grad():
+        for data in testloader:
+            images, classes, paths = data
+            # calculate outputs by running images through the network
+            outputs = net(images)
+            # the class with the highest energy is what we choose as prediction
+            _, predicted = torch.max(outputs.data, 1)
+
+            predictions_batch = pd.DataFrame({
+                'path': paths,
+                'label_real': dataset.get_labels(classes),
+                'label_predicted': dataset.get_labels(predicted)
+            })
+            predictions = pd.concat([predictions, predictions_batch], ignore_index=True)
+
+    return predictions
+
+
+def evaluate(labels_real, labels_pred):
+
+    metrics = {
+        'accuracy': sklmetrics.accuracy_score(labels_real, labels_pred),
+        'mcc': sklmetrics.matthews_corrcoef(labels_real, labels_pred),
+
+        'recall_micro': sklmetrics.recall_score(labels_real, labels_pred, average='micro'),
+        'recall_macro': sklmetrics.recall_score(labels_real, labels_pred, average='macro'),
+        'recall_weighted': sklmetrics.recall_score(labels_real, labels_pred, average='weighted'),
+
+        'precison_micro': sklmetrics.precision_score(labels_real, labels_pred, average='micro'),
+        'precison_macro': sklmetrics.precision_score(labels_real, labels_pred, average='macro'),
+        'precison_weighted': sklmetrics.precision_score(labels_real, labels_pred, average='weighted'),
+
+        'f1_micro': sklmetrics.f1_score(labels_real, labels_pred, average='micro'),
+        'f1_macro': sklmetrics.f1_score(labels_real, labels_pred, average='macro'),
+        'f1_weighted': sklmetrics.f1_score(labels_real, labels_pred, average='weighted'),
+    }
+
+    return metrics
+
+
+def is_new_model_better():
+    return True  # For simplicity
+
+
+def parse_args(args_list=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--prepared_data_path', type=str, required=True, help='Directory path to test data (output from prep step)')
+    parser.add_argument('--dogs-imgs', type=str, help='Directory path to images')
+    parser.add_argument('--model_path', type=str, help='Model output directory')
+    parser.add_argument('--evaluation_path', type=str, default='evaluation_results/', help="Evaluation results output directory")
+    parser.add_argument('--deploy_flag', type=str, help='A deploy flag whether to deploy or no')
+
+    args_parsed, unknown = parser.parse_known_args(args_list)
+    if unknown:
+        print(f"Unrecognized arguments. These won't be used: {unknown}")
+
+    return args_parsed
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    main(
+        labels_path=args.prepared_data_path,
+        images_path=args.dogs_imgs,
+        model_path=args.model_path,
+        model_name='model',
+        output_dir=args.evaluation_path,
+        deploy_flag_output=args.deploy_flag
+    )
--- a/cv/python-sdk/data-science/src/model/init.py
+++ b/cv/python-sdk/data-science/src/model/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from .dataset import CustomImageDataset
+from .net import load_model
--- a/cv/python-sdk/data-science/src/model/dataset.py
+++ b/cv/python-sdk/data-science/src/model/dataset.py
@ -0,0 +1,49 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import PIL
+
+from torch.utils.data import Dataset
+import torchvision.transforms as transforms
+
+
+class CustomImageDataset(Dataset):
+    def __init__(self, img_dir, img_labels, mode='test'):      
+        self.img_dir = img_dir
+        self.img_labels = img_labels
+        self.classes = img_labels.unique().tolist()
+
+        self.mode = mode
+        if self.mode == 'train':
+            self.transform = transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor()
+            ])
+        else:
+            self.transform = transforms.Compose([
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+            ])
+ 
+    def __len__(self):
+        return len(self.img_labels)
+
+    def __getitem__(self, idx):
+
+        img_path = self.img_labels.index[idx]
+        image = PIL.Image.open(os.path.join(self.img_dir, img_path)).convert('RGB')
+        image = self.transform(image)
+
+        img_label = self.img_labels[idx]
+        img_class = self.classes.index(img_label)
+
+        return image, img_class, img_path
+
+    def nclasses(self):
+        return len(self.classes)
+
+    def get_labels(self, indexes):
+        return [self.classes[i] for i in indexes]
--- a/cv/python-sdk/data-science/src/model/net.py
+++ b/cv/python-sdk/data-science/src/model/net.py
@ -0,0 +1,36 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import torch
+import torch.nn as nn
+import torchvision.models as models
+import torch.optim as optim
+
+
+def load_model(path=None, num_classes=2, mode='finetuning', learning_rate=0.001, momentum=0.9):
+
+    # Load existing model
+    if path:
+        print('Loading existing model from path...')
+        model_data = torch.load(path)
+        model = models.resnet18(pretrained=False)
+        model.fc = nn.Linear(model.fc.in_features, model_data['fc.weight'].shape[0])
+        model.load_state_dict(model_data)
+        return model
+
+    # Initialize new model
+    assert mode in ['finetuning', 'feature-extraction']
+
+    model = models.resnet18(pretrained=True)
+    if mode == 'feature-extraction':  # Freeze layers
+        for param in model.parameters():
+            param.requires_grad = False
+
+    model.fc = nn.Linear(model.fc.in_features, num_classes)
+
+    criterion = nn.CrossEntropyLoss()
+
+    params_optim = model.parameters() if mode == 'finetuning' else model.fc.parameters() if mode == 'feature-extraction' else None
+    optimizer = optim.SGD(params_optim, lr=learning_rate, momentum=momentum)
+
+    return model, criterion, optimizer
--- a/cv/python-sdk/data-science/src/prep.py
+++ b/cv/python-sdk/data-science/src/prep.py
@ -0,0 +1,58 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import argparse
+
+import numpy as np
+import pandas as pd
+import mlflow
+
+
+def main(raw_data_path, prepared_data_path):
+
+    print(f'Raw data path: {raw_data_path}')
+    print(f'Output data path: {prepared_data_path}')
+
+    # Read data
+
+    labels_data = pd.read_csv(os.path.join(raw_data_path, 'image_labels.csv'))
+
+    mlflow.log_metric('total_labels', len(labels_data))
+
+    # Split data into train and test datasets
+
+    random_data = np.random.rand(len(labels_data))
+    labels_train = labels_data[random_data < 0.7]
+    labels_test = labels_data[random_data >= 0.7]
+
+    print(labels_train)
+
+    mlflow.log_metric('train_size', labels_train.shape[0])
+    mlflow.log_metric('test_size', labels_test.shape[0])
+
+    labels_train.to_csv(os.path.join(prepared_data_path, 'labels_train.csv'), index=False)
+    labels_test.to_csv(os.path.join(prepared_data_path, 'labels_test.csv'), index=False)
+
+    print('Finished.')
+
+
+def parse_args(args_list=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dogs-labels", type=str, required=True, help="Path to labels")
+    parser.add_argument("--prepared_data_path", type=str, required=True, help="Path for prepared data")
+
+    args_parsed, unknown = parser.parse_known_args(args_list)
+    if unknown:
+        print(f"Unrecognized arguments. These won't be used: {unknown}")
+
+    return args_parsed
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    main(
+        raw_data_path=args.dogs_labels,
+        prepared_data_path=args.prepared_data_path
+    )
--- a/cv/python-sdk/data-science/src/train.py
+++ b/cv/python-sdk/data-science/src/train.py
@ -0,0 +1,118 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+# Example adapted from:
+#   https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
+#   https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
+
+import os
+import argparse
+
+import pandas as pd
+import mlflow
+import torch
+
+from model import CustomImageDataset, load_model
+
+
+def main(labels_path, images_path, 
+         model_name, output_dir,
+         mode, epochs, batch_size, learning_rate, momentum):
+
+    labels_data = pd.read_csv(os.path.join(labels_path, 'labels_train.csv'))
+    labels_data = labels_data.set_index('path').squeeze() # Convert to appropiate format for CustomImageDataset
+    trainset = CustomImageDataset(images_path, labels_data, mode='train')
+    print(f'Train size: {len(trainset)}')
+
+    print("Training...")
+    net = train(
+        trainset,
+        mode=mode,
+        epochs=epochs,
+        batch_size=batch_size,
+        learning_rate=learning_rate,
+        momentum=momentum
+    )
+    print('Finished training')
+
+    print(f"Saving model in folder {output_dir}...")
+    os.makedirs(output_dir, exist_ok=True)
+    model_path = os.path.join(output_dir, f'{model_name}.pth')
+    torch.save(net.state_dict(), model_path)
+
+    print('Finished.')
+
+
+def train(dataset, mode='finetuning', epochs=2, batch_size=64, learning_rate=0.001, momentum=0.9, stats_freq=25):
+
+    trainloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=2)
+
+    net, criterion, optimizer = load_model(
+        num_classes=dataset.nclasses(),
+        mode=mode,
+        learning_rate=learning_rate,
+        momentum=momentum
+    )
+
+    global_iter = 0
+    for epoch in range(epochs):  # loop over the dataset multiple times
+        print(f'----\nEpoch {epoch}\n----\n')
+
+        running_loss = 0.0
+        for i, data in enumerate(trainloader, 0):
+            # get the inputs; data is a list of [inputs, classes]
+            inputs, classes, paths = data
+
+            # zero the parameter gradients
+            optimizer.zero_grad()
+
+            # forward + backward + optimize
+            outputs = net(inputs)
+            loss = criterion(outputs, classes)
+            loss.backward()
+            optimizer.step()
+
+            # print statistics
+            running_loss += loss.item()
+            if (i + 1) % stats_freq == 0:
+                mlflow.log_metric('loss', running_loss / stats_freq, step=global_iter)
+                mlflow.log_metric('epoch', epoch, step=global_iter)
+                running_loss = 0.0
+
+            global_iter += 1
+
+    return net
+
+
+def parse_args(args_list=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--prepared_data_path', type=str, required=True, help='Directory path to training data (output from prep step)')
+    parser.add_argument('--dogs-imgs', type=str, help='Directory path to images')
+    parser.add_argument('--model_path', type=str, help='Model output directory')
+    parser.add_argument('--training-mode', type=str, default='feature-extraction', choices=['finetuning', 'feature-extraction'])
+    parser.add_argument('--epochs', type=int, default=2)
+    parser.add_argument('--batch-size', type=int, default=64)
+    parser.add_argument('--learning-rate', type=float, default=0.001)
+    parser.add_argument('--momentum', type=float, default=0.9)
+
+    args_parsed, unknown = parser.parse_known_args(args_list)
+    if unknown:
+        print(f"Unrecognized arguments. These won't be used: {unknown}")
+
+    return args_parsed
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    main(
+        labels_path=args.prepared_data_path,
+        images_path=args.dogs_imgs,
+        model_name='model',
+        output_dir=args.model_path,
+        mode=args.training_mode,
+        epochs=args.epochs,
+        batch_size=args.batch_size,
+        learning_rate=args.learning_rate,
+        momentum=args.momentum
+    )
--- a/cv/python-sdk/data-science/tests/.gitkeep
+++ b/cv/python-sdk/data-science/tests/.gitkeep
--- a/cv/python-sdk/data/training/image_labels.csv
+++ b/cv/python-sdk/data/training/image_labels.csv
--- a/cv/python-sdk/mlops/devops-pipelines/deploy-model-training-pipeline.yml
+++ b/cv/python-sdk/mlops/devops-pipelines/deploy-model-training-pipeline.yml
@ -0,0 +1,105 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+variables:
+- template: ../../config-aml.yml
+- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
+    # 'main' branch: PRD environment
+    - template: ../../config-infra-prod.yml
+- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:  
+    # 'develop' or feature branches: DEV environment
+    - template: ../../config-infra-dev.yml
+- name: version
+  value: python-sdk
+
+
+trigger:
+- none
+
+pool:
+  vmImage: $(ap_vm_image)
+
+
+resources:
+  repositories:
+    - repository: mlops-templates  # Template Repo
+      name: Azure/mlops-templates # need to change org name from "Azure" to your own org
+      endpoint: github-connection # need to set up and hardcode
+      type: github
+
+stages:
+- stage: DeployTrainingPipeline
+  displayName: Deploy Training Pipeline
+  jobs:
+    - job: DeployTrainingPipeline
+      steps:
+
+      # Setup
+      - checkout: self
+        path: s/
+      - checkout: mlops-templates
+        path: s/templates/
+      - template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
+      - template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
+      - template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
+
+      # Environment
+      - template: templates/${{ variables.version }}/create-environment.yml@mlops-templates
+        parameters:
+          environment_name: $(training_env_name)
+          build_type: folder
+          environment_file: $(training_env_path)
+
+      # Compute
+      - template: templates/${{ variables.version }}/get-compute.yml@mlops-templates
+        parameters:
+          compute_type: training
+
+      # Datasets (images + labels)
+      # Images dataset
+      - task: Bash@3
+        displayName: 'Download data'
+        inputs:
+          targetType: inline
+          script: |
+            mkdir -p $(training_dataset_local_path)
+            curl $(training_dataset_storage_url) | tar xvf - --no-same-owner -C $(training_dataset_local_path)
+      - template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
+        parameters:
+          data_type: training
+      # Labels dataset
+      - template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
+        parameters:
+          data_type: training
+          datasetName: $(labels_dataset_name)
+          datasetDescription: $(labels_dataset_description)
+          datasetLocalPath: $(labels_dataset_local_path)
+          datasetPathOnDatastore: $(labels_dataset_path_on_datastore)
+          datasetType: $(labels_dataset_type)
+
+      # Deploy training pipeline
+      - template: templates/${{ variables.version }}/deploy-training-pipeline.yml@mlops-templates
+      - template: templates/${{ variables.version }}/add-pipeline-to-endpoint.yml@mlops-templates
+      - task: Bash@3
+        name: export_pipeline_id
+        displayName: "Export Pipeline ID"
+        inputs:
+          targetType: "inline"
+          script: |
+            echo "##vso[task.setvariable variable=pipeline_id;isOutput=true;]$(pipeline_id)"
+
+      # Run training
+    - job: invoke_pipeline
+      displayName: 'Invoke pipeline'
+      pool: server
+      timeoutInMinutes: 0
+      dependsOn: DeployTrainingPipeline
+      variables: 
+        pipeline_id: $[ dependencies.DeployTrainingPipeline.outputs['export_pipeline_id.pipeline_id'] ]
+      steps:
+          - task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0
+            displayName: 'Invoke AML Pipeline'
+            inputs:
+              azureSubscription: '$(ado_service_connection_aml_ws)'
+              PipelineId: '$(PIPELINE_ID)'
+              ExperimentName: '$(training_experiment_name)'
--- a/documentation/architecturepattern/AzureML_CML_Architecture.png
+++ b/documentation/architecturepattern/AzureML_CML_Architecture.png
--- a/documentation/architecturepattern/AzureML_CML_Architecture.vsdx
+++ b/documentation/architecturepattern/AzureML_CML_Architecture.vsdx
--- a/documentation/architecturepattern/AzureML_CML_Architecture_v0.7.jpg
+++ b/documentation/architecturepattern/AzureML_CML_Architecture_v0.7.jpg
--- a/documentation/architecturepattern/AzureML_NLP_Classification_Architecture.png
+++ b/documentation/architecturepattern/AzureML_NLP_Classification_Architecture.png
--- a/documentation/architecturepattern/AzureML_NLP_Classification_Architecture.vsdx
+++ b/documentation/architecturepattern/AzureML_NLP_Classification_Architecture.vsdx
--- a/documentation/architecturepattern/AzureML_SupervisedCV_Architecture.png
+++ b/documentation/architecturepattern/AzureML_SupervisedCV_Architecture.png
--- a/documentation/architecturepattern/AzureML_SupervisedCV_Architecture.vsdx
+++ b/documentation/architecturepattern/AzureML_SupervisedCV_Architecture.vsdx
--- a/documentation/architecturepattern/AzureML_SupervisedCV_Architecture_v0.5.jpg
+++ b/documentation/architecturepattern/AzureML_SupervisedCV_Architecture_v0.5.jpg
--- a/infrastructure/bicep/main.bicep
+++ b/infrastructure/bicep/main.bicep
@ -8,7 +8,7 @@ param env string
 param tags object = {
  Owner: 'mlops-v2'
  Project: 'mlops-v2'
-  Environment: 'dev'
+  Environment: env
  Toolkit: 'bicep'
  Name: prefix
 }
@ -28,7 +28,7 @@ module st './modules/storage_account.bicep' = {
  name: 'st'
  scope: resourceGroup(rg.name)
  params: {
-    baseName: '${prefix}${postfix}${env}'
+    baseName: '${uniqueString(rg.id)}${env}'
    location: location
    tags: tags
  }
@ -61,7 +61,7 @@ module cr './modules/container_registry.bicep' = {
  name: 'cr'
  scope: resourceGroup(rg.name)
  params: {
-    baseName: '${prefix}${postfix}${env}'
+    baseName: '${uniqueString(rg.id)}${env}'
    location: location
    tags: tags
  }
--- a/infrastructure/bicep/pipelines/bicep-ado-deploy-infra.yml
+++ b/infrastructure/bicep/pipelines/bicep-ado-deploy-infra.yml
@ -4,10 +4,10 @@
 variables:
 - ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
    # 'main' branch: PRD environment
-    - template: ../../../config-infra-prod.yml
+    - template: ../../config-infra-prod.yml
 - ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:  
    # 'develop' or feature branches: DEV environment
-    - template: ../../../config-infra-dev.yml
+    - template: ../../config-infra-dev.yml

 trigger:
 - none
@ -24,7 +24,7 @@ stages :
      steps:
        - checkout: self
        - script: |
-            az bicep build --file ./infrastructure/bicep/main.bicep
+            az bicep build --file ./infrastructure/main.bicep
          name: LintBicepCode
          displayName: Run Bicep Linter

@ -43,7 +43,7 @@ stages :
            inlineScript: |
              az deployment sub validate \
                --name $(Build.DefinitionName) \
-                --template-file ./infrastructure/bicep/main.bicep \
+                --template-file ./infrastructure/main.bicep \
                --location $(location) \
                --parameters location=$(location) prefix=$(namespace) postfix=$(postfix) env=$(environment)

@ -54,7 +54,7 @@ stages :
      displayName: Deploy Bicep
      pool:
        vmImage: $(ap_vm_image)
-      environment: dev
+      environment: $(environment)
      strategy:
        runOnce:
          deploy:
@ -72,5 +72,5 @@ stages :
                  az deployment sub create \
                    --name $(Build.DefinitionName) \
                    --location $(location) \
-                    --template-file ./infrastructure/bicep/main.bicep \
+                    --template-file ./infrastructure/main.bicep \
                    --parameters location=$(location) prefix=$(namespace) postfix=$(postfix) env=$(environment)
--- a/infrastructure/terraform/aml_deploy.tf
+++ b/infrastructure/terraform/aml_deploy.tf
@ -32,6 +32,11 @@ module "aml_workspace" {
  enable_aml_computecluster = var.enable_aml_computecluster
  storage_account_name      = module.storage_account_aml.name

+  enable_aml_secure_workspace = var.enable_aml_secure_workspace
+  vnet_id                     = var.enable_aml_secure_workspace ? azurerm_virtual_network.vnet_default[0].id : ""
+  subnet_default_id           = var.enable_aml_secure_workspace ? azurerm_subnet.snet_default[0].id : ""
+  subnet_training_id          = var.enable_aml_secure_workspace ? azurerm_subnet.snet_training[0].id : ""
+
  tags = local.tags
 }

@ -51,6 +56,10 @@ module "storage_account_aml" {
  firewall_bypass                     = ["AzureServices"]
  firewall_virtual_network_subnet_ids = []

+  enable_aml_secure_workspace = var.enable_aml_secure_workspace
+  vnet_id                     = var.enable_aml_secure_workspace ? azurerm_virtual_network.vnet_default[0].id : ""
+  subnet_id                   = var.enable_aml_secure_workspace ? azurerm_subnet.snet_default[0].id : ""
+
  tags = local.tags
 }

@ -66,6 +75,10 @@ module "key_vault" {
  postfix = var.postfix
  env     = var.environment

+  enable_aml_secure_workspace = var.enable_aml_secure_workspace
+  vnet_id                     = var.enable_aml_secure_workspace ? azurerm_virtual_network.vnet_default[0].id : ""
+  subnet_id                   = var.enable_aml_secure_workspace ? azurerm_subnet.snet_default[0].id : ""
+
  tags = local.tags
 }

@ -96,5 +109,26 @@ module "container_registry" {
  postfix = var.postfix
  env     = var.environment

+  enable_aml_secure_workspace = var.enable_aml_secure_workspace
+  vnet_id                     = var.enable_aml_secure_workspace ? azurerm_virtual_network.vnet_default[0].id : ""
+  subnet_id                   = var.enable_aml_secure_workspace ? azurerm_subnet.snet_default[0].id : ""
+
+  tags = local.tags
+}
+
+module "data_explorer" {
+  source = "./modules/data-explorer"
+
+  rg_name  = module.resource_group.name
+  location = module.resource_group.location
+
+  prefix  = var.prefix
+  postfix = var.postfix
+  env = var.environment
+  key_vault_id      = module.key_vault.id
+  enable_monitoring = var.enable_monitoring
+
+  client_secret = var.client_secret
+
  tags = local.tags
 }
--- a/infrastructure/terraform/jumphost.tf
+++ b/infrastructure/terraform/jumphost.tf
@ -0,0 +1,37 @@
+# Bastion
+
+module "bastion" {
+  source = "./modules/bastion-host"
+
+  prefix  = var.prefix
+  postfix = var.postfix
+  env     = var.environment
+
+  rg_name   = module.resource_group.name
+  location  = module.resource_group.location
+  subnet_id = var.enable_aml_secure_workspace ? azurerm_subnet.snet_bastion[0].id : ""
+
+  enable_aml_secure_workspace = var.enable_aml_secure_workspace
+
+  tags = local.tags
+}
+
+# Virtual machine
+
+module "virtual_machine_jumphost" {
+  source = "./modules/virtual-machine"
+
+  prefix  = var.prefix
+  postfix = var.postfix
+  env     = var.environment
+
+  rg_name           = module.resource_group.name
+  location          = module.resource_group.location
+  subnet_id         = var.enable_aml_secure_workspace ? azurerm_subnet.snet_default[0].id : ""
+  jumphost_username = var.jumphost_username
+  jumphost_password = var.jumphost_password
+
+  enable_aml_secure_workspace = var.enable_aml_secure_workspace
+
+  tags = local.tags
+}
--- a/infrastructure/terraform/modules/aml-workspace/main.tf
+++ b/infrastructure/terraform/modules/aml-workspace/main.tf
@ -18,12 +18,14 @@ resource "azurerm_machine_learning_workspace" "mlw" {

 # Compute cluster

-resource "azurerm_machine_learning_compute_cluster" "adl_aml_ws_compute_cluster" {
+resource "azurerm_machine_learning_compute_cluster" "mlw_compute_cluster" {
  name                          = "cpu-cluster"
  location                      = var.location
  vm_priority                   = "LowPriority"
  vm_size                       = "Standard_DS3_v2"
  machine_learning_workspace_id = azurerm_machine_learning_workspace.mlw.id
+  subnet_resource_id            = var.enable_aml_secure_workspace ? var.subnet_training_id : ""
+
  count = var.enable_aml_computecluster ? 1 : 0

  scale_settings {
@ -33,70 +35,63 @@ resource "azurerm_machine_learning_compute_cluster" "adl_aml_ws_compute_cluster"
  }
 }

-# # Datastore
+# DNS Zones

-# resource "azurerm_resource_group_template_deployment" "arm_aml_create_datastore" {
-#   name                = "arm_aml_create_datastore"
-#   resource_group_name = var.rg_name
-#   deployment_mode     = "Incremental"
-#   parameters_content = jsonencode({
-#     "WorkspaceName" = {
-#       value = azurerm_machine_learning_workspace.mlw.name
-#     },
-#     "StorageAccountName" = {
-#       value = var.storage_account_name
-#     }
-#   })
+resource "azurerm_private_dns_zone" "mlw_zone_api" {
+  name                = "privatelink.api.azureml.ms"
+  resource_group_name = var.rg_name

-#   depends_on = [time_sleep.wait_30_seconds]
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}

-#   template_content = <<TEMPLATE
-# {
-#   "$schema": "http://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
-#   "contentVersion": "1.0.0.0",
-#   "parameters": {
-#         "WorkspaceName": {
-#             "type": "String"
-#         },
-#         "StorageAccountName": {
-#             "type": "String"
-#         }
-#     },
-#   "resources": [
-#         {
-#             "type": "Microsoft.MachineLearningServices/workspaces/datastores",
-#             "apiVersion": "2021-03-01-preview",
-#             "name": "[concat(parameters('WorkspaceName'), '/default')]",
-#             "dependsOn": [],
-#             "properties": {
-#                 "contents": {
-#                     "accountName": "[parameters('StorageAccountName')]",
-#                     "containerName": "default",
-#                     "contentsType": "AzureBlob",
-#                     "credentials": {
-#                       "credentialsType": "None"
-#                     },
-#                     "endpoint": "core.windows.net",
-#                     "protocol": "https"
-#                   },
-#                   "description": "Default datastore for mlops-tabular",
-#                   "isDefault": false,
-#                   "properties": {
-#                     "ServiceDataAccessAuthIdentity": "None"
-#                   },
-#                   "tags": {}
-#                 }
-#         }
-#   ]
-# }
-# TEMPLATE
-# }
+resource "azurerm_private_dns_zone" "mlw_zone_notebooks" {
+  name                = "privatelink.notebooks.azure.net"
+  resource_group_name = var.rg_name

-# resource "time_sleep" "wait_30_seconds" {
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}

-#   depends_on = [
-#     azurerm_machine_learning_workspace.mlw
-#   ]
+# Linking of DNS zones to Virtual Network

-#   create_duration = "30s"
-# }
+resource "azurerm_private_dns_zone_virtual_network_link" "mlw_zone_api_link" {
+  name                  = "${var.prefix}${var.postfix}_link_api"
+  resource_group_name   = var.rg_name
+  private_dns_zone_name = azurerm_private_dns_zone.mlw_zone_api[0].name
+  virtual_network_id    = var.vnet_id
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+resource "azurerm_private_dns_zone_virtual_network_link" "mlw_zone_notebooks_link" {
+  name                  = "${var.prefix}${var.postfix}_link_notebooks"
+  resource_group_name   = var.rg_name
+  private_dns_zone_name = azurerm_private_dns_zone.mlw_zone_notebooks[0].name
+  virtual_network_id    = var.vnet_id
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+# Private Endpoint configuration
+
+resource "azurerm_private_endpoint" "mlw_pe" {
+  name                = "pe-${azurerm_machine_learning_workspace.mlw.name}-amlw"
+  location            = var.location
+  resource_group_name = var.rg_name
+  subnet_id           = var.subnet_default_id
+
+  private_service_connection {
+    name                           = "psc-aml-${var.prefix}-${var.postfix}${var.env}"
+    private_connection_resource_id = azurerm_machine_learning_workspace.mlw.id
+    subresource_names              = ["amlworkspace"]
+    is_manual_connection           = false
+  }
+
+  private_dns_zone_group {
+    name                 = "private-dns-zone-group-ws"
+    private_dns_zone_ids = [azurerm_private_dns_zone.mlw_zone_api[0].id, azurerm_private_dns_zone.mlw_zone_notebooks[0].id]
+  }
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+
+  tags = var.tags
+}
--- a/infrastructure/terraform/modules/aml-workspace/variables.tf
+++ b/infrastructure/terraform/modules/aml-workspace/variables.tf
@ -58,3 +58,22 @@ variable "storage_account_name" {
  type        = string
  description = "The Name of the Storage Account linked to AML workspace"
 }
+
+variable "enable_aml_secure_workspace" {
+  description = "Variable to enable or disable AML secure workspace"
+}
+
+variable "vnet_id" {
+  type        = string
+  description = "The ID of the vnet that should be linked to the DNS zone"
+}
+
+variable "subnet_default_id" {
+  type        = string
+  description = "The ID of the subnet from which private IP addresses will be allocated for this Private Endpoint"
+}
+
+variable "subnet_training_id" {
+  type        = string
+  description = "The ID of the subnet from which private IP addresses will be allocated for this Private Endpoint"
+}
--- a/infrastructure/terraform/modules/bastion-host/main.tf
+++ b/infrastructure/terraform/modules/bastion-host/main.tf
@ -0,0 +1,31 @@
+resource "azurerm_bastion_host" "bas" {
+  name                = "bas-${var.prefix}-${var.postfix}${var.env}"
+  location            = var.location
+  resource_group_name = var.rg_name
+
+  sku                = "Standard"
+  copy_paste_enabled = false
+  file_copy_enabled  = false
+
+  ip_configuration {
+    name                 = "configuration"
+    subnet_id            = var.subnet_id
+    public_ip_address_id = azurerm_public_ip.pip[0].id
+  }
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+
+  tags = var.tags
+}
+
+resource "azurerm_public_ip" "pip" {
+  name                = "pip-${var.prefix}-${var.postfix}${var.env}"
+  location            = var.location
+  resource_group_name = var.rg_name
+  allocation_method   = "Static"
+  sku                 = "Standard"
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+
+  tags = var.tags
+}
--- a/infrastructure/terraform/modules/bastion-host/outputs.tf
+++ b/infrastructure/terraform/modules/bastion-host/outputs.tf
--- a/infrastructure/terraform/modules/bastion-host/variables.tf
+++ b/infrastructure/terraform/modules/bastion-host/variables.tf
@ -0,0 +1,39 @@
+variable "rg_name" {
+  type        = string
+  description = "Resource group name"
+}
+
+variable "location" {
+  type        = string
+  description = "Location of the resource group"
+}
+
+variable "tags" {
+  type        = map(string)
+  default     = {}
+  description = "A mapping of tags which should be assigned to the deployed resource"
+}
+
+variable "prefix" {
+  type        = string
+  description = "Prefix for the module name"
+}
+
+variable "postfix" {
+  type        = string
+  description = "Postfix for the module name"
+}
+
+variable "env" {
+  type        = string
+  description = "Environment prefix"
+}
+
+variable "subnet_id" {
+  type        = string
+  description = "Subnet ID for the bastion"
+}
+
+variable "enable_aml_secure_workspace" {
+  description = "Variable to enable or disable AML secure workspace"
+}
--- a/infrastructure/terraform/modules/container-registry/main.tf
+++ b/infrastructure/terraform/modules/container-registry/main.tf
@ -7,8 +7,53 @@ resource "azurerm_container_registry" "cr" {
  name                = "cr${local.safe_prefix}${local.safe_postfix}${var.env}"
  resource_group_name = var.rg_name
  location            = var.location
-  sku                 = "Standard"
+  sku                 = var.enable_aml_secure_workspace ? "Premium" : "Standard"
  admin_enabled       = true

  tags = var.tags
 }
+
+# DNS Zones
+
+resource "azurerm_private_dns_zone" "cr_zone" {
+  name                = "privatelink.azurecr.io"
+  resource_group_name = var.rg_name
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+# Linking of DNS zones to Virtual Network
+
+resource "azurerm_private_dns_zone_virtual_network_link" "cr_zone_link" {
+  name                  = "${var.prefix}${var.postfix}_link_acr"
+  resource_group_name   = var.rg_name
+  private_dns_zone_name = azurerm_private_dns_zone.cr_zone[0].name
+  virtual_network_id    = var.vnet_id
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+# Private Endpoint configuration
+
+resource "azurerm_private_endpoint" "cr_pe" {
+  name                = "pe-${azurerm_container_registry.cr.name}-acr"
+  location            = var.location
+  resource_group_name = var.rg_name
+  subnet_id           = var.subnet_id
+
+  private_service_connection {
+    name                           = "psc-acr-${var.prefix}-${var.postfix}${var.env}"
+    private_connection_resource_id = azurerm_container_registry.cr.id
+    subresource_names              = ["registry"]
+    is_manual_connection           = false
+  }
+
+  private_dns_zone_group {
+    name                 = "private-dns-zone-group-acr"
+    private_dns_zone_ids = [azurerm_private_dns_zone.cr_zone[0].id]
+  }
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+
+  tags = var.tags
+}
--- a/infrastructure/terraform/modules/container-registry/variables.tf
+++ b/infrastructure/terraform/modules/container-registry/variables.tf
@ -28,3 +28,17 @@ variable "env" {
  type        = string
  description = "Environment prefix"
 }
+
+variable "enable_aml_secure_workspace" {
+  description = "Variable to enable or disable AML secure workspace"
+}
+
+variable "vnet_id" {
+  type        = string
+  description = "The ID of the vnet that should be linked to the DNS zone"
+}
+
+variable "subnet_id" {
+  type        = string
+  description = "The ID of the subnet from which private IP addresses will be allocated for this Private Endpoint"
+}
--- a/infrastructure/terraform/modules/data-explorer/outputs.tf
+++ b/infrastructure/terraform/modules/data-explorer/outputs.tf
--- a/infrastructure/terraform/modules/key-vault/main.tf
+++ b/infrastructure/terraform/modules/key-vault/main.tf
@ -8,4 +8,67 @@ resource "azurerm_key_vault" "kv" {
  sku_name            = "standard"

  tags = var.tags
+
+  access_policy {
+    tenant_id = data.azurerm_client_config.current.tenant_id
+    object_id = data.azurerm_client_config.current.object_id
+
+    key_permissions = [
+      "Create",
+      "Get",
+    ]
+
+    secret_permissions = [
+      "Set",
+      "Get",
+      "Delete",
+      "Purge",
+      "Recover"
+    ]
+  }
+}
+
+# DNS Zones
+
+resource "azurerm_private_dns_zone" "kv_zone" {
+  name                = "privatelink.vaultcore.azure.net"
+  resource_group_name = var.rg_name
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+# Linking of DNS zones to Virtual Network
+
+resource "azurerm_private_dns_zone_virtual_network_link" "kv_zone_link" {
+  name                  = "${var.prefix}${var.postfix}_link_kv"
+  resource_group_name   = var.rg_name
+  private_dns_zone_name = azurerm_private_dns_zone.kv_zone[0].name
+  virtual_network_id    = var.vnet_id
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+# Private Endpoint configuration
+
+resource "azurerm_private_endpoint" "kv_pe" {
+  name                = "pe-${azurerm_key_vault.kv.name}-vault"
+  location            = var.location
+  resource_group_name = var.rg_name
+  subnet_id           = var.subnet_id
+
+  private_service_connection {
+    name                           = "psc-kv-${var.prefix}-${var.postfix}${var.env}"
+    private_connection_resource_id = azurerm_key_vault.kv.id
+    subresource_names              = ["vault"]
+    is_manual_connection           = false
+  }
+
+  private_dns_zone_group {
+    name                 = "private-dns-zone-group-kv"
+    private_dns_zone_ids = [azurerm_private_dns_zone.kv_zone[0].id]
+  }
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+
+  tags = var.tags
 }
--- a/infrastructure/terraform/modules/key-vault/variables.tf
+++ b/infrastructure/terraform/modules/key-vault/variables.tf
@ -28,3 +28,17 @@ variable "env" {
  type        = string
  description = "Environment prefix"
 }
+
+variable "enable_aml_secure_workspace" {
+  description = "Variable to enable or disable AML secure workspace"
+}
+
+variable "vnet_id" {
+  type        = string
+  description = "The ID of the vnet that should be linked to the DNS zone"
+}
+
+variable "subnet_id" {
+  type        = string
+  description = "The ID of the subnet from which private IP addresses will be allocated for this Private Endpoint"
+}
--- a/infrastructure/terraform/modules/storage-account/main.tf
+++ b/infrastructure/terraform/modules/storage-account/main.tf
@ -32,3 +32,87 @@ resource "azurerm_storage_account_network_rules" "firewall_rules" {
  virtual_network_subnet_ids = var.firewall_virtual_network_subnet_ids
  bypass                     = var.firewall_bypass
 }
+
+# DNS Zones
+
+resource "azurerm_private_dns_zone" "st_zone_blob" {
+  name                = "privatelink.blob.core.windows.net"
+  resource_group_name = var.rg_name
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+resource "azurerm_private_dns_zone" "st_zone_file" {
+  name                = "privatelink.file.core.windows.net"
+  resource_group_name = var.rg_name
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+# Linking of DNS zones to Virtual Network
+
+resource "azurerm_private_dns_zone_virtual_network_link" "st_zone_link_blob" {
+  name                  = "${var.prefix}${var.postfix}_link_st_blob"
+  resource_group_name   = var.rg_name
+  private_dns_zone_name = azurerm_private_dns_zone.st_zone_blob[0].name
+  virtual_network_id    = var.vnet_id
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+resource "azurerm_private_dns_zone_virtual_network_link" "st_zone_link_file" {
+  name                  = "${var.prefix}${var.postfix}_link_st_file"
+  resource_group_name   = var.rg_name
+  private_dns_zone_name = azurerm_private_dns_zone.st_zone_file[0].name
+  virtual_network_id    = var.vnet_id
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+# Private Endpoint configuration
+
+resource "azurerm_private_endpoint" "st_pe_blob" {
+  name                = "pe-${azurerm_storage_account.st.name}-blob"
+  location            = var.location
+  resource_group_name = var.rg_name
+  subnet_id           = var.subnet_id
+
+  private_service_connection {
+    name                           = "psc-blob-${var.prefix}-${var.postfix}${var.env}"
+    private_connection_resource_id = azurerm_storage_account.st.id
+    subresource_names              = ["blob"]
+    is_manual_connection           = false
+  }
+
+  private_dns_zone_group {
+    name                 = "private-dns-zone-group-blob"
+    private_dns_zone_ids = [azurerm_private_dns_zone.st_zone_blob[0].id]
+  }
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+
+  tags = var.tags
+}
+
+resource "azurerm_private_endpoint" "st_pe_file" {
+  name                = "pe-${azurerm_storage_account.st.name}-file"
+  location            = var.location
+  resource_group_name = var.rg_name
+  subnet_id           = var.subnet_id
+
+  private_service_connection {
+    name                           = "psc-file-${var.prefix}-${var.postfix}${var.env}"
+    private_connection_resource_id = azurerm_storage_account.st.id
+    subresource_names              = ["file"]
+    is_manual_connection           = false
+  }
+
+  private_dns_zone_group {
+    name                 = "private-dns-zone-group-file"
+    private_dns_zone_ids = [azurerm_private_dns_zone.st_zone_file[0].id]
+  }
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+
+  tags = var.tags
+}
--- a/infrastructure/terraform/modules/storage-account/variables.tf
+++ b/infrastructure/terraform/modules/storage-account/variables.tf
@ -42,3 +42,17 @@ variable "firewall_virtual_network_subnet_ids" {
 variable "firewall_bypass" {
  default = ["None"]
 }
+
+variable "enable_aml_secure_workspace" {
+  description = "Variable to enable or disable AML secure workspace"
+}
+
+variable "vnet_id" {
+  type        = string
+  description = "The ID of the vnet that should be linked to the DNS zone"
+}
+
+variable "subnet_id" {
+  type        = string
+  description = "The ID of the subnet from which private IP addresses will be allocated for this Private Endpoint"
+}
--- a/infrastructure/terraform/modules/virtual-machine/main.tf
+++ b/infrastructure/terraform/modules/virtual-machine/main.tf
@ -0,0 +1,104 @@
+resource "azurerm_virtual_machine" "vm" {
+  name                  = "wvm-jumphost"
+  location              = var.location
+  resource_group_name   = var.rg_name
+  network_interface_ids = [azurerm_network_interface.vm_nic[0].id]
+  vm_size               = "Standard_DS3_v2"
+
+  delete_os_disk_on_termination    = true
+  delete_data_disks_on_termination = true
+
+  storage_image_reference {
+    publisher = "microsoft-dsvm"
+    offer     = "dsvm-win-2019"
+    sku       = "server-2019"
+    version   = "latest"
+  }
+
+  os_profile {
+    computer_name  = var.jumphost_username
+    admin_username = var.jumphost_username
+    admin_password = var.jumphost_password
+  }
+
+  os_profile_windows_config {
+    provision_vm_agent        = true
+    enable_automatic_upgrades = true
+  }
+
+  identity {
+    type = "SystemAssigned"
+  }
+
+  storage_os_disk {
+    name              = "disk-${var.prefix}-${var.postfix}${var.env}"
+    caching           = "ReadWrite"
+    create_option     = "FromImage"
+    managed_disk_type = "StandardSSD_LRS"
+  }
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+
+  tags = var.tags
+}
+
+resource "azurerm_network_interface" "vm_nic" {
+  name                = "nic-${var.prefix}-${var.postfix}${var.env}"
+  location            = var.location
+  resource_group_name = var.rg_name
+
+  ip_configuration {
+    name                          = "configuration"
+    private_ip_address_allocation = "Dynamic"
+    subnet_id                     = var.subnet_id
+    # public_ip_address_id          = azurerm_public_ip.vm_public_ip.id
+  }
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+
+  tags = var.tags
+}
+
+resource "azurerm_network_security_group" "vm_nsg" {
+  name                = "nsg-${var.prefix}-${var.postfix}${var.env}"
+  location            = var.location
+  resource_group_name = var.rg_name
+
+  security_rule {
+    name                       = "RDP"
+    priority                   = 1010
+    direction                  = "Inbound"
+    access                     = "Allow"
+    protocol                   = "Tcp"
+    source_port_range          = "*"
+    destination_port_range     = 3389
+    source_address_prefix      = "*"
+    destination_address_prefix = "*"
+  }
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+
+  tags = var.tags
+}
+
+resource "azurerm_network_interface_security_group_association" "vm_nsg_association" {
+  network_interface_id      = azurerm_network_interface.vm_nic[0].id
+  network_security_group_id = azurerm_network_security_group.vm_nsg[0].id
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+resource "azurerm_dev_test_global_vm_shutdown_schedule" "vm_schedule" {
+  virtual_machine_id = azurerm_virtual_machine.vm[0].id
+  location           = var.location
+  enabled            = true
+
+  daily_recurrence_time = "2000"
+  timezone              = "W. Europe Standard Time"
+
+  notification_settings {
+    enabled = false
+  }
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
--- a/infrastructure/terraform/modules/virtual-machine/outputs.tf
+++ b/infrastructure/terraform/modules/virtual-machine/outputs.tf
--- a/infrastructure/terraform/modules/virtual-machine/variables.tf
+++ b/infrastructure/terraform/modules/virtual-machine/variables.tf
@ -0,0 +1,49 @@
+variable "rg_name" {
+  type        = string
+  description = "Resource group name"
+}
+
+variable "location" {
+  type        = string
+  description = "Location of the resource group"
+}
+
+variable "tags" {
+  type        = map(string)
+  default     = {}
+  description = "A mapping of tags which should be assigned to the deployed resource"
+}
+
+variable "prefix" {
+  type        = string
+  description = "Prefix for the module name"
+}
+
+variable "postfix" {
+  type        = string
+  description = "Postfix for the module name"
+}
+
+variable "env" {
+  type        = string
+  description = "Environment prefix"
+}
+
+variable "jumphost_username" {
+  type        = string
+  description = "VM username"
+}
+
+variable "jumphost_password" {
+  type        = string
+  description = "VM password"
+}
+
+variable "subnet_id" {
+  type        = string
+  description = "Subnet ID for the virtual machine"
+}
+
+variable "enable_aml_secure_workspace" {
+  description = "Variable to enable or disable AML secure workspace"
+}
--- a/infrastructure/terraform/network.tf
+++ b/infrastructure/terraform/network.tf
@ -0,0 +1,131 @@
+# Virtual network
+
+resource "azurerm_virtual_network" "vnet_default" {
+  name                = "vnet-${var.prefix}-${var.postfix}${var.environment}"
+  resource_group_name = module.resource_group.name
+  location            = module.resource_group.location
+  address_space       = ["10.0.0.0/16"]
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+
+  tags = local.tags
+}
+
+# Subnets
+
+resource "azurerm_subnet" "snet_default" {
+  name                                           = "snet-${var.prefix}-${var.postfix}${var.environment}-default"
+  resource_group_name                            = module.resource_group.name
+  virtual_network_name                           = azurerm_virtual_network.vnet_default[0].name
+  address_prefixes                               = ["10.0.1.0/24"]
+  enforce_private_link_endpoint_network_policies = true
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+resource "azurerm_subnet" "snet_bastion" {
+  name                 = "AzureBastionSubnet"
+  resource_group_name  = module.resource_group.name
+  virtual_network_name = azurerm_virtual_network.vnet_default[0].name
+  address_prefixes     = ["10.0.10.0/27"]
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+resource "azurerm_subnet" "snet_training" {
+  name                                           = "snet-${var.prefix}-${var.postfix}${var.environment}-training"
+  resource_group_name                            = module.resource_group.name
+  virtual_network_name                           = azurerm_virtual_network.vnet_default[0].name
+  address_prefixes                               = ["10.0.2.0/24"]
+  enforce_private_link_endpoint_network_policies = true
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+# Network security groups
+
+resource "azurerm_network_security_group" "nsg_training" {
+  name                = "nsg-${var.prefix}-${var.postfix}${var.environment}-training"
+  location            = module.resource_group.location
+  resource_group_name = module.resource_group.name
+
+  security_rule {
+    name                       = "BatchNodeManagement"
+    priority                   = 100
+    direction                  = "Inbound"
+    access                     = "Allow"
+    protocol                   = "Tcp"
+    source_port_range          = "*"
+    destination_port_range     = "29876-29877"
+    source_address_prefix      = "BatchNodeManagement"
+    destination_address_prefix = "*"
+  }
+
+  security_rule {
+    name                       = "AzureMachineLearning"
+    priority                   = 110
+    direction                  = "Inbound"
+    access                     = "Allow"
+    protocol                   = "Tcp"
+    source_port_range          = "*"
+    destination_port_range     = "44224"
+    source_address_prefix      = "AzureMachineLearning"
+    destination_address_prefix = "*"
+  }
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+resource "azurerm_subnet_network_security_group_association" "nsg-training-link" {
+  subnet_id                 = azurerm_subnet.snet_training[0].id
+  network_security_group_id = azurerm_network_security_group.nsg_training[0].id
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+# User Defined Routes
+
+resource "azurerm_route_table" "rt_training" {
+  name                = "rt-${var.prefix}-${var.postfix}${var.environment}-training"
+  location            = module.resource_group.location
+  resource_group_name = module.resource_group.name
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+resource "azurerm_route" "route_training_internet" {
+  name                = "Internet"
+  resource_group_name = module.resource_group.name
+  route_table_name    = azurerm_route_table.rt_training[0].name
+  address_prefix      = "0.0.0.0/0"
+  next_hop_type       = "Internet"
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+resource "azurerm_route" "route_training_aml" {
+  name                = "AzureMLRoute"
+  resource_group_name = module.resource_group.name
+  route_table_name    = azurerm_route_table.rt_training[0].name
+  address_prefix      = "AzureMachineLearning"
+  next_hop_type       = "Internet"
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+resource "azurerm_route" "route_training_batch" {
+  name                = "BatchRoute"
+  resource_group_name = module.resource_group.name
+  route_table_name    = azurerm_route_table.rt_training[0].name
+  address_prefix      = "BatchNodeManagement"
+  next_hop_type       = "Internet"
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
+
+resource "azurerm_subnet_route_table_association" "rt_training_link" {
+  subnet_id      = azurerm_subnet.snet_training[0].id
+  route_table_id = azurerm_route_table.rt_training[0].id
+
+  count = var.enable_aml_secure_workspace ? 1 : 0
+}
--- a/infrastructure/terraform/pipelines/tf-ado-deploy-infra.yml
+++ b/infrastructure/terraform/pipelines/tf-ado-deploy-infra.yml
@ -9,6 +9,14 @@ variables:
    # 'develop' or feature branches: DEV environment
    - template: ../../config-infra-dev.yml

+parameters:
+- name: jumphost_username
+  type: string
+  default: "azureuser"
+- name: jumphost_password
+  type: string
+  default: "ThisIsNotVerySecure!"
+
 trigger:
 - none

@ -51,4 +59,10 @@ stages :
      - template: templates/infra/run-terraform-init.yml@mlops-templates
      - template: templates/infra/run-terraform-validate.yml@mlops-templates
      - template: templates/infra/run-terraform-plan.yml@mlops-templates
+        parameters:
+          jumphost_username: ${{parameters.jumphost_username}}
+          jumphost_password: ${{parameters.jumphost_password}}
      - template: templates/infra/run-terraform-apply.yml@mlops-templates
+        parameters:
+            jumphost_username: ${{parameters.jumphost_username}}
+            jumphost_password: ${{parameters.jumphost_password}}
--- a/infrastructure/terraform/variables.tf
+++ b/infrastructure/terraform/variables.tf
@ -21,3 +21,27 @@ variable "postfix" {
 variable "enable_aml_computecluster" {
  description = "Variable to enable or disable AML compute cluster"
 }
+
+variable "enable_aml_secure_workspace" {
+  description = "Variable to enable or disable AML secure workspace"
+}
+
+variable "jumphost_username" {
+  type        = string
+  description = "VM username"
+  default     = "azureuser"
+}
+
+variable "jumphost_password" {
+  type        = string
+  description = "VM password"
+  default     = "ThisIsNotVerySecure!"
+}
+
+variable "enable_monitoring" {
+  description = "Variable to enable or disable Monitoring"
+}
+
+variable "client_secret" {
+  description = "Service Principal Secret"
+}
--- a/nlp/aml-cli-v2/data-science/environments/inference/conda_env.yml
+++ b/nlp/aml-cli-v2/data-science/environments/inference/conda_env.yml
@ -0,0 +1,24 @@
+name: nlp_inference_conda_env
+channels:
+  - pytorch
+  - anaconda
+  - defaults
+  - conda-forge
+dependencies:
+  - python=3.8
+  - pip=21.2.4
+  - pytorch=1.10.0
+  - torchvision=0.11.1
+  - torchaudio=0.10.0
+  - cudatoolkit=11.1.1
+  - nvidia-apex=0.1.0
+  - gxx_linux-64=8.5.0
+  - pip:
+      - azureml-defaults==1.39.0
+      - azureml-mlflow==1.39.0
+      - azureml-telemetry==1.39.0
+      - azureml-train-core==1.39.0
+      - mlflow==1.24.0
+      - transformers==4.17.0
+      - 'inference-schema[numpy-support]==1.3.0'
+      - applicationinsights==0.11.10
--- a/Показать больше
+++ b/Показать больше
				`@ -0,0 +1 @@`
				`from .model_loader import MODEL_ARCH_LIST, get_model_metadata, load_model`