feat: re-enable image/cifar-10 pipeline tests & update readme.py (#1112)

* feat: test ci * feat: enable cifar-10 & image classification * feat: add sample for hyperparameter sweep * fix: fix ci * fix: fix ci * fix ci: update readme.py * fix: black reformat * Revert "fix: black reformat" This reverts commit d47755571f. * fix: image pipeline yml has been renamed * fix: update output mode * fix: update output mode * chaneg setting to use artifact store and fix typo Co-authored-by: lochen <cloga0216@gmail.com>
2022-04-08 09:47:34 +08:00 · 2022-04-08 09:47:34 +08:00 · c46a0de2b2
--- a/.github/workflows/cli-jobs-pipelines-cifar-10-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-cifar-10-pipeline.yml
@ -1,4 +1,4 @@
-name: cli-scripts-run-pipeline-jobs
+name: cli-jobs-pipelines-cifar-10-pipeline
 on:
  workflow_dispatch:
  schedule:
@ -7,8 +7,9 @@ on:
    branches:
      - main
    paths:
+      - cli/jobs/pipelines/cifar-10/**
+      - .github/workflows/cli-jobs-pipelines-cifar-10-pipeline.yml
      - cli/run-pipeline-jobs.sh
-      - .github/workflows/cli-scripts-run-pipeline-jobs.yml
      - cli/setup.sh
 jobs:
  build:
@ -24,8 +25,6 @@ jobs:
      run: bash setup.sh
      working-directory: cli
      continue-on-error: true
-    - name: scripts installs
-      run: sudo apt-get upgrade -y && sudo apt-get install uuid-runtime jq -y
-    - name: test script script
-      run: set -e; bash -x run-pipeline-jobs.sh
+    - name: run job
+      run: bash -x run-job.sh jobs/pipelines/cifar-10/pipeline.yml
      working-directory: cli
--- a/.github/workflows/cli-jobs-pipelines-nyc-taxi-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-nyc-taxi-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines/nyc-taxi/**
      - .github/workflows/cli-jobs-pipelines-nyc-taxi-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/.github/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines-with-components/basics/1a_e2e_local_components/**
      - .github/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/.github/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines-with-components/basics/1b_e2e_registered_components/**
      - .github/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/.github/workflows/cli-jobs-pipelines-with-components-basics-2a_basic_component-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-basics-2a_basic_component-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines-with-components/basics/2a_basic_component/**
      - .github/workflows/cli-jobs-pipelines-with-components-basics-2a_basic_component-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/.github/workflows/cli-jobs-pipelines-with-components-basics-2b_component_with_input_output-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-basics-2b_component_with_input_output-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines-with-components/basics/2b_component_with_input_output/**
      - .github/workflows/cli-jobs-pipelines-with-components-basics-2b_component_with_input_output-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/.github/workflows/cli-jobs-pipelines-with-components-basics-3a_basic_pipeline-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-basics-3a_basic_pipeline-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines-with-components/basics/3a_basic_pipeline/**
      - .github/workflows/cli-jobs-pipelines-with-components-basics-3a_basic_pipeline-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/.github/workflows/cli-jobs-pipelines-with-components-basics-3b_pipeline_with_data-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-basics-3b_pipeline_with_data-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines-with-components/basics/3b_pipeline_with_data/**
      - .github/workflows/cli-jobs-pipelines-with-components-basics-3b_pipeline_with_data-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/.github/workflows/cli-jobs-pipelines-with-components-basics-4a_local_data_input-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-basics-4a_local_data_input-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines-with-components/basics/4a_local_data_input/**
      - .github/workflows/cli-jobs-pipelines-with-components-basics-4a_local_data_input-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/.github/workflows/cli-jobs-pipelines-with-components-basics-4b_datastore_datapath_uri-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-basics-4b_datastore_datapath_uri-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines-with-components/basics/4b_datastore_datapath_uri/**
      - .github/workflows/cli-jobs-pipelines-with-components-basics-4b_datastore_datapath_uri-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/.github/workflows/cli-jobs-pipelines-with-components-basics-4c_web_url_input-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-basics-4c_web_url_input-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines-with-components/basics/4c_web_url_input/**
      - .github/workflows/cli-jobs-pipelines-with-components-basics-4c_web_url_input-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/.github/workflows/cli-jobs-pipelines-with-components-basics-5a_env_public_docker_image-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-basics-5a_env_public_docker_image-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines-with-components/basics/5a_env_public_docker_image/**
      - .github/workflows/cli-jobs-pipelines-with-components-basics-5a_env_public_docker_image-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/.github/workflows/cli-jobs-pipelines-with-components-basics-5b_env_registered-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-basics-5b_env_registered-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines-with-components/basics/5b_env_registered/**
      - .github/workflows/cli-jobs-pipelines-with-components-basics-5b_env_registered-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/.github/workflows/cli-jobs-pipelines-with-components-basics-5c_env_conda_file-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-basics-5c_env_conda_file-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines-with-components/basics/5c_env_conda_file/**
      - .github/workflows/cli-jobs-pipelines-with-components-basics-5c_env_conda_file-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/.github/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines-with-components/basics/6a_tf_hello_world/**
      - .github/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/.github/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines-with-components/basics/6b_pytorch_hello_world/**
      - .github/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/.github/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines-with-components/basics/6c_r_iris/**
      - .github/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/.github/workflows/cli-jobs-pipelines-with-components-image_classification_with_densenet-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-image_classification_with_densenet-pipeline.yml
@ -0,0 +1,30 @@
+name: cli-jobs-pipelines-with-components-image_classification_with_densenet-pipeline
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0/4 * * *"
+  pull_request:
+    branches:
+      - main
+    paths:
+      - cli/jobs/pipelines-with-components/image_classification_with_densenet/**
+      - .github/workflows/cli-jobs-pipelines-with-components-image_classification_with_densenet-pipeline.yml
+      - cli/run-pipeline-jobs.sh
+      - cli/setup.sh
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: check out repo
+      uses: actions/checkout@v2
+    - name: azure login
+      uses: azure/login@v1
+      with:
+        creds: ${{secrets.AZ_CREDS}}
+    - name: setup
+      run: bash setup.sh
+      working-directory: cli
+      continue-on-error: true
+    - name: run job
+      run: bash -x run-job.sh jobs/pipelines-with-components/image_classification_with_densenet/pipeline.yml
+      working-directory: cli
--- a/.github/workflows/cli-jobs-pipelines-with-components-nyc_taxi_data_regression-pipeline.yml
+++ b/.github/workflows/cli-jobs-pipelines-with-components-nyc_taxi_data_regression-pipeline.yml
@ -9,6 +9,7 @@ on:
    paths:
      - cli/jobs/pipelines-with-components/nyc_taxi_data_regression/**
      - .github/workflows/cli-jobs-pipelines-with-components-nyc_taxi_data_regression-pipeline.yml
+      - cli/run-pipeline-jobs.sh
      - cli/setup.sh
 jobs:
  build:
--- a/cli/README.md
+++ b/cli/README.md
@ -47,7 +47,6 @@ path|status|
 [deploy-triton-managed-online-endpoint.sh](deploy-triton-managed-online-endpoint.sh)|[![deploy-triton-managed-online-endpoint](https://github.com/Azure/azureml-examples/workflows/cli-scripts-deploy-triton-managed-online-endpoint/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-deploy-triton-managed-online-endpoint.yml)
 [misc.sh](misc.sh)|[![misc](https://github.com/Azure/azureml-examples/workflows/cli-scripts-misc/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-misc.yml)
 [mlflow-uri.sh](mlflow-uri.sh)|[![mlflow-uri](https://github.com/Azure/azureml-examples/workflows/cli-scripts-mlflow-uri/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-mlflow-uri.yml)
-[run-pipeline-jobs.sh](run-pipeline-jobs.sh)|[![run-pipeline-jobs](https://github.com/Azure/azureml-examples/workflows/cli-scripts-run-pipeline-jobs/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-run-pipeline-jobs.yml)
 [train-rest.sh](train-rest.sh)|[![train-rest](https://github.com/Azure/azureml-examples/workflows/cli-scripts-train-rest/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-train-rest.yml)
 [train.sh](train.sh)|[![train](https://github.com/Azure/azureml-examples/workflows/cli-scripts-train/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-train.yml)

@ -94,6 +93,7 @@ path|status|description
 [jobs/basics/hello-world-output-data.yml](jobs/basics/hello-world-output-data.yml)|[![jobs/basics/hello-world-output-data](https://github.com/Azure/azureml-examples/workflows/cli-jobs-basics-hello-world-output-data/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-basics-hello-world-output-data.yml)|*no description*
 [jobs/basics/hello-world-output.yml](jobs/basics/hello-world-output.yml)|[![jobs/basics/hello-world-output](https://github.com/Azure/azureml-examples/workflows/cli-jobs-basics-hello-world-output/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-basics-hello-world-output.yml)|*no description*
 [jobs/basics/hello-world.yml](jobs/basics/hello-world.yml)|[![jobs/basics/hello-world](https://github.com/Azure/azureml-examples/workflows/cli-jobs-basics-hello-world/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-basics-hello-world.yml)|*no description*
+[jobs/pipelines/cifar-10/pipeline.yml](jobs/pipelines/cifar-10/pipeline.yml)|[![jobs/pipelines/cifar-10/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-cifar-10-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-cifar-10-pipeline.yml)|*no description*
 [jobs/pipelines/nyc-taxi/pipeline.yml](jobs/pipelines/nyc-taxi/pipeline.yml)|[![jobs/pipelines/nyc-taxi/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-nyc-taxi-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-nyc-taxi-pipeline.yml)|*no description*
 [jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline.yml](jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline.yml)|[![jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline.yml)|"Dummy train-score-eval pipeline with local components"
 [jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline.yml](jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline.yml)|[![jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline.yml)|"E2E dummy train-score-eval pipeline with registered components"
@ -110,22 +110,8 @@ path|status|description
 [jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline.yml](jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline.yml)|[![jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline.yml)|"Prints the environment variable ($TF_CONFIG) useful for scripts running in a Tensorflow training environment"
 [jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline.yml](jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline.yml)|[![jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline.yml)|"Prints the environment variables useful for scripts running in a PyTorch training environment"
 [jobs/pipelines-with-components/basics/6c_r_iris/pipeline.yml](jobs/pipelines-with-components/basics/6c_r_iris/pipeline.yml)|[![jobs/pipelines-with-components/basics/6c_r_iris/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline.yml)|Train an R model on the Iris dataset.
+[jobs/pipelines-with-components/image_classification_with_densenet/pipeline.yml](jobs/pipelines-with-components/image_classification_with_densenet/pipeline.yml)|[![jobs/pipelines-with-components/image_classification_with_densenet/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-image_classification_with_densenet-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-image_classification_with_densenet-pipeline.yml)|*no description*
 [jobs/pipelines-with-components/nyc_taxi_data_regression/pipeline.yml](jobs/pipelines-with-components/nyc_taxi_data_regression/pipeline.yml)|[![jobs/pipelines-with-components/nyc_taxi_data_regression/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-nyc_taxi_data_regression-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-nyc_taxi_data_regression-pipeline.yml)|*no description*
-[jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline.yml](jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline.yml)|[![jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline.yml)|"Dummy train-score-eval pipeline with local components"
-[jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline.yml](jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline.yml)|[![jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline.yml)|"E2E dummy train-score-eval pipeline with registered components"
-[jobs/pipelines-with-components/basics/2a_basic_component/pipeline.yml](jobs/pipelines-with-components/basics/2a_basic_component/pipeline.yml)|[![jobs/pipelines-with-components/basics/2a_basic_component/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-2a_basic_component-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-2a_basic_component-pipeline.yml)|"Hello World component example"
-[jobs/pipelines-with-components/basics/2b_component_with_input_output/pipeline.yml](jobs/pipelines-with-components/basics/2b_component_with_input_output/pipeline.yml)|[![jobs/pipelines-with-components/basics/2b_component_with_input_output/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-2b_component_with_input_output-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-2b_component_with_input_output-pipeline.yml)|"Component with inputs and outputs"
-[jobs/pipelines-with-components/basics/3a_basic_pipeline/pipeline.yml](jobs/pipelines-with-components/basics/3a_basic_pipeline/pipeline.yml)|[![jobs/pipelines-with-components/basics/3a_basic_pipeline/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-3a_basic_pipeline-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-3a_basic_pipeline-pipeline.yml)|"Basic Pipeline Job with 3 Hello World components"
-[jobs/pipelines-with-components/basics/3b_pipeline_with_data/pipeline.yml](jobs/pipelines-with-components/basics/3b_pipeline_with_data/pipeline.yml)|[![jobs/pipelines-with-components/basics/3b_pipeline_with_data/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-3b_pipeline_with_data-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-3b_pipeline_with_data-pipeline.yml)|*no description*
-[jobs/pipelines-with-components/basics/4a_local_data_input/pipeline.yml](jobs/pipelines-with-components/basics/4a_local_data_input/pipeline.yml)|[![jobs/pipelines-with-components/basics/4a_local_data_input/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-4a_local_data_input-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-4a_local_data_input-pipeline.yml)|"Example of using data in a local folder as pipeline input"
-[jobs/pipelines-with-components/basics/4b_datastore_datapath_uri/pipeline.yml](jobs/pipelines-with-components/basics/4b_datastore_datapath_uri/pipeline.yml)|[![jobs/pipelines-with-components/basics/4b_datastore_datapath_uri/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-4b_datastore_datapath_uri-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-4b_datastore_datapath_uri-pipeline.yml)|"Example of using data folder from a Workspace Datastore as pipeline input"
-[jobs/pipelines-with-components/basics/4c_web_url_input/pipeline.yml](jobs/pipelines-with-components/basics/4c_web_url_input/pipeline.yml)|[![jobs/pipelines-with-components/basics/4c_web_url_input/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-4c_web_url_input-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-4c_web_url_input-pipeline.yml)|"Example of using a file hosted at a web URL as pipeline input"
-[jobs/pipelines-with-components/basics/5a_env_public_docker_image/pipeline.yml](jobs/pipelines-with-components/basics/5a_env_public_docker_image/pipeline.yml)|[![jobs/pipelines-with-components/basics/5a_env_public_docker_image/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-5a_env_public_docker_image-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-5a_env_public_docker_image-pipeline.yml)|*no description*
-[jobs/pipelines-with-components/basics/5b_env_registered/pipeline.yml](jobs/pipelines-with-components/basics/5b_env_registered/pipeline.yml)|[![jobs/pipelines-with-components/basics/5b_env_registered/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-5b_env_registered-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-5b_env_registered-pipeline.yml)|*no description*
-[jobs/pipelines-with-components/basics/5c_env_conda_file/pipeline.yml](jobs/pipelines-with-components/basics/5c_env_conda_file/pipeline.yml)|[![jobs/pipelines-with-components/basics/5c_env_conda_file/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-5c_env_conda_file-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-5c_env_conda_file-pipeline.yml)|*no description*
-[jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline.yml](jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline.yml)|[![jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline.yml)|"Prints the environment variable ($TF_CONFIG) useful for scripts running in a Tensorflow training environment"
-[jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline.yml](jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline.yml)|[![jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline.yml)|"Prints the environment variables useful for scripts running in a PyTorch training environment"
-[jobs/pipelines-with-components/basics/6c_r_iris/pipeline.yml](jobs/pipelines-with-components/basics/6c_r_iris/pipeline.yml)|[![jobs/pipelines-with-components/basics/6c_r_iris/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline.yml)|Train an R model on the Iris dataset.

 **Endpoints** ([endpoints](endpoints))

@ -184,4 +170,3 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope

 - [Documentation](https://docs.microsoft.com/azure/machine-learning)
 - [Private previews](https://github.com/Azure/azureml-previews)
- 
--- a/cli/jobs/basics/hello-pipeline-settings.yml
+++ b/cli/jobs/basics/hello-pipeline-settings.yml
@ -2,12 +2,19 @@ $schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
 type: pipeline

 settings:
-  default_datestore: azureml:workspaceblobstore
+  default_datastore: azureml:workspaceartifactstore
  default_compute: azureml:cpu-cluster
+
 jobs:
  hello_job:
-    command: echo "hello"
-    environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:23
+    command: echo "hello-world" > ${{outputs.world_output}}/world.txt
+    environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
+    compute: azureml:cpu-cluster
+    outputs:
+      world_output:
  world_job:
-    command: echo "world"
-    environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:23   
+    command: cat ${{inputs.world_input}}/world.txt
+    environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:23
+    compute: azureml:cpu-cluster
+    inputs:
+      world_input: ${{parent.jobs.hello_job.outputs.world_output}}    
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/apply_image_transformation/conda.yaml
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/apply_image_transformation/conda.yaml
@ -0,0 +1,8 @@
+name: designer-cv-transform
+channels:
+  - defaults
+dependencies:
+  - pip=20.2
+  - python=3.7.9
+  - pip:
+    - azureml-designer-cv-modules[pytorch]==0.0.41
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/apply_image_transformation/entry.spec.yaml
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/apply_image_transformation/entry.spec.yaml
@ -0,0 +1,35 @@
+$schema: https://azuremlschemas.azureedge.net/development/CommandComponent.schema.json
+type: command
+
+name: microsoftsamples_apply_image_transformation
+display_name: Apply Image Transformation
+description: Applies a image transformation to a image directory.
+
+version: 0.0.1
+
+inputs:
+  input_image_transform_path:
+    description: Input image transformation
+    type: uri_folder
+  input_image_dir_path:
+    description: Input image directory
+    type: uri_folder
+  mode:
+    description: Should exclude 'Random' transform operations in inference but keep them in training
+    type: string
+    default: For training
+    enum: ['For training', 'For inference']
+outputs:
+  output_path:
+    type: uri_folder
+    description: Output image directory
+
+command: >-
+  python -m azureml.designer.modules.computer_vision.transform.apply_image_transformation.apply_image_transformation
+  --input-image-transform-path ${{inputs.input_image_transform_path}}
+  --input-image-dir-path ${{inputs.input_image_dir_path}}
+  --mode "For training"
+  --output-path ${{outputs.output_path}}
+environment:
+  conda_file: ./conda.yaml
+  image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20211124.v1
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/convert_to_image_directory/conda.yaml
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/convert_to_image_directory/conda.yaml
@ -0,0 +1,8 @@
+name: designer-cv-transform
+channels:
+  - defaults
+dependencies:
+  - pip=20.2
+  - python=3.7.9
+  - pip:
+    - azureml-designer-cv-modules==0.0.41
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/convert_to_image_directory/entry.spec.yaml
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/convert_to_image_directory/entry.spec.yaml
@ -0,0 +1,25 @@
+$schema: https://azuremlschemas.azureedge.net/development/CommandComponent.schema.json
+type: command
+
+name: microsoftsamples_convert_to_image_directory
+display_name: Convert to Image Directory
+description: Convert dataset to image directory format.
+
+version: 1
+
+inputs:
+  input_path:
+    type: uri_folder
+    description: Input dataset
+outputs:
+  output_path:
+    type: uri_folder
+    description: Output image directory
+
+command: >-
+  python -m azureml.designer.modules.computer_vision.preprocess.convert_to_image_directory.convert_to_image_directory
+  --input-path ${{inputs.input_path}}
+  --output-path ${{outputs.output_path}}
+environment:
+  conda_file: ./conda.yaml
+  image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20211124.v1
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/data/train/train.zip
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/data/train/train.zip
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/data/val/val.zip
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/data/val/val.zip
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/README.md
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/README.md
@ -0,0 +1,89 @@
+# Convolutional Networks for Image Classification in PyTorch
+
+In this repository you will find implementations of various image classification models.
+
+## Table Of Contents
+
+* [Models](#models)
+* [Validation accuracy results](#validation-accuracy-results)
+* [Training performance results](#training-performance-results)
+  * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-(8x-v100-16G))
+  * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-(16x-v100-32G))
+* [Model comparison](#model-comparison)
+  * [Accuracy vs FLOPS](#accuracy-vs-flops)
+  * [Latency vs Throughput on different batch sizes](#latency-vs-throughput-on-different-batch-sizes)
+
+## Models
+
+The following table provides links to where you can find additional information on each model:
+
+| **Model** | **Link**|
+|:-:|:-:|
+| resnet50 | [README](./resnet50v1.5/README.md) |
+| resnext101-32x4d | [README](./resnext101-32x4d/README.md) |
+| se-resnext101-32x4d | [README](./se-resnext101-32x4d/README.md) |
+
+## Validation accuracy results
+
+Our results were obtained by running the applicable 
+training scripts in the [framework-container-name] NGC container 
+on NVIDIA DGX-1 with (8x V100 16G) GPUs. 
+The specific training script that was run is documented 
+in the corresponding model's README.
+
+
+The following table shows the validation accuracy results of the 
+three classification models side-by-side.
+
+
+| **arch** | **AMP Top1** | **AMP Top5** | **FP32 Top1** | **FP32 Top1** |
+|:-:|:-:|:-:|:-:|:-:|
+| resnet50 | 78.46 | 94.15 | 78.50 | 94.11 |
+| resnext101-32x4d | 80.08 | 94.89 | 80.14 | 95.02 |
+| se-resnext101-32x4d | 81.01 | 95.52 | 81.12 | 95.54 |
+
+
+## Training performance results
+
+
+### Training performance: NVIDIA DGX-1 (8x V100 16G)
+
+
+Our results were obtained by running the applicable 
+training scripts in the pytorch-19.10 NGC container 
+on NVIDIA DGX-1 with (8x V100 16G) GPUs. 
+Performance numbers (in images per second) 
+were averaged over an entire training epoch.
+The specific training script that was run is documented 
+in the corresponding model's README.
+
+The following table shows the training accuracy results of the 
+three classification models side-by-side.
+
+
+| **arch** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** |
+|:-:|:-:|:-:|:-:|
+| resnet50 | 6888.75 img/s | 2945.37 img/s | 2.34x |
+| resnext101-32x4d | 2384.85 img/s | 1116.58 img/s | 2.14x |
+| se-resnext101-32x4d | 2031.17 img/s | 977.45 img/s | 2.08x |
+
+### Training performance: NVIDIA DGX-2 (16x V100 32G)
+
+
+Our results were obtained by running the applicable 
+training scripts in the pytorch-19.10 NGC container 
+on NVIDIA DGX-2 with (16x V100 32G) GPUs. 
+Performance numbers (in images per second) 
+were averaged over an entire training epoch.
+The specific training script that was run is documented 
+in the corresponding model's README.
+
+The following table shows the training accuracy results of the 
+three classification models side-by-side.
+
+
+| **arch** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** |
+|:-:|:-:|:-:|:-:|
+| resnet50 | 13443.82 img/s | 6263.41 img/s | 2.15x |
+| resnext101-32x4d | 4473.37 img/s | 2261.97 img/s | 1.98x |
+| se-resnext101-32x4d | 3776.03 img/s | 1953.13 img/s | 1.93x |
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/checkpoint2model.py
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/checkpoint2model.py
@ -0,0 +1,42 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import torch
+
+
+def add_parser_arguments(parser):
+    parser.add_argument(
+        "--checkpoint-path", metavar="<path>", help="checkpoint filename"
+    )
+    parser.add_argument(
+        "--weight-path", metavar="<path>", help="name of file in which to store weights"
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="PyTorch ImageNet Training")
+
+    add_parser_arguments(parser)
+    args = parser.parse_args()
+
+    checkpoint = torch.load(args.checkpoint_path)
+
+    model_state_dict = {
+        k[len("module.1.") :] if "module.1." in k else k: v
+        for k, v in checkpoint["state_dict"].items()
+    }
+
+    print(f"Loaded {checkpoint['arch']} : {checkpoint['best_prec1']}")
+
+    torch.save(model_state_dict, args.weight_path)
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/classify.py
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/classify.py
@ -0,0 +1,96 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from PIL import Image
+import argparse
+import numpy as np
+import json
+import torch
+import torch.backends.cudnn as cudnn
+import torchvision.transforms as transforms
+import image_classification.resnet as models
+from image_classification.dataloaders import load_jpeg_from_file
+
+try:
+    from apex.fp16_utils import *
+    from apex import amp
+except ImportError:
+    raise ImportError(
+        "Please install apex from https://www.github.com/nvidia/apex to run this example."
+    )
+
+
+def add_parser_arguments(parser):
+    model_names = models.resnet_versions.keys()
+    model_configs = models.resnet_configs.keys()
+    parser.add_argument("--image-size", default="224", type=int)
+    parser.add_argument(
+        "--arch",
+        "-a",
+        metavar="ARCH",
+        default="resnet50",
+        choices=model_names,
+        help="model architecture: " + " | ".join(model_names) + " (default: resnet50)",
+    )
+    parser.add_argument(
+        "--model-config",
+        "-c",
+        metavar="CONF",
+        default="classic",
+        choices=model_configs,
+        help="model configs: " + " | ".join(model_configs) + "(default: classic)",
+    )
+    parser.add_argument("--weights", metavar="<path>", help="file with model weights")
+    parser.add_argument(
+        "--precision", metavar="PREC", default="FP16", choices=["AMP", "FP16", "FP32"]
+    )
+    parser.add_argument("--image", metavar="<path>", help="path to classified image")
+
+
+def main(args):
+    imgnet_classes = np.array(json.load(open("./LOC_synset_mapping.json", "r")))
+    model = models.build_resnet(args.arch, args.model_config, verbose=False)
+
+    if args.weights is not None:
+        weights = torch.load(args.weights)
+        model.load_state_dict(weights)
+
+    model = model.cuda()
+
+    if args.precision == "FP16":
+        model = network_to_half(model)
+
+    model.eval()
+
+    with torch.no_grad():
+        input = load_jpeg_from_file(
+            args.image, cuda=True, fp16=args.precision != "FP32"
+        )
+
+        output = torch.nn.functional.softmax(model(input), dim=1).cpu().view(-1).numpy()
+        top5 = np.argsort(output)[-5:][::-1]
+
+        print(args.image)
+        for c, v in zip(imgnet_classes[top5], output[top5]):
+            print(f"{c}: {100*v:.1f}%")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="PyTorch ImageNet Training")
+
+    add_parser_arguments(parser)
+    args = parser.parse_args()
+
+    cudnn.benchmark = True
+
+    main(args)
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/conda.yaml
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/conda.yaml
@ -0,0 +1,21 @@
+name: train_environment
+channels:
+  - defaults
+  - conda-forge
+dependencies:
+  - python=3.8.12
+  - pip=21.2.2
+  - pip:
+      - azure-ml==0.0.58938149
+      - --extra-index-url https://pypi.org/simple
+      - --extra-index-url=https://azuremlsdktestpypi.azureedge.net/test-sdk-cli-v2
+      - git+https://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger
+      - watchdog==0.10.3
+      - torch==1.8.1
+      - torchvision==0.9.1
+      - tensorboard==2.5.0
+      - pillow==8.2.0
+      - numpy==1.19.5
+      - --extra-index-url=https://developer.download.nvidia.com/compute/redist/
+      - nvidia-dali-cuda100
+      - azureml-mlflow
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/entry.py
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/entry.py
@ -0,0 +1,175 @@
+from pathlib import Path
+import sys
+import runpy
+import json
+import shutil
+from multiprocessing.pool import ThreadPool
+from multiprocessing import cpu_count
+import functools
+from enum import Enum
+from azure.ml import dsl
+from azure.ml.dsl._component import ComponentExecutor
+from azure.ml.dsl._types import DataInput, NumberInput
+
+
+class Data_BackendEnum(Enum):
+    pytorch = "pytorch"
+    syntetic = "syntetic"
+    dali_gpu = "dali-gpu"
+    dali_cpu = "dali-cpu"
+
+
+class ArchEnum(Enum):
+    resnet18 = "resnet18"
+    resnet34 = "resnet34"
+    resnet50 = "resnet50"
+    resnet101 = "resnet101"
+    resnet152 = "resnet152"
+    resnext101_32x4d = "resnext101-32x4d"
+    se_resnext101_32x4d = "se-resnext101-32x4d"
+
+
+class Model_ConfigEnum(Enum):
+    classic = "classic"
+    fanin = "fanin"
+    grp_fanin = "grp-fanin"
+    grp_fanout = "grp-fanout"
+
+
+class Lr_ScheduleEnum(Enum):
+    step = "step"
+    linear = "linear"
+    cosine = "cosine"
+
+
+def convert_image_directory_to_specific_format(
+    image_dir_path, output_root, is_train=False
+):
+    # convert image directory to train component input data format
+    image_dir_path = Path(image_dir_path)
+    image_list_path = image_dir_path / "images.lst"
+    output_data_path = output_root / ("train" if is_train else "val")
+    category_list = []
+    file_name_list = []
+    with open(image_list_path, "r") as fin:
+        for line in fin:
+            line = json.loads(line)
+            # print(line)
+            category_list.append(line["category"])
+            file_name_list.append(line["image_info"]["file_name"])
+            (output_data_path / line["category"]).mkdir(parents=True, exist_ok=True)
+    print(
+        f"file number {len(file_name_list)}, category number {len(set(category_list))}."
+    )
+
+    def copy_file(index):
+        target_dir = output_data_path / category_list[index]
+        shutil.copyfile(
+            str(image_dir_path / file_name_list[index]),
+            str(target_dir / Path(file_name_list[index]).name),
+        )
+
+    with ThreadPool(cpu_count()) as p:
+        p.map(functools.partial(copy_file), range(len(file_name_list)))
+
+    print(
+        f"output path {output_data_path} has {len(list(output_data_path.glob('**/*')))} files."
+    )
+    return output_root
+
+
+@dsl.command_component(
+    name="imagecnn_train", description="imagecnn_train main function"
+)
+def main(
+    train_data: DataInput(description="path to train dataset") = None,
+    val_data: DataInput(description="path to valid dataset") = None,
+    data_backend="dali-cpu",
+    arch="resnet50",
+    model_config="classic",
+    workers: int = 5,
+    epochs: int = 90,
+    batch_size: int = 256,
+    optimizer_batch_size: int = -1,
+    lr: float = 0.1,
+    lr_schedule="step",
+    warmup: int = 0,
+    label_smoothing: float = 0.0,
+    mixup: float = 0.0,
+    momentum: float = 0.9,
+    weight_decay: float = 0.0001,
+    print_freq: int = 10,
+    resume="",
+    pretrained_weights="",
+    static_loss_scale: float = 1,
+    prof: int = -1,
+    seed: int = None,
+    raport_file="experiment_raport.json",
+    workspace="./",
+    save_checkpoint_epochs: int = 10,
+):
+    new_data_path = Path(train_data).parent / "new_dataset"
+    convert_image_directory_to_specific_format(
+        image_dir_path=train_data, output_root=new_data_path, is_train=True
+    )
+    convert_image_directory_to_specific_format(
+        image_dir_path=val_data, output_root=new_data_path
+    )
+    print(f"new data path {new_data_path}")
+    sys.argv = [
+        "main",
+        "--data",
+        str(new_data_path),
+        "--data-backend",
+        data_backend,
+        "--arch",
+        arch,
+        "--model-config",
+        model_config,
+        "-j",
+        str(workers),
+        "--epochs",
+        str(epochs),
+        "-b",
+        str(batch_size),
+        "--optimizer-batch-size",
+        str(optimizer_batch_size),
+        "--lr",
+        str(lr),
+        "--lr-schedule",
+        lr_schedule,
+        "--warmup",
+        str(warmup),
+        "--label-smoothing",
+        str(label_smoothing),
+        "--mixup",
+        str(mixup),
+        "--momentum",
+        str(momentum),
+        "--weight-decay",
+        str(weight_decay),
+        "--print-freq",
+        str(print_freq),
+        "--resume",
+        str(resume),
+        "--pretrained-weights",
+        str(pretrained_weights),
+        "--static-loss-scale",
+        str(static_loss_scale),
+        "--prof",
+        str(prof),
+        "--seed",
+        str(seed),
+        "--raport-file",
+        str(raport_file),
+        "--workspace",
+        str(workspace),
+        "--save-checkpoint-epochs",
+        str(save_checkpoint_epochs),
+    ]
+    print(" ".join(sys.argv))
+    runpy.run_path("main.py", run_name="__main__")
+
+
+if __name__ == "__main__":
+    ComponentExecutor(main).execute(sys.argv)
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/entry.spec.yaml
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/entry.spec.yaml
@ -0,0 +1,147 @@
+$schema: https://azuremlschemas.azureedge.net/development/commandComponent.schema.json
+type: command
+
+name: train_image_classification
+version: 0.0.1
+display_name: Train Image Classification
+
+tags: {}
+inputs:
+  train_data:
+    type: path
+    description: "path to train dataset"
+    optional: false
+  valid_data:
+    type: path
+    description: "path to valid dataset"
+    optional: false
+  data_backend:
+    type: string
+    description: "data backend: pytorch | syntetic | dali-gpu | dali-cpu (default: dali-cpu)"
+    default: "dali-cpu"
+    optional: true
+  arch:
+    type: string
+    description: "model architecture: resnet18 | resnet34 | resnet50 | resnet101 | resnet152 | resnext101_32x4d | se_resnext101_32x4d (default: resnet50)"
+    default: "resnet50"
+    optional: true
+  model_config:
+    type: string
+    description: "model configs: classic | fanin | grp_fanin | grp_fanout(default: classic)"
+    default: "classic"
+    optional: true
+  workers:
+    type: integer
+    description: "number of data loading workers (default: 5)"
+    default: 5
+    optional: true
+  epochs:
+    type: integer
+    description: number of total epochs to run
+    default: 90
+    optional: true
+  batch_size:
+    type: integer
+    description: "mini-batch size (default: 256) per gpu"
+    default: 256
+    optional: true
+  optimizer_batch_size:
+    type: integer
+    description: size of a total batch size, for simulating bigger batches using gradient accumulation
+    default: -1
+    optional: true
+  lr:
+    type: number
+    description: initial learning rate
+    default: 0.1
+    optional: true
+  lr_schedule:
+    type: string
+    description: "Type of LR schedule: step, linear, cosine"
+    default: "step"
+    optional: true
+  warmup:
+    type: integer
+    description: number of warmup epochs
+    default: 0
+    optional: true
+  label_smoothing:
+    type: number
+    description: label smoothing
+    default: 0.0
+    optional: true
+  mixup:
+    type: number
+    description: mixup alpha
+    default: 0.0
+    optional: true
+  momentum:
+    type: number
+    description: momentum
+    default: 0.9
+    optional: true
+  weight_decay:
+    type: number
+    description: "weight decay (default: 1e-4)"
+    default: 0.0001
+    optional: true
+  print_freq:
+    type: integer
+    description: "print frequency (default: 10)"
+    default: 10
+    optional: true
+  resume:
+    type: string
+    description: "path to latest checkpoint (default: none)"
+    default: ""
+    optional: true
+  pretrained_weights:
+    type: string
+    description: load weights from here
+    default: ""
+    optional: true
+  static_loss_scale:
+    type: number
+    description: Static loss scale, positive power of 2 values can improve fp16 convergence.
+    default: 1.0
+    optional: true
+  prof:
+    type: integer
+    description: Run only N iterations
+    default: -1
+    optional: true
+  seed:
+    type: integer
+    description: random seed used for numpy and pytorch
+    default: 123
+    optional: true
+  raport_file:
+    type: string
+    description: file in which to store JSON experiment raport
+    default: experiment_raport.json
+    optional: true
+  save_checkpoint_epochs:
+    type: integer
+    description: how many epochs run between saving checkpoints
+    default: 2
+    optional: true
+outputs:
+  workspace:
+    type: uri_folder
+    description: path to directory where checkpoints will be stored
+
+code: ./
+
+environment:
+  image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn7-ubuntu18.04
+  conda_file: ./conda.yaml
+
+resources:
+  instance_count: 2
+distribution:
+  type: mpi
+  process_count_per_instance: 1
+
+command: >-
+  git clone https://github.com/NVIDIA/apex && cd apex && git checkout 3303b3e7174383312a3468ef390060c26e640cb1 && python setup.py install && cd .. && python entry.py --train_data ${{inputs.train_data}} --val_data ${{inputs.valid_data}} [--data_backend ${{inputs.data_backend}}] [--arch ${{inputs.arch}}] [--model_config ${{inputs.model_config}}] [--workers ${{inputs.workers}}] [--epochs ${{inputs.epochs}}] [--batch_size ${{inputs.batch_size}}] [--optimizer_batch_size ${{inputs.optimizer_batch_size}}] [--lr ${{inputs.lr}}] [--lr_schedule ${{inputs.lr_schedule}}] [--warmup ${{inputs.warmup}}] [--label_smoothing ${{inputs.label_smoothing}}] [--mixup ${{inputs.mixup}}] [--momentum ${{inputs.momentum}}] [--weight_decay ${{inputs.weight_decay}}] [--print_freq ${{inputs.print_freq}}] [--resume ${{inputs.resume}}] [--pretrained_weights ${{inputs.pretrained_weights}}] [--static_loss_scale ${{inputs.static_loss_scale}}] [--prof ${{inputs.prof}}] [--seed ${{inputs.seed}}] [--raport_file ${{inputs.raport_file}}] [--save_checkpoint_epochs ${{inputs.save_checkpoint_epochs}}] --workspace ${{outputs.workspace}}
+
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/init.py
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/init.py
@ -0,0 +1,20 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import logger
+from . import dataloaders
+from . import training
+from . import utils
+from . import mixup
+from . import resnet
+from . import smoothing
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/dataloaders.py
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/dataloaders.py
@ -0,0 +1,489 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+import torch
+import numpy as np
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from PIL import Image
+
+DATA_BACKEND_CHOICES = ["pytorch", "syntetic"]
+try:
+    from nvidia.dali.plugin.pytorch import DALIClassificationIterator
+    from nvidia.dali.pipeline import Pipeline
+    import nvidia.dali.ops as ops
+    import nvidia.dali.types as types
+
+    DATA_BACKEND_CHOICES.append("dali-gpu")
+    DATA_BACKEND_CHOICES.append("dali-cpu")
+except ImportError:
+    print(
+        "Please install DALI from https://www.github.com/NVIDIA/DALI to run this example."
+    )
+
+
+def load_jpeg_from_file(path, cuda=True, fp16=False):
+    img_transforms = transforms.Compose(
+        [transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor()]
+    )
+
+    img = img_transforms(Image.open(path))
+    with torch.no_grad():
+        # mean and std are not multiplied by 255 as they are in training script
+        # torch dataloader reads data into bytes whereas loading directly
+        # through PIL creates a tensor with floats in [0,1] range
+        mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
+        std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
+
+        if cuda:
+            mean = mean.cuda()
+            std = std.cuda()
+            img = img.cuda()
+        if fp16:
+            mean = mean.half()
+            std = std.half()
+            img = img.half()
+        else:
+            img = img.float()
+
+        input = img.unsqueeze(0).sub_(mean).div_(std)
+
+    return input
+
+
+class HybridTrainPipe(Pipeline):
+    def __init__(
+        self, batch_size, num_threads, device_id, data_dir, crop, dali_cpu=False
+    ):
+        super(HybridTrainPipe, self).__init__(
+            batch_size, num_threads, device_id, seed=12 + device_id
+        )
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+            world_size = torch.distributed.get_world_size()
+        else:
+            rank = 0
+            world_size = 1
+
+        self.input = ops.FileReader(
+            file_root=data_dir,
+            shard_id=rank,
+            num_shards=world_size,
+            random_shuffle=True,
+        )
+
+        if dali_cpu:
+            dali_device = "cpu"
+            self.decode = ops.ImageDecoder(device=dali_device, output_type=types.RGB)
+        else:
+            dali_device = "gpu"
+            # This padding sets the size of the internal nvJPEG buffers to be able to handle all images from full-sized ImageNet
+            # without additional reallocations
+            self.decode = ops.ImageDecoder(
+                device="mixed",
+                output_type=types.RGB,
+                device_memory_padding=211025920,
+                host_memory_padding=140544512,
+            )
+
+        self.res = ops.RandomResizedCrop(
+            device=dali_device,
+            size=[crop, crop],
+            interp_type=types.INTERP_LINEAR,
+            random_aspect_ratio=[0.75, 4.0 / 3.0],
+            random_area=[0.08, 1.0],
+            num_attempts=100,
+        )
+
+        self.cmnp = ops.CropMirrorNormalize(
+            device="gpu",
+            output_dtype=types.FLOAT,
+            output_layout=types.NCHW,
+            crop=(crop, crop),
+            image_type=types.RGB,
+            mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+            std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
+        )
+        self.coin = ops.CoinFlip(probability=0.5)
+
+    def define_graph(self):
+        rng = self.coin()
+        self.jpegs, self.labels = self.input(name="Reader")
+        images = self.decode(self.jpegs)
+        images = self.res(images)
+        output = self.cmnp(images.gpu(), mirror=rng)
+        return [output, self.labels]
+
+
+class HybridValPipe(Pipeline):
+    def __init__(self, batch_size, num_threads, device_id, data_dir, crop, size):
+        super(HybridValPipe, self).__init__(
+            batch_size, num_threads, device_id, seed=12 + device_id
+        )
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+            world_size = torch.distributed.get_world_size()
+        else:
+            rank = 0
+            world_size = 1
+
+        self.input = ops.FileReader(
+            file_root=data_dir,
+            shard_id=rank,
+            num_shards=world_size,
+            random_shuffle=False,
+        )
+
+        self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB)
+        self.res = ops.Resize(device="gpu", resize_shorter=size)
+        self.cmnp = ops.CropMirrorNormalize(
+            device="gpu",
+            output_dtype=types.FLOAT,
+            output_layout=types.NCHW,
+            crop=(crop, crop),
+            image_type=types.RGB,
+            mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+            std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
+        )
+
+    def define_graph(self):
+        self.jpegs, self.labels = self.input(name="Reader")
+        images = self.decode(self.jpegs)
+        images = self.res(images)
+        output = self.cmnp(images)
+        return [output, self.labels]
+
+
+class DALIWrapper(object):
+    def gen_wrapper(dalipipeline, num_classes, one_hot):
+        for data in dalipipeline:
+            input = data[0]["data"]
+            target = torch.reshape(data[0]["label"], [-1]).cuda().long()
+            if one_hot:
+                target = expand(num_classes, torch.float, target)
+            yield input, target
+        dalipipeline.reset()
+
+    def __init__(self, dalipipeline, num_classes, one_hot):
+        self.dalipipeline = dalipipeline
+        self.num_classes = num_classes
+        self.one_hot = one_hot
+
+    def __iter__(self):
+        return DALIWrapper.gen_wrapper(
+            self.dalipipeline, self.num_classes, self.one_hot
+        )
+
+
+def get_dali_train_loader(dali_cpu=False):
+    def gdtl(
+        data_path,
+        batch_size,
+        num_classes,
+        one_hot,
+        workers=5,
+        _worker_init_fn=None,
+        fp16=False,
+    ):
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+            world_size = torch.distributed.get_world_size()
+        else:
+            rank = 0
+            world_size = 1
+
+        traindir = os.path.join(data_path, "train")
+
+        pipe = HybridTrainPipe(
+            batch_size=batch_size,
+            num_threads=workers,
+            device_id=rank % torch.cuda.device_count(),
+            data_dir=traindir,
+            crop=224,
+            dali_cpu=dali_cpu,
+        )
+
+        pipe.build()
+        train_loader = DALIClassificationIterator(
+            pipe, size=int(pipe.epoch_size("Reader") / world_size)
+        )
+
+        return DALIWrapper(train_loader, num_classes, one_hot), int(
+            pipe.epoch_size("Reader") / (world_size * batch_size)
+        )
+
+    return gdtl
+
+
+def get_dali_val_loader():
+    def gdvl(
+        data_path,
+        batch_size,
+        num_classes,
+        one_hot,
+        workers=5,
+        _worker_init_fn=None,
+        fp16=False,
+    ):
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+            world_size = torch.distributed.get_world_size()
+        else:
+            rank = 0
+            world_size = 1
+
+        valdir = os.path.join(data_path, "val")
+
+        pipe = HybridValPipe(
+            batch_size=batch_size,
+            num_threads=workers,
+            device_id=rank % torch.cuda.device_count(),
+            data_dir=valdir,
+            crop=224,
+            size=256,
+        )
+
+        pipe.build()
+        val_loader = DALIClassificationIterator(
+            pipe, size=int(pipe.epoch_size("Reader") / world_size)
+        )
+
+        return DALIWrapper(val_loader, num_classes, one_hot), int(
+            pipe.epoch_size("Reader") / (world_size * batch_size)
+        )
+
+    return gdvl
+
+
+def fast_collate(batch):
+    imgs = [img[0] for img in batch]
+    targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
+    w = imgs[0].size[0]
+    h = imgs[0].size[1]
+    tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8)
+    for i, img in enumerate(imgs):
+        nump_array = np.asarray(img, dtype=np.uint8)
+        tens = torch.from_numpy(nump_array)
+        if nump_array.ndim < 3:
+            nump_array = np.expand_dims(nump_array, axis=-1)
+        nump_array = np.rollaxis(nump_array, 2)
+
+        tensor[i] += torch.from_numpy(nump_array)
+
+    return tensor, targets
+
+
+def expand(num_classes, dtype, tensor):
+    e = torch.zeros(
+        tensor.size(0), num_classes, dtype=dtype, device=torch.device("cuda")
+    )
+    e = e.scatter(1, tensor.unsqueeze(1), 1.0)
+    return e
+
+
+class PrefetchedWrapper(object):
+    def prefetched_loader(loader, num_classes, fp16, one_hot):
+        mean = (
+            torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255])
+            .cuda()
+            .view(1, 3, 1, 1)
+        )
+        std = (
+            torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255])
+            .cuda()
+            .view(1, 3, 1, 1)
+        )
+        if fp16:
+            mean = mean.half()
+            std = std.half()
+
+        stream = torch.cuda.Stream()
+        first = True
+
+        for next_input, next_target in loader:
+            with torch.cuda.stream(stream):
+                next_input = next_input.cuda(non_blocking=True)
+                next_target = next_target.cuda(non_blocking=True)
+                if fp16:
+                    next_input = next_input.half()
+                    if one_hot:
+                        next_target = expand(num_classes, torch.half, next_target)
+                else:
+                    next_input = next_input.float()
+                    if one_hot:
+                        next_target = expand(num_classes, torch.float, next_target)
+
+                next_input = next_input.sub_(mean).div_(std)
+
+            if not first:
+                yield input, target
+            else:
+                first = False
+
+            torch.cuda.current_stream().wait_stream(stream)
+            input = next_input
+            target = next_target
+
+            yield input, target
+
+    def __init__(self, dataloader, num_classes, fp16, one_hot):
+        self.dataloader = dataloader
+        self.fp16 = fp16
+        self.epoch = 0
+        self.one_hot = one_hot
+        self.num_classes = num_classes
+
+    def __iter__(self):
+        if self.dataloader.sampler is not None and isinstance(
+            self.dataloader.sampler, torch.utils.data.distributed.DistributedSampler
+        ):
+
+            self.dataloader.sampler.set_epoch(self.epoch)
+        self.epoch += 1
+        return PrefetchedWrapper.prefetched_loader(
+            self.dataloader, self.num_classes, self.fp16, self.one_hot
+        )
+
+
+def get_pytorch_train_loader(
+    data_path,
+    batch_size,
+    num_classes,
+    one_hot,
+    workers=5,
+    _worker_init_fn=None,
+    fp16=False,
+):
+    traindir = os.path.join(data_path, "train")
+    train_dataset = datasets.ImageFolder(
+        traindir,
+        transforms.Compose(
+            [
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+            ]
+        ),
+    )
+
+    if torch.distributed.is_initialized():
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+    else:
+        train_sampler = None
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=batch_size,
+        shuffle=(train_sampler is None),
+        num_workers=workers,
+        worker_init_fn=_worker_init_fn,
+        pin_memory=True,
+        sampler=train_sampler,
+        collate_fn=fast_collate,
+        drop_last=True,
+    )
+
+    return PrefetchedWrapper(train_loader, num_classes, fp16, one_hot), len(
+        train_loader
+    )
+
+
+def get_pytorch_val_loader(
+    data_path,
+    batch_size,
+    num_classes,
+    one_hot,
+    workers=5,
+    _worker_init_fn=None,
+    fp16=False,
+):
+    valdir = os.path.join(data_path, "val")
+    val_dataset = datasets.ImageFolder(
+        valdir,
+        transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+            ]
+        ),
+    )
+
+    if torch.distributed.is_initialized():
+        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
+    else:
+        val_sampler = None
+
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset,
+        sampler=val_sampler,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=workers,
+        worker_init_fn=_worker_init_fn,
+        pin_memory=True,
+        collate_fn=fast_collate,
+    )
+
+    return PrefetchedWrapper(val_loader, num_classes, fp16, one_hot), len(val_loader)
+
+
+class SynteticDataLoader(object):
+    def __init__(
+        self, fp16, batch_size, num_classes, num_channels, height, width, one_hot
+    ):
+        input_data = (
+            torch.empty(batch_size, num_channels, height, width).cuda().normal_(0, 1.0)
+        )
+        if one_hot:
+            input_target = torch.empty(batch_size, num_classes).cuda()
+            input_target[:, 0] = 1.0
+        else:
+            input_target = torch.randint(0, num_classes, (batch_size,))
+        input_target = input_target.cuda()
+        if fp16:
+            input_data = input_data.half()
+
+        self.input_data = input_data
+        self.input_target = input_target
+
+    def __iter__(self):
+        while True:
+            yield self.input_data, self.input_target
+
+
+def get_syntetic_loader(
+    data_path,
+    batch_size,
+    num_classes,
+    one_hot,
+    workers=None,
+    _worker_init_fn=None,
+    fp16=False,
+):
+    return SynteticDataLoader(fp16, batch_size, 1000, 3, 224, 224, one_hot), -1
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/logger.py
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/logger.py
@ -0,0 +1,311 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from collections import OrderedDict
+import dllogger
+import numpy as np
+
+
+def format_step(step):
+    if isinstance(step, str):
+        return step
+    s = ""
+    if len(step) > 0:
+        s += "Epoch: {} ".format(step[0])
+    if len(step) > 1:
+        s += "Iteration: {} ".format(step[1])
+    if len(step) > 2:
+        s += "Validation Iteration: {} ".format(step[2])
+    if len(step) == 0:
+        s = "Summary:"
+    return s
+
+
+PERF_METER = lambda: Meter(AverageMeter(), AverageMeter(), AverageMeter())
+LOSS_METER = lambda: Meter(AverageMeter(), AverageMeter(), MinMeter())
+ACC_METER = lambda: Meter(AverageMeter(), AverageMeter(), MaxMeter())
+LR_METER = lambda: Meter(LastMeter(), LastMeter(), LastMeter())
+
+LAT_100 = lambda: Meter(QuantileMeter(1), QuantileMeter(1), QuantileMeter(1))
+LAT_99 = lambda: Meter(QuantileMeter(0.99), QuantileMeter(0.99), QuantileMeter(0.99))
+LAT_95 = lambda: Meter(QuantileMeter(0.95), QuantileMeter(0.95), QuantileMeter(0.95))
+
+
+class Meter(object):
+    def __init__(self, iteration_aggregator, epoch_aggregator, run_aggregator):
+        self.run_aggregator = run_aggregator
+        self.epoch_aggregator = epoch_aggregator
+        self.iteration_aggregator = iteration_aggregator
+
+    def record(self, val, n=1):
+        self.iteration_aggregator.record(val, n=n)
+
+    def get_iteration(self):
+        v, n = self.iteration_aggregator.get_val()
+        return v
+
+    def reset_iteration(self):
+        v, n = self.iteration_aggregator.get_data()
+        self.iteration_aggregator.reset()
+        if v is not None:
+            self.epoch_aggregator.record(v, n=n)
+
+    def get_epoch(self):
+        v, n = self.epoch_aggregator.get_val()
+        return v
+
+    def reset_epoch(self):
+        v, n = self.epoch_aggregator.get_data()
+        self.epoch_aggregator.reset()
+        if v is not None:
+            self.run_aggregator.record(v, n=n)
+
+    def get_run(self):
+        v, n = self.run_aggregator.get_val()
+        return v
+
+    def reset_run(self):
+        self.run_aggregator.reset()
+
+
+class QuantileMeter(object):
+    def __init__(self, q):
+        self.q = q
+        self.reset()
+
+    def reset(self):
+        self.vals = []
+        self.n = 0
+
+    def record(self, val, n=1):
+        if isinstance(val, list):
+            self.vals += val
+            self.n += len(val)
+        else:
+            self.vals += [val] * n
+            self.n += n
+
+    def get_val(self):
+        if not self.vals:
+            return None, self.n
+        return np.quantile(self.vals, self.q, interpolation="nearest"), self.n
+
+    def get_data(self):
+        return self.vals, self.n
+
+
+class MaxMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.max = None
+        self.n = 0
+
+    def record(self, val, n=1):
+        if self.max is None:
+            self.max = val
+        else:
+            self.max = max(self.max, val)
+        self.n = n
+
+    def get_val(self):
+        return self.max, self.n
+
+    def get_data(self):
+        return self.max, self.n
+
+
+class MinMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.min = None
+        self.n = 0
+
+    def record(self, val, n=1):
+        if self.min is None:
+            self.min = val
+        else:
+            self.min = max(self.min, val)
+        self.n = n
+
+    def get_val(self):
+        return self.min, self.n
+
+    def get_data(self):
+        return self.min, self.n
+
+
+class LastMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.last = None
+        self.n = 0
+
+    def record(self, val, n=1):
+        self.last = val
+        self.n = n
+
+    def get_val(self):
+        return self.last, self.n
+
+    def get_data(self):
+        return self.last, self.n
+
+
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.n = 0
+        self.val = 0
+
+    def record(self, val, n=1):
+        self.n += n
+        self.val += val * n
+
+    def get_val(self):
+        if self.n == 0:
+            return None, 0
+        return self.val / self.n, self.n
+
+    def get_data(self):
+        if self.n == 0:
+            return None, 0
+        return self.val / self.n, self.n
+
+
+class Logger(object):
+    def __init__(self, print_interval, backends, verbose=False, last_epoch=-1):
+        self.epoch = last_epoch
+        self.iteration = -1
+        self.val_iteration = -1
+        self.metrics = OrderedDict()
+        self.backends = backends
+        self.print_interval = print_interval
+        self.verbose = verbose
+        dllogger.init(backends)
+
+    def log_parameter(self, data, verbosity=0):
+        dllogger.log(step="PARAMETER", data=data, verbosity=verbosity)
+
+    def register_metric(self, metric_name, meter, verbosity=0, metadata={}):
+        if self.verbose:
+            print("Registering metric: {}".format(metric_name))
+        self.metrics[metric_name] = {"meter": meter, "level": verbosity}
+        dllogger.metadata(metric_name, metadata)
+
+    def log_metric(self, metric_name, val, n=1):
+        self.metrics[metric_name]["meter"].record(val, n=n)
+
+    def start_iteration(self, val=False):
+        if val:
+            self.val_iteration += 1
+        else:
+            self.iteration += 1
+
+    def end_iteration(self, val=False):
+        it = self.val_iteration if val else self.iteration
+        if it % self.print_interval == 0:
+            metrics = {
+                n: m for n, m in self.metrics.items() if n.startswith("val") == val
+            }
+            step = (
+                (self.epoch, self.iteration)
+                if not val
+                else (self.epoch, self.iteration, self.val_iteration)
+            )
+
+            verbositys = {m["level"] for _, m in metrics.items()}
+            for ll in verbositys:
+                llm = {n: m for n, m in metrics.items() if m["level"] == ll}
+
+                dllogger.log(
+                    step=step,
+                    data={n: m["meter"].get_iteration() for n, m in llm.items()},
+                    verbosity=ll,
+                )
+
+            for n, m in metrics.items():
+                m["meter"].reset_iteration()
+
+            dllogger.flush()
+
+    def start_epoch(self):
+        self.epoch += 1
+        self.iteration = 0
+        self.val_iteration = 0
+
+        for n, m in self.metrics.items():
+            m["meter"].reset_epoch()
+
+    def end_epoch(self):
+        for n, m in self.metrics.items():
+            m["meter"].reset_iteration()
+
+        verbositys = {m["level"] for _, m in self.metrics.items()}
+        for ll in verbositys:
+            llm = {n: m for n, m in self.metrics.items() if m["level"] == ll}
+            dllogger.log(
+                step=(self.epoch,),
+                data={n: m["meter"].get_epoch() for n, m in llm.items()},
+            )
+
+    def end(self):
+        for n, m in self.metrics.items():
+            m["meter"].reset_epoch()
+
+        verbositys = {m["level"] for _, m in self.metrics.items()}
+        for ll in verbositys:
+            llm = {n: m for n, m in self.metrics.items() if m["level"] == ll}
+            dllogger.log(
+                step=tuple(), data={n: m["meter"].get_run() for n, m in llm.items()}
+            )
+
+        for n, m in self.metrics.items():
+            m["meter"].reset_epoch()
+
+        dllogger.flush()
+
+    def iteration_generator_wrapper(self, gen, val=False):
+        for g in gen:
+            self.start_iteration(val=val)
+            yield g
+            self.end_iteration(val=val)
+
+    def epoch_generator_wrapper(self, gen):
+        for g in gen:
+            self.start_epoch()
+            yield g
+            self.end_epoch()
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/mixup.py
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/mixup.py
@ -0,0 +1,67 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+
+
+def mixup(alpha, num_classes, data, target):
+    with torch.no_grad():
+        bs = data.size(0)
+        c = np.random.beta(alpha, alpha)
+
+        perm = torch.randperm(bs).cuda()
+
+        md = c * data + (1 - c) * data[perm, :]
+        mt = c * target + (1 - c) * target[perm, :]
+        return md, mt
+
+
+class MixUpWrapper(object):
+    def __init__(self, alpha, num_classes, dataloader):
+        self.alpha = alpha
+        self.dataloader = dataloader
+        self.num_classes = num_classes
+
+    def mixup_loader(self, loader):
+        for input, target in loader:
+            i, t = mixup(self.alpha, self.num_classes, input, target)
+            yield i, t
+
+    def __iter__(self):
+        return self.mixup_loader(self.dataloader)
+
+
+class NLLMultiLabelSmooth(nn.Module):
+    def __init__(self, smoothing=0.0):
+        super(NLLMultiLabelSmooth, self).__init__()
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+
+    def forward(self, x, target):
+        if self.training:
+            x = x.float()
+            target = target.float()
+            logprobs = torch.nn.functional.log_softmax(x, dim=-1)
+
+            nll_loss = -logprobs * target
+            nll_loss = nll_loss.sum(-1)
+
+            smooth_loss = -logprobs.mean(dim=-1)
+
+            loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+
+            return loss.mean()
+        else:
+            return torch.nn.functional.cross_entropy(x, target)
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/resnet.py
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/resnet.py
@ -0,0 +1,411 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+
+__all__ = ["ResNet", "build_resnet", "resnet_versions", "resnet_configs"]
+
+# ResNetBuilder {{{
+
+
+class ResNetBuilder(object):
+    def __init__(self, version, config):
+        self.conv3x3_cardinality = (
+            1 if "cardinality" not in version.keys() else version["cardinality"]
+        )
+        self.config = config
+
+    def conv(self, kernel_size, in_planes, out_planes, groups=1, stride=1):
+        conv = nn.Conv2d(
+            in_planes,
+            out_planes,
+            kernel_size=kernel_size,
+            groups=groups,
+            stride=stride,
+            padding=int((kernel_size - 1) / 2),
+            bias=False,
+        )
+
+        if self.config["nonlinearity"] == "relu":
+            nn.init.kaiming_normal_(
+                conv.weight,
+                mode=self.config["conv_init"],
+                nonlinearity=self.config["nonlinearity"],
+            )
+
+        return conv
+
+    def conv3x3(self, in_planes, out_planes, stride=1):
+        """3x3 convolution with padding"""
+        c = self.conv(
+            3, in_planes, out_planes, groups=self.conv3x3_cardinality, stride=stride
+        )
+        return c
+
+    def conv1x1(self, in_planes, out_planes, stride=1):
+        """1x1 convolution with padding"""
+        c = self.conv(1, in_planes, out_planes, stride=stride)
+        return c
+
+    def conv7x7(self, in_planes, out_planes, stride=1):
+        """7x7 convolution with padding"""
+        c = self.conv(7, in_planes, out_planes, stride=stride)
+        return c
+
+    def conv5x5(self, in_planes, out_planes, stride=1):
+        """5x5 convolution with padding"""
+        c = self.conv(5, in_planes, out_planes, stride=stride)
+        return c
+
+    def batchnorm(self, planes, last_bn=False):
+        bn = nn.BatchNorm2d(planes)
+        gamma_init_val = 0 if last_bn and self.config["last_bn_0_init"] else 1
+        nn.init.constant_(bn.weight, gamma_init_val)
+        nn.init.constant_(bn.bias, 0)
+
+        return bn
+
+    def activation(self):
+        return self.config["activation"]()
+
+
+# ResNetBuilder }}}
+
+# BasicBlock {{{
+class BasicBlock(nn.Module):
+    def __init__(self, builder, inplanes, planes, expansion, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = builder.conv3x3(inplanes, planes, stride)
+        self.bn1 = builder.batchnorm(planes)
+        self.relu = builder.activation()
+        self.conv2 = builder.conv3x3(planes, planes * expansion)
+        self.bn2 = builder.batchnorm(planes * expansion, last_bn=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        if self.bn1 is not None:
+            out = self.bn1(out)
+
+        out = self.relu(out)
+
+        out = self.conv2(out)
+
+        if self.bn2 is not None:
+            out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+# BasicBlock }}}
+
+# SqueezeAndExcitation {{{
+class SqueezeAndExcitation(nn.Module):
+    def __init__(self, planes, squeeze):
+        super(SqueezeAndExcitation, self).__init__()
+        self.squeeze = nn.Linear(planes, squeeze)
+        self.expand = nn.Linear(squeeze, planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        out = torch.mean(x.view(x.size(0), x.size(1), -1), 2)
+        out = self.squeeze(out)
+        out = self.relu(out)
+        out = self.expand(out)
+        out = self.sigmoid(out)
+        out = out.unsqueeze(2).unsqueeze(3)
+
+        return out
+
+
+# }}}
+
+# Bottleneck {{{
+class Bottleneck(nn.Module):
+    def __init__(
+        self,
+        builder,
+        inplanes,
+        planes,
+        expansion,
+        stride=1,
+        se=False,
+        se_squeeze=16,
+        downsample=None,
+    ):
+        super(Bottleneck, self).__init__()
+        self.conv1 = builder.conv1x1(inplanes, planes)
+        self.bn1 = builder.batchnorm(planes)
+        self.conv2 = builder.conv3x3(planes, planes, stride=stride)
+        self.bn2 = builder.batchnorm(planes)
+        self.conv3 = builder.conv1x1(planes, planes * expansion)
+        self.bn3 = builder.batchnorm(planes * expansion, last_bn=True)
+        self.relu = builder.activation()
+        self.downsample = downsample
+        self.stride = stride
+        self.squeeze = (
+            SqueezeAndExcitation(planes * expansion, se_squeeze) if se else None
+        )
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        if self.squeeze is None:
+            out += residual
+        else:
+            out = torch.addcmul(residual, 1.0, out, self.squeeze(out))
+
+        out = self.relu(out)
+
+        return out
+
+
+def SEBottleneck(builder, inplanes, planes, expansion, stride=1, downsample=None):
+    return Bottleneck(
+        builder,
+        inplanes,
+        planes,
+        expansion,
+        stride=stride,
+        se=True,
+        se_squeeze=16,
+        downsample=downsample,
+    )
+
+
+# Bottleneck }}}
+
+# ResNet {{{
+class ResNet(nn.Module):
+    def __init__(self, builder, block, expansion, layers, widths, num_classes=1000):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        self.conv1 = builder.conv7x7(3, 64, stride=2)
+        self.bn1 = builder.batchnorm(64)
+        self.relu = builder.activation()
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(builder, block, expansion, widths[0], layers[0])
+        self.layer2 = self._make_layer(
+            builder, block, expansion, widths[1], layers[1], stride=2
+        )
+        self.layer3 = self._make_layer(
+            builder, block, expansion, widths[2], layers[2], stride=2
+        )
+        self.layer4 = self._make_layer(
+            builder, block, expansion, widths[3], layers[3], stride=2
+        )
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Linear(widths[3] * expansion, num_classes)
+
+    def _make_layer(self, builder, block, expansion, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * expansion:
+            dconv = builder.conv1x1(self.inplanes, planes * expansion, stride=stride)
+            dbn = builder.batchnorm(planes * expansion)
+            if dbn is not None:
+                downsample = nn.Sequential(dconv, dbn)
+            else:
+                downsample = dconv
+
+        layers = []
+        layers.append(
+            block(
+                builder,
+                self.inplanes,
+                planes,
+                expansion,
+                stride=stride,
+                downsample=downsample,
+            )
+        )
+        self.inplanes = planes * expansion
+        for i in range(1, blocks):
+            layers.append(block(builder, self.inplanes, planes, expansion))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        if self.bn1 is not None:
+            x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+# ResNet }}}
+
+resnet_configs = {
+    "classic": {
+        "conv": nn.Conv2d,
+        "conv_init": "fan_out",
+        "nonlinearity": "relu",
+        "last_bn_0_init": False,
+        "activation": lambda: nn.ReLU(inplace=True),
+    },
+    "fanin": {
+        "conv": nn.Conv2d,
+        "conv_init": "fan_in",
+        "nonlinearity": "relu",
+        "last_bn_0_init": False,
+        "activation": lambda: nn.ReLU(inplace=True),
+    },
+    "grp-fanin": {
+        "conv": nn.Conv2d,
+        "conv_init": "fan_in",
+        "nonlinearity": "relu",
+        "last_bn_0_init": False,
+        "activation": lambda: nn.ReLU(inplace=True),
+    },
+    "grp-fanout": {
+        "conv": nn.Conv2d,
+        "conv_init": "fan_out",
+        "nonlinearity": "relu",
+        "last_bn_0_init": False,
+        "activation": lambda: nn.ReLU(inplace=True),
+    },
+}
+
+resnet_versions = {
+    "resnet18": {
+        "net": ResNet,
+        "block": BasicBlock,
+        "layers": [2, 2, 2, 2],
+        "widths": [64, 128, 256, 512],
+        "expansion": 1,
+        "num_classes": 1000,
+    },
+    "resnet34": {
+        "net": ResNet,
+        "block": BasicBlock,
+        "layers": [3, 4, 6, 3],
+        "widths": [64, 128, 256, 512],
+        "expansion": 1,
+        "num_classes": 1000,
+    },
+    "resnet50": {
+        "net": ResNet,
+        "block": Bottleneck,
+        "layers": [3, 4, 6, 3],
+        "widths": [64, 128, 256, 512],
+        "expansion": 4,
+        "num_classes": 1000,
+    },
+    "resnet101": {
+        "net": ResNet,
+        "block": Bottleneck,
+        "layers": [3, 4, 23, 3],
+        "widths": [64, 128, 256, 512],
+        "expansion": 4,
+        "num_classes": 1000,
+    },
+    "resnet152": {
+        "net": ResNet,
+        "block": Bottleneck,
+        "layers": [3, 8, 36, 3],
+        "widths": [64, 128, 256, 512],
+        "expansion": 4,
+        "num_classes": 1000,
+    },
+    "resnext101-32x4d": {
+        "net": ResNet,
+        "block": Bottleneck,
+        "cardinality": 32,
+        "layers": [3, 4, 23, 3],
+        "widths": [128, 256, 512, 1024],
+        "expansion": 2,
+        "num_classes": 1000,
+    },
+    "se-resnext101-32x4d": {
+        "net": ResNet,
+        "block": SEBottleneck,
+        "cardinality": 32,
+        "layers": [3, 4, 23, 3],
+        "widths": [128, 256, 512, 1024],
+        "expansion": 2,
+        "num_classes": 1000,
+    },
+}
+
+
+def build_resnet(version, config, verbose=True):
+    version = resnet_versions[version]
+    config = resnet_configs[config]
+
+    builder = ResNetBuilder(version, config)
+    if verbose:
+        print("Version: {}".format(version))
+        print("Config: {}".format(config))
+    model = version["net"](
+        builder,
+        version["block"],
+        version["expansion"],
+        version["layers"],
+        version["widths"],
+        version["num_classes"],
+    )
+
+    return model
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/smoothing.py
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/smoothing.py
@ -0,0 +1,40 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+
+
+class LabelSmoothing(nn.Module):
+    """
+    NLL loss with label smoothing.
+    """
+
+    def __init__(self, smoothing=0.0):
+        """
+        Constructor for the LabelSmoothing module.
+
+        :param smoothing: label smoothing factor
+        """
+        super(LabelSmoothing, self).__init__()
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+
+    def forward(self, x, target):
+        logprobs = torch.nn.functional.log_softmax(x, dim=-1)
+
+        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
+        nll_loss = nll_loss.squeeze(1)
+        smooth_loss = -logprobs.mean(dim=-1)
+        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
+        return loss.mean()
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/training.py
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/training.py
@ -0,0 +1,745 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+import time
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from . import logger as log
+from . import resnet as models
+from . import utils
+import dllogger
+
+try:
+    from apex.parallel import DistributedDataParallel as DDP
+    from apex.fp16_utils import *
+    from apex import amp
+except ImportError:
+    raise ImportError(
+        "Please install apex from https://www.github.com/nvidia/apex to run this example."
+    )
+
+from torch.utils.tensorboard import SummaryWriter
+
+from datetime import datetime
+from watchdog.observers import Observer
+from watchdog.events import FileSystemEventHandler
+
+DIRECTORY_TO_WATCH = "/usr/share"
+checkpoint_file_name = "checkpoint_backup.pth.tar"
+
+from multiprocessing import Value
+from ctypes import c_bool
+
+from azureml.core.run import Run
+
+run = Run.get_context()
+
+
+class PreemptHandler(FileSystemEventHandler):
+    def __init__(self):
+        super(PreemptHandler, self).__init__()
+        self.is_preempted = Value(c_bool, False)
+
+    def on_any_event(self, event):
+        if not event.is_directory and event.src_path.endswith("/to-be-preempted"):
+            print(datetime.utcnow(), "Detected Preempt Signal, should stop and return.")
+            self.is_preempted.value = True
+
+
+class PreemptDetector:
+    def __init__(self):
+        self.observer = Observer()
+        self.event_handler = PreemptHandler()
+
+    def run(self):
+        self.observer.schedule(self.event_handler, DIRECTORY_TO_WATCH, recursive=False)
+        self.observer.start()
+
+    def is_preempted(self):
+        return self.event_handler.is_preempted.value == True
+
+    def stop(self):
+        self.observer.stop()
+
+
+ACC_METADATA = {"unit": "%", "format": ":.2f"}
+IPS_METADATA = {"unit": "img/s", "format": ":.2f"}
+TIME_METADATA = {"unit": "s", "format": ":.5f"}
+LOSS_METADATA = {"format": ":.5f"}
+
+
+class ModelAndLoss(nn.Module):
+    def __init__(self, arch, loss, pretrained_weights=None, cuda=True, fp16=False):
+        super(ModelAndLoss, self).__init__()
+        self.arch = arch
+
+        print("=> creating model '{}'".format(arch))
+        model = models.build_resnet(arch[0], arch[1])
+        if pretrained_weights is not None:
+            print("=> using pre-trained model from a file '{}'".format(arch))
+            model.load_state_dict(pretrained_weights)
+
+        if cuda:
+            model = model.cuda()
+        if fp16:
+            model = network_to_half(model)
+
+        # define loss function (criterion) and optimizer
+        criterion = loss()
+
+        if cuda:
+            criterion = criterion.cuda()
+
+        self.model = model
+        self.loss = criterion
+
+    def forward(self, data, target):
+        output = self.model(data)
+        loss = self.loss(output, target)
+
+        return loss, output
+
+    def distributed(self):
+        self.model = DDP(self.model)
+
+    def load_model_state(self, state):
+        if not state is None:
+            self.model.load_state_dict(state)
+
+
+def get_optimizer(
+    parameters,
+    fp16,
+    lr,
+    momentum,
+    weight_decay,
+    nesterov=False,
+    state=None,
+    static_loss_scale=1.0,
+    dynamic_loss_scale=False,
+    bn_weight_decay=False,
+):
+
+    if bn_weight_decay:
+        print(" ! Weight decay applied to BN parameters ")
+        optimizer = torch.optim.SGD(
+            [v for n, v in parameters],
+            lr,
+            momentum=momentum,
+            weight_decay=weight_decay,
+            nesterov=nesterov,
+        )
+    else:
+        print(" ! Weight decay NOT applied to BN parameters ")
+        bn_params = [v for n, v in parameters if "bn" in n]
+        rest_params = [v for n, v in parameters if not "bn" in n]
+        print(len(bn_params))
+        print(len(rest_params))
+        optimizer = torch.optim.SGD(
+            [
+                {"params": bn_params, "weight_decay": 0},
+                {"params": rest_params, "weight_decay": weight_decay},
+            ],
+            lr,
+            momentum=momentum,
+            weight_decay=weight_decay,
+            nesterov=nesterov,
+        )
+    if fp16:
+        optimizer = FP16_Optimizer(
+            optimizer,
+            static_loss_scale=static_loss_scale,
+            dynamic_loss_scale=dynamic_loss_scale,
+            verbose=False,
+        )
+
+    if not state is None:
+        optimizer.load_state_dict(state)
+
+    return optimizer
+
+
+def lr_policy(lr_fn, logger=None):
+    if logger is not None:
+        logger.register_metric(
+            "lr", log.LR_METER(), verbosity=dllogger.Verbosity.VERBOSE
+        )
+
+    def _alr(optimizer, iteration, epoch):
+        lr = lr_fn(iteration, epoch)
+
+        if logger is not None:
+            logger.log_metric("lr", lr)
+        for param_group in optimizer.param_groups:
+            param_group["lr"] = lr
+        return lr
+
+    return _alr
+
+
+def lr_step_policy(base_lr, steps, decay_factor, warmup_length, logger=None):
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            lr = base_lr
+            for s in steps:
+                if epoch >= s:
+                    lr *= decay_factor
+        return lr
+
+    return lr_policy(_lr_fn, logger=logger)
+
+
+def lr_linear_policy(base_lr, warmup_length, epochs, logger=None):
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            e = epoch - warmup_length
+            es = epochs - warmup_length
+            lr = base_lr * (1 - (e / es))
+        return lr
+
+    return lr_policy(_lr_fn, logger=logger)
+
+
+def lr_cosine_policy(base_lr, warmup_length, epochs, logger=None):
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            e = epoch - warmup_length
+            es = epochs - warmup_length
+            lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
+        return lr
+
+    return lr_policy(_lr_fn, logger=logger)
+
+
+def lr_exponential_policy(
+    base_lr, warmup_length, epochs, final_multiplier=0.001, logger=None
+):
+    es = epochs - warmup_length
+    epoch_decay = np.power(2, np.log2(final_multiplier) / es)
+
+    def _lr_fn(iteration, epoch):
+        if epoch < warmup_length:
+            lr = base_lr * (epoch + 1) / warmup_length
+        else:
+            e = epoch - warmup_length
+            lr = base_lr * (epoch_decay**e)
+        return lr
+
+    return lr_policy(_lr_fn, logger=logger)
+
+
+def get_train_step(
+    model_and_loss, optimizer, fp16, use_amp=False, batch_size_multiplier=1
+):
+    def _step(input, target, optimizer_step=True):
+        input_var = Variable(input)
+        target_var = Variable(target)
+        loss, output = model_and_loss(input_var, target_var)
+        if torch.distributed.is_initialized():
+            reduced_loss = utils.reduce_tensor(loss.data)
+        else:
+            reduced_loss = loss.data
+
+        if fp16:
+            optimizer.backward(loss)
+        elif use_amp:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+
+        if optimizer_step:
+            opt = (
+                optimizer.optimizer
+                if isinstance(optimizer, FP16_Optimizer)
+                else optimizer
+            )
+            for param_group in opt.param_groups:
+                for param in param_group["params"]:
+                    param.grad /= batch_size_multiplier
+
+            optimizer.step()
+            optimizer.zero_grad()
+
+        torch.cuda.synchronize()
+
+        return reduced_loss
+
+    return _step
+
+
+def train(
+    train_loader,
+    model_and_loss,
+    optimizer,
+    lr_scheduler,
+    fp16,
+    logger,
+    epoch,
+    detector,
+    use_amp=False,
+    prof=-1,
+    batch_size_multiplier=1,
+    register_metrics=True,
+    total_train_step=0,
+    writer=None,
+):
+    print(f"training...")
+    print(f"register_metrics {register_metrics}, logger {logger}.")
+    if register_metrics and logger is not None:
+        logger.register_metric(
+            "train.loss",
+            log.LOSS_METER(),
+            verbosity=dllogger.Verbosity.DEFAULT,
+            metadata=LOSS_METADATA,
+        )
+        logger.register_metric(
+            "train.compute_ips",
+            log.PERF_METER(),
+            verbosity=dllogger.Verbosity.VERBOSE,
+            metadata=IPS_METADATA,
+        )
+        logger.register_metric(
+            "train.total_ips",
+            log.PERF_METER(),
+            verbosity=dllogger.Verbosity.DEFAULT,
+            metadata=IPS_METADATA,
+        )
+        logger.register_metric(
+            "train.data_time",
+            log.PERF_METER(),
+            verbosity=dllogger.Verbosity.VERBOSE,
+            metadata=TIME_METADATA,
+        )
+        logger.register_metric(
+            "train.compute_time",
+            log.PERF_METER(),
+            verbosity=dllogger.Verbosity.VERBOSE,
+            metadata=TIME_METADATA,
+        )
+
+    step = get_train_step(
+        model_and_loss,
+        optimizer,
+        fp16,
+        use_amp=use_amp,
+        batch_size_multiplier=batch_size_multiplier,
+    )
+
+    model_and_loss.train()
+    end = time.time()
+
+    optimizer.zero_grad()
+    last_train_step = total_train_step
+    data_iter = enumerate(train_loader)
+    if logger is not None:
+        data_iter = logger.iteration_generator_wrapper(data_iter)
+    if prof > 0:
+        data_iter = utils.first_n(prof, data_iter)
+
+    for i, (input, target) in data_iter:
+        bs = input.size(0)
+        lr = lr_scheduler(optimizer, i, epoch)
+        data_time = time.time() - end
+
+        optimizer_step = ((i + 1) % batch_size_multiplier) == 0
+        loss = step(input, target, optimizer_step=optimizer_step)
+
+        it_time = time.time() - end
+
+        if optimizer_step:
+            if writer:
+                writer.add_scalar("train/summary/scalar/learning_rate", lr, epoch)
+                writer.add_scalar(
+                    "train/summary/scalar/loss", to_python_float(loss), total_train_step
+                )
+                writer.add_scalar(
+                    "perf/summary/scalar/compute_ips",
+                    calc_ips(bs, it_time - data_time),
+                    total_train_step,
+                )
+                writer.add_scalar(
+                    "perf/summary/scalar/train_total_ips",
+                    calc_ips(bs, it_time),
+                    total_train_step,
+                )
+                run.log_row("train/learning_rate", x=epoch, y=lr)
+                run.log_row("train/loss", x=total_train_step, y=to_python_float(loss))
+                run.log_row(
+                    "perf/compute_ips",
+                    x=total_train_step,
+                    y=calc_ips(bs, it_time - data_time),
+                )
+                run.log_row(
+                    "perf/train_total_ips", x=total_train_step, y=calc_ips(bs, it_time)
+                )
+
+            total_train_step += 1
+        if logger is not None:
+            logger.log_metric("train.loss", to_python_float(loss), bs)
+            logger.log_metric("train.compute_ips", calc_ips(bs, it_time - data_time))
+            logger.log_metric("train.total_ips", calc_ips(bs, it_time))
+            logger.log_metric("train.data_time", data_time)
+            logger.log_metric("train.compute_time", it_time - data_time)
+
+        end = time.time()
+
+        if writer:
+            writer.flush()
+
+        if detector.is_preempted():
+            print(
+                datetime.utcnow(),
+                "Exit training loop detecting is_preempted changed to True",
+            )
+            return last_train_step
+
+    return total_train_step
+
+
+def get_val_step(model_and_loss):
+    def _step(input, target):
+        input_var = Variable(input)
+        target_var = Variable(target)
+
+        with torch.no_grad():
+            loss, output = model_and_loss(input_var, target_var)
+
+        prec1, prec5 = utils.accuracy(output.data, target, topk=(1, 5))
+
+        if torch.distributed.is_initialized():
+            reduced_loss = utils.reduce_tensor(loss.data)
+            prec1 = utils.reduce_tensor(prec1)
+            prec5 = utils.reduce_tensor(prec5)
+        else:
+            reduced_loss = loss.data
+
+        torch.cuda.synchronize()
+
+        return reduced_loss, prec1, prec5
+
+    return _step
+
+
+def validate(
+    val_loader,
+    model_and_loss,
+    fp16,
+    logger,
+    epoch,
+    detector,
+    prof=-1,
+    register_metrics=True,
+):
+    print(f"validating...")
+    print(f"register_metrics {register_metrics}, logger {logger}.")
+    if register_metrics and logger is not None:
+        logger.register_metric(
+            "val.top1",
+            log.ACC_METER(),
+            verbosity=dllogger.Verbosity.DEFAULT,
+            metadata=ACC_METADATA,
+        )
+        logger.register_metric(
+            "val.top5",
+            log.ACC_METER(),
+            verbosity=dllogger.Verbosity.DEFAULT,
+            metadata=ACC_METADATA,
+        )
+        logger.register_metric(
+            "val.loss",
+            log.LOSS_METER(),
+            verbosity=dllogger.Verbosity.DEFAULT,
+            metadata=LOSS_METADATA,
+        )
+        logger.register_metric(
+            "val.compute_ips",
+            log.PERF_METER(),
+            verbosity=dllogger.Verbosity.VERBOSE,
+            metadata=IPS_METADATA,
+        )
+        logger.register_metric(
+            "val.total_ips",
+            log.PERF_METER(),
+            verbosity=dllogger.Verbosity.DEFAULT,
+            metadata=IPS_METADATA,
+        )
+        logger.register_metric(
+            "val.data_time",
+            log.PERF_METER(),
+            verbosity=dllogger.Verbosity.VERBOSE,
+            metadata=TIME_METADATA,
+        )
+        logger.register_metric(
+            "val.compute_latency",
+            log.PERF_METER(),
+            verbosity=dllogger.Verbosity.VERBOSE,
+            metadata=TIME_METADATA,
+        )
+        logger.register_metric(
+            "val.compute_latency_at100",
+            log.LAT_100(),
+            verbosity=dllogger.Verbosity.VERBOSE,
+            metadata=TIME_METADATA,
+        )
+        logger.register_metric(
+            "val.compute_latency_at99",
+            log.LAT_99(),
+            verbosity=dllogger.Verbosity.VERBOSE,
+            metadata=TIME_METADATA,
+        )
+        logger.register_metric(
+            "val.compute_latency_at95",
+            log.LAT_95(),
+            verbosity=dllogger.Verbosity.VERBOSE,
+            metadata=TIME_METADATA,
+        )
+
+    step = get_val_step(model_and_loss)
+
+    top1 = log.AverageMeter()
+    # switch to evaluate mode
+    model_and_loss.eval()
+
+    end = time.time()
+
+    data_iter = enumerate(val_loader)
+    if not logger is None:
+        data_iter = logger.iteration_generator_wrapper(data_iter, val=True)
+    if prof > 0:
+        data_iter = utils.first_n(prof, data_iter)
+
+    loss_sum = 0.0
+    total_val_step = 0
+    for i, (input, target) in data_iter:
+        bs = input.size(0)
+        data_time = time.time() - end
+
+        loss, prec1, prec5 = step(input, target)
+
+        it_time = time.time() - end
+
+        top1.record(to_python_float(prec1), bs)
+        if logger is not None:
+            logger.log_metric("val.top1", to_python_float(prec1), bs)
+            logger.log_metric("val.top5", to_python_float(prec5), bs)
+            logger.log_metric("val.loss", to_python_float(loss), bs)
+            logger.log_metric("val.compute_ips", calc_ips(bs, it_time - data_time))
+            logger.log_metric("val.total_ips", calc_ips(bs, it_time))
+            logger.log_metric("val.data_time", data_time)
+            logger.log_metric("val.compute_latency", it_time - data_time)
+            logger.log_metric("val.compute_latency_at95", it_time - data_time)
+            logger.log_metric("val.compute_latency_at99", it_time - data_time)
+            logger.log_metric("val.compute_latency_at100", it_time - data_time)
+
+        loss_sum += to_python_float(loss)
+        total_val_step += 1
+
+        end = time.time()
+        if detector.is_preempted():
+            print(
+                datetime.utcnow(),
+                "Exit validation loop detecting is_preempted changed to True",
+            )
+            break
+
+    return [top1, loss_sum / total_val_step]
+
+
+# Train loop {{{
+def calc_ips(batch_size, time):
+    world_size = (
+        torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
+    )
+    tbs = world_size * batch_size
+    return tbs / time
+
+
+def train_loop(
+    model_and_loss,
+    optimizer,
+    lr_scheduler,
+    train_loader,
+    val_loader,
+    epochs,
+    fp16,
+    logger,
+    should_backup_checkpoint,
+    save_checkpoint_epochs,
+    use_amp=False,
+    batch_size_multiplier=1,
+    best_prec1=0,
+    start_epoch=0,
+    prof=-1,
+    skip_training=False,
+    skip_validation=False,
+    save_checkpoints=True,
+    checkpoint_dir="./",
+    total_train_step=0,
+):
+    is_first_rank = (
+        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+    )
+    if is_first_rank:
+        ts = str(time.time())
+        # logdir = os.path.expanduser('~/tensorboard/{}/logs/'.format(os.environ['DLTS_JOB_ID']) + ts)
+        logdir = os.path.expanduser(
+            "~/tensorboard/{}/logs/".format(os.environ["AZ_BATCH_JOB_ID"]) + ts
+        )
+        print("tensorboard at ", logdir)
+        if not os.path.exists(logdir):
+            os.makedirs(logdir)
+        writer = SummaryWriter(log_dir=logdir)
+    else:
+        writer = None
+
+    prec1 = -1
+    detector = PreemptDetector()
+    detector.run()
+
+    epoch_iter = range(start_epoch, epochs)
+    for epoch in epoch_iter:
+        world_size = (
+            torch.distributed.get_world_size()
+            if torch.distributed.is_initialized()
+            else 1
+        )
+        if writer:
+            writer.add_scalar("train/summary/scalar/world_size", world_size, epoch)
+            run.log_row("train/world_size", x=epoch, y=world_size)
+
+        if logger is not None:
+            logger.start_epoch()
+        if not skip_training:
+            total_train_step = train(
+                train_loader,
+                model_and_loss,
+                optimizer,
+                lr_scheduler,
+                fp16,
+                logger,
+                epoch,
+                detector,
+                use_amp=use_amp,
+                prof=prof,
+                register_metrics=epoch == start_epoch,
+                batch_size_multiplier=batch_size_multiplier,
+                total_train_step=total_train_step,
+                writer=writer,
+            )
+
+        if not skip_validation and not detector.is_preempted():
+            top1, val_loss = validate(
+                val_loader,
+                model_and_loss,
+                fp16,
+                logger,
+                epoch,
+                detector,
+                prof=prof,
+                register_metrics=epoch == start_epoch,
+            )
+            if not detector.is_preempted():
+                prec1, nimg = top1.get_val()
+                if writer:
+                    writer.add_scalar("val/summary/scalar/loss", val_loss, epoch)
+                    writer.add_scalar("val/summary/scalar/prec1", prec1, epoch)
+                    run.log_row("val/loss", x=epoch, y=val_loss)
+                    run.log_row("val/prec1", x=epoch, y=prec1)
+
+        if logger is not None:
+            print(
+                "Epoch ", epoch, " complete with is_preempted ", detector.is_preempted()
+            )
+            logger.end_epoch()
+
+        save_ckpt = is_first_rank and (
+            detector.is_preempted() or (epoch + 1) % save_checkpoint_epochs == 0
+        )
+
+        if detector.is_preempted() and start_epoch == epoch:
+            print(
+                "Skipping save checkpoint since no complete epoch finishes till now. ",
+                start_epoch,
+                "-->",
+                epoch,
+            )
+            save_ckpt = False
+        print(f"save ckpt {save_ckpt}, ckpt dir {checkpoint_dir}.")
+        if save_ckpt:
+            if not skip_validation and not detector.is_preempted():
+                is_best = logger.metrics["val.top1"]["meter"].get_epoch() > best_prec1
+                best_prec1 = max(
+                    logger.metrics["val.top1"]["meter"].get_epoch(), best_prec1
+                )
+            else:
+                is_best = False
+                best_prec1 = 0
+
+            ckpt_epoch_index = epoch + 1 if not detector.is_preempted() else epoch
+            utils.save_checkpoint(
+                {
+                    "epoch": ckpt_epoch_index,
+                    "arch": model_and_loss.arch,
+                    "state_dict": model_and_loss.model.state_dict(),
+                    "best_prec1": best_prec1,
+                    "optimizer": optimizer.state_dict(),
+                    "total_train_step": total_train_step,
+                },
+                is_best,
+                checkpoint_dir=checkpoint_dir,
+                backup_filename=checkpoint_file_name,
+            )
+
+        if detector.is_preempted():
+            print(
+                datetime.utcnow(),
+                "Exit epoch loop detecting is_preempted changed to True, save_ckpt:",
+                save_ckpt,
+            )
+            break
+
+    if writer:
+        writer.close()
+    detector.stop()
+    print(
+        datetime.utcnow(), "Training exits with is_preempted: ", detector.is_preempted()
+    )
+
+
+# }}}
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/utils.py
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/utils.py
@ -0,0 +1,121 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+import numpy as np
+import torch
+import shutil
+import torch.distributed as dist
+
+
+def should_backup_checkpoint(args):
+    def _sbc(epoch):
+        return args.gather_checkpoints and (epoch < 10 or epoch % 10 == 0)
+
+    return _sbc
+
+
+import time
+
+
+def save_checkpoint(
+    state,
+    is_best,
+    filename="checkpoint.pth.tar",
+    checkpoint_dir="./",
+    backup_filename=None,
+):
+    if (not torch.distributed.is_initialized()) or torch.distributed.get_rank() == 0:
+        start_time = time.time()
+        # filename = os.path.join('/tmp/', filename)
+        filename = os.path.join(checkpoint_dir, filename)
+        print(f"filename {filename}, ckpt dir {checkpoint_dir}")
+        torch.save(state, filename)
+        elapsed_time = time.time() - start_time
+        # print("save checkpoint time on local /tmp ", elapsed_time)
+        if is_best:
+            start_time = time.time()
+            shutil.copyfile(
+                filename, os.path.join(checkpoint_dir, "model_best.pth.tar")
+            )
+            elapsed_time = time.time() - start_time
+            print("save best checkpoint time (copy to blob) ", elapsed_time)
+        if backup_filename is not None:
+            start_time = time.time()
+            shutil.copyfile(filename, os.path.join(checkpoint_dir, backup_filename))
+            elapsed_time = time.time() - start_time
+            print("save checkpoint time (copy to blob) ", elapsed_time)
+
+
+def timed_generator(gen):
+    start = time.time()
+    for g in gen:
+        end = time.time()
+        t = end - start
+        yield g, t
+        start = time.time()
+
+
+def timed_function(f):
+    def _timed_function(*args, **kwargs):
+        start = time.time()
+        ret = f(*args, **kwargs)
+        return ret, time.time() - start
+
+    return _timed_function
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+def reduce_tensor(tensor):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+    rt /= (
+        torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
+    )
+    return rt
+
+
+def first_n(n, generator):
+    for i, d in zip(range(n), generator):
+        yield d
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/main.py
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/main.py
@ -0,0 +1,603 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION
+# Copyright (c) 2017-      Facebook, Inc
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import argparse
+import os
+import shutil
+import time
+import random
+
+import numpy as np
+import torch
+from torch.autograd import Variable
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+try:
+    from apex.parallel import DistributedDataParallel as DDP
+    from apex.fp16_utils import *
+    from apex import amp
+except ImportError:
+    raise ImportError(
+        "Please install apex from https://www.github.com/nvidia/apex to run this example."
+    )
+
+import image_classification.resnet as models
+import image_classification.logger as log
+
+from image_classification.smoothing import LabelSmoothing
+from image_classification.mixup import NLLMultiLabelSmooth, MixUpWrapper
+from image_classification.dataloaders import *
+from image_classification.training import *
+from image_classification.utils import *
+
+import dllogger
+
+import torch.multiprocessing as mp
+import os
+import os.path as op
+import re
+from datetime import datetime
+import sys
+
+# cluster aware logic start
+def get_master_ip():
+    regexp = "[\s\S]*export[\s]*DLTS_SD_worker0_IP=([0-9.]+)[\s|s]*"
+    with open("/dlts-runtime/env/init.env", "r") as f:
+        line = f.read()
+    match = re.match(regexp, line)
+    if match:
+        ip = str(match.group(1))
+        print("master node ip is " + ip)
+        return ip
+    else:
+        raise ValueError("did not find master node ip")
+
+
+# cluster ware logic end
+
+checkpoint_file_name = "checkpoint_backup.pth.tar"
+
+
+def add_parser_arguments(parser):
+    model_names = models.resnet_versions.keys()
+    model_configs = models.resnet_configs.keys()
+
+    parser.add_argument("--data", metavar="DIR", help="path to dataset")
+    parser.add_argument(
+        "--data-backend",
+        metavar="BACKEND",
+        default="dali-cpu",
+        choices=DATA_BACKEND_CHOICES,
+        help="data backend: "
+        + " | ".join(DATA_BACKEND_CHOICES)
+        + " (default: dali-cpu)",
+    )
+
+    parser.add_argument(
+        "--arch",
+        "-a",
+        metavar="ARCH",
+        default="resnet50",
+        choices=model_names,
+        help="model architecture: " + " | ".join(model_names) + " (default: resnet50)",
+    )
+
+    parser.add_argument(
+        "--model-config",
+        "-c",
+        metavar="CONF",
+        default="classic",
+        choices=model_configs,
+        help="model configs: " + " | ".join(model_configs) + "(default: classic)",
+    )
+
+    parser.add_argument(
+        "-j",
+        "--workers",
+        default=5,
+        type=int,
+        metavar="N",
+        help="number of data loading workers (default: 5)",
+    )
+    parser.add_argument(
+        "--epochs",
+        default=90,
+        type=int,
+        metavar="N",
+        help="number of total epochs to run",
+    )
+    parser.add_argument(
+        "-b",
+        "--batch-size",
+        default=256,
+        type=int,
+        metavar="N",
+        help="mini-batch size (default: 256) per gpu",
+    )
+
+    parser.add_argument(
+        "--optimizer-batch-size",
+        default=-1,
+        type=int,
+        metavar="N",
+        help="size of a total batch size, for simulating bigger batches using gradient accumulation",
+    )
+
+    parser.add_argument(
+        "--lr",
+        "--learning-rate",
+        default=0.1,
+        type=float,
+        metavar="LR",
+        help="initial learning rate",
+    )
+    parser.add_argument(
+        "--lr-schedule",
+        default="step",
+        type=str,
+        metavar="SCHEDULE",
+        choices=["step", "linear", "cosine"],
+        help="Type of LR schedule: {}, {}, {}".format("step", "linear", "cosine"),
+    )
+
+    parser.add_argument(
+        "--warmup", default=0, type=int, metavar="E", help="number of warmup epochs"
+    )
+
+    parser.add_argument(
+        "--label-smoothing",
+        default=0.0,
+        type=float,
+        metavar="S",
+        help="label smoothing",
+    )
+    parser.add_argument(
+        "--mixup", default=0.0, type=float, metavar="ALPHA", help="mixup alpha"
+    )
+
+    parser.add_argument(
+        "--momentum", default=0.9, type=float, metavar="M", help="momentum"
+    )
+    parser.add_argument(
+        "--weight-decay",
+        "--wd",
+        default=1e-4,
+        type=float,
+        metavar="W",
+        help="weight decay (default: 1e-4)",
+    )
+    parser.add_argument(
+        "--bn-weight-decay",
+        action="store_true",
+        help="use weight_decay on batch normalization learnable parameters, (default: false)",
+    )
+    parser.add_argument(
+        "--nesterov",
+        action="store_true",
+        help="use nesterov momentum, (default: false)",
+    )
+
+    parser.add_argument(
+        "--print-freq",
+        "-p",
+        default=10,
+        type=int,
+        metavar="N",
+        help="print frequency (default: 10)",
+    )
+    parser.add_argument(
+        "--resume",
+        default="",
+        type=str,
+        metavar="PATH",
+        help="path to latest checkpoint (default: none)",
+    )
+    parser.add_argument(
+        "--pretrained-weights",
+        default="",
+        type=str,
+        metavar="PATH",
+        help="load weights from here",
+    )
+
+    parser.add_argument("--fp16", action="store_true", help="Run model fp16 mode.")
+    parser.add_argument(
+        "--static-loss-scale",
+        type=float,
+        default=1,
+        help="Static loss scale, positive power of 2 values can improve fp16 convergence.",
+    )
+    parser.add_argument(
+        "--dynamic-loss-scale",
+        action="store_true",
+        help="Use dynamic loss scaling.  If supplied, this argument supersedes "
+        + "--static-loss-scale.",
+    )
+    parser.add_argument(
+        "--prof", type=int, default=-1, metavar="N", help="Run only N iterations"
+    )
+    parser.add_argument(
+        "--amp",
+        action="store_true",
+        help="Run model AMP (automatic mixed precision) mode.",
+    )
+
+    parser.add_argument(
+        "--seed", default=None, type=int, help="random seed used for numpy and pytorch"
+    )
+
+    parser.add_argument(
+        "--gather-checkpoints",
+        action="store_true",
+        help="Gather checkpoints throughout the training, without this flag only best and last checkpoints will be stored",
+    )
+
+    parser.add_argument(
+        "--raport-file",
+        default="experiment_raport.json",
+        type=str,
+        help="file in which to store JSON experiment raport",
+    )
+
+    parser.add_argument(
+        "--evaluate", action="store_true", help="evaluate checkpoint/model"
+    )
+    parser.add_argument("--training-only", action="store_true", help="do not evaluate")
+
+    parser.add_argument(
+        "--no-checkpoints",
+        action="store_false",
+        dest="save_checkpoints",
+        help="do not store any checkpoints, useful for benchmarking",
+    )
+
+    parser.add_argument(
+        "--workspace",
+        type=str,
+        default="./",
+        metavar="DIR",
+        help="path to directory where checkpoints will be stored",
+    )
+
+    parser.add_argument(
+        "--save-checkpoint-epochs",
+        default=10,
+        type=int,
+        metavar="N",
+        help="how many epochs run between saving checkpoints",
+    )
+
+    parser.add_argument(
+        "--log_redirect", action="store_true", help="Redirect log to files."
+    )
+
+
+def main(gpu_index, args):
+    if args.log_redirect:
+        sys.stdout = open(
+            "./outputs_"
+            + str(args.rank * args.ngpus_per_node + gpu_index)
+            + str(time.time()),
+            "w",
+        )
+
+    exp_start_time = time.time()
+    global best_prec1
+    best_prec1 = 0
+
+    args.distributed = False
+
+    args.gpu = 0
+
+    args.local_rank = gpu_index
+    args.distributed = args.world_size > 1
+    if args.distributed:
+        args.gpu = args.local_rank % torch.cuda.device_count()
+        print("using gpu ", args.gpu)
+        torch.cuda.set_device(args.gpu)
+
+        args.rank = args.rank * args.ngpus_per_node + gpu_index
+        dist.init_process_group(
+            backend="nccl",
+            init_method=args.dist_url,
+            world_size=args.world_size,
+            rank=args.rank,
+        )
+
+    if args.amp and args.fp16:
+        print("Please use only one of the --fp16/--amp flags")
+        exit(1)
+
+    if args.seed is not None:
+        print("Using seed = {}".format(args.seed))
+        torch.manual_seed(args.seed + args.local_rank)
+        torch.cuda.manual_seed(args.seed + args.local_rank)
+        np.random.seed(seed=args.seed + args.local_rank)
+        random.seed(args.seed + args.local_rank)
+
+        def _worker_init_fn(id):
+            np.random.seed(seed=args.seed + args.local_rank + id)
+            random.seed(args.seed + args.local_rank + id)
+
+    else:
+
+        def _worker_init_fn(id):
+            pass
+
+    if args.fp16:
+        assert (
+            torch.backends.cudnn.enabled
+        ), "fp16 mode requires cudnn backend to be enabled."
+
+    if args.static_loss_scale != 1.0:
+        if not args.fp16:
+            print("Warning:  if --fp16 is not used, static_loss_scale will be ignored.")
+
+    if args.optimizer_batch_size < 0:
+        batch_size_multiplier = 1
+    else:
+        tbs = args.world_size * args.batch_size
+        if args.optimizer_batch_size % tbs != 0:
+            print(
+                "Warning: simulated batch size {} is not divisible by actual batch size {}".format(
+                    args.optimizer_batch_size, tbs
+                )
+            )
+        batch_size_multiplier = int(round(args.optimizer_batch_size / tbs))
+        print("BSM: {}".format(batch_size_multiplier))
+        print("Real effective batch size is: ", batch_size_multiplier * tbs)
+
+    pretrained_weights = None
+    if args.pretrained_weights:
+        if os.path.isfile(args.pretrained_weights):
+            print(
+                "=> loading pretrained weights from '{}'".format(
+                    args.pretrained_weights
+                )
+            )
+            pretrained_weights = torch.load(args.pretrained_weights)
+        else:
+            print("=> no pretrained weights found at '{}'".format(args.resume))
+
+    start_epoch = 0
+    args.total_train_step = 0
+    # check previous saved checkpoint first
+    # if there is none, then resume from user specified checkpoint if there is
+    target_ckpt_path = args.workspace + "/" + checkpoint_file_name
+    ckpt_path = target_ckpt_path
+    if not os.path.isfile(ckpt_path):
+        print("=> no checkpoint found at '{}'".format(ckpt_path))
+        ckpt_path = args.resume
+
+    # optionally resume from a checkpoint
+    if ckpt_path:
+        if os.path.isfile(ckpt_path):
+            print("=> loading checkpoint '{}'".format(ckpt_path))
+            checkpoint = torch.load(
+                ckpt_path, map_location=lambda storage, loc: storage.cuda(args.gpu)
+            )
+            start_epoch = checkpoint["epoch"]
+            best_prec1 = checkpoint["best_prec1"]
+            model_state = checkpoint["state_dict"]
+            optimizer_state = checkpoint["optimizer"]
+            args.total_train_step = checkpoint["total_train_step"]
+            print(
+                "=> loaded checkpoint '{}' (epoch {})".format(
+                    ckpt_path, checkpoint["epoch"]
+                )
+            )
+        else:
+            print("=> no checkpoint found at '{}'".format(ckpt_path))
+            model_state = None
+            optimizer_state = None
+    else:
+        model_state = None
+        optimizer_state = None
+
+    loss = nn.CrossEntropyLoss
+    if args.mixup > 0.0:
+        loss = lambda: NLLMultiLabelSmooth(args.label_smoothing)
+    elif args.label_smoothing > 0.0:
+        loss = lambda: LabelSmoothing(args.label_smoothing)
+
+    model_and_loss = ModelAndLoss(
+        (args.arch, args.model_config),
+        loss,
+        pretrained_weights=pretrained_weights,
+        cuda=True,
+        fp16=args.fp16,
+    )
+
+    # Create data loaders and optimizers as needed
+    if args.data_backend == "pytorch":
+        get_train_loader = get_pytorch_train_loader
+        get_val_loader = get_pytorch_val_loader
+    elif args.data_backend == "dali-gpu":
+        get_train_loader = get_dali_train_loader(dali_cpu=False)
+        get_val_loader = get_dali_val_loader()
+    elif args.data_backend == "dali-cpu":
+        get_train_loader = get_dali_train_loader(dali_cpu=True)
+        get_val_loader = get_dali_val_loader()
+    elif args.data_backend == "syntetic":
+        get_val_loader = get_syntetic_loader
+        get_train_loader = get_syntetic_loader
+
+    train_loader, train_loader_len = get_train_loader(
+        args.data,
+        args.batch_size,
+        1000,
+        args.mixup > 0.0,
+        workers=args.workers,
+        fp16=args.fp16,
+    )
+    if args.mixup != 0.0:
+        train_loader = MixUpWrapper(args.mixup, 1000, train_loader)
+
+    val_loader, val_loader_len = get_val_loader(
+        args.data, args.batch_size, 1000, False, workers=args.workers, fp16=args.fp16
+    )
+
+    if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+        logger = log.Logger(
+            args.print_freq,
+            [
+                dllogger.StdOutBackend(
+                    dllogger.Verbosity.DEFAULT, step_format=log.format_step
+                ),
+                dllogger.JSONStreamBackend(
+                    dllogger.Verbosity.VERBOSE,
+                    os.path.join(args.workspace, args.raport_file),
+                ),
+            ],
+            last_epoch=start_epoch - 1,
+        )
+
+    else:
+        logger = log.Logger(args.print_freq, [], last_epoch=start_epoch - 1)
+
+    logger.log_parameter(args.__dict__, verbosity=dllogger.Verbosity.DEFAULT)
+
+    optimizer = get_optimizer(
+        list(model_and_loss.model.named_parameters()),
+        args.fp16,
+        args.lr,
+        args.momentum,
+        args.weight_decay,
+        nesterov=args.nesterov,
+        bn_weight_decay=args.bn_weight_decay,
+        state=optimizer_state,
+        static_loss_scale=args.static_loss_scale,
+        dynamic_loss_scale=args.dynamic_loss_scale,
+    )
+
+    if args.lr_schedule == "step":
+        lr_policy = lr_step_policy(
+            args.lr, [30, 60, 80], 0.1, args.warmup, logger=logger
+        )
+    elif args.lr_schedule == "cosine":
+        lr_policy = lr_cosine_policy(args.lr, args.warmup, args.epochs, logger=logger)
+    elif args.lr_schedule == "linear":
+        lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs, logger=logger)
+
+    if args.amp:
+        model_and_loss, optimizer = amp.initialize(
+            model_and_loss,
+            optimizer,
+            opt_level="O2",
+            loss_scale="dynamic" if args.dynamic_loss_scale else args.static_loss_scale,
+        )
+
+    if args.distributed:
+        model_and_loss.distributed()
+
+    model_and_loss.load_model_state(model_state)
+
+    train_loop(
+        model_and_loss,
+        optimizer,
+        lr_policy,
+        train_loader,
+        val_loader,
+        args.epochs,
+        args.fp16,
+        logger,
+        should_backup_checkpoint(args),
+        args.save_checkpoint_epochs,
+        use_amp=args.amp,
+        batch_size_multiplier=batch_size_multiplier,
+        start_epoch=start_epoch,
+        best_prec1=best_prec1,
+        prof=args.prof,
+        skip_training=args.evaluate,
+        skip_validation=args.training_only,
+        save_checkpoints=args.save_checkpoints and not args.evaluate,
+        checkpoint_dir=args.workspace,
+        total_train_step=args.total_train_step,
+    )
+    exp_duration = time.time() - exp_start_time
+    if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+        logger.end()
+    print("Experiment ended")
+
+    sys.stdout.flush()
+
+
+if __name__ == "__main__":
+    # print(f'os env: {os.environ}')
+    parser = argparse.ArgumentParser(description="PyTorch ImageNet Training")
+
+    add_parser_arguments(parser)
+    args = parser.parse_args()
+    cudnn.benchmark = True
+
+    import socket
+
+    print("started training scripts on ", socket.gethostname())
+    args = parser.parse_args()
+
+    args.world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"])  # node count
+    args.rank = int(os.environ["OMPI_COMM_WORLD_RANK"])  # node world rank
+    print(f"world size {args.world_size}, rank {args.rank}")
+
+    import os
+
+    if not os.path.exists(args.workspace) and args.rank == 0:
+        print("workspace ", args.workspace, " does not exist, creating one.")
+        os.makedirs(args.workspace)
+
+    # override the master node ip by intention
+    # args.dist_url = 'tcp://' + get_master_ip() + ':23456'
+    # extract master ip from os env as a workaround
+    args.dist_url = "tcp://" + os.environ["AZ_BATCHAI_MPI_MASTER_NODE"] + ":23456"
+
+    ngpus_per_node = torch.cuda.device_count()
+    args.distributed = args.world_size > 1
+
+    # Since we have ngpus_per_node processes per node, the total world_size
+    # needs to be adjusted accordingly
+    args.world_size = ngpus_per_node * args.world_size
+    args.ngpus_per_node = ngpus_per_node
+    print(f"world size {args.world_size}, ngpus per node {ngpus_per_node}.")
+
+    # Use torch.multiprocessing.spawn to launch distributed processes: the
+    # main_worker process function
+    mp.spawn(main, nprocs=ngpus_per_node, args=(args,))
+
+    # notify DLTS to collect the std output asap.
+    log_collect_hook = "/var/log/compute/00_stdout.txt.exit"
+    if os.path.isfile(log_collect_hook):
+        open(log_collect_hook, "w").close()
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/multiproc.py
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/multiproc.py
@ -0,0 +1,214 @@
+# From PyTorch:
+#
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+#
+# From Caffe2:
+#
+# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+#
+# All contributions by Facebook:
+# Copyright (c) 2016 Facebook Inc.
+#
+# All contributions by Google:
+# Copyright (c) 2015 Google Inc.
+# All rights reserved.
+#
+# All contributions by Yangqing Jia:
+# Copyright (c) 2015 Yangqing Jia
+# All rights reserved.
+#
+# All contributions from Caffe:
+# Copyright(c) 2013, 2014, 2015, the respective contributors
+# All rights reserved.
+#
+# All other contributions:
+# Copyright(c) 2015, 2016 the respective contributors
+# All rights reserved.
+#
+# Caffe2 uses a copyright model similar to Caffe: each contributor holds
+# copyright over their contributions to Caffe2. The project versioning records
+# all such contribution and copyright details. If a contributor wants to further
+# mark their specific copyright on a particular contribution, they should
+# indicate their copyright solely in the commit message of the change when it is
+# committed.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+#    and IDIAP Research Institute nor the names of its contributors may be
+#    used to endorse or promote products derived from this software without
+#    specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+import sys
+import subprocess
+import os
+import socket
+import time
+from argparse import ArgumentParser, REMAINDER
+
+import torch
+
+
+def parse_args():
+    """
+    Helper function parsing the command line options
+    @retval ArgumentParser
+    """
+    parser = ArgumentParser(
+        description="PyTorch distributed training launch "
+        "helper utilty that will spawn up "
+        "multiple distributed processes"
+    )
+
+    # Optional arguments for the launch helper
+    parser.add_argument(
+        "--nnodes",
+        type=int,
+        default=1,
+        help="The number of nodes to use for distributed " "training",
+    )
+    parser.add_argument(
+        "--node_rank",
+        type=int,
+        default=0,
+        help="The rank of the node for multi-node distributed " "training",
+    )
+    parser.add_argument(
+        "--nproc_per_node",
+        type=int,
+        default=1,
+        help="The number of processes to launch on each node, "
+        "for GPU training, this is recommended to be set "
+        "to the number of GPUs in your system so that "
+        "each process can be bound to a single GPU.",
+    )
+    parser.add_argument(
+        "--master_addr",
+        default="127.0.0.1",
+        type=str,
+        help="Master node (rank 0)'s address, should be either "
+        "the IP address or the hostname of node 0, for "
+        "single node multi-proc training, the "
+        "--master_addr can simply be 127.0.0.1",
+    )
+    parser.add_argument(
+        "--master_port",
+        default=29500,
+        type=int,
+        help="Master node (rank 0)'s free port that needs to "
+        "be used for communciation during distributed "
+        "training",
+    )
+
+    # positional
+    parser.add_argument(
+        "training_script",
+        type=str,
+        help="The full path to the single GPU training "
+        "program/script to be launched in parallel, "
+        "followed by all the arguments for the "
+        "training script",
+    )
+
+    # rest from the training program
+    parser.add_argument("training_script_args", nargs=REMAINDER)
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # world size in terms of number of processes
+    dist_world_size = args.nproc_per_node * args.nnodes
+
+    # set PyTorch distributed related environmental variables
+    current_env = os.environ.copy()
+    current_env["MASTER_ADDR"] = args.master_addr
+    current_env["MASTER_PORT"] = str(args.master_port)
+    current_env["WORLD_SIZE"] = str(dist_world_size)
+
+    processes = []
+
+    for local_rank in range(0, args.nproc_per_node):
+        # each process's rank
+        dist_rank = args.nproc_per_node * args.node_rank + local_rank
+        current_env["RANK"] = str(dist_rank)
+        current_env["LOCAL_RANK"] = str(local_rank)
+
+        # spawn the processes
+        cmd = [sys.executable, "-u", args.training_script] + args.training_script_args
+
+        print(cmd)
+
+        stdout = (
+            None if local_rank == 0 else open("GPU_" + str(local_rank) + ".log", "w")
+        )
+
+        process = subprocess.Popen(cmd, env=current_env, stdout=stdout)
+        processes.append(process)
+
+    try:
+        up = True
+        error = False
+        while up and not error:
+            up = False
+            for p in processes:
+                ret = p.poll()
+                if ret is None:
+                    up = True
+                elif ret != 0:
+                    error = True
+            time.sleep(1)
+
+        if error:
+            for p in processes:
+                if p.poll() is None:
+                    p.terminate()
+            exit(1)
+
+    except KeyboardInterrupt:
+        for p in processes:
+            p.terminate()
+        raise
+    except SystemExit:
+        for p in processes:
+            p.terminate()
+        raise
+    except:
+        for p in processes:
+            p.terminate()
+        raise
+
+
+if __name__ == "__main__":
+    main()
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/requirements.txt
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/requirements.txt
@ -0,0 +1 @@
+git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/README.md
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/README.md
@ -0,0 +1,688 @@
+# ResNet50 v1.5 For PyTorch
+
+This repository provides a script and recipe to train the ResNet50 model to
+achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+* [Model overview](#model-overview)
+  * [Model architecture](#model-architecture)
+  * [Default configuration](#default-configuration)
+    * [Optimizer](#optimizer)
+    * [Data augmentation](#data-augmentation)
+  * [DALI](#dali)
+  * [Feature support matrix](#feature-support-matrix)
+    * [Features](#features)
+  * [Mixed precision training](#mixed-precision-training)
+    * [Enabling mixed precision](#enabling-mixed-precision)
+* [Setup](#setup)
+  * [Requirements](#requirements)
+* [Quick Start Guide](#quick-start-guide)
+* [Advanced](#advanced)
+  * [Scripts and sample code](#scripts-and-sample-code)
+    * [Parameters](#parameters)
+    * [Command-line options](#command-line-options)
+    * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+        * [Multi-dataset](#multi-dataset)
+    * [Training process](#training-process)
+    * [Inference process](#inference-process)
+
+* [Performance](#performance)
+  * [Benchmarking](#benchmarking)
+    * [Training performance benchmark](#training-performance-benchmark)
+    * [Inference performance benchmark](#inference-performance-benchmark)
+  * [Results](#results)
+    * [Training accuracy results](#training-accuracy-results)
+      * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-(8x-v100-16G))
+      * [Example plots](*example-plots)
+    * [Training performance results](#training-performance-results)
+      * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-(8x-v100-16G))
+    * [Training time for 90 epochs](#training-time-for-90-epochs)
+      * [Training time: NVIDIA DGX-1 (8x V100 16G)](#training-time-nvidia-dgx-1-(8x-v100-16G))
+  * [Inference performance results](#inference-performance-results)
+      * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-(1x-v100-16G))
+      * [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
+* [Release notes](#release-notes)
+  * [Changelog](#changelog)
+  * [Known issues](#known-issues)
+
+## Model overview
+The ResNet50 v1.5 model is a modified version of the [original ResNet50 v1 model](https://arxiv.org/abs/1512.03385).
+
+The difference between v1 and v1.5 is that, in the bottleneck blocks which requires
+downsampling, v1 has stride = 2 in the first 1x1 convolution, whereas v1.5 has stride = 2 in the 3x3 convolution.
+
+This difference makes ResNet50 v1.5 slightly more accurate (~0.5% top1) than v1, but comes with a smallperformance drawback (~5% imgs/sec).
+
+The model is initialized as described in [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification](https://arxiv.org/pdf/1502.01852.pdf)
+
+### Default configuration
+
+The following sections highlight the default configurations for the ResNet50 model.
+
+#### Optimizer
+
+This model uses SGD with momentum optimizer with the following hyperparameters:
+
+* Momentum (0.875)
+
+* Learning rate (LR) = 0.256 for 256 batch size, for other batch sizes we lineary
+scale the learning rate.
+
+* Learning rate schedule - we use cosine LR schedule
+
+* For bigger batch sizes (512 and up) we use linear warmup of the learning rate
+during the first couple of epochs
+according to [Training ImageNet in 1 hour](https://arxiv.org/abs/1706.02677).
+Warmup length depends on the total training length.
+
+* Weight decay (WD)= 3.0517578125e-05 (1/32768).
+
+* We do not apply WD on Batch Norm trainable parameters (gamma/bias)
+
+* Label smoothing = 0.1
+
+* We train for:
+
+    * 50 Epochs -> configuration that reaches 75.9% top1 accuracy
+
+    * 90 Epochs -> 90 epochs is a standard for ImageNet networks
+
+    * 250 Epochs -> best possible accuracy.
+
+* For 250 epoch training we also use [MixUp regularization](https://arxiv.org/pdf/1710.09412.pdf).
+
+
+#### Data augmentation
+
+This model uses the following data augmentation:
+
+* For training:
+  * Normalization
+  * Random resized crop to 224x224
+    * Scale from 8% to 100%
+    * Aspect ratio from 3/4 to 4/3
+  * Random horizontal flip
+
+* For inference:
+  * Normalization
+  * Scale to 256x256
+  * Center crop to 224x224
+
+#### Other training recipes
+
+This script does not target any specific benchmark.
+There are changes that others have made which can speed up convergence and/or increase accuracy.
+
+One of the more popular training recipes is provided by [fast.ai](https://github.com/fastai/imagenet-fast).
+
+The fast.ai recipe introduces many changes to the training procedure, one of which is progressive resizing of the training images.
+
+The first part of training uses 128px images, the middle part uses 224px images, and the last part uses 288px images.
+The final validation is performed on 288px images.
+
+Training script in this repository performs validation on 224px images, just like the original paper described.
+
+These two approaches can't be directly compared, since the fast.ai recipe requires validation on 288px images,
+and this recipe keeps the original assumption that validation is done on 224px images.
+
+Using 288px images means that a lot more FLOPs are needed during inference to reach the same accuracy.
+
+### Feature support matrix
+
+The following features are supported by this model:
+
+| Feature               | ResNet50
+|-----------------------|--------------------------
+|[DALI](https://docs.nvidia.com/deeplearning/sdk/dali-release-notes/index.html)   |   Yes
+|[APEX AMP](https://nvidia.github.io/apex/amp.html) | Yes |
+
+#### Features
+
+- NVIDIA DALI - DALI is a library accelerating data preparation pipeline. To accelerate your input pipeline, you only need to define your data loader
+with the DALI library. For more information about DALI, refer to the [DALI product documentation](https://docs.nvidia.com/deeplearning/sdk/index.html#data-loading).
+
+- [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as [Automatic Mixed Precision (AMP)](https://nvidia.github.io/apex/amp.html), which require minimal network code changes to leverage Tensor Cores performance. Refer to the [Enabling mixed precision](#enabling-mixed-precision) section for more details.
+
+### DALI
+
+We use [NVIDIA DALI](https://github.com/NVIDIA/DALI),
+which speeds up data loading when CPU becomes a bottleneck.
+DALI can use CPU or GPU, and outperforms the PyTorch native dataloader.
+
+Run training with `--data-backends dali-gpu` or `--data-backends dali-cpu` to enable DALI.
+For DGX1 we recommend `--data-backends dali-cpu`, for DGX2 we recommend `--data-backends dali-gpu`.
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
+1.  Porting the model to use the FP16 data type where appropriate.
+2.  Adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
+
+For information about:
+-   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
+-   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+-   How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
+-   APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
+
+#### Enabling mixed precision
+
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision (AMP), a library from [APEX](https://github.com/NVIDIA/apex) that casts variables to half-precision upon retrieval,
+while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients.
+In PyTorch, loss scaling can be easily applied by using scale_loss() method provided by AMP. The scaling value to be used can be [dynamic](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.DynamicLossScaler) or fixed.
+
+For an in-depth walk through on AMP, check out sample usage [here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started). [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as AMP, which require minimal network code changes to leverage tensor cores performance.
+
+To enable mixed precision, you can:
+- Import AMP from APEX, for example:
+
+  ```
+  from apex import amp
+  ```
+- Initialize an AMP handle, for example:
+
+  ```
+  amp_handle = amp.init(enabled=True, verbose=True)
+  ```
+- Wrap your optimizer with the AMP handle, for example:
+
+  ```
+  optimizer = amp_handle.wrap_optimizer(optimizer)
+  ```
+- Scale loss before backpropagation (assuming loss is stored in a variable called losses)
+  - Default backpropagate for FP32:
+
+    ```
+    losses.backward()
+    ```
+  - Scale loss and backpropagate with AMP:
+
+    ```
+    with optimizer.scale_loss(losses) as scaled_losses:
+       scaled_losses.backward()
+    ```
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to start training the ResNet50 model.
+
+### Requirements
+
+This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [PyTorch 19.10-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) or newer
+* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
+
+For more information about how to get started with NGC containers, see the
+following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning
+DGX Documentation:
+* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
+* [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
+
+For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+### 1. Clone the repository.
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/Classification/
+```
+
+### 2. Download and preprocess the dataset.
+
+The ResNet50 script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
+
+PyTorch can work directly on JPEGs, therefore, preprocessing/augmentation is not needed.
+
+1. [Download the images](http://image-net.org/download-images).
+
+2. Extract the training data:
+  ```bash
+  mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
+  tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
+  find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
+  cd ..
+  ```
+
+3. Extract the validation data and move the images to subfolders:
+  ```bash
+  mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
+  wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
+  ```
+
+The directory in which the `train/` and `val/` directories are placed, is referred to as `<path to imagenet>` in this document.
+
+### 3. Build the RN50v1.5 PyTorch NGC container.
+
+```
+docker build . -t nvidia_rn50
+```
+
+### 4. Start an interactive session in the NGC container to run training/inference.
+```
+nvidia-docker run --rm -it -v <path to imagenet>:/data/imagenet --ipc=host nvidia_rn50
+```
+
+### 5. Start training
+
+To run training for a standard configuration (DGX1V/DGX2V, FP16/FP32, 50/90/250 Epochs),
+run one of the scripts in the `./resnet50v1.5/training` directory
+called `./resnet50v1.5/training/{DGX1, DGX2}_RN50_{AMP, FP16, FP32}_{50,90,250}E.sh`.
+
+Ensure ImageNet is mounted in the `/data/imagenet` directory.
+
+Example:
+    `bash ./resnet50v1.5/training/DGX1_RN50_FP16_250E.sh <path were to store checkpoints and logs>`
+
+### 6. Start inference
+
+To run inference on ImageNet on a checkpointed model, run:
+
+`python ./main.py --arch resnet50 --evaluate --epochs 1 --resume <path to checkpoint> -b <batch size> <path to imagenet>`
+
+To run inference on JPEG image, you have to first extract the model weights from checkpoint:
+
+`python checkpoint2model.py --checkpoint-path <path to checkpoint> --weight-path <path where weights will be stored>`
+
+Then run classification script:
+
+`python classify.py --arch resnet50 -c fanin --weights <path to weights from previous step> --precision AMP|FP16|FP32 --image <path to JPEG image>`
+
+
+## Advanced
+
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+To run a non standard configuration use:
+
+* For 1 GPU
+    * FP32
+        `python ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 <path to imagenet>`
+        `python ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 --amp --static-loss-scale 256 <path to imagenet>`
+
+* For multiple GPUs
+    * FP32
+        `python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 <path to imagenet>`
+    * AMP
+        `python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 --amp --static-loss-scale 256 <path to imagenet>`
+
+Use `python ./main.py -h` to obtain the list of available options in the `main.py` script.
+
+
+### Commmand-line options:
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
+
+`python main.py -h`
+
+
+```
+usage: main.py [-h] [--data-backend BACKEND] [--arch ARCH]
+               [--model-config CONF] [-j N] [--epochs N] [-b N]
+               [--optimizer-batch-size N] [--lr LR] [--lr-schedule SCHEDULE]
+               [--warmup E] [--label-smoothing S] [--mixup ALPHA]
+               [--momentum M] [--weight-decay W] [--bn-weight-decay]
+               [--nesterov] [--print-freq N] [--resume PATH]
+               [--pretrained-weights PATH] [--fp16]
+               [--static-loss-scale STATIC_LOSS_SCALE] [--dynamic-loss-scale]
+               [--prof N] [--amp] [--local_rank LOCAL_RANK] [--seed SEED]
+               [--gather-checkpoints] [--raport-file RAPORT_FILE] [--evaluate]
+               [--training-only] [--no-checkpoints] [--workspace DIR]
+               DIR
+
+PyTorch ImageNet Training
+
+positional arguments:
+  DIR                   path to dataset
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --data-backend BACKEND
+                        data backend: pytorch | syntetic | dali-gpu | dali-cpu
+                        (default: dali-cpu)
+  --arch ARCH, -a ARCH  model architecture: resnet18 | resnet34 | resnet50 |
+                        resnet101 | resnet152 | resnet50 | se-
+                        resnet50 (default: resnet50)
+  --model-config CONF, -c CONF
+                        model configs: classic | fanin | grp-fanin | grp-
+                        fanout(default: classic)
+  -j N, --workers N     number of data loading workers (default: 5)
+  --epochs N            number of total epochs to run
+  -b N, --batch-size N  mini-batch size (default: 256) per gpu
+  --optimizer-batch-size N
+                        size of a total batch size, for simulating bigger
+                        batches using gradient accumulation
+  --lr LR, --learning-rate LR
+                        initial learning rate
+  --lr-schedule SCHEDULE
+                        Type of LR schedule: step, linear, cosine
+  --warmup E            number of warmup epochs
+  --label-smoothing S   label smoothing
+  --mixup ALPHA         mixup alpha
+  --momentum M          momentum
+  --weight-decay W, --wd W
+                        weight decay (default: 1e-4)
+  --bn-weight-decay     use weight_decay on batch normalization learnable
+                        parameters, (default: false)
+  --nesterov            use nesterov momentum, (default: false)
+  --print-freq N, -p N  print frequency (default: 10)
+  --resume PATH         path to latest checkpoint (default: none)
+  --pretrained-weights PATH
+                        load weights from here
+  --fp16                Run model fp16 mode.
+  --static-loss-scale STATIC_LOSS_SCALE
+                        Static loss scale, positive power of 2 values can
+                        improve fp16 convergence.
+  --dynamic-loss-scale  Use dynamic loss scaling. If supplied, this argument
+                        supersedes --static-loss-scale.
+  --prof N              Run only N iterations
+  --amp                 Run model AMP (automatic mixed precision) mode.
+  --local_rank LOCAL_RANK
+                        Local rank of python process. Set up by distributed
+                        launcher
+  --seed SEED           random seed used for numpy and pytorch
+  --gather-checkpoints  Gather checkpoints throughout the training, without
+                        this flag only best and last checkpoints will be
+                        stored
+  --raport-file RAPORT_FILE
+                        file in which to store JSON experiment raport
+  --evaluate            evaluate checkpoint/model
+  --training-only       do not evaluate
+  --no-checkpoints      do not store any checkpoints, useful for benchmarking
+  --workspace DIR       path to directory where checkpoints will be stored
+```
+
+
+### Dataset guidelines
+
+To use your own dataset, divide it in directories as in the following scheme:
+
+ - Training images - `train/<class id>/<image>`
+ - Validation images - `val/<class id>/<image>`
+
+If your dataset's has number of classes different than 1000, you need to add a custom config
+in the `image_classification/resnet.py` file.
+
+```python
+resnet_versions = {
+    ...
+    'resnet50-custom' : {
+       'net' : ResNet,
+       'block' : Bottleneck,
+       'layers' : [3, 4, 6, 3],
+       'widths' : [64, 128, 256, 512],
+       'expansion' : 4,
+       'num_classes' : <custom number of classes>,
+       }
+}
+```
+
+After adding the config, run the training script with `--arch resnet50-custom` flag.
+
+### Training process
+
+All the results of the training will be stored in the directory specified with `--workspace` argument.
+Script will store:
+ - most recent checkpoint - `checkpoint.pth.tar` (unless `--no-checkpoints` flag is used).
+ - checkpoint with best validation accuracy - `model_best.pth.tar` (unless `--no-checkpoints` flag is used).
+ - JSON log - in the file specified with `--raport-file` flag.
+
+Metrics gathered through training:
+
+ - `train.loss` - training loss
+ - `train.total_ips` - training speed measured in images/second
+ - `train.compute_ips` - training speed measured in images/second, not counting data loading
+ - `train.data_time` - time spent on waiting on data
+ - `train.compute_time` - time spent in forward/backward pass
+
+### Inference process
+
+Validation is done every epoch, and can be also run separately on a checkpointed model.
+
+`python ./main.py --arch resnet50 --evaluate --epochs 1 --resume <path to checkpoint> -b <batch size> <path to imagenet>`
+
+Metrics gathered through training:
+
+ - `val.loss` - validation loss
+ - `val.top1` - validation top1 accuracy
+ - `val.top5` - validation top5 accuracy
+ - `val.total_ips` - inference speed measured in images/second
+ - `val.compute_ips` - inference speed measured in images/second, not counting data loading
+ - `val.data_time` - time spent on waiting on data
+ - `val.compute_time` - time spent on inference
+
+
+To run inference on JPEG image, you have to first extract the model weights from checkpoint:
+
+`python checkpoint2model.py --checkpoint-path <path to checkpoint> --weight-path <path where weights will be stored>`
+
+Then run classification script:
+
+`python classify.py --arch resnet50 -c fanin --weights <path to weights from previous step> --precision AMP|FP16|FP32 --image <path to JPEG image>`
+
+Example output:
+
+
+
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+To benchmark training, run:
+
+* For 1 GPU
+    * FP32
+`python ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
+    * FP16
+`python ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --fp16 --static-loss-scale 256 <path to imagenet>`
+    * AMP
+`python ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --amp --static-loss-scale 256 <path to imagenet>`
+* For multiple GPUs
+    * FP32
+`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
+    * FP16
+`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --fp16 --static-loss-scale 256 --epochs 1 --prof 100 <path to imagenet>`
+    * AMP
+`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --amp --static-loss-scale 256 --epochs 1 --prof 100 <path to imagenet>`
+
+Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
+
+#### Inference performance benchmark
+
+To benchmark inference, run:
+
+* FP32
+
+`python ./main.py --arch resnet50 -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate <path to imagenet>`
+
+* FP16
+
+`python ./main.py --arch resnet50 -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate --fp16 <path to imagenet>`
+
+* AMP
+
+`python ./main.py --arch resnet50 -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate --amp <path to imagenet>`
+
+Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
+
+
+### Results
+
+Our results were obtained by running the applicable training script     in the pytorch-19.10 NGC container.
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+#### Training accuracy results
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
+
+| **epochs** | **Mixed Precision Top1** | **FP32 Top1** |
+|:-:|:-:|:-:|
+| 50 | 76.25 +/- 0.04 | 76.26 +/- 0.07 |
+| 90 | 77.23 +/- 0.04 | 77.08 +/- 0.08 |
+| 250 | 78.42 +/- 0.04 | 78.30 +/- 0.16 |
+
+##### Training accuracy: NVIDIA DGX-2 (16x V100 32G)
+
+| **epochs** | **Mixed Precision Top1** | **FP32 Top1** |
+|:-:|:-:|:-:|
+| 50 | 75.81 +/- 0.08 | 76.04 +/- 0.05 |
+| 90 | 77.10 +/- 0.06 | 77.23 +/- 0.04 |
+| 250 | 78.59 +/- 0.13 | 78.46 +/- 0.03 |
+
+
+
+##### Example plots
+
+The following images show a 250 epochs configuration on a DGX-1V.
+
+![ValidationLoss](./img/loss_plot.png)
+
+![ValidationTop1](./img/top1_plot.png)
+
+![ValidationTop5](./img/top5_plot.png)
+
+#### Training performance results
+
+##### Traininig performance: NVIDIA DGX1-16G (8x V100 16G)
+
+| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 893.09 img/s | 380.44 img/s | 2.35x | 1.00x | 1.00x |
+| 8 | 6888.75 img/s | 2945.37 img/s | 2.34x | 7.71x | 7.74x |
+
+##### Traininig performance: NVIDIA DGX1-32G (8x V100 32G)
+
+| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 849.63 img/s | 373.93 img/s | 2.27x | 1.00x | 1.00x |
+| 8 | 6614.15 img/s | 2911.22 img/s | 2.27x | 7.78x | 7.79x |
+
+##### Traininig performance: NVIDIA DGX2 (16x V100 32G)
+
+| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 894.41 img/s | 402.23 img/s | 2.22x | 1.00x | 1.00x |
+| 16 | 13443.82 img/s | 6263.41 img/s | 2.15x | 15.03x | 15.57x |
+
+#### Training Time for 90 Epochs
+
+##### Training time: NVIDIA DGX-1 (8x V100 16G)
+
+| **GPUs** | **Mixed Precision training time** | **FP32 training time** |
+|:-:|:-:|:-:|
+| 1 | ~ 41 h | ~ 95 h |
+| 8 | ~ 7 h | ~ 14 h |
+
+##### Training time: NVIDIA DGX-2 (16x V100 32G)
+
+| **GPUs** | **Mixed Precision training time** | **FP32 training time** |
+|:-:|:-:|:-:|
+| 1 | ~ 41 h | ~ 90 h |
+| 16 | ~ 5 h | ~ 8 h |
+
+
+
+#### Inference performance results
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
+
+###### FP32 Inference Latency
+
+| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 136.82 img/s | 7.12ms | 7.25ms | 8.36ms | 10.92ms |
+| 2 | 266.86 img/s | 7.27ms | 7.41ms | 7.85ms | 9.11ms |
+| 4 | 521.76 img/s | 7.44ms | 7.58ms | 8.14ms | 10.09ms |
+| 8 | 766.22 img/s | 10.18ms | 10.46ms | 10.97ms | 12.75ms |
+| 16 | 976.36 img/s | 15.79ms | 15.88ms | 15.95ms | 16.63ms |
+| 32 | 1092.27 img/s | 28.63ms | 28.71ms | 28.76ms | 29.30ms |
+| 64 | 1161.55 img/s | 53.69ms | 53.86ms | 53.90ms | 54.23ms |
+| 128 | 1209.12 img/s | 104.24ms | 104.68ms | 104.80ms | 105.00ms |
+| 256 | N/A | N/A | N/A | N/A | N/A |
+
+###### Mixed Precision Inference Latency
+
+| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 114.97 img/s | 8.56ms | 9.32ms | 11.43ms | 12.79ms |
+| 2 | 238.70 img/s | 8.20ms | 8.75ms | 9.49ms | 12.31ms |
+| 4 | 448.69 img/s | 8.67ms | 9.20ms | 9.97ms | 10.60ms |
+| 8 | 875.00 img/s | 8.88ms | 9.31ms | 9.80ms | 10.82ms |
+| 16 | 1746.07 img/s | 8.89ms | 9.05ms | 9.56ms | 12.81ms |
+| 32 | 2004.28 img/s | 14.07ms | 14.14ms | 14.31ms | 14.92ms |
+| 64 | 2254.60 img/s | 25.93ms | 26.05ms | 26.07ms | 26.17ms |
+| 128 | 2360.14 img/s | 50.14ms | 50.28ms | 50.34ms | 50.68ms |
+| 256 | 2342.13 img/s | 96.74ms | 96.91ms | 96.99ms | 97.14ms |
+
+
+
+##### Inference performance: NVIDIA T4
+
+###### FP32 Inference Latency
+
+| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 179.85 img/s | 5.51ms | 5.65ms | 7.34ms | 10.97ms |
+| 2 | 348.12 img/s | 5.67ms | 5.95ms | 6.33ms | 9.81ms |
+| 4 | 556.27 img/s | 7.03ms | 7.34ms | 8.13ms | 9.65ms |
+| 8 | 740.43 img/s | 10.32ms | 10.33ms | 10.60ms | 13.87ms |
+| 16 | 909.17 img/s | 17.19ms | 17.15ms | 18.13ms | 21.06ms |
+| 32 | 999.07 img/s | 31.07ms | 31.12ms | 31.17ms | 32.41ms |
+| 64 | 1090.47 img/s | 57.62ms | 57.84ms | 57.91ms | 58.05ms |
+| 128 | 1142.46 img/s | 110.94ms | 111.15ms | 111.23ms | 112.16ms |
+| 256 | N/A | N/A | N/A | N/A | N/A |
+
+###### Mixed Precision Inference Latency
+
+| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
+|:-:|:-:|:-:|:-:|:-:|:-:|
+| 1 | 163.78 img/s | 6.05ms | 5.92ms | 7.98ms | 11.58ms |
+| 2 | 333.43 img/s | 5.91ms | 6.05ms | 6.63ms | 11.52ms |
+| 4 | 645.45 img/s | 6.04ms | 6.33ms | 7.01ms | 8.90ms |
+| 8 | 1164.15 img/s | 6.73ms | 7.31ms | 8.04ms | 12.41ms |
+| 16 | 1606.42 img/s | 9.53ms | 9.86ms | 10.52ms | 17.01ms |
+| 32 | 1857.29 img/s | 15.67ms | 15.61ms | 16.14ms | 18.66ms |
+| 64 | 2011.62 img/s | 28.64ms | 28.69ms | 28.82ms | 31.06ms |
+| 128 | 2083.90 img/s | 54.87ms | 54.96ms | 54.99ms | 55.27ms |
+| 256 | 2043.72 img/s | 106.51ms | 106.62ms | 106.68ms | 107.03ms |
+
+
+
+
+
+## Release notes
+
+### Changelog
+
+1. September 2018
+  * Initial release
+2. January 2019
+  * Added options Label Smoothing, fan-in initialization, skipping weight decay on batch norm gamma and bias.
+3. May 2019
+  * Cosine LR schedule
+  * MixUp regularization
+  * DALI support
+  * DGX2 configurations
+  * gradients accumulation
+4. July 2019
+  * DALI-CPU dataloader
+  * Updated README
+
+### Known issues
+
+There are no known issues with this model.
+
+
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/AMP/DGX1_RN50_AMP_250E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/AMP/DGX1_RN50_AMP_250E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 250 --mixup 0.2
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/AMP/DGX1_RN50_AMP_50E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/AMP/DGX1_RN50_AMP_50E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 50
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/AMP/DGX1_RN50_AMP_90E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/AMP/DGX1_RN50_AMP_90E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 90
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/AMP/DGX2_RN50_AMP_250E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/AMP/DGX2_RN50_AMP_250E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 250 --mixup 0.2
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/AMP/DGX2_RN50_AMP_50E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/AMP/DGX2_RN50_AMP_50E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 50
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/AMP/DGX2_RN50_AMP_90E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/AMP/DGX2_RN50_AMP_90E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 90
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP16/DGX1_RN50_FP16_250E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP16/DGX1_RN50_FP16_250E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 250 --mixup 0.2
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP16/DGX1_RN50_FP16_50E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP16/DGX1_RN50_FP16_50E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 50
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP16/DGX1_RN50_FP16_90E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP16/DGX1_RN50_FP16_90E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 90
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP16/DGX2_RN50_FP16_250E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP16/DGX2_RN50_FP16_250E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 250 --mixup 0.2
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP16/DGX2_RN50_FP16_50E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP16/DGX2_RN50_FP16_50E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 50
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP16/DGX2_RN50_FP16_90E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP16/DGX2_RN50_FP16_90E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 90
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP32/DGX1_RN50_FP32_250E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP32/DGX1_RN50_FP32_250E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 250 --mixup 0.2
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP32/DGX1_RN50_FP32_50E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP32/DGX1_RN50_FP32_50E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 50
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP32/DGX1_RN50_FP32_90E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP32/DGX1_RN50_FP32_90E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 90
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP32/DGX2_RN50_FP32_250E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP32/DGX2_RN50_FP32_250E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 250 --mixup 0.2
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP32/DGX2_RN50_FP32_50E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP32/DGX2_RN50_FP32_50E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 50
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP32/DGX2_RN50_FP32_90E.sh
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/resnet50v1.5/training/FP32/DGX2_RN50_FP32_90E.sh
@ -0,0 +1 @@
+python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 90
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/init_image_transformation/conda.yaml
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/init_image_transformation/conda.yaml
@ -0,0 +1,8 @@
+name: designer-cv-transform
+channels:
+  - defaults
+dependencies:
+  - pip=20.2
+  - python=3.7.9
+  - pip:
+    - azureml-designer-cv-modules==0.0.41
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/init_image_transformation/entry.spec.yaml
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/init_image_transformation/entry.spec.yaml
@ -0,0 +1,127 @@
+$schema: https://azuremlschemas.azureedge.net/development/CommandComponent.schema.json
+type: command
+
+name: microsoftsamples_init_image_transformation
+display_name: Init Image Transformation
+description: Initialize image transformation.
+
+version: 1
+
+inputs:
+  resize:
+    description: Resize the input PIL Image to the given size
+    type: string
+    default: True
+    enum: ['True', 'False']
+  size:
+    description: Desired output size
+    type: integer
+    default: 256
+  center_crop:
+    description: Crops the given PIL Image at the center
+    type: string
+    default: False
+    enum: ['True', 'False']
+  crop_size:
+    description: Desired output size of the crop
+    type: integer
+    default: 224
+  pad:
+    description: Pad the given PIL Image on all sides with the given "pad" value
+    type: string
+    default: False
+    enum: ['True', 'False']
+  padding:
+    description: Padding on each border
+    type: integer
+    default: 0
+  color_jitter:
+    description: Randomly change the brightness, contrast and saturation of an image
+    type: boolean
+    default: false
+  grayscale:
+    description: Convert image to grayscale
+    type: boolean
+    default: false
+  random_resized_crop:
+    description: Crop the given PIL Image to random size and aspect ratio
+    type: string
+    default: False
+    enum: ['True', 'False']
+  random_resized_crop_size:
+    description: Expected output size of each edge
+    type: integer
+    default: 256
+  random_crop:
+    description: Crop the given PIL Image at a random location
+    type: string
+    default: False
+    enum: ['True', 'False']
+  random_crop_size:
+    description: Desired output size of the crop
+    type: integer
+    default: 224
+  random_horizontal_flip:
+    description: Horizontally flip the given PIL Image randomly with a given probability
+    type: boolean
+    default: false
+  random_vertical_flip:
+    description: Vertically flip the given PIL Image randomly with a given probability
+    type: boolean
+    default: false
+  random_rotation:
+    description: Rotate the image by angle
+    type: boolean
+    default: false
+  random_rotation_degrees:
+    description: Range of degrees to select from
+    type: integer
+    default: 0
+  random_affine:
+    description: Random affine transformation of the image keeping center invariant
+    type: boolean
+    default: false
+  random_affine_degrees:
+    description: Range of degrees to select from
+    type: integer
+    default: 0
+  random_grayscale:
+    description: Randomly convert image to grayscale with a probability of p (default 0.1)
+    type: boolean
+    default: false
+  random_perspective:
+    description: Performs Perspective transformation of the given PIL Image randomly with a given probability
+    type: boolean
+    default: false
+outputs:
+  output_path:
+    type: uri_folder
+    description: Output image transformation
+
+command: >-
+  python -m azureml.designer.modules.computer_vision.transform.init_image_transformation.init_image_transformation
+  --resize ${{inputs.resize}}
+  --size ${{inputs.size}}
+  --center-crop ${{inputs.center_crop}}
+  --crop-size ${{inputs.crop_size}}  
+  --pad ${{inputs.pad}}
+  --padding ${{inputs.padding}}
+  --color-jitter ${{inputs.color_jitter}}
+  --grayscale ${{inputs.grayscale}}
+  --random-resized-crop ${{inputs.random_resized_crop}}
+  --random-resized-crop-size ${{inputs.random_resized_crop_size}}
+  --random-crop ${{inputs.random_crop}}
+  --random-crop-size ${{inputs.random_crop_size}}
+  --random-horizontal-flip ${{inputs.random_horizontal_flip}}
+  --random-vertical-flip ${{inputs.random_vertical_flip}}
+  --random-rotation ${{inputs.random_rotation}}
+  --random-rotation-degrees ${{inputs.random_rotation_degrees}}
+  --random-affine ${{inputs.random_affine}}
+  --random-affine-degrees ${{inputs.random_affine_degrees}}
+  --random-grayscale ${{inputs.random_grayscale}}
+  --random-perspective ${{inputs.random_perspective}}
+  --output-path ${{outputs.output_path}}
+  
+environment:
+  conda_file: ./conda.yaml
+  image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20211124.v1
--- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/pipeline.yml
+++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/pipeline.yml
@ -0,0 +1,95 @@
+$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
+type: pipeline
+
+# <inputs_and_outputs>
+inputs:
+  training_image: #using local data, will crate an anonymous data asset
+    type: uri_folder
+    path: ./data/train
+  validation_image:
+    type: uri_folder
+    path: ./data/val
+
+# </inputs_and_outputs>
+
+# <jobs>
+settings:
+  default_datastore: azureml:workspaceblobstore
+  default_compute: azureml:cpu-cluster
+  continue_on_step_failure: false
+
+jobs:
+  convert_training_image:
+    type: command
+    component: file:./convert_to_image_directory/entry.spec.yaml
+    inputs:
+      input_path: ${{parent.inputs.training_image}}
+
+  convert_evaluation_image:
+    type: command
+    component: file:./convert_to_image_directory/entry.spec.yaml
+    inputs:
+      input_path: ${{parent.inputs.validation_image}}
+
+  init_transformation:
+    type: command
+    component: file:./init_image_transformation/entry.spec.yaml
+    inputs:
+      resize: "False"
+      size: 256
+      center_crop: 224
+      pad: "False"
+      padding: 0
+      color_jitter: "False"
+      grayscale: "False"
+      random_resized_crop: "False"
+      random_resized_crop_size: 256
+      random_crop: "False"
+      random_crop_size: 224
+      random_horizontal_flip: "True"
+      random_vertical_flip: "True"
+      random_rotation: "False"
+      random_rotation_degrees: 0
+      random_affine: "False"
+      random_affine_degrees: 0
+      random_grayscale: "False"
+      random_perspective: "False"
+  
+  transform_on_training_image:
+    type: command
+    component: file:./apply_image_transformation/entry.spec.yaml
+    inputs: 
+      mode: "For training"
+      input_image_transform_path: ${{parent.jobs.init_transformation.outputs.output_path}}
+      input_image_dir_path: ${{parent.jobs.convert_training_image.outputs.output_path}}
+
+  transform_on_evaluation_image:
+    type: command
+    component: file:./apply_image_transformation/entry.spec.yaml
+    inputs:
+      mode: "For inference"
+      input_image_transform_path: ${{parent.jobs.init_transformation.outputs.output_path}}
+      input_image_dir_path: ${{parent.jobs.convert_evaluation_image.outputs.output_path}}
+
+  train:
+    type: command
+    component: file:./image_cnn_train/entry.spec.yaml
+    compute: azureml:gpu-cluster
+    inputs:
+      train_data: ${{parent.jobs.transform_on_training_image.outputs.output_path}}
+      valid_data: ${{parent.jobs.transform_on_evaluation_image.outputs.output_path}}
+      data_backend: "pytorch"
+      epochs: 4
+      seed: 123
+      batch_size: 16
+      save_checkpoint_epochs: 2
+    outputs:
+      workspace:
+        type: uri_folder
+        mode: upload
+    distribution:
+      type: mpi
+      process_count_per_instance: 1
+    resources:
+      instance_count: 2
+# </jobs>
--- a/cli/jobs/pipelines/cifar-10/README.md
+++ b/cli/jobs/pipelines/cifar-10/README.md
@ -0,0 +1,15 @@
+---
+page_type: sample
+languages:
+- azurecli
+- python
+products:
+- azure-machine-learning
+description: This sample shows how to using distributed job on an Azure ML compute cluster. It will use cifar-10 dataset, processed data, train model and then evaluate output model. 
+---
+
+# Submit pipeline job
+
+This example shows how a build a three steps pipeline. You need use gpu SKU or powerful cpu SKU like `STANDARD_D15_V2` for the train and eval step in this pipeline.
+
+Please change `process_count_per_instance` number of GPU cards you have to fully utilize the compute resource you have.
--- a/cli/jobs/pipelines/cifar-10/pipeline.yml
+++ b/cli/jobs/pipelines/cifar-10/pipeline.yml
@ -0,0 +1,56 @@
+$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
+type: pipeline
+display_name: cifar-10-pipeline-example
+experiment_name: cifar-10-pipeline-example
+jobs:
+  get-data:
+    type: command
+    command: >-
+      wget https://azuremlexamples.blob.core.windows.net/datasets/cifar-10-python.tar.gz;
+      tar -xvzf cifar-10-python.tar.gz -C ${{outputs.cifar}};
+      rm cifar-10-python.tar.gz;
+    compute: azureml:gpu-cluster
+    environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
+    outputs:
+      cifar:
+        type: uri_folder
+        mode: upload
+  train-model:
+    type: command
+    command: >-
+      python main.py
+      --data-dir ${{inputs.cifar}}
+      --epochs ${{inputs.epochs}}
+      --model-dir ${{outputs.model_dir}}
+    code: src/train-model
+    inputs:
+      epochs: 1
+      cifar: ${{parent.jobs.get-data.outputs.cifar}}
+    outputs:
+      model_dir:
+        type: uri_folder
+        mode: upload
+    environment: azureml:AzureML-pytorch-1.9-ubuntu18.04-py37-cuda11-gpu@latest
+    compute: azureml:gpu-cluster
+    distribution:
+      type: pytorch
+      process_count_per_instance: 1
+    resources:
+      instance_count: 2
+  eval-model:
+    type: command
+    command: >-
+      python main.py
+      --data-dir ${{inputs.cifar}}
+      --model-dir ${{inputs.model_dir}}/model
+    code: src/eval-model
+    environment: azureml:AzureML-pytorch-1.9-ubuntu18.04-py37-cuda11-gpu@latest
+    compute: azureml:gpu-cluster
+    distribution:
+      type: pytorch
+      process_count_per_instance: 2
+    resources:
+      instance_count: 1
+    inputs:
+      cifar: ${{parent.jobs.get-data.outputs.cifar}}
+      model_dir: ${{parent.jobs.train-model.outputs.model_dir}}
--- a/cli/jobs/pipelines/cifar-10/src/eval-model/main.py
+++ b/cli/jobs/pipelines/cifar-10/src/eval-model/main.py
@ -0,0 +1,147 @@
+# Copyright (c) 2017 Facebook, Inc. All rights reserved.
+# BSD 3-Clause License
+#
+# Script adapted from: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py
+# ==============================================================================
+
+# imports
+import os
+import mlflow
+import argparse
+
+import torch
+import torchvision
+import torchvision.transforms as transforms
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+# TODO - add mlflow logging
+
+# define functions
+def evaluate(test_loader, model, device):
+    classes = (
+        "plane",
+        "car",
+        "bird",
+        "cat",
+        "deer",
+        "dog",
+        "frog",
+        "horse",
+        "ship",
+        "truck",
+    )
+
+    model.eval()
+
+    correct = 0
+    total = 0
+    class_correct = list(0.0 for i in range(10))
+    class_total = list(0.0 for i in range(10))
+    with torch.no_grad():
+        for data in test_loader:
+            images, labels = data[0].to(device), data[1].to(device)
+            outputs = model(images)
+            _, predicted = torch.max(outputs.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+            c = (predicted == labels).squeeze()
+            for i in range(10):
+                label = labels[i]
+                class_correct[label] += c[i].item()
+                class_total[label] += 1
+
+    # print total test set accuracy
+    print(
+        "Accuracy of the network on the 10000 test images: %d %%"
+        % (100 * correct / total)
+    )
+
+    # print test accuracy for each of the classes
+    for i in range(10):
+        print(
+            "Accuracy of %5s : %2d %%"
+            % (classes[i], 100 * class_correct[i] / class_total[i])
+        )
+
+
+def main(args):
+    # get PyTorch environment variables
+    world_size = int(os.environ["WORLD_SIZE"])
+    rank = int(os.environ["RANK"])
+    local_rank = int(os.environ["LOCAL_RANK"])
+
+    distributed = world_size > 1
+
+    # set device
+    if distributed and torch.cuda.is_available():
+        device = torch.device("cuda", local_rank)
+    else:
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+    # initialize distributed process group using default env:// method
+    if distributed:
+        torch.distributed.init_process_group(
+            backend="nccl" if torch.cuda.is_available() else "gloo"
+        )
+
+    # define test dataset DataLoaders
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
+    )
+
+    test_set = torchvision.datasets.CIFAR10(
+        root=args.data_dir, train=False, download=False, transform=transform
+    )
+    test_loader = torch.utils.data.DataLoader(
+        test_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers
+    )
+
+    # load model
+    model = mlflow.pytorch.load_model(args.model_dir)
+    model = model.to(device)
+
+    # evaluate on full test dataset
+    if not distributed or rank == 0:
+        evaluate(test_loader, model, device)
+
+
+def parse_args():
+    # setup argparse
+    parser = argparse.ArgumentParser()
+
+    # add arguments
+    parser.add_argument(
+        "--data-dir", type=str, help="directory containing CIFAR-10 dataset"
+    )
+    parser.add_argument(
+        "--model-dir", type=str, default="./", help="input directory for model"
+    )
+    parser.add_argument(
+        "--batch-size",
+        default=16,
+        type=int,
+        help="mini batch size for each gpu/process",
+    )
+    parser.add_argument(
+        "--workers",
+        default=2,
+        type=int,
+        help="number of data loading workers for each gpu/process",
+    )
+
+    # parse args
+    args = parser.parse_args()
+
+    # return args
+    return args
+
+
+# run script
+if __name__ == "__main__":
+    # parse args
+    args = parse_args()
+
+    # call main function
+    main(args)
--- a/cli/jobs/pipelines/cifar-10/src/train-model/main.py
+++ b/cli/jobs/pipelines/cifar-10/src/train-model/main.py
@ -0,0 +1,199 @@
+# Copyright (c) 2017 Facebook, Inc. All rights reserved.
+# BSD 3-Clause License
+#
+# Script adapted from: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py
+# ==============================================================================
+
+# imports
+import os
+import mlflow
+import argparse
+
+import torch
+import torchvision
+import torchvision.transforms as transforms
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+# TODO - add mlflow logging
+
+# define network architecture
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(3, 32, 3)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(32, 64, 3)
+        self.conv3 = nn.Conv2d(64, 128, 3)
+        self.fc1 = nn.Linear(128 * 6 * 6, 120)
+        self.dropout = nn.Dropout(p=0.2)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = self.pool(F.relu(self.conv3(x)))
+        x = x.view(-1, 128 * 6 * 6)
+        x = self.dropout(F.relu(self.fc1(x)))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+# define functions
+def train(train_loader, model, criterion, optimizer, epoch, device, print_freq, rank):
+    running_loss = 0.0
+    for i, data in enumerate(train_loader, 0):
+        # get the inputs; data is a list of [inputs, labels]
+        inputs, labels = data[0].to(device), data[1].to(device)
+
+        # zero the parameter gradients
+        optimizer.zero_grad()
+
+        # forward + backward + optimize
+        outputs = model(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+        # print statistics
+        running_loss += loss.item()
+        if i % print_freq == 0:  # print every print_freq mini-batches
+            print(
+                "Rank %d: [%d, %5d] loss: %.3f"
+                % (rank, epoch + 1, i + 1, running_loss / print_freq)
+            )
+            running_loss = 0.0
+
+
+def main(args):
+    # get PyTorch environment variables
+    world_size = int(os.environ["WORLD_SIZE"])
+    rank = int(os.environ["RANK"])
+    local_rank = int(os.environ["LOCAL_RANK"])
+
+    distributed = world_size > 1
+
+    # set device
+    if distributed and torch.cuda.is_available():
+        device = torch.device("cuda", local_rank)
+    else:
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+    # initialize distributed process group using default env:// method
+    if distributed:
+        torch.distributed.init_process_group(
+            backend="nccl" if torch.cuda.is_available() else "gloo"
+        )
+
+    # define train and dataset DataLoaders
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
+    )
+
+    train_set = torchvision.datasets.CIFAR10(
+        root=args.data_dir, train=True, download=False, transform=transform
+    )
+
+    if distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_set)
+    else:
+        train_sampler = None
+
+    train_loader = torch.utils.data.DataLoader(
+        train_set,
+        batch_size=args.batch_size,
+        shuffle=(train_sampler is None),
+        num_workers=args.workers,
+        sampler=train_sampler,
+    )
+
+    model = Net().to(device)
+
+    # wrap model with DDP
+    if distributed and torch.cuda.is_available():
+        model = nn.parallel.DistributedDataParallel(
+            model, device_ids=[local_rank], output_device=local_rank
+        )
+
+    # define loss function and optimizer
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.SGD(
+        model.parameters(), lr=args.learning_rate, momentum=args.momentum
+    )
+
+    # train the model
+    for epoch in range(args.epochs):
+        print("Rank %d: Starting epoch %d" % (rank, epoch))
+        if distributed:
+            train_sampler.set_epoch(epoch)
+        model.train()
+        train(
+            train_loader,
+            model,
+            criterion,
+            optimizer,
+            epoch,
+            device,
+            args.print_freq,
+            rank,
+        )
+
+    print("Rank %d: Finished Training" % (rank))
+
+    if not distributed or rank == 0:
+        # log model
+        mlflow.pytorch.save_model(model, f"{args.model_dir}/model")
+
+
+def parse_args():
+    # setup argparse
+    parser = argparse.ArgumentParser()
+
+    # add arguments
+    parser.add_argument(
+        "--data-dir", type=str, help="directory containing CIFAR-10 dataset"
+    )
+    parser.add_argument(
+        "--model-dir", type=str, default="./", help="output directory for model"
+    )
+    parser.add_argument("--epochs", default=10, type=int, help="number of epochs")
+    parser.add_argument(
+        "--batch-size",
+        default=16,
+        type=int,
+        help="mini batch size for each gpu/process",
+    )
+    parser.add_argument(
+        "--workers",
+        default=2,
+        type=int,
+        help="number of data loading workers for each gpu/process",
+    )
+    parser.add_argument(
+        "--learning-rate", default=0.001, type=float, help="learning rate"
+    )
+    parser.add_argument("--momentum", default=0.9, type=float, help="momentum")
+    parser.add_argument(
+        "--print-freq",
+        default=200,
+        type=int,
+        help="frequency of printing training statistics",
+    )
+
+    # parse args
+    args = parser.parse_args()
+
+    # return args
+    return args
+
+
+# run script
+if __name__ == "__main__":
+    # parse args
+    args = parse_args()
+
+    # call main function
+    main(args)
--- a/cli/readme.py
+++ b/cli/readme.py
@ -36,7 +36,6 @@ def main(args):
    jobs += sorted(
        glob.glob("jobs/pipelines-with-components/**/*pipeline*.yml", recursive=True)
    )
-    jobs += sorted(glob.glob("jobs/*/basics/**/*pipeline*.yml", recursive=True))
    jobs = [
        job.replace(".yml", "")
        for job in jobs
@ -290,6 +289,11 @@ def parse_path(path):
 def write_job_workflow(job):
    filename, project_dir, hyphenated = parse_path(job)
    creds = "${{secrets.AZ_CREDS}}"
+    run_pipeline_job_path = (
+        "\n      - cli/run-pipeline-jobs.sh"
+        if hyphenated.startswith("jobs-pipelines")
+        else ""
+    )
    workflow_yaml = f"""name: cli-{hyphenated}
 on:
  workflow_dispatch:
@ -300,7 +304,7 @@ on:
      - main
    paths:
      - cli/{project_dir}/**
-      - .github/workflows/cli-{hyphenated}.yml
+      - .github/workflows/cli-{hyphenated}.yml{run_pipeline_job_path}
      - cli/setup.sh
 jobs:
  build:
--- a/cli/run-job-pipeline-all.sh
+++ b/cli/run-job-pipeline-all.sh
@ -37,6 +37,11 @@ pwd
 az ml job create --file pipeline.yml
 cd ../../../../

+cd jobs/pipelines-with-components/basics/3c_pipeline_with_hyperparameter_sweep
+pwd
+az ml job create --file pipeline.yml
+cd ../../../../
+
 cd jobs/pipelines-with-components/basics/4a_local_data_input
 pwd
 az ml job create --file pipeline.yml
@ -47,12 +52,6 @@ pwd
 az ml job create --file pipeline.yml
 cd ../../../../

-# cd jobs/pipelines-with-components/basics/4c_dataset_input
-# pwd
-# az ml data create --file data.yml --version $target_version
-# az ml job create --file pipeline.yml
-# cd ../../../../
-
 cd jobs/pipelines-with-components/basics/4c_web_url_input
 pwd
 az ml job create --file pipeline.yml
@ -103,9 +102,9 @@ pwd
 az ml job create --file pipeline.yml
 cd ../../../

-# cd jobs/pipelines/cifar-10
-# pwd
-# az ml job create --file pipeline.yml --web
-# cd ../../../
+cd jobs/pipelines/cifar-10
+pwd
+az ml job create --file pipeline.yml --web
+cd ../../../

 az --version
				`@ -0,0 +1 @@`
				`git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger`
				`@ -0,0 +1 @@`
				`python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 250 --mixup 0.2`