feat: re-enable image/cifar-10 pipeline tests & update readme.py (#1112)

* feat: test ci

* feat: enable cifar-10 & image classification

* feat: add sample for hyperparameter sweep

* fix: fix ci

* fix: fix ci

* fix ci: update readme.py

* fix: black reformat

* Revert "fix: black reformat"

This reverts commit d47755571f.

* fix: image pipeline yml has been renamed

* fix: update output mode

* fix: update output mode

* chaneg setting to use artifact store and fix typo

Co-authored-by: lochen <cloga0216@gmail.com>
This commit is contained in:
elliotzh 2022-04-08 09:47:34 +08:00 коммит произвёл GitHub
Родитель 55eb858324
Коммит c46a0de2b2
72 изменённых файлов: 5102 добавлений и 40 удалений

Просмотреть файл

@ -1,4 +1,4 @@
name: cli-scripts-run-pipeline-jobs
name: cli-jobs-pipelines-cifar-10-pipeline
on:
workflow_dispatch:
schedule:
@ -7,8 +7,9 @@ on:
branches:
- main
paths:
- cli/jobs/pipelines/cifar-10/**
- .github/workflows/cli-jobs-pipelines-cifar-10-pipeline.yml
- cli/run-pipeline-jobs.sh
- .github/workflows/cli-scripts-run-pipeline-jobs.yml
- cli/setup.sh
jobs:
build:
@ -24,8 +25,6 @@ jobs:
run: bash setup.sh
working-directory: cli
continue-on-error: true
- name: scripts installs
run: sudo apt-get upgrade -y && sudo apt-get install uuid-runtime jq -y
- name: test script script
run: set -e; bash -x run-pipeline-jobs.sh
- name: run job
run: bash -x run-job.sh jobs/pipelines/cifar-10/pipeline.yml
working-directory: cli

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines/nyc-taxi/**
- .github/workflows/cli-jobs-pipelines-nyc-taxi-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines-with-components/basics/1a_e2e_local_components/**
- .github/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines-with-components/basics/1b_e2e_registered_components/**
- .github/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines-with-components/basics/2a_basic_component/**
- .github/workflows/cli-jobs-pipelines-with-components-basics-2a_basic_component-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines-with-components/basics/2b_component_with_input_output/**
- .github/workflows/cli-jobs-pipelines-with-components-basics-2b_component_with_input_output-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines-with-components/basics/3a_basic_pipeline/**
- .github/workflows/cli-jobs-pipelines-with-components-basics-3a_basic_pipeline-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines-with-components/basics/3b_pipeline_with_data/**
- .github/workflows/cli-jobs-pipelines-with-components-basics-3b_pipeline_with_data-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines-with-components/basics/4a_local_data_input/**
- .github/workflows/cli-jobs-pipelines-with-components-basics-4a_local_data_input-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines-with-components/basics/4b_datastore_datapath_uri/**
- .github/workflows/cli-jobs-pipelines-with-components-basics-4b_datastore_datapath_uri-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines-with-components/basics/4c_web_url_input/**
- .github/workflows/cli-jobs-pipelines-with-components-basics-4c_web_url_input-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines-with-components/basics/5a_env_public_docker_image/**
- .github/workflows/cli-jobs-pipelines-with-components-basics-5a_env_public_docker_image-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines-with-components/basics/5b_env_registered/**
- .github/workflows/cli-jobs-pipelines-with-components-basics-5b_env_registered-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines-with-components/basics/5c_env_conda_file/**
- .github/workflows/cli-jobs-pipelines-with-components-basics-5c_env_conda_file-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines-with-components/basics/6a_tf_hello_world/**
- .github/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines-with-components/basics/6b_pytorch_hello_world/**
- .github/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines-with-components/basics/6c_r_iris/**
- .github/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -0,0 +1,30 @@
name: cli-jobs-pipelines-with-components-image_classification_with_densenet-pipeline
on:
workflow_dispatch:
schedule:
- cron: "0 0/4 * * *"
pull_request:
branches:
- main
paths:
- cli/jobs/pipelines-with-components/image_classification_with_densenet/**
- .github/workflows/cli-jobs-pipelines-with-components-image_classification_with_densenet-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
creds: ${{secrets.AZ_CREDS}}
- name: setup
run: bash setup.sh
working-directory: cli
continue-on-error: true
- name: run job
run: bash -x run-job.sh jobs/pipelines-with-components/image_classification_with_densenet/pipeline.yml
working-directory: cli

Просмотреть файл

@ -9,6 +9,7 @@ on:
paths:
- cli/jobs/pipelines-with-components/nyc_taxi_data_regression/**
- .github/workflows/cli-jobs-pipelines-with-components-nyc_taxi_data_regression-pipeline.yml
- cli/run-pipeline-jobs.sh
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -47,7 +47,6 @@ path|status|
[deploy-triton-managed-online-endpoint.sh](deploy-triton-managed-online-endpoint.sh)|[![deploy-triton-managed-online-endpoint](https://github.com/Azure/azureml-examples/workflows/cli-scripts-deploy-triton-managed-online-endpoint/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-deploy-triton-managed-online-endpoint.yml)
[misc.sh](misc.sh)|[![misc](https://github.com/Azure/azureml-examples/workflows/cli-scripts-misc/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-misc.yml)
[mlflow-uri.sh](mlflow-uri.sh)|[![mlflow-uri](https://github.com/Azure/azureml-examples/workflows/cli-scripts-mlflow-uri/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-mlflow-uri.yml)
[run-pipeline-jobs.sh](run-pipeline-jobs.sh)|[![run-pipeline-jobs](https://github.com/Azure/azureml-examples/workflows/cli-scripts-run-pipeline-jobs/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-run-pipeline-jobs.yml)
[train-rest.sh](train-rest.sh)|[![train-rest](https://github.com/Azure/azureml-examples/workflows/cli-scripts-train-rest/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-train-rest.yml)
[train.sh](train.sh)|[![train](https://github.com/Azure/azureml-examples/workflows/cli-scripts-train/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-train.yml)
@ -94,6 +93,7 @@ path|status|description
[jobs/basics/hello-world-output-data.yml](jobs/basics/hello-world-output-data.yml)|[![jobs/basics/hello-world-output-data](https://github.com/Azure/azureml-examples/workflows/cli-jobs-basics-hello-world-output-data/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-basics-hello-world-output-data.yml)|*no description*
[jobs/basics/hello-world-output.yml](jobs/basics/hello-world-output.yml)|[![jobs/basics/hello-world-output](https://github.com/Azure/azureml-examples/workflows/cli-jobs-basics-hello-world-output/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-basics-hello-world-output.yml)|*no description*
[jobs/basics/hello-world.yml](jobs/basics/hello-world.yml)|[![jobs/basics/hello-world](https://github.com/Azure/azureml-examples/workflows/cli-jobs-basics-hello-world/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-basics-hello-world.yml)|*no description*
[jobs/pipelines/cifar-10/pipeline.yml](jobs/pipelines/cifar-10/pipeline.yml)|[![jobs/pipelines/cifar-10/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-cifar-10-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-cifar-10-pipeline.yml)|*no description*
[jobs/pipelines/nyc-taxi/pipeline.yml](jobs/pipelines/nyc-taxi/pipeline.yml)|[![jobs/pipelines/nyc-taxi/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-nyc-taxi-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-nyc-taxi-pipeline.yml)|*no description*
[jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline.yml](jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline.yml)|[![jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline.yml)|"Dummy train-score-eval pipeline with local components"
[jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline.yml](jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline.yml)|[![jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline.yml)|"E2E dummy train-score-eval pipeline with registered components"
@ -110,22 +110,8 @@ path|status|description
[jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline.yml](jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline.yml)|[![jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline.yml)|"Prints the environment variable ($TF_CONFIG) useful for scripts running in a Tensorflow training environment"
[jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline.yml](jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline.yml)|[![jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline.yml)|"Prints the environment variables useful for scripts running in a PyTorch training environment"
[jobs/pipelines-with-components/basics/6c_r_iris/pipeline.yml](jobs/pipelines-with-components/basics/6c_r_iris/pipeline.yml)|[![jobs/pipelines-with-components/basics/6c_r_iris/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline.yml)|Train an R model on the Iris dataset.
[jobs/pipelines-with-components/image_classification_with_densenet/pipeline.yml](jobs/pipelines-with-components/image_classification_with_densenet/pipeline.yml)|[![jobs/pipelines-with-components/image_classification_with_densenet/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-image_classification_with_densenet-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-image_classification_with_densenet-pipeline.yml)|*no description*
[jobs/pipelines-with-components/nyc_taxi_data_regression/pipeline.yml](jobs/pipelines-with-components/nyc_taxi_data_regression/pipeline.yml)|[![jobs/pipelines-with-components/nyc_taxi_data_regression/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-nyc_taxi_data_regression-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-nyc_taxi_data_regression-pipeline.yml)|*no description*
[jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline.yml](jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline.yml)|[![jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline.yml)|"Dummy train-score-eval pipeline with local components"
[jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline.yml](jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline.yml)|[![jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline.yml)|"E2E dummy train-score-eval pipeline with registered components"
[jobs/pipelines-with-components/basics/2a_basic_component/pipeline.yml](jobs/pipelines-with-components/basics/2a_basic_component/pipeline.yml)|[![jobs/pipelines-with-components/basics/2a_basic_component/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-2a_basic_component-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-2a_basic_component-pipeline.yml)|"Hello World component example"
[jobs/pipelines-with-components/basics/2b_component_with_input_output/pipeline.yml](jobs/pipelines-with-components/basics/2b_component_with_input_output/pipeline.yml)|[![jobs/pipelines-with-components/basics/2b_component_with_input_output/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-2b_component_with_input_output-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-2b_component_with_input_output-pipeline.yml)|"Component with inputs and outputs"
[jobs/pipelines-with-components/basics/3a_basic_pipeline/pipeline.yml](jobs/pipelines-with-components/basics/3a_basic_pipeline/pipeline.yml)|[![jobs/pipelines-with-components/basics/3a_basic_pipeline/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-3a_basic_pipeline-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-3a_basic_pipeline-pipeline.yml)|"Basic Pipeline Job with 3 Hello World components"
[jobs/pipelines-with-components/basics/3b_pipeline_with_data/pipeline.yml](jobs/pipelines-with-components/basics/3b_pipeline_with_data/pipeline.yml)|[![jobs/pipelines-with-components/basics/3b_pipeline_with_data/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-3b_pipeline_with_data-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-3b_pipeline_with_data-pipeline.yml)|*no description*
[jobs/pipelines-with-components/basics/4a_local_data_input/pipeline.yml](jobs/pipelines-with-components/basics/4a_local_data_input/pipeline.yml)|[![jobs/pipelines-with-components/basics/4a_local_data_input/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-4a_local_data_input-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-4a_local_data_input-pipeline.yml)|"Example of using data in a local folder as pipeline input"
[jobs/pipelines-with-components/basics/4b_datastore_datapath_uri/pipeline.yml](jobs/pipelines-with-components/basics/4b_datastore_datapath_uri/pipeline.yml)|[![jobs/pipelines-with-components/basics/4b_datastore_datapath_uri/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-4b_datastore_datapath_uri-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-4b_datastore_datapath_uri-pipeline.yml)|"Example of using data folder from a Workspace Datastore as pipeline input"
[jobs/pipelines-with-components/basics/4c_web_url_input/pipeline.yml](jobs/pipelines-with-components/basics/4c_web_url_input/pipeline.yml)|[![jobs/pipelines-with-components/basics/4c_web_url_input/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-4c_web_url_input-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-4c_web_url_input-pipeline.yml)|"Example of using a file hosted at a web URL as pipeline input"
[jobs/pipelines-with-components/basics/5a_env_public_docker_image/pipeline.yml](jobs/pipelines-with-components/basics/5a_env_public_docker_image/pipeline.yml)|[![jobs/pipelines-with-components/basics/5a_env_public_docker_image/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-5a_env_public_docker_image-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-5a_env_public_docker_image-pipeline.yml)|*no description*
[jobs/pipelines-with-components/basics/5b_env_registered/pipeline.yml](jobs/pipelines-with-components/basics/5b_env_registered/pipeline.yml)|[![jobs/pipelines-with-components/basics/5b_env_registered/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-5b_env_registered-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-5b_env_registered-pipeline.yml)|*no description*
[jobs/pipelines-with-components/basics/5c_env_conda_file/pipeline.yml](jobs/pipelines-with-components/basics/5c_env_conda_file/pipeline.yml)|[![jobs/pipelines-with-components/basics/5c_env_conda_file/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-5c_env_conda_file-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-5c_env_conda_file-pipeline.yml)|*no description*
[jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline.yml](jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline.yml)|[![jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline.yml)|"Prints the environment variable ($TF_CONFIG) useful for scripts running in a Tensorflow training environment"
[jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline.yml](jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline.yml)|[![jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline.yml)|"Prints the environment variables useful for scripts running in a PyTorch training environment"
[jobs/pipelines-with-components/basics/6c_r_iris/pipeline.yml](jobs/pipelines-with-components/basics/6c_r_iris/pipeline.yml)|[![jobs/pipelines-with-components/basics/6c_r_iris/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline.yml)|Train an R model on the Iris dataset.
**Endpoints** ([endpoints](endpoints))
@ -184,4 +170,3 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope
- [Documentation](https://docs.microsoft.com/azure/machine-learning)
- [Private previews](https://github.com/Azure/azureml-previews)

Просмотреть файл

@ -2,12 +2,19 @@ $schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
settings:
default_datestore: azureml:workspaceblobstore
default_datastore: azureml:workspaceartifactstore
default_compute: azureml:cpu-cluster
jobs:
hello_job:
command: echo "hello"
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:23
command: echo "hello-world" > ${{outputs.world_output}}/world.txt
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
compute: azureml:cpu-cluster
outputs:
world_output:
world_job:
command: echo "world"
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:23
command: cat ${{inputs.world_input}}/world.txt
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:23
compute: azureml:cpu-cluster
inputs:
world_input: ${{parent.jobs.hello_job.outputs.world_output}}

Просмотреть файл

@ -0,0 +1,8 @@
name: designer-cv-transform
channels:
- defaults
dependencies:
- pip=20.2
- python=3.7.9
- pip:
- azureml-designer-cv-modules[pytorch]==0.0.41

Просмотреть файл

@ -0,0 +1,35 @@
$schema: https://azuremlschemas.azureedge.net/development/CommandComponent.schema.json
type: command
name: microsoftsamples_apply_image_transformation
display_name: Apply Image Transformation
description: Applies a image transformation to a image directory.
version: 0.0.1
inputs:
input_image_transform_path:
description: Input image transformation
type: uri_folder
input_image_dir_path:
description: Input image directory
type: uri_folder
mode:
description: Should exclude 'Random' transform operations in inference but keep them in training
type: string
default: For training
enum: ['For training', 'For inference']
outputs:
output_path:
type: uri_folder
description: Output image directory
command: >-
python -m azureml.designer.modules.computer_vision.transform.apply_image_transformation.apply_image_transformation
--input-image-transform-path ${{inputs.input_image_transform_path}}
--input-image-dir-path ${{inputs.input_image_dir_path}}
--mode "For training"
--output-path ${{outputs.output_path}}
environment:
conda_file: ./conda.yaml
image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20211124.v1

Просмотреть файл

@ -0,0 +1,8 @@
name: designer-cv-transform
channels:
- defaults
dependencies:
- pip=20.2
- python=3.7.9
- pip:
- azureml-designer-cv-modules==0.0.41

Просмотреть файл

@ -0,0 +1,25 @@
$schema: https://azuremlschemas.azureedge.net/development/CommandComponent.schema.json
type: command
name: microsoftsamples_convert_to_image_directory
display_name: Convert to Image Directory
description: Convert dataset to image directory format.
version: 1
inputs:
input_path:
type: uri_folder
description: Input dataset
outputs:
output_path:
type: uri_folder
description: Output image directory
command: >-
python -m azureml.designer.modules.computer_vision.preprocess.convert_to_image_directory.convert_to_image_directory
--input-path ${{inputs.input_path}}
--output-path ${{outputs.output_path}}
environment:
conda_file: ./conda.yaml
image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20211124.v1

Двоичный файл не отображается.

Двоичный файл не отображается.

Просмотреть файл

@ -0,0 +1,89 @@
# Convolutional Networks for Image Classification in PyTorch
In this repository you will find implementations of various image classification models.
## Table Of Contents
* [Models](#models)
* [Validation accuracy results](#validation-accuracy-results)
* [Training performance results](#training-performance-results)
* [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-(8x-v100-16G))
* [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-(16x-v100-32G))
* [Model comparison](#model-comparison)
* [Accuracy vs FLOPS](#accuracy-vs-flops)
* [Latency vs Throughput on different batch sizes](#latency-vs-throughput-on-different-batch-sizes)
## Models
The following table provides links to where you can find additional information on each model:
| **Model** | **Link**|
|:-:|:-:|
| resnet50 | [README](./resnet50v1.5/README.md) |
| resnext101-32x4d | [README](./resnext101-32x4d/README.md) |
| se-resnext101-32x4d | [README](./se-resnext101-32x4d/README.md) |
## Validation accuracy results
Our results were obtained by running the applicable
training scripts in the [framework-container-name] NGC container
on NVIDIA DGX-1 with (8x V100 16G) GPUs.
The specific training script that was run is documented
in the corresponding model's README.
The following table shows the validation accuracy results of the
three classification models side-by-side.
| **arch** | **AMP Top1** | **AMP Top5** | **FP32 Top1** | **FP32 Top1** |
|:-:|:-:|:-:|:-:|:-:|
| resnet50 | 78.46 | 94.15 | 78.50 | 94.11 |
| resnext101-32x4d | 80.08 | 94.89 | 80.14 | 95.02 |
| se-resnext101-32x4d | 81.01 | 95.52 | 81.12 | 95.54 |
## Training performance results
### Training performance: NVIDIA DGX-1 (8x V100 16G)
Our results were obtained by running the applicable
training scripts in the pytorch-19.10 NGC container
on NVIDIA DGX-1 with (8x V100 16G) GPUs.
Performance numbers (in images per second)
were averaged over an entire training epoch.
The specific training script that was run is documented
in the corresponding model's README.
The following table shows the training accuracy results of the
three classification models side-by-side.
| **arch** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** |
|:-:|:-:|:-:|:-:|
| resnet50 | 6888.75 img/s | 2945.37 img/s | 2.34x |
| resnext101-32x4d | 2384.85 img/s | 1116.58 img/s | 2.14x |
| se-resnext101-32x4d | 2031.17 img/s | 977.45 img/s | 2.08x |
### Training performance: NVIDIA DGX-2 (16x V100 32G)
Our results were obtained by running the applicable
training scripts in the pytorch-19.10 NGC container
on NVIDIA DGX-2 with (16x V100 32G) GPUs.
Performance numbers (in images per second)
were averaged over an entire training epoch.
The specific training script that was run is documented
in the corresponding model's README.
The following table shows the training accuracy results of the
three classification models side-by-side.
| **arch** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** |
|:-:|:-:|:-:|:-:|
| resnet50 | 13443.82 img/s | 6263.41 img/s | 2.15x |
| resnext101-32x4d | 4473.37 img/s | 2261.97 img/s | 1.98x |
| se-resnext101-32x4d | 3776.03 img/s | 1953.13 img/s | 1.93x |

Просмотреть файл

@ -0,0 +1,42 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the BSD 3-Clause License (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://opensource.org/licenses/BSD-3-Clause
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import torch
def add_parser_arguments(parser):
parser.add_argument(
"--checkpoint-path", metavar="<path>", help="checkpoint filename"
)
parser.add_argument(
"--weight-path", metavar="<path>", help="name of file in which to store weights"
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="PyTorch ImageNet Training")
add_parser_arguments(parser)
args = parser.parse_args()
checkpoint = torch.load(args.checkpoint_path)
model_state_dict = {
k[len("module.1.") :] if "module.1." in k else k: v
for k, v in checkpoint["state_dict"].items()
}
print(f"Loaded {checkpoint['arch']} : {checkpoint['best_prec1']}")
torch.save(model_state_dict, args.weight_path)

Просмотреть файл

@ -0,0 +1,96 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the BSD 3-Clause License (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://opensource.org/licenses/BSD-3-Clause
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from PIL import Image
import argparse
import numpy as np
import json
import torch
import torch.backends.cudnn as cudnn
import torchvision.transforms as transforms
import image_classification.resnet as models
from image_classification.dataloaders import load_jpeg_from_file
try:
from apex.fp16_utils import *
from apex import amp
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to run this example."
)
def add_parser_arguments(parser):
model_names = models.resnet_versions.keys()
model_configs = models.resnet_configs.keys()
parser.add_argument("--image-size", default="224", type=int)
parser.add_argument(
"--arch",
"-a",
metavar="ARCH",
default="resnet50",
choices=model_names,
help="model architecture: " + " | ".join(model_names) + " (default: resnet50)",
)
parser.add_argument(
"--model-config",
"-c",
metavar="CONF",
default="classic",
choices=model_configs,
help="model configs: " + " | ".join(model_configs) + "(default: classic)",
)
parser.add_argument("--weights", metavar="<path>", help="file with model weights")
parser.add_argument(
"--precision", metavar="PREC", default="FP16", choices=["AMP", "FP16", "FP32"]
)
parser.add_argument("--image", metavar="<path>", help="path to classified image")
def main(args):
imgnet_classes = np.array(json.load(open("./LOC_synset_mapping.json", "r")))
model = models.build_resnet(args.arch, args.model_config, verbose=False)
if args.weights is not None:
weights = torch.load(args.weights)
model.load_state_dict(weights)
model = model.cuda()
if args.precision == "FP16":
model = network_to_half(model)
model.eval()
with torch.no_grad():
input = load_jpeg_from_file(
args.image, cuda=True, fp16=args.precision != "FP32"
)
output = torch.nn.functional.softmax(model(input), dim=1).cpu().view(-1).numpy()
top5 = np.argsort(output)[-5:][::-1]
print(args.image)
for c, v in zip(imgnet_classes[top5], output[top5]):
print(f"{c}: {100*v:.1f}%")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="PyTorch ImageNet Training")
add_parser_arguments(parser)
args = parser.parse_args()
cudnn.benchmark = True
main(args)

Просмотреть файл

@ -0,0 +1,21 @@
name: train_environment
channels:
- defaults
- conda-forge
dependencies:
- python=3.8.12
- pip=21.2.2
- pip:
- azure-ml==0.0.58938149
- --extra-index-url https://pypi.org/simple
- --extra-index-url=https://azuremlsdktestpypi.azureedge.net/test-sdk-cli-v2
- git+https://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger
- watchdog==0.10.3
- torch==1.8.1
- torchvision==0.9.1
- tensorboard==2.5.0
- pillow==8.2.0
- numpy==1.19.5
- --extra-index-url=https://developer.download.nvidia.com/compute/redist/
- nvidia-dali-cuda100
- azureml-mlflow

Просмотреть файл

@ -0,0 +1,175 @@
from pathlib import Path
import sys
import runpy
import json
import shutil
from multiprocessing.pool import ThreadPool
from multiprocessing import cpu_count
import functools
from enum import Enum
from azure.ml import dsl
from azure.ml.dsl._component import ComponentExecutor
from azure.ml.dsl._types import DataInput, NumberInput
class Data_BackendEnum(Enum):
pytorch = "pytorch"
syntetic = "syntetic"
dali_gpu = "dali-gpu"
dali_cpu = "dali-cpu"
class ArchEnum(Enum):
resnet18 = "resnet18"
resnet34 = "resnet34"
resnet50 = "resnet50"
resnet101 = "resnet101"
resnet152 = "resnet152"
resnext101_32x4d = "resnext101-32x4d"
se_resnext101_32x4d = "se-resnext101-32x4d"
class Model_ConfigEnum(Enum):
classic = "classic"
fanin = "fanin"
grp_fanin = "grp-fanin"
grp_fanout = "grp-fanout"
class Lr_ScheduleEnum(Enum):
step = "step"
linear = "linear"
cosine = "cosine"
def convert_image_directory_to_specific_format(
image_dir_path, output_root, is_train=False
):
# convert image directory to train component input data format
image_dir_path = Path(image_dir_path)
image_list_path = image_dir_path / "images.lst"
output_data_path = output_root / ("train" if is_train else "val")
category_list = []
file_name_list = []
with open(image_list_path, "r") as fin:
for line in fin:
line = json.loads(line)
# print(line)
category_list.append(line["category"])
file_name_list.append(line["image_info"]["file_name"])
(output_data_path / line["category"]).mkdir(parents=True, exist_ok=True)
print(
f"file number {len(file_name_list)}, category number {len(set(category_list))}."
)
def copy_file(index):
target_dir = output_data_path / category_list[index]
shutil.copyfile(
str(image_dir_path / file_name_list[index]),
str(target_dir / Path(file_name_list[index]).name),
)
with ThreadPool(cpu_count()) as p:
p.map(functools.partial(copy_file), range(len(file_name_list)))
print(
f"output path {output_data_path} has {len(list(output_data_path.glob('**/*')))} files."
)
return output_root
@dsl.command_component(
name="imagecnn_train", description="imagecnn_train main function"
)
def main(
train_data: DataInput(description="path to train dataset") = None,
val_data: DataInput(description="path to valid dataset") = None,
data_backend="dali-cpu",
arch="resnet50",
model_config="classic",
workers: int = 5,
epochs: int = 90,
batch_size: int = 256,
optimizer_batch_size: int = -1,
lr: float = 0.1,
lr_schedule="step",
warmup: int = 0,
label_smoothing: float = 0.0,
mixup: float = 0.0,
momentum: float = 0.9,
weight_decay: float = 0.0001,
print_freq: int = 10,
resume="",
pretrained_weights="",
static_loss_scale: float = 1,
prof: int = -1,
seed: int = None,
raport_file="experiment_raport.json",
workspace="./",
save_checkpoint_epochs: int = 10,
):
new_data_path = Path(train_data).parent / "new_dataset"
convert_image_directory_to_specific_format(
image_dir_path=train_data, output_root=new_data_path, is_train=True
)
convert_image_directory_to_specific_format(
image_dir_path=val_data, output_root=new_data_path
)
print(f"new data path {new_data_path}")
sys.argv = [
"main",
"--data",
str(new_data_path),
"--data-backend",
data_backend,
"--arch",
arch,
"--model-config",
model_config,
"-j",
str(workers),
"--epochs",
str(epochs),
"-b",
str(batch_size),
"--optimizer-batch-size",
str(optimizer_batch_size),
"--lr",
str(lr),
"--lr-schedule",
lr_schedule,
"--warmup",
str(warmup),
"--label-smoothing",
str(label_smoothing),
"--mixup",
str(mixup),
"--momentum",
str(momentum),
"--weight-decay",
str(weight_decay),
"--print-freq",
str(print_freq),
"--resume",
str(resume),
"--pretrained-weights",
str(pretrained_weights),
"--static-loss-scale",
str(static_loss_scale),
"--prof",
str(prof),
"--seed",
str(seed),
"--raport-file",
str(raport_file),
"--workspace",
str(workspace),
"--save-checkpoint-epochs",
str(save_checkpoint_epochs),
]
print(" ".join(sys.argv))
runpy.run_path("main.py", run_name="__main__")
if __name__ == "__main__":
ComponentExecutor(main).execute(sys.argv)

Просмотреть файл

@ -0,0 +1,147 @@
$schema: https://azuremlschemas.azureedge.net/development/commandComponent.schema.json
type: command
name: train_image_classification
version: 0.0.1
display_name: Train Image Classification
tags: {}
inputs:
train_data:
type: path
description: "path to train dataset"
optional: false
valid_data:
type: path
description: "path to valid dataset"
optional: false
data_backend:
type: string
description: "data backend: pytorch | syntetic | dali-gpu | dali-cpu (default: dali-cpu)"
default: "dali-cpu"
optional: true
arch:
type: string
description: "model architecture: resnet18 | resnet34 | resnet50 | resnet101 | resnet152 | resnext101_32x4d | se_resnext101_32x4d (default: resnet50)"
default: "resnet50"
optional: true
model_config:
type: string
description: "model configs: classic | fanin | grp_fanin | grp_fanout(default: classic)"
default: "classic"
optional: true
workers:
type: integer
description: "number of data loading workers (default: 5)"
default: 5
optional: true
epochs:
type: integer
description: number of total epochs to run
default: 90
optional: true
batch_size:
type: integer
description: "mini-batch size (default: 256) per gpu"
default: 256
optional: true
optimizer_batch_size:
type: integer
description: size of a total batch size, for simulating bigger batches using gradient accumulation
default: -1
optional: true
lr:
type: number
description: initial learning rate
default: 0.1
optional: true
lr_schedule:
type: string
description: "Type of LR schedule: step, linear, cosine"
default: "step"
optional: true
warmup:
type: integer
description: number of warmup epochs
default: 0
optional: true
label_smoothing:
type: number
description: label smoothing
default: 0.0
optional: true
mixup:
type: number
description: mixup alpha
default: 0.0
optional: true
momentum:
type: number
description: momentum
default: 0.9
optional: true
weight_decay:
type: number
description: "weight decay (default: 1e-4)"
default: 0.0001
optional: true
print_freq:
type: integer
description: "print frequency (default: 10)"
default: 10
optional: true
resume:
type: string
description: "path to latest checkpoint (default: none)"
default: ""
optional: true
pretrained_weights:
type: string
description: load weights from here
default: ""
optional: true
static_loss_scale:
type: number
description: Static loss scale, positive power of 2 values can improve fp16 convergence.
default: 1.0
optional: true
prof:
type: integer
description: Run only N iterations
default: -1
optional: true
seed:
type: integer
description: random seed used for numpy and pytorch
default: 123
optional: true
raport_file:
type: string
description: file in which to store JSON experiment raport
default: experiment_raport.json
optional: true
save_checkpoint_epochs:
type: integer
description: how many epochs run between saving checkpoints
default: 2
optional: true
outputs:
workspace:
type: uri_folder
description: path to directory where checkpoints will be stored
code: ./
environment:
image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn7-ubuntu18.04
conda_file: ./conda.yaml
resources:
instance_count: 2
distribution:
type: mpi
process_count_per_instance: 1
command: >-
git clone https://github.com/NVIDIA/apex && cd apex && git checkout 3303b3e7174383312a3468ef390060c26e640cb1 && python setup.py install && cd .. && python entry.py --train_data ${{inputs.train_data}} --val_data ${{inputs.valid_data}} [--data_backend ${{inputs.data_backend}}] [--arch ${{inputs.arch}}] [--model_config ${{inputs.model_config}}] [--workers ${{inputs.workers}}] [--epochs ${{inputs.epochs}}] [--batch_size ${{inputs.batch_size}}] [--optimizer_batch_size ${{inputs.optimizer_batch_size}}] [--lr ${{inputs.lr}}] [--lr_schedule ${{inputs.lr_schedule}}] [--warmup ${{inputs.warmup}}] [--label_smoothing ${{inputs.label_smoothing}}] [--mixup ${{inputs.mixup}}] [--momentum ${{inputs.momentum}}] [--weight_decay ${{inputs.weight_decay}}] [--print_freq ${{inputs.print_freq}}] [--resume ${{inputs.resume}}] [--pretrained_weights ${{inputs.pretrained_weights}}] [--static_loss_scale ${{inputs.static_loss_scale}}] [--prof ${{inputs.prof}}] [--seed ${{inputs.seed}}] [--raport_file ${{inputs.raport_file}}] [--save_checkpoint_epochs ${{inputs.save_checkpoint_epochs}}] --workspace ${{outputs.workspace}}

Просмотреть файл

@ -0,0 +1,20 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the BSD 3-Clause License (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://opensource.org/licenses/BSD-3-Clause
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import logger
from . import dataloaders
from . import training
from . import utils
from . import mixup
from . import resnet
from . import smoothing

Просмотреть файл

@ -0,0 +1,489 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import os
import torch
import numpy as np
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from PIL import Image
DATA_BACKEND_CHOICES = ["pytorch", "syntetic"]
try:
from nvidia.dali.plugin.pytorch import DALIClassificationIterator
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types
DATA_BACKEND_CHOICES.append("dali-gpu")
DATA_BACKEND_CHOICES.append("dali-cpu")
except ImportError:
print(
"Please install DALI from https://www.github.com/NVIDIA/DALI to run this example."
)
def load_jpeg_from_file(path, cuda=True, fp16=False):
img_transforms = transforms.Compose(
[transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor()]
)
img = img_transforms(Image.open(path))
with torch.no_grad():
# mean and std are not multiplied by 255 as they are in training script
# torch dataloader reads data into bytes whereas loading directly
# through PIL creates a tensor with floats in [0,1] range
mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
if cuda:
mean = mean.cuda()
std = std.cuda()
img = img.cuda()
if fp16:
mean = mean.half()
std = std.half()
img = img.half()
else:
img = img.float()
input = img.unsqueeze(0).sub_(mean).div_(std)
return input
class HybridTrainPipe(Pipeline):
def __init__(
self, batch_size, num_threads, device_id, data_dir, crop, dali_cpu=False
):
super(HybridTrainPipe, self).__init__(
batch_size, num_threads, device_id, seed=12 + device_id
)
if torch.distributed.is_initialized():
rank = torch.distributed.get_rank()
world_size = torch.distributed.get_world_size()
else:
rank = 0
world_size = 1
self.input = ops.FileReader(
file_root=data_dir,
shard_id=rank,
num_shards=world_size,
random_shuffle=True,
)
if dali_cpu:
dali_device = "cpu"
self.decode = ops.ImageDecoder(device=dali_device, output_type=types.RGB)
else:
dali_device = "gpu"
# This padding sets the size of the internal nvJPEG buffers to be able to handle all images from full-sized ImageNet
# without additional reallocations
self.decode = ops.ImageDecoder(
device="mixed",
output_type=types.RGB,
device_memory_padding=211025920,
host_memory_padding=140544512,
)
self.res = ops.RandomResizedCrop(
device=dali_device,
size=[crop, crop],
interp_type=types.INTERP_LINEAR,
random_aspect_ratio=[0.75, 4.0 / 3.0],
random_area=[0.08, 1.0],
num_attempts=100,
)
self.cmnp = ops.CropMirrorNormalize(
device="gpu",
output_dtype=types.FLOAT,
output_layout=types.NCHW,
crop=(crop, crop),
image_type=types.RGB,
mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
)
self.coin = ops.CoinFlip(probability=0.5)
def define_graph(self):
rng = self.coin()
self.jpegs, self.labels = self.input(name="Reader")
images = self.decode(self.jpegs)
images = self.res(images)
output = self.cmnp(images.gpu(), mirror=rng)
return [output, self.labels]
class HybridValPipe(Pipeline):
def __init__(self, batch_size, num_threads, device_id, data_dir, crop, size):
super(HybridValPipe, self).__init__(
batch_size, num_threads, device_id, seed=12 + device_id
)
if torch.distributed.is_initialized():
rank = torch.distributed.get_rank()
world_size = torch.distributed.get_world_size()
else:
rank = 0
world_size = 1
self.input = ops.FileReader(
file_root=data_dir,
shard_id=rank,
num_shards=world_size,
random_shuffle=False,
)
self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB)
self.res = ops.Resize(device="gpu", resize_shorter=size)
self.cmnp = ops.CropMirrorNormalize(
device="gpu",
output_dtype=types.FLOAT,
output_layout=types.NCHW,
crop=(crop, crop),
image_type=types.RGB,
mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
)
def define_graph(self):
self.jpegs, self.labels = self.input(name="Reader")
images = self.decode(self.jpegs)
images = self.res(images)
output = self.cmnp(images)
return [output, self.labels]
class DALIWrapper(object):
def gen_wrapper(dalipipeline, num_classes, one_hot):
for data in dalipipeline:
input = data[0]["data"]
target = torch.reshape(data[0]["label"], [-1]).cuda().long()
if one_hot:
target = expand(num_classes, torch.float, target)
yield input, target
dalipipeline.reset()
def __init__(self, dalipipeline, num_classes, one_hot):
self.dalipipeline = dalipipeline
self.num_classes = num_classes
self.one_hot = one_hot
def __iter__(self):
return DALIWrapper.gen_wrapper(
self.dalipipeline, self.num_classes, self.one_hot
)
def get_dali_train_loader(dali_cpu=False):
def gdtl(
data_path,
batch_size,
num_classes,
one_hot,
workers=5,
_worker_init_fn=None,
fp16=False,
):
if torch.distributed.is_initialized():
rank = torch.distributed.get_rank()
world_size = torch.distributed.get_world_size()
else:
rank = 0
world_size = 1
traindir = os.path.join(data_path, "train")
pipe = HybridTrainPipe(
batch_size=batch_size,
num_threads=workers,
device_id=rank % torch.cuda.device_count(),
data_dir=traindir,
crop=224,
dali_cpu=dali_cpu,
)
pipe.build()
train_loader = DALIClassificationIterator(
pipe, size=int(pipe.epoch_size("Reader") / world_size)
)
return DALIWrapper(train_loader, num_classes, one_hot), int(
pipe.epoch_size("Reader") / (world_size * batch_size)
)
return gdtl
def get_dali_val_loader():
def gdvl(
data_path,
batch_size,
num_classes,
one_hot,
workers=5,
_worker_init_fn=None,
fp16=False,
):
if torch.distributed.is_initialized():
rank = torch.distributed.get_rank()
world_size = torch.distributed.get_world_size()
else:
rank = 0
world_size = 1
valdir = os.path.join(data_path, "val")
pipe = HybridValPipe(
batch_size=batch_size,
num_threads=workers,
device_id=rank % torch.cuda.device_count(),
data_dir=valdir,
crop=224,
size=256,
)
pipe.build()
val_loader = DALIClassificationIterator(
pipe, size=int(pipe.epoch_size("Reader") / world_size)
)
return DALIWrapper(val_loader, num_classes, one_hot), int(
pipe.epoch_size("Reader") / (world_size * batch_size)
)
return gdvl
def fast_collate(batch):
imgs = [img[0] for img in batch]
targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
w = imgs[0].size[0]
h = imgs[0].size[1]
tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8)
for i, img in enumerate(imgs):
nump_array = np.asarray(img, dtype=np.uint8)
tens = torch.from_numpy(nump_array)
if nump_array.ndim < 3:
nump_array = np.expand_dims(nump_array, axis=-1)
nump_array = np.rollaxis(nump_array, 2)
tensor[i] += torch.from_numpy(nump_array)
return tensor, targets
def expand(num_classes, dtype, tensor):
e = torch.zeros(
tensor.size(0), num_classes, dtype=dtype, device=torch.device("cuda")
)
e = e.scatter(1, tensor.unsqueeze(1), 1.0)
return e
class PrefetchedWrapper(object):
def prefetched_loader(loader, num_classes, fp16, one_hot):
mean = (
torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255])
.cuda()
.view(1, 3, 1, 1)
)
std = (
torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255])
.cuda()
.view(1, 3, 1, 1)
)
if fp16:
mean = mean.half()
std = std.half()
stream = torch.cuda.Stream()
first = True
for next_input, next_target in loader:
with torch.cuda.stream(stream):
next_input = next_input.cuda(non_blocking=True)
next_target = next_target.cuda(non_blocking=True)
if fp16:
next_input = next_input.half()
if one_hot:
next_target = expand(num_classes, torch.half, next_target)
else:
next_input = next_input.float()
if one_hot:
next_target = expand(num_classes, torch.float, next_target)
next_input = next_input.sub_(mean).div_(std)
if not first:
yield input, target
else:
first = False
torch.cuda.current_stream().wait_stream(stream)
input = next_input
target = next_target
yield input, target
def __init__(self, dataloader, num_classes, fp16, one_hot):
self.dataloader = dataloader
self.fp16 = fp16
self.epoch = 0
self.one_hot = one_hot
self.num_classes = num_classes
def __iter__(self):
if self.dataloader.sampler is not None and isinstance(
self.dataloader.sampler, torch.utils.data.distributed.DistributedSampler
):
self.dataloader.sampler.set_epoch(self.epoch)
self.epoch += 1
return PrefetchedWrapper.prefetched_loader(
self.dataloader, self.num_classes, self.fp16, self.one_hot
)
def get_pytorch_train_loader(
data_path,
batch_size,
num_classes,
one_hot,
workers=5,
_worker_init_fn=None,
fp16=False,
):
traindir = os.path.join(data_path, "train")
train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose(
[
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
]
),
)
if torch.distributed.is_initialized():
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=(train_sampler is None),
num_workers=workers,
worker_init_fn=_worker_init_fn,
pin_memory=True,
sampler=train_sampler,
collate_fn=fast_collate,
drop_last=True,
)
return PrefetchedWrapper(train_loader, num_classes, fp16, one_hot), len(
train_loader
)
def get_pytorch_val_loader(
data_path,
batch_size,
num_classes,
one_hot,
workers=5,
_worker_init_fn=None,
fp16=False,
):
valdir = os.path.join(data_path, "val")
val_dataset = datasets.ImageFolder(
valdir,
transforms.Compose(
[
transforms.Resize(256),
transforms.CenterCrop(224),
]
),
)
if torch.distributed.is_initialized():
val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
else:
val_sampler = None
val_loader = torch.utils.data.DataLoader(
val_dataset,
sampler=val_sampler,
batch_size=batch_size,
shuffle=False,
num_workers=workers,
worker_init_fn=_worker_init_fn,
pin_memory=True,
collate_fn=fast_collate,
)
return PrefetchedWrapper(val_loader, num_classes, fp16, one_hot), len(val_loader)
class SynteticDataLoader(object):
def __init__(
self, fp16, batch_size, num_classes, num_channels, height, width, one_hot
):
input_data = (
torch.empty(batch_size, num_channels, height, width).cuda().normal_(0, 1.0)
)
if one_hot:
input_target = torch.empty(batch_size, num_classes).cuda()
input_target[:, 0] = 1.0
else:
input_target = torch.randint(0, num_classes, (batch_size,))
input_target = input_target.cuda()
if fp16:
input_data = input_data.half()
self.input_data = input_data
self.input_target = input_target
def __iter__(self):
while True:
yield self.input_data, self.input_target
def get_syntetic_loader(
data_path,
batch_size,
num_classes,
one_hot,
workers=None,
_worker_init_fn=None,
fp16=False,
):
return SynteticDataLoader(fp16, batch_size, 1000, 3, 224, 224, one_hot), -1

Просмотреть файл

@ -0,0 +1,311 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from collections import OrderedDict
import dllogger
import numpy as np
def format_step(step):
if isinstance(step, str):
return step
s = ""
if len(step) > 0:
s += "Epoch: {} ".format(step[0])
if len(step) > 1:
s += "Iteration: {} ".format(step[1])
if len(step) > 2:
s += "Validation Iteration: {} ".format(step[2])
if len(step) == 0:
s = "Summary:"
return s
PERF_METER = lambda: Meter(AverageMeter(), AverageMeter(), AverageMeter())
LOSS_METER = lambda: Meter(AverageMeter(), AverageMeter(), MinMeter())
ACC_METER = lambda: Meter(AverageMeter(), AverageMeter(), MaxMeter())
LR_METER = lambda: Meter(LastMeter(), LastMeter(), LastMeter())
LAT_100 = lambda: Meter(QuantileMeter(1), QuantileMeter(1), QuantileMeter(1))
LAT_99 = lambda: Meter(QuantileMeter(0.99), QuantileMeter(0.99), QuantileMeter(0.99))
LAT_95 = lambda: Meter(QuantileMeter(0.95), QuantileMeter(0.95), QuantileMeter(0.95))
class Meter(object):
def __init__(self, iteration_aggregator, epoch_aggregator, run_aggregator):
self.run_aggregator = run_aggregator
self.epoch_aggregator = epoch_aggregator
self.iteration_aggregator = iteration_aggregator
def record(self, val, n=1):
self.iteration_aggregator.record(val, n=n)
def get_iteration(self):
v, n = self.iteration_aggregator.get_val()
return v
def reset_iteration(self):
v, n = self.iteration_aggregator.get_data()
self.iteration_aggregator.reset()
if v is not None:
self.epoch_aggregator.record(v, n=n)
def get_epoch(self):
v, n = self.epoch_aggregator.get_val()
return v
def reset_epoch(self):
v, n = self.epoch_aggregator.get_data()
self.epoch_aggregator.reset()
if v is not None:
self.run_aggregator.record(v, n=n)
def get_run(self):
v, n = self.run_aggregator.get_val()
return v
def reset_run(self):
self.run_aggregator.reset()
class QuantileMeter(object):
def __init__(self, q):
self.q = q
self.reset()
def reset(self):
self.vals = []
self.n = 0
def record(self, val, n=1):
if isinstance(val, list):
self.vals += val
self.n += len(val)
else:
self.vals += [val] * n
self.n += n
def get_val(self):
if not self.vals:
return None, self.n
return np.quantile(self.vals, self.q, interpolation="nearest"), self.n
def get_data(self):
return self.vals, self.n
class MaxMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.max = None
self.n = 0
def record(self, val, n=1):
if self.max is None:
self.max = val
else:
self.max = max(self.max, val)
self.n = n
def get_val(self):
return self.max, self.n
def get_data(self):
return self.max, self.n
class MinMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.min = None
self.n = 0
def record(self, val, n=1):
if self.min is None:
self.min = val
else:
self.min = max(self.min, val)
self.n = n
def get_val(self):
return self.min, self.n
def get_data(self):
return self.min, self.n
class LastMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.last = None
self.n = 0
def record(self, val, n=1):
self.last = val
self.n = n
def get_val(self):
return self.last, self.n
def get_data(self):
return self.last, self.n
class AverageMeter(object):
def __init__(self):
self.reset()
def reset(self):
self.n = 0
self.val = 0
def record(self, val, n=1):
self.n += n
self.val += val * n
def get_val(self):
if self.n == 0:
return None, 0
return self.val / self.n, self.n
def get_data(self):
if self.n == 0:
return None, 0
return self.val / self.n, self.n
class Logger(object):
def __init__(self, print_interval, backends, verbose=False, last_epoch=-1):
self.epoch = last_epoch
self.iteration = -1
self.val_iteration = -1
self.metrics = OrderedDict()
self.backends = backends
self.print_interval = print_interval
self.verbose = verbose
dllogger.init(backends)
def log_parameter(self, data, verbosity=0):
dllogger.log(step="PARAMETER", data=data, verbosity=verbosity)
def register_metric(self, metric_name, meter, verbosity=0, metadata={}):
if self.verbose:
print("Registering metric: {}".format(metric_name))
self.metrics[metric_name] = {"meter": meter, "level": verbosity}
dllogger.metadata(metric_name, metadata)
def log_metric(self, metric_name, val, n=1):
self.metrics[metric_name]["meter"].record(val, n=n)
def start_iteration(self, val=False):
if val:
self.val_iteration += 1
else:
self.iteration += 1
def end_iteration(self, val=False):
it = self.val_iteration if val else self.iteration
if it % self.print_interval == 0:
metrics = {
n: m for n, m in self.metrics.items() if n.startswith("val") == val
}
step = (
(self.epoch, self.iteration)
if not val
else (self.epoch, self.iteration, self.val_iteration)
)
verbositys = {m["level"] for _, m in metrics.items()}
for ll in verbositys:
llm = {n: m for n, m in metrics.items() if m["level"] == ll}
dllogger.log(
step=step,
data={n: m["meter"].get_iteration() for n, m in llm.items()},
verbosity=ll,
)
for n, m in metrics.items():
m["meter"].reset_iteration()
dllogger.flush()
def start_epoch(self):
self.epoch += 1
self.iteration = 0
self.val_iteration = 0
for n, m in self.metrics.items():
m["meter"].reset_epoch()
def end_epoch(self):
for n, m in self.metrics.items():
m["meter"].reset_iteration()
verbositys = {m["level"] for _, m in self.metrics.items()}
for ll in verbositys:
llm = {n: m for n, m in self.metrics.items() if m["level"] == ll}
dllogger.log(
step=(self.epoch,),
data={n: m["meter"].get_epoch() for n, m in llm.items()},
)
def end(self):
for n, m in self.metrics.items():
m["meter"].reset_epoch()
verbositys = {m["level"] for _, m in self.metrics.items()}
for ll in verbositys:
llm = {n: m for n, m in self.metrics.items() if m["level"] == ll}
dllogger.log(
step=tuple(), data={n: m["meter"].get_run() for n, m in llm.items()}
)
for n, m in self.metrics.items():
m["meter"].reset_epoch()
dllogger.flush()
def iteration_generator_wrapper(self, gen, val=False):
for g in gen:
self.start_iteration(val=val)
yield g
self.end_iteration(val=val)
def epoch_generator_wrapper(self, gen):
for g in gen:
self.start_epoch()
yield g
self.end_epoch()

Просмотреть файл

@ -0,0 +1,67 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the BSD 3-Clause License (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://opensource.org/licenses/BSD-3-Clause
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn as nn
import numpy as np
def mixup(alpha, num_classes, data, target):
with torch.no_grad():
bs = data.size(0)
c = np.random.beta(alpha, alpha)
perm = torch.randperm(bs).cuda()
md = c * data + (1 - c) * data[perm, :]
mt = c * target + (1 - c) * target[perm, :]
return md, mt
class MixUpWrapper(object):
def __init__(self, alpha, num_classes, dataloader):
self.alpha = alpha
self.dataloader = dataloader
self.num_classes = num_classes
def mixup_loader(self, loader):
for input, target in loader:
i, t = mixup(self.alpha, self.num_classes, input, target)
yield i, t
def __iter__(self):
return self.mixup_loader(self.dataloader)
class NLLMultiLabelSmooth(nn.Module):
def __init__(self, smoothing=0.0):
super(NLLMultiLabelSmooth, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
def forward(self, x, target):
if self.training:
x = x.float()
target = target.float()
logprobs = torch.nn.functional.log_softmax(x, dim=-1)
nll_loss = -logprobs * target
nll_loss = nll_loss.sum(-1)
smooth_loss = -logprobs.mean(dim=-1)
loss = self.confidence * nll_loss + self.smoothing * smooth_loss
return loss.mean()
else:
return torch.nn.functional.cross_entropy(x, target)

Просмотреть файл

@ -0,0 +1,411 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import math
import torch
import torch.nn as nn
import numpy as np
__all__ = ["ResNet", "build_resnet", "resnet_versions", "resnet_configs"]
# ResNetBuilder {{{
class ResNetBuilder(object):
def __init__(self, version, config):
self.conv3x3_cardinality = (
1 if "cardinality" not in version.keys() else version["cardinality"]
)
self.config = config
def conv(self, kernel_size, in_planes, out_planes, groups=1, stride=1):
conv = nn.Conv2d(
in_planes,
out_planes,
kernel_size=kernel_size,
groups=groups,
stride=stride,
padding=int((kernel_size - 1) / 2),
bias=False,
)
if self.config["nonlinearity"] == "relu":
nn.init.kaiming_normal_(
conv.weight,
mode=self.config["conv_init"],
nonlinearity=self.config["nonlinearity"],
)
return conv
def conv3x3(self, in_planes, out_planes, stride=1):
"""3x3 convolution with padding"""
c = self.conv(
3, in_planes, out_planes, groups=self.conv3x3_cardinality, stride=stride
)
return c
def conv1x1(self, in_planes, out_planes, stride=1):
"""1x1 convolution with padding"""
c = self.conv(1, in_planes, out_planes, stride=stride)
return c
def conv7x7(self, in_planes, out_planes, stride=1):
"""7x7 convolution with padding"""
c = self.conv(7, in_planes, out_planes, stride=stride)
return c
def conv5x5(self, in_planes, out_planes, stride=1):
"""5x5 convolution with padding"""
c = self.conv(5, in_planes, out_planes, stride=stride)
return c
def batchnorm(self, planes, last_bn=False):
bn = nn.BatchNorm2d(planes)
gamma_init_val = 0 if last_bn and self.config["last_bn_0_init"] else 1
nn.init.constant_(bn.weight, gamma_init_val)
nn.init.constant_(bn.bias, 0)
return bn
def activation(self):
return self.config["activation"]()
# ResNetBuilder }}}
# BasicBlock {{{
class BasicBlock(nn.Module):
def __init__(self, builder, inplanes, planes, expansion, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = builder.conv3x3(inplanes, planes, stride)
self.bn1 = builder.batchnorm(planes)
self.relu = builder.activation()
self.conv2 = builder.conv3x3(planes, planes * expansion)
self.bn2 = builder.batchnorm(planes * expansion, last_bn=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
if self.bn1 is not None:
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
if self.bn2 is not None:
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
# BasicBlock }}}
# SqueezeAndExcitation {{{
class SqueezeAndExcitation(nn.Module):
def __init__(self, planes, squeeze):
super(SqueezeAndExcitation, self).__init__()
self.squeeze = nn.Linear(planes, squeeze)
self.expand = nn.Linear(squeeze, planes)
self.relu = nn.ReLU(inplace=True)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
out = torch.mean(x.view(x.size(0), x.size(1), -1), 2)
out = self.squeeze(out)
out = self.relu(out)
out = self.expand(out)
out = self.sigmoid(out)
out = out.unsqueeze(2).unsqueeze(3)
return out
# }}}
# Bottleneck {{{
class Bottleneck(nn.Module):
def __init__(
self,
builder,
inplanes,
planes,
expansion,
stride=1,
se=False,
se_squeeze=16,
downsample=None,
):
super(Bottleneck, self).__init__()
self.conv1 = builder.conv1x1(inplanes, planes)
self.bn1 = builder.batchnorm(planes)
self.conv2 = builder.conv3x3(planes, planes, stride=stride)
self.bn2 = builder.batchnorm(planes)
self.conv3 = builder.conv1x1(planes, planes * expansion)
self.bn3 = builder.batchnorm(planes * expansion, last_bn=True)
self.relu = builder.activation()
self.downsample = downsample
self.stride = stride
self.squeeze = (
SqueezeAndExcitation(planes * expansion, se_squeeze) if se else None
)
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
if self.squeeze is None:
out += residual
else:
out = torch.addcmul(residual, 1.0, out, self.squeeze(out))
out = self.relu(out)
return out
def SEBottleneck(builder, inplanes, planes, expansion, stride=1, downsample=None):
return Bottleneck(
builder,
inplanes,
planes,
expansion,
stride=stride,
se=True,
se_squeeze=16,
downsample=downsample,
)
# Bottleneck }}}
# ResNet {{{
class ResNet(nn.Module):
def __init__(self, builder, block, expansion, layers, widths, num_classes=1000):
self.inplanes = 64
super(ResNet, self).__init__()
self.conv1 = builder.conv7x7(3, 64, stride=2)
self.bn1 = builder.batchnorm(64)
self.relu = builder.activation()
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(builder, block, expansion, widths[0], layers[0])
self.layer2 = self._make_layer(
builder, block, expansion, widths[1], layers[1], stride=2
)
self.layer3 = self._make_layer(
builder, block, expansion, widths[2], layers[2], stride=2
)
self.layer4 = self._make_layer(
builder, block, expansion, widths[3], layers[3], stride=2
)
self.avgpool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Linear(widths[3] * expansion, num_classes)
def _make_layer(self, builder, block, expansion, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * expansion:
dconv = builder.conv1x1(self.inplanes, planes * expansion, stride=stride)
dbn = builder.batchnorm(planes * expansion)
if dbn is not None:
downsample = nn.Sequential(dconv, dbn)
else:
downsample = dconv
layers = []
layers.append(
block(
builder,
self.inplanes,
planes,
expansion,
stride=stride,
downsample=downsample,
)
)
self.inplanes = planes * expansion
for i in range(1, blocks):
layers.append(block(builder, self.inplanes, planes, expansion))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
if self.bn1 is not None:
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
# ResNet }}}
resnet_configs = {
"classic": {
"conv": nn.Conv2d,
"conv_init": "fan_out",
"nonlinearity": "relu",
"last_bn_0_init": False,
"activation": lambda: nn.ReLU(inplace=True),
},
"fanin": {
"conv": nn.Conv2d,
"conv_init": "fan_in",
"nonlinearity": "relu",
"last_bn_0_init": False,
"activation": lambda: nn.ReLU(inplace=True),
},
"grp-fanin": {
"conv": nn.Conv2d,
"conv_init": "fan_in",
"nonlinearity": "relu",
"last_bn_0_init": False,
"activation": lambda: nn.ReLU(inplace=True),
},
"grp-fanout": {
"conv": nn.Conv2d,
"conv_init": "fan_out",
"nonlinearity": "relu",
"last_bn_0_init": False,
"activation": lambda: nn.ReLU(inplace=True),
},
}
resnet_versions = {
"resnet18": {
"net": ResNet,
"block": BasicBlock,
"layers": [2, 2, 2, 2],
"widths": [64, 128, 256, 512],
"expansion": 1,
"num_classes": 1000,
},
"resnet34": {
"net": ResNet,
"block": BasicBlock,
"layers": [3, 4, 6, 3],
"widths": [64, 128, 256, 512],
"expansion": 1,
"num_classes": 1000,
},
"resnet50": {
"net": ResNet,
"block": Bottleneck,
"layers": [3, 4, 6, 3],
"widths": [64, 128, 256, 512],
"expansion": 4,
"num_classes": 1000,
},
"resnet101": {
"net": ResNet,
"block": Bottleneck,
"layers": [3, 4, 23, 3],
"widths": [64, 128, 256, 512],
"expansion": 4,
"num_classes": 1000,
},
"resnet152": {
"net": ResNet,
"block": Bottleneck,
"layers": [3, 8, 36, 3],
"widths": [64, 128, 256, 512],
"expansion": 4,
"num_classes": 1000,
},
"resnext101-32x4d": {
"net": ResNet,
"block": Bottleneck,
"cardinality": 32,
"layers": [3, 4, 23, 3],
"widths": [128, 256, 512, 1024],
"expansion": 2,
"num_classes": 1000,
},
"se-resnext101-32x4d": {
"net": ResNet,
"block": SEBottleneck,
"cardinality": 32,
"layers": [3, 4, 23, 3],
"widths": [128, 256, 512, 1024],
"expansion": 2,
"num_classes": 1000,
},
}
def build_resnet(version, config, verbose=True):
version = resnet_versions[version]
config = resnet_configs[config]
builder = ResNetBuilder(version, config)
if verbose:
print("Version: {}".format(version))
print("Config: {}".format(config))
model = version["net"](
builder,
version["block"],
version["expansion"],
version["layers"],
version["widths"],
version["num_classes"],
)
return model

Просмотреть файл

@ -0,0 +1,40 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the BSD 3-Clause License (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://opensource.org/licenses/BSD-3-Clause
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn as nn
class LabelSmoothing(nn.Module):
"""
NLL loss with label smoothing.
"""
def __init__(self, smoothing=0.0):
"""
Constructor for the LabelSmoothing module.
:param smoothing: label smoothing factor
"""
super(LabelSmoothing, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
def forward(self, x, target):
logprobs = torch.nn.functional.log_softmax(x, dim=-1)
nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
nll_loss = nll_loss.squeeze(1)
smooth_loss = -logprobs.mean(dim=-1)
loss = self.confidence * nll_loss + self.smoothing * smooth_loss
return loss.mean()

Просмотреть файл

@ -0,0 +1,745 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import os
import time
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from . import logger as log
from . import resnet as models
from . import utils
import dllogger
try:
from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import *
from apex import amp
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to run this example."
)
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
DIRECTORY_TO_WATCH = "/usr/share"
checkpoint_file_name = "checkpoint_backup.pth.tar"
from multiprocessing import Value
from ctypes import c_bool
from azureml.core.run import Run
run = Run.get_context()
class PreemptHandler(FileSystemEventHandler):
def __init__(self):
super(PreemptHandler, self).__init__()
self.is_preempted = Value(c_bool, False)
def on_any_event(self, event):
if not event.is_directory and event.src_path.endswith("/to-be-preempted"):
print(datetime.utcnow(), "Detected Preempt Signal, should stop and return.")
self.is_preempted.value = True
class PreemptDetector:
def __init__(self):
self.observer = Observer()
self.event_handler = PreemptHandler()
def run(self):
self.observer.schedule(self.event_handler, DIRECTORY_TO_WATCH, recursive=False)
self.observer.start()
def is_preempted(self):
return self.event_handler.is_preempted.value == True
def stop(self):
self.observer.stop()
ACC_METADATA = {"unit": "%", "format": ":.2f"}
IPS_METADATA = {"unit": "img/s", "format": ":.2f"}
TIME_METADATA = {"unit": "s", "format": ":.5f"}
LOSS_METADATA = {"format": ":.5f"}
class ModelAndLoss(nn.Module):
def __init__(self, arch, loss, pretrained_weights=None, cuda=True, fp16=False):
super(ModelAndLoss, self).__init__()
self.arch = arch
print("=> creating model '{}'".format(arch))
model = models.build_resnet(arch[0], arch[1])
if pretrained_weights is not None:
print("=> using pre-trained model from a file '{}'".format(arch))
model.load_state_dict(pretrained_weights)
if cuda:
model = model.cuda()
if fp16:
model = network_to_half(model)
# define loss function (criterion) and optimizer
criterion = loss()
if cuda:
criterion = criterion.cuda()
self.model = model
self.loss = criterion
def forward(self, data, target):
output = self.model(data)
loss = self.loss(output, target)
return loss, output
def distributed(self):
self.model = DDP(self.model)
def load_model_state(self, state):
if not state is None:
self.model.load_state_dict(state)
def get_optimizer(
parameters,
fp16,
lr,
momentum,
weight_decay,
nesterov=False,
state=None,
static_loss_scale=1.0,
dynamic_loss_scale=False,
bn_weight_decay=False,
):
if bn_weight_decay:
print(" ! Weight decay applied to BN parameters ")
optimizer = torch.optim.SGD(
[v for n, v in parameters],
lr,
momentum=momentum,
weight_decay=weight_decay,
nesterov=nesterov,
)
else:
print(" ! Weight decay NOT applied to BN parameters ")
bn_params = [v for n, v in parameters if "bn" in n]
rest_params = [v for n, v in parameters if not "bn" in n]
print(len(bn_params))
print(len(rest_params))
optimizer = torch.optim.SGD(
[
{"params": bn_params, "weight_decay": 0},
{"params": rest_params, "weight_decay": weight_decay},
],
lr,
momentum=momentum,
weight_decay=weight_decay,
nesterov=nesterov,
)
if fp16:
optimizer = FP16_Optimizer(
optimizer,
static_loss_scale=static_loss_scale,
dynamic_loss_scale=dynamic_loss_scale,
verbose=False,
)
if not state is None:
optimizer.load_state_dict(state)
return optimizer
def lr_policy(lr_fn, logger=None):
if logger is not None:
logger.register_metric(
"lr", log.LR_METER(), verbosity=dllogger.Verbosity.VERBOSE
)
def _alr(optimizer, iteration, epoch):
lr = lr_fn(iteration, epoch)
if logger is not None:
logger.log_metric("lr", lr)
for param_group in optimizer.param_groups:
param_group["lr"] = lr
return lr
return _alr
def lr_step_policy(base_lr, steps, decay_factor, warmup_length, logger=None):
def _lr_fn(iteration, epoch):
if epoch < warmup_length:
lr = base_lr * (epoch + 1) / warmup_length
else:
lr = base_lr
for s in steps:
if epoch >= s:
lr *= decay_factor
return lr
return lr_policy(_lr_fn, logger=logger)
def lr_linear_policy(base_lr, warmup_length, epochs, logger=None):
def _lr_fn(iteration, epoch):
if epoch < warmup_length:
lr = base_lr * (epoch + 1) / warmup_length
else:
e = epoch - warmup_length
es = epochs - warmup_length
lr = base_lr * (1 - (e / es))
return lr
return lr_policy(_lr_fn, logger=logger)
def lr_cosine_policy(base_lr, warmup_length, epochs, logger=None):
def _lr_fn(iteration, epoch):
if epoch < warmup_length:
lr = base_lr * (epoch + 1) / warmup_length
else:
e = epoch - warmup_length
es = epochs - warmup_length
lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
return lr
return lr_policy(_lr_fn, logger=logger)
def lr_exponential_policy(
base_lr, warmup_length, epochs, final_multiplier=0.001, logger=None
):
es = epochs - warmup_length
epoch_decay = np.power(2, np.log2(final_multiplier) / es)
def _lr_fn(iteration, epoch):
if epoch < warmup_length:
lr = base_lr * (epoch + 1) / warmup_length
else:
e = epoch - warmup_length
lr = base_lr * (epoch_decay**e)
return lr
return lr_policy(_lr_fn, logger=logger)
def get_train_step(
model_and_loss, optimizer, fp16, use_amp=False, batch_size_multiplier=1
):
def _step(input, target, optimizer_step=True):
input_var = Variable(input)
target_var = Variable(target)
loss, output = model_and_loss(input_var, target_var)
if torch.distributed.is_initialized():
reduced_loss = utils.reduce_tensor(loss.data)
else:
reduced_loss = loss.data
if fp16:
optimizer.backward(loss)
elif use_amp:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
if optimizer_step:
opt = (
optimizer.optimizer
if isinstance(optimizer, FP16_Optimizer)
else optimizer
)
for param_group in opt.param_groups:
for param in param_group["params"]:
param.grad /= batch_size_multiplier
optimizer.step()
optimizer.zero_grad()
torch.cuda.synchronize()
return reduced_loss
return _step
def train(
train_loader,
model_and_loss,
optimizer,
lr_scheduler,
fp16,
logger,
epoch,
detector,
use_amp=False,
prof=-1,
batch_size_multiplier=1,
register_metrics=True,
total_train_step=0,
writer=None,
):
print(f"training...")
print(f"register_metrics {register_metrics}, logger {logger}.")
if register_metrics and logger is not None:
logger.register_metric(
"train.loss",
log.LOSS_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=LOSS_METADATA,
)
logger.register_metric(
"train.compute_ips",
log.PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=IPS_METADATA,
)
logger.register_metric(
"train.total_ips",
log.PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=IPS_METADATA,
)
logger.register_metric(
"train.data_time",
log.PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=TIME_METADATA,
)
logger.register_metric(
"train.compute_time",
log.PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=TIME_METADATA,
)
step = get_train_step(
model_and_loss,
optimizer,
fp16,
use_amp=use_amp,
batch_size_multiplier=batch_size_multiplier,
)
model_and_loss.train()
end = time.time()
optimizer.zero_grad()
last_train_step = total_train_step
data_iter = enumerate(train_loader)
if logger is not None:
data_iter = logger.iteration_generator_wrapper(data_iter)
if prof > 0:
data_iter = utils.first_n(prof, data_iter)
for i, (input, target) in data_iter:
bs = input.size(0)
lr = lr_scheduler(optimizer, i, epoch)
data_time = time.time() - end
optimizer_step = ((i + 1) % batch_size_multiplier) == 0
loss = step(input, target, optimizer_step=optimizer_step)
it_time = time.time() - end
if optimizer_step:
if writer:
writer.add_scalar("train/summary/scalar/learning_rate", lr, epoch)
writer.add_scalar(
"train/summary/scalar/loss", to_python_float(loss), total_train_step
)
writer.add_scalar(
"perf/summary/scalar/compute_ips",
calc_ips(bs, it_time - data_time),
total_train_step,
)
writer.add_scalar(
"perf/summary/scalar/train_total_ips",
calc_ips(bs, it_time),
total_train_step,
)
run.log_row("train/learning_rate", x=epoch, y=lr)
run.log_row("train/loss", x=total_train_step, y=to_python_float(loss))
run.log_row(
"perf/compute_ips",
x=total_train_step,
y=calc_ips(bs, it_time - data_time),
)
run.log_row(
"perf/train_total_ips", x=total_train_step, y=calc_ips(bs, it_time)
)
total_train_step += 1
if logger is not None:
logger.log_metric("train.loss", to_python_float(loss), bs)
logger.log_metric("train.compute_ips", calc_ips(bs, it_time - data_time))
logger.log_metric("train.total_ips", calc_ips(bs, it_time))
logger.log_metric("train.data_time", data_time)
logger.log_metric("train.compute_time", it_time - data_time)
end = time.time()
if writer:
writer.flush()
if detector.is_preempted():
print(
datetime.utcnow(),
"Exit training loop detecting is_preempted changed to True",
)
return last_train_step
return total_train_step
def get_val_step(model_and_loss):
def _step(input, target):
input_var = Variable(input)
target_var = Variable(target)
with torch.no_grad():
loss, output = model_and_loss(input_var, target_var)
prec1, prec5 = utils.accuracy(output.data, target, topk=(1, 5))
if torch.distributed.is_initialized():
reduced_loss = utils.reduce_tensor(loss.data)
prec1 = utils.reduce_tensor(prec1)
prec5 = utils.reduce_tensor(prec5)
else:
reduced_loss = loss.data
torch.cuda.synchronize()
return reduced_loss, prec1, prec5
return _step
def validate(
val_loader,
model_and_loss,
fp16,
logger,
epoch,
detector,
prof=-1,
register_metrics=True,
):
print(f"validating...")
print(f"register_metrics {register_metrics}, logger {logger}.")
if register_metrics and logger is not None:
logger.register_metric(
"val.top1",
log.ACC_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=ACC_METADATA,
)
logger.register_metric(
"val.top5",
log.ACC_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=ACC_METADATA,
)
logger.register_metric(
"val.loss",
log.LOSS_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=LOSS_METADATA,
)
logger.register_metric(
"val.compute_ips",
log.PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=IPS_METADATA,
)
logger.register_metric(
"val.total_ips",
log.PERF_METER(),
verbosity=dllogger.Verbosity.DEFAULT,
metadata=IPS_METADATA,
)
logger.register_metric(
"val.data_time",
log.PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=TIME_METADATA,
)
logger.register_metric(
"val.compute_latency",
log.PERF_METER(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=TIME_METADATA,
)
logger.register_metric(
"val.compute_latency_at100",
log.LAT_100(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=TIME_METADATA,
)
logger.register_metric(
"val.compute_latency_at99",
log.LAT_99(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=TIME_METADATA,
)
logger.register_metric(
"val.compute_latency_at95",
log.LAT_95(),
verbosity=dllogger.Verbosity.VERBOSE,
metadata=TIME_METADATA,
)
step = get_val_step(model_and_loss)
top1 = log.AverageMeter()
# switch to evaluate mode
model_and_loss.eval()
end = time.time()
data_iter = enumerate(val_loader)
if not logger is None:
data_iter = logger.iteration_generator_wrapper(data_iter, val=True)
if prof > 0:
data_iter = utils.first_n(prof, data_iter)
loss_sum = 0.0
total_val_step = 0
for i, (input, target) in data_iter:
bs = input.size(0)
data_time = time.time() - end
loss, prec1, prec5 = step(input, target)
it_time = time.time() - end
top1.record(to_python_float(prec1), bs)
if logger is not None:
logger.log_metric("val.top1", to_python_float(prec1), bs)
logger.log_metric("val.top5", to_python_float(prec5), bs)
logger.log_metric("val.loss", to_python_float(loss), bs)
logger.log_metric("val.compute_ips", calc_ips(bs, it_time - data_time))
logger.log_metric("val.total_ips", calc_ips(bs, it_time))
logger.log_metric("val.data_time", data_time)
logger.log_metric("val.compute_latency", it_time - data_time)
logger.log_metric("val.compute_latency_at95", it_time - data_time)
logger.log_metric("val.compute_latency_at99", it_time - data_time)
logger.log_metric("val.compute_latency_at100", it_time - data_time)
loss_sum += to_python_float(loss)
total_val_step += 1
end = time.time()
if detector.is_preempted():
print(
datetime.utcnow(),
"Exit validation loop detecting is_preempted changed to True",
)
break
return [top1, loss_sum / total_val_step]
# Train loop {{{
def calc_ips(batch_size, time):
world_size = (
torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
)
tbs = world_size * batch_size
return tbs / time
def train_loop(
model_and_loss,
optimizer,
lr_scheduler,
train_loader,
val_loader,
epochs,
fp16,
logger,
should_backup_checkpoint,
save_checkpoint_epochs,
use_amp=False,
batch_size_multiplier=1,
best_prec1=0,
start_epoch=0,
prof=-1,
skip_training=False,
skip_validation=False,
save_checkpoints=True,
checkpoint_dir="./",
total_train_step=0,
):
is_first_rank = (
not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
)
if is_first_rank:
ts = str(time.time())
# logdir = os.path.expanduser('~/tensorboard/{}/logs/'.format(os.environ['DLTS_JOB_ID']) + ts)
logdir = os.path.expanduser(
"~/tensorboard/{}/logs/".format(os.environ["AZ_BATCH_JOB_ID"]) + ts
)
print("tensorboard at ", logdir)
if not os.path.exists(logdir):
os.makedirs(logdir)
writer = SummaryWriter(log_dir=logdir)
else:
writer = None
prec1 = -1
detector = PreemptDetector()
detector.run()
epoch_iter = range(start_epoch, epochs)
for epoch in epoch_iter:
world_size = (
torch.distributed.get_world_size()
if torch.distributed.is_initialized()
else 1
)
if writer:
writer.add_scalar("train/summary/scalar/world_size", world_size, epoch)
run.log_row("train/world_size", x=epoch, y=world_size)
if logger is not None:
logger.start_epoch()
if not skip_training:
total_train_step = train(
train_loader,
model_and_loss,
optimizer,
lr_scheduler,
fp16,
logger,
epoch,
detector,
use_amp=use_amp,
prof=prof,
register_metrics=epoch == start_epoch,
batch_size_multiplier=batch_size_multiplier,
total_train_step=total_train_step,
writer=writer,
)
if not skip_validation and not detector.is_preempted():
top1, val_loss = validate(
val_loader,
model_and_loss,
fp16,
logger,
epoch,
detector,
prof=prof,
register_metrics=epoch == start_epoch,
)
if not detector.is_preempted():
prec1, nimg = top1.get_val()
if writer:
writer.add_scalar("val/summary/scalar/loss", val_loss, epoch)
writer.add_scalar("val/summary/scalar/prec1", prec1, epoch)
run.log_row("val/loss", x=epoch, y=val_loss)
run.log_row("val/prec1", x=epoch, y=prec1)
if logger is not None:
print(
"Epoch ", epoch, " complete with is_preempted ", detector.is_preempted()
)
logger.end_epoch()
save_ckpt = is_first_rank and (
detector.is_preempted() or (epoch + 1) % save_checkpoint_epochs == 0
)
if detector.is_preempted() and start_epoch == epoch:
print(
"Skipping save checkpoint since no complete epoch finishes till now. ",
start_epoch,
"-->",
epoch,
)
save_ckpt = False
print(f"save ckpt {save_ckpt}, ckpt dir {checkpoint_dir}.")
if save_ckpt:
if not skip_validation and not detector.is_preempted():
is_best = logger.metrics["val.top1"]["meter"].get_epoch() > best_prec1
best_prec1 = max(
logger.metrics["val.top1"]["meter"].get_epoch(), best_prec1
)
else:
is_best = False
best_prec1 = 0
ckpt_epoch_index = epoch + 1 if not detector.is_preempted() else epoch
utils.save_checkpoint(
{
"epoch": ckpt_epoch_index,
"arch": model_and_loss.arch,
"state_dict": model_and_loss.model.state_dict(),
"best_prec1": best_prec1,
"optimizer": optimizer.state_dict(),
"total_train_step": total_train_step,
},
is_best,
checkpoint_dir=checkpoint_dir,
backup_filename=checkpoint_file_name,
)
if detector.is_preempted():
print(
datetime.utcnow(),
"Exit epoch loop detecting is_preempted changed to True, save_ckpt:",
save_ckpt,
)
break
if writer:
writer.close()
detector.stop()
print(
datetime.utcnow(), "Training exits with is_preempted: ", detector.is_preempted()
)
# }}}

Просмотреть файл

@ -0,0 +1,121 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import os
import numpy as np
import torch
import shutil
import torch.distributed as dist
def should_backup_checkpoint(args):
def _sbc(epoch):
return args.gather_checkpoints and (epoch < 10 or epoch % 10 == 0)
return _sbc
import time
def save_checkpoint(
state,
is_best,
filename="checkpoint.pth.tar",
checkpoint_dir="./",
backup_filename=None,
):
if (not torch.distributed.is_initialized()) or torch.distributed.get_rank() == 0:
start_time = time.time()
# filename = os.path.join('/tmp/', filename)
filename = os.path.join(checkpoint_dir, filename)
print(f"filename {filename}, ckpt dir {checkpoint_dir}")
torch.save(state, filename)
elapsed_time = time.time() - start_time
# print("save checkpoint time on local /tmp ", elapsed_time)
if is_best:
start_time = time.time()
shutil.copyfile(
filename, os.path.join(checkpoint_dir, "model_best.pth.tar")
)
elapsed_time = time.time() - start_time
print("save best checkpoint time (copy to blob) ", elapsed_time)
if backup_filename is not None:
start_time = time.time()
shutil.copyfile(filename, os.path.join(checkpoint_dir, backup_filename))
elapsed_time = time.time() - start_time
print("save checkpoint time (copy to blob) ", elapsed_time)
def timed_generator(gen):
start = time.time()
for g in gen:
end = time.time()
t = end - start
yield g, t
start = time.time()
def timed_function(f):
def _timed_function(*args, **kwargs):
start = time.time()
ret = f(*args, **kwargs)
return ret, time.time() - start
return _timed_function
def accuracy(output, target, topk=(1,)):
"""Computes the precision@k for the specified values of k"""
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
def reduce_tensor(tensor):
rt = tensor.clone()
dist.all_reduce(rt, op=dist.ReduceOp.SUM)
rt /= (
torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
)
return rt
def first_n(n, generator):
for i, d in zip(range(n), generator):
yield d

Просмотреть файл

@ -0,0 +1,603 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION
# Copyright (c) 2017- Facebook, Inc
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import argparse
import os
import shutil
import time
import random
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
try:
from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import *
from apex import amp
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to run this example."
)
import image_classification.resnet as models
import image_classification.logger as log
from image_classification.smoothing import LabelSmoothing
from image_classification.mixup import NLLMultiLabelSmooth, MixUpWrapper
from image_classification.dataloaders import *
from image_classification.training import *
from image_classification.utils import *
import dllogger
import torch.multiprocessing as mp
import os
import os.path as op
import re
from datetime import datetime
import sys
# cluster aware logic start
def get_master_ip():
regexp = "[\s\S]*export[\s]*DLTS_SD_worker0_IP=([0-9.]+)[\s|s]*"
with open("/dlts-runtime/env/init.env", "r") as f:
line = f.read()
match = re.match(regexp, line)
if match:
ip = str(match.group(1))
print("master node ip is " + ip)
return ip
else:
raise ValueError("did not find master node ip")
# cluster ware logic end
checkpoint_file_name = "checkpoint_backup.pth.tar"
def add_parser_arguments(parser):
model_names = models.resnet_versions.keys()
model_configs = models.resnet_configs.keys()
parser.add_argument("--data", metavar="DIR", help="path to dataset")
parser.add_argument(
"--data-backend",
metavar="BACKEND",
default="dali-cpu",
choices=DATA_BACKEND_CHOICES,
help="data backend: "
+ " | ".join(DATA_BACKEND_CHOICES)
+ " (default: dali-cpu)",
)
parser.add_argument(
"--arch",
"-a",
metavar="ARCH",
default="resnet50",
choices=model_names,
help="model architecture: " + " | ".join(model_names) + " (default: resnet50)",
)
parser.add_argument(
"--model-config",
"-c",
metavar="CONF",
default="classic",
choices=model_configs,
help="model configs: " + " | ".join(model_configs) + "(default: classic)",
)
parser.add_argument(
"-j",
"--workers",
default=5,
type=int,
metavar="N",
help="number of data loading workers (default: 5)",
)
parser.add_argument(
"--epochs",
default=90,
type=int,
metavar="N",
help="number of total epochs to run",
)
parser.add_argument(
"-b",
"--batch-size",
default=256,
type=int,
metavar="N",
help="mini-batch size (default: 256) per gpu",
)
parser.add_argument(
"--optimizer-batch-size",
default=-1,
type=int,
metavar="N",
help="size of a total batch size, for simulating bigger batches using gradient accumulation",
)
parser.add_argument(
"--lr",
"--learning-rate",
default=0.1,
type=float,
metavar="LR",
help="initial learning rate",
)
parser.add_argument(
"--lr-schedule",
default="step",
type=str,
metavar="SCHEDULE",
choices=["step", "linear", "cosine"],
help="Type of LR schedule: {}, {}, {}".format("step", "linear", "cosine"),
)
parser.add_argument(
"--warmup", default=0, type=int, metavar="E", help="number of warmup epochs"
)
parser.add_argument(
"--label-smoothing",
default=0.0,
type=float,
metavar="S",
help="label smoothing",
)
parser.add_argument(
"--mixup", default=0.0, type=float, metavar="ALPHA", help="mixup alpha"
)
parser.add_argument(
"--momentum", default=0.9, type=float, metavar="M", help="momentum"
)
parser.add_argument(
"--weight-decay",
"--wd",
default=1e-4,
type=float,
metavar="W",
help="weight decay (default: 1e-4)",
)
parser.add_argument(
"--bn-weight-decay",
action="store_true",
help="use weight_decay on batch normalization learnable parameters, (default: false)",
)
parser.add_argument(
"--nesterov",
action="store_true",
help="use nesterov momentum, (default: false)",
)
parser.add_argument(
"--print-freq",
"-p",
default=10,
type=int,
metavar="N",
help="print frequency (default: 10)",
)
parser.add_argument(
"--resume",
default="",
type=str,
metavar="PATH",
help="path to latest checkpoint (default: none)",
)
parser.add_argument(
"--pretrained-weights",
default="",
type=str,
metavar="PATH",
help="load weights from here",
)
parser.add_argument("--fp16", action="store_true", help="Run model fp16 mode.")
parser.add_argument(
"--static-loss-scale",
type=float,
default=1,
help="Static loss scale, positive power of 2 values can improve fp16 convergence.",
)
parser.add_argument(
"--dynamic-loss-scale",
action="store_true",
help="Use dynamic loss scaling. If supplied, this argument supersedes "
+ "--static-loss-scale.",
)
parser.add_argument(
"--prof", type=int, default=-1, metavar="N", help="Run only N iterations"
)
parser.add_argument(
"--amp",
action="store_true",
help="Run model AMP (automatic mixed precision) mode.",
)
parser.add_argument(
"--seed", default=None, type=int, help="random seed used for numpy and pytorch"
)
parser.add_argument(
"--gather-checkpoints",
action="store_true",
help="Gather checkpoints throughout the training, without this flag only best and last checkpoints will be stored",
)
parser.add_argument(
"--raport-file",
default="experiment_raport.json",
type=str,
help="file in which to store JSON experiment raport",
)
parser.add_argument(
"--evaluate", action="store_true", help="evaluate checkpoint/model"
)
parser.add_argument("--training-only", action="store_true", help="do not evaluate")
parser.add_argument(
"--no-checkpoints",
action="store_false",
dest="save_checkpoints",
help="do not store any checkpoints, useful for benchmarking",
)
parser.add_argument(
"--workspace",
type=str,
default="./",
metavar="DIR",
help="path to directory where checkpoints will be stored",
)
parser.add_argument(
"--save-checkpoint-epochs",
default=10,
type=int,
metavar="N",
help="how many epochs run between saving checkpoints",
)
parser.add_argument(
"--log_redirect", action="store_true", help="Redirect log to files."
)
def main(gpu_index, args):
if args.log_redirect:
sys.stdout = open(
"./outputs_"
+ str(args.rank * args.ngpus_per_node + gpu_index)
+ str(time.time()),
"w",
)
exp_start_time = time.time()
global best_prec1
best_prec1 = 0
args.distributed = False
args.gpu = 0
args.local_rank = gpu_index
args.distributed = args.world_size > 1
if args.distributed:
args.gpu = args.local_rank % torch.cuda.device_count()
print("using gpu ", args.gpu)
torch.cuda.set_device(args.gpu)
args.rank = args.rank * args.ngpus_per_node + gpu_index
dist.init_process_group(
backend="nccl",
init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank,
)
if args.amp and args.fp16:
print("Please use only one of the --fp16/--amp flags")
exit(1)
if args.seed is not None:
print("Using seed = {}".format(args.seed))
torch.manual_seed(args.seed + args.local_rank)
torch.cuda.manual_seed(args.seed + args.local_rank)
np.random.seed(seed=args.seed + args.local_rank)
random.seed(args.seed + args.local_rank)
def _worker_init_fn(id):
np.random.seed(seed=args.seed + args.local_rank + id)
random.seed(args.seed + args.local_rank + id)
else:
def _worker_init_fn(id):
pass
if args.fp16:
assert (
torch.backends.cudnn.enabled
), "fp16 mode requires cudnn backend to be enabled."
if args.static_loss_scale != 1.0:
if not args.fp16:
print("Warning: if --fp16 is not used, static_loss_scale will be ignored.")
if args.optimizer_batch_size < 0:
batch_size_multiplier = 1
else:
tbs = args.world_size * args.batch_size
if args.optimizer_batch_size % tbs != 0:
print(
"Warning: simulated batch size {} is not divisible by actual batch size {}".format(
args.optimizer_batch_size, tbs
)
)
batch_size_multiplier = int(round(args.optimizer_batch_size / tbs))
print("BSM: {}".format(batch_size_multiplier))
print("Real effective batch size is: ", batch_size_multiplier * tbs)
pretrained_weights = None
if args.pretrained_weights:
if os.path.isfile(args.pretrained_weights):
print(
"=> loading pretrained weights from '{}'".format(
args.pretrained_weights
)
)
pretrained_weights = torch.load(args.pretrained_weights)
else:
print("=> no pretrained weights found at '{}'".format(args.resume))
start_epoch = 0
args.total_train_step = 0
# check previous saved checkpoint first
# if there is none, then resume from user specified checkpoint if there is
target_ckpt_path = args.workspace + "/" + checkpoint_file_name
ckpt_path = target_ckpt_path
if not os.path.isfile(ckpt_path):
print("=> no checkpoint found at '{}'".format(ckpt_path))
ckpt_path = args.resume
# optionally resume from a checkpoint
if ckpt_path:
if os.path.isfile(ckpt_path):
print("=> loading checkpoint '{}'".format(ckpt_path))
checkpoint = torch.load(
ckpt_path, map_location=lambda storage, loc: storage.cuda(args.gpu)
)
start_epoch = checkpoint["epoch"]
best_prec1 = checkpoint["best_prec1"]
model_state = checkpoint["state_dict"]
optimizer_state = checkpoint["optimizer"]
args.total_train_step = checkpoint["total_train_step"]
print(
"=> loaded checkpoint '{}' (epoch {})".format(
ckpt_path, checkpoint["epoch"]
)
)
else:
print("=> no checkpoint found at '{}'".format(ckpt_path))
model_state = None
optimizer_state = None
else:
model_state = None
optimizer_state = None
loss = nn.CrossEntropyLoss
if args.mixup > 0.0:
loss = lambda: NLLMultiLabelSmooth(args.label_smoothing)
elif args.label_smoothing > 0.0:
loss = lambda: LabelSmoothing(args.label_smoothing)
model_and_loss = ModelAndLoss(
(args.arch, args.model_config),
loss,
pretrained_weights=pretrained_weights,
cuda=True,
fp16=args.fp16,
)
# Create data loaders and optimizers as needed
if args.data_backend == "pytorch":
get_train_loader = get_pytorch_train_loader
get_val_loader = get_pytorch_val_loader
elif args.data_backend == "dali-gpu":
get_train_loader = get_dali_train_loader(dali_cpu=False)
get_val_loader = get_dali_val_loader()
elif args.data_backend == "dali-cpu":
get_train_loader = get_dali_train_loader(dali_cpu=True)
get_val_loader = get_dali_val_loader()
elif args.data_backend == "syntetic":
get_val_loader = get_syntetic_loader
get_train_loader = get_syntetic_loader
train_loader, train_loader_len = get_train_loader(
args.data,
args.batch_size,
1000,
args.mixup > 0.0,
workers=args.workers,
fp16=args.fp16,
)
if args.mixup != 0.0:
train_loader = MixUpWrapper(args.mixup, 1000, train_loader)
val_loader, val_loader_len = get_val_loader(
args.data, args.batch_size, 1000, False, workers=args.workers, fp16=args.fp16
)
if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
logger = log.Logger(
args.print_freq,
[
dllogger.StdOutBackend(
dllogger.Verbosity.DEFAULT, step_format=log.format_step
),
dllogger.JSONStreamBackend(
dllogger.Verbosity.VERBOSE,
os.path.join(args.workspace, args.raport_file),
),
],
last_epoch=start_epoch - 1,
)
else:
logger = log.Logger(args.print_freq, [], last_epoch=start_epoch - 1)
logger.log_parameter(args.__dict__, verbosity=dllogger.Verbosity.DEFAULT)
optimizer = get_optimizer(
list(model_and_loss.model.named_parameters()),
args.fp16,
args.lr,
args.momentum,
args.weight_decay,
nesterov=args.nesterov,
bn_weight_decay=args.bn_weight_decay,
state=optimizer_state,
static_loss_scale=args.static_loss_scale,
dynamic_loss_scale=args.dynamic_loss_scale,
)
if args.lr_schedule == "step":
lr_policy = lr_step_policy(
args.lr, [30, 60, 80], 0.1, args.warmup, logger=logger
)
elif args.lr_schedule == "cosine":
lr_policy = lr_cosine_policy(args.lr, args.warmup, args.epochs, logger=logger)
elif args.lr_schedule == "linear":
lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs, logger=logger)
if args.amp:
model_and_loss, optimizer = amp.initialize(
model_and_loss,
optimizer,
opt_level="O2",
loss_scale="dynamic" if args.dynamic_loss_scale else args.static_loss_scale,
)
if args.distributed:
model_and_loss.distributed()
model_and_loss.load_model_state(model_state)
train_loop(
model_and_loss,
optimizer,
lr_policy,
train_loader,
val_loader,
args.epochs,
args.fp16,
logger,
should_backup_checkpoint(args),
args.save_checkpoint_epochs,
use_amp=args.amp,
batch_size_multiplier=batch_size_multiplier,
start_epoch=start_epoch,
best_prec1=best_prec1,
prof=args.prof,
skip_training=args.evaluate,
skip_validation=args.training_only,
save_checkpoints=args.save_checkpoints and not args.evaluate,
checkpoint_dir=args.workspace,
total_train_step=args.total_train_step,
)
exp_duration = time.time() - exp_start_time
if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
logger.end()
print("Experiment ended")
sys.stdout.flush()
if __name__ == "__main__":
# print(f'os env: {os.environ}')
parser = argparse.ArgumentParser(description="PyTorch ImageNet Training")
add_parser_arguments(parser)
args = parser.parse_args()
cudnn.benchmark = True
import socket
print("started training scripts on ", socket.gethostname())
args = parser.parse_args()
args.world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) # node count
args.rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) # node world rank
print(f"world size {args.world_size}, rank {args.rank}")
import os
if not os.path.exists(args.workspace) and args.rank == 0:
print("workspace ", args.workspace, " does not exist, creating one.")
os.makedirs(args.workspace)
# override the master node ip by intention
# args.dist_url = 'tcp://' + get_master_ip() + ':23456'
# extract master ip from os env as a workaround
args.dist_url = "tcp://" + os.environ["AZ_BATCHAI_MPI_MASTER_NODE"] + ":23456"
ngpus_per_node = torch.cuda.device_count()
args.distributed = args.world_size > 1
# Since we have ngpus_per_node processes per node, the total world_size
# needs to be adjusted accordingly
args.world_size = ngpus_per_node * args.world_size
args.ngpus_per_node = ngpus_per_node
print(f"world size {args.world_size}, ngpus per node {ngpus_per_node}.")
# Use torch.multiprocessing.spawn to launch distributed processes: the
# main_worker process function
mp.spawn(main, nprocs=ngpus_per_node, args=(args,))
# notify DLTS to collect the std output asap.
log_collect_hook = "/var/log/compute/00_stdout.txt.exit"
if os.path.isfile(log_collect_hook):
open(log_collect_hook, "w").close()

Просмотреть файл

@ -0,0 +1,214 @@
# From PyTorch:
#
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2016- Facebook, Inc (Adam Paszke)
# Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
# Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
# Copyright (c) 2011-2013 NYU (Clement Farabet)
# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
# Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
#
# From Caffe2:
#
# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
#
# All contributions by Facebook:
# Copyright (c) 2016 Facebook Inc.
#
# All contributions by Google:
# Copyright (c) 2015 Google Inc.
# All rights reserved.
#
# All contributions by Yangqing Jia:
# Copyright (c) 2015 Yangqing Jia
# All rights reserved.
#
# All contributions from Caffe:
# Copyright(c) 2013, 2014, 2015, the respective contributors
# All rights reserved.
#
# All other contributions:
# Copyright(c) 2015, 2016 the respective contributors
# All rights reserved.
#
# Caffe2 uses a copyright model similar to Caffe: each contributor holds
# copyright over their contributions to Caffe2. The project versioning records
# all such contribution and copyright details. If a contributor wants to further
# mark their specific copyright on a particular contribution, they should
# indicate their copyright solely in the commit message of the change when it is
# committed.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
# and IDIAP Research Institute nor the names of its contributors may be
# used to endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import sys
import subprocess
import os
import socket
import time
from argparse import ArgumentParser, REMAINDER
import torch
def parse_args():
"""
Helper function parsing the command line options
@retval ArgumentParser
"""
parser = ArgumentParser(
description="PyTorch distributed training launch "
"helper utilty that will spawn up "
"multiple distributed processes"
)
# Optional arguments for the launch helper
parser.add_argument(
"--nnodes",
type=int,
default=1,
help="The number of nodes to use for distributed " "training",
)
parser.add_argument(
"--node_rank",
type=int,
default=0,
help="The rank of the node for multi-node distributed " "training",
)
parser.add_argument(
"--nproc_per_node",
type=int,
default=1,
help="The number of processes to launch on each node, "
"for GPU training, this is recommended to be set "
"to the number of GPUs in your system so that "
"each process can be bound to a single GPU.",
)
parser.add_argument(
"--master_addr",
default="127.0.0.1",
type=str,
help="Master node (rank 0)'s address, should be either "
"the IP address or the hostname of node 0, for "
"single node multi-proc training, the "
"--master_addr can simply be 127.0.0.1",
)
parser.add_argument(
"--master_port",
default=29500,
type=int,
help="Master node (rank 0)'s free port that needs to "
"be used for communciation during distributed "
"training",
)
# positional
parser.add_argument(
"training_script",
type=str,
help="The full path to the single GPU training "
"program/script to be launched in parallel, "
"followed by all the arguments for the "
"training script",
)
# rest from the training program
parser.add_argument("training_script_args", nargs=REMAINDER)
return parser.parse_args()
def main():
args = parse_args()
# world size in terms of number of processes
dist_world_size = args.nproc_per_node * args.nnodes
# set PyTorch distributed related environmental variables
current_env = os.environ.copy()
current_env["MASTER_ADDR"] = args.master_addr
current_env["MASTER_PORT"] = str(args.master_port)
current_env["WORLD_SIZE"] = str(dist_world_size)
processes = []
for local_rank in range(0, args.nproc_per_node):
# each process's rank
dist_rank = args.nproc_per_node * args.node_rank + local_rank
current_env["RANK"] = str(dist_rank)
current_env["LOCAL_RANK"] = str(local_rank)
# spawn the processes
cmd = [sys.executable, "-u", args.training_script] + args.training_script_args
print(cmd)
stdout = (
None if local_rank == 0 else open("GPU_" + str(local_rank) + ".log", "w")
)
process = subprocess.Popen(cmd, env=current_env, stdout=stdout)
processes.append(process)
try:
up = True
error = False
while up and not error:
up = False
for p in processes:
ret = p.poll()
if ret is None:
up = True
elif ret != 0:
error = True
time.sleep(1)
if error:
for p in processes:
if p.poll() is None:
p.terminate()
exit(1)
except KeyboardInterrupt:
for p in processes:
p.terminate()
raise
except SystemExit:
for p in processes:
p.terminate()
raise
except:
for p in processes:
p.terminate()
raise
if __name__ == "__main__":
main()

Просмотреть файл

@ -0,0 +1 @@
git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger

Просмотреть файл

@ -0,0 +1,688 @@
# ResNet50 v1.5 For PyTorch
This repository provides a script and recipe to train the ResNet50 model to
achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
## Table Of Contents
* [Model overview](#model-overview)
* [Model architecture](#model-architecture)
* [Default configuration](#default-configuration)
* [Optimizer](#optimizer)
* [Data augmentation](#data-augmentation)
* [DALI](#dali)
* [Feature support matrix](#feature-support-matrix)
* [Features](#features)
* [Mixed precision training](#mixed-precision-training)
* [Enabling mixed precision](#enabling-mixed-precision)
* [Setup](#setup)
* [Requirements](#requirements)
* [Quick Start Guide](#quick-start-guide)
* [Advanced](#advanced)
* [Scripts and sample code](#scripts-and-sample-code)
* [Parameters](#parameters)
* [Command-line options](#command-line-options)
* [Getting the data](#getting-the-data)
* [Dataset guidelines](#dataset-guidelines)
* [Multi-dataset](#multi-dataset)
* [Training process](#training-process)
* [Inference process](#inference-process)
* [Performance](#performance)
* [Benchmarking](#benchmarking)
* [Training performance benchmark](#training-performance-benchmark)
* [Inference performance benchmark](#inference-performance-benchmark)
* [Results](#results)
* [Training accuracy results](#training-accuracy-results)
* [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-(8x-v100-16G))
* [Example plots](*example-plots)
* [Training performance results](#training-performance-results)
* [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-(8x-v100-16G))
* [Training time for 90 epochs](#training-time-for-90-epochs)
* [Training time: NVIDIA DGX-1 (8x V100 16G)](#training-time-nvidia-dgx-1-(8x-v100-16G))
* [Inference performance results](#inference-performance-results)
* [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-(1x-v100-16G))
* [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
* [Release notes](#release-notes)
* [Changelog](#changelog)
* [Known issues](#known-issues)
## Model overview
The ResNet50 v1.5 model is a modified version of the [original ResNet50 v1 model](https://arxiv.org/abs/1512.03385).
The difference between v1 and v1.5 is that, in the bottleneck blocks which requires
downsampling, v1 has stride = 2 in the first 1x1 convolution, whereas v1.5 has stride = 2 in the 3x3 convolution.
This difference makes ResNet50 v1.5 slightly more accurate (~0.5% top1) than v1, but comes with a smallperformance drawback (~5% imgs/sec).
The model is initialized as described in [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification](https://arxiv.org/pdf/1502.01852.pdf)
### Default configuration
The following sections highlight the default configurations for the ResNet50 model.
#### Optimizer
This model uses SGD with momentum optimizer with the following hyperparameters:
* Momentum (0.875)
* Learning rate (LR) = 0.256 for 256 batch size, for other batch sizes we lineary
scale the learning rate.
* Learning rate schedule - we use cosine LR schedule
* For bigger batch sizes (512 and up) we use linear warmup of the learning rate
during the first couple of epochs
according to [Training ImageNet in 1 hour](https://arxiv.org/abs/1706.02677).
Warmup length depends on the total training length.
* Weight decay (WD)= 3.0517578125e-05 (1/32768).
* We do not apply WD on Batch Norm trainable parameters (gamma/bias)
* Label smoothing = 0.1
* We train for:
* 50 Epochs -> configuration that reaches 75.9% top1 accuracy
* 90 Epochs -> 90 epochs is a standard for ImageNet networks
* 250 Epochs -> best possible accuracy.
* For 250 epoch training we also use [MixUp regularization](https://arxiv.org/pdf/1710.09412.pdf).
#### Data augmentation
This model uses the following data augmentation:
* For training:
* Normalization
* Random resized crop to 224x224
* Scale from 8% to 100%
* Aspect ratio from 3/4 to 4/3
* Random horizontal flip
* For inference:
* Normalization
* Scale to 256x256
* Center crop to 224x224
#### Other training recipes
This script does not target any specific benchmark.
There are changes that others have made which can speed up convergence and/or increase accuracy.
One of the more popular training recipes is provided by [fast.ai](https://github.com/fastai/imagenet-fast).
The fast.ai recipe introduces many changes to the training procedure, one of which is progressive resizing of the training images.
The first part of training uses 128px images, the middle part uses 224px images, and the last part uses 288px images.
The final validation is performed on 288px images.
Training script in this repository performs validation on 224px images, just like the original paper described.
These two approaches can't be directly compared, since the fast.ai recipe requires validation on 288px images,
and this recipe keeps the original assumption that validation is done on 224px images.
Using 288px images means that a lot more FLOPs are needed during inference to reach the same accuracy.
### Feature support matrix
The following features are supported by this model:
| Feature | ResNet50
|-----------------------|--------------------------
|[DALI](https://docs.nvidia.com/deeplearning/sdk/dali-release-notes/index.html) | Yes
|[APEX AMP](https://nvidia.github.io/apex/amp.html) | Yes |
#### Features
- NVIDIA DALI - DALI is a library accelerating data preparation pipeline. To accelerate your input pipeline, you only need to define your data loader
with the DALI library. For more information about DALI, refer to the [DALI product documentation](https://docs.nvidia.com/deeplearning/sdk/index.html#data-loading).
- [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as [Automatic Mixed Precision (AMP)](https://nvidia.github.io/apex/amp.html), which require minimal network code changes to leverage Tensor Cores performance. Refer to the [Enabling mixed precision](#enabling-mixed-precision) section for more details.
### DALI
We use [NVIDIA DALI](https://github.com/NVIDIA/DALI),
which speeds up data loading when CPU becomes a bottleneck.
DALI can use CPU or GPU, and outperforms the PyTorch native dataloader.
Run training with `--data-backends dali-gpu` or `--data-backends dali-cpu` to enable DALI.
For DGX1 we recommend `--data-backends dali-cpu`, for DGX2 we recommend `--data-backends dali-gpu`.
### Mixed precision training
Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
1. Porting the model to use the FP16 data type where appropriate.
2. Adding loss scaling to preserve small gradient values.
The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
For information about:
- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
- APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
#### Enabling mixed precision
Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision (AMP), a library from [APEX](https://github.com/NVIDIA/apex) that casts variables to half-precision upon retrieval,
while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients.
In PyTorch, loss scaling can be easily applied by using scale_loss() method provided by AMP. The scaling value to be used can be [dynamic](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.DynamicLossScaler) or fixed.
For an in-depth walk through on AMP, check out sample usage [here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started). [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as AMP, which require minimal network code changes to leverage tensor cores performance.
To enable mixed precision, you can:
- Import AMP from APEX, for example:
```
from apex import amp
```
- Initialize an AMP handle, for example:
```
amp_handle = amp.init(enabled=True, verbose=True)
```
- Wrap your optimizer with the AMP handle, for example:
```
optimizer = amp_handle.wrap_optimizer(optimizer)
```
- Scale loss before backpropagation (assuming loss is stored in a variable called losses)
- Default backpropagate for FP32:
```
losses.backward()
```
- Scale loss and backpropagate with AMP:
```
with optimizer.scale_loss(losses) as scaled_losses:
scaled_losses.backward()
```
## Setup
The following section lists the requirements that you need to meet in order to start training the ResNet50 model.
### Requirements
This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
* [PyTorch 19.10-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) or newer
* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
For more information about how to get started with NGC containers, see the
following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning
DGX Documentation:
* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
* [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
## Quick Start Guide
### 1. Clone the repository.
```
git clone https://github.com/NVIDIA/DeepLearningExamples
cd DeepLearningExamples/PyTorch/Classification/
```
### 2. Download and preprocess the dataset.
The ResNet50 script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
PyTorch can work directly on JPEGs, therefore, preprocessing/augmentation is not needed.
1. [Download the images](http://image-net.org/download-images).
2. Extract the training data:
```bash
mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
cd ..
```
3. Extract the validation data and move the images to subfolders:
```bash
mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
```
The directory in which the `train/` and `val/` directories are placed, is referred to as `<path to imagenet>` in this document.
### 3. Build the RN50v1.5 PyTorch NGC container.
```
docker build . -t nvidia_rn50
```
### 4. Start an interactive session in the NGC container to run training/inference.
```
nvidia-docker run --rm -it -v <path to imagenet>:/data/imagenet --ipc=host nvidia_rn50
```
### 5. Start training
To run training for a standard configuration (DGX1V/DGX2V, FP16/FP32, 50/90/250 Epochs),
run one of the scripts in the `./resnet50v1.5/training` directory
called `./resnet50v1.5/training/{DGX1, DGX2}_RN50_{AMP, FP16, FP32}_{50,90,250}E.sh`.
Ensure ImageNet is mounted in the `/data/imagenet` directory.
Example:
`bash ./resnet50v1.5/training/DGX1_RN50_FP16_250E.sh <path were to store checkpoints and logs>`
### 6. Start inference
To run inference on ImageNet on a checkpointed model, run:
`python ./main.py --arch resnet50 --evaluate --epochs 1 --resume <path to checkpoint> -b <batch size> <path to imagenet>`
To run inference on JPEG image, you have to first extract the model weights from checkpoint:
`python checkpoint2model.py --checkpoint-path <path to checkpoint> --weight-path <path where weights will be stored>`
Then run classification script:
`python classify.py --arch resnet50 -c fanin --weights <path to weights from previous step> --precision AMP|FP16|FP32 --image <path to JPEG image>`
## Advanced
The following sections provide greater details of the dataset, running training and inference, and the training results.
### Scripts and sample code
To run a non standard configuration use:
* For 1 GPU
* FP32
`python ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 <path to imagenet>`
`python ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 --amp --static-loss-scale 256 <path to imagenet>`
* For multiple GPUs
* FP32
`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 <path to imagenet>`
* AMP
`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 --amp --static-loss-scale 256 <path to imagenet>`
Use `python ./main.py -h` to obtain the list of available options in the `main.py` script.
### Commmand-line options:
To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
`python main.py -h`
```
usage: main.py [-h] [--data-backend BACKEND] [--arch ARCH]
[--model-config CONF] [-j N] [--epochs N] [-b N]
[--optimizer-batch-size N] [--lr LR] [--lr-schedule SCHEDULE]
[--warmup E] [--label-smoothing S] [--mixup ALPHA]
[--momentum M] [--weight-decay W] [--bn-weight-decay]
[--nesterov] [--print-freq N] [--resume PATH]
[--pretrained-weights PATH] [--fp16]
[--static-loss-scale STATIC_LOSS_SCALE] [--dynamic-loss-scale]
[--prof N] [--amp] [--local_rank LOCAL_RANK] [--seed SEED]
[--gather-checkpoints] [--raport-file RAPORT_FILE] [--evaluate]
[--training-only] [--no-checkpoints] [--workspace DIR]
DIR
PyTorch ImageNet Training
positional arguments:
DIR path to dataset
optional arguments:
-h, --help show this help message and exit
--data-backend BACKEND
data backend: pytorch | syntetic | dali-gpu | dali-cpu
(default: dali-cpu)
--arch ARCH, -a ARCH model architecture: resnet18 | resnet34 | resnet50 |
resnet101 | resnet152 | resnet50 | se-
resnet50 (default: resnet50)
--model-config CONF, -c CONF
model configs: classic | fanin | grp-fanin | grp-
fanout(default: classic)
-j N, --workers N number of data loading workers (default: 5)
--epochs N number of total epochs to run
-b N, --batch-size N mini-batch size (default: 256) per gpu
--optimizer-batch-size N
size of a total batch size, for simulating bigger
batches using gradient accumulation
--lr LR, --learning-rate LR
initial learning rate
--lr-schedule SCHEDULE
Type of LR schedule: step, linear, cosine
--warmup E number of warmup epochs
--label-smoothing S label smoothing
--mixup ALPHA mixup alpha
--momentum M momentum
--weight-decay W, --wd W
weight decay (default: 1e-4)
--bn-weight-decay use weight_decay on batch normalization learnable
parameters, (default: false)
--nesterov use nesterov momentum, (default: false)
--print-freq N, -p N print frequency (default: 10)
--resume PATH path to latest checkpoint (default: none)
--pretrained-weights PATH
load weights from here
--fp16 Run model fp16 mode.
--static-loss-scale STATIC_LOSS_SCALE
Static loss scale, positive power of 2 values can
improve fp16 convergence.
--dynamic-loss-scale Use dynamic loss scaling. If supplied, this argument
supersedes --static-loss-scale.
--prof N Run only N iterations
--amp Run model AMP (automatic mixed precision) mode.
--local_rank LOCAL_RANK
Local rank of python process. Set up by distributed
launcher
--seed SEED random seed used for numpy and pytorch
--gather-checkpoints Gather checkpoints throughout the training, without
this flag only best and last checkpoints will be
stored
--raport-file RAPORT_FILE
file in which to store JSON experiment raport
--evaluate evaluate checkpoint/model
--training-only do not evaluate
--no-checkpoints do not store any checkpoints, useful for benchmarking
--workspace DIR path to directory where checkpoints will be stored
```
### Dataset guidelines
To use your own dataset, divide it in directories as in the following scheme:
- Training images - `train/<class id>/<image>`
- Validation images - `val/<class id>/<image>`
If your dataset's has number of classes different than 1000, you need to add a custom config
in the `image_classification/resnet.py` file.
```python
resnet_versions = {
...
'resnet50-custom' : {
'net' : ResNet,
'block' : Bottleneck,
'layers' : [3, 4, 6, 3],
'widths' : [64, 128, 256, 512],
'expansion' : 4,
'num_classes' : <custom number of classes>,
}
}
```
After adding the config, run the training script with `--arch resnet50-custom` flag.
### Training process
All the results of the training will be stored in the directory specified with `--workspace` argument.
Script will store:
- most recent checkpoint - `checkpoint.pth.tar` (unless `--no-checkpoints` flag is used).
- checkpoint with best validation accuracy - `model_best.pth.tar` (unless `--no-checkpoints` flag is used).
- JSON log - in the file specified with `--raport-file` flag.
Metrics gathered through training:
- `train.loss` - training loss
- `train.total_ips` - training speed measured in images/second
- `train.compute_ips` - training speed measured in images/second, not counting data loading
- `train.data_time` - time spent on waiting on data
- `train.compute_time` - time spent in forward/backward pass
### Inference process
Validation is done every epoch, and can be also run separately on a checkpointed model.
`python ./main.py --arch resnet50 --evaluate --epochs 1 --resume <path to checkpoint> -b <batch size> <path to imagenet>`
Metrics gathered through training:
- `val.loss` - validation loss
- `val.top1` - validation top1 accuracy
- `val.top5` - validation top5 accuracy
- `val.total_ips` - inference speed measured in images/second
- `val.compute_ips` - inference speed measured in images/second, not counting data loading
- `val.data_time` - time spent on waiting on data
- `val.compute_time` - time spent on inference
To run inference on JPEG image, you have to first extract the model weights from checkpoint:
`python checkpoint2model.py --checkpoint-path <path to checkpoint> --weight-path <path where weights will be stored>`
Then run classification script:
`python classify.py --arch resnet50 -c fanin --weights <path to weights from previous step> --precision AMP|FP16|FP32 --image <path to JPEG image>`
Example output:
## Performance
### Benchmarking
The following section shows how to run benchmarks measuring the model performance in training and inference modes.
#### Training performance benchmark
To benchmark training, run:
* For 1 GPU
* FP32
`python ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
* FP16
`python ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --fp16 --static-loss-scale 256 <path to imagenet>`
* AMP
`python ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --amp --static-loss-scale 256 <path to imagenet>`
* For multiple GPUs
* FP32
`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
* FP16
`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --fp16 --static-loss-scale 256 --epochs 1 --prof 100 <path to imagenet>`
* AMP
`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --amp --static-loss-scale 256 --epochs 1 --prof 100 <path to imagenet>`
Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
#### Inference performance benchmark
To benchmark inference, run:
* FP32
`python ./main.py --arch resnet50 -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate <path to imagenet>`
* FP16
`python ./main.py --arch resnet50 -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate --fp16 <path to imagenet>`
* AMP
`python ./main.py --arch resnet50 -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate --amp <path to imagenet>`
Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
### Results
Our results were obtained by running the applicable training script in the pytorch-19.10 NGC container.
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
#### Training accuracy results
##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
| **epochs** | **Mixed Precision Top1** | **FP32 Top1** |
|:-:|:-:|:-:|
| 50 | 76.25 +/- 0.04 | 76.26 +/- 0.07 |
| 90 | 77.23 +/- 0.04 | 77.08 +/- 0.08 |
| 250 | 78.42 +/- 0.04 | 78.30 +/- 0.16 |
##### Training accuracy: NVIDIA DGX-2 (16x V100 32G)
| **epochs** | **Mixed Precision Top1** | **FP32 Top1** |
|:-:|:-:|:-:|
| 50 | 75.81 +/- 0.08 | 76.04 +/- 0.05 |
| 90 | 77.10 +/- 0.06 | 77.23 +/- 0.04 |
| 250 | 78.59 +/- 0.13 | 78.46 +/- 0.03 |
##### Example plots
The following images show a 250 epochs configuration on a DGX-1V.
![ValidationLoss](./img/loss_plot.png)
![ValidationTop1](./img/top1_plot.png)
![ValidationTop5](./img/top5_plot.png)
#### Training performance results
##### Traininig performance: NVIDIA DGX1-16G (8x V100 16G)
| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
|:-:|:-:|:-:|:-:|:-:|:-:|
| 1 | 893.09 img/s | 380.44 img/s | 2.35x | 1.00x | 1.00x |
| 8 | 6888.75 img/s | 2945.37 img/s | 2.34x | 7.71x | 7.74x |
##### Traininig performance: NVIDIA DGX1-32G (8x V100 32G)
| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
|:-:|:-:|:-:|:-:|:-:|:-:|
| 1 | 849.63 img/s | 373.93 img/s | 2.27x | 1.00x | 1.00x |
| 8 | 6614.15 img/s | 2911.22 img/s | 2.27x | 7.78x | 7.79x |
##### Traininig performance: NVIDIA DGX2 (16x V100 32G)
| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
|:-:|:-:|:-:|:-:|:-:|:-:|
| 1 | 894.41 img/s | 402.23 img/s | 2.22x | 1.00x | 1.00x |
| 16 | 13443.82 img/s | 6263.41 img/s | 2.15x | 15.03x | 15.57x |
#### Training Time for 90 Epochs
##### Training time: NVIDIA DGX-1 (8x V100 16G)
| **GPUs** | **Mixed Precision training time** | **FP32 training time** |
|:-:|:-:|:-:|
| 1 | ~ 41 h | ~ 95 h |
| 8 | ~ 7 h | ~ 14 h |
##### Training time: NVIDIA DGX-2 (16x V100 32G)
| **GPUs** | **Mixed Precision training time** | **FP32 training time** |
|:-:|:-:|:-:|
| 1 | ~ 41 h | ~ 90 h |
| 16 | ~ 5 h | ~ 8 h |
#### Inference performance results
##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
###### FP32 Inference Latency
| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
|:-:|:-:|:-:|:-:|:-:|:-:|
| 1 | 136.82 img/s | 7.12ms | 7.25ms | 8.36ms | 10.92ms |
| 2 | 266.86 img/s | 7.27ms | 7.41ms | 7.85ms | 9.11ms |
| 4 | 521.76 img/s | 7.44ms | 7.58ms | 8.14ms | 10.09ms |
| 8 | 766.22 img/s | 10.18ms | 10.46ms | 10.97ms | 12.75ms |
| 16 | 976.36 img/s | 15.79ms | 15.88ms | 15.95ms | 16.63ms |
| 32 | 1092.27 img/s | 28.63ms | 28.71ms | 28.76ms | 29.30ms |
| 64 | 1161.55 img/s | 53.69ms | 53.86ms | 53.90ms | 54.23ms |
| 128 | 1209.12 img/s | 104.24ms | 104.68ms | 104.80ms | 105.00ms |
| 256 | N/A | N/A | N/A | N/A | N/A |
###### Mixed Precision Inference Latency
| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
|:-:|:-:|:-:|:-:|:-:|:-:|
| 1 | 114.97 img/s | 8.56ms | 9.32ms | 11.43ms | 12.79ms |
| 2 | 238.70 img/s | 8.20ms | 8.75ms | 9.49ms | 12.31ms |
| 4 | 448.69 img/s | 8.67ms | 9.20ms | 9.97ms | 10.60ms |
| 8 | 875.00 img/s | 8.88ms | 9.31ms | 9.80ms | 10.82ms |
| 16 | 1746.07 img/s | 8.89ms | 9.05ms | 9.56ms | 12.81ms |
| 32 | 2004.28 img/s | 14.07ms | 14.14ms | 14.31ms | 14.92ms |
| 64 | 2254.60 img/s | 25.93ms | 26.05ms | 26.07ms | 26.17ms |
| 128 | 2360.14 img/s | 50.14ms | 50.28ms | 50.34ms | 50.68ms |
| 256 | 2342.13 img/s | 96.74ms | 96.91ms | 96.99ms | 97.14ms |
##### Inference performance: NVIDIA T4
###### FP32 Inference Latency
| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
|:-:|:-:|:-:|:-:|:-:|:-:|
| 1 | 179.85 img/s | 5.51ms | 5.65ms | 7.34ms | 10.97ms |
| 2 | 348.12 img/s | 5.67ms | 5.95ms | 6.33ms | 9.81ms |
| 4 | 556.27 img/s | 7.03ms | 7.34ms | 8.13ms | 9.65ms |
| 8 | 740.43 img/s | 10.32ms | 10.33ms | 10.60ms | 13.87ms |
| 16 | 909.17 img/s | 17.19ms | 17.15ms | 18.13ms | 21.06ms |
| 32 | 999.07 img/s | 31.07ms | 31.12ms | 31.17ms | 32.41ms |
| 64 | 1090.47 img/s | 57.62ms | 57.84ms | 57.91ms | 58.05ms |
| 128 | 1142.46 img/s | 110.94ms | 111.15ms | 111.23ms | 112.16ms |
| 256 | N/A | N/A | N/A | N/A | N/A |
###### Mixed Precision Inference Latency
| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
|:-:|:-:|:-:|:-:|:-:|:-:|
| 1 | 163.78 img/s | 6.05ms | 5.92ms | 7.98ms | 11.58ms |
| 2 | 333.43 img/s | 5.91ms | 6.05ms | 6.63ms | 11.52ms |
| 4 | 645.45 img/s | 6.04ms | 6.33ms | 7.01ms | 8.90ms |
| 8 | 1164.15 img/s | 6.73ms | 7.31ms | 8.04ms | 12.41ms |
| 16 | 1606.42 img/s | 9.53ms | 9.86ms | 10.52ms | 17.01ms |
| 32 | 1857.29 img/s | 15.67ms | 15.61ms | 16.14ms | 18.66ms |
| 64 | 2011.62 img/s | 28.64ms | 28.69ms | 28.82ms | 31.06ms |
| 128 | 2083.90 img/s | 54.87ms | 54.96ms | 54.99ms | 55.27ms |
| 256 | 2043.72 img/s | 106.51ms | 106.62ms | 106.68ms | 107.03ms |
## Release notes
### Changelog
1. September 2018
* Initial release
2. January 2019
* Added options Label Smoothing, fan-in initialization, skipping weight decay on batch norm gamma and bias.
3. May 2019
* Cosine LR schedule
* MixUp regularization
* DALI support
* DGX2 configurations
* gradients accumulation
4. July 2019
* DALI-CPU dataloader
* Updated README
### Known issues
There are no known issues with this model.

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 250 --mixup 0.2

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 50

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 90

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 250 --mixup 0.2

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 50

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 90

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 250 --mixup 0.2

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 50

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 90

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 250 --mixup 0.2

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 50

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 90

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 250 --mixup 0.2

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 50

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 90

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 250 --mixup 0.2

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 50

Просмотреть файл

@ -0,0 +1 @@
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 90

Просмотреть файл

@ -0,0 +1,8 @@
name: designer-cv-transform
channels:
- defaults
dependencies:
- pip=20.2
- python=3.7.9
- pip:
- azureml-designer-cv-modules==0.0.41

Просмотреть файл

@ -0,0 +1,127 @@
$schema: https://azuremlschemas.azureedge.net/development/CommandComponent.schema.json
type: command
name: microsoftsamples_init_image_transformation
display_name: Init Image Transformation
description: Initialize image transformation.
version: 1
inputs:
resize:
description: Resize the input PIL Image to the given size
type: string
default: True
enum: ['True', 'False']
size:
description: Desired output size
type: integer
default: 256
center_crop:
description: Crops the given PIL Image at the center
type: string
default: False
enum: ['True', 'False']
crop_size:
description: Desired output size of the crop
type: integer
default: 224
pad:
description: Pad the given PIL Image on all sides with the given "pad" value
type: string
default: False
enum: ['True', 'False']
padding:
description: Padding on each border
type: integer
default: 0
color_jitter:
description: Randomly change the brightness, contrast and saturation of an image
type: boolean
default: false
grayscale:
description: Convert image to grayscale
type: boolean
default: false
random_resized_crop:
description: Crop the given PIL Image to random size and aspect ratio
type: string
default: False
enum: ['True', 'False']
random_resized_crop_size:
description: Expected output size of each edge
type: integer
default: 256
random_crop:
description: Crop the given PIL Image at a random location
type: string
default: False
enum: ['True', 'False']
random_crop_size:
description: Desired output size of the crop
type: integer
default: 224
random_horizontal_flip:
description: Horizontally flip the given PIL Image randomly with a given probability
type: boolean
default: false
random_vertical_flip:
description: Vertically flip the given PIL Image randomly with a given probability
type: boolean
default: false
random_rotation:
description: Rotate the image by angle
type: boolean
default: false
random_rotation_degrees:
description: Range of degrees to select from
type: integer
default: 0
random_affine:
description: Random affine transformation of the image keeping center invariant
type: boolean
default: false
random_affine_degrees:
description: Range of degrees to select from
type: integer
default: 0
random_grayscale:
description: Randomly convert image to grayscale with a probability of p (default 0.1)
type: boolean
default: false
random_perspective:
description: Performs Perspective transformation of the given PIL Image randomly with a given probability
type: boolean
default: false
outputs:
output_path:
type: uri_folder
description: Output image transformation
command: >-
python -m azureml.designer.modules.computer_vision.transform.init_image_transformation.init_image_transformation
--resize ${{inputs.resize}}
--size ${{inputs.size}}
--center-crop ${{inputs.center_crop}}
--crop-size ${{inputs.crop_size}}
--pad ${{inputs.pad}}
--padding ${{inputs.padding}}
--color-jitter ${{inputs.color_jitter}}
--grayscale ${{inputs.grayscale}}
--random-resized-crop ${{inputs.random_resized_crop}}
--random-resized-crop-size ${{inputs.random_resized_crop_size}}
--random-crop ${{inputs.random_crop}}
--random-crop-size ${{inputs.random_crop_size}}
--random-horizontal-flip ${{inputs.random_horizontal_flip}}
--random-vertical-flip ${{inputs.random_vertical_flip}}
--random-rotation ${{inputs.random_rotation}}
--random-rotation-degrees ${{inputs.random_rotation_degrees}}
--random-affine ${{inputs.random_affine}}
--random-affine-degrees ${{inputs.random_affine_degrees}}
--random-grayscale ${{inputs.random_grayscale}}
--random-perspective ${{inputs.random_perspective}}
--output-path ${{outputs.output_path}}
environment:
conda_file: ./conda.yaml
image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20211124.v1

Просмотреть файл

@ -0,0 +1,95 @@
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
# <inputs_and_outputs>
inputs:
training_image: #using local data, will crate an anonymous data asset
type: uri_folder
path: ./data/train
validation_image:
type: uri_folder
path: ./data/val
# </inputs_and_outputs>
# <jobs>
settings:
default_datastore: azureml:workspaceblobstore
default_compute: azureml:cpu-cluster
continue_on_step_failure: false
jobs:
convert_training_image:
type: command
component: file:./convert_to_image_directory/entry.spec.yaml
inputs:
input_path: ${{parent.inputs.training_image}}
convert_evaluation_image:
type: command
component: file:./convert_to_image_directory/entry.spec.yaml
inputs:
input_path: ${{parent.inputs.validation_image}}
init_transformation:
type: command
component: file:./init_image_transformation/entry.spec.yaml
inputs:
resize: "False"
size: 256
center_crop: 224
pad: "False"
padding: 0
color_jitter: "False"
grayscale: "False"
random_resized_crop: "False"
random_resized_crop_size: 256
random_crop: "False"
random_crop_size: 224
random_horizontal_flip: "True"
random_vertical_flip: "True"
random_rotation: "False"
random_rotation_degrees: 0
random_affine: "False"
random_affine_degrees: 0
random_grayscale: "False"
random_perspective: "False"
transform_on_training_image:
type: command
component: file:./apply_image_transformation/entry.spec.yaml
inputs:
mode: "For training"
input_image_transform_path: ${{parent.jobs.init_transformation.outputs.output_path}}
input_image_dir_path: ${{parent.jobs.convert_training_image.outputs.output_path}}
transform_on_evaluation_image:
type: command
component: file:./apply_image_transformation/entry.spec.yaml
inputs:
mode: "For inference"
input_image_transform_path: ${{parent.jobs.init_transformation.outputs.output_path}}
input_image_dir_path: ${{parent.jobs.convert_evaluation_image.outputs.output_path}}
train:
type: command
component: file:./image_cnn_train/entry.spec.yaml
compute: azureml:gpu-cluster
inputs:
train_data: ${{parent.jobs.transform_on_training_image.outputs.output_path}}
valid_data: ${{parent.jobs.transform_on_evaluation_image.outputs.output_path}}
data_backend: "pytorch"
epochs: 4
seed: 123
batch_size: 16
save_checkpoint_epochs: 2
outputs:
workspace:
type: uri_folder
mode: upload
distribution:
type: mpi
process_count_per_instance: 1
resources:
instance_count: 2
# </jobs>

Просмотреть файл

@ -0,0 +1,15 @@
---
page_type: sample
languages:
- azurecli
- python
products:
- azure-machine-learning
description: This sample shows how to using distributed job on an Azure ML compute cluster. It will use cifar-10 dataset, processed data, train model and then evaluate output model.
---
# Submit pipeline job
This example shows how a build a three steps pipeline. You need use gpu SKU or powerful cpu SKU like `STANDARD_D15_V2` for the train and eval step in this pipeline.
Please change `process_count_per_instance` number of GPU cards you have to fully utilize the compute resource you have.

Просмотреть файл

@ -0,0 +1,56 @@
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
display_name: cifar-10-pipeline-example
experiment_name: cifar-10-pipeline-example
jobs:
get-data:
type: command
command: >-
wget https://azuremlexamples.blob.core.windows.net/datasets/cifar-10-python.tar.gz;
tar -xvzf cifar-10-python.tar.gz -C ${{outputs.cifar}};
rm cifar-10-python.tar.gz;
compute: azureml:gpu-cluster
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
outputs:
cifar:
type: uri_folder
mode: upload
train-model:
type: command
command: >-
python main.py
--data-dir ${{inputs.cifar}}
--epochs ${{inputs.epochs}}
--model-dir ${{outputs.model_dir}}
code: src/train-model
inputs:
epochs: 1
cifar: ${{parent.jobs.get-data.outputs.cifar}}
outputs:
model_dir:
type: uri_folder
mode: upload
environment: azureml:AzureML-pytorch-1.9-ubuntu18.04-py37-cuda11-gpu@latest
compute: azureml:gpu-cluster
distribution:
type: pytorch
process_count_per_instance: 1
resources:
instance_count: 2
eval-model:
type: command
command: >-
python main.py
--data-dir ${{inputs.cifar}}
--model-dir ${{inputs.model_dir}}/model
code: src/eval-model
environment: azureml:AzureML-pytorch-1.9-ubuntu18.04-py37-cuda11-gpu@latest
compute: azureml:gpu-cluster
distribution:
type: pytorch
process_count_per_instance: 2
resources:
instance_count: 1
inputs:
cifar: ${{parent.jobs.get-data.outputs.cifar}}
model_dir: ${{parent.jobs.train-model.outputs.model_dir}}

Просмотреть файл

@ -0,0 +1,147 @@
# Copyright (c) 2017 Facebook, Inc. All rights reserved.
# BSD 3-Clause License
#
# Script adapted from: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py
# ==============================================================================
# imports
import os
import mlflow
import argparse
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# TODO - add mlflow logging
# define functions
def evaluate(test_loader, model, device):
classes = (
"plane",
"car",
"bird",
"cat",
"deer",
"dog",
"frog",
"horse",
"ship",
"truck",
)
model.eval()
correct = 0
total = 0
class_correct = list(0.0 for i in range(10))
class_total = list(0.0 for i in range(10))
with torch.no_grad():
for data in test_loader:
images, labels = data[0].to(device), data[1].to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
c = (predicted == labels).squeeze()
for i in range(10):
label = labels[i]
class_correct[label] += c[i].item()
class_total[label] += 1
# print total test set accuracy
print(
"Accuracy of the network on the 10000 test images: %d %%"
% (100 * correct / total)
)
# print test accuracy for each of the classes
for i in range(10):
print(
"Accuracy of %5s : %2d %%"
% (classes[i], 100 * class_correct[i] / class_total[i])
)
def main(args):
# get PyTorch environment variables
world_size = int(os.environ["WORLD_SIZE"])
rank = int(os.environ["RANK"])
local_rank = int(os.environ["LOCAL_RANK"])
distributed = world_size > 1
# set device
if distributed and torch.cuda.is_available():
device = torch.device("cuda", local_rank)
else:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# initialize distributed process group using default env:// method
if distributed:
torch.distributed.init_process_group(
backend="nccl" if torch.cuda.is_available() else "gloo"
)
# define test dataset DataLoaders
transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)
test_set = torchvision.datasets.CIFAR10(
root=args.data_dir, train=False, download=False, transform=transform
)
test_loader = torch.utils.data.DataLoader(
test_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers
)
# load model
model = mlflow.pytorch.load_model(args.model_dir)
model = model.to(device)
# evaluate on full test dataset
if not distributed or rank == 0:
evaluate(test_loader, model, device)
def parse_args():
# setup argparse
parser = argparse.ArgumentParser()
# add arguments
parser.add_argument(
"--data-dir", type=str, help="directory containing CIFAR-10 dataset"
)
parser.add_argument(
"--model-dir", type=str, default="./", help="input directory for model"
)
parser.add_argument(
"--batch-size",
default=16,
type=int,
help="mini batch size for each gpu/process",
)
parser.add_argument(
"--workers",
default=2,
type=int,
help="number of data loading workers for each gpu/process",
)
# parse args
args = parser.parse_args()
# return args
return args
# run script
if __name__ == "__main__":
# parse args
args = parse_args()
# call main function
main(args)

Просмотреть файл

@ -0,0 +1,199 @@
# Copyright (c) 2017 Facebook, Inc. All rights reserved.
# BSD 3-Clause License
#
# Script adapted from: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py
# ==============================================================================
# imports
import os
import mlflow
import argparse
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# TODO - add mlflow logging
# define network architecture
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 32, 3)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(32, 64, 3)
self.conv3 = nn.Conv2d(64, 128, 3)
self.fc1 = nn.Linear(128 * 6 * 6, 120)
self.dropout = nn.Dropout(p=0.2)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.pool(F.relu(self.conv2(x)))
x = self.pool(F.relu(self.conv3(x)))
x = x.view(-1, 128 * 6 * 6)
x = self.dropout(F.relu(self.fc1(x)))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
# define functions
def train(train_loader, model, criterion, optimizer, epoch, device, print_freq, rank):
running_loss = 0.0
for i, data in enumerate(train_loader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data[0].to(device), data[1].to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % print_freq == 0: # print every print_freq mini-batches
print(
"Rank %d: [%d, %5d] loss: %.3f"
% (rank, epoch + 1, i + 1, running_loss / print_freq)
)
running_loss = 0.0
def main(args):
# get PyTorch environment variables
world_size = int(os.environ["WORLD_SIZE"])
rank = int(os.environ["RANK"])
local_rank = int(os.environ["LOCAL_RANK"])
distributed = world_size > 1
# set device
if distributed and torch.cuda.is_available():
device = torch.device("cuda", local_rank)
else:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# initialize distributed process group using default env:// method
if distributed:
torch.distributed.init_process_group(
backend="nccl" if torch.cuda.is_available() else "gloo"
)
# define train and dataset DataLoaders
transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)
train_set = torchvision.datasets.CIFAR10(
root=args.data_dir, train=True, download=False, transform=transform
)
if distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_set)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
train_set,
batch_size=args.batch_size,
shuffle=(train_sampler is None),
num_workers=args.workers,
sampler=train_sampler,
)
model = Net().to(device)
# wrap model with DDP
if distributed and torch.cuda.is_available():
model = nn.parallel.DistributedDataParallel(
model, device_ids=[local_rank], output_device=local_rank
)
# define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(
model.parameters(), lr=args.learning_rate, momentum=args.momentum
)
# train the model
for epoch in range(args.epochs):
print("Rank %d: Starting epoch %d" % (rank, epoch))
if distributed:
train_sampler.set_epoch(epoch)
model.train()
train(
train_loader,
model,
criterion,
optimizer,
epoch,
device,
args.print_freq,
rank,
)
print("Rank %d: Finished Training" % (rank))
if not distributed or rank == 0:
# log model
mlflow.pytorch.save_model(model, f"{args.model_dir}/model")
def parse_args():
# setup argparse
parser = argparse.ArgumentParser()
# add arguments
parser.add_argument(
"--data-dir", type=str, help="directory containing CIFAR-10 dataset"
)
parser.add_argument(
"--model-dir", type=str, default="./", help="output directory for model"
)
parser.add_argument("--epochs", default=10, type=int, help="number of epochs")
parser.add_argument(
"--batch-size",
default=16,
type=int,
help="mini batch size for each gpu/process",
)
parser.add_argument(
"--workers",
default=2,
type=int,
help="number of data loading workers for each gpu/process",
)
parser.add_argument(
"--learning-rate", default=0.001, type=float, help="learning rate"
)
parser.add_argument("--momentum", default=0.9, type=float, help="momentum")
parser.add_argument(
"--print-freq",
default=200,
type=int,
help="frequency of printing training statistics",
)
# parse args
args = parser.parse_args()
# return args
return args
# run script
if __name__ == "__main__":
# parse args
args = parse_args()
# call main function
main(args)

Просмотреть файл

@ -36,7 +36,6 @@ def main(args):
jobs += sorted(
glob.glob("jobs/pipelines-with-components/**/*pipeline*.yml", recursive=True)
)
jobs += sorted(glob.glob("jobs/*/basics/**/*pipeline*.yml", recursive=True))
jobs = [
job.replace(".yml", "")
for job in jobs
@ -290,6 +289,11 @@ def parse_path(path):
def write_job_workflow(job):
filename, project_dir, hyphenated = parse_path(job)
creds = "${{secrets.AZ_CREDS}}"
run_pipeline_job_path = (
"\n - cli/run-pipeline-jobs.sh"
if hyphenated.startswith("jobs-pipelines")
else ""
)
workflow_yaml = f"""name: cli-{hyphenated}
on:
workflow_dispatch:
@ -300,7 +304,7 @@ on:
- main
paths:
- cli/{project_dir}/**
- .github/workflows/cli-{hyphenated}.yml
- .github/workflows/cli-{hyphenated}.yml{run_pipeline_job_path}
- cli/setup.sh
jobs:
build:

Просмотреть файл

@ -37,6 +37,11 @@ pwd
az ml job create --file pipeline.yml
cd ../../../../
cd jobs/pipelines-with-components/basics/3c_pipeline_with_hyperparameter_sweep
pwd
az ml job create --file pipeline.yml
cd ../../../../
cd jobs/pipelines-with-components/basics/4a_local_data_input
pwd
az ml job create --file pipeline.yml
@ -47,12 +52,6 @@ pwd
az ml job create --file pipeline.yml
cd ../../../../
# cd jobs/pipelines-with-components/basics/4c_dataset_input
# pwd
# az ml data create --file data.yml --version $target_version
# az ml job create --file pipeline.yml
# cd ../../../../
cd jobs/pipelines-with-components/basics/4c_web_url_input
pwd
az ml job create --file pipeline.yml
@ -103,9 +102,9 @@ pwd
az ml job create --file pipeline.yml
cd ../../../
# cd jobs/pipelines/cifar-10
# pwd
# az ml job create --file pipeline.yml --web
# cd ../../../
cd jobs/pipelines/cifar-10
pwd
az ml job create --file pipeline.yml --web
cd ../../../
az --version