feat: re-enable image/cifar-10 pipeline tests & update readme.py (#1112)
* feat: test ci
* feat: enable cifar-10 & image classification
* feat: add sample for hyperparameter sweep
* fix: fix ci
* fix: fix ci
* fix ci: update readme.py
* fix: black reformat
* Revert "fix: black reformat"
This reverts commit d47755571f
.
* fix: image pipeline yml has been renamed
* fix: update output mode
* fix: update output mode
* chaneg setting to use artifact store and fix typo
Co-authored-by: lochen <cloga0216@gmail.com>
This commit is contained in:
Родитель
55eb858324
Коммит
c46a0de2b2
|
@ -1,4 +1,4 @@
|
|||
name: cli-scripts-run-pipeline-jobs
|
||||
name: cli-jobs-pipelines-cifar-10-pipeline
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
|
@ -7,8 +7,9 @@ on:
|
|||
branches:
|
||||
- main
|
||||
paths:
|
||||
- cli/jobs/pipelines/cifar-10/**
|
||||
- .github/workflows/cli-jobs-pipelines-cifar-10-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- .github/workflows/cli-scripts-run-pipeline-jobs.yml
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
@ -24,8 +25,6 @@ jobs:
|
|||
run: bash setup.sh
|
||||
working-directory: cli
|
||||
continue-on-error: true
|
||||
- name: scripts installs
|
||||
run: sudo apt-get upgrade -y && sudo apt-get install uuid-runtime jq -y
|
||||
- name: test script script
|
||||
run: set -e; bash -x run-pipeline-jobs.sh
|
||||
- name: run job
|
||||
run: bash -x run-job.sh jobs/pipelines/cifar-10/pipeline.yml
|
||||
working-directory: cli
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines/nyc-taxi/**
|
||||
- .github/workflows/cli-jobs-pipelines-nyc-taxi-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines-with-components/basics/1a_e2e_local_components/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines-with-components/basics/1b_e2e_registered_components/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines-with-components/basics/2a_basic_component/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-basics-2a_basic_component-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines-with-components/basics/2b_component_with_input_output/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-basics-2b_component_with_input_output-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines-with-components/basics/3a_basic_pipeline/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-basics-3a_basic_pipeline-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines-with-components/basics/3b_pipeline_with_data/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-basics-3b_pipeline_with_data-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines-with-components/basics/4a_local_data_input/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-basics-4a_local_data_input-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines-with-components/basics/4b_datastore_datapath_uri/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-basics-4b_datastore_datapath_uri-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines-with-components/basics/4c_web_url_input/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-basics-4c_web_url_input-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines-with-components/basics/5a_env_public_docker_image/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-basics-5a_env_public_docker_image-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines-with-components/basics/5b_env_registered/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-basics-5b_env_registered-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines-with-components/basics/5c_env_conda_file/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-basics-5c_env_conda_file-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines-with-components/basics/6a_tf_hello_world/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines-with-components/basics/6b_pytorch_hello_world/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines-with-components/basics/6c_r_iris/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
30
.github/workflows/cli-jobs-pipelines-with-components-image_classification_with_densenet-pipeline.yml
поставляемый
Normal file
30
.github/workflows/cli-jobs-pipelines-with-components-image_classification_with_densenet-pipeline.yml
поставляемый
Normal file
|
@ -0,0 +1,30 @@
|
|||
name: cli-jobs-pipelines-with-components-image_classification_with_densenet-pipeline
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "0 0/4 * * *"
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- cli/jobs/pipelines-with-components/image_classification_with_densenet/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-image_classification_with_densenet-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: check out repo
|
||||
uses: actions/checkout@v2
|
||||
- name: azure login
|
||||
uses: azure/login@v1
|
||||
with:
|
||||
creds: ${{secrets.AZ_CREDS}}
|
||||
- name: setup
|
||||
run: bash setup.sh
|
||||
working-directory: cli
|
||||
continue-on-error: true
|
||||
- name: run job
|
||||
run: bash -x run-job.sh jobs/pipelines-with-components/image_classification_with_densenet/pipeline.yml
|
||||
working-directory: cli
|
|
@ -9,6 +9,7 @@ on:
|
|||
paths:
|
||||
- cli/jobs/pipelines-with-components/nyc_taxi_data_regression/**
|
||||
- .github/workflows/cli-jobs-pipelines-with-components-nyc_taxi_data_regression-pipeline.yml
|
||||
- cli/run-pipeline-jobs.sh
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -47,7 +47,6 @@ path|status|
|
|||
[deploy-triton-managed-online-endpoint.sh](deploy-triton-managed-online-endpoint.sh)|[![deploy-triton-managed-online-endpoint](https://github.com/Azure/azureml-examples/workflows/cli-scripts-deploy-triton-managed-online-endpoint/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-deploy-triton-managed-online-endpoint.yml)
|
||||
[misc.sh](misc.sh)|[![misc](https://github.com/Azure/azureml-examples/workflows/cli-scripts-misc/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-misc.yml)
|
||||
[mlflow-uri.sh](mlflow-uri.sh)|[![mlflow-uri](https://github.com/Azure/azureml-examples/workflows/cli-scripts-mlflow-uri/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-mlflow-uri.yml)
|
||||
[run-pipeline-jobs.sh](run-pipeline-jobs.sh)|[![run-pipeline-jobs](https://github.com/Azure/azureml-examples/workflows/cli-scripts-run-pipeline-jobs/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-run-pipeline-jobs.yml)
|
||||
[train-rest.sh](train-rest.sh)|[![train-rest](https://github.com/Azure/azureml-examples/workflows/cli-scripts-train-rest/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-train-rest.yml)
|
||||
[train.sh](train.sh)|[![train](https://github.com/Azure/azureml-examples/workflows/cli-scripts-train/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-train.yml)
|
||||
|
||||
|
@ -94,6 +93,7 @@ path|status|description
|
|||
[jobs/basics/hello-world-output-data.yml](jobs/basics/hello-world-output-data.yml)|[![jobs/basics/hello-world-output-data](https://github.com/Azure/azureml-examples/workflows/cli-jobs-basics-hello-world-output-data/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-basics-hello-world-output-data.yml)|*no description*
|
||||
[jobs/basics/hello-world-output.yml](jobs/basics/hello-world-output.yml)|[![jobs/basics/hello-world-output](https://github.com/Azure/azureml-examples/workflows/cli-jobs-basics-hello-world-output/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-basics-hello-world-output.yml)|*no description*
|
||||
[jobs/basics/hello-world.yml](jobs/basics/hello-world.yml)|[![jobs/basics/hello-world](https://github.com/Azure/azureml-examples/workflows/cli-jobs-basics-hello-world/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-basics-hello-world.yml)|*no description*
|
||||
[jobs/pipelines/cifar-10/pipeline.yml](jobs/pipelines/cifar-10/pipeline.yml)|[![jobs/pipelines/cifar-10/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-cifar-10-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-cifar-10-pipeline.yml)|*no description*
|
||||
[jobs/pipelines/nyc-taxi/pipeline.yml](jobs/pipelines/nyc-taxi/pipeline.yml)|[![jobs/pipelines/nyc-taxi/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-nyc-taxi-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-nyc-taxi-pipeline.yml)|*no description*
|
||||
[jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline.yml](jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline.yml)|[![jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline.yml)|"Dummy train-score-eval pipeline with local components"
|
||||
[jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline.yml](jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline.yml)|[![jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline.yml)|"E2E dummy train-score-eval pipeline with registered components"
|
||||
|
@ -110,22 +110,8 @@ path|status|description
|
|||
[jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline.yml](jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline.yml)|[![jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline.yml)|"Prints the environment variable ($TF_CONFIG) useful for scripts running in a Tensorflow training environment"
|
||||
[jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline.yml](jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline.yml)|[![jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline.yml)|"Prints the environment variables useful for scripts running in a PyTorch training environment"
|
||||
[jobs/pipelines-with-components/basics/6c_r_iris/pipeline.yml](jobs/pipelines-with-components/basics/6c_r_iris/pipeline.yml)|[![jobs/pipelines-with-components/basics/6c_r_iris/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline.yml)|Train an R model on the Iris dataset.
|
||||
[jobs/pipelines-with-components/image_classification_with_densenet/pipeline.yml](jobs/pipelines-with-components/image_classification_with_densenet/pipeline.yml)|[![jobs/pipelines-with-components/image_classification_with_densenet/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-image_classification_with_densenet-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-image_classification_with_densenet-pipeline.yml)|*no description*
|
||||
[jobs/pipelines-with-components/nyc_taxi_data_regression/pipeline.yml](jobs/pipelines-with-components/nyc_taxi_data_regression/pipeline.yml)|[![jobs/pipelines-with-components/nyc_taxi_data_regression/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-nyc_taxi_data_regression-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-nyc_taxi_data_regression-pipeline.yml)|*no description*
|
||||
[jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline.yml](jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline.yml)|[![jobs/pipelines-with-components/basics/1a_e2e_local_components/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-1a_e2e_local_components-pipeline.yml)|"Dummy train-score-eval pipeline with local components"
|
||||
[jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline.yml](jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline.yml)|[![jobs/pipelines-with-components/basics/1b_e2e_registered_components/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-1b_e2e_registered_components-pipeline.yml)|"E2E dummy train-score-eval pipeline with registered components"
|
||||
[jobs/pipelines-with-components/basics/2a_basic_component/pipeline.yml](jobs/pipelines-with-components/basics/2a_basic_component/pipeline.yml)|[![jobs/pipelines-with-components/basics/2a_basic_component/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-2a_basic_component-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-2a_basic_component-pipeline.yml)|"Hello World component example"
|
||||
[jobs/pipelines-with-components/basics/2b_component_with_input_output/pipeline.yml](jobs/pipelines-with-components/basics/2b_component_with_input_output/pipeline.yml)|[![jobs/pipelines-with-components/basics/2b_component_with_input_output/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-2b_component_with_input_output-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-2b_component_with_input_output-pipeline.yml)|"Component with inputs and outputs"
|
||||
[jobs/pipelines-with-components/basics/3a_basic_pipeline/pipeline.yml](jobs/pipelines-with-components/basics/3a_basic_pipeline/pipeline.yml)|[![jobs/pipelines-with-components/basics/3a_basic_pipeline/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-3a_basic_pipeline-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-3a_basic_pipeline-pipeline.yml)|"Basic Pipeline Job with 3 Hello World components"
|
||||
[jobs/pipelines-with-components/basics/3b_pipeline_with_data/pipeline.yml](jobs/pipelines-with-components/basics/3b_pipeline_with_data/pipeline.yml)|[![jobs/pipelines-with-components/basics/3b_pipeline_with_data/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-3b_pipeline_with_data-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-3b_pipeline_with_data-pipeline.yml)|*no description*
|
||||
[jobs/pipelines-with-components/basics/4a_local_data_input/pipeline.yml](jobs/pipelines-with-components/basics/4a_local_data_input/pipeline.yml)|[![jobs/pipelines-with-components/basics/4a_local_data_input/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-4a_local_data_input-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-4a_local_data_input-pipeline.yml)|"Example of using data in a local folder as pipeline input"
|
||||
[jobs/pipelines-with-components/basics/4b_datastore_datapath_uri/pipeline.yml](jobs/pipelines-with-components/basics/4b_datastore_datapath_uri/pipeline.yml)|[![jobs/pipelines-with-components/basics/4b_datastore_datapath_uri/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-4b_datastore_datapath_uri-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-4b_datastore_datapath_uri-pipeline.yml)|"Example of using data folder from a Workspace Datastore as pipeline input"
|
||||
[jobs/pipelines-with-components/basics/4c_web_url_input/pipeline.yml](jobs/pipelines-with-components/basics/4c_web_url_input/pipeline.yml)|[![jobs/pipelines-with-components/basics/4c_web_url_input/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-4c_web_url_input-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-4c_web_url_input-pipeline.yml)|"Example of using a file hosted at a web URL as pipeline input"
|
||||
[jobs/pipelines-with-components/basics/5a_env_public_docker_image/pipeline.yml](jobs/pipelines-with-components/basics/5a_env_public_docker_image/pipeline.yml)|[![jobs/pipelines-with-components/basics/5a_env_public_docker_image/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-5a_env_public_docker_image-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-5a_env_public_docker_image-pipeline.yml)|*no description*
|
||||
[jobs/pipelines-with-components/basics/5b_env_registered/pipeline.yml](jobs/pipelines-with-components/basics/5b_env_registered/pipeline.yml)|[![jobs/pipelines-with-components/basics/5b_env_registered/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-5b_env_registered-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-5b_env_registered-pipeline.yml)|*no description*
|
||||
[jobs/pipelines-with-components/basics/5c_env_conda_file/pipeline.yml](jobs/pipelines-with-components/basics/5c_env_conda_file/pipeline.yml)|[![jobs/pipelines-with-components/basics/5c_env_conda_file/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-5c_env_conda_file-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-5c_env_conda_file-pipeline.yml)|*no description*
|
||||
[jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline.yml](jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline.yml)|[![jobs/pipelines-with-components/basics/6a_tf_hello_world/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6a_tf_hello_world-pipeline.yml)|"Prints the environment variable ($TF_CONFIG) useful for scripts running in a Tensorflow training environment"
|
||||
[jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline.yml](jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline.yml)|[![jobs/pipelines-with-components/basics/6b_pytorch_hello_world/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6b_pytorch_hello_world-pipeline.yml)|"Prints the environment variables useful for scripts running in a PyTorch training environment"
|
||||
[jobs/pipelines-with-components/basics/6c_r_iris/pipeline.yml](jobs/pipelines-with-components/basics/6c_r_iris/pipeline.yml)|[![jobs/pipelines-with-components/basics/6c_r_iris/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-basics-6c_r_iris-pipeline.yml)|Train an R model on the Iris dataset.
|
||||
|
||||
**Endpoints** ([endpoints](endpoints))
|
||||
|
||||
|
@ -184,4 +170,3 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope
|
|||
|
||||
- [Documentation](https://docs.microsoft.com/azure/machine-learning)
|
||||
- [Private previews](https://github.com/Azure/azureml-previews)
|
||||
|
|
@ -2,12 +2,19 @@ $schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
|
|||
type: pipeline
|
||||
|
||||
settings:
|
||||
default_datestore: azureml:workspaceblobstore
|
||||
default_datastore: azureml:workspaceartifactstore
|
||||
default_compute: azureml:cpu-cluster
|
||||
|
||||
jobs:
|
||||
hello_job:
|
||||
command: echo "hello"
|
||||
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:23
|
||||
command: echo "hello-world" > ${{outputs.world_output}}/world.txt
|
||||
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
|
||||
compute: azureml:cpu-cluster
|
||||
outputs:
|
||||
world_output:
|
||||
world_job:
|
||||
command: echo "world"
|
||||
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:23
|
||||
command: cat ${{inputs.world_input}}/world.txt
|
||||
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:23
|
||||
compute: azureml:cpu-cluster
|
||||
inputs:
|
||||
world_input: ${{parent.jobs.hello_job.outputs.world_output}}
|
|
@ -0,0 +1,8 @@
|
|||
name: designer-cv-transform
|
||||
channels:
|
||||
- defaults
|
||||
dependencies:
|
||||
- pip=20.2
|
||||
- python=3.7.9
|
||||
- pip:
|
||||
- azureml-designer-cv-modules[pytorch]==0.0.41
|
|
@ -0,0 +1,35 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/development/CommandComponent.schema.json
|
||||
type: command
|
||||
|
||||
name: microsoftsamples_apply_image_transformation
|
||||
display_name: Apply Image Transformation
|
||||
description: Applies a image transformation to a image directory.
|
||||
|
||||
version: 0.0.1
|
||||
|
||||
inputs:
|
||||
input_image_transform_path:
|
||||
description: Input image transformation
|
||||
type: uri_folder
|
||||
input_image_dir_path:
|
||||
description: Input image directory
|
||||
type: uri_folder
|
||||
mode:
|
||||
description: Should exclude 'Random' transform operations in inference but keep them in training
|
||||
type: string
|
||||
default: For training
|
||||
enum: ['For training', 'For inference']
|
||||
outputs:
|
||||
output_path:
|
||||
type: uri_folder
|
||||
description: Output image directory
|
||||
|
||||
command: >-
|
||||
python -m azureml.designer.modules.computer_vision.transform.apply_image_transformation.apply_image_transformation
|
||||
--input-image-transform-path ${{inputs.input_image_transform_path}}
|
||||
--input-image-dir-path ${{inputs.input_image_dir_path}}
|
||||
--mode "For training"
|
||||
--output-path ${{outputs.output_path}}
|
||||
environment:
|
||||
conda_file: ./conda.yaml
|
||||
image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20211124.v1
|
|
@ -0,0 +1,8 @@
|
|||
name: designer-cv-transform
|
||||
channels:
|
||||
- defaults
|
||||
dependencies:
|
||||
- pip=20.2
|
||||
- python=3.7.9
|
||||
- pip:
|
||||
- azureml-designer-cv-modules==0.0.41
|
|
@ -0,0 +1,25 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/development/CommandComponent.schema.json
|
||||
type: command
|
||||
|
||||
name: microsoftsamples_convert_to_image_directory
|
||||
display_name: Convert to Image Directory
|
||||
description: Convert dataset to image directory format.
|
||||
|
||||
version: 1
|
||||
|
||||
inputs:
|
||||
input_path:
|
||||
type: uri_folder
|
||||
description: Input dataset
|
||||
outputs:
|
||||
output_path:
|
||||
type: uri_folder
|
||||
description: Output image directory
|
||||
|
||||
command: >-
|
||||
python -m azureml.designer.modules.computer_vision.preprocess.convert_to_image_directory.convert_to_image_directory
|
||||
--input-path ${{inputs.input_path}}
|
||||
--output-path ${{outputs.output_path}}
|
||||
environment:
|
||||
conda_file: ./conda.yaml
|
||||
image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20211124.v1
|
Двоичные данные
cli/jobs/pipelines-with-components/image_classification_with_densenet/data/train/train.zip
Normal file
Двоичные данные
cli/jobs/pipelines-with-components/image_classification_with_densenet/data/train/train.zip
Normal file
Двоичный файл не отображается.
Двоичные данные
cli/jobs/pipelines-with-components/image_classification_with_densenet/data/val/val.zip
Normal file
Двоичные данные
cli/jobs/pipelines-with-components/image_classification_with_densenet/data/val/val.zip
Normal file
Двоичный файл не отображается.
|
@ -0,0 +1,89 @@
|
|||
# Convolutional Networks for Image Classification in PyTorch
|
||||
|
||||
In this repository you will find implementations of various image classification models.
|
||||
|
||||
## Table Of Contents
|
||||
|
||||
* [Models](#models)
|
||||
* [Validation accuracy results](#validation-accuracy-results)
|
||||
* [Training performance results](#training-performance-results)
|
||||
* [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-(8x-v100-16G))
|
||||
* [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-(16x-v100-32G))
|
||||
* [Model comparison](#model-comparison)
|
||||
* [Accuracy vs FLOPS](#accuracy-vs-flops)
|
||||
* [Latency vs Throughput on different batch sizes](#latency-vs-throughput-on-different-batch-sizes)
|
||||
|
||||
## Models
|
||||
|
||||
The following table provides links to where you can find additional information on each model:
|
||||
|
||||
| **Model** | **Link**|
|
||||
|:-:|:-:|
|
||||
| resnet50 | [README](./resnet50v1.5/README.md) |
|
||||
| resnext101-32x4d | [README](./resnext101-32x4d/README.md) |
|
||||
| se-resnext101-32x4d | [README](./se-resnext101-32x4d/README.md) |
|
||||
|
||||
## Validation accuracy results
|
||||
|
||||
Our results were obtained by running the applicable
|
||||
training scripts in the [framework-container-name] NGC container
|
||||
on NVIDIA DGX-1 with (8x V100 16G) GPUs.
|
||||
The specific training script that was run is documented
|
||||
in the corresponding model's README.
|
||||
|
||||
|
||||
The following table shows the validation accuracy results of the
|
||||
three classification models side-by-side.
|
||||
|
||||
|
||||
| **arch** | **AMP Top1** | **AMP Top5** | **FP32 Top1** | **FP32 Top1** |
|
||||
|:-:|:-:|:-:|:-:|:-:|
|
||||
| resnet50 | 78.46 | 94.15 | 78.50 | 94.11 |
|
||||
| resnext101-32x4d | 80.08 | 94.89 | 80.14 | 95.02 |
|
||||
| se-resnext101-32x4d | 81.01 | 95.52 | 81.12 | 95.54 |
|
||||
|
||||
|
||||
## Training performance results
|
||||
|
||||
|
||||
### Training performance: NVIDIA DGX-1 (8x V100 16G)
|
||||
|
||||
|
||||
Our results were obtained by running the applicable
|
||||
training scripts in the pytorch-19.10 NGC container
|
||||
on NVIDIA DGX-1 with (8x V100 16G) GPUs.
|
||||
Performance numbers (in images per second)
|
||||
were averaged over an entire training epoch.
|
||||
The specific training script that was run is documented
|
||||
in the corresponding model's README.
|
||||
|
||||
The following table shows the training accuracy results of the
|
||||
three classification models side-by-side.
|
||||
|
||||
|
||||
| **arch** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** |
|
||||
|:-:|:-:|:-:|:-:|
|
||||
| resnet50 | 6888.75 img/s | 2945.37 img/s | 2.34x |
|
||||
| resnext101-32x4d | 2384.85 img/s | 1116.58 img/s | 2.14x |
|
||||
| se-resnext101-32x4d | 2031.17 img/s | 977.45 img/s | 2.08x |
|
||||
|
||||
### Training performance: NVIDIA DGX-2 (16x V100 32G)
|
||||
|
||||
|
||||
Our results were obtained by running the applicable
|
||||
training scripts in the pytorch-19.10 NGC container
|
||||
on NVIDIA DGX-2 with (16x V100 32G) GPUs.
|
||||
Performance numbers (in images per second)
|
||||
were averaged over an entire training epoch.
|
||||
The specific training script that was run is documented
|
||||
in the corresponding model's README.
|
||||
|
||||
The following table shows the training accuracy results of the
|
||||
three classification models side-by-side.
|
||||
|
||||
|
||||
| **arch** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** |
|
||||
|:-:|:-:|:-:|:-:|
|
||||
| resnet50 | 13443.82 img/s | 6263.41 img/s | 2.15x |
|
||||
| resnext101-32x4d | 4473.37 img/s | 2261.97 img/s | 1.98x |
|
||||
| se-resnext101-32x4d | 3776.03 img/s | 1953.13 img/s | 1.93x |
|
|
@ -0,0 +1,42 @@
|
|||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the BSD 3-Clause License (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://opensource.org/licenses/BSD-3-Clause
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import torch
|
||||
|
||||
|
||||
def add_parser_arguments(parser):
|
||||
parser.add_argument(
|
||||
"--checkpoint-path", metavar="<path>", help="checkpoint filename"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--weight-path", metavar="<path>", help="name of file in which to store weights"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="PyTorch ImageNet Training")
|
||||
|
||||
add_parser_arguments(parser)
|
||||
args = parser.parse_args()
|
||||
|
||||
checkpoint = torch.load(args.checkpoint_path)
|
||||
|
||||
model_state_dict = {
|
||||
k[len("module.1.") :] if "module.1." in k else k: v
|
||||
for k, v in checkpoint["state_dict"].items()
|
||||
}
|
||||
|
||||
print(f"Loaded {checkpoint['arch']} : {checkpoint['best_prec1']}")
|
||||
|
||||
torch.save(model_state_dict, args.weight_path)
|
|
@ -0,0 +1,96 @@
|
|||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the BSD 3-Clause License (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://opensource.org/licenses/BSD-3-Clause
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from PIL import Image
|
||||
import argparse
|
||||
import numpy as np
|
||||
import json
|
||||
import torch
|
||||
import torch.backends.cudnn as cudnn
|
||||
import torchvision.transforms as transforms
|
||||
import image_classification.resnet as models
|
||||
from image_classification.dataloaders import load_jpeg_from_file
|
||||
|
||||
try:
|
||||
from apex.fp16_utils import *
|
||||
from apex import amp
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install apex from https://www.github.com/nvidia/apex to run this example."
|
||||
)
|
||||
|
||||
|
||||
def add_parser_arguments(parser):
|
||||
model_names = models.resnet_versions.keys()
|
||||
model_configs = models.resnet_configs.keys()
|
||||
parser.add_argument("--image-size", default="224", type=int)
|
||||
parser.add_argument(
|
||||
"--arch",
|
||||
"-a",
|
||||
metavar="ARCH",
|
||||
default="resnet50",
|
||||
choices=model_names,
|
||||
help="model architecture: " + " | ".join(model_names) + " (default: resnet50)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model-config",
|
||||
"-c",
|
||||
metavar="CONF",
|
||||
default="classic",
|
||||
choices=model_configs,
|
||||
help="model configs: " + " | ".join(model_configs) + "(default: classic)",
|
||||
)
|
||||
parser.add_argument("--weights", metavar="<path>", help="file with model weights")
|
||||
parser.add_argument(
|
||||
"--precision", metavar="PREC", default="FP16", choices=["AMP", "FP16", "FP32"]
|
||||
)
|
||||
parser.add_argument("--image", metavar="<path>", help="path to classified image")
|
||||
|
||||
|
||||
def main(args):
|
||||
imgnet_classes = np.array(json.load(open("./LOC_synset_mapping.json", "r")))
|
||||
model = models.build_resnet(args.arch, args.model_config, verbose=False)
|
||||
|
||||
if args.weights is not None:
|
||||
weights = torch.load(args.weights)
|
||||
model.load_state_dict(weights)
|
||||
|
||||
model = model.cuda()
|
||||
|
||||
if args.precision == "FP16":
|
||||
model = network_to_half(model)
|
||||
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
input = load_jpeg_from_file(
|
||||
args.image, cuda=True, fp16=args.precision != "FP32"
|
||||
)
|
||||
|
||||
output = torch.nn.functional.softmax(model(input), dim=1).cpu().view(-1).numpy()
|
||||
top5 = np.argsort(output)[-5:][::-1]
|
||||
|
||||
print(args.image)
|
||||
for c, v in zip(imgnet_classes[top5], output[top5]):
|
||||
print(f"{c}: {100*v:.1f}%")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="PyTorch ImageNet Training")
|
||||
|
||||
add_parser_arguments(parser)
|
||||
args = parser.parse_args()
|
||||
|
||||
cudnn.benchmark = True
|
||||
|
||||
main(args)
|
|
@ -0,0 +1,21 @@
|
|||
name: train_environment
|
||||
channels:
|
||||
- defaults
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.8.12
|
||||
- pip=21.2.2
|
||||
- pip:
|
||||
- azure-ml==0.0.58938149
|
||||
- --extra-index-url https://pypi.org/simple
|
||||
- --extra-index-url=https://azuremlsdktestpypi.azureedge.net/test-sdk-cli-v2
|
||||
- git+https://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger
|
||||
- watchdog==0.10.3
|
||||
- torch==1.8.1
|
||||
- torchvision==0.9.1
|
||||
- tensorboard==2.5.0
|
||||
- pillow==8.2.0
|
||||
- numpy==1.19.5
|
||||
- --extra-index-url=https://developer.download.nvidia.com/compute/redist/
|
||||
- nvidia-dali-cuda100
|
||||
- azureml-mlflow
|
|
@ -0,0 +1,175 @@
|
|||
from pathlib import Path
|
||||
import sys
|
||||
import runpy
|
||||
import json
|
||||
import shutil
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from multiprocessing import cpu_count
|
||||
import functools
|
||||
from enum import Enum
|
||||
from azure.ml import dsl
|
||||
from azure.ml.dsl._component import ComponentExecutor
|
||||
from azure.ml.dsl._types import DataInput, NumberInput
|
||||
|
||||
|
||||
class Data_BackendEnum(Enum):
|
||||
pytorch = "pytorch"
|
||||
syntetic = "syntetic"
|
||||
dali_gpu = "dali-gpu"
|
||||
dali_cpu = "dali-cpu"
|
||||
|
||||
|
||||
class ArchEnum(Enum):
|
||||
resnet18 = "resnet18"
|
||||
resnet34 = "resnet34"
|
||||
resnet50 = "resnet50"
|
||||
resnet101 = "resnet101"
|
||||
resnet152 = "resnet152"
|
||||
resnext101_32x4d = "resnext101-32x4d"
|
||||
se_resnext101_32x4d = "se-resnext101-32x4d"
|
||||
|
||||
|
||||
class Model_ConfigEnum(Enum):
|
||||
classic = "classic"
|
||||
fanin = "fanin"
|
||||
grp_fanin = "grp-fanin"
|
||||
grp_fanout = "grp-fanout"
|
||||
|
||||
|
||||
class Lr_ScheduleEnum(Enum):
|
||||
step = "step"
|
||||
linear = "linear"
|
||||
cosine = "cosine"
|
||||
|
||||
|
||||
def convert_image_directory_to_specific_format(
|
||||
image_dir_path, output_root, is_train=False
|
||||
):
|
||||
# convert image directory to train component input data format
|
||||
image_dir_path = Path(image_dir_path)
|
||||
image_list_path = image_dir_path / "images.lst"
|
||||
output_data_path = output_root / ("train" if is_train else "val")
|
||||
category_list = []
|
||||
file_name_list = []
|
||||
with open(image_list_path, "r") as fin:
|
||||
for line in fin:
|
||||
line = json.loads(line)
|
||||
# print(line)
|
||||
category_list.append(line["category"])
|
||||
file_name_list.append(line["image_info"]["file_name"])
|
||||
(output_data_path / line["category"]).mkdir(parents=True, exist_ok=True)
|
||||
print(
|
||||
f"file number {len(file_name_list)}, category number {len(set(category_list))}."
|
||||
)
|
||||
|
||||
def copy_file(index):
|
||||
target_dir = output_data_path / category_list[index]
|
||||
shutil.copyfile(
|
||||
str(image_dir_path / file_name_list[index]),
|
||||
str(target_dir / Path(file_name_list[index]).name),
|
||||
)
|
||||
|
||||
with ThreadPool(cpu_count()) as p:
|
||||
p.map(functools.partial(copy_file), range(len(file_name_list)))
|
||||
|
||||
print(
|
||||
f"output path {output_data_path} has {len(list(output_data_path.glob('**/*')))} files."
|
||||
)
|
||||
return output_root
|
||||
|
||||
|
||||
@dsl.command_component(
|
||||
name="imagecnn_train", description="imagecnn_train main function"
|
||||
)
|
||||
def main(
|
||||
train_data: DataInput(description="path to train dataset") = None,
|
||||
val_data: DataInput(description="path to valid dataset") = None,
|
||||
data_backend="dali-cpu",
|
||||
arch="resnet50",
|
||||
model_config="classic",
|
||||
workers: int = 5,
|
||||
epochs: int = 90,
|
||||
batch_size: int = 256,
|
||||
optimizer_batch_size: int = -1,
|
||||
lr: float = 0.1,
|
||||
lr_schedule="step",
|
||||
warmup: int = 0,
|
||||
label_smoothing: float = 0.0,
|
||||
mixup: float = 0.0,
|
||||
momentum: float = 0.9,
|
||||
weight_decay: float = 0.0001,
|
||||
print_freq: int = 10,
|
||||
resume="",
|
||||
pretrained_weights="",
|
||||
static_loss_scale: float = 1,
|
||||
prof: int = -1,
|
||||
seed: int = None,
|
||||
raport_file="experiment_raport.json",
|
||||
workspace="./",
|
||||
save_checkpoint_epochs: int = 10,
|
||||
):
|
||||
new_data_path = Path(train_data).parent / "new_dataset"
|
||||
convert_image_directory_to_specific_format(
|
||||
image_dir_path=train_data, output_root=new_data_path, is_train=True
|
||||
)
|
||||
convert_image_directory_to_specific_format(
|
||||
image_dir_path=val_data, output_root=new_data_path
|
||||
)
|
||||
print(f"new data path {new_data_path}")
|
||||
sys.argv = [
|
||||
"main",
|
||||
"--data",
|
||||
str(new_data_path),
|
||||
"--data-backend",
|
||||
data_backend,
|
||||
"--arch",
|
||||
arch,
|
||||
"--model-config",
|
||||
model_config,
|
||||
"-j",
|
||||
str(workers),
|
||||
"--epochs",
|
||||
str(epochs),
|
||||
"-b",
|
||||
str(batch_size),
|
||||
"--optimizer-batch-size",
|
||||
str(optimizer_batch_size),
|
||||
"--lr",
|
||||
str(lr),
|
||||
"--lr-schedule",
|
||||
lr_schedule,
|
||||
"--warmup",
|
||||
str(warmup),
|
||||
"--label-smoothing",
|
||||
str(label_smoothing),
|
||||
"--mixup",
|
||||
str(mixup),
|
||||
"--momentum",
|
||||
str(momentum),
|
||||
"--weight-decay",
|
||||
str(weight_decay),
|
||||
"--print-freq",
|
||||
str(print_freq),
|
||||
"--resume",
|
||||
str(resume),
|
||||
"--pretrained-weights",
|
||||
str(pretrained_weights),
|
||||
"--static-loss-scale",
|
||||
str(static_loss_scale),
|
||||
"--prof",
|
||||
str(prof),
|
||||
"--seed",
|
||||
str(seed),
|
||||
"--raport-file",
|
||||
str(raport_file),
|
||||
"--workspace",
|
||||
str(workspace),
|
||||
"--save-checkpoint-epochs",
|
||||
str(save_checkpoint_epochs),
|
||||
]
|
||||
print(" ".join(sys.argv))
|
||||
runpy.run_path("main.py", run_name="__main__")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ComponentExecutor(main).execute(sys.argv)
|
|
@ -0,0 +1,147 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/development/commandComponent.schema.json
|
||||
type: command
|
||||
|
||||
name: train_image_classification
|
||||
version: 0.0.1
|
||||
display_name: Train Image Classification
|
||||
|
||||
tags: {}
|
||||
inputs:
|
||||
train_data:
|
||||
type: path
|
||||
description: "path to train dataset"
|
||||
optional: false
|
||||
valid_data:
|
||||
type: path
|
||||
description: "path to valid dataset"
|
||||
optional: false
|
||||
data_backend:
|
||||
type: string
|
||||
description: "data backend: pytorch | syntetic | dali-gpu | dali-cpu (default: dali-cpu)"
|
||||
default: "dali-cpu"
|
||||
optional: true
|
||||
arch:
|
||||
type: string
|
||||
description: "model architecture: resnet18 | resnet34 | resnet50 | resnet101 | resnet152 | resnext101_32x4d | se_resnext101_32x4d (default: resnet50)"
|
||||
default: "resnet50"
|
||||
optional: true
|
||||
model_config:
|
||||
type: string
|
||||
description: "model configs: classic | fanin | grp_fanin | grp_fanout(default: classic)"
|
||||
default: "classic"
|
||||
optional: true
|
||||
workers:
|
||||
type: integer
|
||||
description: "number of data loading workers (default: 5)"
|
||||
default: 5
|
||||
optional: true
|
||||
epochs:
|
||||
type: integer
|
||||
description: number of total epochs to run
|
||||
default: 90
|
||||
optional: true
|
||||
batch_size:
|
||||
type: integer
|
||||
description: "mini-batch size (default: 256) per gpu"
|
||||
default: 256
|
||||
optional: true
|
||||
optimizer_batch_size:
|
||||
type: integer
|
||||
description: size of a total batch size, for simulating bigger batches using gradient accumulation
|
||||
default: -1
|
||||
optional: true
|
||||
lr:
|
||||
type: number
|
||||
description: initial learning rate
|
||||
default: 0.1
|
||||
optional: true
|
||||
lr_schedule:
|
||||
type: string
|
||||
description: "Type of LR schedule: step, linear, cosine"
|
||||
default: "step"
|
||||
optional: true
|
||||
warmup:
|
||||
type: integer
|
||||
description: number of warmup epochs
|
||||
default: 0
|
||||
optional: true
|
||||
label_smoothing:
|
||||
type: number
|
||||
description: label smoothing
|
||||
default: 0.0
|
||||
optional: true
|
||||
mixup:
|
||||
type: number
|
||||
description: mixup alpha
|
||||
default: 0.0
|
||||
optional: true
|
||||
momentum:
|
||||
type: number
|
||||
description: momentum
|
||||
default: 0.9
|
||||
optional: true
|
||||
weight_decay:
|
||||
type: number
|
||||
description: "weight decay (default: 1e-4)"
|
||||
default: 0.0001
|
||||
optional: true
|
||||
print_freq:
|
||||
type: integer
|
||||
description: "print frequency (default: 10)"
|
||||
default: 10
|
||||
optional: true
|
||||
resume:
|
||||
type: string
|
||||
description: "path to latest checkpoint (default: none)"
|
||||
default: ""
|
||||
optional: true
|
||||
pretrained_weights:
|
||||
type: string
|
||||
description: load weights from here
|
||||
default: ""
|
||||
optional: true
|
||||
static_loss_scale:
|
||||
type: number
|
||||
description: Static loss scale, positive power of 2 values can improve fp16 convergence.
|
||||
default: 1.0
|
||||
optional: true
|
||||
prof:
|
||||
type: integer
|
||||
description: Run only N iterations
|
||||
default: -1
|
||||
optional: true
|
||||
seed:
|
||||
type: integer
|
||||
description: random seed used for numpy and pytorch
|
||||
default: 123
|
||||
optional: true
|
||||
raport_file:
|
||||
type: string
|
||||
description: file in which to store JSON experiment raport
|
||||
default: experiment_raport.json
|
||||
optional: true
|
||||
save_checkpoint_epochs:
|
||||
type: integer
|
||||
description: how many epochs run between saving checkpoints
|
||||
default: 2
|
||||
optional: true
|
||||
outputs:
|
||||
workspace:
|
||||
type: uri_folder
|
||||
description: path to directory where checkpoints will be stored
|
||||
|
||||
code: ./
|
||||
|
||||
environment:
|
||||
image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn7-ubuntu18.04
|
||||
conda_file: ./conda.yaml
|
||||
|
||||
resources:
|
||||
instance_count: 2
|
||||
distribution:
|
||||
type: mpi
|
||||
process_count_per_instance: 1
|
||||
|
||||
command: >-
|
||||
git clone https://github.com/NVIDIA/apex && cd apex && git checkout 3303b3e7174383312a3468ef390060c26e640cb1 && python setup.py install && cd .. && python entry.py --train_data ${{inputs.train_data}} --val_data ${{inputs.valid_data}} [--data_backend ${{inputs.data_backend}}] [--arch ${{inputs.arch}}] [--model_config ${{inputs.model_config}}] [--workers ${{inputs.workers}}] [--epochs ${{inputs.epochs}}] [--batch_size ${{inputs.batch_size}}] [--optimizer_batch_size ${{inputs.optimizer_batch_size}}] [--lr ${{inputs.lr}}] [--lr_schedule ${{inputs.lr_schedule}}] [--warmup ${{inputs.warmup}}] [--label_smoothing ${{inputs.label_smoothing}}] [--mixup ${{inputs.mixup}}] [--momentum ${{inputs.momentum}}] [--weight_decay ${{inputs.weight_decay}}] [--print_freq ${{inputs.print_freq}}] [--resume ${{inputs.resume}}] [--pretrained_weights ${{inputs.pretrained_weights}}] [--static_loss_scale ${{inputs.static_loss_scale}}] [--prof ${{inputs.prof}}] [--seed ${{inputs.seed}}] [--raport_file ${{inputs.raport_file}}] [--save_checkpoint_epochs ${{inputs.save_checkpoint_epochs}}] --workspace ${{outputs.workspace}}
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the BSD 3-Clause License (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://opensource.org/licenses/BSD-3-Clause
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from . import logger
|
||||
from . import dataloaders
|
||||
from . import training
|
||||
from . import utils
|
||||
from . import mixup
|
||||
from . import resnet
|
||||
from . import smoothing
|
|
@ -0,0 +1,489 @@
|
|||
# Copyright (c) 2018-2019, NVIDIA CORPORATION
|
||||
# Copyright (c) 2017- Facebook, Inc
|
||||
#
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# * Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
import os
|
||||
import torch
|
||||
import numpy as np
|
||||
import torchvision.datasets as datasets
|
||||
import torchvision.transforms as transforms
|
||||
from PIL import Image
|
||||
|
||||
DATA_BACKEND_CHOICES = ["pytorch", "syntetic"]
|
||||
try:
|
||||
from nvidia.dali.plugin.pytorch import DALIClassificationIterator
|
||||
from nvidia.dali.pipeline import Pipeline
|
||||
import nvidia.dali.ops as ops
|
||||
import nvidia.dali.types as types
|
||||
|
||||
DATA_BACKEND_CHOICES.append("dali-gpu")
|
||||
DATA_BACKEND_CHOICES.append("dali-cpu")
|
||||
except ImportError:
|
||||
print(
|
||||
"Please install DALI from https://www.github.com/NVIDIA/DALI to run this example."
|
||||
)
|
||||
|
||||
|
||||
def load_jpeg_from_file(path, cuda=True, fp16=False):
|
||||
img_transforms = transforms.Compose(
|
||||
[transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor()]
|
||||
)
|
||||
|
||||
img = img_transforms(Image.open(path))
|
||||
with torch.no_grad():
|
||||
# mean and std are not multiplied by 255 as they are in training script
|
||||
# torch dataloader reads data into bytes whereas loading directly
|
||||
# through PIL creates a tensor with floats in [0,1] range
|
||||
mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
|
||||
std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
|
||||
|
||||
if cuda:
|
||||
mean = mean.cuda()
|
||||
std = std.cuda()
|
||||
img = img.cuda()
|
||||
if fp16:
|
||||
mean = mean.half()
|
||||
std = std.half()
|
||||
img = img.half()
|
||||
else:
|
||||
img = img.float()
|
||||
|
||||
input = img.unsqueeze(0).sub_(mean).div_(std)
|
||||
|
||||
return input
|
||||
|
||||
|
||||
class HybridTrainPipe(Pipeline):
|
||||
def __init__(
|
||||
self, batch_size, num_threads, device_id, data_dir, crop, dali_cpu=False
|
||||
):
|
||||
super(HybridTrainPipe, self).__init__(
|
||||
batch_size, num_threads, device_id, seed=12 + device_id
|
||||
)
|
||||
if torch.distributed.is_initialized():
|
||||
rank = torch.distributed.get_rank()
|
||||
world_size = torch.distributed.get_world_size()
|
||||
else:
|
||||
rank = 0
|
||||
world_size = 1
|
||||
|
||||
self.input = ops.FileReader(
|
||||
file_root=data_dir,
|
||||
shard_id=rank,
|
||||
num_shards=world_size,
|
||||
random_shuffle=True,
|
||||
)
|
||||
|
||||
if dali_cpu:
|
||||
dali_device = "cpu"
|
||||
self.decode = ops.ImageDecoder(device=dali_device, output_type=types.RGB)
|
||||
else:
|
||||
dali_device = "gpu"
|
||||
# This padding sets the size of the internal nvJPEG buffers to be able to handle all images from full-sized ImageNet
|
||||
# without additional reallocations
|
||||
self.decode = ops.ImageDecoder(
|
||||
device="mixed",
|
||||
output_type=types.RGB,
|
||||
device_memory_padding=211025920,
|
||||
host_memory_padding=140544512,
|
||||
)
|
||||
|
||||
self.res = ops.RandomResizedCrop(
|
||||
device=dali_device,
|
||||
size=[crop, crop],
|
||||
interp_type=types.INTERP_LINEAR,
|
||||
random_aspect_ratio=[0.75, 4.0 / 3.0],
|
||||
random_area=[0.08, 1.0],
|
||||
num_attempts=100,
|
||||
)
|
||||
|
||||
self.cmnp = ops.CropMirrorNormalize(
|
||||
device="gpu",
|
||||
output_dtype=types.FLOAT,
|
||||
output_layout=types.NCHW,
|
||||
crop=(crop, crop),
|
||||
image_type=types.RGB,
|
||||
mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
|
||||
std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
|
||||
)
|
||||
self.coin = ops.CoinFlip(probability=0.5)
|
||||
|
||||
def define_graph(self):
|
||||
rng = self.coin()
|
||||
self.jpegs, self.labels = self.input(name="Reader")
|
||||
images = self.decode(self.jpegs)
|
||||
images = self.res(images)
|
||||
output = self.cmnp(images.gpu(), mirror=rng)
|
||||
return [output, self.labels]
|
||||
|
||||
|
||||
class HybridValPipe(Pipeline):
|
||||
def __init__(self, batch_size, num_threads, device_id, data_dir, crop, size):
|
||||
super(HybridValPipe, self).__init__(
|
||||
batch_size, num_threads, device_id, seed=12 + device_id
|
||||
)
|
||||
if torch.distributed.is_initialized():
|
||||
rank = torch.distributed.get_rank()
|
||||
world_size = torch.distributed.get_world_size()
|
||||
else:
|
||||
rank = 0
|
||||
world_size = 1
|
||||
|
||||
self.input = ops.FileReader(
|
||||
file_root=data_dir,
|
||||
shard_id=rank,
|
||||
num_shards=world_size,
|
||||
random_shuffle=False,
|
||||
)
|
||||
|
||||
self.decode = ops.ImageDecoder(device="mixed", output_type=types.RGB)
|
||||
self.res = ops.Resize(device="gpu", resize_shorter=size)
|
||||
self.cmnp = ops.CropMirrorNormalize(
|
||||
device="gpu",
|
||||
output_dtype=types.FLOAT,
|
||||
output_layout=types.NCHW,
|
||||
crop=(crop, crop),
|
||||
image_type=types.RGB,
|
||||
mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
|
||||
std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
|
||||
)
|
||||
|
||||
def define_graph(self):
|
||||
self.jpegs, self.labels = self.input(name="Reader")
|
||||
images = self.decode(self.jpegs)
|
||||
images = self.res(images)
|
||||
output = self.cmnp(images)
|
||||
return [output, self.labels]
|
||||
|
||||
|
||||
class DALIWrapper(object):
|
||||
def gen_wrapper(dalipipeline, num_classes, one_hot):
|
||||
for data in dalipipeline:
|
||||
input = data[0]["data"]
|
||||
target = torch.reshape(data[0]["label"], [-1]).cuda().long()
|
||||
if one_hot:
|
||||
target = expand(num_classes, torch.float, target)
|
||||
yield input, target
|
||||
dalipipeline.reset()
|
||||
|
||||
def __init__(self, dalipipeline, num_classes, one_hot):
|
||||
self.dalipipeline = dalipipeline
|
||||
self.num_classes = num_classes
|
||||
self.one_hot = one_hot
|
||||
|
||||
def __iter__(self):
|
||||
return DALIWrapper.gen_wrapper(
|
||||
self.dalipipeline, self.num_classes, self.one_hot
|
||||
)
|
||||
|
||||
|
||||
def get_dali_train_loader(dali_cpu=False):
|
||||
def gdtl(
|
||||
data_path,
|
||||
batch_size,
|
||||
num_classes,
|
||||
one_hot,
|
||||
workers=5,
|
||||
_worker_init_fn=None,
|
||||
fp16=False,
|
||||
):
|
||||
if torch.distributed.is_initialized():
|
||||
rank = torch.distributed.get_rank()
|
||||
world_size = torch.distributed.get_world_size()
|
||||
else:
|
||||
rank = 0
|
||||
world_size = 1
|
||||
|
||||
traindir = os.path.join(data_path, "train")
|
||||
|
||||
pipe = HybridTrainPipe(
|
||||
batch_size=batch_size,
|
||||
num_threads=workers,
|
||||
device_id=rank % torch.cuda.device_count(),
|
||||
data_dir=traindir,
|
||||
crop=224,
|
||||
dali_cpu=dali_cpu,
|
||||
)
|
||||
|
||||
pipe.build()
|
||||
train_loader = DALIClassificationIterator(
|
||||
pipe, size=int(pipe.epoch_size("Reader") / world_size)
|
||||
)
|
||||
|
||||
return DALIWrapper(train_loader, num_classes, one_hot), int(
|
||||
pipe.epoch_size("Reader") / (world_size * batch_size)
|
||||
)
|
||||
|
||||
return gdtl
|
||||
|
||||
|
||||
def get_dali_val_loader():
|
||||
def gdvl(
|
||||
data_path,
|
||||
batch_size,
|
||||
num_classes,
|
||||
one_hot,
|
||||
workers=5,
|
||||
_worker_init_fn=None,
|
||||
fp16=False,
|
||||
):
|
||||
if torch.distributed.is_initialized():
|
||||
rank = torch.distributed.get_rank()
|
||||
world_size = torch.distributed.get_world_size()
|
||||
else:
|
||||
rank = 0
|
||||
world_size = 1
|
||||
|
||||
valdir = os.path.join(data_path, "val")
|
||||
|
||||
pipe = HybridValPipe(
|
||||
batch_size=batch_size,
|
||||
num_threads=workers,
|
||||
device_id=rank % torch.cuda.device_count(),
|
||||
data_dir=valdir,
|
||||
crop=224,
|
||||
size=256,
|
||||
)
|
||||
|
||||
pipe.build()
|
||||
val_loader = DALIClassificationIterator(
|
||||
pipe, size=int(pipe.epoch_size("Reader") / world_size)
|
||||
)
|
||||
|
||||
return DALIWrapper(val_loader, num_classes, one_hot), int(
|
||||
pipe.epoch_size("Reader") / (world_size * batch_size)
|
||||
)
|
||||
|
||||
return gdvl
|
||||
|
||||
|
||||
def fast_collate(batch):
|
||||
imgs = [img[0] for img in batch]
|
||||
targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
|
||||
w = imgs[0].size[0]
|
||||
h = imgs[0].size[1]
|
||||
tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8)
|
||||
for i, img in enumerate(imgs):
|
||||
nump_array = np.asarray(img, dtype=np.uint8)
|
||||
tens = torch.from_numpy(nump_array)
|
||||
if nump_array.ndim < 3:
|
||||
nump_array = np.expand_dims(nump_array, axis=-1)
|
||||
nump_array = np.rollaxis(nump_array, 2)
|
||||
|
||||
tensor[i] += torch.from_numpy(nump_array)
|
||||
|
||||
return tensor, targets
|
||||
|
||||
|
||||
def expand(num_classes, dtype, tensor):
|
||||
e = torch.zeros(
|
||||
tensor.size(0), num_classes, dtype=dtype, device=torch.device("cuda")
|
||||
)
|
||||
e = e.scatter(1, tensor.unsqueeze(1), 1.0)
|
||||
return e
|
||||
|
||||
|
||||
class PrefetchedWrapper(object):
|
||||
def prefetched_loader(loader, num_classes, fp16, one_hot):
|
||||
mean = (
|
||||
torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255])
|
||||
.cuda()
|
||||
.view(1, 3, 1, 1)
|
||||
)
|
||||
std = (
|
||||
torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255])
|
||||
.cuda()
|
||||
.view(1, 3, 1, 1)
|
||||
)
|
||||
if fp16:
|
||||
mean = mean.half()
|
||||
std = std.half()
|
||||
|
||||
stream = torch.cuda.Stream()
|
||||
first = True
|
||||
|
||||
for next_input, next_target in loader:
|
||||
with torch.cuda.stream(stream):
|
||||
next_input = next_input.cuda(non_blocking=True)
|
||||
next_target = next_target.cuda(non_blocking=True)
|
||||
if fp16:
|
||||
next_input = next_input.half()
|
||||
if one_hot:
|
||||
next_target = expand(num_classes, torch.half, next_target)
|
||||
else:
|
||||
next_input = next_input.float()
|
||||
if one_hot:
|
||||
next_target = expand(num_classes, torch.float, next_target)
|
||||
|
||||
next_input = next_input.sub_(mean).div_(std)
|
||||
|
||||
if not first:
|
||||
yield input, target
|
||||
else:
|
||||
first = False
|
||||
|
||||
torch.cuda.current_stream().wait_stream(stream)
|
||||
input = next_input
|
||||
target = next_target
|
||||
|
||||
yield input, target
|
||||
|
||||
def __init__(self, dataloader, num_classes, fp16, one_hot):
|
||||
self.dataloader = dataloader
|
||||
self.fp16 = fp16
|
||||
self.epoch = 0
|
||||
self.one_hot = one_hot
|
||||
self.num_classes = num_classes
|
||||
|
||||
def __iter__(self):
|
||||
if self.dataloader.sampler is not None and isinstance(
|
||||
self.dataloader.sampler, torch.utils.data.distributed.DistributedSampler
|
||||
):
|
||||
|
||||
self.dataloader.sampler.set_epoch(self.epoch)
|
||||
self.epoch += 1
|
||||
return PrefetchedWrapper.prefetched_loader(
|
||||
self.dataloader, self.num_classes, self.fp16, self.one_hot
|
||||
)
|
||||
|
||||
|
||||
def get_pytorch_train_loader(
|
||||
data_path,
|
||||
batch_size,
|
||||
num_classes,
|
||||
one_hot,
|
||||
workers=5,
|
||||
_worker_init_fn=None,
|
||||
fp16=False,
|
||||
):
|
||||
traindir = os.path.join(data_path, "train")
|
||||
train_dataset = datasets.ImageFolder(
|
||||
traindir,
|
||||
transforms.Compose(
|
||||
[
|
||||
transforms.RandomResizedCrop(224),
|
||||
transforms.RandomHorizontalFlip(),
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
if torch.distributed.is_initialized():
|
||||
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
|
||||
else:
|
||||
train_sampler = None
|
||||
|
||||
train_loader = torch.utils.data.DataLoader(
|
||||
train_dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=(train_sampler is None),
|
||||
num_workers=workers,
|
||||
worker_init_fn=_worker_init_fn,
|
||||
pin_memory=True,
|
||||
sampler=train_sampler,
|
||||
collate_fn=fast_collate,
|
||||
drop_last=True,
|
||||
)
|
||||
|
||||
return PrefetchedWrapper(train_loader, num_classes, fp16, one_hot), len(
|
||||
train_loader
|
||||
)
|
||||
|
||||
|
||||
def get_pytorch_val_loader(
|
||||
data_path,
|
||||
batch_size,
|
||||
num_classes,
|
||||
one_hot,
|
||||
workers=5,
|
||||
_worker_init_fn=None,
|
||||
fp16=False,
|
||||
):
|
||||
valdir = os.path.join(data_path, "val")
|
||||
val_dataset = datasets.ImageFolder(
|
||||
valdir,
|
||||
transforms.Compose(
|
||||
[
|
||||
transforms.Resize(256),
|
||||
transforms.CenterCrop(224),
|
||||
]
|
||||
),
|
||||
)
|
||||
|
||||
if torch.distributed.is_initialized():
|
||||
val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
|
||||
else:
|
||||
val_sampler = None
|
||||
|
||||
val_loader = torch.utils.data.DataLoader(
|
||||
val_dataset,
|
||||
sampler=val_sampler,
|
||||
batch_size=batch_size,
|
||||
shuffle=False,
|
||||
num_workers=workers,
|
||||
worker_init_fn=_worker_init_fn,
|
||||
pin_memory=True,
|
||||
collate_fn=fast_collate,
|
||||
)
|
||||
|
||||
return PrefetchedWrapper(val_loader, num_classes, fp16, one_hot), len(val_loader)
|
||||
|
||||
|
||||
class SynteticDataLoader(object):
|
||||
def __init__(
|
||||
self, fp16, batch_size, num_classes, num_channels, height, width, one_hot
|
||||
):
|
||||
input_data = (
|
||||
torch.empty(batch_size, num_channels, height, width).cuda().normal_(0, 1.0)
|
||||
)
|
||||
if one_hot:
|
||||
input_target = torch.empty(batch_size, num_classes).cuda()
|
||||
input_target[:, 0] = 1.0
|
||||
else:
|
||||
input_target = torch.randint(0, num_classes, (batch_size,))
|
||||
input_target = input_target.cuda()
|
||||
if fp16:
|
||||
input_data = input_data.half()
|
||||
|
||||
self.input_data = input_data
|
||||
self.input_target = input_target
|
||||
|
||||
def __iter__(self):
|
||||
while True:
|
||||
yield self.input_data, self.input_target
|
||||
|
||||
|
||||
def get_syntetic_loader(
|
||||
data_path,
|
||||
batch_size,
|
||||
num_classes,
|
||||
one_hot,
|
||||
workers=None,
|
||||
_worker_init_fn=None,
|
||||
fp16=False,
|
||||
):
|
||||
return SynteticDataLoader(fp16, batch_size, 1000, 3, 224, 224, one_hot), -1
|
|
@ -0,0 +1,311 @@
|
|||
# Copyright (c) 2018-2019, NVIDIA CORPORATION
|
||||
# Copyright (c) 2017- Facebook, Inc
|
||||
#
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# * Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
from collections import OrderedDict
|
||||
import dllogger
|
||||
import numpy as np
|
||||
|
||||
|
||||
def format_step(step):
|
||||
if isinstance(step, str):
|
||||
return step
|
||||
s = ""
|
||||
if len(step) > 0:
|
||||
s += "Epoch: {} ".format(step[0])
|
||||
if len(step) > 1:
|
||||
s += "Iteration: {} ".format(step[1])
|
||||
if len(step) > 2:
|
||||
s += "Validation Iteration: {} ".format(step[2])
|
||||
if len(step) == 0:
|
||||
s = "Summary:"
|
||||
return s
|
||||
|
||||
|
||||
PERF_METER = lambda: Meter(AverageMeter(), AverageMeter(), AverageMeter())
|
||||
LOSS_METER = lambda: Meter(AverageMeter(), AverageMeter(), MinMeter())
|
||||
ACC_METER = lambda: Meter(AverageMeter(), AverageMeter(), MaxMeter())
|
||||
LR_METER = lambda: Meter(LastMeter(), LastMeter(), LastMeter())
|
||||
|
||||
LAT_100 = lambda: Meter(QuantileMeter(1), QuantileMeter(1), QuantileMeter(1))
|
||||
LAT_99 = lambda: Meter(QuantileMeter(0.99), QuantileMeter(0.99), QuantileMeter(0.99))
|
||||
LAT_95 = lambda: Meter(QuantileMeter(0.95), QuantileMeter(0.95), QuantileMeter(0.95))
|
||||
|
||||
|
||||
class Meter(object):
|
||||
def __init__(self, iteration_aggregator, epoch_aggregator, run_aggregator):
|
||||
self.run_aggregator = run_aggregator
|
||||
self.epoch_aggregator = epoch_aggregator
|
||||
self.iteration_aggregator = iteration_aggregator
|
||||
|
||||
def record(self, val, n=1):
|
||||
self.iteration_aggregator.record(val, n=n)
|
||||
|
||||
def get_iteration(self):
|
||||
v, n = self.iteration_aggregator.get_val()
|
||||
return v
|
||||
|
||||
def reset_iteration(self):
|
||||
v, n = self.iteration_aggregator.get_data()
|
||||
self.iteration_aggregator.reset()
|
||||
if v is not None:
|
||||
self.epoch_aggregator.record(v, n=n)
|
||||
|
||||
def get_epoch(self):
|
||||
v, n = self.epoch_aggregator.get_val()
|
||||
return v
|
||||
|
||||
def reset_epoch(self):
|
||||
v, n = self.epoch_aggregator.get_data()
|
||||
self.epoch_aggregator.reset()
|
||||
if v is not None:
|
||||
self.run_aggregator.record(v, n=n)
|
||||
|
||||
def get_run(self):
|
||||
v, n = self.run_aggregator.get_val()
|
||||
return v
|
||||
|
||||
def reset_run(self):
|
||||
self.run_aggregator.reset()
|
||||
|
||||
|
||||
class QuantileMeter(object):
|
||||
def __init__(self, q):
|
||||
self.q = q
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.vals = []
|
||||
self.n = 0
|
||||
|
||||
def record(self, val, n=1):
|
||||
if isinstance(val, list):
|
||||
self.vals += val
|
||||
self.n += len(val)
|
||||
else:
|
||||
self.vals += [val] * n
|
||||
self.n += n
|
||||
|
||||
def get_val(self):
|
||||
if not self.vals:
|
||||
return None, self.n
|
||||
return np.quantile(self.vals, self.q, interpolation="nearest"), self.n
|
||||
|
||||
def get_data(self):
|
||||
return self.vals, self.n
|
||||
|
||||
|
||||
class MaxMeter(object):
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.max = None
|
||||
self.n = 0
|
||||
|
||||
def record(self, val, n=1):
|
||||
if self.max is None:
|
||||
self.max = val
|
||||
else:
|
||||
self.max = max(self.max, val)
|
||||
self.n = n
|
||||
|
||||
def get_val(self):
|
||||
return self.max, self.n
|
||||
|
||||
def get_data(self):
|
||||
return self.max, self.n
|
||||
|
||||
|
||||
class MinMeter(object):
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.min = None
|
||||
self.n = 0
|
||||
|
||||
def record(self, val, n=1):
|
||||
if self.min is None:
|
||||
self.min = val
|
||||
else:
|
||||
self.min = max(self.min, val)
|
||||
self.n = n
|
||||
|
||||
def get_val(self):
|
||||
return self.min, self.n
|
||||
|
||||
def get_data(self):
|
||||
return self.min, self.n
|
||||
|
||||
|
||||
class LastMeter(object):
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.last = None
|
||||
self.n = 0
|
||||
|
||||
def record(self, val, n=1):
|
||||
self.last = val
|
||||
self.n = n
|
||||
|
||||
def get_val(self):
|
||||
return self.last, self.n
|
||||
|
||||
def get_data(self):
|
||||
return self.last, self.n
|
||||
|
||||
|
||||
class AverageMeter(object):
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.n = 0
|
||||
self.val = 0
|
||||
|
||||
def record(self, val, n=1):
|
||||
self.n += n
|
||||
self.val += val * n
|
||||
|
||||
def get_val(self):
|
||||
if self.n == 0:
|
||||
return None, 0
|
||||
return self.val / self.n, self.n
|
||||
|
||||
def get_data(self):
|
||||
if self.n == 0:
|
||||
return None, 0
|
||||
return self.val / self.n, self.n
|
||||
|
||||
|
||||
class Logger(object):
|
||||
def __init__(self, print_interval, backends, verbose=False, last_epoch=-1):
|
||||
self.epoch = last_epoch
|
||||
self.iteration = -1
|
||||
self.val_iteration = -1
|
||||
self.metrics = OrderedDict()
|
||||
self.backends = backends
|
||||
self.print_interval = print_interval
|
||||
self.verbose = verbose
|
||||
dllogger.init(backends)
|
||||
|
||||
def log_parameter(self, data, verbosity=0):
|
||||
dllogger.log(step="PARAMETER", data=data, verbosity=verbosity)
|
||||
|
||||
def register_metric(self, metric_name, meter, verbosity=0, metadata={}):
|
||||
if self.verbose:
|
||||
print("Registering metric: {}".format(metric_name))
|
||||
self.metrics[metric_name] = {"meter": meter, "level": verbosity}
|
||||
dllogger.metadata(metric_name, metadata)
|
||||
|
||||
def log_metric(self, metric_name, val, n=1):
|
||||
self.metrics[metric_name]["meter"].record(val, n=n)
|
||||
|
||||
def start_iteration(self, val=False):
|
||||
if val:
|
||||
self.val_iteration += 1
|
||||
else:
|
||||
self.iteration += 1
|
||||
|
||||
def end_iteration(self, val=False):
|
||||
it = self.val_iteration if val else self.iteration
|
||||
if it % self.print_interval == 0:
|
||||
metrics = {
|
||||
n: m for n, m in self.metrics.items() if n.startswith("val") == val
|
||||
}
|
||||
step = (
|
||||
(self.epoch, self.iteration)
|
||||
if not val
|
||||
else (self.epoch, self.iteration, self.val_iteration)
|
||||
)
|
||||
|
||||
verbositys = {m["level"] for _, m in metrics.items()}
|
||||
for ll in verbositys:
|
||||
llm = {n: m for n, m in metrics.items() if m["level"] == ll}
|
||||
|
||||
dllogger.log(
|
||||
step=step,
|
||||
data={n: m["meter"].get_iteration() for n, m in llm.items()},
|
||||
verbosity=ll,
|
||||
)
|
||||
|
||||
for n, m in metrics.items():
|
||||
m["meter"].reset_iteration()
|
||||
|
||||
dllogger.flush()
|
||||
|
||||
def start_epoch(self):
|
||||
self.epoch += 1
|
||||
self.iteration = 0
|
||||
self.val_iteration = 0
|
||||
|
||||
for n, m in self.metrics.items():
|
||||
m["meter"].reset_epoch()
|
||||
|
||||
def end_epoch(self):
|
||||
for n, m in self.metrics.items():
|
||||
m["meter"].reset_iteration()
|
||||
|
||||
verbositys = {m["level"] for _, m in self.metrics.items()}
|
||||
for ll in verbositys:
|
||||
llm = {n: m for n, m in self.metrics.items() if m["level"] == ll}
|
||||
dllogger.log(
|
||||
step=(self.epoch,),
|
||||
data={n: m["meter"].get_epoch() for n, m in llm.items()},
|
||||
)
|
||||
|
||||
def end(self):
|
||||
for n, m in self.metrics.items():
|
||||
m["meter"].reset_epoch()
|
||||
|
||||
verbositys = {m["level"] for _, m in self.metrics.items()}
|
||||
for ll in verbositys:
|
||||
llm = {n: m for n, m in self.metrics.items() if m["level"] == ll}
|
||||
dllogger.log(
|
||||
step=tuple(), data={n: m["meter"].get_run() for n, m in llm.items()}
|
||||
)
|
||||
|
||||
for n, m in self.metrics.items():
|
||||
m["meter"].reset_epoch()
|
||||
|
||||
dllogger.flush()
|
||||
|
||||
def iteration_generator_wrapper(self, gen, val=False):
|
||||
for g in gen:
|
||||
self.start_iteration(val=val)
|
||||
yield g
|
||||
self.end_iteration(val=val)
|
||||
|
||||
def epoch_generator_wrapper(self, gen):
|
||||
for g in gen:
|
||||
self.start_epoch()
|
||||
yield g
|
||||
self.end_epoch()
|
|
@ -0,0 +1,67 @@
|
|||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the BSD 3-Clause License (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://opensource.org/licenses/BSD-3-Clause
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import numpy as np
|
||||
|
||||
|
||||
def mixup(alpha, num_classes, data, target):
|
||||
with torch.no_grad():
|
||||
bs = data.size(0)
|
||||
c = np.random.beta(alpha, alpha)
|
||||
|
||||
perm = torch.randperm(bs).cuda()
|
||||
|
||||
md = c * data + (1 - c) * data[perm, :]
|
||||
mt = c * target + (1 - c) * target[perm, :]
|
||||
return md, mt
|
||||
|
||||
|
||||
class MixUpWrapper(object):
|
||||
def __init__(self, alpha, num_classes, dataloader):
|
||||
self.alpha = alpha
|
||||
self.dataloader = dataloader
|
||||
self.num_classes = num_classes
|
||||
|
||||
def mixup_loader(self, loader):
|
||||
for input, target in loader:
|
||||
i, t = mixup(self.alpha, self.num_classes, input, target)
|
||||
yield i, t
|
||||
|
||||
def __iter__(self):
|
||||
return self.mixup_loader(self.dataloader)
|
||||
|
||||
|
||||
class NLLMultiLabelSmooth(nn.Module):
|
||||
def __init__(self, smoothing=0.0):
|
||||
super(NLLMultiLabelSmooth, self).__init__()
|
||||
self.confidence = 1.0 - smoothing
|
||||
self.smoothing = smoothing
|
||||
|
||||
def forward(self, x, target):
|
||||
if self.training:
|
||||
x = x.float()
|
||||
target = target.float()
|
||||
logprobs = torch.nn.functional.log_softmax(x, dim=-1)
|
||||
|
||||
nll_loss = -logprobs * target
|
||||
nll_loss = nll_loss.sum(-1)
|
||||
|
||||
smooth_loss = -logprobs.mean(dim=-1)
|
||||
|
||||
loss = self.confidence * nll_loss + self.smoothing * smooth_loss
|
||||
|
||||
return loss.mean()
|
||||
else:
|
||||
return torch.nn.functional.cross_entropy(x, target)
|
|
@ -0,0 +1,411 @@
|
|||
# Copyright (c) 2018-2019, NVIDIA CORPORATION
|
||||
# Copyright (c) 2017- Facebook, Inc
|
||||
#
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# * Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import numpy as np
|
||||
|
||||
__all__ = ["ResNet", "build_resnet", "resnet_versions", "resnet_configs"]
|
||||
|
||||
# ResNetBuilder {{{
|
||||
|
||||
|
||||
class ResNetBuilder(object):
|
||||
def __init__(self, version, config):
|
||||
self.conv3x3_cardinality = (
|
||||
1 if "cardinality" not in version.keys() else version["cardinality"]
|
||||
)
|
||||
self.config = config
|
||||
|
||||
def conv(self, kernel_size, in_planes, out_planes, groups=1, stride=1):
|
||||
conv = nn.Conv2d(
|
||||
in_planes,
|
||||
out_planes,
|
||||
kernel_size=kernel_size,
|
||||
groups=groups,
|
||||
stride=stride,
|
||||
padding=int((kernel_size - 1) / 2),
|
||||
bias=False,
|
||||
)
|
||||
|
||||
if self.config["nonlinearity"] == "relu":
|
||||
nn.init.kaiming_normal_(
|
||||
conv.weight,
|
||||
mode=self.config["conv_init"],
|
||||
nonlinearity=self.config["nonlinearity"],
|
||||
)
|
||||
|
||||
return conv
|
||||
|
||||
def conv3x3(self, in_planes, out_planes, stride=1):
|
||||
"""3x3 convolution with padding"""
|
||||
c = self.conv(
|
||||
3, in_planes, out_planes, groups=self.conv3x3_cardinality, stride=stride
|
||||
)
|
||||
return c
|
||||
|
||||
def conv1x1(self, in_planes, out_planes, stride=1):
|
||||
"""1x1 convolution with padding"""
|
||||
c = self.conv(1, in_planes, out_planes, stride=stride)
|
||||
return c
|
||||
|
||||
def conv7x7(self, in_planes, out_planes, stride=1):
|
||||
"""7x7 convolution with padding"""
|
||||
c = self.conv(7, in_planes, out_planes, stride=stride)
|
||||
return c
|
||||
|
||||
def conv5x5(self, in_planes, out_planes, stride=1):
|
||||
"""5x5 convolution with padding"""
|
||||
c = self.conv(5, in_planes, out_planes, stride=stride)
|
||||
return c
|
||||
|
||||
def batchnorm(self, planes, last_bn=False):
|
||||
bn = nn.BatchNorm2d(planes)
|
||||
gamma_init_val = 0 if last_bn and self.config["last_bn_0_init"] else 1
|
||||
nn.init.constant_(bn.weight, gamma_init_val)
|
||||
nn.init.constant_(bn.bias, 0)
|
||||
|
||||
return bn
|
||||
|
||||
def activation(self):
|
||||
return self.config["activation"]()
|
||||
|
||||
|
||||
# ResNetBuilder }}}
|
||||
|
||||
# BasicBlock {{{
|
||||
class BasicBlock(nn.Module):
|
||||
def __init__(self, builder, inplanes, planes, expansion, stride=1, downsample=None):
|
||||
super(BasicBlock, self).__init__()
|
||||
self.conv1 = builder.conv3x3(inplanes, planes, stride)
|
||||
self.bn1 = builder.batchnorm(planes)
|
||||
self.relu = builder.activation()
|
||||
self.conv2 = builder.conv3x3(planes, planes * expansion)
|
||||
self.bn2 = builder.batchnorm(planes * expansion, last_bn=True)
|
||||
self.downsample = downsample
|
||||
self.stride = stride
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
if self.bn1 is not None:
|
||||
out = self.bn1(out)
|
||||
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
|
||||
if self.bn2 is not None:
|
||||
out = self.bn2(out)
|
||||
|
||||
if self.downsample is not None:
|
||||
residual = self.downsample(x)
|
||||
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
# BasicBlock }}}
|
||||
|
||||
# SqueezeAndExcitation {{{
|
||||
class SqueezeAndExcitation(nn.Module):
|
||||
def __init__(self, planes, squeeze):
|
||||
super(SqueezeAndExcitation, self).__init__()
|
||||
self.squeeze = nn.Linear(planes, squeeze)
|
||||
self.expand = nn.Linear(squeeze, planes)
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.sigmoid = nn.Sigmoid()
|
||||
|
||||
def forward(self, x):
|
||||
out = torch.mean(x.view(x.size(0), x.size(1), -1), 2)
|
||||
out = self.squeeze(out)
|
||||
out = self.relu(out)
|
||||
out = self.expand(out)
|
||||
out = self.sigmoid(out)
|
||||
out = out.unsqueeze(2).unsqueeze(3)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
# }}}
|
||||
|
||||
# Bottleneck {{{
|
||||
class Bottleneck(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
builder,
|
||||
inplanes,
|
||||
planes,
|
||||
expansion,
|
||||
stride=1,
|
||||
se=False,
|
||||
se_squeeze=16,
|
||||
downsample=None,
|
||||
):
|
||||
super(Bottleneck, self).__init__()
|
||||
self.conv1 = builder.conv1x1(inplanes, planes)
|
||||
self.bn1 = builder.batchnorm(planes)
|
||||
self.conv2 = builder.conv3x3(planes, planes, stride=stride)
|
||||
self.bn2 = builder.batchnorm(planes)
|
||||
self.conv3 = builder.conv1x1(planes, planes * expansion)
|
||||
self.bn3 = builder.batchnorm(planes * expansion, last_bn=True)
|
||||
self.relu = builder.activation()
|
||||
self.downsample = downsample
|
||||
self.stride = stride
|
||||
self.squeeze = (
|
||||
SqueezeAndExcitation(planes * expansion, se_squeeze) if se else None
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
if self.downsample is not None:
|
||||
residual = self.downsample(x)
|
||||
|
||||
if self.squeeze is None:
|
||||
out += residual
|
||||
else:
|
||||
out = torch.addcmul(residual, 1.0, out, self.squeeze(out))
|
||||
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def SEBottleneck(builder, inplanes, planes, expansion, stride=1, downsample=None):
|
||||
return Bottleneck(
|
||||
builder,
|
||||
inplanes,
|
||||
planes,
|
||||
expansion,
|
||||
stride=stride,
|
||||
se=True,
|
||||
se_squeeze=16,
|
||||
downsample=downsample,
|
||||
)
|
||||
|
||||
|
||||
# Bottleneck }}}
|
||||
|
||||
# ResNet {{{
|
||||
class ResNet(nn.Module):
|
||||
def __init__(self, builder, block, expansion, layers, widths, num_classes=1000):
|
||||
self.inplanes = 64
|
||||
super(ResNet, self).__init__()
|
||||
self.conv1 = builder.conv7x7(3, 64, stride=2)
|
||||
self.bn1 = builder.batchnorm(64)
|
||||
self.relu = builder.activation()
|
||||
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
|
||||
self.layer1 = self._make_layer(builder, block, expansion, widths[0], layers[0])
|
||||
self.layer2 = self._make_layer(
|
||||
builder, block, expansion, widths[1], layers[1], stride=2
|
||||
)
|
||||
self.layer3 = self._make_layer(
|
||||
builder, block, expansion, widths[2], layers[2], stride=2
|
||||
)
|
||||
self.layer4 = self._make_layer(
|
||||
builder, block, expansion, widths[3], layers[3], stride=2
|
||||
)
|
||||
self.avgpool = nn.AdaptiveAvgPool2d(1)
|
||||
self.fc = nn.Linear(widths[3] * expansion, num_classes)
|
||||
|
||||
def _make_layer(self, builder, block, expansion, planes, blocks, stride=1):
|
||||
downsample = None
|
||||
if stride != 1 or self.inplanes != planes * expansion:
|
||||
dconv = builder.conv1x1(self.inplanes, planes * expansion, stride=stride)
|
||||
dbn = builder.batchnorm(planes * expansion)
|
||||
if dbn is not None:
|
||||
downsample = nn.Sequential(dconv, dbn)
|
||||
else:
|
||||
downsample = dconv
|
||||
|
||||
layers = []
|
||||
layers.append(
|
||||
block(
|
||||
builder,
|
||||
self.inplanes,
|
||||
planes,
|
||||
expansion,
|
||||
stride=stride,
|
||||
downsample=downsample,
|
||||
)
|
||||
)
|
||||
self.inplanes = planes * expansion
|
||||
for i in range(1, blocks):
|
||||
layers.append(block(builder, self.inplanes, planes, expansion))
|
||||
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
if self.bn1 is not None:
|
||||
x = self.bn1(x)
|
||||
x = self.relu(x)
|
||||
x = self.maxpool(x)
|
||||
|
||||
x = self.layer1(x)
|
||||
x = self.layer2(x)
|
||||
x = self.layer3(x)
|
||||
x = self.layer4(x)
|
||||
|
||||
x = self.avgpool(x)
|
||||
x = x.view(x.size(0), -1)
|
||||
x = self.fc(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
# ResNet }}}
|
||||
|
||||
resnet_configs = {
|
||||
"classic": {
|
||||
"conv": nn.Conv2d,
|
||||
"conv_init": "fan_out",
|
||||
"nonlinearity": "relu",
|
||||
"last_bn_0_init": False,
|
||||
"activation": lambda: nn.ReLU(inplace=True),
|
||||
},
|
||||
"fanin": {
|
||||
"conv": nn.Conv2d,
|
||||
"conv_init": "fan_in",
|
||||
"nonlinearity": "relu",
|
||||
"last_bn_0_init": False,
|
||||
"activation": lambda: nn.ReLU(inplace=True),
|
||||
},
|
||||
"grp-fanin": {
|
||||
"conv": nn.Conv2d,
|
||||
"conv_init": "fan_in",
|
||||
"nonlinearity": "relu",
|
||||
"last_bn_0_init": False,
|
||||
"activation": lambda: nn.ReLU(inplace=True),
|
||||
},
|
||||
"grp-fanout": {
|
||||
"conv": nn.Conv2d,
|
||||
"conv_init": "fan_out",
|
||||
"nonlinearity": "relu",
|
||||
"last_bn_0_init": False,
|
||||
"activation": lambda: nn.ReLU(inplace=True),
|
||||
},
|
||||
}
|
||||
|
||||
resnet_versions = {
|
||||
"resnet18": {
|
||||
"net": ResNet,
|
||||
"block": BasicBlock,
|
||||
"layers": [2, 2, 2, 2],
|
||||
"widths": [64, 128, 256, 512],
|
||||
"expansion": 1,
|
||||
"num_classes": 1000,
|
||||
},
|
||||
"resnet34": {
|
||||
"net": ResNet,
|
||||
"block": BasicBlock,
|
||||
"layers": [3, 4, 6, 3],
|
||||
"widths": [64, 128, 256, 512],
|
||||
"expansion": 1,
|
||||
"num_classes": 1000,
|
||||
},
|
||||
"resnet50": {
|
||||
"net": ResNet,
|
||||
"block": Bottleneck,
|
||||
"layers": [3, 4, 6, 3],
|
||||
"widths": [64, 128, 256, 512],
|
||||
"expansion": 4,
|
||||
"num_classes": 1000,
|
||||
},
|
||||
"resnet101": {
|
||||
"net": ResNet,
|
||||
"block": Bottleneck,
|
||||
"layers": [3, 4, 23, 3],
|
||||
"widths": [64, 128, 256, 512],
|
||||
"expansion": 4,
|
||||
"num_classes": 1000,
|
||||
},
|
||||
"resnet152": {
|
||||
"net": ResNet,
|
||||
"block": Bottleneck,
|
||||
"layers": [3, 8, 36, 3],
|
||||
"widths": [64, 128, 256, 512],
|
||||
"expansion": 4,
|
||||
"num_classes": 1000,
|
||||
},
|
||||
"resnext101-32x4d": {
|
||||
"net": ResNet,
|
||||
"block": Bottleneck,
|
||||
"cardinality": 32,
|
||||
"layers": [3, 4, 23, 3],
|
||||
"widths": [128, 256, 512, 1024],
|
||||
"expansion": 2,
|
||||
"num_classes": 1000,
|
||||
},
|
||||
"se-resnext101-32x4d": {
|
||||
"net": ResNet,
|
||||
"block": SEBottleneck,
|
||||
"cardinality": 32,
|
||||
"layers": [3, 4, 23, 3],
|
||||
"widths": [128, 256, 512, 1024],
|
||||
"expansion": 2,
|
||||
"num_classes": 1000,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def build_resnet(version, config, verbose=True):
|
||||
version = resnet_versions[version]
|
||||
config = resnet_configs[config]
|
||||
|
||||
builder = ResNetBuilder(version, config)
|
||||
if verbose:
|
||||
print("Version: {}".format(version))
|
||||
print("Config: {}".format(config))
|
||||
model = version["net"](
|
||||
builder,
|
||||
version["block"],
|
||||
version["expansion"],
|
||||
version["layers"],
|
||||
version["widths"],
|
||||
version["num_classes"],
|
||||
)
|
||||
|
||||
return model
|
|
@ -0,0 +1,40 @@
|
|||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the BSD 3-Clause License (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://opensource.org/licenses/BSD-3-Clause
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class LabelSmoothing(nn.Module):
|
||||
"""
|
||||
NLL loss with label smoothing.
|
||||
"""
|
||||
|
||||
def __init__(self, smoothing=0.0):
|
||||
"""
|
||||
Constructor for the LabelSmoothing module.
|
||||
|
||||
:param smoothing: label smoothing factor
|
||||
"""
|
||||
super(LabelSmoothing, self).__init__()
|
||||
self.confidence = 1.0 - smoothing
|
||||
self.smoothing = smoothing
|
||||
|
||||
def forward(self, x, target):
|
||||
logprobs = torch.nn.functional.log_softmax(x, dim=-1)
|
||||
|
||||
nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
|
||||
nll_loss = nll_loss.squeeze(1)
|
||||
smooth_loss = -logprobs.mean(dim=-1)
|
||||
loss = self.confidence * nll_loss + self.smoothing * smooth_loss
|
||||
return loss.mean()
|
|
@ -0,0 +1,745 @@
|
|||
# Copyright (c) 2018-2019, NVIDIA CORPORATION
|
||||
# Copyright (c) 2017- Facebook, Inc
|
||||
#
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# * Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
import os
|
||||
import time
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.autograd import Variable
|
||||
from . import logger as log
|
||||
from . import resnet as models
|
||||
from . import utils
|
||||
import dllogger
|
||||
|
||||
try:
|
||||
from apex.parallel import DistributedDataParallel as DDP
|
||||
from apex.fp16_utils import *
|
||||
from apex import amp
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install apex from https://www.github.com/nvidia/apex to run this example."
|
||||
)
|
||||
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from datetime import datetime
|
||||
from watchdog.observers import Observer
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
|
||||
DIRECTORY_TO_WATCH = "/usr/share"
|
||||
checkpoint_file_name = "checkpoint_backup.pth.tar"
|
||||
|
||||
from multiprocessing import Value
|
||||
from ctypes import c_bool
|
||||
|
||||
from azureml.core.run import Run
|
||||
|
||||
run = Run.get_context()
|
||||
|
||||
|
||||
class PreemptHandler(FileSystemEventHandler):
|
||||
def __init__(self):
|
||||
super(PreemptHandler, self).__init__()
|
||||
self.is_preempted = Value(c_bool, False)
|
||||
|
||||
def on_any_event(self, event):
|
||||
if not event.is_directory and event.src_path.endswith("/to-be-preempted"):
|
||||
print(datetime.utcnow(), "Detected Preempt Signal, should stop and return.")
|
||||
self.is_preempted.value = True
|
||||
|
||||
|
||||
class PreemptDetector:
|
||||
def __init__(self):
|
||||
self.observer = Observer()
|
||||
self.event_handler = PreemptHandler()
|
||||
|
||||
def run(self):
|
||||
self.observer.schedule(self.event_handler, DIRECTORY_TO_WATCH, recursive=False)
|
||||
self.observer.start()
|
||||
|
||||
def is_preempted(self):
|
||||
return self.event_handler.is_preempted.value == True
|
||||
|
||||
def stop(self):
|
||||
self.observer.stop()
|
||||
|
||||
|
||||
ACC_METADATA = {"unit": "%", "format": ":.2f"}
|
||||
IPS_METADATA = {"unit": "img/s", "format": ":.2f"}
|
||||
TIME_METADATA = {"unit": "s", "format": ":.5f"}
|
||||
LOSS_METADATA = {"format": ":.5f"}
|
||||
|
||||
|
||||
class ModelAndLoss(nn.Module):
|
||||
def __init__(self, arch, loss, pretrained_weights=None, cuda=True, fp16=False):
|
||||
super(ModelAndLoss, self).__init__()
|
||||
self.arch = arch
|
||||
|
||||
print("=> creating model '{}'".format(arch))
|
||||
model = models.build_resnet(arch[0], arch[1])
|
||||
if pretrained_weights is not None:
|
||||
print("=> using pre-trained model from a file '{}'".format(arch))
|
||||
model.load_state_dict(pretrained_weights)
|
||||
|
||||
if cuda:
|
||||
model = model.cuda()
|
||||
if fp16:
|
||||
model = network_to_half(model)
|
||||
|
||||
# define loss function (criterion) and optimizer
|
||||
criterion = loss()
|
||||
|
||||
if cuda:
|
||||
criterion = criterion.cuda()
|
||||
|
||||
self.model = model
|
||||
self.loss = criterion
|
||||
|
||||
def forward(self, data, target):
|
||||
output = self.model(data)
|
||||
loss = self.loss(output, target)
|
||||
|
||||
return loss, output
|
||||
|
||||
def distributed(self):
|
||||
self.model = DDP(self.model)
|
||||
|
||||
def load_model_state(self, state):
|
||||
if not state is None:
|
||||
self.model.load_state_dict(state)
|
||||
|
||||
|
||||
def get_optimizer(
|
||||
parameters,
|
||||
fp16,
|
||||
lr,
|
||||
momentum,
|
||||
weight_decay,
|
||||
nesterov=False,
|
||||
state=None,
|
||||
static_loss_scale=1.0,
|
||||
dynamic_loss_scale=False,
|
||||
bn_weight_decay=False,
|
||||
):
|
||||
|
||||
if bn_weight_decay:
|
||||
print(" ! Weight decay applied to BN parameters ")
|
||||
optimizer = torch.optim.SGD(
|
||||
[v for n, v in parameters],
|
||||
lr,
|
||||
momentum=momentum,
|
||||
weight_decay=weight_decay,
|
||||
nesterov=nesterov,
|
||||
)
|
||||
else:
|
||||
print(" ! Weight decay NOT applied to BN parameters ")
|
||||
bn_params = [v for n, v in parameters if "bn" in n]
|
||||
rest_params = [v for n, v in parameters if not "bn" in n]
|
||||
print(len(bn_params))
|
||||
print(len(rest_params))
|
||||
optimizer = torch.optim.SGD(
|
||||
[
|
||||
{"params": bn_params, "weight_decay": 0},
|
||||
{"params": rest_params, "weight_decay": weight_decay},
|
||||
],
|
||||
lr,
|
||||
momentum=momentum,
|
||||
weight_decay=weight_decay,
|
||||
nesterov=nesterov,
|
||||
)
|
||||
if fp16:
|
||||
optimizer = FP16_Optimizer(
|
||||
optimizer,
|
||||
static_loss_scale=static_loss_scale,
|
||||
dynamic_loss_scale=dynamic_loss_scale,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
if not state is None:
|
||||
optimizer.load_state_dict(state)
|
||||
|
||||
return optimizer
|
||||
|
||||
|
||||
def lr_policy(lr_fn, logger=None):
|
||||
if logger is not None:
|
||||
logger.register_metric(
|
||||
"lr", log.LR_METER(), verbosity=dllogger.Verbosity.VERBOSE
|
||||
)
|
||||
|
||||
def _alr(optimizer, iteration, epoch):
|
||||
lr = lr_fn(iteration, epoch)
|
||||
|
||||
if logger is not None:
|
||||
logger.log_metric("lr", lr)
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group["lr"] = lr
|
||||
return lr
|
||||
|
||||
return _alr
|
||||
|
||||
|
||||
def lr_step_policy(base_lr, steps, decay_factor, warmup_length, logger=None):
|
||||
def _lr_fn(iteration, epoch):
|
||||
if epoch < warmup_length:
|
||||
lr = base_lr * (epoch + 1) / warmup_length
|
||||
else:
|
||||
lr = base_lr
|
||||
for s in steps:
|
||||
if epoch >= s:
|
||||
lr *= decay_factor
|
||||
return lr
|
||||
|
||||
return lr_policy(_lr_fn, logger=logger)
|
||||
|
||||
|
||||
def lr_linear_policy(base_lr, warmup_length, epochs, logger=None):
|
||||
def _lr_fn(iteration, epoch):
|
||||
if epoch < warmup_length:
|
||||
lr = base_lr * (epoch + 1) / warmup_length
|
||||
else:
|
||||
e = epoch - warmup_length
|
||||
es = epochs - warmup_length
|
||||
lr = base_lr * (1 - (e / es))
|
||||
return lr
|
||||
|
||||
return lr_policy(_lr_fn, logger=logger)
|
||||
|
||||
|
||||
def lr_cosine_policy(base_lr, warmup_length, epochs, logger=None):
|
||||
def _lr_fn(iteration, epoch):
|
||||
if epoch < warmup_length:
|
||||
lr = base_lr * (epoch + 1) / warmup_length
|
||||
else:
|
||||
e = epoch - warmup_length
|
||||
es = epochs - warmup_length
|
||||
lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
|
||||
return lr
|
||||
|
||||
return lr_policy(_lr_fn, logger=logger)
|
||||
|
||||
|
||||
def lr_exponential_policy(
|
||||
base_lr, warmup_length, epochs, final_multiplier=0.001, logger=None
|
||||
):
|
||||
es = epochs - warmup_length
|
||||
epoch_decay = np.power(2, np.log2(final_multiplier) / es)
|
||||
|
||||
def _lr_fn(iteration, epoch):
|
||||
if epoch < warmup_length:
|
||||
lr = base_lr * (epoch + 1) / warmup_length
|
||||
else:
|
||||
e = epoch - warmup_length
|
||||
lr = base_lr * (epoch_decay**e)
|
||||
return lr
|
||||
|
||||
return lr_policy(_lr_fn, logger=logger)
|
||||
|
||||
|
||||
def get_train_step(
|
||||
model_and_loss, optimizer, fp16, use_amp=False, batch_size_multiplier=1
|
||||
):
|
||||
def _step(input, target, optimizer_step=True):
|
||||
input_var = Variable(input)
|
||||
target_var = Variable(target)
|
||||
loss, output = model_and_loss(input_var, target_var)
|
||||
if torch.distributed.is_initialized():
|
||||
reduced_loss = utils.reduce_tensor(loss.data)
|
||||
else:
|
||||
reduced_loss = loss.data
|
||||
|
||||
if fp16:
|
||||
optimizer.backward(loss)
|
||||
elif use_amp:
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
|
||||
if optimizer_step:
|
||||
opt = (
|
||||
optimizer.optimizer
|
||||
if isinstance(optimizer, FP16_Optimizer)
|
||||
else optimizer
|
||||
)
|
||||
for param_group in opt.param_groups:
|
||||
for param in param_group["params"]:
|
||||
param.grad /= batch_size_multiplier
|
||||
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
torch.cuda.synchronize()
|
||||
|
||||
return reduced_loss
|
||||
|
||||
return _step
|
||||
|
||||
|
||||
def train(
|
||||
train_loader,
|
||||
model_and_loss,
|
||||
optimizer,
|
||||
lr_scheduler,
|
||||
fp16,
|
||||
logger,
|
||||
epoch,
|
||||
detector,
|
||||
use_amp=False,
|
||||
prof=-1,
|
||||
batch_size_multiplier=1,
|
||||
register_metrics=True,
|
||||
total_train_step=0,
|
||||
writer=None,
|
||||
):
|
||||
print(f"training...")
|
||||
print(f"register_metrics {register_metrics}, logger {logger}.")
|
||||
if register_metrics and logger is not None:
|
||||
logger.register_metric(
|
||||
"train.loss",
|
||||
log.LOSS_METER(),
|
||||
verbosity=dllogger.Verbosity.DEFAULT,
|
||||
metadata=LOSS_METADATA,
|
||||
)
|
||||
logger.register_metric(
|
||||
"train.compute_ips",
|
||||
log.PERF_METER(),
|
||||
verbosity=dllogger.Verbosity.VERBOSE,
|
||||
metadata=IPS_METADATA,
|
||||
)
|
||||
logger.register_metric(
|
||||
"train.total_ips",
|
||||
log.PERF_METER(),
|
||||
verbosity=dllogger.Verbosity.DEFAULT,
|
||||
metadata=IPS_METADATA,
|
||||
)
|
||||
logger.register_metric(
|
||||
"train.data_time",
|
||||
log.PERF_METER(),
|
||||
verbosity=dllogger.Verbosity.VERBOSE,
|
||||
metadata=TIME_METADATA,
|
||||
)
|
||||
logger.register_metric(
|
||||
"train.compute_time",
|
||||
log.PERF_METER(),
|
||||
verbosity=dllogger.Verbosity.VERBOSE,
|
||||
metadata=TIME_METADATA,
|
||||
)
|
||||
|
||||
step = get_train_step(
|
||||
model_and_loss,
|
||||
optimizer,
|
||||
fp16,
|
||||
use_amp=use_amp,
|
||||
batch_size_multiplier=batch_size_multiplier,
|
||||
)
|
||||
|
||||
model_and_loss.train()
|
||||
end = time.time()
|
||||
|
||||
optimizer.zero_grad()
|
||||
last_train_step = total_train_step
|
||||
data_iter = enumerate(train_loader)
|
||||
if logger is not None:
|
||||
data_iter = logger.iteration_generator_wrapper(data_iter)
|
||||
if prof > 0:
|
||||
data_iter = utils.first_n(prof, data_iter)
|
||||
|
||||
for i, (input, target) in data_iter:
|
||||
bs = input.size(0)
|
||||
lr = lr_scheduler(optimizer, i, epoch)
|
||||
data_time = time.time() - end
|
||||
|
||||
optimizer_step = ((i + 1) % batch_size_multiplier) == 0
|
||||
loss = step(input, target, optimizer_step=optimizer_step)
|
||||
|
||||
it_time = time.time() - end
|
||||
|
||||
if optimizer_step:
|
||||
if writer:
|
||||
writer.add_scalar("train/summary/scalar/learning_rate", lr, epoch)
|
||||
writer.add_scalar(
|
||||
"train/summary/scalar/loss", to_python_float(loss), total_train_step
|
||||
)
|
||||
writer.add_scalar(
|
||||
"perf/summary/scalar/compute_ips",
|
||||
calc_ips(bs, it_time - data_time),
|
||||
total_train_step,
|
||||
)
|
||||
writer.add_scalar(
|
||||
"perf/summary/scalar/train_total_ips",
|
||||
calc_ips(bs, it_time),
|
||||
total_train_step,
|
||||
)
|
||||
run.log_row("train/learning_rate", x=epoch, y=lr)
|
||||
run.log_row("train/loss", x=total_train_step, y=to_python_float(loss))
|
||||
run.log_row(
|
||||
"perf/compute_ips",
|
||||
x=total_train_step,
|
||||
y=calc_ips(bs, it_time - data_time),
|
||||
)
|
||||
run.log_row(
|
||||
"perf/train_total_ips", x=total_train_step, y=calc_ips(bs, it_time)
|
||||
)
|
||||
|
||||
total_train_step += 1
|
||||
if logger is not None:
|
||||
logger.log_metric("train.loss", to_python_float(loss), bs)
|
||||
logger.log_metric("train.compute_ips", calc_ips(bs, it_time - data_time))
|
||||
logger.log_metric("train.total_ips", calc_ips(bs, it_time))
|
||||
logger.log_metric("train.data_time", data_time)
|
||||
logger.log_metric("train.compute_time", it_time - data_time)
|
||||
|
||||
end = time.time()
|
||||
|
||||
if writer:
|
||||
writer.flush()
|
||||
|
||||
if detector.is_preempted():
|
||||
print(
|
||||
datetime.utcnow(),
|
||||
"Exit training loop detecting is_preempted changed to True",
|
||||
)
|
||||
return last_train_step
|
||||
|
||||
return total_train_step
|
||||
|
||||
|
||||
def get_val_step(model_and_loss):
|
||||
def _step(input, target):
|
||||
input_var = Variable(input)
|
||||
target_var = Variable(target)
|
||||
|
||||
with torch.no_grad():
|
||||
loss, output = model_and_loss(input_var, target_var)
|
||||
|
||||
prec1, prec5 = utils.accuracy(output.data, target, topk=(1, 5))
|
||||
|
||||
if torch.distributed.is_initialized():
|
||||
reduced_loss = utils.reduce_tensor(loss.data)
|
||||
prec1 = utils.reduce_tensor(prec1)
|
||||
prec5 = utils.reduce_tensor(prec5)
|
||||
else:
|
||||
reduced_loss = loss.data
|
||||
|
||||
torch.cuda.synchronize()
|
||||
|
||||
return reduced_loss, prec1, prec5
|
||||
|
||||
return _step
|
||||
|
||||
|
||||
def validate(
|
||||
val_loader,
|
||||
model_and_loss,
|
||||
fp16,
|
||||
logger,
|
||||
epoch,
|
||||
detector,
|
||||
prof=-1,
|
||||
register_metrics=True,
|
||||
):
|
||||
print(f"validating...")
|
||||
print(f"register_metrics {register_metrics}, logger {logger}.")
|
||||
if register_metrics and logger is not None:
|
||||
logger.register_metric(
|
||||
"val.top1",
|
||||
log.ACC_METER(),
|
||||
verbosity=dllogger.Verbosity.DEFAULT,
|
||||
metadata=ACC_METADATA,
|
||||
)
|
||||
logger.register_metric(
|
||||
"val.top5",
|
||||
log.ACC_METER(),
|
||||
verbosity=dllogger.Verbosity.DEFAULT,
|
||||
metadata=ACC_METADATA,
|
||||
)
|
||||
logger.register_metric(
|
||||
"val.loss",
|
||||
log.LOSS_METER(),
|
||||
verbosity=dllogger.Verbosity.DEFAULT,
|
||||
metadata=LOSS_METADATA,
|
||||
)
|
||||
logger.register_metric(
|
||||
"val.compute_ips",
|
||||
log.PERF_METER(),
|
||||
verbosity=dllogger.Verbosity.VERBOSE,
|
||||
metadata=IPS_METADATA,
|
||||
)
|
||||
logger.register_metric(
|
||||
"val.total_ips",
|
||||
log.PERF_METER(),
|
||||
verbosity=dllogger.Verbosity.DEFAULT,
|
||||
metadata=IPS_METADATA,
|
||||
)
|
||||
logger.register_metric(
|
||||
"val.data_time",
|
||||
log.PERF_METER(),
|
||||
verbosity=dllogger.Verbosity.VERBOSE,
|
||||
metadata=TIME_METADATA,
|
||||
)
|
||||
logger.register_metric(
|
||||
"val.compute_latency",
|
||||
log.PERF_METER(),
|
||||
verbosity=dllogger.Verbosity.VERBOSE,
|
||||
metadata=TIME_METADATA,
|
||||
)
|
||||
logger.register_metric(
|
||||
"val.compute_latency_at100",
|
||||
log.LAT_100(),
|
||||
verbosity=dllogger.Verbosity.VERBOSE,
|
||||
metadata=TIME_METADATA,
|
||||
)
|
||||
logger.register_metric(
|
||||
"val.compute_latency_at99",
|
||||
log.LAT_99(),
|
||||
verbosity=dllogger.Verbosity.VERBOSE,
|
||||
metadata=TIME_METADATA,
|
||||
)
|
||||
logger.register_metric(
|
||||
"val.compute_latency_at95",
|
||||
log.LAT_95(),
|
||||
verbosity=dllogger.Verbosity.VERBOSE,
|
||||
metadata=TIME_METADATA,
|
||||
)
|
||||
|
||||
step = get_val_step(model_and_loss)
|
||||
|
||||
top1 = log.AverageMeter()
|
||||
# switch to evaluate mode
|
||||
model_and_loss.eval()
|
||||
|
||||
end = time.time()
|
||||
|
||||
data_iter = enumerate(val_loader)
|
||||
if not logger is None:
|
||||
data_iter = logger.iteration_generator_wrapper(data_iter, val=True)
|
||||
if prof > 0:
|
||||
data_iter = utils.first_n(prof, data_iter)
|
||||
|
||||
loss_sum = 0.0
|
||||
total_val_step = 0
|
||||
for i, (input, target) in data_iter:
|
||||
bs = input.size(0)
|
||||
data_time = time.time() - end
|
||||
|
||||
loss, prec1, prec5 = step(input, target)
|
||||
|
||||
it_time = time.time() - end
|
||||
|
||||
top1.record(to_python_float(prec1), bs)
|
||||
if logger is not None:
|
||||
logger.log_metric("val.top1", to_python_float(prec1), bs)
|
||||
logger.log_metric("val.top5", to_python_float(prec5), bs)
|
||||
logger.log_metric("val.loss", to_python_float(loss), bs)
|
||||
logger.log_metric("val.compute_ips", calc_ips(bs, it_time - data_time))
|
||||
logger.log_metric("val.total_ips", calc_ips(bs, it_time))
|
||||
logger.log_metric("val.data_time", data_time)
|
||||
logger.log_metric("val.compute_latency", it_time - data_time)
|
||||
logger.log_metric("val.compute_latency_at95", it_time - data_time)
|
||||
logger.log_metric("val.compute_latency_at99", it_time - data_time)
|
||||
logger.log_metric("val.compute_latency_at100", it_time - data_time)
|
||||
|
||||
loss_sum += to_python_float(loss)
|
||||
total_val_step += 1
|
||||
|
||||
end = time.time()
|
||||
if detector.is_preempted():
|
||||
print(
|
||||
datetime.utcnow(),
|
||||
"Exit validation loop detecting is_preempted changed to True",
|
||||
)
|
||||
break
|
||||
|
||||
return [top1, loss_sum / total_val_step]
|
||||
|
||||
|
||||
# Train loop {{{
|
||||
def calc_ips(batch_size, time):
|
||||
world_size = (
|
||||
torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
|
||||
)
|
||||
tbs = world_size * batch_size
|
||||
return tbs / time
|
||||
|
||||
|
||||
def train_loop(
|
||||
model_and_loss,
|
||||
optimizer,
|
||||
lr_scheduler,
|
||||
train_loader,
|
||||
val_loader,
|
||||
epochs,
|
||||
fp16,
|
||||
logger,
|
||||
should_backup_checkpoint,
|
||||
save_checkpoint_epochs,
|
||||
use_amp=False,
|
||||
batch_size_multiplier=1,
|
||||
best_prec1=0,
|
||||
start_epoch=0,
|
||||
prof=-1,
|
||||
skip_training=False,
|
||||
skip_validation=False,
|
||||
save_checkpoints=True,
|
||||
checkpoint_dir="./",
|
||||
total_train_step=0,
|
||||
):
|
||||
is_first_rank = (
|
||||
not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
|
||||
)
|
||||
if is_first_rank:
|
||||
ts = str(time.time())
|
||||
# logdir = os.path.expanduser('~/tensorboard/{}/logs/'.format(os.environ['DLTS_JOB_ID']) + ts)
|
||||
logdir = os.path.expanduser(
|
||||
"~/tensorboard/{}/logs/".format(os.environ["AZ_BATCH_JOB_ID"]) + ts
|
||||
)
|
||||
print("tensorboard at ", logdir)
|
||||
if not os.path.exists(logdir):
|
||||
os.makedirs(logdir)
|
||||
writer = SummaryWriter(log_dir=logdir)
|
||||
else:
|
||||
writer = None
|
||||
|
||||
prec1 = -1
|
||||
detector = PreemptDetector()
|
||||
detector.run()
|
||||
|
||||
epoch_iter = range(start_epoch, epochs)
|
||||
for epoch in epoch_iter:
|
||||
world_size = (
|
||||
torch.distributed.get_world_size()
|
||||
if torch.distributed.is_initialized()
|
||||
else 1
|
||||
)
|
||||
if writer:
|
||||
writer.add_scalar("train/summary/scalar/world_size", world_size, epoch)
|
||||
run.log_row("train/world_size", x=epoch, y=world_size)
|
||||
|
||||
if logger is not None:
|
||||
logger.start_epoch()
|
||||
if not skip_training:
|
||||
total_train_step = train(
|
||||
train_loader,
|
||||
model_and_loss,
|
||||
optimizer,
|
||||
lr_scheduler,
|
||||
fp16,
|
||||
logger,
|
||||
epoch,
|
||||
detector,
|
||||
use_amp=use_amp,
|
||||
prof=prof,
|
||||
register_metrics=epoch == start_epoch,
|
||||
batch_size_multiplier=batch_size_multiplier,
|
||||
total_train_step=total_train_step,
|
||||
writer=writer,
|
||||
)
|
||||
|
||||
if not skip_validation and not detector.is_preempted():
|
||||
top1, val_loss = validate(
|
||||
val_loader,
|
||||
model_and_loss,
|
||||
fp16,
|
||||
logger,
|
||||
epoch,
|
||||
detector,
|
||||
prof=prof,
|
||||
register_metrics=epoch == start_epoch,
|
||||
)
|
||||
if not detector.is_preempted():
|
||||
prec1, nimg = top1.get_val()
|
||||
if writer:
|
||||
writer.add_scalar("val/summary/scalar/loss", val_loss, epoch)
|
||||
writer.add_scalar("val/summary/scalar/prec1", prec1, epoch)
|
||||
run.log_row("val/loss", x=epoch, y=val_loss)
|
||||
run.log_row("val/prec1", x=epoch, y=prec1)
|
||||
|
||||
if logger is not None:
|
||||
print(
|
||||
"Epoch ", epoch, " complete with is_preempted ", detector.is_preempted()
|
||||
)
|
||||
logger.end_epoch()
|
||||
|
||||
save_ckpt = is_first_rank and (
|
||||
detector.is_preempted() or (epoch + 1) % save_checkpoint_epochs == 0
|
||||
)
|
||||
|
||||
if detector.is_preempted() and start_epoch == epoch:
|
||||
print(
|
||||
"Skipping save checkpoint since no complete epoch finishes till now. ",
|
||||
start_epoch,
|
||||
"-->",
|
||||
epoch,
|
||||
)
|
||||
save_ckpt = False
|
||||
print(f"save ckpt {save_ckpt}, ckpt dir {checkpoint_dir}.")
|
||||
if save_ckpt:
|
||||
if not skip_validation and not detector.is_preempted():
|
||||
is_best = logger.metrics["val.top1"]["meter"].get_epoch() > best_prec1
|
||||
best_prec1 = max(
|
||||
logger.metrics["val.top1"]["meter"].get_epoch(), best_prec1
|
||||
)
|
||||
else:
|
||||
is_best = False
|
||||
best_prec1 = 0
|
||||
|
||||
ckpt_epoch_index = epoch + 1 if not detector.is_preempted() else epoch
|
||||
utils.save_checkpoint(
|
||||
{
|
||||
"epoch": ckpt_epoch_index,
|
||||
"arch": model_and_loss.arch,
|
||||
"state_dict": model_and_loss.model.state_dict(),
|
||||
"best_prec1": best_prec1,
|
||||
"optimizer": optimizer.state_dict(),
|
||||
"total_train_step": total_train_step,
|
||||
},
|
||||
is_best,
|
||||
checkpoint_dir=checkpoint_dir,
|
||||
backup_filename=checkpoint_file_name,
|
||||
)
|
||||
|
||||
if detector.is_preempted():
|
||||
print(
|
||||
datetime.utcnow(),
|
||||
"Exit epoch loop detecting is_preempted changed to True, save_ckpt:",
|
||||
save_ckpt,
|
||||
)
|
||||
break
|
||||
|
||||
if writer:
|
||||
writer.close()
|
||||
detector.stop()
|
||||
print(
|
||||
datetime.utcnow(), "Training exits with is_preempted: ", detector.is_preempted()
|
||||
)
|
||||
|
||||
|
||||
# }}}
|
|
@ -0,0 +1,121 @@
|
|||
# Copyright (c) 2018-2019, NVIDIA CORPORATION
|
||||
# Copyright (c) 2017- Facebook, Inc
|
||||
#
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# * Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
import os
|
||||
import numpy as np
|
||||
import torch
|
||||
import shutil
|
||||
import torch.distributed as dist
|
||||
|
||||
|
||||
def should_backup_checkpoint(args):
|
||||
def _sbc(epoch):
|
||||
return args.gather_checkpoints and (epoch < 10 or epoch % 10 == 0)
|
||||
|
||||
return _sbc
|
||||
|
||||
|
||||
import time
|
||||
|
||||
|
||||
def save_checkpoint(
|
||||
state,
|
||||
is_best,
|
||||
filename="checkpoint.pth.tar",
|
||||
checkpoint_dir="./",
|
||||
backup_filename=None,
|
||||
):
|
||||
if (not torch.distributed.is_initialized()) or torch.distributed.get_rank() == 0:
|
||||
start_time = time.time()
|
||||
# filename = os.path.join('/tmp/', filename)
|
||||
filename = os.path.join(checkpoint_dir, filename)
|
||||
print(f"filename {filename}, ckpt dir {checkpoint_dir}")
|
||||
torch.save(state, filename)
|
||||
elapsed_time = time.time() - start_time
|
||||
# print("save checkpoint time on local /tmp ", elapsed_time)
|
||||
if is_best:
|
||||
start_time = time.time()
|
||||
shutil.copyfile(
|
||||
filename, os.path.join(checkpoint_dir, "model_best.pth.tar")
|
||||
)
|
||||
elapsed_time = time.time() - start_time
|
||||
print("save best checkpoint time (copy to blob) ", elapsed_time)
|
||||
if backup_filename is not None:
|
||||
start_time = time.time()
|
||||
shutil.copyfile(filename, os.path.join(checkpoint_dir, backup_filename))
|
||||
elapsed_time = time.time() - start_time
|
||||
print("save checkpoint time (copy to blob) ", elapsed_time)
|
||||
|
||||
|
||||
def timed_generator(gen):
|
||||
start = time.time()
|
||||
for g in gen:
|
||||
end = time.time()
|
||||
t = end - start
|
||||
yield g, t
|
||||
start = time.time()
|
||||
|
||||
|
||||
def timed_function(f):
|
||||
def _timed_function(*args, **kwargs):
|
||||
start = time.time()
|
||||
ret = f(*args, **kwargs)
|
||||
return ret, time.time() - start
|
||||
|
||||
return _timed_function
|
||||
|
||||
|
||||
def accuracy(output, target, topk=(1,)):
|
||||
"""Computes the precision@k for the specified values of k"""
|
||||
maxk = max(topk)
|
||||
batch_size = target.size(0)
|
||||
|
||||
_, pred = output.topk(maxk, 1, True, True)
|
||||
pred = pred.t()
|
||||
correct = pred.eq(target.view(1, -1).expand_as(pred))
|
||||
|
||||
res = []
|
||||
for k in topk:
|
||||
correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
|
||||
res.append(correct_k.mul_(100.0 / batch_size))
|
||||
return res
|
||||
|
||||
|
||||
def reduce_tensor(tensor):
|
||||
rt = tensor.clone()
|
||||
dist.all_reduce(rt, op=dist.ReduceOp.SUM)
|
||||
rt /= (
|
||||
torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
|
||||
)
|
||||
return rt
|
||||
|
||||
|
||||
def first_n(n, generator):
|
||||
for i, d in zip(range(n), generator):
|
||||
yield d
|
|
@ -0,0 +1,603 @@
|
|||
# Copyright (c) 2018-2019, NVIDIA CORPORATION
|
||||
# Copyright (c) 2017- Facebook, Inc
|
||||
#
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# * Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
import argparse
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.autograd import Variable
|
||||
import torch.nn as nn
|
||||
import torch.nn.parallel
|
||||
import torch.backends.cudnn as cudnn
|
||||
import torch.distributed as dist
|
||||
import torch.optim
|
||||
import torch.utils.data
|
||||
import torch.utils.data.distributed
|
||||
import torchvision.transforms as transforms
|
||||
import torchvision.datasets as datasets
|
||||
|
||||
try:
|
||||
from apex.parallel import DistributedDataParallel as DDP
|
||||
from apex.fp16_utils import *
|
||||
from apex import amp
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install apex from https://www.github.com/nvidia/apex to run this example."
|
||||
)
|
||||
|
||||
import image_classification.resnet as models
|
||||
import image_classification.logger as log
|
||||
|
||||
from image_classification.smoothing import LabelSmoothing
|
||||
from image_classification.mixup import NLLMultiLabelSmooth, MixUpWrapper
|
||||
from image_classification.dataloaders import *
|
||||
from image_classification.training import *
|
||||
from image_classification.utils import *
|
||||
|
||||
import dllogger
|
||||
|
||||
import torch.multiprocessing as mp
|
||||
import os
|
||||
import os.path as op
|
||||
import re
|
||||
from datetime import datetime
|
||||
import sys
|
||||
|
||||
# cluster aware logic start
|
||||
def get_master_ip():
|
||||
regexp = "[\s\S]*export[\s]*DLTS_SD_worker0_IP=([0-9.]+)[\s|s]*"
|
||||
with open("/dlts-runtime/env/init.env", "r") as f:
|
||||
line = f.read()
|
||||
match = re.match(regexp, line)
|
||||
if match:
|
||||
ip = str(match.group(1))
|
||||
print("master node ip is " + ip)
|
||||
return ip
|
||||
else:
|
||||
raise ValueError("did not find master node ip")
|
||||
|
||||
|
||||
# cluster ware logic end
|
||||
|
||||
checkpoint_file_name = "checkpoint_backup.pth.tar"
|
||||
|
||||
|
||||
def add_parser_arguments(parser):
|
||||
model_names = models.resnet_versions.keys()
|
||||
model_configs = models.resnet_configs.keys()
|
||||
|
||||
parser.add_argument("--data", metavar="DIR", help="path to dataset")
|
||||
parser.add_argument(
|
||||
"--data-backend",
|
||||
metavar="BACKEND",
|
||||
default="dali-cpu",
|
||||
choices=DATA_BACKEND_CHOICES,
|
||||
help="data backend: "
|
||||
+ " | ".join(DATA_BACKEND_CHOICES)
|
||||
+ " (default: dali-cpu)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--arch",
|
||||
"-a",
|
||||
metavar="ARCH",
|
||||
default="resnet50",
|
||||
choices=model_names,
|
||||
help="model architecture: " + " | ".join(model_names) + " (default: resnet50)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model-config",
|
||||
"-c",
|
||||
metavar="CONF",
|
||||
default="classic",
|
||||
choices=model_configs,
|
||||
help="model configs: " + " | ".join(model_configs) + "(default: classic)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-j",
|
||||
"--workers",
|
||||
default=5,
|
||||
type=int,
|
||||
metavar="N",
|
||||
help="number of data loading workers (default: 5)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--epochs",
|
||||
default=90,
|
||||
type=int,
|
||||
metavar="N",
|
||||
help="number of total epochs to run",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-b",
|
||||
"--batch-size",
|
||||
default=256,
|
||||
type=int,
|
||||
metavar="N",
|
||||
help="mini-batch size (default: 256) per gpu",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--optimizer-batch-size",
|
||||
default=-1,
|
||||
type=int,
|
||||
metavar="N",
|
||||
help="size of a total batch size, for simulating bigger batches using gradient accumulation",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--lr",
|
||||
"--learning-rate",
|
||||
default=0.1,
|
||||
type=float,
|
||||
metavar="LR",
|
||||
help="initial learning rate",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lr-schedule",
|
||||
default="step",
|
||||
type=str,
|
||||
metavar="SCHEDULE",
|
||||
choices=["step", "linear", "cosine"],
|
||||
help="Type of LR schedule: {}, {}, {}".format("step", "linear", "cosine"),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--warmup", default=0, type=int, metavar="E", help="number of warmup epochs"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--label-smoothing",
|
||||
default=0.0,
|
||||
type=float,
|
||||
metavar="S",
|
||||
help="label smoothing",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mixup", default=0.0, type=float, metavar="ALPHA", help="mixup alpha"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--momentum", default=0.9, type=float, metavar="M", help="momentum"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--weight-decay",
|
||||
"--wd",
|
||||
default=1e-4,
|
||||
type=float,
|
||||
metavar="W",
|
||||
help="weight decay (default: 1e-4)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bn-weight-decay",
|
||||
action="store_true",
|
||||
help="use weight_decay on batch normalization learnable parameters, (default: false)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--nesterov",
|
||||
action="store_true",
|
||||
help="use nesterov momentum, (default: false)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--print-freq",
|
||||
"-p",
|
||||
default=10,
|
||||
type=int,
|
||||
metavar="N",
|
||||
help="print frequency (default: 10)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resume",
|
||||
default="",
|
||||
type=str,
|
||||
metavar="PATH",
|
||||
help="path to latest checkpoint (default: none)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pretrained-weights",
|
||||
default="",
|
||||
type=str,
|
||||
metavar="PATH",
|
||||
help="load weights from here",
|
||||
)
|
||||
|
||||
parser.add_argument("--fp16", action="store_true", help="Run model fp16 mode.")
|
||||
parser.add_argument(
|
||||
"--static-loss-scale",
|
||||
type=float,
|
||||
default=1,
|
||||
help="Static loss scale, positive power of 2 values can improve fp16 convergence.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dynamic-loss-scale",
|
||||
action="store_true",
|
||||
help="Use dynamic loss scaling. If supplied, this argument supersedes "
|
||||
+ "--static-loss-scale.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prof", type=int, default=-1, metavar="N", help="Run only N iterations"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--amp",
|
||||
action="store_true",
|
||||
help="Run model AMP (automatic mixed precision) mode.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--seed", default=None, type=int, help="random seed used for numpy and pytorch"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--gather-checkpoints",
|
||||
action="store_true",
|
||||
help="Gather checkpoints throughout the training, without this flag only best and last checkpoints will be stored",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--raport-file",
|
||||
default="experiment_raport.json",
|
||||
type=str,
|
||||
help="file in which to store JSON experiment raport",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--evaluate", action="store_true", help="evaluate checkpoint/model"
|
||||
)
|
||||
parser.add_argument("--training-only", action="store_true", help="do not evaluate")
|
||||
|
||||
parser.add_argument(
|
||||
"--no-checkpoints",
|
||||
action="store_false",
|
||||
dest="save_checkpoints",
|
||||
help="do not store any checkpoints, useful for benchmarking",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--workspace",
|
||||
type=str,
|
||||
default="./",
|
||||
metavar="DIR",
|
||||
help="path to directory where checkpoints will be stored",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--save-checkpoint-epochs",
|
||||
default=10,
|
||||
type=int,
|
||||
metavar="N",
|
||||
help="how many epochs run between saving checkpoints",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--log_redirect", action="store_true", help="Redirect log to files."
|
||||
)
|
||||
|
||||
|
||||
def main(gpu_index, args):
|
||||
if args.log_redirect:
|
||||
sys.stdout = open(
|
||||
"./outputs_"
|
||||
+ str(args.rank * args.ngpus_per_node + gpu_index)
|
||||
+ str(time.time()),
|
||||
"w",
|
||||
)
|
||||
|
||||
exp_start_time = time.time()
|
||||
global best_prec1
|
||||
best_prec1 = 0
|
||||
|
||||
args.distributed = False
|
||||
|
||||
args.gpu = 0
|
||||
|
||||
args.local_rank = gpu_index
|
||||
args.distributed = args.world_size > 1
|
||||
if args.distributed:
|
||||
args.gpu = args.local_rank % torch.cuda.device_count()
|
||||
print("using gpu ", args.gpu)
|
||||
torch.cuda.set_device(args.gpu)
|
||||
|
||||
args.rank = args.rank * args.ngpus_per_node + gpu_index
|
||||
dist.init_process_group(
|
||||
backend="nccl",
|
||||
init_method=args.dist_url,
|
||||
world_size=args.world_size,
|
||||
rank=args.rank,
|
||||
)
|
||||
|
||||
if args.amp and args.fp16:
|
||||
print("Please use only one of the --fp16/--amp flags")
|
||||
exit(1)
|
||||
|
||||
if args.seed is not None:
|
||||
print("Using seed = {}".format(args.seed))
|
||||
torch.manual_seed(args.seed + args.local_rank)
|
||||
torch.cuda.manual_seed(args.seed + args.local_rank)
|
||||
np.random.seed(seed=args.seed + args.local_rank)
|
||||
random.seed(args.seed + args.local_rank)
|
||||
|
||||
def _worker_init_fn(id):
|
||||
np.random.seed(seed=args.seed + args.local_rank + id)
|
||||
random.seed(args.seed + args.local_rank + id)
|
||||
|
||||
else:
|
||||
|
||||
def _worker_init_fn(id):
|
||||
pass
|
||||
|
||||
if args.fp16:
|
||||
assert (
|
||||
torch.backends.cudnn.enabled
|
||||
), "fp16 mode requires cudnn backend to be enabled."
|
||||
|
||||
if args.static_loss_scale != 1.0:
|
||||
if not args.fp16:
|
||||
print("Warning: if --fp16 is not used, static_loss_scale will be ignored.")
|
||||
|
||||
if args.optimizer_batch_size < 0:
|
||||
batch_size_multiplier = 1
|
||||
else:
|
||||
tbs = args.world_size * args.batch_size
|
||||
if args.optimizer_batch_size % tbs != 0:
|
||||
print(
|
||||
"Warning: simulated batch size {} is not divisible by actual batch size {}".format(
|
||||
args.optimizer_batch_size, tbs
|
||||
)
|
||||
)
|
||||
batch_size_multiplier = int(round(args.optimizer_batch_size / tbs))
|
||||
print("BSM: {}".format(batch_size_multiplier))
|
||||
print("Real effective batch size is: ", batch_size_multiplier * tbs)
|
||||
|
||||
pretrained_weights = None
|
||||
if args.pretrained_weights:
|
||||
if os.path.isfile(args.pretrained_weights):
|
||||
print(
|
||||
"=> loading pretrained weights from '{}'".format(
|
||||
args.pretrained_weights
|
||||
)
|
||||
)
|
||||
pretrained_weights = torch.load(args.pretrained_weights)
|
||||
else:
|
||||
print("=> no pretrained weights found at '{}'".format(args.resume))
|
||||
|
||||
start_epoch = 0
|
||||
args.total_train_step = 0
|
||||
# check previous saved checkpoint first
|
||||
# if there is none, then resume from user specified checkpoint if there is
|
||||
target_ckpt_path = args.workspace + "/" + checkpoint_file_name
|
||||
ckpt_path = target_ckpt_path
|
||||
if not os.path.isfile(ckpt_path):
|
||||
print("=> no checkpoint found at '{}'".format(ckpt_path))
|
||||
ckpt_path = args.resume
|
||||
|
||||
# optionally resume from a checkpoint
|
||||
if ckpt_path:
|
||||
if os.path.isfile(ckpt_path):
|
||||
print("=> loading checkpoint '{}'".format(ckpt_path))
|
||||
checkpoint = torch.load(
|
||||
ckpt_path, map_location=lambda storage, loc: storage.cuda(args.gpu)
|
||||
)
|
||||
start_epoch = checkpoint["epoch"]
|
||||
best_prec1 = checkpoint["best_prec1"]
|
||||
model_state = checkpoint["state_dict"]
|
||||
optimizer_state = checkpoint["optimizer"]
|
||||
args.total_train_step = checkpoint["total_train_step"]
|
||||
print(
|
||||
"=> loaded checkpoint '{}' (epoch {})".format(
|
||||
ckpt_path, checkpoint["epoch"]
|
||||
)
|
||||
)
|
||||
else:
|
||||
print("=> no checkpoint found at '{}'".format(ckpt_path))
|
||||
model_state = None
|
||||
optimizer_state = None
|
||||
else:
|
||||
model_state = None
|
||||
optimizer_state = None
|
||||
|
||||
loss = nn.CrossEntropyLoss
|
||||
if args.mixup > 0.0:
|
||||
loss = lambda: NLLMultiLabelSmooth(args.label_smoothing)
|
||||
elif args.label_smoothing > 0.0:
|
||||
loss = lambda: LabelSmoothing(args.label_smoothing)
|
||||
|
||||
model_and_loss = ModelAndLoss(
|
||||
(args.arch, args.model_config),
|
||||
loss,
|
||||
pretrained_weights=pretrained_weights,
|
||||
cuda=True,
|
||||
fp16=args.fp16,
|
||||
)
|
||||
|
||||
# Create data loaders and optimizers as needed
|
||||
if args.data_backend == "pytorch":
|
||||
get_train_loader = get_pytorch_train_loader
|
||||
get_val_loader = get_pytorch_val_loader
|
||||
elif args.data_backend == "dali-gpu":
|
||||
get_train_loader = get_dali_train_loader(dali_cpu=False)
|
||||
get_val_loader = get_dali_val_loader()
|
||||
elif args.data_backend == "dali-cpu":
|
||||
get_train_loader = get_dali_train_loader(dali_cpu=True)
|
||||
get_val_loader = get_dali_val_loader()
|
||||
elif args.data_backend == "syntetic":
|
||||
get_val_loader = get_syntetic_loader
|
||||
get_train_loader = get_syntetic_loader
|
||||
|
||||
train_loader, train_loader_len = get_train_loader(
|
||||
args.data,
|
||||
args.batch_size,
|
||||
1000,
|
||||
args.mixup > 0.0,
|
||||
workers=args.workers,
|
||||
fp16=args.fp16,
|
||||
)
|
||||
if args.mixup != 0.0:
|
||||
train_loader = MixUpWrapper(args.mixup, 1000, train_loader)
|
||||
|
||||
val_loader, val_loader_len = get_val_loader(
|
||||
args.data, args.batch_size, 1000, False, workers=args.workers, fp16=args.fp16
|
||||
)
|
||||
|
||||
if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
|
||||
logger = log.Logger(
|
||||
args.print_freq,
|
||||
[
|
||||
dllogger.StdOutBackend(
|
||||
dllogger.Verbosity.DEFAULT, step_format=log.format_step
|
||||
),
|
||||
dllogger.JSONStreamBackend(
|
||||
dllogger.Verbosity.VERBOSE,
|
||||
os.path.join(args.workspace, args.raport_file),
|
||||
),
|
||||
],
|
||||
last_epoch=start_epoch - 1,
|
||||
)
|
||||
|
||||
else:
|
||||
logger = log.Logger(args.print_freq, [], last_epoch=start_epoch - 1)
|
||||
|
||||
logger.log_parameter(args.__dict__, verbosity=dllogger.Verbosity.DEFAULT)
|
||||
|
||||
optimizer = get_optimizer(
|
||||
list(model_and_loss.model.named_parameters()),
|
||||
args.fp16,
|
||||
args.lr,
|
||||
args.momentum,
|
||||
args.weight_decay,
|
||||
nesterov=args.nesterov,
|
||||
bn_weight_decay=args.bn_weight_decay,
|
||||
state=optimizer_state,
|
||||
static_loss_scale=args.static_loss_scale,
|
||||
dynamic_loss_scale=args.dynamic_loss_scale,
|
||||
)
|
||||
|
||||
if args.lr_schedule == "step":
|
||||
lr_policy = lr_step_policy(
|
||||
args.lr, [30, 60, 80], 0.1, args.warmup, logger=logger
|
||||
)
|
||||
elif args.lr_schedule == "cosine":
|
||||
lr_policy = lr_cosine_policy(args.lr, args.warmup, args.epochs, logger=logger)
|
||||
elif args.lr_schedule == "linear":
|
||||
lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs, logger=logger)
|
||||
|
||||
if args.amp:
|
||||
model_and_loss, optimizer = amp.initialize(
|
||||
model_and_loss,
|
||||
optimizer,
|
||||
opt_level="O2",
|
||||
loss_scale="dynamic" if args.dynamic_loss_scale else args.static_loss_scale,
|
||||
)
|
||||
|
||||
if args.distributed:
|
||||
model_and_loss.distributed()
|
||||
|
||||
model_and_loss.load_model_state(model_state)
|
||||
|
||||
train_loop(
|
||||
model_and_loss,
|
||||
optimizer,
|
||||
lr_policy,
|
||||
train_loader,
|
||||
val_loader,
|
||||
args.epochs,
|
||||
args.fp16,
|
||||
logger,
|
||||
should_backup_checkpoint(args),
|
||||
args.save_checkpoint_epochs,
|
||||
use_amp=args.amp,
|
||||
batch_size_multiplier=batch_size_multiplier,
|
||||
start_epoch=start_epoch,
|
||||
best_prec1=best_prec1,
|
||||
prof=args.prof,
|
||||
skip_training=args.evaluate,
|
||||
skip_validation=args.training_only,
|
||||
save_checkpoints=args.save_checkpoints and not args.evaluate,
|
||||
checkpoint_dir=args.workspace,
|
||||
total_train_step=args.total_train_step,
|
||||
)
|
||||
exp_duration = time.time() - exp_start_time
|
||||
if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
|
||||
logger.end()
|
||||
print("Experiment ended")
|
||||
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# print(f'os env: {os.environ}')
|
||||
parser = argparse.ArgumentParser(description="PyTorch ImageNet Training")
|
||||
|
||||
add_parser_arguments(parser)
|
||||
args = parser.parse_args()
|
||||
cudnn.benchmark = True
|
||||
|
||||
import socket
|
||||
|
||||
print("started training scripts on ", socket.gethostname())
|
||||
args = parser.parse_args()
|
||||
|
||||
args.world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) # node count
|
||||
args.rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) # node world rank
|
||||
print(f"world size {args.world_size}, rank {args.rank}")
|
||||
|
||||
import os
|
||||
|
||||
if not os.path.exists(args.workspace) and args.rank == 0:
|
||||
print("workspace ", args.workspace, " does not exist, creating one.")
|
||||
os.makedirs(args.workspace)
|
||||
|
||||
# override the master node ip by intention
|
||||
# args.dist_url = 'tcp://' + get_master_ip() + ':23456'
|
||||
# extract master ip from os env as a workaround
|
||||
args.dist_url = "tcp://" + os.environ["AZ_BATCHAI_MPI_MASTER_NODE"] + ":23456"
|
||||
|
||||
ngpus_per_node = torch.cuda.device_count()
|
||||
args.distributed = args.world_size > 1
|
||||
|
||||
# Since we have ngpus_per_node processes per node, the total world_size
|
||||
# needs to be adjusted accordingly
|
||||
args.world_size = ngpus_per_node * args.world_size
|
||||
args.ngpus_per_node = ngpus_per_node
|
||||
print(f"world size {args.world_size}, ngpus per node {ngpus_per_node}.")
|
||||
|
||||
# Use torch.multiprocessing.spawn to launch distributed processes: the
|
||||
# main_worker process function
|
||||
mp.spawn(main, nprocs=ngpus_per_node, args=(args,))
|
||||
|
||||
# notify DLTS to collect the std output asap.
|
||||
log_collect_hook = "/var/log/compute/00_stdout.txt.exit"
|
||||
if os.path.isfile(log_collect_hook):
|
||||
open(log_collect_hook, "w").close()
|
|
@ -0,0 +1,214 @@
|
|||
# From PyTorch:
|
||||
#
|
||||
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2016- Facebook, Inc (Adam Paszke)
|
||||
# Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
|
||||
# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
|
||||
# Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
|
||||
# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
|
||||
# Copyright (c) 2011-2013 NYU (Clement Farabet)
|
||||
# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
|
||||
# Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
|
||||
# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
|
||||
#
|
||||
# From Caffe2:
|
||||
#
|
||||
# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
|
||||
#
|
||||
# All contributions by Facebook:
|
||||
# Copyright (c) 2016 Facebook Inc.
|
||||
#
|
||||
# All contributions by Google:
|
||||
# Copyright (c) 2015 Google Inc.
|
||||
# All rights reserved.
|
||||
#
|
||||
# All contributions by Yangqing Jia:
|
||||
# Copyright (c) 2015 Yangqing Jia
|
||||
# All rights reserved.
|
||||
#
|
||||
# All contributions from Caffe:
|
||||
# Copyright(c) 2013, 2014, 2015, the respective contributors
|
||||
# All rights reserved.
|
||||
#
|
||||
# All other contributions:
|
||||
# Copyright(c) 2015, 2016 the respective contributors
|
||||
# All rights reserved.
|
||||
#
|
||||
# Caffe2 uses a copyright model similar to Caffe: each contributor holds
|
||||
# copyright over their contributions to Caffe2. The project versioning records
|
||||
# all such contribution and copyright details. If a contributor wants to further
|
||||
# mark their specific copyright on a particular contribution, they should
|
||||
# indicate their copyright solely in the commit message of the change when it is
|
||||
# committed.
|
||||
#
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
|
||||
# and IDIAP Research Institute nor the names of its contributors may be
|
||||
# used to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
import sys
|
||||
import subprocess
|
||||
import os
|
||||
import socket
|
||||
import time
|
||||
from argparse import ArgumentParser, REMAINDER
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def parse_args():
|
||||
"""
|
||||
Helper function parsing the command line options
|
||||
@retval ArgumentParser
|
||||
"""
|
||||
parser = ArgumentParser(
|
||||
description="PyTorch distributed training launch "
|
||||
"helper utilty that will spawn up "
|
||||
"multiple distributed processes"
|
||||
)
|
||||
|
||||
# Optional arguments for the launch helper
|
||||
parser.add_argument(
|
||||
"--nnodes",
|
||||
type=int,
|
||||
default=1,
|
||||
help="The number of nodes to use for distributed " "training",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--node_rank",
|
||||
type=int,
|
||||
default=0,
|
||||
help="The rank of the node for multi-node distributed " "training",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--nproc_per_node",
|
||||
type=int,
|
||||
default=1,
|
||||
help="The number of processes to launch on each node, "
|
||||
"for GPU training, this is recommended to be set "
|
||||
"to the number of GPUs in your system so that "
|
||||
"each process can be bound to a single GPU.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--master_addr",
|
||||
default="127.0.0.1",
|
||||
type=str,
|
||||
help="Master node (rank 0)'s address, should be either "
|
||||
"the IP address or the hostname of node 0, for "
|
||||
"single node multi-proc training, the "
|
||||
"--master_addr can simply be 127.0.0.1",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--master_port",
|
||||
default=29500,
|
||||
type=int,
|
||||
help="Master node (rank 0)'s free port that needs to "
|
||||
"be used for communciation during distributed "
|
||||
"training",
|
||||
)
|
||||
|
||||
# positional
|
||||
parser.add_argument(
|
||||
"training_script",
|
||||
type=str,
|
||||
help="The full path to the single GPU training "
|
||||
"program/script to be launched in parallel, "
|
||||
"followed by all the arguments for the "
|
||||
"training script",
|
||||
)
|
||||
|
||||
# rest from the training program
|
||||
parser.add_argument("training_script_args", nargs=REMAINDER)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
# world size in terms of number of processes
|
||||
dist_world_size = args.nproc_per_node * args.nnodes
|
||||
|
||||
# set PyTorch distributed related environmental variables
|
||||
current_env = os.environ.copy()
|
||||
current_env["MASTER_ADDR"] = args.master_addr
|
||||
current_env["MASTER_PORT"] = str(args.master_port)
|
||||
current_env["WORLD_SIZE"] = str(dist_world_size)
|
||||
|
||||
processes = []
|
||||
|
||||
for local_rank in range(0, args.nproc_per_node):
|
||||
# each process's rank
|
||||
dist_rank = args.nproc_per_node * args.node_rank + local_rank
|
||||
current_env["RANK"] = str(dist_rank)
|
||||
current_env["LOCAL_RANK"] = str(local_rank)
|
||||
|
||||
# spawn the processes
|
||||
cmd = [sys.executable, "-u", args.training_script] + args.training_script_args
|
||||
|
||||
print(cmd)
|
||||
|
||||
stdout = (
|
||||
None if local_rank == 0 else open("GPU_" + str(local_rank) + ".log", "w")
|
||||
)
|
||||
|
||||
process = subprocess.Popen(cmd, env=current_env, stdout=stdout)
|
||||
processes.append(process)
|
||||
|
||||
try:
|
||||
up = True
|
||||
error = False
|
||||
while up and not error:
|
||||
up = False
|
||||
for p in processes:
|
||||
ret = p.poll()
|
||||
if ret is None:
|
||||
up = True
|
||||
elif ret != 0:
|
||||
error = True
|
||||
time.sleep(1)
|
||||
|
||||
if error:
|
||||
for p in processes:
|
||||
if p.poll() is None:
|
||||
p.terminate()
|
||||
exit(1)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
for p in processes:
|
||||
p.terminate()
|
||||
raise
|
||||
except SystemExit:
|
||||
for p in processes:
|
||||
p.terminate()
|
||||
raise
|
||||
except:
|
||||
for p in processes:
|
||||
p.terminate()
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1 @@
|
|||
git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger
|
|
@ -0,0 +1,688 @@
|
|||
# ResNet50 v1.5 For PyTorch
|
||||
|
||||
This repository provides a script and recipe to train the ResNet50 model to
|
||||
achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
|
||||
|
||||
## Table Of Contents
|
||||
|
||||
* [Model overview](#model-overview)
|
||||
* [Model architecture](#model-architecture)
|
||||
* [Default configuration](#default-configuration)
|
||||
* [Optimizer](#optimizer)
|
||||
* [Data augmentation](#data-augmentation)
|
||||
* [DALI](#dali)
|
||||
* [Feature support matrix](#feature-support-matrix)
|
||||
* [Features](#features)
|
||||
* [Mixed precision training](#mixed-precision-training)
|
||||
* [Enabling mixed precision](#enabling-mixed-precision)
|
||||
* [Setup](#setup)
|
||||
* [Requirements](#requirements)
|
||||
* [Quick Start Guide](#quick-start-guide)
|
||||
* [Advanced](#advanced)
|
||||
* [Scripts and sample code](#scripts-and-sample-code)
|
||||
* [Parameters](#parameters)
|
||||
* [Command-line options](#command-line-options)
|
||||
* [Getting the data](#getting-the-data)
|
||||
* [Dataset guidelines](#dataset-guidelines)
|
||||
* [Multi-dataset](#multi-dataset)
|
||||
* [Training process](#training-process)
|
||||
* [Inference process](#inference-process)
|
||||
|
||||
* [Performance](#performance)
|
||||
* [Benchmarking](#benchmarking)
|
||||
* [Training performance benchmark](#training-performance-benchmark)
|
||||
* [Inference performance benchmark](#inference-performance-benchmark)
|
||||
* [Results](#results)
|
||||
* [Training accuracy results](#training-accuracy-results)
|
||||
* [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-(8x-v100-16G))
|
||||
* [Example plots](*example-plots)
|
||||
* [Training performance results](#training-performance-results)
|
||||
* [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-(8x-v100-16G))
|
||||
* [Training time for 90 epochs](#training-time-for-90-epochs)
|
||||
* [Training time: NVIDIA DGX-1 (8x V100 16G)](#training-time-nvidia-dgx-1-(8x-v100-16G))
|
||||
* [Inference performance results](#inference-performance-results)
|
||||
* [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-(1x-v100-16G))
|
||||
* [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
|
||||
* [Release notes](#release-notes)
|
||||
* [Changelog](#changelog)
|
||||
* [Known issues](#known-issues)
|
||||
|
||||
## Model overview
|
||||
The ResNet50 v1.5 model is a modified version of the [original ResNet50 v1 model](https://arxiv.org/abs/1512.03385).
|
||||
|
||||
The difference between v1 and v1.5 is that, in the bottleneck blocks which requires
|
||||
downsampling, v1 has stride = 2 in the first 1x1 convolution, whereas v1.5 has stride = 2 in the 3x3 convolution.
|
||||
|
||||
This difference makes ResNet50 v1.5 slightly more accurate (~0.5% top1) than v1, but comes with a smallperformance drawback (~5% imgs/sec).
|
||||
|
||||
The model is initialized as described in [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification](https://arxiv.org/pdf/1502.01852.pdf)
|
||||
|
||||
### Default configuration
|
||||
|
||||
The following sections highlight the default configurations for the ResNet50 model.
|
||||
|
||||
#### Optimizer
|
||||
|
||||
This model uses SGD with momentum optimizer with the following hyperparameters:
|
||||
|
||||
* Momentum (0.875)
|
||||
|
||||
* Learning rate (LR) = 0.256 for 256 batch size, for other batch sizes we lineary
|
||||
scale the learning rate.
|
||||
|
||||
* Learning rate schedule - we use cosine LR schedule
|
||||
|
||||
* For bigger batch sizes (512 and up) we use linear warmup of the learning rate
|
||||
during the first couple of epochs
|
||||
according to [Training ImageNet in 1 hour](https://arxiv.org/abs/1706.02677).
|
||||
Warmup length depends on the total training length.
|
||||
|
||||
* Weight decay (WD)= 3.0517578125e-05 (1/32768).
|
||||
|
||||
* We do not apply WD on Batch Norm trainable parameters (gamma/bias)
|
||||
|
||||
* Label smoothing = 0.1
|
||||
|
||||
* We train for:
|
||||
|
||||
* 50 Epochs -> configuration that reaches 75.9% top1 accuracy
|
||||
|
||||
* 90 Epochs -> 90 epochs is a standard for ImageNet networks
|
||||
|
||||
* 250 Epochs -> best possible accuracy.
|
||||
|
||||
* For 250 epoch training we also use [MixUp regularization](https://arxiv.org/pdf/1710.09412.pdf).
|
||||
|
||||
|
||||
#### Data augmentation
|
||||
|
||||
This model uses the following data augmentation:
|
||||
|
||||
* For training:
|
||||
* Normalization
|
||||
* Random resized crop to 224x224
|
||||
* Scale from 8% to 100%
|
||||
* Aspect ratio from 3/4 to 4/3
|
||||
* Random horizontal flip
|
||||
|
||||
* For inference:
|
||||
* Normalization
|
||||
* Scale to 256x256
|
||||
* Center crop to 224x224
|
||||
|
||||
#### Other training recipes
|
||||
|
||||
This script does not target any specific benchmark.
|
||||
There are changes that others have made which can speed up convergence and/or increase accuracy.
|
||||
|
||||
One of the more popular training recipes is provided by [fast.ai](https://github.com/fastai/imagenet-fast).
|
||||
|
||||
The fast.ai recipe introduces many changes to the training procedure, one of which is progressive resizing of the training images.
|
||||
|
||||
The first part of training uses 128px images, the middle part uses 224px images, and the last part uses 288px images.
|
||||
The final validation is performed on 288px images.
|
||||
|
||||
Training script in this repository performs validation on 224px images, just like the original paper described.
|
||||
|
||||
These two approaches can't be directly compared, since the fast.ai recipe requires validation on 288px images,
|
||||
and this recipe keeps the original assumption that validation is done on 224px images.
|
||||
|
||||
Using 288px images means that a lot more FLOPs are needed during inference to reach the same accuracy.
|
||||
|
||||
### Feature support matrix
|
||||
|
||||
The following features are supported by this model:
|
||||
|
||||
| Feature | ResNet50
|
||||
|-----------------------|--------------------------
|
||||
|[DALI](https://docs.nvidia.com/deeplearning/sdk/dali-release-notes/index.html) | Yes
|
||||
|[APEX AMP](https://nvidia.github.io/apex/amp.html) | Yes |
|
||||
|
||||
#### Features
|
||||
|
||||
- NVIDIA DALI - DALI is a library accelerating data preparation pipeline. To accelerate your input pipeline, you only need to define your data loader
|
||||
with the DALI library. For more information about DALI, refer to the [DALI product documentation](https://docs.nvidia.com/deeplearning/sdk/index.html#data-loading).
|
||||
|
||||
- [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as [Automatic Mixed Precision (AMP)](https://nvidia.github.io/apex/amp.html), which require minimal network code changes to leverage Tensor Cores performance. Refer to the [Enabling mixed precision](#enabling-mixed-precision) section for more details.
|
||||
|
||||
### DALI
|
||||
|
||||
We use [NVIDIA DALI](https://github.com/NVIDIA/DALI),
|
||||
which speeds up data loading when CPU becomes a bottleneck.
|
||||
DALI can use CPU or GPU, and outperforms the PyTorch native dataloader.
|
||||
|
||||
Run training with `--data-backends dali-gpu` or `--data-backends dali-cpu` to enable DALI.
|
||||
For DGX1 we recommend `--data-backends dali-cpu`, for DGX2 we recommend `--data-backends dali-gpu`.
|
||||
|
||||
### Mixed precision training
|
||||
|
||||
Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
|
||||
1. Porting the model to use the FP16 data type where appropriate.
|
||||
2. Adding loss scaling to preserve small gradient values.
|
||||
|
||||
The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
|
||||
|
||||
For information about:
|
||||
- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
|
||||
- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
|
||||
- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
|
||||
- APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
|
||||
|
||||
#### Enabling mixed precision
|
||||
|
||||
Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision (AMP), a library from [APEX](https://github.com/NVIDIA/apex) that casts variables to half-precision upon retrieval,
|
||||
while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients.
|
||||
In PyTorch, loss scaling can be easily applied by using scale_loss() method provided by AMP. The scaling value to be used can be [dynamic](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.DynamicLossScaler) or fixed.
|
||||
|
||||
For an in-depth walk through on AMP, check out sample usage [here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started). [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as AMP, which require minimal network code changes to leverage tensor cores performance.
|
||||
|
||||
To enable mixed precision, you can:
|
||||
- Import AMP from APEX, for example:
|
||||
|
||||
```
|
||||
from apex import amp
|
||||
```
|
||||
- Initialize an AMP handle, for example:
|
||||
|
||||
```
|
||||
amp_handle = amp.init(enabled=True, verbose=True)
|
||||
```
|
||||
- Wrap your optimizer with the AMP handle, for example:
|
||||
|
||||
```
|
||||
optimizer = amp_handle.wrap_optimizer(optimizer)
|
||||
```
|
||||
- Scale loss before backpropagation (assuming loss is stored in a variable called losses)
|
||||
- Default backpropagate for FP32:
|
||||
|
||||
```
|
||||
losses.backward()
|
||||
```
|
||||
- Scale loss and backpropagate with AMP:
|
||||
|
||||
```
|
||||
with optimizer.scale_loss(losses) as scaled_losses:
|
||||
scaled_losses.backward()
|
||||
```
|
||||
|
||||
## Setup
|
||||
|
||||
The following section lists the requirements that you need to meet in order to start training the ResNet50 model.
|
||||
|
||||
### Requirements
|
||||
|
||||
This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
|
||||
|
||||
* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
|
||||
* [PyTorch 19.10-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) or newer
|
||||
* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
|
||||
|
||||
For more information about how to get started with NGC containers, see the
|
||||
following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning
|
||||
DGX Documentation:
|
||||
* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
|
||||
* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
|
||||
* [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
|
||||
|
||||
For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
|
||||
|
||||
## Quick Start Guide
|
||||
|
||||
### 1. Clone the repository.
|
||||
```
|
||||
git clone https://github.com/NVIDIA/DeepLearningExamples
|
||||
cd DeepLearningExamples/PyTorch/Classification/
|
||||
```
|
||||
|
||||
### 2. Download and preprocess the dataset.
|
||||
|
||||
The ResNet50 script operates on ImageNet 1k, a widely popular image classification dataset from the ILSVRC challenge.
|
||||
|
||||
PyTorch can work directly on JPEGs, therefore, preprocessing/augmentation is not needed.
|
||||
|
||||
1. [Download the images](http://image-net.org/download-images).
|
||||
|
||||
2. Extract the training data:
|
||||
```bash
|
||||
mkdir train && mv ILSVRC2012_img_train.tar train/ && cd train
|
||||
tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
|
||||
find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
|
||||
cd ..
|
||||
```
|
||||
|
||||
3. Extract the validation data and move the images to subfolders:
|
||||
```bash
|
||||
mkdir val && mv ILSVRC2012_img_val.tar val/ && cd val && tar -xvf ILSVRC2012_img_val.tar
|
||||
wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
|
||||
```
|
||||
|
||||
The directory in which the `train/` and `val/` directories are placed, is referred to as `<path to imagenet>` in this document.
|
||||
|
||||
### 3. Build the RN50v1.5 PyTorch NGC container.
|
||||
|
||||
```
|
||||
docker build . -t nvidia_rn50
|
||||
```
|
||||
|
||||
### 4. Start an interactive session in the NGC container to run training/inference.
|
||||
```
|
||||
nvidia-docker run --rm -it -v <path to imagenet>:/data/imagenet --ipc=host nvidia_rn50
|
||||
```
|
||||
|
||||
### 5. Start training
|
||||
|
||||
To run training for a standard configuration (DGX1V/DGX2V, FP16/FP32, 50/90/250 Epochs),
|
||||
run one of the scripts in the `./resnet50v1.5/training` directory
|
||||
called `./resnet50v1.5/training/{DGX1, DGX2}_RN50_{AMP, FP16, FP32}_{50,90,250}E.sh`.
|
||||
|
||||
Ensure ImageNet is mounted in the `/data/imagenet` directory.
|
||||
|
||||
Example:
|
||||
`bash ./resnet50v1.5/training/DGX1_RN50_FP16_250E.sh <path were to store checkpoints and logs>`
|
||||
|
||||
### 6. Start inference
|
||||
|
||||
To run inference on ImageNet on a checkpointed model, run:
|
||||
|
||||
`python ./main.py --arch resnet50 --evaluate --epochs 1 --resume <path to checkpoint> -b <batch size> <path to imagenet>`
|
||||
|
||||
To run inference on JPEG image, you have to first extract the model weights from checkpoint:
|
||||
|
||||
`python checkpoint2model.py --checkpoint-path <path to checkpoint> --weight-path <path where weights will be stored>`
|
||||
|
||||
Then run classification script:
|
||||
|
||||
`python classify.py --arch resnet50 -c fanin --weights <path to weights from previous step> --precision AMP|FP16|FP32 --image <path to JPEG image>`
|
||||
|
||||
|
||||
## Advanced
|
||||
|
||||
The following sections provide greater details of the dataset, running training and inference, and the training results.
|
||||
|
||||
### Scripts and sample code
|
||||
|
||||
To run a non standard configuration use:
|
||||
|
||||
* For 1 GPU
|
||||
* FP32
|
||||
`python ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 <path to imagenet>`
|
||||
`python ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 --amp --static-loss-scale 256 <path to imagenet>`
|
||||
|
||||
* For multiple GPUs
|
||||
* FP32
|
||||
`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 <path to imagenet>`
|
||||
* AMP
|
||||
`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 -c fanin --label-smoothing 0.1 --amp --static-loss-scale 256 <path to imagenet>`
|
||||
|
||||
Use `python ./main.py -h` to obtain the list of available options in the `main.py` script.
|
||||
|
||||
|
||||
### Commmand-line options:
|
||||
|
||||
To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
|
||||
|
||||
`python main.py -h`
|
||||
|
||||
|
||||
```
|
||||
usage: main.py [-h] [--data-backend BACKEND] [--arch ARCH]
|
||||
[--model-config CONF] [-j N] [--epochs N] [-b N]
|
||||
[--optimizer-batch-size N] [--lr LR] [--lr-schedule SCHEDULE]
|
||||
[--warmup E] [--label-smoothing S] [--mixup ALPHA]
|
||||
[--momentum M] [--weight-decay W] [--bn-weight-decay]
|
||||
[--nesterov] [--print-freq N] [--resume PATH]
|
||||
[--pretrained-weights PATH] [--fp16]
|
||||
[--static-loss-scale STATIC_LOSS_SCALE] [--dynamic-loss-scale]
|
||||
[--prof N] [--amp] [--local_rank LOCAL_RANK] [--seed SEED]
|
||||
[--gather-checkpoints] [--raport-file RAPORT_FILE] [--evaluate]
|
||||
[--training-only] [--no-checkpoints] [--workspace DIR]
|
||||
DIR
|
||||
|
||||
PyTorch ImageNet Training
|
||||
|
||||
positional arguments:
|
||||
DIR path to dataset
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--data-backend BACKEND
|
||||
data backend: pytorch | syntetic | dali-gpu | dali-cpu
|
||||
(default: dali-cpu)
|
||||
--arch ARCH, -a ARCH model architecture: resnet18 | resnet34 | resnet50 |
|
||||
resnet101 | resnet152 | resnet50 | se-
|
||||
resnet50 (default: resnet50)
|
||||
--model-config CONF, -c CONF
|
||||
model configs: classic | fanin | grp-fanin | grp-
|
||||
fanout(default: classic)
|
||||
-j N, --workers N number of data loading workers (default: 5)
|
||||
--epochs N number of total epochs to run
|
||||
-b N, --batch-size N mini-batch size (default: 256) per gpu
|
||||
--optimizer-batch-size N
|
||||
size of a total batch size, for simulating bigger
|
||||
batches using gradient accumulation
|
||||
--lr LR, --learning-rate LR
|
||||
initial learning rate
|
||||
--lr-schedule SCHEDULE
|
||||
Type of LR schedule: step, linear, cosine
|
||||
--warmup E number of warmup epochs
|
||||
--label-smoothing S label smoothing
|
||||
--mixup ALPHA mixup alpha
|
||||
--momentum M momentum
|
||||
--weight-decay W, --wd W
|
||||
weight decay (default: 1e-4)
|
||||
--bn-weight-decay use weight_decay on batch normalization learnable
|
||||
parameters, (default: false)
|
||||
--nesterov use nesterov momentum, (default: false)
|
||||
--print-freq N, -p N print frequency (default: 10)
|
||||
--resume PATH path to latest checkpoint (default: none)
|
||||
--pretrained-weights PATH
|
||||
load weights from here
|
||||
--fp16 Run model fp16 mode.
|
||||
--static-loss-scale STATIC_LOSS_SCALE
|
||||
Static loss scale, positive power of 2 values can
|
||||
improve fp16 convergence.
|
||||
--dynamic-loss-scale Use dynamic loss scaling. If supplied, this argument
|
||||
supersedes --static-loss-scale.
|
||||
--prof N Run only N iterations
|
||||
--amp Run model AMP (automatic mixed precision) mode.
|
||||
--local_rank LOCAL_RANK
|
||||
Local rank of python process. Set up by distributed
|
||||
launcher
|
||||
--seed SEED random seed used for numpy and pytorch
|
||||
--gather-checkpoints Gather checkpoints throughout the training, without
|
||||
this flag only best and last checkpoints will be
|
||||
stored
|
||||
--raport-file RAPORT_FILE
|
||||
file in which to store JSON experiment raport
|
||||
--evaluate evaluate checkpoint/model
|
||||
--training-only do not evaluate
|
||||
--no-checkpoints do not store any checkpoints, useful for benchmarking
|
||||
--workspace DIR path to directory where checkpoints will be stored
|
||||
```
|
||||
|
||||
|
||||
### Dataset guidelines
|
||||
|
||||
To use your own dataset, divide it in directories as in the following scheme:
|
||||
|
||||
- Training images - `train/<class id>/<image>`
|
||||
- Validation images - `val/<class id>/<image>`
|
||||
|
||||
If your dataset's has number of classes different than 1000, you need to add a custom config
|
||||
in the `image_classification/resnet.py` file.
|
||||
|
||||
```python
|
||||
resnet_versions = {
|
||||
...
|
||||
'resnet50-custom' : {
|
||||
'net' : ResNet,
|
||||
'block' : Bottleneck,
|
||||
'layers' : [3, 4, 6, 3],
|
||||
'widths' : [64, 128, 256, 512],
|
||||
'expansion' : 4,
|
||||
'num_classes' : <custom number of classes>,
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
After adding the config, run the training script with `--arch resnet50-custom` flag.
|
||||
|
||||
### Training process
|
||||
|
||||
All the results of the training will be stored in the directory specified with `--workspace` argument.
|
||||
Script will store:
|
||||
- most recent checkpoint - `checkpoint.pth.tar` (unless `--no-checkpoints` flag is used).
|
||||
- checkpoint with best validation accuracy - `model_best.pth.tar` (unless `--no-checkpoints` flag is used).
|
||||
- JSON log - in the file specified with `--raport-file` flag.
|
||||
|
||||
Metrics gathered through training:
|
||||
|
||||
- `train.loss` - training loss
|
||||
- `train.total_ips` - training speed measured in images/second
|
||||
- `train.compute_ips` - training speed measured in images/second, not counting data loading
|
||||
- `train.data_time` - time spent on waiting on data
|
||||
- `train.compute_time` - time spent in forward/backward pass
|
||||
|
||||
### Inference process
|
||||
|
||||
Validation is done every epoch, and can be also run separately on a checkpointed model.
|
||||
|
||||
`python ./main.py --arch resnet50 --evaluate --epochs 1 --resume <path to checkpoint> -b <batch size> <path to imagenet>`
|
||||
|
||||
Metrics gathered through training:
|
||||
|
||||
- `val.loss` - validation loss
|
||||
- `val.top1` - validation top1 accuracy
|
||||
- `val.top5` - validation top5 accuracy
|
||||
- `val.total_ips` - inference speed measured in images/second
|
||||
- `val.compute_ips` - inference speed measured in images/second, not counting data loading
|
||||
- `val.data_time` - time spent on waiting on data
|
||||
- `val.compute_time` - time spent on inference
|
||||
|
||||
|
||||
To run inference on JPEG image, you have to first extract the model weights from checkpoint:
|
||||
|
||||
`python checkpoint2model.py --checkpoint-path <path to checkpoint> --weight-path <path where weights will be stored>`
|
||||
|
||||
Then run classification script:
|
||||
|
||||
`python classify.py --arch resnet50 -c fanin --weights <path to weights from previous step> --precision AMP|FP16|FP32 --image <path to JPEG image>`
|
||||
|
||||
Example output:
|
||||
|
||||
|
||||
|
||||
## Performance
|
||||
|
||||
### Benchmarking
|
||||
|
||||
The following section shows how to run benchmarks measuring the model performance in training and inference modes.
|
||||
|
||||
#### Training performance benchmark
|
||||
|
||||
To benchmark training, run:
|
||||
|
||||
* For 1 GPU
|
||||
* FP32
|
||||
`python ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
|
||||
* FP16
|
||||
`python ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --fp16 --static-loss-scale 256 <path to imagenet>`
|
||||
* AMP
|
||||
`python ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --amp --static-loss-scale 256 <path to imagenet>`
|
||||
* For multiple GPUs
|
||||
* FP32
|
||||
`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --epochs 1 --prof 100 <path to imagenet>`
|
||||
* FP16
|
||||
`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --fp16 --static-loss-scale 256 --epochs 1 --prof 100 <path to imagenet>`
|
||||
* AMP
|
||||
`python ./multiproc.py --nproc_per_node 8 ./main.py --arch resnet50 --training-only -p 1 --raport-file benchmark.json --amp --static-loss-scale 256 --epochs 1 --prof 100 <path to imagenet>`
|
||||
|
||||
Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
|
||||
|
||||
#### Inference performance benchmark
|
||||
|
||||
To benchmark inference, run:
|
||||
|
||||
* FP32
|
||||
|
||||
`python ./main.py --arch resnet50 -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate <path to imagenet>`
|
||||
|
||||
* FP16
|
||||
|
||||
`python ./main.py --arch resnet50 -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate --fp16 <path to imagenet>`
|
||||
|
||||
* AMP
|
||||
|
||||
`python ./main.py --arch resnet50 -p 1 --raport-file benchmark.json --epochs 1 --prof 100 --evaluate --amp <path to imagenet>`
|
||||
|
||||
Each of these scripts will run 100 iterations and save results in the `benchmark.json` file.
|
||||
|
||||
|
||||
### Results
|
||||
|
||||
Our results were obtained by running the applicable training script in the pytorch-19.10 NGC container.
|
||||
|
||||
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
|
||||
|
||||
#### Training accuracy results
|
||||
|
||||
##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
|
||||
|
||||
| **epochs** | **Mixed Precision Top1** | **FP32 Top1** |
|
||||
|:-:|:-:|:-:|
|
||||
| 50 | 76.25 +/- 0.04 | 76.26 +/- 0.07 |
|
||||
| 90 | 77.23 +/- 0.04 | 77.08 +/- 0.08 |
|
||||
| 250 | 78.42 +/- 0.04 | 78.30 +/- 0.16 |
|
||||
|
||||
##### Training accuracy: NVIDIA DGX-2 (16x V100 32G)
|
||||
|
||||
| **epochs** | **Mixed Precision Top1** | **FP32 Top1** |
|
||||
|:-:|:-:|:-:|
|
||||
| 50 | 75.81 +/- 0.08 | 76.04 +/- 0.05 |
|
||||
| 90 | 77.10 +/- 0.06 | 77.23 +/- 0.04 |
|
||||
| 250 | 78.59 +/- 0.13 | 78.46 +/- 0.03 |
|
||||
|
||||
|
||||
|
||||
##### Example plots
|
||||
|
||||
The following images show a 250 epochs configuration on a DGX-1V.
|
||||
|
||||
![ValidationLoss](./img/loss_plot.png)
|
||||
|
||||
![ValidationTop1](./img/top1_plot.png)
|
||||
|
||||
![ValidationTop5](./img/top5_plot.png)
|
||||
|
||||
#### Training performance results
|
||||
|
||||
##### Traininig performance: NVIDIA DGX1-16G (8x V100 16G)
|
||||
|
||||
| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
|
||||
|:-:|:-:|:-:|:-:|:-:|:-:|
|
||||
| 1 | 893.09 img/s | 380.44 img/s | 2.35x | 1.00x | 1.00x |
|
||||
| 8 | 6888.75 img/s | 2945.37 img/s | 2.34x | 7.71x | 7.74x |
|
||||
|
||||
##### Traininig performance: NVIDIA DGX1-32G (8x V100 32G)
|
||||
|
||||
| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
|
||||
|:-:|:-:|:-:|:-:|:-:|:-:|
|
||||
| 1 | 849.63 img/s | 373.93 img/s | 2.27x | 1.00x | 1.00x |
|
||||
| 8 | 6614.15 img/s | 2911.22 img/s | 2.27x | 7.78x | 7.79x |
|
||||
|
||||
##### Traininig performance: NVIDIA DGX2 (16x V100 32G)
|
||||
|
||||
| **GPUs** | **Mixed Precision** | **FP32** | **Mixed Precision speedup** | **Mixed Precision Strong Scaling** | **FP32 Strong Scaling** |
|
||||
|:-:|:-:|:-:|:-:|:-:|:-:|
|
||||
| 1 | 894.41 img/s | 402.23 img/s | 2.22x | 1.00x | 1.00x |
|
||||
| 16 | 13443.82 img/s | 6263.41 img/s | 2.15x | 15.03x | 15.57x |
|
||||
|
||||
#### Training Time for 90 Epochs
|
||||
|
||||
##### Training time: NVIDIA DGX-1 (8x V100 16G)
|
||||
|
||||
| **GPUs** | **Mixed Precision training time** | **FP32 training time** |
|
||||
|:-:|:-:|:-:|
|
||||
| 1 | ~ 41 h | ~ 95 h |
|
||||
| 8 | ~ 7 h | ~ 14 h |
|
||||
|
||||
##### Training time: NVIDIA DGX-2 (16x V100 32G)
|
||||
|
||||
| **GPUs** | **Mixed Precision training time** | **FP32 training time** |
|
||||
|:-:|:-:|:-:|
|
||||
| 1 | ~ 41 h | ~ 90 h |
|
||||
| 16 | ~ 5 h | ~ 8 h |
|
||||
|
||||
|
||||
|
||||
#### Inference performance results
|
||||
|
||||
##### Inference performance: NVIDIA DGX-1 (1x V100 16G)
|
||||
|
||||
###### FP32 Inference Latency
|
||||
|
||||
| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
|
||||
|:-:|:-:|:-:|:-:|:-:|:-:|
|
||||
| 1 | 136.82 img/s | 7.12ms | 7.25ms | 8.36ms | 10.92ms |
|
||||
| 2 | 266.86 img/s | 7.27ms | 7.41ms | 7.85ms | 9.11ms |
|
||||
| 4 | 521.76 img/s | 7.44ms | 7.58ms | 8.14ms | 10.09ms |
|
||||
| 8 | 766.22 img/s | 10.18ms | 10.46ms | 10.97ms | 12.75ms |
|
||||
| 16 | 976.36 img/s | 15.79ms | 15.88ms | 15.95ms | 16.63ms |
|
||||
| 32 | 1092.27 img/s | 28.63ms | 28.71ms | 28.76ms | 29.30ms |
|
||||
| 64 | 1161.55 img/s | 53.69ms | 53.86ms | 53.90ms | 54.23ms |
|
||||
| 128 | 1209.12 img/s | 104.24ms | 104.68ms | 104.80ms | 105.00ms |
|
||||
| 256 | N/A | N/A | N/A | N/A | N/A |
|
||||
|
||||
###### Mixed Precision Inference Latency
|
||||
|
||||
| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
|
||||
|:-:|:-:|:-:|:-:|:-:|:-:|
|
||||
| 1 | 114.97 img/s | 8.56ms | 9.32ms | 11.43ms | 12.79ms |
|
||||
| 2 | 238.70 img/s | 8.20ms | 8.75ms | 9.49ms | 12.31ms |
|
||||
| 4 | 448.69 img/s | 8.67ms | 9.20ms | 9.97ms | 10.60ms |
|
||||
| 8 | 875.00 img/s | 8.88ms | 9.31ms | 9.80ms | 10.82ms |
|
||||
| 16 | 1746.07 img/s | 8.89ms | 9.05ms | 9.56ms | 12.81ms |
|
||||
| 32 | 2004.28 img/s | 14.07ms | 14.14ms | 14.31ms | 14.92ms |
|
||||
| 64 | 2254.60 img/s | 25.93ms | 26.05ms | 26.07ms | 26.17ms |
|
||||
| 128 | 2360.14 img/s | 50.14ms | 50.28ms | 50.34ms | 50.68ms |
|
||||
| 256 | 2342.13 img/s | 96.74ms | 96.91ms | 96.99ms | 97.14ms |
|
||||
|
||||
|
||||
|
||||
##### Inference performance: NVIDIA T4
|
||||
|
||||
###### FP32 Inference Latency
|
||||
|
||||
| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
|
||||
|:-:|:-:|:-:|:-:|:-:|:-:|
|
||||
| 1 | 179.85 img/s | 5.51ms | 5.65ms | 7.34ms | 10.97ms |
|
||||
| 2 | 348.12 img/s | 5.67ms | 5.95ms | 6.33ms | 9.81ms |
|
||||
| 4 | 556.27 img/s | 7.03ms | 7.34ms | 8.13ms | 9.65ms |
|
||||
| 8 | 740.43 img/s | 10.32ms | 10.33ms | 10.60ms | 13.87ms |
|
||||
| 16 | 909.17 img/s | 17.19ms | 17.15ms | 18.13ms | 21.06ms |
|
||||
| 32 | 999.07 img/s | 31.07ms | 31.12ms | 31.17ms | 32.41ms |
|
||||
| 64 | 1090.47 img/s | 57.62ms | 57.84ms | 57.91ms | 58.05ms |
|
||||
| 128 | 1142.46 img/s | 110.94ms | 111.15ms | 111.23ms | 112.16ms |
|
||||
| 256 | N/A | N/A | N/A | N/A | N/A |
|
||||
|
||||
###### Mixed Precision Inference Latency
|
||||
|
||||
| **batch size** | **Throughput Avg** | **Latency Avg** | **Latency 90%** | **Latency 95%** | **Latency 99%** |
|
||||
|:-:|:-:|:-:|:-:|:-:|:-:|
|
||||
| 1 | 163.78 img/s | 6.05ms | 5.92ms | 7.98ms | 11.58ms |
|
||||
| 2 | 333.43 img/s | 5.91ms | 6.05ms | 6.63ms | 11.52ms |
|
||||
| 4 | 645.45 img/s | 6.04ms | 6.33ms | 7.01ms | 8.90ms |
|
||||
| 8 | 1164.15 img/s | 6.73ms | 7.31ms | 8.04ms | 12.41ms |
|
||||
| 16 | 1606.42 img/s | 9.53ms | 9.86ms | 10.52ms | 17.01ms |
|
||||
| 32 | 1857.29 img/s | 15.67ms | 15.61ms | 16.14ms | 18.66ms |
|
||||
| 64 | 2011.62 img/s | 28.64ms | 28.69ms | 28.82ms | 31.06ms |
|
||||
| 128 | 2083.90 img/s | 54.87ms | 54.96ms | 54.99ms | 55.27ms |
|
||||
| 256 | 2043.72 img/s | 106.51ms | 106.62ms | 106.68ms | 107.03ms |
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## Release notes
|
||||
|
||||
### Changelog
|
||||
|
||||
1. September 2018
|
||||
* Initial release
|
||||
2. January 2019
|
||||
* Added options Label Smoothing, fan-in initialization, skipping weight decay on batch norm gamma and bias.
|
||||
3. May 2019
|
||||
* Cosine LR schedule
|
||||
* MixUp regularization
|
||||
* DALI support
|
||||
* DGX2 configurations
|
||||
* gradients accumulation
|
||||
4. July 2019
|
||||
* DALI-CPU dataloader
|
||||
* Updated README
|
||||
|
||||
### Known issues
|
||||
|
||||
There are no known issues with this model.
|
||||
|
||||
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 250 --mixup 0.2
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 50
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 90
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 250 --mixup 0.2
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 50
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --amp --static-loss-scale 128 --epochs 90
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 250 --mixup 0.2
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 50
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 90
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 250 --mixup 0.2
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 50
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 256 --fp16 --static-loss-scale 128 --epochs 90
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 250 --mixup 0.2
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 50
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 8 ./main.py /imagenet --data-backend dali-cpu --raport-file raport.json -j5 -p 100 --lr 2.048 --optimizer-batch-size 2048 --warmup 8 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 90
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 250 --mixup 0.2
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 50
|
|
@ -0,0 +1 @@
|
|||
python ./multiproc.py --nproc_per_node 16 ./main.py /imagenet --data-backend dali-gpu --raport-file raport.json -j5 -p 100 --lr 4.096 --optimizer-batch-size 4096 --warmup 16 --arch resnet50 -c fanin --label-smoothing 0.1 --lr-schedule cosine --mom 0.875 --wd 3.0517578125e-05 --workspace ${1:-./} -b 128 --epochs 90
|
|
@ -0,0 +1,8 @@
|
|||
name: designer-cv-transform
|
||||
channels:
|
||||
- defaults
|
||||
dependencies:
|
||||
- pip=20.2
|
||||
- python=3.7.9
|
||||
- pip:
|
||||
- azureml-designer-cv-modules==0.0.41
|
|
@ -0,0 +1,127 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/development/CommandComponent.schema.json
|
||||
type: command
|
||||
|
||||
name: microsoftsamples_init_image_transformation
|
||||
display_name: Init Image Transformation
|
||||
description: Initialize image transformation.
|
||||
|
||||
version: 1
|
||||
|
||||
inputs:
|
||||
resize:
|
||||
description: Resize the input PIL Image to the given size
|
||||
type: string
|
||||
default: True
|
||||
enum: ['True', 'False']
|
||||
size:
|
||||
description: Desired output size
|
||||
type: integer
|
||||
default: 256
|
||||
center_crop:
|
||||
description: Crops the given PIL Image at the center
|
||||
type: string
|
||||
default: False
|
||||
enum: ['True', 'False']
|
||||
crop_size:
|
||||
description: Desired output size of the crop
|
||||
type: integer
|
||||
default: 224
|
||||
pad:
|
||||
description: Pad the given PIL Image on all sides with the given "pad" value
|
||||
type: string
|
||||
default: False
|
||||
enum: ['True', 'False']
|
||||
padding:
|
||||
description: Padding on each border
|
||||
type: integer
|
||||
default: 0
|
||||
color_jitter:
|
||||
description: Randomly change the brightness, contrast and saturation of an image
|
||||
type: boolean
|
||||
default: false
|
||||
grayscale:
|
||||
description: Convert image to grayscale
|
||||
type: boolean
|
||||
default: false
|
||||
random_resized_crop:
|
||||
description: Crop the given PIL Image to random size and aspect ratio
|
||||
type: string
|
||||
default: False
|
||||
enum: ['True', 'False']
|
||||
random_resized_crop_size:
|
||||
description: Expected output size of each edge
|
||||
type: integer
|
||||
default: 256
|
||||
random_crop:
|
||||
description: Crop the given PIL Image at a random location
|
||||
type: string
|
||||
default: False
|
||||
enum: ['True', 'False']
|
||||
random_crop_size:
|
||||
description: Desired output size of the crop
|
||||
type: integer
|
||||
default: 224
|
||||
random_horizontal_flip:
|
||||
description: Horizontally flip the given PIL Image randomly with a given probability
|
||||
type: boolean
|
||||
default: false
|
||||
random_vertical_flip:
|
||||
description: Vertically flip the given PIL Image randomly with a given probability
|
||||
type: boolean
|
||||
default: false
|
||||
random_rotation:
|
||||
description: Rotate the image by angle
|
||||
type: boolean
|
||||
default: false
|
||||
random_rotation_degrees:
|
||||
description: Range of degrees to select from
|
||||
type: integer
|
||||
default: 0
|
||||
random_affine:
|
||||
description: Random affine transformation of the image keeping center invariant
|
||||
type: boolean
|
||||
default: false
|
||||
random_affine_degrees:
|
||||
description: Range of degrees to select from
|
||||
type: integer
|
||||
default: 0
|
||||
random_grayscale:
|
||||
description: Randomly convert image to grayscale with a probability of p (default 0.1)
|
||||
type: boolean
|
||||
default: false
|
||||
random_perspective:
|
||||
description: Performs Perspective transformation of the given PIL Image randomly with a given probability
|
||||
type: boolean
|
||||
default: false
|
||||
outputs:
|
||||
output_path:
|
||||
type: uri_folder
|
||||
description: Output image transformation
|
||||
|
||||
command: >-
|
||||
python -m azureml.designer.modules.computer_vision.transform.init_image_transformation.init_image_transformation
|
||||
--resize ${{inputs.resize}}
|
||||
--size ${{inputs.size}}
|
||||
--center-crop ${{inputs.center_crop}}
|
||||
--crop-size ${{inputs.crop_size}}
|
||||
--pad ${{inputs.pad}}
|
||||
--padding ${{inputs.padding}}
|
||||
--color-jitter ${{inputs.color_jitter}}
|
||||
--grayscale ${{inputs.grayscale}}
|
||||
--random-resized-crop ${{inputs.random_resized_crop}}
|
||||
--random-resized-crop-size ${{inputs.random_resized_crop_size}}
|
||||
--random-crop ${{inputs.random_crop}}
|
||||
--random-crop-size ${{inputs.random_crop_size}}
|
||||
--random-horizontal-flip ${{inputs.random_horizontal_flip}}
|
||||
--random-vertical-flip ${{inputs.random_vertical_flip}}
|
||||
--random-rotation ${{inputs.random_rotation}}
|
||||
--random-rotation-degrees ${{inputs.random_rotation_degrees}}
|
||||
--random-affine ${{inputs.random_affine}}
|
||||
--random-affine-degrees ${{inputs.random_affine_degrees}}
|
||||
--random-grayscale ${{inputs.random_grayscale}}
|
||||
--random-perspective ${{inputs.random_perspective}}
|
||||
--output-path ${{outputs.output_path}}
|
||||
|
||||
environment:
|
||||
conda_file: ./conda.yaml
|
||||
image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20211124.v1
|
|
@ -0,0 +1,95 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
|
||||
type: pipeline
|
||||
|
||||
# <inputs_and_outputs>
|
||||
inputs:
|
||||
training_image: #using local data, will crate an anonymous data asset
|
||||
type: uri_folder
|
||||
path: ./data/train
|
||||
validation_image:
|
||||
type: uri_folder
|
||||
path: ./data/val
|
||||
|
||||
# </inputs_and_outputs>
|
||||
|
||||
# <jobs>
|
||||
settings:
|
||||
default_datastore: azureml:workspaceblobstore
|
||||
default_compute: azureml:cpu-cluster
|
||||
continue_on_step_failure: false
|
||||
|
||||
jobs:
|
||||
convert_training_image:
|
||||
type: command
|
||||
component: file:./convert_to_image_directory/entry.spec.yaml
|
||||
inputs:
|
||||
input_path: ${{parent.inputs.training_image}}
|
||||
|
||||
convert_evaluation_image:
|
||||
type: command
|
||||
component: file:./convert_to_image_directory/entry.spec.yaml
|
||||
inputs:
|
||||
input_path: ${{parent.inputs.validation_image}}
|
||||
|
||||
init_transformation:
|
||||
type: command
|
||||
component: file:./init_image_transformation/entry.spec.yaml
|
||||
inputs:
|
||||
resize: "False"
|
||||
size: 256
|
||||
center_crop: 224
|
||||
pad: "False"
|
||||
padding: 0
|
||||
color_jitter: "False"
|
||||
grayscale: "False"
|
||||
random_resized_crop: "False"
|
||||
random_resized_crop_size: 256
|
||||
random_crop: "False"
|
||||
random_crop_size: 224
|
||||
random_horizontal_flip: "True"
|
||||
random_vertical_flip: "True"
|
||||
random_rotation: "False"
|
||||
random_rotation_degrees: 0
|
||||
random_affine: "False"
|
||||
random_affine_degrees: 0
|
||||
random_grayscale: "False"
|
||||
random_perspective: "False"
|
||||
|
||||
transform_on_training_image:
|
||||
type: command
|
||||
component: file:./apply_image_transformation/entry.spec.yaml
|
||||
inputs:
|
||||
mode: "For training"
|
||||
input_image_transform_path: ${{parent.jobs.init_transformation.outputs.output_path}}
|
||||
input_image_dir_path: ${{parent.jobs.convert_training_image.outputs.output_path}}
|
||||
|
||||
transform_on_evaluation_image:
|
||||
type: command
|
||||
component: file:./apply_image_transformation/entry.spec.yaml
|
||||
inputs:
|
||||
mode: "For inference"
|
||||
input_image_transform_path: ${{parent.jobs.init_transformation.outputs.output_path}}
|
||||
input_image_dir_path: ${{parent.jobs.convert_evaluation_image.outputs.output_path}}
|
||||
|
||||
train:
|
||||
type: command
|
||||
component: file:./image_cnn_train/entry.spec.yaml
|
||||
compute: azureml:gpu-cluster
|
||||
inputs:
|
||||
train_data: ${{parent.jobs.transform_on_training_image.outputs.output_path}}
|
||||
valid_data: ${{parent.jobs.transform_on_evaluation_image.outputs.output_path}}
|
||||
data_backend: "pytorch"
|
||||
epochs: 4
|
||||
seed: 123
|
||||
batch_size: 16
|
||||
save_checkpoint_epochs: 2
|
||||
outputs:
|
||||
workspace:
|
||||
type: uri_folder
|
||||
mode: upload
|
||||
distribution:
|
||||
type: mpi
|
||||
process_count_per_instance: 1
|
||||
resources:
|
||||
instance_count: 2
|
||||
# </jobs>
|
|
@ -0,0 +1,15 @@
|
|||
---
|
||||
page_type: sample
|
||||
languages:
|
||||
- azurecli
|
||||
- python
|
||||
products:
|
||||
- azure-machine-learning
|
||||
description: This sample shows how to using distributed job on an Azure ML compute cluster. It will use cifar-10 dataset, processed data, train model and then evaluate output model.
|
||||
---
|
||||
|
||||
# Submit pipeline job
|
||||
|
||||
This example shows how a build a three steps pipeline. You need use gpu SKU or powerful cpu SKU like `STANDARD_D15_V2` for the train and eval step in this pipeline.
|
||||
|
||||
Please change `process_count_per_instance` number of GPU cards you have to fully utilize the compute resource you have.
|
|
@ -0,0 +1,56 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
|
||||
type: pipeline
|
||||
display_name: cifar-10-pipeline-example
|
||||
experiment_name: cifar-10-pipeline-example
|
||||
jobs:
|
||||
get-data:
|
||||
type: command
|
||||
command: >-
|
||||
wget https://azuremlexamples.blob.core.windows.net/datasets/cifar-10-python.tar.gz;
|
||||
tar -xvzf cifar-10-python.tar.gz -C ${{outputs.cifar}};
|
||||
rm cifar-10-python.tar.gz;
|
||||
compute: azureml:gpu-cluster
|
||||
environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest
|
||||
outputs:
|
||||
cifar:
|
||||
type: uri_folder
|
||||
mode: upload
|
||||
train-model:
|
||||
type: command
|
||||
command: >-
|
||||
python main.py
|
||||
--data-dir ${{inputs.cifar}}
|
||||
--epochs ${{inputs.epochs}}
|
||||
--model-dir ${{outputs.model_dir}}
|
||||
code: src/train-model
|
||||
inputs:
|
||||
epochs: 1
|
||||
cifar: ${{parent.jobs.get-data.outputs.cifar}}
|
||||
outputs:
|
||||
model_dir:
|
||||
type: uri_folder
|
||||
mode: upload
|
||||
environment: azureml:AzureML-pytorch-1.9-ubuntu18.04-py37-cuda11-gpu@latest
|
||||
compute: azureml:gpu-cluster
|
||||
distribution:
|
||||
type: pytorch
|
||||
process_count_per_instance: 1
|
||||
resources:
|
||||
instance_count: 2
|
||||
eval-model:
|
||||
type: command
|
||||
command: >-
|
||||
python main.py
|
||||
--data-dir ${{inputs.cifar}}
|
||||
--model-dir ${{inputs.model_dir}}/model
|
||||
code: src/eval-model
|
||||
environment: azureml:AzureML-pytorch-1.9-ubuntu18.04-py37-cuda11-gpu@latest
|
||||
compute: azureml:gpu-cluster
|
||||
distribution:
|
||||
type: pytorch
|
||||
process_count_per_instance: 2
|
||||
resources:
|
||||
instance_count: 1
|
||||
inputs:
|
||||
cifar: ${{parent.jobs.get-data.outputs.cifar}}
|
||||
model_dir: ${{parent.jobs.train-model.outputs.model_dir}}
|
|
@ -0,0 +1,147 @@
|
|||
# Copyright (c) 2017 Facebook, Inc. All rights reserved.
|
||||
# BSD 3-Clause License
|
||||
#
|
||||
# Script adapted from: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py
|
||||
# ==============================================================================
|
||||
|
||||
# imports
|
||||
import os
|
||||
import mlflow
|
||||
import argparse
|
||||
|
||||
import torch
|
||||
import torchvision
|
||||
import torchvision.transforms as transforms
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
|
||||
# TODO - add mlflow logging
|
||||
|
||||
# define functions
|
||||
def evaluate(test_loader, model, device):
|
||||
classes = (
|
||||
"plane",
|
||||
"car",
|
||||
"bird",
|
||||
"cat",
|
||||
"deer",
|
||||
"dog",
|
||||
"frog",
|
||||
"horse",
|
||||
"ship",
|
||||
"truck",
|
||||
)
|
||||
|
||||
model.eval()
|
||||
|
||||
correct = 0
|
||||
total = 0
|
||||
class_correct = list(0.0 for i in range(10))
|
||||
class_total = list(0.0 for i in range(10))
|
||||
with torch.no_grad():
|
||||
for data in test_loader:
|
||||
images, labels = data[0].to(device), data[1].to(device)
|
||||
outputs = model(images)
|
||||
_, predicted = torch.max(outputs.data, 1)
|
||||
total += labels.size(0)
|
||||
correct += (predicted == labels).sum().item()
|
||||
c = (predicted == labels).squeeze()
|
||||
for i in range(10):
|
||||
label = labels[i]
|
||||
class_correct[label] += c[i].item()
|
||||
class_total[label] += 1
|
||||
|
||||
# print total test set accuracy
|
||||
print(
|
||||
"Accuracy of the network on the 10000 test images: %d %%"
|
||||
% (100 * correct / total)
|
||||
)
|
||||
|
||||
# print test accuracy for each of the classes
|
||||
for i in range(10):
|
||||
print(
|
||||
"Accuracy of %5s : %2d %%"
|
||||
% (classes[i], 100 * class_correct[i] / class_total[i])
|
||||
)
|
||||
|
||||
|
||||
def main(args):
|
||||
# get PyTorch environment variables
|
||||
world_size = int(os.environ["WORLD_SIZE"])
|
||||
rank = int(os.environ["RANK"])
|
||||
local_rank = int(os.environ["LOCAL_RANK"])
|
||||
|
||||
distributed = world_size > 1
|
||||
|
||||
# set device
|
||||
if distributed and torch.cuda.is_available():
|
||||
device = torch.device("cuda", local_rank)
|
||||
else:
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
# initialize distributed process group using default env:// method
|
||||
if distributed:
|
||||
torch.distributed.init_process_group(
|
||||
backend="nccl" if torch.cuda.is_available() else "gloo"
|
||||
)
|
||||
|
||||
# define test dataset DataLoaders
|
||||
transform = transforms.Compose(
|
||||
[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
|
||||
)
|
||||
|
||||
test_set = torchvision.datasets.CIFAR10(
|
||||
root=args.data_dir, train=False, download=False, transform=transform
|
||||
)
|
||||
test_loader = torch.utils.data.DataLoader(
|
||||
test_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers
|
||||
)
|
||||
|
||||
# load model
|
||||
model = mlflow.pytorch.load_model(args.model_dir)
|
||||
model = model.to(device)
|
||||
|
||||
# evaluate on full test dataset
|
||||
if not distributed or rank == 0:
|
||||
evaluate(test_loader, model, device)
|
||||
|
||||
|
||||
def parse_args():
|
||||
# setup argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# add arguments
|
||||
parser.add_argument(
|
||||
"--data-dir", type=str, help="directory containing CIFAR-10 dataset"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model-dir", type=str, default="./", help="input directory for model"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
default=16,
|
||||
type=int,
|
||||
help="mini batch size for each gpu/process",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workers",
|
||||
default=2,
|
||||
type=int,
|
||||
help="number of data loading workers for each gpu/process",
|
||||
)
|
||||
|
||||
# parse args
|
||||
args = parser.parse_args()
|
||||
|
||||
# return args
|
||||
return args
|
||||
|
||||
|
||||
# run script
|
||||
if __name__ == "__main__":
|
||||
# parse args
|
||||
args = parse_args()
|
||||
|
||||
# call main function
|
||||
main(args)
|
|
@ -0,0 +1,199 @@
|
|||
# Copyright (c) 2017 Facebook, Inc. All rights reserved.
|
||||
# BSD 3-Clause License
|
||||
#
|
||||
# Script adapted from: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py
|
||||
# ==============================================================================
|
||||
|
||||
# imports
|
||||
import os
|
||||
import mlflow
|
||||
import argparse
|
||||
|
||||
import torch
|
||||
import torchvision
|
||||
import torchvision.transforms as transforms
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
|
||||
# TODO - add mlflow logging
|
||||
|
||||
# define network architecture
|
||||
class Net(nn.Module):
|
||||
def __init__(self):
|
||||
super(Net, self).__init__()
|
||||
self.conv1 = nn.Conv2d(3, 32, 3)
|
||||
self.pool = nn.MaxPool2d(2, 2)
|
||||
self.conv2 = nn.Conv2d(32, 64, 3)
|
||||
self.conv3 = nn.Conv2d(64, 128, 3)
|
||||
self.fc1 = nn.Linear(128 * 6 * 6, 120)
|
||||
self.dropout = nn.Dropout(p=0.2)
|
||||
self.fc2 = nn.Linear(120, 84)
|
||||
self.fc3 = nn.Linear(84, 10)
|
||||
|
||||
def forward(self, x):
|
||||
x = F.relu(self.conv1(x))
|
||||
x = self.pool(F.relu(self.conv2(x)))
|
||||
x = self.pool(F.relu(self.conv3(x)))
|
||||
x = x.view(-1, 128 * 6 * 6)
|
||||
x = self.dropout(F.relu(self.fc1(x)))
|
||||
x = F.relu(self.fc2(x))
|
||||
x = self.fc3(x)
|
||||
return x
|
||||
|
||||
|
||||
# define functions
|
||||
def train(train_loader, model, criterion, optimizer, epoch, device, print_freq, rank):
|
||||
running_loss = 0.0
|
||||
for i, data in enumerate(train_loader, 0):
|
||||
# get the inputs; data is a list of [inputs, labels]
|
||||
inputs, labels = data[0].to(device), data[1].to(device)
|
||||
|
||||
# zero the parameter gradients
|
||||
optimizer.zero_grad()
|
||||
|
||||
# forward + backward + optimize
|
||||
outputs = model(inputs)
|
||||
loss = criterion(outputs, labels)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# print statistics
|
||||
running_loss += loss.item()
|
||||
if i % print_freq == 0: # print every print_freq mini-batches
|
||||
print(
|
||||
"Rank %d: [%d, %5d] loss: %.3f"
|
||||
% (rank, epoch + 1, i + 1, running_loss / print_freq)
|
||||
)
|
||||
running_loss = 0.0
|
||||
|
||||
|
||||
def main(args):
|
||||
# get PyTorch environment variables
|
||||
world_size = int(os.environ["WORLD_SIZE"])
|
||||
rank = int(os.environ["RANK"])
|
||||
local_rank = int(os.environ["LOCAL_RANK"])
|
||||
|
||||
distributed = world_size > 1
|
||||
|
||||
# set device
|
||||
if distributed and torch.cuda.is_available():
|
||||
device = torch.device("cuda", local_rank)
|
||||
else:
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
# initialize distributed process group using default env:// method
|
||||
if distributed:
|
||||
torch.distributed.init_process_group(
|
||||
backend="nccl" if torch.cuda.is_available() else "gloo"
|
||||
)
|
||||
|
||||
# define train and dataset DataLoaders
|
||||
transform = transforms.Compose(
|
||||
[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
|
||||
)
|
||||
|
||||
train_set = torchvision.datasets.CIFAR10(
|
||||
root=args.data_dir, train=True, download=False, transform=transform
|
||||
)
|
||||
|
||||
if distributed:
|
||||
train_sampler = torch.utils.data.distributed.DistributedSampler(train_set)
|
||||
else:
|
||||
train_sampler = None
|
||||
|
||||
train_loader = torch.utils.data.DataLoader(
|
||||
train_set,
|
||||
batch_size=args.batch_size,
|
||||
shuffle=(train_sampler is None),
|
||||
num_workers=args.workers,
|
||||
sampler=train_sampler,
|
||||
)
|
||||
|
||||
model = Net().to(device)
|
||||
|
||||
# wrap model with DDP
|
||||
if distributed and torch.cuda.is_available():
|
||||
model = nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[local_rank], output_device=local_rank
|
||||
)
|
||||
|
||||
# define loss function and optimizer
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
optimizer = optim.SGD(
|
||||
model.parameters(), lr=args.learning_rate, momentum=args.momentum
|
||||
)
|
||||
|
||||
# train the model
|
||||
for epoch in range(args.epochs):
|
||||
print("Rank %d: Starting epoch %d" % (rank, epoch))
|
||||
if distributed:
|
||||
train_sampler.set_epoch(epoch)
|
||||
model.train()
|
||||
train(
|
||||
train_loader,
|
||||
model,
|
||||
criterion,
|
||||
optimizer,
|
||||
epoch,
|
||||
device,
|
||||
args.print_freq,
|
||||
rank,
|
||||
)
|
||||
|
||||
print("Rank %d: Finished Training" % (rank))
|
||||
|
||||
if not distributed or rank == 0:
|
||||
# log model
|
||||
mlflow.pytorch.save_model(model, f"{args.model_dir}/model")
|
||||
|
||||
|
||||
def parse_args():
|
||||
# setup argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# add arguments
|
||||
parser.add_argument(
|
||||
"--data-dir", type=str, help="directory containing CIFAR-10 dataset"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model-dir", type=str, default="./", help="output directory for model"
|
||||
)
|
||||
parser.add_argument("--epochs", default=10, type=int, help="number of epochs")
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
default=16,
|
||||
type=int,
|
||||
help="mini batch size for each gpu/process",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workers",
|
||||
default=2,
|
||||
type=int,
|
||||
help="number of data loading workers for each gpu/process",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--learning-rate", default=0.001, type=float, help="learning rate"
|
||||
)
|
||||
parser.add_argument("--momentum", default=0.9, type=float, help="momentum")
|
||||
parser.add_argument(
|
||||
"--print-freq",
|
||||
default=200,
|
||||
type=int,
|
||||
help="frequency of printing training statistics",
|
||||
)
|
||||
|
||||
# parse args
|
||||
args = parser.parse_args()
|
||||
|
||||
# return args
|
||||
return args
|
||||
|
||||
|
||||
# run script
|
||||
if __name__ == "__main__":
|
||||
# parse args
|
||||
args = parse_args()
|
||||
|
||||
# call main function
|
||||
main(args)
|
|
@ -36,7 +36,6 @@ def main(args):
|
|||
jobs += sorted(
|
||||
glob.glob("jobs/pipelines-with-components/**/*pipeline*.yml", recursive=True)
|
||||
)
|
||||
jobs += sorted(glob.glob("jobs/*/basics/**/*pipeline*.yml", recursive=True))
|
||||
jobs = [
|
||||
job.replace(".yml", "")
|
||||
for job in jobs
|
||||
|
@ -290,6 +289,11 @@ def parse_path(path):
|
|||
def write_job_workflow(job):
|
||||
filename, project_dir, hyphenated = parse_path(job)
|
||||
creds = "${{secrets.AZ_CREDS}}"
|
||||
run_pipeline_job_path = (
|
||||
"\n - cli/run-pipeline-jobs.sh"
|
||||
if hyphenated.startswith("jobs-pipelines")
|
||||
else ""
|
||||
)
|
||||
workflow_yaml = f"""name: cli-{hyphenated}
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
@ -300,7 +304,7 @@ on:
|
|||
- main
|
||||
paths:
|
||||
- cli/{project_dir}/**
|
||||
- .github/workflows/cli-{hyphenated}.yml
|
||||
- .github/workflows/cli-{hyphenated}.yml{run_pipeline_job_path}
|
||||
- cli/setup.sh
|
||||
jobs:
|
||||
build:
|
||||
|
|
|
@ -37,6 +37,11 @@ pwd
|
|||
az ml job create --file pipeline.yml
|
||||
cd ../../../../
|
||||
|
||||
cd jobs/pipelines-with-components/basics/3c_pipeline_with_hyperparameter_sweep
|
||||
pwd
|
||||
az ml job create --file pipeline.yml
|
||||
cd ../../../../
|
||||
|
||||
cd jobs/pipelines-with-components/basics/4a_local_data_input
|
||||
pwd
|
||||
az ml job create --file pipeline.yml
|
||||
|
@ -47,12 +52,6 @@ pwd
|
|||
az ml job create --file pipeline.yml
|
||||
cd ../../../../
|
||||
|
||||
# cd jobs/pipelines-with-components/basics/4c_dataset_input
|
||||
# pwd
|
||||
# az ml data create --file data.yml --version $target_version
|
||||
# az ml job create --file pipeline.yml
|
||||
# cd ../../../../
|
||||
|
||||
cd jobs/pipelines-with-components/basics/4c_web_url_input
|
||||
pwd
|
||||
az ml job create --file pipeline.yml
|
||||
|
@ -103,9 +102,9 @@ pwd
|
|||
az ml job create --file pipeline.yml
|
||||
cd ../../../
|
||||
|
||||
# cd jobs/pipelines/cifar-10
|
||||
# pwd
|
||||
# az ml job create --file pipeline.yml --web
|
||||
# cd ../../../
|
||||
cd jobs/pipelines/cifar-10
|
||||
pwd
|
||||
az ml job create --file pipeline.yml --web
|
||||
cd ../../../
|
||||
|
||||
az --version
|
Загрузка…
Ссылка в новой задаче