This commit is contained in:
msalvaris 2018-08-13 15:59:38 +00:00
Родитель 7e948b25a0
Коммит c09ac66272
3 изменённых файлов: 65 добавлений и 50 удалений

Просмотреть файл

@ -4,8 +4,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Train Tensorflow Model Distributed on Batch AI\n",
"In this notebook we will train a TensorFlow model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n",
"# Train Keras Model Distributed on Batch AI\n",
"In this notebook we will train a Keras model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n",
" * [Create Azure Resources](#azure_resources)\n",
" * [Create Fileserver(NFS)](#create_fileshare)\n",
" * [Configure Batch AI Cluster](#configure_cluster)\n",
@ -15,7 +15,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -26,7 +26,7 @@
"from getpass import getpass\n",
"import os\n",
"import json\n",
"from utils import get_password, write_json_to_file"
"from utils import get_password, write_json_to_file, dotenv_for"
]
},
{
@ -49,7 +49,7 @@
"outputs": [],
"source": [
"# Variables for Batch AI - change as necessary\n",
"ID = \"ddtf2\"\n",
"ID = \"ddkeras\"\n",
"GROUP_NAME = f\"batch{ID}rg\"\n",
"STORAGE_ACCOUNT_NAME = f\"batch{ID}st\"\n",
"FILE_SHARE_NAME = f\"batch{ID}share\"\n",
@ -62,7 +62,7 @@
"PROCESSES_PER_NODE = 4\n",
"LOCATION = \"eastus\"\n",
"NFS_NAME = f\"batch{ID}nfs\"\n",
"EXPERIMENT = f\"distributed_tensorflow_{GPU_TYPE}\"\n",
"EXPERIMENT = f\"distributed_keras_{GPU_TYPE}\"\n",
"USERNAME = \"batchai_user\"\n",
"USE_FAKE = False\n",
"DOCKERHUB = \"caia\" #\"<YOUR DOCKERHUB>\""
@ -74,7 +74,8 @@
"metadata": {},
"outputs": [],
"source": [
"FAKE='-env FAKE=True' if USE_FAKE else ''"
"FAKE='-env FAKE=True' if USE_FAKE else ''\n",
"TOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES"
]
},
{
@ -244,7 +245,7 @@
"outputs": [],
"source": [
"!az batchai file-server create -n $NFS_NAME --disk-count 4 --disk-size 250 -w $WORKSPACE \\\n",
"-s Standard_DS4_v2 -u $USERNAME -p {get_password(find_dotenv())} -g $GROUP_NAME --storage-sku Premium_LRS"
"-s Standard_DS4_v2 -u $USERNAME -p {get_password(dotenv_for())} -g $GROUP_NAME --storage-sku Premium_LRS"
]
},
{
@ -324,7 +325,7 @@
"metadata": {},
"outputs": [],
"source": [
"!sshpass -p {get_password(dotenv_path)} scp -o \"StrictHostKeyChecking=no\" nodeprep.sh $USERNAME@{nfs_ip}:~/"
"!sshpass -p {get_password(dotenv_for())} scp -o \"StrictHostKeyChecking=no\" nodeprep.sh $USERNAME@{nfs_ip}:~/"
]
},
{
@ -333,7 +334,7 @@
"metadata": {},
"outputs": [],
"source": [
"!sshpass -p {get_password(dotenv_path)} ssh -o \"StrictHostKeyChecking=no\" $USERNAME@{nfs_ip} \"sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh\""
"!sshpass -p {get_password(dotenv_for())} ssh -o \"StrictHostKeyChecking=no\" $USERNAME@{nfs_ip} \"sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh\""
]
},
{
@ -367,8 +368,8 @@
"metadata": {},
"outputs": [],
"source": [
"!az storage file upload --share-name $FILE_SHARE_NAME --source ../src/imagenet_estimator_tf_horovod.py --path scripts\n",
"!az storage file upload --share-name $FILE_SHARE_NAME --source ../src/resnet_model.py --path scripts\n",
"!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_keras_horovod.py --path scripts\n",
"!az storage file upload --share-name $FILE_SHARE_NAME --source src/data_generator.py --path scripts\n",
"!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts"
]
},
@ -394,7 +395,7 @@
" --afs-name $FILE_SHARE_NAME \\\n",
" --afs-mount-path extfs \\\n",
" --user-name $USERNAME \\\n",
" --password {get_password(dotenv_path)} \\\n",
" --password {get_password(dotenv_for())} \\\n",
" --storage-account-name $STORAGE_ACCOUNT_NAME \\\n",
" --storage-account-key $storage_account_key \\\n",
" --nfs $NFS_NAME \\\n",
@ -457,7 +458,7 @@
" \"customToolkitSettings\": {\n",
" \"commandLine\": f\"source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh; \\\n",
" echo $AZ_BATCH_HOST_LIST; \\\n",
" mpirun -n {NUM_NODES} -ppn {PROCESSES_PER_NODE} -hosts $AZ_BATCH_HOST_LIST \\\n",
" mpirun -n {TOTAL_PROCESSES} -ppn {PROCESSES_PER_NODE} -hosts $AZ_BATCH_HOST_LIST \\\n",
" -env I_MPI_FABRICS=dapl \\\n",
" -env I_MPI_DAPL_PROVIDER=ofa-v2-ib0 \\\n",
" -env I_MPI_DYNAMIC_CONNECTION=0 \\\n",
@ -465,7 +466,7 @@
" -env I_MPI_HYDRA_DEBUG=on \\\n",
" -env DISTRIBUTED=True \\\n",
" {FAKE} \\\n",
" python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py\"\n",
" python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py\"\n",
" },\n",
" \"stdOutErrPathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n",
" \"inputDirectories\": [{\n",
@ -488,7 +489,7 @@
" }],\n",
" \"containerSettings\": {\n",
" \"imageSourceRegistry\": {\n",
" \"image\": \"{DOCKERHUB}/distributed-training.horovod-tf\"\n",
" \"image\": f\"{DOCKERHUB}/distributed-training.horovod-keras\"\n",
" }\n",
" }\n",
" }\n",
@ -510,7 +511,7 @@
"metadata": {},
"outputs": [],
"source": [
"JOB_NAME='tf-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)"
"JOB_NAME='keras-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)"
]
},
{
@ -523,7 +524,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"!az batchai job create -n $JOB_NAME --cluster $CLUSTER_NAME -w $WORKSPACE -e $EXPERIMENT -f job.json"
@ -571,7 +574,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stdout.txt"

Просмотреть файл

@ -4,8 +4,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Train Tensorflow Model Distributed on Batch AI\n",
"In this notebook we will train a TensorFlow model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n",
"# Train PyTorch Model Distributed on Batch AI\n",
"In this notebook we will train a PyTorch model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n",
" * [Create Azure Resources](#azure_resources)\n",
" * [Create Fileserver(NFS)](#create_fileshare)\n",
" * [Configure Batch AI Cluster](#configure_cluster)\n",
@ -15,7 +15,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -26,7 +26,7 @@
"from getpass import getpass\n",
"import os\n",
"import json\n",
"from utils import get_password, write_json_to_file"
"from utils import get_password, write_json_to_file, dotenv_for"
]
},
{
@ -49,7 +49,7 @@
"outputs": [],
"source": [
"# Variables for Batch AI - change as necessary\n",
"ID = \"ddtf2\"\n",
"ID = \"ddpytorch\"\n",
"GROUP_NAME = f\"batch{ID}rg\"\n",
"STORAGE_ACCOUNT_NAME = f\"batch{ID}st\"\n",
"FILE_SHARE_NAME = f\"batch{ID}share\"\n",
@ -62,7 +62,7 @@
"PROCESSES_PER_NODE = 4\n",
"LOCATION = \"eastus\"\n",
"NFS_NAME = f\"batch{ID}nfs\"\n",
"EXPERIMENT = f\"distributed_tensorflow_{GPU_TYPE}\"\n",
"EXPERIMENT = f\"distributed_pytorch_{GPU_TYPE}\"\n",
"USERNAME = \"batchai_user\"\n",
"USE_FAKE = False\n",
"DOCKERHUB = \"caia\" #\"<YOUR DOCKERHUB>\""
@ -74,7 +74,8 @@
"metadata": {},
"outputs": [],
"source": [
"FAKE='-env FAKE=True' if USE_FAKE else ''"
"FAKE='-env FAKE=True' if USE_FAKE else ''\n",
"TOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES"
]
},
{
@ -244,7 +245,7 @@
"outputs": [],
"source": [
"!az batchai file-server create -n $NFS_NAME --disk-count 4 --disk-size 250 -w $WORKSPACE \\\n",
"-s Standard_DS4_v2 -u $USERNAME -p {get_password(find_dotenv())} -g $GROUP_NAME --storage-sku Premium_LRS"
"-s Standard_DS4_v2 -u $USERNAME -p {get_password(dotenv_for())} -g $GROUP_NAME --storage-sku Premium_LRS"
]
},
{
@ -324,7 +325,7 @@
"metadata": {},
"outputs": [],
"source": [
"!sshpass -p {get_password(dotenv_path)} scp -o \"StrictHostKeyChecking=no\" nodeprep.sh $USERNAME@{nfs_ip}:~/"
"!sshpass -p {get_password(dotenv_for())} scp -o \"StrictHostKeyChecking=no\" nodeprep.sh $USERNAME@{nfs_ip}:~/"
]
},
{
@ -333,7 +334,7 @@
"metadata": {},
"outputs": [],
"source": [
"!sshpass -p {get_password(dotenv_path)} ssh -o \"StrictHostKeyChecking=no\" $USERNAME@{nfs_ip} \"sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh\""
"!sshpass -p {get_password(dotenv_for())} ssh -o \"StrictHostKeyChecking=no\" $USERNAME@{nfs_ip} \"sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh\""
]
},
{
@ -367,9 +368,10 @@
"metadata": {},
"outputs": [],
"source": [
"!az storage file upload --share-name $FILE_SHARE_NAME --source ../src/imagenet_estimator_tf_horovod.py --path scripts\n",
"!az storage file upload --share-name $FILE_SHARE_NAME --source ../src/resnet_model.py --path scripts\n",
"!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts"
"!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_pytorch_horovod.py --path scripts\n",
"!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts\n",
"!az storage file upload --share-name $FILE_SHARE_NAME --source cluster_config/docker.service --path scripts\n",
"!az storage file upload --share-name $FILE_SHARE_NAME --source cluster_config/nodeprep.sh --path scripts"
]
},
{
@ -394,11 +396,12 @@
" --afs-name $FILE_SHARE_NAME \\\n",
" --afs-mount-path extfs \\\n",
" --user-name $USERNAME \\\n",
" --password {get_password(dotenv_path)} \\\n",
" --password {get_password(dotenv_for())} \\\n",
" --storage-account-name $STORAGE_ACCOUNT_NAME \\\n",
" --storage-account-key $storage_account_key \\\n",
" --nfs $NFS_NAME \\\n",
" --nfs-mount-path nfs "
" --nfs-mount-path nfs \\\n",
" --config-file cluster_config/cluster.json"
]
},
{
@ -455,17 +458,21 @@
" \"properties\": {\n",
" \"nodeCount\": NUM_NODES,\n",
" \"customToolkitSettings\": {\n",
" \"commandLine\": f\"source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh; \\\n",
" echo $AZ_BATCH_HOST_LIST; \\\n",
" mpirun -n {NUM_NODES} -ppn {PROCESSES_PER_NODE} -hosts $AZ_BATCH_HOST_LIST \\\n",
" -env I_MPI_FABRICS=dapl \\\n",
" -env I_MPI_DAPL_PROVIDER=ofa-v2-ib0 \\\n",
" -env I_MPI_DYNAMIC_CONNECTION=0 \\\n",
" -env I_MPI_DEBUG=6 \\\n",
" -env I_MPI_HYDRA_DEBUG=on \\\n",
" -env DISTRIBUTED=True \\\n",
" \"commandLine\": f\"echo $AZ_BATCH_HOST_LIST; \\\n",
" cat $AZ_BATCHAI_MPI_HOST_FILE; \\\n",
" mpirun -np {TOTAL_PROCESSES} --hostfile $AZ_BATCHAI_MPI_HOST_FILE \\\n",
" -bind-to none -map-by slot \\\n",
" -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \\\n",
" -mca btl_tcp_if_include eth0 \\\n",
" -x NCCL_SOCKET_IFNAME=eth0 \\\n",
" -mca btl ^openib \\\n",
" -x NCCL_IB_DISABLE=1 \\\n",
" -x DISTRIBUTED=True \\\n",
" -x AZ_BATCHAI_INPUT_TRAIN \\\n",
" -x AZ_BATCHAI_INPUT_TEST \\\n",
" --allow-run-as-root \\\n",
" {FAKE} \\\n",
" python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py\"\n",
" python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py\"\n",
" },\n",
" \"stdOutErrPathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n",
" \"inputDirectories\": [{\n",
@ -488,7 +495,7 @@
" }],\n",
" \"containerSettings\": {\n",
" \"imageSourceRegistry\": {\n",
" \"image\": \"{DOCKERHUB}/distributed-training.horovod-tf\"\n",
" \"image\": f\"{DOCKERHUB}/distributed-training.horovod-pytorch\"\n",
" }\n",
" }\n",
" }\n",
@ -510,7 +517,7 @@
"metadata": {},
"outputs": [],
"source": [
"JOB_NAME='tf-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)"
"JOB_NAME='pytorch-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)"
]
},
{
@ -580,7 +587,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stderr.txt"

Просмотреть файл

@ -74,7 +74,8 @@
"metadata": {},
"outputs": [],
"source": [
"FAKE='-env FAKE=True' if USE_FAKE else ''"
"FAKE='-env FAKE=True' if USE_FAKE else ''\n",
"TOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES"
]
},
{
@ -457,7 +458,7 @@
" \"customToolkitSettings\": {\n",
" \"commandLine\": f\"source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh; \\\n",
" echo $AZ_BATCH_HOST_LIST; \\\n",
" mpirun -n {NUM_NODES} -ppn {PROCESSES_PER_NODE} -hosts $AZ_BATCH_HOST_LIST \\\n",
" mpirun -n {TOTAL_PROCESSES} -ppn {PROCESSES_PER_NODE} -hosts $AZ_BATCH_HOST_LIST \\\n",
" -env I_MPI_FABRICS=dapl \\\n",
" -env I_MPI_DAPL_PROVIDER=ofa-v2-ib0 \\\n",
" -env I_MPI_DYNAMIC_CONNECTION=0 \\\n",
@ -488,7 +489,7 @@
" }],\n",
" \"containerSettings\": {\n",
" \"imageSourceRegistry\": {\n",
" \"image\": \"{DOCKERHUB}/distributed-training.horovod-tf\"\n",
" \"image\": f\"{DOCKERHUB}/distributed-training.horovod-tf\"\n",
" }\n",
" }\n",
" }\n",