Updates notebooks
This commit is contained in:
Родитель
7e948b25a0
Коммит
c09ac66272
|
@ -4,8 +4,8 @@
|
|||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Train Tensorflow Model Distributed on Batch AI\n",
|
||||
"In this notebook we will train a TensorFlow model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n",
|
||||
"# Train Keras Model Distributed on Batch AI\n",
|
||||
"In this notebook we will train a Keras model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n",
|
||||
" * [Create Azure Resources](#azure_resources)\n",
|
||||
" * [Create Fileserver(NFS)](#create_fileshare)\n",
|
||||
" * [Configure Batch AI Cluster](#configure_cluster)\n",
|
||||
|
@ -15,7 +15,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -26,7 +26,7 @@
|
|||
"from getpass import getpass\n",
|
||||
"import os\n",
|
||||
"import json\n",
|
||||
"from utils import get_password, write_json_to_file"
|
||||
"from utils import get_password, write_json_to_file, dotenv_for"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -49,7 +49,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Variables for Batch AI - change as necessary\n",
|
||||
"ID = \"ddtf2\"\n",
|
||||
"ID = \"ddkeras\"\n",
|
||||
"GROUP_NAME = f\"batch{ID}rg\"\n",
|
||||
"STORAGE_ACCOUNT_NAME = f\"batch{ID}st\"\n",
|
||||
"FILE_SHARE_NAME = f\"batch{ID}share\"\n",
|
||||
|
@ -62,7 +62,7 @@
|
|||
"PROCESSES_PER_NODE = 4\n",
|
||||
"LOCATION = \"eastus\"\n",
|
||||
"NFS_NAME = f\"batch{ID}nfs\"\n",
|
||||
"EXPERIMENT = f\"distributed_tensorflow_{GPU_TYPE}\"\n",
|
||||
"EXPERIMENT = f\"distributed_keras_{GPU_TYPE}\"\n",
|
||||
"USERNAME = \"batchai_user\"\n",
|
||||
"USE_FAKE = False\n",
|
||||
"DOCKERHUB = \"caia\" #\"<YOUR DOCKERHUB>\""
|
||||
|
@ -74,7 +74,8 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"FAKE='-env FAKE=True' if USE_FAKE else ''"
|
||||
"FAKE='-env FAKE=True' if USE_FAKE else ''\n",
|
||||
"TOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -244,7 +245,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"!az batchai file-server create -n $NFS_NAME --disk-count 4 --disk-size 250 -w $WORKSPACE \\\n",
|
||||
"-s Standard_DS4_v2 -u $USERNAME -p {get_password(find_dotenv())} -g $GROUP_NAME --storage-sku Premium_LRS"
|
||||
"-s Standard_DS4_v2 -u $USERNAME -p {get_password(dotenv_for())} -g $GROUP_NAME --storage-sku Premium_LRS"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -324,7 +325,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!sshpass -p {get_password(dotenv_path)} scp -o \"StrictHostKeyChecking=no\" nodeprep.sh $USERNAME@{nfs_ip}:~/"
|
||||
"!sshpass -p {get_password(dotenv_for())} scp -o \"StrictHostKeyChecking=no\" nodeprep.sh $USERNAME@{nfs_ip}:~/"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -333,7 +334,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!sshpass -p {get_password(dotenv_path)} ssh -o \"StrictHostKeyChecking=no\" $USERNAME@{nfs_ip} \"sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh\""
|
||||
"!sshpass -p {get_password(dotenv_for())} ssh -o \"StrictHostKeyChecking=no\" $USERNAME@{nfs_ip} \"sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh\""
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -367,8 +368,8 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source ../src/imagenet_estimator_tf_horovod.py --path scripts\n",
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source ../src/resnet_model.py --path scripts\n",
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_keras_horovod.py --path scripts\n",
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source src/data_generator.py --path scripts\n",
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts"
|
||||
]
|
||||
},
|
||||
|
@ -394,7 +395,7 @@
|
|||
" --afs-name $FILE_SHARE_NAME \\\n",
|
||||
" --afs-mount-path extfs \\\n",
|
||||
" --user-name $USERNAME \\\n",
|
||||
" --password {get_password(dotenv_path)} \\\n",
|
||||
" --password {get_password(dotenv_for())} \\\n",
|
||||
" --storage-account-name $STORAGE_ACCOUNT_NAME \\\n",
|
||||
" --storage-account-key $storage_account_key \\\n",
|
||||
" --nfs $NFS_NAME \\\n",
|
||||
|
@ -457,7 +458,7 @@
|
|||
" \"customToolkitSettings\": {\n",
|
||||
" \"commandLine\": f\"source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh; \\\n",
|
||||
" echo $AZ_BATCH_HOST_LIST; \\\n",
|
||||
" mpirun -n {NUM_NODES} -ppn {PROCESSES_PER_NODE} -hosts $AZ_BATCH_HOST_LIST \\\n",
|
||||
" mpirun -n {TOTAL_PROCESSES} -ppn {PROCESSES_PER_NODE} -hosts $AZ_BATCH_HOST_LIST \\\n",
|
||||
" -env I_MPI_FABRICS=dapl \\\n",
|
||||
" -env I_MPI_DAPL_PROVIDER=ofa-v2-ib0 \\\n",
|
||||
" -env I_MPI_DYNAMIC_CONNECTION=0 \\\n",
|
||||
|
@ -465,7 +466,7 @@
|
|||
" -env I_MPI_HYDRA_DEBUG=on \\\n",
|
||||
" -env DISTRIBUTED=True \\\n",
|
||||
" {FAKE} \\\n",
|
||||
" python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py\"\n",
|
||||
" python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py\"\n",
|
||||
" },\n",
|
||||
" \"stdOutErrPathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n",
|
||||
" \"inputDirectories\": [{\n",
|
||||
|
@ -488,7 +489,7 @@
|
|||
" }],\n",
|
||||
" \"containerSettings\": {\n",
|
||||
" \"imageSourceRegistry\": {\n",
|
||||
" \"image\": \"{DOCKERHUB}/distributed-training.horovod-tf\"\n",
|
||||
" \"image\": f\"{DOCKERHUB}/distributed-training.horovod-keras\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
|
@ -510,7 +511,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"JOB_NAME='tf-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)"
|
||||
"JOB_NAME='keras-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -523,7 +524,9 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!az batchai job create -n $JOB_NAME --cluster $CLUSTER_NAME -w $WORKSPACE -e $EXPERIMENT -f job.json"
|
||||
|
@ -571,7 +574,9 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stdout.txt"
|
||||
|
|
|
@ -4,8 +4,8 @@
|
|||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Train Tensorflow Model Distributed on Batch AI\n",
|
||||
"In this notebook we will train a TensorFlow model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n",
|
||||
"# Train PyTorch Model Distributed on Batch AI\n",
|
||||
"In this notebook we will train a PyTorch model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n",
|
||||
" * [Create Azure Resources](#azure_resources)\n",
|
||||
" * [Create Fileserver(NFS)](#create_fileshare)\n",
|
||||
" * [Configure Batch AI Cluster](#configure_cluster)\n",
|
||||
|
@ -15,7 +15,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -26,7 +26,7 @@
|
|||
"from getpass import getpass\n",
|
||||
"import os\n",
|
||||
"import json\n",
|
||||
"from utils import get_password, write_json_to_file"
|
||||
"from utils import get_password, write_json_to_file, dotenv_for"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -49,7 +49,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Variables for Batch AI - change as necessary\n",
|
||||
"ID = \"ddtf2\"\n",
|
||||
"ID = \"ddpytorch\"\n",
|
||||
"GROUP_NAME = f\"batch{ID}rg\"\n",
|
||||
"STORAGE_ACCOUNT_NAME = f\"batch{ID}st\"\n",
|
||||
"FILE_SHARE_NAME = f\"batch{ID}share\"\n",
|
||||
|
@ -62,7 +62,7 @@
|
|||
"PROCESSES_PER_NODE = 4\n",
|
||||
"LOCATION = \"eastus\"\n",
|
||||
"NFS_NAME = f\"batch{ID}nfs\"\n",
|
||||
"EXPERIMENT = f\"distributed_tensorflow_{GPU_TYPE}\"\n",
|
||||
"EXPERIMENT = f\"distributed_pytorch_{GPU_TYPE}\"\n",
|
||||
"USERNAME = \"batchai_user\"\n",
|
||||
"USE_FAKE = False\n",
|
||||
"DOCKERHUB = \"caia\" #\"<YOUR DOCKERHUB>\""
|
||||
|
@ -74,7 +74,8 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"FAKE='-env FAKE=True' if USE_FAKE else ''"
|
||||
"FAKE='-env FAKE=True' if USE_FAKE else ''\n",
|
||||
"TOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -244,7 +245,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"!az batchai file-server create -n $NFS_NAME --disk-count 4 --disk-size 250 -w $WORKSPACE \\\n",
|
||||
"-s Standard_DS4_v2 -u $USERNAME -p {get_password(find_dotenv())} -g $GROUP_NAME --storage-sku Premium_LRS"
|
||||
"-s Standard_DS4_v2 -u $USERNAME -p {get_password(dotenv_for())} -g $GROUP_NAME --storage-sku Premium_LRS"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -324,7 +325,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!sshpass -p {get_password(dotenv_path)} scp -o \"StrictHostKeyChecking=no\" nodeprep.sh $USERNAME@{nfs_ip}:~/"
|
||||
"!sshpass -p {get_password(dotenv_for())} scp -o \"StrictHostKeyChecking=no\" nodeprep.sh $USERNAME@{nfs_ip}:~/"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -333,7 +334,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!sshpass -p {get_password(dotenv_path)} ssh -o \"StrictHostKeyChecking=no\" $USERNAME@{nfs_ip} \"sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh\""
|
||||
"!sshpass -p {get_password(dotenv_for())} ssh -o \"StrictHostKeyChecking=no\" $USERNAME@{nfs_ip} \"sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh\""
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -367,9 +368,10 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source ../src/imagenet_estimator_tf_horovod.py --path scripts\n",
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source ../src/resnet_model.py --path scripts\n",
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts"
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_pytorch_horovod.py --path scripts\n",
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts\n",
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source cluster_config/docker.service --path scripts\n",
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source cluster_config/nodeprep.sh --path scripts"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -394,11 +396,12 @@
|
|||
" --afs-name $FILE_SHARE_NAME \\\n",
|
||||
" --afs-mount-path extfs \\\n",
|
||||
" --user-name $USERNAME \\\n",
|
||||
" --password {get_password(dotenv_path)} \\\n",
|
||||
" --password {get_password(dotenv_for())} \\\n",
|
||||
" --storage-account-name $STORAGE_ACCOUNT_NAME \\\n",
|
||||
" --storage-account-key $storage_account_key \\\n",
|
||||
" --nfs $NFS_NAME \\\n",
|
||||
" --nfs-mount-path nfs "
|
||||
" --nfs-mount-path nfs \\\n",
|
||||
" --config-file cluster_config/cluster.json"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -455,17 +458,21 @@
|
|||
" \"properties\": {\n",
|
||||
" \"nodeCount\": NUM_NODES,\n",
|
||||
" \"customToolkitSettings\": {\n",
|
||||
" \"commandLine\": f\"source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh; \\\n",
|
||||
" echo $AZ_BATCH_HOST_LIST; \\\n",
|
||||
" mpirun -n {NUM_NODES} -ppn {PROCESSES_PER_NODE} -hosts $AZ_BATCH_HOST_LIST \\\n",
|
||||
" -env I_MPI_FABRICS=dapl \\\n",
|
||||
" -env I_MPI_DAPL_PROVIDER=ofa-v2-ib0 \\\n",
|
||||
" -env I_MPI_DYNAMIC_CONNECTION=0 \\\n",
|
||||
" -env I_MPI_DEBUG=6 \\\n",
|
||||
" -env I_MPI_HYDRA_DEBUG=on \\\n",
|
||||
" -env DISTRIBUTED=True \\\n",
|
||||
" \"commandLine\": f\"echo $AZ_BATCH_HOST_LIST; \\\n",
|
||||
" cat $AZ_BATCHAI_MPI_HOST_FILE; \\\n",
|
||||
" mpirun -np {TOTAL_PROCESSES} --hostfile $AZ_BATCHAI_MPI_HOST_FILE \\\n",
|
||||
" -bind-to none -map-by slot \\\n",
|
||||
" -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \\\n",
|
||||
" -mca btl_tcp_if_include eth0 \\\n",
|
||||
" -x NCCL_SOCKET_IFNAME=eth0 \\\n",
|
||||
" -mca btl ^openib \\\n",
|
||||
" -x NCCL_IB_DISABLE=1 \\\n",
|
||||
" -x DISTRIBUTED=True \\\n",
|
||||
" -x AZ_BATCHAI_INPUT_TRAIN \\\n",
|
||||
" -x AZ_BATCHAI_INPUT_TEST \\\n",
|
||||
" --allow-run-as-root \\\n",
|
||||
" {FAKE} \\\n",
|
||||
" python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py\"\n",
|
||||
" python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py\"\n",
|
||||
" },\n",
|
||||
" \"stdOutErrPathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n",
|
||||
" \"inputDirectories\": [{\n",
|
||||
|
@ -488,7 +495,7 @@
|
|||
" }],\n",
|
||||
" \"containerSettings\": {\n",
|
||||
" \"imageSourceRegistry\": {\n",
|
||||
" \"image\": \"{DOCKERHUB}/distributed-training.horovod-tf\"\n",
|
||||
" \"image\": f\"{DOCKERHUB}/distributed-training.horovod-pytorch\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
|
@ -510,7 +517,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"JOB_NAME='tf-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)"
|
||||
"JOB_NAME='pytorch-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -580,7 +587,9 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stderr.txt"
|
||||
|
|
|
@ -74,7 +74,8 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"FAKE='-env FAKE=True' if USE_FAKE else ''"
|
||||
"FAKE='-env FAKE=True' if USE_FAKE else ''\n",
|
||||
"TOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -457,7 +458,7 @@
|
|||
" \"customToolkitSettings\": {\n",
|
||||
" \"commandLine\": f\"source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh; \\\n",
|
||||
" echo $AZ_BATCH_HOST_LIST; \\\n",
|
||||
" mpirun -n {NUM_NODES} -ppn {PROCESSES_PER_NODE} -hosts $AZ_BATCH_HOST_LIST \\\n",
|
||||
" mpirun -n {TOTAL_PROCESSES} -ppn {PROCESSES_PER_NODE} -hosts $AZ_BATCH_HOST_LIST \\\n",
|
||||
" -env I_MPI_FABRICS=dapl \\\n",
|
||||
" -env I_MPI_DAPL_PROVIDER=ofa-v2-ib0 \\\n",
|
||||
" -env I_MPI_DYNAMIC_CONNECTION=0 \\\n",
|
||||
|
@ -488,7 +489,7 @@
|
|||
" }],\n",
|
||||
" \"containerSettings\": {\n",
|
||||
" \"imageSourceRegistry\": {\n",
|
||||
" \"image\": \"{DOCKERHUB}/distributed-training.horovod-tf\"\n",
|
||||
" \"image\": f\"{DOCKERHUB}/distributed-training.horovod-tf\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
|
|
Загрузка…
Ссылка в новой задаче