diff --git a/HorovodKeras/train_keras_model.ipynb b/HorovodKeras/train_keras_model.ipynb index 88b4672..30148e2 100644 --- a/HorovodKeras/train_keras_model.ipynb +++ b/HorovodKeras/train_keras_model.ipynb @@ -4,8 +4,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Train Tensorflow Model Distributed on Batch AI\n", - "In this notebook we will train a TensorFlow model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n", + "# Train Keras Model Distributed on Batch AI\n", + "In this notebook we will train a Keras model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n", " * [Create Azure Resources](#azure_resources)\n", " * [Create Fileserver(NFS)](#create_fileshare)\n", " * [Configure Batch AI Cluster](#configure_cluster)\n", @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ "from getpass import getpass\n", "import os\n", "import json\n", - "from utils import get_password, write_json_to_file" + "from utils import get_password, write_json_to_file, dotenv_for" ] }, { @@ -49,7 +49,7 @@ "outputs": [], "source": [ "# Variables for Batch AI - change as necessary\n", - "ID = \"ddtf2\"\n", + "ID = \"ddkeras\"\n", "GROUP_NAME = f\"batch{ID}rg\"\n", "STORAGE_ACCOUNT_NAME = f\"batch{ID}st\"\n", "FILE_SHARE_NAME = f\"batch{ID}share\"\n", @@ -62,7 +62,7 @@ "PROCESSES_PER_NODE = 4\n", "LOCATION = \"eastus\"\n", "NFS_NAME = f\"batch{ID}nfs\"\n", - "EXPERIMENT = f\"distributed_tensorflow_{GPU_TYPE}\"\n", + "EXPERIMENT = f\"distributed_keras_{GPU_TYPE}\"\n", "USERNAME = \"batchai_user\"\n", "USE_FAKE = False\n", "DOCKERHUB = \"caia\" #\"\"" @@ -74,7 +74,8 @@ "metadata": {}, "outputs": [], "source": [ - "FAKE='-env FAKE=True' if USE_FAKE else ''" + "FAKE='-env FAKE=True' if USE_FAKE else ''\n", + "TOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES" ] }, { @@ -244,7 +245,7 @@ "outputs": [], "source": [ "!az batchai file-server create -n $NFS_NAME --disk-count 4 --disk-size 250 -w $WORKSPACE \\\n", - "-s Standard_DS4_v2 -u $USERNAME -p {get_password(find_dotenv())} -g $GROUP_NAME --storage-sku Premium_LRS" + "-s Standard_DS4_v2 -u $USERNAME -p {get_password(dotenv_for())} -g $GROUP_NAME --storage-sku Premium_LRS" ] }, { @@ -324,7 +325,7 @@ "metadata": {}, "outputs": [], "source": [ - "!sshpass -p {get_password(dotenv_path)} scp -o \"StrictHostKeyChecking=no\" nodeprep.sh $USERNAME@{nfs_ip}:~/" + "!sshpass -p {get_password(dotenv_for())} scp -o \"StrictHostKeyChecking=no\" nodeprep.sh $USERNAME@{nfs_ip}:~/" ] }, { @@ -333,7 +334,7 @@ "metadata": {}, "outputs": [], "source": [ - "!sshpass -p {get_password(dotenv_path)} ssh -o \"StrictHostKeyChecking=no\" $USERNAME@{nfs_ip} \"sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh\"" + "!sshpass -p {get_password(dotenv_for())} ssh -o \"StrictHostKeyChecking=no\" $USERNAME@{nfs_ip} \"sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh\"" ] }, { @@ -367,8 +368,8 @@ "metadata": {}, "outputs": [], "source": [ - "!az storage file upload --share-name $FILE_SHARE_NAME --source ../src/imagenet_estimator_tf_horovod.py --path scripts\n", - "!az storage file upload --share-name $FILE_SHARE_NAME --source ../src/resnet_model.py --path scripts\n", + "!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_keras_horovod.py --path scripts\n", + "!az storage file upload --share-name $FILE_SHARE_NAME --source src/data_generator.py --path scripts\n", "!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts" ] }, @@ -394,7 +395,7 @@ " --afs-name $FILE_SHARE_NAME \\\n", " --afs-mount-path extfs \\\n", " --user-name $USERNAME \\\n", - " --password {get_password(dotenv_path)} \\\n", + " --password {get_password(dotenv_for())} \\\n", " --storage-account-name $STORAGE_ACCOUNT_NAME \\\n", " --storage-account-key $storage_account_key \\\n", " --nfs $NFS_NAME \\\n", @@ -457,7 +458,7 @@ " \"customToolkitSettings\": {\n", " \"commandLine\": f\"source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh; \\\n", " echo $AZ_BATCH_HOST_LIST; \\\n", - " mpirun -n {NUM_NODES} -ppn {PROCESSES_PER_NODE} -hosts $AZ_BATCH_HOST_LIST \\\n", + " mpirun -n {TOTAL_PROCESSES} -ppn {PROCESSES_PER_NODE} -hosts $AZ_BATCH_HOST_LIST \\\n", " -env I_MPI_FABRICS=dapl \\\n", " -env I_MPI_DAPL_PROVIDER=ofa-v2-ib0 \\\n", " -env I_MPI_DYNAMIC_CONNECTION=0 \\\n", @@ -465,7 +466,7 @@ " -env I_MPI_HYDRA_DEBUG=on \\\n", " -env DISTRIBUTED=True \\\n", " {FAKE} \\\n", - " python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py\"\n", + " python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py\"\n", " },\n", " \"stdOutErrPathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n", " \"inputDirectories\": [{\n", @@ -488,7 +489,7 @@ " }],\n", " \"containerSettings\": {\n", " \"imageSourceRegistry\": {\n", - " \"image\": \"{DOCKERHUB}/distributed-training.horovod-tf\"\n", + " \"image\": f\"{DOCKERHUB}/distributed-training.horovod-keras\"\n", " }\n", " }\n", " }\n", @@ -510,7 +511,7 @@ "metadata": {}, "outputs": [], "source": [ - "JOB_NAME='tf-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)" + "JOB_NAME='keras-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)" ] }, { @@ -523,7 +524,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "!az batchai job create -n $JOB_NAME --cluster $CLUSTER_NAME -w $WORKSPACE -e $EXPERIMENT -f job.json" @@ -571,7 +574,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stdout.txt" diff --git a/HorovodPytorch/train_pytorch_model.ipynb b/HorovodPytorch/train_pytorch_model.ipynb index 88b4672..9d837b7 100644 --- a/HorovodPytorch/train_pytorch_model.ipynb +++ b/HorovodPytorch/train_pytorch_model.ipynb @@ -4,8 +4,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Train Tensorflow Model Distributed on Batch AI\n", - "In this notebook we will train a TensorFlow model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n", + "# Train PyTorch Model Distributed on Batch AI\n", + "In this notebook we will train a PyTorch model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n", " * [Create Azure Resources](#azure_resources)\n", " * [Create Fileserver(NFS)](#create_fileshare)\n", " * [Configure Batch AI Cluster](#configure_cluster)\n", @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ "from getpass import getpass\n", "import os\n", "import json\n", - "from utils import get_password, write_json_to_file" + "from utils import get_password, write_json_to_file, dotenv_for" ] }, { @@ -49,7 +49,7 @@ "outputs": [], "source": [ "# Variables for Batch AI - change as necessary\n", - "ID = \"ddtf2\"\n", + "ID = \"ddpytorch\"\n", "GROUP_NAME = f\"batch{ID}rg\"\n", "STORAGE_ACCOUNT_NAME = f\"batch{ID}st\"\n", "FILE_SHARE_NAME = f\"batch{ID}share\"\n", @@ -62,7 +62,7 @@ "PROCESSES_PER_NODE = 4\n", "LOCATION = \"eastus\"\n", "NFS_NAME = f\"batch{ID}nfs\"\n", - "EXPERIMENT = f\"distributed_tensorflow_{GPU_TYPE}\"\n", + "EXPERIMENT = f\"distributed_pytorch_{GPU_TYPE}\"\n", "USERNAME = \"batchai_user\"\n", "USE_FAKE = False\n", "DOCKERHUB = \"caia\" #\"\"" @@ -74,7 +74,8 @@ "metadata": {}, "outputs": [], "source": [ - "FAKE='-env FAKE=True' if USE_FAKE else ''" + "FAKE='-env FAKE=True' if USE_FAKE else ''\n", + "TOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES" ] }, { @@ -244,7 +245,7 @@ "outputs": [], "source": [ "!az batchai file-server create -n $NFS_NAME --disk-count 4 --disk-size 250 -w $WORKSPACE \\\n", - "-s Standard_DS4_v2 -u $USERNAME -p {get_password(find_dotenv())} -g $GROUP_NAME --storage-sku Premium_LRS" + "-s Standard_DS4_v2 -u $USERNAME -p {get_password(dotenv_for())} -g $GROUP_NAME --storage-sku Premium_LRS" ] }, { @@ -324,7 +325,7 @@ "metadata": {}, "outputs": [], "source": [ - "!sshpass -p {get_password(dotenv_path)} scp -o \"StrictHostKeyChecking=no\" nodeprep.sh $USERNAME@{nfs_ip}:~/" + "!sshpass -p {get_password(dotenv_for())} scp -o \"StrictHostKeyChecking=no\" nodeprep.sh $USERNAME@{nfs_ip}:~/" ] }, { @@ -333,7 +334,7 @@ "metadata": {}, "outputs": [], "source": [ - "!sshpass -p {get_password(dotenv_path)} ssh -o \"StrictHostKeyChecking=no\" $USERNAME@{nfs_ip} \"sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh\"" + "!sshpass -p {get_password(dotenv_for())} ssh -o \"StrictHostKeyChecking=no\" $USERNAME@{nfs_ip} \"sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh\"" ] }, { @@ -367,9 +368,10 @@ "metadata": {}, "outputs": [], "source": [ - "!az storage file upload --share-name $FILE_SHARE_NAME --source ../src/imagenet_estimator_tf_horovod.py --path scripts\n", - "!az storage file upload --share-name $FILE_SHARE_NAME --source ../src/resnet_model.py --path scripts\n", - "!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts" + "!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_pytorch_horovod.py --path scripts\n", + "!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts\n", + "!az storage file upload --share-name $FILE_SHARE_NAME --source cluster_config/docker.service --path scripts\n", + "!az storage file upload --share-name $FILE_SHARE_NAME --source cluster_config/nodeprep.sh --path scripts" ] }, { @@ -394,11 +396,12 @@ " --afs-name $FILE_SHARE_NAME \\\n", " --afs-mount-path extfs \\\n", " --user-name $USERNAME \\\n", - " --password {get_password(dotenv_path)} \\\n", + " --password {get_password(dotenv_for())} \\\n", " --storage-account-name $STORAGE_ACCOUNT_NAME \\\n", " --storage-account-key $storage_account_key \\\n", " --nfs $NFS_NAME \\\n", - " --nfs-mount-path nfs " + " --nfs-mount-path nfs \\\n", + " --config-file cluster_config/cluster.json" ] }, { @@ -455,17 +458,21 @@ " \"properties\": {\n", " \"nodeCount\": NUM_NODES,\n", " \"customToolkitSettings\": {\n", - " \"commandLine\": f\"source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh; \\\n", - " echo $AZ_BATCH_HOST_LIST; \\\n", - " mpirun -n {NUM_NODES} -ppn {PROCESSES_PER_NODE} -hosts $AZ_BATCH_HOST_LIST \\\n", - " -env I_MPI_FABRICS=dapl \\\n", - " -env I_MPI_DAPL_PROVIDER=ofa-v2-ib0 \\\n", - " -env I_MPI_DYNAMIC_CONNECTION=0 \\\n", - " -env I_MPI_DEBUG=6 \\\n", - " -env I_MPI_HYDRA_DEBUG=on \\\n", - " -env DISTRIBUTED=True \\\n", + " \"commandLine\": f\"echo $AZ_BATCH_HOST_LIST; \\\n", + " cat $AZ_BATCHAI_MPI_HOST_FILE; \\\n", + " mpirun -np {TOTAL_PROCESSES} --hostfile $AZ_BATCHAI_MPI_HOST_FILE \\\n", + " -bind-to none -map-by slot \\\n", + " -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \\\n", + " -mca btl_tcp_if_include eth0 \\\n", + " -x NCCL_SOCKET_IFNAME=eth0 \\\n", + " -mca btl ^openib \\\n", + " -x NCCL_IB_DISABLE=1 \\\n", + " -x DISTRIBUTED=True \\\n", + " -x AZ_BATCHAI_INPUT_TRAIN \\\n", + " -x AZ_BATCHAI_INPUT_TEST \\\n", + " --allow-run-as-root \\\n", " {FAKE} \\\n", - " python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py\"\n", + " python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py\"\n", " },\n", " \"stdOutErrPathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n", " \"inputDirectories\": [{\n", @@ -488,7 +495,7 @@ " }],\n", " \"containerSettings\": {\n", " \"imageSourceRegistry\": {\n", - " \"image\": \"{DOCKERHUB}/distributed-training.horovod-tf\"\n", + " \"image\": f\"{DOCKERHUB}/distributed-training.horovod-pytorch\"\n", " }\n", " }\n", " }\n", @@ -510,7 +517,7 @@ "metadata": {}, "outputs": [], "source": [ - "JOB_NAME='tf-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)" + "JOB_NAME='pytorch-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)" ] }, { @@ -580,7 +587,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stderr.txt" diff --git a/HorovodTF/train_tensorflow_model.ipynb b/HorovodTF/train_tensorflow_model.ipynb index 88b4672..4ffbabe 100644 --- a/HorovodTF/train_tensorflow_model.ipynb +++ b/HorovodTF/train_tensorflow_model.ipynb @@ -74,7 +74,8 @@ "metadata": {}, "outputs": [], "source": [ - "FAKE='-env FAKE=True' if USE_FAKE else ''" + "FAKE='-env FAKE=True' if USE_FAKE else ''\n", + "TOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES" ] }, { @@ -457,7 +458,7 @@ " \"customToolkitSettings\": {\n", " \"commandLine\": f\"source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh; \\\n", " echo $AZ_BATCH_HOST_LIST; \\\n", - " mpirun -n {NUM_NODES} -ppn {PROCESSES_PER_NODE} -hosts $AZ_BATCH_HOST_LIST \\\n", + " mpirun -n {TOTAL_PROCESSES} -ppn {PROCESSES_PER_NODE} -hosts $AZ_BATCH_HOST_LIST \\\n", " -env I_MPI_FABRICS=dapl \\\n", " -env I_MPI_DAPL_PROVIDER=ofa-v2-ib0 \\\n", " -env I_MPI_DYNAMIC_CONNECTION=0 \\\n", @@ -488,7 +489,7 @@ " }],\n", " \"containerSettings\": {\n", " \"imageSourceRegistry\": {\n", - " \"image\": \"{DOCKERHUB}/distributed-training.horovod-tf\"\n", + " \"image\": f\"{DOCKERHUB}/distributed-training.horovod-tf\"\n", " }\n", " }\n", " }\n",