Updates notebooks
Tested end2end the PyTorch path Both fake and real data confirmed as working Added tags for striping out information
This commit is contained in:
Родитель
e58e0d8587
Коммит
7b3f645a4a
|
@ -1,5 +1,15 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Data Processing\n",
|
||||
"In this notebook we convert the ImageNet data to the appropriate format so that we can use it for training.\n",
|
||||
"\n",
|
||||
"The dataset has many versions, the one commonly used for image classification is ILSVRC 2012. Go to the [download page](http://www.image-net.org/download-images) (you may need to register an account), and find the page for ILSVRC2012. You will need to download two files ILSVRC2012_img_train.tar and ILSVRC2012_img_val.tar"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -15,25 +25,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"DATA=Path(\"/data/imagenet\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!rm -r {DATA / \"train.tar.gz\"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!ls {DATA}"
|
||||
"DATA=Path(\"/data\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -46,15 +38,6 @@
|
|||
"!tar -C {DATA/\"train\"} -xf {DATA/\"ILSVRC2012_img_train.tar\"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install tqdm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -105,6 +88,13 @@
|
|||
"!tar -C {DATA/\"validation\"} -xf {DATA/\"ILSVRC2012_img_val.tar\"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The validation data comes without labels so wee ned to run a script to asign the images to the appropriate classes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -124,6 +114,13 @@
|
|||
"!bash -c \"cd {validation_path} && {validation_preparation_script}\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Finally we package the processed directories so that we can upload them quicker."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -141,36 +138,17 @@
|
|||
"source": [
|
||||
"!cd {DATA} && tar -czvf validation.tar.gz validation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"SUBSCRIPTION=\"Boston Team Danielle\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!az login -o table"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!az account set -s ${SUBSCRIPTION}"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"jupytext": {
|
||||
"text_representation": {
|
||||
"extension": ".py",
|
||||
"format_name": "light",
|
||||
"format_version": "1.3",
|
||||
"jupytext_version": "0.8.6"
|
||||
}
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -13,15 +13,15 @@
|
|||
# ---
|
||||
|
||||
# # Create Azure and Batch AI Resources
|
||||
# In this notebook we will create the necessary resources to train a ResNet50 model([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. If you plan on using fake data then the sections marked optional can be skipped. This notebook will take you through the following steps:
|
||||
# In this notebook we will create the necessary resources to train a ResNet50 model([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the ImageNet dataset. If you plan on using fake data then the sections marked optional can be skipped. This notebook will take you through the following steps:
|
||||
# * [Create Azure Resources](#azure_resources)
|
||||
# * [Create Fileserver(NFS)(Optional)](#create_fileshare)
|
||||
# * [Create Fileserver(NFS)](#create_fileshare)
|
||||
# * [Upload Data to Blob (Optional)](#upload_data)
|
||||
# * [Configure Batch AI Cluster](#configure_cluster)
|
||||
|
||||
# +
|
||||
import sys
|
||||
sys.path.append("../common")
|
||||
sys.path.append("common")
|
||||
|
||||
from dotenv import set_key
|
||||
import os
|
||||
|
@ -40,7 +40,7 @@ ID = "dtdemo"
|
|||
GROUP_NAME = f"batch{ID}rg"
|
||||
STORAGE_ACCOUNT_NAME = f"batch{ID}st"
|
||||
FILE_SHARE_NAME = f"batch{ID}share"
|
||||
SELECTED_SUBSCRIPTION = "Boston Team Danielle" #"<YOUR SUBSCRIPTION>"
|
||||
SELECTED_SUBSCRIPTION = "Boston Team Danielle"
|
||||
WORKSPACE = "workspace"
|
||||
NUM_NODES = 2
|
||||
CLUSTER_NAME = "msv100"
|
||||
|
@ -50,34 +50,39 @@ PROCESSES_PER_NODE = 4
|
|||
LOCATION = "eastus"
|
||||
NFS_NAME = f"batch{ID}nfs"
|
||||
USERNAME = "batchai_user"
|
||||
USE_FAKE = True
|
||||
DOCKERHUB = os.getenv('DOCKER_REPOSITORY', "masalvar") #"<YOUR DOCKERHUB>"
|
||||
USE_FAKE = False
|
||||
DOCKERHUB = os.getenv('DOCKER_REPOSITORY', "masalvar")
|
||||
DATA = Path("/data")
|
||||
CONTAINER_NAME = f"batch{ID}container"
|
||||
DOCKER_PWD = ""
|
||||
DOCKER_PWD = "<YOUR_DOCKER_PWD>"
|
||||
|
||||
dotenv_path = dotenv_for()
|
||||
set_key(dotenv_path, 'DOCKER_PWD', DOCKER_PWD)
|
||||
set_key(dotenv_path, 'GROUP_NAME', GROUP_NAME)
|
||||
set_key(dotenv_path, 'FILE_SHARE_NAME', FILE_SHARE_NAME)
|
||||
set_key(dotenv_path, 'WORKSPACE', WORKSPACE)
|
||||
set_key(dotenv_path, 'NUM_NODES', NUM_NODES)
|
||||
set_key(dotenv_path, 'NUM_NODES', str(NUM_NODES))
|
||||
set_key(dotenv_path, 'CLUSTER_NAME', CLUSTER_NAME)
|
||||
set_key(dotenv_path, 'GPU_TYPE', GPU_TYPE)
|
||||
set_key(dotenv_path, 'PROCESSES_PER_NODE', PROCESSES_PER_NODE)
|
||||
set_key(dotenv_path, 'PROCESSES_PER_NODE', str(PROCESSES_PER_NODE))
|
||||
set_key(dotenv_path, 'STORAGE_ACCOUNT_NAME', STORAGE_ACCOUNT_NAME)
|
||||
# -
|
||||
|
||||
# <a id='azure_resources'></a>
|
||||
# ## Create Azure Resources
|
||||
# First we need to log in to our Azure account.
|
||||
|
||||
# + {"tags": ["stripout"]}
|
||||
!az login -o table
|
||||
# -
|
||||
|
||||
# If you have more than one Azure account you will need to select it with the command below. If you only have one account you can skip this step.
|
||||
|
||||
!az account set --subscription "$SELECTED_SUBSCRIPTION"
|
||||
|
||||
# + {"tags": ["stripout"]}
|
||||
!az account list -o table
|
||||
# -
|
||||
|
||||
# Next we create the group that will hold all our Azure resources.
|
||||
|
||||
|
@ -103,38 +108,52 @@ storage_account_key = json.loads(''.join([i for i in json_data if 'WARNING' not
|
|||
!az configure --defaults location=$LOCATION
|
||||
!az configure --defaults group=$GROUP_NAME
|
||||
|
||||
# + {"tags": ["stripout"]}
|
||||
# %env AZURE_STORAGE_ACCOUNT $STORAGE_ACCOUNT_NAME
|
||||
# %env AZURE_STORAGE_KEY=$storage_account_key
|
||||
# -
|
||||
|
||||
# #### Create Workspace
|
||||
# Batch AI has the concept of workspaces and experiments. Below we will create the workspace for our work.
|
||||
|
||||
# + {"tags": ["stripout"]}
|
||||
!az batchai workspace create -n $WORKSPACE -g $GROUP_NAME
|
||||
# -
|
||||
|
||||
# <a id='upload_data'></a>
|
||||
# ## Upload Data to Blob (Optional)
|
||||
# In this section we will create a blob container and upload the imagenet data we prepared locally in the previous notebook.
|
||||
if USE_FAKE:
|
||||
raise Warning("You should not be running this section if you simply want to use fake data")
|
||||
!az storage container create --account-name {STORAGE_ACCOUNT_NAME} \
|
||||
--account-key {storage_account_key} \
|
||||
--name {CONTAINER_NAME}
|
||||
#
|
||||
# **You only need to run this section if you want to use real data. If USE_FAKE is set to False the commands below won't be executed.**
|
||||
#
|
||||
|
||||
# Should take about 20 minnutes
|
||||
!azcopy --source {DATA/"train.tar.gz"} \
|
||||
--destination https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/train.tar.gz \
|
||||
--dest-key {storage_account_key} --quiet
|
||||
if USE_FAKE is False:
|
||||
!az storage container create --account-name {STORAGE_ACCOUNT_NAME} \
|
||||
--account-key {storage_account_key} \
|
||||
--name {CONTAINER_NAME}
|
||||
|
||||
!azcopy --source {DATA/"validation.tar.gz"} \
|
||||
--destination https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/validation.tar.gz \
|
||||
--dest-key {storage_account_key} --quiet
|
||||
# + {"tags": ["stripout"]}
|
||||
if USE_FAKE is False:
|
||||
# Should take about 20 minutes
|
||||
!azcopy --source {DATA/"train.tar.gz"} \
|
||||
--destination https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/train.tar.gz \
|
||||
--dest-key {storage_account_key} --quiet
|
||||
|
||||
# + {"tags": ["stripout"]}
|
||||
if USE_FAKE is False:
|
||||
!azcopy --source {DATA/"validation.tar.gz"} \
|
||||
--destination https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/validation.tar.gz \
|
||||
--dest-key {storage_account_key} --quiet
|
||||
# -
|
||||
|
||||
# <a id='create_fileshare'></a>
|
||||
# ## Create Fileserver (Optional)
|
||||
# In this example we will store the data on an NFS fileshare. It is possible to use many storage solutions with Batch AI. NFS offers the best traideoff between performance and ease of use. The best performance is achieved by loading the data locally but this can be cumbersome since it requires that the data is download by the all the nodes which with the imagenet dataset can take hours.
|
||||
# ## Create Fileserver
|
||||
# In this example we will store the data on an NFS fileshare. It is possible to use many storage solutions with Batch AI. NFS offers the best tradeoff between performance and ease of use. The best performance is achieved by loading the data locally but this can be cumbersome since it requires that the data is download by the all the nodes which with the ImageNet dataset can take hours. If you are using fake data we won't be using the fileserver but we will create one so that if you want to run the real ImageNet data later the server is ready.
|
||||
|
||||
# + {"tags": ["stripout"]}
|
||||
!az batchai file-server create -n $NFS_NAME --disk-count 4 --disk-size 250 -w $WORKSPACE \
|
||||
-s Standard_DS4_v2 -u $USERNAME -p {get_password(dotenv_for())} -g $GROUP_NAME --storage-sku Premium_LRS
|
||||
# -
|
||||
|
||||
!az batchai file-server list -o table -w $WORKSPACE -g $GROUP_NAME
|
||||
|
||||
|
@ -176,16 +195,25 @@ with open('nodeprep.sh', 'w') as f:
|
|||
if USE_FAKE:
|
||||
raise Warning("You should not be running this section if you simply want to use fake data")
|
||||
|
||||
!sshpass -p {get_password(dotenv_for())} scp -o "StrictHostKeyChecking=no" nodeprep.sh $USERNAME@{nfs_ip}:~/
|
||||
# + {"tags": ["stripout"]}
|
||||
if USE_FAKE is False:
|
||||
!sshpass -p {get_password(dotenv_for())} scp -o "StrictHostKeyChecking=no" nodeprep.sh $USERNAME@{nfs_ip}:~/
|
||||
|
||||
!sshpass -p {get_password(dotenv_for())} ssh -o "StrictHostKeyChecking=no" $USERNAME@{nfs_ip} "sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh"
|
||||
# + {"tags": ["stripout"]}
|
||||
if USE_FAKE is False:
|
||||
!sshpass -p {get_password(dotenv_for())} ssh -o "StrictHostKeyChecking=no" $USERNAME@{nfs_ip} "sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh"
|
||||
# -
|
||||
|
||||
# <a id='configure_cluster'></a>
|
||||
# ## Configure Batch AI Cluster
|
||||
# We then upload the scripts we wish to execute onto the fileshare. The fileshare will later be mounted by Batch AI. An alternative to uploading the scripts would be to embedd them inside the Docker container.
|
||||
# We then upload the scripts we wish to execute onto the fileshare. The fileshare will later be mounted by Batch AI. An alternative to uploading the scripts would be to embedd them inside the Docker image.
|
||||
|
||||
# Below it the command to create the cluster.
|
||||
!az storage file upload --share-name $FILE_SHARE_NAME --source HorovodPytorch/cluster_config/docker.service --path scripts
|
||||
!az storage file upload --share-name $FILE_SHARE_NAME --source HorovodPytorch/cluster_config/nodeprep.sh --path scripts
|
||||
|
||||
# Below it the command to create the cluster.
|
||||
|
||||
# + {"tags": ["stripout"]}
|
||||
!az batchai cluster create \
|
||||
-w $WORKSPACE \
|
||||
--name $CLUSTER_NAME \
|
||||
|
@ -200,11 +228,14 @@ if USE_FAKE:
|
|||
--storage-account-key $storage_account_key \
|
||||
--nfs $NFS_NAME \
|
||||
--nfs-mount-path nfs \
|
||||
--config-file cluster_config/cluster.json
|
||||
--config-file HorovodPytorch/cluster_config/cluster.json
|
||||
# -
|
||||
|
||||
# Let's check that the cluster was created succesfully.
|
||||
|
||||
# + {"tags": ["stripout"]}
|
||||
!az batchai cluster show -n $CLUSTER_NAME -w $WORKSPACE
|
||||
# -
|
||||
|
||||
!az batchai cluster list -w $WORKSPACE -o table
|
||||
|
||||
|
|
|
@ -1,57 +0,0 @@
|
|||
# ---
|
||||
# jupyter:
|
||||
# jupytext:
|
||||
# text_representation:
|
||||
# extension: .py
|
||||
# format_name: light
|
||||
# format_version: '1.3'
|
||||
# jupytext_version: 0.8.6
|
||||
# kernelspec:
|
||||
# display_name: Python 3
|
||||
# language: python
|
||||
# name: python3
|
||||
# ---
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
DATA=Path("/data")
|
||||
|
||||
!rm -r {DATA / "train.tar.gz"}
|
||||
|
||||
!ls {DATA}
|
||||
|
||||
!mkdir -p {DATA/"train"}
|
||||
!tar -C {DATA/"train"} -xf {DATA/"ILSVRC2012_img_train.tar"}
|
||||
|
||||
!pip install tqdm
|
||||
|
||||
import tarfile
|
||||
from tqdm import tqdm_notebook
|
||||
import os
|
||||
|
||||
filenames = list((DATA/"train").glob("*.tar"))
|
||||
pbar = tqdm_notebook(total=len(filenames))
|
||||
for class_tar in filenames:
|
||||
pbar.set_description('Extracting '+class_tar.name+ '...')
|
||||
class_dir = os.path.splitext(class_tar)[0]
|
||||
os.mkdir(class_dir)
|
||||
with tarfile.open(class_tar) as f:
|
||||
f.extractall(class_dir)
|
||||
os.remove(class_tar)
|
||||
pbar.update(1)
|
||||
|
||||
!rm -r {DATA/"validation"}
|
||||
|
||||
!mkdir -p {DATA/"validation"}
|
||||
!tar -C {DATA/"validation"} -xf {DATA/"ILSVRC2012_img_val.tar"}
|
||||
|
||||
validation_path = DATA/"validation"
|
||||
validation_preparation_script = Path(os.getcwd())/"valprep.sh"
|
||||
|
||||
!bash -c "cd {validation_path} && {validation_preparation_script}"
|
||||
|
||||
!cd {DATA} && tar -czvf train.tar.gz train
|
||||
|
||||
!cd {DATA} && tar -czvf validation.tar.gz validation
|
||||
|
||||
|
|
@ -5,23 +5,23 @@
|
|||
"metadata": {},
|
||||
"source": [
|
||||
"# Create Docker Image for PyTorch\n",
|
||||
"In this notebook we will create the image for our PyTorch script to run in. We will go through the process of creating the image and testing it locally to make sure it runs before submitting it to the cluster. It is often recommended you do this rather than debugging on the cluster since debugging on a cluster can be much more difficult and time consuming."
|
||||
"In this notebook we will create the Docker image for our PyTorch script to run in. We will go through the process of creating the image and testing it locally to make sure it runs before submitting it to the cluster. It is often recommended you do this rather than debugging on the cluster since debugging on a cluster can be much more difficult and time consuming.\n",
|
||||
" \n",
|
||||
"**You will need to be running everything on a GPU enabled VM to run this notebook.** "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"sys.path.append(\"../common\") \n",
|
||||
"\n",
|
||||
"from dotenv import dotenv_values, set_key, find_dotenv, get_key\n",
|
||||
"from getpass import getpass\n",
|
||||
"from dotenv import get_key\n",
|
||||
"import os\n",
|
||||
"import json\n",
|
||||
"from utils import get_password, write_json_to_file, dotenv_for\n",
|
||||
"from utils import dotenv_for\n",
|
||||
"import docker"
|
||||
]
|
||||
},
|
||||
|
@ -29,14 +29,12 @@
|
|||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Below are the variables that describe our experiment. By default we are using the NC24rs_v3 (Standard_NC24rs_v3) VMs which have V100 GPUs and Infiniband. By default we are using 2 nodes with each node having 4 GPUs, this equates to 8 GPUs. Feel free to increase the number of nodes but be aware what limitations your subscription may have.\n",
|
||||
"\n",
|
||||
"Set the USE_FAKE to True if you want to use fake data rather than the Imagenet dataset. This is often a good way to debug your models as well as checking what IO overhead is."
|
||||
"We will use fake data here since we don't want to have to download the data etc. Using fake data is often a good way to debug your models as well as checking what IO overhead is. Here we are setting the number of processes (NUM_PROCESSES) to 2 since the VM we are testing on has 2 GPUs. If you are running on a machine with 1 GPU set NUM_PROCESSES to 1."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"execution_count": 22,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
|
@ -44,14 +42,16 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"USE_FAKE = False\n",
|
||||
"DOCKERHUB = \"masalvar\" #\"<YOUR DOCKERHUB>\"\n",
|
||||
"NUM_PROCESSES = 2"
|
||||
"dotenv_path = dotenv_for()\n",
|
||||
"USE_FAKE = True\n",
|
||||
"DOCKERHUB = os.getenv('DOCKER_REPOSITORY', \"masalvar\")\n",
|
||||
"NUM_PROCESSES = 2\n",
|
||||
"DOCKER_PWD = get_key(dotenv_path, 'DOCKER_PWD')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -60,7 +60,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -70,27 +70,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'masalvar/caia-horovod-pytorch:latest'"
|
||||
]
|
||||
},
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"image.tags[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -118,7 +98,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -136,9 +116,20 @@
|
|||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"With the code below we are simply monitoring what is happening in the container. Feel free to stop the notebook when you are happy that everything is working."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"stripout"
|
||||
]
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
|
@ -147,43 +138,49 @@
|
|||
"INFO:__main__:0: Runnin Distributed\n",
|
||||
"INFO:__main__:1: Runnin Distributed\n",
|
||||
"INFO:__main__:0: PyTorch version 0.4.0\n",
|
||||
"INFO:__main__:0: Setting up loaders\n",
|
||||
"INFO:__main__:0: Setting up fake loaders\n",
|
||||
"INFO:__main__:1: PyTorch version 0.4.0\n",
|
||||
"INFO:__main__:1: Setting up loaders\n",
|
||||
"INFO:__main__:0: Loading model\n",
|
||||
"INFO:__main__:1: Setting up fake loaders\n",
|
||||
"INFO:__main__:1: Creating fake data 1000 labels and 640 images\n",
|
||||
"INFO:__main__:1: Loading model\n",
|
||||
"INFO:__main__:0: Training ...\n",
|
||||
"INFO:__main__:0: Creating fake data 1000 labels and 640 images\n",
|
||||
"INFO:__main__:0: Loading model\n",
|
||||
"INFO:__main__:1: Training ...\n",
|
||||
"INFO:__main__:0: Training ...\n",
|
||||
"\n",
|
||||
"4f92c5ed3cd3:13:68 [0] misc/ibvwrap.cu:61 WARN Failed to open libibverbs.so[.1]\n",
|
||||
"4f92c5ed3cd3:13:68 [0] INFO Using internal Network Socket\n",
|
||||
"4f92c5ed3cd3:13:68 [0] INFO Using NCCL Low-latency algorithm for sizes below 16384\n",
|
||||
"4f92c5ed3cd3:13:68 [0] INFO NET : Using interface eth0:172.17.0.3<0>\n",
|
||||
"4f92c5ed3cd3:13:68 [0] INFO NET/Socket : 1 interfaces found\n",
|
||||
"41afbf31e948:13:65 [0] misc/ibvwrap.cu:61 WARN Failed to open libibverbs.so[.1]\n",
|
||||
"41afbf31e948:13:65 [0] INFO Using internal Network Socket\n",
|
||||
"41afbf31e948:13:65 [0] INFO Using NCCL Low-latency algorithm for sizes below 16384\n",
|
||||
"41afbf31e948:13:65 [0] INFO NET : Using interface eth0:172.17.0.3<0>\n",
|
||||
"41afbf31e948:13:65 [0] INFO NET/Socket : 1 interfaces found\n",
|
||||
"NCCL version 2.2.13+cuda9.0\n",
|
||||
"\n",
|
||||
"4f92c5ed3cd3:14:65 [1] misc/ibvwrap.cu:61 WARN Failed to open libibverbs.so[.1]\n",
|
||||
"4f92c5ed3cd3:14:65 [1] INFO Using internal Network Socket\n",
|
||||
"4f92c5ed3cd3:14:65 [1] INFO Using NCCL Low-latency algorithm for sizes below 16384\n",
|
||||
"Unexpected end of /proc/mounts line `overlay / overlay rw,relatime,lowerdir=/var/lib/docker/overlay2/l/GTTEVW365EQA2GDOGZVOD37J32:/var/lib/docker/overlay2/l/P3UTSC3GD5U2KE5CZH4M2NHMBG:/var/lib/docker/overlay2/l/VD7EUCZSMECTFOW7IIPSCEX4JM:/var/lib/docker/overlay2/l/JYXYPILZXXAYHR7WRVQZCD6K3Q:/var/lib/docker/overlay2/l/WVA3JNFTU4ODUE62JCC7AVMUM6:/var/lib/docker/overlay2/l/MRY4XN6FMUFF4BQUPGJ7YLLHRP:/var/lib/docker/overlay2/l/ITJFFFEWFL7E7OGEEZKTF6N7LD:/var/lib/docker/overlay2/l/FQDWWHY3VM74QBLUFN5CF5IDYO:/var/lib/docker/overlay2/l/U5OVVSIWE2MDZ'\n",
|
||||
"41afbf31e948:14:66 [1] misc/ibvwrap.cu:61 WARN Failed to open libibverbs.so[.1]\n",
|
||||
"41afbf31e948:14:66 [1] INFO Using internal Network Socket\n",
|
||||
"41afbf31e948:14:66 [1] INFO Using NCCL Low-latency algorithm for sizes below 16384\n",
|
||||
"Unexpected end of /proc/mounts line `overlay / overlay rw,relatime,lowerdir=/var/lib/docker/overlay2/l/DK62GMJ7YWD352N7EBN6Z7NW4X:/var/lib/docker/overlay2/l/P3UTSC3GD5U2KE5CZH4M2NHMBG:/var/lib/docker/overlay2/l/VD7EUCZSMECTFOW7IIPSCEX4JM:/var/lib/docker/overlay2/l/JYXYPILZXXAYHR7WRVQZCD6K3Q:/var/lib/docker/overlay2/l/WVA3JNFTU4ODUE62JCC7AVMUM6:/var/lib/docker/overlay2/l/MRY4XN6FMUFF4BQUPGJ7YLLHRP:/var/lib/docker/overlay2/l/ITJFFFEWFL7E7OGEEZKTF6N7LD:/var/lib/docker/overlay2/l/FQDWWHY3VM74QBLUFN5CF5IDYO:/var/lib/docker/overlay2/l/U5OVVSIWE2MDZ'\n",
|
||||
"Unexpected end of /proc/mounts line `EAH4TF2PEAAE6:/var/lib/docker/overlay2/l/EGF6PA2CWXADDQJESE2446D56S:/var/lib/docker/overlay2/l/IB2WD76YXTMZG2QOWR7GUG7IYT:/var/lib/docker/overlay2/l/IIMLGMJ4JINXEWK27EDCERZELX:/var/lib/docker/overlay2/l/L7CTL35E6XMSEWF7QPLHH5TOW2:/var/lib/docker/overlay2/l/3DS6RVJMFFGK3UYSLNER7RUM4Q:/var/lib/docker/overlay2/l/DGINKAV4FMSJX44G2RIJJ3Z42O:/var/lib/docker/overlay2/l/DOCM6YYK7SLPTZ6RM4CMED7YLV:/var/lib/docker/overlay2/l/WKITE5QHJLEASZYONTVW3DQRFD:/var/lib/docker/overlay2/l/LGF4BIN6WXOSKTESB6PUQVKD3W:/var/lib/do'\n",
|
||||
"Unexpected end of /proc/mounts line `overlay / overlay rw,relatime,lowerdir=/var/lib/docker/overlay2/l/GTTEVW365EQA2GDOGZVOD37J32:/var/lib/docker/overlay2/l/P3UTSC3GD5U2KE5CZH4M2NHMBG:/var/lib/docker/overlay2/l/VD7EUCZSMECTFOW7IIPSCEX4JM:/var/lib/docker/overlay2/l/JYXYPILZXXAYHR7WRVQZCD6K3Q:/var/lib/docker/overlay2/l/WVA3JNFTU4ODUE62JCC7AVMUM6:/var/lib/docker/overlay2/l/MRY4XN6FMUFF4BQUPGJ7YLLHRP:/var/lib/docker/overlay2/l/ITJFFFEWFL7E7OGEEZKTF6N7LD:/var/lib/docker/overlay2/l/FQDWWHY3VM74QBLUFN5CF5IDYO:/var/lib/docker/overlay2/l/U5OVVSIWE2MDZ'\n",
|
||||
"Unexpected end of /proc/mounts line `overlay / overlay rw,relatime,lowerdir=/var/lib/docker/overlay2/l/DK62GMJ7YWD352N7EBN6Z7NW4X:/var/lib/docker/overlay2/l/P3UTSC3GD5U2KE5CZH4M2NHMBG:/var/lib/docker/overlay2/l/VD7EUCZSMECTFOW7IIPSCEX4JM:/var/lib/docker/overlay2/l/JYXYPILZXXAYHR7WRVQZCD6K3Q:/var/lib/docker/overlay2/l/WVA3JNFTU4ODUE62JCC7AVMUM6:/var/lib/docker/overlay2/l/MRY4XN6FMUFF4BQUPGJ7YLLHRP:/var/lib/docker/overlay2/l/ITJFFFEWFL7E7OGEEZKTF6N7LD:/var/lib/docker/overlay2/l/FQDWWHY3VM74QBLUFN5CF5IDYO:/var/lib/docker/overlay2/l/U5OVVSIWE2MDZ'\n",
|
||||
"Unexpected end of /proc/mounts line `EAH4TF2PEAAE6:/var/lib/docker/overlay2/l/EGF6PA2CWXADDQJESE2446D56S:/var/lib/docker/overlay2/l/IB2WD76YXTMZG2QOWR7GUG7IYT:/var/lib/docker/overlay2/l/IIMLGMJ4JINXEWK27EDCERZELX:/var/lib/docker/overlay2/l/L7CTL35E6XMSEWF7QPLHH5TOW2:/var/lib/docker/overlay2/l/3DS6RVJMFFGK3UYSLNER7RUM4Q:/var/lib/docker/overlay2/l/DGINKAV4FMSJX44G2RIJJ3Z42O:/var/lib/docker/overlay2/l/DOCM6YYK7SLPTZ6RM4CMED7YLV:/var/lib/docker/overlay2/l/WKITE5QHJLEASZYONTVW3DQRFD:/var/lib/docker/overlay2/l/LGF4BIN6WXOSKTESB6PUQVKD3W:/var/lib/do'\n",
|
||||
"4f92c5ed3cd3:14:65 [1] INFO comm 0x7f20502291a0 rank 1 nranks 2\n",
|
||||
"4f92c5ed3cd3:14:65 [1] INFO NET : Using interface eth0:172.17.0.3<0>\n",
|
||||
"4f92c5ed3cd3:14:65 [1] INFO NET/Socket : 1 interfaces found\n",
|
||||
"4f92c5ed3cd3:13:68 [0] INFO comm 0x7f5fd0226150 rank 0 nranks 2\n",
|
||||
"4f92c5ed3cd3:13:68 [0] INFO Using 256 threads\n",
|
||||
"4f92c5ed3cd3:13:68 [0] INFO Min Comp Cap 7\n",
|
||||
"4f92c5ed3cd3:13:68 [0] INFO NCCL_SINGLE_RING_THRESHOLD=262144\n",
|
||||
"4f92c5ed3cd3:13:68 [0] INFO Ring 00 : 0 1\n",
|
||||
"4f92c5ed3cd3:14:65 [1] INFO 1[14] -> 0[13] via direct shared memory\n",
|
||||
"4f92c5ed3cd3:13:68 [0] INFO 0[13] -> 1[14] via direct shared memory\n",
|
||||
"4f92c5ed3cd3:13:68 [0] INFO Launch mode Parallel\n",
|
||||
"INFO:__main__:0: [Epoch 0] duration(6.247324528027093) loss:7.080879211425781 total-samples: 0\n",
|
||||
"INFO:__main__:1: [Epoch 0] duration(6.245516683993628) loss:7.201418399810791 total-samples: 0\n",
|
||||
"INFO:__main__:0: [Epoch 0] duration(22.298739871999715) loss:6.897201061248779 total-samples: 6400\n",
|
||||
"INFO:__main__:1: [Epoch 0] duration(22.29826568599674) loss:7.035123348236084 total-samples: 6400\n"
|
||||
"41afbf31e948:13:65 [0] INFO comm 0x7fb4742272e0 rank 0 nranks 2\n",
|
||||
"41afbf31e948:14:66 [1] INFO comm 0x7fec54227800 rank 1 nranks 2\n",
|
||||
"41afbf31e948:14:66 [1] INFO NET : Using interface eth0:172.17.0.3<0>\n",
|
||||
"41afbf31e948:14:66 [1] INFO NET/Socket : 1 interfaces found\n",
|
||||
"41afbf31e948:13:65 [0] INFO Using 256 threads\n",
|
||||
"41afbf31e948:13:65 [0] INFO Min Comp Cap 7\n",
|
||||
"41afbf31e948:13:65 [0] INFO NCCL_SINGLE_RING_THRESHOLD=262144\n",
|
||||
"41afbf31e948:13:65 [0] INFO Ring 00 : 0 1\n",
|
||||
"41afbf31e948:14:66 [1] INFO 1[14] -> 0[13] via direct shared memory\n",
|
||||
"41afbf31e948:13:65 [0] INFO 0[13] -> 1[14] via direct shared memory\n",
|
||||
"41afbf31e948:13:65 [0] INFO Launch mode Parallel\n",
|
||||
"INFO:__main__:1: [Epoch 0] duration(5.657348500099033) loss:6.9815826416015625 total-samples: 0\n",
|
||||
"INFO:__main__:0: [Epoch 0] duration(5.659670127090067) loss:7.070679187774658 total-samples: 0\n",
|
||||
"INFO:__main__:1: [Epoch 0] duration(22.52876269700937) loss:6.24755334854126 total-samples: 6400\n",
|
||||
"INFO:__main__:0: [Epoch 0] duration(22.528516705147922) loss:6.189745903015137 total-samples: 6400\n",
|
||||
"INFO:__main__:1: [Epoch 0] duration(22.495547740953043) loss:6.0138092041015625 total-samples: 12800\n",
|
||||
"INFO:__main__:0: [Epoch 0] duration(22.49454707186669) loss:6.170968055725098 total-samples: 12800\n",
|
||||
"INFO:__main__:0: [Epoch 0] duration(22.48468095716089) loss:6.01400089263916 total-samples: 19200\n",
|
||||
"INFO:__main__:1: [Epoch 0] duration(22.4861887099687) loss:6.05994176864624 total-samples: 19200\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -193,9 +190,9 @@
|
|||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-33-58c14d27e2ca>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcontainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstream\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"utf-8\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m/opt/conda/envs/py3.6/lib/python3.6/site-packages/docker/types/daemon.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__next__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 30\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_stream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 31\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0murllib3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexceptions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mProtocolError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m/opt/conda/envs/py3.6/lib/python3.6/site-packages/docker/api/client.py\u001b[0m in \u001b[0;36m_multiplexed_response_stream_helper\u001b[0;34m(self, response)\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 347\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 348\u001b[0;31m \u001b[0mheader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSTREAM_HEADER_SIZE_BYTES\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 349\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 350\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m<ipython-input-18-58c14d27e2ca>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcontainer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstderr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstream\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"utf-8\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m/opt/conda/envs/py3.6/lib/python3.6/site-packages/docker/types/daemon.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__next__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_stream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0murllib3\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexceptions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mProtocolError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m/opt/conda/envs/py3.6/lib/python3.6/site-packages/docker/api/client.py\u001b[0m in \u001b[0;36m_multiplexed_response_stream_helper\u001b[0;34m(self, response)\u001b[0m\n\u001b[1;32m 365\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 366\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 367\u001b[0;31m \u001b[0mheader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mSTREAM_HEADER_SIZE_BYTES\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 368\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 369\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m/opt/conda/envs/py3.6/lib/python3.6/site-packages/urllib3/response.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, amt, decode_content, cache_content)\u001b[0m\n\u001b[1;32m 440\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[0mcache_content\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 442\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mamt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 443\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mamt\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# Platform-specific: Buggy versions of Python.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 444\u001b[0m \u001b[0;31m# Close the connection when no data is returned\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m/opt/conda/envs/py3.6/lib/python3.6/http/client.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, amt)\u001b[0m\n\u001b[1;32m 447\u001b[0m \u001b[0;31m# Amount is given, implement using readinto\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 448\u001b[0m \u001b[0mb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbytearray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mamt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 449\u001b[0;31m \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 450\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmemoryview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtobytes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 451\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m/opt/conda/envs/py3.6/lib/python3.6/http/client.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 481\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 482\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchunked\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 483\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_readinto_chunked\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 484\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 485\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlength\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
|
@ -214,7 +211,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -225,8 +222,12 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"execution_count": 24,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"stripout"
|
||||
]
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
|
@ -249,47 +250,49 @@
|
|||
"b'{\"status\":\"Preparing\",\"progressDetail\":{},\"id\":\"65dd347e5346\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Preparing\",\"progressDetail\":{},\"id\":\"39d39e5e9701\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Preparing\",\"progressDetail\":{},\"id\":\"621c51016399\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"4246124ac3fb\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"12302f8bd2e6\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Preparing\",\"progressDetail\":{},\"id\":\"145eb658aaf0\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"39d39e5e9701\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Preparing\",\"progressDetail\":{},\"id\":\"e6c3a9c7c79e\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"a917bc2d0f96\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"879c4ef3d9fb\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Preparing\",\"progressDetail\":{},\"id\":\"f1dfa8049aa6\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Preparing\",\"progressDetail\":{},\"id\":\"79109c0f8a0b\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"4246124ac3fb\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"a917bc2d0f96\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"145eb658aaf0\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"9b68e6935e56\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"39d39e5e9701\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"c7cfa177d51a\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"65dd347e5346\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"6e8ce585c22b\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"e6c3a9c7c79e\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"f1dfa8049aa6\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Preparing\",\"progressDetail\":{},\"id\":\"33db8ccd260b\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Preparing\",\"progressDetail\":{},\"id\":\"b8c891f0ffec\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"79109c0f8a0b\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"9be10ccfe4da\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"c7cfa177d51a\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"12302f8bd2e6\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"621c51016399\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"b8c891f0ffec\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"33db8ccd260b\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"65dd347e5346\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Waiting\",\"progressDetail\":{},\"id\":\"621c51016399\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"4936625d6fff\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"7e6c8b5d5783\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"d9038574f55a\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"7e6c8b5d5783\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"eeb659df3cc8\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"4936625d6fff\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"27aab996f8cd\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"9be10ccfe4da\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"4246124ac3fb\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"12302f8bd2e6\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"a917bc2d0f96\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"879c4ef3d9fb\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"12302f8bd2e6\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"4246124ac3fb\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"c7cfa177d51a\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"9b68e6935e56\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"65dd347e5346\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"6e8ce585c22b\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"39d39e5e9701\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"621c51016399\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"6e8ce585c22b\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"145eb658aaf0\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"79109c0f8a0b\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"f1dfa8049aa6\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"e6c3a9c7c79e\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"b8c891f0ffec\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"f1dfa8049aa6\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"79109c0f8a0b\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"9b68e6935e56\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"33db8ccd260b\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"Layer already exists\",\"progressDetail\":{},\"id\":\"b8c891f0ffec\"}\\r\\n'\n",
|
||||
"b'{\"status\":\"latest: digest: sha256:fbb148f239f4120dda9c4637a6542c37dc36b8d57291d11a94b257b62e5257c4 size: 4926\"}\\r\\n'\n",
|
||||
"b'{\"progressDetail\":{},\"aux\":{\"Tag\":\"latest\",\"Digest\":\"sha256:fbb148f239f4120dda9c4637a6542c37dc36b8d57291d11a94b257b62e5257c4\",\"Size\":4926}}\\r\\n'\n"
|
||||
]
|
||||
|
@ -298,13 +301,21 @@
|
|||
"source": [
|
||||
"for line in dc.images.push(image.tags[0], \n",
|
||||
" stream=True,\n",
|
||||
" auth_config={\"username\":DOCKERHUB,\n",
|
||||
" \"password\": \"d13NHAL!\"}):\n",
|
||||
" auth_config={\"username\": DOCKERHUB,\n",
|
||||
" \"password\": DOCKER_PWD}):\n",
|
||||
" print(line)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"jupytext": {
|
||||
"text_representation": {
|
||||
"extension": ".py",
|
||||
"format_name": "light",
|
||||
"format_version": "1.3",
|
||||
"jupytext_version": "0.8.6"
|
||||
}
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
|
|
|
@ -13,7 +13,9 @@
|
|||
# ---
|
||||
|
||||
# # Create Docker Image for PyTorch
|
||||
# In this notebook we will create the image for our PyTorch script to run in. We will go through the process of creating the image and testing it locally to make sure it runs before submitting it to the cluster. It is often recommended you do this rather than debugging on the cluster since debugging on a cluster can be much more difficult and time consuming.
|
||||
# In this notebook we will create the Docker image for our PyTorch script to run in. We will go through the process of creating the image and testing it locally to make sure it runs before submitting it to the cluster. It is often recommended you do this rather than debugging on the cluster since debugging on a cluster can be much more difficult and time consuming.
|
||||
#
|
||||
# **You will need to be running everything on a GPU enabled VM to run this notebook.**
|
||||
|
||||
# +
|
||||
import sys
|
||||
|
@ -25,14 +27,12 @@ from utils import dotenv_for
|
|||
import docker
|
||||
# -
|
||||
|
||||
# Below are the variables that describe our experiment. By default we are using the NC24rs_v3 (Standard_NC24rs_v3) VMs which have V100 GPUs and Infiniband. By default we are using 2 nodes with each node having 4 GPUs, this equates to 8 GPUs. Feel free to increase the number of nodes but be aware what limitations your subscription may have.
|
||||
#
|
||||
# Set the USE_FAKE to True if you want to use fake data rather than the Imagenet dataset. This is often a good way to debug your models as well as checking what IO overhead is.
|
||||
# We will use fake data here since we don't want to have to download the data etc. Using fake data is often a good way to debug your models as well as checking what IO overhead is. Here we are setting the number of processes (NUM_PROCESSES) to 2 since the VM we are testing on has 2 GPUs. If you are running on a machine with 1 GPU set NUM_PROCESSES to 1.
|
||||
|
||||
# + {"tags": ["parameters"]}
|
||||
dotenv_path = dotenv_for()
|
||||
USE_FAKE = True
|
||||
DOCKERHUB = os.getenv('DOCKER_REPOSITORY', "masalvar") #"<YOUR DOCKERHUB>"
|
||||
DOCKERHUB = os.getenv('DOCKER_REPOSITORY', "masalvar")
|
||||
NUM_PROCESSES = 2
|
||||
DOCKER_PWD = get_key(dotenv_path, 'DOCKER_PWD')
|
||||
# -
|
||||
|
@ -42,8 +42,6 @@ dc = docker.from_env()
|
|||
image, log_iter = dc.images.build(path='Docker',
|
||||
tag='{}/caia-horovod-pytorch'.format(DOCKERHUB))
|
||||
|
||||
image.tags[0]
|
||||
|
||||
# +
|
||||
container_labels = {'containerName': 'pytorchgpu'}
|
||||
environment ={
|
||||
|
@ -79,13 +77,18 @@ container = dc.containers.run(image.tags[0],
|
|||
shm_size='8G',
|
||||
privileged=True)
|
||||
|
||||
# With the code below we are simply monitoring what is happening in the container. Feel free to stop the notebook when you are happy that everything is working.
|
||||
|
||||
# + {"tags": ["stripout"]}
|
||||
for line in container.logs(stderr=True, stream=True):
|
||||
print(line.decode("utf-8"),end ="")
|
||||
# -
|
||||
|
||||
container.reload() # Refresh state
|
||||
if container.status is 'running':
|
||||
container.kill()
|
||||
|
||||
# + {"tags": ["stripout"]}
|
||||
for line in dc.images.push(image.tags[0],
|
||||
stream=True,
|
||||
auth_config={"username": DOCKERHUB,
|
||||
|
|
|
@ -6,41 +6,37 @@
|
|||
"source": [
|
||||
"# Train PyTorch Model Distributed on Batch AI\n",
|
||||
"In this notebook we will train a PyTorch model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n",
|
||||
" * [Create Azure Resources](#azure_resources)\n",
|
||||
" * [Create Fileserver(NFS)](#create_fileshare)\n",
|
||||
" * [Configure Batch AI Cluster](#configure_cluster)\n",
|
||||
" * [Create Experiment](#experiment)\n",
|
||||
" * [Upload Training Scripts](#training_scripts)\n",
|
||||
" * [Submit and Monitor Job](#job)\n",
|
||||
" * [Clean Up Resources](#clean_up)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"sys.path.append(\"../common\") \n",
|
||||
"\n",
|
||||
"from dotenv import dotenv_values, set_key, find_dotenv, get_key\n",
|
||||
"from getpass import getpass\n",
|
||||
"import os\n",
|
||||
"import json\n",
|
||||
"from utils import get_password, write_json_to_file, dotenv_for"
|
||||
"from dotenv import get_key\n",
|
||||
"import os\n",
|
||||
"from utils import write_json_to_file, dotenv_for"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Below are the variables that describe our experiment. By default we are using the NC24rs_v3 (Standard_NC24rs_v3) VMs which have V100 GPUs and Infiniband. By default we are using 2 nodes with each node having 4 GPUs, this equates to 8 GPUs. Feel free to increase the number of nodes but be aware what limitations your subscription may have.\n",
|
||||
"\n",
|
||||
"Set the USE_FAKE to True if you want to use fake data rather than the Imagenet dataset. This is often a good way to debug your models as well as checking what IO overhead is."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 45,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
|
@ -49,32 +45,28 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Variables for Batch AI - change as necessary\n",
|
||||
"ID = \"ddpytorch\"\n",
|
||||
"GROUP_NAME = f\"batch{ID}rg\"\n",
|
||||
"STORAGE_ACCOUNT_NAME = f\"batch{ID}st\"\n",
|
||||
"FILE_SHARE_NAME = f\"batch{ID}share\"\n",
|
||||
"SELECTED_SUBSCRIPTION = \"Team Danielle Internal\" #\"<YOUR SUBSCRIPTION>\"\n",
|
||||
"WORKSPACE = \"workspace\"\n",
|
||||
"NUM_NODES = 2\n",
|
||||
"CLUSTER_NAME = \"msv100\"\n",
|
||||
"VM_SIZE = \"Standard_NC24rs_v3\"\n",
|
||||
"GPU_TYPE = \"V100\"\n",
|
||||
"PROCESSES_PER_NODE = 4\n",
|
||||
"LOCATION = \"eastus\"\n",
|
||||
"NFS_NAME = f\"batch{ID}nfs\"\n",
|
||||
"dotenv_path = dotenv_for()\n",
|
||||
"GROUP_NAME = get_key(dotenv_path, 'GROUP_NAME')\n",
|
||||
"FILE_SHARE_NAME = get_key(dotenv_path, 'FILE_SHARE_NAME')\n",
|
||||
"WORKSPACE = get_key(dotenv_path, 'WORKSPACE')\n",
|
||||
"NUM_NODES = int(get_key(dotenv_path, 'NUM_NODES'))\n",
|
||||
"CLUSTER_NAME = get_key(dotenv_path, 'CLUSTER_NAME')\n",
|
||||
"GPU_TYPE = get_key(dotenv_path, 'GPU_TYPE')\n",
|
||||
"PROCESSES_PER_NODE = int(get_key(dotenv_path, 'PROCESSES_PER_NODE'))\n",
|
||||
"STORAGE_ACCOUNT_NAME = get_key(dotenv_path, 'STORAGE_ACCOUNT_NAME')\n",
|
||||
"\n",
|
||||
"EXPERIMENT = f\"distributed_pytorch_{GPU_TYPE}\"\n",
|
||||
"USERNAME = \"batchai_user\"\n",
|
||||
"USE_FAKE = False\n",
|
||||
"DOCKERHUB = \"caia\" #\"<YOUR DOCKERHUB>\""
|
||||
"DOCKERHUB = os.getenv('DOCKER_REPOSITORY', \"masalvar\") #\"<YOUR DOCKERHUB>\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 56,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"FAKE='-env FAKE=True' if USE_FAKE else ''\n",
|
||||
"FAKE='-x FAKE=True' if USE_FAKE else ''\n",
|
||||
"TOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES"
|
||||
]
|
||||
},
|
||||
|
@ -82,14 +74,33 @@
|
|||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a id='experiment'></a>\n",
|
||||
"# Create Experiment\n",
|
||||
"Next we create our experiment."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[K{- Finished ..\n",
|
||||
" \"creationTime\": \"2018-12-17T13:19:30.658000+00:00\",\n",
|
||||
" \"id\": \"/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/batchdtdemorg/providers/Microsoft.BatchAI/workspaces/workspace/experiments/distributed_pytorch_v100\",\n",
|
||||
" \"name\": \"distributed_pytorch_v100\",\n",
|
||||
" \"provisioningState\": \"succeeded\",\n",
|
||||
" \"provisioningStateTransitionTime\": \"2018-12-17T13:19:30.658000+00:00\",\n",
|
||||
" \"resourceGroup\": \"batchdtdemorg\",\n",
|
||||
" \"type\": \"Microsoft.BatchAI/workspaces/experiments\"\n",
|
||||
"}\n",
|
||||
"\u001b[0m"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!az batchai experiment create -n $EXPERIMENT -g $GROUP_NAME -w $WORKSPACE"
|
||||
]
|
||||
|
@ -98,19 +109,68 @@
|
|||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Upload the relevant scripts"
|
||||
"<a id='training_scripts'></a>\n",
|
||||
"# Upload Training Scripts\n",
|
||||
"We need to upload our training scripts and associated files"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"json_data = !az storage account keys list -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME\n",
|
||||
"storage_account_key = json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['value']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"stripout"
|
||||
]
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"env: AZURE_STORAGE_ACCOUNT=batchdtdemost\n",
|
||||
"env: AZURE_STORAGE_KEY=AtQA2uvmxTSvo0SXnI5FjMOXl+qp5fKwNcPL+Y2N0N/0+EhcRt4RhFuXf+YKvG9qDSrB6ZrgNmJ8fgloABMtSQ==\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%env AZURE_STORAGE_ACCOUNT $STORAGE_ACCOUNT_NAME\n",
|
||||
"%env AZURE_STORAGE_KEY=$storage_account_key"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Upload our training scripts"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Finished[#############################################################] 100.0000%\n",
|
||||
"Finished[#############################################################] 100.0000%\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_pytorch_horovod.py --path scripts\n",
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts\n",
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source cluster_config/docker.service --path scripts\n",
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source cluster_config/nodeprep.sh --path scripts"
|
||||
"!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -122,9 +182,19 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Name Resource Group Workspace VM Size State Idle Running Preparing Leaving Unusable\r\n",
|
||||
"------ ---------------- ----------- ------------------ ------- ------ --------- ----------- --------- ----------\r\n",
|
||||
"msv100 batchdtdemorg workspace STANDARD_NC24RS_V3 steady 2 0 0 0 0\r\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!az batchai cluster list -w $WORKSPACE -o table"
|
||||
]
|
||||
|
@ -140,7 +210,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 61,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -186,7 +256,7 @@
|
|||
" }],\n",
|
||||
" \"containerSettings\": {\n",
|
||||
" \"imageSourceRegistry\": {\n",
|
||||
" \"image\": f\"{DOCKERHUB}/distributed-training.horovod-pytorch\"\n",
|
||||
" \"image\": f\"{DOCKERHUB}/caia-horovod-pytorch\"\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
|
@ -195,7 +265,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 62,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -204,7 +274,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 63,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -220,9 +290,94 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"execution_count": 64,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"stripout"
|
||||
]
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[K{- Finished ..\n",
|
||||
" \"caffe2Settings\": null,\n",
|
||||
" \"caffeSettings\": null,\n",
|
||||
" \"chainerSettings\": null,\n",
|
||||
" \"cluster\": {\n",
|
||||
" \"id\": \"/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/batchdtdemorg/providers/Microsoft.BatchAI/workspaces/workspace/clusters/msv100\",\n",
|
||||
" \"resourceGroup\": \"batchdtdemorg\"\n",
|
||||
" },\n",
|
||||
" \"cntkSettings\": null,\n",
|
||||
" \"constraints\": {\n",
|
||||
" \"maxWallClockTime\": \"7 days, 0:00:00\"\n",
|
||||
" },\n",
|
||||
" \"containerSettings\": {\n",
|
||||
" \"imageSourceRegistry\": {\n",
|
||||
" \"credentials\": null,\n",
|
||||
" \"image\": \"masalvar/caia-horovod-pytorch\",\n",
|
||||
" \"serverUrl\": null\n",
|
||||
" },\n",
|
||||
" \"shmSize\": null\n",
|
||||
" },\n",
|
||||
" \"creationTime\": \"2018-12-17T13:47:50.202000+00:00\",\n",
|
||||
" \"customMpiSettings\": null,\n",
|
||||
" \"customToolkitSettings\": {\n",
|
||||
" \"commandLine\": \"echo $AZ_BATCH_HOST_LIST; cat $AZ_BATCHAI_MPI_HOST_FILE; mpirun -np 8 --hostfile $AZ_BATCHAI_MPI_HOST_FILE -bind-to none -map-by slot -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -mca btl_tcp_if_include eth0 -x NCCL_SOCKET_IFNAME=eth0 -mca btl ^openib -x NCCL_IB_DISABLE=1 -x DISTRIBUTED=True -x AZ_BATCHAI_INPUT_TRAIN -x AZ_BATCHAI_INPUT_TEST --allow-run-as-root -x FAKE=True python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py\"\n",
|
||||
" },\n",
|
||||
" \"environmentVariables\": null,\n",
|
||||
" \"executionInfo\": {\n",
|
||||
" \"endTime\": null,\n",
|
||||
" \"errors\": null,\n",
|
||||
" \"exitCode\": null,\n",
|
||||
" \"startTime\": \"2018-12-17T13:47:54.570000+00:00\"\n",
|
||||
" },\n",
|
||||
" \"executionState\": \"running\",\n",
|
||||
" \"executionStateTransitionTime\": \"2018-12-17T13:47:54.570000+00:00\",\n",
|
||||
" \"horovodSettings\": null,\n",
|
||||
" \"id\": \"/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/batchdtdemorg/providers/Microsoft.BatchAI/workspaces/workspace/experiments/distributed_pytorch_v100/jobs/pytorch-horovod-8\",\n",
|
||||
" \"inputDirectories\": [\n",
|
||||
" {\n",
|
||||
" \"id\": \"SCRIPTS\",\n",
|
||||
" \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"id\": \"TRAIN\",\n",
|
||||
" \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"id\": \"TEST\",\n",
|
||||
" \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\"\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"jobOutputDirectoryPathSegment\": \"edf507a2-6235-46c5-b560-fd463ba2e771/batchdtdemorg/workspaces/workspace/experiments/distributed_pytorch_v100/jobs/pytorch-horovod-8/43d5d58c-2ecd-4aa4-a459-93be2f302b7e\",\n",
|
||||
" \"jobPreparation\": null,\n",
|
||||
" \"mountVolumes\": null,\n",
|
||||
" \"name\": \"pytorch-horovod-8\",\n",
|
||||
" \"nodeCount\": 2,\n",
|
||||
" \"outputDirectories\": [\n",
|
||||
" {\n",
|
||||
" \"id\": \"MODEL\",\n",
|
||||
" \"pathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n",
|
||||
" \"pathSuffix\": \"Models\"\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"provisioningState\": \"succeeded\",\n",
|
||||
" \"provisioningStateTransitionTime\": \"2018-12-17T13:47:50.484000+00:00\",\n",
|
||||
" \"pyTorchSettings\": null,\n",
|
||||
" \"resourceGroup\": \"batchdtdemorg\",\n",
|
||||
" \"schedulingPriority\": \"normal\",\n",
|
||||
" \"secrets\": null,\n",
|
||||
" \"stdOutErrPathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n",
|
||||
" \"tensorFlowSettings\": null,\n",
|
||||
" \"toolType\": \"custom\",\n",
|
||||
" \"type\": \"Microsoft.BatchAI/workspaces/experiments/jobs\"\n",
|
||||
"}\n",
|
||||
"\u001b[0m"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!az batchai job create -n $JOB_NAME --cluster $CLUSTER_NAME -w $WORKSPACE -e $EXPERIMENT -f job.json"
|
||||
]
|
||||
|
@ -236,9 +391,19 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 67,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Name Cluster Cluster RG Cluster Workspace Tool Nodes State Exit code\r\n",
|
||||
"----------------- --------- ------------- ------------------- ------ ------- ------- -----------\r\n",
|
||||
"pytorch-horovod-8 msv100 batchdtdemorg workspace custom 2 running\r\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!az batchai job list -w $WORKSPACE -e $EXPERIMENT -o table"
|
||||
]
|
||||
|
@ -252,9 +417,50 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"execution_count": 37,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"stripout"
|
||||
]
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[\r\n",
|
||||
" {\r\n",
|
||||
" \"contentLength\": 9925,\r\n",
|
||||
" \"downloadUrl\": \"https://batchdtdemost.file.core.windows.net/batchdtdemoshare/edf507a2-6235-46c5-b560-fd463ba2e771/batchdtdemorg/workspaces/workspace/experiments/distributed_pytorch_v100/jobs/pytorch-horovod-8/ffa69c05-3f59-41b3-bfc4-370ad3022d9a/stdouterr/execution-tvm-829305193_1-20181217t125904z.log?sv=2016-05-31&sr=f&sig=RDpy9UMuftOa1w2TM6fROekEqc6ISPRmAwsoQufRzig%3D&se=2018-12-17T14%3A29%3A33Z&sp=rl\",\r\n",
|
||||
" \"fileType\": \"file\",\r\n",
|
||||
" \"lastModified\": \"2018-12-17T13:26:52+00:00\",\r\n",
|
||||
" \"name\": \"execution-tvm-829305193_1-20181217t125904z.log\"\r\n",
|
||||
" },\r\n",
|
||||
" {\r\n",
|
||||
" \"contentLength\": 14343,\r\n",
|
||||
" \"downloadUrl\": \"https://batchdtdemost.file.core.windows.net/batchdtdemoshare/edf507a2-6235-46c5-b560-fd463ba2e771/batchdtdemorg/workspaces/workspace/experiments/distributed_pytorch_v100/jobs/pytorch-horovod-8/ffa69c05-3f59-41b3-bfc4-370ad3022d9a/stdouterr/execution-tvm-829305193_2-20181217t125904z.log?sv=2016-05-31&sr=f&sig=i1S6%2BgVSpK%2BX1o%2BXOLuNBFJ%2FZrRK8W1d7ZEbR8a1NJU%3D&se=2018-12-17T14%3A29%3A33Z&sp=rl\",\r\n",
|
||||
" \"fileType\": \"file\",\r\n",
|
||||
" \"lastModified\": \"2018-12-17T13:26:53+00:00\",\r\n",
|
||||
" \"name\": \"execution-tvm-829305193_2-20181217t125904z.log\"\r\n",
|
||||
" },\r\n",
|
||||
" {\r\n",
|
||||
" \"contentLength\": 80,\r\n",
|
||||
" \"downloadUrl\": \"https://batchdtdemost.file.core.windows.net/batchdtdemoshare/edf507a2-6235-46c5-b560-fd463ba2e771/batchdtdemorg/workspaces/workspace/experiments/distributed_pytorch_v100/jobs/pytorch-horovod-8/ffa69c05-3f59-41b3-bfc4-370ad3022d9a/stdouterr/stderr.txt?sv=2016-05-31&sr=f&sig=%2BeJhQCBEu4tCdRiUMpNfAd2fwtuOFLacYWBQdsGp80g%3D&se=2018-12-17T14%3A29%3A33Z&sp=rl\",\r\n",
|
||||
" \"fileType\": \"file\",\r\n",
|
||||
" \"lastModified\": \"2018-12-17T13:26:53+00:00\",\r\n",
|
||||
" \"name\": \"stderr.txt\"\r\n",
|
||||
" },\r\n",
|
||||
" {\r\n",
|
||||
" \"contentLength\": 988,\r\n",
|
||||
" \"downloadUrl\": \"https://batchdtdemost.file.core.windows.net/batchdtdemoshare/edf507a2-6235-46c5-b560-fd463ba2e771/batchdtdemorg/workspaces/workspace/experiments/distributed_pytorch_v100/jobs/pytorch-horovod-8/ffa69c05-3f59-41b3-bfc4-370ad3022d9a/stdouterr/stdout.txt?sv=2016-05-31&sr=f&sig=pua2p8PQ0h5VE51Nn%2BBgUrw7rXP8HuDlSf75MDqxJ84%3D&se=2018-12-17T14%3A29%3A33Z&sp=rl\",\r\n",
|
||||
" \"fileType\": \"file\",\r\n",
|
||||
" \"lastModified\": \"2018-12-17T13:27:28+00:00\",\r\n",
|
||||
" \"name\": \"stdout.txt\"\r\n",
|
||||
" }\r\n",
|
||||
"]\r\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!az batchai job file list -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr"
|
||||
]
|
||||
|
@ -268,20 +474,189 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"execution_count": 68,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"stripout"
|
||||
]
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[33mFile found with URL \"https://batchdtdemost.file.core.windows.net/batchdtdemoshare/edf507a2-6235-46c5-b560-fd463ba2e771/batchdtdemorg/workspaces/workspace/experiments/distributed_pytorch_v100/jobs/pytorch-horovod-8/43d5d58c-2ecd-4aa4-a459-93be2f302b7e/stdouterr/stdout.txt?sv=2016-05-31&sr=f&sig=%2F3GTrC%2BW73ccZZ82QFvYxHmjrayV0pquj6EYSeqC%2B0I%3D&se=2018-12-17T14%3A51%3A24Z&sp=rl\". Start streaming\u001b[0m\n",
|
||||
"10.0.0.5,10.0.0.6\n",
|
||||
"10.0.0.5 slots=4 max-slots=4\n",
|
||||
"10.0.0.6 slots=4 max-slots=4\n",
|
||||
"INFO:__main__:1: Runnin Distributed\n",
|
||||
"INFO:__main__:0: Runnin Distributed\n",
|
||||
"INFO:__main__:5: Runnin Distributed\n",
|
||||
"INFO:__main__:2: Runnin Distributed\n",
|
||||
"INFO:__main__:3: Runnin Distributed\n",
|
||||
"INFO:__main__:6: Runnin Distributed\n",
|
||||
"INFO:__main__:7: Runnin Distributed\n",
|
||||
"INFO:__main__:4: Runnin Distributed\n",
|
||||
"INFO:__main__:0: PyTorch version 0.4.0\n",
|
||||
"INFO:__main__:0: Setting up fake loaders\n",
|
||||
"INFO:__main__:2: PyTorch version 0.4.0\n",
|
||||
"INFO:__main__:2: Setting up fake loaders\n",
|
||||
"INFO:__main__:1: PyTorch version 0.4.0\n",
|
||||
"INFO:__main__:1: Setting up fake loaders\n",
|
||||
"INFO:__main__:3: PyTorch version 0.4.0\n",
|
||||
"INFO:__main__:3: Setting up fake loaders\n",
|
||||
"INFO:__main__:4: PyTorch version 0.4.0\n",
|
||||
"INFO:__main__:4: Setting up fake loaders\n",
|
||||
"INFO:__main__:5: PyTorch version 0.4.0\n",
|
||||
"INFO:__main__:5: Setting up fake loaders\n",
|
||||
"INFO:__main__:7: PyTorch version 0.4.0\n",
|
||||
"INFO:__main__:7: Setting up fake loaders\n",
|
||||
"INFO:__main__:6: PyTorch version 0.4.0\n",
|
||||
"INFO:__main__:6: Setting up fake loaders\n",
|
||||
"INFO:__main__:0: Creating fake data 1000 labels and 640 images\n",
|
||||
"INFO:__main__:0: Loading model\n",
|
||||
"INFO:__main__:3: Creating fake data 1000 labels and 640 images\n",
|
||||
"INFO:__main__:3: Loading model\n",
|
||||
"INFO:__main__:1: Creating fake data 1000 labels and 640 images\n",
|
||||
"INFO:__main__:1: Loading model\n",
|
||||
"INFO:__main__:2: Creating fake data 1000 labels and 640 images\n",
|
||||
"INFO:__main__:2: Loading model\n",
|
||||
"INFO:__main__:4: Creating fake data 1000 labels and 640 images\n",
|
||||
"INFO:__main__:4: Loading model\n",
|
||||
"INFO:__main__:7: Creating fake data 1000 labels and 640 images\n",
|
||||
"INFO:__main__:7: Loading model\n",
|
||||
"INFO:__main__:6: Creating fake data 1000 labels and 640 images\n",
|
||||
"INFO:__main__:6: Loading model\n",
|
||||
"INFO:__main__:5: Creating fake data 1000 labels and 640 images\n",
|
||||
"INFO:__main__:5: Loading model\n",
|
||||
"INFO:__main__:0: Training ...\n",
|
||||
"INFO:__main__:2: Training ...\n",
|
||||
"INFO:__main__:3: Training ...\n",
|
||||
"INFO:__main__:1: Training ...\n",
|
||||
"INFO:__main__:6: Training ...\n",
|
||||
"INFO:__main__:7: Training ...\n",
|
||||
"INFO:__main__:5: Training ...\n",
|
||||
"INFO:__main__:4: Training ...\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:466:579 [0] INFO Using internal Network Socket\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:466:579 [0] INFO Using NCCL Low-latency algorithm for sizes below 16384\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:466:579 [0] INFO NET : Using interface eth0:10.0.0.5<0>\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:466:579 [0] INFO NET/Socket : 1 interfaces found\n",
|
||||
"NCCL version 2.2.13+cuda9.0\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:467:571 [1] INFO Using internal Network Socket\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:467:571 [1] INFO Using NCCL Low-latency algorithm for sizes below 16384\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:468:570 [2] INFO Using internal Network Socket\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:468:570 [2] INFO Using NCCL Low-latency algorithm for sizes below 16384\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:469:572 [3] INFO Using internal Network Socket\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:469:572 [3] INFO Using NCCL Low-latency algorithm for sizes below 16384\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:450:552 [3] INFO Using internal Network Socket\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:450:552 [3] INFO Using NCCL Low-latency algorithm for sizes below 16384\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:448:553 [1] INFO Using internal Network Socket\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:448:553 [1] INFO Using NCCL Low-latency algorithm for sizes below 16384\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:447:554 [0] INFO Using internal Network Socket\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:447:554 [0] INFO Using NCCL Low-latency algorithm for sizes below 16384\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:449:551 [2] INFO Using internal Network Socket\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:449:551 [2] INFO Using NCCL Low-latency algorithm for sizes below 16384\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:450:552 [3] INFO comm 0x7f97042d7c90 rank 7 nranks 8\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:450:552 [3] INFO NET : Using interface eth0:10.0.0.6<0>\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:450:552 [3] INFO NET/Socket : 1 interfaces found\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:447:554 [0] INFO comm 0x7f9474347240 rank 4 nranks 8\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:447:554 [0] INFO NET : Using interface eth0:10.0.0.6<0>\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:447:554 [0] INFO NET/Socket : 1 interfaces found\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:466:579 [0] INFO comm 0x7f4d7827fe00 rank 0 nranks 8\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:448:553 [1] INFO comm 0x7f1e6c2c6850 rank 5 nranks 8\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:448:553 [1] INFO NET : Using interface eth0:10.0.0.6<0>\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:448:553 [1] INFO NET/Socket : 1 interfaces found\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:469:572 [3] INFO comm 0x7f26e0290fb0 rank 3 nranks 8\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:469:572 [3] INFO NET : Using interface eth0:10.0.0.5<0>\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:469:572 [3] INFO NET/Socket : 1 interfaces found\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:449:551 [2] INFO comm 0x7ff4ac2c1f70 rank 6 nranks 8\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:449:551 [2] INFO NET : Using interface eth0:10.0.0.6<0>\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:449:551 [2] INFO NET/Socket : 1 interfaces found\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:467:571 [1] INFO comm 0x7f4ddc271930 rank 1 nranks 8\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:467:571 [1] INFO NET : Using interface eth0:10.0.0.5<0>\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:467:571 [1] INFO NET/Socket : 1 interfaces found\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:468:570 [2] INFO comm 0x7f039025d7c0 rank 2 nranks 8\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:468:570 [2] INFO NET : Using interface eth0:10.0.0.5<0>\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:468:570 [2] INFO NET/Socket : 1 interfaces found\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:466:579 [0] INFO Using 256 threads\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:466:579 [0] INFO Min Comp Cap 7\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:466:579 [0] INFO NCCL_SINGLE_RING_THRESHOLD=262144\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:466:579 [0] INFO Ring 00 : 0 1 2 3 4 5 6 7\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:448:553 [1] INFO 5[448] -> 6[449] via direct shared memory\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:447:554 [0] INFO 3 -> 4 via NET/Socket/0\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:447:554 [0] INFO 4[447] -> 5[448] via direct shared memory\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000001:449:551 [2] INFO 6[449] -> 7[450] via direct shared memory\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:466:579 [0] INFO 7 -> 0 via NET/Socket/0\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:467:571 [1] INFO 1[467] -> 2[468] via direct shared memory\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:466:579 [0] INFO 0[466] -> 1[467] via direct shared memory\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:468:570 [2] INFO 2[468] -> 3[469] via direct shared memory\n",
|
||||
"974d10b55464480199a0bc9c4b4615cf000000:466:579 [0] INFO Launch mode Parallel\n",
|
||||
"INFO:__main__:3: [Epoch 0] duration(6.689605174999997) loss:7.052699089050293 total-samples: 0\n",
|
||||
"INFO:__main__:7: [Epoch 0] duration(6.689712283000063) loss:7.05151891708374 total-samples: 0\n",
|
||||
"INFO:__main__:6: [Epoch 0] duration(6.691189228999974) loss:6.988680362701416 total-samples: 0\n",
|
||||
"INFO:__main__:5: [Epoch 0] duration(6.692733973000031) loss:6.947587013244629 total-samples: 0\n",
|
||||
"INFO:__main__:1: [Epoch 0] duration(6.693754228999751) loss:7.041652679443359 total-samples: 0\n",
|
||||
"INFO:__main__:2: [Epoch 0] duration(6.697406198999943) loss:7.09742546081543 total-samples: 0\n",
|
||||
"INFO:__main__:0: [Epoch 0] duration(6.699599520999982) loss:7.070676803588867 total-samples: 0\n",
|
||||
"INFO:__main__:4: [Epoch 0] duration(6.695543971000006) loss:6.981583118438721 total-samples: 0\n",
|
||||
"INFO:__main__:3: [Epoch 0] duration(57.50169312499975) loss:5.711731433868408 total-samples: 6400\n",
|
||||
"INFO:__main__:5: [Epoch 0] duration(57.49870837199978) loss:5.794982433319092 total-samples: 6400\n",
|
||||
"INFO:__main__:7: [Epoch 0] duration(57.50195155399979) loss:5.764939785003662 total-samples: 6400\n",
|
||||
"INFO:__main__:6: [Epoch 0] duration(57.50037861200008) loss:5.75480318069458 total-samples: 6400\n",
|
||||
"INFO:__main__:4: [Epoch 0] duration(57.49586707499975) loss:5.767154216766357 total-samples: 6400\n",
|
||||
"INFO:__main__:2: [Epoch 0] duration(57.497045193000304) loss:5.684810638427734 total-samples: 6400\n",
|
||||
"INFO:__main__:1: [Epoch 0] duration(57.49853723899969) loss:5.811939239501953 total-samples: 6400\n",
|
||||
"INFO:__main__:0: [Epoch 0] duration(57.499225016999844) loss:5.770937919616699 total-samples: 6400\n",
|
||||
"INFO:__main__:2: [Epoch 0] duration(57.3714186279999) loss:3.925326347351074 total-samples: 12800\n",
|
||||
"INFO:__main__:1: [Epoch 0] duration(57.37188071199989) loss:3.969285726547241 total-samples: 12800\n",
|
||||
"INFO:__main__:0: [Epoch 0] duration(57.373041275000105) loss:3.922102451324463 total-samples: 12800\n",
|
||||
"INFO:__main__:3: [Epoch 0] duration(57.37743352200005) loss:3.7698488235473633 total-samples: 12800\n",
|
||||
"INFO:__main__:7: [Epoch 0] duration(57.375299133000226) loss:3.8675193786621094 total-samples: 12800\n",
|
||||
"INFO:__main__:6: [Epoch 0] duration(57.37926719500001) loss:3.9498977661132812 total-samples: 12800\n",
|
||||
"INFO:__main__:5: [Epoch 0] duration(57.381546716999765) loss:3.8839111328125 total-samples: 12800\n",
|
||||
"INFO:__main__:4: [Epoch 0] duration(57.384796402999655) loss:3.7828776836395264 total-samples: 12800\n",
|
||||
"INFO:__main__:3: [Epoch 0] duration(57.56508574600002) loss:0.4888748526573181 total-samples: 19200\n",
|
||||
"INFO:__main__:2: [Epoch 0] duration(57.572057309999764) loss:0.5112927556037903 total-samples: 19200\n",
|
||||
"INFO:__main__:7: [Epoch 0] duration(57.568645871999706) loss:0.4705663323402405 total-samples: 19200\n",
|
||||
"INFO:__main__:1: [Epoch 0] duration(57.57279408600016) loss:0.48321157693862915 total-samples: 19200\n",
|
||||
"INFO:__main__:0: [Epoch 0] duration(57.56971709100026) loss:0.5088376402854919 total-samples: 19200\n",
|
||||
"INFO:__main__:6: [Epoch 0] duration(57.56528169000012) loss:0.49410247802734375 total-samples: 19200\n",
|
||||
"INFO:__main__:5: [Epoch 0] duration(57.564610613999776) loss:0.45060762763023376 total-samples: 19200\n",
|
||||
"INFO:__main__:4: [Epoch 0] duration(57.560950141000376) loss:0.48489123582839966 total-samples: 19200\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"^C\r\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stdout.txt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 69,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
"scrolled": true,
|
||||
"tags": [
|
||||
"stripout"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[33mFile found with URL \"https://batchdtdemost.file.core.windows.net/batchdtdemoshare/edf507a2-6235-46c5-b560-fd463ba2e771/batchdtdemorg/workspaces/workspace/experiments/distributed_pytorch_v100/jobs/pytorch-horovod-8/43d5d58c-2ecd-4aa4-a459-93be2f302b7e/stdouterr/stderr.txt?sv=2016-05-31&sr=f&sig=9HSNbBWc0aGcQINWHJz508JAKw935Miy%2BkMwEj184NQ%3D&se=2018-12-17T14%3A51%3A44Z&sp=rl\". Start streaming\u001b[0m\n",
|
||||
"Warning: Permanently added '[10.0.0.6]:23' (ECDSA) to the list of known hosts.\n",
|
||||
"^C\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stderr.txt"
|
||||
]
|
||||
|
@ -295,9 +670,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 70,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[K\u001b[0minished .."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!az batchai job delete -w $WORKSPACE -e $EXPERIMENT --name $JOB_NAME -y"
|
||||
]
|
||||
|
@ -314,7 +697,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 71,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -331,9 +714,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 72,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[K\u001b[0minished .."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!az batchai cluster delete -w $WORKSPACE --name $CLUSTER_NAME -g $GROUP_NAME -y"
|
||||
]
|
||||
|
@ -347,18 +738,34 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 73,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[K\u001b[0minished .."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!az batchai experiment delete -w $WORKSPACE --name $EXPERIMENT -g $GROUP_NAME -y"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 74,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[K\u001b[0minished .."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!az batchai workspace delete -n $WORKSPACE -g $GROUP_NAME -y"
|
||||
]
|
||||
|
@ -372,15 +779,31 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 75,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[K\u001b[0minished .."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!az group delete --name $GROUP_NAME -y"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"jupytext": {
|
||||
"text_representation": {
|
||||
"extension": ".py",
|
||||
"format_name": "light",
|
||||
"format_version": "1.3",
|
||||
"jupytext_version": "0.8.6"
|
||||
}
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
|
|
|
@ -14,9 +14,8 @@
|
|||
|
||||
# # Train PyTorch Model Distributed on Batch AI
|
||||
# In this notebook we will train a PyTorch model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:
|
||||
# * [Create Azure Resources](#azure_resources)
|
||||
# * [Create Fileserver(NFS)](#create_fileshare)
|
||||
# * [Configure Batch AI Cluster](#configure_cluster)
|
||||
# * [Create Experiment](#experiment)
|
||||
# * [Upload Training Scripts](#training_scripts)
|
||||
# * [Submit and Monitor Job](#job)
|
||||
# * [Clean Up Resources](#clean_up)
|
||||
|
||||
|
@ -24,13 +23,12 @@
|
|||
import sys
|
||||
sys.path.append("../common")
|
||||
|
||||
import json
|
||||
from dotenv import get_key
|
||||
import os
|
||||
from utils import write_json_to_file, dotenv_for
|
||||
# -
|
||||
|
||||
# Below are the variables that describe our experiment. By default we are using the NC24rs_v3 (Standard_NC24rs_v3) VMs which have V100 GPUs and Infiniband. By default we are using 2 nodes with each node having 4 GPUs, this equates to 8 GPUs. Feel free to increase the number of nodes but be aware what limitations your subscription may have.
|
||||
#
|
||||
# Set the USE_FAKE to True if you want to use fake data rather than the Imagenet dataset. This is often a good way to debug your models as well as checking what IO overhead is.
|
||||
|
||||
# + {"tags": ["parameters"]}
|
||||
|
@ -39,29 +37,42 @@ dotenv_path = dotenv_for()
|
|||
GROUP_NAME = get_key(dotenv_path, 'GROUP_NAME')
|
||||
FILE_SHARE_NAME = get_key(dotenv_path, 'FILE_SHARE_NAME')
|
||||
WORKSPACE = get_key(dotenv_path, 'WORKSPACE')
|
||||
NUM_NODES = get_key(dotenv_path, 'NUM_NODES')
|
||||
NUM_NODES = int(get_key(dotenv_path, 'NUM_NODES'))
|
||||
CLUSTER_NAME = get_key(dotenv_path, 'CLUSTER_NAME')
|
||||
GPU_TYPE = get_key(dotenv_path, 'GPU_TYPE')
|
||||
PROCESSES_PER_NODE = get_key(dotenv_path, 'PROCESSES_PER_NODE')
|
||||
PROCESSES_PER_NODE = int(get_key(dotenv_path, 'PROCESSES_PER_NODE'))
|
||||
STORAGE_ACCOUNT_NAME = get_key(dotenv_path, 'STORAGE_ACCOUNT_NAME')
|
||||
|
||||
EXPERIMENT = f"distributed_pytorch_{GPU_TYPE}"
|
||||
USE_FAKE = False
|
||||
DOCKERHUB = os.getenv('DOCKER_REPOSITORY', "masalvar") #"<YOUR DOCKERHUB>"
|
||||
# -
|
||||
|
||||
FAKE='-env FAKE=True' if USE_FAKE else ''
|
||||
FAKE='-x FAKE=True' if USE_FAKE else ''
|
||||
TOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES
|
||||
|
||||
# <a id='experiment'></a>
|
||||
# # Create Experiment
|
||||
# Next we create our experiment.
|
||||
|
||||
!az batchai experiment create -n $EXPERIMENT -g $GROUP_NAME -w $WORKSPACE
|
||||
|
||||
# Upload the relevant scripts
|
||||
# <a id='training_scripts'></a>
|
||||
# # Upload Training Scripts
|
||||
# We need to upload our training scripts and associated files
|
||||
|
||||
json_data = !az storage account keys list -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME
|
||||
storage_account_key = json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['value']
|
||||
|
||||
# + {"tags": ["stripout"]}
|
||||
# %env AZURE_STORAGE_ACCOUNT $STORAGE_ACCOUNT_NAME
|
||||
# %env AZURE_STORAGE_KEY=$storage_account_key
|
||||
# -
|
||||
|
||||
# Upload our training scripts
|
||||
|
||||
!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_pytorch_horovod.py --path scripts
|
||||
!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts
|
||||
!az storage file upload --share-name $FILE_SHARE_NAME --source cluster_config/docker.service --path scripts
|
||||
!az storage file upload --share-name $FILE_SHARE_NAME --source cluster_config/nodeprep.sh --path scripts
|
||||
|
||||
# Let's check our cluster we created earlier
|
||||
|
||||
|
@ -125,7 +136,9 @@ JOB_NAME='pytorch-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)
|
|||
|
||||
# We now submit the job to Batch AI
|
||||
|
||||
# + {"tags": ["stripout"]}
|
||||
!az batchai job create -n $JOB_NAME --cluster $CLUSTER_NAME -w $WORKSPACE -e $EXPERIMENT -f job.json
|
||||
# -
|
||||
|
||||
# With the command below we can check the status of the job
|
||||
|
||||
|
@ -133,13 +146,18 @@ JOB_NAME='pytorch-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)
|
|||
|
||||
# To view the files that the job has generated use the command below
|
||||
|
||||
# + {"tags": ["stripout"]}
|
||||
!az batchai job file list -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr
|
||||
# -
|
||||
|
||||
# We are also able to stream the stdout and stderr that our job produces. This is great to check the progress of our job as well as debug issues.
|
||||
|
||||
# + {"tags": ["stripout"]}
|
||||
!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stdout.txt
|
||||
|
||||
# + {"tags": ["stripout"]}
|
||||
!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stderr.txt
|
||||
# -
|
||||
|
||||
# We can either wait for the job to complete or delete it with the command below.
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче