merged master and fixed commits
This commit is contained in:
Коммит
24b52baee4
|
@ -99,3 +99,9 @@ ENV/
|
||||||
|
|
||||||
# mypy
|
# mypy
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
|
|
||||||
|
# Pycharm
|
||||||
|
.idea/
|
||||||
|
|
||||||
|
#################
|
||||||
|
job.json
|
|
@ -0,0 +1,3 @@
|
||||||
|
FROM pytorch/pytorch:0.4_cuda9_cudnn7
|
||||||
|
|
||||||
|
RUN pip install --no-cache-dir h5py scipy jupyter ipykernel numpy toolz pandas scikit-learn pillow
|
|
@ -0,0 +1,11 @@
|
||||||
|
DATA_DIR:=/mnt/imagenet
|
||||||
|
PWD:=$(shell pwd)
|
||||||
|
FAKE:='False'
|
||||||
|
FAKE_DATA_LENGTH:=1281167
|
||||||
|
name_prefix:=iliauk
|
||||||
|
tag:=latest
|
||||||
|
image-open:=$(name_prefix)/pytorch_gloo:$(tag)
|
||||||
|
open-path:=$(PWD)/Docker
|
||||||
|
script:=\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_gloo.py
|
||||||
|
include ../include/build.mk
|
||||||
|
|
|
@ -0,0 +1,283 @@
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from os import path
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import multiprocessing
|
||||||
|
from toolz import pipe
|
||||||
|
from timer import Timer
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.backends.cudnn as cudnn
|
||||||
|
import torch.optim
|
||||||
|
import torch.utils.data
|
||||||
|
import torchvision.transforms as transforms
|
||||||
|
from torch.utils.data import DataLoader, Dataset
|
||||||
|
import torchvision.models as models
|
||||||
|
import torch.distributed as dist
|
||||||
|
import torch.utils.data.distributed
|
||||||
|
|
||||||
|
print("PyTorch: ", torch.__version__)
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
# Distributed training settings
|
||||||
|
parser = argparse.ArgumentParser(description='PyTorch ResNet Example')
|
||||||
|
parser.add_argument('--world-size', default=1, type=int, help='number of distributed processes')
|
||||||
|
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, help='url used to set up distributed training')
|
||||||
|
parser.add_argument('--dist-backend', default='gloo', type=str, help='distributed backend')
|
||||||
|
parser.add_argument('--rank', default=-1, type=int, help='rank of the worker')
|
||||||
|
|
||||||
|
_WIDTH = 224
|
||||||
|
_HEIGHT = 224
|
||||||
|
_LR = 0.001
|
||||||
|
_EPOCHS = 1
|
||||||
|
_NUM_GPU = int(torch.cuda.device_count())
|
||||||
|
_BATCHSIZE = 64*_NUM_GPU
|
||||||
|
_RGB_MEAN = [0.485, 0.456, 0.406]
|
||||||
|
_RGB_SD = [0.229, 0.224, 0.225]
|
||||||
|
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
def _str_to_bool(in_str):
|
||||||
|
if 't' in in_str.lower():
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
_FAKE = _str_to_bool(os.getenv('FAKE', 'True'))
|
||||||
|
_DATA_LENGTH = int(os.getenv('FAKE_DATA_LENGTH', 1281167)) # How much fake data to simulate, default to size of imagenet dataset
|
||||||
|
|
||||||
|
#_DISTRIBUTED = _str_to_bool(os.getenv('DISTRIBUTED', 'False'))
|
||||||
|
_DISTRIBUTED = True
|
||||||
|
_CPU_COUNT = 8
|
||||||
|
logger.info("Distributed mode: ", _DISTRIBUTED)
|
||||||
|
logger.info("CPU Count: ", _CPU_COUNT)
|
||||||
|
|
||||||
|
|
||||||
|
def _append_path_to(data_path, data_series):
|
||||||
|
return data_series.apply(lambda x: path.join(data_path, x))
|
||||||
|
|
||||||
|
|
||||||
|
def _load_training(data_dir):
|
||||||
|
train_df = pd.read_csv(path.join(data_dir, 'train.csv'))
|
||||||
|
return train_df.assign(filenames=_append_path_to(path.join(data_dir, 'train'),
|
||||||
|
train_df.filenames))
|
||||||
|
|
||||||
|
|
||||||
|
def _load_validation(data_dir):
|
||||||
|
train_df = pd.read_csv(path.join(data_dir, 'validation.csv'))
|
||||||
|
return train_df.assign(filenames=_append_path_to(path.join(data_dir, 'validation'),
|
||||||
|
train_df.filenames))
|
||||||
|
|
||||||
|
|
||||||
|
def _create_data_fn(train_path, test_path):
|
||||||
|
logger.info('Reading training data info')
|
||||||
|
train_df = _load_training(train_path)
|
||||||
|
logger.info('Reading validation data info')
|
||||||
|
validation_df = _load_validation(test_path)
|
||||||
|
# File-path
|
||||||
|
train_X = train_df['filenames'].values
|
||||||
|
validation_X = validation_df['filenames'].values
|
||||||
|
# One-hot encoded labels for torch
|
||||||
|
train_labels = train_df[['num_id']].values.ravel()
|
||||||
|
validation_labels = validation_df[['num_id']].values.ravel()
|
||||||
|
# Index starts from 0
|
||||||
|
train_labels -= 1
|
||||||
|
validation_labels -= 1
|
||||||
|
return train_X, train_labels, validation_X, validation_labels
|
||||||
|
|
||||||
|
|
||||||
|
class ImageNet(Dataset):
|
||||||
|
def __init__(self, img_locs, img_labels, transform=None):
|
||||||
|
self.img_locs, self.labels = img_locs, img_labels
|
||||||
|
self.transform = transform
|
||||||
|
logger.info("Loaded {} labels and {} images".format(len(self.labels), len(self.img_locs)))
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
im_file = self.img_locs[idx]
|
||||||
|
label = self.labels[idx]
|
||||||
|
with open(im_file, 'rb') as f:
|
||||||
|
im_rgb = Image.open(f)
|
||||||
|
# Make sure 3-channel (RGB)
|
||||||
|
im_rgb = im_rgb.convert('RGB')
|
||||||
|
if self.transform is not None:
|
||||||
|
im_rgb = self.transform(im_rgb)
|
||||||
|
return im_rgb, label
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.img_locs)
|
||||||
|
|
||||||
|
|
||||||
|
class FakeData(Dataset):
|
||||||
|
def __init__(self,
|
||||||
|
batch_size=32,
|
||||||
|
num_batches=20,
|
||||||
|
dim=(224, 224),
|
||||||
|
n_channels=3,
|
||||||
|
n_classes=10,
|
||||||
|
length=_DATA_LENGTH,
|
||||||
|
seed=42,
|
||||||
|
data_transform=None):
|
||||||
|
self.dim = dim
|
||||||
|
self.n_channels = n_channels
|
||||||
|
self.n_classes = n_classes
|
||||||
|
self.num_batches = num_batches
|
||||||
|
self._data = _create_data(batch_size, self.num_batches, self.dim, self.n_channels)
|
||||||
|
self._labels = _create_labels(batch_size, self.num_batches, self.n_classes)
|
||||||
|
self.translation_index = np.random.choice(len(self._labels), length)
|
||||||
|
self._length=length
|
||||||
|
|
||||||
|
self._data_transform = data_transform
|
||||||
|
#logger = _get_logger()
|
||||||
|
logger.info("Creating fake data {} labels and {} images".format(n_classes, len(self._data)))
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
#logger = _get_logger()
|
||||||
|
logger.debug('Retrieving samples')
|
||||||
|
logger.debug(str(idx))
|
||||||
|
tr_index_array = self.translation_index[idx]
|
||||||
|
|
||||||
|
if self._data_transform is not None:
|
||||||
|
data=self._data_transform(self._data[tr_index_array])
|
||||||
|
else:
|
||||||
|
data=self._data[tr_index_array]
|
||||||
|
|
||||||
|
return data, self._labels[tr_index_array]
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return self._length
|
||||||
|
|
||||||
|
|
||||||
|
def _log_summary(data_length, duration):
|
||||||
|
#logger = _get_logger()
|
||||||
|
images_per_second = data_length / duration
|
||||||
|
logger.info('Data length: {}'.format(data_length))
|
||||||
|
logger.info('Total duration: {:.3f}'.format(duration))
|
||||||
|
logger.info('Total images/sec: {:.3f}'.format(images_per_second))
|
||||||
|
logger.info('Batch size: (Per GPU {}: Total {})'.format(int(_BATCHSIZE/_NUM_GPU), _BATCHSIZE))
|
||||||
|
logger.info('Distributed: {}'.format('True' if _DISTRIBUTED else 'False'))
|
||||||
|
logger.info('Num GPUs: {:.3f}'.format(_NUM_GPU)) # May need to pass in argument to get this
|
||||||
|
logger.info('Dataset: {}'.format('Synthetic' if _FAKE else 'Imagenet'))
|
||||||
|
|
||||||
|
def _create_data(batch_size, num_batches, dim, channels, seed=42):
|
||||||
|
np.random.seed(seed)
|
||||||
|
return np.random.rand(batch_size * num_batches,
|
||||||
|
channels,
|
||||||
|
dim[0],
|
||||||
|
dim[1]).astype(np.float32)
|
||||||
|
|
||||||
|
|
||||||
|
def _create_labels(batch_size, num_batches, n_classes):
|
||||||
|
return np.random.choice(n_classes, batch_size * num_batches)
|
||||||
|
|
||||||
|
|
||||||
|
def train(train_loader, model, criterion, optimizer, epoch):
|
||||||
|
logger.info("Training ...")
|
||||||
|
model.train()
|
||||||
|
for i, (input, target) in enumerate(train_loader):
|
||||||
|
input, target = input.cuda(non_blocking=True), target.cuda(non_blocking=True)
|
||||||
|
# compute output
|
||||||
|
output = model(input)
|
||||||
|
loss = criterion(output, target)
|
||||||
|
# compute gradient and do SGD step
|
||||||
|
optimizer.zero_grad()
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
|
||||||
|
def validate(val_loader, model, criterion):
|
||||||
|
logger.info("Validating ...")
|
||||||
|
model.eval()
|
||||||
|
correct = 0
|
||||||
|
total = 0
|
||||||
|
with torch.no_grad():
|
||||||
|
for i, (input, target) in enumerate(val_loader):
|
||||||
|
target = target.cuda(non_blocking=True)
|
||||||
|
# compute output
|
||||||
|
output = model(input)
|
||||||
|
_, predicted = torch.max(output.data, 1)
|
||||||
|
total += target.size(0)
|
||||||
|
correct += (predicted == target).sum().item()
|
||||||
|
logger.info('Top-1 Accuracy: %.2f %%' % (100 * correct / total))
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Autotune
|
||||||
|
cudnn.benchmark = True
|
||||||
|
# Load symbol
|
||||||
|
model = models.__dict__['resnet50'](pretrained=False)
|
||||||
|
if _DISTRIBUTED:
|
||||||
|
logger.info('Running in distributed mode')
|
||||||
|
dist.init_process_group(
|
||||||
|
backend=args.dist_backend,
|
||||||
|
init_method=args.dist_url,
|
||||||
|
world_size=args.world_size,
|
||||||
|
rank=args.rank)
|
||||||
|
model.cuda()
|
||||||
|
model = torch.nn.parallel.DistributedDataParallel(model)
|
||||||
|
else:
|
||||||
|
model = torch.nn.DataParallel(model).cuda()
|
||||||
|
# Optimisers
|
||||||
|
criterion = nn.CrossEntropyLoss().cuda()
|
||||||
|
optimizer = torch.optim.SGD(model.parameters(), lr=_LR)
|
||||||
|
# Data-sets
|
||||||
|
if _FAKE:
|
||||||
|
logger.info("Setting up fake loaders")
|
||||||
|
train_dataset = FakeData(n_classes=1000, data_transform=torch.FloatTensor)
|
||||||
|
else:
|
||||||
|
normalize = transforms.Normalize(_RGB_MEAN, _RGB_SD)
|
||||||
|
train_X, train_y, valid_X, valid_y = _create_data_fn(os.getenv('AZ_BATCHAI_INPUT_TRAIN'),
|
||||||
|
os.getenv('AZ_BATCHAI_INPUT_TEST'))
|
||||||
|
train_dataset = ImageNet(
|
||||||
|
train_X,
|
||||||
|
train_y,
|
||||||
|
transforms.Compose([
|
||||||
|
transforms.RandomResizedCrop(_WIDTH),
|
||||||
|
transforms.RandomHorizontalFlip(),
|
||||||
|
transforms.ToTensor(),
|
||||||
|
normalize]))
|
||||||
|
|
||||||
|
|
||||||
|
if _DISTRIBUTED:
|
||||||
|
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
|
||||||
|
else:
|
||||||
|
train_sampler = None
|
||||||
|
|
||||||
|
# Data-loaders
|
||||||
|
train_loader = torch.utils.data.DataLoader(
|
||||||
|
train_dataset, batch_size=_BATCHSIZE, shuffle=(train_sampler is None), num_workers=_CPU_COUNT, sampler=train_sampler)
|
||||||
|
|
||||||
|
#val_loader = torch.utils.data.DataLoader(
|
||||||
|
# ImageNet(
|
||||||
|
# valid_X,
|
||||||
|
# valid_y,
|
||||||
|
# transforms.Compose([
|
||||||
|
# transforms.Resize(256),
|
||||||
|
# transforms.CenterCrop(_WIDTH),
|
||||||
|
# transforms.ToTensor(),
|
||||||
|
# normalize])), batch_size=_BATCHSIZE, shuffle=False,
|
||||||
|
# num_workers=_CPU_COUNT)
|
||||||
|
|
||||||
|
# Main training-loop
|
||||||
|
for epoch in range(_EPOCHS):
|
||||||
|
if _DISTRIBUTED:
|
||||||
|
train_sampler.set_epoch(epoch)
|
||||||
|
# Train
|
||||||
|
with Timer(output=logger.info, prefix="Training") as t:
|
||||||
|
train(train_loader, model, criterion, optimizer, epoch)
|
||||||
|
_log_summary(len(train_dataset), t.elapsed)
|
||||||
|
|
||||||
|
# Validate
|
||||||
|
#with Timer(output=logger.info, prefix="Testing"):
|
||||||
|
# validate(val_loader, model, criterion)
|
||||||
|
|
||||||
|
print("Finished")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print("Pytorch")
|
||||||
|
main()
|
|
@ -1,5 +1,5 @@
|
||||||
# Variables for Batch AI - change as necessary
|
# Variables for Batch AI - change as necessary
|
||||||
ID:=disdl
|
ID:=iliadl2
|
||||||
LOCATION:=eastus
|
LOCATION:=eastus
|
||||||
GROUP_NAME:=batch${ID}rg
|
GROUP_NAME:=batch${ID}rg
|
||||||
STORAGE_ACCOUNT_NAME:=batch${ID}st
|
STORAGE_ACCOUNT_NAME:=batch${ID}st
|
||||||
|
@ -8,8 +8,8 @@ SELECTED_SUBSCRIPTION:="Team Danielle Internal"
|
||||||
WORKSPACE:=workspace
|
WORKSPACE:=workspace
|
||||||
|
|
||||||
VM_SIZE:=Standard_NC24rs_v3
|
VM_SIZE:=Standard_NC24rs_v3
|
||||||
NUM_NODES:=8
|
NUM_NODES:=2
|
||||||
CLUSTER_NAME:=msv100
|
CLUSTER_NAME:=ikv100
|
||||||
|
|
||||||
|
|
||||||
GPU_TYPE:=V100
|
GPU_TYPE:=V100
|
||||||
|
|
|
@ -8,36 +8,36 @@ logger = logging.getLogger(__name__)
|
||||||
#
|
#
|
||||||
# Config for Intel
|
# Config for Intel
|
||||||
cmd_for_intel = \
|
cmd_for_intel = \
|
||||||
"""source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh;
|
"""source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh;
|
||||||
echo $AZ_BATCH_HOST_LIST;
|
echo $AZ_BATCH_HOST_LIST;
|
||||||
mpirun -n {total_processes} -ppn {processes_per_node} {hosts}
|
mpirun -n {total_processes} -ppn {processes_per_node} {hosts}
|
||||||
-env I_MPI_FABRICS=dapl
|
-env I_MPI_FABRICS=dapl
|
||||||
-env I_MPI_DAPL_PROVIDER=ofa-v2-ib0
|
-env I_MPI_DAPL_PROVIDER=ofa-v2-ib0
|
||||||
-env I_MPI_DYNAMIC_CONNECTION=0
|
-env I_MPI_DYNAMIC_CONNECTION=0
|
||||||
-env I_MPI_DEBUG=6
|
-env I_MPI_DEBUG=6
|
||||||
-env I_MPI_HYDRA_DEBUG=on
|
-env I_MPI_HYDRA_DEBUG=on
|
||||||
-env DISTRIBUTED=True
|
-env DISTRIBUTED=True
|
||||||
{fake}
|
{fake}
|
||||||
{fake_length}
|
{fake_length}
|
||||||
python -u {script}""".replace('\n', '')
|
python -u {script}""".replace('\n', '')
|
||||||
|
|
||||||
# Config for OpenMPI
|
# Config for OpenMPI
|
||||||
cmd_for_openmpi = \
|
cmd_for_openmpi = \
|
||||||
"""echo $AZ_BATCH_HOST_LIST;
|
"""echo $AZ_BATCH_HOST_LIST;
|
||||||
cat $AZ_BATCHAI_MPI_HOST_FILE;
|
cat $AZ_BATCHAI_MPI_HOST_FILE;
|
||||||
mpirun -np {total_processes} {hosts}
|
mpirun -np {total_processes} {hosts}
|
||||||
-bind-to none -map-by slot
|
-bind-to none -map-by slot
|
||||||
-x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH
|
-x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH
|
||||||
-mca btl_tcp_if_include eth0
|
-mca btl_tcp_if_include eth0
|
||||||
-x NCCL_SOCKET_IFNAME=eth0
|
-x NCCL_SOCKET_IFNAME=eth0
|
||||||
-mca btl ^openib
|
-mca btl ^openib
|
||||||
-x NCCL_IB_DISABLE=1
|
-x NCCL_IB_DISABLE=1
|
||||||
-x DISTRIBUTED=True
|
-x DISTRIBUTED=True
|
||||||
-x AZ_BATCHAI_INPUT_TRAIN
|
-x AZ_BATCHAI_INPUT_TRAIN
|
||||||
-x AZ_BATCHAI_INPUT_TEST
|
-x AZ_BATCHAI_INPUT_TEST
|
||||||
{fake}
|
{fake}
|
||||||
{fake_length}
|
{fake_length}
|
||||||
--allow-run-as-root
|
--allow-run-as-root
|
||||||
python -u {script}""".replace('\n', '')
|
python -u {script}""".replace('\n', '')
|
||||||
|
|
||||||
# Running on single node without mpi
|
# Running on single node without mpi
|
||||||
|
@ -89,7 +89,8 @@ def _fake_length_for(mpitype, fake_length, data):
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
|
||||||
def _prepare_command(mpitype, total_processes, processes_per_node, script, node_count, data=None, synthetic_length=1281167):
|
def _prepare_command(mpitype, total_processes, processes_per_node, script, node_count, data=None,
|
||||||
|
synthetic_length=1281167):
|
||||||
command = cmd_choice_dict.get(mpitype, cmd_for_intel)
|
command = cmd_choice_dict.get(mpitype, cmd_for_intel)
|
||||||
return command.format(total_processes=total_processes,
|
return command.format(total_processes=total_processes,
|
||||||
processes_per_node=processes_per_node,
|
processes_per_node=processes_per_node,
|
||||||
|
@ -141,6 +142,48 @@ def generate_job_dict(image_name,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def generate_job_dict_gloo(image_name,
|
||||||
|
script,
|
||||||
|
node_count=2):
|
||||||
|
# Command is hard-coded for time-being
|
||||||
|
# Not sure what world-size is?? Probably node_count but check
|
||||||
|
return {
|
||||||
|
"$schema": "https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2018-05-01/job.json",
|
||||||
|
"properties": {
|
||||||
|
"pyTorchSettings": {
|
||||||
|
"pythonScriptFilePath": script,
|
||||||
|
"commandLineArgs": "--world-size 2 --dist-backend $AZ_BATCHAI_PYTORCH_BACKEND --dist-url $AZ_BATCHAI_PYTORCH_INIT_METHOD --rank $AZ_BATCHAI_TASK_INDEX",
|
||||||
|
"communicationBackend": "gloo"
|
||||||
|
},
|
||||||
|
"nodeCount": node_count,
|
||||||
|
"stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs",
|
||||||
|
"inputDirectories": [{
|
||||||
|
"id": "SCRIPTS",
|
||||||
|
"path": "$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TRAIN",
|
||||||
|
"path": "$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TEST",
|
||||||
|
"path": "$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"outputDirectories": [{
|
||||||
|
"id": "MODEL",
|
||||||
|
"pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs",
|
||||||
|
"pathSuffix": "Models"
|
||||||
|
}],
|
||||||
|
"containerSettings": {
|
||||||
|
"imageSourceRegistry": {
|
||||||
|
"image": image_name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def generate_job_dict_cntk(image_name,
|
def generate_job_dict_cntk(image_name,
|
||||||
command,
|
command,
|
||||||
node_count=2,
|
node_count=2,
|
||||||
|
@ -203,15 +246,21 @@ def synthetic_data_job(image_name,
|
||||||
filename, image_name))
|
filename, image_name))
|
||||||
total_processes = processes_per_node * \
|
total_processes = processes_per_node * \
|
||||||
node_count if total_processes is None else total_processes
|
node_count if total_processes is None else total_processes
|
||||||
command = _prepare_command(mpitype,
|
if mpitype == "gloo":
|
||||||
total_processes,
|
job_template = generate_job_dict_gloo(image_name,
|
||||||
processes_per_node,
|
script,
|
||||||
script,
|
node_count=node_count)
|
||||||
node_count,
|
else:
|
||||||
synthetic_length=synthetic_length)
|
command = _prepare_command(mpitype,
|
||||||
job_template = generate_job_dict(image_name,
|
total_processes,
|
||||||
command,
|
processes_per_node,
|
||||||
node_count=node_count)
|
script,
|
||||||
|
node_count,
|
||||||
|
synthetic_length=synthetic_length)
|
||||||
|
job_template = generate_job_dict(image_name,
|
||||||
|
command,
|
||||||
|
node_count=node_count)
|
||||||
|
|
||||||
write_json_to_file(job_template, filename)
|
write_json_to_file(job_template, filename)
|
||||||
logger.info('Done')
|
logger.info('Done')
|
||||||
|
|
||||||
|
@ -228,6 +277,7 @@ def imagenet_data_job(image_name,
|
||||||
filename, image_name))
|
filename, image_name))
|
||||||
total_processes = processes_per_node * \
|
total_processes = processes_per_node * \
|
||||||
node_count if total_processes is None else total_processes
|
node_count if total_processes is None else total_processes
|
||||||
|
# non-synthetic gloo to add
|
||||||
command = _prepare_command(mpitype,
|
command = _prepare_command(mpitype,
|
||||||
total_processes,
|
total_processes,
|
||||||
processes_per_node,
|
processes_per_node,
|
||||||
|
|
|
@ -45,6 +45,11 @@ define submit_pytorch_local
|
||||||
$(call submit_job, $(2))
|
$(call submit_job, $(2))
|
||||||
endef
|
endef
|
||||||
|
|
||||||
|
define submit_pytorch_gloo
|
||||||
|
$(call generate_job_gloo,iliauk/pytorch_gloo,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_gloo.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH})
|
||||||
|
$(call submit_job, $(3))
|
||||||
|
endef
|
||||||
|
|
||||||
define submit_cntk
|
define submit_cntk
|
||||||
$(call generate_job_openmpi,hoaphumanoid/cntk:distributed,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_cntk.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH})
|
$(call generate_job_openmpi,hoaphumanoid/cntk:distributed,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_cntk.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH})
|
||||||
$(call submit_job, $(3))
|
$(call submit_job, $(3))
|
||||||
|
@ -75,6 +80,7 @@ create-cluster: upload-nodeprep-scripts
|
||||||
submit-all: submit-keras-intel32 submit-keras-intel16 submit-keras-intel8 submit-keras-intel4 \
|
submit-all: submit-keras-intel32 submit-keras-intel16 submit-keras-intel8 submit-keras-intel4 \
|
||||||
submit-tf-intel32 submit-tf-intel16 submit-tf-intel8 submit-tf-intel4 \
|
submit-tf-intel32 submit-tf-intel16 submit-tf-intel8 submit-tf-intel4 \
|
||||||
submit-pytorch32 submit-pytorch16 submit-pytorch8 submit-pytorch4 \
|
submit-pytorch32 submit-pytorch16 submit-pytorch8 submit-pytorch4 \
|
||||||
|
submit-pytorch_gloo32 submit-pytorch_gloo16 submit-pytorch_gloo8 submit-pytorch_gloo4 \
|
||||||
submit-cntk32 submit-cntk16 submit-cntk8 submit-cntk4 \
|
submit-cntk32 submit-cntk16 submit-cntk8 submit-cntk4 \
|
||||||
submit-keras-local submit-tf-local submit-pytorch-local submit_cntk_local
|
submit-keras-local submit-tf-local submit-pytorch-local submit_cntk_local
|
||||||
|
|
||||||
|
@ -139,4 +145,17 @@ submit-cntk4:
|
||||||
$(call submit_cntk,1,$(PROCESSES_PER_NODE),cntk-4)
|
$(call submit_cntk,1,$(PROCESSES_PER_NODE),cntk-4)
|
||||||
|
|
||||||
submit-cntk-local:
|
submit-cntk-local:
|
||||||
$(call submit_cntk_local,1,cntk-local)
|
$(call submit_cntk_local,1,cntk-local)
|
||||||
|
|
||||||
|
|
||||||
|
submit-pytorch_gloo32:
|
||||||
|
$(call submit_pytorch_gloo,8,$(PROCESSES_PER_NODE),pytorch_gloo-32)
|
||||||
|
|
||||||
|
submit-pytorch_gloo16:
|
||||||
|
$(call submit_pytorch_gloo,4,$(PROCESSES_PER_NODE),pytorch_gloo-16)
|
||||||
|
|
||||||
|
submit-pytorch_gloo8:
|
||||||
|
$(call submit_pytorch_gloo,2,$(PROCESSES_PER_NODE),pytorch_gloo-8)
|
||||||
|
|
||||||
|
submit-pytorch_gloo4:
|
||||||
|
$(call submit_pytorch_gloo,1,$(PROCESSES_PER_NODE),pytorch_gloo-4)
|
||||||
|
|
|
@ -51,6 +51,16 @@ define generate_job_local
|
||||||
endef
|
endef
|
||||||
|
|
||||||
|
|
||||||
|
define generate_job_gloo
|
||||||
|
python ../generate_job_spec.py $(1) gloo \
|
||||||
|
$(2) \
|
||||||
|
--filename job.json \
|
||||||
|
--node_count $(3) \
|
||||||
|
--ppn $(4) \
|
||||||
|
$(5)
|
||||||
|
endef
|
||||||
|
|
||||||
|
|
||||||
define stream_stdout
|
define stream_stdout
|
||||||
az batchai job file stream -w $(WORKSPACE) -e $(EXPERIMENT) \
|
az batchai job file stream -w $(WORKSPACE) -e $(EXPERIMENT) \
|
||||||
--j $(1) --output-directory-id stdouterr -f stdout.txt
|
--j $(1) --output-directory-id stdouterr -f stdout.txt
|
||||||
|
@ -113,6 +123,7 @@ upload-scripts: set-storage
|
||||||
$(call upload_script, ../../HorovodPytorch/src/imagenet_pytorch_horovod.py)
|
$(call upload_script, ../../HorovodPytorch/src/imagenet_pytorch_horovod.py)
|
||||||
$(call upload_script, ../../CNTK/src/imagenet_cntk.py)
|
$(call upload_script, ../../CNTK/src/imagenet_cntk.py)
|
||||||
$(call upload_script, ../../CNTK/src/resnet_models.py)
|
$(call upload_script, ../../CNTK/src/resnet_models.py)
|
||||||
|
$(call upload_script, ../../Pytorch/src/imagenet_pytorch_gloo.py)
|
||||||
$(call upload_script, ../../common/timer.py)
|
$(call upload_script, ../../common/timer.py)
|
||||||
|
|
||||||
upload-nodeprep-scripts: set-storage
|
upload-nodeprep-scripts: set-storage
|
||||||
|
@ -160,7 +171,7 @@ delete: delete-cluster
|
||||||
az group delete --name ${GROUP_NAME} -y
|
az group delete --name ${GROUP_NAME} -y
|
||||||
|
|
||||||
|
|
||||||
setup: select-subscription create-resource-group create-workspace create-storage set-storage set-az-defaults create-fileshare create-cluster list-clusters
|
setup: select-subscription create-resource-group create-workspace create-storage set-storage set-az-defaults create-fileshare create-directory upload-scripts create-cluster list-clusters create-experiment
|
||||||
@echo "Cluster created"
|
@echo "Cluster created"
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -169,6 +180,7 @@ setup: select-subscription create-resource-group create-workspace create-storage
|
||||||
submit-all: submit-keras-intel32 submit-keras-intel16 submit-keras-intel8 submit-keras-intel4 \
|
submit-all: submit-keras-intel32 submit-keras-intel16 submit-keras-intel8 submit-keras-intel4 \
|
||||||
submit-tf-intel32 submit-tf-intel16 submit-tf-intel8 submit-tf-intel4 \
|
submit-tf-intel32 submit-tf-intel16 submit-tf-intel8 submit-tf-intel4 \
|
||||||
submit-pytorch32 submit-pytorch16 submit-pytorch8 submit-pytorch4 \
|
submit-pytorch32 submit-pytorch16 submit-pytorch8 submit-pytorch4 \
|
||||||
|
submit-pytorch_gloo32 submit-pytorch_gloo16 submit-pytorch_gloo8 submit-pytorch_gloo4 \
|
||||||
submit-cntk32 submit-cntk16 submit-cntk8 submit-cntk4 \
|
submit-cntk32 submit-cntk16 submit-cntk8 submit-cntk4 \
|
||||||
submit-keras-local submit-tf-local submit-pytorch-local submit_cntk_local
|
submit-keras-local submit-tf-local submit-pytorch-local submit_cntk_local
|
||||||
|
|
||||||
|
@ -191,6 +203,11 @@ clean-jobs:
|
||||||
$(call delete_job, pytorch-16)
|
$(call delete_job, pytorch-16)
|
||||||
$(call delete_job, pytorch-32)
|
$(call delete_job, pytorch-32)
|
||||||
|
|
||||||
|
$(call delete_job, pytorch_gloo-4)
|
||||||
|
$(call delete_job, pytorch_gloo-8)
|
||||||
|
$(call delete_job, pytorch_gloo-16)
|
||||||
|
$(call delete_job, pytorch_gloo-32)
|
||||||
|
|
||||||
$(call delete_job, cntk-local)
|
$(call delete_job, cntk-local)
|
||||||
$(call delete_job, cntk-4)
|
$(call delete_job, cntk-4)
|
||||||
$(call delete_job, cntk-8)
|
$(call delete_job, cntk-8)
|
||||||
|
@ -198,6 +215,7 @@ clean-jobs:
|
||||||
$(call delete_job, cntk-32)
|
$(call delete_job, cntk-32)
|
||||||
|
|
||||||
####### Gather Results ######
|
####### Gather Results ######
|
||||||
|
# TODO for PyTorch_Gloo
|
||||||
|
|
||||||
gather-results:results.json
|
gather-results:results.json
|
||||||
@echo "All results gathered"
|
@echo "All results gathered"
|
||||||
|
@ -205,6 +223,9 @@ gather-results:results.json
|
||||||
results.json: pytorch_1gpulocal_$(GPU_TYPE)_local.results pytorch_4gpuopen_$(GPU_TYPE)_open.results \
|
results.json: pytorch_1gpulocal_$(GPU_TYPE)_local.results pytorch_4gpuopen_$(GPU_TYPE)_open.results \
|
||||||
pytorch_8gpuopen_$(GPU_TYPE)_open.results pytorch_16gpuopen_$(GPU_TYPE)_open.results \
|
pytorch_8gpuopen_$(GPU_TYPE)_open.results pytorch_16gpuopen_$(GPU_TYPE)_open.results \
|
||||||
pytorch_32gpuopen_$(GPU_TYPE)_open.results \
|
pytorch_32gpuopen_$(GPU_TYPE)_open.results \
|
||||||
|
pytorch_gloo_1gpulocal_$(GPU_TYPE)_local.results pytorch_gloo_4gpuopen_$(GPU_TYPE)_open.results \
|
||||||
|
pytorch_gloo_8gpuopen_$(GPU_TYPE)_open.results pytorch_gloo_16gpuopen_$(GPU_TYPE)_open.results \
|
||||||
|
pytorch_gloo_32gpuopen_$(GPU_TYPE)_open.results \
|
||||||
tf_1gpulocal_$(GPU_TYPE)_local.results tf_4gpuintel_$(GPU_TYPE)_intel.results \
|
tf_1gpulocal_$(GPU_TYPE)_local.results tf_4gpuintel_$(GPU_TYPE)_intel.results \
|
||||||
tf_8gpuintel_$(GPU_TYPE)_intel.results tf_16gpuintel_$(GPU_TYPE)_intel.results \
|
tf_8gpuintel_$(GPU_TYPE)_intel.results tf_16gpuintel_$(GPU_TYPE)_intel.results \
|
||||||
tf_32gpuintel_$(GPU_TYPE)_intel.results \
|
tf_32gpuintel_$(GPU_TYPE)_intel.results \
|
||||||
|
@ -233,7 +254,20 @@ pytorch_32gpuopen_$(GPU_TYPE)_open.results:
|
||||||
$(call stream_stdout, pytorch-32)>pytorch_32gpuopen_$(GPU_TYPE)_open.results
|
$(call stream_stdout, pytorch-32)>pytorch_32gpuopen_$(GPU_TYPE)_open.results
|
||||||
|
|
||||||
|
|
||||||
|
pytorch_gloo_1gpulocal_$(GPU_TYPE)_local.results:
|
||||||
|
$(call stream_stdout, pytorch_gloo-local)>pytorch_gloo_1gpulocal_$(GPU_TYPE)_local.results
|
||||||
|
|
||||||
|
pytorch_gloo_4gpuopen_$(GPU_TYPE)_open.results:
|
||||||
|
$(call stream_stdout, pytorch_gloo-4)>pytorch_gloo_4gpuopen_$(GPU_TYPE)_open.results
|
||||||
|
|
||||||
|
pytorch_gloo_8gpuopen_$(GPU_TYPE)_open.results:
|
||||||
|
$(call stream_stdout, pytorch_gloo-8)>pytorch_gloo_8gpuopen_$(GPU_TYPE)_open.results
|
||||||
|
|
||||||
|
pytorch_gloo_16gpuopen_$(GPU_TYPE)_open.results:
|
||||||
|
$(call stream_stdout, pytorch_gloo-16)>pytorch_gloo_16gpuopen_$(GPU_TYPE)_open.results
|
||||||
|
|
||||||
|
pytorch_gloo_32gpuopen_$(GPU_TYPE)_open.results:
|
||||||
|
$(call stream_stdout, pytorch_gloo-32)>pytorch_gloo_32gpuopen_$(GPU_TYPE)_open.results
|
||||||
|
|
||||||
tf_1gpulocal_$(GPU_TYPE)_local.results:
|
tf_1gpulocal_$(GPU_TYPE)_local.results:
|
||||||
$(call stream_stdout, tf-local)>tf_1gpulocal_$(GPU_TYPE)_local.results
|
$(call stream_stdout, tf-local)>tf_1gpulocal_$(GPU_TYPE)_local.results
|
||||||
|
|
Загрузка…
Ссылка в новой задаче