Adds initial version of template (#33)

This commit is contained in:
Mat 2019-05-21 13:49:50 +01:00 коммит произвёл GitHub
Родитель 3a99ab8e35
Коммит b3c31b6e22
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
110 изменённых файлов: 58298 добавлений и 35951 удалений

20
.gitignore поставляемый
Просмотреть файл

@ -8,7 +8,6 @@ __pycache__/
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
@ -24,6 +23,7 @@ wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
@ -45,6 +45,7 @@ nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
@ -53,6 +54,7 @@ coverage.xml
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
@ -79,13 +81,14 @@ celerybeat-schedule
# SageMath parsed files
*.sage.py
# dotenv
# Environments
.env
# virtualenv
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
@ -100,8 +103,9 @@ ENV/
# mypy
.mypy_cache/
# Pycharm
.idea/
*/TensorFlow_benchmark/src/*
#################
job.json
*/.vscode/*
.vscode/settings.json
.dev_env

Просмотреть файл

@ -1,73 +0,0 @@
# Ubuntu 16.04, CUDA 9.0
FROM nvidia/cuda:9.0-runtime-ubuntu16.04
ARG CNTK_VERSION="2.5.1"
LABEL maintainer "MICROSOFT CORPORATION" \
com.microsoft.cntk.version="$CNTK_VERSION"
ENV CNTK_VERSION="$CNTK_VERSION"
# Install CNTK as the default backend for Keras
ENV KERAS_BACKEND=cntk
RUN apt-get update && apt-get install -y --no-install-recommends \
# General
ca-certificates \
wget \
sudo \
build-essential \
openssh-client \
openssh-server \
&& \
# Clean-up
apt-get -y autoremove \
&& \
rm -rf /var/lib/apt/lists/*
# Get CNTK Binary Distribution
RUN CNTK_VERSION_DASHED=$(echo $CNTK_VERSION | tr . -) && \
([ "$CNTK_VERSION" != "2.4" ] || VERIFY_SHA256="true") && \
CNTK_SHA256="8eebff81ef4111b2be5804303f1254cd20de5911a7678c8e64689e5c288dde40" && \
wget -q https://cntk.ai/BinaryDrop/CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz && \
([ "$VERIFY_SHA256" != "true" ] || (echo "$CNTK_SHA256 CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz" | sha256sum --check --strict -)) && \
tar -xzf CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz && \
rm -f CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz && \
/bin/bash /cntk/Scripts/install/linux/install-cntk.sh --py-version 35 --docker
WORKDIR /root
ENV PATH /cntk/cntk/bin:/root/anaconda3/envs/cntk-py35/bin:$PATH
ENV LD_LIBRARY_PATH /cntk/cntk/lib:/cntk/cntk/dependencies/lib:$LD_LIBRARY_PATH
# Install Open MPI
RUN mkdir /tmp/openmpi && \
cd /tmp/openmpi && \
wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
tar zxf openmpi-3.0.0.tar.gz && \
cd openmpi-3.0.0 && \
./configure --enable-orterun-prefix-by-default && \
make -j $(nproc) all && \
make install && \
ldconfig && \
rm -rf /tmp/openmpi
# Create a wrapper for OpenMPI to allow running as root by default
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
echo '#!/bin/bash' > /usr/local/bin/mpirun && \
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
chmod a+x /usr/local/bin/mpirun
# Configure OpenMPI to run good defaults:
# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \
echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
# Set default NCCL parameters
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \
echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
# Allow OpenSSH to talk to containers without asking for confirmation
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config

Просмотреть файл

@ -1,9 +0,0 @@
DATA_DIR:=/mnt/imagenet
PWD:=$(shell pwd)
FAKE:='False'
FAKE_DATA_LENGTH:=1281167
image-open:=hoaphumanoid/cntk:distributed-openmpi3
open-path:=$(PWD)/Docker/cntk
script:=\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_cntk.py
include ../include/build.mk

Просмотреть файл

@ -1,309 +0,0 @@
"""
Trains ResNet50 using CNTK.
It requires the following env variables
AZ_BATCHAI_INPUT_TRAIN
AZ_BATCHAI_OUTPUT_MODEL
This code is based on this example:
https://github.com/Microsoft/CNTK/blob/master/Examples/Image/Classification/ResNet/Python/TrainResNet_ImageNet_Distributed.py
"""
from __future__ import print_function
import os
import numpy as np
import cntk as C
from cntk import input, cross_entropy_with_softmax, classification_error, Trainer, cntk_py
from cntk import data_parallel_distributed_learner, Communicator
from cntk.learners import momentum_sgd, learning_rate_schedule, momentum_schedule, UnitType
from cntk.io import UserMinibatchSource, StreamInformation, MinibatchData
from cntk.train.training_session import *
from cntk.debugging import *
from cntk.logging import *
import cntk.io.transforms as xforms
from resnet_models import create_imagenet_model_bottleneck
from sklearn.preprocessing import OneHotEncoder
import logging
logger = logging.getLogger(__name__)
def _str_to_bool(in_str):
if 't' in in_str.lower():
return True
else:
return False
# model dimensions
_WIDTH = 224
_HEIGHT = 224
_CHANNELS = 3
_LR = 0.001
_EPOCHS = os.getenv('EPOCHS', 1)
_BATCHSIZE = 32
_MOMENTUM = 0.9
_NUMCLASSES = 1000
_MODELNAME = 'ResNet_ImageNet.model'
_NUMQUANTIZEDBITS = 32
_WD = 0.0001
_FAKE = _str_to_bool(os.getenv('FAKE', 'False'))
# How much fake data to simulate, default to size of imagenet dataset
_DATA_LENGTH = int(os.getenv('FAKE_DATA_LENGTH', 1281167))
_DISTRIBUTED = _str_to_bool(os.getenv('DISTRIBUTED', 'False'))
def _get_progress_printer():
pp = ProgressPrinter(
freq=100,
tag='Training',
log_to_file=None,
rank=Communicator.rank(),
gen_heartbeat=False,
num_epochs=_EPOCHS)
return pp
def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
if not os.path.exists(map_file) or not os.path.exists(mean_file):
raise RuntimeError("File '%s' or '%s' does not exist." %
(map_file, mean_file))
# transformation pipeline for the features has jitter/crop only when training
transforms = []
if train:
transforms += [
xforms.crop(crop_type='randomarea',
area_ratio=(0.08, 1.0),
aspect_ratio=(0.75, 1.3333),
jitter_type='uniratio')
]
else:
transforms += [
# test has no jitter
C.io.transforms.crop(crop_type='center', side_ratio=0.875)
]
transforms += [
xforms.scale(width=_WIDTH, height=_HEIGHT,
channels=_CHANNELS, interpolations='cubic'),
xforms.mean(mean_file)
]
# deserializer
return C.io.MinibatchSource(
C.io.ImageDeserializer(map_file, C.io.StreamDefs(
# 1st col in mapfile referred to as 'image'
features=C.io.StreamDef(field='image', transforms=transforms),
labels=C.io.StreamDef(field='label', shape=_NUMCLASSES))), # and second as 'label'
randomize=train,
max_samples=total_number_of_samples,
multithreaded_deserializer=True)
class FakeDataSource(UserMinibatchSource):
"""Fake data source
https://cntk.ai/pythondocs/Manual_How_to_create_user_minibatch_sources.html
"""
def __init__(self, total_n_images, dim, channels, n_classes, seed=42):
self.dim = dim
self.total_n_images = total_n_images
self.channels = channels
self.n_classes = n_classes
self.seed = seed
self.fsi = StreamInformation(name='features', stream_id=0, storage_format='dense',
dtype=np.float32, shape=(self.channels, self.dim[0], self.dim[0],))
self.lsi = StreamInformation(
name='labels', stream_id=1, storage_format='dense', dtype=np.float32, shape=(self.n_classes,))
self.sample_count = 0
self.next_seq_idx = 0
super(FakeDataSource, self).__init__()
def stream_infos(self):
"""
Override the stream_infos method of the base UserMinibatchSource class
to provide stream meta information.
"""
return [self.fsi, self.lsi]
def next_minibatch(self, num_samples, number_of_workers=1, worker_rank=0, device=None):
"""
Override the next_minibatch method of the base UserMinibatchSource class
to provide minibatch data.
"""
np.random.seed(self.seed)
x = np.random.rand(num_samples, self.channels,
self.dim[0], self.dim[1]).astype(np.float32)
y = np.random.choice(self.n_classes, num_samples)
y = np.expand_dims(y, axis=-1)
enc = OneHotEncoder(n_values=self.n_classes, dtype=np.float32,
categorical_features='all')
fit = enc.fit(y)
y = fit.transform(y).toarray()
if self.sample_count + num_samples <= self.total_n_images:
self.sample_count += num_samples
self.next_seq_idx += num_samples
feature_data = C.Value(batch=x, device=device)
label_data = C.Value(batch=y, device=device)
res = {
self.fsi: MinibatchData(feature_data, num_samples, num_samples, False),
self.lsi: MinibatchData(label_data, num_samples, num_samples, False)
}
else:
res = {}
return res
def get_checkpoint_state(self):
return {'next_seq_idx': self.next_seq_idx}
def restore_from_checkpoint(self, state):
self.next_seq_idx = state['next_seq_idx']
def model_fn():
# Input variables denoting the features and label data
graph_input = C.input_variable((_CHANNELS, _HEIGHT, _WIDTH))
graph_label = C.input_variable((_NUMCLASSES))
with C.default_options(dtype=np.float32):
stride1x1 = (1, 1)
stride3x3 = (2, 2)
# create model, and configure learning parameters for ResNet50
z = create_imagenet_model_bottleneck(graph_input, [2, 3, 5, 2],
_NUMCLASSES, stride1x1, stride3x3)
# loss and metric
ce = cross_entropy_with_softmax(z, graph_label)
errs = classification_error(z, graph_label, topN=1)
return {
'name': 'resnet50',
'feature': graph_input,
'label': graph_label,
'ce': ce,
'errs': errs,
'output': z
}
# Create trainer
def create_trainer(network, minibatch_size, epoch_size,
learning_rate, momentum, l2_reg_weight,
num_quantization_bits):
lr_per_mb = [learning_rate]
# Set learning parameters
lr_schedule = learning_rate_schedule(
lr_per_mb, epoch_size=epoch_size, unit=UnitType.minibatch)
mm_schedule = momentum_schedule(momentum)
local_learner = momentum_sgd(network['output'].parameters,
lr_schedule,
mm_schedule,
l2_regularization_weight=l2_reg_weight)
# learner object
if _DISTRIBUTED:
learner = data_parallel_distributed_learner(
local_learner,
num_quantization_bits=num_quantization_bits,
distributed_after=0)
else:
learner = local_learner
# logger
progress_printer = _get_progress_printer()
return Trainer(network['output'], (network['ce'], network['errs']), learner, progress_printer)
def train_and_test(network, trainer, train_source, test_source, minibatch_size,
epoch_size, model_path):
# define mapping from intput streams to network inputs
input_map = {
network['feature']: train_source.streams.features,
network['label']: train_source.streams.labels
}
if _DISTRIBUTED:
start_profiler(sync_gpu=True)
training_session(
trainer=trainer,
mb_source=train_source,
mb_size=minibatch_size,
model_inputs_to_streams=input_map,
progress_frequency=epoch_size,
checkpoint_config=CheckpointConfig(frequency=epoch_size,
filename=os.path.join(
model_path, _MODELNAME),
restore=False) # ,
# test_config=TestConfig(test_source, minibatch_size)
).train()
if _DISTRIBUTED:
stop_profiler()
def main():
model_path = os.getenv('AZ_BATCHAI_OUTPUT_MODEL')
if _DISTRIBUTED:
minibatch_size = _BATCHSIZE * Communicator.num_workers()
else:
minibatch_size = _BATCHSIZE
logger.info("Creating model ...")
network = model_fn()
logger.info("Creating trainer ...")
trainer = create_trainer(network,
minibatch_size,
_DATA_LENGTH,
learning_rate=_LR,
momentum=_MOMENTUM,
l2_reg_weight=_WD,
num_quantization_bits=_NUMQUANTIZEDBITS)
logger.info('Creating data sources ...')
if _FAKE:
logger.info("Using {} images of fake data".format(_DATA_LENGTH))
train_source = FakeDataSource(total_n_images=_DATA_LENGTH,
dim=(_HEIGHT, _WIDTH),
channels=_CHANNELS,
n_classes=_NUMCLASSES)
test_source = None
else:
logging.info(
"Using ImageNet dataset with {} images".format(_DATA_LENGTH))
data_path = os.getenv('AZ_BATCHAI_INPUT_TRAIN')
logger.info("model_path: {}".format(model_path))
logger.info("data_path: {}".format(data_path))
mean_data = os.path.join(data_path, 'ImageNet1K_mean.xml')
train_data = os.path.join(data_path, 'train_map.txt')
test_data = os.path.join(data_path, 'val_map.txt')
train_source = create_image_mb_source(
train_data, mean_data, train=True, total_number_of_samples=_EPOCHS*_DATA_LENGTH)
test_source = create_image_mb_source(
test_data, mean_data, train=False, total_number_of_samples=C.io.FULL_DATA_SWEEP)
logger.info("Training with minibatch size of {}".format(minibatch_size))
train_and_test(network, trainer, train_source, test_source,
minibatch_size, _DATA_LENGTH, model_path)
if _DISTRIBUTED:
# Must call MPI finalize when process exit without exceptions
Communicator.finalize()
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
logger.info("Starting routine. Distributed mode={}".format(_DISTRIBUTED))
main()
logger.info("Routine finished")

Просмотреть файл

@ -1,137 +0,0 @@
# Copyright (c) Microsoft. All rights reserved.
#
# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================
import numpy as np
from cntk.initializer import he_normal, normal
from cntk.layers import AveragePooling, MaxPooling, BatchNormalization, Convolution, Dense
from cntk.ops import element_times, relu
#
# assembly components
#
def conv_bn(input, filter_size, num_filters, strides=(1, 1), init=he_normal(), bn_init_scale=1):
c = Convolution(filter_size, num_filters, activation=None, init=init, pad=True, strides=strides, bias=False)(input)
r = BatchNormalization(map_rank=1, normalization_time_constant=4096, use_cntk_engine=False, init_scale=bn_init_scale, disable_regularization=True)(c)
return r
def conv_bn_relu(input, filter_size, num_filters, strides=(1, 1), init=he_normal()):
r = conv_bn(input, filter_size, num_filters, strides, init, 1)
return relu(r)
#
# ResNet components
#
def resnet_basic(input, num_filters):
c1 = conv_bn_relu(input, (3, 3), num_filters)
c2 = conv_bn(c1, (3, 3), num_filters, bn_init_scale=1)
p = c2 + input
return relu(p)
def resnet_basic_inc(input, num_filters, strides=(2, 2)):
c1 = conv_bn_relu(input, (3, 3), num_filters, strides)
c2 = conv_bn(c1, (3, 3), num_filters, bn_init_scale=1)
s = conv_bn(input, (1, 1), num_filters, strides) # Shortcut
p = c2 + s
return relu(p)
def resnet_basic_stack(input, num_stack_layers, num_filters):
assert(num_stack_layers >= 0)
l = input
for _ in range(num_stack_layers):
l = resnet_basic(l, num_filters)
return l
def resnet_bottleneck(input, out_num_filters, inter_out_num_filters):
c1 = conv_bn_relu(input, (1, 1), inter_out_num_filters)
c2 = conv_bn_relu(c1, (3, 3), inter_out_num_filters)
c3 = conv_bn(c2, (1, 1), out_num_filters, bn_init_scale=0)
p = c3 + input
return relu(p)
def resnet_bottleneck_inc(input, out_num_filters, inter_out_num_filters, stride1x1, stride3x3):
c1 = conv_bn_relu(input, (1, 1), inter_out_num_filters, strides=stride1x1)
c2 = conv_bn_relu(c1, (3, 3), inter_out_num_filters, strides=stride3x3)
c3 = conv_bn(c2, (1, 1), out_num_filters, bn_init_scale=0)
stride = np.multiply(stride1x1, stride3x3)
s = conv_bn(input, (1, 1), out_num_filters, strides=stride) # Shortcut
p = c3 + s
return relu(p)
def resnet_bottleneck_stack(input, num_stack_layers, out_num_filters, inter_out_num_filters):
assert(num_stack_layers >= 0)
l = input
for _ in range(num_stack_layers):
l = resnet_bottleneck(l, out_num_filters, inter_out_num_filters)
return l
#
# Defines the residual network model for classifying images
#
def create_cifar10_model(input, num_stack_layers, num_classes):
c_map = [16, 32, 64]
conv = conv_bn_relu(input, (3, 3), c_map[0])
r1 = resnet_basic_stack(conv, num_stack_layers, c_map[0])
r2_1 = resnet_basic_inc(r1, c_map[1])
r2_2 = resnet_basic_stack(r2_1, num_stack_layers-1, c_map[1])
r3_1 = resnet_basic_inc(r2_2, c_map[2])
r3_2 = resnet_basic_stack(r3_1, num_stack_layers-1, c_map[2])
# Global average pooling and output
pool = AveragePooling(filter_shape=(8, 8), name='final_avg_pooling')(r3_2)
z = Dense(num_classes, init=normal(0.01))(pool)
return z
def create_imagenet_model_basic(input, num_stack_layers, num_classes):
c_map = [64, 128, 256, 512]
conv = conv_bn_relu(input, (7, 7), c_map[0], strides=(2, 2))
pool1 = MaxPooling((3, 3), strides=(2, 2), pad=True)(conv)
r1 = resnet_basic_stack(pool1, num_stack_layers[0], c_map[0])
r2_1 = resnet_basic_inc(r1, c_map[1])
r2_2 = resnet_basic_stack(r2_1, num_stack_layers[1], c_map[1])
r3_1 = resnet_basic_inc(r2_2, c_map[2])
r3_2 = resnet_basic_stack(r3_1, num_stack_layers[2], c_map[2])
r4_1 = resnet_basic_inc(r3_2, c_map[3])
r4_2 = resnet_basic_stack(r4_1, num_stack_layers[3], c_map[3])
# Global average pooling and output
pool = AveragePooling(filter_shape=(7, 7), name='final_avg_pooling')(r4_2)
z = Dense(num_classes, init=normal(0.01))(pool)
return z
def create_imagenet_model_bottleneck(input, num_stack_layers, num_classes, stride1x1, stride3x3):
c_map = [64, 128, 256, 512, 1024, 2048]
# conv1 and max pooling
conv1 = conv_bn_relu(input, (7, 7), c_map[0], strides=(2, 2))
pool1 = MaxPooling((3,3), strides=(2,2), pad=True)(conv1)
# conv2_x
r2_1 = resnet_bottleneck_inc(pool1, c_map[2], c_map[0], (1, 1), (1, 1))
r2_2 = resnet_bottleneck_stack(r2_1, num_stack_layers[0], c_map[2], c_map[0])
# conv3_x
r3_1 = resnet_bottleneck_inc(r2_2, c_map[3], c_map[1], stride1x1, stride3x3)
r3_2 = resnet_bottleneck_stack(r3_1, num_stack_layers[1], c_map[3], c_map[1])
# conv4_x
r4_1 = resnet_bottleneck_inc(r3_2, c_map[4], c_map[2], stride1x1, stride3x3)
r4_2 = resnet_bottleneck_stack(r4_1, num_stack_layers[2], c_map[4], c_map[2])
# conv5_x
r5_1 = resnet_bottleneck_inc(r4_2, c_map[5], c_map[3], stride1x1, stride3x3)
r5_2 = resnet_bottleneck_stack(r5_1, num_stack_layers[3], c_map[5], c_map[3])
# Global average pooling and output
pool = AveragePooling(filter_shape=(7, 7), name='final_avg_pooling')(r5_2)
z = Dense(num_classes, init=normal(0.01))(pool)
return z

Просмотреть файл

@ -1,48 +0,0 @@
FROM ubuntu:16.04
COPY environment.yml .
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
cmake \
vim \
wget \
curl \
gfortran \
apt-transport-https \
jq \
locales \
git \
openssh-client && \
rm -rf /var/lib/apt/lists/*
RUN locale-gen en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
ENV ENV_NAME=py3.6
RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
chmod +x ~/miniconda.sh && \
~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
/opt/conda/bin/conda env create -q --name $ENV_NAME -f environment.yml && \
/opt/conda/bin/conda clean -ya
ENV PATH /opt/conda/envs/$ENV_NAME/bin:/opt/conda/bin:$PATH
# Install Azure CLI
RUN echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ xenial main" | \
tee /etc/apt/sources.list.d/azure-cli.list && \
curl -L https://packages.microsoft.com/keys/microsoft.asc | apt-key add - && \
apt-get update && \
apt-get install -y --no-install-recommends \
azure-cli
# Install AzCopy
RUN echo "deb [arch=amd64] https://packages.microsoft.com/repos/microsoft-ubuntu-xenial-prod/ xenial main" > azure.list &&\
cp ./azure.list /etc/apt/sources.list.d/ &&\
apt-key adv --keyserver packages.microsoft.com --recv-keys B02C46DF417A0893 &&\
apt-get update &&\
apt-get install -y --no-install-recommends azcopy

Просмотреть файл

@ -1,16 +0,0 @@
channels:
- conda-forge
dependencies:
- python=3.6
- numpy
- pyyaml
- scipy
- ipython
- pandas
- jupyter
- ipykernel
- scikit-learn
- selenium
- phantomjs
- pillow
- bokeh=0.13.0

Просмотреть файл

@ -1,63 +0,0 @@
FROM nvidia/cuda:9.0-devel-ubuntu16.04
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
ENV TENSORFLOW_VERSION=1.8.0
ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
ENV PYTHON_VERSION=3.5
ENV NCCL_VERSION=2.2.12-1+cuda9.0
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
cpio \
git \
curl \
wget \
ca-certificates \
libdapl2 \
libcudnn7=${CUDNN_VERSION} \
libnccl2=${NCCL_VERSION} \
libnccl-dev=${NCCL_VERSION} \
libjpeg-dev \
libpng-dev \
libmlx4-1 \
libsm6 \
libxext6 \
python$PYTHON_VERSION \
python$PYTHON_VERSION-dev
# install intel MPI
RUN cd /tmp && \
wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' && \
tar zxvf l_mpi_2017.3.196.tgz && \
sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' \
/tmp/l_mpi_2017.3.196/silent.cfg && \
sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
cd /tmp/l_mpi_2017.3.196 && \
./install.sh -s silent.cfg && \
cd .. && \
rm -rf l_mpi_2017.3.196* && \
echo "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" >> ~/.bashrc
ENV PATH $PATH:/opt/intel/compilers_and_libraries/linux/mpi/bin64
RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
# Install TensorFlow and Keras
RUN pip install --no-cache-dir tensorflow-gpu==$TENSORFLOW_VERSION h5py scipy jupyter ipykernel numpy toolz pandas \
scikit-learn keras pillow
# Install Horovod, temporarily using CUDA stubs
RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
/bin/bash -c "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" && \
HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==0.13.2 && \
ldconfig

Просмотреть файл

@ -1,79 +0,0 @@
FROM nvidia/cuda:9.0-devel-ubuntu16.04
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
ENV TENSORFLOW_VERSION=1.8.0
ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
ENV NCCL_VERSION=2.2.12-1+cuda9.0
ENV PYTHON_VERSION=3.5
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
git \
curl \
nano \
wget \
ca-certificates \
libcudnn7=$CUDNN_VERSION \
libnccl2=$NCCL_VERSION \
libnccl-dev=$NCCL_VERSION \
libjpeg-dev \
libpng-dev \
python$PYTHON_VERSION \
python$PYTHON_VERSION-dev
RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
# Install TensorFlow
RUN pip install --no-cache-dir tensorflow-gpu==$TENSORFLOW_VERSION h5py scipy jupyter ipykernel numpy toolz pandas scikit-learn keras pillow
# Install Open MPI
RUN mkdir /tmp/openmpi && \
cd /tmp/openmpi && \
wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
tar zxf openmpi-3.0.0.tar.gz && \
cd openmpi-3.0.0 && \
./configure --enable-orterun-prefix-by-default && \
make -j $(nproc) all && \
make install && \
ldconfig && \
rm -rf /tmp/openmpi
# Install Horovod, temporarily using CUDA stubs
RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==0.13.2 && \
ldconfig
# Create a wrapper for OpenMPI to allow running as root by default
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
echo '#!/bin/bash' > /usr/local/bin/mpirun && \
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
chmod a+x /usr/local/bin/mpirun
# Configure OpenMPI to run good defaults:
# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \
echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
# Set default NCCL parameters
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \
echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
# Install OpenSSH for MPI to communicate between containers
RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
mkdir -p /var/run/sshd
# Allow OpenSSH to talk to containers without asking for confirmation
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
WORKDIR "/examples"

Просмотреть файл

@ -1,13 +0,0 @@
DATA_DIR:=/mnt/imagenet
PWD:=$(shell pwd)
FAKE:='False'
FAKE_DATA_LENGTH:=1281167
name_prefix:=masalvar
tag:=9-1.8-.13.2 # Cuda - TF version - Horovod version
image-intel:=$(name_prefix)/horovod-intel-keras:$(tag)
intel-path:=$(PWD)/Docker/horovod-intel
image-open:=$(name_prefix)/horovod-keras:$(tag)
open-path:=$(PWD)/Docker/horovod
script:=\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py
include ../include/build.mk

Просмотреть файл

@ -1,53 +0,0 @@
import numpy as np
import keras
import logging
def _get_logger():
return logging.getLogger(__name__)
def _create_data(batch_size, num_batches, dim, channels, seed=42):
np.random.seed(42)
return np.random.rand(batch_size * num_batches,
dim[0],
dim[1],
channels).astype(np.float32)
def _create_labels(batch_size, num_batches, n_classes):
return np.random.choice(n_classes, batch_size * num_batches)
class FakeDataGenerator(keras.preprocessing.image.Iterator):
def __init__(self,
batch_size=32,
num_batches=20,
dim=(224, 224),
n_channels=3,
n_classes=10,
length=1000,
shuffle=True,
seed=42):
'Initialization'
super(FakeDataGenerator, self).__init__(length,
batch_size,
shuffle,
seed)
self.dim = dim
self.n_channels = n_channels
self.n_classes = n_classes
self.num_batches = num_batches
self._data = _create_data(self.batch_size, self.num_batches, self.dim, self.n_channels)
self._labels = _create_labels(self.batch_size, self.num_batches, self.n_classes)
self.translation_index = np.random.choice(len(self._labels), length)
def _get_batches_of_transformed_samples(self, index_array):
logger = _get_logger()
logger.debug('Retrieving samples')
logger.debug(str(index_array))
tr_index_array = self.translation_index[index_array]
return self._data[tr_index_array], keras.utils.to_categorical(self._labels[tr_index_array], num_classes=self.n_classes)

Просмотреть файл

@ -1,314 +0,0 @@
"""
Trains ResNet50 in Keras using Horovod.
It requires the following env variables
AZ_BATCHAI_INPUT_TRAIN
AZ_BATCHAI_INPUT_TEST
AZ_BATCHAI_OUTPUT_MODEL
AZ_BATCHAI_JOB_TEMP_DIR
"""
import logging
import sys
from functools import lru_cache
from data_generator import FakeDataGenerator
from timer import Timer
import keras
from keras import backend as K
from keras.preprocessing import image
import tensorflow as tf
import os
def _str_to_bool(in_str):
if 't' in in_str.lower():
return True
else:
return False
_WIDTH = 224
_HEIGHT = 224
_CHANNELS = 3
_LR = 0.001
_EPOCHS = os.getenv('EPOCHS', 1)
_BATCHSIZE = 64
_R_MEAN = 123.68
_G_MEAN = 116.78
_B_MEAN = 103.94
# Settings from https://arxiv.org/abs/1706.02677.
_WARMUP_EPOCHS = 5
_WEIGHT_DECAY = 0.00005
_NUM_WORKERS=int(os.getenv('NUM_WORKERS', 10))
_MAX_QUEUE_SIZE=int(os.getenv('MAX_QUEUE_SIZE', 10))
_MULTIPROCESSING=_str_to_bool(os.getenv('MULTIPROCESSING', 'False'))
_DISTRIBUTED = _str_to_bool(os.getenv('DISTRIBUTED', 'False'))
_FAKE = _str_to_bool(os.getenv('FAKE', 'False'))
_DATA_LENGTH = int(os.getenv('FAKE_DATA_LENGTH', 1281167)) # How much fake data to simulate, default to size of imagenet dataset
_VALIDATION = _str_to_bool(os.getenv('VALIDATION', 'False'))
if _DISTRIBUTED:
import horovod.keras as hvd
def _get_rank():
if _DISTRIBUTED:
try:
return hvd.rank()
except:
return 0
else:
return 0
class HorovodAdapter(logging.LoggerAdapter):
def __init__(self, logger):
self._str_epoch=''
self._gpu_rank=0
super(HorovodAdapter, self).__init__(logger, {})
def set_epoch(self, epoch):
self._str_epoch='[Epoch {}]'.format(epoch)
def process(self, msg, kwargs):
kwargs['extra'] = {
'gpurank': _get_rank(),
'epoch': self._str_epoch
}
return msg, kwargs
@lru_cache()
def _get_logger():
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
ch = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(levelname)s:%(name)s:%(gpurank)d: %(epoch)s %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
adapter = HorovodAdapter(logger)
return adapter
def _create_model():
logger = _get_logger()
logger.info('Creating model')
# Set up standard ResNet-50 model.
model = keras.applications.resnet50.ResNet50(weights=None)
# ResNet-50 model that is included with Keras is optimized for inference.
# Add L2 weight decay & adjust BN settings.
model_config = model.get_config()
for layer, layer_config in zip(model.layers, model_config['layers']):
if hasattr(layer, 'kernel_regularizer'):
regularizer = keras.regularizers.l2(_WEIGHT_DECAY)
layer_config['config']['kernel_regularizer'] = \
{'class_name': regularizer.__class__.__name__,
'config': regularizer.get_config()}
if type(layer) == keras.layers.BatchNormalization:
layer_config['config']['momentum'] = 0.9
layer_config['config']['epsilon'] = 1e-5
model = keras.models.Model.from_config(model_config)
return model
def _validation_data_iterator_from():
# Validation data iterator.
raise NotImplementedError('The flow from directory command expects data to be in directories and this is not implemented yet')
# test_gen = image.ImageDataGenerator(
# zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input)
# test_iter = test_gen.flow_from_directory(os.path.join(os.getenv('AZ_BATCHAI_INPUT_TEST'), 'validation'), batch_size=_BATCHSIZE,
# target_size=(224, 224))
# return test_iter
def _training_data_iterator_from():
# Training data iterator.
train_gen = image.ImageDataGenerator(
width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True,
preprocessing_function=keras.applications.resnet50.preprocess_input)
train_iter = train_gen.flow_from_directory(os.path.join(os.getenv('AZ_BATCHAI_INPUT_TRAIN'), 'train'), batch_size=_BATCHSIZE,
target_size=(224, 224))
return train_iter
def _fake_data_iterator_from(length=_DATA_LENGTH):
return FakeDataGenerator(batch_size=_BATCHSIZE, n_classes=1000, length=length)
def _get_optimizer(params, is_distributed=_DISTRIBUTED):
if is_distributed:
# Horovod: adjust learning rate based on number of GPUs.
opt = keras.optimizers.SGD(lr=params['learning_rate'] * hvd.size(), momentum=params['momentum'])
# Horovod: add Horovod Distributed Optimizer.
return hvd.DistributedOptimizer(opt)
else:
return keras.optimizers.SGD(lr=params['learning_rate'], momentum=params['momentum'])
def _get_runconfig(is_distributed=_DISTRIBUTED):
if is_distributed:
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
else:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
return config
def _get_model_dir(is_distributed=_DISTRIBUTED):
if is_distributed:
# Horovod: save checkpoints only on worker 0 to prevent other workers from
# corrupting them.
return os.getenv('AZ_BATCHAI_OUTPUT_MODEL') if hvd.rank() == 0 else os.getenv('AZ_BATCHAI_JOB_TEMP_DIR')
else:
return os.getenv('AZ_BATCHAI_OUTPUT_MODEL')
def _get_hooks(is_distributed=_DISTRIBUTED, verbose=1):
logger = _get_logger()
if is_distributed:
logger.info('Rank: {} Cluster Size {}'.format(hvd.local_rank(), hvd.size()))
return [
# Horovod: broadcast initial variable states from rank 0 to all other processes.
# This is necessary to ensure consistent initialization of all workers when
# training is started with random weights or restored from a checkpoint.
hvd.callbacks.BroadcastGlobalVariablesCallback(0),
# Horovod: average metrics among workers at the end of every epoch.
#
# Note: This callback must be in the list before the ReduceLROnPlateau,
# TensorBoard, or other metrics-based callbacks.
hvd.callbacks.MetricAverageCallback(),
# Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
# accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
# the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=_WARMUP_EPOCHS, verbose=verbose),
# Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs.
hvd.callbacks.LearningRateScheduleCallback(start_epoch=_WARMUP_EPOCHS, end_epoch=30, multiplier=1.),
hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1),
hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2),
hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3),
]
else:
return []
class LoggerCallback(keras.callbacks.Callback):
def __init__(self, logger, data_length):
self._timer = Timer(output=logger.info, prefix="Epoch duration: ", fmt="{:.3f} seconds")
self._data_length=data_length
def on_epoch_begin(self, epoch, logs):
logger = _get_logger()
logger.set_epoch(epoch)
self._timer.start()
def on_epoch_end(self, epoch, logs):
duration = self._timer.elapsed
_log_summary(self._data_length, duration)
def _is_master(is_distributed=_DISTRIBUTED):
if is_distributed:
if hvd.rank() == 0:
return True
else:
return False
else:
return True
def _log_summary(data_length, duration):
logger = _get_logger()
images_per_second = data_length / duration
logger.info('Data length: {}'.format(data_length))
logger.info('Total duration: {:.3f}'.format(duration))
logger.info('Total images/sec: {:.3f}'.format(images_per_second))
logger.info('Batch size: (Per GPU {}: Total {})'.format(_BATCHSIZE, hvd.size()*_BATCHSIZE if _DISTRIBUTED else _BATCHSIZE))
logger.info('Distributed: {}'.format('True' if _DISTRIBUTED else 'False'))
logger.info('Num GPUs: {:.3f}'.format(hvd.size() if _DISTRIBUTED else 1))
logger.info('Dataset: {}'.format('Synthetic' if _FAKE else 'Imagenet'))
def main():
verbose=1
logger = _get_logger()
if _DISTRIBUTED:
# Horovod: initialize Horovod.
hvd.init()
logger.info("Runnin Distributed")
verbose = 1 if hvd.rank() == 0 else 0
logger.info("Tensorflow version {}".format(tf.__version__))
K.set_session(tf.Session(config=_get_runconfig()))
# Horovod: broadcast resume_from_epoch from rank 0 (which will have
# checkpoints) to other ranks.
resume_from_epoch = 0
if _DISTRIBUTED:
resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch')
if _FAKE:
train_iter = _fake_data_iterator_from()
else:
train_iter = _training_data_iterator_from()
test_iter = _validation_data_iterator_from() if _VALIDATION else None
model = _create_model()
params = {
'learning_rate':_LR,
'momentum': 0.9
}
opt = _get_optimizer(params)
model.compile(loss=keras.losses.categorical_crossentropy,
optimizer=opt,
metrics=['accuracy', 'top_k_categorical_accuracy'])
model_dir = _get_model_dir()
checkpoint_format = os.path.join(model_dir, 'checkpoint-{epoch}.h5')
callbacks = _get_hooks()
callbacks.append(LoggerCallback(logger, len(train_iter)*_BATCHSIZE))
# Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
if _is_master():
callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_format))
# callbacks.append(keras.callbacks.TensorBoard(log_dir))
# Restore from a previous checkpoint, if initial_epoch is specified.
# Horovod: restore on the first worker which will broadcast weights to other workers.
if resume_from_epoch > 0 and _is_master():
model.load_weights(checkpoint_format.format(epoch=resume_from_epoch))
logger.info('Training...')
# Train the model. The training will randomly sample 1 / N batches of training data and
# 3 / N batches of validation data on every worker, where N is the number of workers.
# Over-sampling of validation data helps to increase probability that every validation
# example will be evaluated.
num_workers = hvd.size() if _DISTRIBUTED else 1
model.fit_generator(train_iter,
steps_per_epoch=len(train_iter) // num_workers,
callbacks=callbacks,
epochs=_EPOCHS,
verbose=verbose,
workers=_NUM_WORKERS,
max_queue_size=_MAX_QUEUE_SIZE,
use_multiprocessing=_MULTIPROCESSING,
initial_epoch=resume_from_epoch)
if _FAKE is False and _VALIDATION:
# Evaluate the model on the full data set.
with Timer(output=logger.info, prefix="Testing"):
logger.info('Testing...')
score = hvd.allreduce(model.evaluate_generator(test_iter, len(test_iter), workers=10))
if verbose:
print('Test loss:', score[0])
print('Test accuracy:', score[1])
if __name__ == '__main__':
main()

Просмотреть файл

@ -1,69 +0,0 @@
FROM nvidia/cuda:9.0-devel-ubuntu16.04
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
ENV PYTORCH_VERSION=0.4.0
ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
ENV NCCL_VERSION=2.2.12-1+cuda9.0
ENV PYTHON_VERSION=3.5
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
cpio \
git \
curl \
wget \
ca-certificates \
libdapl2 \
libcudnn7=${CUDNN_VERSION} \
libnccl2=${NCCL_VERSION} \
libnccl-dev=${NCCL_VERSION} \
libjpeg-dev \
libpng-dev \
libmlx4-1 \
libsm6 \
libxext6 \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-dev
# install intel MPI
RUN cd /tmp && \
wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' && \
tar zxvf l_mpi_2017.3.196.tgz && \
sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' /tmp/l_mpi_2017.3.196/silent.cfg && \
sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
cd /tmp/l_mpi_2017.3.196 && \
./install.sh -s silent.cfg && \
cd .. && \
rm -rf l_mpi_2017.3.196* && \
echo "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" >> ~/.bashrc
ENV PATH $PATH:/opt/intel/compilers_and_libraries/linux/mpi/bin64
RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
## Install PyTorch
#RUN PY=$(echo ${PYTHON_VERSION} | sed s/\\.//); \
# if [[ ${PYTHON_VERSION} == 3* ]]; then \
# pip install http://download.pytorch.org/whl/cu90/torch-${PYTORCH_VERSION}-cp${PY}-cp${PY}m-linux_x86_64.whl; \
# else \
# pip install http://download.pytorch.org/whl/cu90/torch-${PYTORCH_VERSION}-cp${PY}-cp${PY}mu-linux_x86_64.whl; \
# fi; \
# Install PyTorch
RUN pip install http://download.pytorch.org/whl/cu90/torch-0.4.0-cp35-cp35m-linux_x86_64.whl && \
pip install --no-cache-dir torchvision h5py scipy jupyter ipykernel numpy toolz pandas scikit-learn pillow
# Install Horovod, temporarily using CUDA stubs
RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
/bin/bash -c "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" && \
HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod==0.13.8 && \
ldconfig

Просмотреть файл

@ -1,82 +0,0 @@
FROM nvidia/cuda:9.0-devel-ubuntu16.04
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
ENV TENSORFLOW_VERSION=1.8.0
ENV PYTORCH_VERSION=0.4.0
ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
ENV NCCL_VERSION=2.2.12-1+cuda9.0
ENV PYTHON_VERSION=3.5
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
git \
curl \
vim \
wget \
ca-certificates \
libcudnn7=${CUDNN_VERSION} \
libnccl2=${NCCL_VERSION} \
libnccl-dev=${NCCL_VERSION} \
libjpeg-dev \
libpng-dev \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-dev
RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
# Install PyTorch
RUN pip install http://download.pytorch.org/whl/cu90/torch-0.4.0-cp35-cp35m-linux_x86_64.whl && \
pip install --no-cache-dir torchvision h5py scipy jupyter ipykernel numpy toolz pandas scikit-learn pillow
# Install Open MPI
RUN mkdir /tmp/openmpi && \
cd /tmp/openmpi && \
wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
tar zxf openmpi-3.0.0.tar.gz && \
cd openmpi-3.0.0 && \
./configure --enable-orterun-prefix-by-default && \
make -j $(nproc) all && \
make install && \
ldconfig && \
rm -rf /tmp/openmpi
# Install Horovod, temporarily using CUDA stubs
RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod==0.13.2 && \
ldconfig
# Create a wrapper for OpenMPI to allow running as root by default
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
echo '#!/bin/bash' > /usr/local/bin/mpirun && \
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
chmod a+x /usr/local/bin/mpirun
# Configure OpenMPI to run good defaults:
# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf #&& \
# echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
# Set default NCCL parameters
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \
echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
# Install OpenSSH for MPI to communicate between containers
RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
mkdir -p /var/run/sshd
# Allow OpenSSH to talk to containers without asking for confirmation
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
WORKDIR "/examples"

Просмотреть файл

@ -1,13 +0,0 @@
DATA_DIR:=/mnt/imagenet
PWD:=$(shell pwd)
FAKE:='False'
FAKE_DATA_LENGTH:=1281167
name_prefix:=masalvar
tag:=9-1.8-.13.2 # Cuda - TF version - Horovod version
image-intel:=$(name_prefix)/horovod-intel-pytorch:$(tag)
intel-path:=$(PWD)/Docker/horovod-intel
image-open:=$(name_prefix)/horovod-pytorch:$(tag)
open-path:=$(PWD)/Docker/horovod
script:=\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py
include ../include/build.mk

Просмотреть файл

@ -1,328 +0,0 @@
"""
Trains ResNet50 in Keras using Horovod.
It requires the following env variables
AZ_BATCHAI_INPUT_TRAIN
AZ_BATCHAI_INPUT_TEST
AZ_BATCHAI_OUTPUT_MODEL
AZ_BATCHAI_JOB_TEMP_DIR
"""
import logging
import sys
from functools import lru_cache
from timer import Timer
import numpy as np
import os
from PIL import Image
import torch.optim as optim
from torchvision import transforms
import torch.utils.data.distributed
import torch.backends.cudnn as cudnn
import torchvision.models as models
from os import path
import pandas as pd
from torch.utils.data import Dataset
import torch.nn.functional as F
def _str_to_bool(in_str):
if 't' in in_str.lower():
return True
else:
return False
_WIDTH = 224
_HEIGHT = 224
_CHANNELS = 3
_LR = 0.001
_EPOCHS = os.getenv('EPOCHS', 1)
_BATCHSIZE = 64
_RGB_MEAN = [0.485, 0.456, 0.406]
_RGB_SD = [0.229, 0.224, 0.225]
_SEED=42
# Settings from https://arxiv.org/abs/1706.02677.
_WARMUP_EPOCHS = 5
_WEIGHT_DECAY = 0.00005
_FAKE = _str_to_bool(os.getenv('FAKE', 'False'))
_DATA_LENGTH = int(os.getenv('FAKE_DATA_LENGTH', 1281167)) # How much fake data to simulate, default to size of imagenet dataset
_DISTRIBUTED = _str_to_bool(os.getenv('DISTRIBUTED', 'False'))
if _DISTRIBUTED:
import horovod.torch as hvd
def _get_rank():
if _DISTRIBUTED:
try:
return hvd.rank()
except:
return 0
else:
return 0
class HorovodAdapter(logging.LoggerAdapter):
def __init__(self, logger):
self._str_epoch=''
self._gpu_rank=0
super(HorovodAdapter, self).__init__(logger, {})
def set_epoch(self, epoch):
self._str_epoch='[Epoch {}]'.format(epoch)
def process(self, msg, kwargs):
kwargs['extra'] = {
'gpurank': _get_rank(),
'epoch': self._str_epoch
}
return msg, kwargs
@lru_cache()
def _get_logger():
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
ch = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(levelname)s:%(name)s:%(gpurank)d: %(epoch)s %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
adapter = HorovodAdapter(logger)
return adapter
def _append_path_to(data_path, data_series):
return data_series.apply(lambda x: path.join(data_path, x))
def _load_training(data_dir):
logger = _get_logger()
logger.info('Reading training data from {}'.format(data_dir))
train_df = pd.read_csv(path.join(data_dir, 'train.csv'))
return train_df.assign(filenames=_append_path_to(path.join(data_dir, 'train'),
train_df.filenames))
def _load_validation(data_dir):
logger = _get_logger()
logger.info('Reading validation data from {}'.format(data_dir))
train_df = pd.read_csv(path.join(data_dir, 'validation.csv'))
return train_df.assign(filenames=_append_path_to(path.join(data_dir, 'validation'),
train_df.filenames))
def _create_data_fn(train_path, test_path):
train_df = _load_training(train_path)
validation_df = _load_validation(test_path)
# File-path
train_X = train_df['filenames'].values
validation_X = validation_df['filenames'].values
# One-hot encoded labels for torch
train_labels = train_df[['num_id']].values.ravel()
validation_labels = validation_df[['num_id']].values.ravel()
# Index starts from 0
train_labels -= 1
validation_labels -= 1
return train_X, train_labels, validation_X, validation_labels
class ImageNet(Dataset):
def __init__(self, img_locs, img_labels, transform=None):
logger = _get_logger()
self.img_locs, self.labels = img_locs, img_labels
self.transform = transform
logger.info("Loaded {} labels and {} images".format(len(np.unique(self.labels)), len(self.img_locs)))
def __getitem__(self, idx):
im_file = self.img_locs[idx]
label = self.labels[idx]
with open(im_file, 'rb') as f:
im_rgb = Image.open(f)
# Make sure 3-channel (RGB)
im_rgb = im_rgb.convert('RGB')
if self.transform is not None:
im_rgb = self.transform(im_rgb)
return im_rgb, label
def __len__(self):
return len(self.img_locs)
def _create_data(batch_size, num_batches, dim, channels, seed=42):
np.random.seed(seed)
return np.random.rand(batch_size * num_batches,
channels,
dim[0],
dim[1]).astype(np.float32)
def _create_labels(batch_size, num_batches, n_classes):
return np.random.choice(n_classes, batch_size * num_batches)
class FakeData(Dataset):
def __init__(self,
batch_size=32,
num_batches=20,
dim=(224, 224),
n_channels=3,
n_classes=10,
length=_DATA_LENGTH,
seed=42,
data_transform=None):
self.dim = dim
self.n_channels = n_channels
self.n_classes = n_classes
self.num_batches = num_batches
self._data = _create_data(batch_size, self.num_batches, self.dim, self.n_channels)
self._labels = _create_labels(batch_size, self.num_batches, self.n_classes)
self.translation_index = np.random.choice(len(self._labels), length)
self._length=length
self._data_transform = data_transform
logger = _get_logger()
logger.info("Creating fake data {} labels and {} images".format(n_classes, len(self._data)))
def __getitem__(self, idx):
logger = _get_logger()
logger.debug('Retrieving samples')
logger.debug(str(idx))
tr_index_array = self.translation_index[idx]
if self._data_transform is not None:
data=self._data_transform(self._data[tr_index_array])
else:
data=self._data[tr_index_array]
return data, self._labels[tr_index_array]
def __len__(self):
return self._length
def _is_master(is_distributed=_DISTRIBUTED):
if is_distributed:
if hvd.rank() == 0:
return True
else:
return False
else:
return True
def train(train_loader, model, criterion, optimizer, epoch):
logger = _get_logger()
msg = ' duration({}) loss:{} total-samples: {}'
t=Timer()
t.start()
logger.set_epoch(epoch)
for i, (data, target) in enumerate(train_loader):
data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
optimizer.zero_grad()
# compute output
output = model(data)
loss = criterion(output, target)
# compute gradient and do SGD step
loss.backward()
optimizer.step()
if i % 100 == 0:
logger.info(msg.format(t.elapsed, loss.item(), i * len(data)))
t.start()
def _log_summary(data_length, duration):
logger = _get_logger()
images_per_second = data_length / duration
logger.info('Data length: {}'.format(data_length))
logger.info('Total duration: {:.3f}'.format(duration))
logger.info('Total images/sec: {:.3f}'.format(images_per_second))
logger.info('Batch size: (Per GPU {}: Total {})'.format(_BATCHSIZE, hvd.size()*_BATCHSIZE if _DISTRIBUTED else _BATCHSIZE))
logger.info('Distributed: {}'.format('True' if _DISTRIBUTED else 'False'))
logger.info('Num GPUs: {:.3f}'.format(hvd.size() if _DISTRIBUTED else 1))
logger.info('Dataset: {}'.format('Synthetic' if _FAKE else 'Imagenet'))
def _get_sampler(dataset, is_distributed=_DISTRIBUTED):
if is_distributed:
return torch.utils.data.distributed.DistributedSampler(
dataset, num_replicas=hvd.size(), rank=hvd.rank())
else:
return torch.utils.data.sampler.RandomSampler(dataset)
def main():
logger = _get_logger()
if _DISTRIBUTED:
# Horovod: initialize Horovod.
hvd.init()
logger.info("Runnin Distributed")
torch.manual_seed(_SEED)
# Horovod: pin GPU to local rank.
torch.cuda.set_device(hvd.local_rank())
torch.cuda.manual_seed(_SEED)
logger.info("PyTorch version {}".format(torch.__version__))
if _FAKE:
logger.info("Setting up fake loaders")
train_dataset = FakeData(n_classes=1000, data_transform=torch.FloatTensor)
else:
normalize = transforms.Normalize(_RGB_MEAN, _RGB_SD)
train_X, train_y, valid_X, valid_y = _create_data_fn(os.getenv('AZ_BATCHAI_INPUT_TRAIN'), os.getenv('AZ_BATCHAI_INPUT_TEST'))
logger.info("Setting up loaders")
train_dataset = ImageNet(
train_X,
train_y,
transforms.Compose([
transforms.RandomResizedCrop(_WIDTH),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize]))
train_sampler=_get_sampler(train_dataset)
kwargs = {'num_workers': 5, 'pin_memory': True}
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=_BATCHSIZE, sampler=train_sampler, **kwargs)
# Autotune
cudnn.benchmark = True
logger.info("Loading model")
# Load symbol
model = models.__dict__['resnet50'](pretrained=False)
model.cuda()
if _DISTRIBUTED:
# Horovod: broadcast parameters.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
num_gpus= hvd.size() if _DISTRIBUTED else 1
# Horovod: scale learning rate by the number of GPUs.
optimizer = optim.SGD(model.parameters(), lr=_LR * num_gpus,
momentum=0.9)
if _DISTRIBUTED:
# Horovod: wrap optimizer with DistributedOptimizer.
optimizer = hvd.DistributedOptimizer(
optimizer, named_parameters=model.named_parameters())
criterion=F.cross_entropy
# Main training-loop
logger.info("Training ...")
for epoch in range(_EPOCHS):
with Timer(output=logger.info, prefix="Training") as t:
model.train()
if _DISTRIBUTED:
train_sampler.set_epoch(epoch)
train(train_loader, model, criterion, optimizer, epoch)
_log_summary(len(train_dataset), t.elapsed)
if __name__ == '__main__':
main()

Просмотреть файл

@ -1,59 +0,0 @@
FROM nvidia/cuda:9.0-devel-ubuntu16.04
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
ENV TENSORFLOW_VERSION=1.8.0
ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
ENV PYTHON_VERSION=3.5
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
cpio \
git \
curl \
wget \
ca-certificates \
libdapl2 \
libcudnn7=$CUDNN_VERSION \
libjpeg-dev \
libpng-dev \
libmlx4-1 \
libsm6 \
libxext6 \
python$PYTHON_VERSION \
python$PYTHON_VERSION-dev
# install intel MPI
RUN cd /tmp && \
wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' && \
tar zxvf l_mpi_2017.3.196.tgz && \
sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' \
/tmp/l_mpi_2017.3.196/silent.cfg && \
sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
cd /tmp/l_mpi_2017.3.196 && \
./install.sh -s silent.cfg && \
cd .. && \
rm -rf l_mpi_2017.3.196* && \
echo "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" >> ~/.bashrc
ENV PATH $PATH:/opt/intel/compilers_and_libraries/linux/mpi/bin64
RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
# Install TensorFlow
RUN pip install --no-cache-dir tensorflow-gpu==$TENSORFLOW_VERSION h5py scipy jupyter ipykernel numpy toolz pandas \
scikit-learn
# Install Horovod, temporarily using CUDA stubs
RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
/bin/bash -c "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" && \
HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==0.13.2 && \
ldconfig

Просмотреть файл

@ -1,78 +0,0 @@
FROM nvidia/cuda:9.0-devel-ubuntu16.04
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
ENV TENSORFLOW_VERSION=1.8.0
ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
ENV NCCL_VERSION=2.2.12-1+cuda9.0
ENV PYTHON_VERSION=3.5
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
git \
curl \
nano \
wget \
ca-certificates \
libcudnn7=$CUDNN_VERSION \
libnccl2=$NCCL_VERSION \
libnccl-dev=$NCCL_VERSION \
libjpeg-dev \
libpng-dev \
python$PYTHON_VERSION \
python$PYTHON_VERSION-dev
RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
# Install TensorFlow
RUN pip install --no-cache-dir tensorflow-gpu==$TENSORFLOW_VERSION h5py scipy jupyter ipykernel numpy toolz pandas scikit-learn
# Install Open MPI
RUN mkdir /tmp/openmpi && \
cd /tmp/openmpi && \
wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
tar zxf openmpi-3.0.0.tar.gz && \
cd openmpi-3.0.0 && \
./configure --enable-orterun-prefix-by-default && \
make -j $(nproc) all && \
make install && \
ldconfig && \
rm -rf /tmp/openmpi
# Install Horovod, temporarily using CUDA stubs
RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==0.13.2 && \
ldconfig
# Create a wrapper for OpenMPI to allow running as root by default
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
echo '#!/bin/bash' > /usr/local/bin/mpirun && \
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
chmod a+x /usr/local/bin/mpirun
# Configure OpenMPI to run good defaults:
# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \
echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
# Set default NCCL parameters
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \
echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
# Install OpenSSH for MPI to communicate between containers
RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
mkdir -p /var/run/sshd
# Allow OpenSSH to talk to containers without asking for confirmation
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config

Просмотреть файл

@ -1,14 +0,0 @@
DATA_DIR:=/mnt/imagenet
PWD:=$(shell pwd)
FAKE:='False'
FAKE_DATA_LENGTH:=1281167
name_prefix:=masalvar
tag:=9-1.8-.13.2 # Cuda - TF version - Horovod version
image-intel:=$(name_prefix)/horovod-intel:$(tag)
intel-path:=$(PWD)/Docker/horovod-intel
image-open:=$(name_prefix)/horovod:$(tag)
open-path:=$(PWD)/Docker/horovod
script:=\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py
include ../include/build.mk

Просмотреть файл

@ -1,435 +0,0 @@
"""
Trains ResNet50 using Horovod.
It requires the following env variables
AZ_BATCHAI_INPUT_TRAIN
AZ_BATCHAI_INPUT_TEST
AZ_BATCHAI_OUTPUT_MODEL
AZ_BATCHAI_JOB_TEMP_DIR
"""
import logging
import sys
from functools import lru_cache
import os
from os import path
import pandas as pd
import tensorflow as tf
from resnet_model import resnet_v1
from toolz import pipe
from timer import Timer
import numpy as np
_WIDTH = 224
_HEIGHT = 224
_CHANNELS = 3
_LR = 0.001
_EPOCHS = os.getenv('EPOCHS', 1)
_BATCHSIZE = 64
_R_MEAN = 123.68
_G_MEAN = 116.78
_B_MEAN = 103.94
_BUFFER = 256
def _str_to_bool(in_str):
if 't' in in_str.lower():
return True
else:
return False
_DISTRIBUTED = _str_to_bool(os.getenv('DISTRIBUTED', 'False'))
_FAKE = _str_to_bool(os.getenv('FAKE', 'False'))
_DATA_LENGTH = int(
os.getenv('FAKE_DATA_LENGTH', 1281167)) # How much fake data to simulate, default to size of imagenet dataset
_VALIDATION = _str_to_bool(os.getenv('VALIDATION', 'False'))
if _DISTRIBUTED:
import horovod.tensorflow as hvd
tf_logger = logging.getLogger('tensorflow')
tf_logger.setLevel(logging.INFO)
stout = logging.StreamHandler(stream=sys.stdout)
tf_logger.addHandler(stout)
def _get_rank():
if _DISTRIBUTED:
try:
return hvd.rank()
except:
return 0
else:
return 0
class HorovodAdapter(logging.LoggerAdapter):
def __init__(self, logger):
self._str_epoch=''
self._gpu_rank=0
super(HorovodAdapter, self).__init__(logger, {})
def set_epoch(self, epoch):
self._str_epoch='[Epoch {}]'.format(epoch)
def process(self, msg, kwargs):
kwargs['extra'] = {
'gpurank': _get_rank(),
'epoch': self._str_epoch
}
return msg, kwargs
@lru_cache()
def _get_logger():
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
ch = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(levelname)s:%(name)s:%(gpurank)d: %(epoch)s %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
adapter = HorovodAdapter(logger)
return adapter
def _load_image(filename, channels=_CHANNELS):
return tf.to_float(tf.image.decode_png(tf.read_file(filename), channels=channels))
def _resize(img, width=_WIDTH, height=_HEIGHT):
return tf.image.resize_images(img, [height, width])
def _centre(img, mean_subtraction=(_R_MEAN, _G_MEAN, _B_MEAN)):
return tf.subtract(img, list(mean_subtraction))
def _random_crop(img, width=_WIDTH, height=_HEIGHT, channels=_CHANNELS):
return tf.random_crop(img, [height, width, channels])
def _random_horizontal_flip(img):
return tf.image.random_flip_left_right(img)
def _preprocess_images(filename):
return pipe(filename,
_load_image,
_resize,
_centre)
def _preprocess_labels(label):
return tf.cast(label, dtype=tf.int32)
def _transform_to_NCHW(img):
return tf.transpose(img, [2, 0, 1]) # Transform from NHWC to NCHW
def _parse_function_train(tensor, label):
img_rgb = pipe(tensor,
_random_crop,
_random_horizontal_flip,
_transform_to_NCHW)
return img_rgb, label
def _prep(filename, label):
return tf.data.Dataset.from_tensor_slices(([_preprocess_images(filename)], [_preprocess_labels(label)]))
def _parse_function_eval(filename, label):
return pipe(filename,
_preprocess_images,
_transform_to_NCHW), _preprocess_labels(label)
def _get_optimizer(params, is_distributed=_DISTRIBUTED):
if is_distributed:
# Horovod: add Horovod Distributed Optimizer.
return hvd.DistributedOptimizer(tf.train.MomentumOptimizer(learning_rate=params["learning_rate"] * hvd.size(),
momentum=0.9))
else:
return tf.train.MomentumOptimizer(learning_rate=params["learning_rate"], momentum=0.9)
def build_network(features, mode, params):
network = resnet_v1(
resnet_depth=50,
num_classes=params['classes'],
data_format='channels_first')
return network(
inputs=features, is_training=(mode == tf.estimator.ModeKeys.TRAIN))
def model_fn(features, labels, mode, params):
"""
features: This is the x-arg from the input_fn.
labels: This is the y-arg from the input_fn,
see e.g. train_input_fn for these two.
mode: Either TRAIN, EVAL, or PREDICT
params: User-defined hyper-parameters, e.g. learning-rate.
"""
logger=_get_logger()
logger.info('Creating model in {} mode'.format(mode))
logits = build_network(features, mode, params)
if mode == tf.estimator.ModeKeys.PREDICT:
# Softmax output of the neural network.
y_pred = tf.nn.softmax(logits=logits)
# Classification output of the neural network.
y_pred_cls = tf.argmax(y_pred, axis=1)
predictions = {
'class_ids': y_pred_cls,
'probabilities': y_pred,
'logits': logits,
}
return tf.estimator.EstimatorSpec(mode=mode,
predictions=predictions)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
loss = tf.reduce_mean(cross_entropy)
if mode == tf.estimator.ModeKeys.EVAL:
# Softmax output of the neural network.
y_pred = tf.nn.softmax(logits=logits)
# Classification output of the neural network.
y_pred_cls = tf.argmax(y_pred, axis=1)
accuracy = tf.metrics.accuracy(labels=tf.argmax(labels, axis=1),
predictions=y_pred_cls,
name='acc_op')
metrics = {'accuracy': accuracy}
tf.summary.scalar('accuracy', accuracy[1])
return tf.estimator.EstimatorSpec(mode=mode,
eval_metric_ops=metrics,
loss=loss)
optimizer = _get_optimizer(params)
train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode,
loss=loss,
train_op=train_op)
def _append_path_to(data_path, data_series):
return data_series.apply(lambda x: path.join(data_path, x))
def _load_training(data_dir):
train_df = pd.read_csv(path.join(data_dir, 'train.csv'))
return train_df.assign(filenames=_append_path_to(path.join(data_dir, 'train'),
train_df.filenames))
def _load_validation(data_dir):
train_df = pd.read_csv(path.join(data_dir, 'validation.csv'))
return train_df.assign(filenames=_append_path_to(path.join(data_dir, 'validation'),
train_df.filenames))
def _create_data_fn(train_path, test_path):
logger = _get_logger()
logger.info('Reading training data info')
train_df = _load_training(train_path)
logger.info('Reading validation data info')
validation_df = _load_validation(test_path)
train_labels = train_df[['num_id']].values.ravel() - 1
validation_labels = validation_df[['num_id']].values.ravel() - 1
train_data = tf.data.Dataset.from_tensor_slices((train_df['filenames'].values, train_labels))
train_data_transform = tf.contrib.data.map_and_batch(_parse_function_train, _BATCHSIZE, num_parallel_batches=5)
train_data = train_data.apply(tf.contrib.data.parallel_interleave(
_prep, cycle_length=5, buffer_output_elements=1024))
train_data = (train_data.shuffle(1024)
.repeat()
.apply(train_data_transform)
.prefetch(_BUFFER))
validation_data = tf.data.Dataset.from_tensor_slices((validation_df['filenames'].values, validation_labels))
validation_data_transform = tf.contrib.data.map_and_batch(_parse_function_eval, _BATCHSIZE, num_parallel_batches=4)
validation_data = (validation_data.apply(validation_data_transform)
.prefetch(_BUFFER))
def _train_input_fn():
return train_data.make_one_shot_iterator().get_next()
def _validation_input_fn():
return validation_data.make_one_shot_iterator().get_next()
_train_input_fn.length = len(train_df)
_validation_input_fn.length = len(validation_df)
_train_input_fn.classes = 1000
_validation_input_fn.classes = 1000
return _train_input_fn, _validation_input_fn
def _create_data(batch_size, num_batches, dim, channels, seed=42):
np.random.seed(seed)
return np.random.rand(batch_size * num_batches,
channels,
dim[0],
dim[1]).astype(np.float32)
def _create_labels(batch_size, num_batches, n_classes):
return np.random.choice(n_classes, batch_size * num_batches)
def _create_fake_data_fn(train_length=_DATA_LENGTH, valid_length=50000, num_batches=40):
""" Creates fake dataset
Data is returned in NCHW since this tends to be faster on GPUs
"""
logger = _get_logger()
logger.info('Creating fake data')
data_array = _create_data(_BATCHSIZE, num_batches, (_HEIGHT, _WIDTH), _CHANNELS)
labels_array = _create_labels(_BATCHSIZE, num_batches, 1000)
def fake_data_generator():
for i in range(num_batches):
yield data_array[i * _BATCHSIZE:(i + 1) * _BATCHSIZE], labels_array[i * _BATCHSIZE:(i + 1) * _BATCHSIZE]
train_data = tf.data.Dataset().from_generator(fake_data_generator,
output_types=(tf.float32, tf.int32),
output_shapes=(tf.TensorShape([None, _CHANNELS, _HEIGHT, _WIDTH]),
tf.TensorShape([None])))
train_data = (train_data.shuffle(40 * _BATCHSIZE)
.repeat()
.prefetch(_BUFFER))
validation_data = tf.data.Dataset().from_generator(fake_data_generator,
output_types=(tf.float32, tf.int32),
output_shapes=(
tf.TensorShape([None, _CHANNELS, _HEIGHT, _WIDTH]),
tf.TensorShape([None])))
validation_data = (validation_data.prefetch(_BUFFER))
def _train_input_fn():
return train_data.make_one_shot_iterator().get_next()
def _validation_input_fn():
return validation_data.make_one_shot_iterator().get_next()
_train_input_fn.length = train_length
_validation_input_fn.length = valid_length
_train_input_fn.classes = 1000
_validation_input_fn.classes = 1000
return _train_input_fn, _validation_input_fn
def _get_runconfig(is_distributed=_DISTRIBUTED):
if is_distributed:
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
return tf.estimator.RunConfig(save_checkpoints_steps=None,
save_checkpoints_secs=None,
session_config=config)
else:
return tf.estimator.RunConfig(save_checkpoints_steps=None)
def _get_model_dir(is_distributed=_DISTRIBUTED):
if is_distributed:
# Horovod: save checkpoints only on worker 0 to prevent other workers from
# corrupting them.
return os.getenv('AZ_BATCHAI_OUTPUT_MODEL') if hvd.rank() == 0 else os.getenv('AZ_BATCHAI_JOB_TEMP_DIR')
else:
return os.getenv('AZ_BATCHAI_OUTPUT_MODEL')
def _get_hooks(is_distributed=_DISTRIBUTED):
logger = _get_logger()
if is_distributed:
bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
logger.info('Rank: {} Cluster Size {}'.format(hvd.local_rank(), hvd.size()))
return [bcast_hook]
else:
return []
def _is_master(is_distributed=_DISTRIBUTED):
if is_distributed:
if hvd.rank() == 0:
return True
else:
return False
else:
return True
def _log_summary(data_length, duration):
logger = _get_logger()
images_per_second = data_length / duration
logger.info('Data length: {}'.format(data_length))
logger.info('Total duration: {:.3f}'.format(duration))
logger.info('Total images/sec: {:.3f}'.format(images_per_second))
logger.info('Batch size: (Per GPU {}: Total {})'.format(_BATCHSIZE,
hvd.size() * _BATCHSIZE if _DISTRIBUTED else _BATCHSIZE))
logger.info('Distributed: {}'.format('True' if _DISTRIBUTED else 'False'))
logger.info('Num GPUs: {:.3f}'.format(hvd.size() if _DISTRIBUTED else 1))
logger.info('Dataset: {}'.format('Synthetic' if _FAKE else 'Imagenet'))
def main():
if _DISTRIBUTED:
# Horovod: initialize Horovod.
hvd.init()
logger = _get_logger()
logger.info("Runnin Distributed")
else:
logger = _get_logger()
logger.info("Tensorflow version {}".format(tf.__version__))
if _FAKE:
train_input_fn, validation_input_fn = _create_fake_data_fn()
else:
train_input_fn, validation_input_fn = _create_data_fn(os.getenv('AZ_BATCHAI_INPUT_TRAIN'),
os.getenv('AZ_BATCHAI_INPUT_TEST'))
run_config = _get_runconfig()
model_dir = _get_model_dir()
params = {"learning_rate": _LR,
"classes": train_input_fn.classes}
logger.info('Creating estimator with params: {}'.format(params))
model = tf.estimator.Estimator(model_fn=model_fn,
params=params,
model_dir=model_dir,
config=run_config)
hooks = _get_hooks()
num_gpus = hvd.size() if _DISTRIBUTED else 1
with Timer(output=logger.info, prefix="Training") as t:
logger.info('Training...')
model.train(input_fn=train_input_fn,
steps=_EPOCHS * train_input_fn.length // (_BATCHSIZE * num_gpus),
hooks=hooks)
_log_summary(_EPOCHS * train_input_fn.length, t.elapsed)
if _is_master() and _FAKE is False and _VALIDATION:
with Timer(output=logger.info, prefix="Testing"):
logger.info('Testing...')
model.evaluate(input_fn=validation_input_fn)
if __name__ == '__main__':
main()

Просмотреть файл

@ -1,27 +1,19 @@
define PROJECT_HELP_MSG
Usage:
make help show this message
make build build docker image
make push push container
make run run benchmarking container
endef
export PROJECT_HELP_MSG
PWD:=$(shell pwd)
# This makefile is used to test the cookiecutter
# To use this you will need to create a .dev_env file and add the subscription_id to it
include .dev_env
image_name:=masalvar/batchai-ddl
cookiecutter:
ifdef subscription_id
cd ../ && cookiecutter AMLDistCC --no-input \
subscription_id=${subscription_id} \
resource_group=mstestdistrg \
data=/mnt/imagenet_test \
vm_size=Standard_NC24rs_v3 \
project_name=mstestdist \
image_name=mstestdist
else
@echo "You need to create a .dev_env file with subscription_id in it"
endif
help:
echo "$$PROJECT_HELP_MSG" | less
build:
docker build -t $(image_name) Docker
run:
docker run -v $(PWD):/workspace -it $(image_name) bash
push:
docker push $(image_name)
.PHONY: help build push
clean:
rm -rf ../mstestdist

Просмотреть файл

@ -1,3 +0,0 @@
FROM pytorch/pytorch:0.4_cuda9_cudnn7
RUN pip install --no-cache-dir h5py scipy jupyter ipykernel numpy toolz pandas scikit-learn pillow

Просмотреть файл

@ -1,11 +0,0 @@
DATA_DIR:=/mnt/imagenet
PWD:=$(shell pwd)
FAKE:='False'
FAKE_DATA_LENGTH:=1281167
name_prefix:=iliauk
tag:=latest
image-open:=$(name_prefix)/pytorch_gloo:$(tag)
open-path:=$(PWD)/Docker
script:=\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_gloo.py
include ../include/build.mk

Просмотреть файл

@ -1,283 +0,0 @@
import argparse
import logging
import os
from os import path
import numpy as np
import pandas as pd
import multiprocessing
from toolz import pipe
from timer import Timer
from PIL import Image
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import torchvision.models as models
import torch.distributed as dist
import torch.utils.data.distributed
print("PyTorch: ", torch.__version__)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Distributed training settings
parser = argparse.ArgumentParser(description='PyTorch ResNet Example')
parser.add_argument('--world-size', default=1, type=int, help='number of distributed processes')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='gloo', type=str, help='distributed backend')
parser.add_argument('--rank', default=-1, type=int, help='rank of the worker')
_WIDTH = 224
_HEIGHT = 224
_LR = 0.001
_EPOCHS = 1
_NUM_GPU = int(torch.cuda.device_count())
_BATCHSIZE = 64*_NUM_GPU
_RGB_MEAN = [0.485, 0.456, 0.406]
_RGB_SD = [0.229, 0.224, 0.225]
args = parser.parse_args()
def _str_to_bool(in_str):
if 't' in in_str.lower():
return True
else:
return False
_FAKE = _str_to_bool(os.getenv('FAKE', 'True'))
_DATA_LENGTH = int(os.getenv('FAKE_DATA_LENGTH', 1281167)) # How much fake data to simulate, default to size of imagenet dataset
#_DISTRIBUTED = _str_to_bool(os.getenv('DISTRIBUTED', 'False'))
_DISTRIBUTED = True
_CPU_COUNT = 8
logger.info("Distributed mode: ", _DISTRIBUTED)
logger.info("CPU Count: ", _CPU_COUNT)
def _append_path_to(data_path, data_series):
return data_series.apply(lambda x: path.join(data_path, x))
def _load_training(data_dir):
train_df = pd.read_csv(path.join(data_dir, 'train.csv'))
return train_df.assign(filenames=_append_path_to(path.join(data_dir, 'train'),
train_df.filenames))
def _load_validation(data_dir):
train_df = pd.read_csv(path.join(data_dir, 'validation.csv'))
return train_df.assign(filenames=_append_path_to(path.join(data_dir, 'validation'),
train_df.filenames))
def _create_data_fn(train_path, test_path):
logger.info('Reading training data info')
train_df = _load_training(train_path)
logger.info('Reading validation data info')
validation_df = _load_validation(test_path)
# File-path
train_X = train_df['filenames'].values
validation_X = validation_df['filenames'].values
# One-hot encoded labels for torch
train_labels = train_df[['num_id']].values.ravel()
validation_labels = validation_df[['num_id']].values.ravel()
# Index starts from 0
train_labels -= 1
validation_labels -= 1
return train_X, train_labels, validation_X, validation_labels
class ImageNet(Dataset):
def __init__(self, img_locs, img_labels, transform=None):
self.img_locs, self.labels = img_locs, img_labels
self.transform = transform
logger.info("Loaded {} labels and {} images".format(len(self.labels), len(self.img_locs)))
def __getitem__(self, idx):
im_file = self.img_locs[idx]
label = self.labels[idx]
with open(im_file, 'rb') as f:
im_rgb = Image.open(f)
# Make sure 3-channel (RGB)
im_rgb = im_rgb.convert('RGB')
if self.transform is not None:
im_rgb = self.transform(im_rgb)
return im_rgb, label
def __len__(self):
return len(self.img_locs)
class FakeData(Dataset):
def __init__(self,
batch_size=32,
num_batches=20,
dim=(224, 224),
n_channels=3,
n_classes=10,
length=_DATA_LENGTH,
seed=42,
data_transform=None):
self.dim = dim
self.n_channels = n_channels
self.n_classes = n_classes
self.num_batches = num_batches
self._data = _create_data(batch_size, self.num_batches, self.dim, self.n_channels)
self._labels = _create_labels(batch_size, self.num_batches, self.n_classes)
self.translation_index = np.random.choice(len(self._labels), length)
self._length=length
self._data_transform = data_transform
#logger = _get_logger()
logger.info("Creating fake data {} labels and {} images".format(n_classes, len(self._data)))
def __getitem__(self, idx):
#logger = _get_logger()
logger.debug('Retrieving samples')
logger.debug(str(idx))
tr_index_array = self.translation_index[idx]
if self._data_transform is not None:
data=self._data_transform(self._data[tr_index_array])
else:
data=self._data[tr_index_array]
return data, self._labels[tr_index_array]
def __len__(self):
return self._length
def _log_summary(data_length, duration):
#logger = _get_logger()
images_per_second = data_length / duration
logger.info('Data length: {}'.format(data_length))
logger.info('Total duration: {:.3f}'.format(duration))
logger.info('Total images/sec: {:.3f}'.format(images_per_second))
logger.info('Batch size: (Per GPU {}: Total {})'.format(int(_BATCHSIZE/_NUM_GPU), _BATCHSIZE))
logger.info('Distributed: {}'.format('True' if _DISTRIBUTED else 'False'))
logger.info('Num GPUs: {:.3f}'.format(_NUM_GPU)) # May need to pass in argument to get this
logger.info('Dataset: {}'.format('Synthetic' if _FAKE else 'Imagenet'))
def _create_data(batch_size, num_batches, dim, channels, seed=42):
np.random.seed(seed)
return np.random.rand(batch_size * num_batches,
channels,
dim[0],
dim[1]).astype(np.float32)
def _create_labels(batch_size, num_batches, n_classes):
return np.random.choice(n_classes, batch_size * num_batches)
def train(train_loader, model, criterion, optimizer, epoch):
logger.info("Training ...")
model.train()
for i, (input, target) in enumerate(train_loader):
input, target = input.cuda(non_blocking=True), target.cuda(non_blocking=True)
# compute output
output = model(input)
loss = criterion(output, target)
# compute gradient and do SGD step
optimizer.zero_grad()
loss.backward()
optimizer.step()
def validate(val_loader, model, criterion):
logger.info("Validating ...")
model.eval()
correct = 0
total = 0
with torch.no_grad():
for i, (input, target) in enumerate(val_loader):
target = target.cuda(non_blocking=True)
# compute output
output = model(input)
_, predicted = torch.max(output.data, 1)
total += target.size(0)
correct += (predicted == target).sum().item()
logger.info('Top-1 Accuracy: %.2f %%' % (100 * correct / total))
def main():
# Autotune
cudnn.benchmark = True
# Load symbol
model = models.__dict__['resnet50'](pretrained=False)
if _DISTRIBUTED:
logger.info('Running in distributed mode')
dist.init_process_group(
backend=args.dist_backend,
init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank)
model.cuda()
model = torch.nn.parallel.DistributedDataParallel(model)
else:
model = torch.nn.DataParallel(model).cuda()
# Optimisers
criterion = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=_LR)
# Data-sets
if _FAKE:
logger.info("Setting up fake loaders")
train_dataset = FakeData(n_classes=1000, data_transform=torch.FloatTensor)
else:
normalize = transforms.Normalize(_RGB_MEAN, _RGB_SD)
train_X, train_y, valid_X, valid_y = _create_data_fn(os.getenv('AZ_BATCHAI_INPUT_TRAIN'),
os.getenv('AZ_BATCHAI_INPUT_TEST'))
train_dataset = ImageNet(
train_X,
train_y,
transforms.Compose([
transforms.RandomResizedCrop(_WIDTH),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize]))
if _DISTRIBUTED:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
train_sampler = None
# Data-loaders
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=_BATCHSIZE, shuffle=(train_sampler is None), num_workers=_CPU_COUNT, sampler=train_sampler)
#val_loader = torch.utils.data.DataLoader(
# ImageNet(
# valid_X,
# valid_y,
# transforms.Compose([
# transforms.Resize(256),
# transforms.CenterCrop(_WIDTH),
# transforms.ToTensor(),
# normalize])), batch_size=_BATCHSIZE, shuffle=False,
# num_workers=_CPU_COUNT)
# Main training-loop
for epoch in range(_EPOCHS):
if _DISTRIBUTED:
train_sampler.set_epoch(epoch)
# Train
with Timer(output=logger.info, prefix="Training") as t:
train(train_loader, model, criterion, optimizer, epoch)
_log_summary(len(train_dataset), t.elapsed)
# Validate
#with Timer(output=logger.info, prefix="Testing"):
# validate(val_loader, model, criterion)
print("Finished")
if __name__ == '__main__':
print("Pytorch")
main()

243
README.md
Просмотреть файл

@ -1,14 +1,245 @@
# Distributed Deep Learning
# Introduction
This repo contains a cookiecutter template for running distributed training of deep learning models using
Azure Machine Learning. You can create clusters with 0 nodes which will incur no cost and scale this up to hundreds of nodes. It is also possible to use low priority nodes to reduce costs even further.
This repo contains a number of examples of training a ResNet50 network with the Imagenet dataset in various Deep Learning frameworks.
The project contains the following:
#### Tensorflow Benchmark
This is a demo template that allows you to easily run [tf_cnn_benchmarks](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks) on Azure ML. This is a great way to test performance as well as compare to other platforms
#### Tensorflow Imagenet
This is another demo tempalte that shows you how to train a ResNet50 model using Imagenet on Azure. We include scripts for processing the imagenet data, transforming them to TF Records as well as leveraging AzCopy to quickly upload the data to the cloud.
#### Tensorflow Template
This is a blank template you can use for your own didstributed training projects. It allows you to leverage all the tooling built around the previous two demos to speed up the time it takes to run your model in a distributed fashion on Azure.
[Horovod + Keras](HorovodKeras)
[Horovod + Tensorflow](HorovodTF)
[Horovod + PyTorch](HorovodPytorch)
# Prerequisites
Before you get started you need a PC running Ubuntu and the following installed:
[Docker installed](https://docs.docker.com/install/linux/docker-ce/ubuntu/)
[Nvidia runtime for docker](https://github.com/NVIDIA/nvidia-container-runtime) [Required for local execution]
[Cookiecutter installed](https://cookiecutter.readthedocs.io/en/latest/)
[Git installed](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git)
> **Note:**
> You will need to run docker without sudo, to do this run:
> ```
> sudo usermod -aG docker $USER
> newgrp docker
>```
# Setup
## Using the template
Once you have Cookiecutter installed you can either directly invoke project creation as follows:
```bash
cookiecutter gh:Microsoft/AMLDistCC
```
or clone locally and then invoke
```bash
git clone https://github.com/Microsoft/AMLDistCC.git
cookiecutter AMLDistCC
```
Cookiecutter will then ask you about a number of fields which it will use to construct your project.
If you simply want to select the defaults don't write or select anything just press enter. Many of them can be left at the default values, the ones that are absolutely necessary are _highlighted_
**project_title:** The title of your project
**project_name:** The folder in which your project will be created. Make sure it is a valid linux folder name
**resource_group:** The name of the resource group in Azure under which all the resources will be created.
It is fine if it already exists
**workspace:** The AML workspace that the project will use. If it doesn't already exist it will create it
**sub_id:** The subscription id for your project, you can look this up on the portal or run a command on the
cloud shell to get it. It isn't mandatory though, the application will give you an option to select it later.
**vm_size:** The VM type to use for distributed training
**minimum_number_nodes:** The minimum number of nodes in the cluster. Set to 0 if you want it to scale down
after use to reduce costs
**maximum_number_nodes:** The maximum number of nodes in the cluster
**cluster_name:** The name of the cluster to use. It will create it if it doesn't exist
**container_registry:** The name of your dockerhub or other account which you may want to push your control plane docker container. If you don't have one or don't want to push the container to it simply leave as default
**type:** The type of project you want:
* all: All of them
* template: Just create a template for distributed training
* benchmark: Create project that will run the Tensorflow benchmarks
* imagenet: Create an example project that will run against the imagenet data. (You will need to download the imagenet data)
**region:** Which region to create Azure resources in
**experiment_name:** The name of the experiment
_data_**:** The absolute path on your computer where you will store the imagenet data. The location needs to have around 400GB of space
**image_name:** The name to give the control plane docker image
**datastore_name:** Name of the datastore that will be created as part of the project
**container_name:** The name of the container in your storage account that will hold the data
Once the project is created you will still be able to change many of the above options as they will be present in .env file that will be created.
## Building environment
Distributed training is complex and often has a number of moving parts. To reduce the overhead of installing packages and managing environments we use a docker container to encapsulate our enviroment. So once you have created the project simply navigate to the root folder created by cookiecutter and run:
```bash
make build
```
This will build your docker container. Isnide your docker container will be an appropriately set up conda environment a number of utilities such as AzCopy as well as everything you will need to run your distributed training job.
Once your container is built run:
```bash
make run
```
This will put you in an environment inside your container in a tmux session (for a tutorial on tmux see [here](https://www.hamvocke.com/blog/a-quick-and-easy-guide-to-tmux/)). The tmux control key has been mapped to **ctrl+a** rather than the standard ctrl+b so as not to interfere with outer tmux session if you are already a tmux user. You can alter this in the tmux.conf file in the Docker folder. The docker container will map the location you launched it from to the location /workspace inside the docker container. Therefore you can edit files outside of the container in the project folder and the changes will be reflected inside the container.
## Imagenet data
If you have selected **all** or **imagenet** in the type question during cookiecutter invocation then you will need to have **ILSVRC2012_img_train.tar** and **ILSVRC2012_img_val.tar** present in the direcotry you specified as your data directory. Go to the [download page](http://www.image-net.org/download-images) (you may need to register an account), and find the page for ILSVRC2012. You will need to download the two files mentioned earlier.
## Template selection
Based on the option you selected for **type** during the cookiecutter invocation you will get all or one of the options below. Cookiecutter will create your project folder which will contain the tempalte folders. When inside your project folder make sure you have run the **make build** and **make run** commands as mentioned in section X above. Once you run the run command you will be greeted by a prompt, this is now your control plane. First you will need to set everything up. To do this run
```bash
inv setup
```
It will ask you to log in so follow the prompts in the terminal. If you selected **all** in the template type it will also prepare the imagenet data.
Now you will be ready to run the tempaltes.
#### Tensorflow Benchmark
This is a demo template allows you to easily run tf_cnn_benchmarks on Azure ML. This is a great way to test performance as well as compare to other platforms. To use this you must either select benchmark or all when invoking cookiecutter.
Once setup is complete then simply run:
```bash
inv tf-benchmark.submit.local.synthetic
```
to run things locally on a single GPU. Note that the first time you run things you will have to build the environment.
To run things on a cluster simply run:
```bash
inv tf-benchmark.submit.remote.synthetic
```
Note that this will create the cluster if it wasn't created earlier and create the appropriate environment.
#### Tensorflow Imagenet
This is the second demo template that will train a ResNet50 model on imagenet. It allows the options of using synthetic data, image data as well as tfrecords. To use this you must either select **imagenet** or **all** when cookiecutter asks what type of project you want to create.
The run things locally using synthetic data simply run:
```
inv tf-imagenet.submit.local.synthetic
```
To run things on a remote cluster with real data in tfrecords format simply run:
```
inv tf-imagenet.submit.remote.tfrecords
```
This only covers a small number of commands, to see the full list of commands simply run inv --list.
#### Tensorflow Experiment
This is the option that you should use if you want to run your own training script. It is up to you to add the appropriate training scripts and modify the tensorflow_experiment.py file to run the appropriate commands. If you want to see how to invoke things simply look at the other examples.
# Architecture
Below is a diagram that shows how the project is set up.
<p align="center">
<img width="1000" src="./images/architecture1.png">
</p>
The docker container you created using **make build** is the control plane and from there we can invoke jobs to execute either locally or in the cloud. Local execution is meant for debugging and will run on a single GPU. The mapping of data locations is handled by the control scripts. During local execution the appropriate location is mapped to the container. During remote execution the data store created during setup will be mounted on to each of the VMs in the cluster.
## Project structure
The original project structure is as shown below.
```.
├── cookiecutter.json <-- Cookiecutter json that holds all the variables for the projects
├── hooks
│ ├── post_gen_project.py
│ └── pre_gen_project.py
├── images
│ └── demo.svg
├── LICENSE
├── README.md <-- This readme
└── {{cookiecutter.project_name}}
├── _dotenv_template <-- Template that is read and translated into .env file
├── control <-- Holds all files for the control plane
│ ├── Docker <-- Contains the files used to build the control plane docker container
│ │ ├── azure_requirements.txt <-- Azure python requirements
│ │ ├── bash.completion <-- Completion script for invoke
│ │ ├── dockerfile
│ │ ├── environment.yml <-- Conda environment specification for control plane
│ │ ├── jupyter_notebook_config.py
│ │ └── tmux.conf <-- Tmux configuration
│ └── src
│ ├── aml_compute.py <-- Module that holds methods for creating cluster and submitting experiments using Azure ML
│ ├── config.py <-- Module for loading and working with .env config
│ └── logging.conf <-- Logging configuration for control plane
├── Makefile <-- Makefile to build and run control plane
├── scripts
│ ├── convert_imagenet_to_tf_records.py <-- Script for transforming imagenet data to tf records
│ ├── image.py <-- Invoke module for working with images
│ ├── imagenet_nounid_to_class.json <-- Imagenet nounid lookup
│ ├── prepare_imagenet.py <-- Script for preparing imagenet data
│ ├── storage.py <-- Invoke module for using Azure storage
│ └── tfrecords.py <-- Invoke module for working with tf records
├── tasks.py <-- Main invoke module
├── TensorFlow_benchmark <-- Template for running Tensorflow benchmarks
│ ├── environment_cpu.yml
│ ├── environment_gpu.yml <-- Conda specification file used by Azure ML to create environment to run project in
│ ├── src <-- Folder where tensorflow benchmarks code will be cloned into
| └── tensorflow_benchmark.py <-- Invoke module for running benchmarks
├── TensorFlow_experiment <-- Tensorflow distributed training template [Put your code here]
│ ├── environment_cpu.yml
│ ├── environment_gpu.yml <-- Conda specification file used by Azure ML to create environment to run project in
│ ├── src
│ │ ├── logging.conf
│ │ └── train_model.py <-- Template file
│ └── tensorflow_experiment.py <-- Invoke module for running template
└── TensorFlow_imagenet
├── environment_cpu.yml
├── environment_gpu.yml <-- Conda specification file used by Azure ML to create environment to run project in
├── src <-- Code for training ResNet50 model on imagenet
│ ├── data
│ │ ├── __init__.py
│ │ ├── images.py
│ │ ├── synthetic.py
│ │ └── tfrecords.py
│ ├── defaults.py
│ ├── imagenet_preprocessing.py
│ ├── logging.conf
│ ├── resnet_main.py <-- Main entry script
│ ├── resnet_model.py
│ ├── resnet_run_loop.py
│ ├── timer.py
│ └── utils.py
└── tensorflow_imagenet.py <-- Invoke module for running imagenet experiment
```
Depending on the options chosen only certain branches will be moved over to your project.
## Options
These are the options when using the template. These can differ depenting on the type of project you choose to create. To see this list youself simply run:
```
inv --list
```
```
delete Delete the resource group and all associated resources
experiments Prints list of experiments
interactive (i) Open IPython terminal and load in modules to work with AzureML
login Log in to Azure CLI
runs Prints information on last N runs in specified experiment
select-subscription Select Azure subscription to use
setup Setup the environment and process the imagenet data
tensorboard Runs tensorboard in a seperate tmux session
storage.create-resource-group
storage.store-key Retrieves premium storage account key from Azure and stores it in .env file
storage.image.create-container Creates container based on the parameters found in the .env file
storage.image.download-data Download training and validation data from blob container specified in .env file
storage.image.download-training Download training data from blob container specified in .env file
storage.image.download-validation Download validation data from blob container specified in .env file
storage.image.prepare-imagenet Prepare imagenet data found in download_dir and push results to target_dir
storage.image.upload-data Upload training and validation data to container specified in .env file
storage.image.upload-training-data Upload training data to container specified in .env file
storage.image.upload-validation-data Upload validation data to container specified in .env file
storage.create-container Creates container based on the parameters found in the .env file
storage.create-premium-storage Creates premium storage account. By default the values are loaded from the local .env file
storage.tfrecords.upload-validation-data Upload tfrecords validation data to container specified in .env file
tf-benchmark.submit.local.synthetic Submits TensorFlow benchmark job using synthetic data for local execution
tf-benchmark.submit.remote.synthetic Submits TensorFlow benchmark job using synthetic data on remote cluster
tf-experiment.submit.local.images This command isn't implemented please modify to use.
tf-experiment.submit.local.synthetic This command isn't implemented please modify to use.
tf-experiment.submit.remote.images This command isn't implemented please modify to use.
tf-experiment.submit.remote.synthetic This command isn't implemented please modify to use.
tf-imagenet.submit.local.images Submit TensorFlow training job using real imagenet data for local execution
tf-imagenet.submit.local.synthetic Submit TensorFlow training job using synthetic imagenet data for local execution
tf-imagenet.submit.local.tfrecords Submit TensorFlow training job using real imagenet data as tfrecords for local execution
tf-imagenet.submit.remote.images Submit TensorFlow training job using real imagenet data to remote cluster
tf-imagenet.submit.remote.synthetic Submit TensorFlow training job using synthetic imagenet data to remote cluster
tf-imagenet.submit.remote.tfrecords Submit TensorFlow training job using real imagenet data as tfrecords to remote cluster
```
# Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
the rights to use your contribution. For details, visit https://cla.microsoft.com.

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,562 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from scipy.io import loadmat\n",
"from os import path\n",
"import os\n",
"from toolz import juxt, compose\n",
"import pandas as pd\n",
"from glob import iglob\n",
"from itertools import chain"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_dir = path.join(os.getenv('AZ_BATCHAI_INPUT_DATASET'), 'imagenet')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data = loadmat(path.join(data_dir, 'ILSVRC2012_devkit_t12', 'data', 'meta.mat'))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def _index_from(synset):\n",
" return synset[0][0][0][0]\n",
"\n",
"def _wnid_from(synset):\n",
" return str(synset[0][1][0])\n",
"\n",
"def _name_from(synset):\n",
" return str(synset[0][2][0])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def _extract_from(synset):\n",
" extract = juxt(_index_from, _wnid_from, _name_from)\n",
" return extract(synset)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"label_list = list(map(_extract_from, data['synsets']))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame(label_list, columns=('num_index', 'wnid', 'label')).set_index('num_index')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train_dir=path.join(data_dir,'train')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def _extract_to_directory(wnid):\n",
" out_dir = path.join(train_dir, wnid)\n",
" tar_file = path.join(train_dir, '{}.tar'.format(wnid))\n",
" print(out_dir)\n",
" !mkdir -p $out_dir\n",
" !tar -C $out_dir -xf $tar_file"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"filenames = [iglob(path.join(train_dir, wnid, '*.*')) for wnid in df.loc[1:1000]['wnid'].tolist()]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ff = list(chain(*filenames))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_df = pd.DataFrame({'filenames':ff})"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"index_to_wnid_dict = df.loc[1:1000]['wnid'].to_dict()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"wnid_labels = [path.split(name)[-1].split('_')[0] for name in ff]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_df = data_df.assign(wnid=wnid_labels)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_df = data_df.assign(num_id=data_df['wnid'].replace(to_replace=list(index_to_wnid_dict.values()), \n",
" value=list(index_to_wnid_dict.keys())))"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"extract_wnid_dir = compose(path.basename, path.dirname)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"convert_filename = lambda x: path.join(extract_wnid_dir(x), path.basename(x))"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_df=data_df.assign(filenames=data_df['filenames'].apply(convert_filename))"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filenames</th>\n",
" <th>wnid</th>\n",
" <th>num_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>n02119789/n02119789_12009.JPEG</td>\n",
" <td>n02119789</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>n02119789/n02119789_4083.JPEG</td>\n",
" <td>n02119789</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>n02119789/n02119789_14450.JPEG</td>\n",
" <td>n02119789</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>n02119789/n02119789_11832.JPEG</td>\n",
" <td>n02119789</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>n02119789/n02119789_5459.JPEG</td>\n",
" <td>n02119789</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filenames wnid num_id\n",
"0 n02119789/n02119789_12009.JPEG n02119789 1\n",
"1 n02119789/n02119789_4083.JPEG n02119789 1\n",
"2 n02119789/n02119789_14450.JPEG n02119789 1\n",
"3 n02119789/n02119789_11832.JPEG n02119789 1\n",
"4 n02119789/n02119789_5459.JPEG n02119789 1"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_df.to_csv(path.join(data_dir, 'train.csv'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Validation data "
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!mkdir -p {path.join(data_dir, 'validation')}"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!tar -C {path.join(data_dir, 'validation')} -xf {path.join(data_dir, 'ILSVRC2012_img_val.tar')}"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"f=open(path.join(data_dir, 'ILSVRC2012_devkit_t12', 'data', 'ILSVRC2012_validation_ground_truth.txt'))"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"convert_label = compose(int, str.strip)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"labels = list(map(convert_label, f.readlines()))"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"files = list(sorted(iglob(path.join(data_dir, 'validation', '*.JPEG'))))"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"valid_df=pd.DataFrame({'filenames':files, 'num_id':labels})"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"valid_df=valid_df.assign(filenames=valid_df['filenames'].apply(path.basename))"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filenames</th>\n",
" <th>num_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ILSVRC2012_val_00000001.JPEG</td>\n",
" <td>490</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ILSVRC2012_val_00000002.JPEG</td>\n",
" <td>361</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ILSVRC2012_val_00000003.JPEG</td>\n",
" <td>171</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ILSVRC2012_val_00000004.JPEG</td>\n",
" <td>822</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ILSVRC2012_val_00000005.JPEG</td>\n",
" <td>297</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filenames num_id\n",
"0 ILSVRC2012_val_00000001.JPEG 490\n",
"1 ILSVRC2012_val_00000002.JPEG 361\n",
"2 ILSVRC2012_val_00000003.JPEG 171\n",
"3 ILSVRC2012_val_00000004.JPEG 822\n",
"4 ILSVRC2012_val_00000005.JPEG 297"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ILSVRC2012_devkit_t12\t ILSVRC2012_img_train.tar\ttrain.csv\r\n",
"ILSVRC2012_devkit_t12.tar.gz ILSVRC2012_img_val.tar\tvalidation\r\n",
"ILSVRC2012_img_test.tar train\t\t\tvalidation.csv\r\n"
]
}
],
"source": [
"!ls {data_dir}"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"valid_df.to_csv(path.join(data_dir, 'validation.csv'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,562 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from scipy.io import loadmat\n",
"from os import path\n",
"import os\n",
"from toolz import juxt, compose\n",
"import pandas as pd\n",
"from glob import iglob\n",
"from itertools import chain"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_dir = path.join(os.getenv('AZ_BATCHAI_INPUT_DATASET'), 'imagenet')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data = loadmat(path.join(data_dir, 'ILSVRC2012_devkit_t12', 'data', 'meta.mat'))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def _index_from(synset):\n",
" return synset[0][0][0][0]\n",
"\n",
"def _wnid_from(synset):\n",
" return str(synset[0][1][0])\n",
"\n",
"def _name_from(synset):\n",
" return str(synset[0][2][0])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def _extract_from(synset):\n",
" extract = juxt(_index_from, _wnid_from, _name_from)\n",
" return extract(synset)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"label_list = list(map(_extract_from, data['synsets']))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame(label_list, columns=('num_index', 'wnid', 'label')).set_index('num_index')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train_dir=path.join(data_dir,'train')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def _extract_to_directory(wnid):\n",
" out_dir = path.join(train_dir, wnid)\n",
" tar_file = path.join(train_dir, '{}.tar'.format(wnid))\n",
" print(out_dir)\n",
" !mkdir -p $out_dir\n",
" !tar -C $out_dir -xf $tar_file"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"filenames = [iglob(path.join(train_dir, wnid, '*.*')) for wnid in df.loc[1:1000]['wnid'].tolist()]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ff = list(chain(*filenames))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_df = pd.DataFrame({'filenames':ff})"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"index_to_wnid_dict = df.loc[1:1000]['wnid'].to_dict()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"wnid_labels = [path.split(name)[-1].split('_')[0] for name in ff]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_df = data_df.assign(wnid=wnid_labels)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_df = data_df.assign(num_id=data_df['wnid'].replace(to_replace=list(index_to_wnid_dict.values()), \n",
" value=list(index_to_wnid_dict.keys())))"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"extract_wnid_dir = compose(path.basename, path.dirname)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"convert_filename = lambda x: path.join(extract_wnid_dir(x), path.basename(x))"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_df=data_df.assign(filenames=data_df['filenames'].apply(convert_filename))"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filenames</th>\n",
" <th>wnid</th>\n",
" <th>num_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>n02119789/n02119789_12009.JPEG</td>\n",
" <td>n02119789</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>n02119789/n02119789_4083.JPEG</td>\n",
" <td>n02119789</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>n02119789/n02119789_14450.JPEG</td>\n",
" <td>n02119789</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>n02119789/n02119789_11832.JPEG</td>\n",
" <td>n02119789</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>n02119789/n02119789_5459.JPEG</td>\n",
" <td>n02119789</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filenames wnid num_id\n",
"0 n02119789/n02119789_12009.JPEG n02119789 1\n",
"1 n02119789/n02119789_4083.JPEG n02119789 1\n",
"2 n02119789/n02119789_14450.JPEG n02119789 1\n",
"3 n02119789/n02119789_11832.JPEG n02119789 1\n",
"4 n02119789/n02119789_5459.JPEG n02119789 1"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_df.to_csv(path.join(data_dir, 'train.csv'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Validation data "
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!mkdir -p {path.join(data_dir, 'validation')}"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!tar -C {path.join(data_dir, 'validation')} -xf {path.join(data_dir, 'ILSVRC2012_img_val.tar')}"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"f=open(path.join(data_dir, 'ILSVRC2012_devkit_t12', 'data', 'ILSVRC2012_validation_ground_truth.txt'))"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"convert_label = compose(int, str.strip)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"labels = list(map(convert_label, f.readlines()))"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"files = list(sorted(iglob(path.join(data_dir, 'validation', '*.JPEG'))))"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"valid_df=pd.DataFrame({'filenames':files, 'num_id':labels})"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"valid_df=valid_df.assign(filenames=valid_df['filenames'].apply(path.basename))"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filenames</th>\n",
" <th>num_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ILSVRC2012_val_00000001.JPEG</td>\n",
" <td>490</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ILSVRC2012_val_00000002.JPEG</td>\n",
" <td>361</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ILSVRC2012_val_00000003.JPEG</td>\n",
" <td>171</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ILSVRC2012_val_00000004.JPEG</td>\n",
" <td>822</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ILSVRC2012_val_00000005.JPEG</td>\n",
" <td>297</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filenames num_id\n",
"0 ILSVRC2012_val_00000001.JPEG 490\n",
"1 ILSVRC2012_val_00000002.JPEG 361\n",
"2 ILSVRC2012_val_00000003.JPEG 171\n",
"3 ILSVRC2012_val_00000004.JPEG 822\n",
"4 ILSVRC2012_val_00000005.JPEG 297"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ILSVRC2012_devkit_t12\t ILSVRC2012_img_train.tar\ttrain.csv\r\n",
"ILSVRC2012_devkit_t12.tar.gz ILSVRC2012_img_val.tar\tvalidation\r\n",
"ILSVRC2012_img_test.tar train\t\t\tvalidation.csv\r\n"
]
}
],
"source": [
"!ls {data_dir}"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"valid_df.to_csv(path.join(data_dir, 'validation.csv'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -1,482 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"azcopy --source https://datasharesa.blob.core.windows.net/imagenet/train.csv \\\n",
" --destination /data/imagenet/train.csv\\\n",
" --source-sas \"?se=2025-01-01&sp=r&sv=2017-04-17&sr=b&sig=EUcahDDZcefOKtHoVWDh7voAC1BoxYNM512spFmjmDU%3D\"\\\n",
" --quiet"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!azcopy --source https://datasharesa.blob.core.windows.net/imagenet/validation.csv \\\n",
" --destination /data/imagenet/validation.csv\\\n",
" --source-sas \"?se=2025-01-01&sp=r&sv=2017-04-17&sr=b&sig=7x3rN7c/nlXbnZ0gAFywd5Er3r6MdwCq97Vwvda25WE%3D\"\\\n",
" --quiet"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"root_path = '/data/imagenet/'\n",
"train_path = root_path + 'train.csv'\n",
"val_path = root_path + 'validation.csv'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Train set"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1281167, 2)\n",
"1\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filenames</th>\n",
" <th>num_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>n02119789/n02119789_12009.JPEG</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>n02119789/n02119789_4083.JPEG</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>n02119789/n02119789_14450.JPEG</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>n02119789/n02119789_11832.JPEG</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>n02119789/n02119789_5459.JPEG</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filenames num_id\n",
"0 n02119789/n02119789_12009.JPEG 1\n",
"1 n02119789/n02119789_4083.JPEG 1\n",
"2 n02119789/n02119789_14450.JPEG 1\n",
"3 n02119789/n02119789_11832.JPEG 1\n",
"4 n02119789/n02119789_5459.JPEG 1"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train = pd.read_csv(train_path, usecols=['filenames','num_id'])\n",
"print(train.shape)\n",
"print(min(train['num_id']))\n",
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filenames</th>\n",
" <th>num_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filenames num_id\n",
"0 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 0\n",
"1 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 0\n",
"2 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 0\n",
"3 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 0\n",
"4 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 0"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mnt_path = '/mnt/batch/tasks/shared/LS_root/mounts/imagenet/'\n",
"train['filenames'] = mnt_path + 'train/' + train['filenames']\n",
"train['num_id'] -= 1\n",
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"train.to_csv('train_map.txt', header=False, index=False, sep='\\t')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Validation set"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(50000, 2)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filenames</th>\n",
" <th>num_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ILSVRC2012_val_00000001.JPEG</td>\n",
" <td>490</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ILSVRC2012_val_00000002.JPEG</td>\n",
" <td>361</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ILSVRC2012_val_00000003.JPEG</td>\n",
" <td>171</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ILSVRC2012_val_00000004.JPEG</td>\n",
" <td>822</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ILSVRC2012_val_00000005.JPEG</td>\n",
" <td>297</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filenames num_id\n",
"0 ILSVRC2012_val_00000001.JPEG 490\n",
"1 ILSVRC2012_val_00000002.JPEG 361\n",
"2 ILSVRC2012_val_00000003.JPEG 171\n",
"3 ILSVRC2012_val_00000004.JPEG 822\n",
"4 ILSVRC2012_val_00000005.JPEG 297"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val = pd.read_csv(val_path, usecols=['filenames','num_id'])\n",
"print(val.shape)\n",
"val.head()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filenames</th>\n",
" <th>num_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
" <td>489</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
" <td>360</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
" <td>170</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
" <td>821</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
" <td>296</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filenames num_id\n",
"0 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 489\n",
"1 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 360\n",
"2 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 170\n",
"3 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 821\n",
"4 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 296"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val['filenames'] = mnt_path + 'validation/' + val['filenames']\n",
"val['num_id'] -= 1\n",
"val.head()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"val.to_csv('val_map.txt', header=False, index=False, sep='\\t')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### ImageNet mean values"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2018-05-21 14:22:19-- https://raw.githubusercontent.com/Microsoft/CNTK/master/Examples/Image/DataSets/ImageNet/ImageNet1K_mean.xml\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.16.133\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.16.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 2559185 (2.4M) [text/plain]\n",
"Saving to: ImageNet1K_mean.xml\n",
"\n",
"ImageNet1K_mean.xml 100%[===================>] 2.44M --.-KB/s in 0.1s \n",
"\n",
"2018-05-21 14:22:19 (18.8 MB/s) - ImageNet1K_mean.xml saved [2559185/2559185]\n",
"\n"
]
}
],
"source": [
"!wget https://raw.githubusercontent.com/Microsoft/CNTK/master/Examples/Image/DataSets/ImageNet/ImageNet1K_mean.xml"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,562 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from scipy.io import loadmat\n",
"from os import path\n",
"import os\n",
"from toolz import juxt, compose\n",
"import pandas as pd\n",
"from glob import iglob\n",
"from itertools import chain"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_dir = path.join(os.getenv('AZ_BATCHAI_INPUT_DATASET'), 'imagenet')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data = loadmat(path.join(data_dir, 'ILSVRC2012_devkit_t12', 'data', 'meta.mat'))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def _index_from(synset):\n",
" return synset[0][0][0][0]\n",
"\n",
"def _wnid_from(synset):\n",
" return str(synset[0][1][0])\n",
"\n",
"def _name_from(synset):\n",
" return str(synset[0][2][0])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def _extract_from(synset):\n",
" extract = juxt(_index_from, _wnid_from, _name_from)\n",
" return extract(synset)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"label_list = list(map(_extract_from, data['synsets']))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame(label_list, columns=('num_index', 'wnid', 'label')).set_index('num_index')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train_dir=path.join(data_dir,'train')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def _extract_to_directory(wnid):\n",
" out_dir = path.join(train_dir, wnid)\n",
" tar_file = path.join(train_dir, '{}.tar'.format(wnid))\n",
" print(out_dir)\n",
" !mkdir -p $out_dir\n",
" !tar -C $out_dir -xf $tar_file"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"filenames = [iglob(path.join(train_dir, wnid, '*.*')) for wnid in df.loc[1:1000]['wnid'].tolist()]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ff = list(chain(*filenames))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_df = pd.DataFrame({'filenames':ff})"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"index_to_wnid_dict = df.loc[1:1000]['wnid'].to_dict()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"wnid_labels = [path.split(name)[-1].split('_')[0] for name in ff]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_df = data_df.assign(wnid=wnid_labels)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_df = data_df.assign(num_id=data_df['wnid'].replace(to_replace=list(index_to_wnid_dict.values()), \n",
" value=list(index_to_wnid_dict.keys())))"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"extract_wnid_dir = compose(path.basename, path.dirname)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"convert_filename = lambda x: path.join(extract_wnid_dir(x), path.basename(x))"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_df=data_df.assign(filenames=data_df['filenames'].apply(convert_filename))"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filenames</th>\n",
" <th>wnid</th>\n",
" <th>num_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>n02119789/n02119789_12009.JPEG</td>\n",
" <td>n02119789</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>n02119789/n02119789_4083.JPEG</td>\n",
" <td>n02119789</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>n02119789/n02119789_14450.JPEG</td>\n",
" <td>n02119789</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>n02119789/n02119789_11832.JPEG</td>\n",
" <td>n02119789</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>n02119789/n02119789_5459.JPEG</td>\n",
" <td>n02119789</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filenames wnid num_id\n",
"0 n02119789/n02119789_12009.JPEG n02119789 1\n",
"1 n02119789/n02119789_4083.JPEG n02119789 1\n",
"2 n02119789/n02119789_14450.JPEG n02119789 1\n",
"3 n02119789/n02119789_11832.JPEG n02119789 1\n",
"4 n02119789/n02119789_5459.JPEG n02119789 1"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data_df.to_csv(path.join(data_dir, 'train.csv'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Validation data "
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!mkdir -p {path.join(data_dir, 'validation')}"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!tar -C {path.join(data_dir, 'validation')} -xf {path.join(data_dir, 'ILSVRC2012_img_val.tar')}"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"f=open(path.join(data_dir, 'ILSVRC2012_devkit_t12', 'data', 'ILSVRC2012_validation_ground_truth.txt'))"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"convert_label = compose(int, str.strip)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"labels = list(map(convert_label, f.readlines()))"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"files = list(sorted(iglob(path.join(data_dir, 'validation', '*.JPEG'))))"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"valid_df=pd.DataFrame({'filenames':files, 'num_id':labels})"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"valid_df=valid_df.assign(filenames=valid_df['filenames'].apply(path.basename))"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>filenames</th>\n",
" <th>num_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ILSVRC2012_val_00000001.JPEG</td>\n",
" <td>490</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ILSVRC2012_val_00000002.JPEG</td>\n",
" <td>361</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ILSVRC2012_val_00000003.JPEG</td>\n",
" <td>171</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ILSVRC2012_val_00000004.JPEG</td>\n",
" <td>822</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ILSVRC2012_val_00000005.JPEG</td>\n",
" <td>297</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" filenames num_id\n",
"0 ILSVRC2012_val_00000001.JPEG 490\n",
"1 ILSVRC2012_val_00000002.JPEG 361\n",
"2 ILSVRC2012_val_00000003.JPEG 171\n",
"3 ILSVRC2012_val_00000004.JPEG 822\n",
"4 ILSVRC2012_val_00000005.JPEG 297"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ILSVRC2012_devkit_t12\t ILSVRC2012_img_train.tar\ttrain.csv\r\n",
"ILSVRC2012_devkit_t12.tar.gz ILSVRC2012_img_val.tar\tvalidation\r\n",
"ILSVRC2012_img_test.tar train\t\t\tvalidation.csv\r\n"
]
}
],
"source": [
"!ls {data_dir}"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"valid_df.to_csv(path.join(data_dir, 'validation.csv'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -1,11 +0,0 @@
{
"properties": {
"nodeSetup": {
"setupTask": {
"commandLine": "$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts/nodeprep.sh",
"runElevated": "True",
"stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs"
}
}
}
}

Просмотреть файл

@ -1,35 +0,0 @@
[Unit]
Description=Docker Application Container Engine
Documentation=https://docs.docker.com
After=network-online.target docker.socket firewalld.service
Wants=network-online.target
Requires=docker.socket
[Service]
EnvironmentFile=/etc/default/docker
Type=notify
# the default is not to use systemd for cgroups because the delegate issues still
# exists and systemd currently does not support the cgroup feature set required
# for containers run by docker
ExecStart=/usr/bin/dockerd --default-shm-size 8G -g /mnt/docker/ -H fd://
ExecReload=/bin/kill -s HUP $MAINPID
LimitNOFILE=1048576
# Having non-zero Limit*s causes performance problems due to accounting overhead
# in the kernel. We recommend using cgroups to do container-local accounting.
LimitNPROC=infinity
LimitCORE=infinity
# Uncomment TasksMax if your systemd version supports it.
# Only systemd 226 and above support this version.
TasksMax=infinity
TimeoutStartSec=0
# set delegate yes so that systemd does not reset the cgroups of docker containers
Delegate=yes
# kill only the docker process, not all processes in the cgroup
KillMode=process
# restart the docker process if it exits prematurely
Restart=on-failure
StartLimitBurst=3
StartLimitInterval=60s
[Install]
WantedBy=multi-user.target

Просмотреть файл

@ -1,4 +0,0 @@
#!/usr/bin/env bash
sudo cp $AZ_BATCHAI_MOUNT_ROOT/extfs/scripts/docker.service /lib/systemd/system
sudo systemctl daemon-reload
sudo systemctl restart docker

36
cookiecutter.json Normal file
Просмотреть файл

@ -0,0 +1,36 @@
{
"_project_short_description": "A set of templates for running distributed training on AML",
"project_title": "Template for Distributed Deep Learning using Azure Machine Learning ",
"project_name": "aml_dist",
"resource_group": "amldistrg",
"workspace": "workspace",
"subscription_id": "",
"vm_size": [
"Standard_NC24r",
"Standard_NC24rs_v2",
"Standard_NC24rs_v3",
"Standard_ND24rs"
],
"minimum_number_nodes": 2,
"maximum_number_nodes": 2,
"cluster_name":"gpucluster",
"container_registry": "dockerhub",
"type": [
"all",
"template",
"benchmark",
"imagenet"
],
"region": [
"eastus",
"southcentralus"
],
"experiment_name": "experiment",
"data":"/mnt/imagenet",
"image_name":"aml_dist",
"_remove_unused_projects": false,
"account_name": "distpremstorage",
"account_key": "",
"datastore_name":"datastore",
"container_name":"container"
}

Просмотреть файл

@ -1,143 +0,0 @@
include ../experiments_config.mk
CONTAINER_NAME=batch${ID}blob
EXPERIMENT:=experiment_imagenet_blob_${GPU_TYPE}
include ../../include/control.mk
define submit_keras_intel
$(call generate_job_intel,masalvar/horovod-intel-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2),$(3))
$(call submit_job,$(4))
endef
define submit_keras
$(call generate_job_openmpi,masalvar/horovod-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2),$(3))
$(call submit_job,$(4))
endef
define submit_keras_local
$(call generate_job_local,masalvar/horovod-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2))
$(call submit_job,$(3))
endef
define submit_tf_intel
$(call generate_job_intel,masalvar/horovod-intel:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1),$(2),$(3))
$(call submit_job,$(4))
endef
define submit_tf
$(call generate_job_openmpi,masalvar/horovod:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1),$(2),$(3))
$(call submit_job,$(4))
endef
define submit_tf_local
$(call generate_job_local,masalvar/horovod-intel:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1), $(2))
$(call submit_job, $(3))
endef
define submit_pytorch
$(call generate_job_openmpi,masalvar/horovod-pytorch,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1),$(2),$(3))
$(call submit_job,$(4))
endef
define submit_pytorch_local
$(call generate_job_local,masalvar/horovod-pytorch,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1), $(2))
$(call submit_job, $(3))
endef
upload-data: upload-training upload-validation upload-csv
@echo 'All data uploaded'
upload-training: set-storage
azcopy --source https://datasharesa.blob.core.windows.net/imagenet/train \
--source-key 'owUPSqTbwAigV54BHTr8oYABEha8xi/VsA4HD06GboDgOb3pf6OFgtw/tlKYv/AlkgSIBkxqoA28hnkIeo4NFg==' \
--destination https://${azure_storage_account}.blob.core.windows.net/${CONTAINER_NAME}/train \
--dest-key ${azure_storage_key} --quiet --recursive --exclude-older
upload-validation: set-storage
azcopy --source https://datasharesa.blob.core.windows.net/imagenet/validation \
--source-key 'owUPSqTbwAigV54BHTr8oYABEha8xi/VsA4HD06GboDgOb3pf6OFgtw/tlKYv/AlkgSIBkxqoA28hnkIeo4NFg==' \
--destination https://${azure_storage_account}.blob.core.windows.net/${CONTAINER_NAME}/validation \
--dest-key ${azure_storage_key} --quiet --recursive
upload-csv: set-storage
azcopy --source https://datasharesa.blob.core.windows.net/imagenet/train.csv \
--source-key 'owUPSqTbwAigV54BHTr8oYABEha8xi/VsA4HD06GboDgOb3pf6OFgtw/tlKYv/AlkgSIBkxqoA28hnkIeo4NFg==' \
--destination https://${azure_storage_account}.blob.core.windows.net/${CONTAINER_NAME}/train.csv \
--dest-key ${azure_storage_key} --quiet
azcopy --source https://datasharesa.blob.core.windows.net/imagenet/validation.csv \
--source-key 'owUPSqTbwAigV54BHTr8oYABEha8xi/VsA4HD06GboDgOb3pf6OFgtw/tlKYv/AlkgSIBkxqoA28hnkIeo4NFg==' \
--destination https://${azure_storage_account}.blob.core.windows.net/${CONTAINER_NAME}/validation.csv \
--dest-key ${azure_storage_key} --quiet
create-cluster: upload-nodeprep-scripts
az batchai cluster create \
-w $(WORKSPACE) \
--name ${CLUSTER_NAME} \
--image UbuntuLTS \
--vm-size ${VM_SIZE} \
--min ${NUM_NODES} --max ${NUM_NODES} \
--afs-name ${FILE_SHARE_NAME} \
--afs-mount-path extfs \
--user-name mat \
--password dnstvxrz \
--storage-account-name $(STORAGE_ACCOUNT_NAME) \
--storage-account-key $(azure_storage_key) \
--bfs-name $(CONTAINER_NAME) \
--bfs-mount-path extcn \
--config-file ../../cluster_config/cluster.json
submit-all: submit-keras-intel32 submit-keras-intel16 submit-keras-intel8 submit-keras-intel4 submit-tf-intel32 \
submit-tf-intel16 submit-tf-intel8 submit-tf-intel4 submit-pytorch32 submit-pytorch16 submit-pytorch8 submit-pytorch4 \
submit-keras-local submit-tf-local submit-pytorch-local
submit-keras-intel32:
$(call submit_keras_intel,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,keras-intel-32)
submit-keras-intel16:
$(call submit_keras_intel,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,keras-intel-16)
submit-keras-intel8:
$(call submit_keras_intel,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,keras-intel-8)
submit-keras-intel4:
$(call submit_keras_intel,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,keras-intel-4)
submit-keras-local:
$(call submit_keras_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,keras-local)
submit-tf-intel32:
$(call submit_tf_intel,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,tf-intel-32)
submit-tf-intel16:
$(call submit_tf_intel,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,tf-intel-16)
submit-tf-intel8:
$(call submit_tf_intel,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,tf-intel-8)
submit-tf-intel4:
$(call submit_tf_intel,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,tf-intel-4)
submit-tf-local:
$(call submit_tf_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,tf-local)
submit-pytorch32:
$(call submit_pytorch,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,pytorch-32)
submit-pytorch16:
$(call submit_pytorch,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,pytorch-16)
submit-pytorch8:
$(call submit_pytorch,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,pytorch-8)
submit-pytorch4:
$(call submit_pytorch,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,pytorch-4)
submit-pytorch-local:
$(call submit_pytorch_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,pytorch-local)

Просмотреть файл

@ -1,16 +0,0 @@
# Variables for Batch AI - change as necessary
ID:=iliadl3
LOCATION:=eastus
GROUP_NAME:=batch${ID}rg
STORAGE_ACCOUNT_NAME:=batch${ID}st
FILE_SHARE_NAME:=batch${ID}share
SELECTED_SUBSCRIPTION:="Team Danielle Internal"
WORKSPACE:=workspace
VM_SIZE:=Standard_NC24rs_v3
NUM_NODES:=2
CLUSTER_NAME:=ikv100
GPU_TYPE:=V100
PROCESSES_PER_NODE:=4

Просмотреть файл

@ -1,35 +0,0 @@
import json
import logging
from glob import iglob
from itertools import chain
import os
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def append_storage_type(json_data, filename):
json_data['Storage Type'] = os.path.dirname(filename)
def read_json(filename):
logger.info('Reading {}...'.format(filename))
with open(filename) as f:
return json.load(f)
def write_json_to_file(json_data, filename):
with open(filename, 'w') as outfile:
json.dump(json_data, outfile)
def main(filename='all_results.json'):
files = iglob('**/results.json', recursive=True)
json_data = (read_json(i) for i in files)
augmented_json_data = (append_storage_type(j, f) for j, f in zip(json_data, files))
write_json_to_file(list(chain.from_iterable(augmented_json_data)), filename)
logger.info('All results written to {}'.format(filename))
if __name__ == "__main__":
main()

Просмотреть файл

@ -1,343 +0,0 @@
import argparse
import json
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
#
# Config for Intel
cmd_for_intel = \
"""source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh;
echo $AZ_BATCH_HOST_LIST;
mpirun -n {total_processes} -ppn {processes_per_node} {hosts}
-env I_MPI_FABRICS=dapl
-env I_MPI_DAPL_PROVIDER=ofa-v2-ib0
-env I_MPI_DYNAMIC_CONNECTION=0
-env I_MPI_DEBUG=6
-env I_MPI_HYDRA_DEBUG=on
-env DISTRIBUTED=True
{fake}
{fake_length}
python -u {script}""".replace('\n', '')
# Config for OpenMPI
cmd_for_openmpi = \
"""echo $AZ_BATCH_HOST_LIST;
cat $AZ_BATCHAI_MPI_HOST_FILE;
mpirun -np {total_processes} {hosts}
-bind-to none -map-by slot
-x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH
-mca btl_tcp_if_include eth0
-x NCCL_SOCKET_IFNAME=eth0
-mca btl ^openib
-x NCCL_IB_DISABLE=1
-x DISTRIBUTED=True
-x AZ_BATCHAI_INPUT_TRAIN
-x AZ_BATCHAI_INPUT_TEST
{fake}
{fake_length}
--allow-run-as-root
python -u {script}""".replace('\n', '')
# Running on single node without mpi
cmd_local = """{fake} {fake_length} python -u {script}""".replace('\n', '')
cmd_choice_dict = {
'openmpi': cmd_for_openmpi,
'intelmpi': cmd_for_intel,
'local': cmd_local
}
hosts_param = {
'openmpi': '--hostfile $AZ_BATCHAI_MPI_HOST_FILE ',
'intelmpi': '-hosts $AZ_BATCH_HOST_LIST ',
'local': ''
}
fake_param = {
'openmpi': '-x FAKE=True ',
'intelmpi': '-env FAKE=True ',
'local': ' FAKE=True '
}
fake_length_param = {
'openmpi': '-x FAKE_DATA_LENGTH={} ',
'intelmpi': '-env FAKE_DATA_LENGTH={} ',
'local': ' FAKE_DATA_LENGTH={} '
}
def _hosts_for(mpitype, node_count):
if node_count > 1:
return hosts_param.get(mpitype, '')
else:
return hosts_param.get('local')
def _fake_for(mpitype, data):
if data is None:
return fake_param.get(mpitype, '')
else:
return ''
def _fake_length_for(mpitype, fake_length, data):
if data is None:
return fake_length_param.get(mpitype, '').format(fake_length)
else:
return ''
def _prepare_command(mpitype, total_processes, processes_per_node, script, node_count, data=None,
synthetic_length=1281167):
command = cmd_choice_dict.get(mpitype, cmd_for_intel)
return command.format(total_processes=total_processes,
processes_per_node=processes_per_node,
script=script,
hosts=_hosts_for(mpitype, node_count),
fake=_fake_for(mpitype, data),
fake_length=_fake_length_for(mpitype, synthetic_length, data))
def append_data_paths(job_template_dict, data_path):
job_template_dict['properties']['inputDirectories'].extend([{
"id": "TRAIN",
"path": data_path,
},
{
"id": "TEST",
"path": data_path,
}])
return job_template_dict
def generate_job_dict(image_name,
command,
node_count=2):
return {
"$schema": "https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2017-09-01-preview/job.json",
"properties": {
"nodeCount": node_count,
"customToolkitSettings": {
"commandLine": command
},
"stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs",
"inputDirectories": [{
"id": "SCRIPTS",
"path": "$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts"
},
],
"outputDirectories": [{
"id": "MODEL",
"pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs",
"pathSuffix": "Models"
}],
"containerSettings": {
"imageSourceRegistry": {
"image": image_name
}
}
}
}
def generate_job_dict_gloo(image_name,
script,
node_count=2):
# Command is hard-coded for time-being
# Not sure what world-size is?? Probably node_count but check
return {
"$schema": "https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2018-05-01/job.json",
"properties": {
"pyTorchSettings": {
"pythonScriptFilePath": script,
"commandLineArgs": "--world-size 2 --dist-backend $AZ_BATCHAI_PYTORCH_BACKEND --dist-url $AZ_BATCHAI_PYTORCH_INIT_METHOD --rank $AZ_BATCHAI_TASK_INDEX",
"communicationBackend": "gloo"
},
"nodeCount": node_count,
"stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs",
"inputDirectories": [{
"id": "SCRIPTS",
"path": "$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts"
},
{
"id": "TRAIN",
"path": "$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet",
},
{
"id": "TEST",
"path": "$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet",
},
],
"outputDirectories": [{
"id": "MODEL",
"pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs",
"pathSuffix": "Models"
}],
"containerSettings": {
"imageSourceRegistry": {
"image": image_name
}
}
}
}
def generate_job_dict_cntk(image_name,
node_count=2,
processes_per_node=4,
env_var=[]):
return {
"$schema": "https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2018-03-01/job.json",
"properties": {
"nodeCount": node_count,
"cntkSettings": {
"pythonScriptFilePath": "$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_cntk.py",
"processCount": processes_per_node
},
"environmentVariables": env_var,
"stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs",
"inputDirectories": [{
"id": "SCRIPTS",
"path": "$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts"
},
{
"id": "TRAIN",
"path": "$AZ_BATCHAI_MOUNT_ROOT/imagenet",
},
{
"id": "TEST",
"path": "$AZ_BATCHAI_MOUNT_ROOT/imagenet",
},
],
"outputDirectories": [{
"id": "MODEL",
"pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs",
"pathSuffix": "Models"
}],
"containerSettings": {
"imageSourceRegistry": {
"image": image_name
}
}
}
}
def write_json_to_file(json_dict, filename, mode='w'):
with open(filename, mode) as outfile:
json.dump(json_dict, outfile, indent=4, sort_keys=True)
outfile.write('\n\n')
def synthetic_data_job(image_name,
mpitype,
script,
filename='job.json',
node_count=2,
total_processes=None,
processes_per_node=4,
synthetic_length=1281167,
framework='horovod'):
logger.info('Creating manifest for job with synthetic data {} with {} image...'.format(
filename, image_name))
total_processes = processes_per_node * \
node_count if total_processes is None else total_processes
if framework == 'gloo':
job_template = generate_job_dict_gloo(image_name,
script,
node_count=node_count)
elif framework == 'cntk':
env_var = [{"name": "DISTRIBUTED", "value": "True"},
{"name": "FAKE", "value": "True"},
{"name": "FAKE_DATA_LENGTH", "value": str(synthetic_length)}]
job_template = generate_job_dict_cntk(image_name,
node_count,
processes_per_node,
env_var)
elif framework == 'horovod':
command = _prepare_command(mpitype,
total_processes,
processes_per_node,
script,
node_count,
synthetic_length=synthetic_length)
job_template = generate_job_dict(image_name,
command,
node_count=node_count)
else:
raise ValueError("Wrong framework argument {}".format(framework))
write_json_to_file(job_template, filename)
logger.info('Done')
def imagenet_data_job(image_name,
mpitype,
script,
data_path,
filename='job.json',
node_count=2,
total_processes=None,
processes_per_node=4):
logger.info('Creating manifest for job with real data {} with {} image...'.format(
filename, image_name))
total_processes = processes_per_node * \
node_count if total_processes is None else total_processes
# non-synthetic gloo to add
command = _prepare_command(mpitype,
total_processes,
processes_per_node,
script,
node_count,
data=data_path)
job_template = generate_job_dict(image_name,
command,
node_count=node_count)
job_template = append_data_paths(job_template, data_path)
write_json_to_file(job_template, filename)
logger.info('Done')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Generate manifest')
parser.add_argument('docker_image', type=str,
help='docker image to use')
parser.add_argument('mpi', type=str,
help='mpi to use, must be install in the docker image provided options:[intelmpi, openmpi, local]')
parser.add_argument('script', type=str,
help='script to run')
parser.add_argument('--filename', '-f', dest='filename', type=str, nargs='?',
default='job.json',
help='name of the file to save job spec to')
parser.add_argument('--node_count', '-n', dest='node_count', type=int, nargs='?',
default=1, help='the number of nodes to run the job across')
parser.add_argument('--ppn', dest='processes_per_node', type=int, nargs='?',
default=4,
help='number of GPU proceses to run per node')
parser.add_argument('--data', dest='data', type=str, nargs='?',
default=None,
help='the path where the imagenet data is stored')
parser.add_argument('--synthetic_length', '-l', dest='synthetic_length', type=str, nargs='?',
default=1281167,
help='the length of the fake data [default=size of imagenet 1281167]')
parser.add_argument('--framework', '-fw', type=str, nargs='?', default='horovod',
help='the framework used to generate the configuration, options:[horovod, gloo, cntk]')
args = parser.parse_args()
if args.data is None:
synthetic_data_job(args.docker_image,
args.mpi,
args.script,
filename=args.filename,
node_count=args.node_count,
processes_per_node=args.processes_per_node,
synthetic_length=args.synthetic_length,
framework=args.framework)
else:
imagenet_data_job(args.docker_image,
args.mpi,
args.script,
args.data,
filename=args.filename,
node_count=args.node_count,
processes_per_node=args.processes_per_node)

Просмотреть файл

@ -1,137 +0,0 @@
include ../experiments_config.mk
NFS_NAME=batch${ID}nfs
EXPERIMENT:=experiment_imagenet_local_${GPU_TYPE}
NFS_IP:=""
include ../../include/control.mk
define submit_keras_intel
$(call generate_job_intel,masalvar/horovod-intel-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2),$(3))
$(call submit_job,$(4))
endef
define submit_keras
$(call generate_job_openmpi,masalvar/horovod-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2),$(3))
$(call submit_job,$(4))
endef
define submit_keras_local
$(call generate_job_local,masalvar/horovod-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2))
$(call submit_job,$(3))
endef
define submit_tf_intel
$(call generate_job_intel,masalvar/horovod-intel:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1),$(2),$(3))
$(call submit_job,$(4))
endef
define submit_tf
$(call generate_job_openmpi,masalvar/horovod:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1),$(2),$(3))
$(call submit_job,$(4))
endef
define submit_tf_local
$(call generate_job_local,masalvar/horovod-intel:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1), $(2))
$(call submit_job, $(3))
endef
define submit_pytorch
$(call generate_job_openmpi,masalvar/horovod-pytorch,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1),$(2),$(3))
$(call submit_job,$(4))
endef
define submit_pytorch_intel
$(call generate_job_intel,masalvar/horovod-intel-pytorch:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1),$(2),$(3))
$(call submit_job,$(4))
endef
define submit_pytorch_local
$(call generate_job_local,masalvar/horovod-pytorch,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1), $(2))
$(call submit_job, $(3))
endef
define upload-data-nfs
scp nodeprep.sh mat@$(1):~/
ssh mat@$(1) sudo chmod 777 ~/nodeprep.sh
ssh mat@$(1) ./nodeprep.sh
endef
upload-nodeprep-scripts: set-storage
$(call upload_script, ../../cluster_config/docker.service)
$(call upload_script, ../../cluster_config/nodeprep.sh)
upload-download-script: set-storage
$(call upload_script, downloaddata.sh)
create-cluster: upload-nodeprep-scripts upload-download-script
az batchai cluster create \
-w $(WORKSPACE) \
--name ${CLUSTER_NAME} \
--image UbuntuLTS \
--vm-size ${VM_SIZE} \
--min ${NUM_NODES} --max ${NUM_NODES} \
--afs-name ${FILE_SHARE_NAME} \
--afs-mount-path extfs \
--user-name mat \
--password dnstvxrz \
--storage-account-name $(STORAGE_ACCOUNT_NAME) \
--storage-account-key $(azure_storage_key) \
--nfs ${NFS_NAME} \
--nfs-mount-path nfs \
--config-file ../../cluster_config/cluster.json
upload-data-nfs:
$(call upload-data-nfs,$(NFS_IP))
submit-keras-intel32:
$(call submit_keras_intel,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,keras-intel-32)
submit-keras-intel16:
$(call submit_keras_intel,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,keras-intel-16)
submit-keras-intel8:
$(call submit_keras_intel,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,keras-intel-8)
submit-keras-intel4:
$(call submit_keras_intel,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,keras-intel-4)
submit-keras-local:
$(call submit_keras_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,keras-local)
submit-tf-intel32:
$(call submit_tf_intel,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,tf-intel-32)
submit-tf-intel16:
$(call submit_tf_intel,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,tf-intel-16)
submit-tf-intel8:
$(call submit_tf_intel,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,tf-intel-8)
submit-tf-intel4:
$(call submit_tf_intel,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,tf-intel-4)
submit-tf-local:
$(call submit_tf_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,tf-local)
submit-pytorch32:
$(call submit_pytorch,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,pytorch-32)
submit-pytorch16:
$(call submit_pytorch,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,pytorch-16)
submit-pytorch8:
$(call submit_pytorch,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,pytorch-8)
submit-pytorch4:
$(call submit_pytorch,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,pytorch-4)
submit-pytorch-intel4:
$(call submit_pytorch_intel,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,pytorch-intel-4)
submit-pytorch-local:
$(call submit_pytorch_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,pytorch-local)

Просмотреть файл

@ -1,10 +0,0 @@
#!/usr/bin/env bash
# Download data
mkdir -p $AZ_BATCHAI_MOUNT_ROOT/imagenet
rsync --info=progress2 $AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet/train.tar.gz $AZ_BATCHAI_MOUNT_ROOT/imagenet
rsync --info=progress2 $AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet/validation.tar.gz $AZ_BATCHAI_MOUNT_ROOT/imagenet
rsync --info=progress2 $AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet/train.csv $AZ_BATCHAI_MOUNT_ROOT/imagenet
rsync --info=progress2 $AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet/validation.csv $AZ_BATCHAI_MOUNT_ROOT/imagenet
cd $AZ_BATCHAI_MOUNT_ROOT/imagenet
tar -xzf train.tar.gz
tar -xzf validation.tar.gz

Просмотреть файл

@ -1,125 +0,0 @@
include ../experiments_config.mk
NFS_NAME=batch${ID}nfs
EXPERIMENT:=experiment_imagenet_${GPU_TYPE}
NFS_IP:=""
include ../../include/control.mk
define submit_keras_intel
$(call generate_job_intel,masalvar/horovod-intel-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2),$(3))
$(call submit_job,$(4))
endef
define submit_keras
$(call generate_job_openmpi,masalvar/horovod-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2),$(3))
$(call submit_job,$(4))
endef
define submit_keras_local
$(call generate_job_local,masalvar/horovod-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2))
$(call submit_job,$(3))
endef
define submit_tf_intel
$(call generate_job_intel,masalvar/horovod-intel:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1),$(2),$(3))
$(call submit_job,$(4))
endef
define submit_tf
$(call generate_job_openmpi,masalvar/horovod:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1),$(2),$(3))
$(call submit_job,$(4))
endef
define submit_tf_local
$(call generate_job_local,masalvar/horovod-intel:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1), $(2))
$(call submit_job, $(3))
endef
define submit_pytorch
$(call generate_job_openmpi,masalvar/horovod-pytorch,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1),$(2),$(3))
$(call submit_job,$(4))
endef
define submit_pytorch_local
$(call generate_job_local,masalvar/horovod-pytorch,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1), $(2))
$(call submit_job, $(3))
endef
define upload-data-nfs
scp nodeprep.sh mat@$(1):~/
ssh mat@$(1) sudo chmod 777 ~/nodeprep.sh
ssh mat@$(1) ./nodeprep.sh
endef
create-cluster: upload-nodeprep-scripts
az batchai cluster create \
-w $(WORKSPACE) \
--name ${CLUSTER_NAME} \
--image UbuntuLTS \
--vm-size ${VM_SIZE} \
--min ${NUM_NODES} --max ${NUM_NODES} \
--afs-name ${FILE_SHARE_NAME} \
--afs-mount-path extfs \
--user-name mat \
--password dnstvxrz \
--storage-account-name $(STORAGE_ACCOUNT_NAME) \
--storage-account-key $(azure_storage_key) \
--nfs ${NFS_NAME} \
--nfs-mount-path nfs \
--config-file ../../cluster_config/cluster.json
upload-data-nfs:
$(call upload-data-nfs,$(NFS_IP))
submit-all: submit-keras-intel32 submit-keras-intel16 submit-keras-intel8 submit-keras-intel4 submit-tf-intel32 \
submit-tf-intel16 submit-tf-intel8 submit-tf-intel4 submit-pytorch32 submit-pytorch16 submit-pytorch8 submit-pytorch4 \
submit-keras-local submit-tf-local submit-pytorch-local
submit-keras-intel32:
$(call submit_keras_intel,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,keras-intel-32)
submit-keras-intel16:
$(call submit_keras_intel,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,keras-intel-16)
submit-keras-intel8:
$(call submit_keras_intel,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,keras-intel-8)
submit-keras-intel4:
$(call submit_keras_intel,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,keras-intel-4)
submit-keras-local:
$(call submit_keras_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,keras-local)
submit-tf-intel32:
$(call submit_tf_intel,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,tf-intel-32)
submit-tf-intel16:
$(call submit_tf_intel,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,tf-intel-16)
submit-tf-intel8:
$(call submit_tf_intel,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,tf-intel-8)
submit-tf-intel4:
$(call submit_tf_intel,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,tf-intel-4)
submit-tf-local:
$(call submit_tf_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,tf-local)
submit-pytorch32:
$(call submit_pytorch,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,pytorch-32)
submit-pytorch16:
$(call submit_pytorch,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,pytorch-16)
submit-pytorch8:
$(call submit_pytorch,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,pytorch-8)
submit-pytorch4:
$(call submit_pytorch,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,pytorch-4)
submit-pytorch-local:
$(call submit_pytorch_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,pytorch-local)

Просмотреть файл

@ -1,29 +0,0 @@
#!/usr/bin/env bash
wget https://gist.githubusercontent.com/msalvaris/073c28a9993d58498957294d20d74202/raw/916eefe763c71da49d8ed41cb8474bdc8021af33/install_azcopy
chmod 777 install_azcopy
sudo ./install_azcopy
mkdir -p /data/imagenet
azcopy --source https://datasharesa.blob.core.windows.net/imagenet/validation.csv \
--destination /data/imagenet/validation.csv\
--source-sas "?se=2025-01-01&sp=r&sv=2017-04-17&sr=b&sig=7x3rN7c/nlXbnZ0gAFywd5Er3r6MdwCq97Vwvda25WE%3D"\
--quiet
azcopy --source https://datasharesa.blob.core.windows.net/imagenet/validation.tar.gz \
--destination /data/imagenet/validation.tar.gz\
--source-sas "?se=2025-01-01&sp=r&sv=2017-04-17&sr=b&sig=zy8L4shZa3XXBe152hPnhXsyfBqCufDOz01a9ZHWU28%3D"\
--quiet
azcopy --source https://datasharesa.blob.core.windows.net/imagenet/train.csv \
--destination /data/imagenet/train.csv\
--source-sas "?se=2025-01-01&sp=r&sv=2017-04-17&sr=b&sig=EUcahDDZcefOKtHoVWDh7voAC1BoxYNM512spFmjmDU%3D"\
--quiet
azcopy --source https://datasharesa.blob.core.windows.net/imagenet/train.tar.gz \
--destination /data/imagenet/train.tar.gz\
--source-sas "?se=2025-01-01&sp=r&sv=2017-04-17&sr=b&sig=qP%2B7lQuFKHo5UhQKpHcKt6p5fHT21lPaLz1O/vv4FNU%3D"\
--quiet
cd /data/imagenet
tar -xvzf train.tar.gz
tar -xzvf validation.tar.gz

Просмотреть файл

@ -1,106 +0,0 @@
from glob import glob
import numpy as np
import json
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def extract_mpi_type(file):
return file.split('_')[-1].split('.')[0]
def extract_gpu_type(file):
return file.split('_')[-2]
def extract_framework(file):
return file.split('_')[0]
def extract_images_per_second(data):
def _extract(line_string):
if 'Total images/sec: ' in line_string:
return float(line_string.split(':')[-1].strip())
return np.array(list(
filter(None,
map(_extract, data))
)).mean()
def extract_total_duration(data):
def _extract(line_string):
if 'Total duration: ' in line_string:
return float(line_string.split(':')[-1].strip())
return np.array(list(
filter(None,
map(_extract, data))
)).mean()
def extact_data_length(data):
for line in data:
if 'Data length: ' in line:
return int(line.split(':')[-1].strip())
def extract_batch_size(data):
for line in data:
if 'Batch size: ' in line:
return int(line.split(':')[-1].strip().split(' ')[-1].strip(')'))
def extact_dataset(data):
for line in data:
if 'Dataset: ' in line:
return line.split(':')[-1].strip()
def extract_num_devices(data):
for line in data:
if 'Num GPUs: ' in line:
return int(float(line.split(': ')[-1].strip()))
extraction_funcs = {
'Images/Second': extract_images_per_second,
'Batch Size': extract_batch_size,
'Data Length': extact_data_length,
'Total Duration': extract_total_duration,
'Dataset': extact_dataset,
'GPUs': extract_num_devices,
}
def parse_results(file):
logger.info('Processing {}'.format(file))
with open(file) as f:
data = f.readlines()
results_dict = {key: func(data) for key, func in extraction_funcs.items()}
results_dict['MPI'] = extract_mpi_type(file)
results_dict['GPU Type'] = extract_gpu_type(file)
results_dict['Framework'] = extract_framework(file)
return results_dict
def write_json_to_file(json_dict, filename):
""" Simple function to write JSON dictionaries to files
"""
with open(filename, 'w') as outfile:
json.dump(json_dict, outfile)
def main(path='*.results', filename='results.json'):
logger.info('Reading files from {} and writing to {}'.format(path, filename))
files = glob('*.results')
logger.info('Found {} files'.format(len(files)))
results = [parse_results(file) for file in files]
logger.info('Writing results to {}'.format(filename))
write_json_to_file(results, filename)
if __name__ == '__main__':
main()

Просмотреть файл

@ -1,161 +0,0 @@
include ../experiments_config.mk
FAKE_DATA_LENGTH:=1281167
EXPERIMENT:=experiment_synthetic_${GPU_TYPE}
include ../../include/control.mk
define submit_keras_intel
$(call generate_job_intel,masalvar/horovod-intel-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH} )
$(call submit_job, $(3))
endef
define submit_keras
$(call generate_job_openmpi,masalvar/horovod-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH})
$(call submit_job, $(3))
endef
define submit_keras_local
$(call generate_job_local,masalvar/horovod-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1), --synthetic_length ${FAKE_DATA_LENGTH})
$(call submit_job, $(2))
endef
define submit_tf_intel
$(call generate_job_intel,masalvar/horovod-intel:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH})
$(call submit_job, $(3))
endef
define submit_tf
$(call generate_job_openmpi,masalvar/horovod:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH})
$(call submit_job, $(3))
endef
define submit_tf_local
$(call generate_job_local,masalvar/horovod-intel:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1), --synthetic_length ${FAKE_DATA_LENGTH})
$(call submit_job, $(2))
endef
define submit_pytorch
$(call generate_job_openmpi,masalvar/horovod-pytorch,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH})
$(call submit_job, $(3))
endef
define submit_pytorch_local
$(call generate_job_local,masalvar/horovod-pytorch,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1), --synthetic_length ${FAKE_DATA_LENGTH})
$(call submit_job, $(2))
endef
define submit_pytorch_gloo
$(call generate_job_gloo,iliauk/pytorch_gloo,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_gloo.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH})
$(call submit_job, $(3))
endef
define submit_cntk
$(call generate_job_cntk,hoaphumanoid/cntk:distributed,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_cntk.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH})
$(call submit_job, $(3))
endef
define submit_cntk_local
$(call generate_job_local,hoaphumanoid/cntk:distributed,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_cntk.py,$(1), --synthetic_length ${FAKE_DATA_LENGTH})
$(call submit_job, $(2))
endef
create-cluster: upload-nodeprep-scripts
az batchai cluster create \
-w $(WORKSPACE) \
--name ${CLUSTER_NAME} \
--image UbuntuLTS \
--vm-size ${VM_SIZE} \
--min ${NUM_NODES} --max ${NUM_NODES} \
--afs-name ${FILE_SHARE_NAME} \
--afs-mount-path extfs \
--user-name mat \
--password dnstvxrz \
--storage-account-name $(STORAGE_ACCOUNT_NAME) \
--storage-account-key $(azure_storage_key) \
--config-file ../../cluster_config/cluster.json
submit-all: submit-keras-intel32 submit-keras-intel16 submit-keras-intel8 submit-keras-intel4 \
submit-tf-intel32 submit-tf-intel16 submit-tf-intel8 submit-tf-intel4 \
submit-pytorch32 submit-pytorch16 submit-pytorch8 submit-pytorch4 \
submit-pytorch_gloo32 submit-pytorch_gloo16 submit-pytorch_gloo8 submit-pytorch_gloo4 \
submit-cntk32 submit-cntk16 submit-cntk8 submit-cntk4 \
submit-keras-local submit-tf-local submit-pytorch-local submit_cntk_local
submit-keras-intel32:
$(call submit_keras_intel,$(NUM_NODES),$(PROCESSES_PER_NODE),keras-intel-32)
submit-keras-intel16:
$(call submit_keras_intel,4,$(PROCESSES_PER_NODE),keras-intel-16)
submit-keras-intel8:
$(call submit_keras_intel,2,$(PROCESSES_PER_NODE),keras-intel-8)
submit-keras-intel4:
$(call submit_keras_intel,1,$(PROCESSES_PER_NODE),keras-intel-4)
submit-keras-local:
$(call submit_keras_local,1,keras-local)
submit-tf-intel32:
$(call submit_tf_intel,$(NUM_NODES),$(PROCESSES_PER_NODE),tf-intel-32)
submit-tf-intel16:
$(call submit_tf_intel,4,$(PROCESSES_PER_NODE),tf-intel-16)
submit-tf-intel8:
$(call submit_tf_intel,2,$(PROCESSES_PER_NODE),tf-intel-8)
submit-tf-intel4:
$(call submit_tf_intel,1,$(PROCESSES_PER_NODE),tf-intel-4)
submit-tf-local:
$(call submit_tf_local,1,tf-local)
submit-pytorch32:
$(call submit_pytorch,8,$(PROCESSES_PER_NODE),pytorch-32)
submit-pytorch16:
$(call submit_pytorch,4,$(PROCESSES_PER_NODE),pytorch-16)
submit-pytorch8:
$(call submit_pytorch,2,$(PROCESSES_PER_NODE),pytorch-8)
submit-pytorch4:
$(call submit_pytorch,1,$(PROCESSES_PER_NODE),pytorch-4)
submit-pytorch-local:
$(call submit_pytorch_local,1,pytorch-local)
submit-cntk32:
$(call submit_cntk,8,$(PROCESSES_PER_NODE),cntk-32)
submit-cntk16:
$(call submit_cntk,4,$(PROCESSES_PER_NODE),cntk-16)
submit-cntk8:
$(call submit_cntk,2,$(PROCESSES_PER_NODE),cntk-8)
submit-cntk4:
$(call submit_cntk,1,$(PROCESSES_PER_NODE),cntk-4)
submit-cntk-local:
$(call submit_cntk_local,1,cntk-local)
submit-pytorch_gloo32:
$(call submit_pytorch_gloo,8,$(PROCESSES_PER_NODE),pytorch_gloo-32)
submit-pytorch_gloo16:
$(call submit_pytorch_gloo,4,$(PROCESSES_PER_NODE),pytorch_gloo-16)
submit-pytorch_gloo8:
$(call submit_pytorch_gloo,2,$(PROCESSES_PER_NODE),pytorch_gloo-8)
submit-pytorch_gloo4:
$(call submit_pytorch_gloo,1,$(PROCESSES_PER_NODE),pytorch_gloo-4)

29
hooks/post_gen_project.py Normal file
Просмотреть файл

@ -0,0 +1,29 @@
import shutil
def _copy_directories(src, dst):
try:
shutil.copytree(src, dst, ignore=shutil.ignore_patterns(".git"))
except PermissionError:
print(f"Could not copy files from {src} to {dst}, permission error")
def _remove_directories(*directories):
for folder in directories:
shutil.rmtree(folder)
def _copy_env_file():
shutil.move("_dotenv_template", ".env")
_CHOICES_DICT = {
"template": ("TensorFlow_benchmark", "TensorFlow_imagenet"),
"benchmark": ("TensorFlow_experiment", "TensorFlow_imagenet"),
"imagenet": ("TensorFlow_benchmark", "TensorFlow_experiment")
}
if __name__ == "__main__":
_copy_env_file()
if {{cookiecutter._remove_unused_projects}}:
_remove_directories(_CHOICES_DICT.get({{cookiecutter.type}}, tuple()))

23
hooks/pre_gen_project.py Normal file
Просмотреть файл

@ -0,0 +1,23 @@
import os
import shutil
def _remove_directory(dirpath):
if os.path.exists(dirpath):
try:
print(f"Deleting directory {dirpath}")
shutil.rmtree(dirpath)
except PermissionError:
print(
f"The directory contains files that can't be removed please delete {dirpath} and run again"
)
_remove_directory("{{cookiecutter.experiment_name}}")
print(
"""
Generating project {{cookiecutter.project_name}}
"""
)

Двоичные данные
images/architecture1.png Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 240 KiB

1
images/demo.svg Normal file

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

После

Ширина:  |  Высота:  |  Размер: 57 KiB

Просмотреть файл

@ -1,108 +0,0 @@
define PROJECT_HELP_MSG
Usage:
make help show this message
make build make Horovod TF image with Open MPI
make build-intel make Horovod TF image with Intel MPI
make run-mpi run training using Open MPI image
make run-mpi-intel run training using Intel MPI image
make run run training in non-distributed mode
make push push Horovod TF image with Open MPI
make push-intel push Horovod TF image with Intel MPI
endef
export PROJECT_HELP_MSG
DATA_DIR:=/mnt/imagenet
#DATA_DIR:=/mnt/rmdsk
PWD:=$(shell pwd)
FAKE:='False'
FAKE_DATA_LENGTH:=1281167
ROOT:=$(shell dirname ${PWD})
setup_volumes:=-v $(PWD)/src:/mnt/script \
-v $(DATA_DIR):/mnt/input \
-v $(DATA_DIR)/temp/model:/mnt/model \
-v $(DATA_DIR)/temp/output:/mnt/output \
-v $(ROOT)/common:/mnt/common
setup_environment:=--env AZ_BATCHAI_INPUT_TRAIN='/mnt/input' \
--env AZ_BATCHAI_INPUT_TEST='/mnt/input' \
--env AZ_BATCHAI_OUTPUT_MODEL='/mnt/model' \
--env AZ_BATCHAI_JOB_TEMP_DIR='/mnt/output' \
--env AZ_BATCHAI_INPUT_SCRIPTS='/mnt/script' \
--env PYTHONPATH=/mnt/common/:$$PYTHONPATH
define execute_mpi
nvidia-docker run -it \
--shm-size="8g" \
$(setup_volumes) \
$(setup_environment) \
--env DISTRIBUTED='True' \
--env FAKE=$(FAKE) \
--env FAKE_DATA_LENGTH=$(FAKE_DATA_LENGTH) \
--privileged \
$(1) bash -c "mpirun -np 2 -H localhost:2 python $(2)"
endef
define execute_mpi_intel
nvidia-docker run -it \
--shm-size="8g" \
$(setup_volumes) \
$(setup_environment) \
--env DISTRIBUTED='True' \
--env FAKE=$(FAKE) \
--env FAKE_DATA_LENGTH=$(FAKE_DATA_LENGTH) \
--privileged \
$(1) bash -c " source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh; mpirun -n 2 -host localhost -ppn 2 -env I_MPI_DAPL_PROVIDER=ofa-v2-ib0 -env I_MPI_DYNAMIC_CONNECTION=0 python $(2)"
endef
define execute
nvidia-docker run -it \
--shm-size="8g" \
$(setup_volumes) \
$(setup_environment) \
--env DISTRIBUTED='False' \
--env FAKE=$(FAKE) \
--env FAKE_DATA_LENGTH=$(FAKE_DATA_LENGTH) \
$(1) bash -c "python $(2)"
endef
define execute_jupyter
nvidia-docker run -p 8888:8888 -it \
--shm-size="8g" \
$(setup_volumes) \
$(setup_environment) \
$(1) bash -c "jupyter notebook --ip=* --no-browser --allow-root"
endef
help:
echo "$$PROJECT_HELP_MSG" | less
build:
docker build -t $(image-open) $(open-path)
build-intel:
docker build -t $(image-intel) $(intel-path)
run-mpi:
$(call execute_mpi, $(image-open), $(script))
run-mpi-intel:
$(call execute_mpi_intel, $(image-intel), $(script))
run:
$(call execute, $(image-open), $(script))
run-jupyter:
$(call execute_jupyter, $(image-open))
push:
docker push $(image-open)
push-intel:
docker push $(image-intel)
.PHONY: help build push

Просмотреть файл

@ -1,341 +0,0 @@
define PROJECT_HELP_MSG
Usage:
make help show this message
make build build docker image
make push push container
make run run benchmarking container
make setup setup the cluster
make show-cluster
make list-clusters
make run-bait-intel run batch ai benchamrk using intel mpi
make run-bait-openmpi run batch ai benchmark using open mpi
make run-bait-local run batch ai benchmark on one node
make list-jobs
make list-files
make stream-stdout
make stream-stderr
make delete-job
make delete-cluster
make delete delete everything including experiments, workspace and resource group
endef
export PROJECT_HELP_MSG
define generate_job_intel
python ../generate_job_spec.py $(1) intelmpi \
$(2) \
--filename job.json \
--node_count $(3) \
--ppn $(4) \
$(5)
endef
define generate_job_openmpi
python ../generate_job_spec.py $(1) openmpi \
$(2) \
--filename job.json \
--node_count $(3) \
--ppn $(4) \
$(5)
endef
define generate_job_local
python ../generate_job_spec.py $(1) local \
$(2) \
--filename job.json \
--node_count 1 \
--ppn $(3) \
$(4)
endef
define generate_job_gloo
python ../generate_job_spec.py $(1) openmpi \
$(2) \
--filename job.json \
--node_count $(3) \
--ppn $(4) \
$(5) \
--framework gloo
endef
define generate_job_cntk
python ../generate_job_spec.py $(1) openmpi \
$(2) \
--filename job.json \
--node_count $(3) \
--ppn $(4) \
$(5) \
--framework cntk
endef
define stream_stdout
az batchai job file stream -w $(WORKSPACE) -e $(EXPERIMENT) \
--j $(1) --output-directory-id stdouterr -f stdout.txt
endef
define submit_job
az batchai job create -n $(1) --cluster ${CLUSTER_NAME} -w $(WORKSPACE) -e $(EXPERIMENT) -f job.json
endef
define delete_job
az batchai job delete -w $(WORKSPACE) -e $(EXPERIMENT) --name $(1) -y
endef
define upload_script
az storage file upload --share-name ${FILE_SHARE_NAME} --source $(1) --path scripts --account-name $(azure_storage_account) --account-key $(azure_storage_key)
endef
select-subscription:
az login -o table
az account set --subscription $(SELECTED_SUBSCRIPTION)
create-resource-group:
az group create -n $(GROUP_NAME) -l $(LOCATION) -o table
create-storage:
@echo "Creating storage account"
az storage account create -l $(LOCATION) -n $(STORAGE_ACCOUNT_NAME) -g $(GROUP_NAME) --sku Standard_LRS
set-storage:
$(eval azure_storage_key:=$(shell az storage account keys list -n $(STORAGE_ACCOUNT_NAME) -g $(GROUP_NAME) | jq '.[0]["value"]'))
$(eval azure_storage_account:= $(STORAGE_ACCOUNT_NAME))
$(eval file_share_name:= $(FILE_SHARE_NAME))
set-az-defaults:
az configure --defaults location=${LOCATION}
az configure --defaults group=${GROUP_NAME}
create-fileshare: set-storage
@echo "Creating fileshare"
az storage share create -n $(file_share_name) --account-name $(azure_storage_account) --account-key $(azure_storage_key)
create-directory: set-storage
az storage directory create --share-name $(file_share_name) --name scripts --account-name $(azure_storage_account) --account-key $(azure_storage_key)
create-nfs:
az batchai file-server create -n $(NFS_NAME) -w ${WORKSPACE} --disk-count 4 --disk-size 250 -s Standard_DS4_v2 -u mat -p d13NHAL! -g ${GROUP_NAME} --storage-sku Premium_LRS
list-nfs:
az batchai file-server list -o table -w ${WORKSPACE} -g ${GROUP_NAME}
create-container: set-storage
az storage container create --account-name $(azure_storage_account) --account-key $(azure_storage_key) --name ${CONTAINER_NAME}
upload-scripts: set-storage
$(call upload_script, ../../HorovodKeras/src/data_generator.py)
$(call upload_script, ../../HorovodKeras/src/imagenet_keras_horovod.py)
$(call upload_script, ../../HorovodTF/src/imagenet_estimator_tf_horovod.py)
$(call upload_script, ../../HorovodTF/src/resnet_model.py)
$(call upload_script, ../../HorovodPytorch/src/imagenet_pytorch_horovod.py)
$(call upload_script, ../../CNTK/src/imagenet_cntk.py)
$(call upload_script, ../../CNTK/src/resnet_models.py)
$(call upload_script, ../../Pytorch/src/imagenet_pytorch_gloo.py)
$(call upload_script, ../../common/timer.py)
upload-nodeprep-scripts: set-storage
$(call upload_script, ../../cluster_config/docker.service)
$(call upload_script, ../../cluster_config/nodeprep.sh)
create-workspace:
az batchai workspace create -n $(WORKSPACE) -g $(GROUP_NAME)
create-experiment:
az batchai experiment create -n $(EXPERIMENT) -g $(GROUP_NAME) -w $(WORKSPACE)
show-cluster:
az batchai cluster show -n ${CLUSTER_NAME} -w $(WORKSPACE)
list-clusters:
az batchai cluster list -w $(WORKSPACE) -o table
list-nodes:
az batchai cluster node list -c ${CLUSTER_NAME} -w $(WORKSPACE)
list-jobs:
az batchai job list -w $(WORKSPACE) -e $(EXPERIMENT) -o table
list-files:
az batchai job file list -w $(WORKSPACE) -e $(EXPERIMENT) --j ${JOB_NAME} --output-directory-id stdouterr
stream-stdout:
$(call stream_stdout, ${JOB_NAME})
stream-stderr:
az batchai job file stream -w $(WORKSPACE) -e $(EXPERIMENT) --j ${JOB_NAME} --output-directory-id stdouterr -f stderr.txt
delete-job:
$(call delete_job, ${JOB_NAME})
delete-cluster:
az configure --defaults group=''
az configure --defaults location=''
az batchai cluster delete -w $(WORKSPACE) --name ${CLUSTER_NAME} -g ${GROUP_NAME} -y
delete: delete-cluster
az batchai experiment delete -w $(WORKSPACE) --name ${experiment} -g ${GROUP_NAME} -y
az batchai workspace delete -w ${WORKSPACE} -g ${GROUP_NAME} -y
az group delete --name ${GROUP_NAME} -y
setup: select-subscription create-resource-group create-workspace create-storage set-storage set-az-defaults create-fileshare create-directory upload-scripts create-cluster list-clusters create-experiment
@echo "Cluster created"
#
####### Submit Jobs ######
#
submit-all: submit-keras-intel32 submit-keras-intel16 submit-keras-intel8 submit-keras-intel4 \
submit-tf-intel32 submit-tf-intel16 submit-tf-intel8 submit-tf-intel4 \
submit-pytorch32 submit-pytorch16 submit-pytorch8 submit-pytorch4 \
submit-pytorch_gloo32 submit-pytorch_gloo16 submit-pytorch_gloo8 submit-pytorch_gloo4 \
submit-cntk32 submit-cntk16 submit-cntk8 submit-cntk4 \
submit-keras-local submit-tf-local submit-pytorch-local submit_cntk_local
clean-jobs:
$(call delete_job, tf-local)
$(call delete_job, tf-intel-4)
$(call delete_job, tf-intel-8)
$(call delete_job, tf-intel-16)
$(call delete_job, tf-intel-32)
$(call delete_job, keras-local)
$(call delete_job, keras-intel-4)
$(call delete_job, keras-intel-8)
$(call delete_job, keras-intel-16)
$(call delete_job, keras-intel-32)
$(call delete_job, pytorch-local)
$(call delete_job, pytorch-4)
$(call delete_job, pytorch-8)
$(call delete_job, pytorch-16)
$(call delete_job, pytorch-32)
$(call delete_job, pytorch_gloo-4)
$(call delete_job, pytorch_gloo-8)
$(call delete_job, pytorch_gloo-16)
$(call delete_job, pytorch_gloo-32)
$(call delete_job, cntk-local)
$(call delete_job, cntk-4)
$(call delete_job, cntk-8)
$(call delete_job, cntk-16)
$(call delete_job, cntk-32)
####### Gather Results ######
# TODO for PyTorch_Gloo
gather-results:results.json
@echo "All results gathered"
results.json: pytorch_1gpulocal_$(GPU_TYPE)_local.results pytorch_4gpuopen_$(GPU_TYPE)_open.results \
pytorch_8gpuopen_$(GPU_TYPE)_open.results pytorch_16gpuopen_$(GPU_TYPE)_open.results \
pytorch_32gpuopen_$(GPU_TYPE)_open.results \
pytorch_gloo_1gpulocal_$(GPU_TYPE)_local.results pytorch_gloo_4gpuopen_$(GPU_TYPE)_open.results \
pytorch_gloo_8gpuopen_$(GPU_TYPE)_open.results pytorch_gloo_16gpuopen_$(GPU_TYPE)_open.results \
pytorch_gloo_32gpuopen_$(GPU_TYPE)_open.results \
tf_1gpulocal_$(GPU_TYPE)_local.results tf_4gpuintel_$(GPU_TYPE)_intel.results \
tf_8gpuintel_$(GPU_TYPE)_intel.results tf_16gpuintel_$(GPU_TYPE)_intel.results \
tf_32gpuintel_$(GPU_TYPE)_intel.results \
keras_1gpulocal_$(GPU_TYPE)_local.results keras_4gpuintel_$(GPU_TYPE)_intel.results \
keras_8gpuintel_$(GPU_TYPE)_intel.results keras_16gpuintel_$(GPU_TYPE)_intel.results \
keras_32gpuintel_$(GPU_TYPE)_intel.results \
cntk_1gpulocal_$(GPU_TYPE)_local.results cntk_4gpuintel_$(GPU_TYPE)_intel.results \
cntk_8gpuintel_$(GPU_TYPE)_intel.results cntk_16gpuintel_$(GPU_TYPE)_intel.results \
cntk_32gpuintel_$(GPU_TYPE)_intel.results
python ../parse_results.py
pytorch_1gpulocal_$(GPU_TYPE)_local.results:
$(call stream_stdout, pytorch-local)>pytorch_1gpulocal_$(GPU_TYPE)_local.results
pytorch_4gpuopen_$(GPU_TYPE)_open.results:
$(call stream_stdout, pytorch-4)>pytorch_4gpuopen_$(GPU_TYPE)_open.results
pytorch_8gpuopen_$(GPU_TYPE)_open.results:
$(call stream_stdout, pytorch-8)>pytorch_8gpuopen_$(GPU_TYPE)_open.results
pytorch_16gpuopen_$(GPU_TYPE)_open.results:
$(call stream_stdout, pytorch-16)>pytorch_16gpuopen_$(GPU_TYPE)_open.results
pytorch_32gpuopen_$(GPU_TYPE)_open.results:
$(call stream_stdout, pytorch-32)>pytorch_32gpuopen_$(GPU_TYPE)_open.results
pytorch_gloo_1gpulocal_$(GPU_TYPE)_local.results:
$(call stream_stdout, pytorch_gloo-local)>pytorch_gloo_1gpulocal_$(GPU_TYPE)_local.results
pytorch_gloo_4gpuopen_$(GPU_TYPE)_open.results:
$(call stream_stdout, pytorch_gloo-4)>pytorch_gloo_4gpuopen_$(GPU_TYPE)_open.results
pytorch_gloo_8gpuopen_$(GPU_TYPE)_open.results:
$(call stream_stdout, pytorch_gloo-8)>pytorch_gloo_8gpuopen_$(GPU_TYPE)_open.results
pytorch_gloo_16gpuopen_$(GPU_TYPE)_open.results:
$(call stream_stdout, pytorch_gloo-16)>pytorch_gloo_16gpuopen_$(GPU_TYPE)_open.results
pytorch_gloo_32gpuopen_$(GPU_TYPE)_open.results:
$(call stream_stdout, pytorch_gloo-32)>pytorch_gloo_32gpuopen_$(GPU_TYPE)_open.results
tf_1gpulocal_$(GPU_TYPE)_local.results:
$(call stream_stdout, tf-local)>tf_1gpulocal_$(GPU_TYPE)_local.results
tf_4gpuintel_$(GPU_TYPE)_intel.results:
$(call stream_stdout, tf-intel-4)>tf_4gpuintel_$(GPU_TYPE)_intel.results
tf_8gpuintel_$(GPU_TYPE)_intel.results:
$(call stream_stdout, tf-intel-8)>tf_8gpuintel_$(GPU_TYPE)_intel.results
tf_16gpuintel_$(GPU_TYPE)_intel.results:
$(call stream_stdout, tf-intel-16)>tf_16gpuintel_$(GPU_TYPE)_intel.results
tf_32gpuintel_$(GPU_TYPE)_intel.results:
$(call stream_stdout, tf-intel-32)>tf_32gpuintel_$(GPU_TYPE)_intel.results
keras_1gpulocal_$(GPU_TYPE)_local.results:
$(call stream_stdout, keras-local)>keras_1gpulocal_$(GPU_TYPE)_local.results
keras_4gpuintel_$(GPU_TYPE)_intel.results:
$(call stream_stdout, keras-intel-4)>keras_4gpuintel_$(GPU_TYPE)_intel.results
keras_8gpuintel_$(GPU_TYPE)_intel.results:
$(call stream_stdout, keras-intel-8)>keras_8gpuintel_$(GPU_TYPE)_intel.results
keras_16gpuintel_$(GPU_TYPE)_intel.results:
$(call stream_stdout, keras-intel-16)>keras_16gpuintel_$(GPU_TYPE)_intel.results
keras_32gpuintel_$(GPU_TYPE)_intel.results:
$(call stream_stdout, keras-intel-32)>keras_32gpuintel_$(GPU_TYPE)_intel.results
cntk_1gpulocal_$(GPU_TYPE)_local.results:
$(call stream_stdout, cntk-local)>cntk_1gpulocal_$(GPU_TYPE)_local.results
cntk_4gpuintel_$(GPU_TYPE)_intel.results:
$(call stream_stdout, cntk-intel-4)>cntk_4gpuintel_$(GPU_TYPE)_intel.results
cntk_8gpuintel_$(GPU_TYPE)_intel.results:
$(call stream_stdout, cntk-intel-8)>cntk_8gpuintel_$(GPU_TYPE)_intel.results
cntk_16gpuintel_$(GPU_TYPE)_intel.results:
$(call stream_stdout, cntk-intel-16)>cntk_16gpuintel_$(GPU_TYPE)_intel.results
cntk_32gpuintel_$(GPU_TYPE)_intel.results:
$(call stream_stdout, cntk-intel-32)>cntk_32gpuintel_$(GPU_TYPE)_intel.results
clean-results:
rm results.json
rm *.results
make plot: results.json
python ../produce_plot.py
.PHONY: help build push

Просмотреть файл

@ -0,0 +1 @@
{"Id": null, "Scope": "/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/amldistrg/providers/Microsoft.MachineLearningServices/workspaces/workspace"}

Просмотреть файл

@ -0,0 +1,55 @@
define PROJECT_HELP_MSG
Makefile to control project aml_dist
Usage:
help show this message
build build docker image to use as control plane
bash run bash inside runnin docker container
stop stop running docker container
endef
export PROJECT_HELP_MSG
PWD:=$(shell pwd)
PORT:=9999
TBOARD_PORT:=6006
NAME:=aml_dist # Name of running container
setup_environment_file:=--env-file .env
include .env
local_code_volume:=-v $(PWD):/workspace
volumes:=-v /tmp/azureml_runs:/tmp/azureml_runs \
-v $(DATA):/data \
-v ${HOME}/.bash_history:/root/.bash_history
help:
echo "$$PROJECT_HELP_MSG" | less
build:
docker build -t $(IMAGE_NAME) -f control/Docker/dockerfile control/Docker
/tmp/azureml_runs:
mkdir -p /tmp/azureml_runs
run: /tmp/azureml_runs
# Start docker running as daemon
docker run $(local_code_volume) $(volumes) $(setup_environment_file) \
--name $(NAME) \
-p $(PORT):$(PORT) \
-p $(TBOARD_PORT):$(TBOARD_PORT) \
-d \
-v /var/run/docker.sock:/var/run/docker.sock \
-e HIST_FILE=/root/.bash_history \
-it $(IMAGE_NAME)
# Attach to running container and create new tmux session
docker exec -it $(NAME) bash -c "tmux new -s dist -n control"
bash:
docker exec -it $(NAME) bash -c "tmux a -t dist"
stop:
docker stop $(NAME)
docker rm $(NAME)
.PHONY: help build run bash stop

Просмотреть файл

@ -0,0 +1,14 @@
name: project_environment
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- pandas
- numpy
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azureml-defaults
- tensorflow==1.12.0
- horovod==0.15.2
- fire
- toolz

Просмотреть файл

@ -0,0 +1,14 @@
name: project_environment
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- pandas
- numpy
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azureml-defaults
- tensorflow-gpu==1.12.0
- horovod==0.15.2
- fire
- toolz

Просмотреть файл

@ -0,0 +1,91 @@
from invoke import task, Collection
import os
from config import load_config
_BASE_PATH = os.path.dirname(os.path.abspath(__file__))
env_values = load_config()
def _benchmark_code_exists():
dir_path = os.path.dirname(os.path.realpath(__file__))
return os.path.exists(os.path.join(dir_path, "src", "tf_cnn_benchmarks.py"))
@task
def clone_benchmarks(c):
"""Clones the Tensorflow benchmarks from https://github.com/tensorflow/benchmarks.git into the src folder
"""
if _benchmark_code_exists():
return None
c.run(
"git clone -b cnn_tf_v1.12_compatible https://github.com/tensorflow/benchmarks.git"
)
dir_path = os.path.dirname(os.path.realpath(__file__))
c.run(
f"cp -r benchmarks/scripts/tf_cnn_benchmarks/* {os.path.join(dir_path, 'src')}"
)
c.run("rm -r benchmarks")
@task(pre=[clone_benchmarks])
def submit_tf_benchmark(c, node_count=int(env_values["CLUSTER_MAX_NODES"])):
"""Submits TensorFlow benchmark job using synthetic data on remote cluster
Args:
node_count (int, optional): The number of nodes to use in cluster. Defaults to env_values['CLUSTER_MAX_NODES'].
Note:
Runs ResNet 50 model with batch size of 256 and mixed precision
"""
from aml_compute import TFExperimentCLI
exp = TFExperimentCLI("tf_benchmark")
run = exp.submit(
os.path.join(_BASE_PATH, "src"),
"tf_cnn_benchmarks.py",
{
"--model": "resnet50",
"--batch_size": 256,
"--variable_update": "horovod",
"--use_fp16": "",
},
node_count=node_count,
dependencies_file="TensorFlow_benchmark/environment_gpu.yml",
wait_for_completion=True,
)
print(run)
@task(pre=[clone_benchmarks])
def submit_tf_benchmark_local(c):
"""Submits TensorFlow benchmark job using synthetic data for local execution
Note:
Runs ResNet 50 model with batch size of 256 and mixed precision
"""
from aml_compute import TFExperimentCLI
exp = TFExperimentCLI("tf_benchmark")
run = exp.submit_local(
os.path.join(_BASE_PATH, "src"),
"tf_cnn_benchmarks.py",
{
"--model": "resnet50",
"--batch_size": 256,
"--variable_update": "horovod",
"--use_fp16": "",
},
dependencies_file="TensorFlow_benchmark/environment_gpu.yml",
wait_for_completion=True,
)
print(run)
remote_collection = Collection("remote")
remote_collection.add_task(submit_tf_benchmark, "synthetic")
local_collection = Collection("local")
local_collection.add_task(submit_tf_benchmark_local, "synthetic")
submit_collection = Collection("submit", local_collection, remote_collection)
namespace = Collection("tf_benchmark", submit_collection)

Просмотреть файл

@ -0,0 +1,14 @@
name: project_environment
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- pandas
- numpy
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azureml-defaults
- tensorflow==1.12.0
- horovod==0.15.2
- fire
- toolz

Просмотреть файл

@ -0,0 +1,14 @@
name: project_environment
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- pandas
- numpy
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azureml-defaults
- tensorflow-gpu==1.12.0
- horovod==0.15.2
- fire
- toolz

Просмотреть файл

@ -0,0 +1,7 @@
.ipynb_checkpoints
azureml-logs
.azureml
.git
outputs
azureml-setup
docs

Просмотреть файл

@ -0,0 +1,33 @@
[loggers]
keys=root,__main__,tensorflow
[handlers]
keys=consoleHandler
[formatters]
keys=simpleFormatter
[logger_root]
level=WARNING
handlers=consoleHandler
[logger___main__]
level=INFO
handlers=consoleHandler
qualname=__main__
propagate=0
[logger_tensorflow]
level=INFO
handlers=consoleHandler
qualname=tensorflow
propagate=0
[handler_consoleHandler]
class=StreamHandler
level=INFO
formatter=simpleFormatter
args=(sys.stdout,)
[formatter_simpleFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s

Просмотреть файл

@ -0,0 +1,152 @@
"""Script to train model using TensorFlow and Horovod
Please complete the necessary functions and assign values to the required variables
For instructions on using TensorFLow see: https://www.tensorflow.org/
For instructions on using Horovod see: https://github.com/horovod/horovod
"""
import logging.config
import fire
import os
import tensorflow as tf
DISTRIBUTED = False
LR = 0.001
MOMENTUM = 0.9
NUM_CLASSES = #Number of classes for your dataset
if DISTRIBUTED:
import horovod.tensorflow as hvd
def _get_rank():
if DISTRIBUTED:
try:
return hvd.rank()
except:
return 0
else:
return 0
def _get_optimizer(params, is_distributed=DISTRIBUTED):
if is_distributed:
# Horovod: add Horovod Distributed Optimizer.
return hvd.DistributedOptimizer(
tf.train.MomentumOptimizer(
learning_rate=params["learning_rate"] * hvd.size(),
momentum=params["momentum"],
)
)
else:
return tf.train.MomentumOptimizer(
learning_rate=params["learning_rate"], momentum=params["momentum"]
)
def build_network(features, mode, params):
""" Build Model
Args:
features:
mode:
params:
Returns:
Model function
"""
return None
def model_fn(features, labels, mode, params):
"""Model function that returns the estimator spec
Args:
features: This is the x-arg from the input_fn.
labels: This is the y-arg from the input_fn,
see e.g. train_input_fn for these two.
mode: Either TRAIN, EVAL, or PREDICT
params: User-defined hyper-parameters, e.g. learning-rate.
Returns:
tf.estimator.EstimatorSpec: Estimator specification
"""
return None
def input_fn():
"""Input function which provides batches for train or eval.
Returns:
A dataset that can be used for iteration.
"""
return None
def _get_runconfig(is_distributed=DISTRIBUTED, save_checkpoints_steps=None):
if is_distributed:
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
return tf.estimator.RunConfig(
save_checkpoints_steps=save_checkpoints_steps,
save_checkpoints_secs=None,
session_config=config,
log_step_count_steps=100,
)
else:
return tf.estimator.RunConfig(
save_checkpoints_steps=save_checkpoints_steps,
save_checkpoints_secs=None,
log_step_count_steps=100,
)
def _get_hooks(is_distributed=DISTRIBUTED):
logger = logging.getLogger(__name__)
if is_distributed:
bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(), hvd.size()))
return [bcast_hook]
else:
return []
def main():
"""Train your model
"""
logger = logging.getLogger(__name__)
if DISTRIBUTED:
# Horovod: initialize Horovod.
hvd.init()
logger.info("Running Distributed")
logger.info("Num GPUs: {:.3f}".format(hvd.size()))
input_function = input_fn
run_config = _get_runconfig()
params = {
"learning_rate": LR,
"momentum": MOMENTUM,
"classes": NUM_CLASSES,
}
logger.info("Creating estimator with params: {}".format(params))
model = tf.estimator.Estimator(
model_fn=model_fn, params=params, config=run_config
)
hooks = _get_hooks()
model.train(input_fn=input_function, hooks=hooks)
model.evaluate(input_fn=input_function)
if __name__ == "__main__":
logging.config.fileConfig(os.getenv("LOG_CONFIG", "logging.conf"))
fire.Fire(main)

Просмотреть файл

@ -0,0 +1,126 @@
""" This is an example template that you can use to create functions that you can call with invoke
"""
from invoke import task, Collection
import os
from config import load_config
_BASE_PATH = os.path.dirname(os.path.abspath( __file__ ))
env_values = load_config()
@task
def submit_local(c):
"""This command isn't implemented please modify to use.
The call below will work for submitting jobs to execute locally on a GPU.
"""
raise NotImplementedError(
"You need to modify this call before being able to use it"
)
from aml_compute import TFExperimentCLI
exp = TFExperimentCLI("<YOUR-EXPERIMENT-NAME>")
run = exp.submit_local(
os.path.join(_BASE_PATH, "src"),
"<YOUR-TRAINING-SCRIPT>",
{"YOUR": "ARGS"},
dependencies_file="TensorFlow/environment_gpu.yml",
wait_for_completion=True,
)
print(run)
@task
def submit_remote(c):
"""This command isn't implemented please modify to use.
The call below will work for submitting jobs to execute on a remote cluster using GPUs.
"""
raise NotImplementedError(
"You need to modify this call before being able to use it"
)
from aml_compute import TFExperimentCLI
exp = TFExperimentCLI("<YOUR-EXPERIMENT-NAME>")
run = exp.submit(
os.path.join(_BASE_PATH, "src"),
"<YOUR-TRAINING-SCRIPT>",
{"YOUR": "ARGS"},
node_count=4,
dependencies_file="TensorFlow/environment_gpu.yml",
wait_for_completion=True,
)
print(run)
@task
def submit_images(c):
"""This command isn't implemented please modify to use.
The call below will work for submitting jobs to execute on a remote cluster using GPUs.
Notive that we are passing in a {datastore} parameter to the path. This tells the submit
method that we want the location as mapped by the datastore to be inserted here. Upon
execution the appropriate path will be preappended to the training_data_path and validation_data_path.
"""
raise NotImplementedError(
"You need to modify this call before being able to use it"
)
from aml_compute import TFExperimentCLI
exp = TFExperimentCLI("<YOUR-EXPERIMENT-NAME>")
run = exp.submit(
os.path.join(_BASE_PATH, "src"),
"<YOUR-TRAINING-SCRIPT>",
{
"--training_data_path": "{datastore}/train",
"--validation_data_path": "{datastore}/validation",
"--epochs": "1",
"--data_type": "images",
"--data-format": "channels_first",
},
node_count=4,
dependencies_file="TensorFlow/environment_gpu.yml",
wait_for_completion=True,
)
print(run)
@task
def submit_images_local(c):
"""This command isn't implemented please modify to use.
The call below will work for submitting jobs to execute locally on a GPU.
Here we also map a volume to the docker container executing locally. This is the
location we tell our script to look for our training and validation data. Feel free to
adjust the other arguments as required by your trainining script.
"""
raise NotImplementedError(
"You need to modify this call before being able to use it"
)
from aml_compute import TFExperimentCLI
exp = TFExperimentCLI("<YOUR-EXPERIMENT-NAME>")
run = exp.submit_local(
os.path.join(_BASE_PATH, "src"),
"<YOUR-TRAINING-SCRIPT>",
{
"--training_data_path": "/data/train",
"--validation_data_path": "/data/validation",
"--epochs": "1",
"--data_type": "images",
"--data-format": "channels_first",
},
dependencies_file="TensorFlow_imagenet/environment_gpu.yml",
docker_args=["-v", f"{env_values['data']}:/data"],
wait_for_completion=True,
)
print(run)
remote_collection = Collection("remote")
remote_collection.add_task(submit_images, "images")
remote_collection.add_task(submit_remote, "synthetic")
local_collection = Collection("local")
local_collection.add_task(submit_images_local, "images")
local_collection.add_task(submit_local, "synthetic")
submit_collection = Collection("submit", local_collection, remote_collection)
namespace = Collection("tf_experiment", submit_collection)

Просмотреть файл

@ -0,0 +1,14 @@
name: project_environment
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- pandas
- numpy
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azureml-defaults
- tensorflow==1.12.0
- horovod==0.15.2
- fire
- toolz

Просмотреть файл

@ -0,0 +1,14 @@
name: project_environment
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- pandas
- numpy
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azureml-defaults
- tensorflow-gpu==1.12.0
- horovod==0.15.2
- fire
- toolz

Просмотреть файл

@ -0,0 +1,7 @@
.ipynb_checkpoints
azureml-logs
.azureml
.git
outputs
azureml-setup
docs

Просмотреть файл

Просмотреть файл

@ -0,0 +1,209 @@
import json
import logging
from pathlib import Path
import horovod.tensorflow as hvd
import tensorflow as tf
from toolz import pipe
import defaults
import imagenet_preprocessing
_NOUNID_LOOKUP_FILE = "imagenet_nounid_to_class.json"
def _create_nounid_lookup(nounid_lookup_file=_NOUNID_LOOKUP_FILE):
with open(nounid_lookup_file, "r") as read_file:
nounid_lookup_dict = json.load(read_file)
def _lookup(nounid):
return nounid_lookup_dict[nounid]
return _lookup
def _load_data(data_dir):
filenames = []
labels = []
lookup = _create_nounid_lookup()
for path_obj in Path(data_dir).glob("**/*.JPEG"):
filenames.append(str(path_obj))
labels.append(lookup(path_obj.parts[-2]))
return filenames, labels
def _preprocess_labels(label):
return tf.cast(label, dtype=tf.int32)
def _preprocess_images(filename):
return pipe(filename, tf.read_file)
def _prep(filename, num_label):
return tf.data.Dataset.from_tensor_slices(
([_preprocess_images(filename)], [_preprocess_labels(num_label)])
)
def parse_record(
image_buffer,
label,
is_training,
dtype,
data_format="channels_last",
image_size=defaults.DEFAULT_IMAGE_SIZE,
num_channels=defaults.NUM_CHANNELS,
):
"""Parses a record containing a training example of an image.
The input record is parsed into a label and image, and the image is passed
through preprocessing steps (cropping, flipping, and so on).
Args:
raw_record: scalar Tensor tf.string containing a serialized
Example protocol buffer.
is_training: A boolean denoting whether the input is for training.
dtype: data type to use for images/features.
data_format: the axis order of the matrix, channels_last NHWC or channels_first NCHW
Returns:
Tuple with processed image tensor and one-hot-encoded label tensor.
"""
# image_buffer, label = raw_record
image = imagenet_preprocessing.preprocess_image(
image_buffer=image_buffer,
output_height=image_size,
output_width=image_size,
num_channels=num_channels,
is_training=is_training,
data_format=data_format,
)
image = tf.cast(image, dtype)
return image, label
def process_image_dataset(dataset,
is_training,
batch_size,
shuffle_buffer,
parse_record_fn,
num_epochs=1,
dtype=tf.float32,
data_format="channels_last",
num_parallel_batches=1):
"""Given a Dataset with raw records, return an iterator over the records.
Args:
dataset: A Dataset representing raw records
is_training: A boolean denoting whether the input is for training.
batch_size: The number of samples per batch.
shuffle_buffer: The buffer size to use when shuffling records. A larger
value results in better randomness, but smaller values reduce startup
time and use less memory.
parse_record_fn: A function that takes a raw record and returns the
corresponding (image, label) pair.
num_epochs: The number of epochs to repeat the dataset.
dtype: Data type to use for images/features.
num_parallel_batches: Number of parallel batches for tf.data.
Returns:
Dataset of (image, label) pairs ready for iteration.
"""
# Prefetches a batch at a time to smooth out the time taken to load input
# files for shuffling and processing.
dataset = dataset.prefetch(buffer_size=batch_size)
if is_training:
# Shuffles records before repeating to respect epoch boundaries.
dataset = dataset.shuffle(buffer_size=shuffle_buffer)
# Repeats the dataset for the number of epochs to train.
dataset = dataset.repeat(num_epochs)
# Parses the raw records into images and labels.
dataset = dataset.apply(
tf.data.experimental.map_and_batch(
lambda image, label: parse_record_fn(image, label, is_training, dtype, data_format=data_format),
batch_size=batch_size,
num_parallel_batches=num_parallel_batches,
drop_remainder=False))
dataset = dataset.prefetch(buffer_size=100)
return dataset
def input_fn(
is_training,
data_dir,
batch_size,
num_epochs=1,
dtype=tf.float32,
num_parallel_batches=1,
parse_record_fn=parse_record,
data_format="channels_last",
distributed=False,
file_shuffle_buffer=1000,
data_shuffle_buffer=defaults.SHUFFLE_BUFFER,
):
"""Input function which provides batches for train or eval.
Args:
is_training: A boolean denoting whether the input is for training.
data_dir: The directory containing the input data.
batch_size: The number of samples per batch.
num_epochs: The number of epochs to repeat the dataset.
dtype: Data type to use for images/features
num_parallel_batches: Number of parallel batches for tf.data.
parse_record_fn: Function to use for parsing the records.
Returns:
A dataset that can be used for iteration.
"""
logger = logging.getLogger(__name__)
logger.info(f"Reading data info from {data_dir}")
buffer_length = 1024
parallel_num = 5
filenames, labels = _load_data(data_dir)
logger.info(f"Found {len(filenames)} files")
dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
if is_training:
# Shuffle the input files
if distributed:
dataset = dataset.shard(hvd.size(), hvd.local_rank())
dataset = dataset.shuffle(buffer_size=file_shuffle_buffer) # _NUM_TRAIN_FILES
# Convert to individual records.
# cycle_length = 10 means 10 files will be read and deserialized in parallel.
# This number is low enough to not cause too much contention on small systems
# but high enough to provide the benefits of parallelization. You may want
# to increase this number if you have a large number of CPU cores.
dataset = dataset.apply(
tf.data.experimental.parallel_interleave(
_prep,
cycle_length=parallel_num,
buffer_output_elements=buffer_length,
sloppy=True,
)
)
return process_image_dataset(
dataset=dataset,
is_training=is_training,
batch_size=batch_size,
shuffle_buffer=data_shuffle_buffer,
parse_record_fn=parse_record_fn,
num_epochs=num_epochs,
dtype=dtype,
num_parallel_batches=num_parallel_batches,
data_format=data_format,
)

Просмотреть файл

@ -0,0 +1,52 @@
import tensorflow as tf
def get_synth_input_fn(height, width, num_channels, num_classes, dtype=tf.float32):
"""Returns an input function that returns a dataset with random data.
This input_fn returns a data set that iterates over a set of random data and
bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
copy is still included. This used to find the upper throughput bound when
tunning the full input pipeline.
Args:
height: Integer height that will be used to create a fake image tensor.
width: Integer width that will be used to create a fake image tensor.
num_channels: Integer depth that will be used to create a fake image tensor.
num_classes: Number of classes that should be represented in the fake labels
tensor
dtype: Data type for features/images.
Returns:
An input_fn that can be used in place of a real one to return a dataset
that can be used for iteration.
"""
def input_fn(
is_training, data_dir, batch_size, *args, data_format="channels_last", **kwargs
):
"""Returns dataset filled with random data."""
# Synthetic input should be within [0, 255].
if data_format == "channels_last":
shape = [height, width, num_channels]
else:
shape = [num_channels, height, width]
inputs = tf.random.truncated_normal(
[batch_size] + shape,
dtype=dtype,
mean=127,
stddev=60,
name="synthetic_inputs",
)
labels = tf.random.uniform(
[batch_size],
minval=0,
maxval=num_classes - 1,
dtype=tf.int32,
name="synthetic_labels",
)
data = tf.data.Dataset.from_tensors((inputs, labels)).repeat()
data = data.prefetch(buffer_size=1024)
return data
return input_fn

Просмотреть файл

@ -0,0 +1,218 @@
import logging
import os
import horovod.tensorflow as hvd
import tensorflow as tf
import defaults
import imagenet_preprocessing
def get_filenames(is_training, data_dir, num_files=1014):
"""Return filenames for dataset."""
if is_training:
return [
os.path.join(data_dir, "train-%05d-of-01014" % i) for i in range(num_files)
]
else:
return [
os.path.join(data_dir, "validation-%05d-of-00128" % i) for i in range(128)
]
def _parse_example_proto(example_serialized):
"""Parses an Example proto containing a training example of an image.
The output of the build_image_data.py image preprocessing script is a dataset
containing serialized Example protocol buffers. Each Example proto contains
the following fields (values are included as examples):
image/height: 462
image/width: 581
image/colorspace: 'RGB'
image/channels: 3
image/class/label: 615
image/class/text: 'knee pad'
image/format: 'JPEG'
image/filename: 'ILSVRC2012_val_00041207.JPEG'
image/encoded: <JPEG encoded string>
Args:
example_serialized: scalar Tensor tf.string containing a serialized
Example protocol buffer.
Returns:
image_buffer: Tensor tf.string containing the contents of a JPEG file.
label: Tensor tf.int32 containing the label.
"""
feature_map = {
"image/encoded": tf.io.FixedLenFeature([], dtype=tf.string, default_value=""),
"image/class/label": tf.io.FixedLenFeature(
[], dtype=tf.int64, default_value=-1
),
"image/class/text": tf.io.FixedLenFeature(
[], dtype=tf.string, default_value=""
),
}
features = tf.io.parse_single_example(
serialized=example_serialized, features=feature_map
)
label = tf.cast(features["image/class/label"], dtype=tf.int32)
return features["image/encoded"], label
def parse_record(
raw_record,
is_training,
dtype,
data_format="channels_last",
image_size=defaults.DEFAULT_IMAGE_SIZE,
num_channels=defaults.NUM_CHANNELS,
):
"""Parses a record containing a training example of an image.
The input record is parsed into a label and image, and the image is passed
through preprocessing steps (cropping, flipping, and so on).
Args:
raw_record: scalar Tensor tf.string containing a serialized
Example protocol buffer.
is_training: A boolean denoting whether the input is for training.
dtype: data type to use for images/features.
data_format: the axis order of the matrix, channels_last NHWC or channels_first NCHW
Returns:
Tuple with processed image tensor and one-hot-encoded label tensor.
"""
image_buffer, label = _parse_example_proto(raw_record)
image = imagenet_preprocessing.preprocess_image(
image_buffer=image_buffer,
output_height=image_size,
output_width=image_size,
num_channels=num_channels,
is_training=is_training,
data_format=data_format,
)
image = tf.cast(image, dtype)
return image, label
def input_fn(
is_training,
data_dir,
batch_size,
num_epochs=1,
dtype=tf.float32,
num_parallel_batches=1,
parse_record_fn=parse_record,
data_format="channels_last",
distributed=False,
file_shuffle_buffer=10,
data_shuffle_buffer=defaults.SHUFFLE_BUFFER,
):
"""Input function which provides batches for train or eval.
Args:
is_training: A boolean denoting whether the input is for training.
data_dir: The directory containing the input data.
batch_size: The number of samples per batch.
num_epochs: The number of epochs to repeat the dataset.
dtype: Data type to use for images/features
num_parallel_batches: Number of parallel batches for tf.data.
parse_record_fn: Function to use for parsing the records.
Returns:
A dataset that can be used for iteration.
"""
logger = logging.getLogger(__name__)
logger.info(f"Reading data info from {data_dir}")
filenames = get_filenames(is_training, data_dir)
for f in filenames:
if not os.path.exists(f):
raise ValueError(f"{f} File doesn't exist ")
logger.info(f"Found {len(filenames)} files")
dataset = tf.data.Dataset.from_tensor_slices(filenames)
if is_training:
# Shuffle the input files
if distributed:
dataset = dataset.shard(hvd.size(), hvd.local_rank())
dataset = dataset.shuffle(buffer_size=file_shuffle_buffer) # _NUM_TRAIN_FILES
# Convert to individual records.
# cycle_length = 10 means 10 files will be read and deserialized in parallel.
# This number is low enough to not cause too much contention on small systems
# but high enough to provide the benefits of parallelization. You may want
# to increase this number if you have a large number of CPU cores.
dataset = dataset.apply(
tf.data.experimental.parallel_interleave(
tf.data.TFRecordDataset,
cycle_length=num_parallel_batches,
buffer_output_elements=10,
)
)
return process_record_dataset(
dataset=dataset,
is_training=is_training,
batch_size=batch_size,
shuffle_buffer=data_shuffle_buffer,
parse_record_fn=parse_record_fn,
num_epochs=num_epochs,
dtype=dtype,
num_parallel_batches=num_parallel_batches,
data_format=data_format,
)
def process_record_dataset(dataset,
is_training,
batch_size,
shuffle_buffer,
parse_record_fn,
num_epochs=1,
dtype=tf.float32,
data_format="channels_last",
num_parallel_batches=1):
"""Given a Dataset with raw records, return an iterator over the records.
Args:
dataset: A Dataset representing raw records
is_training: A boolean denoting whether the input is for training.
batch_size: The number of samples per batch.
shuffle_buffer: The buffer size to use when shuffling records. A larger
value results in better randomness, but smaller values reduce startup
time and use less memory.
parse_record_fn: A function that takes a raw record and returns the
corresponding (image, label) pair.
num_epochs: The number of epochs to repeat the dataset.
dtype: Data type to use for images/features.
num_parallel_batches: Number of parallel batches for tf.data.
Returns:
Dataset of (image, label) pairs ready for iteration.
"""
# Prefetches a batch at a time to smooth out the time taken to load input
# files for shuffling and processing.
dataset = dataset.prefetch(buffer_size=batch_size)
if is_training:
# Shuffles records before repeating to respect epoch boundaries.
dataset = dataset.shuffle(buffer_size=shuffle_buffer)
# Repeats the dataset for the number of epochs to train.
dataset = dataset.repeat(num_epochs)
# Parses the raw records into images and labels.
dataset = dataset.apply(
tf.data.experimental.map_and_batch(
lambda value: parse_record_fn(value, is_training, dtype, data_format=data_format),
batch_size=batch_size,
num_parallel_batches=num_parallel_batches,
drop_remainder=False))
dataset = dataset.prefetch(buffer_size=1024)
return dataset

Просмотреть файл

@ -0,0 +1,25 @@
import os
from utils import str_to_bool
LR = 0.001
EPOCHS = os.getenv("EPOCHS", 5)
_BATCHSIZE = 64
R_MEAN = 123.68
G_MEAN = 116.78
B_MEAN = 103.94
BUFFER = 256
DEFAULT_IMAGE_SIZE = 224
NUM_CHANNELS = 3
NUM_CLASSES = 1001
NUM_IMAGES = {"train": 1_281_167, "validation": 50000}
NUM_TRAIN_FILES = 1024
SHUFFLE_BUFFER = 1000
DATA_LENGTH = int(
os.getenv("FAKE_DATA_LENGTH", 1_281_167)
) # How much fake data to simulate, default to size of imagenet dataset
DATASET_NAME = "ImageNet"
DISTRIBUTED = str_to_bool(os.getenv("DISTRIBUTED", "False"))

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,222 @@
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Provides utilities to preprocess images.
Training images are sampled using the provided bounding boxes, and subsequently
cropped to the sampled bounding box. Images are additionally flipped randomly,
then resized to the target output size (without aspect-ratio preservation).
Images used during evaluation are resized (with aspect-ratio preservation) and
centrally cropped.
All images undergo mean color subtraction.
Note that these steps are colloquially referred to as "ResNet preprocessing,"
and they differ from "VGG preprocessing," which does not use bounding boxes
and instead does an aspect-preserving resize followed by random crop during
training. (These both differ from "Inception preprocessing," which introduces
color distortion steps.)
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
_R_MEAN = 123.68
_G_MEAN = 116.78
_B_MEAN = 103.94
_CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
# The lower bound for the smallest side of the image for aspect-preserving
# resizing. For example, if an image is 500 x 1000, it will be resized to
# _RESIZE_MIN x (_RESIZE_MIN * 2).
_RESIZE_MIN = 256
def _central_crop(image, crop_height, crop_width):
"""Performs central crops of the given image list.
Args:
image: a 3-D image tensor
crop_height: the height of the image following the crop.
crop_width: the width of the image following the crop.
Returns:
3-D tensor with cropped image.
"""
shape = tf.shape(input=image)
height, width = shape[0], shape[1]
amount_to_be_cropped_h = height - crop_height
crop_top = amount_to_be_cropped_h // 2
amount_to_be_cropped_w = width - crop_width
crop_left = amount_to_be_cropped_w // 2
return tf.slice(image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
def _mean_image_subtraction(image, means, num_channels):
"""Subtracts the given means from each image channel.
For example:
means = [123.68, 116.779, 103.939]
image = _mean_image_subtraction(image, means)
Note that the rank of `image` must be known.
Args:
image: a tensor of size [height, width, C].
means: a C-vector of values to subtract from each channel.
num_channels: number of color channels in the image that will be distorted.
Returns:
the centered image.
Raises:
ValueError: If the rank of `image` is unknown, if `image` has a rank other
than three or if the number of channels in `image` doesn't match the
number of values in `means`.
"""
if image.get_shape().ndims != 3:
raise ValueError("Input must be of size [height, width, C>0]")
if len(means) != num_channels:
raise ValueError("len(means) must match the number of channels")
# We have a 1-D tensor of means; convert to 3-D.
means = tf.expand_dims(tf.expand_dims(means, 0), 0)
return image - means
def _smallest_size_at_least(height, width, resize_min):
"""Computes new shape with the smallest side equal to `smallest_side`.
Computes new shape with the smallest side equal to `smallest_side` while
preserving the original aspect ratio.
Args:
height: an int32 scalar tensor indicating the current height.
width: an int32 scalar tensor indicating the current width.
resize_min: A python integer or scalar `Tensor` indicating the size of
the smallest side after resize.
Returns:
new_height: an int32 scalar tensor indicating the new height.
new_width: an int32 scalar tensor indicating the new width.
"""
resize_min = tf.cast(resize_min, tf.float32)
# Convert to floats to make subsequent calculations go smoothly.
height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
smaller_dim = tf.minimum(height, width)
scale_ratio = resize_min / smaller_dim
# Convert back to ints to make heights and widths that TF ops will accept.
new_height = tf.cast(height * scale_ratio, tf.int32)
new_width = tf.cast(width * scale_ratio, tf.int32)
return new_height, new_width
def _aspect_preserving_resize(image, resize_min):
"""Resize images preserving the original aspect ratio.
Args:
image: A 3-D image `Tensor`.
resize_min: A python integer or scalar `Tensor` indicating the size of
the smallest side after resize.
Returns:
resized_image: A 3-D tensor containing the resized image.
"""
shape = tf.shape(input=image)
height, width = shape[0], shape[1]
new_height, new_width = _smallest_size_at_least(height, width, resize_min)
return _resize_image(image, new_height, new_width)
def _resize_image(image, height, width):
"""Simple wrapper around tf.resize_images.
This is primarily to make sure we use the same `ResizeMethod` and other
details each time.
Args:
image: A 3-D image `Tensor`.
height: The target height for the resized image.
width: The target width for the resized image.
Returns:
resized_image: A 3-D tensor containing the resized image. The first two
dimensions have the shape [height, width].
"""
return tf.image.resize_images(
image,
[height, width],
method=tf.image.ResizeMethod.BILINEAR,
align_corners=False,
)
def preprocess_image(
image_buffer,
output_height,
output_width,
num_channels,
is_training=False,
data_format="channels_last",
):
"""Preprocesses the given image.
Preprocessing includes decoding, cropping, and resizing for both training
and eval images. Training preprocessing, however, introduces some random
distortion of the image to improve accuracy.
Args:
image_buffer: scalar string Tensor representing the raw JPEG image buffer.
output_height: The height of the image after preprocessing.
output_width: The width of the image after preprocessing.
num_channels: Integer depth of the image buffer for decoding.
is_training: `True` if we're preprocessing the image for training and
`False` otherwise.
Returns:
A preprocessed image.
"""
if is_training:
# For training, we want to randomize some of the distortions.
image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
image = _resize_image(image, output_height, output_width)
else:
# For validation, we want to decode, resize, then just crop the middle.
image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
image = _aspect_preserving_resize(image, _RESIZE_MIN)
image = _central_crop(image, output_height, output_width)
image.set_shape([output_height, output_width, num_channels])
image = _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels)
if data_format == "channels_first":
image = tf.transpose(image, [2, 0, 1]) # Transform from NHWC to NCHW
image.set_shape([num_channels, output_height, output_width])
return image

Просмотреть файл

@ -0,0 +1,33 @@
[loggers]
keys=root,__main__,tensorflow
[handlers]
keys=consoleHandler
[formatters]
keys=simpleFormatter
[logger_root]
level=INFO
handlers=consoleHandler
[logger___main__]
level=DEBUG
handlers=consoleHandler
qualname=__main__
propagate=0
[logger_tensorflow]
level=DEBUG
handlers=consoleHandler
qualname=tensorflow
propagate=0
[handler_consoleHandler]
class=StreamHandler
level=DEBUG
formatter=simpleFormatter
args=(sys.stdout,)
[formatter_simpleFormatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s

Просмотреть файл

@ -0,0 +1,308 @@
""" This the script is the main entry point for training ResNet model using TensorFlow with Horovod
"""
import logging
import logging.config
import os
import fire
import tensorflow as tf
from data.synthetic import get_synth_input_fn
from data import tfrecords, images
from resnet_model import resnet_v1
from timer import Timer
from utils import ExamplesPerSecondHook
import defaults
if defaults.DISTRIBUTED:
import horovod.tensorflow as hvd
def _get_rank():
if defaults.DISTRIBUTED:
try:
return hvd.rank()
except:
return 0
else:
return 0
# Data processing
###############################################################################
def _get_optimizer(params, is_distributed=defaults.DISTRIBUTED):
if is_distributed:
# Horovod: add Horovod Distributed Optimizer.
return hvd.DistributedOptimizer(
tf.train.MomentumOptimizer(
learning_rate=params["learning_rate"] * hvd.size(),
momentum=params["momentum"],
)
)
else:
return tf.train.MomentumOptimizer(
learning_rate=params["learning_rate"], momentum=params["momentum"]
)
def build_network(features, mode, params):
""" Build ResNet50 Model
Args:
features:
mode:
params:
Returns:
Model function
"""
network = resnet_v1(
resnet_depth=50,
num_classes=params["classes"],
data_format=params["data_format"],
)
return network(inputs=features, is_training=(mode == tf.estimator.ModeKeys.TRAIN))
def model_fn(features, labels, mode, params):
"""Model function that returns the estimator spec
Args:
features: This is the x-arg from the input_fn.
labels: This is the y-arg from the input_fn,
see e.g. train_input_fn for these two.
mode: Either TRAIN, EVAL, or PREDICT
params: User-defined hyper-parameters, e.g. learning-rate.
Returns:
tf.estimator.EstimatorSpec: Estimator specification
"""
logger = logging.getLogger(__name__)
logger.info("Creating model in {} mode".format(mode))
logits = build_network(features, mode, params)
# Classification output of the neural network.
y_pred_cls = tf.argmax(logits, axis=1)
if mode == tf.estimator.ModeKeys.PREDICT:
# Softmax output of the neural network.
y_pred = tf.nn.softmax(logits=logits)
predictions = {
"class_ids": y_pred_cls,
"probabilities": y_pred,
"logits": logits,
}
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=logits, labels=labels
)
loss = tf.reduce_mean(cross_entropy, name="loss")
accuracy = tf.metrics.accuracy(labels=labels, predictions=y_pred_cls, name="acc_op")
metrics = {"accuracy": accuracy}
if mode == tf.estimator.ModeKeys.EVAL:
eval_hook_list = []
eval_tensors_log = {"acc": accuracy[1]}
eval_hook_list.append(
tf.train.LoggingTensorHook(tensors=eval_tensors_log, every_n_iter=100)
)
return tf.estimator.EstimatorSpec(
mode=mode,
eval_metric_ops=metrics,
loss=loss,
evaluation_hooks=eval_hook_list,
)
optimizer = _get_optimizer(params)
train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
train_hook_list = []
train_tensors_log = {"loss": loss, "acc": accuracy[1]}
train_hook_list.append(
tf.train.LoggingTensorHook(tensors=train_tensors_log, every_n_iter=100)
)
return tf.estimator.EstimatorSpec(
mode=mode, loss=loss, train_op=train_op, training_hooks=train_hook_list
)
def _get_runconfig(is_distributed=defaults.DISTRIBUTED, save_checkpoints_steps=None):
if is_distributed:
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
return tf.estimator.RunConfig(
save_checkpoints_steps=save_checkpoints_steps,
save_checkpoints_secs=None,
session_config=config,
log_step_count_steps=100,
)
else:
return tf.estimator.RunConfig(
save_checkpoints_steps=save_checkpoints_steps,
save_checkpoints_secs=None,
log_step_count_steps=100,
)
def _get_hooks(batch_size, is_distributed=defaults.DISTRIBUTED):
logger = logging.getLogger(__name__)
if is_distributed:
exps_hook = ExamplesPerSecondHook(batch_size * hvd.size())
bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(), hvd.size()))
return [bcast_hook, exps_hook]
else:
exps_hook = ExamplesPerSecondHook(batch_size)
return [exps_hook]
def _is_master(is_distributed=defaults.DISTRIBUTED):
if is_distributed:
if hvd.rank() == 0:
return True
else:
return False
else:
return True
def _log_summary(total_images, batch_size, duration):
logger = logging.getLogger(__name__)
images_per_second = total_images / duration
logger.info("Data length: {}".format(total_images))
logger.info("Total duration: {:.3f}".format(duration))
logger.info("Total images/sec: {:.3f}".format(images_per_second))
logger.info(
"Batch size: (Per GPU {}: Total {})".format(
batch_size, hvd.size() * batch_size if defaults.DISTRIBUTED else batch_size
)
)
logger.info(
"Distributed: {}".format("True" if defaults.DISTRIBUTED else "False")
)
logger.info(
"Num GPUs: {:.3f}".format(hvd.size() if defaults.DISTRIBUTED else 1)
)
def main(
training_data_path=None,
validation_data_path=None,
save_filepath="logs",
epochs=defaults.EPOCHS,
batch_size=defaults._BATCHSIZE,
max_steps=None,
save_checkpoints_steps=None,
data_format="channels_last",
momentum=0.9,
data_type="tfrecords"
):
"""Run train and evaluation loop
Args:
training_data_path: Location of training data
validation_data_path: Location of validation data
save_filepath: Location where the checkpoint and events files are saved
epochs: Number of epochs to run the training for
batch_size: Number of images to run in a mini-batch
max_steps: Maximum number of steps to run for training and validation. This will override epochs parameter
save_checkpoints_steps: Number of steps between checkpoints
data_format: The axis order of the matrix, channels_last NHWC or channels_first NCHW
momentum: Momentum term for tf.train.MomentumOptimizer
data_type: The format that the data is in, valid values are 'images' and 'tfrecords'
"""
logger = logging.getLogger(__name__)
if defaults.DISTRIBUTED:
# Horovod: initialize Horovod.
hvd.init()
logger.info("Runnin Distributed")
logger.info("Num GPUs: {:.3f}".format(hvd.size()))
logger.info("Tensorflow version {}".format(tf.__version__))
if training_data_path is None:
input_function = get_synth_input_fn(
defaults.DEFAULT_IMAGE_SIZE,
defaults.DEFAULT_IMAGE_SIZE,
defaults.NUM_CHANNELS,
defaults.NUM_CLASSES,
)
else:
input_function = tfrecords.input_fn if "tfrecords" in data_type else images.input_fn
run_config = _get_runconfig(save_checkpoints_steps=save_checkpoints_steps)
if (defaults.DISTRIBUTED and hvd.rank() == 0) or not defaults.DISTRIBUTED:
model_dir = save_filepath
else:
model_dir = "."
params = {
"learning_rate": defaults.LR,
"momentum": momentum,
"classes": defaults.NUM_CLASSES,
"data_format": data_format,
}
logger.info("Creating estimator with params: {}".format(params))
model = tf.estimator.Estimator(
model_fn=model_fn, params=params, model_dir=model_dir, config=run_config
)
hooks = _get_hooks(batch_size)
num_gpus = hvd.size() if defaults.DISTRIBUTED else 1
def train_input_fn():
return input_function(
True,
training_data_path,
batch_size,
num_epochs=epochs,
data_format=data_format,
num_parallel_batches=4,
distributed=defaults.DISTRIBUTED
)
with Timer(output=logger.info, prefix="Training") as t:
logger.info("Training...")
model.train(input_fn=train_input_fn, max_steps=max_steps, hooks=hooks)
if max_steps is not None:
total_images = max_steps * batch_size * num_gpus
else:
total_images = epochs * defaults.NUM_IMAGES["train"]
_log_summary(total_images, batch_size, t.elapsed)
if _is_master() and validation_data_path is not None:
def validation_input_fn():
return input_function(
False,
validation_data_path,
batch_size,
num_epochs=1,
data_format=data_format,
num_parallel_batches=4,
)
with Timer(output=logger.info, prefix="Testing"):
logger.info("Testing...")
model.evaluate(input_fn=validation_input_fn, steps=max_steps)
if __name__ == "__main__":
logging.config.fileConfig(os.getenv("LOG_CONFIG", "logging.conf"))
fire.Fire(main)

Просмотреть файл

@ -0,0 +1,149 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains utility and supporting functions for ResNet.
This module contains ResNet code which does not directly build layers. This
includes dataset management, hyperparameter and optimizer code, and argument
parsing. Code for defining the ResNet layers can be found in resnet_model.py.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import multiprocessing
import os
# pylint: disable=g-bad-import-order
import tensorflow as tf
import imagenet_preprocessing
################################################################################
# Functions for input processing.
################################################################################
def image_bytes_serving_input_fn(image_shape, dtype=tf.float32):
"""Serving input fn for raw jpeg images."""
def _preprocess_image(image_bytes):
"""Preprocess a single raw image."""
# Bounding box around the whole image.
bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=dtype, shape=[1, 1, 4])
height, width, num_channels = image_shape
image = imagenet_preprocessing.preprocess_image(
image_bytes, bbox, height, width, num_channels, is_training=False)
return image
image_bytes_list = tf.compat.v1.placeholder(
shape=[None], dtype=tf.string, name='input_tensor')
images = tf.map_fn(
_preprocess_image, image_bytes_list, back_prop=False, dtype=dtype)
return tf.estimator.export.TensorServingInputReceiver(
images, {'image_bytes': image_bytes_list})
def override_flags_and_set_envars_for_gpu_thread_pool(flags_obj):
"""Override flags and set env_vars for performance.
These settings exist to test the difference between using stock settings
and manual tuning. It also shows some of the ENV_VARS that can be tweaked to
squeeze a few extra examples per second. These settings are defaulted to the
current platform of interest, which changes over time.
On systems with small numbers of cpu cores, e.g. under 8 logical cores,
setting up a gpu thread pool with `tf_gpu_thread_mode=gpu_private` may perform
poorly.
Args:
flags_obj: Current flags, which will be adjusted possibly overriding
what has been set by the user on the command-line.
"""
cpu_count = multiprocessing.cpu_count()
tf.compat.v1.logging.info('Logical CPU cores: %s', cpu_count)
# Sets up thread pool for each GPU for op scheduling.
per_gpu_thread_count = 1
total_gpu_thread_count = per_gpu_thread_count * flags_obj.num_gpus
os.environ['TF_GPU_THREAD_MODE'] = flags_obj.tf_gpu_thread_mode
os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
tf.compat.v1.logging.info('TF_GPU_THREAD_COUNT: %s',
os.environ['TF_GPU_THREAD_COUNT'])
tf.compat.v1.logging.info('TF_GPU_THREAD_MODE: %s',
os.environ['TF_GPU_THREAD_MODE'])
# Reduces general thread pool by number of threads used for GPU pool.
main_thread_count = cpu_count - total_gpu_thread_count
flags_obj.inter_op_parallelism_threads = main_thread_count
# Sets thread count for tf.data. Logical cores minus threads assign to the
# private GPU pool along with 2 thread per GPU for event monitoring and
# sending / receiving tensors.
num_monitoring_threads = 2 * flags_obj.num_gpus
flags_obj.datasets_num_private_threads = (cpu_count - total_gpu_thread_count
- num_monitoring_threads)
################################################################################
# Functions for running training/eval/validation loops for the model.
################################################################################
def learning_rate_with_decay(
batch_size, batch_denom, num_images, boundary_epochs, decay_rates,
base_lr=0.1, warmup=False):
"""Get a learning rate that decays step-wise as training progresses.
Args:
batch_size: the number of examples processed in each training batch.
batch_denom: this value will be used to scale the base learning rate.
`0.1 * batch size` is divided by this number, such that when
batch_denom == batch_size, the initial learning rate will be 0.1.
num_images: total number of images that will be used for training.
boundary_epochs: list of ints representing the epochs at which we
decay the learning rate.
decay_rates: list of floats representing the decay rates to be used
for scaling the learning rate. It should have one more element
than `boundary_epochs`, and all elements should have the same type.
base_lr: Initial learning rate scaled based on batch_denom.
warmup: Run a 5 epoch warmup to the initial lr.
Returns:
Returns a function that takes a single argument - the number of batches
trained so far (global_step)- and returns the learning rate to be used
for training the next batch.
"""
initial_learning_rate = base_lr * batch_size / batch_denom
batches_per_epoch = num_images / batch_size
# Reduce the learning rate at certain epochs.
# CIFAR-10: divide by 10 at epoch 100, 150, and 200
# ImageNet: divide by 10 at epoch 30, 60, 80, and 90
boundaries = [int(batches_per_epoch * epoch) for epoch in boundary_epochs]
vals = [initial_learning_rate * decay for decay in decay_rates]
def learning_rate_fn(global_step):
"""Builds scaled learning rate function with 5 epoch warm up."""
lr = tf.compat.v1.train.piecewise_constant(global_step, boundaries, vals)
if warmup:
warmup_steps = int(batches_per_epoch * 5)
warmup_lr = (
initial_learning_rate * tf.cast(global_step, tf.float32) / tf.cast(
warmup_steps, tf.float32))
return tf.cond(pred=global_step < warmup_steps,
true_fn=lambda: warmup_lr,
false_fn=lambda: lr)
return lr
return learning_rate_fn

Просмотреть файл

@ -0,0 +1,75 @@
# Taken from https://github.com/tensorflow/models/blob/master/tutorials/image/cifar10_estimator/cifar10_utils.py
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.training import basic_session_run_hooks
from tensorflow.python.training import session_run_hook
from tensorflow.python.training import training_util
def str_to_bool(in_str):
if "t" in in_str.lower():
return True
else:
return False
class ExamplesPerSecondHook(session_run_hook.SessionRunHook):
"""Hook to print out examples per second.
Total time is tracked and then divided by the total number of steps
to get the average step time and then batch_size is used to determine
the running average of examples per second. The examples per second for the
most recent interval is also logged.
"""
def __init__(self, batch_size, every_n_steps=100, every_n_secs=None):
"""Initializer for ExamplesPerSecondHook.
Args:
batch_size: Total batch size used to calculate examples/second from
global time.
every_n_steps: Log stats every n steps.
every_n_secs: Log stats every n seconds.
"""
if (every_n_steps is None) == (every_n_secs is None):
raise ValueError(
"exactly one of every_n_steps" " and every_n_secs should be provided."
)
self._timer = basic_session_run_hooks.SecondOrStepTimer(
every_steps=every_n_steps, every_secs=every_n_secs
)
self._step_train_time = 0
self._total_steps = 0
self._batch_size = batch_size
def begin(self):
self._global_step_tensor = training_util.get_global_step()
if self._global_step_tensor is None:
raise RuntimeError("Global step should be created to use StepCounterHook.")
def before_run(self, run_context): # pylint: disable=unused-argument
return basic_session_run_hooks.SessionRunArgs(self._global_step_tensor)
def after_run(self, run_context, run_values):
_ = run_context
global_step = run_values.results
if self._timer.should_trigger_for_step(global_step):
elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(
global_step
)
if elapsed_time is not None:
steps_per_sec = elapsed_steps / elapsed_time
self._step_train_time += elapsed_time
self._total_steps += elapsed_steps
average_examples_per_sec = self._batch_size * (
self._total_steps / self._step_train_time
)
current_examples_per_sec = steps_per_sec * self._batch_size
# Average examples/sec followed by current examples/sec
logging.info(
"%s: %g (%g), step = %g",
"Average examples/sec",
average_examples_per_sec,
current_examples_per_sec,
self._total_steps,
)

Просмотреть файл

@ -0,0 +1,176 @@
"""Module for running TensorFlow training on Imagenet data
"""
from invoke import task, Collection
import os
from config import load_config
_BASE_PATH = os.path.dirname(os.path.abspath(__file__))
env_values = load_config()
@task
def submit_synthetic(c, node_count=int(env_values["CLUSTER_MAX_NODES"]), epochs=1):
"""Submit TensorFlow training job using synthetic imagenet data to remote cluster
Args:
node_count (int, optional): The number of nodes to use in cluster. Defaults to env_values['CLUSTER_MAX_NODES'].
epochs (int, optional): Number of epochs to run training for. Defaults to 1.
"""
from aml_compute import TFExperimentCLI
exp = TFExperimentCLI("synthetic_images_remote")
run = exp.submit(
os.path.join(_BASE_PATH, "src"),
"resnet_main.py",
{"--epochs": epochs},
node_count=node_count,
dependencies_file="TensorFlow_imagenet/environment_gpu.yml",
wait_for_completion=True,
)
print(run)
@task
def submit_synthetic_local(c, epochs=1):
"""Submit TensorFlow training job using synthetic imagenet data for local execution
Args:
epochs (int, optional): Number of epochs to run training for. Defaults to 1.
"""
from aml_compute import TFExperimentCLI
exp = TFExperimentCLI("synthetic_images_local")
run = exp.submit_local(
os.path.join(_BASE_PATH, "src"),
"resnet_main.py",
{"--epochs": epochs},
dependencies_file="TensorFlow_imagenet/environment_gpu.yml",
wait_for_completion=True,
)
print(run)
@task
def submit_images(c, node_count=int(env_values["CLUSTER_MAX_NODES"]), epochs=1):
"""Submit TensorFlow training job using real imagenet data to remote cluster
Args:
node_count (int, optional): The number of nodes to use in cluster. Defaults to env_values['CLUSTER_MAX_NODES'].
epochs (int, optional): Number of epochs to run training for. Defaults to 1.
"""
from aml_compute import TFExperimentCLI
exp = TFExperimentCLI("real_images_remote")
run = exp.submit(
os.path.join(_BASE_PATH, "src"),
"resnet_main.py",
{
"--training_data_path": "{datastore}/train",
"--validation_data_path": "{datastore}/validation",
"--epochs": epochs,
"--data_type": "images",
"--data-format": "channels_first",
},
node_count=node_count,
dependencies_file="TensorFlow_imagenet/environment_gpu.yml",
wait_for_completion=True,
)
print(run)
@task
def submit_images_local(c, epochs=1):
"""Submit TensorFlow training job using real imagenet data for local execution
Args:
epochs (int, optional): Number of epochs to run training for. Defaults to 1.
"""
from aml_compute import TFExperimentCLI
exp = TFExperimentCLI("real_images_local")
run = exp.submit_local(
os.path.join(_BASE_PATH, "src"),
"resnet_main.py",
{
"--training_data_path": "/data/train",
"--validation_data_path": "/data/validation",
"--epochs": epochs,
"--data_type": "images",
"--data-format": "channels_first",
},
dependencies_file="TensorFlow_imagenet/environment_gpu.yml",
docker_args=["-v", f"{env_values['data']}:/data"],
wait_for_completion=True,
)
print(run)
@task
def submit_tfrecords(c, node_count=int(env_values["CLUSTER_MAX_NODES"]), epochs=1):
"""Submit TensorFlow training job using real imagenet data as tfrecords to remote cluster
Args:
node_count (int, optional): The number of nodes to use in cluster. Defaults to env_values['CLUSTER_MAX_NODES'].
epochs (int, optional): Number of epochs to run training for. Defaults to 1.
"""
from aml_compute import TFExperimentCLI
exp = TFExperimentCLI("real_tfrecords_remote")
run = exp.submit(
os.path.join(_BASE_PATH, "src"),
"resnet_main.py",
{
"--training_data_path": "{datastore}/tfrecords/train",
"--validation_data_path": "{datastore}/tfrecords/validation",
"--epochs": epochs,
"--data_type": "tfrecords",
"--data-format": "channels_first",
},
node_count=node_count,
dependencies_file="TensorFlow_imagenet/environment_gpu.yml",
wait_for_completion=True,
)
print(run)
@task
def submit_tfrecords_local(c, epochs=1):
"""Submit TensorFlow training job using real imagenet data as tfrecords for local execution
Args:
epochs (int, optional): Number of epochs to run training for. Defaults to 1.
"""
from aml_compute import TFExperimentCLI
exp = TFExperimentCLI("real_tfrecords_local")
run = exp.submit_local(
os.path.join(_BASE_PATH, "src"),
"resnet_main.py",
{
"--training_data_path": "/data/tfrecords/train",
"--validation_data_path": "/data/tfrecords/validation",
"--epochs": epochs,
"--data_type": "tfrecords",
"--data-format": "channels_first",
},
dependencies_file="TensorFlow_imagenet/environment_gpu.yml",
docker_args=["-v", f"{env_values['data']}:/data"],
wait_for_completion=True,
)
print(run)
remote_collection = Collection("remote")
remote_collection.add_task(submit_images, "images")
remote_collection.add_task(submit_tfrecords, "tfrecords")
remote_collection.add_task(submit_synthetic, "synthetic")
local_collection = Collection("local")
local_collection.add_task(submit_images_local, "images")
local_collection.add_task(submit_tfrecords_local, "tfrecords")
local_collection.add_task(submit_synthetic_local, "synthetic")
submit_collection = Collection("submit", local_collection, remote_collection)
namespace = Collection("tf_imagenet", submit_collection)

Просмотреть файл

@ -0,0 +1,15 @@
CLUSTER_NAME={{cookiecutter.cluster_name}}
CLUSTER_VM_SIZE={{cookiecutter.vm_size}}
CLUSTER_MIN_NODES={{cookiecutter.minimum_number_nodes}}
CLUSTER_MAX_NODES={{cookiecutter.maximum_number_nodes}}
WORKSPACE={{cookiecutter.workspace}}
RESOURCE_GROUP={{cookiecutter.resource_group}}
REGION={{cookiecutter.region}}
LOG_CONFIG=/workspace/control/src/logging.conf
SUBSCRIPTION_ID={{cookiecutter.subscription_id}}
DATASTORE_NAME={{cookiecutter.datastore_name}}
CONTAINER_NAME={{cookiecutter.container_name}}
ACCOUNT_NAME={{cookiecutter.account_name}}
ACCOUNT_KEY={{cookiecutter.account_key}}
DATA={{cookiecutter.data}}
IMAGE_NAME={{cookiecutter.container_registry}}/{{cookiecutter.image_name}}

Просмотреть файл

@ -0,0 +1,4 @@
azure
azure-cli-core
azureml-sdk[notebooks,contrib,tensorboard]
git+https://github.com/msalvaris/amltoolz.git

Просмотреть файл

@ -0,0 +1,32 @@
# Invoke tab-completion script to be sourced with Bash shell.
# Known to work on Bash 3.x, untested on 4.x.
_complete_invoke() {
local candidates
# COMP_WORDS contains the entire command string up til now (including
# program name).
# We hand it to Invoke so it can figure out the current context: spit back
# core options, task names, the current task's options, or some combo.
candidates=`invoke --complete -- ${COMP_WORDS[*]}`
# `compgen -W` takes list of valid options & a partial word & spits back
# possible matches. Necessary for any partial word completions (vs
# completions performed when no partial words are present).
#
# $2 is the current word or token being tabbed on, either empty string or a
# partial word, and thus wants to be compgen'd to arrive at some subset of
# our candidate list which actually matches.
#
# COMPREPLY is the list of valid completions handed back to `complete`.
COMPREPLY=( $(compgen -W "${candidates}" -- $2) )
}
# Tell shell builtin to use the above for completing our invocations.
# * -F: use given function name to generate completions.
# * -o default: when function generates no results, use filenames.
# * positional args: program names to complete for.
complete -F _complete_invoke -o default invoke inv
# vim: set ft=sh :

Просмотреть файл

@ -0,0 +1,96 @@
FROM ubuntu:16.04
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
git \
curl \
nano \
wget \
unzip \
ca-certificates \
jq \
locales \
apt-transport-https \
software-properties-common \
sudo \
tmux
# Install Docker
RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - && \
apt-key fingerprint 0EBFCD88 && \
add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
$(lsb_release -cs) \
stable" &&\
apt-get update && apt-get install -y --no-install-recommends docker-ce
RUN locale-gen en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
COPY environment.yml .
COPY azure_requirements.txt .
ENV ENV_NAME=py36
RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
chmod +x ~/miniconda.sh && \
~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
/opt/conda/bin/conda env create -q --name $ENV_NAME -f environment.yml && \
/opt/conda/bin/conda clean -ya && \
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
echo "conda activate $ENV_NAME" >> ~/.bashrc
ENV PATH /opt/conda/envs/$ENV_NAME/bin:/opt/conda/bin:$PATH
RUN ["/bin/bash", "-c", "pip install -r azure_requirements.txt --ignore-installed PyYAML"]
# Install Azure CLI
RUN echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ xenial main" | \
tee /etc/apt/sources.list.d/azure-cli.list && \
apt-key --keyring /etc/apt/trusted.gpg.d/Microsoft.gpg adv \
--keyserver packages.microsoft.com \
--recv-keys BC528686B50D79E339D3721CEB3E94ADBE1229CF && \
apt-get update && \
apt-get install -y --no-install-recommends \
azure-cli
RUN az extension add -n azure-cli-ml # Install azure ml extension
# Install AzCopy
RUN echo "deb [arch=amd64] https://packages.microsoft.com/repos/microsoft-ubuntu-xenial-prod/ xenial main" > azure.list &&\
cp ./azure.list /etc/apt/sources.list.d/ &&\
apt-key adv --keyserver packages.microsoft.com --recv-keys B02C46DF417A0893 &&\
apt-get update &&\
apt-get install -y --no-install-recommends azcopy
COPY jupyter_notebook_config.py /root/.jupyter/
SHELL ["/bin/bash", "-c"]
RUN jupyter nbextension install --py --user azureml.widgets && \
jupyter nbextension enable --py --user azureml.widgets
# Adding nvidia-docker alias
RUN echo -e '#!/bin/bash\ndocker "$@"' > /usr/bin/nvidia-docker && \
chmod +x /usr/bin/nvidia-docker
ENV PYTHONPATH /workspace/scripts:/workspace/control/src:$PYTHONPATH
# template {% if cookiecutter.type == "template" or cookiecutter.type == "all"%}
ENV PYTHONPATH /workspace/TensorFlow_experiment:$PYTHONPATH
# ------ {% endif %}
# benchmark {% if cookiecutter.type == "benchmark" or cookiecutter.type == "all"%}
ENV PYTHONPATH /workspace/TensorFlow_benchmark:$PYTHONPATH
# ------ {% endif %}
# imagenet {% if cookiecutter.type == "imagenet" or cookiecutter.type == "all"%}
ENV PYTHONPATH /workspace/TensorFlow_imagenet:$PYTHONPATH
# ------- {% endif %}
# Completion script
COPY bash.completion /etc/bash_completion.d/
RUN echo "source /etc/bash_completion.d/bash.completion" >> /root/.bashrc
# Tmux
COPY tmux.conf /root/.tmux.conf
WORKDIR /workspace
CMD /bin/bash

Просмотреть файл

@ -0,0 +1,29 @@
name: py36
channels:
- conda-forge
dependencies:
- python=3.6
- jupyter
- ipykernel
- matplotlib
- seaborn
- numpy
- pandas
- selenium
- phantomjs
- pillow
- bokeh
- ipython
- ipdb
- pip:
- docker
- fire
- toolz
- tabulate==0.8.2
- Jinja2
- gitpython
- tensorflow # Installing for Tensorboard
- tensorboard
- tqdm
- python-dotenv[cli]==0.10.1
- invoke

Просмотреть файл

@ -0,0 +1,6 @@
# Configuration file for jupyter-notebook.
c.NotebookApp.ip = "0.0.0.0"
c.NotebookApp.port = 9999
c.NotebookApp.open_browser = False
c.NotebookApp.allow_root = True

Просмотреть файл

@ -0,0 +1,4 @@
# remap prefix from 'C-b' to 'C-a'
unbind C-b
set-option -g prefix C-a
bind-key C-a send-prefix

Просмотреть файл

@ -0,0 +1,510 @@
import logging
import logging.config
import os
import azureml.core
import fire
from amltoolz import Workspace
from azureml import core
from azureml.core import Datastore
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.conda_dependencies import (
CondaDependencies,
TENSORFLOW_DEFAULT_VERSION,
)
from azureml.core.runconfig import EnvironmentDefinition
from azureml.tensorboard import Tensorboard
from azureml.train.dnn import TensorFlow
from config import load_config
from toolz import curry, pipe
from pprint import pformat
from time import sleep
logging.config.fileConfig(os.getenv("LOG_CONFIG", "logging.conf"))
config_dict = load_config()
_DEFAULT_AML_PATH = config_dict.get("DEFAULT_AML_PATH", "aml_config/azml_config.json")
_CLUSTER_NAME = config_dict.get("CLUSTER_NAME", "gpucluster24rv3")
_CLUSTER_VM_SIZE = config_dict.get("CLUSTER_VM_SIZE", "Standard_NC24rs_v3")
_CLUSTER_MIN_NODES = int(config_dict.get("CLUSTER_MIN_NODES", 0))
_CLUSTER_MAX_NODES = int(config_dict.get("CLUSTER_MAX_NODES", 2))
_WORKSPACE = config_dict.get("WORKSPACE", "workspace")
_RESOURCE_GROUP = config_dict.get("RESOURCE_GROUP", "amlccrg")
_SUBSCRIPTION_ID = config_dict.get("SUBSCRIPTION_ID", None)
_REGION = config_dict.get("REGION", "eastus")
_DEPENDENCIES_FILE = config_dict.get(
"DEPENDENCIES_FILE", "../../experiment/src/environment_gpu.yml"
)
_DATASTORE_NAME = config_dict.get("DATASTORE_NAME", "datastore")
_CONTAINER_NAME = config_dict.get("CONTAINER_NAME", "container")
_ACCOUNT_NAME = config_dict.get("ACCOUNT_NAME", None)
_ACCOUNT_KEY = config_dict.get("ACCOUNT_KEY", None)
def _create_cluster(
workspace,
cluster_name=_CLUSTER_NAME,
vm_size=_CLUSTER_VM_SIZE,
min_nodes=_CLUSTER_MIN_NODES,
max_nodes=_CLUSTER_MAX_NODES,
):
logger = logging.getLogger(__name__)
try:
compute_target = ComputeTarget(workspace=workspace, name=cluster_name)
logger.info("Found existing compute target.")
except ComputeTargetException:
logger.info("Creating a new compute target...")
compute_config = AmlCompute.provisioning_configuration(
vm_size=vm_size, min_nodes=min_nodes, max_nodes=max_nodes
)
# create the cluster
compute_target = ComputeTarget.create(workspace, cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True)
# use get_status() to get a detailed status for the current AmlCompute.
logger.debug(compute_target.get_status().serialize())
return compute_target
def _prepare_environment_definition(dependencies_file, distributed):
logger = logging.getLogger(__name__)
env_def = EnvironmentDefinition()
conda_dep = CondaDependencies(conda_dependencies_file_path=dependencies_file)
env_def.python.user_managed_dependencies = False
env_def.python.conda_dependencies = conda_dep
env_def.docker.enabled = True
env_def.docker.gpu_support = True
env_def.docker.base_image = azureml.core.runconfig.DEFAULT_GPU_IMAGE
env_def.docker.shm_size = "8g"
env_def.environment_variables["NCCL_SOCKET_IFNAME"] = "eth0"
env_def.environment_variables["NCCL_IB_DISABLE"] = 1
if distributed:
env_def.environment_variables["DISTRIBUTED"] = "True"
else:
env_def.environment_variables["DISTRIBUTED"] = "False"
logger.info("Adding runtime argument")
# Adds runtime argument since we aliased nvidia-docker to docker in order to be able to run them as
# sibling containers. Without this we will get CUDA library errors
env_def.docker.arguments.extend(["--runtime", "nvidia"])
return env_def
@curry
def _create_estimator(
estimator_class,
dependencies_file,
project_folder,
entry_script,
compute_target,
script_params,
node_count=_CLUSTER_MAX_NODES,
process_count_per_node=4,
docker_args=(),
):
logger = logging.getLogger(__name__)
logger.debug(f"Loading dependencies from {dependencies_file}")
# If the compute target is "local" then don't run distributed
distributed = not (isinstance(compute_target, str) and compute_target == "local")
env_def = _prepare_environment_definition(dependencies_file, distributed)
env_def.docker.arguments.extend(list(docker_args))
estimator = estimator_class(
project_folder,
entry_script=entry_script,
compute_target=compute_target,
script_params=script_params,
node_count=node_count,
process_count_per_node=process_count_per_node,
distributed_backend="mpi" if distributed else None,
environment_definition=env_def,
)
logger.debug(estimator.conda_dependencies.__dict__)
return estimator
def _create_datastore(
aml_workspace,
datastore_name,
container_name,
account_name,
account_key,
create_if_not_exists=True,
):
ds = Datastore.register_azure_blob_container(
workspace=aml_workspace,
datastore_name=datastore_name,
container_name=container_name,
account_name=account_name,
account_key=account_key,
create_if_not_exists=create_if_not_exists,
)
return ds
class ExperimentCLI(object):
def __init__(
self,
experiment_name,
workspace_name=_WORKSPACE,
resource_group=_RESOURCE_GROUP,
subscription_id=_SUBSCRIPTION_ID,
workspace_region=_REGION,
config_path=_DEFAULT_AML_PATH,
):
self._logger = logging.getLogger(__name__)
self._logger.info("SDK version:" + str(azureml.core.VERSION))
self._ws = workspace_for_user(
workspace_name=workspace_name,
resource_group=resource_group,
subscription_id=subscription_id,
workspace_region=workspace_region,
config_path=config_path,
).aml_workspace
self._experiment = core.Experiment(self._ws, name=experiment_name)
self._cluster = None
self._datastore = None
def create_cluster(
self,
name=_CLUSTER_NAME,
vm_size=_CLUSTER_VM_SIZE,
min_nodes=_CLUSTER_MIN_NODES,
max_nodes=_CLUSTER_MAX_NODES,
):
"""Creates AzureML cluster
Args:
name (string, optional): The name you wish to assign the cluster.
Defaults to _CLUSTER_NAME.
vm_size (string, optional): The type of sku to use for your vm.
Defaults to _CLUSTER_VM_SIZE.
min_nodes (int, optional): Minimum number of nodes in cluster.
Use 0 if you don't want to incur costs when it isn't being used.
Defaults to _CLUSTER_MIN_NODES.
max_nodes (int, optional): Maximum number of nodes in cluster.
Defaults to _CLUSTER_MAX_NODES.
Returns:
ExperimentCLI: Experiment object
"""
self._cluster = _create_cluster(
self._ws,
cluster_name=name,
vm_size=vm_size,
min_nodes=min_nodes,
max_nodes=max_nodes,
)
return self
def create_datastore(
self,
datastore_name=_DATASTORE_NAME,
container_name=_CONTAINER_NAME,
account_name=_ACCOUNT_NAME,
account_key=_ACCOUNT_KEY,
):
"""Creates datastore
Args:
datastore_name (string, optional): Name you wish to assign to your datastore. Defaults to _DATASTORE_NAME.
container_name (string, optional): Name of your container. Defaults to _CONTAINER_NAME.
account_name (string, optional): Storage account name. Defaults to _ACCOUNT_NAME.
account_key (string, optional): The storage account key. Defaults to _ACCOUNT_KEY.
Returns:
ExperimentCLI: Experiment object
"""
assert account_name is not None, "Account name for Datastore not set"
assert account_key is not None, "Account key for Datastore not set"
self._datastore = _create_datastore(
self._ws,
datastore_name=datastore_name,
container_name=container_name,
account_name=account_name,
account_key=account_key,
)
return self
@property
def cluster(self):
if self._cluster is None:
self.create_cluster()
return self._cluster
@property
def datastore(self):
if self._datastore is None:
self.create_datastore()
return self._datastore
def _has_key(input_dict, key):
for v in input_dict.values:
if key in v:
return True
return False
def _fill_param_with(input_dict, parameters_dict):
return {key: value.format(**parameters_dict) for key, value in input_dict.items()}
class TFExperimentCLI(ExperimentCLI):
"""Creates Experiment object that can be used to create clusters and submit experiments
Returns:
TFExperimentCLI: Experiment object
"""
def submit_local(
self,
project_folder,
entry_script,
script_params,
dependencies_file=_DEPENDENCIES_FILE,
wait_for_completion=True,
docker_args=(),
):
"""Submit experiment for local execution
Args:
project_folder (string): Path of you source files for the experiment
entry_script (string): The filename of your script to run. Must be found in your project_folder
script_params (dict): Dictionary of script parameters
dependencies_file (string, optional): The location of your environment.yml to use to create the
environment your training script requires.
Defaults to _DEPENDENCIES_FILE.
wait_for_completion (bool, optional): Whether to block until experiment is done. Defaults to True.
docker_args (tuple, optional): Docker arguments to pass. Defaults to ().
"""
self._logger.info("Running in local mode")
self._submit(
dependencies_file,
project_folder,
entry_script,
"local",
script_params,
1,
1,
docker_args,
wait_for_completion,
)
def submit(
self,
project_folder,
entry_script,
script_params,
dependencies_file=_DEPENDENCIES_FILE,
node_count=_CLUSTER_MAX_NODES,
process_count_per_node=4,
wait_for_completion=True,
docker_args=(),
):
"""Submit experiment for remote execution on AzureML clusters
Args:
project_folder (string): Path of you source files for the experiment
entry_script (string): The filename of your script to run. Must be found in your project_folder
script_params (dict): Dictionary of script parameters
dependencies_file (string, optional): The location of your environment.yml to use to
create the environment your training script requires.
Defaults to _DEPENDENCIES_FILE.
node_count (int, optional): [description]. Defaults to _CLUSTER_MAX_NODES.
process_count_per_node (int, optional): Number of precesses to run on each node.
Usually should be the same as the number of GPU for GPU exeuction.
Defaults to 4.
wait_for_completion (bool, optional): Whether to block until experiment is done. Defaults to True.
docker_args (tuple, optional): Docker arguments to pass. Defaults to ().
Returns:
azureml.core.Run: AzureML Run object
"""
self._logger.debug(script_params)
transformed_params = self._complete_datastore(script_params)
self._logger.debug("Transformed script params")
self._logger.debug(transformed_params)
return self._submit(
dependencies_file,
project_folder,
entry_script,
self.cluster,
transformed_params,
node_count,
process_count_per_node,
docker_args,
wait_for_completion,
)
def _submit(
self,
dependencies_file,
project_folder,
entry_script,
cluster,
script_params,
node_count,
process_count_per_node,
docker_args,
wait_for_completion,
):
self._logger.debug(script_params)
estimator = _create_estimator(
TensorFlow,
dependencies_file,
project_folder,
entry_script,
cluster,
script_params,
node_count=node_count,
process_count_per_node=process_count_per_node,
docker_args=docker_args,
)
# TEMPORARY HACK: Bugs with AML necessitate the code below, once fixed remove
estimator.conda_dependencies.remove_pip_package("horovod==0.15.2")
estimator.conda_dependencies.remove_pip_package(
"tensorflow==" + TENSORFLOW_DEFAULT_VERSION
)
estimator.conda_dependencies.add_pip_package("tensorflow-gpu==1.12.0")
estimator.conda_dependencies.add_pip_package("horovod==0.15.2")
self._logger.debug(estimator.conda_dependencies.__dict__)
run = self._experiment.submit(estimator)
if wait_for_completion:
run.wait_for_completion(show_output=True)
return run
def _complete_datastore(self, script_params):
def _replace(value):
if isinstance(value, str) and "{datastore}" in value:
data_path = value.replace("{datastore}/", "")
return self.datastore.path(data_path).as_mount()
else:
return value
return {key: _replace(value) for key, value in script_params.items()}
def workspace_for_user(
workspace_name=_WORKSPACE,
resource_group=_RESOURCE_GROUP,
subscription_id=_SUBSCRIPTION_ID,
workspace_region=_REGION,
config_path=_DEFAULT_AML_PATH,
):
""" Creates or gets amltoolz.Workspace instance which represents an AML Workspace.
Args:
workspace_name (str): Name of workspace
resource_group (str): Name of Azure Resource group
subscription_id (str): Azure Subscription ID
workspace_region (str): Azure region to create resources in
config_path (str): Path to save AML config to
Returns:
amltoolz.Workspace: Either a new workspace created or gets one as identified by name, region and resource group
"""
return Workspace(
workspace_name=workspace_name,
resource_group=resource_group,
subscription_id=subscription_id,
workspace_region=workspace_region,
config_path=config_path,
)
def tensorboard(runs):
""" Returns Tensorboard object instantiated with one or more runs
You can start Tensorboard session by calling start on Tensorboard object
To stop simply call stop on same object
Args:
runs (azureml.core.script_run.ScriptRun or list):
Returns:
azureml.tensorboard.Tensorboard
Examples:
>>> tb = tensorboard(runs)
>>> tb.start() # Start Tensorboard
>>> tb.stop() # Stop Tensorboard
"""
logger = logging.getLogger(__name__)
logger.info(f"Starting tensorboard {pformat(runs)}")
if isinstance(runs, list):
return Tensorboard(runs)
else:
return Tensorboard([runs])
def _start_and_wait(tb):
logger = logging.getLogger(__name__)
try:
tb.start()
while True:
sleep(10)
except KeyboardInterrupt:
logger.info("Exiting Tensorboard")
finally:
tb.stop()
def _select_runs(experiment, runs=None, status=("Running",)):
logger = logging.getLogger(__name__)
try:
if runs:
selected_runs = [experiment.runs[run].aml_run for run in runs]
else:
selected_runs = [
run.aml_run for run in experiment.runs if run.aml_run.status in status
]
if len(selected_runs) == 0:
logger.warn("No runs found")
return selected_runs
except KeyError as e:
logger.warn(f"Did not find run!")
raise e
def tensorboard_cli(experiment, runs=None, status=("Running",)):
logger = logging.getLogger(__name__)
ws = workspace_for_user()
ws.experiments.refresh()
try:
exp_obj = ws.experiments[experiment]
exp_obj.runs.refresh()
runs = _select_runs(exp_obj, runs=runs, status=status)
logger.debug(pformat(runs))
pipe(runs, tensorboard, _start_and_wait)
except KeyError:
logger.warn(f"Did not find experiment {experiment}!")
logger.warn("Your experiments are:")
for exp in ws.experiments:
logger.warn(f"{exp}")
if __name__ == "__main__":
""" Access workspace and run TensorFlow experiments
"""
fire.Fire(
{
"workspace": workspace_for_user,
"tf-experiment": TFExperimentCLI,
"tensorboard": tensorboard_cli,
}
)

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше