Adds initial version of template (#33)
This commit is contained in:
Родитель
3a99ab8e35
Коммит
b3c31b6e22
|
@ -8,7 +8,6 @@ __pycache__/
|
|||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
env/
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
|
@ -24,6 +23,7 @@ wheels/
|
|||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
|
@ -45,6 +45,7 @@ nosetests.xml
|
|||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
|
@ -53,6 +54,7 @@ coverage.xml
|
|||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
|
@ -79,13 +81,14 @@ celerybeat-schedule
|
|||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# dotenv
|
||||
# Environments
|
||||
.env
|
||||
|
||||
# virtualenv
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
|
@ -100,8 +103,9 @@ ENV/
|
|||
# mypy
|
||||
.mypy_cache/
|
||||
|
||||
# Pycharm
|
||||
.idea/
|
||||
*/TensorFlow_benchmark/src/*
|
||||
|
||||
#################
|
||||
job.json
|
||||
*/.vscode/*
|
||||
.vscode/settings.json
|
||||
|
||||
.dev_env
|
||||
|
|
|
@ -1,73 +0,0 @@
|
|||
# Ubuntu 16.04, CUDA 9.0
|
||||
FROM nvidia/cuda:9.0-runtime-ubuntu16.04
|
||||
|
||||
ARG CNTK_VERSION="2.5.1"
|
||||
LABEL maintainer "MICROSOFT CORPORATION" \
|
||||
com.microsoft.cntk.version="$CNTK_VERSION"
|
||||
|
||||
ENV CNTK_VERSION="$CNTK_VERSION"
|
||||
|
||||
# Install CNTK as the default backend for Keras
|
||||
ENV KERAS_BACKEND=cntk
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
# General
|
||||
ca-certificates \
|
||||
wget \
|
||||
sudo \
|
||||
build-essential \
|
||||
openssh-client \
|
||||
openssh-server \
|
||||
&& \
|
||||
# Clean-up
|
||||
apt-get -y autoremove \
|
||||
&& \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Get CNTK Binary Distribution
|
||||
RUN CNTK_VERSION_DASHED=$(echo $CNTK_VERSION | tr . -) && \
|
||||
([ "$CNTK_VERSION" != "2.4" ] || VERIFY_SHA256="true") && \
|
||||
CNTK_SHA256="8eebff81ef4111b2be5804303f1254cd20de5911a7678c8e64689e5c288dde40" && \
|
||||
wget -q https://cntk.ai/BinaryDrop/CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz && \
|
||||
([ "$VERIFY_SHA256" != "true" ] || (echo "$CNTK_SHA256 CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz" | sha256sum --check --strict -)) && \
|
||||
tar -xzf CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz && \
|
||||
rm -f CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz && \
|
||||
/bin/bash /cntk/Scripts/install/linux/install-cntk.sh --py-version 35 --docker
|
||||
|
||||
WORKDIR /root
|
||||
ENV PATH /cntk/cntk/bin:/root/anaconda3/envs/cntk-py35/bin:$PATH
|
||||
ENV LD_LIBRARY_PATH /cntk/cntk/lib:/cntk/cntk/dependencies/lib:$LD_LIBRARY_PATH
|
||||
|
||||
# Install Open MPI
|
||||
RUN mkdir /tmp/openmpi && \
|
||||
cd /tmp/openmpi && \
|
||||
wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
|
||||
tar zxf openmpi-3.0.0.tar.gz && \
|
||||
cd openmpi-3.0.0 && \
|
||||
./configure --enable-orterun-prefix-by-default && \
|
||||
make -j $(nproc) all && \
|
||||
make install && \
|
||||
ldconfig && \
|
||||
rm -rf /tmp/openmpi
|
||||
|
||||
# Create a wrapper for OpenMPI to allow running as root by default
|
||||
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
|
||||
echo '#!/bin/bash' > /usr/local/bin/mpirun && \
|
||||
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
|
||||
chmod a+x /usr/local/bin/mpirun
|
||||
|
||||
# Configure OpenMPI to run good defaults:
|
||||
# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
|
||||
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
|
||||
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \
|
||||
echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
|
||||
|
||||
# Set default NCCL parameters
|
||||
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \
|
||||
echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
|
||||
|
||||
# Allow OpenSSH to talk to containers without asking for confirmation
|
||||
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
|
||||
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
|
||||
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
DATA_DIR:=/mnt/imagenet
|
||||
PWD:=$(shell pwd)
|
||||
FAKE:='False'
|
||||
FAKE_DATA_LENGTH:=1281167
|
||||
image-open:=hoaphumanoid/cntk:distributed-openmpi3
|
||||
open-path:=$(PWD)/Docker/cntk
|
||||
script:=\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_cntk.py
|
||||
include ../include/build.mk
|
||||
|
|
@ -1,309 +0,0 @@
|
|||
"""
|
||||
Trains ResNet50 using CNTK.
|
||||
|
||||
It requires the following env variables
|
||||
AZ_BATCHAI_INPUT_TRAIN
|
||||
AZ_BATCHAI_OUTPUT_MODEL
|
||||
|
||||
This code is based on this example:
|
||||
https://github.com/Microsoft/CNTK/blob/master/Examples/Image/Classification/ResNet/Python/TrainResNet_ImageNet_Distributed.py
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import numpy as np
|
||||
import cntk as C
|
||||
from cntk import input, cross_entropy_with_softmax, classification_error, Trainer, cntk_py
|
||||
from cntk import data_parallel_distributed_learner, Communicator
|
||||
from cntk.learners import momentum_sgd, learning_rate_schedule, momentum_schedule, UnitType
|
||||
from cntk.io import UserMinibatchSource, StreamInformation, MinibatchData
|
||||
from cntk.train.training_session import *
|
||||
from cntk.debugging import *
|
||||
from cntk.logging import *
|
||||
import cntk.io.transforms as xforms
|
||||
from resnet_models import create_imagenet_model_bottleneck
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
|
||||
import logging
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _str_to_bool(in_str):
|
||||
if 't' in in_str.lower():
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
# model dimensions
|
||||
_WIDTH = 224
|
||||
_HEIGHT = 224
|
||||
_CHANNELS = 3
|
||||
_LR = 0.001
|
||||
_EPOCHS = os.getenv('EPOCHS', 1)
|
||||
_BATCHSIZE = 32
|
||||
_MOMENTUM = 0.9
|
||||
_NUMCLASSES = 1000
|
||||
_MODELNAME = 'ResNet_ImageNet.model'
|
||||
_NUMQUANTIZEDBITS = 32
|
||||
_WD = 0.0001
|
||||
|
||||
|
||||
_FAKE = _str_to_bool(os.getenv('FAKE', 'False'))
|
||||
# How much fake data to simulate, default to size of imagenet dataset
|
||||
_DATA_LENGTH = int(os.getenv('FAKE_DATA_LENGTH', 1281167))
|
||||
_DISTRIBUTED = _str_to_bool(os.getenv('DISTRIBUTED', 'False'))
|
||||
|
||||
|
||||
def _get_progress_printer():
|
||||
pp = ProgressPrinter(
|
||||
freq=100,
|
||||
tag='Training',
|
||||
log_to_file=None,
|
||||
rank=Communicator.rank(),
|
||||
gen_heartbeat=False,
|
||||
num_epochs=_EPOCHS)
|
||||
return pp
|
||||
|
||||
|
||||
def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
|
||||
if not os.path.exists(map_file) or not os.path.exists(mean_file):
|
||||
raise RuntimeError("File '%s' or '%s' does not exist." %
|
||||
(map_file, mean_file))
|
||||
|
||||
# transformation pipeline for the features has jitter/crop only when training
|
||||
transforms = []
|
||||
if train:
|
||||
transforms += [
|
||||
xforms.crop(crop_type='randomarea',
|
||||
area_ratio=(0.08, 1.0),
|
||||
aspect_ratio=(0.75, 1.3333),
|
||||
jitter_type='uniratio')
|
||||
]
|
||||
else:
|
||||
transforms += [
|
||||
# test has no jitter
|
||||
C.io.transforms.crop(crop_type='center', side_ratio=0.875)
|
||||
]
|
||||
|
||||
transforms += [
|
||||
xforms.scale(width=_WIDTH, height=_HEIGHT,
|
||||
channels=_CHANNELS, interpolations='cubic'),
|
||||
xforms.mean(mean_file)
|
||||
]
|
||||
|
||||
# deserializer
|
||||
return C.io.MinibatchSource(
|
||||
C.io.ImageDeserializer(map_file, C.io.StreamDefs(
|
||||
# 1st col in mapfile referred to as 'image'
|
||||
features=C.io.StreamDef(field='image', transforms=transforms),
|
||||
labels=C.io.StreamDef(field='label', shape=_NUMCLASSES))), # and second as 'label'
|
||||
randomize=train,
|
||||
max_samples=total_number_of_samples,
|
||||
multithreaded_deserializer=True)
|
||||
|
||||
|
||||
class FakeDataSource(UserMinibatchSource):
|
||||
"""Fake data source
|
||||
https://cntk.ai/pythondocs/Manual_How_to_create_user_minibatch_sources.html
|
||||
"""
|
||||
|
||||
def __init__(self, total_n_images, dim, channels, n_classes, seed=42):
|
||||
self.dim = dim
|
||||
self.total_n_images = total_n_images
|
||||
self.channels = channels
|
||||
self.n_classes = n_classes
|
||||
self.seed = seed
|
||||
self.fsi = StreamInformation(name='features', stream_id=0, storage_format='dense',
|
||||
dtype=np.float32, shape=(self.channels, self.dim[0], self.dim[0],))
|
||||
self.lsi = StreamInformation(
|
||||
name='labels', stream_id=1, storage_format='dense', dtype=np.float32, shape=(self.n_classes,))
|
||||
self.sample_count = 0
|
||||
self.next_seq_idx = 0
|
||||
super(FakeDataSource, self).__init__()
|
||||
|
||||
def stream_infos(self):
|
||||
"""
|
||||
Override the stream_infos method of the base UserMinibatchSource class
|
||||
to provide stream meta information.
|
||||
"""
|
||||
return [self.fsi, self.lsi]
|
||||
|
||||
def next_minibatch(self, num_samples, number_of_workers=1, worker_rank=0, device=None):
|
||||
"""
|
||||
Override the next_minibatch method of the base UserMinibatchSource class
|
||||
to provide minibatch data.
|
||||
"""
|
||||
np.random.seed(self.seed)
|
||||
x = np.random.rand(num_samples, self.channels,
|
||||
self.dim[0], self.dim[1]).astype(np.float32)
|
||||
y = np.random.choice(self.n_classes, num_samples)
|
||||
y = np.expand_dims(y, axis=-1)
|
||||
enc = OneHotEncoder(n_values=self.n_classes, dtype=np.float32,
|
||||
categorical_features='all')
|
||||
fit = enc.fit(y)
|
||||
y = fit.transform(y).toarray()
|
||||
if self.sample_count + num_samples <= self.total_n_images:
|
||||
self.sample_count += num_samples
|
||||
self.next_seq_idx += num_samples
|
||||
feature_data = C.Value(batch=x, device=device)
|
||||
label_data = C.Value(batch=y, device=device)
|
||||
res = {
|
||||
self.fsi: MinibatchData(feature_data, num_samples, num_samples, False),
|
||||
self.lsi: MinibatchData(label_data, num_samples, num_samples, False)
|
||||
}
|
||||
else:
|
||||
res = {}
|
||||
|
||||
return res
|
||||
|
||||
def get_checkpoint_state(self):
|
||||
return {'next_seq_idx': self.next_seq_idx}
|
||||
|
||||
def restore_from_checkpoint(self, state):
|
||||
self.next_seq_idx = state['next_seq_idx']
|
||||
|
||||
|
||||
def model_fn():
|
||||
# Input variables denoting the features and label data
|
||||
graph_input = C.input_variable((_CHANNELS, _HEIGHT, _WIDTH))
|
||||
graph_label = C.input_variable((_NUMCLASSES))
|
||||
|
||||
with C.default_options(dtype=np.float32):
|
||||
stride1x1 = (1, 1)
|
||||
stride3x3 = (2, 2)
|
||||
|
||||
# create model, and configure learning parameters for ResNet50
|
||||
z = create_imagenet_model_bottleneck(graph_input, [2, 3, 5, 2],
|
||||
_NUMCLASSES, stride1x1, stride3x3)
|
||||
|
||||
# loss and metric
|
||||
ce = cross_entropy_with_softmax(z, graph_label)
|
||||
errs = classification_error(z, graph_label, topN=1)
|
||||
|
||||
return {
|
||||
'name': 'resnet50',
|
||||
'feature': graph_input,
|
||||
'label': graph_label,
|
||||
'ce': ce,
|
||||
'errs': errs,
|
||||
'output': z
|
||||
}
|
||||
|
||||
|
||||
# Create trainer
|
||||
def create_trainer(network, minibatch_size, epoch_size,
|
||||
learning_rate, momentum, l2_reg_weight,
|
||||
num_quantization_bits):
|
||||
lr_per_mb = [learning_rate]
|
||||
|
||||
# Set learning parameters
|
||||
lr_schedule = learning_rate_schedule(
|
||||
lr_per_mb, epoch_size=epoch_size, unit=UnitType.minibatch)
|
||||
mm_schedule = momentum_schedule(momentum)
|
||||
local_learner = momentum_sgd(network['output'].parameters,
|
||||
lr_schedule,
|
||||
mm_schedule,
|
||||
l2_regularization_weight=l2_reg_weight)
|
||||
|
||||
# learner object
|
||||
if _DISTRIBUTED:
|
||||
learner = data_parallel_distributed_learner(
|
||||
local_learner,
|
||||
num_quantization_bits=num_quantization_bits,
|
||||
distributed_after=0)
|
||||
else:
|
||||
learner = local_learner
|
||||
|
||||
# logger
|
||||
progress_printer = _get_progress_printer()
|
||||
|
||||
return Trainer(network['output'], (network['ce'], network['errs']), learner, progress_printer)
|
||||
|
||||
|
||||
def train_and_test(network, trainer, train_source, test_source, minibatch_size,
|
||||
epoch_size, model_path):
|
||||
|
||||
# define mapping from intput streams to network inputs
|
||||
input_map = {
|
||||
network['feature']: train_source.streams.features,
|
||||
network['label']: train_source.streams.labels
|
||||
}
|
||||
if _DISTRIBUTED:
|
||||
start_profiler(sync_gpu=True)
|
||||
|
||||
training_session(
|
||||
trainer=trainer,
|
||||
mb_source=train_source,
|
||||
mb_size=minibatch_size,
|
||||
model_inputs_to_streams=input_map,
|
||||
progress_frequency=epoch_size,
|
||||
checkpoint_config=CheckpointConfig(frequency=epoch_size,
|
||||
filename=os.path.join(
|
||||
model_path, _MODELNAME),
|
||||
restore=False) # ,
|
||||
# test_config=TestConfig(test_source, minibatch_size)
|
||||
).train()
|
||||
if _DISTRIBUTED:
|
||||
stop_profiler()
|
||||
|
||||
|
||||
def main():
|
||||
model_path = os.getenv('AZ_BATCHAI_OUTPUT_MODEL')
|
||||
|
||||
if _DISTRIBUTED:
|
||||
minibatch_size = _BATCHSIZE * Communicator.num_workers()
|
||||
else:
|
||||
minibatch_size = _BATCHSIZE
|
||||
|
||||
logger.info("Creating model ...")
|
||||
network = model_fn()
|
||||
|
||||
logger.info("Creating trainer ...")
|
||||
trainer = create_trainer(network,
|
||||
minibatch_size,
|
||||
_DATA_LENGTH,
|
||||
learning_rate=_LR,
|
||||
momentum=_MOMENTUM,
|
||||
l2_reg_weight=_WD,
|
||||
num_quantization_bits=_NUMQUANTIZEDBITS)
|
||||
|
||||
logger.info('Creating data sources ...')
|
||||
if _FAKE:
|
||||
logger.info("Using {} images of fake data".format(_DATA_LENGTH))
|
||||
train_source = FakeDataSource(total_n_images=_DATA_LENGTH,
|
||||
dim=(_HEIGHT, _WIDTH),
|
||||
channels=_CHANNELS,
|
||||
n_classes=_NUMCLASSES)
|
||||
test_source = None
|
||||
else:
|
||||
logging.info(
|
||||
"Using ImageNet dataset with {} images".format(_DATA_LENGTH))
|
||||
data_path = os.getenv('AZ_BATCHAI_INPUT_TRAIN')
|
||||
logger.info("model_path: {}".format(model_path))
|
||||
logger.info("data_path: {}".format(data_path))
|
||||
|
||||
mean_data = os.path.join(data_path, 'ImageNet1K_mean.xml')
|
||||
train_data = os.path.join(data_path, 'train_map.txt')
|
||||
test_data = os.path.join(data_path, 'val_map.txt')
|
||||
train_source = create_image_mb_source(
|
||||
train_data, mean_data, train=True, total_number_of_samples=_EPOCHS*_DATA_LENGTH)
|
||||
test_source = create_image_mb_source(
|
||||
test_data, mean_data, train=False, total_number_of_samples=C.io.FULL_DATA_SWEEP)
|
||||
|
||||
logger.info("Training with minibatch size of {}".format(minibatch_size))
|
||||
train_and_test(network, trainer, train_source, test_source,
|
||||
minibatch_size, _DATA_LENGTH, model_path)
|
||||
|
||||
if _DISTRIBUTED:
|
||||
# Must call MPI finalize when process exit without exceptions
|
||||
Communicator.finalize()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger.info("Starting routine. Distributed mode={}".format(_DISTRIBUTED))
|
||||
main()
|
||||
logger.info("Routine finished")
|
|
@ -1,137 +0,0 @@
|
|||
# Copyright (c) Microsoft. All rights reserved.
|
||||
#
|
||||
# Licensed under the MIT license. See LICENSE.md file in the project root
|
||||
# for full license information.
|
||||
# ==============================================================================
|
||||
|
||||
import numpy as np
|
||||
from cntk.initializer import he_normal, normal
|
||||
from cntk.layers import AveragePooling, MaxPooling, BatchNormalization, Convolution, Dense
|
||||
from cntk.ops import element_times, relu
|
||||
|
||||
#
|
||||
# assembly components
|
||||
#
|
||||
def conv_bn(input, filter_size, num_filters, strides=(1, 1), init=he_normal(), bn_init_scale=1):
|
||||
c = Convolution(filter_size, num_filters, activation=None, init=init, pad=True, strides=strides, bias=False)(input)
|
||||
r = BatchNormalization(map_rank=1, normalization_time_constant=4096, use_cntk_engine=False, init_scale=bn_init_scale, disable_regularization=True)(c)
|
||||
return r
|
||||
|
||||
def conv_bn_relu(input, filter_size, num_filters, strides=(1, 1), init=he_normal()):
|
||||
r = conv_bn(input, filter_size, num_filters, strides, init, 1)
|
||||
return relu(r)
|
||||
|
||||
#
|
||||
# ResNet components
|
||||
#
|
||||
def resnet_basic(input, num_filters):
|
||||
c1 = conv_bn_relu(input, (3, 3), num_filters)
|
||||
c2 = conv_bn(c1, (3, 3), num_filters, bn_init_scale=1)
|
||||
p = c2 + input
|
||||
return relu(p)
|
||||
|
||||
def resnet_basic_inc(input, num_filters, strides=(2, 2)):
|
||||
c1 = conv_bn_relu(input, (3, 3), num_filters, strides)
|
||||
c2 = conv_bn(c1, (3, 3), num_filters, bn_init_scale=1)
|
||||
s = conv_bn(input, (1, 1), num_filters, strides) # Shortcut
|
||||
p = c2 + s
|
||||
return relu(p)
|
||||
|
||||
def resnet_basic_stack(input, num_stack_layers, num_filters):
|
||||
assert(num_stack_layers >= 0)
|
||||
l = input
|
||||
for _ in range(num_stack_layers):
|
||||
l = resnet_basic(l, num_filters)
|
||||
return l
|
||||
|
||||
def resnet_bottleneck(input, out_num_filters, inter_out_num_filters):
|
||||
c1 = conv_bn_relu(input, (1, 1), inter_out_num_filters)
|
||||
c2 = conv_bn_relu(c1, (3, 3), inter_out_num_filters)
|
||||
c3 = conv_bn(c2, (1, 1), out_num_filters, bn_init_scale=0)
|
||||
p = c3 + input
|
||||
return relu(p)
|
||||
|
||||
def resnet_bottleneck_inc(input, out_num_filters, inter_out_num_filters, stride1x1, stride3x3):
|
||||
c1 = conv_bn_relu(input, (1, 1), inter_out_num_filters, strides=stride1x1)
|
||||
c2 = conv_bn_relu(c1, (3, 3), inter_out_num_filters, strides=stride3x3)
|
||||
c3 = conv_bn(c2, (1, 1), out_num_filters, bn_init_scale=0)
|
||||
stride = np.multiply(stride1x1, stride3x3)
|
||||
s = conv_bn(input, (1, 1), out_num_filters, strides=stride) # Shortcut
|
||||
p = c3 + s
|
||||
return relu(p)
|
||||
|
||||
def resnet_bottleneck_stack(input, num_stack_layers, out_num_filters, inter_out_num_filters):
|
||||
assert(num_stack_layers >= 0)
|
||||
l = input
|
||||
for _ in range(num_stack_layers):
|
||||
l = resnet_bottleneck(l, out_num_filters, inter_out_num_filters)
|
||||
return l
|
||||
|
||||
#
|
||||
# Defines the residual network model for classifying images
|
||||
#
|
||||
def create_cifar10_model(input, num_stack_layers, num_classes):
|
||||
c_map = [16, 32, 64]
|
||||
|
||||
conv = conv_bn_relu(input, (3, 3), c_map[0])
|
||||
r1 = resnet_basic_stack(conv, num_stack_layers, c_map[0])
|
||||
|
||||
r2_1 = resnet_basic_inc(r1, c_map[1])
|
||||
r2_2 = resnet_basic_stack(r2_1, num_stack_layers-1, c_map[1])
|
||||
|
||||
r3_1 = resnet_basic_inc(r2_2, c_map[2])
|
||||
r3_2 = resnet_basic_stack(r3_1, num_stack_layers-1, c_map[2])
|
||||
|
||||
# Global average pooling and output
|
||||
pool = AveragePooling(filter_shape=(8, 8), name='final_avg_pooling')(r3_2)
|
||||
z = Dense(num_classes, init=normal(0.01))(pool)
|
||||
return z
|
||||
|
||||
def create_imagenet_model_basic(input, num_stack_layers, num_classes):
|
||||
c_map = [64, 128, 256, 512]
|
||||
|
||||
conv = conv_bn_relu(input, (7, 7), c_map[0], strides=(2, 2))
|
||||
pool1 = MaxPooling((3, 3), strides=(2, 2), pad=True)(conv)
|
||||
r1 = resnet_basic_stack(pool1, num_stack_layers[0], c_map[0])
|
||||
|
||||
r2_1 = resnet_basic_inc(r1, c_map[1])
|
||||
r2_2 = resnet_basic_stack(r2_1, num_stack_layers[1], c_map[1])
|
||||
|
||||
r3_1 = resnet_basic_inc(r2_2, c_map[2])
|
||||
r3_2 = resnet_basic_stack(r3_1, num_stack_layers[2], c_map[2])
|
||||
|
||||
r4_1 = resnet_basic_inc(r3_2, c_map[3])
|
||||
r4_2 = resnet_basic_stack(r4_1, num_stack_layers[3], c_map[3])
|
||||
|
||||
# Global average pooling and output
|
||||
pool = AveragePooling(filter_shape=(7, 7), name='final_avg_pooling')(r4_2)
|
||||
z = Dense(num_classes, init=normal(0.01))(pool)
|
||||
return z
|
||||
|
||||
def create_imagenet_model_bottleneck(input, num_stack_layers, num_classes, stride1x1, stride3x3):
|
||||
c_map = [64, 128, 256, 512, 1024, 2048]
|
||||
|
||||
# conv1 and max pooling
|
||||
conv1 = conv_bn_relu(input, (7, 7), c_map[0], strides=(2, 2))
|
||||
pool1 = MaxPooling((3,3), strides=(2,2), pad=True)(conv1)
|
||||
|
||||
# conv2_x
|
||||
r2_1 = resnet_bottleneck_inc(pool1, c_map[2], c_map[0], (1, 1), (1, 1))
|
||||
r2_2 = resnet_bottleneck_stack(r2_1, num_stack_layers[0], c_map[2], c_map[0])
|
||||
|
||||
# conv3_x
|
||||
r3_1 = resnet_bottleneck_inc(r2_2, c_map[3], c_map[1], stride1x1, stride3x3)
|
||||
r3_2 = resnet_bottleneck_stack(r3_1, num_stack_layers[1], c_map[3], c_map[1])
|
||||
|
||||
# conv4_x
|
||||
r4_1 = resnet_bottleneck_inc(r3_2, c_map[4], c_map[2], stride1x1, stride3x3)
|
||||
r4_2 = resnet_bottleneck_stack(r4_1, num_stack_layers[2], c_map[4], c_map[2])
|
||||
|
||||
# conv5_x
|
||||
r5_1 = resnet_bottleneck_inc(r4_2, c_map[5], c_map[3], stride1x1, stride3x3)
|
||||
r5_2 = resnet_bottleneck_stack(r5_1, num_stack_layers[3], c_map[5], c_map[3])
|
||||
|
||||
# Global average pooling and output
|
||||
pool = AveragePooling(filter_shape=(7, 7), name='final_avg_pooling')(r5_2)
|
||||
z = Dense(num_classes, init=normal(0.01))(pool)
|
||||
return z
|
|
@ -1,48 +0,0 @@
|
|||
FROM ubuntu:16.04
|
||||
|
||||
COPY environment.yml .
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
ca-certificates \
|
||||
cmake \
|
||||
vim \
|
||||
wget \
|
||||
curl \
|
||||
gfortran \
|
||||
apt-transport-https \
|
||||
jq \
|
||||
locales \
|
||||
git \
|
||||
openssh-client && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN locale-gen en_US.UTF-8
|
||||
ENV LANG en_US.UTF-8
|
||||
ENV LANGUAGE en_US:en
|
||||
ENV LC_ALL en_US.UTF-8
|
||||
|
||||
ENV ENV_NAME=py3.6
|
||||
RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
|
||||
chmod +x ~/miniconda.sh && \
|
||||
~/miniconda.sh -b -p /opt/conda && \
|
||||
rm ~/miniconda.sh && \
|
||||
/opt/conda/bin/conda env create -q --name $ENV_NAME -f environment.yml && \
|
||||
/opt/conda/bin/conda clean -ya
|
||||
ENV PATH /opt/conda/envs/$ENV_NAME/bin:/opt/conda/bin:$PATH
|
||||
|
||||
|
||||
# Install Azure CLI
|
||||
RUN echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ xenial main" | \
|
||||
tee /etc/apt/sources.list.d/azure-cli.list && \
|
||||
curl -L https://packages.microsoft.com/keys/microsoft.asc | apt-key add - && \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
azure-cli
|
||||
|
||||
# Install AzCopy
|
||||
RUN echo "deb [arch=amd64] https://packages.microsoft.com/repos/microsoft-ubuntu-xenial-prod/ xenial main" > azure.list &&\
|
||||
cp ./azure.list /etc/apt/sources.list.d/ &&\
|
||||
apt-key adv --keyserver packages.microsoft.com --recv-keys B02C46DF417A0893 &&\
|
||||
apt-get update &&\
|
||||
apt-get install -y --no-install-recommends azcopy
|
|
@ -1,16 +0,0 @@
|
|||
channels:
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.6
|
||||
- numpy
|
||||
- pyyaml
|
||||
- scipy
|
||||
- ipython
|
||||
- pandas
|
||||
- jupyter
|
||||
- ipykernel
|
||||
- scikit-learn
|
||||
- selenium
|
||||
- phantomjs
|
||||
- pillow
|
||||
- bokeh=0.13.0
|
|
@ -1,63 +0,0 @@
|
|||
FROM nvidia/cuda:9.0-devel-ubuntu16.04
|
||||
|
||||
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
|
||||
ENV TENSORFLOW_VERSION=1.8.0
|
||||
ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
|
||||
ENV PYTHON_VERSION=3.5
|
||||
ENV NCCL_VERSION=2.2.12-1+cuda9.0
|
||||
|
||||
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
cmake \
|
||||
cpio \
|
||||
git \
|
||||
curl \
|
||||
wget \
|
||||
ca-certificates \
|
||||
libdapl2 \
|
||||
libcudnn7=${CUDNN_VERSION} \
|
||||
libnccl2=${NCCL_VERSION} \
|
||||
libnccl-dev=${NCCL_VERSION} \
|
||||
libjpeg-dev \
|
||||
libpng-dev \
|
||||
libmlx4-1 \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
python$PYTHON_VERSION \
|
||||
python$PYTHON_VERSION-dev
|
||||
|
||||
|
||||
# install intel MPI
|
||||
RUN cd /tmp && \
|
||||
wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' && \
|
||||
tar zxvf l_mpi_2017.3.196.tgz && \
|
||||
sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
|
||||
sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' \
|
||||
/tmp/l_mpi_2017.3.196/silent.cfg && \
|
||||
sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
|
||||
cd /tmp/l_mpi_2017.3.196 && \
|
||||
./install.sh -s silent.cfg && \
|
||||
cd .. && \
|
||||
rm -rf l_mpi_2017.3.196* && \
|
||||
echo "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" >> ~/.bashrc
|
||||
|
||||
ENV PATH $PATH:/opt/intel/compilers_and_libraries/linux/mpi/bin64
|
||||
|
||||
RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python
|
||||
|
||||
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
|
||||
python get-pip.py && \
|
||||
rm get-pip.py
|
||||
|
||||
|
||||
# Install TensorFlow and Keras
|
||||
RUN pip install --no-cache-dir tensorflow-gpu==$TENSORFLOW_VERSION h5py scipy jupyter ipykernel numpy toolz pandas \
|
||||
scikit-learn keras pillow
|
||||
|
||||
# Install Horovod, temporarily using CUDA stubs
|
||||
RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
|
||||
/bin/bash -c "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" && \
|
||||
HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==0.13.2 && \
|
||||
ldconfig
|
|
@ -1,79 +0,0 @@
|
|||
FROM nvidia/cuda:9.0-devel-ubuntu16.04
|
||||
|
||||
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
|
||||
ENV TENSORFLOW_VERSION=1.8.0
|
||||
ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
|
||||
ENV NCCL_VERSION=2.2.12-1+cuda9.0
|
||||
|
||||
ENV PYTHON_VERSION=3.5
|
||||
|
||||
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
cmake \
|
||||
git \
|
||||
curl \
|
||||
nano \
|
||||
wget \
|
||||
ca-certificates \
|
||||
libcudnn7=$CUDNN_VERSION \
|
||||
libnccl2=$NCCL_VERSION \
|
||||
libnccl-dev=$NCCL_VERSION \
|
||||
libjpeg-dev \
|
||||
libpng-dev \
|
||||
python$PYTHON_VERSION \
|
||||
python$PYTHON_VERSION-dev
|
||||
|
||||
RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python
|
||||
|
||||
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
|
||||
python get-pip.py && \
|
||||
rm get-pip.py
|
||||
|
||||
# Install TensorFlow
|
||||
RUN pip install --no-cache-dir tensorflow-gpu==$TENSORFLOW_VERSION h5py scipy jupyter ipykernel numpy toolz pandas scikit-learn keras pillow
|
||||
|
||||
# Install Open MPI
|
||||
RUN mkdir /tmp/openmpi && \
|
||||
cd /tmp/openmpi && \
|
||||
wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
|
||||
tar zxf openmpi-3.0.0.tar.gz && \
|
||||
cd openmpi-3.0.0 && \
|
||||
./configure --enable-orterun-prefix-by-default && \
|
||||
make -j $(nproc) all && \
|
||||
make install && \
|
||||
ldconfig && \
|
||||
rm -rf /tmp/openmpi
|
||||
|
||||
# Install Horovod, temporarily using CUDA stubs
|
||||
RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
|
||||
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==0.13.2 && \
|
||||
ldconfig
|
||||
|
||||
# Create a wrapper for OpenMPI to allow running as root by default
|
||||
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
|
||||
echo '#!/bin/bash' > /usr/local/bin/mpirun && \
|
||||
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
|
||||
chmod a+x /usr/local/bin/mpirun
|
||||
|
||||
# Configure OpenMPI to run good defaults:
|
||||
# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
|
||||
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
|
||||
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \
|
||||
echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
|
||||
|
||||
# Set default NCCL parameters
|
||||
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \
|
||||
echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
|
||||
|
||||
# Install OpenSSH for MPI to communicate between containers
|
||||
RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
|
||||
mkdir -p /var/run/sshd
|
||||
|
||||
# Allow OpenSSH to talk to containers without asking for confirmation
|
||||
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
|
||||
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
|
||||
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
|
||||
|
||||
WORKDIR "/examples"
|
|
@ -1,13 +0,0 @@
|
|||
DATA_DIR:=/mnt/imagenet
|
||||
PWD:=$(shell pwd)
|
||||
FAKE:='False'
|
||||
FAKE_DATA_LENGTH:=1281167
|
||||
name_prefix:=masalvar
|
||||
tag:=9-1.8-.13.2 # Cuda - TF version - Horovod version
|
||||
image-intel:=$(name_prefix)/horovod-intel-keras:$(tag)
|
||||
intel-path:=$(PWD)/Docker/horovod-intel
|
||||
image-open:=$(name_prefix)/horovod-keras:$(tag)
|
||||
open-path:=$(PWD)/Docker/horovod
|
||||
script:=\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py
|
||||
include ../include/build.mk
|
||||
|
|
@ -1,53 +0,0 @@
|
|||
import numpy as np
|
||||
import keras
|
||||
import logging
|
||||
|
||||
|
||||
def _get_logger():
|
||||
return logging.getLogger(__name__)
|
||||
|
||||
def _create_data(batch_size, num_batches, dim, channels, seed=42):
|
||||
np.random.seed(42)
|
||||
return np.random.rand(batch_size * num_batches,
|
||||
dim[0],
|
||||
dim[1],
|
||||
channels).astype(np.float32)
|
||||
|
||||
|
||||
def _create_labels(batch_size, num_batches, n_classes):
|
||||
return np.random.choice(n_classes, batch_size * num_batches)
|
||||
|
||||
|
||||
|
||||
class FakeDataGenerator(keras.preprocessing.image.Iterator):
|
||||
|
||||
def __init__(self,
|
||||
batch_size=32,
|
||||
num_batches=20,
|
||||
dim=(224, 224),
|
||||
n_channels=3,
|
||||
n_classes=10,
|
||||
length=1000,
|
||||
shuffle=True,
|
||||
seed=42):
|
||||
|
||||
'Initialization'
|
||||
super(FakeDataGenerator, self).__init__(length,
|
||||
batch_size,
|
||||
shuffle,
|
||||
seed)
|
||||
self.dim = dim
|
||||
self.n_channels = n_channels
|
||||
self.n_classes = n_classes
|
||||
self.num_batches = num_batches
|
||||
self._data = _create_data(self.batch_size, self.num_batches, self.dim, self.n_channels)
|
||||
self._labels = _create_labels(self.batch_size, self.num_batches, self.n_classes)
|
||||
self.translation_index = np.random.choice(len(self._labels), length)
|
||||
|
||||
|
||||
def _get_batches_of_transformed_samples(self, index_array):
|
||||
logger = _get_logger()
|
||||
logger.debug('Retrieving samples')
|
||||
logger.debug(str(index_array))
|
||||
tr_index_array = self.translation_index[index_array]
|
||||
return self._data[tr_index_array], keras.utils.to_categorical(self._labels[tr_index_array], num_classes=self.n_classes)
|
|
@ -1,314 +0,0 @@
|
|||
"""
|
||||
Trains ResNet50 in Keras using Horovod.
|
||||
|
||||
It requires the following env variables
|
||||
AZ_BATCHAI_INPUT_TRAIN
|
||||
AZ_BATCHAI_INPUT_TEST
|
||||
AZ_BATCHAI_OUTPUT_MODEL
|
||||
AZ_BATCHAI_JOB_TEMP_DIR
|
||||
"""
|
||||
import logging
|
||||
import sys
|
||||
from functools import lru_cache
|
||||
|
||||
from data_generator import FakeDataGenerator
|
||||
|
||||
from timer import Timer
|
||||
|
||||
import keras
|
||||
from keras import backend as K
|
||||
from keras.preprocessing import image
|
||||
import tensorflow as tf
|
||||
import os
|
||||
|
||||
|
||||
def _str_to_bool(in_str):
|
||||
if 't' in in_str.lower():
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
_WIDTH = 224
|
||||
_HEIGHT = 224
|
||||
_CHANNELS = 3
|
||||
_LR = 0.001
|
||||
_EPOCHS = os.getenv('EPOCHS', 1)
|
||||
_BATCHSIZE = 64
|
||||
_R_MEAN = 123.68
|
||||
_G_MEAN = 116.78
|
||||
_B_MEAN = 103.94
|
||||
|
||||
# Settings from https://arxiv.org/abs/1706.02677.
|
||||
_WARMUP_EPOCHS = 5
|
||||
_WEIGHT_DECAY = 0.00005
|
||||
|
||||
_NUM_WORKERS=int(os.getenv('NUM_WORKERS', 10))
|
||||
_MAX_QUEUE_SIZE=int(os.getenv('MAX_QUEUE_SIZE', 10))
|
||||
_MULTIPROCESSING=_str_to_bool(os.getenv('MULTIPROCESSING', 'False'))
|
||||
_DISTRIBUTED = _str_to_bool(os.getenv('DISTRIBUTED', 'False'))
|
||||
_FAKE = _str_to_bool(os.getenv('FAKE', 'False'))
|
||||
_DATA_LENGTH = int(os.getenv('FAKE_DATA_LENGTH', 1281167)) # How much fake data to simulate, default to size of imagenet dataset
|
||||
_VALIDATION = _str_to_bool(os.getenv('VALIDATION', 'False'))
|
||||
|
||||
|
||||
if _DISTRIBUTED:
|
||||
import horovod.keras as hvd
|
||||
|
||||
def _get_rank():
|
||||
if _DISTRIBUTED:
|
||||
try:
|
||||
return hvd.rank()
|
||||
except:
|
||||
return 0
|
||||
else:
|
||||
return 0
|
||||
|
||||
class HorovodAdapter(logging.LoggerAdapter):
|
||||
def __init__(self, logger):
|
||||
self._str_epoch=''
|
||||
self._gpu_rank=0
|
||||
super(HorovodAdapter, self).__init__(logger, {})
|
||||
|
||||
def set_epoch(self, epoch):
|
||||
self._str_epoch='[Epoch {}]'.format(epoch)
|
||||
|
||||
def process(self, msg, kwargs):
|
||||
kwargs['extra'] = {
|
||||
'gpurank': _get_rank(),
|
||||
'epoch': self._str_epoch
|
||||
}
|
||||
return msg, kwargs
|
||||
|
||||
@lru_cache()
|
||||
def _get_logger():
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
ch = logging.StreamHandler(stream=sys.stdout)
|
||||
formatter = logging.Formatter('%(levelname)s:%(name)s:%(gpurank)d: %(epoch)s %(message)s')
|
||||
ch.setFormatter(formatter)
|
||||
logger.addHandler(ch)
|
||||
adapter = HorovodAdapter(logger)
|
||||
return adapter
|
||||
|
||||
def _create_model():
|
||||
logger = _get_logger()
|
||||
logger.info('Creating model')
|
||||
# Set up standard ResNet-50 model.
|
||||
model = keras.applications.resnet50.ResNet50(weights=None)
|
||||
# ResNet-50 model that is included with Keras is optimized for inference.
|
||||
# Add L2 weight decay & adjust BN settings.
|
||||
model_config = model.get_config()
|
||||
for layer, layer_config in zip(model.layers, model_config['layers']):
|
||||
if hasattr(layer, 'kernel_regularizer'):
|
||||
regularizer = keras.regularizers.l2(_WEIGHT_DECAY)
|
||||
layer_config['config']['kernel_regularizer'] = \
|
||||
{'class_name': regularizer.__class__.__name__,
|
||||
'config': regularizer.get_config()}
|
||||
if type(layer) == keras.layers.BatchNormalization:
|
||||
layer_config['config']['momentum'] = 0.9
|
||||
layer_config['config']['epsilon'] = 1e-5
|
||||
model = keras.models.Model.from_config(model_config)
|
||||
return model
|
||||
|
||||
|
||||
def _validation_data_iterator_from():
|
||||
# Validation data iterator.
|
||||
raise NotImplementedError('The flow from directory command expects data to be in directories and this is not implemented yet')
|
||||
# test_gen = image.ImageDataGenerator(
|
||||
# zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input)
|
||||
# test_iter = test_gen.flow_from_directory(os.path.join(os.getenv('AZ_BATCHAI_INPUT_TEST'), 'validation'), batch_size=_BATCHSIZE,
|
||||
# target_size=(224, 224))
|
||||
# return test_iter
|
||||
|
||||
|
||||
def _training_data_iterator_from():
|
||||
# Training data iterator.
|
||||
train_gen = image.ImageDataGenerator(
|
||||
width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True,
|
||||
preprocessing_function=keras.applications.resnet50.preprocess_input)
|
||||
train_iter = train_gen.flow_from_directory(os.path.join(os.getenv('AZ_BATCHAI_INPUT_TRAIN'), 'train'), batch_size=_BATCHSIZE,
|
||||
target_size=(224, 224))
|
||||
return train_iter
|
||||
|
||||
def _fake_data_iterator_from(length=_DATA_LENGTH):
|
||||
return FakeDataGenerator(batch_size=_BATCHSIZE, n_classes=1000, length=length)
|
||||
|
||||
|
||||
|
||||
def _get_optimizer(params, is_distributed=_DISTRIBUTED):
|
||||
if is_distributed:
|
||||
# Horovod: adjust learning rate based on number of GPUs.
|
||||
opt = keras.optimizers.SGD(lr=params['learning_rate'] * hvd.size(), momentum=params['momentum'])
|
||||
# Horovod: add Horovod Distributed Optimizer.
|
||||
return hvd.DistributedOptimizer(opt)
|
||||
else:
|
||||
return keras.optimizers.SGD(lr=params['learning_rate'], momentum=params['momentum'])
|
||||
|
||||
|
||||
def _get_runconfig(is_distributed=_DISTRIBUTED):
|
||||
if is_distributed:
|
||||
# Horovod: pin GPU to be used to process local rank (one GPU per process)
|
||||
config = tf.ConfigProto()
|
||||
config.gpu_options.allow_growth = True
|
||||
config.gpu_options.visible_device_list = str(hvd.local_rank())
|
||||
else:
|
||||
config = tf.ConfigProto()
|
||||
config.gpu_options.allow_growth = True
|
||||
return config
|
||||
|
||||
def _get_model_dir(is_distributed=_DISTRIBUTED):
|
||||
if is_distributed:
|
||||
# Horovod: save checkpoints only on worker 0 to prevent other workers from
|
||||
# corrupting them.
|
||||
return os.getenv('AZ_BATCHAI_OUTPUT_MODEL') if hvd.rank() == 0 else os.getenv('AZ_BATCHAI_JOB_TEMP_DIR')
|
||||
else:
|
||||
return os.getenv('AZ_BATCHAI_OUTPUT_MODEL')
|
||||
|
||||
|
||||
def _get_hooks(is_distributed=_DISTRIBUTED, verbose=1):
|
||||
logger = _get_logger()
|
||||
if is_distributed:
|
||||
logger.info('Rank: {} Cluster Size {}'.format(hvd.local_rank(), hvd.size()))
|
||||
return [
|
||||
# Horovod: broadcast initial variable states from rank 0 to all other processes.
|
||||
# This is necessary to ensure consistent initialization of all workers when
|
||||
# training is started with random weights or restored from a checkpoint.
|
||||
hvd.callbacks.BroadcastGlobalVariablesCallback(0),
|
||||
|
||||
# Horovod: average metrics among workers at the end of every epoch.
|
||||
#
|
||||
# Note: This callback must be in the list before the ReduceLROnPlateau,
|
||||
# TensorBoard, or other metrics-based callbacks.
|
||||
hvd.callbacks.MetricAverageCallback(),
|
||||
|
||||
# Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
|
||||
# accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
|
||||
# the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
|
||||
hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=_WARMUP_EPOCHS, verbose=verbose),
|
||||
|
||||
# Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs.
|
||||
hvd.callbacks.LearningRateScheduleCallback(start_epoch=_WARMUP_EPOCHS, end_epoch=30, multiplier=1.),
|
||||
hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1),
|
||||
hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2),
|
||||
hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3),
|
||||
]
|
||||
else:
|
||||
return []
|
||||
|
||||
class LoggerCallback(keras.callbacks.Callback):
|
||||
|
||||
def __init__(self, logger, data_length):
|
||||
self._timer = Timer(output=logger.info, prefix="Epoch duration: ", fmt="{:.3f} seconds")
|
||||
self._data_length=data_length
|
||||
|
||||
def on_epoch_begin(self, epoch, logs):
|
||||
logger = _get_logger()
|
||||
logger.set_epoch(epoch)
|
||||
self._timer.start()
|
||||
|
||||
def on_epoch_end(self, epoch, logs):
|
||||
duration = self._timer.elapsed
|
||||
_log_summary(self._data_length, duration)
|
||||
|
||||
def _is_master(is_distributed=_DISTRIBUTED):
|
||||
if is_distributed:
|
||||
if hvd.rank() == 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def _log_summary(data_length, duration):
|
||||
logger = _get_logger()
|
||||
images_per_second = data_length / duration
|
||||
logger.info('Data length: {}'.format(data_length))
|
||||
logger.info('Total duration: {:.3f}'.format(duration))
|
||||
logger.info('Total images/sec: {:.3f}'.format(images_per_second))
|
||||
logger.info('Batch size: (Per GPU {}: Total {})'.format(_BATCHSIZE, hvd.size()*_BATCHSIZE if _DISTRIBUTED else _BATCHSIZE))
|
||||
logger.info('Distributed: {}'.format('True' if _DISTRIBUTED else 'False'))
|
||||
logger.info('Num GPUs: {:.3f}'.format(hvd.size() if _DISTRIBUTED else 1))
|
||||
logger.info('Dataset: {}'.format('Synthetic' if _FAKE else 'Imagenet'))
|
||||
|
||||
|
||||
def main():
|
||||
verbose=1
|
||||
logger = _get_logger()
|
||||
if _DISTRIBUTED:
|
||||
# Horovod: initialize Horovod.
|
||||
hvd.init()
|
||||
logger.info("Runnin Distributed")
|
||||
verbose = 1 if hvd.rank() == 0 else 0
|
||||
|
||||
logger.info("Tensorflow version {}".format(tf.__version__))
|
||||
K.set_session(tf.Session(config=_get_runconfig()))
|
||||
|
||||
# Horovod: broadcast resume_from_epoch from rank 0 (which will have
|
||||
# checkpoints) to other ranks.
|
||||
resume_from_epoch = 0
|
||||
if _DISTRIBUTED:
|
||||
resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch')
|
||||
|
||||
if _FAKE:
|
||||
train_iter = _fake_data_iterator_from()
|
||||
else:
|
||||
train_iter = _training_data_iterator_from()
|
||||
test_iter = _validation_data_iterator_from() if _VALIDATION else None
|
||||
|
||||
model = _create_model()
|
||||
|
||||
params = {
|
||||
'learning_rate':_LR,
|
||||
'momentum': 0.9
|
||||
}
|
||||
|
||||
opt = _get_optimizer(params)
|
||||
model.compile(loss=keras.losses.categorical_crossentropy,
|
||||
optimizer=opt,
|
||||
metrics=['accuracy', 'top_k_categorical_accuracy'])
|
||||
|
||||
|
||||
model_dir = _get_model_dir()
|
||||
checkpoint_format = os.path.join(model_dir, 'checkpoint-{epoch}.h5')
|
||||
|
||||
callbacks = _get_hooks()
|
||||
callbacks.append(LoggerCallback(logger, len(train_iter)*_BATCHSIZE))
|
||||
|
||||
# Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
|
||||
if _is_master():
|
||||
callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_format))
|
||||
# callbacks.append(keras.callbacks.TensorBoard(log_dir))
|
||||
|
||||
# Restore from a previous checkpoint, if initial_epoch is specified.
|
||||
# Horovod: restore on the first worker which will broadcast weights to other workers.
|
||||
if resume_from_epoch > 0 and _is_master():
|
||||
model.load_weights(checkpoint_format.format(epoch=resume_from_epoch))
|
||||
|
||||
logger.info('Training...')
|
||||
# Train the model. The training will randomly sample 1 / N batches of training data and
|
||||
# 3 / N batches of validation data on every worker, where N is the number of workers.
|
||||
# Over-sampling of validation data helps to increase probability that every validation
|
||||
# example will be evaluated.
|
||||
num_workers = hvd.size() if _DISTRIBUTED else 1
|
||||
model.fit_generator(train_iter,
|
||||
steps_per_epoch=len(train_iter) // num_workers,
|
||||
callbacks=callbacks,
|
||||
epochs=_EPOCHS,
|
||||
verbose=verbose,
|
||||
workers=_NUM_WORKERS,
|
||||
max_queue_size=_MAX_QUEUE_SIZE,
|
||||
use_multiprocessing=_MULTIPROCESSING,
|
||||
initial_epoch=resume_from_epoch)
|
||||
|
||||
if _FAKE is False and _VALIDATION:
|
||||
# Evaluate the model on the full data set.
|
||||
with Timer(output=logger.info, prefix="Testing"):
|
||||
logger.info('Testing...')
|
||||
score = hvd.allreduce(model.evaluate_generator(test_iter, len(test_iter), workers=10))
|
||||
if verbose:
|
||||
print('Test loss:', score[0])
|
||||
print('Test accuracy:', score[1])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -1,69 +0,0 @@
|
|||
FROM nvidia/cuda:9.0-devel-ubuntu16.04
|
||||
|
||||
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
|
||||
ENV PYTORCH_VERSION=0.4.0
|
||||
ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
|
||||
ENV NCCL_VERSION=2.2.12-1+cuda9.0
|
||||
|
||||
ENV PYTHON_VERSION=3.5
|
||||
|
||||
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
cmake \
|
||||
cpio \
|
||||
git \
|
||||
curl \
|
||||
wget \
|
||||
ca-certificates \
|
||||
libdapl2 \
|
||||
libcudnn7=${CUDNN_VERSION} \
|
||||
libnccl2=${NCCL_VERSION} \
|
||||
libnccl-dev=${NCCL_VERSION} \
|
||||
libjpeg-dev \
|
||||
libpng-dev \
|
||||
libmlx4-1 \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
python${PYTHON_VERSION} \
|
||||
python${PYTHON_VERSION}-dev
|
||||
|
||||
# install intel MPI
|
||||
RUN cd /tmp && \
|
||||
wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' && \
|
||||
tar zxvf l_mpi_2017.3.196.tgz && \
|
||||
sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
|
||||
sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' /tmp/l_mpi_2017.3.196/silent.cfg && \
|
||||
sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
|
||||
cd /tmp/l_mpi_2017.3.196 && \
|
||||
./install.sh -s silent.cfg && \
|
||||
cd .. && \
|
||||
rm -rf l_mpi_2017.3.196* && \
|
||||
echo "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" >> ~/.bashrc
|
||||
|
||||
ENV PATH $PATH:/opt/intel/compilers_and_libraries/linux/mpi/bin64
|
||||
|
||||
RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python
|
||||
|
||||
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
|
||||
python get-pip.py && \
|
||||
rm get-pip.py
|
||||
|
||||
## Install PyTorch
|
||||
#RUN PY=$(echo ${PYTHON_VERSION} | sed s/\\.//); \
|
||||
# if [[ ${PYTHON_VERSION} == 3* ]]; then \
|
||||
# pip install http://download.pytorch.org/whl/cu90/torch-${PYTORCH_VERSION}-cp${PY}-cp${PY}m-linux_x86_64.whl; \
|
||||
# else \
|
||||
# pip install http://download.pytorch.org/whl/cu90/torch-${PYTORCH_VERSION}-cp${PY}-cp${PY}mu-linux_x86_64.whl; \
|
||||
# fi; \
|
||||
|
||||
# Install PyTorch
|
||||
RUN pip install http://download.pytorch.org/whl/cu90/torch-0.4.0-cp35-cp35m-linux_x86_64.whl && \
|
||||
pip install --no-cache-dir torchvision h5py scipy jupyter ipykernel numpy toolz pandas scikit-learn pillow
|
||||
|
||||
# Install Horovod, temporarily using CUDA stubs
|
||||
RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
|
||||
/bin/bash -c "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" && \
|
||||
HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod==0.13.8 && \
|
||||
ldconfig
|
|
@ -1,82 +0,0 @@
|
|||
FROM nvidia/cuda:9.0-devel-ubuntu16.04
|
||||
|
||||
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
|
||||
ENV TENSORFLOW_VERSION=1.8.0
|
||||
ENV PYTORCH_VERSION=0.4.0
|
||||
ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
|
||||
ENV NCCL_VERSION=2.2.12-1+cuda9.0
|
||||
|
||||
ENV PYTHON_VERSION=3.5
|
||||
|
||||
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
cmake \
|
||||
git \
|
||||
curl \
|
||||
vim \
|
||||
wget \
|
||||
ca-certificates \
|
||||
libcudnn7=${CUDNN_VERSION} \
|
||||
libnccl2=${NCCL_VERSION} \
|
||||
libnccl-dev=${NCCL_VERSION} \
|
||||
libjpeg-dev \
|
||||
libpng-dev \
|
||||
python${PYTHON_VERSION} \
|
||||
python${PYTHON_VERSION}-dev
|
||||
|
||||
RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
|
||||
|
||||
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
|
||||
python get-pip.py && \
|
||||
rm get-pip.py
|
||||
|
||||
|
||||
# Install PyTorch
|
||||
RUN pip install http://download.pytorch.org/whl/cu90/torch-0.4.0-cp35-cp35m-linux_x86_64.whl && \
|
||||
pip install --no-cache-dir torchvision h5py scipy jupyter ipykernel numpy toolz pandas scikit-learn pillow
|
||||
|
||||
# Install Open MPI
|
||||
RUN mkdir /tmp/openmpi && \
|
||||
cd /tmp/openmpi && \
|
||||
wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
|
||||
tar zxf openmpi-3.0.0.tar.gz && \
|
||||
cd openmpi-3.0.0 && \
|
||||
./configure --enable-orterun-prefix-by-default && \
|
||||
make -j $(nproc) all && \
|
||||
make install && \
|
||||
ldconfig && \
|
||||
rm -rf /tmp/openmpi
|
||||
|
||||
# Install Horovod, temporarily using CUDA stubs
|
||||
RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
|
||||
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod==0.13.2 && \
|
||||
ldconfig
|
||||
|
||||
# Create a wrapper for OpenMPI to allow running as root by default
|
||||
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
|
||||
echo '#!/bin/bash' > /usr/local/bin/mpirun && \
|
||||
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
|
||||
chmod a+x /usr/local/bin/mpirun
|
||||
|
||||
# Configure OpenMPI to run good defaults:
|
||||
# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
|
||||
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
|
||||
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf #&& \
|
||||
# echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
|
||||
|
||||
# Set default NCCL parameters
|
||||
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \
|
||||
echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
|
||||
|
||||
# Install OpenSSH for MPI to communicate between containers
|
||||
RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
|
||||
mkdir -p /var/run/sshd
|
||||
|
||||
# Allow OpenSSH to talk to containers without asking for confirmation
|
||||
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
|
||||
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
|
||||
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
|
||||
|
||||
WORKDIR "/examples"
|
|
@ -1,13 +0,0 @@
|
|||
DATA_DIR:=/mnt/imagenet
|
||||
PWD:=$(shell pwd)
|
||||
FAKE:='False'
|
||||
FAKE_DATA_LENGTH:=1281167
|
||||
name_prefix:=masalvar
|
||||
tag:=9-1.8-.13.2 # Cuda - TF version - Horovod version
|
||||
image-intel:=$(name_prefix)/horovod-intel-pytorch:$(tag)
|
||||
intel-path:=$(PWD)/Docker/horovod-intel
|
||||
image-open:=$(name_prefix)/horovod-pytorch:$(tag)
|
||||
open-path:=$(PWD)/Docker/horovod
|
||||
script:=\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py
|
||||
include ../include/build.mk
|
||||
|
|
@ -1,328 +0,0 @@
|
|||
"""
|
||||
Trains ResNet50 in Keras using Horovod.
|
||||
|
||||
It requires the following env variables
|
||||
AZ_BATCHAI_INPUT_TRAIN
|
||||
AZ_BATCHAI_INPUT_TEST
|
||||
AZ_BATCHAI_OUTPUT_MODEL
|
||||
AZ_BATCHAI_JOB_TEMP_DIR
|
||||
"""
|
||||
import logging
|
||||
import sys
|
||||
from functools import lru_cache
|
||||
|
||||
from timer import Timer
|
||||
import numpy as np
|
||||
import os
|
||||
from PIL import Image
|
||||
|
||||
import torch.optim as optim
|
||||
from torchvision import transforms
|
||||
import torch.utils.data.distributed
|
||||
import torch.backends.cudnn as cudnn
|
||||
import torchvision.models as models
|
||||
from os import path
|
||||
import pandas as pd
|
||||
from torch.utils.data import Dataset
|
||||
import torch.nn.functional as F
|
||||
|
||||
def _str_to_bool(in_str):
|
||||
if 't' in in_str.lower():
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
_WIDTH = 224
|
||||
_HEIGHT = 224
|
||||
_CHANNELS = 3
|
||||
_LR = 0.001
|
||||
_EPOCHS = os.getenv('EPOCHS', 1)
|
||||
_BATCHSIZE = 64
|
||||
_RGB_MEAN = [0.485, 0.456, 0.406]
|
||||
_RGB_SD = [0.229, 0.224, 0.225]
|
||||
_SEED=42
|
||||
|
||||
# Settings from https://arxiv.org/abs/1706.02677.
|
||||
_WARMUP_EPOCHS = 5
|
||||
_WEIGHT_DECAY = 0.00005
|
||||
|
||||
_FAKE = _str_to_bool(os.getenv('FAKE', 'False'))
|
||||
_DATA_LENGTH = int(os.getenv('FAKE_DATA_LENGTH', 1281167)) # How much fake data to simulate, default to size of imagenet dataset
|
||||
_DISTRIBUTED = _str_to_bool(os.getenv('DISTRIBUTED', 'False'))
|
||||
|
||||
if _DISTRIBUTED:
|
||||
import horovod.torch as hvd
|
||||
|
||||
|
||||
def _get_rank():
|
||||
if _DISTRIBUTED:
|
||||
try:
|
||||
return hvd.rank()
|
||||
except:
|
||||
return 0
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
class HorovodAdapter(logging.LoggerAdapter):
|
||||
def __init__(self, logger):
|
||||
self._str_epoch=''
|
||||
self._gpu_rank=0
|
||||
super(HorovodAdapter, self).__init__(logger, {})
|
||||
|
||||
def set_epoch(self, epoch):
|
||||
self._str_epoch='[Epoch {}]'.format(epoch)
|
||||
|
||||
def process(self, msg, kwargs):
|
||||
kwargs['extra'] = {
|
||||
'gpurank': _get_rank(),
|
||||
'epoch': self._str_epoch
|
||||
}
|
||||
return msg, kwargs
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def _get_logger():
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
ch = logging.StreamHandler(stream=sys.stdout)
|
||||
formatter = logging.Formatter('%(levelname)s:%(name)s:%(gpurank)d: %(epoch)s %(message)s')
|
||||
ch.setFormatter(formatter)
|
||||
logger.addHandler(ch)
|
||||
adapter = HorovodAdapter(logger)
|
||||
return adapter
|
||||
|
||||
|
||||
def _append_path_to(data_path, data_series):
|
||||
return data_series.apply(lambda x: path.join(data_path, x))
|
||||
|
||||
|
||||
def _load_training(data_dir):
|
||||
logger = _get_logger()
|
||||
logger.info('Reading training data from {}'.format(data_dir))
|
||||
train_df = pd.read_csv(path.join(data_dir, 'train.csv'))
|
||||
return train_df.assign(filenames=_append_path_to(path.join(data_dir, 'train'),
|
||||
train_df.filenames))
|
||||
|
||||
def _load_validation(data_dir):
|
||||
logger = _get_logger()
|
||||
logger.info('Reading validation data from {}'.format(data_dir))
|
||||
train_df = pd.read_csv(path.join(data_dir, 'validation.csv'))
|
||||
return train_df.assign(filenames=_append_path_to(path.join(data_dir, 'validation'),
|
||||
train_df.filenames))
|
||||
|
||||
|
||||
def _create_data_fn(train_path, test_path):
|
||||
train_df = _load_training(train_path)
|
||||
validation_df = _load_validation(test_path)
|
||||
# File-path
|
||||
train_X = train_df['filenames'].values
|
||||
validation_X = validation_df['filenames'].values
|
||||
# One-hot encoded labels for torch
|
||||
train_labels = train_df[['num_id']].values.ravel()
|
||||
validation_labels = validation_df[['num_id']].values.ravel()
|
||||
# Index starts from 0
|
||||
train_labels -= 1
|
||||
validation_labels -= 1
|
||||
return train_X, train_labels, validation_X, validation_labels
|
||||
|
||||
|
||||
class ImageNet(Dataset):
|
||||
def __init__(self, img_locs, img_labels, transform=None):
|
||||
logger = _get_logger()
|
||||
self.img_locs, self.labels = img_locs, img_labels
|
||||
self.transform = transform
|
||||
logger.info("Loaded {} labels and {} images".format(len(np.unique(self.labels)), len(self.img_locs)))
|
||||
|
||||
def __getitem__(self, idx):
|
||||
im_file = self.img_locs[idx]
|
||||
label = self.labels[idx]
|
||||
with open(im_file, 'rb') as f:
|
||||
im_rgb = Image.open(f)
|
||||
# Make sure 3-channel (RGB)
|
||||
im_rgb = im_rgb.convert('RGB')
|
||||
if self.transform is not None:
|
||||
im_rgb = self.transform(im_rgb)
|
||||
return im_rgb, label
|
||||
|
||||
def __len__(self):
|
||||
return len(self.img_locs)
|
||||
|
||||
|
||||
def _create_data(batch_size, num_batches, dim, channels, seed=42):
|
||||
np.random.seed(seed)
|
||||
return np.random.rand(batch_size * num_batches,
|
||||
channels,
|
||||
dim[0],
|
||||
dim[1]).astype(np.float32)
|
||||
|
||||
|
||||
def _create_labels(batch_size, num_batches, n_classes):
|
||||
return np.random.choice(n_classes, batch_size * num_batches)
|
||||
|
||||
|
||||
|
||||
class FakeData(Dataset):
|
||||
def __init__(self,
|
||||
batch_size=32,
|
||||
num_batches=20,
|
||||
dim=(224, 224),
|
||||
n_channels=3,
|
||||
n_classes=10,
|
||||
length=_DATA_LENGTH,
|
||||
seed=42,
|
||||
data_transform=None):
|
||||
self.dim = dim
|
||||
self.n_channels = n_channels
|
||||
self.n_classes = n_classes
|
||||
self.num_batches = num_batches
|
||||
self._data = _create_data(batch_size, self.num_batches, self.dim, self.n_channels)
|
||||
self._labels = _create_labels(batch_size, self.num_batches, self.n_classes)
|
||||
self.translation_index = np.random.choice(len(self._labels), length)
|
||||
self._length=length
|
||||
|
||||
self._data_transform = data_transform
|
||||
logger = _get_logger()
|
||||
logger.info("Creating fake data {} labels and {} images".format(n_classes, len(self._data)))
|
||||
|
||||
def __getitem__(self, idx):
|
||||
logger = _get_logger()
|
||||
logger.debug('Retrieving samples')
|
||||
logger.debug(str(idx))
|
||||
tr_index_array = self.translation_index[idx]
|
||||
|
||||
if self._data_transform is not None:
|
||||
data=self._data_transform(self._data[tr_index_array])
|
||||
else:
|
||||
data=self._data[tr_index_array]
|
||||
|
||||
return data, self._labels[tr_index_array]
|
||||
|
||||
def __len__(self):
|
||||
return self._length
|
||||
|
||||
|
||||
def _is_master(is_distributed=_DISTRIBUTED):
|
||||
if is_distributed:
|
||||
if hvd.rank() == 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def train(train_loader, model, criterion, optimizer, epoch):
|
||||
logger = _get_logger()
|
||||
msg = ' duration({}) loss:{} total-samples: {}'
|
||||
t=Timer()
|
||||
t.start()
|
||||
logger.set_epoch(epoch)
|
||||
for i, (data, target) in enumerate(train_loader):
|
||||
data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
|
||||
optimizer.zero_grad()
|
||||
# compute output
|
||||
output = model(data)
|
||||
loss = criterion(output, target)
|
||||
# compute gradient and do SGD step
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
if i % 100 == 0:
|
||||
logger.info(msg.format(t.elapsed, loss.item(), i * len(data)))
|
||||
t.start()
|
||||
|
||||
|
||||
def _log_summary(data_length, duration):
|
||||
logger = _get_logger()
|
||||
images_per_second = data_length / duration
|
||||
logger.info('Data length: {}'.format(data_length))
|
||||
logger.info('Total duration: {:.3f}'.format(duration))
|
||||
logger.info('Total images/sec: {:.3f}'.format(images_per_second))
|
||||
logger.info('Batch size: (Per GPU {}: Total {})'.format(_BATCHSIZE, hvd.size()*_BATCHSIZE if _DISTRIBUTED else _BATCHSIZE))
|
||||
logger.info('Distributed: {}'.format('True' if _DISTRIBUTED else 'False'))
|
||||
logger.info('Num GPUs: {:.3f}'.format(hvd.size() if _DISTRIBUTED else 1))
|
||||
logger.info('Dataset: {}'.format('Synthetic' if _FAKE else 'Imagenet'))
|
||||
|
||||
|
||||
def _get_sampler(dataset, is_distributed=_DISTRIBUTED):
|
||||
if is_distributed:
|
||||
return torch.utils.data.distributed.DistributedSampler(
|
||||
dataset, num_replicas=hvd.size(), rank=hvd.rank())
|
||||
else:
|
||||
return torch.utils.data.sampler.RandomSampler(dataset)
|
||||
|
||||
|
||||
def main():
|
||||
logger = _get_logger()
|
||||
if _DISTRIBUTED:
|
||||
# Horovod: initialize Horovod.
|
||||
|
||||
hvd.init()
|
||||
logger.info("Runnin Distributed")
|
||||
torch.manual_seed(_SEED)
|
||||
# Horovod: pin GPU to local rank.
|
||||
torch.cuda.set_device(hvd.local_rank())
|
||||
torch.cuda.manual_seed(_SEED)
|
||||
|
||||
logger.info("PyTorch version {}".format(torch.__version__))
|
||||
|
||||
if _FAKE:
|
||||
logger.info("Setting up fake loaders")
|
||||
train_dataset = FakeData(n_classes=1000, data_transform=torch.FloatTensor)
|
||||
else:
|
||||
normalize = transforms.Normalize(_RGB_MEAN, _RGB_SD)
|
||||
|
||||
train_X, train_y, valid_X, valid_y = _create_data_fn(os.getenv('AZ_BATCHAI_INPUT_TRAIN'), os.getenv('AZ_BATCHAI_INPUT_TEST'))
|
||||
|
||||
logger.info("Setting up loaders")
|
||||
train_dataset = ImageNet(
|
||||
train_X,
|
||||
train_y,
|
||||
transforms.Compose([
|
||||
transforms.RandomResizedCrop(_WIDTH),
|
||||
transforms.RandomHorizontalFlip(),
|
||||
transforms.ToTensor(),
|
||||
normalize]))
|
||||
|
||||
|
||||
train_sampler=_get_sampler(train_dataset)
|
||||
kwargs = {'num_workers': 5, 'pin_memory': True}
|
||||
train_loader = torch.utils.data.DataLoader(
|
||||
train_dataset, batch_size=_BATCHSIZE, sampler=train_sampler, **kwargs)
|
||||
|
||||
# Autotune
|
||||
cudnn.benchmark = True
|
||||
|
||||
logger.info("Loading model")
|
||||
# Load symbol
|
||||
model = models.__dict__['resnet50'](pretrained=False)
|
||||
|
||||
model.cuda()
|
||||
|
||||
if _DISTRIBUTED:
|
||||
# Horovod: broadcast parameters.
|
||||
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
|
||||
|
||||
num_gpus= hvd.size() if _DISTRIBUTED else 1
|
||||
# Horovod: scale learning rate by the number of GPUs.
|
||||
optimizer = optim.SGD(model.parameters(), lr=_LR * num_gpus,
|
||||
momentum=0.9)
|
||||
if _DISTRIBUTED:
|
||||
# Horovod: wrap optimizer with DistributedOptimizer.
|
||||
optimizer = hvd.DistributedOptimizer(
|
||||
optimizer, named_parameters=model.named_parameters())
|
||||
|
||||
criterion=F.cross_entropy
|
||||
# Main training-loop
|
||||
logger.info("Training ...")
|
||||
for epoch in range(_EPOCHS):
|
||||
with Timer(output=logger.info, prefix="Training") as t:
|
||||
model.train()
|
||||
if _DISTRIBUTED:
|
||||
train_sampler.set_epoch(epoch)
|
||||
train(train_loader, model, criterion, optimizer, epoch)
|
||||
|
||||
_log_summary(len(train_dataset), t.elapsed)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -1,59 +0,0 @@
|
|||
FROM nvidia/cuda:9.0-devel-ubuntu16.04
|
||||
|
||||
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
|
||||
ENV TENSORFLOW_VERSION=1.8.0
|
||||
ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
|
||||
ENV PYTHON_VERSION=3.5
|
||||
|
||||
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
cmake \
|
||||
cpio \
|
||||
git \
|
||||
curl \
|
||||
wget \
|
||||
ca-certificates \
|
||||
libdapl2 \
|
||||
libcudnn7=$CUDNN_VERSION \
|
||||
libjpeg-dev \
|
||||
libpng-dev \
|
||||
libmlx4-1 \
|
||||
libsm6 \
|
||||
libxext6 \
|
||||
python$PYTHON_VERSION \
|
||||
python$PYTHON_VERSION-dev
|
||||
|
||||
|
||||
# install intel MPI
|
||||
RUN cd /tmp && \
|
||||
wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' && \
|
||||
tar zxvf l_mpi_2017.3.196.tgz && \
|
||||
sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
|
||||
sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' \
|
||||
/tmp/l_mpi_2017.3.196/silent.cfg && \
|
||||
sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
|
||||
cd /tmp/l_mpi_2017.3.196 && \
|
||||
./install.sh -s silent.cfg && \
|
||||
cd .. && \
|
||||
rm -rf l_mpi_2017.3.196* && \
|
||||
echo "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" >> ~/.bashrc
|
||||
|
||||
ENV PATH $PATH:/opt/intel/compilers_and_libraries/linux/mpi/bin64
|
||||
|
||||
RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python
|
||||
|
||||
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
|
||||
python get-pip.py && \
|
||||
rm get-pip.py
|
||||
|
||||
# Install TensorFlow
|
||||
RUN pip install --no-cache-dir tensorflow-gpu==$TENSORFLOW_VERSION h5py scipy jupyter ipykernel numpy toolz pandas \
|
||||
scikit-learn
|
||||
|
||||
# Install Horovod, temporarily using CUDA stubs
|
||||
RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
|
||||
/bin/bash -c "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" && \
|
||||
HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==0.13.2 && \
|
||||
ldconfig
|
|
@ -1,78 +0,0 @@
|
|||
FROM nvidia/cuda:9.0-devel-ubuntu16.04
|
||||
|
||||
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
|
||||
ENV TENSORFLOW_VERSION=1.8.0
|
||||
ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
|
||||
ENV NCCL_VERSION=2.2.12-1+cuda9.0
|
||||
|
||||
ENV PYTHON_VERSION=3.5
|
||||
|
||||
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
cmake \
|
||||
git \
|
||||
curl \
|
||||
nano \
|
||||
wget \
|
||||
ca-certificates \
|
||||
libcudnn7=$CUDNN_VERSION \
|
||||
libnccl2=$NCCL_VERSION \
|
||||
libnccl-dev=$NCCL_VERSION \
|
||||
libjpeg-dev \
|
||||
libpng-dev \
|
||||
python$PYTHON_VERSION \
|
||||
python$PYTHON_VERSION-dev
|
||||
|
||||
RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python
|
||||
|
||||
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
|
||||
python get-pip.py && \
|
||||
rm get-pip.py
|
||||
|
||||
# Install TensorFlow
|
||||
RUN pip install --no-cache-dir tensorflow-gpu==$TENSORFLOW_VERSION h5py scipy jupyter ipykernel numpy toolz pandas scikit-learn
|
||||
|
||||
# Install Open MPI
|
||||
RUN mkdir /tmp/openmpi && \
|
||||
cd /tmp/openmpi && \
|
||||
wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
|
||||
tar zxf openmpi-3.0.0.tar.gz && \
|
||||
cd openmpi-3.0.0 && \
|
||||
./configure --enable-orterun-prefix-by-default && \
|
||||
make -j $(nproc) all && \
|
||||
make install && \
|
||||
ldconfig && \
|
||||
rm -rf /tmp/openmpi
|
||||
|
||||
# Install Horovod, temporarily using CUDA stubs
|
||||
RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
|
||||
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==0.13.2 && \
|
||||
ldconfig
|
||||
|
||||
# Create a wrapper for OpenMPI to allow running as root by default
|
||||
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
|
||||
echo '#!/bin/bash' > /usr/local/bin/mpirun && \
|
||||
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
|
||||
chmod a+x /usr/local/bin/mpirun
|
||||
|
||||
# Configure OpenMPI to run good defaults:
|
||||
# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
|
||||
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
|
||||
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \
|
||||
echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
|
||||
|
||||
# Set default NCCL parameters
|
||||
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \
|
||||
echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
|
||||
|
||||
# Install OpenSSH for MPI to communicate between containers
|
||||
RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
|
||||
mkdir -p /var/run/sshd
|
||||
|
||||
# Allow OpenSSH to talk to containers without asking for confirmation
|
||||
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
|
||||
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
|
||||
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
|
||||
DATA_DIR:=/mnt/imagenet
|
||||
PWD:=$(shell pwd)
|
||||
FAKE:='False'
|
||||
FAKE_DATA_LENGTH:=1281167
|
||||
name_prefix:=masalvar
|
||||
tag:=9-1.8-.13.2 # Cuda - TF version - Horovod version
|
||||
image-intel:=$(name_prefix)/horovod-intel:$(tag)
|
||||
intel-path:=$(PWD)/Docker/horovod-intel
|
||||
image-open:=$(name_prefix)/horovod:$(tag)
|
||||
open-path:=$(PWD)/Docker/horovod
|
||||
script:=\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py
|
||||
include ../include/build.mk
|
||||
|
|
@ -1,435 +0,0 @@
|
|||
"""
|
||||
Trains ResNet50 using Horovod.
|
||||
|
||||
It requires the following env variables
|
||||
AZ_BATCHAI_INPUT_TRAIN
|
||||
AZ_BATCHAI_INPUT_TEST
|
||||
AZ_BATCHAI_OUTPUT_MODEL
|
||||
AZ_BATCHAI_JOB_TEMP_DIR
|
||||
"""
|
||||
import logging
|
||||
import sys
|
||||
from functools import lru_cache
|
||||
|
||||
import os
|
||||
from os import path
|
||||
|
||||
import pandas as pd
|
||||
import tensorflow as tf
|
||||
from resnet_model import resnet_v1
|
||||
from toolz import pipe
|
||||
from timer import Timer
|
||||
import numpy as np
|
||||
|
||||
_WIDTH = 224
|
||||
_HEIGHT = 224
|
||||
_CHANNELS = 3
|
||||
_LR = 0.001
|
||||
_EPOCHS = os.getenv('EPOCHS', 1)
|
||||
_BATCHSIZE = 64
|
||||
_R_MEAN = 123.68
|
||||
_G_MEAN = 116.78
|
||||
_B_MEAN = 103.94
|
||||
_BUFFER = 256
|
||||
|
||||
|
||||
def _str_to_bool(in_str):
|
||||
if 't' in in_str.lower():
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
_DISTRIBUTED = _str_to_bool(os.getenv('DISTRIBUTED', 'False'))
|
||||
_FAKE = _str_to_bool(os.getenv('FAKE', 'False'))
|
||||
_DATA_LENGTH = int(
|
||||
os.getenv('FAKE_DATA_LENGTH', 1281167)) # How much fake data to simulate, default to size of imagenet dataset
|
||||
_VALIDATION = _str_to_bool(os.getenv('VALIDATION', 'False'))
|
||||
|
||||
if _DISTRIBUTED:
|
||||
import horovod.tensorflow as hvd
|
||||
|
||||
|
||||
tf_logger = logging.getLogger('tensorflow')
|
||||
tf_logger.setLevel(logging.INFO)
|
||||
stout = logging.StreamHandler(stream=sys.stdout)
|
||||
tf_logger.addHandler(stout)
|
||||
|
||||
def _get_rank():
|
||||
if _DISTRIBUTED:
|
||||
try:
|
||||
return hvd.rank()
|
||||
except:
|
||||
return 0
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
class HorovodAdapter(logging.LoggerAdapter):
|
||||
def __init__(self, logger):
|
||||
self._str_epoch=''
|
||||
self._gpu_rank=0
|
||||
super(HorovodAdapter, self).__init__(logger, {})
|
||||
|
||||
def set_epoch(self, epoch):
|
||||
self._str_epoch='[Epoch {}]'.format(epoch)
|
||||
|
||||
def process(self, msg, kwargs):
|
||||
kwargs['extra'] = {
|
||||
'gpurank': _get_rank(),
|
||||
'epoch': self._str_epoch
|
||||
}
|
||||
return msg, kwargs
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def _get_logger():
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
ch = logging.StreamHandler(stream=sys.stdout)
|
||||
formatter = logging.Formatter('%(levelname)s:%(name)s:%(gpurank)d: %(epoch)s %(message)s')
|
||||
ch.setFormatter(formatter)
|
||||
logger.addHandler(ch)
|
||||
adapter = HorovodAdapter(logger)
|
||||
return adapter
|
||||
|
||||
|
||||
def _load_image(filename, channels=_CHANNELS):
|
||||
return tf.to_float(tf.image.decode_png(tf.read_file(filename), channels=channels))
|
||||
|
||||
|
||||
def _resize(img, width=_WIDTH, height=_HEIGHT):
|
||||
return tf.image.resize_images(img, [height, width])
|
||||
|
||||
|
||||
def _centre(img, mean_subtraction=(_R_MEAN, _G_MEAN, _B_MEAN)):
|
||||
return tf.subtract(img, list(mean_subtraction))
|
||||
|
||||
|
||||
def _random_crop(img, width=_WIDTH, height=_HEIGHT, channels=_CHANNELS):
|
||||
return tf.random_crop(img, [height, width, channels])
|
||||
|
||||
|
||||
def _random_horizontal_flip(img):
|
||||
return tf.image.random_flip_left_right(img)
|
||||
|
||||
|
||||
def _preprocess_images(filename):
|
||||
return pipe(filename,
|
||||
_load_image,
|
||||
_resize,
|
||||
_centre)
|
||||
|
||||
|
||||
def _preprocess_labels(label):
|
||||
return tf.cast(label, dtype=tf.int32)
|
||||
|
||||
def _transform_to_NCHW(img):
|
||||
return tf.transpose(img, [2, 0, 1]) # Transform from NHWC to NCHW
|
||||
|
||||
|
||||
def _parse_function_train(tensor, label):
|
||||
img_rgb = pipe(tensor,
|
||||
_random_crop,
|
||||
_random_horizontal_flip,
|
||||
_transform_to_NCHW)
|
||||
|
||||
return img_rgb, label
|
||||
|
||||
def _prep(filename, label):
|
||||
return tf.data.Dataset.from_tensor_slices(([_preprocess_images(filename)], [_preprocess_labels(label)]))
|
||||
|
||||
def _parse_function_eval(filename, label):
|
||||
return pipe(filename,
|
||||
_preprocess_images,
|
||||
_transform_to_NCHW), _preprocess_labels(label)
|
||||
|
||||
|
||||
def _get_optimizer(params, is_distributed=_DISTRIBUTED):
|
||||
if is_distributed:
|
||||
# Horovod: add Horovod Distributed Optimizer.
|
||||
return hvd.DistributedOptimizer(tf.train.MomentumOptimizer(learning_rate=params["learning_rate"] * hvd.size(),
|
||||
momentum=0.9))
|
||||
else:
|
||||
return tf.train.MomentumOptimizer(learning_rate=params["learning_rate"], momentum=0.9)
|
||||
|
||||
def build_network(features, mode, params):
|
||||
network = resnet_v1(
|
||||
resnet_depth=50,
|
||||
num_classes=params['classes'],
|
||||
data_format='channels_first')
|
||||
return network(
|
||||
inputs=features, is_training=(mode == tf.estimator.ModeKeys.TRAIN))
|
||||
|
||||
def model_fn(features, labels, mode, params):
|
||||
"""
|
||||
features: This is the x-arg from the input_fn.
|
||||
labels: This is the y-arg from the input_fn,
|
||||
see e.g. train_input_fn for these two.
|
||||
mode: Either TRAIN, EVAL, or PREDICT
|
||||
params: User-defined hyper-parameters, e.g. learning-rate.
|
||||
"""
|
||||
logger=_get_logger()
|
||||
logger.info('Creating model in {} mode'.format(mode))
|
||||
|
||||
logits = build_network(features, mode, params)
|
||||
|
||||
if mode == tf.estimator.ModeKeys.PREDICT:
|
||||
# Softmax output of the neural network.
|
||||
y_pred = tf.nn.softmax(logits=logits)
|
||||
|
||||
# Classification output of the neural network.
|
||||
y_pred_cls = tf.argmax(y_pred, axis=1)
|
||||
|
||||
predictions = {
|
||||
'class_ids': y_pred_cls,
|
||||
'probabilities': y_pred,
|
||||
'logits': logits,
|
||||
}
|
||||
return tf.estimator.EstimatorSpec(mode=mode,
|
||||
predictions=predictions)
|
||||
|
||||
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
|
||||
loss = tf.reduce_mean(cross_entropy)
|
||||
|
||||
if mode == tf.estimator.ModeKeys.EVAL:
|
||||
# Softmax output of the neural network.
|
||||
y_pred = tf.nn.softmax(logits=logits)
|
||||
|
||||
# Classification output of the neural network.
|
||||
y_pred_cls = tf.argmax(y_pred, axis=1)
|
||||
|
||||
accuracy = tf.metrics.accuracy(labels=tf.argmax(labels, axis=1),
|
||||
predictions=y_pred_cls,
|
||||
name='acc_op')
|
||||
metrics = {'accuracy': accuracy}
|
||||
tf.summary.scalar('accuracy', accuracy[1])
|
||||
return tf.estimator.EstimatorSpec(mode=mode,
|
||||
eval_metric_ops=metrics,
|
||||
loss=loss)
|
||||
|
||||
optimizer = _get_optimizer(params)
|
||||
|
||||
train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
|
||||
|
||||
return tf.estimator.EstimatorSpec(mode=mode,
|
||||
loss=loss,
|
||||
train_op=train_op)
|
||||
|
||||
|
||||
def _append_path_to(data_path, data_series):
|
||||
return data_series.apply(lambda x: path.join(data_path, x))
|
||||
|
||||
|
||||
def _load_training(data_dir):
|
||||
train_df = pd.read_csv(path.join(data_dir, 'train.csv'))
|
||||
return train_df.assign(filenames=_append_path_to(path.join(data_dir, 'train'),
|
||||
train_df.filenames))
|
||||
|
||||
|
||||
def _load_validation(data_dir):
|
||||
train_df = pd.read_csv(path.join(data_dir, 'validation.csv'))
|
||||
return train_df.assign(filenames=_append_path_to(path.join(data_dir, 'validation'),
|
||||
train_df.filenames))
|
||||
|
||||
|
||||
def _create_data_fn(train_path, test_path):
|
||||
logger = _get_logger()
|
||||
logger.info('Reading training data info')
|
||||
train_df = _load_training(train_path)
|
||||
|
||||
logger.info('Reading validation data info')
|
||||
validation_df = _load_validation(test_path)
|
||||
|
||||
train_labels = train_df[['num_id']].values.ravel() - 1
|
||||
validation_labels = validation_df[['num_id']].values.ravel() - 1
|
||||
|
||||
train_data = tf.data.Dataset.from_tensor_slices((train_df['filenames'].values, train_labels))
|
||||
train_data_transform = tf.contrib.data.map_and_batch(_parse_function_train, _BATCHSIZE, num_parallel_batches=5)
|
||||
train_data = train_data.apply(tf.contrib.data.parallel_interleave(
|
||||
_prep, cycle_length=5, buffer_output_elements=1024))
|
||||
|
||||
train_data = (train_data.shuffle(1024)
|
||||
.repeat()
|
||||
.apply(train_data_transform)
|
||||
.prefetch(_BUFFER))
|
||||
|
||||
validation_data = tf.data.Dataset.from_tensor_slices((validation_df['filenames'].values, validation_labels))
|
||||
validation_data_transform = tf.contrib.data.map_and_batch(_parse_function_eval, _BATCHSIZE, num_parallel_batches=4)
|
||||
validation_data = (validation_data.apply(validation_data_transform)
|
||||
.prefetch(_BUFFER))
|
||||
|
||||
def _train_input_fn():
|
||||
return train_data.make_one_shot_iterator().get_next()
|
||||
|
||||
def _validation_input_fn():
|
||||
return validation_data.make_one_shot_iterator().get_next()
|
||||
|
||||
_train_input_fn.length = len(train_df)
|
||||
_validation_input_fn.length = len(validation_df)
|
||||
_train_input_fn.classes = 1000
|
||||
_validation_input_fn.classes = 1000
|
||||
|
||||
return _train_input_fn, _validation_input_fn
|
||||
|
||||
|
||||
def _create_data(batch_size, num_batches, dim, channels, seed=42):
|
||||
np.random.seed(seed)
|
||||
return np.random.rand(batch_size * num_batches,
|
||||
channels,
|
||||
dim[0],
|
||||
dim[1]).astype(np.float32)
|
||||
|
||||
|
||||
def _create_labels(batch_size, num_batches, n_classes):
|
||||
return np.random.choice(n_classes, batch_size * num_batches)
|
||||
|
||||
|
||||
def _create_fake_data_fn(train_length=_DATA_LENGTH, valid_length=50000, num_batches=40):
|
||||
""" Creates fake dataset
|
||||
|
||||
Data is returned in NCHW since this tends to be faster on GPUs
|
||||
"""
|
||||
logger = _get_logger()
|
||||
logger.info('Creating fake data')
|
||||
|
||||
data_array = _create_data(_BATCHSIZE, num_batches, (_HEIGHT, _WIDTH), _CHANNELS)
|
||||
labels_array = _create_labels(_BATCHSIZE, num_batches, 1000)
|
||||
|
||||
def fake_data_generator():
|
||||
for i in range(num_batches):
|
||||
yield data_array[i * _BATCHSIZE:(i + 1) * _BATCHSIZE], labels_array[i * _BATCHSIZE:(i + 1) * _BATCHSIZE]
|
||||
|
||||
train_data = tf.data.Dataset().from_generator(fake_data_generator,
|
||||
output_types=(tf.float32, tf.int32),
|
||||
output_shapes=(tf.TensorShape([None, _CHANNELS, _HEIGHT, _WIDTH]),
|
||||
tf.TensorShape([None])))
|
||||
|
||||
train_data = (train_data.shuffle(40 * _BATCHSIZE)
|
||||
.repeat()
|
||||
.prefetch(_BUFFER))
|
||||
|
||||
validation_data = tf.data.Dataset().from_generator(fake_data_generator,
|
||||
output_types=(tf.float32, tf.int32),
|
||||
output_shapes=(
|
||||
tf.TensorShape([None, _CHANNELS, _HEIGHT, _WIDTH]),
|
||||
tf.TensorShape([None])))
|
||||
|
||||
validation_data = (validation_data.prefetch(_BUFFER))
|
||||
|
||||
def _train_input_fn():
|
||||
return train_data.make_one_shot_iterator().get_next()
|
||||
|
||||
def _validation_input_fn():
|
||||
return validation_data.make_one_shot_iterator().get_next()
|
||||
|
||||
_train_input_fn.length = train_length
|
||||
_validation_input_fn.length = valid_length
|
||||
_train_input_fn.classes = 1000
|
||||
_validation_input_fn.classes = 1000
|
||||
|
||||
return _train_input_fn, _validation_input_fn
|
||||
|
||||
|
||||
def _get_runconfig(is_distributed=_DISTRIBUTED):
|
||||
if is_distributed:
|
||||
# Horovod: pin GPU to be used to process local rank (one GPU per process)
|
||||
config = tf.ConfigProto()
|
||||
config.gpu_options.allow_growth = True
|
||||
config.gpu_options.visible_device_list = str(hvd.local_rank())
|
||||
|
||||
return tf.estimator.RunConfig(save_checkpoints_steps=None,
|
||||
save_checkpoints_secs=None,
|
||||
session_config=config)
|
||||
else:
|
||||
return tf.estimator.RunConfig(save_checkpoints_steps=None)
|
||||
|
||||
|
||||
def _get_model_dir(is_distributed=_DISTRIBUTED):
|
||||
if is_distributed:
|
||||
# Horovod: save checkpoints only on worker 0 to prevent other workers from
|
||||
# corrupting them.
|
||||
return os.getenv('AZ_BATCHAI_OUTPUT_MODEL') if hvd.rank() == 0 else os.getenv('AZ_BATCHAI_JOB_TEMP_DIR')
|
||||
else:
|
||||
return os.getenv('AZ_BATCHAI_OUTPUT_MODEL')
|
||||
|
||||
|
||||
def _get_hooks(is_distributed=_DISTRIBUTED):
|
||||
logger = _get_logger()
|
||||
if is_distributed:
|
||||
bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
|
||||
logger.info('Rank: {} Cluster Size {}'.format(hvd.local_rank(), hvd.size()))
|
||||
return [bcast_hook]
|
||||
else:
|
||||
return []
|
||||
|
||||
|
||||
def _is_master(is_distributed=_DISTRIBUTED):
|
||||
if is_distributed:
|
||||
if hvd.rank() == 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def _log_summary(data_length, duration):
|
||||
logger = _get_logger()
|
||||
images_per_second = data_length / duration
|
||||
logger.info('Data length: {}'.format(data_length))
|
||||
logger.info('Total duration: {:.3f}'.format(duration))
|
||||
logger.info('Total images/sec: {:.3f}'.format(images_per_second))
|
||||
logger.info('Batch size: (Per GPU {}: Total {})'.format(_BATCHSIZE,
|
||||
hvd.size() * _BATCHSIZE if _DISTRIBUTED else _BATCHSIZE))
|
||||
logger.info('Distributed: {}'.format('True' if _DISTRIBUTED else 'False'))
|
||||
logger.info('Num GPUs: {:.3f}'.format(hvd.size() if _DISTRIBUTED else 1))
|
||||
logger.info('Dataset: {}'.format('Synthetic' if _FAKE else 'Imagenet'))
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
if _DISTRIBUTED:
|
||||
# Horovod: initialize Horovod.
|
||||
hvd.init()
|
||||
logger = _get_logger()
|
||||
logger.info("Runnin Distributed")
|
||||
else:
|
||||
logger = _get_logger()
|
||||
|
||||
logger.info("Tensorflow version {}".format(tf.__version__))
|
||||
if _FAKE:
|
||||
train_input_fn, validation_input_fn = _create_fake_data_fn()
|
||||
else:
|
||||
train_input_fn, validation_input_fn = _create_data_fn(os.getenv('AZ_BATCHAI_INPUT_TRAIN'),
|
||||
os.getenv('AZ_BATCHAI_INPUT_TEST'))
|
||||
|
||||
run_config = _get_runconfig()
|
||||
model_dir = _get_model_dir()
|
||||
|
||||
params = {"learning_rate": _LR,
|
||||
"classes": train_input_fn.classes}
|
||||
logger.info('Creating estimator with params: {}'.format(params))
|
||||
model = tf.estimator.Estimator(model_fn=model_fn,
|
||||
params=params,
|
||||
model_dir=model_dir,
|
||||
config=run_config)
|
||||
|
||||
hooks = _get_hooks()
|
||||
num_gpus = hvd.size() if _DISTRIBUTED else 1
|
||||
with Timer(output=logger.info, prefix="Training") as t:
|
||||
logger.info('Training...')
|
||||
model.train(input_fn=train_input_fn,
|
||||
steps=_EPOCHS * train_input_fn.length // (_BATCHSIZE * num_gpus),
|
||||
hooks=hooks)
|
||||
|
||||
_log_summary(_EPOCHS * train_input_fn.length, t.elapsed)
|
||||
|
||||
if _is_master() and _FAKE is False and _VALIDATION:
|
||||
with Timer(output=logger.info, prefix="Testing"):
|
||||
logger.info('Testing...')
|
||||
model.evaluate(input_fn=validation_input_fn)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
42
Makefile
42
Makefile
|
@ -1,27 +1,19 @@
|
|||
define PROJECT_HELP_MSG
|
||||
Usage:
|
||||
make help show this message
|
||||
make build build docker image
|
||||
make push push container
|
||||
make run run benchmarking container
|
||||
endef
|
||||
export PROJECT_HELP_MSG
|
||||
PWD:=$(shell pwd)
|
||||
# This makefile is used to test the cookiecutter
|
||||
# To use this you will need to create a .dev_env file and add the subscription_id to it
|
||||
include .dev_env
|
||||
|
||||
image_name:=masalvar/batchai-ddl
|
||||
cookiecutter:
|
||||
ifdef subscription_id
|
||||
cd ../ && cookiecutter AMLDistCC --no-input \
|
||||
subscription_id=${subscription_id} \
|
||||
resource_group=mstestdistrg \
|
||||
data=/mnt/imagenet_test \
|
||||
vm_size=Standard_NC24rs_v3 \
|
||||
project_name=mstestdist \
|
||||
image_name=mstestdist
|
||||
else
|
||||
@echo "You need to create a .dev_env file with subscription_id in it"
|
||||
endif
|
||||
|
||||
help:
|
||||
echo "$$PROJECT_HELP_MSG" | less
|
||||
|
||||
build:
|
||||
docker build -t $(image_name) Docker
|
||||
|
||||
run:
|
||||
docker run -v $(PWD):/workspace -it $(image_name) bash
|
||||
|
||||
push:
|
||||
docker push $(image_name)
|
||||
|
||||
|
||||
|
||||
.PHONY: help build push
|
||||
clean:
|
||||
rm -rf ../mstestdist
|
|
@ -1,3 +0,0 @@
|
|||
FROM pytorch/pytorch:0.4_cuda9_cudnn7
|
||||
|
||||
RUN pip install --no-cache-dir h5py scipy jupyter ipykernel numpy toolz pandas scikit-learn pillow
|
|
@ -1,11 +0,0 @@
|
|||
DATA_DIR:=/mnt/imagenet
|
||||
PWD:=$(shell pwd)
|
||||
FAKE:='False'
|
||||
FAKE_DATA_LENGTH:=1281167
|
||||
name_prefix:=iliauk
|
||||
tag:=latest
|
||||
image-open:=$(name_prefix)/pytorch_gloo:$(tag)
|
||||
open-path:=$(PWD)/Docker
|
||||
script:=\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_gloo.py
|
||||
include ../include/build.mk
|
||||
|
|
@ -1,283 +0,0 @@
|
|||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from os import path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import multiprocessing
|
||||
from toolz import pipe
|
||||
from timer import Timer
|
||||
from PIL import Image
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.backends.cudnn as cudnn
|
||||
import torch.optim
|
||||
import torch.utils.data
|
||||
import torchvision.transforms as transforms
|
||||
from torch.utils.data import DataLoader, Dataset
|
||||
import torchvision.models as models
|
||||
import torch.distributed as dist
|
||||
import torch.utils.data.distributed
|
||||
|
||||
print("PyTorch: ", torch.__version__)
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
# Distributed training settings
|
||||
parser = argparse.ArgumentParser(description='PyTorch ResNet Example')
|
||||
parser.add_argument('--world-size', default=1, type=int, help='number of distributed processes')
|
||||
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, help='url used to set up distributed training')
|
||||
parser.add_argument('--dist-backend', default='gloo', type=str, help='distributed backend')
|
||||
parser.add_argument('--rank', default=-1, type=int, help='rank of the worker')
|
||||
|
||||
_WIDTH = 224
|
||||
_HEIGHT = 224
|
||||
_LR = 0.001
|
||||
_EPOCHS = 1
|
||||
_NUM_GPU = int(torch.cuda.device_count())
|
||||
_BATCHSIZE = 64*_NUM_GPU
|
||||
_RGB_MEAN = [0.485, 0.456, 0.406]
|
||||
_RGB_SD = [0.229, 0.224, 0.225]
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
def _str_to_bool(in_str):
|
||||
if 't' in in_str.lower():
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
_FAKE = _str_to_bool(os.getenv('FAKE', 'True'))
|
||||
_DATA_LENGTH = int(os.getenv('FAKE_DATA_LENGTH', 1281167)) # How much fake data to simulate, default to size of imagenet dataset
|
||||
|
||||
#_DISTRIBUTED = _str_to_bool(os.getenv('DISTRIBUTED', 'False'))
|
||||
_DISTRIBUTED = True
|
||||
_CPU_COUNT = 8
|
||||
logger.info("Distributed mode: ", _DISTRIBUTED)
|
||||
logger.info("CPU Count: ", _CPU_COUNT)
|
||||
|
||||
|
||||
def _append_path_to(data_path, data_series):
|
||||
return data_series.apply(lambda x: path.join(data_path, x))
|
||||
|
||||
|
||||
def _load_training(data_dir):
|
||||
train_df = pd.read_csv(path.join(data_dir, 'train.csv'))
|
||||
return train_df.assign(filenames=_append_path_to(path.join(data_dir, 'train'),
|
||||
train_df.filenames))
|
||||
|
||||
|
||||
def _load_validation(data_dir):
|
||||
train_df = pd.read_csv(path.join(data_dir, 'validation.csv'))
|
||||
return train_df.assign(filenames=_append_path_to(path.join(data_dir, 'validation'),
|
||||
train_df.filenames))
|
||||
|
||||
|
||||
def _create_data_fn(train_path, test_path):
|
||||
logger.info('Reading training data info')
|
||||
train_df = _load_training(train_path)
|
||||
logger.info('Reading validation data info')
|
||||
validation_df = _load_validation(test_path)
|
||||
# File-path
|
||||
train_X = train_df['filenames'].values
|
||||
validation_X = validation_df['filenames'].values
|
||||
# One-hot encoded labels for torch
|
||||
train_labels = train_df[['num_id']].values.ravel()
|
||||
validation_labels = validation_df[['num_id']].values.ravel()
|
||||
# Index starts from 0
|
||||
train_labels -= 1
|
||||
validation_labels -= 1
|
||||
return train_X, train_labels, validation_X, validation_labels
|
||||
|
||||
|
||||
class ImageNet(Dataset):
|
||||
def __init__(self, img_locs, img_labels, transform=None):
|
||||
self.img_locs, self.labels = img_locs, img_labels
|
||||
self.transform = transform
|
||||
logger.info("Loaded {} labels and {} images".format(len(self.labels), len(self.img_locs)))
|
||||
|
||||
def __getitem__(self, idx):
|
||||
im_file = self.img_locs[idx]
|
||||
label = self.labels[idx]
|
||||
with open(im_file, 'rb') as f:
|
||||
im_rgb = Image.open(f)
|
||||
# Make sure 3-channel (RGB)
|
||||
im_rgb = im_rgb.convert('RGB')
|
||||
if self.transform is not None:
|
||||
im_rgb = self.transform(im_rgb)
|
||||
return im_rgb, label
|
||||
|
||||
def __len__(self):
|
||||
return len(self.img_locs)
|
||||
|
||||
|
||||
class FakeData(Dataset):
|
||||
def __init__(self,
|
||||
batch_size=32,
|
||||
num_batches=20,
|
||||
dim=(224, 224),
|
||||
n_channels=3,
|
||||
n_classes=10,
|
||||
length=_DATA_LENGTH,
|
||||
seed=42,
|
||||
data_transform=None):
|
||||
self.dim = dim
|
||||
self.n_channels = n_channels
|
||||
self.n_classes = n_classes
|
||||
self.num_batches = num_batches
|
||||
self._data = _create_data(batch_size, self.num_batches, self.dim, self.n_channels)
|
||||
self._labels = _create_labels(batch_size, self.num_batches, self.n_classes)
|
||||
self.translation_index = np.random.choice(len(self._labels), length)
|
||||
self._length=length
|
||||
|
||||
self._data_transform = data_transform
|
||||
#logger = _get_logger()
|
||||
logger.info("Creating fake data {} labels and {} images".format(n_classes, len(self._data)))
|
||||
|
||||
def __getitem__(self, idx):
|
||||
#logger = _get_logger()
|
||||
logger.debug('Retrieving samples')
|
||||
logger.debug(str(idx))
|
||||
tr_index_array = self.translation_index[idx]
|
||||
|
||||
if self._data_transform is not None:
|
||||
data=self._data_transform(self._data[tr_index_array])
|
||||
else:
|
||||
data=self._data[tr_index_array]
|
||||
|
||||
return data, self._labels[tr_index_array]
|
||||
|
||||
def __len__(self):
|
||||
return self._length
|
||||
|
||||
|
||||
def _log_summary(data_length, duration):
|
||||
#logger = _get_logger()
|
||||
images_per_second = data_length / duration
|
||||
logger.info('Data length: {}'.format(data_length))
|
||||
logger.info('Total duration: {:.3f}'.format(duration))
|
||||
logger.info('Total images/sec: {:.3f}'.format(images_per_second))
|
||||
logger.info('Batch size: (Per GPU {}: Total {})'.format(int(_BATCHSIZE/_NUM_GPU), _BATCHSIZE))
|
||||
logger.info('Distributed: {}'.format('True' if _DISTRIBUTED else 'False'))
|
||||
logger.info('Num GPUs: {:.3f}'.format(_NUM_GPU)) # May need to pass in argument to get this
|
||||
logger.info('Dataset: {}'.format('Synthetic' if _FAKE else 'Imagenet'))
|
||||
|
||||
def _create_data(batch_size, num_batches, dim, channels, seed=42):
|
||||
np.random.seed(seed)
|
||||
return np.random.rand(batch_size * num_batches,
|
||||
channels,
|
||||
dim[0],
|
||||
dim[1]).astype(np.float32)
|
||||
|
||||
|
||||
def _create_labels(batch_size, num_batches, n_classes):
|
||||
return np.random.choice(n_classes, batch_size * num_batches)
|
||||
|
||||
|
||||
def train(train_loader, model, criterion, optimizer, epoch):
|
||||
logger.info("Training ...")
|
||||
model.train()
|
||||
for i, (input, target) in enumerate(train_loader):
|
||||
input, target = input.cuda(non_blocking=True), target.cuda(non_blocking=True)
|
||||
# compute output
|
||||
output = model(input)
|
||||
loss = criterion(output, target)
|
||||
# compute gradient and do SGD step
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
|
||||
def validate(val_loader, model, criterion):
|
||||
logger.info("Validating ...")
|
||||
model.eval()
|
||||
correct = 0
|
||||
total = 0
|
||||
with torch.no_grad():
|
||||
for i, (input, target) in enumerate(val_loader):
|
||||
target = target.cuda(non_blocking=True)
|
||||
# compute output
|
||||
output = model(input)
|
||||
_, predicted = torch.max(output.data, 1)
|
||||
total += target.size(0)
|
||||
correct += (predicted == target).sum().item()
|
||||
logger.info('Top-1 Accuracy: %.2f %%' % (100 * correct / total))
|
||||
|
||||
|
||||
def main():
|
||||
# Autotune
|
||||
cudnn.benchmark = True
|
||||
# Load symbol
|
||||
model = models.__dict__['resnet50'](pretrained=False)
|
||||
if _DISTRIBUTED:
|
||||
logger.info('Running in distributed mode')
|
||||
dist.init_process_group(
|
||||
backend=args.dist_backend,
|
||||
init_method=args.dist_url,
|
||||
world_size=args.world_size,
|
||||
rank=args.rank)
|
||||
model.cuda()
|
||||
model = torch.nn.parallel.DistributedDataParallel(model)
|
||||
else:
|
||||
model = torch.nn.DataParallel(model).cuda()
|
||||
# Optimisers
|
||||
criterion = nn.CrossEntropyLoss().cuda()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=_LR)
|
||||
# Data-sets
|
||||
if _FAKE:
|
||||
logger.info("Setting up fake loaders")
|
||||
train_dataset = FakeData(n_classes=1000, data_transform=torch.FloatTensor)
|
||||
else:
|
||||
normalize = transforms.Normalize(_RGB_MEAN, _RGB_SD)
|
||||
train_X, train_y, valid_X, valid_y = _create_data_fn(os.getenv('AZ_BATCHAI_INPUT_TRAIN'),
|
||||
os.getenv('AZ_BATCHAI_INPUT_TEST'))
|
||||
train_dataset = ImageNet(
|
||||
train_X,
|
||||
train_y,
|
||||
transforms.Compose([
|
||||
transforms.RandomResizedCrop(_WIDTH),
|
||||
transforms.RandomHorizontalFlip(),
|
||||
transforms.ToTensor(),
|
||||
normalize]))
|
||||
|
||||
|
||||
if _DISTRIBUTED:
|
||||
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
|
||||
else:
|
||||
train_sampler = None
|
||||
|
||||
# Data-loaders
|
||||
train_loader = torch.utils.data.DataLoader(
|
||||
train_dataset, batch_size=_BATCHSIZE, shuffle=(train_sampler is None), num_workers=_CPU_COUNT, sampler=train_sampler)
|
||||
|
||||
#val_loader = torch.utils.data.DataLoader(
|
||||
# ImageNet(
|
||||
# valid_X,
|
||||
# valid_y,
|
||||
# transforms.Compose([
|
||||
# transforms.Resize(256),
|
||||
# transforms.CenterCrop(_WIDTH),
|
||||
# transforms.ToTensor(),
|
||||
# normalize])), batch_size=_BATCHSIZE, shuffle=False,
|
||||
# num_workers=_CPU_COUNT)
|
||||
|
||||
# Main training-loop
|
||||
for epoch in range(_EPOCHS):
|
||||
if _DISTRIBUTED:
|
||||
train_sampler.set_epoch(epoch)
|
||||
# Train
|
||||
with Timer(output=logger.info, prefix="Training") as t:
|
||||
train(train_loader, model, criterion, optimizer, epoch)
|
||||
_log_summary(len(train_dataset), t.elapsed)
|
||||
|
||||
# Validate
|
||||
#with Timer(output=logger.info, prefix="Testing"):
|
||||
# validate(val_loader, model, criterion)
|
||||
|
||||
print("Finished")
|
||||
|
||||
if __name__ == '__main__':
|
||||
print("Pytorch")
|
||||
main()
|
243
README.md
243
README.md
|
@ -1,14 +1,245 @@
|
|||
# Distributed Deep Learning
|
||||
# Introduction
|
||||
This repo contains a cookiecutter template for running distributed training of deep learning models using
|
||||
Azure Machine Learning. You can create clusters with 0 nodes which will incur no cost and scale this up to hundreds of nodes. It is also possible to use low priority nodes to reduce costs even further.
|
||||
|
||||
This repo contains a number of examples of training a ResNet50 network with the Imagenet dataset in various Deep Learning frameworks.
|
||||
The project contains the following:
|
||||
#### Tensorflow Benchmark
|
||||
This is a demo template that allows you to easily run [tf_cnn_benchmarks](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks) on Azure ML. This is a great way to test performance as well as compare to other platforms
|
||||
#### Tensorflow Imagenet
|
||||
This is another demo tempalte that shows you how to train a ResNet50 model using Imagenet on Azure. We include scripts for processing the imagenet data, transforming them to TF Records as well as leveraging AzCopy to quickly upload the data to the cloud.
|
||||
#### Tensorflow Template
|
||||
This is a blank template you can use for your own didstributed training projects. It allows you to leverage all the tooling built around the previous two demos to speed up the time it takes to run your model in a distributed fashion on Azure.
|
||||
|
||||
[Horovod + Keras](HorovodKeras)
|
||||
[Horovod + Tensorflow](HorovodTF)
|
||||
[Horovod + PyTorch](HorovodPytorch)
|
||||
|
||||
# Prerequisites
|
||||
Before you get started you need a PC running Ubuntu and the following installed:
|
||||
[Docker installed](https://docs.docker.com/install/linux/docker-ce/ubuntu/)
|
||||
[Nvidia runtime for docker](https://github.com/NVIDIA/nvidia-container-runtime) [Required for local execution]
|
||||
[Cookiecutter installed](https://cookiecutter.readthedocs.io/en/latest/)
|
||||
[Git installed](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git)
|
||||
|
||||
> **Note:**
|
||||
> You will need to run docker without sudo, to do this run:
|
||||
> ```
|
||||
> sudo usermod -aG docker $USER
|
||||
> newgrp docker
|
||||
>```
|
||||
|
||||
# Setup
|
||||
## Using the template
|
||||
|
||||
Once you have Cookiecutter installed you can either directly invoke project creation as follows:
|
||||
```bash
|
||||
cookiecutter gh:Microsoft/AMLDistCC
|
||||
```
|
||||
or clone locally and then invoke
|
||||
```bash
|
||||
git clone https://github.com/Microsoft/AMLDistCC.git
|
||||
cookiecutter AMLDistCC
|
||||
```
|
||||
Cookiecutter will then ask you about a number of fields which it will use to construct your project.
|
||||
If you simply want to select the defaults don't write or select anything just press enter. Many of them can be left at the default values, the ones that are absolutely necessary are _highlighted_
|
||||
|
||||
**project_title:** The title of your project
|
||||
**project_name:** The folder in which your project will be created. Make sure it is a valid linux folder name
|
||||
**resource_group:** The name of the resource group in Azure under which all the resources will be created.
|
||||
It is fine if it already exists
|
||||
**workspace:** The AML workspace that the project will use. If it doesn't already exist it will create it
|
||||
**sub_id:** The subscription id for your project, you can look this up on the portal or run a command on the
|
||||
cloud shell to get it. It isn't mandatory though, the application will give you an option to select it later.
|
||||
**vm_size:** The VM type to use for distributed training
|
||||
**minimum_number_nodes:** The minimum number of nodes in the cluster. Set to 0 if you want it to scale down
|
||||
after use to reduce costs
|
||||
**maximum_number_nodes:** The maximum number of nodes in the cluster
|
||||
**cluster_name:** The name of the cluster to use. It will create it if it doesn't exist
|
||||
**container_registry:** The name of your dockerhub or other account which you may want to push your control plane docker container. If you don't have one or don't want to push the container to it simply leave as default
|
||||
**type:** The type of project you want:
|
||||
* all: All of them
|
||||
* template: Just create a template for distributed training
|
||||
* benchmark: Create project that will run the Tensorflow benchmarks
|
||||
* imagenet: Create an example project that will run against the imagenet data. (You will need to download the imagenet data)
|
||||
|
||||
**region:** Which region to create Azure resources in
|
||||
**experiment_name:** The name of the experiment
|
||||
_data_**:** The absolute path on your computer where you will store the imagenet data. The location needs to have around 400GB of space
|
||||
**image_name:** The name to give the control plane docker image
|
||||
**datastore_name:** Name of the datastore that will be created as part of the project
|
||||
**container_name:** The name of the container in your storage account that will hold the data
|
||||
|
||||
Once the project is created you will still be able to change many of the above options as they will be present in .env file that will be created.
|
||||
|
||||
## Building environment
|
||||
Distributed training is complex and often has a number of moving parts. To reduce the overhead of installing packages and managing environments we use a docker container to encapsulate our enviroment. So once you have created the project simply navigate to the root folder created by cookiecutter and run:
|
||||
```bash
|
||||
make build
|
||||
```
|
||||
This will build your docker container. Isnide your docker container will be an appropriately set up conda environment a number of utilities such as AzCopy as well as everything you will need to run your distributed training job.
|
||||
Once your container is built run:
|
||||
```bash
|
||||
make run
|
||||
```
|
||||
This will put you in an environment inside your container in a tmux session (for a tutorial on tmux see [here](https://www.hamvocke.com/blog/a-quick-and-easy-guide-to-tmux/)). The tmux control key has been mapped to **ctrl+a** rather than the standard ctrl+b so as not to interfere with outer tmux session if you are already a tmux user. You can alter this in the tmux.conf file in the Docker folder. The docker container will map the location you launched it from to the location /workspace inside the docker container. Therefore you can edit files outside of the container in the project folder and the changes will be reflected inside the container.
|
||||
|
||||
## Imagenet data
|
||||
If you have selected **all** or **imagenet** in the type question during cookiecutter invocation then you will need to have **ILSVRC2012_img_train.tar** and **ILSVRC2012_img_val.tar** present in the direcotry you specified as your data directory. Go to the [download page](http://www.image-net.org/download-images) (you may need to register an account), and find the page for ILSVRC2012. You will need to download the two files mentioned earlier.
|
||||
|
||||
## Template selection
|
||||
Based on the option you selected for **type** during the cookiecutter invocation you will get all or one of the options below. Cookiecutter will create your project folder which will contain the tempalte folders. When inside your project folder make sure you have run the **make build** and **make run** commands as mentioned in section X above. Once you run the run command you will be greeted by a prompt, this is now your control plane. First you will need to set everything up. To do this run
|
||||
```bash
|
||||
inv setup
|
||||
```
|
||||
It will ask you to log in so follow the prompts in the terminal. If you selected **all** in the template type it will also prepare the imagenet data.
|
||||
Now you will be ready to run the tempaltes.
|
||||
|
||||
#### Tensorflow Benchmark
|
||||
This is a demo template allows you to easily run tf_cnn_benchmarks on Azure ML. This is a great way to test performance as well as compare to other platforms. To use this you must either select benchmark or all when invoking cookiecutter.
|
||||
Once setup is complete then simply run:
|
||||
```bash
|
||||
inv tf-benchmark.submit.local.synthetic
|
||||
```
|
||||
to run things locally on a single GPU. Note that the first time you run things you will have to build the environment.
|
||||
To run things on a cluster simply run:
|
||||
```bash
|
||||
inv tf-benchmark.submit.remote.synthetic
|
||||
```
|
||||
Note that this will create the cluster if it wasn't created earlier and create the appropriate environment.
|
||||
|
||||
#### Tensorflow Imagenet
|
||||
This is the second demo template that will train a ResNet50 model on imagenet. It allows the options of using synthetic data, image data as well as tfrecords. To use this you must either select **imagenet** or **all** when cookiecutter asks what type of project you want to create.
|
||||
The run things locally using synthetic data simply run:
|
||||
```
|
||||
inv tf-imagenet.submit.local.synthetic
|
||||
```
|
||||
|
||||
To run things on a remote cluster with real data in tfrecords format simply run:
|
||||
```
|
||||
inv tf-imagenet.submit.remote.tfrecords
|
||||
```
|
||||
|
||||
This only covers a small number of commands, to see the full list of commands simply run inv --list.
|
||||
#### Tensorflow Experiment
|
||||
This is the option that you should use if you want to run your own training script. It is up to you to add the appropriate training scripts and modify the tensorflow_experiment.py file to run the appropriate commands. If you want to see how to invoke things simply look at the other examples.
|
||||
|
||||
# Architecture
|
||||
Below is a diagram that shows how the project is set up.
|
||||
|
||||
<p align="center">
|
||||
<img width="1000" src="./images/architecture1.png">
|
||||
</p>
|
||||
|
||||
The docker container you created using **make build** is the control plane and from there we can invoke jobs to execute either locally or in the cloud. Local execution is meant for debugging and will run on a single GPU. The mapping of data locations is handled by the control scripts. During local execution the appropriate location is mapped to the container. During remote execution the data store created during setup will be mounted on to each of the VMs in the cluster.
|
||||
|
||||
## Project structure
|
||||
The original project structure is as shown below.
|
||||
|
||||
```.
|
||||
├── cookiecutter.json <-- Cookiecutter json that holds all the variables for the projects
|
||||
├── hooks
|
||||
│ ├── post_gen_project.py
|
||||
│ └── pre_gen_project.py
|
||||
├── images
|
||||
│ └── demo.svg
|
||||
├── LICENSE
|
||||
├── README.md <-- This readme
|
||||
└── {{cookiecutter.project_name}}
|
||||
├── _dotenv_template <-- Template that is read and translated into .env file
|
||||
├── control <-- Holds all files for the control plane
|
||||
│ ├── Docker <-- Contains the files used to build the control plane docker container
|
||||
│ │ ├── azure_requirements.txt <-- Azure python requirements
|
||||
│ │ ├── bash.completion <-- Completion script for invoke
|
||||
│ │ ├── dockerfile
|
||||
│ │ ├── environment.yml <-- Conda environment specification for control plane
|
||||
│ │ ├── jupyter_notebook_config.py
|
||||
│ │ └── tmux.conf <-- Tmux configuration
|
||||
│ └── src
|
||||
│ ├── aml_compute.py <-- Module that holds methods for creating cluster and submitting experiments using Azure ML
|
||||
│ ├── config.py <-- Module for loading and working with .env config
|
||||
│ └── logging.conf <-- Logging configuration for control plane
|
||||
├── Makefile <-- Makefile to build and run control plane
|
||||
├── scripts
|
||||
│ ├── convert_imagenet_to_tf_records.py <-- Script for transforming imagenet data to tf records
|
||||
│ ├── image.py <-- Invoke module for working with images
|
||||
│ ├── imagenet_nounid_to_class.json <-- Imagenet nounid lookup
|
||||
│ ├── prepare_imagenet.py <-- Script for preparing imagenet data
|
||||
│ ├── storage.py <-- Invoke module for using Azure storage
|
||||
│ └── tfrecords.py <-- Invoke module for working with tf records
|
||||
├── tasks.py <-- Main invoke module
|
||||
├── TensorFlow_benchmark <-- Template for running Tensorflow benchmarks
|
||||
│ ├── environment_cpu.yml
|
||||
│ ├── environment_gpu.yml <-- Conda specification file used by Azure ML to create environment to run project in
|
||||
│ ├── src <-- Folder where tensorflow benchmarks code will be cloned into
|
||||
| └── tensorflow_benchmark.py <-- Invoke module for running benchmarks
|
||||
├── TensorFlow_experiment <-- Tensorflow distributed training template [Put your code here]
|
||||
│ ├── environment_cpu.yml
|
||||
│ ├── environment_gpu.yml <-- Conda specification file used by Azure ML to create environment to run project in
|
||||
│ ├── src
|
||||
│ │ ├── logging.conf
|
||||
│ │ └── train_model.py <-- Template file
|
||||
│ └── tensorflow_experiment.py <-- Invoke module for running template
|
||||
└── TensorFlow_imagenet
|
||||
├── environment_cpu.yml
|
||||
├── environment_gpu.yml <-- Conda specification file used by Azure ML to create environment to run project in
|
||||
├── src <-- Code for training ResNet50 model on imagenet
|
||||
│ ├── data
|
||||
│ │ ├── __init__.py
|
||||
│ │ ├── images.py
|
||||
│ │ ├── synthetic.py
|
||||
│ │ └── tfrecords.py
|
||||
│ ├── defaults.py
|
||||
│ ├── imagenet_preprocessing.py
|
||||
│ ├── logging.conf
|
||||
│ ├── resnet_main.py <-- Main entry script
|
||||
│ ├── resnet_model.py
|
||||
│ ├── resnet_run_loop.py
|
||||
│ ├── timer.py
|
||||
│ └── utils.py
|
||||
└── tensorflow_imagenet.py <-- Invoke module for running imagenet experiment
|
||||
```
|
||||
Depending on the options chosen only certain branches will be moved over to your project.
|
||||
|
||||
|
||||
## Options
|
||||
These are the options when using the template. These can differ depenting on the type of project you choose to create. To see this list youself simply run:
|
||||
```
|
||||
inv --list
|
||||
```
|
||||
```
|
||||
delete Delete the resource group and all associated resources
|
||||
experiments Prints list of experiments
|
||||
interactive (i) Open IPython terminal and load in modules to work with AzureML
|
||||
login Log in to Azure CLI
|
||||
runs Prints information on last N runs in specified experiment
|
||||
select-subscription Select Azure subscription to use
|
||||
setup Setup the environment and process the imagenet data
|
||||
tensorboard Runs tensorboard in a seperate tmux session
|
||||
storage.create-resource-group
|
||||
storage.store-key Retrieves premium storage account key from Azure and stores it in .env file
|
||||
storage.image.create-container Creates container based on the parameters found in the .env file
|
||||
storage.image.download-data Download training and validation data from blob container specified in .env file
|
||||
storage.image.download-training Download training data from blob container specified in .env file
|
||||
storage.image.download-validation Download validation data from blob container specified in .env file
|
||||
storage.image.prepare-imagenet Prepare imagenet data found in download_dir and push results to target_dir
|
||||
storage.image.upload-data Upload training and validation data to container specified in .env file
|
||||
storage.image.upload-training-data Upload training data to container specified in .env file
|
||||
storage.image.upload-validation-data Upload validation data to container specified in .env file
|
||||
storage.create-container Creates container based on the parameters found in the .env file
|
||||
storage.create-premium-storage Creates premium storage account. By default the values are loaded from the local .env file
|
||||
storage.tfrecords.upload-validation-data Upload tfrecords validation data to container specified in .env file
|
||||
tf-benchmark.submit.local.synthetic Submits TensorFlow benchmark job using synthetic data for local execution
|
||||
tf-benchmark.submit.remote.synthetic Submits TensorFlow benchmark job using synthetic data on remote cluster
|
||||
tf-experiment.submit.local.images This command isn't implemented please modify to use.
|
||||
tf-experiment.submit.local.synthetic This command isn't implemented please modify to use.
|
||||
tf-experiment.submit.remote.images This command isn't implemented please modify to use.
|
||||
tf-experiment.submit.remote.synthetic This command isn't implemented please modify to use.
|
||||
tf-imagenet.submit.local.images Submit TensorFlow training job using real imagenet data for local execution
|
||||
tf-imagenet.submit.local.synthetic Submit TensorFlow training job using synthetic imagenet data for local execution
|
||||
tf-imagenet.submit.local.tfrecords Submit TensorFlow training job using real imagenet data as tfrecords for local execution
|
||||
tf-imagenet.submit.remote.images Submit TensorFlow training job using real imagenet data to remote cluster
|
||||
tf-imagenet.submit.remote.synthetic Submit TensorFlow training job using synthetic imagenet data to remote cluster
|
||||
tf-imagenet.submit.remote.tfrecords Submit TensorFlow training job using real imagenet data as tfrecords to remote cluster
|
||||
```
|
||||
|
||||
# Contributing
|
||||
|
||||
This project welcomes contributions and suggestions. Most contributions require you to agree to a
|
||||
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
|
||||
the rights to use your contribution. For details, visit https://cla.microsoft.com.
|
||||
|
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,562 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from scipy.io import loadmat\n",
|
||||
"from os import path\n",
|
||||
"import os\n",
|
||||
"from toolz import juxt, compose\n",
|
||||
"import pandas as pd\n",
|
||||
"from glob import iglob\n",
|
||||
"from itertools import chain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_dir = path.join(os.getenv('AZ_BATCHAI_INPUT_DATASET'), 'imagenet')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loadmat(path.join(data_dir, 'ILSVRC2012_devkit_t12', 'data', 'meta.mat'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _index_from(synset):\n",
|
||||
" return synset[0][0][0][0]\n",
|
||||
"\n",
|
||||
"def _wnid_from(synset):\n",
|
||||
" return str(synset[0][1][0])\n",
|
||||
"\n",
|
||||
"def _name_from(synset):\n",
|
||||
" return str(synset[0][2][0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _extract_from(synset):\n",
|
||||
" extract = juxt(_index_from, _wnid_from, _name_from)\n",
|
||||
" return extract(synset)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"label_list = list(map(_extract_from, data['synsets']))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pd.DataFrame(label_list, columns=('num_index', 'wnid', 'label')).set_index('num_index')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_dir=path.join(data_dir,'train')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _extract_to_directory(wnid):\n",
|
||||
" out_dir = path.join(train_dir, wnid)\n",
|
||||
" tar_file = path.join(train_dir, '{}.tar'.format(wnid))\n",
|
||||
" print(out_dir)\n",
|
||||
" !mkdir -p $out_dir\n",
|
||||
" !tar -C $out_dir -xf $tar_file"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"filenames = [iglob(path.join(train_dir, wnid, '*.*')) for wnid in df.loc[1:1000]['wnid'].tolist()]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ff = list(chain(*filenames))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_df = pd.DataFrame({'filenames':ff})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"index_to_wnid_dict = df.loc[1:1000]['wnid'].to_dict()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"wnid_labels = [path.split(name)[-1].split('_')[0] for name in ff]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_df = data_df.assign(wnid=wnid_labels)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_df = data_df.assign(num_id=data_df['wnid'].replace(to_replace=list(index_to_wnid_dict.values()), \n",
|
||||
" value=list(index_to_wnid_dict.keys())))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"extract_wnid_dir = compose(path.basename, path.dirname)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"convert_filename = lambda x: path.join(extract_wnid_dir(x), path.basename(x))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_df=data_df.assign(filenames=data_df['filenames'].apply(convert_filename))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>filenames</th>\n",
|
||||
" <th>wnid</th>\n",
|
||||
" <th>num_id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>n02119789/n02119789_12009.JPEG</td>\n",
|
||||
" <td>n02119789</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>n02119789/n02119789_4083.JPEG</td>\n",
|
||||
" <td>n02119789</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>n02119789/n02119789_14450.JPEG</td>\n",
|
||||
" <td>n02119789</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>n02119789/n02119789_11832.JPEG</td>\n",
|
||||
" <td>n02119789</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>n02119789/n02119789_5459.JPEG</td>\n",
|
||||
" <td>n02119789</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" filenames wnid num_id\n",
|
||||
"0 n02119789/n02119789_12009.JPEG n02119789 1\n",
|
||||
"1 n02119789/n02119789_4083.JPEG n02119789 1\n",
|
||||
"2 n02119789/n02119789_14450.JPEG n02119789 1\n",
|
||||
"3 n02119789/n02119789_11832.JPEG n02119789 1\n",
|
||||
"4 n02119789/n02119789_5459.JPEG n02119789 1"
|
||||
]
|
||||
},
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 64,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_df.to_csv(path.join(data_dir, 'train.csv'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Validation data "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 53,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!mkdir -p {path.join(data_dir, 'validation')}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 54,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!tar -C {path.join(data_dir, 'validation')} -xf {path.join(data_dir, 'ILSVRC2012_img_val.tar')}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 55,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"f=open(path.join(data_dir, 'ILSVRC2012_devkit_t12', 'data', 'ILSVRC2012_validation_ground_truth.txt'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 56,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"convert_label = compose(int, str.strip)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 57,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"labels = list(map(convert_label, f.readlines()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 58,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"files = list(sorted(iglob(path.join(data_dir, 'validation', '*.JPEG'))))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 68,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"valid_df=pd.DataFrame({'filenames':files, 'num_id':labels})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 70,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"valid_df=valid_df.assign(filenames=valid_df['filenames'].apply(path.basename))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 71,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>filenames</th>\n",
|
||||
" <th>num_id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>ILSVRC2012_val_00000001.JPEG</td>\n",
|
||||
" <td>490</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>ILSVRC2012_val_00000002.JPEG</td>\n",
|
||||
" <td>361</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>ILSVRC2012_val_00000003.JPEG</td>\n",
|
||||
" <td>171</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>ILSVRC2012_val_00000004.JPEG</td>\n",
|
||||
" <td>822</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>ILSVRC2012_val_00000005.JPEG</td>\n",
|
||||
" <td>297</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" filenames num_id\n",
|
||||
"0 ILSVRC2012_val_00000001.JPEG 490\n",
|
||||
"1 ILSVRC2012_val_00000002.JPEG 361\n",
|
||||
"2 ILSVRC2012_val_00000003.JPEG 171\n",
|
||||
"3 ILSVRC2012_val_00000004.JPEG 822\n",
|
||||
"4 ILSVRC2012_val_00000005.JPEG 297"
|
||||
]
|
||||
},
|
||||
"execution_count": 71,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"valid_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 65,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ILSVRC2012_devkit_t12\t ILSVRC2012_img_train.tar\ttrain.csv\r\n",
|
||||
"ILSVRC2012_devkit_t12.tar.gz ILSVRC2012_img_val.tar\tvalidation\r\n",
|
||||
"ILSVRC2012_img_test.tar train\t\t\tvalidation.csv\r\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!ls {data_dir}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 72,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"valid_df.to_csv(path.join(data_dir, 'validation.csv'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python [default]",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.5.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,562 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from scipy.io import loadmat\n",
|
||||
"from os import path\n",
|
||||
"import os\n",
|
||||
"from toolz import juxt, compose\n",
|
||||
"import pandas as pd\n",
|
||||
"from glob import iglob\n",
|
||||
"from itertools import chain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_dir = path.join(os.getenv('AZ_BATCHAI_INPUT_DATASET'), 'imagenet')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loadmat(path.join(data_dir, 'ILSVRC2012_devkit_t12', 'data', 'meta.mat'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _index_from(synset):\n",
|
||||
" return synset[0][0][0][0]\n",
|
||||
"\n",
|
||||
"def _wnid_from(synset):\n",
|
||||
" return str(synset[0][1][0])\n",
|
||||
"\n",
|
||||
"def _name_from(synset):\n",
|
||||
" return str(synset[0][2][0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _extract_from(synset):\n",
|
||||
" extract = juxt(_index_from, _wnid_from, _name_from)\n",
|
||||
" return extract(synset)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"label_list = list(map(_extract_from, data['synsets']))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pd.DataFrame(label_list, columns=('num_index', 'wnid', 'label')).set_index('num_index')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_dir=path.join(data_dir,'train')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _extract_to_directory(wnid):\n",
|
||||
" out_dir = path.join(train_dir, wnid)\n",
|
||||
" tar_file = path.join(train_dir, '{}.tar'.format(wnid))\n",
|
||||
" print(out_dir)\n",
|
||||
" !mkdir -p $out_dir\n",
|
||||
" !tar -C $out_dir -xf $tar_file"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"filenames = [iglob(path.join(train_dir, wnid, '*.*')) for wnid in df.loc[1:1000]['wnid'].tolist()]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ff = list(chain(*filenames))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_df = pd.DataFrame({'filenames':ff})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"index_to_wnid_dict = df.loc[1:1000]['wnid'].to_dict()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"wnid_labels = [path.split(name)[-1].split('_')[0] for name in ff]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_df = data_df.assign(wnid=wnid_labels)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_df = data_df.assign(num_id=data_df['wnid'].replace(to_replace=list(index_to_wnid_dict.values()), \n",
|
||||
" value=list(index_to_wnid_dict.keys())))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"extract_wnid_dir = compose(path.basename, path.dirname)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"convert_filename = lambda x: path.join(extract_wnid_dir(x), path.basename(x))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_df=data_df.assign(filenames=data_df['filenames'].apply(convert_filename))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>filenames</th>\n",
|
||||
" <th>wnid</th>\n",
|
||||
" <th>num_id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>n02119789/n02119789_12009.JPEG</td>\n",
|
||||
" <td>n02119789</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>n02119789/n02119789_4083.JPEG</td>\n",
|
||||
" <td>n02119789</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>n02119789/n02119789_14450.JPEG</td>\n",
|
||||
" <td>n02119789</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>n02119789/n02119789_11832.JPEG</td>\n",
|
||||
" <td>n02119789</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>n02119789/n02119789_5459.JPEG</td>\n",
|
||||
" <td>n02119789</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" filenames wnid num_id\n",
|
||||
"0 n02119789/n02119789_12009.JPEG n02119789 1\n",
|
||||
"1 n02119789/n02119789_4083.JPEG n02119789 1\n",
|
||||
"2 n02119789/n02119789_14450.JPEG n02119789 1\n",
|
||||
"3 n02119789/n02119789_11832.JPEG n02119789 1\n",
|
||||
"4 n02119789/n02119789_5459.JPEG n02119789 1"
|
||||
]
|
||||
},
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 64,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_df.to_csv(path.join(data_dir, 'train.csv'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Validation data "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 53,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!mkdir -p {path.join(data_dir, 'validation')}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 54,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!tar -C {path.join(data_dir, 'validation')} -xf {path.join(data_dir, 'ILSVRC2012_img_val.tar')}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 55,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"f=open(path.join(data_dir, 'ILSVRC2012_devkit_t12', 'data', 'ILSVRC2012_validation_ground_truth.txt'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 56,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"convert_label = compose(int, str.strip)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 57,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"labels = list(map(convert_label, f.readlines()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 58,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"files = list(sorted(iglob(path.join(data_dir, 'validation', '*.JPEG'))))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 68,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"valid_df=pd.DataFrame({'filenames':files, 'num_id':labels})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 70,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"valid_df=valid_df.assign(filenames=valid_df['filenames'].apply(path.basename))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 71,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>filenames</th>\n",
|
||||
" <th>num_id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>ILSVRC2012_val_00000001.JPEG</td>\n",
|
||||
" <td>490</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>ILSVRC2012_val_00000002.JPEG</td>\n",
|
||||
" <td>361</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>ILSVRC2012_val_00000003.JPEG</td>\n",
|
||||
" <td>171</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>ILSVRC2012_val_00000004.JPEG</td>\n",
|
||||
" <td>822</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>ILSVRC2012_val_00000005.JPEG</td>\n",
|
||||
" <td>297</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" filenames num_id\n",
|
||||
"0 ILSVRC2012_val_00000001.JPEG 490\n",
|
||||
"1 ILSVRC2012_val_00000002.JPEG 361\n",
|
||||
"2 ILSVRC2012_val_00000003.JPEG 171\n",
|
||||
"3 ILSVRC2012_val_00000004.JPEG 822\n",
|
||||
"4 ILSVRC2012_val_00000005.JPEG 297"
|
||||
]
|
||||
},
|
||||
"execution_count": 71,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"valid_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 65,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ILSVRC2012_devkit_t12\t ILSVRC2012_img_train.tar\ttrain.csv\r\n",
|
||||
"ILSVRC2012_devkit_t12.tar.gz ILSVRC2012_img_val.tar\tvalidation\r\n",
|
||||
"ILSVRC2012_img_test.tar train\t\t\tvalidation.csv\r\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!ls {data_dir}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 72,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"valid_df.to_csv(path.join(data_dir, 'validation.csv'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python [default]",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.5.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -1,482 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import pandas as pd\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"azcopy --source https://datasharesa.blob.core.windows.net/imagenet/train.csv \\\n",
|
||||
" --destination /data/imagenet/train.csv\\\n",
|
||||
" --source-sas \"?se=2025-01-01&sp=r&sv=2017-04-17&sr=b&sig=EUcahDDZcefOKtHoVWDh7voAC1BoxYNM512spFmjmDU%3D\"\\\n",
|
||||
" --quiet"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!azcopy --source https://datasharesa.blob.core.windows.net/imagenet/validation.csv \\\n",
|
||||
" --destination /data/imagenet/validation.csv\\\n",
|
||||
" --source-sas \"?se=2025-01-01&sp=r&sv=2017-04-17&sr=b&sig=7x3rN7c/nlXbnZ0gAFywd5Er3r6MdwCq97Vwvda25WE%3D\"\\\n",
|
||||
" --quiet"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"root_path = '/data/imagenet/'\n",
|
||||
"train_path = root_path + 'train.csv'\n",
|
||||
"val_path = root_path + 'validation.csv'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Train set"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(1281167, 2)\n",
|
||||
"1\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>filenames</th>\n",
|
||||
" <th>num_id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>n02119789/n02119789_12009.JPEG</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>n02119789/n02119789_4083.JPEG</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>n02119789/n02119789_14450.JPEG</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>n02119789/n02119789_11832.JPEG</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>n02119789/n02119789_5459.JPEG</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" filenames num_id\n",
|
||||
"0 n02119789/n02119789_12009.JPEG 1\n",
|
||||
"1 n02119789/n02119789_4083.JPEG 1\n",
|
||||
"2 n02119789/n02119789_14450.JPEG 1\n",
|
||||
"3 n02119789/n02119789_11832.JPEG 1\n",
|
||||
"4 n02119789/n02119789_5459.JPEG 1"
|
||||
]
|
||||
},
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"train = pd.read_csv(train_path, usecols=['filenames','num_id'])\n",
|
||||
"print(train.shape)\n",
|
||||
"print(min(train['num_id']))\n",
|
||||
"train.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>filenames</th>\n",
|
||||
" <th>num_id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" filenames num_id\n",
|
||||
"0 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 0\n",
|
||||
"1 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 0\n",
|
||||
"2 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 0\n",
|
||||
"3 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 0\n",
|
||||
"4 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 0"
|
||||
]
|
||||
},
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"mnt_path = '/mnt/batch/tasks/shared/LS_root/mounts/imagenet/'\n",
|
||||
"train['filenames'] = mnt_path + 'train/' + train['filenames']\n",
|
||||
"train['num_id'] -= 1\n",
|
||||
"train.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train.to_csv('train_map.txt', header=False, index=False, sep='\\t')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Validation set"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(50000, 2)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>filenames</th>\n",
|
||||
" <th>num_id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>ILSVRC2012_val_00000001.JPEG</td>\n",
|
||||
" <td>490</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>ILSVRC2012_val_00000002.JPEG</td>\n",
|
||||
" <td>361</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>ILSVRC2012_val_00000003.JPEG</td>\n",
|
||||
" <td>171</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>ILSVRC2012_val_00000004.JPEG</td>\n",
|
||||
" <td>822</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>ILSVRC2012_val_00000005.JPEG</td>\n",
|
||||
" <td>297</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" filenames num_id\n",
|
||||
"0 ILSVRC2012_val_00000001.JPEG 490\n",
|
||||
"1 ILSVRC2012_val_00000002.JPEG 361\n",
|
||||
"2 ILSVRC2012_val_00000003.JPEG 171\n",
|
||||
"3 ILSVRC2012_val_00000004.JPEG 822\n",
|
||||
"4 ILSVRC2012_val_00000005.JPEG 297"
|
||||
]
|
||||
},
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"val = pd.read_csv(val_path, usecols=['filenames','num_id'])\n",
|
||||
"print(val.shape)\n",
|
||||
"val.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>filenames</th>\n",
|
||||
" <th>num_id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
|
||||
" <td>489</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
|
||||
" <td>360</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
|
||||
" <td>170</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
|
||||
" <td>821</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>/mnt/batch/tasks/shared/LS_root/mounts/imagene...</td>\n",
|
||||
" <td>296</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" filenames num_id\n",
|
||||
"0 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 489\n",
|
||||
"1 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 360\n",
|
||||
"2 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 170\n",
|
||||
"3 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 821\n",
|
||||
"4 /mnt/batch/tasks/shared/LS_root/mounts/imagene... 296"
|
||||
]
|
||||
},
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"val['filenames'] = mnt_path + 'validation/' + val['filenames']\n",
|
||||
"val['num_id'] -= 1\n",
|
||||
"val.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"val.to_csv('val_map.txt', header=False, index=False, sep='\\t')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### ImageNet mean values"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--2018-05-21 14:22:19-- https://raw.githubusercontent.com/Microsoft/CNTK/master/Examples/Image/DataSets/ImageNet/ImageNet1K_mean.xml\n",
|
||||
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.16.133\n",
|
||||
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.16.133|:443... connected.\n",
|
||||
"HTTP request sent, awaiting response... 200 OK\n",
|
||||
"Length: 2559185 (2.4M) [text/plain]\n",
|
||||
"Saving to: ‘ImageNet1K_mean.xml’\n",
|
||||
"\n",
|
||||
"ImageNet1K_mean.xml 100%[===================>] 2.44M --.-KB/s in 0.1s \n",
|
||||
"\n",
|
||||
"2018-05-21 14:22:19 (18.8 MB/s) - ‘ImageNet1K_mean.xml’ saved [2559185/2559185]\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!wget https://raw.githubusercontent.com/Microsoft/CNTK/master/Examples/Image/DataSets/ImageNet/ImageNet1K_mean.xml"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,562 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from scipy.io import loadmat\n",
|
||||
"from os import path\n",
|
||||
"import os\n",
|
||||
"from toolz import juxt, compose\n",
|
||||
"import pandas as pd\n",
|
||||
"from glob import iglob\n",
|
||||
"from itertools import chain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_dir = path.join(os.getenv('AZ_BATCHAI_INPUT_DATASET'), 'imagenet')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loadmat(path.join(data_dir, 'ILSVRC2012_devkit_t12', 'data', 'meta.mat'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _index_from(synset):\n",
|
||||
" return synset[0][0][0][0]\n",
|
||||
"\n",
|
||||
"def _wnid_from(synset):\n",
|
||||
" return str(synset[0][1][0])\n",
|
||||
"\n",
|
||||
"def _name_from(synset):\n",
|
||||
" return str(synset[0][2][0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _extract_from(synset):\n",
|
||||
" extract = juxt(_index_from, _wnid_from, _name_from)\n",
|
||||
" return extract(synset)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"label_list = list(map(_extract_from, data['synsets']))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pd.DataFrame(label_list, columns=('num_index', 'wnid', 'label')).set_index('num_index')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_dir=path.join(data_dir,'train')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _extract_to_directory(wnid):\n",
|
||||
" out_dir = path.join(train_dir, wnid)\n",
|
||||
" tar_file = path.join(train_dir, '{}.tar'.format(wnid))\n",
|
||||
" print(out_dir)\n",
|
||||
" !mkdir -p $out_dir\n",
|
||||
" !tar -C $out_dir -xf $tar_file"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"filenames = [iglob(path.join(train_dir, wnid, '*.*')) for wnid in df.loc[1:1000]['wnid'].tolist()]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ff = list(chain(*filenames))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_df = pd.DataFrame({'filenames':ff})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"index_to_wnid_dict = df.loc[1:1000]['wnid'].to_dict()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"wnid_labels = [path.split(name)[-1].split('_')[0] for name in ff]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_df = data_df.assign(wnid=wnid_labels)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_df = data_df.assign(num_id=data_df['wnid'].replace(to_replace=list(index_to_wnid_dict.values()), \n",
|
||||
" value=list(index_to_wnid_dict.keys())))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"extract_wnid_dir = compose(path.basename, path.dirname)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"convert_filename = lambda x: path.join(extract_wnid_dir(x), path.basename(x))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_df=data_df.assign(filenames=data_df['filenames'].apply(convert_filename))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>filenames</th>\n",
|
||||
" <th>wnid</th>\n",
|
||||
" <th>num_id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>n02119789/n02119789_12009.JPEG</td>\n",
|
||||
" <td>n02119789</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>n02119789/n02119789_4083.JPEG</td>\n",
|
||||
" <td>n02119789</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>n02119789/n02119789_14450.JPEG</td>\n",
|
||||
" <td>n02119789</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>n02119789/n02119789_11832.JPEG</td>\n",
|
||||
" <td>n02119789</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>n02119789/n02119789_5459.JPEG</td>\n",
|
||||
" <td>n02119789</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" filenames wnid num_id\n",
|
||||
"0 n02119789/n02119789_12009.JPEG n02119789 1\n",
|
||||
"1 n02119789/n02119789_4083.JPEG n02119789 1\n",
|
||||
"2 n02119789/n02119789_14450.JPEG n02119789 1\n",
|
||||
"3 n02119789/n02119789_11832.JPEG n02119789 1\n",
|
||||
"4 n02119789/n02119789_5459.JPEG n02119789 1"
|
||||
]
|
||||
},
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 64,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_df.to_csv(path.join(data_dir, 'train.csv'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Validation data "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 53,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!mkdir -p {path.join(data_dir, 'validation')}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 54,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!tar -C {path.join(data_dir, 'validation')} -xf {path.join(data_dir, 'ILSVRC2012_img_val.tar')}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 55,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"f=open(path.join(data_dir, 'ILSVRC2012_devkit_t12', 'data', 'ILSVRC2012_validation_ground_truth.txt'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 56,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"convert_label = compose(int, str.strip)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 57,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"labels = list(map(convert_label, f.readlines()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 58,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"files = list(sorted(iglob(path.join(data_dir, 'validation', '*.JPEG'))))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 68,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"valid_df=pd.DataFrame({'filenames':files, 'num_id':labels})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 70,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"valid_df=valid_df.assign(filenames=valid_df['filenames'].apply(path.basename))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 71,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>filenames</th>\n",
|
||||
" <th>num_id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>ILSVRC2012_val_00000001.JPEG</td>\n",
|
||||
" <td>490</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>ILSVRC2012_val_00000002.JPEG</td>\n",
|
||||
" <td>361</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>ILSVRC2012_val_00000003.JPEG</td>\n",
|
||||
" <td>171</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>ILSVRC2012_val_00000004.JPEG</td>\n",
|
||||
" <td>822</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>ILSVRC2012_val_00000005.JPEG</td>\n",
|
||||
" <td>297</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" filenames num_id\n",
|
||||
"0 ILSVRC2012_val_00000001.JPEG 490\n",
|
||||
"1 ILSVRC2012_val_00000002.JPEG 361\n",
|
||||
"2 ILSVRC2012_val_00000003.JPEG 171\n",
|
||||
"3 ILSVRC2012_val_00000004.JPEG 822\n",
|
||||
"4 ILSVRC2012_val_00000005.JPEG 297"
|
||||
]
|
||||
},
|
||||
"execution_count": 71,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"valid_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 65,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ILSVRC2012_devkit_t12\t ILSVRC2012_img_train.tar\ttrain.csv\r\n",
|
||||
"ILSVRC2012_devkit_t12.tar.gz ILSVRC2012_img_val.tar\tvalidation\r\n",
|
||||
"ILSVRC2012_img_test.tar train\t\t\tvalidation.csv\r\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!ls {data_dir}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 72,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"valid_df.to_csv(path.join(data_dir, 'validation.csv'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python [default]",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.5.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -1,11 +0,0 @@
|
|||
{
|
||||
"properties": {
|
||||
"nodeSetup": {
|
||||
"setupTask": {
|
||||
"commandLine": "$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts/nodeprep.sh",
|
||||
"runElevated": "True",
|
||||
"stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,35 +0,0 @@
|
|||
[Unit]
|
||||
Description=Docker Application Container Engine
|
||||
Documentation=https://docs.docker.com
|
||||
After=network-online.target docker.socket firewalld.service
|
||||
Wants=network-online.target
|
||||
Requires=docker.socket
|
||||
|
||||
[Service]
|
||||
EnvironmentFile=/etc/default/docker
|
||||
Type=notify
|
||||
# the default is not to use systemd for cgroups because the delegate issues still
|
||||
# exists and systemd currently does not support the cgroup feature set required
|
||||
# for containers run by docker
|
||||
ExecStart=/usr/bin/dockerd --default-shm-size 8G -g /mnt/docker/ -H fd://
|
||||
ExecReload=/bin/kill -s HUP $MAINPID
|
||||
LimitNOFILE=1048576
|
||||
# Having non-zero Limit*s causes performance problems due to accounting overhead
|
||||
# in the kernel. We recommend using cgroups to do container-local accounting.
|
||||
LimitNPROC=infinity
|
||||
LimitCORE=infinity
|
||||
# Uncomment TasksMax if your systemd version supports it.
|
||||
# Only systemd 226 and above support this version.
|
||||
TasksMax=infinity
|
||||
TimeoutStartSec=0
|
||||
# set delegate yes so that systemd does not reset the cgroups of docker containers
|
||||
Delegate=yes
|
||||
# kill only the docker process, not all processes in the cgroup
|
||||
KillMode=process
|
||||
# restart the docker process if it exits prematurely
|
||||
Restart=on-failure
|
||||
StartLimitBurst=3
|
||||
StartLimitInterval=60s
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
|
@ -1,4 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
sudo cp $AZ_BATCHAI_MOUNT_ROOT/extfs/scripts/docker.service /lib/systemd/system
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl restart docker
|
|
@ -0,0 +1,36 @@
|
|||
{
|
||||
"_project_short_description": "A set of templates for running distributed training on AML",
|
||||
"project_title": "Template for Distributed Deep Learning using Azure Machine Learning ",
|
||||
"project_name": "aml_dist",
|
||||
"resource_group": "amldistrg",
|
||||
"workspace": "workspace",
|
||||
"subscription_id": "",
|
||||
"vm_size": [
|
||||
"Standard_NC24r",
|
||||
"Standard_NC24rs_v2",
|
||||
"Standard_NC24rs_v3",
|
||||
"Standard_ND24rs"
|
||||
],
|
||||
"minimum_number_nodes": 2,
|
||||
"maximum_number_nodes": 2,
|
||||
"cluster_name":"gpucluster",
|
||||
"container_registry": "dockerhub",
|
||||
"type": [
|
||||
"all",
|
||||
"template",
|
||||
"benchmark",
|
||||
"imagenet"
|
||||
],
|
||||
"region": [
|
||||
"eastus",
|
||||
"southcentralus"
|
||||
],
|
||||
"experiment_name": "experiment",
|
||||
"data":"/mnt/imagenet",
|
||||
"image_name":"aml_dist",
|
||||
"_remove_unused_projects": false,
|
||||
"account_name": "distpremstorage",
|
||||
"account_key": "",
|
||||
"datastore_name":"datastore",
|
||||
"container_name":"container"
|
||||
}
|
|
@ -1,143 +0,0 @@
|
|||
include ../experiments_config.mk
|
||||
|
||||
CONTAINER_NAME=batch${ID}blob
|
||||
EXPERIMENT:=experiment_imagenet_blob_${GPU_TYPE}
|
||||
|
||||
include ../../include/control.mk
|
||||
|
||||
define submit_keras_intel
|
||||
$(call generate_job_intel,masalvar/horovod-intel-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2),$(3))
|
||||
$(call submit_job,$(4))
|
||||
endef
|
||||
|
||||
define submit_keras
|
||||
$(call generate_job_openmpi,masalvar/horovod-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2),$(3))
|
||||
$(call submit_job,$(4))
|
||||
endef
|
||||
|
||||
define submit_keras_local
|
||||
$(call generate_job_local,masalvar/horovod-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2))
|
||||
$(call submit_job,$(3))
|
||||
endef
|
||||
|
||||
define submit_tf_intel
|
||||
$(call generate_job_intel,masalvar/horovod-intel:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1),$(2),$(3))
|
||||
$(call submit_job,$(4))
|
||||
endef
|
||||
|
||||
define submit_tf
|
||||
$(call generate_job_openmpi,masalvar/horovod:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1),$(2),$(3))
|
||||
$(call submit_job,$(4))
|
||||
endef
|
||||
|
||||
define submit_tf_local
|
||||
$(call generate_job_local,masalvar/horovod-intel:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1), $(2))
|
||||
$(call submit_job, $(3))
|
||||
endef
|
||||
|
||||
define submit_pytorch
|
||||
$(call generate_job_openmpi,masalvar/horovod-pytorch,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1),$(2),$(3))
|
||||
$(call submit_job,$(4))
|
||||
endef
|
||||
|
||||
define submit_pytorch_local
|
||||
$(call generate_job_local,masalvar/horovod-pytorch,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1), $(2))
|
||||
$(call submit_job, $(3))
|
||||
endef
|
||||
|
||||
upload-data: upload-training upload-validation upload-csv
|
||||
@echo 'All data uploaded'
|
||||
|
||||
upload-training: set-storage
|
||||
azcopy --source https://datasharesa.blob.core.windows.net/imagenet/train \
|
||||
--source-key 'owUPSqTbwAigV54BHTr8oYABEha8xi/VsA4HD06GboDgOb3pf6OFgtw/tlKYv/AlkgSIBkxqoA28hnkIeo4NFg==' \
|
||||
--destination https://${azure_storage_account}.blob.core.windows.net/${CONTAINER_NAME}/train \
|
||||
--dest-key ${azure_storage_key} --quiet --recursive --exclude-older
|
||||
|
||||
upload-validation: set-storage
|
||||
azcopy --source https://datasharesa.blob.core.windows.net/imagenet/validation \
|
||||
--source-key 'owUPSqTbwAigV54BHTr8oYABEha8xi/VsA4HD06GboDgOb3pf6OFgtw/tlKYv/AlkgSIBkxqoA28hnkIeo4NFg==' \
|
||||
--destination https://${azure_storage_account}.blob.core.windows.net/${CONTAINER_NAME}/validation \
|
||||
--dest-key ${azure_storage_key} --quiet --recursive
|
||||
|
||||
|
||||
upload-csv: set-storage
|
||||
azcopy --source https://datasharesa.blob.core.windows.net/imagenet/train.csv \
|
||||
--source-key 'owUPSqTbwAigV54BHTr8oYABEha8xi/VsA4HD06GboDgOb3pf6OFgtw/tlKYv/AlkgSIBkxqoA28hnkIeo4NFg==' \
|
||||
--destination https://${azure_storage_account}.blob.core.windows.net/${CONTAINER_NAME}/train.csv \
|
||||
--dest-key ${azure_storage_key} --quiet
|
||||
|
||||
azcopy --source https://datasharesa.blob.core.windows.net/imagenet/validation.csv \
|
||||
--source-key 'owUPSqTbwAigV54BHTr8oYABEha8xi/VsA4HD06GboDgOb3pf6OFgtw/tlKYv/AlkgSIBkxqoA28hnkIeo4NFg==' \
|
||||
--destination https://${azure_storage_account}.blob.core.windows.net/${CONTAINER_NAME}/validation.csv \
|
||||
--dest-key ${azure_storage_key} --quiet
|
||||
|
||||
create-cluster: upload-nodeprep-scripts
|
||||
az batchai cluster create \
|
||||
-w $(WORKSPACE) \
|
||||
--name ${CLUSTER_NAME} \
|
||||
--image UbuntuLTS \
|
||||
--vm-size ${VM_SIZE} \
|
||||
--min ${NUM_NODES} --max ${NUM_NODES} \
|
||||
--afs-name ${FILE_SHARE_NAME} \
|
||||
--afs-mount-path extfs \
|
||||
--user-name mat \
|
||||
--password dnstvxrz \
|
||||
--storage-account-name $(STORAGE_ACCOUNT_NAME) \
|
||||
--storage-account-key $(azure_storage_key) \
|
||||
--bfs-name $(CONTAINER_NAME) \
|
||||
--bfs-mount-path extcn \
|
||||
--config-file ../../cluster_config/cluster.json
|
||||
|
||||
|
||||
submit-all: submit-keras-intel32 submit-keras-intel16 submit-keras-intel8 submit-keras-intel4 submit-tf-intel32 \
|
||||
submit-tf-intel16 submit-tf-intel8 submit-tf-intel4 submit-pytorch32 submit-pytorch16 submit-pytorch8 submit-pytorch4 \
|
||||
submit-keras-local submit-tf-local submit-pytorch-local
|
||||
|
||||
submit-keras-intel32:
|
||||
$(call submit_keras_intel,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,keras-intel-32)
|
||||
|
||||
submit-keras-intel16:
|
||||
$(call submit_keras_intel,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,keras-intel-16)
|
||||
|
||||
submit-keras-intel8:
|
||||
$(call submit_keras_intel,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,keras-intel-8)
|
||||
|
||||
submit-keras-intel4:
|
||||
$(call submit_keras_intel,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,keras-intel-4)
|
||||
|
||||
submit-keras-local:
|
||||
$(call submit_keras_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,keras-local)
|
||||
|
||||
|
||||
|
||||
submit-tf-intel32:
|
||||
$(call submit_tf_intel,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,tf-intel-32)
|
||||
|
||||
submit-tf-intel16:
|
||||
$(call submit_tf_intel,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,tf-intel-16)
|
||||
|
||||
submit-tf-intel8:
|
||||
$(call submit_tf_intel,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,tf-intel-8)
|
||||
|
||||
submit-tf-intel4:
|
||||
$(call submit_tf_intel,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,tf-intel-4)
|
||||
|
||||
submit-tf-local:
|
||||
$(call submit_tf_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,tf-local)
|
||||
|
||||
|
||||
submit-pytorch32:
|
||||
$(call submit_pytorch,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,pytorch-32)
|
||||
|
||||
submit-pytorch16:
|
||||
$(call submit_pytorch,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,pytorch-16)
|
||||
|
||||
submit-pytorch8:
|
||||
$(call submit_pytorch,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,pytorch-8)
|
||||
|
||||
submit-pytorch4:
|
||||
$(call submit_pytorch,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,pytorch-4)
|
||||
|
||||
submit-pytorch-local:
|
||||
$(call submit_pytorch_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/extcn ,pytorch-local)
|
|
@ -1,16 +0,0 @@
|
|||
# Variables for Batch AI - change as necessary
|
||||
ID:=iliadl3
|
||||
LOCATION:=eastus
|
||||
GROUP_NAME:=batch${ID}rg
|
||||
STORAGE_ACCOUNT_NAME:=batch${ID}st
|
||||
FILE_SHARE_NAME:=batch${ID}share
|
||||
SELECTED_SUBSCRIPTION:="Team Danielle Internal"
|
||||
WORKSPACE:=workspace
|
||||
|
||||
VM_SIZE:=Standard_NC24rs_v3
|
||||
NUM_NODES:=2
|
||||
CLUSTER_NAME:=ikv100
|
||||
|
||||
|
||||
GPU_TYPE:=V100
|
||||
PROCESSES_PER_NODE:=4
|
|
@ -1,35 +0,0 @@
|
|||
import json
|
||||
import logging
|
||||
from glob import iglob
|
||||
from itertools import chain
|
||||
import os
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def append_storage_type(json_data, filename):
|
||||
json_data['Storage Type'] = os.path.dirname(filename)
|
||||
|
||||
|
||||
def read_json(filename):
|
||||
logger.info('Reading {}...'.format(filename))
|
||||
with open(filename) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def write_json_to_file(json_data, filename):
|
||||
with open(filename, 'w') as outfile:
|
||||
json.dump(json_data, outfile)
|
||||
|
||||
|
||||
def main(filename='all_results.json'):
|
||||
files = iglob('**/results.json', recursive=True)
|
||||
json_data = (read_json(i) for i in files)
|
||||
augmented_json_data = (append_storage_type(j, f) for j, f in zip(json_data, files))
|
||||
write_json_to_file(list(chain.from_iterable(augmented_json_data)), filename)
|
||||
logger.info('All results written to {}'.format(filename))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,343 +0,0 @@
|
|||
import argparse
|
||||
import json
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
#
|
||||
# Config for Intel
|
||||
cmd_for_intel = \
|
||||
"""source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh;
|
||||
echo $AZ_BATCH_HOST_LIST;
|
||||
mpirun -n {total_processes} -ppn {processes_per_node} {hosts}
|
||||
-env I_MPI_FABRICS=dapl
|
||||
-env I_MPI_DAPL_PROVIDER=ofa-v2-ib0
|
||||
-env I_MPI_DYNAMIC_CONNECTION=0
|
||||
-env I_MPI_DEBUG=6
|
||||
-env I_MPI_HYDRA_DEBUG=on
|
||||
-env DISTRIBUTED=True
|
||||
{fake}
|
||||
{fake_length}
|
||||
python -u {script}""".replace('\n', '')
|
||||
|
||||
# Config for OpenMPI
|
||||
cmd_for_openmpi = \
|
||||
"""echo $AZ_BATCH_HOST_LIST;
|
||||
cat $AZ_BATCHAI_MPI_HOST_FILE;
|
||||
mpirun -np {total_processes} {hosts}
|
||||
-bind-to none -map-by slot
|
||||
-x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH
|
||||
-mca btl_tcp_if_include eth0
|
||||
-x NCCL_SOCKET_IFNAME=eth0
|
||||
-mca btl ^openib
|
||||
-x NCCL_IB_DISABLE=1
|
||||
-x DISTRIBUTED=True
|
||||
-x AZ_BATCHAI_INPUT_TRAIN
|
||||
-x AZ_BATCHAI_INPUT_TEST
|
||||
{fake}
|
||||
{fake_length}
|
||||
--allow-run-as-root
|
||||
python -u {script}""".replace('\n', '')
|
||||
|
||||
# Running on single node without mpi
|
||||
cmd_local = """{fake} {fake_length} python -u {script}""".replace('\n', '')
|
||||
|
||||
cmd_choice_dict = {
|
||||
'openmpi': cmd_for_openmpi,
|
||||
'intelmpi': cmd_for_intel,
|
||||
'local': cmd_local
|
||||
}
|
||||
|
||||
hosts_param = {
|
||||
'openmpi': '--hostfile $AZ_BATCHAI_MPI_HOST_FILE ',
|
||||
'intelmpi': '-hosts $AZ_BATCH_HOST_LIST ',
|
||||
'local': ''
|
||||
}
|
||||
|
||||
fake_param = {
|
||||
'openmpi': '-x FAKE=True ',
|
||||
'intelmpi': '-env FAKE=True ',
|
||||
'local': ' FAKE=True '
|
||||
}
|
||||
|
||||
fake_length_param = {
|
||||
'openmpi': '-x FAKE_DATA_LENGTH={} ',
|
||||
'intelmpi': '-env FAKE_DATA_LENGTH={} ',
|
||||
'local': ' FAKE_DATA_LENGTH={} '
|
||||
}
|
||||
|
||||
|
||||
def _hosts_for(mpitype, node_count):
|
||||
if node_count > 1:
|
||||
return hosts_param.get(mpitype, '')
|
||||
else:
|
||||
return hosts_param.get('local')
|
||||
|
||||
|
||||
def _fake_for(mpitype, data):
|
||||
if data is None:
|
||||
return fake_param.get(mpitype, '')
|
||||
else:
|
||||
return ''
|
||||
|
||||
|
||||
def _fake_length_for(mpitype, fake_length, data):
|
||||
if data is None:
|
||||
return fake_length_param.get(mpitype, '').format(fake_length)
|
||||
else:
|
||||
return ''
|
||||
|
||||
|
||||
def _prepare_command(mpitype, total_processes, processes_per_node, script, node_count, data=None,
|
||||
synthetic_length=1281167):
|
||||
command = cmd_choice_dict.get(mpitype, cmd_for_intel)
|
||||
return command.format(total_processes=total_processes,
|
||||
processes_per_node=processes_per_node,
|
||||
script=script,
|
||||
hosts=_hosts_for(mpitype, node_count),
|
||||
fake=_fake_for(mpitype, data),
|
||||
fake_length=_fake_length_for(mpitype, synthetic_length, data))
|
||||
|
||||
|
||||
def append_data_paths(job_template_dict, data_path):
|
||||
job_template_dict['properties']['inputDirectories'].extend([{
|
||||
"id": "TRAIN",
|
||||
"path": data_path,
|
||||
},
|
||||
{
|
||||
"id": "TEST",
|
||||
"path": data_path,
|
||||
}])
|
||||
return job_template_dict
|
||||
|
||||
|
||||
def generate_job_dict(image_name,
|
||||
command,
|
||||
node_count=2):
|
||||
return {
|
||||
"$schema": "https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2017-09-01-preview/job.json",
|
||||
"properties": {
|
||||
"nodeCount": node_count,
|
||||
"customToolkitSettings": {
|
||||
"commandLine": command
|
||||
},
|
||||
"stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs",
|
||||
"inputDirectories": [{
|
||||
"id": "SCRIPTS",
|
||||
"path": "$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts"
|
||||
},
|
||||
],
|
||||
"outputDirectories": [{
|
||||
"id": "MODEL",
|
||||
"pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs",
|
||||
"pathSuffix": "Models"
|
||||
}],
|
||||
"containerSettings": {
|
||||
"imageSourceRegistry": {
|
||||
"image": image_name
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def generate_job_dict_gloo(image_name,
|
||||
script,
|
||||
node_count=2):
|
||||
# Command is hard-coded for time-being
|
||||
# Not sure what world-size is?? Probably node_count but check
|
||||
return {
|
||||
"$schema": "https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2018-05-01/job.json",
|
||||
"properties": {
|
||||
"pyTorchSettings": {
|
||||
"pythonScriptFilePath": script,
|
||||
"commandLineArgs": "--world-size 2 --dist-backend $AZ_BATCHAI_PYTORCH_BACKEND --dist-url $AZ_BATCHAI_PYTORCH_INIT_METHOD --rank $AZ_BATCHAI_TASK_INDEX",
|
||||
"communicationBackend": "gloo"
|
||||
},
|
||||
"nodeCount": node_count,
|
||||
"stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs",
|
||||
"inputDirectories": [{
|
||||
"id": "SCRIPTS",
|
||||
"path": "$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts"
|
||||
},
|
||||
{
|
||||
"id": "TRAIN",
|
||||
"path": "$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet",
|
||||
},
|
||||
{
|
||||
"id": "TEST",
|
||||
"path": "$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet",
|
||||
},
|
||||
],
|
||||
"outputDirectories": [{
|
||||
"id": "MODEL",
|
||||
"pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs",
|
||||
"pathSuffix": "Models"
|
||||
}],
|
||||
"containerSettings": {
|
||||
"imageSourceRegistry": {
|
||||
"image": image_name
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def generate_job_dict_cntk(image_name,
|
||||
node_count=2,
|
||||
processes_per_node=4,
|
||||
env_var=[]):
|
||||
return {
|
||||
"$schema": "https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2018-03-01/job.json",
|
||||
"properties": {
|
||||
"nodeCount": node_count,
|
||||
"cntkSettings": {
|
||||
"pythonScriptFilePath": "$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_cntk.py",
|
||||
"processCount": processes_per_node
|
||||
},
|
||||
"environmentVariables": env_var,
|
||||
"stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs",
|
||||
"inputDirectories": [{
|
||||
"id": "SCRIPTS",
|
||||
"path": "$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts"
|
||||
},
|
||||
{
|
||||
"id": "TRAIN",
|
||||
"path": "$AZ_BATCHAI_MOUNT_ROOT/imagenet",
|
||||
},
|
||||
{
|
||||
"id": "TEST",
|
||||
"path": "$AZ_BATCHAI_MOUNT_ROOT/imagenet",
|
||||
},
|
||||
],
|
||||
"outputDirectories": [{
|
||||
"id": "MODEL",
|
||||
"pathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs",
|
||||
"pathSuffix": "Models"
|
||||
}],
|
||||
"containerSettings": {
|
||||
"imageSourceRegistry": {
|
||||
"image": image_name
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def write_json_to_file(json_dict, filename, mode='w'):
|
||||
with open(filename, mode) as outfile:
|
||||
json.dump(json_dict, outfile, indent=4, sort_keys=True)
|
||||
outfile.write('\n\n')
|
||||
|
||||
|
||||
def synthetic_data_job(image_name,
|
||||
mpitype,
|
||||
script,
|
||||
filename='job.json',
|
||||
node_count=2,
|
||||
total_processes=None,
|
||||
processes_per_node=4,
|
||||
synthetic_length=1281167,
|
||||
framework='horovod'):
|
||||
logger.info('Creating manifest for job with synthetic data {} with {} image...'.format(
|
||||
filename, image_name))
|
||||
total_processes = processes_per_node * \
|
||||
node_count if total_processes is None else total_processes
|
||||
if framework == 'gloo':
|
||||
job_template = generate_job_dict_gloo(image_name,
|
||||
script,
|
||||
node_count=node_count)
|
||||
elif framework == 'cntk':
|
||||
env_var = [{"name": "DISTRIBUTED", "value": "True"},
|
||||
{"name": "FAKE", "value": "True"},
|
||||
{"name": "FAKE_DATA_LENGTH", "value": str(synthetic_length)}]
|
||||
job_template = generate_job_dict_cntk(image_name,
|
||||
node_count,
|
||||
processes_per_node,
|
||||
env_var)
|
||||
elif framework == 'horovod':
|
||||
command = _prepare_command(mpitype,
|
||||
total_processes,
|
||||
processes_per_node,
|
||||
script,
|
||||
node_count,
|
||||
synthetic_length=synthetic_length)
|
||||
job_template = generate_job_dict(image_name,
|
||||
command,
|
||||
node_count=node_count)
|
||||
else:
|
||||
raise ValueError("Wrong framework argument {}".format(framework))
|
||||
write_json_to_file(job_template, filename)
|
||||
logger.info('Done')
|
||||
|
||||
|
||||
def imagenet_data_job(image_name,
|
||||
mpitype,
|
||||
script,
|
||||
data_path,
|
||||
filename='job.json',
|
||||
node_count=2,
|
||||
total_processes=None,
|
||||
processes_per_node=4):
|
||||
logger.info('Creating manifest for job with real data {} with {} image...'.format(
|
||||
filename, image_name))
|
||||
total_processes = processes_per_node * \
|
||||
node_count if total_processes is None else total_processes
|
||||
# non-synthetic gloo to add
|
||||
command = _prepare_command(mpitype,
|
||||
total_processes,
|
||||
processes_per_node,
|
||||
script,
|
||||
node_count,
|
||||
data=data_path)
|
||||
job_template = generate_job_dict(image_name,
|
||||
command,
|
||||
node_count=node_count)
|
||||
job_template = append_data_paths(job_template, data_path)
|
||||
write_json_to_file(job_template, filename)
|
||||
logger.info('Done')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Generate manifest')
|
||||
parser.add_argument('docker_image', type=str,
|
||||
help='docker image to use')
|
||||
parser.add_argument('mpi', type=str,
|
||||
help='mpi to use, must be install in the docker image provided options:[intelmpi, openmpi, local]')
|
||||
parser.add_argument('script', type=str,
|
||||
help='script to run')
|
||||
parser.add_argument('--filename', '-f', dest='filename', type=str, nargs='?',
|
||||
default='job.json',
|
||||
help='name of the file to save job spec to')
|
||||
parser.add_argument('--node_count', '-n', dest='node_count', type=int, nargs='?',
|
||||
default=1, help='the number of nodes to run the job across')
|
||||
parser.add_argument('--ppn', dest='processes_per_node', type=int, nargs='?',
|
||||
default=4,
|
||||
help='number of GPU proceses to run per node')
|
||||
parser.add_argument('--data', dest='data', type=str, nargs='?',
|
||||
default=None,
|
||||
help='the path where the imagenet data is stored')
|
||||
parser.add_argument('--synthetic_length', '-l', dest='synthetic_length', type=str, nargs='?',
|
||||
default=1281167,
|
||||
help='the length of the fake data [default=size of imagenet 1281167]')
|
||||
parser.add_argument('--framework', '-fw', type=str, nargs='?', default='horovod',
|
||||
help='the framework used to generate the configuration, options:[horovod, gloo, cntk]')
|
||||
args = parser.parse_args()
|
||||
if args.data is None:
|
||||
synthetic_data_job(args.docker_image,
|
||||
args.mpi,
|
||||
args.script,
|
||||
filename=args.filename,
|
||||
node_count=args.node_count,
|
||||
processes_per_node=args.processes_per_node,
|
||||
synthetic_length=args.synthetic_length,
|
||||
framework=args.framework)
|
||||
else:
|
||||
imagenet_data_job(args.docker_image,
|
||||
args.mpi,
|
||||
args.script,
|
||||
args.data,
|
||||
filename=args.filename,
|
||||
node_count=args.node_count,
|
||||
processes_per_node=args.processes_per_node)
|
|
@ -1,137 +0,0 @@
|
|||
include ../experiments_config.mk
|
||||
|
||||
NFS_NAME=batch${ID}nfs
|
||||
EXPERIMENT:=experiment_imagenet_local_${GPU_TYPE}
|
||||
NFS_IP:=""
|
||||
|
||||
include ../../include/control.mk
|
||||
|
||||
define submit_keras_intel
|
||||
$(call generate_job_intel,masalvar/horovod-intel-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2),$(3))
|
||||
$(call submit_job,$(4))
|
||||
endef
|
||||
|
||||
define submit_keras
|
||||
$(call generate_job_openmpi,masalvar/horovod-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2),$(3))
|
||||
$(call submit_job,$(4))
|
||||
endef
|
||||
|
||||
define submit_keras_local
|
||||
$(call generate_job_local,masalvar/horovod-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2))
|
||||
$(call submit_job,$(3))
|
||||
endef
|
||||
|
||||
define submit_tf_intel
|
||||
$(call generate_job_intel,masalvar/horovod-intel:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1),$(2),$(3))
|
||||
$(call submit_job,$(4))
|
||||
endef
|
||||
|
||||
define submit_tf
|
||||
$(call generate_job_openmpi,masalvar/horovod:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1),$(2),$(3))
|
||||
$(call submit_job,$(4))
|
||||
endef
|
||||
|
||||
define submit_tf_local
|
||||
$(call generate_job_local,masalvar/horovod-intel:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1), $(2))
|
||||
$(call submit_job, $(3))
|
||||
endef
|
||||
|
||||
define submit_pytorch
|
||||
$(call generate_job_openmpi,masalvar/horovod-pytorch,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1),$(2),$(3))
|
||||
$(call submit_job,$(4))
|
||||
endef
|
||||
|
||||
define submit_pytorch_intel
|
||||
$(call generate_job_intel,masalvar/horovod-intel-pytorch:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1),$(2),$(3))
|
||||
$(call submit_job,$(4))
|
||||
endef
|
||||
|
||||
define submit_pytorch_local
|
||||
$(call generate_job_local,masalvar/horovod-pytorch,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1), $(2))
|
||||
$(call submit_job, $(3))
|
||||
endef
|
||||
|
||||
define upload-data-nfs
|
||||
scp nodeprep.sh mat@$(1):~/
|
||||
ssh mat@$(1) sudo chmod 777 ~/nodeprep.sh
|
||||
ssh mat@$(1) ./nodeprep.sh
|
||||
endef
|
||||
|
||||
upload-nodeprep-scripts: set-storage
|
||||
$(call upload_script, ../../cluster_config/docker.service)
|
||||
$(call upload_script, ../../cluster_config/nodeprep.sh)
|
||||
|
||||
upload-download-script: set-storage
|
||||
$(call upload_script, downloaddata.sh)
|
||||
|
||||
create-cluster: upload-nodeprep-scripts upload-download-script
|
||||
az batchai cluster create \
|
||||
-w $(WORKSPACE) \
|
||||
--name ${CLUSTER_NAME} \
|
||||
--image UbuntuLTS \
|
||||
--vm-size ${VM_SIZE} \
|
||||
--min ${NUM_NODES} --max ${NUM_NODES} \
|
||||
--afs-name ${FILE_SHARE_NAME} \
|
||||
--afs-mount-path extfs \
|
||||
--user-name mat \
|
||||
--password dnstvxrz \
|
||||
--storage-account-name $(STORAGE_ACCOUNT_NAME) \
|
||||
--storage-account-key $(azure_storage_key) \
|
||||
--nfs ${NFS_NAME} \
|
||||
--nfs-mount-path nfs \
|
||||
--config-file ../../cluster_config/cluster.json
|
||||
|
||||
upload-data-nfs:
|
||||
$(call upload-data-nfs,$(NFS_IP))
|
||||
|
||||
|
||||
submit-keras-intel32:
|
||||
$(call submit_keras_intel,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,keras-intel-32)
|
||||
|
||||
submit-keras-intel16:
|
||||
$(call submit_keras_intel,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,keras-intel-16)
|
||||
|
||||
submit-keras-intel8:
|
||||
$(call submit_keras_intel,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,keras-intel-8)
|
||||
|
||||
submit-keras-intel4:
|
||||
$(call submit_keras_intel,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,keras-intel-4)
|
||||
|
||||
submit-keras-local:
|
||||
$(call submit_keras_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,keras-local)
|
||||
|
||||
|
||||
|
||||
submit-tf-intel32:
|
||||
$(call submit_tf_intel,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,tf-intel-32)
|
||||
|
||||
submit-tf-intel16:
|
||||
$(call submit_tf_intel,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,tf-intel-16)
|
||||
|
||||
submit-tf-intel8:
|
||||
$(call submit_tf_intel,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,tf-intel-8)
|
||||
|
||||
submit-tf-intel4:
|
||||
$(call submit_tf_intel,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,tf-intel-4)
|
||||
|
||||
submit-tf-local:
|
||||
$(call submit_tf_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,tf-local)
|
||||
|
||||
|
||||
submit-pytorch32:
|
||||
$(call submit_pytorch,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,pytorch-32)
|
||||
|
||||
submit-pytorch16:
|
||||
$(call submit_pytorch,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,pytorch-16)
|
||||
|
||||
submit-pytorch8:
|
||||
$(call submit_pytorch,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,pytorch-8)
|
||||
|
||||
submit-pytorch4:
|
||||
$(call submit_pytorch,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,pytorch-4)
|
||||
|
||||
submit-pytorch-intel4:
|
||||
$(call submit_pytorch_intel,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,pytorch-intel-4)
|
||||
|
||||
submit-pytorch-local:
|
||||
$(call submit_pytorch_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/imagenet ,pytorch-local)
|
|
@ -1,10 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# Download data
|
||||
mkdir -p $AZ_BATCHAI_MOUNT_ROOT/imagenet
|
||||
rsync --info=progress2 $AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet/train.tar.gz $AZ_BATCHAI_MOUNT_ROOT/imagenet
|
||||
rsync --info=progress2 $AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet/validation.tar.gz $AZ_BATCHAI_MOUNT_ROOT/imagenet
|
||||
rsync --info=progress2 $AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet/train.csv $AZ_BATCHAI_MOUNT_ROOT/imagenet
|
||||
rsync --info=progress2 $AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet/validation.csv $AZ_BATCHAI_MOUNT_ROOT/imagenet
|
||||
cd $AZ_BATCHAI_MOUNT_ROOT/imagenet
|
||||
tar -xzf train.tar.gz
|
||||
tar -xzf validation.tar.gz
|
|
@ -1,125 +0,0 @@
|
|||
include ../experiments_config.mk
|
||||
|
||||
NFS_NAME=batch${ID}nfs
|
||||
EXPERIMENT:=experiment_imagenet_${GPU_TYPE}
|
||||
NFS_IP:=""
|
||||
|
||||
include ../../include/control.mk
|
||||
|
||||
define submit_keras_intel
|
||||
$(call generate_job_intel,masalvar/horovod-intel-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2),$(3))
|
||||
$(call submit_job,$(4))
|
||||
endef
|
||||
|
||||
define submit_keras
|
||||
$(call generate_job_openmpi,masalvar/horovod-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2),$(3))
|
||||
$(call submit_job,$(4))
|
||||
endef
|
||||
|
||||
define submit_keras_local
|
||||
$(call generate_job_local,masalvar/horovod-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2))
|
||||
$(call submit_job,$(3))
|
||||
endef
|
||||
|
||||
define submit_tf_intel
|
||||
$(call generate_job_intel,masalvar/horovod-intel:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1),$(2),$(3))
|
||||
$(call submit_job,$(4))
|
||||
endef
|
||||
|
||||
define submit_tf
|
||||
$(call generate_job_openmpi,masalvar/horovod:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1),$(2),$(3))
|
||||
$(call submit_job,$(4))
|
||||
endef
|
||||
|
||||
define submit_tf_local
|
||||
$(call generate_job_local,masalvar/horovod-intel:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1), $(2))
|
||||
$(call submit_job, $(3))
|
||||
endef
|
||||
|
||||
define submit_pytorch
|
||||
$(call generate_job_openmpi,masalvar/horovod-pytorch,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1),$(2),$(3))
|
||||
$(call submit_job,$(4))
|
||||
endef
|
||||
|
||||
define submit_pytorch_local
|
||||
$(call generate_job_local,masalvar/horovod-pytorch,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1), $(2))
|
||||
$(call submit_job, $(3))
|
||||
endef
|
||||
|
||||
define upload-data-nfs
|
||||
scp nodeprep.sh mat@$(1):~/
|
||||
ssh mat@$(1) sudo chmod 777 ~/nodeprep.sh
|
||||
ssh mat@$(1) ./nodeprep.sh
|
||||
endef
|
||||
|
||||
create-cluster: upload-nodeprep-scripts
|
||||
az batchai cluster create \
|
||||
-w $(WORKSPACE) \
|
||||
--name ${CLUSTER_NAME} \
|
||||
--image UbuntuLTS \
|
||||
--vm-size ${VM_SIZE} \
|
||||
--min ${NUM_NODES} --max ${NUM_NODES} \
|
||||
--afs-name ${FILE_SHARE_NAME} \
|
||||
--afs-mount-path extfs \
|
||||
--user-name mat \
|
||||
--password dnstvxrz \
|
||||
--storage-account-name $(STORAGE_ACCOUNT_NAME) \
|
||||
--storage-account-key $(azure_storage_key) \
|
||||
--nfs ${NFS_NAME} \
|
||||
--nfs-mount-path nfs \
|
||||
--config-file ../../cluster_config/cluster.json
|
||||
|
||||
upload-data-nfs:
|
||||
$(call upload-data-nfs,$(NFS_IP))
|
||||
|
||||
submit-all: submit-keras-intel32 submit-keras-intel16 submit-keras-intel8 submit-keras-intel4 submit-tf-intel32 \
|
||||
submit-tf-intel16 submit-tf-intel8 submit-tf-intel4 submit-pytorch32 submit-pytorch16 submit-pytorch8 submit-pytorch4 \
|
||||
submit-keras-local submit-tf-local submit-pytorch-local
|
||||
|
||||
submit-keras-intel32:
|
||||
$(call submit_keras_intel,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,keras-intel-32)
|
||||
|
||||
submit-keras-intel16:
|
||||
$(call submit_keras_intel,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,keras-intel-16)
|
||||
|
||||
submit-keras-intel8:
|
||||
$(call submit_keras_intel,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,keras-intel-8)
|
||||
|
||||
submit-keras-intel4:
|
||||
$(call submit_keras_intel,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,keras-intel-4)
|
||||
|
||||
submit-keras-local:
|
||||
$(call submit_keras_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,keras-local)
|
||||
|
||||
|
||||
|
||||
submit-tf-intel32:
|
||||
$(call submit_tf_intel,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,tf-intel-32)
|
||||
|
||||
submit-tf-intel16:
|
||||
$(call submit_tf_intel,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,tf-intel-16)
|
||||
|
||||
submit-tf-intel8:
|
||||
$(call submit_tf_intel,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,tf-intel-8)
|
||||
|
||||
submit-tf-intel4:
|
||||
$(call submit_tf_intel,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,tf-intel-4)
|
||||
|
||||
submit-tf-local:
|
||||
$(call submit_tf_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,tf-local)
|
||||
|
||||
|
||||
submit-pytorch32:
|
||||
$(call submit_pytorch,8,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,pytorch-32)
|
||||
|
||||
submit-pytorch16:
|
||||
$(call submit_pytorch,4,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,pytorch-16)
|
||||
|
||||
submit-pytorch8:
|
||||
$(call submit_pytorch,2,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,pytorch-8)
|
||||
|
||||
submit-pytorch4:
|
||||
$(call submit_pytorch,1,$(PROCESSES_PER_NODE), --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,pytorch-4)
|
||||
|
||||
submit-pytorch-local:
|
||||
$(call submit_pytorch_local,1, --data \$$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet ,pytorch-local)
|
|
@ -1,29 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
wget https://gist.githubusercontent.com/msalvaris/073c28a9993d58498957294d20d74202/raw/916eefe763c71da49d8ed41cb8474bdc8021af33/install_azcopy
|
||||
chmod 777 install_azcopy
|
||||
sudo ./install_azcopy
|
||||
|
||||
mkdir -p /data/imagenet
|
||||
azcopy --source https://datasharesa.blob.core.windows.net/imagenet/validation.csv \
|
||||
--destination /data/imagenet/validation.csv\
|
||||
--source-sas "?se=2025-01-01&sp=r&sv=2017-04-17&sr=b&sig=7x3rN7c/nlXbnZ0gAFywd5Er3r6MdwCq97Vwvda25WE%3D"\
|
||||
--quiet
|
||||
|
||||
azcopy --source https://datasharesa.blob.core.windows.net/imagenet/validation.tar.gz \
|
||||
--destination /data/imagenet/validation.tar.gz\
|
||||
--source-sas "?se=2025-01-01&sp=r&sv=2017-04-17&sr=b&sig=zy8L4shZa3XXBe152hPnhXsyfBqCufDOz01a9ZHWU28%3D"\
|
||||
--quiet
|
||||
|
||||
azcopy --source https://datasharesa.blob.core.windows.net/imagenet/train.csv \
|
||||
--destination /data/imagenet/train.csv\
|
||||
--source-sas "?se=2025-01-01&sp=r&sv=2017-04-17&sr=b&sig=EUcahDDZcefOKtHoVWDh7voAC1BoxYNM512spFmjmDU%3D"\
|
||||
--quiet
|
||||
|
||||
azcopy --source https://datasharesa.blob.core.windows.net/imagenet/train.tar.gz \
|
||||
--destination /data/imagenet/train.tar.gz\
|
||||
--source-sas "?se=2025-01-01&sp=r&sv=2017-04-17&sr=b&sig=qP%2B7lQuFKHo5UhQKpHcKt6p5fHT21lPaLz1O/vv4FNU%3D"\
|
||||
--quiet
|
||||
|
||||
cd /data/imagenet
|
||||
tar -xvzf train.tar.gz
|
||||
tar -xzvf validation.tar.gz
|
|
@ -1,106 +0,0 @@
|
|||
from glob import glob
|
||||
import numpy as np
|
||||
import json
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def extract_mpi_type(file):
|
||||
return file.split('_')[-1].split('.')[0]
|
||||
|
||||
|
||||
def extract_gpu_type(file):
|
||||
return file.split('_')[-2]
|
||||
|
||||
|
||||
def extract_framework(file):
|
||||
return file.split('_')[0]
|
||||
|
||||
|
||||
def extract_images_per_second(data):
|
||||
def _extract(line_string):
|
||||
if 'Total images/sec: ' in line_string:
|
||||
return float(line_string.split(':')[-1].strip())
|
||||
|
||||
return np.array(list(
|
||||
filter(None,
|
||||
map(_extract, data))
|
||||
)).mean()
|
||||
|
||||
|
||||
def extract_total_duration(data):
|
||||
def _extract(line_string):
|
||||
if 'Total duration: ' in line_string:
|
||||
return float(line_string.split(':')[-1].strip())
|
||||
|
||||
return np.array(list(
|
||||
filter(None,
|
||||
map(_extract, data))
|
||||
)).mean()
|
||||
|
||||
|
||||
def extact_data_length(data):
|
||||
for line in data:
|
||||
if 'Data length: ' in line:
|
||||
return int(line.split(':')[-1].strip())
|
||||
|
||||
|
||||
def extract_batch_size(data):
|
||||
for line in data:
|
||||
if 'Batch size: ' in line:
|
||||
return int(line.split(':')[-1].strip().split(' ')[-1].strip(')'))
|
||||
|
||||
|
||||
def extact_dataset(data):
|
||||
for line in data:
|
||||
if 'Dataset: ' in line:
|
||||
return line.split(':')[-1].strip()
|
||||
|
||||
|
||||
def extract_num_devices(data):
|
||||
for line in data:
|
||||
if 'Num GPUs: ' in line:
|
||||
return int(float(line.split(': ')[-1].strip()))
|
||||
|
||||
|
||||
extraction_funcs = {
|
||||
'Images/Second': extract_images_per_second,
|
||||
'Batch Size': extract_batch_size,
|
||||
'Data Length': extact_data_length,
|
||||
'Total Duration': extract_total_duration,
|
||||
'Dataset': extact_dataset,
|
||||
'GPUs': extract_num_devices,
|
||||
}
|
||||
|
||||
|
||||
def parse_results(file):
|
||||
logger.info('Processing {}'.format(file))
|
||||
with open(file) as f:
|
||||
data = f.readlines()
|
||||
results_dict = {key: func(data) for key, func in extraction_funcs.items()}
|
||||
results_dict['MPI'] = extract_mpi_type(file)
|
||||
results_dict['GPU Type'] = extract_gpu_type(file)
|
||||
results_dict['Framework'] = extract_framework(file)
|
||||
return results_dict
|
||||
|
||||
|
||||
def write_json_to_file(json_dict, filename):
|
||||
""" Simple function to write JSON dictionaries to files
|
||||
"""
|
||||
with open(filename, 'w') as outfile:
|
||||
json.dump(json_dict, outfile)
|
||||
|
||||
|
||||
def main(path='*.results', filename='results.json'):
|
||||
logger.info('Reading files from {} and writing to {}'.format(path, filename))
|
||||
files = glob('*.results')
|
||||
logger.info('Found {} files'.format(len(files)))
|
||||
results = [parse_results(file) for file in files]
|
||||
logger.info('Writing results to {}'.format(filename))
|
||||
write_json_to_file(results, filename)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -1,161 +0,0 @@
|
|||
include ../experiments_config.mk
|
||||
|
||||
FAKE_DATA_LENGTH:=1281167
|
||||
EXPERIMENT:=experiment_synthetic_${GPU_TYPE}
|
||||
|
||||
include ../../include/control.mk
|
||||
|
||||
define submit_keras_intel
|
||||
$(call generate_job_intel,masalvar/horovod-intel-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH} )
|
||||
$(call submit_job, $(3))
|
||||
endef
|
||||
|
||||
define submit_keras
|
||||
$(call generate_job_openmpi,masalvar/horovod-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH})
|
||||
$(call submit_job, $(3))
|
||||
endef
|
||||
|
||||
define submit_keras_local
|
||||
$(call generate_job_local,masalvar/horovod-keras:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_keras_horovod.py,$(1), --synthetic_length ${FAKE_DATA_LENGTH})
|
||||
$(call submit_job, $(2))
|
||||
endef
|
||||
|
||||
define submit_tf_intel
|
||||
$(call generate_job_intel,masalvar/horovod-intel:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH})
|
||||
$(call submit_job, $(3))
|
||||
endef
|
||||
|
||||
define submit_tf
|
||||
$(call generate_job_openmpi,masalvar/horovod:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH})
|
||||
$(call submit_job, $(3))
|
||||
endef
|
||||
|
||||
define submit_tf_local
|
||||
$(call generate_job_local,masalvar/horovod-intel:9-1.8-.13.2,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py,$(1), --synthetic_length ${FAKE_DATA_LENGTH})
|
||||
$(call submit_job, $(2))
|
||||
endef
|
||||
|
||||
define submit_pytorch
|
||||
$(call generate_job_openmpi,masalvar/horovod-pytorch,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH})
|
||||
$(call submit_job, $(3))
|
||||
endef
|
||||
|
||||
define submit_pytorch_local
|
||||
$(call generate_job_local,masalvar/horovod-pytorch,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py,$(1), --synthetic_length ${FAKE_DATA_LENGTH})
|
||||
$(call submit_job, $(2))
|
||||
endef
|
||||
|
||||
define submit_pytorch_gloo
|
||||
$(call generate_job_gloo,iliauk/pytorch_gloo,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_gloo.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH})
|
||||
$(call submit_job, $(3))
|
||||
endef
|
||||
|
||||
define submit_cntk
|
||||
$(call generate_job_cntk,hoaphumanoid/cntk:distributed,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_cntk.py,$(1),$(2), --synthetic_length ${FAKE_DATA_LENGTH})
|
||||
$(call submit_job, $(3))
|
||||
endef
|
||||
|
||||
define submit_cntk_local
|
||||
$(call generate_job_local,hoaphumanoid/cntk:distributed,\$$AZ_BATCHAI_INPUT_SCRIPTS/imagenet_cntk.py,$(1), --synthetic_length ${FAKE_DATA_LENGTH})
|
||||
$(call submit_job, $(2))
|
||||
endef
|
||||
|
||||
|
||||
create-cluster: upload-nodeprep-scripts
|
||||
az batchai cluster create \
|
||||
-w $(WORKSPACE) \
|
||||
--name ${CLUSTER_NAME} \
|
||||
--image UbuntuLTS \
|
||||
--vm-size ${VM_SIZE} \
|
||||
--min ${NUM_NODES} --max ${NUM_NODES} \
|
||||
--afs-name ${FILE_SHARE_NAME} \
|
||||
--afs-mount-path extfs \
|
||||
--user-name mat \
|
||||
--password dnstvxrz \
|
||||
--storage-account-name $(STORAGE_ACCOUNT_NAME) \
|
||||
--storage-account-key $(azure_storage_key) \
|
||||
--config-file ../../cluster_config/cluster.json
|
||||
|
||||
|
||||
submit-all: submit-keras-intel32 submit-keras-intel16 submit-keras-intel8 submit-keras-intel4 \
|
||||
submit-tf-intel32 submit-tf-intel16 submit-tf-intel8 submit-tf-intel4 \
|
||||
submit-pytorch32 submit-pytorch16 submit-pytorch8 submit-pytorch4 \
|
||||
submit-pytorch_gloo32 submit-pytorch_gloo16 submit-pytorch_gloo8 submit-pytorch_gloo4 \
|
||||
submit-cntk32 submit-cntk16 submit-cntk8 submit-cntk4 \
|
||||
submit-keras-local submit-tf-local submit-pytorch-local submit_cntk_local
|
||||
|
||||
submit-keras-intel32:
|
||||
$(call submit_keras_intel,$(NUM_NODES),$(PROCESSES_PER_NODE),keras-intel-32)
|
||||
|
||||
submit-keras-intel16:
|
||||
$(call submit_keras_intel,4,$(PROCESSES_PER_NODE),keras-intel-16)
|
||||
|
||||
submit-keras-intel8:
|
||||
$(call submit_keras_intel,2,$(PROCESSES_PER_NODE),keras-intel-8)
|
||||
|
||||
submit-keras-intel4:
|
||||
$(call submit_keras_intel,1,$(PROCESSES_PER_NODE),keras-intel-4)
|
||||
|
||||
submit-keras-local:
|
||||
$(call submit_keras_local,1,keras-local)
|
||||
|
||||
|
||||
submit-tf-intel32:
|
||||
$(call submit_tf_intel,$(NUM_NODES),$(PROCESSES_PER_NODE),tf-intel-32)
|
||||
|
||||
submit-tf-intel16:
|
||||
$(call submit_tf_intel,4,$(PROCESSES_PER_NODE),tf-intel-16)
|
||||
|
||||
submit-tf-intel8:
|
||||
$(call submit_tf_intel,2,$(PROCESSES_PER_NODE),tf-intel-8)
|
||||
|
||||
submit-tf-intel4:
|
||||
$(call submit_tf_intel,1,$(PROCESSES_PER_NODE),tf-intel-4)
|
||||
|
||||
submit-tf-local:
|
||||
$(call submit_tf_local,1,tf-local)
|
||||
|
||||
|
||||
submit-pytorch32:
|
||||
$(call submit_pytorch,8,$(PROCESSES_PER_NODE),pytorch-32)
|
||||
|
||||
submit-pytorch16:
|
||||
$(call submit_pytorch,4,$(PROCESSES_PER_NODE),pytorch-16)
|
||||
|
||||
submit-pytorch8:
|
||||
$(call submit_pytorch,2,$(PROCESSES_PER_NODE),pytorch-8)
|
||||
|
||||
submit-pytorch4:
|
||||
$(call submit_pytorch,1,$(PROCESSES_PER_NODE),pytorch-4)
|
||||
|
||||
submit-pytorch-local:
|
||||
$(call submit_pytorch_local,1,pytorch-local)
|
||||
|
||||
|
||||
submit-cntk32:
|
||||
$(call submit_cntk,8,$(PROCESSES_PER_NODE),cntk-32)
|
||||
|
||||
submit-cntk16:
|
||||
$(call submit_cntk,4,$(PROCESSES_PER_NODE),cntk-16)
|
||||
|
||||
submit-cntk8:
|
||||
$(call submit_cntk,2,$(PROCESSES_PER_NODE),cntk-8)
|
||||
|
||||
submit-cntk4:
|
||||
$(call submit_cntk,1,$(PROCESSES_PER_NODE),cntk-4)
|
||||
|
||||
submit-cntk-local:
|
||||
$(call submit_cntk_local,1,cntk-local)
|
||||
|
||||
|
||||
submit-pytorch_gloo32:
|
||||
$(call submit_pytorch_gloo,8,$(PROCESSES_PER_NODE),pytorch_gloo-32)
|
||||
|
||||
submit-pytorch_gloo16:
|
||||
$(call submit_pytorch_gloo,4,$(PROCESSES_PER_NODE),pytorch_gloo-16)
|
||||
|
||||
submit-pytorch_gloo8:
|
||||
$(call submit_pytorch_gloo,2,$(PROCESSES_PER_NODE),pytorch_gloo-8)
|
||||
|
||||
submit-pytorch_gloo4:
|
||||
$(call submit_pytorch_gloo,1,$(PROCESSES_PER_NODE),pytorch_gloo-4)
|
|
@ -0,0 +1,29 @@
|
|||
import shutil
|
||||
|
||||
|
||||
def _copy_directories(src, dst):
|
||||
try:
|
||||
shutil.copytree(src, dst, ignore=shutil.ignore_patterns(".git"))
|
||||
except PermissionError:
|
||||
print(f"Could not copy files from {src} to {dst}, permission error")
|
||||
|
||||
|
||||
def _remove_directories(*directories):
|
||||
for folder in directories:
|
||||
shutil.rmtree(folder)
|
||||
|
||||
|
||||
def _copy_env_file():
|
||||
shutil.move("_dotenv_template", ".env")
|
||||
|
||||
|
||||
_CHOICES_DICT = {
|
||||
"template": ("TensorFlow_benchmark", "TensorFlow_imagenet"),
|
||||
"benchmark": ("TensorFlow_experiment", "TensorFlow_imagenet"),
|
||||
"imagenet": ("TensorFlow_benchmark", "TensorFlow_experiment")
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
_copy_env_file()
|
||||
if {{cookiecutter._remove_unused_projects}}:
|
||||
_remove_directories(_CHOICES_DICT.get({{cookiecutter.type}}, tuple()))
|
|
@ -0,0 +1,23 @@
|
|||
import os
|
||||
import shutil
|
||||
|
||||
|
||||
def _remove_directory(dirpath):
|
||||
if os.path.exists(dirpath):
|
||||
try:
|
||||
print(f"Deleting directory {dirpath}")
|
||||
shutil.rmtree(dirpath)
|
||||
except PermissionError:
|
||||
print(
|
||||
f"The directory contains files that can't be removed please delete {dirpath} and run again"
|
||||
)
|
||||
|
||||
|
||||
_remove_directory("{{cookiecutter.experiment_name}}")
|
||||
print(
|
||||
"""
|
||||
Generating project {{cookiecutter.project_name}}
|
||||
"""
|
||||
)
|
||||
|
||||
|
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 240 KiB |
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
После Ширина: | Высота: | Размер: 57 KiB |
108
include/build.mk
108
include/build.mk
|
@ -1,108 +0,0 @@
|
|||
define PROJECT_HELP_MSG
|
||||
Usage:
|
||||
make help show this message
|
||||
make build make Horovod TF image with Open MPI
|
||||
make build-intel make Horovod TF image with Intel MPI
|
||||
make run-mpi run training using Open MPI image
|
||||
make run-mpi-intel run training using Intel MPI image
|
||||
make run run training in non-distributed mode
|
||||
make push push Horovod TF image with Open MPI
|
||||
make push-intel push Horovod TF image with Intel MPI
|
||||
endef
|
||||
export PROJECT_HELP_MSG
|
||||
|
||||
DATA_DIR:=/mnt/imagenet
|
||||
#DATA_DIR:=/mnt/rmdsk
|
||||
PWD:=$(shell pwd)
|
||||
FAKE:='False'
|
||||
FAKE_DATA_LENGTH:=1281167
|
||||
ROOT:=$(shell dirname ${PWD})
|
||||
|
||||
|
||||
setup_volumes:=-v $(PWD)/src:/mnt/script \
|
||||
-v $(DATA_DIR):/mnt/input \
|
||||
-v $(DATA_DIR)/temp/model:/mnt/model \
|
||||
-v $(DATA_DIR)/temp/output:/mnt/output \
|
||||
-v $(ROOT)/common:/mnt/common
|
||||
|
||||
|
||||
setup_environment:=--env AZ_BATCHAI_INPUT_TRAIN='/mnt/input' \
|
||||
--env AZ_BATCHAI_INPUT_TEST='/mnt/input' \
|
||||
--env AZ_BATCHAI_OUTPUT_MODEL='/mnt/model' \
|
||||
--env AZ_BATCHAI_JOB_TEMP_DIR='/mnt/output' \
|
||||
--env AZ_BATCHAI_INPUT_SCRIPTS='/mnt/script' \
|
||||
--env PYTHONPATH=/mnt/common/:$$PYTHONPATH
|
||||
|
||||
|
||||
define execute_mpi
|
||||
nvidia-docker run -it \
|
||||
--shm-size="8g" \
|
||||
$(setup_volumes) \
|
||||
$(setup_environment) \
|
||||
--env DISTRIBUTED='True' \
|
||||
--env FAKE=$(FAKE) \
|
||||
--env FAKE_DATA_LENGTH=$(FAKE_DATA_LENGTH) \
|
||||
--privileged \
|
||||
$(1) bash -c "mpirun -np 2 -H localhost:2 python $(2)"
|
||||
endef
|
||||
|
||||
define execute_mpi_intel
|
||||
nvidia-docker run -it \
|
||||
--shm-size="8g" \
|
||||
$(setup_volumes) \
|
||||
$(setup_environment) \
|
||||
--env DISTRIBUTED='True' \
|
||||
--env FAKE=$(FAKE) \
|
||||
--env FAKE_DATA_LENGTH=$(FAKE_DATA_LENGTH) \
|
||||
--privileged \
|
||||
$(1) bash -c " source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh; mpirun -n 2 -host localhost -ppn 2 -env I_MPI_DAPL_PROVIDER=ofa-v2-ib0 -env I_MPI_DYNAMIC_CONNECTION=0 python $(2)"
|
||||
endef
|
||||
|
||||
define execute
|
||||
nvidia-docker run -it \
|
||||
--shm-size="8g" \
|
||||
$(setup_volumes) \
|
||||
$(setup_environment) \
|
||||
--env DISTRIBUTED='False' \
|
||||
--env FAKE=$(FAKE) \
|
||||
--env FAKE_DATA_LENGTH=$(FAKE_DATA_LENGTH) \
|
||||
$(1) bash -c "python $(2)"
|
||||
endef
|
||||
|
||||
define execute_jupyter
|
||||
nvidia-docker run -p 8888:8888 -it \
|
||||
--shm-size="8g" \
|
||||
$(setup_volumes) \
|
||||
$(setup_environment) \
|
||||
$(1) bash -c "jupyter notebook --ip=* --no-browser --allow-root"
|
||||
endef
|
||||
|
||||
help:
|
||||
echo "$$PROJECT_HELP_MSG" | less
|
||||
|
||||
build:
|
||||
docker build -t $(image-open) $(open-path)
|
||||
|
||||
build-intel:
|
||||
docker build -t $(image-intel) $(intel-path)
|
||||
|
||||
run-mpi:
|
||||
$(call execute_mpi, $(image-open), $(script))
|
||||
|
||||
run-mpi-intel:
|
||||
$(call execute_mpi_intel, $(image-intel), $(script))
|
||||
|
||||
run:
|
||||
$(call execute, $(image-open), $(script))
|
||||
|
||||
run-jupyter:
|
||||
$(call execute_jupyter, $(image-open))
|
||||
|
||||
push:
|
||||
docker push $(image-open)
|
||||
|
||||
push-intel:
|
||||
docker push $(image-intel)
|
||||
|
||||
|
||||
.PHONY: help build push
|
|
@ -1,341 +0,0 @@
|
|||
define PROJECT_HELP_MSG
|
||||
Usage:
|
||||
make help show this message
|
||||
make build build docker image
|
||||
make push push container
|
||||
make run run benchmarking container
|
||||
make setup setup the cluster
|
||||
make show-cluster
|
||||
make list-clusters
|
||||
make run-bait-intel run batch ai benchamrk using intel mpi
|
||||
make run-bait-openmpi run batch ai benchmark using open mpi
|
||||
make run-bait-local run batch ai benchmark on one node
|
||||
make list-jobs
|
||||
make list-files
|
||||
make stream-stdout
|
||||
make stream-stderr
|
||||
make delete-job
|
||||
make delete-cluster
|
||||
make delete delete everything including experiments, workspace and resource group
|
||||
endef
|
||||
export PROJECT_HELP_MSG
|
||||
|
||||
|
||||
define generate_job_intel
|
||||
python ../generate_job_spec.py $(1) intelmpi \
|
||||
$(2) \
|
||||
--filename job.json \
|
||||
--node_count $(3) \
|
||||
--ppn $(4) \
|
||||
$(5)
|
||||
endef
|
||||
|
||||
|
||||
define generate_job_openmpi
|
||||
python ../generate_job_spec.py $(1) openmpi \
|
||||
$(2) \
|
||||
--filename job.json \
|
||||
--node_count $(3) \
|
||||
--ppn $(4) \
|
||||
$(5)
|
||||
endef
|
||||
|
||||
|
||||
define generate_job_local
|
||||
python ../generate_job_spec.py $(1) local \
|
||||
$(2) \
|
||||
--filename job.json \
|
||||
--node_count 1 \
|
||||
--ppn $(3) \
|
||||
$(4)
|
||||
endef
|
||||
|
||||
|
||||
define generate_job_gloo
|
||||
python ../generate_job_spec.py $(1) openmpi \
|
||||
$(2) \
|
||||
--filename job.json \
|
||||
--node_count $(3) \
|
||||
--ppn $(4) \
|
||||
$(5) \
|
||||
--framework gloo
|
||||
endef
|
||||
|
||||
|
||||
define generate_job_cntk
|
||||
python ../generate_job_spec.py $(1) openmpi \
|
||||
$(2) \
|
||||
--filename job.json \
|
||||
--node_count $(3) \
|
||||
--ppn $(4) \
|
||||
$(5) \
|
||||
--framework cntk
|
||||
endef
|
||||
|
||||
|
||||
define stream_stdout
|
||||
az batchai job file stream -w $(WORKSPACE) -e $(EXPERIMENT) \
|
||||
--j $(1) --output-directory-id stdouterr -f stdout.txt
|
||||
endef
|
||||
|
||||
|
||||
define submit_job
|
||||
az batchai job create -n $(1) --cluster ${CLUSTER_NAME} -w $(WORKSPACE) -e $(EXPERIMENT) -f job.json
|
||||
endef
|
||||
|
||||
define delete_job
|
||||
az batchai job delete -w $(WORKSPACE) -e $(EXPERIMENT) --name $(1) -y
|
||||
endef
|
||||
|
||||
define upload_script
|
||||
az storage file upload --share-name ${FILE_SHARE_NAME} --source $(1) --path scripts --account-name $(azure_storage_account) --account-key $(azure_storage_key)
|
||||
endef
|
||||
|
||||
select-subscription:
|
||||
az login -o table
|
||||
az account set --subscription $(SELECTED_SUBSCRIPTION)
|
||||
|
||||
create-resource-group:
|
||||
az group create -n $(GROUP_NAME) -l $(LOCATION) -o table
|
||||
|
||||
create-storage:
|
||||
@echo "Creating storage account"
|
||||
az storage account create -l $(LOCATION) -n $(STORAGE_ACCOUNT_NAME) -g $(GROUP_NAME) --sku Standard_LRS
|
||||
|
||||
set-storage:
|
||||
$(eval azure_storage_key:=$(shell az storage account keys list -n $(STORAGE_ACCOUNT_NAME) -g $(GROUP_NAME) | jq '.[0]["value"]'))
|
||||
$(eval azure_storage_account:= $(STORAGE_ACCOUNT_NAME))
|
||||
$(eval file_share_name:= $(FILE_SHARE_NAME))
|
||||
|
||||
set-az-defaults:
|
||||
az configure --defaults location=${LOCATION}
|
||||
az configure --defaults group=${GROUP_NAME}
|
||||
|
||||
create-fileshare: set-storage
|
||||
@echo "Creating fileshare"
|
||||
az storage share create -n $(file_share_name) --account-name $(azure_storage_account) --account-key $(azure_storage_key)
|
||||
|
||||
create-directory: set-storage
|
||||
az storage directory create --share-name $(file_share_name) --name scripts --account-name $(azure_storage_account) --account-key $(azure_storage_key)
|
||||
|
||||
create-nfs:
|
||||
az batchai file-server create -n $(NFS_NAME) -w ${WORKSPACE} --disk-count 4 --disk-size 250 -s Standard_DS4_v2 -u mat -p d13NHAL! -g ${GROUP_NAME} --storage-sku Premium_LRS
|
||||
|
||||
list-nfs:
|
||||
az batchai file-server list -o table -w ${WORKSPACE} -g ${GROUP_NAME}
|
||||
|
||||
create-container: set-storage
|
||||
az storage container create --account-name $(azure_storage_account) --account-key $(azure_storage_key) --name ${CONTAINER_NAME}
|
||||
|
||||
upload-scripts: set-storage
|
||||
$(call upload_script, ../../HorovodKeras/src/data_generator.py)
|
||||
$(call upload_script, ../../HorovodKeras/src/imagenet_keras_horovod.py)
|
||||
$(call upload_script, ../../HorovodTF/src/imagenet_estimator_tf_horovod.py)
|
||||
$(call upload_script, ../../HorovodTF/src/resnet_model.py)
|
||||
$(call upload_script, ../../HorovodPytorch/src/imagenet_pytorch_horovod.py)
|
||||
$(call upload_script, ../../CNTK/src/imagenet_cntk.py)
|
||||
$(call upload_script, ../../CNTK/src/resnet_models.py)
|
||||
$(call upload_script, ../../Pytorch/src/imagenet_pytorch_gloo.py)
|
||||
$(call upload_script, ../../common/timer.py)
|
||||
|
||||
upload-nodeprep-scripts: set-storage
|
||||
$(call upload_script, ../../cluster_config/docker.service)
|
||||
$(call upload_script, ../../cluster_config/nodeprep.sh)
|
||||
|
||||
create-workspace:
|
||||
az batchai workspace create -n $(WORKSPACE) -g $(GROUP_NAME)
|
||||
|
||||
create-experiment:
|
||||
az batchai experiment create -n $(EXPERIMENT) -g $(GROUP_NAME) -w $(WORKSPACE)
|
||||
|
||||
show-cluster:
|
||||
az batchai cluster show -n ${CLUSTER_NAME} -w $(WORKSPACE)
|
||||
|
||||
list-clusters:
|
||||
az batchai cluster list -w $(WORKSPACE) -o table
|
||||
|
||||
list-nodes:
|
||||
az batchai cluster node list -c ${CLUSTER_NAME} -w $(WORKSPACE)
|
||||
|
||||
list-jobs:
|
||||
az batchai job list -w $(WORKSPACE) -e $(EXPERIMENT) -o table
|
||||
|
||||
list-files:
|
||||
az batchai job file list -w $(WORKSPACE) -e $(EXPERIMENT) --j ${JOB_NAME} --output-directory-id stdouterr
|
||||
|
||||
stream-stdout:
|
||||
$(call stream_stdout, ${JOB_NAME})
|
||||
|
||||
stream-stderr:
|
||||
az batchai job file stream -w $(WORKSPACE) -e $(EXPERIMENT) --j ${JOB_NAME} --output-directory-id stdouterr -f stderr.txt
|
||||
|
||||
delete-job:
|
||||
$(call delete_job, ${JOB_NAME})
|
||||
|
||||
delete-cluster:
|
||||
az configure --defaults group=''
|
||||
az configure --defaults location=''
|
||||
az batchai cluster delete -w $(WORKSPACE) --name ${CLUSTER_NAME} -g ${GROUP_NAME} -y
|
||||
|
||||
delete: delete-cluster
|
||||
az batchai experiment delete -w $(WORKSPACE) --name ${experiment} -g ${GROUP_NAME} -y
|
||||
az batchai workspace delete -w ${WORKSPACE} -g ${GROUP_NAME} -y
|
||||
az group delete --name ${GROUP_NAME} -y
|
||||
|
||||
|
||||
setup: select-subscription create-resource-group create-workspace create-storage set-storage set-az-defaults create-fileshare create-directory upload-scripts create-cluster list-clusters create-experiment
|
||||
@echo "Cluster created"
|
||||
|
||||
#
|
||||
####### Submit Jobs ######
|
||||
#
|
||||
submit-all: submit-keras-intel32 submit-keras-intel16 submit-keras-intel8 submit-keras-intel4 \
|
||||
submit-tf-intel32 submit-tf-intel16 submit-tf-intel8 submit-tf-intel4 \
|
||||
submit-pytorch32 submit-pytorch16 submit-pytorch8 submit-pytorch4 \
|
||||
submit-pytorch_gloo32 submit-pytorch_gloo16 submit-pytorch_gloo8 submit-pytorch_gloo4 \
|
||||
submit-cntk32 submit-cntk16 submit-cntk8 submit-cntk4 \
|
||||
submit-keras-local submit-tf-local submit-pytorch-local submit_cntk_local
|
||||
|
||||
clean-jobs:
|
||||
$(call delete_job, tf-local)
|
||||
$(call delete_job, tf-intel-4)
|
||||
$(call delete_job, tf-intel-8)
|
||||
$(call delete_job, tf-intel-16)
|
||||
$(call delete_job, tf-intel-32)
|
||||
|
||||
$(call delete_job, keras-local)
|
||||
$(call delete_job, keras-intel-4)
|
||||
$(call delete_job, keras-intel-8)
|
||||
$(call delete_job, keras-intel-16)
|
||||
$(call delete_job, keras-intel-32)
|
||||
|
||||
$(call delete_job, pytorch-local)
|
||||
$(call delete_job, pytorch-4)
|
||||
$(call delete_job, pytorch-8)
|
||||
$(call delete_job, pytorch-16)
|
||||
$(call delete_job, pytorch-32)
|
||||
|
||||
$(call delete_job, pytorch_gloo-4)
|
||||
$(call delete_job, pytorch_gloo-8)
|
||||
$(call delete_job, pytorch_gloo-16)
|
||||
$(call delete_job, pytorch_gloo-32)
|
||||
|
||||
$(call delete_job, cntk-local)
|
||||
$(call delete_job, cntk-4)
|
||||
$(call delete_job, cntk-8)
|
||||
$(call delete_job, cntk-16)
|
||||
$(call delete_job, cntk-32)
|
||||
|
||||
####### Gather Results ######
|
||||
# TODO for PyTorch_Gloo
|
||||
|
||||
gather-results:results.json
|
||||
@echo "All results gathered"
|
||||
|
||||
results.json: pytorch_1gpulocal_$(GPU_TYPE)_local.results pytorch_4gpuopen_$(GPU_TYPE)_open.results \
|
||||
pytorch_8gpuopen_$(GPU_TYPE)_open.results pytorch_16gpuopen_$(GPU_TYPE)_open.results \
|
||||
pytorch_32gpuopen_$(GPU_TYPE)_open.results \
|
||||
pytorch_gloo_1gpulocal_$(GPU_TYPE)_local.results pytorch_gloo_4gpuopen_$(GPU_TYPE)_open.results \
|
||||
pytorch_gloo_8gpuopen_$(GPU_TYPE)_open.results pytorch_gloo_16gpuopen_$(GPU_TYPE)_open.results \
|
||||
pytorch_gloo_32gpuopen_$(GPU_TYPE)_open.results \
|
||||
tf_1gpulocal_$(GPU_TYPE)_local.results tf_4gpuintel_$(GPU_TYPE)_intel.results \
|
||||
tf_8gpuintel_$(GPU_TYPE)_intel.results tf_16gpuintel_$(GPU_TYPE)_intel.results \
|
||||
tf_32gpuintel_$(GPU_TYPE)_intel.results \
|
||||
keras_1gpulocal_$(GPU_TYPE)_local.results keras_4gpuintel_$(GPU_TYPE)_intel.results \
|
||||
keras_8gpuintel_$(GPU_TYPE)_intel.results keras_16gpuintel_$(GPU_TYPE)_intel.results \
|
||||
keras_32gpuintel_$(GPU_TYPE)_intel.results \
|
||||
cntk_1gpulocal_$(GPU_TYPE)_local.results cntk_4gpuintel_$(GPU_TYPE)_intel.results \
|
||||
cntk_8gpuintel_$(GPU_TYPE)_intel.results cntk_16gpuintel_$(GPU_TYPE)_intel.results \
|
||||
cntk_32gpuintel_$(GPU_TYPE)_intel.results
|
||||
python ../parse_results.py
|
||||
|
||||
|
||||
pytorch_1gpulocal_$(GPU_TYPE)_local.results:
|
||||
$(call stream_stdout, pytorch-local)>pytorch_1gpulocal_$(GPU_TYPE)_local.results
|
||||
|
||||
pytorch_4gpuopen_$(GPU_TYPE)_open.results:
|
||||
$(call stream_stdout, pytorch-4)>pytorch_4gpuopen_$(GPU_TYPE)_open.results
|
||||
|
||||
pytorch_8gpuopen_$(GPU_TYPE)_open.results:
|
||||
$(call stream_stdout, pytorch-8)>pytorch_8gpuopen_$(GPU_TYPE)_open.results
|
||||
|
||||
pytorch_16gpuopen_$(GPU_TYPE)_open.results:
|
||||
$(call stream_stdout, pytorch-16)>pytorch_16gpuopen_$(GPU_TYPE)_open.results
|
||||
|
||||
pytorch_32gpuopen_$(GPU_TYPE)_open.results:
|
||||
$(call stream_stdout, pytorch-32)>pytorch_32gpuopen_$(GPU_TYPE)_open.results
|
||||
|
||||
|
||||
pytorch_gloo_1gpulocal_$(GPU_TYPE)_local.results:
|
||||
$(call stream_stdout, pytorch_gloo-local)>pytorch_gloo_1gpulocal_$(GPU_TYPE)_local.results
|
||||
|
||||
pytorch_gloo_4gpuopen_$(GPU_TYPE)_open.results:
|
||||
$(call stream_stdout, pytorch_gloo-4)>pytorch_gloo_4gpuopen_$(GPU_TYPE)_open.results
|
||||
|
||||
pytorch_gloo_8gpuopen_$(GPU_TYPE)_open.results:
|
||||
$(call stream_stdout, pytorch_gloo-8)>pytorch_gloo_8gpuopen_$(GPU_TYPE)_open.results
|
||||
|
||||
pytorch_gloo_16gpuopen_$(GPU_TYPE)_open.results:
|
||||
$(call stream_stdout, pytorch_gloo-16)>pytorch_gloo_16gpuopen_$(GPU_TYPE)_open.results
|
||||
|
||||
pytorch_gloo_32gpuopen_$(GPU_TYPE)_open.results:
|
||||
$(call stream_stdout, pytorch_gloo-32)>pytorch_gloo_32gpuopen_$(GPU_TYPE)_open.results
|
||||
|
||||
tf_1gpulocal_$(GPU_TYPE)_local.results:
|
||||
$(call stream_stdout, tf-local)>tf_1gpulocal_$(GPU_TYPE)_local.results
|
||||
|
||||
tf_4gpuintel_$(GPU_TYPE)_intel.results:
|
||||
$(call stream_stdout, tf-intel-4)>tf_4gpuintel_$(GPU_TYPE)_intel.results
|
||||
|
||||
tf_8gpuintel_$(GPU_TYPE)_intel.results:
|
||||
$(call stream_stdout, tf-intel-8)>tf_8gpuintel_$(GPU_TYPE)_intel.results
|
||||
|
||||
tf_16gpuintel_$(GPU_TYPE)_intel.results:
|
||||
$(call stream_stdout, tf-intel-16)>tf_16gpuintel_$(GPU_TYPE)_intel.results
|
||||
|
||||
tf_32gpuintel_$(GPU_TYPE)_intel.results:
|
||||
$(call stream_stdout, tf-intel-32)>tf_32gpuintel_$(GPU_TYPE)_intel.results
|
||||
|
||||
|
||||
|
||||
keras_1gpulocal_$(GPU_TYPE)_local.results:
|
||||
$(call stream_stdout, keras-local)>keras_1gpulocal_$(GPU_TYPE)_local.results
|
||||
|
||||
keras_4gpuintel_$(GPU_TYPE)_intel.results:
|
||||
$(call stream_stdout, keras-intel-4)>keras_4gpuintel_$(GPU_TYPE)_intel.results
|
||||
|
||||
keras_8gpuintel_$(GPU_TYPE)_intel.results:
|
||||
$(call stream_stdout, keras-intel-8)>keras_8gpuintel_$(GPU_TYPE)_intel.results
|
||||
|
||||
keras_16gpuintel_$(GPU_TYPE)_intel.results:
|
||||
$(call stream_stdout, keras-intel-16)>keras_16gpuintel_$(GPU_TYPE)_intel.results
|
||||
|
||||
keras_32gpuintel_$(GPU_TYPE)_intel.results:
|
||||
$(call stream_stdout, keras-intel-32)>keras_32gpuintel_$(GPU_TYPE)_intel.results
|
||||
|
||||
|
||||
|
||||
cntk_1gpulocal_$(GPU_TYPE)_local.results:
|
||||
$(call stream_stdout, cntk-local)>cntk_1gpulocal_$(GPU_TYPE)_local.results
|
||||
|
||||
cntk_4gpuintel_$(GPU_TYPE)_intel.results:
|
||||
$(call stream_stdout, cntk-intel-4)>cntk_4gpuintel_$(GPU_TYPE)_intel.results
|
||||
|
||||
cntk_8gpuintel_$(GPU_TYPE)_intel.results:
|
||||
$(call stream_stdout, cntk-intel-8)>cntk_8gpuintel_$(GPU_TYPE)_intel.results
|
||||
|
||||
cntk_16gpuintel_$(GPU_TYPE)_intel.results:
|
||||
$(call stream_stdout, cntk-intel-16)>cntk_16gpuintel_$(GPU_TYPE)_intel.results
|
||||
|
||||
cntk_32gpuintel_$(GPU_TYPE)_intel.results:
|
||||
$(call stream_stdout, cntk-intel-32)>cntk_32gpuintel_$(GPU_TYPE)_intel.results
|
||||
|
||||
|
||||
clean-results:
|
||||
rm results.json
|
||||
rm *.results
|
||||
|
||||
make plot: results.json
|
||||
python ../produce_plot.py
|
||||
|
||||
.PHONY: help build push
|
|
@ -0,0 +1 @@
|
|||
{"Id": null, "Scope": "/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/amldistrg/providers/Microsoft.MachineLearningServices/workspaces/workspace"}
|
|
@ -0,0 +1,55 @@
|
|||
define PROJECT_HELP_MSG
|
||||
Makefile to control project aml_dist
|
||||
Usage:
|
||||
help show this message
|
||||
build build docker image to use as control plane
|
||||
bash run bash inside runnin docker container
|
||||
stop stop running docker container
|
||||
endef
|
||||
export PROJECT_HELP_MSG
|
||||
PWD:=$(shell pwd)
|
||||
PORT:=9999
|
||||
TBOARD_PORT:=6006
|
||||
NAME:=aml_dist # Name of running container
|
||||
|
||||
setup_environment_file:=--env-file .env
|
||||
include .env
|
||||
|
||||
local_code_volume:=-v $(PWD):/workspace
|
||||
volumes:=-v /tmp/azureml_runs:/tmp/azureml_runs \
|
||||
-v $(DATA):/data \
|
||||
-v ${HOME}/.bash_history:/root/.bash_history
|
||||
|
||||
|
||||
help:
|
||||
echo "$$PROJECT_HELP_MSG" | less
|
||||
|
||||
build:
|
||||
docker build -t $(IMAGE_NAME) -f control/Docker/dockerfile control/Docker
|
||||
|
||||
/tmp/azureml_runs:
|
||||
mkdir -p /tmp/azureml_runs
|
||||
|
||||
run: /tmp/azureml_runs
|
||||
# Start docker running as daemon
|
||||
docker run $(local_code_volume) $(volumes) $(setup_environment_file) \
|
||||
--name $(NAME) \
|
||||
-p $(PORT):$(PORT) \
|
||||
-p $(TBOARD_PORT):$(TBOARD_PORT) \
|
||||
-d \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
-e HIST_FILE=/root/.bash_history \
|
||||
-it $(IMAGE_NAME)
|
||||
|
||||
# Attach to running container and create new tmux session
|
||||
docker exec -it $(NAME) bash -c "tmux new -s dist -n control"
|
||||
|
||||
|
||||
bash:
|
||||
docker exec -it $(NAME) bash -c "tmux a -t dist"
|
||||
|
||||
stop:
|
||||
docker stop $(NAME)
|
||||
docker rm $(NAME)
|
||||
|
||||
.PHONY: help build run bash stop
|
|
@ -0,0 +1,14 @@
|
|||
name: project_environment
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.5.2 and later.
|
||||
- python=3.6.2
|
||||
- pandas
|
||||
- numpy
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-defaults
|
||||
- tensorflow==1.12.0
|
||||
- horovod==0.15.2
|
||||
- fire
|
||||
- toolz
|
|
@ -0,0 +1,14 @@
|
|||
name: project_environment
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.5.2 and later.
|
||||
- python=3.6.2
|
||||
- pandas
|
||||
- numpy
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-defaults
|
||||
- tensorflow-gpu==1.12.0
|
||||
- horovod==0.15.2
|
||||
- fire
|
||||
- toolz
|
|
@ -0,0 +1,91 @@
|
|||
from invoke import task, Collection
|
||||
import os
|
||||
from config import load_config
|
||||
|
||||
|
||||
_BASE_PATH = os.path.dirname(os.path.abspath(__file__))
|
||||
env_values = load_config()
|
||||
|
||||
|
||||
def _benchmark_code_exists():
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
return os.path.exists(os.path.join(dir_path, "src", "tf_cnn_benchmarks.py"))
|
||||
|
||||
|
||||
@task
|
||||
def clone_benchmarks(c):
|
||||
"""Clones the Tensorflow benchmarks from https://github.com/tensorflow/benchmarks.git into the src folder
|
||||
"""
|
||||
if _benchmark_code_exists():
|
||||
return None
|
||||
c.run(
|
||||
"git clone -b cnn_tf_v1.12_compatible https://github.com/tensorflow/benchmarks.git"
|
||||
)
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
c.run(
|
||||
f"cp -r benchmarks/scripts/tf_cnn_benchmarks/* {os.path.join(dir_path, 'src')}"
|
||||
)
|
||||
c.run("rm -r benchmarks")
|
||||
|
||||
|
||||
@task(pre=[clone_benchmarks])
|
||||
def submit_tf_benchmark(c, node_count=int(env_values["CLUSTER_MAX_NODES"])):
|
||||
"""Submits TensorFlow benchmark job using synthetic data on remote cluster
|
||||
|
||||
Args:
|
||||
node_count (int, optional): The number of nodes to use in cluster. Defaults to env_values['CLUSTER_MAX_NODES'].
|
||||
|
||||
Note:
|
||||
Runs ResNet 50 model with batch size of 256 and mixed precision
|
||||
"""
|
||||
from aml_compute import TFExperimentCLI
|
||||
|
||||
exp = TFExperimentCLI("tf_benchmark")
|
||||
run = exp.submit(
|
||||
os.path.join(_BASE_PATH, "src"),
|
||||
"tf_cnn_benchmarks.py",
|
||||
{
|
||||
"--model": "resnet50",
|
||||
"--batch_size": 256,
|
||||
"--variable_update": "horovod",
|
||||
"--use_fp16": "",
|
||||
},
|
||||
node_count=node_count,
|
||||
dependencies_file="TensorFlow_benchmark/environment_gpu.yml",
|
||||
wait_for_completion=True,
|
||||
)
|
||||
print(run)
|
||||
|
||||
|
||||
@task(pre=[clone_benchmarks])
|
||||
def submit_tf_benchmark_local(c):
|
||||
"""Submits TensorFlow benchmark job using synthetic data for local execution
|
||||
|
||||
Note:
|
||||
Runs ResNet 50 model with batch size of 256 and mixed precision
|
||||
"""
|
||||
from aml_compute import TFExperimentCLI
|
||||
|
||||
exp = TFExperimentCLI("tf_benchmark")
|
||||
run = exp.submit_local(
|
||||
os.path.join(_BASE_PATH, "src"),
|
||||
"tf_cnn_benchmarks.py",
|
||||
{
|
||||
"--model": "resnet50",
|
||||
"--batch_size": 256,
|
||||
"--variable_update": "horovod",
|
||||
"--use_fp16": "",
|
||||
},
|
||||
dependencies_file="TensorFlow_benchmark/environment_gpu.yml",
|
||||
wait_for_completion=True,
|
||||
)
|
||||
print(run)
|
||||
|
||||
|
||||
remote_collection = Collection("remote")
|
||||
remote_collection.add_task(submit_tf_benchmark, "synthetic")
|
||||
local_collection = Collection("local")
|
||||
local_collection.add_task(submit_tf_benchmark_local, "synthetic")
|
||||
submit_collection = Collection("submit", local_collection, remote_collection)
|
||||
namespace = Collection("tf_benchmark", submit_collection)
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
name: project_environment
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.5.2 and later.
|
||||
- python=3.6.2
|
||||
- pandas
|
||||
- numpy
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-defaults
|
||||
- tensorflow==1.12.0
|
||||
- horovod==0.15.2
|
||||
- fire
|
||||
- toolz
|
|
@ -0,0 +1,14 @@
|
|||
name: project_environment
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.5.2 and later.
|
||||
- python=3.6.2
|
||||
- pandas
|
||||
- numpy
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-defaults
|
||||
- tensorflow-gpu==1.12.0
|
||||
- horovod==0.15.2
|
||||
- fire
|
||||
- toolz
|
|
@ -0,0 +1,7 @@
|
|||
.ipynb_checkpoints
|
||||
azureml-logs
|
||||
.azureml
|
||||
.git
|
||||
outputs
|
||||
azureml-setup
|
||||
docs
|
|
@ -0,0 +1,33 @@
|
|||
[loggers]
|
||||
keys=root,__main__,tensorflow
|
||||
|
||||
[handlers]
|
||||
keys=consoleHandler
|
||||
|
||||
[formatters]
|
||||
keys=simpleFormatter
|
||||
|
||||
[logger_root]
|
||||
level=WARNING
|
||||
handlers=consoleHandler
|
||||
|
||||
[logger___main__]
|
||||
level=INFO
|
||||
handlers=consoleHandler
|
||||
qualname=__main__
|
||||
propagate=0
|
||||
|
||||
[logger_tensorflow]
|
||||
level=INFO
|
||||
handlers=consoleHandler
|
||||
qualname=tensorflow
|
||||
propagate=0
|
||||
|
||||
[handler_consoleHandler]
|
||||
class=StreamHandler
|
||||
level=INFO
|
||||
formatter=simpleFormatter
|
||||
args=(sys.stdout,)
|
||||
|
||||
[formatter_simpleFormatter]
|
||||
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
|
|
@ -0,0 +1,152 @@
|
|||
"""Script to train model using TensorFlow and Horovod
|
||||
|
||||
Please complete the necessary functions and assign values to the required variables
|
||||
|
||||
|
||||
For instructions on using TensorFLow see: https://www.tensorflow.org/
|
||||
For instructions on using Horovod see: https://github.com/horovod/horovod
|
||||
|
||||
"""
|
||||
import logging.config
|
||||
import fire
|
||||
import os
|
||||
import tensorflow as tf
|
||||
|
||||
DISTRIBUTED = False
|
||||
LR = 0.001
|
||||
MOMENTUM = 0.9
|
||||
NUM_CLASSES = #Number of classes for your dataset
|
||||
|
||||
if DISTRIBUTED:
|
||||
import horovod.tensorflow as hvd
|
||||
|
||||
|
||||
def _get_rank():
|
||||
if DISTRIBUTED:
|
||||
try:
|
||||
return hvd.rank()
|
||||
except:
|
||||
return 0
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def _get_optimizer(params, is_distributed=DISTRIBUTED):
|
||||
if is_distributed:
|
||||
# Horovod: add Horovod Distributed Optimizer.
|
||||
return hvd.DistributedOptimizer(
|
||||
tf.train.MomentumOptimizer(
|
||||
learning_rate=params["learning_rate"] * hvd.size(),
|
||||
momentum=params["momentum"],
|
||||
)
|
||||
)
|
||||
else:
|
||||
return tf.train.MomentumOptimizer(
|
||||
learning_rate=params["learning_rate"], momentum=params["momentum"]
|
||||
)
|
||||
|
||||
|
||||
def build_network(features, mode, params):
|
||||
""" Build Model
|
||||
|
||||
Args:
|
||||
features:
|
||||
mode:
|
||||
params:
|
||||
|
||||
Returns:
|
||||
Model function
|
||||
|
||||
"""
|
||||
return None
|
||||
|
||||
|
||||
def model_fn(features, labels, mode, params):
|
||||
"""Model function that returns the estimator spec
|
||||
|
||||
Args:
|
||||
features: This is the x-arg from the input_fn.
|
||||
labels: This is the y-arg from the input_fn,
|
||||
see e.g. train_input_fn for these two.
|
||||
mode: Either TRAIN, EVAL, or PREDICT
|
||||
params: User-defined hyper-parameters, e.g. learning-rate.
|
||||
Returns:
|
||||
tf.estimator.EstimatorSpec: Estimator specification
|
||||
"""
|
||||
return None
|
||||
|
||||
|
||||
def input_fn():
|
||||
"""Input function which provides batches for train or eval.
|
||||
|
||||
Returns:
|
||||
A dataset that can be used for iteration.
|
||||
"""
|
||||
return None
|
||||
|
||||
|
||||
def _get_runconfig(is_distributed=DISTRIBUTED, save_checkpoints_steps=None):
|
||||
if is_distributed:
|
||||
# Horovod: pin GPU to be used to process local rank (one GPU per process)
|
||||
config = tf.ConfigProto()
|
||||
config.gpu_options.allow_growth = True
|
||||
config.gpu_options.visible_device_list = str(hvd.local_rank())
|
||||
|
||||
return tf.estimator.RunConfig(
|
||||
save_checkpoints_steps=save_checkpoints_steps,
|
||||
save_checkpoints_secs=None,
|
||||
session_config=config,
|
||||
log_step_count_steps=100,
|
||||
)
|
||||
else:
|
||||
return tf.estimator.RunConfig(
|
||||
save_checkpoints_steps=save_checkpoints_steps,
|
||||
save_checkpoints_secs=None,
|
||||
log_step_count_steps=100,
|
||||
)
|
||||
|
||||
|
||||
def _get_hooks(is_distributed=DISTRIBUTED):
|
||||
logger = logging.getLogger(__name__)
|
||||
if is_distributed:
|
||||
bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
|
||||
logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(), hvd.size()))
|
||||
return [bcast_hook]
|
||||
else:
|
||||
return []
|
||||
|
||||
|
||||
def main():
|
||||
"""Train your model
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
if DISTRIBUTED:
|
||||
# Horovod: initialize Horovod.
|
||||
hvd.init()
|
||||
logger.info("Running Distributed")
|
||||
logger.info("Num GPUs: {:.3f}".format(hvd.size()))
|
||||
|
||||
input_function = input_fn
|
||||
|
||||
run_config = _get_runconfig()
|
||||
|
||||
params = {
|
||||
"learning_rate": LR,
|
||||
"momentum": MOMENTUM,
|
||||
"classes": NUM_CLASSES,
|
||||
}
|
||||
logger.info("Creating estimator with params: {}".format(params))
|
||||
model = tf.estimator.Estimator(
|
||||
model_fn=model_fn, params=params, config=run_config
|
||||
)
|
||||
|
||||
hooks = _get_hooks()
|
||||
|
||||
model.train(input_fn=input_function, hooks=hooks)
|
||||
|
||||
model.evaluate(input_fn=input_function)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.config.fileConfig(os.getenv("LOG_CONFIG", "logging.conf"))
|
||||
fire.Fire(main)
|
|
@ -0,0 +1,126 @@
|
|||
""" This is an example template that you can use to create functions that you can call with invoke
|
||||
"""
|
||||
from invoke import task, Collection
|
||||
import os
|
||||
from config import load_config
|
||||
|
||||
|
||||
_BASE_PATH = os.path.dirname(os.path.abspath( __file__ ))
|
||||
env_values = load_config()
|
||||
|
||||
|
||||
@task
|
||||
def submit_local(c):
|
||||
"""This command isn't implemented please modify to use.
|
||||
|
||||
The call below will work for submitting jobs to execute locally on a GPU.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"You need to modify this call before being able to use it"
|
||||
)
|
||||
from aml_compute import TFExperimentCLI
|
||||
exp = TFExperimentCLI("<YOUR-EXPERIMENT-NAME>")
|
||||
run = exp.submit_local(
|
||||
os.path.join(_BASE_PATH, "src"),
|
||||
"<YOUR-TRAINING-SCRIPT>",
|
||||
{"YOUR": "ARGS"},
|
||||
dependencies_file="TensorFlow/environment_gpu.yml",
|
||||
wait_for_completion=True,
|
||||
)
|
||||
print(run)
|
||||
|
||||
|
||||
@task
|
||||
def submit_remote(c):
|
||||
"""This command isn't implemented please modify to use.
|
||||
|
||||
The call below will work for submitting jobs to execute on a remote cluster using GPUs.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"You need to modify this call before being able to use it"
|
||||
)
|
||||
from aml_compute import TFExperimentCLI
|
||||
exp = TFExperimentCLI("<YOUR-EXPERIMENT-NAME>")
|
||||
run = exp.submit(
|
||||
os.path.join(_BASE_PATH, "src"),
|
||||
"<YOUR-TRAINING-SCRIPT>",
|
||||
{"YOUR": "ARGS"},
|
||||
node_count=4,
|
||||
dependencies_file="TensorFlow/environment_gpu.yml",
|
||||
wait_for_completion=True,
|
||||
)
|
||||
print(run)
|
||||
|
||||
|
||||
@task
|
||||
def submit_images(c):
|
||||
"""This command isn't implemented please modify to use.
|
||||
|
||||
The call below will work for submitting jobs to execute on a remote cluster using GPUs.
|
||||
Notive that we are passing in a {datastore} parameter to the path. This tells the submit
|
||||
method that we want the location as mapped by the datastore to be inserted here. Upon
|
||||
execution the appropriate path will be preappended to the training_data_path and validation_data_path.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"You need to modify this call before being able to use it"
|
||||
)
|
||||
from aml_compute import TFExperimentCLI
|
||||
exp = TFExperimentCLI("<YOUR-EXPERIMENT-NAME>")
|
||||
run = exp.submit(
|
||||
os.path.join(_BASE_PATH, "src"),
|
||||
"<YOUR-TRAINING-SCRIPT>",
|
||||
{
|
||||
"--training_data_path": "{datastore}/train",
|
||||
"--validation_data_path": "{datastore}/validation",
|
||||
"--epochs": "1",
|
||||
"--data_type": "images",
|
||||
"--data-format": "channels_first",
|
||||
},
|
||||
node_count=4,
|
||||
dependencies_file="TensorFlow/environment_gpu.yml",
|
||||
wait_for_completion=True,
|
||||
)
|
||||
print(run)
|
||||
|
||||
|
||||
@task
|
||||
def submit_images_local(c):
|
||||
"""This command isn't implemented please modify to use.
|
||||
|
||||
The call below will work for submitting jobs to execute locally on a GPU.
|
||||
Here we also map a volume to the docker container executing locally. This is the
|
||||
location we tell our script to look for our training and validation data. Feel free to
|
||||
adjust the other arguments as required by your trainining script.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"You need to modify this call before being able to use it"
|
||||
)
|
||||
from aml_compute import TFExperimentCLI
|
||||
exp = TFExperimentCLI("<YOUR-EXPERIMENT-NAME>")
|
||||
run = exp.submit_local(
|
||||
os.path.join(_BASE_PATH, "src"),
|
||||
"<YOUR-TRAINING-SCRIPT>",
|
||||
{
|
||||
"--training_data_path": "/data/train",
|
||||
"--validation_data_path": "/data/validation",
|
||||
"--epochs": "1",
|
||||
"--data_type": "images",
|
||||
"--data-format": "channels_first",
|
||||
},
|
||||
dependencies_file="TensorFlow_imagenet/environment_gpu.yml",
|
||||
docker_args=["-v", f"{env_values['data']}:/data"],
|
||||
wait_for_completion=True,
|
||||
)
|
||||
print(run)
|
||||
|
||||
|
||||
remote_collection = Collection("remote")
|
||||
remote_collection.add_task(submit_images, "images")
|
||||
remote_collection.add_task(submit_remote, "synthetic")
|
||||
|
||||
local_collection = Collection("local")
|
||||
local_collection.add_task(submit_images_local, "images")
|
||||
local_collection.add_task(submit_local, "synthetic")
|
||||
|
||||
submit_collection = Collection("submit", local_collection, remote_collection)
|
||||
namespace = Collection("tf_experiment", submit_collection)
|
|
@ -0,0 +1,14 @@
|
|||
name: project_environment
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.5.2 and later.
|
||||
- python=3.6.2
|
||||
- pandas
|
||||
- numpy
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-defaults
|
||||
- tensorflow==1.12.0
|
||||
- horovod==0.15.2
|
||||
- fire
|
||||
- toolz
|
|
@ -0,0 +1,14 @@
|
|||
name: project_environment
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.5.2 and later.
|
||||
- python=3.6.2
|
||||
- pandas
|
||||
- numpy
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-defaults
|
||||
- tensorflow-gpu==1.12.0
|
||||
- horovod==0.15.2
|
||||
- fire
|
||||
- toolz
|
|
@ -0,0 +1,7 @@
|
|||
.ipynb_checkpoints
|
||||
azureml-logs
|
||||
.azureml
|
||||
.git
|
||||
outputs
|
||||
azureml-setup
|
||||
docs
|
|
@ -0,0 +1,209 @@
|
|||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import horovod.tensorflow as hvd
|
||||
import tensorflow as tf
|
||||
from toolz import pipe
|
||||
|
||||
import defaults
|
||||
import imagenet_preprocessing
|
||||
|
||||
_NOUNID_LOOKUP_FILE = "imagenet_nounid_to_class.json"
|
||||
|
||||
|
||||
def _create_nounid_lookup(nounid_lookup_file=_NOUNID_LOOKUP_FILE):
|
||||
|
||||
with open(nounid_lookup_file, "r") as read_file:
|
||||
nounid_lookup_dict = json.load(read_file)
|
||||
|
||||
def _lookup(nounid):
|
||||
return nounid_lookup_dict[nounid]
|
||||
|
||||
return _lookup
|
||||
|
||||
|
||||
def _load_data(data_dir):
|
||||
filenames = []
|
||||
labels = []
|
||||
lookup = _create_nounid_lookup()
|
||||
for path_obj in Path(data_dir).glob("**/*.JPEG"):
|
||||
filenames.append(str(path_obj))
|
||||
labels.append(lookup(path_obj.parts[-2]))
|
||||
return filenames, labels
|
||||
|
||||
|
||||
|
||||
def _preprocess_labels(label):
|
||||
return tf.cast(label, dtype=tf.int32)
|
||||
|
||||
|
||||
def _preprocess_images(filename):
|
||||
return pipe(filename, tf.read_file)
|
||||
|
||||
|
||||
def _prep(filename, num_label):
|
||||
return tf.data.Dataset.from_tensor_slices(
|
||||
([_preprocess_images(filename)], [_preprocess_labels(num_label)])
|
||||
)
|
||||
|
||||
|
||||
def parse_record(
|
||||
image_buffer,
|
||||
label,
|
||||
is_training,
|
||||
dtype,
|
||||
data_format="channels_last",
|
||||
image_size=defaults.DEFAULT_IMAGE_SIZE,
|
||||
num_channels=defaults.NUM_CHANNELS,
|
||||
):
|
||||
"""Parses a record containing a training example of an image.
|
||||
The input record is parsed into a label and image, and the image is passed
|
||||
through preprocessing steps (cropping, flipping, and so on).
|
||||
|
||||
Args:
|
||||
raw_record: scalar Tensor tf.string containing a serialized
|
||||
Example protocol buffer.
|
||||
is_training: A boolean denoting whether the input is for training.
|
||||
dtype: data type to use for images/features.
|
||||
data_format: the axis order of the matrix, channels_last NHWC or channels_first NCHW
|
||||
|
||||
Returns:
|
||||
Tuple with processed image tensor and one-hot-encoded label tensor.
|
||||
"""
|
||||
# image_buffer, label = raw_record
|
||||
|
||||
image = imagenet_preprocessing.preprocess_image(
|
||||
image_buffer=image_buffer,
|
||||
output_height=image_size,
|
||||
output_width=image_size,
|
||||
num_channels=num_channels,
|
||||
is_training=is_training,
|
||||
data_format=data_format,
|
||||
)
|
||||
image = tf.cast(image, dtype)
|
||||
|
||||
return image, label
|
||||
|
||||
|
||||
def process_image_dataset(dataset,
|
||||
is_training,
|
||||
batch_size,
|
||||
shuffle_buffer,
|
||||
parse_record_fn,
|
||||
num_epochs=1,
|
||||
dtype=tf.float32,
|
||||
data_format="channels_last",
|
||||
num_parallel_batches=1):
|
||||
"""Given a Dataset with raw records, return an iterator over the records.
|
||||
|
||||
Args:
|
||||
dataset: A Dataset representing raw records
|
||||
is_training: A boolean denoting whether the input is for training.
|
||||
batch_size: The number of samples per batch.
|
||||
shuffle_buffer: The buffer size to use when shuffling records. A larger
|
||||
value results in better randomness, but smaller values reduce startup
|
||||
time and use less memory.
|
||||
parse_record_fn: A function that takes a raw record and returns the
|
||||
corresponding (image, label) pair.
|
||||
num_epochs: The number of epochs to repeat the dataset.
|
||||
dtype: Data type to use for images/features.
|
||||
num_parallel_batches: Number of parallel batches for tf.data.
|
||||
|
||||
Returns:
|
||||
Dataset of (image, label) pairs ready for iteration.
|
||||
"""
|
||||
|
||||
# Prefetches a batch at a time to smooth out the time taken to load input
|
||||
# files for shuffling and processing.
|
||||
dataset = dataset.prefetch(buffer_size=batch_size)
|
||||
if is_training:
|
||||
# Shuffles records before repeating to respect epoch boundaries.
|
||||
dataset = dataset.shuffle(buffer_size=shuffle_buffer)
|
||||
|
||||
# Repeats the dataset for the number of epochs to train.
|
||||
dataset = dataset.repeat(num_epochs)
|
||||
|
||||
# Parses the raw records into images and labels.
|
||||
dataset = dataset.apply(
|
||||
tf.data.experimental.map_and_batch(
|
||||
lambda image, label: parse_record_fn(image, label, is_training, dtype, data_format=data_format),
|
||||
batch_size=batch_size,
|
||||
num_parallel_batches=num_parallel_batches,
|
||||
drop_remainder=False))
|
||||
|
||||
|
||||
dataset = dataset.prefetch(buffer_size=100)
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
def input_fn(
|
||||
is_training,
|
||||
data_dir,
|
||||
batch_size,
|
||||
num_epochs=1,
|
||||
dtype=tf.float32,
|
||||
num_parallel_batches=1,
|
||||
parse_record_fn=parse_record,
|
||||
data_format="channels_last",
|
||||
distributed=False,
|
||||
file_shuffle_buffer=1000,
|
||||
data_shuffle_buffer=defaults.SHUFFLE_BUFFER,
|
||||
):
|
||||
"""Input function which provides batches for train or eval.
|
||||
|
||||
Args:
|
||||
is_training: A boolean denoting whether the input is for training.
|
||||
data_dir: The directory containing the input data.
|
||||
batch_size: The number of samples per batch.
|
||||
num_epochs: The number of epochs to repeat the dataset.
|
||||
dtype: Data type to use for images/features
|
||||
num_parallel_batches: Number of parallel batches for tf.data.
|
||||
parse_record_fn: Function to use for parsing the records.
|
||||
|
||||
Returns:
|
||||
A dataset that can be used for iteration.
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info(f"Reading data info from {data_dir}")
|
||||
|
||||
buffer_length = 1024
|
||||
parallel_num = 5
|
||||
|
||||
filenames, labels = _load_data(data_dir)
|
||||
logger.info(f"Found {len(filenames)} files")
|
||||
dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
|
||||
|
||||
if is_training:
|
||||
# Shuffle the input files
|
||||
if distributed:
|
||||
dataset = dataset.shard(hvd.size(), hvd.local_rank())
|
||||
|
||||
dataset = dataset.shuffle(buffer_size=file_shuffle_buffer) # _NUM_TRAIN_FILES
|
||||
|
||||
# Convert to individual records.
|
||||
# cycle_length = 10 means 10 files will be read and deserialized in parallel.
|
||||
# This number is low enough to not cause too much contention on small systems
|
||||
# but high enough to provide the benefits of parallelization. You may want
|
||||
# to increase this number if you have a large number of CPU cores.
|
||||
dataset = dataset.apply(
|
||||
tf.data.experimental.parallel_interleave(
|
||||
_prep,
|
||||
cycle_length=parallel_num,
|
||||
buffer_output_elements=buffer_length,
|
||||
sloppy=True,
|
||||
)
|
||||
)
|
||||
|
||||
return process_image_dataset(
|
||||
dataset=dataset,
|
||||
is_training=is_training,
|
||||
batch_size=batch_size,
|
||||
shuffle_buffer=data_shuffle_buffer,
|
||||
parse_record_fn=parse_record_fn,
|
||||
num_epochs=num_epochs,
|
||||
dtype=dtype,
|
||||
num_parallel_batches=num_parallel_batches,
|
||||
data_format=data_format,
|
||||
)
|
|
@ -0,0 +1,52 @@
|
|||
import tensorflow as tf
|
||||
|
||||
|
||||
def get_synth_input_fn(height, width, num_channels, num_classes, dtype=tf.float32):
|
||||
"""Returns an input function that returns a dataset with random data.
|
||||
This input_fn returns a data set that iterates over a set of random data and
|
||||
bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
|
||||
copy is still included. This used to find the upper throughput bound when
|
||||
tunning the full input pipeline.
|
||||
|
||||
Args:
|
||||
height: Integer height that will be used to create a fake image tensor.
|
||||
width: Integer width that will be used to create a fake image tensor.
|
||||
num_channels: Integer depth that will be used to create a fake image tensor.
|
||||
num_classes: Number of classes that should be represented in the fake labels
|
||||
tensor
|
||||
dtype: Data type for features/images.
|
||||
|
||||
Returns:
|
||||
An input_fn that can be used in place of a real one to return a dataset
|
||||
that can be used for iteration.
|
||||
"""
|
||||
|
||||
def input_fn(
|
||||
is_training, data_dir, batch_size, *args, data_format="channels_last", **kwargs
|
||||
):
|
||||
"""Returns dataset filled with random data."""
|
||||
# Synthetic input should be within [0, 255].
|
||||
if data_format == "channels_last":
|
||||
shape = [height, width, num_channels]
|
||||
else:
|
||||
shape = [num_channels, height, width]
|
||||
inputs = tf.random.truncated_normal(
|
||||
[batch_size] + shape,
|
||||
dtype=dtype,
|
||||
mean=127,
|
||||
stddev=60,
|
||||
name="synthetic_inputs",
|
||||
)
|
||||
|
||||
labels = tf.random.uniform(
|
||||
[batch_size],
|
||||
minval=0,
|
||||
maxval=num_classes - 1,
|
||||
dtype=tf.int32,
|
||||
name="synthetic_labels",
|
||||
)
|
||||
data = tf.data.Dataset.from_tensors((inputs, labels)).repeat()
|
||||
data = data.prefetch(buffer_size=1024)
|
||||
return data
|
||||
|
||||
return input_fn
|
|
@ -0,0 +1,218 @@
|
|||
import logging
|
||||
import os
|
||||
|
||||
import horovod.tensorflow as hvd
|
||||
import tensorflow as tf
|
||||
|
||||
import defaults
|
||||
import imagenet_preprocessing
|
||||
|
||||
|
||||
def get_filenames(is_training, data_dir, num_files=1014):
|
||||
"""Return filenames for dataset."""
|
||||
if is_training:
|
||||
return [
|
||||
os.path.join(data_dir, "train-%05d-of-01014" % i) for i in range(num_files)
|
||||
]
|
||||
else:
|
||||
return [
|
||||
os.path.join(data_dir, "validation-%05d-of-00128" % i) for i in range(128)
|
||||
]
|
||||
|
||||
|
||||
def _parse_example_proto(example_serialized):
|
||||
"""Parses an Example proto containing a training example of an image.
|
||||
The output of the build_image_data.py image preprocessing script is a dataset
|
||||
containing serialized Example protocol buffers. Each Example proto contains
|
||||
the following fields (values are included as examples):
|
||||
image/height: 462
|
||||
image/width: 581
|
||||
image/colorspace: 'RGB'
|
||||
image/channels: 3
|
||||
image/class/label: 615
|
||||
image/class/text: 'knee pad'
|
||||
image/format: 'JPEG'
|
||||
image/filename: 'ILSVRC2012_val_00041207.JPEG'
|
||||
image/encoded: <JPEG encoded string>
|
||||
|
||||
Args:
|
||||
example_serialized: scalar Tensor tf.string containing a serialized
|
||||
Example protocol buffer.
|
||||
|
||||
Returns:
|
||||
image_buffer: Tensor tf.string containing the contents of a JPEG file.
|
||||
label: Tensor tf.int32 containing the label.
|
||||
"""
|
||||
feature_map = {
|
||||
"image/encoded": tf.io.FixedLenFeature([], dtype=tf.string, default_value=""),
|
||||
"image/class/label": tf.io.FixedLenFeature(
|
||||
[], dtype=tf.int64, default_value=-1
|
||||
),
|
||||
"image/class/text": tf.io.FixedLenFeature(
|
||||
[], dtype=tf.string, default_value=""
|
||||
),
|
||||
}
|
||||
|
||||
features = tf.io.parse_single_example(
|
||||
serialized=example_serialized, features=feature_map
|
||||
)
|
||||
label = tf.cast(features["image/class/label"], dtype=tf.int32)
|
||||
return features["image/encoded"], label
|
||||
|
||||
|
||||
def parse_record(
|
||||
raw_record,
|
||||
is_training,
|
||||
dtype,
|
||||
data_format="channels_last",
|
||||
image_size=defaults.DEFAULT_IMAGE_SIZE,
|
||||
num_channels=defaults.NUM_CHANNELS,
|
||||
):
|
||||
"""Parses a record containing a training example of an image.
|
||||
The input record is parsed into a label and image, and the image is passed
|
||||
through preprocessing steps (cropping, flipping, and so on).
|
||||
|
||||
Args:
|
||||
raw_record: scalar Tensor tf.string containing a serialized
|
||||
Example protocol buffer.
|
||||
is_training: A boolean denoting whether the input is for training.
|
||||
dtype: data type to use for images/features.
|
||||
data_format: the axis order of the matrix, channels_last NHWC or channels_first NCHW
|
||||
|
||||
Returns:
|
||||
Tuple with processed image tensor and one-hot-encoded label tensor.
|
||||
"""
|
||||
image_buffer, label = _parse_example_proto(raw_record)
|
||||
|
||||
image = imagenet_preprocessing.preprocess_image(
|
||||
image_buffer=image_buffer,
|
||||
output_height=image_size,
|
||||
output_width=image_size,
|
||||
num_channels=num_channels,
|
||||
is_training=is_training,
|
||||
data_format=data_format,
|
||||
)
|
||||
image = tf.cast(image, dtype)
|
||||
|
||||
return image, label
|
||||
|
||||
|
||||
def input_fn(
|
||||
is_training,
|
||||
data_dir,
|
||||
batch_size,
|
||||
num_epochs=1,
|
||||
dtype=tf.float32,
|
||||
num_parallel_batches=1,
|
||||
parse_record_fn=parse_record,
|
||||
data_format="channels_last",
|
||||
distributed=False,
|
||||
file_shuffle_buffer=10,
|
||||
data_shuffle_buffer=defaults.SHUFFLE_BUFFER,
|
||||
):
|
||||
"""Input function which provides batches for train or eval.
|
||||
|
||||
Args:
|
||||
is_training: A boolean denoting whether the input is for training.
|
||||
data_dir: The directory containing the input data.
|
||||
batch_size: The number of samples per batch.
|
||||
num_epochs: The number of epochs to repeat the dataset.
|
||||
dtype: Data type to use for images/features
|
||||
num_parallel_batches: Number of parallel batches for tf.data.
|
||||
parse_record_fn: Function to use for parsing the records.
|
||||
|
||||
Returns:
|
||||
A dataset that can be used for iteration.
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info(f"Reading data info from {data_dir}")
|
||||
filenames = get_filenames(is_training, data_dir)
|
||||
for f in filenames:
|
||||
if not os.path.exists(f):
|
||||
raise ValueError(f"{f} File doesn't exist ")
|
||||
logger.info(f"Found {len(filenames)} files")
|
||||
dataset = tf.data.Dataset.from_tensor_slices(filenames)
|
||||
|
||||
if is_training:
|
||||
# Shuffle the input files
|
||||
if distributed:
|
||||
dataset = dataset.shard(hvd.size(), hvd.local_rank())
|
||||
|
||||
dataset = dataset.shuffle(buffer_size=file_shuffle_buffer) # _NUM_TRAIN_FILES
|
||||
|
||||
# Convert to individual records.
|
||||
# cycle_length = 10 means 10 files will be read and deserialized in parallel.
|
||||
# This number is low enough to not cause too much contention on small systems
|
||||
# but high enough to provide the benefits of parallelization. You may want
|
||||
# to increase this number if you have a large number of CPU cores.
|
||||
dataset = dataset.apply(
|
||||
tf.data.experimental.parallel_interleave(
|
||||
tf.data.TFRecordDataset,
|
||||
cycle_length=num_parallel_batches,
|
||||
buffer_output_elements=10,
|
||||
)
|
||||
)
|
||||
|
||||
return process_record_dataset(
|
||||
dataset=dataset,
|
||||
is_training=is_training,
|
||||
batch_size=batch_size,
|
||||
shuffle_buffer=data_shuffle_buffer,
|
||||
parse_record_fn=parse_record_fn,
|
||||
num_epochs=num_epochs,
|
||||
dtype=dtype,
|
||||
num_parallel_batches=num_parallel_batches,
|
||||
data_format=data_format,
|
||||
)
|
||||
|
||||
|
||||
def process_record_dataset(dataset,
|
||||
is_training,
|
||||
batch_size,
|
||||
shuffle_buffer,
|
||||
parse_record_fn,
|
||||
num_epochs=1,
|
||||
dtype=tf.float32,
|
||||
data_format="channels_last",
|
||||
num_parallel_batches=1):
|
||||
"""Given a Dataset with raw records, return an iterator over the records.
|
||||
|
||||
Args:
|
||||
dataset: A Dataset representing raw records
|
||||
is_training: A boolean denoting whether the input is for training.
|
||||
batch_size: The number of samples per batch.
|
||||
shuffle_buffer: The buffer size to use when shuffling records. A larger
|
||||
value results in better randomness, but smaller values reduce startup
|
||||
time and use less memory.
|
||||
parse_record_fn: A function that takes a raw record and returns the
|
||||
corresponding (image, label) pair.
|
||||
num_epochs: The number of epochs to repeat the dataset.
|
||||
dtype: Data type to use for images/features.
|
||||
num_parallel_batches: Number of parallel batches for tf.data.
|
||||
|
||||
Returns:
|
||||
Dataset of (image, label) pairs ready for iteration.
|
||||
"""
|
||||
|
||||
# Prefetches a batch at a time to smooth out the time taken to load input
|
||||
# files for shuffling and processing.
|
||||
dataset = dataset.prefetch(buffer_size=batch_size)
|
||||
if is_training:
|
||||
# Shuffles records before repeating to respect epoch boundaries.
|
||||
dataset = dataset.shuffle(buffer_size=shuffle_buffer)
|
||||
|
||||
# Repeats the dataset for the number of epochs to train.
|
||||
dataset = dataset.repeat(num_epochs)
|
||||
|
||||
# Parses the raw records into images and labels.
|
||||
dataset = dataset.apply(
|
||||
tf.data.experimental.map_and_batch(
|
||||
lambda value: parse_record_fn(value, is_training, dtype, data_format=data_format),
|
||||
batch_size=batch_size,
|
||||
num_parallel_batches=num_parallel_batches,
|
||||
drop_remainder=False))
|
||||
|
||||
|
||||
dataset = dataset.prefetch(buffer_size=1024)
|
||||
|
||||
return dataset
|
|
@ -0,0 +1,25 @@
|
|||
import os
|
||||
|
||||
from utils import str_to_bool
|
||||
|
||||
LR = 0.001
|
||||
EPOCHS = os.getenv("EPOCHS", 5)
|
||||
_BATCHSIZE = 64
|
||||
R_MEAN = 123.68
|
||||
G_MEAN = 116.78
|
||||
B_MEAN = 103.94
|
||||
BUFFER = 256
|
||||
DEFAULT_IMAGE_SIZE = 224
|
||||
NUM_CHANNELS = 3
|
||||
NUM_CLASSES = 1001
|
||||
NUM_IMAGES = {"train": 1_281_167, "validation": 50000}
|
||||
NUM_TRAIN_FILES = 1024
|
||||
SHUFFLE_BUFFER = 1000
|
||||
|
||||
DATA_LENGTH = int(
|
||||
os.getenv("FAKE_DATA_LENGTH", 1_281_167)
|
||||
) # How much fake data to simulate, default to size of imagenet dataset
|
||||
|
||||
DATASET_NAME = "ImageNet"
|
||||
|
||||
DISTRIBUTED = str_to_bool(os.getenv("DISTRIBUTED", "False"))
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -0,0 +1,222 @@
|
|||
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Provides utilities to preprocess images.
|
||||
|
||||
Training images are sampled using the provided bounding boxes, and subsequently
|
||||
cropped to the sampled bounding box. Images are additionally flipped randomly,
|
||||
then resized to the target output size (without aspect-ratio preservation).
|
||||
|
||||
Images used during evaluation are resized (with aspect-ratio preservation) and
|
||||
centrally cropped.
|
||||
|
||||
All images undergo mean color subtraction.
|
||||
|
||||
Note that these steps are colloquially referred to as "ResNet preprocessing,"
|
||||
and they differ from "VGG preprocessing," which does not use bounding boxes
|
||||
and instead does an aspect-preserving resize followed by random crop during
|
||||
training. (These both differ from "Inception preprocessing," which introduces
|
||||
color distortion steps.)
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
_R_MEAN = 123.68
|
||||
_G_MEAN = 116.78
|
||||
_B_MEAN = 103.94
|
||||
_CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
|
||||
|
||||
# The lower bound for the smallest side of the image for aspect-preserving
|
||||
# resizing. For example, if an image is 500 x 1000, it will be resized to
|
||||
# _RESIZE_MIN x (_RESIZE_MIN * 2).
|
||||
_RESIZE_MIN = 256
|
||||
|
||||
|
||||
def _central_crop(image, crop_height, crop_width):
|
||||
"""Performs central crops of the given image list.
|
||||
|
||||
Args:
|
||||
image: a 3-D image tensor
|
||||
crop_height: the height of the image following the crop.
|
||||
crop_width: the width of the image following the crop.
|
||||
|
||||
Returns:
|
||||
3-D tensor with cropped image.
|
||||
"""
|
||||
shape = tf.shape(input=image)
|
||||
height, width = shape[0], shape[1]
|
||||
|
||||
amount_to_be_cropped_h = height - crop_height
|
||||
crop_top = amount_to_be_cropped_h // 2
|
||||
amount_to_be_cropped_w = width - crop_width
|
||||
crop_left = amount_to_be_cropped_w // 2
|
||||
return tf.slice(image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
|
||||
|
||||
|
||||
def _mean_image_subtraction(image, means, num_channels):
|
||||
"""Subtracts the given means from each image channel.
|
||||
|
||||
For example:
|
||||
means = [123.68, 116.779, 103.939]
|
||||
image = _mean_image_subtraction(image, means)
|
||||
|
||||
Note that the rank of `image` must be known.
|
||||
|
||||
Args:
|
||||
image: a tensor of size [height, width, C].
|
||||
means: a C-vector of values to subtract from each channel.
|
||||
num_channels: number of color channels in the image that will be distorted.
|
||||
|
||||
Returns:
|
||||
the centered image.
|
||||
|
||||
Raises:
|
||||
ValueError: If the rank of `image` is unknown, if `image` has a rank other
|
||||
than three or if the number of channels in `image` doesn't match the
|
||||
number of values in `means`.
|
||||
"""
|
||||
if image.get_shape().ndims != 3:
|
||||
raise ValueError("Input must be of size [height, width, C>0]")
|
||||
|
||||
if len(means) != num_channels:
|
||||
raise ValueError("len(means) must match the number of channels")
|
||||
|
||||
# We have a 1-D tensor of means; convert to 3-D.
|
||||
means = tf.expand_dims(tf.expand_dims(means, 0), 0)
|
||||
|
||||
return image - means
|
||||
|
||||
|
||||
def _smallest_size_at_least(height, width, resize_min):
|
||||
"""Computes new shape with the smallest side equal to `smallest_side`.
|
||||
|
||||
Computes new shape with the smallest side equal to `smallest_side` while
|
||||
preserving the original aspect ratio.
|
||||
|
||||
Args:
|
||||
height: an int32 scalar tensor indicating the current height.
|
||||
width: an int32 scalar tensor indicating the current width.
|
||||
resize_min: A python integer or scalar `Tensor` indicating the size of
|
||||
the smallest side after resize.
|
||||
|
||||
Returns:
|
||||
new_height: an int32 scalar tensor indicating the new height.
|
||||
new_width: an int32 scalar tensor indicating the new width.
|
||||
"""
|
||||
resize_min = tf.cast(resize_min, tf.float32)
|
||||
|
||||
# Convert to floats to make subsequent calculations go smoothly.
|
||||
height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
|
||||
|
||||
smaller_dim = tf.minimum(height, width)
|
||||
scale_ratio = resize_min / smaller_dim
|
||||
|
||||
# Convert back to ints to make heights and widths that TF ops will accept.
|
||||
new_height = tf.cast(height * scale_ratio, tf.int32)
|
||||
new_width = tf.cast(width * scale_ratio, tf.int32)
|
||||
|
||||
return new_height, new_width
|
||||
|
||||
|
||||
def _aspect_preserving_resize(image, resize_min):
|
||||
"""Resize images preserving the original aspect ratio.
|
||||
|
||||
Args:
|
||||
image: A 3-D image `Tensor`.
|
||||
resize_min: A python integer or scalar `Tensor` indicating the size of
|
||||
the smallest side after resize.
|
||||
|
||||
Returns:
|
||||
resized_image: A 3-D tensor containing the resized image.
|
||||
"""
|
||||
shape = tf.shape(input=image)
|
||||
height, width = shape[0], shape[1]
|
||||
|
||||
new_height, new_width = _smallest_size_at_least(height, width, resize_min)
|
||||
|
||||
return _resize_image(image, new_height, new_width)
|
||||
|
||||
|
||||
def _resize_image(image, height, width):
|
||||
"""Simple wrapper around tf.resize_images.
|
||||
|
||||
This is primarily to make sure we use the same `ResizeMethod` and other
|
||||
details each time.
|
||||
|
||||
Args:
|
||||
image: A 3-D image `Tensor`.
|
||||
height: The target height for the resized image.
|
||||
width: The target width for the resized image.
|
||||
|
||||
Returns:
|
||||
resized_image: A 3-D tensor containing the resized image. The first two
|
||||
dimensions have the shape [height, width].
|
||||
"""
|
||||
|
||||
return tf.image.resize_images(
|
||||
image,
|
||||
[height, width],
|
||||
method=tf.image.ResizeMethod.BILINEAR,
|
||||
align_corners=False,
|
||||
)
|
||||
|
||||
|
||||
def preprocess_image(
|
||||
image_buffer,
|
||||
output_height,
|
||||
output_width,
|
||||
num_channels,
|
||||
is_training=False,
|
||||
data_format="channels_last",
|
||||
):
|
||||
"""Preprocesses the given image.
|
||||
|
||||
Preprocessing includes decoding, cropping, and resizing for both training
|
||||
and eval images. Training preprocessing, however, introduces some random
|
||||
distortion of the image to improve accuracy.
|
||||
|
||||
Args:
|
||||
image_buffer: scalar string Tensor representing the raw JPEG image buffer.
|
||||
output_height: The height of the image after preprocessing.
|
||||
output_width: The width of the image after preprocessing.
|
||||
num_channels: Integer depth of the image buffer for decoding.
|
||||
is_training: `True` if we're preprocessing the image for training and
|
||||
`False` otherwise.
|
||||
|
||||
Returns:
|
||||
A preprocessed image.
|
||||
"""
|
||||
if is_training:
|
||||
# For training, we want to randomize some of the distortions.
|
||||
image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
|
||||
image = _resize_image(image, output_height, output_width)
|
||||
else:
|
||||
# For validation, we want to decode, resize, then just crop the middle.
|
||||
image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
|
||||
image = _aspect_preserving_resize(image, _RESIZE_MIN)
|
||||
image = _central_crop(image, output_height, output_width)
|
||||
|
||||
image.set_shape([output_height, output_width, num_channels])
|
||||
image = _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels)
|
||||
|
||||
if data_format == "channels_first":
|
||||
image = tf.transpose(image, [2, 0, 1]) # Transform from NHWC to NCHW
|
||||
image.set_shape([num_channels, output_height, output_width])
|
||||
|
||||
return image
|
|
@ -0,0 +1,33 @@
|
|||
[loggers]
|
||||
keys=root,__main__,tensorflow
|
||||
|
||||
[handlers]
|
||||
keys=consoleHandler
|
||||
|
||||
[formatters]
|
||||
keys=simpleFormatter
|
||||
|
||||
[logger_root]
|
||||
level=INFO
|
||||
handlers=consoleHandler
|
||||
|
||||
[logger___main__]
|
||||
level=DEBUG
|
||||
handlers=consoleHandler
|
||||
qualname=__main__
|
||||
propagate=0
|
||||
|
||||
[logger_tensorflow]
|
||||
level=DEBUG
|
||||
handlers=consoleHandler
|
||||
qualname=tensorflow
|
||||
propagate=0
|
||||
|
||||
[handler_consoleHandler]
|
||||
class=StreamHandler
|
||||
level=DEBUG
|
||||
formatter=simpleFormatter
|
||||
args=(sys.stdout,)
|
||||
|
||||
[formatter_simpleFormatter]
|
||||
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
|
|
@ -0,0 +1,308 @@
|
|||
""" This the script is the main entry point for training ResNet model using TensorFlow with Horovod
|
||||
|
||||
"""
|
||||
import logging
|
||||
import logging.config
|
||||
import os
|
||||
|
||||
import fire
|
||||
import tensorflow as tf
|
||||
|
||||
from data.synthetic import get_synth_input_fn
|
||||
from data import tfrecords, images
|
||||
from resnet_model import resnet_v1
|
||||
from timer import Timer
|
||||
from utils import ExamplesPerSecondHook
|
||||
import defaults
|
||||
|
||||
|
||||
if defaults.DISTRIBUTED:
|
||||
import horovod.tensorflow as hvd
|
||||
|
||||
|
||||
def _get_rank():
|
||||
if defaults.DISTRIBUTED:
|
||||
try:
|
||||
return hvd.rank()
|
||||
except:
|
||||
return 0
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
# Data processing
|
||||
###############################################################################
|
||||
|
||||
|
||||
def _get_optimizer(params, is_distributed=defaults.DISTRIBUTED):
|
||||
if is_distributed:
|
||||
# Horovod: add Horovod Distributed Optimizer.
|
||||
return hvd.DistributedOptimizer(
|
||||
tf.train.MomentumOptimizer(
|
||||
learning_rate=params["learning_rate"] * hvd.size(),
|
||||
momentum=params["momentum"],
|
||||
)
|
||||
)
|
||||
else:
|
||||
return tf.train.MomentumOptimizer(
|
||||
learning_rate=params["learning_rate"], momentum=params["momentum"]
|
||||
)
|
||||
|
||||
|
||||
def build_network(features, mode, params):
|
||||
""" Build ResNet50 Model
|
||||
|
||||
Args:
|
||||
features:
|
||||
mode:
|
||||
params:
|
||||
|
||||
Returns:
|
||||
Model function
|
||||
"""
|
||||
network = resnet_v1(
|
||||
resnet_depth=50,
|
||||
num_classes=params["classes"],
|
||||
data_format=params["data_format"],
|
||||
)
|
||||
return network(inputs=features, is_training=(mode == tf.estimator.ModeKeys.TRAIN))
|
||||
|
||||
|
||||
def model_fn(features, labels, mode, params):
|
||||
"""Model function that returns the estimator spec
|
||||
|
||||
Args:
|
||||
features: This is the x-arg from the input_fn.
|
||||
labels: This is the y-arg from the input_fn,
|
||||
see e.g. train_input_fn for these two.
|
||||
mode: Either TRAIN, EVAL, or PREDICT
|
||||
params: User-defined hyper-parameters, e.g. learning-rate.
|
||||
Returns:
|
||||
tf.estimator.EstimatorSpec: Estimator specification
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("Creating model in {} mode".format(mode))
|
||||
|
||||
logits = build_network(features, mode, params)
|
||||
|
||||
# Classification output of the neural network.
|
||||
y_pred_cls = tf.argmax(logits, axis=1)
|
||||
|
||||
if mode == tf.estimator.ModeKeys.PREDICT:
|
||||
# Softmax output of the neural network.
|
||||
y_pred = tf.nn.softmax(logits=logits)
|
||||
|
||||
predictions = {
|
||||
"class_ids": y_pred_cls,
|
||||
"probabilities": y_pred,
|
||||
"logits": logits,
|
||||
}
|
||||
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
|
||||
|
||||
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
|
||||
logits=logits, labels=labels
|
||||
)
|
||||
|
||||
loss = tf.reduce_mean(cross_entropy, name="loss")
|
||||
|
||||
accuracy = tf.metrics.accuracy(labels=labels, predictions=y_pred_cls, name="acc_op")
|
||||
metrics = {"accuracy": accuracy}
|
||||
|
||||
if mode == tf.estimator.ModeKeys.EVAL:
|
||||
eval_hook_list = []
|
||||
eval_tensors_log = {"acc": accuracy[1]}
|
||||
eval_hook_list.append(
|
||||
tf.train.LoggingTensorHook(tensors=eval_tensors_log, every_n_iter=100)
|
||||
)
|
||||
|
||||
return tf.estimator.EstimatorSpec(
|
||||
mode=mode,
|
||||
eval_metric_ops=metrics,
|
||||
loss=loss,
|
||||
evaluation_hooks=eval_hook_list,
|
||||
)
|
||||
|
||||
optimizer = _get_optimizer(params)
|
||||
|
||||
train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
|
||||
|
||||
train_hook_list = []
|
||||
train_tensors_log = {"loss": loss, "acc": accuracy[1]}
|
||||
train_hook_list.append(
|
||||
tf.train.LoggingTensorHook(tensors=train_tensors_log, every_n_iter=100)
|
||||
)
|
||||
|
||||
return tf.estimator.EstimatorSpec(
|
||||
mode=mode, loss=loss, train_op=train_op, training_hooks=train_hook_list
|
||||
)
|
||||
|
||||
|
||||
def _get_runconfig(is_distributed=defaults.DISTRIBUTED, save_checkpoints_steps=None):
|
||||
if is_distributed:
|
||||
# Horovod: pin GPU to be used to process local rank (one GPU per process)
|
||||
config = tf.ConfigProto()
|
||||
config.gpu_options.allow_growth = True
|
||||
config.gpu_options.visible_device_list = str(hvd.local_rank())
|
||||
|
||||
return tf.estimator.RunConfig(
|
||||
save_checkpoints_steps=save_checkpoints_steps,
|
||||
save_checkpoints_secs=None,
|
||||
session_config=config,
|
||||
log_step_count_steps=100,
|
||||
)
|
||||
else:
|
||||
return tf.estimator.RunConfig(
|
||||
save_checkpoints_steps=save_checkpoints_steps,
|
||||
save_checkpoints_secs=None,
|
||||
log_step_count_steps=100,
|
||||
)
|
||||
|
||||
|
||||
def _get_hooks(batch_size, is_distributed=defaults.DISTRIBUTED):
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
if is_distributed:
|
||||
exps_hook = ExamplesPerSecondHook(batch_size * hvd.size())
|
||||
bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
|
||||
logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(), hvd.size()))
|
||||
return [bcast_hook, exps_hook]
|
||||
else:
|
||||
exps_hook = ExamplesPerSecondHook(batch_size)
|
||||
return [exps_hook]
|
||||
|
||||
|
||||
def _is_master(is_distributed=defaults.DISTRIBUTED):
|
||||
if is_distributed:
|
||||
if hvd.rank() == 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def _log_summary(total_images, batch_size, duration):
|
||||
logger = logging.getLogger(__name__)
|
||||
images_per_second = total_images / duration
|
||||
logger.info("Data length: {}".format(total_images))
|
||||
logger.info("Total duration: {:.3f}".format(duration))
|
||||
logger.info("Total images/sec: {:.3f}".format(images_per_second))
|
||||
logger.info(
|
||||
"Batch size: (Per GPU {}: Total {})".format(
|
||||
batch_size, hvd.size() * batch_size if defaults.DISTRIBUTED else batch_size
|
||||
)
|
||||
)
|
||||
logger.info(
|
||||
"Distributed: {}".format("True" if defaults.DISTRIBUTED else "False")
|
||||
)
|
||||
logger.info(
|
||||
"Num GPUs: {:.3f}".format(hvd.size() if defaults.DISTRIBUTED else 1)
|
||||
)
|
||||
|
||||
|
||||
def main(
|
||||
training_data_path=None,
|
||||
validation_data_path=None,
|
||||
save_filepath="logs",
|
||||
epochs=defaults.EPOCHS,
|
||||
batch_size=defaults._BATCHSIZE,
|
||||
max_steps=None,
|
||||
save_checkpoints_steps=None,
|
||||
data_format="channels_last",
|
||||
momentum=0.9,
|
||||
data_type="tfrecords"
|
||||
):
|
||||
"""Run train and evaluation loop
|
||||
|
||||
Args:
|
||||
training_data_path: Location of training data
|
||||
validation_data_path: Location of validation data
|
||||
save_filepath: Location where the checkpoint and events files are saved
|
||||
epochs: Number of epochs to run the training for
|
||||
batch_size: Number of images to run in a mini-batch
|
||||
max_steps: Maximum number of steps to run for training and validation. This will override epochs parameter
|
||||
save_checkpoints_steps: Number of steps between checkpoints
|
||||
data_format: The axis order of the matrix, channels_last NHWC or channels_first NCHW
|
||||
momentum: Momentum term for tf.train.MomentumOptimizer
|
||||
data_type: The format that the data is in, valid values are 'images' and 'tfrecords'
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
if defaults.DISTRIBUTED:
|
||||
# Horovod: initialize Horovod.
|
||||
hvd.init()
|
||||
logger.info("Runnin Distributed")
|
||||
logger.info("Num GPUs: {:.3f}".format(hvd.size()))
|
||||
|
||||
logger.info("Tensorflow version {}".format(tf.__version__))
|
||||
if training_data_path is None:
|
||||
input_function = get_synth_input_fn(
|
||||
defaults.DEFAULT_IMAGE_SIZE,
|
||||
defaults.DEFAULT_IMAGE_SIZE,
|
||||
defaults.NUM_CHANNELS,
|
||||
defaults.NUM_CLASSES,
|
||||
)
|
||||
else:
|
||||
input_function = tfrecords.input_fn if "tfrecords" in data_type else images.input_fn
|
||||
|
||||
run_config = _get_runconfig(save_checkpoints_steps=save_checkpoints_steps)
|
||||
if (defaults.DISTRIBUTED and hvd.rank() == 0) or not defaults.DISTRIBUTED:
|
||||
model_dir = save_filepath
|
||||
else:
|
||||
model_dir = "."
|
||||
|
||||
params = {
|
||||
"learning_rate": defaults.LR,
|
||||
"momentum": momentum,
|
||||
"classes": defaults.NUM_CLASSES,
|
||||
"data_format": data_format,
|
||||
}
|
||||
logger.info("Creating estimator with params: {}".format(params))
|
||||
model = tf.estimator.Estimator(
|
||||
model_fn=model_fn, params=params, model_dir=model_dir, config=run_config
|
||||
)
|
||||
|
||||
hooks = _get_hooks(batch_size)
|
||||
num_gpus = hvd.size() if defaults.DISTRIBUTED else 1
|
||||
|
||||
def train_input_fn():
|
||||
return input_function(
|
||||
True,
|
||||
training_data_path,
|
||||
batch_size,
|
||||
num_epochs=epochs,
|
||||
data_format=data_format,
|
||||
num_parallel_batches=4,
|
||||
distributed=defaults.DISTRIBUTED
|
||||
)
|
||||
|
||||
with Timer(output=logger.info, prefix="Training") as t:
|
||||
logger.info("Training...")
|
||||
model.train(input_fn=train_input_fn, max_steps=max_steps, hooks=hooks)
|
||||
|
||||
if max_steps is not None:
|
||||
total_images = max_steps * batch_size * num_gpus
|
||||
else:
|
||||
total_images = epochs * defaults.NUM_IMAGES["train"]
|
||||
|
||||
_log_summary(total_images, batch_size, t.elapsed)
|
||||
|
||||
if _is_master() and validation_data_path is not None:
|
||||
|
||||
def validation_input_fn():
|
||||
return input_function(
|
||||
False,
|
||||
validation_data_path,
|
||||
batch_size,
|
||||
num_epochs=1,
|
||||
data_format=data_format,
|
||||
num_parallel_batches=4,
|
||||
)
|
||||
|
||||
with Timer(output=logger.info, prefix="Testing"):
|
||||
logger.info("Testing...")
|
||||
model.evaluate(input_fn=validation_input_fn, steps=max_steps)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.config.fileConfig(os.getenv("LOG_CONFIG", "logging.conf"))
|
||||
fire.Fire(main)
|
|
@ -0,0 +1,149 @@
|
|||
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Contains utility and supporting functions for ResNet.
|
||||
|
||||
This module contains ResNet code which does not directly build layers. This
|
||||
includes dataset management, hyperparameter and optimizer code, and argument
|
||||
parsing. Code for defining the ResNet layers can be found in resnet_model.py.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import multiprocessing
|
||||
import os
|
||||
|
||||
# pylint: disable=g-bad-import-order
|
||||
import tensorflow as tf
|
||||
import imagenet_preprocessing
|
||||
|
||||
|
||||
################################################################################
|
||||
# Functions for input processing.
|
||||
################################################################################
|
||||
|
||||
|
||||
def image_bytes_serving_input_fn(image_shape, dtype=tf.float32):
|
||||
"""Serving input fn for raw jpeg images."""
|
||||
|
||||
def _preprocess_image(image_bytes):
|
||||
"""Preprocess a single raw image."""
|
||||
# Bounding box around the whole image.
|
||||
bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=dtype, shape=[1, 1, 4])
|
||||
height, width, num_channels = image_shape
|
||||
image = imagenet_preprocessing.preprocess_image(
|
||||
image_bytes, bbox, height, width, num_channels, is_training=False)
|
||||
return image
|
||||
|
||||
image_bytes_list = tf.compat.v1.placeholder(
|
||||
shape=[None], dtype=tf.string, name='input_tensor')
|
||||
images = tf.map_fn(
|
||||
_preprocess_image, image_bytes_list, back_prop=False, dtype=dtype)
|
||||
return tf.estimator.export.TensorServingInputReceiver(
|
||||
images, {'image_bytes': image_bytes_list})
|
||||
|
||||
|
||||
def override_flags_and_set_envars_for_gpu_thread_pool(flags_obj):
|
||||
"""Override flags and set env_vars for performance.
|
||||
|
||||
These settings exist to test the difference between using stock settings
|
||||
and manual tuning. It also shows some of the ENV_VARS that can be tweaked to
|
||||
squeeze a few extra examples per second. These settings are defaulted to the
|
||||
current platform of interest, which changes over time.
|
||||
|
||||
On systems with small numbers of cpu cores, e.g. under 8 logical cores,
|
||||
setting up a gpu thread pool with `tf_gpu_thread_mode=gpu_private` may perform
|
||||
poorly.
|
||||
|
||||
Args:
|
||||
flags_obj: Current flags, which will be adjusted possibly overriding
|
||||
what has been set by the user on the command-line.
|
||||
"""
|
||||
cpu_count = multiprocessing.cpu_count()
|
||||
tf.compat.v1.logging.info('Logical CPU cores: %s', cpu_count)
|
||||
|
||||
# Sets up thread pool for each GPU for op scheduling.
|
||||
per_gpu_thread_count = 1
|
||||
total_gpu_thread_count = per_gpu_thread_count * flags_obj.num_gpus
|
||||
os.environ['TF_GPU_THREAD_MODE'] = flags_obj.tf_gpu_thread_mode
|
||||
os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
|
||||
tf.compat.v1.logging.info('TF_GPU_THREAD_COUNT: %s',
|
||||
os.environ['TF_GPU_THREAD_COUNT'])
|
||||
tf.compat.v1.logging.info('TF_GPU_THREAD_MODE: %s',
|
||||
os.environ['TF_GPU_THREAD_MODE'])
|
||||
|
||||
# Reduces general thread pool by number of threads used for GPU pool.
|
||||
main_thread_count = cpu_count - total_gpu_thread_count
|
||||
flags_obj.inter_op_parallelism_threads = main_thread_count
|
||||
|
||||
# Sets thread count for tf.data. Logical cores minus threads assign to the
|
||||
# private GPU pool along with 2 thread per GPU for event monitoring and
|
||||
# sending / receiving tensors.
|
||||
num_monitoring_threads = 2 * flags_obj.num_gpus
|
||||
flags_obj.datasets_num_private_threads = (cpu_count - total_gpu_thread_count
|
||||
- num_monitoring_threads)
|
||||
|
||||
|
||||
################################################################################
|
||||
# Functions for running training/eval/validation loops for the model.
|
||||
################################################################################
|
||||
def learning_rate_with_decay(
|
||||
batch_size, batch_denom, num_images, boundary_epochs, decay_rates,
|
||||
base_lr=0.1, warmup=False):
|
||||
"""Get a learning rate that decays step-wise as training progresses.
|
||||
|
||||
Args:
|
||||
batch_size: the number of examples processed in each training batch.
|
||||
batch_denom: this value will be used to scale the base learning rate.
|
||||
`0.1 * batch size` is divided by this number, such that when
|
||||
batch_denom == batch_size, the initial learning rate will be 0.1.
|
||||
num_images: total number of images that will be used for training.
|
||||
boundary_epochs: list of ints representing the epochs at which we
|
||||
decay the learning rate.
|
||||
decay_rates: list of floats representing the decay rates to be used
|
||||
for scaling the learning rate. It should have one more element
|
||||
than `boundary_epochs`, and all elements should have the same type.
|
||||
base_lr: Initial learning rate scaled based on batch_denom.
|
||||
warmup: Run a 5 epoch warmup to the initial lr.
|
||||
Returns:
|
||||
Returns a function that takes a single argument - the number of batches
|
||||
trained so far (global_step)- and returns the learning rate to be used
|
||||
for training the next batch.
|
||||
"""
|
||||
initial_learning_rate = base_lr * batch_size / batch_denom
|
||||
batches_per_epoch = num_images / batch_size
|
||||
|
||||
# Reduce the learning rate at certain epochs.
|
||||
# CIFAR-10: divide by 10 at epoch 100, 150, and 200
|
||||
# ImageNet: divide by 10 at epoch 30, 60, 80, and 90
|
||||
boundaries = [int(batches_per_epoch * epoch) for epoch in boundary_epochs]
|
||||
vals = [initial_learning_rate * decay for decay in decay_rates]
|
||||
|
||||
def learning_rate_fn(global_step):
|
||||
"""Builds scaled learning rate function with 5 epoch warm up."""
|
||||
lr = tf.compat.v1.train.piecewise_constant(global_step, boundaries, vals)
|
||||
if warmup:
|
||||
warmup_steps = int(batches_per_epoch * 5)
|
||||
warmup_lr = (
|
||||
initial_learning_rate * tf.cast(global_step, tf.float32) / tf.cast(
|
||||
warmup_steps, tf.float32))
|
||||
return tf.cond(pred=global_step < warmup_steps,
|
||||
true_fn=lambda: warmup_lr,
|
||||
false_fn=lambda: lr)
|
||||
return lr
|
||||
|
||||
return learning_rate_fn
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
# Taken from https://github.com/tensorflow/models/blob/master/tutorials/image/cifar10_estimator/cifar10_utils.py
|
||||
from tensorflow.python.platform import tf_logging as logging
|
||||
from tensorflow.python.training import basic_session_run_hooks
|
||||
from tensorflow.python.training import session_run_hook
|
||||
from tensorflow.python.training import training_util
|
||||
|
||||
|
||||
def str_to_bool(in_str):
|
||||
if "t" in in_str.lower():
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
class ExamplesPerSecondHook(session_run_hook.SessionRunHook):
|
||||
"""Hook to print out examples per second.
|
||||
Total time is tracked and then divided by the total number of steps
|
||||
to get the average step time and then batch_size is used to determine
|
||||
the running average of examples per second. The examples per second for the
|
||||
most recent interval is also logged.
|
||||
"""
|
||||
|
||||
def __init__(self, batch_size, every_n_steps=100, every_n_secs=None):
|
||||
"""Initializer for ExamplesPerSecondHook.
|
||||
Args:
|
||||
batch_size: Total batch size used to calculate examples/second from
|
||||
global time.
|
||||
every_n_steps: Log stats every n steps.
|
||||
every_n_secs: Log stats every n seconds.
|
||||
"""
|
||||
if (every_n_steps is None) == (every_n_secs is None):
|
||||
raise ValueError(
|
||||
"exactly one of every_n_steps" " and every_n_secs should be provided."
|
||||
)
|
||||
self._timer = basic_session_run_hooks.SecondOrStepTimer(
|
||||
every_steps=every_n_steps, every_secs=every_n_secs
|
||||
)
|
||||
|
||||
self._step_train_time = 0
|
||||
self._total_steps = 0
|
||||
self._batch_size = batch_size
|
||||
|
||||
def begin(self):
|
||||
self._global_step_tensor = training_util.get_global_step()
|
||||
if self._global_step_tensor is None:
|
||||
raise RuntimeError("Global step should be created to use StepCounterHook.")
|
||||
|
||||
def before_run(self, run_context): # pylint: disable=unused-argument
|
||||
return basic_session_run_hooks.SessionRunArgs(self._global_step_tensor)
|
||||
|
||||
def after_run(self, run_context, run_values):
|
||||
_ = run_context
|
||||
|
||||
global_step = run_values.results
|
||||
if self._timer.should_trigger_for_step(global_step):
|
||||
elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(
|
||||
global_step
|
||||
)
|
||||
if elapsed_time is not None:
|
||||
steps_per_sec = elapsed_steps / elapsed_time
|
||||
self._step_train_time += elapsed_time
|
||||
self._total_steps += elapsed_steps
|
||||
|
||||
average_examples_per_sec = self._batch_size * (
|
||||
self._total_steps / self._step_train_time
|
||||
)
|
||||
current_examples_per_sec = steps_per_sec * self._batch_size
|
||||
# Average examples/sec followed by current examples/sec
|
||||
logging.info(
|
||||
"%s: %g (%g), step = %g",
|
||||
"Average examples/sec",
|
||||
average_examples_per_sec,
|
||||
current_examples_per_sec,
|
||||
self._total_steps,
|
||||
)
|
|
@ -0,0 +1,176 @@
|
|||
"""Module for running TensorFlow training on Imagenet data
|
||||
"""
|
||||
from invoke import task, Collection
|
||||
import os
|
||||
from config import load_config
|
||||
|
||||
|
||||
_BASE_PATH = os.path.dirname(os.path.abspath(__file__))
|
||||
env_values = load_config()
|
||||
|
||||
|
||||
@task
|
||||
def submit_synthetic(c, node_count=int(env_values["CLUSTER_MAX_NODES"]), epochs=1):
|
||||
"""Submit TensorFlow training job using synthetic imagenet data to remote cluster
|
||||
|
||||
Args:
|
||||
node_count (int, optional): The number of nodes to use in cluster. Defaults to env_values['CLUSTER_MAX_NODES'].
|
||||
epochs (int, optional): Number of epochs to run training for. Defaults to 1.
|
||||
"""
|
||||
from aml_compute import TFExperimentCLI
|
||||
|
||||
exp = TFExperimentCLI("synthetic_images_remote")
|
||||
run = exp.submit(
|
||||
os.path.join(_BASE_PATH, "src"),
|
||||
"resnet_main.py",
|
||||
{"--epochs": epochs},
|
||||
node_count=node_count,
|
||||
dependencies_file="TensorFlow_imagenet/environment_gpu.yml",
|
||||
wait_for_completion=True,
|
||||
)
|
||||
print(run)
|
||||
|
||||
|
||||
@task
|
||||
def submit_synthetic_local(c, epochs=1):
|
||||
"""Submit TensorFlow training job using synthetic imagenet data for local execution
|
||||
|
||||
Args:
|
||||
epochs (int, optional): Number of epochs to run training for. Defaults to 1.
|
||||
"""
|
||||
from aml_compute import TFExperimentCLI
|
||||
|
||||
exp = TFExperimentCLI("synthetic_images_local")
|
||||
run = exp.submit_local(
|
||||
os.path.join(_BASE_PATH, "src"),
|
||||
"resnet_main.py",
|
||||
{"--epochs": epochs},
|
||||
dependencies_file="TensorFlow_imagenet/environment_gpu.yml",
|
||||
wait_for_completion=True,
|
||||
)
|
||||
print(run)
|
||||
|
||||
|
||||
@task
|
||||
def submit_images(c, node_count=int(env_values["CLUSTER_MAX_NODES"]), epochs=1):
|
||||
"""Submit TensorFlow training job using real imagenet data to remote cluster
|
||||
|
||||
Args:
|
||||
node_count (int, optional): The number of nodes to use in cluster. Defaults to env_values['CLUSTER_MAX_NODES'].
|
||||
epochs (int, optional): Number of epochs to run training for. Defaults to 1.
|
||||
"""
|
||||
from aml_compute import TFExperimentCLI
|
||||
|
||||
exp = TFExperimentCLI("real_images_remote")
|
||||
run = exp.submit(
|
||||
os.path.join(_BASE_PATH, "src"),
|
||||
"resnet_main.py",
|
||||
{
|
||||
"--training_data_path": "{datastore}/train",
|
||||
"--validation_data_path": "{datastore}/validation",
|
||||
"--epochs": epochs,
|
||||
"--data_type": "images",
|
||||
"--data-format": "channels_first",
|
||||
},
|
||||
node_count=node_count,
|
||||
dependencies_file="TensorFlow_imagenet/environment_gpu.yml",
|
||||
wait_for_completion=True,
|
||||
)
|
||||
print(run)
|
||||
|
||||
|
||||
@task
|
||||
def submit_images_local(c, epochs=1):
|
||||
"""Submit TensorFlow training job using real imagenet data for local execution
|
||||
|
||||
Args:
|
||||
epochs (int, optional): Number of epochs to run training for. Defaults to 1.
|
||||
"""
|
||||
from aml_compute import TFExperimentCLI
|
||||
|
||||
exp = TFExperimentCLI("real_images_local")
|
||||
run = exp.submit_local(
|
||||
os.path.join(_BASE_PATH, "src"),
|
||||
"resnet_main.py",
|
||||
{
|
||||
"--training_data_path": "/data/train",
|
||||
"--validation_data_path": "/data/validation",
|
||||
"--epochs": epochs,
|
||||
"--data_type": "images",
|
||||
"--data-format": "channels_first",
|
||||
},
|
||||
dependencies_file="TensorFlow_imagenet/environment_gpu.yml",
|
||||
docker_args=["-v", f"{env_values['data']}:/data"],
|
||||
wait_for_completion=True,
|
||||
)
|
||||
print(run)
|
||||
|
||||
|
||||
@task
|
||||
def submit_tfrecords(c, node_count=int(env_values["CLUSTER_MAX_NODES"]), epochs=1):
|
||||
"""Submit TensorFlow training job using real imagenet data as tfrecords to remote cluster
|
||||
|
||||
Args:
|
||||
node_count (int, optional): The number of nodes to use in cluster. Defaults to env_values['CLUSTER_MAX_NODES'].
|
||||
epochs (int, optional): Number of epochs to run training for. Defaults to 1.
|
||||
"""
|
||||
from aml_compute import TFExperimentCLI
|
||||
|
||||
exp = TFExperimentCLI("real_tfrecords_remote")
|
||||
run = exp.submit(
|
||||
os.path.join(_BASE_PATH, "src"),
|
||||
"resnet_main.py",
|
||||
{
|
||||
"--training_data_path": "{datastore}/tfrecords/train",
|
||||
"--validation_data_path": "{datastore}/tfrecords/validation",
|
||||
"--epochs": epochs,
|
||||
"--data_type": "tfrecords",
|
||||
"--data-format": "channels_first",
|
||||
},
|
||||
node_count=node_count,
|
||||
dependencies_file="TensorFlow_imagenet/environment_gpu.yml",
|
||||
wait_for_completion=True,
|
||||
)
|
||||
print(run)
|
||||
|
||||
|
||||
@task
|
||||
def submit_tfrecords_local(c, epochs=1):
|
||||
"""Submit TensorFlow training job using real imagenet data as tfrecords for local execution
|
||||
|
||||
Args:
|
||||
epochs (int, optional): Number of epochs to run training for. Defaults to 1.
|
||||
"""
|
||||
from aml_compute import TFExperimentCLI
|
||||
|
||||
exp = TFExperimentCLI("real_tfrecords_local")
|
||||
run = exp.submit_local(
|
||||
os.path.join(_BASE_PATH, "src"),
|
||||
"resnet_main.py",
|
||||
{
|
||||
"--training_data_path": "/data/tfrecords/train",
|
||||
"--validation_data_path": "/data/tfrecords/validation",
|
||||
"--epochs": epochs,
|
||||
"--data_type": "tfrecords",
|
||||
"--data-format": "channels_first",
|
||||
},
|
||||
dependencies_file="TensorFlow_imagenet/environment_gpu.yml",
|
||||
docker_args=["-v", f"{env_values['data']}:/data"],
|
||||
wait_for_completion=True,
|
||||
)
|
||||
print(run)
|
||||
|
||||
|
||||
remote_collection = Collection("remote")
|
||||
remote_collection.add_task(submit_images, "images")
|
||||
remote_collection.add_task(submit_tfrecords, "tfrecords")
|
||||
remote_collection.add_task(submit_synthetic, "synthetic")
|
||||
|
||||
local_collection = Collection("local")
|
||||
local_collection.add_task(submit_images_local, "images")
|
||||
local_collection.add_task(submit_tfrecords_local, "tfrecords")
|
||||
local_collection.add_task(submit_synthetic_local, "synthetic")
|
||||
|
||||
submit_collection = Collection("submit", local_collection, remote_collection)
|
||||
namespace = Collection("tf_imagenet", submit_collection)
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
CLUSTER_NAME={{cookiecutter.cluster_name}}
|
||||
CLUSTER_VM_SIZE={{cookiecutter.vm_size}}
|
||||
CLUSTER_MIN_NODES={{cookiecutter.minimum_number_nodes}}
|
||||
CLUSTER_MAX_NODES={{cookiecutter.maximum_number_nodes}}
|
||||
WORKSPACE={{cookiecutter.workspace}}
|
||||
RESOURCE_GROUP={{cookiecutter.resource_group}}
|
||||
REGION={{cookiecutter.region}}
|
||||
LOG_CONFIG=/workspace/control/src/logging.conf
|
||||
SUBSCRIPTION_ID={{cookiecutter.subscription_id}}
|
||||
DATASTORE_NAME={{cookiecutter.datastore_name}}
|
||||
CONTAINER_NAME={{cookiecutter.container_name}}
|
||||
ACCOUNT_NAME={{cookiecutter.account_name}}
|
||||
ACCOUNT_KEY={{cookiecutter.account_key}}
|
||||
DATA={{cookiecutter.data}}
|
||||
IMAGE_NAME={{cookiecutter.container_registry}}/{{cookiecutter.image_name}}
|
|
@ -0,0 +1,4 @@
|
|||
azure
|
||||
azure-cli-core
|
||||
azureml-sdk[notebooks,contrib,tensorboard]
|
||||
git+https://github.com/msalvaris/amltoolz.git
|
|
@ -0,0 +1,32 @@
|
|||
# Invoke tab-completion script to be sourced with Bash shell.
|
||||
# Known to work on Bash 3.x, untested on 4.x.
|
||||
|
||||
_complete_invoke() {
|
||||
local candidates
|
||||
|
||||
# COMP_WORDS contains the entire command string up til now (including
|
||||
# program name).
|
||||
# We hand it to Invoke so it can figure out the current context: spit back
|
||||
# core options, task names, the current task's options, or some combo.
|
||||
candidates=`invoke --complete -- ${COMP_WORDS[*]}`
|
||||
|
||||
# `compgen -W` takes list of valid options & a partial word & spits back
|
||||
# possible matches. Necessary for any partial word completions (vs
|
||||
# completions performed when no partial words are present).
|
||||
#
|
||||
# $2 is the current word or token being tabbed on, either empty string or a
|
||||
# partial word, and thus wants to be compgen'd to arrive at some subset of
|
||||
# our candidate list which actually matches.
|
||||
#
|
||||
# COMPREPLY is the list of valid completions handed back to `complete`.
|
||||
COMPREPLY=( $(compgen -W "${candidates}" -- $2) )
|
||||
}
|
||||
|
||||
|
||||
# Tell shell builtin to use the above for completing our invocations.
|
||||
# * -F: use given function name to generate completions.
|
||||
# * -o default: when function generates no results, use filenames.
|
||||
# * positional args: program names to complete for.
|
||||
complete -F _complete_invoke -o default invoke inv
|
||||
|
||||
# vim: set ft=sh :
|
|
@ -0,0 +1,96 @@
|
|||
FROM ubuntu:16.04
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
cmake \
|
||||
git \
|
||||
curl \
|
||||
nano \
|
||||
wget \
|
||||
unzip \
|
||||
ca-certificates \
|
||||
jq \
|
||||
locales \
|
||||
apt-transport-https \
|
||||
software-properties-common \
|
||||
sudo \
|
||||
tmux
|
||||
|
||||
|
||||
# Install Docker
|
||||
RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - && \
|
||||
apt-key fingerprint 0EBFCD88 && \
|
||||
add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
|
||||
$(lsb_release -cs) \
|
||||
stable" &&\
|
||||
apt-get update && apt-get install -y --no-install-recommends docker-ce
|
||||
|
||||
|
||||
RUN locale-gen en_US.UTF-8
|
||||
ENV LANG en_US.UTF-8
|
||||
ENV LANGUAGE en_US:en
|
||||
ENV LC_ALL en_US.UTF-8
|
||||
|
||||
COPY environment.yml .
|
||||
COPY azure_requirements.txt .
|
||||
|
||||
ENV ENV_NAME=py36
|
||||
RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
|
||||
chmod +x ~/miniconda.sh && \
|
||||
~/miniconda.sh -b -p /opt/conda && \
|
||||
rm ~/miniconda.sh && \
|
||||
/opt/conda/bin/conda env create -q --name $ENV_NAME -f environment.yml && \
|
||||
/opt/conda/bin/conda clean -ya && \
|
||||
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
|
||||
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
|
||||
echo "conda activate $ENV_NAME" >> ~/.bashrc
|
||||
ENV PATH /opt/conda/envs/$ENV_NAME/bin:/opt/conda/bin:$PATH
|
||||
|
||||
RUN ["/bin/bash", "-c", "pip install -r azure_requirements.txt --ignore-installed PyYAML"]
|
||||
|
||||
# Install Azure CLI
|
||||
RUN echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ xenial main" | \
|
||||
tee /etc/apt/sources.list.d/azure-cli.list && \
|
||||
apt-key --keyring /etc/apt/trusted.gpg.d/Microsoft.gpg adv \
|
||||
--keyserver packages.microsoft.com \
|
||||
--recv-keys BC528686B50D79E339D3721CEB3E94ADBE1229CF && \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
azure-cli
|
||||
|
||||
RUN az extension add -n azure-cli-ml # Install azure ml extension
|
||||
|
||||
# Install AzCopy
|
||||
RUN echo "deb [arch=amd64] https://packages.microsoft.com/repos/microsoft-ubuntu-xenial-prod/ xenial main" > azure.list &&\
|
||||
cp ./azure.list /etc/apt/sources.list.d/ &&\
|
||||
apt-key adv --keyserver packages.microsoft.com --recv-keys B02C46DF417A0893 &&\
|
||||
apt-get update &&\
|
||||
apt-get install -y --no-install-recommends azcopy
|
||||
|
||||
COPY jupyter_notebook_config.py /root/.jupyter/
|
||||
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
RUN jupyter nbextension install --py --user azureml.widgets && \
|
||||
jupyter nbextension enable --py --user azureml.widgets
|
||||
|
||||
# Adding nvidia-docker alias
|
||||
RUN echo -e '#!/bin/bash\ndocker "$@"' > /usr/bin/nvidia-docker && \
|
||||
chmod +x /usr/bin/nvidia-docker
|
||||
|
||||
ENV PYTHONPATH /workspace/scripts:/workspace/control/src:$PYTHONPATH
|
||||
# template {% if cookiecutter.type == "template" or cookiecutter.type == "all"%}
|
||||
ENV PYTHONPATH /workspace/TensorFlow_experiment:$PYTHONPATH
|
||||
# ------ {% endif %}
|
||||
# benchmark {% if cookiecutter.type == "benchmark" or cookiecutter.type == "all"%}
|
||||
ENV PYTHONPATH /workspace/TensorFlow_benchmark:$PYTHONPATH
|
||||
# ------ {% endif %}
|
||||
# imagenet {% if cookiecutter.type == "imagenet" or cookiecutter.type == "all"%}
|
||||
ENV PYTHONPATH /workspace/TensorFlow_imagenet:$PYTHONPATH
|
||||
# ------- {% endif %}
|
||||
# Completion script
|
||||
COPY bash.completion /etc/bash_completion.d/
|
||||
RUN echo "source /etc/bash_completion.d/bash.completion" >> /root/.bashrc
|
||||
# Tmux
|
||||
COPY tmux.conf /root/.tmux.conf
|
||||
WORKDIR /workspace
|
||||
CMD /bin/bash
|
|
@ -0,0 +1,29 @@
|
|||
name: py36
|
||||
channels:
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.6
|
||||
- jupyter
|
||||
- ipykernel
|
||||
- matplotlib
|
||||
- seaborn
|
||||
- numpy
|
||||
- pandas
|
||||
- selenium
|
||||
- phantomjs
|
||||
- pillow
|
||||
- bokeh
|
||||
- ipython
|
||||
- ipdb
|
||||
- pip:
|
||||
- docker
|
||||
- fire
|
||||
- toolz
|
||||
- tabulate==0.8.2
|
||||
- Jinja2
|
||||
- gitpython
|
||||
- tensorflow # Installing for Tensorboard
|
||||
- tensorboard
|
||||
- tqdm
|
||||
- python-dotenv[cli]==0.10.1
|
||||
- invoke
|
|
@ -0,0 +1,6 @@
|
|||
# Configuration file for jupyter-notebook.
|
||||
|
||||
c.NotebookApp.ip = "0.0.0.0"
|
||||
c.NotebookApp.port = 9999
|
||||
c.NotebookApp.open_browser = False
|
||||
c.NotebookApp.allow_root = True
|
|
@ -0,0 +1,4 @@
|
|||
# remap prefix from 'C-b' to 'C-a'
|
||||
unbind C-b
|
||||
set-option -g prefix C-a
|
||||
bind-key C-a send-prefix
|
|
@ -0,0 +1,510 @@
|
|||
import logging
|
||||
import logging.config
|
||||
import os
|
||||
|
||||
import azureml.core
|
||||
import fire
|
||||
from amltoolz import Workspace
|
||||
from azureml import core
|
||||
from azureml.core import Datastore
|
||||
from azureml.core.compute import ComputeTarget, AmlCompute
|
||||
from azureml.core.compute_target import ComputeTargetException
|
||||
from azureml.core.conda_dependencies import (
|
||||
CondaDependencies,
|
||||
TENSORFLOW_DEFAULT_VERSION,
|
||||
)
|
||||
from azureml.core.runconfig import EnvironmentDefinition
|
||||
from azureml.tensorboard import Tensorboard
|
||||
from azureml.train.dnn import TensorFlow
|
||||
from config import load_config
|
||||
from toolz import curry, pipe
|
||||
from pprint import pformat
|
||||
from time import sleep
|
||||
|
||||
|
||||
logging.config.fileConfig(os.getenv("LOG_CONFIG", "logging.conf"))
|
||||
|
||||
config_dict = load_config()
|
||||
|
||||
_DEFAULT_AML_PATH = config_dict.get("DEFAULT_AML_PATH", "aml_config/azml_config.json")
|
||||
_CLUSTER_NAME = config_dict.get("CLUSTER_NAME", "gpucluster24rv3")
|
||||
_CLUSTER_VM_SIZE = config_dict.get("CLUSTER_VM_SIZE", "Standard_NC24rs_v3")
|
||||
_CLUSTER_MIN_NODES = int(config_dict.get("CLUSTER_MIN_NODES", 0))
|
||||
_CLUSTER_MAX_NODES = int(config_dict.get("CLUSTER_MAX_NODES", 2))
|
||||
_WORKSPACE = config_dict.get("WORKSPACE", "workspace")
|
||||
_RESOURCE_GROUP = config_dict.get("RESOURCE_GROUP", "amlccrg")
|
||||
_SUBSCRIPTION_ID = config_dict.get("SUBSCRIPTION_ID", None)
|
||||
_REGION = config_dict.get("REGION", "eastus")
|
||||
_DEPENDENCIES_FILE = config_dict.get(
|
||||
"DEPENDENCIES_FILE", "../../experiment/src/environment_gpu.yml"
|
||||
)
|
||||
_DATASTORE_NAME = config_dict.get("DATASTORE_NAME", "datastore")
|
||||
_CONTAINER_NAME = config_dict.get("CONTAINER_NAME", "container")
|
||||
_ACCOUNT_NAME = config_dict.get("ACCOUNT_NAME", None)
|
||||
_ACCOUNT_KEY = config_dict.get("ACCOUNT_KEY", None)
|
||||
|
||||
|
||||
def _create_cluster(
|
||||
workspace,
|
||||
cluster_name=_CLUSTER_NAME,
|
||||
vm_size=_CLUSTER_VM_SIZE,
|
||||
min_nodes=_CLUSTER_MIN_NODES,
|
||||
max_nodes=_CLUSTER_MAX_NODES,
|
||||
):
|
||||
logger = logging.getLogger(__name__)
|
||||
try:
|
||||
compute_target = ComputeTarget(workspace=workspace, name=cluster_name)
|
||||
logger.info("Found existing compute target.")
|
||||
except ComputeTargetException:
|
||||
logger.info("Creating a new compute target...")
|
||||
compute_config = AmlCompute.provisioning_configuration(
|
||||
vm_size=vm_size, min_nodes=min_nodes, max_nodes=max_nodes
|
||||
)
|
||||
|
||||
# create the cluster
|
||||
compute_target = ComputeTarget.create(workspace, cluster_name, compute_config)
|
||||
compute_target.wait_for_completion(show_output=True)
|
||||
|
||||
# use get_status() to get a detailed status for the current AmlCompute.
|
||||
logger.debug(compute_target.get_status().serialize())
|
||||
|
||||
return compute_target
|
||||
|
||||
|
||||
def _prepare_environment_definition(dependencies_file, distributed):
|
||||
logger = logging.getLogger(__name__)
|
||||
env_def = EnvironmentDefinition()
|
||||
conda_dep = CondaDependencies(conda_dependencies_file_path=dependencies_file)
|
||||
env_def.python.user_managed_dependencies = False
|
||||
env_def.python.conda_dependencies = conda_dep
|
||||
env_def.docker.enabled = True
|
||||
env_def.docker.gpu_support = True
|
||||
env_def.docker.base_image = azureml.core.runconfig.DEFAULT_GPU_IMAGE
|
||||
env_def.docker.shm_size = "8g"
|
||||
env_def.environment_variables["NCCL_SOCKET_IFNAME"] = "eth0"
|
||||
env_def.environment_variables["NCCL_IB_DISABLE"] = 1
|
||||
|
||||
if distributed:
|
||||
env_def.environment_variables["DISTRIBUTED"] = "True"
|
||||
else:
|
||||
env_def.environment_variables["DISTRIBUTED"] = "False"
|
||||
logger.info("Adding runtime argument")
|
||||
# Adds runtime argument since we aliased nvidia-docker to docker in order to be able to run them as
|
||||
# sibling containers. Without this we will get CUDA library errors
|
||||
env_def.docker.arguments.extend(["--runtime", "nvidia"])
|
||||
|
||||
return env_def
|
||||
|
||||
|
||||
@curry
|
||||
def _create_estimator(
|
||||
estimator_class,
|
||||
dependencies_file,
|
||||
project_folder,
|
||||
entry_script,
|
||||
compute_target,
|
||||
script_params,
|
||||
node_count=_CLUSTER_MAX_NODES,
|
||||
process_count_per_node=4,
|
||||
docker_args=(),
|
||||
):
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.debug(f"Loading dependencies from {dependencies_file}")
|
||||
|
||||
# If the compute target is "local" then don't run distributed
|
||||
distributed = not (isinstance(compute_target, str) and compute_target == "local")
|
||||
env_def = _prepare_environment_definition(dependencies_file, distributed)
|
||||
env_def.docker.arguments.extend(list(docker_args))
|
||||
|
||||
estimator = estimator_class(
|
||||
project_folder,
|
||||
entry_script=entry_script,
|
||||
compute_target=compute_target,
|
||||
script_params=script_params,
|
||||
node_count=node_count,
|
||||
process_count_per_node=process_count_per_node,
|
||||
distributed_backend="mpi" if distributed else None,
|
||||
environment_definition=env_def,
|
||||
)
|
||||
|
||||
logger.debug(estimator.conda_dependencies.__dict__)
|
||||
return estimator
|
||||
|
||||
|
||||
def _create_datastore(
|
||||
aml_workspace,
|
||||
datastore_name,
|
||||
container_name,
|
||||
account_name,
|
||||
account_key,
|
||||
create_if_not_exists=True,
|
||||
):
|
||||
ds = Datastore.register_azure_blob_container(
|
||||
workspace=aml_workspace,
|
||||
datastore_name=datastore_name,
|
||||
container_name=container_name,
|
||||
account_name=account_name,
|
||||
account_key=account_key,
|
||||
create_if_not_exists=create_if_not_exists,
|
||||
)
|
||||
return ds
|
||||
|
||||
|
||||
class ExperimentCLI(object):
|
||||
def __init__(
|
||||
self,
|
||||
experiment_name,
|
||||
workspace_name=_WORKSPACE,
|
||||
resource_group=_RESOURCE_GROUP,
|
||||
subscription_id=_SUBSCRIPTION_ID,
|
||||
workspace_region=_REGION,
|
||||
config_path=_DEFAULT_AML_PATH,
|
||||
):
|
||||
|
||||
self._logger = logging.getLogger(__name__)
|
||||
self._logger.info("SDK version:" + str(azureml.core.VERSION))
|
||||
self._ws = workspace_for_user(
|
||||
workspace_name=workspace_name,
|
||||
resource_group=resource_group,
|
||||
subscription_id=subscription_id,
|
||||
workspace_region=workspace_region,
|
||||
config_path=config_path,
|
||||
).aml_workspace
|
||||
self._experiment = core.Experiment(self._ws, name=experiment_name)
|
||||
self._cluster = None
|
||||
self._datastore = None
|
||||
|
||||
def create_cluster(
|
||||
self,
|
||||
name=_CLUSTER_NAME,
|
||||
vm_size=_CLUSTER_VM_SIZE,
|
||||
min_nodes=_CLUSTER_MIN_NODES,
|
||||
max_nodes=_CLUSTER_MAX_NODES,
|
||||
):
|
||||
"""Creates AzureML cluster
|
||||
|
||||
Args:
|
||||
name (string, optional): The name you wish to assign the cluster.
|
||||
Defaults to _CLUSTER_NAME.
|
||||
vm_size (string, optional): The type of sku to use for your vm.
|
||||
Defaults to _CLUSTER_VM_SIZE.
|
||||
min_nodes (int, optional): Minimum number of nodes in cluster.
|
||||
Use 0 if you don't want to incur costs when it isn't being used.
|
||||
Defaults to _CLUSTER_MIN_NODES.
|
||||
max_nodes (int, optional): Maximum number of nodes in cluster.
|
||||
Defaults to _CLUSTER_MAX_NODES.
|
||||
|
||||
Returns:
|
||||
ExperimentCLI: Experiment object
|
||||
"""
|
||||
self._cluster = _create_cluster(
|
||||
self._ws,
|
||||
cluster_name=name,
|
||||
vm_size=vm_size,
|
||||
min_nodes=min_nodes,
|
||||
max_nodes=max_nodes,
|
||||
)
|
||||
return self
|
||||
|
||||
def create_datastore(
|
||||
self,
|
||||
datastore_name=_DATASTORE_NAME,
|
||||
container_name=_CONTAINER_NAME,
|
||||
account_name=_ACCOUNT_NAME,
|
||||
account_key=_ACCOUNT_KEY,
|
||||
):
|
||||
"""Creates datastore
|
||||
|
||||
Args:
|
||||
datastore_name (string, optional): Name you wish to assign to your datastore. Defaults to _DATASTORE_NAME.
|
||||
container_name (string, optional): Name of your container. Defaults to _CONTAINER_NAME.
|
||||
account_name (string, optional): Storage account name. Defaults to _ACCOUNT_NAME.
|
||||
account_key (string, optional): The storage account key. Defaults to _ACCOUNT_KEY.
|
||||
|
||||
Returns:
|
||||
ExperimentCLI: Experiment object
|
||||
"""
|
||||
assert account_name is not None, "Account name for Datastore not set"
|
||||
assert account_key is not None, "Account key for Datastore not set"
|
||||
|
||||
self._datastore = _create_datastore(
|
||||
self._ws,
|
||||
datastore_name=datastore_name,
|
||||
container_name=container_name,
|
||||
account_name=account_name,
|
||||
account_key=account_key,
|
||||
)
|
||||
return self
|
||||
|
||||
@property
|
||||
def cluster(self):
|
||||
if self._cluster is None:
|
||||
self.create_cluster()
|
||||
return self._cluster
|
||||
|
||||
@property
|
||||
def datastore(self):
|
||||
if self._datastore is None:
|
||||
self.create_datastore()
|
||||
return self._datastore
|
||||
|
||||
|
||||
def _has_key(input_dict, key):
|
||||
for v in input_dict.values:
|
||||
if key in v:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _fill_param_with(input_dict, parameters_dict):
|
||||
return {key: value.format(**parameters_dict) for key, value in input_dict.items()}
|
||||
|
||||
|
||||
class TFExperimentCLI(ExperimentCLI):
|
||||
"""Creates Experiment object that can be used to create clusters and submit experiments
|
||||
|
||||
Returns:
|
||||
TFExperimentCLI: Experiment object
|
||||
"""
|
||||
|
||||
def submit_local(
|
||||
self,
|
||||
project_folder,
|
||||
entry_script,
|
||||
script_params,
|
||||
dependencies_file=_DEPENDENCIES_FILE,
|
||||
wait_for_completion=True,
|
||||
docker_args=(),
|
||||
):
|
||||
"""Submit experiment for local execution
|
||||
|
||||
Args:
|
||||
project_folder (string): Path of you source files for the experiment
|
||||
entry_script (string): The filename of your script to run. Must be found in your project_folder
|
||||
script_params (dict): Dictionary of script parameters
|
||||
dependencies_file (string, optional): The location of your environment.yml to use to create the
|
||||
environment your training script requires.
|
||||
Defaults to _DEPENDENCIES_FILE.
|
||||
wait_for_completion (bool, optional): Whether to block until experiment is done. Defaults to True.
|
||||
docker_args (tuple, optional): Docker arguments to pass. Defaults to ().
|
||||
"""
|
||||
self._logger.info("Running in local mode")
|
||||
self._submit(
|
||||
dependencies_file,
|
||||
project_folder,
|
||||
entry_script,
|
||||
"local",
|
||||
script_params,
|
||||
1,
|
||||
1,
|
||||
docker_args,
|
||||
wait_for_completion,
|
||||
)
|
||||
|
||||
def submit(
|
||||
self,
|
||||
project_folder,
|
||||
entry_script,
|
||||
script_params,
|
||||
dependencies_file=_DEPENDENCIES_FILE,
|
||||
node_count=_CLUSTER_MAX_NODES,
|
||||
process_count_per_node=4,
|
||||
wait_for_completion=True,
|
||||
docker_args=(),
|
||||
):
|
||||
"""Submit experiment for remote execution on AzureML clusters
|
||||
|
||||
Args:
|
||||
project_folder (string): Path of you source files for the experiment
|
||||
entry_script (string): The filename of your script to run. Must be found in your project_folder
|
||||
script_params (dict): Dictionary of script parameters
|
||||
dependencies_file (string, optional): The location of your environment.yml to use to
|
||||
create the environment your training script requires.
|
||||
Defaults to _DEPENDENCIES_FILE.
|
||||
node_count (int, optional): [description]. Defaults to _CLUSTER_MAX_NODES.
|
||||
process_count_per_node (int, optional): Number of precesses to run on each node.
|
||||
Usually should be the same as the number of GPU for GPU exeuction.
|
||||
Defaults to 4.
|
||||
wait_for_completion (bool, optional): Whether to block until experiment is done. Defaults to True.
|
||||
docker_args (tuple, optional): Docker arguments to pass. Defaults to ().
|
||||
|
||||
Returns:
|
||||
azureml.core.Run: AzureML Run object
|
||||
"""
|
||||
self._logger.debug(script_params)
|
||||
|
||||
transformed_params = self._complete_datastore(script_params)
|
||||
self._logger.debug("Transformed script params")
|
||||
self._logger.debug(transformed_params)
|
||||
|
||||
return self._submit(
|
||||
dependencies_file,
|
||||
project_folder,
|
||||
entry_script,
|
||||
self.cluster,
|
||||
transformed_params,
|
||||
node_count,
|
||||
process_count_per_node,
|
||||
docker_args,
|
||||
wait_for_completion,
|
||||
)
|
||||
|
||||
def _submit(
|
||||
self,
|
||||
dependencies_file,
|
||||
project_folder,
|
||||
entry_script,
|
||||
cluster,
|
||||
script_params,
|
||||
node_count,
|
||||
process_count_per_node,
|
||||
docker_args,
|
||||
wait_for_completion,
|
||||
):
|
||||
self._logger.debug(script_params)
|
||||
estimator = _create_estimator(
|
||||
TensorFlow,
|
||||
dependencies_file,
|
||||
project_folder,
|
||||
entry_script,
|
||||
cluster,
|
||||
script_params,
|
||||
node_count=node_count,
|
||||
process_count_per_node=process_count_per_node,
|
||||
docker_args=docker_args,
|
||||
)
|
||||
# TEMPORARY HACK: Bugs with AML necessitate the code below, once fixed remove
|
||||
estimator.conda_dependencies.remove_pip_package("horovod==0.15.2")
|
||||
estimator.conda_dependencies.remove_pip_package(
|
||||
"tensorflow==" + TENSORFLOW_DEFAULT_VERSION
|
||||
)
|
||||
estimator.conda_dependencies.add_pip_package("tensorflow-gpu==1.12.0")
|
||||
estimator.conda_dependencies.add_pip_package("horovod==0.15.2")
|
||||
|
||||
self._logger.debug(estimator.conda_dependencies.__dict__)
|
||||
run = self._experiment.submit(estimator)
|
||||
if wait_for_completion:
|
||||
run.wait_for_completion(show_output=True)
|
||||
return run
|
||||
|
||||
def _complete_datastore(self, script_params):
|
||||
def _replace(value):
|
||||
if isinstance(value, str) and "{datastore}" in value:
|
||||
data_path = value.replace("{datastore}/", "")
|
||||
return self.datastore.path(data_path).as_mount()
|
||||
else:
|
||||
return value
|
||||
|
||||
return {key: _replace(value) for key, value in script_params.items()}
|
||||
|
||||
|
||||
def workspace_for_user(
|
||||
workspace_name=_WORKSPACE,
|
||||
resource_group=_RESOURCE_GROUP,
|
||||
subscription_id=_SUBSCRIPTION_ID,
|
||||
workspace_region=_REGION,
|
||||
config_path=_DEFAULT_AML_PATH,
|
||||
):
|
||||
""" Creates or gets amltoolz.Workspace instance which represents an AML Workspace.
|
||||
|
||||
Args:
|
||||
workspace_name (str): Name of workspace
|
||||
resource_group (str): Name of Azure Resource group
|
||||
subscription_id (str): Azure Subscription ID
|
||||
workspace_region (str): Azure region to create resources in
|
||||
config_path (str): Path to save AML config to
|
||||
|
||||
Returns:
|
||||
amltoolz.Workspace: Either a new workspace created or gets one as identified by name, region and resource group
|
||||
"""
|
||||
return Workspace(
|
||||
workspace_name=workspace_name,
|
||||
resource_group=resource_group,
|
||||
subscription_id=subscription_id,
|
||||
workspace_region=workspace_region,
|
||||
config_path=config_path,
|
||||
)
|
||||
|
||||
|
||||
def tensorboard(runs):
|
||||
""" Returns Tensorboard object instantiated with one or more runs
|
||||
|
||||
You can start Tensorboard session by calling start on Tensorboard object
|
||||
To stop simply call stop on same object
|
||||
Args:
|
||||
runs (azureml.core.script_run.ScriptRun or list):
|
||||
|
||||
Returns:
|
||||
azureml.tensorboard.Tensorboard
|
||||
|
||||
Examples:
|
||||
>>> tb = tensorboard(runs)
|
||||
>>> tb.start() # Start Tensorboard
|
||||
>>> tb.stop() # Stop Tensorboard
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info(f"Starting tensorboard {pformat(runs)}")
|
||||
if isinstance(runs, list):
|
||||
return Tensorboard(runs)
|
||||
else:
|
||||
return Tensorboard([runs])
|
||||
|
||||
|
||||
def _start_and_wait(tb):
|
||||
logger = logging.getLogger(__name__)
|
||||
try:
|
||||
tb.start()
|
||||
while True:
|
||||
sleep(10)
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Exiting Tensorboard")
|
||||
finally:
|
||||
tb.stop()
|
||||
|
||||
|
||||
def _select_runs(experiment, runs=None, status=("Running",)):
|
||||
logger = logging.getLogger(__name__)
|
||||
try:
|
||||
if runs:
|
||||
selected_runs = [experiment.runs[run].aml_run for run in runs]
|
||||
else:
|
||||
selected_runs = [
|
||||
run.aml_run for run in experiment.runs if run.aml_run.status in status
|
||||
]
|
||||
if len(selected_runs) == 0:
|
||||
logger.warn("No runs found")
|
||||
return selected_runs
|
||||
except KeyError as e:
|
||||
logger.warn(f"Did not find run!")
|
||||
raise e
|
||||
|
||||
|
||||
def tensorboard_cli(experiment, runs=None, status=("Running",)):
|
||||
logger = logging.getLogger(__name__)
|
||||
ws = workspace_for_user()
|
||||
ws.experiments.refresh()
|
||||
try:
|
||||
exp_obj = ws.experiments[experiment]
|
||||
exp_obj.runs.refresh()
|
||||
runs = _select_runs(exp_obj, runs=runs, status=status)
|
||||
logger.debug(pformat(runs))
|
||||
pipe(runs, tensorboard, _start_and_wait)
|
||||
|
||||
except KeyError:
|
||||
logger.warn(f"Did not find experiment {experiment}!")
|
||||
logger.warn("Your experiments are:")
|
||||
for exp in ws.experiments:
|
||||
logger.warn(f"{exp}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
""" Access workspace and run TensorFlow experiments
|
||||
"""
|
||||
fire.Fire(
|
||||
{
|
||||
"workspace": workspace_for_user,
|
||||
"tf-experiment": TFExperimentCLI,
|
||||
"tensorboard": tensorboard_cli,
|
||||
}
|
||||
)
|
||||
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче