Remove name requirement for multi-instance tasks

- Update TensorFlow-Distributed gpu launcher script to autodetect gpus
- Separate config scripts for TensorFlow-Distributed into CPU and GPU
This commit is contained in:
Fred Park 2016-10-27 23:41:47 -07:00
Родитель 5f160e8938
Коммит 72f9c90baf
29 изменённых файлов: 138 добавлений и 65 удалений

Просмотреть файл

@ -43,7 +43,10 @@ Please see usage doc for more information.
been replaced by the `ssh` property. `generate_tunnel_script` has been renamed
to `generate_docker_tunnel_script`. Please see the configuration doc for
more information.
- `streamfile` no longer has an arbitrary max streaming time; the action will
- The `name` property of a task json object in the jobs specification is no
longer required for multi-instance tasks. If not specified, `name` defaults
to `id` for all task types.
- `data stream` no longer has an arbitrary max streaming time; the action will
stream the file indefinitely until the task completes
- Validate container with `storage_entity_prefix` for length issues
- `delpool` action now cleans up and deletes some storage containers

Просмотреть файл

@ -1215,6 +1215,32 @@ def list_task_files(batch_client, config):
logger.error('no tasks found for job {}'.format(job['id']))
def _generate_next_generic_task_id(batch_client, job_id, reserved=None):
# type: (azure.batch.batch_service_client.BatchServiceClient, str,
# str) -> str
"""Generate the next generic task id
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param str job_id: job id
:param str reserved: reserved task id
:rtype: str
:return: returns a generic docker task id
"""
# get filtered, sorted list of generic docker task ids
try:
tasklist = sorted(filter(
lambda x: x.id.startswith(_GENERIC_DOCKER_TASK_PREFIX),
(batch_client.task.list(job_id))), key=lambda y: y.id)
tasknum = int(tasklist[-1].id.split('-')[-1]) + 1
except (batchmodels.batch_error.BatchErrorException, IndexError):
tasknum = 0
if reserved is not None:
tasknum_reserved = int(reserved.split('-')[-1])
while tasknum == tasknum_reserved:
tasknum += 1
return '{0}{1:03d}'.format(_GENERIC_DOCKER_TASK_PREFIX, tasknum)
def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
# type: (batch.BatchServiceClient, azureblob.BlockBlobService,
# dict, tuple, tuple) -> None
@ -1261,7 +1287,8 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
mi_ac = True
job.uses_task_dependencies = False
multi_instance = False
docker_container_name = None
mi_docker_container_name = None
reserved_task_id = None
for task in jobspec['tasks']:
# do not break, check to ensure ids are set on each task if
# task dependencies are set
@ -1277,20 +1304,26 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
'cannot specify more than one multi-instance task '
'per job with auto completion enabled')
multi_instance = True
docker_container_name = task['name']
try:
mi_docker_container_name = task['name']
if task['name'] is None or len(task['name']) == 0:
raise KeyError()
except KeyError:
if ('id' not in task or task['id'] is None or
len(task['id']) == 0):
reserved_task_id = _generate_next_generic_task_id(
batch_client, job.id)
task['id'] = reserved_task_id
task['name'] = task['id']
mi_docker_container_name = task['name']
# add multi-instance settings
set_terminate_on_all_tasks_complete = False
if multi_instance and mi_ac:
if (docker_container_name is None or
len(docker_container_name) == 0):
raise ValueError(
'multi-instance task must be invoked with a named '
'container')
set_terminate_on_all_tasks_complete = True
job.job_release_task = batchmodels.JobReleaseTask(
command_line=convoy.util.wrap_commands_in_shell(
['docker stop {}'.format(docker_container_name),
'docker rm -v {}'.format(docker_container_name)]),
['docker stop {}'.format(mi_docker_container_name),
'docker rm -v {}'.format(mi_docker_container_name)]),
run_elevated=True,
)
logger.info('Adding job: {}'.format(job.id))
@ -1306,7 +1339,7 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
raise
del mi_ac
del multi_instance
del docker_container_name
del mi_docker_container_name
# add all tasks under job
for task in jobspec['tasks']:
# get image name
@ -1317,19 +1350,8 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
if task_id is None or len(task_id) == 0:
raise KeyError()
except KeyError:
# get filtered, sorted list of generic docker task ids
try:
tasklist = sorted(
filter(lambda x: x.id.startswith(
_GENERIC_DOCKER_TASK_PREFIX), list(
batch_client.task.list(job.id))),
key=lambda y: y.id)
tasknum = int(tasklist[-1].id.split('-')[-1]) + 1
except (batchmodels.batch_error.BatchErrorException,
IndexError):
tasknum = 0
task_id = '{0}{1:03d}'.format(
_GENERIC_DOCKER_TASK_PREFIX, tasknum)
task_id = _generate_next_generic_task_id(
batch_client, job.id, reserved_task_id)
# set run and exec commands
docker_run_cmd = 'docker run'
docker_exec_cmd = 'docker exec'
@ -1346,13 +1368,14 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
else:
if rm_container and '--rm' not in run_opts:
run_opts.append('--rm')
# parse name option
# parse name option, if not specified use task id
try:
name = task['name']
if name is not None:
run_opts.append('--name {}'.format(name))
if name is None or len(name) == 0:
raise KeyError()
except KeyError:
name = None
name = task['id']
run_opts.append('--name {}'.format(name))
# parse labels option
try:
labels = task['labels']

Просмотреть файл

@ -735,8 +735,8 @@ transferred again. This object currently supports `azure_batch` and
invocation (task) depends on and must run to successful completion prior
to this task executing.
* (required) `image` is the Docker image to use for this task
* `name` is the name to assign to the container. This is required for
multi-instance tasks, optional if not.
* (optional) `name` is the name to assign to the container. If not
specified, the value of the `id` property will be used for `name`.
* (optional) `labels` is an array of labels to apply to the container.
* (optional) `environment_variables` are any additional task-specific
environment variables that should be applied to the container.

Просмотреть файл

@ -45,8 +45,6 @@ array which should have a task definition containing:
Since we are not using either the MNIST or CIFAR examples, this can simply
be `alfpark/cntk:1.7.2-cpu-openmpi`. Please note that the `docker_images` in
the Global Configuration should match this image name.
* `name` is a unique name given to the Docker container instance. This is
required for Multi-Instance tasks.
* `command` should contain the command to pass to the Docker run invocation.
For this example, we will run the ConvNet MNIST Example that has been modified
to run in parallel in the `alfpark/cntk:1.7.2-cpu-openmpi-refdata` Docker

Просмотреть файл

@ -6,7 +6,6 @@
"tasks": [
{
"image": "alfpark/cntk:1.7.2-cpu-openmpi-refdata",
"name": "cntk",
"remove_container_after_exit": true,
"shared_data_volumes": [
"glustervol"

Просмотреть файл

@ -69,9 +69,6 @@ array which should have a task definition containing:
Since we are not using either the MNIST or CIFAR examples, this can simply
be `alfpark/cntk:1.7.2-gpu-openmpi`. Please note that the `docker_images` in
the Global Configuration should match this image name.
* `name` is a unique name given to the Docker container instance. This is
required for Multi-Instance tasks. This is not required for running on a
single node.
* `command` should contain the command to pass to the Docker run invocation.
For this example, we will run the ConvNet MNIST Example that has been modified
to run in parallel in the `alfpark/cntk:1.7.2-gpu-openmpi-refdata` Docker

Просмотреть файл

@ -6,7 +6,6 @@
"tasks": [
{
"image": "alfpark/cntk:1.7.2-gpu-openmpi-refdata",
"name": "cntk",
"remove_container_after_exit": true,
"shared_data_volumes": [
"glustervol"

Просмотреть файл

@ -49,8 +49,6 @@ array which should have a task definition containing:
* `image` should be the name of the Docker image for this container invocation.
This can be `alfpark/mxnet:cpu`. Please note that the `docker_images` in
the Global Configuration should match this image name.
* `name` is a unique name given to the Docker container instance. This is
required for Multi-Instance tasks.
* `command` should contain the command to pass to the Docker run invocation.
For this example, we will run the CIFAR-10 example across distributed nodes
in the `alfpark/mxnet:cpu` Docker image. The application `command` to run

Просмотреть файл

@ -6,7 +6,6 @@
"tasks": [
{
"image": "alfpark/mxnet:cpu",
"name": "mxnet",
"remove_container_after_exit": true,
"shared_data_volumes": [
"glustervol"

Просмотреть файл

@ -70,8 +70,6 @@ array which should have a task definition containing:
* `image` should be the name of the Docker image for this container invocation.
This can be `alfpark/mxnet:gpu`. Please note that the `docker_images` in
the Global Configuration should match this image name.
* `name` is a unique name given to the Docker container instance. This is
required for Multi-Instance tasks.
* `command` should contain the command to pass to the Docker run invocation.
For this example, we will run the CIFAR-10 Resnet example across distributed
nodes in the `alfpark/mxnet:gpu` Docker image. Note that for multinode jobs,

Просмотреть файл

@ -6,7 +6,6 @@
"tasks": [
{
"image": "alfpark/mxnet:gpu",
"name": "mxnet",
"remove_container_after_exit": true,
"shared_data_volumes": [
"glustervol"

Просмотреть файл

@ -31,8 +31,6 @@ The jobs configuration should set the following properties within the `tasks`
array which should have a task definition containing:
* `image` should be the name of the Docker image for this container invocation,
e.g., `alfpark/namd:2.11-icc-mkl-intelmpi`
* `name` is a unique name given to the Docker container instance. This is
required for Multi-Instance tasks.
* `command` should contain the `mpirun` command. If using the sample
`run_namd.sh` script then `"/sw/run_namd.sh <benchmark> <steps> <ppn>"`
can be used to run the included benchmarks:

Просмотреть файл

@ -6,7 +6,6 @@
"tasks": [
{
"image": "alfpark/namd:2.11-icc-mkl-intelmpi",
"name": "namd",
"remove_container_after_exit": true,
"command": "/sw/run_namd.sh apoa1 1000",
"infiniband": true,

Просмотреть файл

@ -27,8 +27,6 @@ The jobs configuration should set the following properties within the `tasks`
array which should have a task definition containing:
* `image` should be the name of the Docker image for this container invocation,
e.g., `alfpark/namd:2.11-tcp`
* `name` is a unique name given to the Docker container instance. This is
required for Multi-Instance tasks.
* `command` should contain the `mpirun` command. If using the sample NAMD-TCP
image provided, `"/sw/run_namd.sh <benchmark> <steps> <ppn>"` can be used
to run the included benchmarks:

Просмотреть файл

@ -6,7 +6,6 @@
"tasks": [
{
"image": "alfpark/namd:2.11-tcp",
"name": "namd",
"remove_container_after_exit": true,
"command": "/sw/run_namd.sh apoa1 100",
"multi_instance": {

Просмотреть файл

@ -37,8 +37,6 @@ The jobs configuration should set the following properties within the `tasks`
array which should have a task definition containing:
* `image` should be the name of the Docker image for this container invocation.
For this example, this can be `alfpark/openfoam:4.0-icc-intelmpi`.
* `name` is a unique name given to the Docker container instance. This is
required for Multi-Instance tasks.
* `command` should contain the `mpirun` command. If using the sample
`run_sample.sh` script then the command should be simply:
`/opt/OpenFOAM/run_sample.sh`

Просмотреть файл

@ -6,7 +6,6 @@
"tasks": [
{
"image": "alfpark/openfoam:4.0-icc-intelmpi",
"name": "openfoam",
"remove_container_after_exit": true,
"shared_data_volumes": [
"glustervol"

Просмотреть файл

@ -32,8 +32,6 @@ The jobs configuration should set the following properties within the `tasks`
array which should have a task definition containing:
* `image` should be the name of the Docker image for this container invocation.
For this example, this should be `alfpark/openfoam:4.0-gcc-openmpi`.
* `name` is a unique name given to the Docker container instance. This is
required for Multi-Instance tasks.
* `command` should contain the `mpirun` command. If using the sample
`run_sample.sh` script then the command should be simply:
`/opt/OpenFOAM/run_sample.sh`

Просмотреть файл

@ -6,7 +6,6 @@
"tasks": [
{
"image": "alfpark/openfoam:4.0-gcc-openmpi",
"name": "openfoam",
"remove_container_after_exit": true,
"shared_data_volumes": [
"glustervol"

Просмотреть файл

@ -52,13 +52,13 @@ array which should have a task definition containing:
* `image` should be the name of the Docker image for this container invocation,
e.g., `alfpark/tensorflow/0.10.0-gpu` or `alfpark/tensorflow/0.10.0-cpu`
* `command` should contain the command to pass to the Docker run invocation.
To run the example MNIST replica example on GPUs, the `command` would look
like: `"/bin/bash /sw/launcher.sh --num_gpus=<number of total gpus>"` where
the total number of gpus would be specified for the `--num_gpus` parameter.
To run the example MNIST replica example, the `command` would look
like: `"/bin/bash /sw/launcher.sh"`. The launcher will automatically detect
the number of GPUs and pass the correct number to the TensorFlow script.
Please see the [launcher.sh](docker/gpu/launcher.sh) for the launcher source.
`--num_gpus=` parameter must be omitted if run on CPUs.
* `gpu` must be set to `true` if run on GPUs. This enables invoking the
`nvidia-docker` wrapper.
`nvidia-docker` wrapper. This property should be omitted or set to `false`
if run on CPUs.
* `multi_instance` property must be defined
* `num_instances` should be set to `pool_specification_vm_count` or
`pool_current_dedicated`

Просмотреть файл

@ -0,0 +1,11 @@
{
"batch_shipyard": {
"storage_account_settings": "<storage account specified in credentials.json>",
"storage_entity_prefix": "shipyard"
},
"global_resources": {
"docker_images": [
"alfpark/tensorflow:0.10.0-cpu"
]
}
}

Просмотреть файл

@ -0,0 +1,21 @@
{
"job_specifications": [
{
"id": "tensorflow",
"multi_instance_auto_complete": true,
"tasks": [
{
"image": "alfpark/tensorflow:0.10.0-cpu",
"remove_container_after_exit": true,
"command": "/bin/bash /sw/launcher.sh",
"multi_instance": {
"num_instances": "pool_specification_vm_count",
"coordination_command": null,
"resource_files": [
]
}
}
]
}
]
}

Просмотреть файл

@ -0,0 +1,16 @@
{
"pool_specification": {
"id": "tensorflow-distributed",
"vm_size": "STANDARD_D4_V2",
"vm_count": 2,
"inter_node_communication_enabled": true,
"publisher": "Canonical",
"offer": "UbuntuServer",
"sku": "16.04.0-LTS",
"ssh": {
"username": "docker"
},
"reboot_on_start_task_failed": false,
"block_until_all_global_resources_loaded": true
}
}

Просмотреть файл

@ -0,0 +1,16 @@
{
"credentials": {
"batch": {
"account": "<batch account name>",
"account_key": "<batch account key>",
"account_service_url": "<batch account service url>"
},
"storage": {
"mystorageaccount": {
"account": "<storage account name>",
"account_key": "<storage account key>",
"endpoint": "core.windows.net"
}
}
}
}

Просмотреть файл

@ -6,9 +6,8 @@
"tasks": [
{
"image": "alfpark/tensorflow:0.10.0-gpu",
"name": "tensorflow",
"remove_container_after_exit": true,
"command": "/bin/bash /sw/launcher.sh --num_gpus 2",
"command": "/bin/bash /sw/launcher.sh",
"gpu": true,
"multi_instance": {
"num_instances": "pool_specification_vm_count",

Просмотреть файл

@ -3,6 +3,15 @@
set -e
set -o pipefail
# get number of GPUs on machine
ngpus=`nvidia-smi -L | wc -l`
echo "num gpus: $ngpus"
if [ $ngpus -eq 0 ]; then
echo "No GPUs detected."
exit 1
fi
# get my ip address
ipaddress=`ip addr list eth0 | grep "inet " | cut -d' ' -f6 | cut -d/ -f1`
@ -40,7 +49,7 @@ if [ $AZ_BATCH_IS_CURRENT_NODE_MASTER == "true" ]; then
# master node
ti=${task_index[$master]}
echo "master node: $ipaddress task index: $ti"
python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=ps --task_index=$ti --data_dir=./master $* > ps-$ti.log 2>&1 &
python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=ps --task_index=$ti --data_dir=./master --num_gpus=$ngpus $* > ps-$ti.log 2>&1 &
masterpid=$!
fi
@ -52,10 +61,10 @@ do
ti=${task_index[$node]}
echo "worker node: $node task index: $ti"
if [ $node == $master ]; then
python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=./worker-$ti $* > worker-$ti.log 2>&1 &
python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=./worker-$ti --num_gpus=$ngpus $* > worker-$ti.log 2>&1 &
waitpids=("${waitpids[@]}" "$!")
else
ssh $node "python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=$AZ_BATCH_TASK_WORKING_DIR/worker-$ti $* > $AZ_BATCH_TASK_WORKING_DIR/worker-$ti.log 2>&1" &
ssh $node "python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=$AZ_BATCH_TASK_WORKING_DIR/worker-$ti --num_gpus=$ngpus $* > $AZ_BATCH_TASK_WORKING_DIR/worker-$ti.log 2>&1" &
waitpids=("${waitpids[@]}" "$!")
fi
done