Remove name requirement for multi-instance tasks
- Update TensorFlow-Distributed gpu launcher script to autodetect gpus - Separate config scripts for TensorFlow-Distributed into CPU and GPU
This commit is contained in:
Родитель
5f160e8938
Коммит
72f9c90baf
|
@ -43,7 +43,10 @@ Please see usage doc for more information.
|
|||
been replaced by the `ssh` property. `generate_tunnel_script` has been renamed
|
||||
to `generate_docker_tunnel_script`. Please see the configuration doc for
|
||||
more information.
|
||||
- `streamfile` no longer has an arbitrary max streaming time; the action will
|
||||
- The `name` property of a task json object in the jobs specification is no
|
||||
longer required for multi-instance tasks. If not specified, `name` defaults
|
||||
to `id` for all task types.
|
||||
- `data stream` no longer has an arbitrary max streaming time; the action will
|
||||
stream the file indefinitely until the task completes
|
||||
- Validate container with `storage_entity_prefix` for length issues
|
||||
- `delpool` action now cleans up and deletes some storage containers
|
||||
|
|
|
@ -1215,6 +1215,32 @@ def list_task_files(batch_client, config):
|
|||
logger.error('no tasks found for job {}'.format(job['id']))
|
||||
|
||||
|
||||
def _generate_next_generic_task_id(batch_client, job_id, reserved=None):
|
||||
# type: (azure.batch.batch_service_client.BatchServiceClient, str,
|
||||
# str) -> str
|
||||
"""Generate the next generic task id
|
||||
:param batch_client: The batch client to use.
|
||||
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
|
||||
:param str job_id: job id
|
||||
:param str reserved: reserved task id
|
||||
:rtype: str
|
||||
:return: returns a generic docker task id
|
||||
"""
|
||||
# get filtered, sorted list of generic docker task ids
|
||||
try:
|
||||
tasklist = sorted(filter(
|
||||
lambda x: x.id.startswith(_GENERIC_DOCKER_TASK_PREFIX),
|
||||
(batch_client.task.list(job_id))), key=lambda y: y.id)
|
||||
tasknum = int(tasklist[-1].id.split('-')[-1]) + 1
|
||||
except (batchmodels.batch_error.BatchErrorException, IndexError):
|
||||
tasknum = 0
|
||||
if reserved is not None:
|
||||
tasknum_reserved = int(reserved.split('-')[-1])
|
||||
while tasknum == tasknum_reserved:
|
||||
tasknum += 1
|
||||
return '{0}{1:03d}'.format(_GENERIC_DOCKER_TASK_PREFIX, tasknum)
|
||||
|
||||
|
||||
def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
|
||||
# type: (batch.BatchServiceClient, azureblob.BlockBlobService,
|
||||
# dict, tuple, tuple) -> None
|
||||
|
@ -1261,7 +1287,8 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
|
|||
mi_ac = True
|
||||
job.uses_task_dependencies = False
|
||||
multi_instance = False
|
||||
docker_container_name = None
|
||||
mi_docker_container_name = None
|
||||
reserved_task_id = None
|
||||
for task in jobspec['tasks']:
|
||||
# do not break, check to ensure ids are set on each task if
|
||||
# task dependencies are set
|
||||
|
@ -1277,20 +1304,26 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
|
|||
'cannot specify more than one multi-instance task '
|
||||
'per job with auto completion enabled')
|
||||
multi_instance = True
|
||||
docker_container_name = task['name']
|
||||
try:
|
||||
mi_docker_container_name = task['name']
|
||||
if task['name'] is None or len(task['name']) == 0:
|
||||
raise KeyError()
|
||||
except KeyError:
|
||||
if ('id' not in task or task['id'] is None or
|
||||
len(task['id']) == 0):
|
||||
reserved_task_id = _generate_next_generic_task_id(
|
||||
batch_client, job.id)
|
||||
task['id'] = reserved_task_id
|
||||
task['name'] = task['id']
|
||||
mi_docker_container_name = task['name']
|
||||
# add multi-instance settings
|
||||
set_terminate_on_all_tasks_complete = False
|
||||
if multi_instance and mi_ac:
|
||||
if (docker_container_name is None or
|
||||
len(docker_container_name) == 0):
|
||||
raise ValueError(
|
||||
'multi-instance task must be invoked with a named '
|
||||
'container')
|
||||
set_terminate_on_all_tasks_complete = True
|
||||
job.job_release_task = batchmodels.JobReleaseTask(
|
||||
command_line=convoy.util.wrap_commands_in_shell(
|
||||
['docker stop {}'.format(docker_container_name),
|
||||
'docker rm -v {}'.format(docker_container_name)]),
|
||||
['docker stop {}'.format(mi_docker_container_name),
|
||||
'docker rm -v {}'.format(mi_docker_container_name)]),
|
||||
run_elevated=True,
|
||||
)
|
||||
logger.info('Adding job: {}'.format(job.id))
|
||||
|
@ -1306,7 +1339,7 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
|
|||
raise
|
||||
del mi_ac
|
||||
del multi_instance
|
||||
del docker_container_name
|
||||
del mi_docker_container_name
|
||||
# add all tasks under job
|
||||
for task in jobspec['tasks']:
|
||||
# get image name
|
||||
|
@ -1317,19 +1350,8 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
|
|||
if task_id is None or len(task_id) == 0:
|
||||
raise KeyError()
|
||||
except KeyError:
|
||||
# get filtered, sorted list of generic docker task ids
|
||||
try:
|
||||
tasklist = sorted(
|
||||
filter(lambda x: x.id.startswith(
|
||||
_GENERIC_DOCKER_TASK_PREFIX), list(
|
||||
batch_client.task.list(job.id))),
|
||||
key=lambda y: y.id)
|
||||
tasknum = int(tasklist[-1].id.split('-')[-1]) + 1
|
||||
except (batchmodels.batch_error.BatchErrorException,
|
||||
IndexError):
|
||||
tasknum = 0
|
||||
task_id = '{0}{1:03d}'.format(
|
||||
_GENERIC_DOCKER_TASK_PREFIX, tasknum)
|
||||
task_id = _generate_next_generic_task_id(
|
||||
batch_client, job.id, reserved_task_id)
|
||||
# set run and exec commands
|
||||
docker_run_cmd = 'docker run'
|
||||
docker_exec_cmd = 'docker exec'
|
||||
|
@ -1346,13 +1368,14 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
|
|||
else:
|
||||
if rm_container and '--rm' not in run_opts:
|
||||
run_opts.append('--rm')
|
||||
# parse name option
|
||||
# parse name option, if not specified use task id
|
||||
try:
|
||||
name = task['name']
|
||||
if name is not None:
|
||||
run_opts.append('--name {}'.format(name))
|
||||
if name is None or len(name) == 0:
|
||||
raise KeyError()
|
||||
except KeyError:
|
||||
name = None
|
||||
name = task['id']
|
||||
run_opts.append('--name {}'.format(name))
|
||||
# parse labels option
|
||||
try:
|
||||
labels = task['labels']
|
||||
|
|
|
@ -735,8 +735,8 @@ transferred again. This object currently supports `azure_batch` and
|
|||
invocation (task) depends on and must run to successful completion prior
|
||||
to this task executing.
|
||||
* (required) `image` is the Docker image to use for this task
|
||||
* `name` is the name to assign to the container. This is required for
|
||||
multi-instance tasks, optional if not.
|
||||
* (optional) `name` is the name to assign to the container. If not
|
||||
specified, the value of the `id` property will be used for `name`.
|
||||
* (optional) `labels` is an array of labels to apply to the container.
|
||||
* (optional) `environment_variables` are any additional task-specific
|
||||
environment variables that should be applied to the container.
|
||||
|
|
|
@ -45,8 +45,6 @@ array which should have a task definition containing:
|
|||
Since we are not using either the MNIST or CIFAR examples, this can simply
|
||||
be `alfpark/cntk:1.7.2-cpu-openmpi`. Please note that the `docker_images` in
|
||||
the Global Configuration should match this image name.
|
||||
* `name` is a unique name given to the Docker container instance. This is
|
||||
required for Multi-Instance tasks.
|
||||
* `command` should contain the command to pass to the Docker run invocation.
|
||||
For this example, we will run the ConvNet MNIST Example that has been modified
|
||||
to run in parallel in the `alfpark/cntk:1.7.2-cpu-openmpi-refdata` Docker
|
||||
|
|
|
@ -6,7 +6,6 @@
|
|||
"tasks": [
|
||||
{
|
||||
"image": "alfpark/cntk:1.7.2-cpu-openmpi-refdata",
|
||||
"name": "cntk",
|
||||
"remove_container_after_exit": true,
|
||||
"shared_data_volumes": [
|
||||
"glustervol"
|
||||
|
|
|
@ -69,9 +69,6 @@ array which should have a task definition containing:
|
|||
Since we are not using either the MNIST or CIFAR examples, this can simply
|
||||
be `alfpark/cntk:1.7.2-gpu-openmpi`. Please note that the `docker_images` in
|
||||
the Global Configuration should match this image name.
|
||||
* `name` is a unique name given to the Docker container instance. This is
|
||||
required for Multi-Instance tasks. This is not required for running on a
|
||||
single node.
|
||||
* `command` should contain the command to pass to the Docker run invocation.
|
||||
For this example, we will run the ConvNet MNIST Example that has been modified
|
||||
to run in parallel in the `alfpark/cntk:1.7.2-gpu-openmpi-refdata` Docker
|
||||
|
|
|
@ -6,7 +6,6 @@
|
|||
"tasks": [
|
||||
{
|
||||
"image": "alfpark/cntk:1.7.2-gpu-openmpi-refdata",
|
||||
"name": "cntk",
|
||||
"remove_container_after_exit": true,
|
||||
"shared_data_volumes": [
|
||||
"glustervol"
|
||||
|
|
|
@ -49,8 +49,6 @@ array which should have a task definition containing:
|
|||
* `image` should be the name of the Docker image for this container invocation.
|
||||
This can be `alfpark/mxnet:cpu`. Please note that the `docker_images` in
|
||||
the Global Configuration should match this image name.
|
||||
* `name` is a unique name given to the Docker container instance. This is
|
||||
required for Multi-Instance tasks.
|
||||
* `command` should contain the command to pass to the Docker run invocation.
|
||||
For this example, we will run the CIFAR-10 example across distributed nodes
|
||||
in the `alfpark/mxnet:cpu` Docker image. The application `command` to run
|
||||
|
|
|
@ -6,7 +6,6 @@
|
|||
"tasks": [
|
||||
{
|
||||
"image": "alfpark/mxnet:cpu",
|
||||
"name": "mxnet",
|
||||
"remove_container_after_exit": true,
|
||||
"shared_data_volumes": [
|
||||
"glustervol"
|
||||
|
|
|
@ -70,8 +70,6 @@ array which should have a task definition containing:
|
|||
* `image` should be the name of the Docker image for this container invocation.
|
||||
This can be `alfpark/mxnet:gpu`. Please note that the `docker_images` in
|
||||
the Global Configuration should match this image name.
|
||||
* `name` is a unique name given to the Docker container instance. This is
|
||||
required for Multi-Instance tasks.
|
||||
* `command` should contain the command to pass to the Docker run invocation.
|
||||
For this example, we will run the CIFAR-10 Resnet example across distributed
|
||||
nodes in the `alfpark/mxnet:gpu` Docker image. Note that for multinode jobs,
|
||||
|
|
|
@ -6,7 +6,6 @@
|
|||
"tasks": [
|
||||
{
|
||||
"image": "alfpark/mxnet:gpu",
|
||||
"name": "mxnet",
|
||||
"remove_container_after_exit": true,
|
||||
"shared_data_volumes": [
|
||||
"glustervol"
|
||||
|
|
|
@ -31,8 +31,6 @@ The jobs configuration should set the following properties within the `tasks`
|
|||
array which should have a task definition containing:
|
||||
* `image` should be the name of the Docker image for this container invocation,
|
||||
e.g., `alfpark/namd:2.11-icc-mkl-intelmpi`
|
||||
* `name` is a unique name given to the Docker container instance. This is
|
||||
required for Multi-Instance tasks.
|
||||
* `command` should contain the `mpirun` command. If using the sample
|
||||
`run_namd.sh` script then `"/sw/run_namd.sh <benchmark> <steps> <ppn>"`
|
||||
can be used to run the included benchmarks:
|
||||
|
|
|
@ -6,7 +6,6 @@
|
|||
"tasks": [
|
||||
{
|
||||
"image": "alfpark/namd:2.11-icc-mkl-intelmpi",
|
||||
"name": "namd",
|
||||
"remove_container_after_exit": true,
|
||||
"command": "/sw/run_namd.sh apoa1 1000",
|
||||
"infiniband": true,
|
||||
|
|
|
@ -27,8 +27,6 @@ The jobs configuration should set the following properties within the `tasks`
|
|||
array which should have a task definition containing:
|
||||
* `image` should be the name of the Docker image for this container invocation,
|
||||
e.g., `alfpark/namd:2.11-tcp`
|
||||
* `name` is a unique name given to the Docker container instance. This is
|
||||
required for Multi-Instance tasks.
|
||||
* `command` should contain the `mpirun` command. If using the sample NAMD-TCP
|
||||
image provided, `"/sw/run_namd.sh <benchmark> <steps> <ppn>"` can be used
|
||||
to run the included benchmarks:
|
||||
|
|
|
@ -6,7 +6,6 @@
|
|||
"tasks": [
|
||||
{
|
||||
"image": "alfpark/namd:2.11-tcp",
|
||||
"name": "namd",
|
||||
"remove_container_after_exit": true,
|
||||
"command": "/sw/run_namd.sh apoa1 100",
|
||||
"multi_instance": {
|
||||
|
|
|
@ -37,8 +37,6 @@ The jobs configuration should set the following properties within the `tasks`
|
|||
array which should have a task definition containing:
|
||||
* `image` should be the name of the Docker image for this container invocation.
|
||||
For this example, this can be `alfpark/openfoam:4.0-icc-intelmpi`.
|
||||
* `name` is a unique name given to the Docker container instance. This is
|
||||
required for Multi-Instance tasks.
|
||||
* `command` should contain the `mpirun` command. If using the sample
|
||||
`run_sample.sh` script then the command should be simply:
|
||||
`/opt/OpenFOAM/run_sample.sh`
|
||||
|
|
|
@ -6,7 +6,6 @@
|
|||
"tasks": [
|
||||
{
|
||||
"image": "alfpark/openfoam:4.0-icc-intelmpi",
|
||||
"name": "openfoam",
|
||||
"remove_container_after_exit": true,
|
||||
"shared_data_volumes": [
|
||||
"glustervol"
|
||||
|
|
|
@ -32,8 +32,6 @@ The jobs configuration should set the following properties within the `tasks`
|
|||
array which should have a task definition containing:
|
||||
* `image` should be the name of the Docker image for this container invocation.
|
||||
For this example, this should be `alfpark/openfoam:4.0-gcc-openmpi`.
|
||||
* `name` is a unique name given to the Docker container instance. This is
|
||||
required for Multi-Instance tasks.
|
||||
* `command` should contain the `mpirun` command. If using the sample
|
||||
`run_sample.sh` script then the command should be simply:
|
||||
`/opt/OpenFOAM/run_sample.sh`
|
||||
|
|
|
@ -6,7 +6,6 @@
|
|||
"tasks": [
|
||||
{
|
||||
"image": "alfpark/openfoam:4.0-gcc-openmpi",
|
||||
"name": "openfoam",
|
||||
"remove_container_after_exit": true,
|
||||
"shared_data_volumes": [
|
||||
"glustervol"
|
||||
|
|
|
@ -52,13 +52,13 @@ array which should have a task definition containing:
|
|||
* `image` should be the name of the Docker image for this container invocation,
|
||||
e.g., `alfpark/tensorflow/0.10.0-gpu` or `alfpark/tensorflow/0.10.0-cpu`
|
||||
* `command` should contain the command to pass to the Docker run invocation.
|
||||
To run the example MNIST replica example on GPUs, the `command` would look
|
||||
like: `"/bin/bash /sw/launcher.sh --num_gpus=<number of total gpus>"` where
|
||||
the total number of gpus would be specified for the `--num_gpus` parameter.
|
||||
To run the example MNIST replica example, the `command` would look
|
||||
like: `"/bin/bash /sw/launcher.sh"`. The launcher will automatically detect
|
||||
the number of GPUs and pass the correct number to the TensorFlow script.
|
||||
Please see the [launcher.sh](docker/gpu/launcher.sh) for the launcher source.
|
||||
`--num_gpus=` parameter must be omitted if run on CPUs.
|
||||
* `gpu` must be set to `true` if run on GPUs. This enables invoking the
|
||||
`nvidia-docker` wrapper.
|
||||
`nvidia-docker` wrapper. This property should be omitted or set to `false`
|
||||
if run on CPUs.
|
||||
* `multi_instance` property must be defined
|
||||
* `num_instances` should be set to `pool_specification_vm_count` or
|
||||
`pool_current_dedicated`
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
{
|
||||
"batch_shipyard": {
|
||||
"storage_account_settings": "<storage account specified in credentials.json>",
|
||||
"storage_entity_prefix": "shipyard"
|
||||
},
|
||||
"global_resources": {
|
||||
"docker_images": [
|
||||
"alfpark/tensorflow:0.10.0-cpu"
|
||||
]
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
{
|
||||
"job_specifications": [
|
||||
{
|
||||
"id": "tensorflow",
|
||||
"multi_instance_auto_complete": true,
|
||||
"tasks": [
|
||||
{
|
||||
"image": "alfpark/tensorflow:0.10.0-cpu",
|
||||
"remove_container_after_exit": true,
|
||||
"command": "/bin/bash /sw/launcher.sh",
|
||||
"multi_instance": {
|
||||
"num_instances": "pool_specification_vm_count",
|
||||
"coordination_command": null,
|
||||
"resource_files": [
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "tensorflow-distributed",
|
||||
"vm_size": "STANDARD_D4_V2",
|
||||
"vm_count": 2,
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
"reboot_on_start_task_failed": false,
|
||||
"block_until_all_global_resources_loaded": true
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
"storage": {
|
||||
"mystorageaccount": {
|
||||
"account": "<storage account name>",
|
||||
"account_key": "<storage account key>",
|
||||
"endpoint": "core.windows.net"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -6,9 +6,8 @@
|
|||
"tasks": [
|
||||
{
|
||||
"image": "alfpark/tensorflow:0.10.0-gpu",
|
||||
"name": "tensorflow",
|
||||
"remove_container_after_exit": true,
|
||||
"command": "/bin/bash /sw/launcher.sh --num_gpus 2",
|
||||
"command": "/bin/bash /sw/launcher.sh",
|
||||
"gpu": true,
|
||||
"multi_instance": {
|
||||
"num_instances": "pool_specification_vm_count",
|
|
@ -3,6 +3,15 @@
|
|||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# get number of GPUs on machine
|
||||
ngpus=`nvidia-smi -L | wc -l`
|
||||
echo "num gpus: $ngpus"
|
||||
|
||||
if [ $ngpus -eq 0 ]; then
|
||||
echo "No GPUs detected."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# get my ip address
|
||||
ipaddress=`ip addr list eth0 | grep "inet " | cut -d' ' -f6 | cut -d/ -f1`
|
||||
|
||||
|
@ -40,7 +49,7 @@ if [ $AZ_BATCH_IS_CURRENT_NODE_MASTER == "true" ]; then
|
|||
# master node
|
||||
ti=${task_index[$master]}
|
||||
echo "master node: $ipaddress task index: $ti"
|
||||
python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=ps --task_index=$ti --data_dir=./master $* > ps-$ti.log 2>&1 &
|
||||
python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=ps --task_index=$ti --data_dir=./master --num_gpus=$ngpus $* > ps-$ti.log 2>&1 &
|
||||
masterpid=$!
|
||||
fi
|
||||
|
||||
|
@ -52,10 +61,10 @@ do
|
|||
ti=${task_index[$node]}
|
||||
echo "worker node: $node task index: $ti"
|
||||
if [ $node == $master ]; then
|
||||
python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=./worker-$ti $* > worker-$ti.log 2>&1 &
|
||||
python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=./worker-$ti --num_gpus=$ngpus $* > worker-$ti.log 2>&1 &
|
||||
waitpids=("${waitpids[@]}" "$!")
|
||||
else
|
||||
ssh $node "python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=$AZ_BATCH_TASK_WORKING_DIR/worker-$ti $* > $AZ_BATCH_TASK_WORKING_DIR/worker-$ti.log 2>&1" &
|
||||
ssh $node "python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=$AZ_BATCH_TASK_WORKING_DIR/worker-$ti --num_gpus=$ngpus $* > $AZ_BATCH_TASK_WORKING_DIR/worker-$ti.log 2>&1" &
|
||||
waitpids=("${waitpids[@]}" "$!")
|
||||
fi
|
||||
done
|
||||
|
|
Загрузка…
Ссылка в новой задаче