From 72f9c90baf2f5c541e1c7dfcc543b35e9727aec5 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Thu, 27 Oct 2016 23:41:47 -0700 Subject: [PATCH] Remove name requirement for multi-instance tasks - Update TensorFlow-Distributed gpu launcher script to autodetect gpus - Separate config scripts for TensorFlow-Distributed into CPU and GPU --- CHANGELOG.md | 5 +- convoy/batch.py | 77 ++++++++++++------- docs/10-batch-shipyard-configuration.md | 4 +- recipes/CNTK-CPU-OpenMPI/README.md | 2 - .../config/multinode/jobs.json | 1 - recipes/CNTK-GPU-OpenMPI/README.md | 3 - .../config/multinode-multigpu/jobs.json | 1 - recipes/MXNet-CPU/README.md | 2 - recipes/MXNet-CPU/config/multinode/jobs.json | 1 - recipes/MXNet-GPU/README.md | 2 - recipes/MXNet-GPU/config/multinode/jobs.json | 1 - recipes/NAMD-Infiniband-IntelMPI/README.md | 2 - .../NAMD-Infiniband-IntelMPI/config/jobs.json | 1 - recipes/NAMD-TCP/README.md | 2 - recipes/NAMD-TCP/config/jobs.json | 1 - .../OpenFOAM-Infiniband-IntelMPI/README.md | 2 - .../config/jobs.json | 1 - recipes/OpenFOAM-TCP-OpenMPI/README.md | 2 - recipes/OpenFOAM-TCP-OpenMPI/config/jobs.json | 1 - recipes/TensorFlow-Distributed/README.md | 10 +-- .../config/cpu/config.json | 11 +++ .../config/{ => cpu}/credentials.json | 0 .../config/cpu/jobs.json | 21 +++++ .../config/cpu/pool.json | 16 ++++ .../config/{ => gpu}/config.json | 0 .../config/gpu/credentials.json | 16 ++++ .../config/{ => gpu}/jobs.json | 3 +- .../config/{ => gpu}/pool.json | 0 .../docker/gpu/launcher.sh | 15 +++- 29 files changed, 138 insertions(+), 65 deletions(-) create mode 100644 recipes/TensorFlow-Distributed/config/cpu/config.json rename recipes/TensorFlow-Distributed/config/{ => cpu}/credentials.json (100%) create mode 100644 recipes/TensorFlow-Distributed/config/cpu/jobs.json create mode 100644 recipes/TensorFlow-Distributed/config/cpu/pool.json rename recipes/TensorFlow-Distributed/config/{ => gpu}/config.json (100%) create mode 100644 recipes/TensorFlow-Distributed/config/gpu/credentials.json rename recipes/TensorFlow-Distributed/config/{ => gpu}/jobs.json (84%) rename recipes/TensorFlow-Distributed/config/{ => gpu}/pool.json (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 77ef665..128290e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,7 +43,10 @@ Please see usage doc for more information. been replaced by the `ssh` property. `generate_tunnel_script` has been renamed to `generate_docker_tunnel_script`. Please see the configuration doc for more information. -- `streamfile` no longer has an arbitrary max streaming time; the action will +- The `name` property of a task json object in the jobs specification is no +longer required for multi-instance tasks. If not specified, `name` defaults +to `id` for all task types. +- `data stream` no longer has an arbitrary max streaming time; the action will stream the file indefinitely until the task completes - Validate container with `storage_entity_prefix` for length issues - `delpool` action now cleans up and deletes some storage containers diff --git a/convoy/batch.py b/convoy/batch.py index 04e4a46..4635182 100644 --- a/convoy/batch.py +++ b/convoy/batch.py @@ -1215,6 +1215,32 @@ def list_task_files(batch_client, config): logger.error('no tasks found for job {}'.format(job['id'])) +def _generate_next_generic_task_id(batch_client, job_id, reserved=None): + # type: (azure.batch.batch_service_client.BatchServiceClient, str, + # str) -> str + """Generate the next generic task id + :param batch_client: The batch client to use. + :type batch_client: `azure.batch.batch_service_client.BatchServiceClient` + :param str job_id: job id + :param str reserved: reserved task id + :rtype: str + :return: returns a generic docker task id + """ + # get filtered, sorted list of generic docker task ids + try: + tasklist = sorted(filter( + lambda x: x.id.startswith(_GENERIC_DOCKER_TASK_PREFIX), + (batch_client.task.list(job_id))), key=lambda y: y.id) + tasknum = int(tasklist[-1].id.split('-')[-1]) + 1 + except (batchmodels.batch_error.BatchErrorException, IndexError): + tasknum = 0 + if reserved is not None: + tasknum_reserved = int(reserved.split('-')[-1]) + while tasknum == tasknum_reserved: + tasknum += 1 + return '{0}{1:03d}'.format(_GENERIC_DOCKER_TASK_PREFIX, tasknum) + + def add_jobs(batch_client, blob_client, config, jpfile, bxfile): # type: (batch.BatchServiceClient, azureblob.BlockBlobService, # dict, tuple, tuple) -> None @@ -1261,7 +1287,8 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile): mi_ac = True job.uses_task_dependencies = False multi_instance = False - docker_container_name = None + mi_docker_container_name = None + reserved_task_id = None for task in jobspec['tasks']: # do not break, check to ensure ids are set on each task if # task dependencies are set @@ -1277,20 +1304,26 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile): 'cannot specify more than one multi-instance task ' 'per job with auto completion enabled') multi_instance = True - docker_container_name = task['name'] + try: + mi_docker_container_name = task['name'] + if task['name'] is None or len(task['name']) == 0: + raise KeyError() + except KeyError: + if ('id' not in task or task['id'] is None or + len(task['id']) == 0): + reserved_task_id = _generate_next_generic_task_id( + batch_client, job.id) + task['id'] = reserved_task_id + task['name'] = task['id'] + mi_docker_container_name = task['name'] # add multi-instance settings set_terminate_on_all_tasks_complete = False if multi_instance and mi_ac: - if (docker_container_name is None or - len(docker_container_name) == 0): - raise ValueError( - 'multi-instance task must be invoked with a named ' - 'container') set_terminate_on_all_tasks_complete = True job.job_release_task = batchmodels.JobReleaseTask( command_line=convoy.util.wrap_commands_in_shell( - ['docker stop {}'.format(docker_container_name), - 'docker rm -v {}'.format(docker_container_name)]), + ['docker stop {}'.format(mi_docker_container_name), + 'docker rm -v {}'.format(mi_docker_container_name)]), run_elevated=True, ) logger.info('Adding job: {}'.format(job.id)) @@ -1306,7 +1339,7 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile): raise del mi_ac del multi_instance - del docker_container_name + del mi_docker_container_name # add all tasks under job for task in jobspec['tasks']: # get image name @@ -1317,19 +1350,8 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile): if task_id is None or len(task_id) == 0: raise KeyError() except KeyError: - # get filtered, sorted list of generic docker task ids - try: - tasklist = sorted( - filter(lambda x: x.id.startswith( - _GENERIC_DOCKER_TASK_PREFIX), list( - batch_client.task.list(job.id))), - key=lambda y: y.id) - tasknum = int(tasklist[-1].id.split('-')[-1]) + 1 - except (batchmodels.batch_error.BatchErrorException, - IndexError): - tasknum = 0 - task_id = '{0}{1:03d}'.format( - _GENERIC_DOCKER_TASK_PREFIX, tasknum) + task_id = _generate_next_generic_task_id( + batch_client, job.id, reserved_task_id) # set run and exec commands docker_run_cmd = 'docker run' docker_exec_cmd = 'docker exec' @@ -1346,13 +1368,14 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile): else: if rm_container and '--rm' not in run_opts: run_opts.append('--rm') - # parse name option + # parse name option, if not specified use task id try: name = task['name'] - if name is not None: - run_opts.append('--name {}'.format(name)) + if name is None or len(name) == 0: + raise KeyError() except KeyError: - name = None + name = task['id'] + run_opts.append('--name {}'.format(name)) # parse labels option try: labels = task['labels'] diff --git a/docs/10-batch-shipyard-configuration.md b/docs/10-batch-shipyard-configuration.md index 1a7b408..d49b892 100644 --- a/docs/10-batch-shipyard-configuration.md +++ b/docs/10-batch-shipyard-configuration.md @@ -735,8 +735,8 @@ transferred again. This object currently supports `azure_batch` and invocation (task) depends on and must run to successful completion prior to this task executing. * (required) `image` is the Docker image to use for this task - * `name` is the name to assign to the container. This is required for - multi-instance tasks, optional if not. + * (optional) `name` is the name to assign to the container. If not + specified, the value of the `id` property will be used for `name`. * (optional) `labels` is an array of labels to apply to the container. * (optional) `environment_variables` are any additional task-specific environment variables that should be applied to the container. diff --git a/recipes/CNTK-CPU-OpenMPI/README.md b/recipes/CNTK-CPU-OpenMPI/README.md index ca32dc5..30266cc 100644 --- a/recipes/CNTK-CPU-OpenMPI/README.md +++ b/recipes/CNTK-CPU-OpenMPI/README.md @@ -45,8 +45,6 @@ array which should have a task definition containing: Since we are not using either the MNIST or CIFAR examples, this can simply be `alfpark/cntk:1.7.2-cpu-openmpi`. Please note that the `docker_images` in the Global Configuration should match this image name. -* `name` is a unique name given to the Docker container instance. This is -required for Multi-Instance tasks. * `command` should contain the command to pass to the Docker run invocation. For this example, we will run the ConvNet MNIST Example that has been modified to run in parallel in the `alfpark/cntk:1.7.2-cpu-openmpi-refdata` Docker diff --git a/recipes/CNTK-CPU-OpenMPI/config/multinode/jobs.json b/recipes/CNTK-CPU-OpenMPI/config/multinode/jobs.json index 9277181..d98a3ae 100644 --- a/recipes/CNTK-CPU-OpenMPI/config/multinode/jobs.json +++ b/recipes/CNTK-CPU-OpenMPI/config/multinode/jobs.json @@ -6,7 +6,6 @@ "tasks": [ { "image": "alfpark/cntk:1.7.2-cpu-openmpi-refdata", - "name": "cntk", "remove_container_after_exit": true, "shared_data_volumes": [ "glustervol" diff --git a/recipes/CNTK-GPU-OpenMPI/README.md b/recipes/CNTK-GPU-OpenMPI/README.md index 0738241..d30679c 100644 --- a/recipes/CNTK-GPU-OpenMPI/README.md +++ b/recipes/CNTK-GPU-OpenMPI/README.md @@ -69,9 +69,6 @@ array which should have a task definition containing: Since we are not using either the MNIST or CIFAR examples, this can simply be `alfpark/cntk:1.7.2-gpu-openmpi`. Please note that the `docker_images` in the Global Configuration should match this image name. -* `name` is a unique name given to the Docker container instance. This is -required for Multi-Instance tasks. This is not required for running on a -single node. * `command` should contain the command to pass to the Docker run invocation. For this example, we will run the ConvNet MNIST Example that has been modified to run in parallel in the `alfpark/cntk:1.7.2-gpu-openmpi-refdata` Docker diff --git a/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/jobs.json b/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/jobs.json index 011f5c7..b73524a 100644 --- a/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/jobs.json +++ b/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/jobs.json @@ -6,7 +6,6 @@ "tasks": [ { "image": "alfpark/cntk:1.7.2-gpu-openmpi-refdata", - "name": "cntk", "remove_container_after_exit": true, "shared_data_volumes": [ "glustervol" diff --git a/recipes/MXNet-CPU/README.md b/recipes/MXNet-CPU/README.md index f6d4f11..7491b00 100644 --- a/recipes/MXNet-CPU/README.md +++ b/recipes/MXNet-CPU/README.md @@ -49,8 +49,6 @@ array which should have a task definition containing: * `image` should be the name of the Docker image for this container invocation. This can be `alfpark/mxnet:cpu`. Please note that the `docker_images` in the Global Configuration should match this image name. -* `name` is a unique name given to the Docker container instance. This is -required for Multi-Instance tasks. * `command` should contain the command to pass to the Docker run invocation. For this example, we will run the CIFAR-10 example across distributed nodes in the `alfpark/mxnet:cpu` Docker image. The application `command` to run diff --git a/recipes/MXNet-CPU/config/multinode/jobs.json b/recipes/MXNet-CPU/config/multinode/jobs.json index d78a034..93988d4 100644 --- a/recipes/MXNet-CPU/config/multinode/jobs.json +++ b/recipes/MXNet-CPU/config/multinode/jobs.json @@ -6,7 +6,6 @@ "tasks": [ { "image": "alfpark/mxnet:cpu", - "name": "mxnet", "remove_container_after_exit": true, "shared_data_volumes": [ "glustervol" diff --git a/recipes/MXNet-GPU/README.md b/recipes/MXNet-GPU/README.md index f773e60..a767dbd 100644 --- a/recipes/MXNet-GPU/README.md +++ b/recipes/MXNet-GPU/README.md @@ -70,8 +70,6 @@ array which should have a task definition containing: * `image` should be the name of the Docker image for this container invocation. This can be `alfpark/mxnet:gpu`. Please note that the `docker_images` in the Global Configuration should match this image name. -* `name` is a unique name given to the Docker container instance. This is -required for Multi-Instance tasks. * `command` should contain the command to pass to the Docker run invocation. For this example, we will run the CIFAR-10 Resnet example across distributed nodes in the `alfpark/mxnet:gpu` Docker image. Note that for multinode jobs, diff --git a/recipes/MXNet-GPU/config/multinode/jobs.json b/recipes/MXNet-GPU/config/multinode/jobs.json index 7e06deb..823a671 100644 --- a/recipes/MXNet-GPU/config/multinode/jobs.json +++ b/recipes/MXNet-GPU/config/multinode/jobs.json @@ -6,7 +6,6 @@ "tasks": [ { "image": "alfpark/mxnet:gpu", - "name": "mxnet", "remove_container_after_exit": true, "shared_data_volumes": [ "glustervol" diff --git a/recipes/NAMD-Infiniband-IntelMPI/README.md b/recipes/NAMD-Infiniband-IntelMPI/README.md index c92a9e8..41f1924 100644 --- a/recipes/NAMD-Infiniband-IntelMPI/README.md +++ b/recipes/NAMD-Infiniband-IntelMPI/README.md @@ -31,8 +31,6 @@ The jobs configuration should set the following properties within the `tasks` array which should have a task definition containing: * `image` should be the name of the Docker image for this container invocation, e.g., `alfpark/namd:2.11-icc-mkl-intelmpi` -* `name` is a unique name given to the Docker container instance. This is -required for Multi-Instance tasks. * `command` should contain the `mpirun` command. If using the sample `run_namd.sh` script then `"/sw/run_namd.sh "` can be used to run the included benchmarks: diff --git a/recipes/NAMD-Infiniband-IntelMPI/config/jobs.json b/recipes/NAMD-Infiniband-IntelMPI/config/jobs.json index 42064cb..a9ad111 100644 --- a/recipes/NAMD-Infiniband-IntelMPI/config/jobs.json +++ b/recipes/NAMD-Infiniband-IntelMPI/config/jobs.json @@ -6,7 +6,6 @@ "tasks": [ { "image": "alfpark/namd:2.11-icc-mkl-intelmpi", - "name": "namd", "remove_container_after_exit": true, "command": "/sw/run_namd.sh apoa1 1000", "infiniband": true, diff --git a/recipes/NAMD-TCP/README.md b/recipes/NAMD-TCP/README.md index 94c6494..97a519b 100644 --- a/recipes/NAMD-TCP/README.md +++ b/recipes/NAMD-TCP/README.md @@ -27,8 +27,6 @@ The jobs configuration should set the following properties within the `tasks` array which should have a task definition containing: * `image` should be the name of the Docker image for this container invocation, e.g., `alfpark/namd:2.11-tcp` -* `name` is a unique name given to the Docker container instance. This is -required for Multi-Instance tasks. * `command` should contain the `mpirun` command. If using the sample NAMD-TCP image provided, `"/sw/run_namd.sh "` can be used to run the included benchmarks: diff --git a/recipes/NAMD-TCP/config/jobs.json b/recipes/NAMD-TCP/config/jobs.json index 4c62849..5da56a0 100644 --- a/recipes/NAMD-TCP/config/jobs.json +++ b/recipes/NAMD-TCP/config/jobs.json @@ -6,7 +6,6 @@ "tasks": [ { "image": "alfpark/namd:2.11-tcp", - "name": "namd", "remove_container_after_exit": true, "command": "/sw/run_namd.sh apoa1 100", "multi_instance": { diff --git a/recipes/OpenFOAM-Infiniband-IntelMPI/README.md b/recipes/OpenFOAM-Infiniband-IntelMPI/README.md index 86a502d..8710b38 100644 --- a/recipes/OpenFOAM-Infiniband-IntelMPI/README.md +++ b/recipes/OpenFOAM-Infiniband-IntelMPI/README.md @@ -37,8 +37,6 @@ The jobs configuration should set the following properties within the `tasks` array which should have a task definition containing: * `image` should be the name of the Docker image for this container invocation. For this example, this can be `alfpark/openfoam:4.0-icc-intelmpi`. -* `name` is a unique name given to the Docker container instance. This is -required for Multi-Instance tasks. * `command` should contain the `mpirun` command. If using the sample `run_sample.sh` script then the command should be simply: `/opt/OpenFOAM/run_sample.sh` diff --git a/recipes/OpenFOAM-Infiniband-IntelMPI/config/jobs.json b/recipes/OpenFOAM-Infiniband-IntelMPI/config/jobs.json index 25816ff..95bbe0d 100644 --- a/recipes/OpenFOAM-Infiniband-IntelMPI/config/jobs.json +++ b/recipes/OpenFOAM-Infiniband-IntelMPI/config/jobs.json @@ -6,7 +6,6 @@ "tasks": [ { "image": "alfpark/openfoam:4.0-icc-intelmpi", - "name": "openfoam", "remove_container_after_exit": true, "shared_data_volumes": [ "glustervol" diff --git a/recipes/OpenFOAM-TCP-OpenMPI/README.md b/recipes/OpenFOAM-TCP-OpenMPI/README.md index 11d3d04..b7a41bd 100644 --- a/recipes/OpenFOAM-TCP-OpenMPI/README.md +++ b/recipes/OpenFOAM-TCP-OpenMPI/README.md @@ -32,8 +32,6 @@ The jobs configuration should set the following properties within the `tasks` array which should have a task definition containing: * `image` should be the name of the Docker image for this container invocation. For this example, this should be `alfpark/openfoam:4.0-gcc-openmpi`. -* `name` is a unique name given to the Docker container instance. This is -required for Multi-Instance tasks. * `command` should contain the `mpirun` command. If using the sample `run_sample.sh` script then the command should be simply: `/opt/OpenFOAM/run_sample.sh` diff --git a/recipes/OpenFOAM-TCP-OpenMPI/config/jobs.json b/recipes/OpenFOAM-TCP-OpenMPI/config/jobs.json index 348fbfd..35542dd 100644 --- a/recipes/OpenFOAM-TCP-OpenMPI/config/jobs.json +++ b/recipes/OpenFOAM-TCP-OpenMPI/config/jobs.json @@ -6,7 +6,6 @@ "tasks": [ { "image": "alfpark/openfoam:4.0-gcc-openmpi", - "name": "openfoam", "remove_container_after_exit": true, "shared_data_volumes": [ "glustervol" diff --git a/recipes/TensorFlow-Distributed/README.md b/recipes/TensorFlow-Distributed/README.md index d9a76c0..42a6c1b 100644 --- a/recipes/TensorFlow-Distributed/README.md +++ b/recipes/TensorFlow-Distributed/README.md @@ -52,13 +52,13 @@ array which should have a task definition containing: * `image` should be the name of the Docker image for this container invocation, e.g., `alfpark/tensorflow/0.10.0-gpu` or `alfpark/tensorflow/0.10.0-cpu` * `command` should contain the command to pass to the Docker run invocation. -To run the example MNIST replica example on GPUs, the `command` would look -like: `"/bin/bash /sw/launcher.sh --num_gpus="` where -the total number of gpus would be specified for the `--num_gpus` parameter. +To run the example MNIST replica example, the `command` would look +like: `"/bin/bash /sw/launcher.sh"`. The launcher will automatically detect +the number of GPUs and pass the correct number to the TensorFlow script. Please see the [launcher.sh](docker/gpu/launcher.sh) for the launcher source. -`--num_gpus=` parameter must be omitted if run on CPUs. * `gpu` must be set to `true` if run on GPUs. This enables invoking the -`nvidia-docker` wrapper. +`nvidia-docker` wrapper. This property should be omitted or set to `false` +if run on CPUs. * `multi_instance` property must be defined * `num_instances` should be set to `pool_specification_vm_count` or `pool_current_dedicated` diff --git a/recipes/TensorFlow-Distributed/config/cpu/config.json b/recipes/TensorFlow-Distributed/config/cpu/config.json new file mode 100644 index 0000000..f737263 --- /dev/null +++ b/recipes/TensorFlow-Distributed/config/cpu/config.json @@ -0,0 +1,11 @@ +{ + "batch_shipyard": { + "storage_account_settings": "", + "storage_entity_prefix": "shipyard" + }, + "global_resources": { + "docker_images": [ + "alfpark/tensorflow:0.10.0-cpu" + ] + } +} diff --git a/recipes/TensorFlow-Distributed/config/credentials.json b/recipes/TensorFlow-Distributed/config/cpu/credentials.json similarity index 100% rename from recipes/TensorFlow-Distributed/config/credentials.json rename to recipes/TensorFlow-Distributed/config/cpu/credentials.json diff --git a/recipes/TensorFlow-Distributed/config/cpu/jobs.json b/recipes/TensorFlow-Distributed/config/cpu/jobs.json new file mode 100644 index 0000000..94dfa6f --- /dev/null +++ b/recipes/TensorFlow-Distributed/config/cpu/jobs.json @@ -0,0 +1,21 @@ +{ + "job_specifications": [ + { + "id": "tensorflow", + "multi_instance_auto_complete": true, + "tasks": [ + { + "image": "alfpark/tensorflow:0.10.0-cpu", + "remove_container_after_exit": true, + "command": "/bin/bash /sw/launcher.sh", + "multi_instance": { + "num_instances": "pool_specification_vm_count", + "coordination_command": null, + "resource_files": [ + ] + } + } + ] + } + ] +} diff --git a/recipes/TensorFlow-Distributed/config/cpu/pool.json b/recipes/TensorFlow-Distributed/config/cpu/pool.json new file mode 100644 index 0000000..4e3d80b --- /dev/null +++ b/recipes/TensorFlow-Distributed/config/cpu/pool.json @@ -0,0 +1,16 @@ +{ + "pool_specification": { + "id": "tensorflow-distributed", + "vm_size": "STANDARD_D4_V2", + "vm_count": 2, + "inter_node_communication_enabled": true, + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04.0-LTS", + "ssh": { + "username": "docker" + }, + "reboot_on_start_task_failed": false, + "block_until_all_global_resources_loaded": true + } +} diff --git a/recipes/TensorFlow-Distributed/config/config.json b/recipes/TensorFlow-Distributed/config/gpu/config.json similarity index 100% rename from recipes/TensorFlow-Distributed/config/config.json rename to recipes/TensorFlow-Distributed/config/gpu/config.json diff --git a/recipes/TensorFlow-Distributed/config/gpu/credentials.json b/recipes/TensorFlow-Distributed/config/gpu/credentials.json new file mode 100644 index 0000000..451e167 --- /dev/null +++ b/recipes/TensorFlow-Distributed/config/gpu/credentials.json @@ -0,0 +1,16 @@ +{ + "credentials": { + "batch": { + "account": "", + "account_key": "", + "account_service_url": "" + }, + "storage": { + "mystorageaccount": { + "account": "", + "account_key": "", + "endpoint": "core.windows.net" + } + } + } +} diff --git a/recipes/TensorFlow-Distributed/config/jobs.json b/recipes/TensorFlow-Distributed/config/gpu/jobs.json similarity index 84% rename from recipes/TensorFlow-Distributed/config/jobs.json rename to recipes/TensorFlow-Distributed/config/gpu/jobs.json index bb10f89..22c4273 100644 --- a/recipes/TensorFlow-Distributed/config/jobs.json +++ b/recipes/TensorFlow-Distributed/config/gpu/jobs.json @@ -6,9 +6,8 @@ "tasks": [ { "image": "alfpark/tensorflow:0.10.0-gpu", - "name": "tensorflow", "remove_container_after_exit": true, - "command": "/bin/bash /sw/launcher.sh --num_gpus 2", + "command": "/bin/bash /sw/launcher.sh", "gpu": true, "multi_instance": { "num_instances": "pool_specification_vm_count", diff --git a/recipes/TensorFlow-Distributed/config/pool.json b/recipes/TensorFlow-Distributed/config/gpu/pool.json similarity index 100% rename from recipes/TensorFlow-Distributed/config/pool.json rename to recipes/TensorFlow-Distributed/config/gpu/pool.json diff --git a/recipes/TensorFlow-Distributed/docker/gpu/launcher.sh b/recipes/TensorFlow-Distributed/docker/gpu/launcher.sh index 76f8a70..d5b8ef8 100755 --- a/recipes/TensorFlow-Distributed/docker/gpu/launcher.sh +++ b/recipes/TensorFlow-Distributed/docker/gpu/launcher.sh @@ -3,6 +3,15 @@ set -e set -o pipefail +# get number of GPUs on machine +ngpus=`nvidia-smi -L | wc -l` +echo "num gpus: $ngpus" + +if [ $ngpus -eq 0 ]; then + echo "No GPUs detected." + exit 1 +fi + # get my ip address ipaddress=`ip addr list eth0 | grep "inet " | cut -d' ' -f6 | cut -d/ -f1` @@ -40,7 +49,7 @@ if [ $AZ_BATCH_IS_CURRENT_NODE_MASTER == "true" ]; then # master node ti=${task_index[$master]} echo "master node: $ipaddress task index: $ti" - python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=ps --task_index=$ti --data_dir=./master $* > ps-$ti.log 2>&1 & + python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=ps --task_index=$ti --data_dir=./master --num_gpus=$ngpus $* > ps-$ti.log 2>&1 & masterpid=$! fi @@ -52,10 +61,10 @@ do ti=${task_index[$node]} echo "worker node: $node task index: $ti" if [ $node == $master ]; then - python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=./worker-$ti $* > worker-$ti.log 2>&1 & + python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=./worker-$ti --num_gpus=$ngpus $* > worker-$ti.log 2>&1 & waitpids=("${waitpids[@]}" "$!") else - ssh $node "python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=$AZ_BATCH_TASK_WORKING_DIR/worker-$ti $* > $AZ_BATCH_TASK_WORKING_DIR/worker-$ti.log 2>&1" & + ssh $node "python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=$AZ_BATCH_TASK_WORKING_DIR/worker-$ti --num_gpus=$ngpus $* > $AZ_BATCH_TASK_WORKING_DIR/worker-$ti.log 2>&1" & waitpids=("${waitpids[@]}" "$!") fi done