From 72f9c90baf2f5c541e1c7dfcc543b35e9727aec5 Mon Sep 17 00:00:00 2001
From: Fred Park <fred.park@microsoft.com>
Date: Thu, 27 Oct 2016 23:41:47 -0700
Subject: [PATCH] Remove name requirement for multi-instance tasks

- Update TensorFlow-Distributed gpu launcher script to autodetect gpus
- Separate config scripts for TensorFlow-Distributed into CPU and GPU
---
 CHANGELOG.md                                  |  5 +-
 convoy/batch.py                               | 77 ++++++++++++-------
 docs/10-batch-shipyard-configuration.md       |  4 +-
 recipes/CNTK-CPU-OpenMPI/README.md            |  2 -
 .../config/multinode/jobs.json                |  1 -
 recipes/CNTK-GPU-OpenMPI/README.md            |  3 -
 .../config/multinode-multigpu/jobs.json       |  1 -
 recipes/MXNet-CPU/README.md                   |  2 -
 recipes/MXNet-CPU/config/multinode/jobs.json  |  1 -
 recipes/MXNet-GPU/README.md                   |  2 -
 recipes/MXNet-GPU/config/multinode/jobs.json  |  1 -
 recipes/NAMD-Infiniband-IntelMPI/README.md    |  2 -
 .../NAMD-Infiniband-IntelMPI/config/jobs.json |  1 -
 recipes/NAMD-TCP/README.md                    |  2 -
 recipes/NAMD-TCP/config/jobs.json             |  1 -
 .../OpenFOAM-Infiniband-IntelMPI/README.md    |  2 -
 .../config/jobs.json                          |  1 -
 recipes/OpenFOAM-TCP-OpenMPI/README.md        |  2 -
 recipes/OpenFOAM-TCP-OpenMPI/config/jobs.json |  1 -
 recipes/TensorFlow-Distributed/README.md      | 10 +--
 .../config/cpu/config.json                    | 11 +++
 .../config/{ => cpu}/credentials.json         |  0
 .../config/cpu/jobs.json                      | 21 +++++
 .../config/cpu/pool.json                      | 16 ++++
 .../config/{ => gpu}/config.json              |  0
 .../config/gpu/credentials.json               | 16 ++++
 .../config/{ => gpu}/jobs.json                |  3 +-
 .../config/{ => gpu}/pool.json                |  0
 .../docker/gpu/launcher.sh                    | 15 +++-
 29 files changed, 138 insertions(+), 65 deletions(-)
 create mode 100644 recipes/TensorFlow-Distributed/config/cpu/config.json
 rename recipes/TensorFlow-Distributed/config/{ => cpu}/credentials.json (100%)
 create mode 100644 recipes/TensorFlow-Distributed/config/cpu/jobs.json
 create mode 100644 recipes/TensorFlow-Distributed/config/cpu/pool.json
 rename recipes/TensorFlow-Distributed/config/{ => gpu}/config.json (100%)
 create mode 100644 recipes/TensorFlow-Distributed/config/gpu/credentials.json
 rename recipes/TensorFlow-Distributed/config/{ => gpu}/jobs.json (84%)
 rename recipes/TensorFlow-Distributed/config/{ => gpu}/pool.json (100%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 77ef665..128290e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -43,7 +43,10 @@ Please see usage doc for more information.
 been replaced by the `ssh` property. `generate_tunnel_script` has been renamed
 to `generate_docker_tunnel_script`. Please see the configuration doc for
 more information.
-- `streamfile` no longer has an arbitrary max streaming time; the action will
+- The `name` property of a task json object in the jobs specification is no
+longer required for multi-instance tasks. If not specified, `name` defaults
+to `id` for all task types.
+- `data stream` no longer has an arbitrary max streaming time; the action will
 stream the file indefinitely until the task completes
 - Validate container with `storage_entity_prefix` for length issues
 - `delpool` action now cleans up and deletes some storage containers
diff --git a/convoy/batch.py b/convoy/batch.py
index 04e4a46..4635182 100644
--- a/convoy/batch.py
+++ b/convoy/batch.py
@@ -1215,6 +1215,32 @@ def list_task_files(batch_client, config):
             logger.error('no tasks found for job {}'.format(job['id']))
 
 
+def _generate_next_generic_task_id(batch_client, job_id, reserved=None):
+    # type: (azure.batch.batch_service_client.BatchServiceClient, str,
+    #        str) -> str
+    """Generate the next generic task id
+    :param batch_client: The batch client to use.
+    :type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
+    :param str job_id: job id
+    :param str reserved: reserved task id
+    :rtype: str
+    :return: returns a generic docker task id
+    """
+    # get filtered, sorted list of generic docker task ids
+    try:
+        tasklist = sorted(filter(
+            lambda x: x.id.startswith(_GENERIC_DOCKER_TASK_PREFIX),
+            (batch_client.task.list(job_id))), key=lambda y: y.id)
+        tasknum = int(tasklist[-1].id.split('-')[-1]) + 1
+    except (batchmodels.batch_error.BatchErrorException, IndexError):
+        tasknum = 0
+    if reserved is not None:
+        tasknum_reserved = int(reserved.split('-')[-1])
+        while tasknum == tasknum_reserved:
+            tasknum += 1
+    return '{0}{1:03d}'.format(_GENERIC_DOCKER_TASK_PREFIX, tasknum)
+
+
 def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
     # type: (batch.BatchServiceClient, azureblob.BlockBlobService,
     #        dict, tuple, tuple) -> None
@@ -1261,7 +1287,8 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
             mi_ac = True
         job.uses_task_dependencies = False
         multi_instance = False
-        docker_container_name = None
+        mi_docker_container_name = None
+        reserved_task_id = None
         for task in jobspec['tasks']:
             # do not break, check to ensure ids are set on each task if
             # task dependencies are set
@@ -1277,20 +1304,26 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
                         'cannot specify more than one multi-instance task '
                         'per job with auto completion enabled')
                 multi_instance = True
-                docker_container_name = task['name']
+                try:
+                    mi_docker_container_name = task['name']
+                    if task['name'] is None or len(task['name']) == 0:
+                        raise KeyError()
+                except KeyError:
+                    if ('id' not in task or task['id'] is None or
+                            len(task['id']) == 0):
+                        reserved_task_id = _generate_next_generic_task_id(
+                            batch_client, job.id)
+                        task['id'] = reserved_task_id
+                    task['name'] = task['id']
+                    mi_docker_container_name = task['name']
         # add multi-instance settings
         set_terminate_on_all_tasks_complete = False
         if multi_instance and mi_ac:
-            if (docker_container_name is None or
-                    len(docker_container_name) == 0):
-                raise ValueError(
-                    'multi-instance task must be invoked with a named '
-                    'container')
             set_terminate_on_all_tasks_complete = True
             job.job_release_task = batchmodels.JobReleaseTask(
                 command_line=convoy.util.wrap_commands_in_shell(
-                    ['docker stop {}'.format(docker_container_name),
-                     'docker rm -v {}'.format(docker_container_name)]),
+                    ['docker stop {}'.format(mi_docker_container_name),
+                     'docker rm -v {}'.format(mi_docker_container_name)]),
                 run_elevated=True,
             )
         logger.info('Adding job: {}'.format(job.id))
@@ -1306,7 +1339,7 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
                 raise
         del mi_ac
         del multi_instance
-        del docker_container_name
+        del mi_docker_container_name
         # add all tasks under job
         for task in jobspec['tasks']:
             # get image name
@@ -1317,19 +1350,8 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
                 if task_id is None or len(task_id) == 0:
                     raise KeyError()
             except KeyError:
-                # get filtered, sorted list of generic docker task ids
-                try:
-                    tasklist = sorted(
-                        filter(lambda x: x.id.startswith(
-                            _GENERIC_DOCKER_TASK_PREFIX), list(
-                                batch_client.task.list(job.id))),
-                        key=lambda y: y.id)
-                    tasknum = int(tasklist[-1].id.split('-')[-1]) + 1
-                except (batchmodels.batch_error.BatchErrorException,
-                        IndexError):
-                    tasknum = 0
-                task_id = '{0}{1:03d}'.format(
-                    _GENERIC_DOCKER_TASK_PREFIX, tasknum)
+                task_id = _generate_next_generic_task_id(
+                    batch_client, job.id, reserved_task_id)
             # set run and exec commands
             docker_run_cmd = 'docker run'
             docker_exec_cmd = 'docker exec'
@@ -1346,13 +1368,14 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
             else:
                 if rm_container and '--rm' not in run_opts:
                     run_opts.append('--rm')
-            # parse name option
+            # parse name option, if not specified use task id
             try:
                 name = task['name']
-                if name is not None:
-                    run_opts.append('--name {}'.format(name))
+                if name is None or len(name) == 0:
+                    raise KeyError()
             except KeyError:
-                name = None
+                name = task['id']
+            run_opts.append('--name {}'.format(name))
             # parse labels option
             try:
                 labels = task['labels']
diff --git a/docs/10-batch-shipyard-configuration.md b/docs/10-batch-shipyard-configuration.md
index 1a7b408..d49b892 100644
--- a/docs/10-batch-shipyard-configuration.md
+++ b/docs/10-batch-shipyard-configuration.md
@@ -735,8 +735,8 @@ transferred again. This object currently supports `azure_batch` and
     invocation (task) depends on and must run to successful completion prior
     to this task executing.
   * (required) `image` is the Docker image to use for this task
-  * `name` is the name to assign to the container. This is required for
-    multi-instance tasks, optional if not.
+  * (optional) `name` is the name to assign to the container. If not
+    specified, the value of the `id` property will be used for `name`.
   * (optional) `labels` is an array of labels to apply to the container.
   * (optional) `environment_variables` are any additional task-specific
     environment variables that should be applied to the container.
diff --git a/recipes/CNTK-CPU-OpenMPI/README.md b/recipes/CNTK-CPU-OpenMPI/README.md
index ca32dc5..30266cc 100644
--- a/recipes/CNTK-CPU-OpenMPI/README.md
+++ b/recipes/CNTK-CPU-OpenMPI/README.md
@@ -45,8 +45,6 @@ array which should have a task definition containing:
 Since we are not using either the MNIST or CIFAR examples, this can simply
 be `alfpark/cntk:1.7.2-cpu-openmpi`. Please note that the `docker_images` in
 the Global Configuration should match this image name.
-* `name` is a unique name given to the Docker container instance. This is
-required for Multi-Instance tasks.
 * `command` should contain the command to pass to the Docker run invocation.
 For this example, we will run the ConvNet MNIST Example that has been modified
 to run in parallel in the `alfpark/cntk:1.7.2-cpu-openmpi-refdata` Docker
diff --git a/recipes/CNTK-CPU-OpenMPI/config/multinode/jobs.json b/recipes/CNTK-CPU-OpenMPI/config/multinode/jobs.json
index 9277181..d98a3ae 100644
--- a/recipes/CNTK-CPU-OpenMPI/config/multinode/jobs.json
+++ b/recipes/CNTK-CPU-OpenMPI/config/multinode/jobs.json
@@ -6,7 +6,6 @@
             "tasks": [
                 {
                     "image": "alfpark/cntk:1.7.2-cpu-openmpi-refdata",
-                    "name": "cntk",
                     "remove_container_after_exit": true,
                     "shared_data_volumes": [
                         "glustervol"
diff --git a/recipes/CNTK-GPU-OpenMPI/README.md b/recipes/CNTK-GPU-OpenMPI/README.md
index 0738241..d30679c 100644
--- a/recipes/CNTK-GPU-OpenMPI/README.md
+++ b/recipes/CNTK-GPU-OpenMPI/README.md
@@ -69,9 +69,6 @@ array which should have a task definition containing:
 Since we are not using either the MNIST or CIFAR examples, this can simply
 be `alfpark/cntk:1.7.2-gpu-openmpi`. Please note that the `docker_images` in
 the Global Configuration should match this image name.
-* `name` is a unique name given to the Docker container instance. This is
-required for Multi-Instance tasks. This is not required for running on a
-single node.
 * `command` should contain the command to pass to the Docker run invocation.
 For this example, we will run the ConvNet MNIST Example that has been modified
 to run in parallel in the `alfpark/cntk:1.7.2-gpu-openmpi-refdata` Docker
diff --git a/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/jobs.json b/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/jobs.json
index 011f5c7..b73524a 100644
--- a/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/jobs.json
+++ b/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/jobs.json
@@ -6,7 +6,6 @@
             "tasks": [
                 {
                     "image": "alfpark/cntk:1.7.2-gpu-openmpi-refdata",
-                    "name": "cntk",
                     "remove_container_after_exit": true,
                     "shared_data_volumes": [
                         "glustervol"
diff --git a/recipes/MXNet-CPU/README.md b/recipes/MXNet-CPU/README.md
index f6d4f11..7491b00 100644
--- a/recipes/MXNet-CPU/README.md
+++ b/recipes/MXNet-CPU/README.md
@@ -49,8 +49,6 @@ array which should have a task definition containing:
 * `image` should be the name of the Docker image for this container invocation.
 This can be `alfpark/mxnet:cpu`. Please note that the `docker_images` in
 the Global Configuration should match this image name.
-* `name` is a unique name given to the Docker container instance. This is
-required for Multi-Instance tasks.
 * `command` should contain the command to pass to the Docker run invocation.
 For this example, we will run the CIFAR-10 example across distributed nodes
 in the `alfpark/mxnet:cpu` Docker image. The application `command` to run
diff --git a/recipes/MXNet-CPU/config/multinode/jobs.json b/recipes/MXNet-CPU/config/multinode/jobs.json
index d78a034..93988d4 100644
--- a/recipes/MXNet-CPU/config/multinode/jobs.json
+++ b/recipes/MXNet-CPU/config/multinode/jobs.json
@@ -6,7 +6,6 @@
             "tasks": [
                 {
                     "image": "alfpark/mxnet:cpu",
-                    "name": "mxnet",
                     "remove_container_after_exit": true,
                     "shared_data_volumes": [
                         "glustervol"
diff --git a/recipes/MXNet-GPU/README.md b/recipes/MXNet-GPU/README.md
index f773e60..a767dbd 100644
--- a/recipes/MXNet-GPU/README.md
+++ b/recipes/MXNet-GPU/README.md
@@ -70,8 +70,6 @@ array which should have a task definition containing:
 * `image` should be the name of the Docker image for this container invocation.
 This can be `alfpark/mxnet:gpu`. Please note that the `docker_images` in
 the Global Configuration should match this image name.
-* `name` is a unique name given to the Docker container instance. This is
-required for Multi-Instance tasks.
 * `command` should contain the command to pass to the Docker run invocation.
 For this example, we will run the CIFAR-10 Resnet example across distributed
 nodes in the `alfpark/mxnet:gpu` Docker image. Note that for multinode jobs,
diff --git a/recipes/MXNet-GPU/config/multinode/jobs.json b/recipes/MXNet-GPU/config/multinode/jobs.json
index 7e06deb..823a671 100644
--- a/recipes/MXNet-GPU/config/multinode/jobs.json
+++ b/recipes/MXNet-GPU/config/multinode/jobs.json
@@ -6,7 +6,6 @@
             "tasks": [
                 {
                     "image": "alfpark/mxnet:gpu",
-                    "name": "mxnet",
                     "remove_container_after_exit": true,
                     "shared_data_volumes": [
                         "glustervol"
diff --git a/recipes/NAMD-Infiniband-IntelMPI/README.md b/recipes/NAMD-Infiniband-IntelMPI/README.md
index c92a9e8..41f1924 100644
--- a/recipes/NAMD-Infiniband-IntelMPI/README.md
+++ b/recipes/NAMD-Infiniband-IntelMPI/README.md
@@ -31,8 +31,6 @@ The jobs configuration should set the following properties within the `tasks`
 array which should have a task definition containing:
 * `image` should be the name of the Docker image for this container invocation,
 e.g., `alfpark/namd:2.11-icc-mkl-intelmpi`
-* `name` is a unique name given to the Docker container instance. This is
-required for Multi-Instance tasks.
 * `command` should contain the `mpirun` command. If using the sample
 `run_namd.sh` script then `"/sw/run_namd.sh <benchmark> <steps> <ppn>"`
 can be used to run the included benchmarks:
diff --git a/recipes/NAMD-Infiniband-IntelMPI/config/jobs.json b/recipes/NAMD-Infiniband-IntelMPI/config/jobs.json
index 42064cb..a9ad111 100644
--- a/recipes/NAMD-Infiniband-IntelMPI/config/jobs.json
+++ b/recipes/NAMD-Infiniband-IntelMPI/config/jobs.json
@@ -6,7 +6,6 @@
             "tasks": [
                 {
                     "image": "alfpark/namd:2.11-icc-mkl-intelmpi",
-                    "name": "namd",
                     "remove_container_after_exit": true,
                     "command": "/sw/run_namd.sh apoa1 1000",
                     "infiniband": true,
diff --git a/recipes/NAMD-TCP/README.md b/recipes/NAMD-TCP/README.md
index 94c6494..97a519b 100644
--- a/recipes/NAMD-TCP/README.md
+++ b/recipes/NAMD-TCP/README.md
@@ -27,8 +27,6 @@ The jobs configuration should set the following properties within the `tasks`
 array which should have a task definition containing:
 * `image` should be the name of the Docker image for this container invocation,
 e.g., `alfpark/namd:2.11-tcp`
-* `name` is a unique name given to the Docker container instance. This is
-required for Multi-Instance tasks.
 * `command` should contain the `mpirun` command. If using the sample NAMD-TCP
 image provided, `"/sw/run_namd.sh <benchmark> <steps> <ppn>"` can be used
 to run the included benchmarks:
diff --git a/recipes/NAMD-TCP/config/jobs.json b/recipes/NAMD-TCP/config/jobs.json
index 4c62849..5da56a0 100644
--- a/recipes/NAMD-TCP/config/jobs.json
+++ b/recipes/NAMD-TCP/config/jobs.json
@@ -6,7 +6,6 @@
             "tasks": [
                 {
                     "image": "alfpark/namd:2.11-tcp",
-                    "name": "namd",
                     "remove_container_after_exit": true,
                     "command": "/sw/run_namd.sh apoa1 100",
                     "multi_instance": {
diff --git a/recipes/OpenFOAM-Infiniband-IntelMPI/README.md b/recipes/OpenFOAM-Infiniband-IntelMPI/README.md
index 86a502d..8710b38 100644
--- a/recipes/OpenFOAM-Infiniband-IntelMPI/README.md
+++ b/recipes/OpenFOAM-Infiniband-IntelMPI/README.md
@@ -37,8 +37,6 @@ The jobs configuration should set the following properties within the `tasks`
 array which should have a task definition containing:
 * `image` should be the name of the Docker image for this container invocation.
 For this example, this can be `alfpark/openfoam:4.0-icc-intelmpi`.
-* `name` is a unique name given to the Docker container instance. This is
-required for Multi-Instance tasks.
 * `command` should contain the `mpirun` command. If using the sample
 `run_sample.sh` script then the command should be simply:
 `/opt/OpenFOAM/run_sample.sh`
diff --git a/recipes/OpenFOAM-Infiniband-IntelMPI/config/jobs.json b/recipes/OpenFOAM-Infiniband-IntelMPI/config/jobs.json
index 25816ff..95bbe0d 100644
--- a/recipes/OpenFOAM-Infiniband-IntelMPI/config/jobs.json
+++ b/recipes/OpenFOAM-Infiniband-IntelMPI/config/jobs.json
@@ -6,7 +6,6 @@
             "tasks": [
                 {
                     "image": "alfpark/openfoam:4.0-icc-intelmpi",
-                    "name": "openfoam",
                     "remove_container_after_exit": true,
                     "shared_data_volumes": [
                         "glustervol"
diff --git a/recipes/OpenFOAM-TCP-OpenMPI/README.md b/recipes/OpenFOAM-TCP-OpenMPI/README.md
index 11d3d04..b7a41bd 100644
--- a/recipes/OpenFOAM-TCP-OpenMPI/README.md
+++ b/recipes/OpenFOAM-TCP-OpenMPI/README.md
@@ -32,8 +32,6 @@ The jobs configuration should set the following properties within the `tasks`
 array which should have a task definition containing:
 * `image` should be the name of the Docker image for this container invocation.
 For this example, this should be `alfpark/openfoam:4.0-gcc-openmpi`.
-* `name` is a unique name given to the Docker container instance. This is
-required for Multi-Instance tasks.
 * `command` should contain the `mpirun` command. If using the sample
 `run_sample.sh` script then the command should be simply:
 `/opt/OpenFOAM/run_sample.sh`
diff --git a/recipes/OpenFOAM-TCP-OpenMPI/config/jobs.json b/recipes/OpenFOAM-TCP-OpenMPI/config/jobs.json
index 348fbfd..35542dd 100644
--- a/recipes/OpenFOAM-TCP-OpenMPI/config/jobs.json
+++ b/recipes/OpenFOAM-TCP-OpenMPI/config/jobs.json
@@ -6,7 +6,6 @@
             "tasks": [
                 {
                     "image": "alfpark/openfoam:4.0-gcc-openmpi",
-                    "name": "openfoam",
                     "remove_container_after_exit": true,
                     "shared_data_volumes": [
                         "glustervol"
diff --git a/recipes/TensorFlow-Distributed/README.md b/recipes/TensorFlow-Distributed/README.md
index d9a76c0..42a6c1b 100644
--- a/recipes/TensorFlow-Distributed/README.md
+++ b/recipes/TensorFlow-Distributed/README.md
@@ -52,13 +52,13 @@ array which should have a task definition containing:
 * `image` should be the name of the Docker image for this container invocation,
 e.g., `alfpark/tensorflow/0.10.0-gpu` or `alfpark/tensorflow/0.10.0-cpu`
 * `command` should contain the command to pass to the Docker run invocation.
-To run the example MNIST replica example on GPUs, the `command` would look
-like: `"/bin/bash /sw/launcher.sh --num_gpus=<number of total gpus>"` where
-the total number of gpus would be specified for the `--num_gpus` parameter.
+To run the example MNIST replica example, the `command` would look
+like: `"/bin/bash /sw/launcher.sh"`. The launcher will automatically detect
+the number of GPUs and pass the correct number to the TensorFlow script.
 Please see the [launcher.sh](docker/gpu/launcher.sh) for the launcher source.
-`--num_gpus=` parameter must be omitted if run on CPUs.
 * `gpu` must be set to `true` if run on GPUs. This enables invoking the
-`nvidia-docker` wrapper.
+`nvidia-docker` wrapper. This property should be omitted or set to `false`
+if run on CPUs.
 * `multi_instance` property must be defined
   * `num_instances` should be set to `pool_specification_vm_count` or
     `pool_current_dedicated`
diff --git a/recipes/TensorFlow-Distributed/config/cpu/config.json b/recipes/TensorFlow-Distributed/config/cpu/config.json
new file mode 100644
index 0000000..f737263
--- /dev/null
+++ b/recipes/TensorFlow-Distributed/config/cpu/config.json
@@ -0,0 +1,11 @@
+{
+    "batch_shipyard": {
+        "storage_account_settings": "<storage account specified in credentials.json>",
+        "storage_entity_prefix": "shipyard"
+    },
+    "global_resources": {
+        "docker_images": [
+            "alfpark/tensorflow:0.10.0-cpu"
+        ]
+    }
+}
diff --git a/recipes/TensorFlow-Distributed/config/credentials.json b/recipes/TensorFlow-Distributed/config/cpu/credentials.json
similarity index 100%
rename from recipes/TensorFlow-Distributed/config/credentials.json
rename to recipes/TensorFlow-Distributed/config/cpu/credentials.json
diff --git a/recipes/TensorFlow-Distributed/config/cpu/jobs.json b/recipes/TensorFlow-Distributed/config/cpu/jobs.json
new file mode 100644
index 0000000..94dfa6f
--- /dev/null
+++ b/recipes/TensorFlow-Distributed/config/cpu/jobs.json
@@ -0,0 +1,21 @@
+{
+    "job_specifications": [
+        {
+            "id": "tensorflow",
+            "multi_instance_auto_complete": true,
+            "tasks": [
+                {
+                    "image": "alfpark/tensorflow:0.10.0-cpu",
+                    "remove_container_after_exit": true,
+                    "command": "/bin/bash /sw/launcher.sh",
+                    "multi_instance": {
+                        "num_instances": "pool_specification_vm_count",
+                        "coordination_command": null,
+                        "resource_files": [
+                        ]
+                    }
+                }
+            ]
+        }
+    ]
+}
diff --git a/recipes/TensorFlow-Distributed/config/cpu/pool.json b/recipes/TensorFlow-Distributed/config/cpu/pool.json
new file mode 100644
index 0000000..4e3d80b
--- /dev/null
+++ b/recipes/TensorFlow-Distributed/config/cpu/pool.json
@@ -0,0 +1,16 @@
+{
+    "pool_specification": {
+        "id": "tensorflow-distributed",
+        "vm_size": "STANDARD_D4_V2",
+        "vm_count": 2,
+        "inter_node_communication_enabled": true,
+        "publisher": "Canonical",
+        "offer": "UbuntuServer",
+        "sku": "16.04.0-LTS",
+        "ssh": {
+            "username": "docker"
+        },
+        "reboot_on_start_task_failed": false,
+        "block_until_all_global_resources_loaded": true
+    }
+}
diff --git a/recipes/TensorFlow-Distributed/config/config.json b/recipes/TensorFlow-Distributed/config/gpu/config.json
similarity index 100%
rename from recipes/TensorFlow-Distributed/config/config.json
rename to recipes/TensorFlow-Distributed/config/gpu/config.json
diff --git a/recipes/TensorFlow-Distributed/config/gpu/credentials.json b/recipes/TensorFlow-Distributed/config/gpu/credentials.json
new file mode 100644
index 0000000..451e167
--- /dev/null
+++ b/recipes/TensorFlow-Distributed/config/gpu/credentials.json
@@ -0,0 +1,16 @@
+{
+    "credentials": {
+        "batch": {
+            "account": "<batch account name>",
+            "account_key": "<batch account key>",
+            "account_service_url": "<batch account service url>"
+        },
+        "storage": {
+            "mystorageaccount": {
+                "account": "<storage account name>",
+                "account_key": "<storage account key>",
+                "endpoint": "core.windows.net"
+            }
+        }
+    }
+}
diff --git a/recipes/TensorFlow-Distributed/config/jobs.json b/recipes/TensorFlow-Distributed/config/gpu/jobs.json
similarity index 84%
rename from recipes/TensorFlow-Distributed/config/jobs.json
rename to recipes/TensorFlow-Distributed/config/gpu/jobs.json
index bb10f89..22c4273 100644
--- a/recipes/TensorFlow-Distributed/config/jobs.json
+++ b/recipes/TensorFlow-Distributed/config/gpu/jobs.json
@@ -6,9 +6,8 @@
             "tasks": [
                 {
                     "image": "alfpark/tensorflow:0.10.0-gpu",
-                    "name": "tensorflow",
                     "remove_container_after_exit": true,
-                    "command": "/bin/bash /sw/launcher.sh --num_gpus 2",
+                    "command": "/bin/bash /sw/launcher.sh",
                     "gpu": true,
                     "multi_instance": {
                         "num_instances": "pool_specification_vm_count",
diff --git a/recipes/TensorFlow-Distributed/config/pool.json b/recipes/TensorFlow-Distributed/config/gpu/pool.json
similarity index 100%
rename from recipes/TensorFlow-Distributed/config/pool.json
rename to recipes/TensorFlow-Distributed/config/gpu/pool.json
diff --git a/recipes/TensorFlow-Distributed/docker/gpu/launcher.sh b/recipes/TensorFlow-Distributed/docker/gpu/launcher.sh
index 76f8a70..d5b8ef8 100755
--- a/recipes/TensorFlow-Distributed/docker/gpu/launcher.sh
+++ b/recipes/TensorFlow-Distributed/docker/gpu/launcher.sh
@@ -3,6 +3,15 @@
 set -e
 set -o pipefail
 
+# get number of GPUs on machine
+ngpus=`nvidia-smi -L | wc -l`
+echo "num gpus: $ngpus"
+
+if [ $ngpus -eq 0 ]; then
+    echo "No GPUs detected."
+    exit 1
+fi
+
 # get my ip address
 ipaddress=`ip addr list eth0 | grep "inet " | cut -d' ' -f6 | cut -d/ -f1`
 
@@ -40,7 +49,7 @@ if [ $AZ_BATCH_IS_CURRENT_NODE_MASTER == "true" ]; then
     # master node
     ti=${task_index[$master]}
     echo "master node: $ipaddress task index: $ti"
-    python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=ps --task_index=$ti --data_dir=./master $* > ps-$ti.log 2>&1 &
+    python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=ps --task_index=$ti --data_dir=./master --num_gpus=$ngpus $* > ps-$ti.log 2>&1 &
     masterpid=$!
 fi
 
@@ -52,10 +61,10 @@ do
     ti=${task_index[$node]}
     echo "worker node: $node task index: $ti"
     if [ $node == $master ]; then
-        python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=./worker-$ti $* > worker-$ti.log 2>&1 &
+        python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=./worker-$ti --num_gpus=$ngpus $* > worker-$ti.log 2>&1 &
         waitpids=("${waitpids[@]}" "$!")
     else
-        ssh $node "python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=$AZ_BATCH_TASK_WORKING_DIR/worker-$ti $* > $AZ_BATCH_TASK_WORKING_DIR/worker-$ti.log 2>&1" &
+        ssh $node "python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=$AZ_BATCH_TASK_WORKING_DIR/worker-$ti --num_gpus=$ngpus $* > $AZ_BATCH_TASK_WORKING_DIR/worker-$ti.log 2>&1" &
         waitpids=("${waitpids[@]}" "$!")
     fi
 done