Remove name requirement for multi-instance tasks

- Update TensorFlow-Distributed gpu launcher script to autodetect gpus - Separate config scripts for TensorFlow-Distributed into CPU and GPU
2016-10-27 23:41:47 -07:00 · 2016-10-27 23:41:47 -07:00 · 72f9c90baf
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -43,7 +43,10 @@ Please see usage doc for more information.
 been replaced by the `ssh` property. `generate_tunnel_script` has been renamed
 to `generate_docker_tunnel_script`. Please see the configuration doc for
 more information.
- `streamfile` no longer has an arbitrary max streaming time; the action will
+- The `name` property of a task json object in the jobs specification is no
+longer required for multi-instance tasks. If not specified, `name` defaults
+to `id` for all task types.
+- `data stream` no longer has an arbitrary max streaming time; the action will
 stream the file indefinitely until the task completes
 - Validate container with `storage_entity_prefix` for length issues
 - `delpool` action now cleans up and deletes some storage containers
--- a/convoy/batch.py
+++ b/convoy/batch.py
@ -1215,6 +1215,32 @@ def list_task_files(batch_client, config):
            logger.error('no tasks found for job {}'.format(job['id']))


+def _generate_next_generic_task_id(batch_client, job_id, reserved=None):
+    # type: (azure.batch.batch_service_client.BatchServiceClient, str,
+    #        str) -> str
+    """Generate the next generic task id
+    :param batch_client: The batch client to use.
+    :type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
+    :param str job_id: job id
+    :param str reserved: reserved task id
+    :rtype: str
+    :return: returns a generic docker task id
+    """
+    # get filtered, sorted list of generic docker task ids
+    try:
+        tasklist = sorted(filter(
+            lambda x: x.id.startswith(_GENERIC_DOCKER_TASK_PREFIX),
+            (batch_client.task.list(job_id))), key=lambda y: y.id)
+        tasknum = int(tasklist[-1].id.split('-')[-1]) + 1
+    except (batchmodels.batch_error.BatchErrorException, IndexError):
+        tasknum = 0
+    if reserved is not None:
+        tasknum_reserved = int(reserved.split('-')[-1])
+        while tasknum == tasknum_reserved:
+            tasknum += 1
+    return '{0}{1:03d}'.format(_GENERIC_DOCKER_TASK_PREFIX, tasknum)
+
+
 def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
    # type: (batch.BatchServiceClient, azureblob.BlockBlobService,
    #        dict, tuple, tuple) -> None
@ -1261,7 +1287,8 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
            mi_ac = True
        job.uses_task_dependencies = False
        multi_instance = False
-        docker_container_name = None
+        mi_docker_container_name = None
+        reserved_task_id = None
        for task in jobspec['tasks']:
            # do not break, check to ensure ids are set on each task if
            # task dependencies are set
@ -1277,20 +1304,26 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
                        'cannot specify more than one multi-instance task '
                        'per job with auto completion enabled')
                multi_instance = True
-                docker_container_name = task['name']
+                try:
+                    mi_docker_container_name = task['name']
+                    if task['name'] is None or len(task['name']) == 0:
+                        raise KeyError()
+                except KeyError:
+                    if ('id' not in task or task['id'] is None or
+                            len(task['id']) == 0):
+                        reserved_task_id = _generate_next_generic_task_id(
+                            batch_client, job.id)
+                        task['id'] = reserved_task_id
+                    task['name'] = task['id']
+                    mi_docker_container_name = task['name']
        # add multi-instance settings
        set_terminate_on_all_tasks_complete = False
        if multi_instance and mi_ac:
-            if (docker_container_name is None or
-                    len(docker_container_name) == 0):
-                raise ValueError(
-                    'multi-instance task must be invoked with a named '
-                    'container')
            set_terminate_on_all_tasks_complete = True
            job.job_release_task = batchmodels.JobReleaseTask(
                command_line=convoy.util.wrap_commands_in_shell(
-                    ['docker stop {}'.format(docker_container_name),
-                     'docker rm -v {}'.format(docker_container_name)]),
+                    ['docker stop {}'.format(mi_docker_container_name),
+                     'docker rm -v {}'.format(mi_docker_container_name)]),
                run_elevated=True,
            )
        logger.info('Adding job: {}'.format(job.id))
@ -1306,7 +1339,7 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
                raise
        del mi_ac
        del multi_instance
-        del docker_container_name
+        del mi_docker_container_name
        # add all tasks under job
        for task in jobspec['tasks']:
            # get image name
@ -1317,19 +1350,8 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
                if task_id is None or len(task_id) == 0:
                    raise KeyError()
            except KeyError:
-                # get filtered, sorted list of generic docker task ids
-                try:
-                    tasklist = sorted(
-                        filter(lambda x: x.id.startswith(
-                            _GENERIC_DOCKER_TASK_PREFIX), list(
-                                batch_client.task.list(job.id))),
-                        key=lambda y: y.id)
-                    tasknum = int(tasklist[-1].id.split('-')[-1]) + 1
-                except (batchmodels.batch_error.BatchErrorException,
-                        IndexError):
-                    tasknum = 0
-                task_id = '{0}{1:03d}'.format(
-                    _GENERIC_DOCKER_TASK_PREFIX, tasknum)
+                task_id = _generate_next_generic_task_id(
+                    batch_client, job.id, reserved_task_id)
            # set run and exec commands
            docker_run_cmd = 'docker run'
            docker_exec_cmd = 'docker exec'
@ -1346,13 +1368,14 @@ def add_jobs(batch_client, blob_client, config, jpfile, bxfile):
            else:
                if rm_container and '--rm' not in run_opts:
                    run_opts.append('--rm')
-            # parse name option
+            # parse name option, if not specified use task id
            try:
                name = task['name']
-                if name is not None:
-                    run_opts.append('--name {}'.format(name))
+                if name is None or len(name) == 0:
+                    raise KeyError()
            except KeyError:
-                name = None
+                name = task['id']
+            run_opts.append('--name {}'.format(name))
            # parse labels option
            try:
                labels = task['labels']
--- a/docs/10-batch-shipyard-configuration.md
+++ b/docs/10-batch-shipyard-configuration.md
@ -735,8 +735,8 @@ transferred again. This object currently supports `azure_batch` and
    invocation (task) depends on and must run to successful completion prior
    to this task executing.
  * (required) `image` is the Docker image to use for this task
-  * `name` is the name to assign to the container. This is required for
-    multi-instance tasks, optional if not.
+  * (optional) `name` is the name to assign to the container. If not
+    specified, the value of the `id` property will be used for `name`.
  * (optional) `labels` is an array of labels to apply to the container.
  * (optional) `environment_variables` are any additional task-specific
    environment variables that should be applied to the container.
--- a/recipes/CNTK-CPU-OpenMPI/README.md
+++ b/recipes/CNTK-CPU-OpenMPI/README.md
@ -45,8 +45,6 @@ array which should have a task definition containing:
 Since we are not using either the MNIST or CIFAR examples, this can simply
 be `alfpark/cntk:1.7.2-cpu-openmpi`. Please note that the `docker_images` in
 the Global Configuration should match this image name.
-* `name` is a unique name given to the Docker container instance. This is
-required for Multi-Instance tasks.
 * `command` should contain the command to pass to the Docker run invocation.
 For this example, we will run the ConvNet MNIST Example that has been modified
 to run in parallel in the `alfpark/cntk:1.7.2-cpu-openmpi-refdata` Docker
--- a/recipes/CNTK-CPU-OpenMPI/config/multinode/jobs.json
+++ b/recipes/CNTK-CPU-OpenMPI/config/multinode/jobs.json
@ -6,7 +6,6 @@
            "tasks": [
                {
                    "image": "alfpark/cntk:1.7.2-cpu-openmpi-refdata",
-                    "name": "cntk",
                    "remove_container_after_exit": true,
                    "shared_data_volumes": [
                        "glustervol"
--- a/recipes/CNTK-GPU-OpenMPI/README.md
+++ b/recipes/CNTK-GPU-OpenMPI/README.md
@ -69,9 +69,6 @@ array which should have a task definition containing:
 Since we are not using either the MNIST or CIFAR examples, this can simply
 be `alfpark/cntk:1.7.2-gpu-openmpi`. Please note that the `docker_images` in
 the Global Configuration should match this image name.
-* `name` is a unique name given to the Docker container instance. This is
-required for Multi-Instance tasks. This is not required for running on a
-single node.
 * `command` should contain the command to pass to the Docker run invocation.
 For this example, we will run the ConvNet MNIST Example that has been modified
 to run in parallel in the `alfpark/cntk:1.7.2-gpu-openmpi-refdata` Docker
--- a/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/jobs.json
+++ b/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/jobs.json
@ -6,7 +6,6 @@
            "tasks": [
                {
                    "image": "alfpark/cntk:1.7.2-gpu-openmpi-refdata",
-                    "name": "cntk",
                    "remove_container_after_exit": true,
                    "shared_data_volumes": [
                        "glustervol"
--- a/recipes/MXNet-CPU/README.md
+++ b/recipes/MXNet-CPU/README.md
@ -49,8 +49,6 @@ array which should have a task definition containing:
 * `image` should be the name of the Docker image for this container invocation.
 This can be `alfpark/mxnet:cpu`. Please note that the `docker_images` in
 the Global Configuration should match this image name.
-* `name` is a unique name given to the Docker container instance. This is
-required for Multi-Instance tasks.
 * `command` should contain the command to pass to the Docker run invocation.
 For this example, we will run the CIFAR-10 example across distributed nodes
 in the `alfpark/mxnet:cpu` Docker image. The application `command` to run
--- a/recipes/MXNet-CPU/config/multinode/jobs.json
+++ b/recipes/MXNet-CPU/config/multinode/jobs.json
@ -6,7 +6,6 @@
            "tasks": [
                {
                    "image": "alfpark/mxnet:cpu",
-                    "name": "mxnet",
                    "remove_container_after_exit": true,
                    "shared_data_volumes": [
                        "glustervol"
--- a/recipes/MXNet-GPU/README.md
+++ b/recipes/MXNet-GPU/README.md
@ -70,8 +70,6 @@ array which should have a task definition containing:
 * `image` should be the name of the Docker image for this container invocation.
 This can be `alfpark/mxnet:gpu`. Please note that the `docker_images` in
 the Global Configuration should match this image name.
-* `name` is a unique name given to the Docker container instance. This is
-required for Multi-Instance tasks.
 * `command` should contain the command to pass to the Docker run invocation.
 For this example, we will run the CIFAR-10 Resnet example across distributed
 nodes in the `alfpark/mxnet:gpu` Docker image. Note that for multinode jobs,
--- a/recipes/MXNet-GPU/config/multinode/jobs.json
+++ b/recipes/MXNet-GPU/config/multinode/jobs.json
@ -6,7 +6,6 @@
            "tasks": [
                {
                    "image": "alfpark/mxnet:gpu",
-                    "name": "mxnet",
                    "remove_container_after_exit": true,
                    "shared_data_volumes": [
                        "glustervol"
--- a/recipes/NAMD-Infiniband-IntelMPI/README.md
+++ b/recipes/NAMD-Infiniband-IntelMPI/README.md
@ -31,8 +31,6 @@ The jobs configuration should set the following properties within the `tasks`
 array which should have a task definition containing:
 * `image` should be the name of the Docker image for this container invocation,
 e.g., `alfpark/namd:2.11-icc-mkl-intelmpi`
-* `name` is a unique name given to the Docker container instance. This is
-required for Multi-Instance tasks.
 * `command` should contain the `mpirun` command. If using the sample
 `run_namd.sh` script then `"/sw/run_namd.sh <benchmark> <steps> <ppn>"`
 can be used to run the included benchmarks:
--- a/recipes/NAMD-Infiniband-IntelMPI/config/jobs.json
+++ b/recipes/NAMD-Infiniband-IntelMPI/config/jobs.json
@ -6,7 +6,6 @@
            "tasks": [
                {
                    "image": "alfpark/namd:2.11-icc-mkl-intelmpi",
-                    "name": "namd",
                    "remove_container_after_exit": true,
                    "command": "/sw/run_namd.sh apoa1 1000",
                    "infiniband": true,
--- a/recipes/NAMD-TCP/README.md
+++ b/recipes/NAMD-TCP/README.md
@ -27,8 +27,6 @@ The jobs configuration should set the following properties within the `tasks`
 array which should have a task definition containing:
 * `image` should be the name of the Docker image for this container invocation,
 e.g., `alfpark/namd:2.11-tcp`
-* `name` is a unique name given to the Docker container instance. This is
-required for Multi-Instance tasks.
 * `command` should contain the `mpirun` command. If using the sample NAMD-TCP
 image provided, `"/sw/run_namd.sh <benchmark> <steps> <ppn>"` can be used
 to run the included benchmarks:
--- a/recipes/NAMD-TCP/config/jobs.json
+++ b/recipes/NAMD-TCP/config/jobs.json
@ -6,7 +6,6 @@
            "tasks": [
                {
                    "image": "alfpark/namd:2.11-tcp",
-                    "name": "namd",
                    "remove_container_after_exit": true,
                    "command": "/sw/run_namd.sh apoa1 100",
                    "multi_instance": {
--- a/recipes/OpenFOAM-Infiniband-IntelMPI/README.md
+++ b/recipes/OpenFOAM-Infiniband-IntelMPI/README.md
@ -37,8 +37,6 @@ The jobs configuration should set the following properties within the `tasks`
 array which should have a task definition containing:
 * `image` should be the name of the Docker image for this container invocation.
 For this example, this can be `alfpark/openfoam:4.0-icc-intelmpi`.
-* `name` is a unique name given to the Docker container instance. This is
-required for Multi-Instance tasks.
 * `command` should contain the `mpirun` command. If using the sample
 `run_sample.sh` script then the command should be simply:
 `/opt/OpenFOAM/run_sample.sh`
--- a/recipes/OpenFOAM-Infiniband-IntelMPI/config/jobs.json
+++ b/recipes/OpenFOAM-Infiniband-IntelMPI/config/jobs.json
@ -6,7 +6,6 @@
            "tasks": [
                {
                    "image": "alfpark/openfoam:4.0-icc-intelmpi",
-                    "name": "openfoam",
                    "remove_container_after_exit": true,
                    "shared_data_volumes": [
                        "glustervol"
--- a/recipes/OpenFOAM-TCP-OpenMPI/README.md
+++ b/recipes/OpenFOAM-TCP-OpenMPI/README.md
@ -32,8 +32,6 @@ The jobs configuration should set the following properties within the `tasks`
 array which should have a task definition containing:
 * `image` should be the name of the Docker image for this container invocation.
 For this example, this should be `alfpark/openfoam:4.0-gcc-openmpi`.
-* `name` is a unique name given to the Docker container instance. This is
-required for Multi-Instance tasks.
 * `command` should contain the `mpirun` command. If using the sample
 `run_sample.sh` script then the command should be simply:
 `/opt/OpenFOAM/run_sample.sh`
--- a/recipes/OpenFOAM-TCP-OpenMPI/config/jobs.json
+++ b/recipes/OpenFOAM-TCP-OpenMPI/config/jobs.json
@ -6,7 +6,6 @@
            "tasks": [
                {
                    "image": "alfpark/openfoam:4.0-gcc-openmpi",
-                    "name": "openfoam",
                    "remove_container_after_exit": true,
                    "shared_data_volumes": [
                        "glustervol"
--- a/recipes/TensorFlow-Distributed/README.md
+++ b/recipes/TensorFlow-Distributed/README.md
@ -52,13 +52,13 @@ array which should have a task definition containing:
 * `image` should be the name of the Docker image for this container invocation,
 e.g., `alfpark/tensorflow/0.10.0-gpu` or `alfpark/tensorflow/0.10.0-cpu`
 * `command` should contain the command to pass to the Docker run invocation.
-To run the example MNIST replica example on GPUs, the `command` would look
-like: `"/bin/bash /sw/launcher.sh --num_gpus=<number of total gpus>"` where
-the total number of gpus would be specified for the `--num_gpus` parameter.
+To run the example MNIST replica example, the `command` would look
+like: `"/bin/bash /sw/launcher.sh"`. The launcher will automatically detect
+the number of GPUs and pass the correct number to the TensorFlow script.
 Please see the [launcher.sh](docker/gpu/launcher.sh) for the launcher source.
-`--num_gpus=` parameter must be omitted if run on CPUs.
 * `gpu` must be set to `true` if run on GPUs. This enables invoking the
-`nvidia-docker` wrapper.
+`nvidia-docker` wrapper. This property should be omitted or set to `false`
+if run on CPUs.
 * `multi_instance` property must be defined
  * `num_instances` should be set to `pool_specification_vm_count` or
    `pool_current_dedicated`
--- a/recipes/TensorFlow-Distributed/config/cpu/config.json
+++ b/recipes/TensorFlow-Distributed/config/cpu/config.json
@ -0,0 +1,11 @@
+{
+    "batch_shipyard": {
+        "storage_account_settings": "<storage account specified in credentials.json>",
+        "storage_entity_prefix": "shipyard"
+    },
+    "global_resources": {
+        "docker_images": [
+            "alfpark/tensorflow:0.10.0-cpu"
+        ]
+    }
+}
--- a/recipes/TensorFlow-Distributed/config/cpu/credentials.json
+++ b/recipes/TensorFlow-Distributed/config/cpu/credentials.json
--- a/recipes/TensorFlow-Distributed/config/cpu/jobs.json
+++ b/recipes/TensorFlow-Distributed/config/cpu/jobs.json
@ -0,0 +1,21 @@
+{
+    "job_specifications": [
+        {
+            "id": "tensorflow",
+            "multi_instance_auto_complete": true,
+            "tasks": [
+                {
+                    "image": "alfpark/tensorflow:0.10.0-cpu",
+                    "remove_container_after_exit": true,
+                    "command": "/bin/bash /sw/launcher.sh",
+                    "multi_instance": {
+                        "num_instances": "pool_specification_vm_count",
+                        "coordination_command": null,
+                        "resource_files": [
+                        ]
+                    }
+                }
+            ]
+        }
+    ]
+}
--- a/recipes/TensorFlow-Distributed/config/cpu/pool.json
+++ b/recipes/TensorFlow-Distributed/config/cpu/pool.json
@ -0,0 +1,16 @@
+{
+    "pool_specification": {
+        "id": "tensorflow-distributed",
+        "vm_size": "STANDARD_D4_V2",
+        "vm_count": 2,
+        "inter_node_communication_enabled": true,
+        "publisher": "Canonical",
+        "offer": "UbuntuServer",
+        "sku": "16.04.0-LTS",
+        "ssh": {
+            "username": "docker"
+        },
+        "reboot_on_start_task_failed": false,
+        "block_until_all_global_resources_loaded": true
+    }
+}
--- a/recipes/TensorFlow-Distributed/config/gpu/config.json
+++ b/recipes/TensorFlow-Distributed/config/gpu/config.json
--- a/recipes/TensorFlow-Distributed/config/gpu/credentials.json
+++ b/recipes/TensorFlow-Distributed/config/gpu/credentials.json
@ -0,0 +1,16 @@
+{
+    "credentials": {
+        "batch": {
+            "account": "<batch account name>",
+            "account_key": "<batch account key>",
+            "account_service_url": "<batch account service url>"
+        },
+        "storage": {
+            "mystorageaccount": {
+                "account": "<storage account name>",
+                "account_key": "<storage account key>",
+                "endpoint": "core.windows.net"
+            }
+        }
+    }
+}
--- a/recipes/TensorFlow-Distributed/config/gpu/jobs.json
+++ b/recipes/TensorFlow-Distributed/config/gpu/jobs.json
@ -6,9 +6,8 @@
            "tasks": [
                {
                    "image": "alfpark/tensorflow:0.10.0-gpu",
-                    "name": "tensorflow",
                    "remove_container_after_exit": true,
-                    "command": "/bin/bash /sw/launcher.sh --num_gpus 2",
+                    "command": "/bin/bash /sw/launcher.sh",
                    "gpu": true,
                    "multi_instance": {
                        "num_instances": "pool_specification_vm_count",
--- a/recipes/TensorFlow-Distributed/config/gpu/pool.json
+++ b/recipes/TensorFlow-Distributed/config/gpu/pool.json
--- a/recipes/TensorFlow-Distributed/docker/gpu/launcher.sh
+++ b/recipes/TensorFlow-Distributed/docker/gpu/launcher.sh
@ -3,6 +3,15 @@
 set -e
 set -o pipefail

+# get number of GPUs on machine
+ngpus=`nvidia-smi -L | wc -l`
+echo "num gpus: $ngpus"
+
+if [ $ngpus -eq 0 ]; then
+    echo "No GPUs detected."
+    exit 1
+fi
+
 # get my ip address
 ipaddress=`ip addr list eth0 | grep "inet " | cut -d' ' -f6 | cut -d/ -f1`

@ -40,7 +49,7 @@ if [ $AZ_BATCH_IS_CURRENT_NODE_MASTER == "true" ]; then
    # master node
    ti=${task_index[$master]}
    echo "master node: $ipaddress task index: $ti"
-    python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=ps --task_index=$ti --data_dir=./master $* > ps-$ti.log 2>&1 &
+    python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=ps --task_index=$ti --data_dir=./master --num_gpus=$ngpus $* > ps-$ti.log 2>&1 &
    masterpid=$!
 fi

@ -52,10 +61,10 @@ do
    ti=${task_index[$node]}
    echo "worker node: $node task index: $ti"
    if [ $node == $master ]; then
-        python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=./worker-$ti $* > worker-$ti.log 2>&1 &
+        python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=./worker-$ti --num_gpus=$ngpus $* > worker-$ti.log 2>&1 &
        waitpids=("${waitpids[@]}" "$!")
    else
-        ssh $node "python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=$AZ_BATCH_TASK_WORKING_DIR/worker-$ti $* > $AZ_BATCH_TASK_WORKING_DIR/worker-$ti.log 2>&1" &
+        ssh $node "python /sw/mnist_replica.py --ps_hosts=$ps_hosts --worker_hosts=$worker_hosts --job_name=worker --task_index=$ti --data_dir=$AZ_BATCH_TASK_WORKING_DIR/worker-$ti --num_gpus=$ngpus $* > $AZ_BATCH_TASK_WORKING_DIR/worker-$ti.log 2>&1" &
        waitpids=("${waitpids[@]}" "$!")
    fi
 done