Add custom image guide

- Update recipes for vm_configuration - Fix some issues with platform pools with new changes
2017-06-06 09:31:32 -07:00 · 2017-06-06 09:31:32 -07:00 · a41713c5ee
--- a/README.md
+++ b/README.md
@ -63,6 +63,7 @@ tunneling to Docker Hosts on compute nodes
 [Azure KeyVault](https://azure.microsoft.com/en-us/services/key-vault/)
 * Support for execution on an
 [Azure Function App environment](https://github.com/Azure/batch-shipyard/blob/master/docs/60-batch-shipyard-site-extension.md)
+* Support for [custom host images](https://github.com/Azure/batch-shipyard/blob/master/docs/63-batch-shipyard-custom-images.md)

 ## Installation
 Installation is typically an easy two-step process. The CLI is also available
--- a/convoy/fleet.py
+++ b/convoy/fleet.py
@ -880,9 +880,11 @@ def _add_pool(
            for image_ref in sorted(
                    nas.verified_image_references, key=lambda item: item.sku)
            if image_ref.publisher.lower() ==
-            pool_settings.publisher.lower() and
-            image_ref.offer.lower() == pool_settings.offer.lower() and
-            image_ref.sku.lower() == pool_settings.sku.lower()
+            pool_settings.vm_configuration.publisher.lower() and
+            image_ref.offer.lower() ==
+            pool_settings.vm_configuration.offer.lower() and
+            image_ref.sku.lower() ==
+            pool_settings.vm_configuration.sku.lower()
        ]
        try:
            sku_to_use, image_ref_to_use = skus_to_use[-1]
@ -912,11 +914,11 @@ def _add_pool(
                    sc_args) else '',
                n=' -n' if settings.can_tune_tcp(
                    pool_settings.vm_size) else '',
-                o=' -o {}'.format(pool_settings.offer),
+                o=' -o {}'.format(pool_settings.vm_configuration.offer),
                p=' -p {}'.format(bs.storage_entity_prefix)
                if bs.storage_entity_prefix else '',
                r=' -r {}'.format(preg.container) if preg.container else '',
-                s=' -s {}'.format(pool_settings.sku),
+                s=' -s {}'.format(pool_settings.vm_configuration.sku),
                t=' -t {}'.format(torrentflags),
                v=' -v {}'.format(__version__),
                w=' -w' if pool_settings.ssh.hpn_server_swap else '',
@ -1548,13 +1550,15 @@ def _adjust_settings_for_pool_creation(config):
        raise ValueError(
            ('Unsupported Docker Host VM Config, publisher={} offer={} '
             'sku={} vm_size={}').format(publisher, offer, sku, pool.vm_size))
+    # compute total vm count
+    pool_total_vm_count = pool.vm_count.dedicated + pool.vm_count.low_priority
    # ensure enough vhds for custom image pools
    if util.is_not_empty(node_agent):
        vhds = len(pool.vm_configuration.image_uris)
        if node_agent == 'batch.node.windows amd64':
-            vhds_req = int(math.ceil(pool.vm_count / 20))
+            vhds_req = int(math.ceil(pool_total_vm_count / 20))
        else:
-            vhds_req = int(math.ceil(pool.vm_count / 40))
+            vhds_req = int(math.ceil(pool_total_vm_count / 40))
        if vhds_req > vhds:
            raise ValueError(
                ('insufficient number of VHDs ({}) supplied for the number '
@ -1569,7 +1573,6 @@ def _adjust_settings_for_pool_creation(config):
             'VM config, publisher={} offer={} sku={}').format(
                 publisher, offer, sku))
    # adjust inter node comm setting
-    pool_total_vm_count = pool.vm_count.dedicated + pool.vm_count.low_priority
    if pool_total_vm_count < 1:
        raise ValueError('invalid total vm_count: {}'.format(
            pool_total_vm_count))
--- a/convoy/settings.py
+++ b/convoy/settings.py
@ -2164,8 +2164,9 @@ def task_settings(cloud_pool, config, poolconf, jobspec, conf, missing_images):
        pool_id = cloud_pool.id
        vm_size = cloud_pool.vm_size.lower()
        inter_node_comm = cloud_pool.enable_inter_node_communication
-        is_custom_image = util.is_none_or_empty(
-            cloud_pool.virtual_machine_configuration.os_disk)
+        is_custom_image = (
+            cloud_pool.virtual_machine_configuration.os_disk is not None
+        )
        if is_custom_image:
            publisher = None
            offer = None
--- a/docs/63-batch-shipyard-custom-images.md
+++ b/docs/63-batch-shipyard-custom-images.md
@ -0,0 +1,115 @@
+# Custom Images with Batch Shipyard
+The focus of this article is to explain how to provision a custom image (VHD)
+and then deploy it with Batch Shipyard as the VM image to use for your
+compute node hosts.
+
+## Background: Azure Batch, Azure Storage and Custom Images
+Azure Batch allows provisioning compute nodes with custom images (VHDs) with
+User Subscription Batch accounts. This allows users to customize the
+compute node with software, settings, etc. that fit their use case. With
+containerization, this requirement is weakened but some users may still
+want to customize the host compute node environment with particular
+versions of software such as the Docker Host engine or even embed the GPU
+driver for potential faster provisioning times.
+
+Azure Storage is used to host these custom image VHDs. Currently, there are
+two sources for creating virtual machines in Azure which are, page blob
+VHDs and managed disks. Currently, Azure Batch does not support managed
+disks, so you will need to create page blobs with your VHD image.
+
+Due to Storage account throttling limits, you must limit the number of
+compute nodes served from a single storage account (and thus VHD). For
+the maximum performance, you should limit one VHD for every 40 VMs for Linux
+(or 20 VMs for Windows) and these VHDs should be on separate storage accounts
+within the same subscription in the same region as your Batch account.
+You can use [blobxfer](https://github.com/Azure/blobxfer) or
+[AzCopy](https://azure.microsoft.com/en-us/documentation/articles/storage-use-azcopy/)
+to copy your VHD images.
+
+## Provisioning a Custom Image
+You will need to ensure that your custom image is sufficiently prepared
+before using it as a source VHD for Batch Shipyard. The following
+sub-section will detail the reasons and requisites.
+
+### Batch Shipyard Node Preparation and Custom Images
+For non-custom images (i.e., platform images or Marketplace images), Batch
+Shipyard takes care of preparing the compute node with the necessary
+software in order for tasks to run with Batch Shipyard.
+
+Because custom images can muddy the assumptions with what is available or
+not in the operating system, Batch Shipyard requires that the user prepare
+the custom image with the necessary software and only attempts to modify
+items that are needed for functionality. Software that is required is
+checked during compute node preparation.
+
+### Base Required Software
+#### Docker Host Engine
+The [Docker](https://docker.com) host engine must be installed and must
+be invocable as root with default path and permissions. The service must
+be running upon boot. The Docker socket (`/var/run/docker.sock`) must
+be available (it is available by default).
+
+#### SSH Server
+An SSH server should be installed and operational on port 22. You can
+limit inbound connections through the Batch service deployed NSG on the
+virtual network or network interface (and/or through the software firewall
+on the host).
+
+#### GPU-enabled Compute Nodes
+In order to utilize the GPUs available on compute nodes that have them
+(e.g., N-series VMs), the NVIDIA driver must be installed and loaded upon
+boot.
+
+Additionally, [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
+must be installed and the service must be running upon boot.
+
+#### Infiniband/RDMA-enabled Compute Nodes
+The host VM Infiniband/RDMA stack must be enabled with the proper drivers
+and the required user-land software for Infiniband installed. It is best to
+base a custom image off of the existing Azure platform images that support
+Infiniband/RDMA.
+
+#### Storage Cluster Auto-Linking and Mounting
+If mounting a storage cluster, the required NFSv4 or GlusterFS client tooling
+must be installed and invocable such that the auto-link mount functionality
+is operable. Both clients need not be installed unless you are mounting
+both types of storage clusters.
+
+#### GlusterFS On Compute
+If a GlusterFS on compute shared data volume is required, then GlusterFS
+server and client tooling must be installed and invocable so the shared
+data volume can be created amongst the compute nodes.
+
+### Installed/Configured Software
+#### Encryption Certificates and Credential Decryption
+If employing credential encryption, Batch Shipyard will exercise the necessary
+logic to decrypt any encrypted field if credential encryption is enabled.
+Properties in the global configuration should be enabled as per requirements
+as if deploying a non-Custom Image-based compute node.
+
+#### Batch Shipyard Docker Images
+Batch Shipyard Docker images required for functionality on the compute node
+will be automatically installed.
+
+#### Azure File Docker Volume Driver
+Batch Shipyard will install and configure the Azure File Docker Volume
+Driver for any Azure File shared data volumes that are specified.
+
+### Packer Samples
+The [contrib](../contrib) area of the repository contain example `packer`
+scripts to create a custom image from an existing Marketplace platform image.
+
+## Allocating a Pool with a Custom Image
+When allocating a compute pool with a custom image, you must ensure the
+following:
+
+0. You have a User Subscription Batch account
+1. Custom image VHD is in your storage account as a page blob
+2. The storage account is in the same subscription and region as your
+   *User Subscription* Batch account
+3. You have sufficiently replicated the custom image VHD across enough
+   storage accounts to support your compute pool
+4. You have URIs for all of these custom image VHDs. These URIs should not
+   include SAS information of any kind. They should be "bare" URLs.
+5. Your pool configuration file has the proper `vm_configuration` settings
+   for `custom_image`
--- a/docs/95-low-priority-considerations.md
+++ b/docs/95-low-priority-considerations.md
@ -1,8 +1,12 @@
 # Low Priority Compute Node Considerations
 Please read the following carefully concerning pools allocated with low-
-priority compute nodes.
+priority compute nodes. You may also want to read the
+[Azure Batch Low Priority Compute Node](https://docs.microsoft.com/en-us/azure/batch/batch-low-pri-vms)
+documentation.

 ### Pool Allocation and Resizing
+* Low priority compute nodes can only be allocated with non-User Subscription
+Batch accounts.
 * Pool and compute node allocation may take up to the full resize timeout
 and not reach full allocation with low priority if a low priority node is
 pre-empted and the target number of low priority nodes cannot be reached.
--- a/docs/97-faq.md
+++ b/docs/97-faq.md
@ -49,6 +49,8 @@ factors that Batch Shipyard has no control over.
    regarding your request. Pull requests are always welcome!
 * How do I contribute a recipe?
  * Please see this [guide](98-contributing-recipes.md).
+* Does Batch Shipyard support Linux custom images?
+  * Yes, please see [the guide](63-batch-shipyard-custom-images.md).
 * Does Batch Shipyard support Windows Server Containers?
  * Not at this time, we are tracking the issue
    [here](https://github.com/Azure/batch-shipyard/issues/7).
--- a/docs/README.md
+++ b/docs/README.md
@ -13,6 +13,7 @@ and effectively running your batch-style Docker workloads on Azure Batch.
   * [FS Configuration](15-batch-shipyard-configuration-fs.md)
 5. [Usage](20-batch-shipyard-usage.md)
 6. [Azure Functions and Batch Shipyard](60-batch-shipyard-site-extension.md)
+7. [Custom Image for Host Compute Nodes](63-batch-shipyard-custom-images.md)
 7. [Remote Filesystems](65-batch-shipyard-remote-fs.md)
 8. [Data Movement](70-batch-shipyard-data-movement.md)
 9. [Azure KeyVault for Credential Management](74-batch-shipyard-azure-keyvault.md)
--- a/recipes/CNTK-CPU-Infiniband-IntelMPI/config/pool.json
+++ b/recipes/CNTK-CPU-Infiniband-IntelMPI/config/pool.json
@ -1,14 +1,18 @@
 {
    "pool_specification": {
        "id": "docker-cntk-rdma",
-        "vm_size": "STANDARD_A9",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "OpenLogic",
+                "offer": "CentOS-HPC",
+                "sku": "7.1"
+            }
+        },
+        "vm_size": "STANDARD_H16R",
        "vm_count": {
            "dedicated": 2
        },
        "inter_node_communication_enabled": true,
-        "publisher": "OpenLogic",
-        "offer": "CentOS-HPC",
-        "sku": "7.1",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/CNTK-CPU-OpenMPI/config/multinode/pool.json
+++ b/recipes/CNTK-CPU-OpenMPI/config/multinode/pool.json
@ -1,14 +1,18 @@
 {
    "pool_specification": {
        "id": "cntk-cpu-multinode",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_D1_V2",
        "vm_count": {
            "dedicated": 3
        },
        "inter_node_communication_enabled": true,
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/CNTK-CPU-OpenMPI/config/singlenode/pool.json
+++ b/recipes/CNTK-CPU-OpenMPI/config/singlenode/pool.json
@ -1,14 +1,18 @@
 {
    "pool_specification": {
        "id": "cntk-cpu-singlenode",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_D1_V2",
        "vm_count": {
            "dedicated": 1
        },
        "inter_node_communication_enabled": true,
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/pool.json
+++ b/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/pool.json
@ -1,14 +1,18 @@
 {
    "pool_specification": {
        "id": "cntk-multinode-multigpu",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_NC24",
        "vm_count": {
            "dedicated": 2
        },
        "inter_node_communication_enabled": true,
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/pool.json
+++ b/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/pool.json
@ -1,13 +1,17 @@
 {
    "pool_specification": {
        "id": "cntk-singlenode-multigpu",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_NC24",
        "vm_count": {
            "dedicated": 1
        },
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/pool.json
+++ b/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/pool.json
@ -1,13 +1,17 @@
 {
    "pool_specification": {
        "id": "cntk-singlenode-singlegpu",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_NC6",
        "vm_count": {
            "dedicated": 1
        },
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/Caffe-CPU/config/pool.json
+++ b/recipes/Caffe-CPU/config/pool.json
@ -1,13 +1,17 @@
 {
    "pool_specification": {
        "id": "caffe-cpu",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_D1_V2",
        "vm_count": {
            "dedicated": 1
        },
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/Caffe-GPU/config/pool.json
+++ b/recipes/Caffe-GPU/config/pool.json
@ -1,13 +1,17 @@
 {
    "pool_specification": {
        "id": "caffe-gpu",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_NC6",
        "vm_count": {
            "dedicated": 1
        },
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/Chainer-CPU/config/pool.json
+++ b/recipes/Chainer-CPU/config/pool.json
@ -1,13 +1,17 @@
 {
    "pool_specification": {
        "id": "chainer-cpu",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_D1_V2",
        "vm_count": {
            "dedicated": 1
        },
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/Chainer-GPU/config/pool.json
+++ b/recipes/Chainer-GPU/config/pool.json
@ -1,13 +1,17 @@
 {
    "pool_specification": {
        "id": "chainer-gpu",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_NC6",
        "vm_count": {
            "dedicated": 1
        },
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/FFmpeg-GPU/config/pool.json
+++ b/recipes/FFmpeg-GPU/config/pool.json
@ -1,13 +1,17 @@
 {
    "pool_specification": {
        "id": "dockerffmpeg",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_NV6",
        "vm_count": {
            "dedicated": 1
        },
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/HPCG-Infiniband-IntelMPI/config/pool.json
+++ b/recipes/HPCG-Infiniband-IntelMPI/config/pool.json
@ -1,14 +1,18 @@
 {
    "pool_specification": {
        "id": "docker-hpcg",
-        "vm_size": "STANDARD_A9",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "OpenLogic",
+                "offer": "CentOS-HPC",
+                "sku": "7.1"
+            }
+        },
+        "vm_size": "STANDARD_H16R",
        "vm_count": {
            "dedicated": 2
        },
        "inter_node_communication_enabled": true,
-        "publisher": "OpenLogic",
-        "offer": "CentOS-HPC",
-        "sku": "7.1",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/HPLinpack-Infiniband-IntelMPI/config/pool.json
+++ b/recipes/HPLinpack-Infiniband-IntelMPI/config/pool.json
@ -1,14 +1,18 @@
 {
    "pool_specification": {
        "id": "docker-linpack",
-        "vm_size": "STANDARD_A9",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "OpenLogic",
+                "offer": "CentOS-HPC",
+                "sku": "7.1"
+            }
+        },
+        "vm_size": "STANDARD_H16R",
        "vm_count": {
            "dedicated": 2
        },
        "inter_node_communication_enabled": true,
-        "publisher": "OpenLogic",
-        "offer": "CentOS-HPC",
-        "sku": "7.1",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/Keras+Theano-CPU/config/Dockerfile
+++ b/recipes/Keras+Theano-CPU/config/Dockerfile
@ -1,37 +0,0 @@
-# Dockerfile for Keras+Theano-CPU for use with Batch Shipyard on Azure Batch
-
-FROM ubuntu:14.04
-MAINTAINER Fred Park <https://github.com/Azure/batch-shipyard>
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        gfortran \
-        git \
-        wget \
-        curl \
-        ca-certificates \
-        libhdf5-dev \
-        liblapack-dev \
-        libopenblas-dev \
-        python-dev && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# upgrade pip and install dependencies
-RUN curl --silent --show-error --retry 5 https://bootstrap.pypa.io/get-pip.py | python && \
-    pip install --upgrade --no-cache-dir setuptools wheel six && \
-    pip install --upgrade --no-cache-dir pyyaml nose h5py && \
-    pip install --upgrade --no-cache-dir numpy && \
-    pip install --upgrade --no-cache-dir scipy
-
-# install theano and keras
-RUN pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git && \
-    git clone https://github.com/fchollet/keras.git && \
-    cd keras && \
-    python setup.py install
-
-# set keras backend to theano
-ENV KERAS_BACKEND=theano
-
-# copy in default theanorc file
-COPY theanorc /root/.theanorc
--- a/recipes/Keras+Theano-CPU/config/pool.json
+++ b/recipes/Keras+Theano-CPU/config/pool.json
@ -1,13 +1,17 @@
 {
    "pool_specification": {
        "id": "keras-cpu",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_D1_V2",
        "vm_count": {
            "dedicated": 1
        },
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/Keras+Theano-CPU/docker/Dockerfile
+++ b/recipes/Keras+Theano-CPU/docker/Dockerfile
@ -1,6 +1,7 @@
 # Dockerfile for Keras+Theano-CPU for use with Batch Shipyard on Azure Batch

 FROM ubuntu:14.04
+MAINTAINER Fred Park <https://github.com/Azure/batch-shipyard>

 RUN apt-get update && apt-get install -y --no-install-recommends \
        build-essential \
--- a/recipes/Keras+Theano-GPU/config/Dockerfile
+++ b/recipes/Keras+Theano-GPU/config/Dockerfile
@ -1,37 +0,0 @@
-# Dockerfile for Keras+Theano-CPU for use with Batch Shipyard on Azure Batch
-
-FROM ubuntu:14.04
-MAINTAINER Fred Park <https://github.com/Azure/batch-shipyard>
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        gfortran \
-        git \
-        wget \
-        curl \
-        ca-certificates \
-        libhdf5-dev \
-        liblapack-dev \
-        libopenblas-dev \
-        python-dev && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# upgrade pip and install dependencies
-RUN curl --silent --show-error --retry 5 https://bootstrap.pypa.io/get-pip.py | python && \
-    pip install --upgrade --no-cache-dir setuptools wheel six && \
-    pip install --upgrade --no-cache-dir pyyaml nose h5py && \
-    pip install --upgrade --no-cache-dir numpy && \
-    pip install --upgrade --no-cache-dir scipy
-
-# install theano and keras
-RUN pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git && \
-    git clone https://github.com/fchollet/keras.git && \
-    cd keras && \
-    python setup.py install
-
-# set keras backend to theano
-ENV KERAS_BACKEND=theano
-
-# copy in default theanorc file
-COPY theanorc /root/.theanorc
--- a/recipes/Keras+Theano-GPU/config/pool.json
+++ b/recipes/Keras+Theano-GPU/config/pool.json
@ -1,13 +1,17 @@
 {
    "pool_specification": {
        "id": "keras-gpu",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_NC6",
        "vm_count": {
            "dedicated": 1
        },
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/MXNet-CPU/config/multinode/pool.json
+++ b/recipes/MXNet-CPU/config/multinode/pool.json
@ -1,14 +1,18 @@
 {
    "pool_specification": {
        "id": "mxnet-cpu-multinode",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_D4_V2",
        "vm_count": {
            "dedicated": 2
        },
        "inter_node_communication_enabled": true,
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/MXNet-CPU/config/singlenode/pool.json
+++ b/recipes/MXNet-CPU/config/singlenode/pool.json
@ -1,14 +1,17 @@
 {
    "pool_specification": {
        "id": "mxnet-cpu-singlenode",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_D1_V2",
        "vm_count": {
            "dedicated": 1
        },
-        "inter_node_communication_enabled": true,
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/MXNet-GPU/config/multinode/pool.json
+++ b/recipes/MXNet-GPU/config/multinode/pool.json
@ -1,13 +1,18 @@
 {
    "pool_specification": {
        "id": "mxnet-multinode",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_NC24",
        "vm_count": {
            "dedicated": 2
        },
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
+        "inter_node_communication_enabled": true,
        "ssh": {
            "username": "docker"
        },
--- a/recipes/MXNet-GPU/config/singlenode/pool.json
+++ b/recipes/MXNet-GPU/config/singlenode/pool.json
@ -1,13 +1,17 @@
 {
    "pool_specification": {
        "id": "mxnet-singlenode",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_NC24",
        "vm_count": {
            "dedicated": 1
        },
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/NAMD-GPU/config/pool.json
+++ b/recipes/NAMD-GPU/config/pool.json
@ -1,13 +1,17 @@
 {
    "pool_specification": {
        "id": "namd-multigpu",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_NC12",
        "vm_count": {
            "dedicated": 1
        },
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/NAMD-Infiniband-IntelMPI/config/pool.json
+++ b/recipes/NAMD-Infiniband-IntelMPI/config/pool.json
@ -1,14 +1,18 @@
 {
    "pool_specification": {
        "id": "docker-namd-rdma",
-        "vm_size": "STANDARD_A9",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "OpenLogic",
+                "offer": "CentOS-HPC",
+                "sku": "7.1"
+            }
+        },
+        "vm_size": "STANDARD_H16R",
        "vm_count": {
            "dedicated": 4
        },
        "inter_node_communication_enabled": true,
-        "publisher": "OpenLogic",
-        "offer": "CentOS-HPC",
-        "sku": "7.1",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/NAMD-TCP/config/pool.json
+++ b/recipes/NAMD-TCP/config/pool.json
@ -1,14 +1,18 @@
 {
    "pool_specification": {
        "id": "namd-tcp",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_D3_V2",
        "vm_count": {
            "dedicated": 4
        },
        "inter_node_communication_enabled": true,
-        "publisher": "OpenLogic",
-        "offer": "CentOS",
-        "sku": "7.3",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/OpenFOAM-Infiniband-IntelMPI/config/pool.json
+++ b/recipes/OpenFOAM-Infiniband-IntelMPI/config/pool.json
@ -1,14 +1,18 @@
 {
    "pool_specification": {
        "id": "docker-openfoam-rdma",
-        "vm_size": "STANDARD_A9",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "OpenLogic",
+                "offer": "CentOS-HPC",
+                "sku": "7.1"
+            }
+        },
+        "vm_size": "STANDARD_H16R",
        "vm_count": {
            "dedicated": 2
        },
        "inter_node_communication_enabled": true,
-        "publisher": "OpenLogic",
-        "offer": "CentOS-HPC",
-        "sku": "7.1",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/OpenFOAM-TCP-OpenMPI/config/pool.json
+++ b/recipes/OpenFOAM-TCP-OpenMPI/config/pool.json
@ -1,14 +1,18 @@
 {
    "pool_specification": {
        "id": "docker-openfoam-tcp",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_D2_V2",
        "vm_count": {
            "dedicated": 2
        },
        "inter_node_communication_enabled": true,
-        "publisher": "OpenLogic",
-        "offer": "CentOS",
-        "sku": "7.3",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/RemoteFS-GlusterFS+BatchPool/config/pool.json
+++ b/recipes/RemoteFS-GlusterFS+BatchPool/config/pool.json
@ -1,14 +1,18 @@
 {
    "pool_specification": {
        "id": "remotefs-batchpool",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_D2_V2",
        "vm_count": {
            "dedicated": 4
        },
        "inter_node_communication_enabled": true,
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/TensorFlow-CPU/config/pool.json
+++ b/recipes/TensorFlow-CPU/config/pool.json
@ -1,13 +1,17 @@
 {
    "pool_specification": {
        "id": "tensorflow-cpu",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_D1_V2",
        "vm_count": {
            "dedicated": 1
        },
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/TensorFlow-Distributed/config/cpu/pool.json
+++ b/recipes/TensorFlow-Distributed/config/cpu/pool.json
@ -1,14 +1,18 @@
 {
    "pool_specification": {
        "id": "tensorflow-distributed",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_D4_V2",
        "vm_count": {
            "dedicated": 2
        },
        "inter_node_communication_enabled": true,
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/TensorFlow-Distributed/config/gpu/pool.json
+++ b/recipes/TensorFlow-Distributed/config/gpu/pool.json
@ -1,14 +1,18 @@
 {
    "pool_specification": {
        "id": "tensorflow-distributed",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_NC12",
        "vm_count": {
            "dedicated": 2
        },
        "inter_node_communication_enabled": true,
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/TensorFlow-GPU/config/pool.json
+++ b/recipes/TensorFlow-GPU/config/pool.json
@ -1,13 +1,17 @@
 {
    "pool_specification": {
        "id": "tensorflow-gpu",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_NC6",
        "vm_count": {
            "dedicated": 1
        },
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/Torch-CPU/config/pool.json
+++ b/recipes/Torch-CPU/config/pool.json
@ -1,13 +1,17 @@
 {
    "pool_specification": {
        "id": "torch",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_D3_V2",
        "vm_count": {
            "dedicated": 1
        },
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/recipes/Torch-GPU/config/pool.json
+++ b/recipes/Torch-GPU/config/pool.json
@ -1,13 +1,17 @@
 {
    "pool_specification": {
        "id": "torch-gpu",
+        "vm_configuration": {
+            "platform_image": {
+                "publisher": "Canonical",
+                "offer": "UbuntuServer",
+                "sku": "16.04-LTS"
+            }
+        },
        "vm_size": "STANDARD_NC6",
        "vm_count": {
            "dedicated": 1
        },
-        "publisher": "Canonical",
-        "offer": "UbuntuServer",
-        "sku": "16.04-LTS",
        "ssh": {
            "username": "docker"
        },
--- a/scripts/shipyard_nodeprep_customimage.sh
+++ b/scripts/shipyard_nodeprep_customimage.sh
@ -158,6 +158,20 @@ check_for_nvidia() {
    fi
 }

+check_docker_root_dir() {
+    set +e
+    rootdir=$(docker info | grep "Docker Root Dir" | cut -d' ' -f 4)
+    set -e
+    echo "$rootdir"
+    if [ -z "$rootdir" ]; then
+        echo "ERROR: could not determine docker graph root"
+    elif [[  "$rootdir" == /mnt* && "$1" == "ubuntu" ]] || [[ "$rootdir" == /mnt/resource* && "$1" != "ubuntu" ]]; then
+        echo "INFO: docker root is within ephemeral temp disk"
+    else
+        echo "WARNING: docker graph root is on the OS disk. Performance may be impacted."
+    fi
+}
+
 check_for_docker_host_engine() {
    set +e
    docker --version
@ -171,11 +185,14 @@ check_for_docker_host_engine() {
 check_for_glusterfs_on_compute() {
    set +e
    gluster
-    if [ $? -ne 0 ]; then
-        echo "ERROR: gluster server not installed"
+    rc0=$?
+    glusterfs -V
+    rc1=$?
+    set -e
+    if [ $rc0 -ne 0 ] || [ $rc1 -ne 0 ]; then
+        echo "ERROR: gluster server and client not installed"
        exit 1
    fi
-    set -e
 }

 check_for_storage_cluster_software() {
@ -309,11 +326,15 @@ fi

 # one-time setup
 if [ ! -f $nodeprepfinished ] && [ $networkopt -eq 1 ]; then
+    # do not fail script if this function fails
+    set +e
    optimize_tcp_network_settings $DISTRIB_ID $DISTRIB_RELEASE
+    set -e
 fi

 # check for docker host engine
 check_for_docker_host_engine
+check_docker_root_dir $DISTRIB_ID

 # TODO warn if graph is on os disk