From a41713c5eec86d9e22a1cfd8b6952ad9daba344d Mon Sep 17 00:00:00 2001 From: Fred Park Date: Tue, 6 Jun 2017 09:31:32 -0700 Subject: [PATCH] Add custom image guide - Update recipes for vm_configuration - Fix some issues with platform pools with new changes --- README.md | 1 + convoy/fleet.py | 19 +-- convoy/settings.py | 5 +- docs/63-batch-shipyard-custom-images.md | 115 ++++++++++++++++++ docs/95-low-priority-considerations.md | 6 +- docs/97-faq.md | 2 + docs/README.md | 1 + .../config/pool.json | 12 +- .../config/multinode/pool.json | 10 +- .../config/singlenode/pool.json | 10 +- .../config/multinode-multigpu/pool.json | 10 +- .../config/singlenode-multigpu/pool.json | 10 +- .../config/singlenode-singlegpu/pool.json | 10 +- recipes/Caffe-CPU/config/pool.json | 10 +- recipes/Caffe-GPU/config/pool.json | 10 +- recipes/Chainer-CPU/config/pool.json | 10 +- recipes/Chainer-GPU/config/pool.json | 10 +- recipes/FFmpeg-GPU/config/pool.json | 10 +- .../HPCG-Infiniband-IntelMPI/config/pool.json | 12 +- .../config/pool.json | 12 +- recipes/Keras+Theano-CPU/config/Dockerfile | 37 ------ recipes/Keras+Theano-CPU/config/pool.json | 10 +- recipes/Keras+Theano-CPU/docker/Dockerfile | 1 + recipes/Keras+Theano-GPU/config/Dockerfile | 37 ------ recipes/Keras+Theano-GPU/config/pool.json | 10 +- recipes/MXNet-CPU/config/multinode/pool.json | 10 +- recipes/MXNet-CPU/config/singlenode/pool.json | 11 +- recipes/MXNet-GPU/config/multinode/pool.json | 11 +- recipes/MXNet-GPU/config/singlenode/pool.json | 10 +- recipes/NAMD-GPU/config/pool.json | 10 +- .../NAMD-Infiniband-IntelMPI/config/pool.json | 12 +- recipes/NAMD-TCP/config/pool.json | 10 +- .../config/pool.json | 12 +- recipes/OpenFOAM-TCP-OpenMPI/config/pool.json | 10 +- .../config/pool.json | 10 +- recipes/TensorFlow-CPU/config/pool.json | 10 +- .../config/cpu/pool.json | 10 +- .../config/gpu/pool.json | 10 +- recipes/TensorFlow-GPU/config/pool.json | 10 +- recipes/Torch-CPU/config/pool.json | 10 +- recipes/Torch-GPU/config/pool.json | 10 +- scripts/shipyard_nodeprep_customimage.sh | 27 +++- 42 files changed, 386 insertions(+), 187 deletions(-) create mode 100644 docs/63-batch-shipyard-custom-images.md delete mode 100644 recipes/Keras+Theano-CPU/config/Dockerfile delete mode 100644 recipes/Keras+Theano-GPU/config/Dockerfile diff --git a/README.md b/README.md index 987a22a..b52d542 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,7 @@ tunneling to Docker Hosts on compute nodes [Azure KeyVault](https://azure.microsoft.com/en-us/services/key-vault/) * Support for execution on an [Azure Function App environment](https://github.com/Azure/batch-shipyard/blob/master/docs/60-batch-shipyard-site-extension.md) +* Support for [custom host images](https://github.com/Azure/batch-shipyard/blob/master/docs/63-batch-shipyard-custom-images.md) ## Installation Installation is typically an easy two-step process. The CLI is also available diff --git a/convoy/fleet.py b/convoy/fleet.py index 756737d..98c79f8 100644 --- a/convoy/fleet.py +++ b/convoy/fleet.py @@ -880,9 +880,11 @@ def _add_pool( for image_ref in sorted( nas.verified_image_references, key=lambda item: item.sku) if image_ref.publisher.lower() == - pool_settings.publisher.lower() and - image_ref.offer.lower() == pool_settings.offer.lower() and - image_ref.sku.lower() == pool_settings.sku.lower() + pool_settings.vm_configuration.publisher.lower() and + image_ref.offer.lower() == + pool_settings.vm_configuration.offer.lower() and + image_ref.sku.lower() == + pool_settings.vm_configuration.sku.lower() ] try: sku_to_use, image_ref_to_use = skus_to_use[-1] @@ -912,11 +914,11 @@ def _add_pool( sc_args) else '', n=' -n' if settings.can_tune_tcp( pool_settings.vm_size) else '', - o=' -o {}'.format(pool_settings.offer), + o=' -o {}'.format(pool_settings.vm_configuration.offer), p=' -p {}'.format(bs.storage_entity_prefix) if bs.storage_entity_prefix else '', r=' -r {}'.format(preg.container) if preg.container else '', - s=' -s {}'.format(pool_settings.sku), + s=' -s {}'.format(pool_settings.vm_configuration.sku), t=' -t {}'.format(torrentflags), v=' -v {}'.format(__version__), w=' -w' if pool_settings.ssh.hpn_server_swap else '', @@ -1548,13 +1550,15 @@ def _adjust_settings_for_pool_creation(config): raise ValueError( ('Unsupported Docker Host VM Config, publisher={} offer={} ' 'sku={} vm_size={}').format(publisher, offer, sku, pool.vm_size)) + # compute total vm count + pool_total_vm_count = pool.vm_count.dedicated + pool.vm_count.low_priority # ensure enough vhds for custom image pools if util.is_not_empty(node_agent): vhds = len(pool.vm_configuration.image_uris) if node_agent == 'batch.node.windows amd64': - vhds_req = int(math.ceil(pool.vm_count / 20)) + vhds_req = int(math.ceil(pool_total_vm_count / 20)) else: - vhds_req = int(math.ceil(pool.vm_count / 40)) + vhds_req = int(math.ceil(pool_total_vm_count / 40)) if vhds_req > vhds: raise ValueError( ('insufficient number of VHDs ({}) supplied for the number ' @@ -1569,7 +1573,6 @@ def _adjust_settings_for_pool_creation(config): 'VM config, publisher={} offer={} sku={}').format( publisher, offer, sku)) # adjust inter node comm setting - pool_total_vm_count = pool.vm_count.dedicated + pool.vm_count.low_priority if pool_total_vm_count < 1: raise ValueError('invalid total vm_count: {}'.format( pool_total_vm_count)) diff --git a/convoy/settings.py b/convoy/settings.py index 29d0b2f..60cc9b3 100644 --- a/convoy/settings.py +++ b/convoy/settings.py @@ -2164,8 +2164,9 @@ def task_settings(cloud_pool, config, poolconf, jobspec, conf, missing_images): pool_id = cloud_pool.id vm_size = cloud_pool.vm_size.lower() inter_node_comm = cloud_pool.enable_inter_node_communication - is_custom_image = util.is_none_or_empty( - cloud_pool.virtual_machine_configuration.os_disk) + is_custom_image = ( + cloud_pool.virtual_machine_configuration.os_disk is not None + ) if is_custom_image: publisher = None offer = None diff --git a/docs/63-batch-shipyard-custom-images.md b/docs/63-batch-shipyard-custom-images.md new file mode 100644 index 0000000..fc3798e --- /dev/null +++ b/docs/63-batch-shipyard-custom-images.md @@ -0,0 +1,115 @@ +# Custom Images with Batch Shipyard +The focus of this article is to explain how to provision a custom image (VHD) +and then deploy it with Batch Shipyard as the VM image to use for your +compute node hosts. + +## Background: Azure Batch, Azure Storage and Custom Images +Azure Batch allows provisioning compute nodes with custom images (VHDs) with +User Subscription Batch accounts. This allows users to customize the +compute node with software, settings, etc. that fit their use case. With +containerization, this requirement is weakened but some users may still +want to customize the host compute node environment with particular +versions of software such as the Docker Host engine or even embed the GPU +driver for potential faster provisioning times. + +Azure Storage is used to host these custom image VHDs. Currently, there are +two sources for creating virtual machines in Azure which are, page blob +VHDs and managed disks. Currently, Azure Batch does not support managed +disks, so you will need to create page blobs with your VHD image. + +Due to Storage account throttling limits, you must limit the number of +compute nodes served from a single storage account (and thus VHD). For +the maximum performance, you should limit one VHD for every 40 VMs for Linux +(or 20 VMs for Windows) and these VHDs should be on separate storage accounts +within the same subscription in the same region as your Batch account. +You can use [blobxfer](https://github.com/Azure/blobxfer) or +[AzCopy](https://azure.microsoft.com/en-us/documentation/articles/storage-use-azcopy/) +to copy your VHD images. + +## Provisioning a Custom Image +You will need to ensure that your custom image is sufficiently prepared +before using it as a source VHD for Batch Shipyard. The following +sub-section will detail the reasons and requisites. + +### Batch Shipyard Node Preparation and Custom Images +For non-custom images (i.e., platform images or Marketplace images), Batch +Shipyard takes care of preparing the compute node with the necessary +software in order for tasks to run with Batch Shipyard. + +Because custom images can muddy the assumptions with what is available or +not in the operating system, Batch Shipyard requires that the user prepare +the custom image with the necessary software and only attempts to modify +items that are needed for functionality. Software that is required is +checked during compute node preparation. + +### Base Required Software +#### Docker Host Engine +The [Docker](https://docker.com) host engine must be installed and must +be invocable as root with default path and permissions. The service must +be running upon boot. The Docker socket (`/var/run/docker.sock`) must +be available (it is available by default). + +#### SSH Server +An SSH server should be installed and operational on port 22. You can +limit inbound connections through the Batch service deployed NSG on the +virtual network or network interface (and/or through the software firewall +on the host). + +#### GPU-enabled Compute Nodes +In order to utilize the GPUs available on compute nodes that have them +(e.g., N-series VMs), the NVIDIA driver must be installed and loaded upon +boot. + +Additionally, [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) +must be installed and the service must be running upon boot. + +#### Infiniband/RDMA-enabled Compute Nodes +The host VM Infiniband/RDMA stack must be enabled with the proper drivers +and the required user-land software for Infiniband installed. It is best to +base a custom image off of the existing Azure platform images that support +Infiniband/RDMA. + +#### Storage Cluster Auto-Linking and Mounting +If mounting a storage cluster, the required NFSv4 or GlusterFS client tooling +must be installed and invocable such that the auto-link mount functionality +is operable. Both clients need not be installed unless you are mounting +both types of storage clusters. + +#### GlusterFS On Compute +If a GlusterFS on compute shared data volume is required, then GlusterFS +server and client tooling must be installed and invocable so the shared +data volume can be created amongst the compute nodes. + +### Installed/Configured Software +#### Encryption Certificates and Credential Decryption +If employing credential encryption, Batch Shipyard will exercise the necessary +logic to decrypt any encrypted field if credential encryption is enabled. +Properties in the global configuration should be enabled as per requirements +as if deploying a non-Custom Image-based compute node. + +#### Batch Shipyard Docker Images +Batch Shipyard Docker images required for functionality on the compute node +will be automatically installed. + +#### Azure File Docker Volume Driver +Batch Shipyard will install and configure the Azure File Docker Volume +Driver for any Azure File shared data volumes that are specified. + +### Packer Samples +The [contrib](../contrib) area of the repository contain example `packer` +scripts to create a custom image from an existing Marketplace platform image. + +## Allocating a Pool with a Custom Image +When allocating a compute pool with a custom image, you must ensure the +following: + +0. You have a User Subscription Batch account +1. Custom image VHD is in your storage account as a page blob +2. The storage account is in the same subscription and region as your + *User Subscription* Batch account +3. You have sufficiently replicated the custom image VHD across enough + storage accounts to support your compute pool +4. You have URIs for all of these custom image VHDs. These URIs should not + include SAS information of any kind. They should be "bare" URLs. +5. Your pool configuration file has the proper `vm_configuration` settings + for `custom_image` diff --git a/docs/95-low-priority-considerations.md b/docs/95-low-priority-considerations.md index e3fe6c8..fea54ec 100644 --- a/docs/95-low-priority-considerations.md +++ b/docs/95-low-priority-considerations.md @@ -1,8 +1,12 @@ # Low Priority Compute Node Considerations Please read the following carefully concerning pools allocated with low- -priority compute nodes. +priority compute nodes. You may also want to read the +[Azure Batch Low Priority Compute Node](https://docs.microsoft.com/en-us/azure/batch/batch-low-pri-vms) +documentation. ### Pool Allocation and Resizing +* Low priority compute nodes can only be allocated with non-User Subscription +Batch accounts. * Pool and compute node allocation may take up to the full resize timeout and not reach full allocation with low priority if a low priority node is pre-empted and the target number of low priority nodes cannot be reached. diff --git a/docs/97-faq.md b/docs/97-faq.md index 9b016d2..f15fbad 100644 --- a/docs/97-faq.md +++ b/docs/97-faq.md @@ -49,6 +49,8 @@ factors that Batch Shipyard has no control over. regarding your request. Pull requests are always welcome! * How do I contribute a recipe? * Please see this [guide](98-contributing-recipes.md). +* Does Batch Shipyard support Linux custom images? + * Yes, please see [the guide](63-batch-shipyard-custom-images.md). * Does Batch Shipyard support Windows Server Containers? * Not at this time, we are tracking the issue [here](https://github.com/Azure/batch-shipyard/issues/7). diff --git a/docs/README.md b/docs/README.md index 32be245..0ff9343 100644 --- a/docs/README.md +++ b/docs/README.md @@ -13,6 +13,7 @@ and effectively running your batch-style Docker workloads on Azure Batch. * [FS Configuration](15-batch-shipyard-configuration-fs.md) 5. [Usage](20-batch-shipyard-usage.md) 6. [Azure Functions and Batch Shipyard](60-batch-shipyard-site-extension.md) +7. [Custom Image for Host Compute Nodes](63-batch-shipyard-custom-images.md) 7. [Remote Filesystems](65-batch-shipyard-remote-fs.md) 8. [Data Movement](70-batch-shipyard-data-movement.md) 9. [Azure KeyVault for Credential Management](74-batch-shipyard-azure-keyvault.md) diff --git a/recipes/CNTK-CPU-Infiniband-IntelMPI/config/pool.json b/recipes/CNTK-CPU-Infiniband-IntelMPI/config/pool.json index aa601c0..2c55d77 100644 --- a/recipes/CNTK-CPU-Infiniband-IntelMPI/config/pool.json +++ b/recipes/CNTK-CPU-Infiniband-IntelMPI/config/pool.json @@ -1,14 +1,18 @@ { "pool_specification": { "id": "docker-cntk-rdma", - "vm_size": "STANDARD_A9", + "vm_configuration": { + "platform_image": { + "publisher": "OpenLogic", + "offer": "CentOS-HPC", + "sku": "7.1" + } + }, + "vm_size": "STANDARD_H16R", "vm_count": { "dedicated": 2 }, "inter_node_communication_enabled": true, - "publisher": "OpenLogic", - "offer": "CentOS-HPC", - "sku": "7.1", "ssh": { "username": "docker" }, diff --git a/recipes/CNTK-CPU-OpenMPI/config/multinode/pool.json b/recipes/CNTK-CPU-OpenMPI/config/multinode/pool.json index 79af9cf..da04f13 100644 --- a/recipes/CNTK-CPU-OpenMPI/config/multinode/pool.json +++ b/recipes/CNTK-CPU-OpenMPI/config/multinode/pool.json @@ -1,14 +1,18 @@ { "pool_specification": { "id": "cntk-cpu-multinode", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_D1_V2", "vm_count": { "dedicated": 3 }, "inter_node_communication_enabled": true, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/CNTK-CPU-OpenMPI/config/singlenode/pool.json b/recipes/CNTK-CPU-OpenMPI/config/singlenode/pool.json index 4f76bd4..710272a 100644 --- a/recipes/CNTK-CPU-OpenMPI/config/singlenode/pool.json +++ b/recipes/CNTK-CPU-OpenMPI/config/singlenode/pool.json @@ -1,14 +1,18 @@ { "pool_specification": { "id": "cntk-cpu-singlenode", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_D1_V2", "vm_count": { "dedicated": 1 }, "inter_node_communication_enabled": true, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/pool.json b/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/pool.json index 037d2c2..cf66c74 100644 --- a/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/pool.json +++ b/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/pool.json @@ -1,14 +1,18 @@ { "pool_specification": { "id": "cntk-multinode-multigpu", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_NC24", "vm_count": { "dedicated": 2 }, "inter_node_communication_enabled": true, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/pool.json b/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/pool.json index 9cd8363..d2544dd 100644 --- a/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/pool.json +++ b/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/pool.json @@ -1,13 +1,17 @@ { "pool_specification": { "id": "cntk-singlenode-multigpu", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_NC24", "vm_count": { "dedicated": 1 }, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/pool.json b/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/pool.json index e19c29e..d5c5969 100644 --- a/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/pool.json +++ b/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/pool.json @@ -1,13 +1,17 @@ { "pool_specification": { "id": "cntk-singlenode-singlegpu", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_NC6", "vm_count": { "dedicated": 1 }, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/Caffe-CPU/config/pool.json b/recipes/Caffe-CPU/config/pool.json index 76e8ed4..4467131 100644 --- a/recipes/Caffe-CPU/config/pool.json +++ b/recipes/Caffe-CPU/config/pool.json @@ -1,13 +1,17 @@ { "pool_specification": { "id": "caffe-cpu", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_D1_V2", "vm_count": { "dedicated": 1 }, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/Caffe-GPU/config/pool.json b/recipes/Caffe-GPU/config/pool.json index e47f1f7..e73b1f0 100644 --- a/recipes/Caffe-GPU/config/pool.json +++ b/recipes/Caffe-GPU/config/pool.json @@ -1,13 +1,17 @@ { "pool_specification": { "id": "caffe-gpu", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_NC6", "vm_count": { "dedicated": 1 }, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/Chainer-CPU/config/pool.json b/recipes/Chainer-CPU/config/pool.json index 31901d6..664f896 100644 --- a/recipes/Chainer-CPU/config/pool.json +++ b/recipes/Chainer-CPU/config/pool.json @@ -1,13 +1,17 @@ { "pool_specification": { "id": "chainer-cpu", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_D1_V2", "vm_count": { "dedicated": 1 }, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/Chainer-GPU/config/pool.json b/recipes/Chainer-GPU/config/pool.json index 46025c7..7ddeee4 100644 --- a/recipes/Chainer-GPU/config/pool.json +++ b/recipes/Chainer-GPU/config/pool.json @@ -1,13 +1,17 @@ { "pool_specification": { "id": "chainer-gpu", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_NC6", "vm_count": { "dedicated": 1 }, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/FFmpeg-GPU/config/pool.json b/recipes/FFmpeg-GPU/config/pool.json index d4de96a..2ea3f29 100644 --- a/recipes/FFmpeg-GPU/config/pool.json +++ b/recipes/FFmpeg-GPU/config/pool.json @@ -1,13 +1,17 @@ { "pool_specification": { "id": "dockerffmpeg", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_NV6", "vm_count": { "dedicated": 1 }, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/HPCG-Infiniband-IntelMPI/config/pool.json b/recipes/HPCG-Infiniband-IntelMPI/config/pool.json index 2cbcc81..e4efae4 100644 --- a/recipes/HPCG-Infiniband-IntelMPI/config/pool.json +++ b/recipes/HPCG-Infiniband-IntelMPI/config/pool.json @@ -1,14 +1,18 @@ { "pool_specification": { "id": "docker-hpcg", - "vm_size": "STANDARD_A9", + "vm_configuration": { + "platform_image": { + "publisher": "OpenLogic", + "offer": "CentOS-HPC", + "sku": "7.1" + } + }, + "vm_size": "STANDARD_H16R", "vm_count": { "dedicated": 2 }, "inter_node_communication_enabled": true, - "publisher": "OpenLogic", - "offer": "CentOS-HPC", - "sku": "7.1", "ssh": { "username": "docker" }, diff --git a/recipes/HPLinpack-Infiniband-IntelMPI/config/pool.json b/recipes/HPLinpack-Infiniband-IntelMPI/config/pool.json index 29c0568..a4b226e 100644 --- a/recipes/HPLinpack-Infiniband-IntelMPI/config/pool.json +++ b/recipes/HPLinpack-Infiniband-IntelMPI/config/pool.json @@ -1,14 +1,18 @@ { "pool_specification": { "id": "docker-linpack", - "vm_size": "STANDARD_A9", + "vm_configuration": { + "platform_image": { + "publisher": "OpenLogic", + "offer": "CentOS-HPC", + "sku": "7.1" + } + }, + "vm_size": "STANDARD_H16R", "vm_count": { "dedicated": 2 }, "inter_node_communication_enabled": true, - "publisher": "OpenLogic", - "offer": "CentOS-HPC", - "sku": "7.1", "ssh": { "username": "docker" }, diff --git a/recipes/Keras+Theano-CPU/config/Dockerfile b/recipes/Keras+Theano-CPU/config/Dockerfile deleted file mode 100644 index ffe18a2..0000000 --- a/recipes/Keras+Theano-CPU/config/Dockerfile +++ /dev/null @@ -1,37 +0,0 @@ -# Dockerfile for Keras+Theano-CPU for use with Batch Shipyard on Azure Batch - -FROM ubuntu:14.04 -MAINTAINER Fred Park - -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - gfortran \ - git \ - wget \ - curl \ - ca-certificates \ - libhdf5-dev \ - liblapack-dev \ - libopenblas-dev \ - python-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# upgrade pip and install dependencies -RUN curl --silent --show-error --retry 5 https://bootstrap.pypa.io/get-pip.py | python && \ - pip install --upgrade --no-cache-dir setuptools wheel six && \ - pip install --upgrade --no-cache-dir pyyaml nose h5py && \ - pip install --upgrade --no-cache-dir numpy && \ - pip install --upgrade --no-cache-dir scipy - -# install theano and keras -RUN pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git && \ - git clone https://github.com/fchollet/keras.git && \ - cd keras && \ - python setup.py install - -# set keras backend to theano -ENV KERAS_BACKEND=theano - -# copy in default theanorc file -COPY theanorc /root/.theanorc diff --git a/recipes/Keras+Theano-CPU/config/pool.json b/recipes/Keras+Theano-CPU/config/pool.json index 53c3875..c6055d7 100644 --- a/recipes/Keras+Theano-CPU/config/pool.json +++ b/recipes/Keras+Theano-CPU/config/pool.json @@ -1,13 +1,17 @@ { "pool_specification": { "id": "keras-cpu", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_D1_V2", "vm_count": { "dedicated": 1 }, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/Keras+Theano-CPU/docker/Dockerfile b/recipes/Keras+Theano-CPU/docker/Dockerfile index c67e67d..ffe18a2 100644 --- a/recipes/Keras+Theano-CPU/docker/Dockerfile +++ b/recipes/Keras+Theano-CPU/docker/Dockerfile @@ -1,6 +1,7 @@ # Dockerfile for Keras+Theano-CPU for use with Batch Shipyard on Azure Batch FROM ubuntu:14.04 +MAINTAINER Fred Park RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ diff --git a/recipes/Keras+Theano-GPU/config/Dockerfile b/recipes/Keras+Theano-GPU/config/Dockerfile deleted file mode 100644 index ffe18a2..0000000 --- a/recipes/Keras+Theano-GPU/config/Dockerfile +++ /dev/null @@ -1,37 +0,0 @@ -# Dockerfile for Keras+Theano-CPU for use with Batch Shipyard on Azure Batch - -FROM ubuntu:14.04 -MAINTAINER Fred Park - -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - gfortran \ - git \ - wget \ - curl \ - ca-certificates \ - libhdf5-dev \ - liblapack-dev \ - libopenblas-dev \ - python-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# upgrade pip and install dependencies -RUN curl --silent --show-error --retry 5 https://bootstrap.pypa.io/get-pip.py | python && \ - pip install --upgrade --no-cache-dir setuptools wheel six && \ - pip install --upgrade --no-cache-dir pyyaml nose h5py && \ - pip install --upgrade --no-cache-dir numpy && \ - pip install --upgrade --no-cache-dir scipy - -# install theano and keras -RUN pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git && \ - git clone https://github.com/fchollet/keras.git && \ - cd keras && \ - python setup.py install - -# set keras backend to theano -ENV KERAS_BACKEND=theano - -# copy in default theanorc file -COPY theanorc /root/.theanorc diff --git a/recipes/Keras+Theano-GPU/config/pool.json b/recipes/Keras+Theano-GPU/config/pool.json index 3bc95c8..0fe7fa5 100644 --- a/recipes/Keras+Theano-GPU/config/pool.json +++ b/recipes/Keras+Theano-GPU/config/pool.json @@ -1,13 +1,17 @@ { "pool_specification": { "id": "keras-gpu", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_NC6", "vm_count": { "dedicated": 1 }, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/MXNet-CPU/config/multinode/pool.json b/recipes/MXNet-CPU/config/multinode/pool.json index 6ba9177..95cb6d6 100644 --- a/recipes/MXNet-CPU/config/multinode/pool.json +++ b/recipes/MXNet-CPU/config/multinode/pool.json @@ -1,14 +1,18 @@ { "pool_specification": { "id": "mxnet-cpu-multinode", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_D4_V2", "vm_count": { "dedicated": 2 }, "inter_node_communication_enabled": true, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/MXNet-CPU/config/singlenode/pool.json b/recipes/MXNet-CPU/config/singlenode/pool.json index 9365d1b..ae0d543 100644 --- a/recipes/MXNet-CPU/config/singlenode/pool.json +++ b/recipes/MXNet-CPU/config/singlenode/pool.json @@ -1,14 +1,17 @@ { "pool_specification": { "id": "mxnet-cpu-singlenode", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_D1_V2", "vm_count": { "dedicated": 1 }, - "inter_node_communication_enabled": true, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/MXNet-GPU/config/multinode/pool.json b/recipes/MXNet-GPU/config/multinode/pool.json index 278e4ec..7e55c7e 100644 --- a/recipes/MXNet-GPU/config/multinode/pool.json +++ b/recipes/MXNet-GPU/config/multinode/pool.json @@ -1,13 +1,18 @@ { "pool_specification": { "id": "mxnet-multinode", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_NC24", "vm_count": { "dedicated": 2 }, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", + "inter_node_communication_enabled": true, "ssh": { "username": "docker" }, diff --git a/recipes/MXNet-GPU/config/singlenode/pool.json b/recipes/MXNet-GPU/config/singlenode/pool.json index c2acdc4..5471752 100644 --- a/recipes/MXNet-GPU/config/singlenode/pool.json +++ b/recipes/MXNet-GPU/config/singlenode/pool.json @@ -1,13 +1,17 @@ { "pool_specification": { "id": "mxnet-singlenode", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_NC24", "vm_count": { "dedicated": 1 }, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/NAMD-GPU/config/pool.json b/recipes/NAMD-GPU/config/pool.json index d067a9a..463237e 100644 --- a/recipes/NAMD-GPU/config/pool.json +++ b/recipes/NAMD-GPU/config/pool.json @@ -1,13 +1,17 @@ { "pool_specification": { "id": "namd-multigpu", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_NC12", "vm_count": { "dedicated": 1 }, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/NAMD-Infiniband-IntelMPI/config/pool.json b/recipes/NAMD-Infiniband-IntelMPI/config/pool.json index 714b47b..7075a45 100644 --- a/recipes/NAMD-Infiniband-IntelMPI/config/pool.json +++ b/recipes/NAMD-Infiniband-IntelMPI/config/pool.json @@ -1,14 +1,18 @@ { "pool_specification": { "id": "docker-namd-rdma", - "vm_size": "STANDARD_A9", + "vm_configuration": { + "platform_image": { + "publisher": "OpenLogic", + "offer": "CentOS-HPC", + "sku": "7.1" + } + }, + "vm_size": "STANDARD_H16R", "vm_count": { "dedicated": 4 }, "inter_node_communication_enabled": true, - "publisher": "OpenLogic", - "offer": "CentOS-HPC", - "sku": "7.1", "ssh": { "username": "docker" }, diff --git a/recipes/NAMD-TCP/config/pool.json b/recipes/NAMD-TCP/config/pool.json index f5047c8..77e42e1 100644 --- a/recipes/NAMD-TCP/config/pool.json +++ b/recipes/NAMD-TCP/config/pool.json @@ -1,14 +1,18 @@ { "pool_specification": { "id": "namd-tcp", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_D3_V2", "vm_count": { "dedicated": 4 }, "inter_node_communication_enabled": true, - "publisher": "OpenLogic", - "offer": "CentOS", - "sku": "7.3", "ssh": { "username": "docker" }, diff --git a/recipes/OpenFOAM-Infiniband-IntelMPI/config/pool.json b/recipes/OpenFOAM-Infiniband-IntelMPI/config/pool.json index 2198cae..f64c858 100644 --- a/recipes/OpenFOAM-Infiniband-IntelMPI/config/pool.json +++ b/recipes/OpenFOAM-Infiniband-IntelMPI/config/pool.json @@ -1,14 +1,18 @@ { "pool_specification": { "id": "docker-openfoam-rdma", - "vm_size": "STANDARD_A9", + "vm_configuration": { + "platform_image": { + "publisher": "OpenLogic", + "offer": "CentOS-HPC", + "sku": "7.1" + } + }, + "vm_size": "STANDARD_H16R", "vm_count": { "dedicated": 2 }, "inter_node_communication_enabled": true, - "publisher": "OpenLogic", - "offer": "CentOS-HPC", - "sku": "7.1", "ssh": { "username": "docker" }, diff --git a/recipes/OpenFOAM-TCP-OpenMPI/config/pool.json b/recipes/OpenFOAM-TCP-OpenMPI/config/pool.json index 8d671e0..7c69937 100644 --- a/recipes/OpenFOAM-TCP-OpenMPI/config/pool.json +++ b/recipes/OpenFOAM-TCP-OpenMPI/config/pool.json @@ -1,14 +1,18 @@ { "pool_specification": { "id": "docker-openfoam-tcp", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_D2_V2", "vm_count": { "dedicated": 2 }, "inter_node_communication_enabled": true, - "publisher": "OpenLogic", - "offer": "CentOS", - "sku": "7.3", "ssh": { "username": "docker" }, diff --git a/recipes/RemoteFS-GlusterFS+BatchPool/config/pool.json b/recipes/RemoteFS-GlusterFS+BatchPool/config/pool.json index bfe6a1c..bc17b50 100644 --- a/recipes/RemoteFS-GlusterFS+BatchPool/config/pool.json +++ b/recipes/RemoteFS-GlusterFS+BatchPool/config/pool.json @@ -1,14 +1,18 @@ { "pool_specification": { "id": "remotefs-batchpool", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_D2_V2", "vm_count": { "dedicated": 4 }, "inter_node_communication_enabled": true, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/TensorFlow-CPU/config/pool.json b/recipes/TensorFlow-CPU/config/pool.json index 9d87531..f32e990 100644 --- a/recipes/TensorFlow-CPU/config/pool.json +++ b/recipes/TensorFlow-CPU/config/pool.json @@ -1,13 +1,17 @@ { "pool_specification": { "id": "tensorflow-cpu", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_D1_V2", "vm_count": { "dedicated": 1 }, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/TensorFlow-Distributed/config/cpu/pool.json b/recipes/TensorFlow-Distributed/config/cpu/pool.json index 35cdb68..90987f0 100644 --- a/recipes/TensorFlow-Distributed/config/cpu/pool.json +++ b/recipes/TensorFlow-Distributed/config/cpu/pool.json @@ -1,14 +1,18 @@ { "pool_specification": { "id": "tensorflow-distributed", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_D4_V2", "vm_count": { "dedicated": 2 }, "inter_node_communication_enabled": true, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/TensorFlow-Distributed/config/gpu/pool.json b/recipes/TensorFlow-Distributed/config/gpu/pool.json index ca88f07..da19e4a 100644 --- a/recipes/TensorFlow-Distributed/config/gpu/pool.json +++ b/recipes/TensorFlow-Distributed/config/gpu/pool.json @@ -1,14 +1,18 @@ { "pool_specification": { "id": "tensorflow-distributed", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_NC12", "vm_count": { "dedicated": 2 }, "inter_node_communication_enabled": true, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/TensorFlow-GPU/config/pool.json b/recipes/TensorFlow-GPU/config/pool.json index e4426bb..7f473b9 100644 --- a/recipes/TensorFlow-GPU/config/pool.json +++ b/recipes/TensorFlow-GPU/config/pool.json @@ -1,13 +1,17 @@ { "pool_specification": { "id": "tensorflow-gpu", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_NC6", "vm_count": { "dedicated": 1 }, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/Torch-CPU/config/pool.json b/recipes/Torch-CPU/config/pool.json index 57712d0..6956077 100644 --- a/recipes/Torch-CPU/config/pool.json +++ b/recipes/Torch-CPU/config/pool.json @@ -1,13 +1,17 @@ { "pool_specification": { "id": "torch", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_D3_V2", "vm_count": { "dedicated": 1 }, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/Torch-GPU/config/pool.json b/recipes/Torch-GPU/config/pool.json index d4e6afe..ccef359 100644 --- a/recipes/Torch-GPU/config/pool.json +++ b/recipes/Torch-GPU/config/pool.json @@ -1,13 +1,17 @@ { "pool_specification": { "id": "torch-gpu", + "vm_configuration": { + "platform_image": { + "publisher": "Canonical", + "offer": "UbuntuServer", + "sku": "16.04-LTS" + } + }, "vm_size": "STANDARD_NC6", "vm_count": { "dedicated": 1 }, - "publisher": "Canonical", - "offer": "UbuntuServer", - "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/scripts/shipyard_nodeprep_customimage.sh b/scripts/shipyard_nodeprep_customimage.sh index dcf2bd5..4316e64 100755 --- a/scripts/shipyard_nodeprep_customimage.sh +++ b/scripts/shipyard_nodeprep_customimage.sh @@ -158,6 +158,20 @@ check_for_nvidia() { fi } +check_docker_root_dir() { + set +e + rootdir=$(docker info | grep "Docker Root Dir" | cut -d' ' -f 4) + set -e + echo "$rootdir" + if [ -z "$rootdir" ]; then + echo "ERROR: could not determine docker graph root" + elif [[ "$rootdir" == /mnt* && "$1" == "ubuntu" ]] || [[ "$rootdir" == /mnt/resource* && "$1" != "ubuntu" ]]; then + echo "INFO: docker root is within ephemeral temp disk" + else + echo "WARNING: docker graph root is on the OS disk. Performance may be impacted." + fi +} + check_for_docker_host_engine() { set +e docker --version @@ -171,11 +185,14 @@ check_for_docker_host_engine() { check_for_glusterfs_on_compute() { set +e gluster - if [ $? -ne 0 ]; then - echo "ERROR: gluster server not installed" + rc0=$? + glusterfs -V + rc1=$? + set -e + if [ $rc0 -ne 0 ] || [ $rc1 -ne 0 ]; then + echo "ERROR: gluster server and client not installed" exit 1 fi - set -e } check_for_storage_cluster_software() { @@ -309,11 +326,15 @@ fi # one-time setup if [ ! -f $nodeprepfinished ] && [ $networkopt -eq 1 ]; then + # do not fail script if this function fails + set +e optimize_tcp_network_settings $DISTRIB_ID $DISTRIB_RELEASE + set -e fi # check for docker host engine check_for_docker_host_engine +check_docker_root_dir $DISTRIB_ID # TODO warn if graph is on os disk