Add custom image guide
- Update recipes for vm_configuration - Fix some issues with platform pools with new changes
This commit is contained in:
Родитель
8397b411c5
Коммит
a41713c5ee
|
@ -63,6 +63,7 @@ tunneling to Docker Hosts on compute nodes
|
|||
[Azure KeyVault](https://azure.microsoft.com/en-us/services/key-vault/)
|
||||
* Support for execution on an
|
||||
[Azure Function App environment](https://github.com/Azure/batch-shipyard/blob/master/docs/60-batch-shipyard-site-extension.md)
|
||||
* Support for [custom host images](https://github.com/Azure/batch-shipyard/blob/master/docs/63-batch-shipyard-custom-images.md)
|
||||
|
||||
## Installation
|
||||
Installation is typically an easy two-step process. The CLI is also available
|
||||
|
|
|
@ -880,9 +880,11 @@ def _add_pool(
|
|||
for image_ref in sorted(
|
||||
nas.verified_image_references, key=lambda item: item.sku)
|
||||
if image_ref.publisher.lower() ==
|
||||
pool_settings.publisher.lower() and
|
||||
image_ref.offer.lower() == pool_settings.offer.lower() and
|
||||
image_ref.sku.lower() == pool_settings.sku.lower()
|
||||
pool_settings.vm_configuration.publisher.lower() and
|
||||
image_ref.offer.lower() ==
|
||||
pool_settings.vm_configuration.offer.lower() and
|
||||
image_ref.sku.lower() ==
|
||||
pool_settings.vm_configuration.sku.lower()
|
||||
]
|
||||
try:
|
||||
sku_to_use, image_ref_to_use = skus_to_use[-1]
|
||||
|
@ -912,11 +914,11 @@ def _add_pool(
|
|||
sc_args) else '',
|
||||
n=' -n' if settings.can_tune_tcp(
|
||||
pool_settings.vm_size) else '',
|
||||
o=' -o {}'.format(pool_settings.offer),
|
||||
o=' -o {}'.format(pool_settings.vm_configuration.offer),
|
||||
p=' -p {}'.format(bs.storage_entity_prefix)
|
||||
if bs.storage_entity_prefix else '',
|
||||
r=' -r {}'.format(preg.container) if preg.container else '',
|
||||
s=' -s {}'.format(pool_settings.sku),
|
||||
s=' -s {}'.format(pool_settings.vm_configuration.sku),
|
||||
t=' -t {}'.format(torrentflags),
|
||||
v=' -v {}'.format(__version__),
|
||||
w=' -w' if pool_settings.ssh.hpn_server_swap else '',
|
||||
|
@ -1548,13 +1550,15 @@ def _adjust_settings_for_pool_creation(config):
|
|||
raise ValueError(
|
||||
('Unsupported Docker Host VM Config, publisher={} offer={} '
|
||||
'sku={} vm_size={}').format(publisher, offer, sku, pool.vm_size))
|
||||
# compute total vm count
|
||||
pool_total_vm_count = pool.vm_count.dedicated + pool.vm_count.low_priority
|
||||
# ensure enough vhds for custom image pools
|
||||
if util.is_not_empty(node_agent):
|
||||
vhds = len(pool.vm_configuration.image_uris)
|
||||
if node_agent == 'batch.node.windows amd64':
|
||||
vhds_req = int(math.ceil(pool.vm_count / 20))
|
||||
vhds_req = int(math.ceil(pool_total_vm_count / 20))
|
||||
else:
|
||||
vhds_req = int(math.ceil(pool.vm_count / 40))
|
||||
vhds_req = int(math.ceil(pool_total_vm_count / 40))
|
||||
if vhds_req > vhds:
|
||||
raise ValueError(
|
||||
('insufficient number of VHDs ({}) supplied for the number '
|
||||
|
@ -1569,7 +1573,6 @@ def _adjust_settings_for_pool_creation(config):
|
|||
'VM config, publisher={} offer={} sku={}').format(
|
||||
publisher, offer, sku))
|
||||
# adjust inter node comm setting
|
||||
pool_total_vm_count = pool.vm_count.dedicated + pool.vm_count.low_priority
|
||||
if pool_total_vm_count < 1:
|
||||
raise ValueError('invalid total vm_count: {}'.format(
|
||||
pool_total_vm_count))
|
||||
|
|
|
@ -2164,8 +2164,9 @@ def task_settings(cloud_pool, config, poolconf, jobspec, conf, missing_images):
|
|||
pool_id = cloud_pool.id
|
||||
vm_size = cloud_pool.vm_size.lower()
|
||||
inter_node_comm = cloud_pool.enable_inter_node_communication
|
||||
is_custom_image = util.is_none_or_empty(
|
||||
cloud_pool.virtual_machine_configuration.os_disk)
|
||||
is_custom_image = (
|
||||
cloud_pool.virtual_machine_configuration.os_disk is not None
|
||||
)
|
||||
if is_custom_image:
|
||||
publisher = None
|
||||
offer = None
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
# Custom Images with Batch Shipyard
|
||||
The focus of this article is to explain how to provision a custom image (VHD)
|
||||
and then deploy it with Batch Shipyard as the VM image to use for your
|
||||
compute node hosts.
|
||||
|
||||
## Background: Azure Batch, Azure Storage and Custom Images
|
||||
Azure Batch allows provisioning compute nodes with custom images (VHDs) with
|
||||
User Subscription Batch accounts. This allows users to customize the
|
||||
compute node with software, settings, etc. that fit their use case. With
|
||||
containerization, this requirement is weakened but some users may still
|
||||
want to customize the host compute node environment with particular
|
||||
versions of software such as the Docker Host engine or even embed the GPU
|
||||
driver for potential faster provisioning times.
|
||||
|
||||
Azure Storage is used to host these custom image VHDs. Currently, there are
|
||||
two sources for creating virtual machines in Azure which are, page blob
|
||||
VHDs and managed disks. Currently, Azure Batch does not support managed
|
||||
disks, so you will need to create page blobs with your VHD image.
|
||||
|
||||
Due to Storage account throttling limits, you must limit the number of
|
||||
compute nodes served from a single storage account (and thus VHD). For
|
||||
the maximum performance, you should limit one VHD for every 40 VMs for Linux
|
||||
(or 20 VMs for Windows) and these VHDs should be on separate storage accounts
|
||||
within the same subscription in the same region as your Batch account.
|
||||
You can use [blobxfer](https://github.com/Azure/blobxfer) or
|
||||
[AzCopy](https://azure.microsoft.com/en-us/documentation/articles/storage-use-azcopy/)
|
||||
to copy your VHD images.
|
||||
|
||||
## Provisioning a Custom Image
|
||||
You will need to ensure that your custom image is sufficiently prepared
|
||||
before using it as a source VHD for Batch Shipyard. The following
|
||||
sub-section will detail the reasons and requisites.
|
||||
|
||||
### Batch Shipyard Node Preparation and Custom Images
|
||||
For non-custom images (i.e., platform images or Marketplace images), Batch
|
||||
Shipyard takes care of preparing the compute node with the necessary
|
||||
software in order for tasks to run with Batch Shipyard.
|
||||
|
||||
Because custom images can muddy the assumptions with what is available or
|
||||
not in the operating system, Batch Shipyard requires that the user prepare
|
||||
the custom image with the necessary software and only attempts to modify
|
||||
items that are needed for functionality. Software that is required is
|
||||
checked during compute node preparation.
|
||||
|
||||
### Base Required Software
|
||||
#### Docker Host Engine
|
||||
The [Docker](https://docker.com) host engine must be installed and must
|
||||
be invocable as root with default path and permissions. The service must
|
||||
be running upon boot. The Docker socket (`/var/run/docker.sock`) must
|
||||
be available (it is available by default).
|
||||
|
||||
#### SSH Server
|
||||
An SSH server should be installed and operational on port 22. You can
|
||||
limit inbound connections through the Batch service deployed NSG on the
|
||||
virtual network or network interface (and/or through the software firewall
|
||||
on the host).
|
||||
|
||||
#### GPU-enabled Compute Nodes
|
||||
In order to utilize the GPUs available on compute nodes that have them
|
||||
(e.g., N-series VMs), the NVIDIA driver must be installed and loaded upon
|
||||
boot.
|
||||
|
||||
Additionally, [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
|
||||
must be installed and the service must be running upon boot.
|
||||
|
||||
#### Infiniband/RDMA-enabled Compute Nodes
|
||||
The host VM Infiniband/RDMA stack must be enabled with the proper drivers
|
||||
and the required user-land software for Infiniband installed. It is best to
|
||||
base a custom image off of the existing Azure platform images that support
|
||||
Infiniband/RDMA.
|
||||
|
||||
#### Storage Cluster Auto-Linking and Mounting
|
||||
If mounting a storage cluster, the required NFSv4 or GlusterFS client tooling
|
||||
must be installed and invocable such that the auto-link mount functionality
|
||||
is operable. Both clients need not be installed unless you are mounting
|
||||
both types of storage clusters.
|
||||
|
||||
#### GlusterFS On Compute
|
||||
If a GlusterFS on compute shared data volume is required, then GlusterFS
|
||||
server and client tooling must be installed and invocable so the shared
|
||||
data volume can be created amongst the compute nodes.
|
||||
|
||||
### Installed/Configured Software
|
||||
#### Encryption Certificates and Credential Decryption
|
||||
If employing credential encryption, Batch Shipyard will exercise the necessary
|
||||
logic to decrypt any encrypted field if credential encryption is enabled.
|
||||
Properties in the global configuration should be enabled as per requirements
|
||||
as if deploying a non-Custom Image-based compute node.
|
||||
|
||||
#### Batch Shipyard Docker Images
|
||||
Batch Shipyard Docker images required for functionality on the compute node
|
||||
will be automatically installed.
|
||||
|
||||
#### Azure File Docker Volume Driver
|
||||
Batch Shipyard will install and configure the Azure File Docker Volume
|
||||
Driver for any Azure File shared data volumes that are specified.
|
||||
|
||||
### Packer Samples
|
||||
The [contrib](../contrib) area of the repository contain example `packer`
|
||||
scripts to create a custom image from an existing Marketplace platform image.
|
||||
|
||||
## Allocating a Pool with a Custom Image
|
||||
When allocating a compute pool with a custom image, you must ensure the
|
||||
following:
|
||||
|
||||
0. You have a User Subscription Batch account
|
||||
1. Custom image VHD is in your storage account as a page blob
|
||||
2. The storage account is in the same subscription and region as your
|
||||
*User Subscription* Batch account
|
||||
3. You have sufficiently replicated the custom image VHD across enough
|
||||
storage accounts to support your compute pool
|
||||
4. You have URIs for all of these custom image VHDs. These URIs should not
|
||||
include SAS information of any kind. They should be "bare" URLs.
|
||||
5. Your pool configuration file has the proper `vm_configuration` settings
|
||||
for `custom_image`
|
|
@ -1,8 +1,12 @@
|
|||
# Low Priority Compute Node Considerations
|
||||
Please read the following carefully concerning pools allocated with low-
|
||||
priority compute nodes.
|
||||
priority compute nodes. You may also want to read the
|
||||
[Azure Batch Low Priority Compute Node](https://docs.microsoft.com/en-us/azure/batch/batch-low-pri-vms)
|
||||
documentation.
|
||||
|
||||
### Pool Allocation and Resizing
|
||||
* Low priority compute nodes can only be allocated with non-User Subscription
|
||||
Batch accounts.
|
||||
* Pool and compute node allocation may take up to the full resize timeout
|
||||
and not reach full allocation with low priority if a low priority node is
|
||||
pre-empted and the target number of low priority nodes cannot be reached.
|
||||
|
|
|
@ -49,6 +49,8 @@ factors that Batch Shipyard has no control over.
|
|||
regarding your request. Pull requests are always welcome!
|
||||
* How do I contribute a recipe?
|
||||
* Please see this [guide](98-contributing-recipes.md).
|
||||
* Does Batch Shipyard support Linux custom images?
|
||||
* Yes, please see [the guide](63-batch-shipyard-custom-images.md).
|
||||
* Does Batch Shipyard support Windows Server Containers?
|
||||
* Not at this time, we are tracking the issue
|
||||
[here](https://github.com/Azure/batch-shipyard/issues/7).
|
||||
|
|
|
@ -13,6 +13,7 @@ and effectively running your batch-style Docker workloads on Azure Batch.
|
|||
* [FS Configuration](15-batch-shipyard-configuration-fs.md)
|
||||
5. [Usage](20-batch-shipyard-usage.md)
|
||||
6. [Azure Functions and Batch Shipyard](60-batch-shipyard-site-extension.md)
|
||||
7. [Custom Image for Host Compute Nodes](63-batch-shipyard-custom-images.md)
|
||||
7. [Remote Filesystems](65-batch-shipyard-remote-fs.md)
|
||||
8. [Data Movement](70-batch-shipyard-data-movement.md)
|
||||
9. [Azure KeyVault for Credential Management](74-batch-shipyard-azure-keyvault.md)
|
||||
|
|
|
@ -1,14 +1,18 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "docker-cntk-rdma",
|
||||
"vm_size": "STANDARD_A9",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "OpenLogic",
|
||||
"offer": "CentOS-HPC",
|
||||
"sku": "7.1"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_H16R",
|
||||
"vm_count": {
|
||||
"dedicated": 2
|
||||
},
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "OpenLogic",
|
||||
"offer": "CentOS-HPC",
|
||||
"sku": "7.1",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,14 +1,18 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "cntk-cpu-multinode",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_D1_V2",
|
||||
"vm_count": {
|
||||
"dedicated": 3
|
||||
},
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,14 +1,18 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "cntk-cpu-singlenode",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_D1_V2",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,14 +1,18 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "cntk-multinode-multigpu",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_NC24",
|
||||
"vm_count": {
|
||||
"dedicated": 2
|
||||
},
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "cntk-singlenode-multigpu",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_NC24",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "cntk-singlenode-singlegpu",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_NC6",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "caffe-cpu",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_D1_V2",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "caffe-gpu",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_NC6",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "chainer-cpu",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_D1_V2",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "chainer-gpu",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_NC6",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "dockerffmpeg",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_NV6",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,14 +1,18 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "docker-hpcg",
|
||||
"vm_size": "STANDARD_A9",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "OpenLogic",
|
||||
"offer": "CentOS-HPC",
|
||||
"sku": "7.1"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_H16R",
|
||||
"vm_count": {
|
||||
"dedicated": 2
|
||||
},
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "OpenLogic",
|
||||
"offer": "CentOS-HPC",
|
||||
"sku": "7.1",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,14 +1,18 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "docker-linpack",
|
||||
"vm_size": "STANDARD_A9",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "OpenLogic",
|
||||
"offer": "CentOS-HPC",
|
||||
"sku": "7.1"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_H16R",
|
||||
"vm_count": {
|
||||
"dedicated": 2
|
||||
},
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "OpenLogic",
|
||||
"offer": "CentOS-HPC",
|
||||
"sku": "7.1",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,37 +0,0 @@
|
|||
# Dockerfile for Keras+Theano-CPU for use with Batch Shipyard on Azure Batch
|
||||
|
||||
FROM ubuntu:14.04
|
||||
MAINTAINER Fred Park <https://github.com/Azure/batch-shipyard>
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gfortran \
|
||||
git \
|
||||
wget \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libhdf5-dev \
|
||||
liblapack-dev \
|
||||
libopenblas-dev \
|
||||
python-dev && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# upgrade pip and install dependencies
|
||||
RUN curl --silent --show-error --retry 5 https://bootstrap.pypa.io/get-pip.py | python && \
|
||||
pip install --upgrade --no-cache-dir setuptools wheel six && \
|
||||
pip install --upgrade --no-cache-dir pyyaml nose h5py && \
|
||||
pip install --upgrade --no-cache-dir numpy && \
|
||||
pip install --upgrade --no-cache-dir scipy
|
||||
|
||||
# install theano and keras
|
||||
RUN pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git && \
|
||||
git clone https://github.com/fchollet/keras.git && \
|
||||
cd keras && \
|
||||
python setup.py install
|
||||
|
||||
# set keras backend to theano
|
||||
ENV KERAS_BACKEND=theano
|
||||
|
||||
# copy in default theanorc file
|
||||
COPY theanorc /root/.theanorc
|
|
@ -1,13 +1,17 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "keras-cpu",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_D1_V2",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# Dockerfile for Keras+Theano-CPU for use with Batch Shipyard on Azure Batch
|
||||
|
||||
FROM ubuntu:14.04
|
||||
MAINTAINER Fred Park <https://github.com/Azure/batch-shipyard>
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
|
|
|
@ -1,37 +0,0 @@
|
|||
# Dockerfile for Keras+Theano-CPU for use with Batch Shipyard on Azure Batch
|
||||
|
||||
FROM ubuntu:14.04
|
||||
MAINTAINER Fred Park <https://github.com/Azure/batch-shipyard>
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gfortran \
|
||||
git \
|
||||
wget \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libhdf5-dev \
|
||||
liblapack-dev \
|
||||
libopenblas-dev \
|
||||
python-dev && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# upgrade pip and install dependencies
|
||||
RUN curl --silent --show-error --retry 5 https://bootstrap.pypa.io/get-pip.py | python && \
|
||||
pip install --upgrade --no-cache-dir setuptools wheel six && \
|
||||
pip install --upgrade --no-cache-dir pyyaml nose h5py && \
|
||||
pip install --upgrade --no-cache-dir numpy && \
|
||||
pip install --upgrade --no-cache-dir scipy
|
||||
|
||||
# install theano and keras
|
||||
RUN pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git && \
|
||||
git clone https://github.com/fchollet/keras.git && \
|
||||
cd keras && \
|
||||
python setup.py install
|
||||
|
||||
# set keras backend to theano
|
||||
ENV KERAS_BACKEND=theano
|
||||
|
||||
# copy in default theanorc file
|
||||
COPY theanorc /root/.theanorc
|
|
@ -1,13 +1,17 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "keras-gpu",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_NC6",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,14 +1,18 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "mxnet-cpu-multinode",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_D4_V2",
|
||||
"vm_count": {
|
||||
"dedicated": 2
|
||||
},
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,14 +1,17 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "mxnet-cpu-singlenode",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_D1_V2",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,13 +1,18 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "mxnet-multinode",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_NC24",
|
||||
"vm_count": {
|
||||
"dedicated": 2
|
||||
},
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"inter_node_communication_enabled": true,
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "mxnet-singlenode",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_NC24",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "namd-multigpu",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_NC12",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,14 +1,18 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "docker-namd-rdma",
|
||||
"vm_size": "STANDARD_A9",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "OpenLogic",
|
||||
"offer": "CentOS-HPC",
|
||||
"sku": "7.1"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_H16R",
|
||||
"vm_count": {
|
||||
"dedicated": 4
|
||||
},
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "OpenLogic",
|
||||
"offer": "CentOS-HPC",
|
||||
"sku": "7.1",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,14 +1,18 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "namd-tcp",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_D3_V2",
|
||||
"vm_count": {
|
||||
"dedicated": 4
|
||||
},
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "OpenLogic",
|
||||
"offer": "CentOS",
|
||||
"sku": "7.3",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,14 +1,18 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "docker-openfoam-rdma",
|
||||
"vm_size": "STANDARD_A9",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "OpenLogic",
|
||||
"offer": "CentOS-HPC",
|
||||
"sku": "7.1"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_H16R",
|
||||
"vm_count": {
|
||||
"dedicated": 2
|
||||
},
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "OpenLogic",
|
||||
"offer": "CentOS-HPC",
|
||||
"sku": "7.1",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,14 +1,18 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "docker-openfoam-tcp",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_D2_V2",
|
||||
"vm_count": {
|
||||
"dedicated": 2
|
||||
},
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "OpenLogic",
|
||||
"offer": "CentOS",
|
||||
"sku": "7.3",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,14 +1,18 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "remotefs-batchpool",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_D2_V2",
|
||||
"vm_count": {
|
||||
"dedicated": 4
|
||||
},
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "tensorflow-cpu",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_D1_V2",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,14 +1,18 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "tensorflow-distributed",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_D4_V2",
|
||||
"vm_count": {
|
||||
"dedicated": 2
|
||||
},
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,14 +1,18 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "tensorflow-distributed",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_NC12",
|
||||
"vm_count": {
|
||||
"dedicated": 2
|
||||
},
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "tensorflow-gpu",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_NC6",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "torch",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_D3_V2",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
{
|
||||
"pool_specification": {
|
||||
"id": "torch-gpu",
|
||||
"vm_configuration": {
|
||||
"platform_image": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS"
|
||||
}
|
||||
},
|
||||
"vm_size": "STANDARD_NC6",
|
||||
"vm_count": {
|
||||
"dedicated": 1
|
||||
},
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -158,6 +158,20 @@ check_for_nvidia() {
|
|||
fi
|
||||
}
|
||||
|
||||
check_docker_root_dir() {
|
||||
set +e
|
||||
rootdir=$(docker info | grep "Docker Root Dir" | cut -d' ' -f 4)
|
||||
set -e
|
||||
echo "$rootdir"
|
||||
if [ -z "$rootdir" ]; then
|
||||
echo "ERROR: could not determine docker graph root"
|
||||
elif [[ "$rootdir" == /mnt* && "$1" == "ubuntu" ]] || [[ "$rootdir" == /mnt/resource* && "$1" != "ubuntu" ]]; then
|
||||
echo "INFO: docker root is within ephemeral temp disk"
|
||||
else
|
||||
echo "WARNING: docker graph root is on the OS disk. Performance may be impacted."
|
||||
fi
|
||||
}
|
||||
|
||||
check_for_docker_host_engine() {
|
||||
set +e
|
||||
docker --version
|
||||
|
@ -171,11 +185,14 @@ check_for_docker_host_engine() {
|
|||
check_for_glusterfs_on_compute() {
|
||||
set +e
|
||||
gluster
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "ERROR: gluster server not installed"
|
||||
rc0=$?
|
||||
glusterfs -V
|
||||
rc1=$?
|
||||
set -e
|
||||
if [ $rc0 -ne 0 ] || [ $rc1 -ne 0 ]; then
|
||||
echo "ERROR: gluster server and client not installed"
|
||||
exit 1
|
||||
fi
|
||||
set -e
|
||||
}
|
||||
|
||||
check_for_storage_cluster_software() {
|
||||
|
@ -309,11 +326,15 @@ fi
|
|||
|
||||
# one-time setup
|
||||
if [ ! -f $nodeprepfinished ] && [ $networkopt -eq 1 ]; then
|
||||
# do not fail script if this function fails
|
||||
set +e
|
||||
optimize_tcp_network_settings $DISTRIB_ID $DISTRIB_RELEASE
|
||||
set -e
|
||||
fi
|
||||
|
||||
# check for docker host engine
|
||||
check_for_docker_host_engine
|
||||
check_docker_root_dir $DISTRIB_ID
|
||||
|
||||
# TODO warn if graph is on os disk
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче