diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9343168..7e1b321 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,19 +2,34 @@
## [Unreleased]
### Added
+- Support for provisioning storage clusters via the `fs cluster` command
+- Support for provisioning managed disks via the `fs disks` command
- Support for UserSubscription Batch accounts
- Azure Active Directory authentication support for Batch accounts
+- `allow_run_on_missing` option to jobs that allows tasks to execute under
+jobs with Docker images that have not been pre-loaded via the
+`global_resources`:`docker_images` setting in config.json. Note that, if
+possible, you should attempt to specify all Docker images that you intend
+to run in the `global_resources`:`docker_images` property in the global
+configuration to minimize scheduling to task execution latency.
+- Support for Canonical/UbuntuServer/16.04-LTS. This sku should be used over
+the old 16.04.0-LTS sku due to
+[issue #31](https://github.com/Azure/batch-shipyard/issues/31).
### Changed
- **Breaking Change:** `glusterfs` `volume_driver` for `shared_data_volumes`
should now be named as `glusterfs_on_compute`. This is to distinguish
co-located glusterfs on compute nodes with possible standalone glusterfs
`storage_cluster` remote mounted in the future.
-- Batch account (name) is now an optional property in the credentials config
+- Pool existance is now checked prior to job submission and can now proceed
+to add without an active pool.
+- Batch `account` (name) is now an optional property in the credentials config
+- Configuration doc broken up into multiple pages
+- Update all recipes using Canonical/UbuntuServer/16.04.0-LTS to use
+Canonical/UbuntuServer/16.04-LTS instead
+- Precompile python files for Docker images
- All dependencies updated to latest versions
- Update Batch API call compatibility for `azure-batch 2.0.0`
-- Precompile python files for Docker images
-- Configuration doc broken up into multiple pages
## [2.5.4] - 2017-03-08
### Changed
diff --git a/config_templates/credentials.json b/config_templates/credentials.json
index f61b95e..4155ede 100644
--- a/config_templates/credentials.json
+++ b/config_templates/credentials.json
@@ -37,10 +37,6 @@
},
"batch": {
"account_service_url": "",
- "account_key": "",
- "account_key_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/batchkey",
- "user_subscription": false,
- "resource_group": "",
"aad": {
"endpoint": "https://batch.core.windows.net/",
"directory_id": "",
@@ -54,7 +50,10 @@
"enabled": true,
"filename": ""
}
- }
+ },
+ "resource_group": "",
+ "account_key": "",
+ "account_key_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/batchkey"
},
"storage": {
"mystorageaccount": {
diff --git a/config_templates/fs.json b/config_templates/fs.json
index 1987233..4e3236a 100644
--- a/config_templates/fs.json
+++ b/config_templates/fs.json
@@ -18,6 +18,7 @@
"static_public_ip": false,
"virtual_network": {
"name": "",
+ "resource_group": "",
"existing_ok": false,
"address_space": "",
"subnet": {
@@ -28,7 +29,7 @@
"network_security": {
"nfs": ["1.2.3.0/24"],
"ssh": ["*"],
- "custom_inbound": {
+ "custom_inbound_rules": {
"myrule": {
"destination_port_range": "5000-5001",
"source_address_prefix": ["1.2.3.4", "5.6.7.0/24"],
diff --git a/config_templates/jobs.json b/config_templates/jobs.json
index b97e4f0..5fa8823 100644
--- a/config_templates/jobs.json
+++ b/config_templates/jobs.json
@@ -8,6 +8,7 @@
},
"environment_variables_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/myjobenv",
"max_task_retries": 1,
+ "allow_run_on_missing_image": false,
"input_data": {
"azure_batch": [
{
diff --git a/config_templates/pool.json b/config_templates/pool.json
index b741563..3f544d5 100644
--- a/config_templates/pool.json
+++ b/config_templates/pool.json
@@ -33,6 +33,7 @@
},
"virtual_network": {
"name": "",
+ "resource_group": "",
"create_nonexistant": false,
"address_space": "",
"subnet": {
diff --git a/convoy/batch.py b/convoy/batch.py
index c043516..05499ae 100644
--- a/convoy/batch.py
+++ b/convoy/batch.py
@@ -66,6 +66,27 @@ _RUN_ELEVATED = batchmodels.UserIdentity(
)
+def get_batch_account(batch_mgmt_client, config):
+ # type: (azure.mgmt.batch.BatchManagementClient, dict) ->
+ # azure.mgmt.batch.models.BatchAccount
+ """Get Batch account properties from ARM
+ :param azure.mgmt.batch.BatchManagementClient batch_mgmt_client:
+ batch management client
+ :param dict config: configuration dict
+ :rtype: azure.mgmt.batch.models.BatchAccount
+ :return: Batch account
+ """
+ if batch_mgmt_client is None:
+ raise RuntimeError(
+ 'Batch management client is invalid, please specify management '
+ 'aad credentials')
+ bc = settings.credentials_batch(config)
+ return batch_mgmt_client.batch_account.get(
+ resource_group_name=bc.resource_group,
+ account_name=bc.account,
+ )
+
+
def list_node_agent_skus(batch_client):
# type: (batch.BatchServiceClient) -> None
"""List all node agent skus
@@ -1708,15 +1729,81 @@ def add_jobs(
# get the pool inter-node comm setting
bs = settings.batch_shipyard_settings(config)
pool = settings.pool_settings(config)
- _pool = batch_client.pool.get(pool.id)
- global_resources = []
- for gr in settings.global_resources_docker_images(config):
- global_resources.append(gr)
+ try:
+ cloud_pool = batch_client.pool.get(pool.id)
+ except batchmodels.batch_error.BatchErrorException as ex:
+ if 'The specified pool does not exist.' in ex.message.value:
+ logger.error('{} pool does not exist'.format(pool.id))
+ if util.confirm_action(
+ config, 'add jobs to nonexistant pool {}'.format(pool.id)):
+ cloud_pool = None
+ else:
+ logger.error(
+ 'not submitting jobs to nonexistant pool {}'.format(
+ pool.id))
+ return
+ else:
+ raise
+ global_resources = settings.global_resources_docker_images(config)
lastjob = None
lasttask = None
for jobspec in settings.job_specifications(config):
- jpcmd = ['$AZ_BATCH_NODE_STARTUP_DIR/wd/{} {}'.format(
- jpfile[0], ' '.join(global_resources))]
+ job_id = settings.job_id(jobspec)
+ # perform checks:
+ # 1. check docker images in task against pre-loaded on pool
+ # 2. if tasks have dependencies, set it if so
+ # 3. if there are multi-instance tasks
+ mi_ac = settings.job_multi_instance_auto_complete(config)
+ multi_instance = False
+ mi_docker_container_name = None
+ reserved_task_id = None
+ uses_task_dependencies = False
+ missing_images = []
+ allow_run_on_missing = settings.job_allow_run_on_missing(jobspec)
+ for task in settings.job_tasks(jobspec):
+ # check if task docker image is set in config.json
+ di = settings.task_docker_image(task)
+ if di not in global_resources:
+ if allow_run_on_missing:
+ logger.warning(
+ ('docker image {} not pre-loaded on pool for a '
+ 'task specified in job {}').format(di, job_id))
+ missing_images.append(di)
+ else:
+ raise RuntimeError(
+ ('not submitting job {} with missing docker image {} '
+ 'pre-load on pool {}').format(job_id, di, pool.id))
+ # do not break, check to ensure ids are set on each task if
+ # task dependencies are set
+ if settings.has_depends_on_task(task):
+ uses_task_dependencies = True
+ if settings.is_multi_instance_task(task):
+ if multi_instance and mi_ac:
+ raise ValueError(
+ 'cannot specify more than one multi-instance task '
+ 'per job with auto completion enabled')
+ multi_instance = True
+ mi_docker_container_name = settings.task_name(task)
+ if util.is_none_or_empty(mi_docker_container_name):
+ _id = settings.task_id(task)
+ if util.is_none_or_empty(_id):
+ reserved_task_id = _generate_next_generic_task_id(
+ batch_client, job_id)
+ settings.set_task_id(task, reserved_task_id)
+ _id = '{}-{}'.format(job_id, reserved_task_id)
+ settings.set_task_name(task, _id)
+ mi_docker_container_name = settings.task_name(task)
+ del _id
+ # construct job prep
+ if util.is_not_empty(global_resources):
+ if len(missing_images) > 0 and allow_run_on_missing:
+ gr = list(set(global_resources) - set(missing_images))
+ else:
+ gr = global_resources
+ jpcmd = ['$AZ_BATCH_NODE_STARTUP_DIR/wd/{} {}'.format(
+ jpfile[0], ' '.join(gr))]
+ else:
+ jpcmd = []
# digest any input_data
addlcmds = data.process_input_data(config, bxfile, jobspec)
if addlcmds is not None:
@@ -1741,39 +1828,10 @@ def add_jobs(
user_identity=_RUN_ELEVATED,
rerun_on_node_reboot_after_success=False,
),
- uses_task_dependencies=False,
+ uses_task_dependencies=uses_task_dependencies,
constraints=job_constraints,
)
lastjob = job.id
- # perform checks:
- # 1. if tasks have dependencies, set it if so
- # 2. if there are multi-instance tasks
- mi_ac = settings.job_multi_instance_auto_complete(config)
- multi_instance = False
- mi_docker_container_name = None
- reserved_task_id = None
- for task in settings.job_tasks(jobspec):
- # do not break, check to ensure ids are set on each task if
- # task dependencies are set
- if settings.has_depends_on_task(task):
- job.uses_task_dependencies = True
- if settings.is_multi_instance_task(task):
- if multi_instance and mi_ac:
- raise ValueError(
- 'cannot specify more than one multi-instance task '
- 'per job with auto completion enabled')
- multi_instance = True
- mi_docker_container_name = settings.task_name(task)
- if util.is_none_or_empty(mi_docker_container_name):
- _id = settings.task_id(task)
- if util.is_none_or_empty(_id):
- reserved_task_id = _generate_next_generic_task_id(
- batch_client, job.id)
- settings.set_task_id(task, reserved_task_id)
- _id = '{}-{}'.format(job.id, reserved_task_id)
- settings.set_task_name(task, _id)
- mi_docker_container_name = settings.task_name(task)
- del _id
# add multi-instance settings
set_terminate_on_all_tasks_complete = False
if multi_instance and mi_ac:
@@ -1784,7 +1842,7 @@ def add_jobs(
'docker rm -v {}'.format(mi_docker_container_name)]),
user_identity=_RUN_ELEVATED,
)
- logger.info('Adding job: {}'.format(job.id))
+ logger.info('Adding job {} to pool {}'.format(job.id, pool.id))
try:
batch_client.job.add(job)
except batchmodels.batch_error.BatchErrorException as ex:
@@ -1810,6 +1868,7 @@ def add_jobs(
del mi_ac
del multi_instance
del mi_docker_container_name
+ del uses_task_dependencies
# get base env vars from job
job_env_vars = settings.job_environment_variables(jobspec)
_job_env_vars_secid = \
@@ -1830,7 +1889,8 @@ def add_jobs(
if util.is_none_or_empty(settings.task_name(_task)):
settings.set_task_name(_task, '{}-{}'.format(job.id, _task_id))
del _task_id
- task = settings.task_settings(_pool, config, _task)
+ task = settings.task_settings(
+ cloud_pool, config, pool, _task, missing_images)
# retrieve keyvault task env vars
if util.is_not_empty(
task.environment_variables_keyvault_secret_id):
diff --git a/convoy/clients.py b/convoy/clients.py
index c806a6c..76fb414 100644
--- a/convoy/clients.py
+++ b/convoy/clients.py
@@ -118,6 +118,31 @@ def create_network_client(ctx, credentials=None, subscription_id=None):
credentials, subscription_id)
+def create_batch_mgmt_client(ctx, credentials=None, subscription_id=None):
+ # type: (CliContext, object, str) ->
+ # azure.mgmt.batch.BatchManagementClient
+ """Create batch management client
+ :param CliContext ctx: Cli Context
+ :param object credentials: credentials object
+ :param str subscription_id: subscription id
+ :rtype: azure.mgmt.batch.BatchManagementClient
+ :return: batch management client
+ """
+ mgmt_aad = None
+ if credentials is None:
+ mgmt_aad = settings.credentials_management(ctx.config).aad
+ credentials = aad.create_aad_credentials(ctx, mgmt_aad)
+ if util.is_none_or_empty(subscription_id):
+ if mgmt_aad is None:
+ mgmt_aad = settings.credentials_management(ctx.config).aad
+ subscription_id = ctx.subscription_id or mgmt_aad.subscription_id
+ batch_mgmt_client = azure.mgmt.batch.BatchManagementClient(
+ credentials, subscription_id)
+ batch_mgmt_client.config.add_user_agent(
+ 'batch-shipyard/{}'.format(__version__))
+ return batch_mgmt_client
+
+
def create_arm_clients(ctx, batch_clients=False):
# type: (CliContext, bool) ->
# Tuple[azure.mgmt.resource.resources.ResourceManagementClient,
@@ -148,10 +173,16 @@ def create_arm_clients(ctx, batch_clients=False):
network_client = create_network_client(
ctx, credentials=credentials, subscription_id=subscription_id)
if batch_clients:
- batch_mgmt_client, batch_client = create_batch_clients(ctx)
+ batch_client = create_batch_service_client(ctx)
+ try:
+ batch_mgmt_client = create_batch_mgmt_client(
+ ctx, credentials=credentials, subscription_id=subscription_id)
+ except Exception:
+ logger.warning('could not create batch management client')
+ batch_mgmt_client = None
else:
- batch_mgmt_client = None
batch_client = None
+ batch_mgmt_client = None
return (
resource_client, compute_client, network_client, batch_mgmt_client,
batch_client
@@ -171,60 +202,25 @@ def create_keyvault_client(ctx):
)
-def create_batch_mgmt_client(ctx, credentials=None, subscription_id=None):
- # type: (CliContext, object, str) ->
- # azure.mgmt.batch.BatchManagementClient
- """Create batch management client
+def create_batch_service_client(ctx):
+ # type: (CliContext) -> azure.batch.batch_service_client.BatchServiceClient
+ """Create batch service client
:param CliContext ctx: Cli Context
- :param object credentials: credentials object
- :param str subscription_id: subscription id
- :rtype: azure.mgmt.batch.BatchManagementClient
- :return: batch management client
- """
- batch_aad = None
- if credentials is None:
- batch_aad = settings.credentials_batch(ctx.config).aad
- credentials = aad.create_aad_credentials(ctx, batch_aad)
- if util.is_none_or_empty(subscription_id):
- if batch_aad is None:
- batch_aad = settings.credentials_batch(ctx.config).aad
- subscription_id = ctx.subscription_id or batch_aad.subscription_id
- if util.is_none_or_empty(subscription_id):
- return None
- batch_mgmt_client = azure.mgmt.batch.BatchManagementClient(
- credentials, subscription_id)
- batch_mgmt_client.config.add_user_agent(
- 'batch-shipyard/{}'.format(__version__))
- return batch_mgmt_client
-
-
-def create_batch_clients(ctx):
- # type: (CliContext) ->
- # Tuple[azure.mgmt.batch.BatchManagementClient,
- # azure.batch.batch_service_client.BatchServiceClient]
- """Create batch client
- :param CliContext ctx: Cli Context
- :rtype: tuple
- :return: (
- azure.mgmt.batch.BatchManagementClient,
- azure.batch.batch_service_client.BatchServiceClient)
+ :rtype: azure.batch.batch_service_client.BatchServiceClient
+ :return: batch service client
"""
bc = settings.credentials_batch(ctx.config)
- use_aad = bc.user_subscription or util.is_none_or_empty(bc.account_key)
- batch_mgmt_client = None
- if use_aad:
- subscription_id = ctx.subscription_id or bc.subscription_id
+ if util.is_none_or_empty(bc.account_key):
+ logger.debug('batch account key not specified, using aad auth')
batch_aad = settings.credentials_batch(ctx.config).aad
credentials = aad.create_aad_credentials(ctx, batch_aad)
- batch_mgmt_client = create_batch_mgmt_client(
- ctx, credentials=credentials, subscription_id=subscription_id)
else:
credentials = batchauth.SharedKeyCredentials(
bc.account, bc.account_key)
batch_client = batchsc.BatchServiceClient(
credentials, base_url=bc.account_service_url)
batch_client.config.add_user_agent('batch-shipyard/{}'.format(__version__))
- return (batch_mgmt_client, batch_client)
+ return batch_client
def create_storage_clients():
diff --git a/convoy/fleet.py b/convoy/fleet.py
index df41f09..632578a 100644
--- a/convoy/fleet.py
+++ b/convoy/fleet.py
@@ -43,6 +43,7 @@ except ImportError:
import uuid
# non-stdlib imports
import azure.batch.models as batchmodels
+import azure.mgmt.batch.models as batchmgmtmodels
# local imports
from . import batch
from . import crypto
@@ -451,10 +452,15 @@ def _add_pool(
raise ValueError(
'Invalid subnet name on virtual network {}'.format(
pool_settings.virtual_network.name))
+ if util.is_not_empty(pool_settings.virtual_network.resource_group):
+ _vnet_rg = pool_settings.virtual_network.resource_group
+ else:
+ _vnet_rg = bc.resource_group
# create virtual network and subnet if specified
vnet, subnet = resource.create_virtual_network_and_subnet(
- network_client, bc.resource_group, bc.location,
+ network_client, _vnet_rg, bc.location,
pool_settings.virtual_network)
+ del _vnet_rg
# ensure address prefix for subnet is valid
tmp = subnet.address_prefix.split('/')
if len(tmp) <= 1:
@@ -490,7 +496,9 @@ def _add_pool(
sc_arg = None
if storage_cluster_mount:
# ensure usersubscription account
- if not bc.user_subscription:
+ ba = batch.get_batch_account(batch_mgmt_client, config)
+ if (not ba.pool_allocation_mode ==
+ batchmgmtmodels.PoolAllocationMode.user_subscription):
raise RuntimeError(
'{} account is not a UserSubscription account'.format(
bc.account))
@@ -1135,21 +1143,6 @@ def _adjust_settings_for_pool_creation(config):
# adjust inter node comm setting
if pool.vm_count < 1:
raise ValueError('invalid vm_count: {}'.format(pool.vm_count))
- dr = settings.data_replication_settings(config)
- max_vms = 20 if publisher == 'microsoftwindowsserver' else 40
- if pool.vm_count > max_vms:
- if dr.peer_to_peer.enabled:
- logger.warning(
- ('disabling peer-to-peer transfer as pool size of {} exceeds '
- 'max limit of {} vms for inter-node communication').format(
- pool.vm_count, max_vms))
- settings.set_peer_to_peer_enabled(config, False)
- if pool.inter_node_communication_enabled:
- logger.warning(
- ('disabling inter-node communication as pool size of {} '
- 'exceeds max limit of {} vms for setting').format(
- pool.vm_count, max_vms))
- settings.set_inter_node_communication_enabled(config, False)
# re-read pool and data replication settings
pool = settings.pool_settings(config)
dr = settings.data_replication_settings(config)
diff --git a/convoy/remotefs.py b/convoy/remotefs.py
index 1d1d13c..6368568 100644
--- a/convoy/remotefs.py
+++ b/convoy/remotefs.py
@@ -632,9 +632,14 @@ def create_storage_cluster(
# upload scripts to blob storage for customscript
blob_urls = storage.upload_for_remotefs(blob_client, remotefs_files)
# create virtual network and subnet if specified
+ if util.is_not_empty(rfs.storage_cluster.virtual_network.resource_group):
+ _vnet_rg = rfs.storage_cluster.virtual_network.resource_group
+ else:
+ _vnet_rg = rfs.resource_group
vnet, subnet = resource.create_virtual_network_and_subnet(
- network_client, rfs.resource_group, rfs.location,
+ network_client, _vnet_rg, rfs.location,
rfs.storage_cluster.virtual_network)
+ del _vnet_rg
# TODO create slb
diff --git a/convoy/settings.py b/convoy/settings.py
index cf0ff17..ae38e3c 100644
--- a/convoy/settings.py
+++ b/convoy/settings.py
@@ -98,7 +98,7 @@ ManagementCredentialsSettings = collections.namedtuple(
BatchCredentialsSettings = collections.namedtuple(
'BatchCredentialsSettings', [
'aad', 'account', 'account_key', 'account_service_url',
- 'user_subscription', 'resource_group', 'subscription_id', 'location',
+ 'resource_group', 'subscription_id', 'location',
]
)
StorageCredentialsSettings = collections.namedtuple(
@@ -176,8 +176,8 @@ ManagedDisksSettings = collections.namedtuple(
)
VirtualNetworkSettings = collections.namedtuple(
'VirtualNetworkSettings', [
- 'name', 'address_space', 'subnet_name', 'subnet_address_prefix',
- 'existing_ok', 'create_nonexistant',
+ 'name', 'resource_group', 'address_space', 'subnet_name',
+ 'subnet_address_prefix', 'existing_ok', 'create_nonexistant',
]
)
FileServerSettings = collections.namedtuple(
@@ -718,7 +718,6 @@ def credentials_batch(config):
account = _kv_read_checked(conf, 'account')
account_key = _kv_read_checked(conf, 'account_key')
account_service_url = conf['account_service_url']
- user_subscription = _kv_read(conf, 'user_subscription', False)
resource_group = _kv_read_checked(conf, 'resource_group')
# get subscription id from management section
try:
@@ -749,7 +748,6 @@ def credentials_batch(config):
account=account,
account_key=account_key,
account_service_url=conf['account_service_url'],
- user_subscription=user_subscription,
resource_group=resource_group,
location=location,
subscription_id=subscription_id,
@@ -1804,6 +1802,22 @@ def job_max_task_retries(conf):
return max_task_retries
+def job_allow_run_on_missing(conf):
+ # type: (dict) -> int
+ """Get allow task run on missing image
+ :param dict conf: job configuration object
+ :rtype: bool
+ :return: allow run on missing image
+ """
+ try:
+ allow = conf['allow_run_on_missing_image']
+ if allow is None:
+ raise KeyError()
+ except KeyError:
+ allow = False
+ return allow
+
+
def has_depends_on_task(conf):
# type: (dict) -> bool
"""Determines if task has task dependencies
@@ -1825,7 +1839,7 @@ def has_depends_on_task(conf):
def is_multi_instance_task(conf):
# type: (dict) -> bool
"""Determines if task is multi-isntance
- :param dict conf: job configuration object
+ :param dict conf: task configuration object
:rtype: bool
:return: task is multi-instance
"""
@@ -1835,7 +1849,7 @@ def is_multi_instance_task(conf):
def task_name(conf):
# type: (dict) -> str
"""Get task name
- :param dict conf: job configuration object
+ :param dict conf: task configuration object
:rtype: str
:return: task name
"""
@@ -1848,10 +1862,26 @@ def task_name(conf):
return name
+def task_docker_image(conf):
+ # type: (dict) -> str
+ """Get docker image used by task
+ :param dict conf: task configuration object
+ :rtype: str
+ :return: docker image used by task
+ """
+ try:
+ di = conf['image']
+ if util.is_none_or_empty(di):
+ raise KeyError()
+ except KeyError:
+ di = None
+ return di
+
+
def set_task_name(conf, name):
# type: (dict, str) -> None
"""Set task name
- :param dict conf: job configuration object
+ :param dict conf: task configuration object
:param str name: task name to set
"""
conf['name'] = name
@@ -1860,7 +1890,7 @@ def set_task_name(conf, name):
def task_id(conf):
# type: (dict) -> str
"""Get task id
- :param dict conf: job configuration object
+ :param dict conf: task configuration object
:rtype: str
:return: task id
"""
@@ -1876,18 +1906,21 @@ def task_id(conf):
def set_task_id(conf, id):
# type: (dict, str) -> None
"""Set task id
- :param dict conf: job configuration object
+ :param dict conf: task configuration object
:param str id: task id to set
"""
conf['id'] = id
-def task_settings(pool, config, conf):
- # type: (azure.batch.models.CloudPool, dict, dict) -> TaskSettings
+def task_settings(cloud_pool, config, poolconf, conf, missing_images):
+ # type: (azure.batch.models.CloudPool, dict, PoolSettings,
+ # dict, list) -> TaskSettings
"""Get task settings
- :param azure.batch.models.CloudPool pool: cloud pool object
+ :param azure.batch.models.CloudPool cloud_pool: cloud pool object
:param dict config: configuration dict
- :param dict conf: job configuration object
+ :param PoolSettings poolconf: pool settings
+ :param dict conf: task configuration object
+ :param list missing_images: list of missing docker images on pool
:rtype: TaskSettings
:return: task settings
"""
@@ -1898,11 +1931,36 @@ def task_settings(pool, config, conf):
image = conf['image']
if util.is_none_or_empty(image):
raise ValueError('image is invalid')
+ # check if image is in missing image list
+ if image in missing_images:
+ # get private registry settings
+ preg = docker_registry_private_settings(config)
+ if util.is_not_empty(preg.storage_account):
+ registry = 'localhost:5000/'
+ elif util.is_not_empty(preg.server):
+ registry = '{}/'.format(preg.server)
+ else:
+ registry = ''
+ del preg
+ image = '{}{}'.format(registry, image)
# get some pool props
- publisher = pool.virtual_machine_configuration.image_reference.\
- publisher.lower()
- offer = pool.virtual_machine_configuration.image_reference.offer.lower()
- sku = pool.virtual_machine_configuration.image_reference.sku.lower()
+ if cloud_pool is None:
+ pool_id = poolconf.id
+ publisher = poolconf.publisher.lower()
+ offer = poolconf.offer.lower()
+ sku = poolconf.sku.lower()
+ vm_size = poolconf.vm_size
+ inter_node_comm = poolconf.inter_node_communication_enabled
+ else:
+ pool_id = cloud_pool.id
+ publisher = cloud_pool.virtual_machine_configuration.image_reference.\
+ publisher.lower()
+ offer = cloud_pool.virtual_machine_configuration.image_reference.\
+ offer.lower()
+ sku = cloud_pool.virtual_machine_configuration.image_reference.sku.\
+ lower()
+ vm_size = cloud_pool.vm_size.lower()
+ inter_node_comm = cloud_pool.enable_inter_node_communication
# get depends on
try:
depends_on = conf['depends_on']
@@ -2088,10 +2146,10 @@ def task_settings(pool, config, conf):
gpu = False
# adjust for gpu settings
if gpu:
- if not is_gpu_pool(pool.vm_size):
+ if not is_gpu_pool(vm_size):
raise RuntimeError(
('cannot initialize a gpu task on nodes without '
- 'gpus, pool: {} vm_size: {}').format(pool.id, pool.vm_size))
+ 'gpus, pool: {} vm_size: {}').format(pool_id, vm_size))
# TODO other images as they become available with gpu support
if (publisher != 'canonical' and offer != 'ubuntuserver' and
sku < '16.04'):
@@ -2107,16 +2165,16 @@ def task_settings(pool, config, conf):
docker_exec_cmd = 'docker exec'
# adjust for infiniband
if infiniband:
- if not pool.enable_inter_node_communication:
+ if not inter_node_comm:
raise RuntimeError(
('cannot initialize an infiniband task on a '
'non-internode communication enabled '
- 'pool: {}').format(pool.id))
- if not is_rdma_pool(pool.vm_size):
+ 'pool: {}').format(pool_id))
+ if not is_rdma_pool(vm_size):
raise RuntimeError(
('cannot initialize an infiniband task on nodes '
'without RDMA, pool: {} vm_size: {}').format(
- pool.id, pool.vm_size))
+ pool_id, vm_size))
# only centos-hpc and sles-hpc:12-sp1 are supported
# for infiniband
if publisher == 'openlogic' and offer == 'centos-hpc':
@@ -2147,7 +2205,7 @@ def task_settings(pool, config, conf):
run_opts.append('--env-file {}'.format(envfile))
# populate mult-instance settings
if is_multi_instance_task(conf):
- if not pool.enable_inter_node_communication:
+ if not inter_node_comm:
raise RuntimeError(
('cannot run a multi-instance task on a '
'non-internode communication enabled '
@@ -2194,7 +2252,12 @@ def task_settings(pool, config, conf):
if num_instances == 'pool_specification_vm_count':
num_instances = pool_vm_count(config)
elif num_instances == 'pool_current_dedicated':
- num_instances = pool.current_dedicated
+ if cloud_pool is None:
+ raise RuntimeError(
+ ('Cannot retrieve current dedicated count for '
+ 'pool: {}. Ensure pool exists.)'.format(pool_id)))
+ else:
+ num_instances = cloud_pool.current_dedicated
else:
raise ValueError(
('multi instance num instances setting '
@@ -2267,6 +2330,7 @@ def virtual_network_settings(
except KeyError:
conf = {}
name = _kv_read_checked(conf, 'name')
+ resource_group = _kv_read_checked(conf, 'resource_group')
address_space = _kv_read_checked(conf, 'address_space')
existing_ok = _kv_read(conf, 'existing_ok', default_existing_ok)
subnet_name = _kv_read_checked(conf['subnet'], 'name')
@@ -2275,6 +2339,7 @@ def virtual_network_settings(
conf, 'create_nonexistant', default_create_nonexistant)
return VirtualNetworkSettings(
name=name,
+ resource_group=resource_group,
address_space=address_space,
subnet_name=subnet_name,
subnet_address_prefix=subnet_address_prefix,
@@ -2331,9 +2396,9 @@ def remotefs_settings(config):
)
if not isinstance(sc_ns_inbound['nfs'].source_address_prefix, list):
raise ValueError('expected list for nfs network security rule')
- if 'custom_inbound' in ns_conf:
+ if 'custom_inbound_rules' in ns_conf:
_reserved = frozenset(['ssh', 'nfs', 'glusterfs'])
- for key in ns_conf['custom_inbound']:
+ for key in ns_conf['custom_inbound_rules']:
# ensure key is not reserved
if key.lower() in _reserved:
raise ValueError(
@@ -2341,11 +2406,13 @@ def remotefs_settings(config):
'reserved name {}').format(key, _reserved))
sc_ns_inbound[key] = InboundNetworkSecurityRule(
destination_port_range=_kv_read_checked(
- ns_conf['custom_inbound'][key], 'destination_port_range'),
+ ns_conf['custom_inbound_rules'][key],
+ 'destination_port_range'),
source_address_prefix=_kv_read_checked(
- ns_conf['custom_inbound'][key], 'source_address_prefix'),
+ ns_conf['custom_inbound_rules'][key],
+ 'source_address_prefix'),
protocol=_kv_read_checked(
- ns_conf['custom_inbound'][key], 'protocol'),
+ ns_conf['custom_inbound_rules'][key], 'protocol'),
)
if not isinstance(sc_ns_inbound[key].source_address_prefix, list):
raise ValueError(
diff --git a/docs/10-batch-shipyard-configuration.md b/docs/10-batch-shipyard-configuration.md
index c982152..1ad1c9d 100644
--- a/docs/10-batch-shipyard-configuration.md
+++ b/docs/10-batch-shipyard-configuration.md
@@ -7,10 +7,12 @@ Batch Shipyard is driven by the following json configuration files:
1. [Credentials](11-batch-shipyard-configuration-credentials.md) -
credentials for Azure Batch, Storage, KeyVault, Management and Docker private
registries
-2. [Global config](#global) - Batch Shipyard and Docker-specific configuration
-settings
-3. [Pool](#pool) - Azure Batch pool configuration
-4. [Jobs](#jobs) - Azure Batch jobs and tasks configuration
+2. [Global config](12-batch-shipyard-configuration-global.md) -
+Batch Shipyard and Docker-specific configuration settings
+3. [Pool](13-batch-shipyard-configuration-pool.md) -
+Batch Shipyard pool configuration
+4. [Jobs](14-batch-shipyard-configuration-jobs.md) -
+Batch Shipyard jobs and tasks configuration
Note that all potential properties are described here and that specifying
all such properties may result in invalid configuration as some properties
@@ -26,940 +28,5 @@ may be invalid if specified as such. They must be modified for your execution
scenario. All [sample recipe](../recipes) also have a set of configuration
files that can be modified to fit your needs.
-### Global Config
-The global config schema is as follows:
-
-```json
-{
- "batch_shipyard": {
- "storage_account_settings": "mystorageaccount",
- "storage_entity_prefix": "shipyard",
- "generated_sas_expiry_days": 90,
- "encryption" : {
- "enabled": true,
- "pfx": {
- "filename": "encrypt.pfx",
- "passphrase": "mysupersecretpassword",
- "sha1_thumbprint": "123456789..."
- },
- "public_key_pem": "encrypt.pem"
- }
- },
- "docker_registry": {
- "private": {
- "allow_public_docker_hub_pull_on_missing": true,
- "server": "myserver-myorg.azurecr.io",
- "azure_storage": {
- "storage_account_settings": "mystorageaccount",
- "container": "mydockerregistry"
- }
- }
- },
- "data_replication": {
- "peer_to_peer": {
- "enabled": true,
- "compression": true,
- "concurrent_source_downloads": 10,
- "direct_download_seed_bias": null
- },
- "non_peer_to_peer_concurrent_downloading": true
- },
- "global_resources": {
- "docker_images": [
- "busybox",
- "redis:3.2.3-alpine",
- ],
- "files": [
- {
- "source": {
- "path": "/some/local/path/dir",
- "include": ["*.dat"],
- "exclude": ["*.bak"]
- },
- "destination": {
- "shared_data_volume": "glustervol",
- "relative_destination_path": "myfiles",
- "data_transfer": {
- "method": "multinode_scp",
- "ssh_private_key": "id_rsa_shipyard",
- "scp_ssh_extra_options": "-C -c aes256-gcm@openssh.com",
- "rsync_extra_options": "",
- "split_files_megabytes": 500,
- "max_parallel_transfers_per_node": 2
- }
- }
- },
- {
- "source": {
- "path": "/some/local/path/bound/for/blob",
- "include": ["*.bin"]
- },
- "destination": {
- "storage_account_settings": "mystorageaccount",
- "data_transfer": {
- "container": "mycontainer",
- "blobxfer_extra_options": "--no-computefilemd5"
- }
- }
- },
- {
- "source": {
- "path": "/another/local/path/dir",
- "include": [],
- "exclude": []
- },
- "destination": {
- "relative_destination_path": "relpath/on/host",
- "data_transfer": {
- "method": "rsync+ssh",
- "ssh_private_key": "id_rsa_shipyard",
- "scp_ssh_extra_options": "-c aes256-gcm@openssh.com",
- "rsync_extra_options": "-v"
- }
- }
- }
- ],
- "docker_volumes": {
- "data_volumes": {
- "abcvol": {
- "host_path": null,
- "container_path": "/abc"
- },
- "hosttempvol": {
- "host_path": "/tmp",
- "container_path": "/hosttmp"
- }
- },
- "shared_data_volumes": {
- "shipyardvol": {
- "volume_driver": "azurefile",
- "storage_account_settings": "mystorageaccount",
- "azure_file_share_name": "shipyardshared",
- "container_path": "$AZ_BATCH_NODE_SHARED_DIR/azfile",
- "mount_options": [
- "filemode=0777",
- "dirmode=0777",
- "nolock=true"
- ]
- },
- "glustervol": {
- "volume_driver": "glusterfs_on_compute",
- "container_path": "$AZ_BATCH_NODE_SHARED_DIR/gfs",
- "volume_type": "replica",
- "volume_options": [
- "performance.cache-size 1 GB",
- "performance.cache-max-file-size 10 MB",
- "performance.cache-refresh-timeout 61",
- ]
- }
- }
- }
- }
-}
-```
-
-The `batch_shipyard` property is used to set settings for the tool.
-* (required) `storage_account_settings` is a link to the alias of the storage
-account specified, in this case, it is `mystorageaccount`. Batch shipyard
-requires a storage account for storing metadata in order to execute across a
-distributed environment.
-* (optional) `storage_entity_prefix` property is used as a generic qualifier
-to prefix storage containers (blob containers, tables, queues) with. If not
-specified, defaults to `shipyard`.
-* (optional) `generated_sas_expiry_days` property is used to set the number of
-days any generated SAS key by Batch Shipyard is valid for. The default is 30
-days. This is useful if you have long-lived pools and want to ensure that
-SAS keys are valid for longer periods of time.
-* (optional) `encryption` object is used to define credential encryption which
-contains the following members:
- * (required) `enabled` property enables or disables this feature.
- * (required) `pfx` object defines the PFX certificate
- * (required) `filename` property is the full path and name to the PFX
- certificate
- * (required) `passphrase` property is the passphrase for the PFX
- certificate. This cannot be empty.
- * (optional) `sha1_thumbprint` is the SHA1 thumbprint of the
- certificate. If the PFX file is created using the `cert create` command,
- then the SHA1 thumbprint is output. It is recommended to populate this
- property such that it does not have to be generated when needed for
- encryption.
- * (optional) `public_key_pem` property is the full path and name to the
- RSA public key in PEM format. If the PFX file is created using the
- `cert create` command, then this file is generated along with the PFX
- file. It is recommended to populate this property with the PEM file path
- such that it does not have to be generated when needed for encryption.
-
-The `docker_registry` property is used to configure Docker image distribution
-options from public/private Docker hub and private registries.
-* (optional) `private` property controls settings for interacting with private
-registries. There are three kinds of private registries that are supported:
-(1) private registries hosted on Docker Hub, (2) Internet accessible
-registries such as those hosted by the
-[Azure Container Registry](https://azure.microsoft.com/en-us/services/container-registry/)
-service and (3) [private registry instances backed to
-Azure Blob Storage](https://azure.microsoft.com/en-us/documentation/articles/virtual-machines-linux-docker-registry-in-blob-storage/)
-and are run on compute nodes. To use private registries hosted on Docker Hub,
-no additional properties need to be specified here, instead, specify your
-Docker Hub login information in the credentials json. To specify a private
-registry other than on Docker Hub, a json property named `server` should be
-defined. To use a private registry backed by Azure Blob Storage, define a
-json object named `azure_storage`. Note that a maximum of only one of these
-three types private registries may be specified at once. The following
-describes members of the non-Docker Hub private registries supported:
- * (optional) `server` object is a property that is the fully-qualified host
- name to a private registry server. A specific port other than 80 can be
- specified using a `:` separator, e.g.,
- `mydockerregistry.com:8080`. Port 80 is the default if no port is
- specified. The value of this property should have an associated login
- in the credentials json file.
- * (optional) `azure_storage` object is to define settings for connecting
- to a private registry backed by Azure Storage blobs and where the
- private registry instances are hosted on the compute nodes themselves.
- * (required) `storage_account_settings` is a link to the alias of the
- storage account specified that stores the private registry blobs.
- * (required) `container` property is the name of the Azure Blob
- container holding the private registry blobs.
- * (optional) `allow_public_docker_hub_pull_on_missing` property allows
- pass-through of Docker image retrieval to public Docker Hub if it is
- missing in the private registry. This defaults to `false` if not
- specified.
-
-The `data_replication` property is used to configure the internal image
-replication mechanism between compute nodes within a compute pool. The
-`non_peer_to_peer_concurrent_downloading` property specifies if it is ok
-to allow unfettered concurrent downloading from the source registry among
-all compute nodes. The following options apply to `peer_to_peer` data
-replication options:
-* (optional) `enabled` property enables or disables private peer-to-peer
-transfer. Note that for compute pools with a relatively small number of VMs,
-peer-to-peer transfer may not provide any benefit and is recommended to be
-disabled in these cases. Compute pools with large number of VMs and especially
-in the case of an Azure Storage-backed private registry can benefit from
-peer-to-peer image replication.
-* `compression` property enables or disables compression of image files. It
-is strongly recommended to keep this enabled.
-* `concurrent_source_downloads` property specifies the number of
-simultaneous downloads allowed to each image.
-* `direct_download_seed_bias` property sets the number of direct download
-seeds to prefer per image before switching to peer-to-peer transfer.
-
-The `global_resources` property contains information regarding required
-Docker images, volume configuration and data ingress information. This
-property is required.
-
-`docker_images` is an array of docker images that should be installed on
-every compute node when this configuration file is supplied while creating
-a compute pool. Image tags are supported. Image names should not include
-private registry server names, as these will be automatically prepended. For
-instance, if you have an image `abc/mytag` on your private registry
-`myregistry-myorg.azurecr.io`, your image should be named in the
-`docker_images` array as `abc/mytag` and not
-`myregistry-myorg.azurecr.io/abc/mytag`.
-
-`files` is an optional property that specifies data that should be ingressed
-from a location accessible by the local machine (i.e., machine invoking
-`shipyard.py` to a shared file system location accessible by compute nodes
-in the pool or Azure Blob or File Storage). `files` is a json list of objects,
-which allows for multiple sources to destinations to be ingressed during the
-same invocation. Note that no Azure Batch environment variables
-(i.e., `$AZ_BATCH_`-style environment variables) are available as path
-arguments since ingress actions performed within `files` are done locally
-on the machine invoking `shipyard.py`. Each object within the `files` list
-contains the following members:
-* (required) `source` property contains the following members:
- * (required) `path` is a local path. A single file or a directory
- can be specified. Filters below will be ignored if `path` is a file and
- not a directory.
- * (optional) `include` is an array of
- [Unix shell-style wildcard filters](https://docs.python.org/3.5/library/fnmatch.html)
- where only files matching a filter are included in the data transfer.
- Filters specified in `include` have precedence over `exclude` described
- next. `include` can only have a maximum of 1 filter for ingress to Azure
- Blob Storage. In this example, all files ending in `.dat` are ingressed.
- * (optional) `exclude` is an array of
- [Unix shell-style wildcard filters](https://docs.python.org/3.5/library/fnmatch.html)
- where files matching a filter are excluded from the data transfer. Filters
- specified in `include` have precedence over filters specified in
- `exclude`. `exclude` cannot be specified for ingress into Azure Blob
- Storage. In this example, all files ending in `.bak` are skipped for
- ingress.
-* (required) `destination` property contains the following members:
- * (required or optional) `shared_data_volume` or `storage_account_settings`
- for data ingress to a GlusterFS volume or Azure Blob or File Storage. If
- you are ingressing to a pool with only one compute node, you may omit
- `shared_data_volume`. Otherwise, you may specify one or the other, but
- not both in the same object. Please see below in the
- `shared_data_volumes` for information on how to set up a GlusterFS share.
- * (required or optional) `relative_destination_path` specifies a relative
- destination path to place the files, with respect to the target root.
- If transferring to a `shared_data_volume` then this is relative to the
- GlusterFS volume root. If transferring to a pool with one single node in
- it, thus, no `shared_data_volume` is specified in the prior property, then
- this is relative to
- [$AZ_BATCH_NODE_ROOT_DIR](https://azure.microsoft.com/en-us/documentation/articles/batch-api-basics/#files-and-directories).
- To place files directly in `$AZ_BATCH_NODE_ROOT_DIR` (not recommended),
- you can specify this property as empty string when not ingressing to
- a `shared_data_volume`. Note that if `scp` is selected while attempting
- to transfer directly to this aforementioned path, then `scp` will fail
- with exit code of 1 but the transfer will have succeeded (this is due
- to some of the permission options). If this property is not specified for
- a `shared_data_volume`, then files will be placed directly in the
- GlusterFS volume root. This property cannot be specified for a Azure
- Storage destination (i.e., `storage_account_settings`).
- * (required) `data_transfer` specifies how the transfer should take place.
- The following list contains members for GlusterFS ingress when a GlusterFS
- volume is provided for `shared_data_volume` (see below for ingressing to
- Azure Blob or File Storage):
- * (required) `method` specified which method should be used to ingress
- data, which should be one of: `scp`, `multinode_scp`, `rsync+ssh` or
- `multinode_rsync+ssh`. `scp` will use secure copy to copy a file or a
- directory (recursively) to the remote share path. `multinode_scp` will
- attempt to simultaneously transfer files to many compute nodes using
- `scp` at the same time to speed up data transfer. `rsync+ssh` will
- perform an rsync of files through SSH. `multinode_rsync+ssh` will
- attempt to simultaneously transfer files using `rsync` to many compute
- nodes at the same time to speed up data transfer with. Note that you may
- specify the `multinode_*` methods even with only 1 compute node in a
- pool which will allow you to take advantage of
- `max_parallel_transfers_per_node` below.
- * (optional) `ssh_private_key` location of the SSH private key for the
- username specified in the `pool_specification`:`ssh` section when
- connecting to compute nodes. The default is `id_rsa_shipyard`, if
- omitted, which is automatically generated if no SSH key is specified
- when an SSH user is added to a pool.
- * (optional) `scp_ssh_extra_options` are any extra options to pass to
- `scp` or `ssh` for `scp`/`multinode_scp` or
- `rsync+ssh`/`multinode_rsync+ssh` methods, respectively. In the example
- above, `-C` enables compression and `-c aes256-gcm@openssh.com`
- is passed to `scp`, which can potentially increase the transfer speed by
- selecting the `aes256-gcm@openssh.com` cipher which can exploit Intel
- AES-NI.
- * (optional) `rsync_extra_options` are any extra options to pass to
- `rsync` for the `rsync+ssh`/`multinode_rsync+ssh` transfer methods. This
- property is ignored for non-rsync transfer methods.
- * (optional) `split_files_megabytes` splits files into chunks with the
- specified size in MiB. This can potentially help with very large files.
- This option forces the transfer `method` to `multinode_scp`.
- Note that the destination file system must be able to accommodate
- up to 2x the size of files which are split. Additionally, transfers
- involving files which are split will incur reconstruction costs after
- the transfer is complete, which will increase the total end-to-end
- ingress time. However, in certain scenarios, by splitting files and
- transferring chunks in parallel along with reconstruction may end up
- being faster than transferring a large file without chunking.
- * (optional) `max_parallel_transfers_per_node` is the maximum number of
- parallel transfer to invoke per node with the
- `multinode_scp`/`multinode_rsync+ssh` methods. For example, if there
- are 3 compute nodes in the pool, and `2` is given for this option, then
- there will be up to 2 scp sessions in parallel per compute node for a
- maximum of 6 concurrent scp sessions to the pool. The default is 1 if
- not specified or omitted.
- * (required) `data_transfer` specifies how the transfer should take place.
- When Azure Blob or File Storage is selected as the destination for data
- ingress, [blobxfer](https://github.com/Azure/blobxfer) is invoked. The
- following list contains members for Azure Blob or File Storage ingress
- when a storage account link is provided for `storage_account_settings`:
- * (required) `container` or `file_share` is required when uploading to
- Azure Blob Storage or Azure File Storage, respectively. `container`
- specifies which container to upload to for Azure Blob Storage while
- `file_share` specifies which file share to upload to for Azure File
- Storage. Only one of these properties can be specified per
- `data_transfer` object. The container or file share need not be created
- beforehand.
- * (optional) `blobxfer_extra_options` are any extra options to pass to
- `blobxfer`. In the example above, `--no-computefilemd5` will force
- `blobxfer` to skip MD5 calculation on files ingressed.
-
-`docker_volumes` is an optional property that can consist of two
-different types of volumes: `data_volumes` and `shared_data_volumes`.
-`data_volumes` can be of two flavors depending upon if `host_path` is set to
-null or not. In the former, this is typically used with the `VOLUME` keyword
-in Dockerfiles to initialize a data volume with existing data inside the
-image. If `host_path` is set, then the path on the host is mounted in the
-container at the path specified with `container_path`.
-
-`shared_data_volumes` is an optional property for initializing persistent
-shared storage volumes. In the first shared volume, `shipyardvol` is the alias
-of this volume:
-* `volume_driver` property specifies the Docker Volume Driver to use.
-Currently Batch Shipyard only supports the `volume_driver` as `azurefile` or
-`glusterfs_on_compute`. Note that `glusterfs_on_compute` is not a true Docker
-Volume Driver. For this volume (`shipyardvol`), as this is an Azure File
-shared volume, the `volume_driver` should be set as `azurefile`.
-* `storage_account_settings` is a link to the alias of the storage account
-specified that holds this Azure File Share.
-* `azure_file_share_name` is the name of the share name on Azure Files. Note
-that the Azure File share must be created beforehand, the toolkit does not
-create Azure File shares, it only mounts them to the compute nodes.
-* `container_path` is the path in the container to mount.
-* `mount_options` are the mount options to pass to the mount command. Supported
-options are documented
-[here](https://github.com/Azure/azurefile-dockervolumedriver). It is
-recommended to use `0777` for both `filemode` and `dirmode` as the `uid` and
-`gid` cannot be reliably determined before the compute pool is allocated and
-this volume will be mounted as the root user.
-
-Note that when using `azurefile` for a shared data volume, the storage account
-that holds the file share must reside within the same Azure region as the
-Azure Batch compute pool. Attempting to mount an Azure File share that is
-cross-region will result in failure as current Linux Samba clients do not
-support share level encryption at this time.
-
-The second shared volue, `glustervol`, is a
-[GlusterFS](https://www.gluster.org/) network file system. Please note that
-`glusterfs_on_compute` are GlusterFS volumes co-located on the VM's temporary
-local disk space which is a shared resource. Sizes of the local temp disk for
-each VM size can be found
-[here](https://azure.microsoft.com/en-us/documentation/articles/virtual-machines-windows-sizes/).
-If specifying a `glusterfs_on_compute` volume, you must enable internode
-communication in the pool configuration file. These volumes have the following
-properties:
-* (required) `volume_driver` property should be set as `glusterfs_on_compute`.
-* (required) `container_path` is the path in the container to mount.
-* (optional) `volume_type` property defines the GlusterFS volume type.
-Currently, `replica` is the only supported type.
-* (optional) `volume_options` property defines additional GlusterFS volume
-options to set.
-
-`glusterfs_on_compute` volumes are mounted on the host at
-`$AZ_BATCH_NODE_SHARED_DIR/.gluster/gv0`. Batch Shipyard will automatically
-replace container path references in direct and storage-based data
-ingress/egress with their host path equivalents.
-
-Note that when resizing a pool with a `glusterfs_on_compute` shared file
-systems that you must resize with the `pool resize` command in `shipyard.py`
-and not with Azure Portal, Batch Explorer or any other tool.
-
-Finally, note that all `docker_volumes` can be omitted completely along with
-one or all of `data_volumes` and `shared_data_volumes` if you do not require
-this functionality.
-
-An example global config json template can be found
-[here](../config\_templates/config.json).
-
-### Pool
-The pool schema is as follows:
-
-```json
-{
- "pool_specification": {
- "id": "dockerpool",
- "vm_size": "STANDARD_A9",
- "vm_count": 10,
- "max_tasks_per_node": 1,
- "inter_node_communication_enabled": true,
- "publisher": "OpenLogic",
- "offer": "CentOS-HPC",
- "sku": "7.1",
- "reboot_on_start_task_failed": true,
- "block_until_all_global_resources_loaded": true,
- "transfer_files_on_pool_creation": false,
- "input_data": {
- "azure_batch": [
- {
- "job_id": "jobonanotherpool",
- "task_id": "mytask",
- "include": ["wd/*.dat"],
- "exclude": ["*.txt"],
- "destination": "$AZ_BATCH_NODE_SHARED_DIR/jobonanotherpool"
- }
- ],
- "azure_storage": [
- {
- "storage_account_settings": "mystorageaccount",
- "container": "poolcontainer",
- "include": ["pooldata*.bin"],
- "destination": "$AZ_BATCH_NODE_SHARED_DIR/pooldata",
- "blobxfer_extra_options": null
- }
- ]
- },
- "ssh": {
- "username": "docker",
- "expiry_days": 7,
- "ssh_public_key": null,
- "generate_docker_tunnel_script": true,
- "generated_file_export_path": null,
- "hpn_server_swap": false
- },
- "gpu": {
- "nvidia_driver": {
- "source": "https://some.url"
- }
- },
- "additional_node_prep_commands": [
- ]
- }
-}
-```
-
-The `pool_specification` property has the following members:
-* (required) `id` is the compute pool ID.
-* (required) `vm_size` is the
-[Azure Virtual Machine Instance Size](https://azure.microsoft.com/en-us/pricing/details/virtual-machines/).
-Please note that not all regions have every VM size available.
-* (required) `vm_count` is the number of compute nodes to allocate.
-* (optional) `max_tasks_per_node` is the maximum number of concurrent tasks
-that can be running at any one time on a compute node. This defaults to a
-value of 1 if not specified.
-* (optional) `inter_node_communication_enabled` designates if this pool is set
-up for inter-node communication. This must be set to `true` for any containers
-that must communicate with each other such as MPI applications. This property
-will be force enabled if peer-to-peer replication is enabled.
-* (required) `publisher` is the publisher name of the Marketplace VM image.
-* (required) `offer` is the offer name of the Marketplace VM image.
-* (required) `sku` is the sku name of the Marketplace VM image.
-* (optional) `reboot_on_start_task_failed` allows Batch Shipyard to reboot the
-compute node in case there is a transient failure in node preparation (e.g.,
-network timeout, resolution failure or download problem). This defaults to
-`false`.
-* (optional) `block_until_all_global_resources_loaded` will block the node
-from entering ready state until all Docker images are loaded. This defaults
-to `true`.
-* (optional) `transfer_files_on_pool_creation` will ingress all `files`
-specified in the `global_resources` section of the configuration json when
-the pool is created. If files are to be ingressed to Azure Blob or File
-Storage, then data movement operations are overlapped with the creation of the
-pool. If files are to be ingressed to a shared file system on the compute
-nodes, then the files are ingressed after the pool is created and the shared
-file system is ready. Files can be ingressed to both Azure Blob Storage and a
-shared file system during the same pool creation invocation. If this property
-is set to `true` then `block_until_all_global_resources_loaded` will be force
-disabled. If omitted, this property defaults to `false`.
-* (optional) `input_data` is an object containing data that should be
-ingressed to all compute nodes as part of node preparation. It is
-important to note that if you are combining this action with `files` and
-are ingressing data to Azure Blob or File storage as part of pool creation,
-that the blob containers or file shares defined here will be downloaded as
-soon as the compute node is ready to do so. This may result in the blob
-container/blobs or file share/files not being ready in time for the
-`input_data` transfer. It is up to you to ensure that these two operations do
-not overlap. If there is a possibility of overlap, then you should ingress
-data defined in `files` prior to pool creation and disable the option above
-`transfer_files_on_pool_creation`. This object currently supports
-`azure_batch` and `azure_storage` as members.
- * `azure_batch` contains the following members:
- * (required) `job_id` the job id of the task
- * (required) `task_id` the id of the task to fetch files from
- * (optional) `include` is an array of include filters
- * (optional) `exclude` is an array of exclude filters
- * (required) `destination` is the destination path to place the files
- * `azure_storage` contains the following members:
- * (required) `storage_account_settings` contains a storage account link
- as defined in the credentials json.
- * (required) `container` or `file_share` is required when downloading
- from Azure Blob Storage or Azure File Storage, respectively.
- `container` specifies which container to download from for Azure Blob
- Storage while `file_share` specifies which file share to download from
- for Azure File Storage. Only one of these properties can be specified
- per `data_transfer` object.
- * (optional) `include` property defines an optional include filter.
- Although this property is an array, it is only allowed to have 1
- maximum filter.
- * (required) `destination` property defines where to place the
- downloaded files on the host file system. Please note that you should
- not specify a destination that is on a shared file system. If you
- require ingressing to a shared file system location like a GlusterFS
- volume, then use the global configuration `files` property and the
- `data ingress` command.
- * (optional) `blobxfer_extra_options` are any extra options to pass to
- `blobxfer`.
-* (optional) `ssh` is the property for creating a user to accomodate SSH
-sessions to compute nodes. If this property is absent, then an SSH user is not
-created with pool creation.
- * (required) `username` is the user to create on the compute nodes.
- * (optional) `expiry_days` is the number of days from now for the account on
- the compute nodes to expire. The default is 30 days from invocation time.
- * (optional) `ssh_public_key` is the path to an existing SSH public key to
- use. If not specified, an RSA public/private keypair will be automatically
- generated only on Linux. If this is `null` or not specified on Windows,
- the SSH user is not created.
- * (optional) `generate_docker_tunnel_script` property directs script to
- generate an SSH tunnel script that can be used to connect to the remote
- Docker engine running on a compute node.
- * (optional) `generated_file_export_path` is the path to export the
- generated RSA keypair and docker tunnel script to. If omitted, the
- current directory is used.
- * (experimental) `hpn_server_swap` property enables an OpenSSH server with
- [HPN patches](https://www.psc.edu/index.php/using-joomla/extensions/templates/atomic/636-hpn-ssh)
- to be swapped with the standard distribution OpenSSH server. This is not
- supported on all Linux distributions and may be force disabled.
-* (required for `STANDARD_NV` instances, optional for `STANDARD_NC` instances)
-`gpu` property defines additional information for NVIDIA GPU-enabled VMs:
- * `nvidia_driver` property contains the following required members:
- * `source` is the source url to download the driver.
-* (optional) `additional_node_prep_commands` is an array of additional commands
-to execute on the compute node host as part of node preparation. This can
-be empty or omitted.
-
-An example pool json template can be found
-[here](../config\_templates/pool.json).
-
-### Jobs
-The jobs schema is as follows:
-
-```json
-{
- "job_specifications": [
- {
- "id": "dockerjob",
- "multi_instance_auto_complete": true,
- "environment_variables": {
- "abc": "xyz"
- },
- "environment_variables_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/myjobenv",
- "max_task_retries": 3,
- "input_data": {
- "azure_batch": [
- {
- "job_id": "someotherjob",
- "task_id": "task-a",
- "include": ["wd/*.dat"],
- "exclude": ["*.txt"],
- "destination": null
- }
- ],
- "azure_storage": [
- {
- "storage_account_settings": "mystorageaccount",
- "container": "jobcontainer",
- "include": ["jobdata*.bin"],
- "destination": "$AZ_BATCH_NODE_SHARED_DIR/jobdata",
- "blobxfer_extra_options": null
- }
- ]
- },
- "tasks": [
- {
- "id": null,
- "depends_on": [
- "taskid-a", "taskid-b", "taskid-c"
- ],
- "depends_on_range": [
- 1, 10
- ],
- "image": "busybox",
- "name": null,
- "labels": [],
- "environment_variables": {
- "def": "123"
- },
- "environment_variables_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/mytaskenv",
- "ports": [],
- "data_volumes": [
- "contdatavol",
- "hosttempvol"
- ],
- "shared_data_volumes": [
- "azurefilevol"
- ],
- "resource_files": [
- {
- "file_path": "",
- "blob_source": "",
- "file_mode": ""
- }
- ],
- "input_data": {
- "azure_batch": [
- {
- "job_id": "previousjob",
- "task_id": "mytask1",
- "include": ["wd/output/*.bin"],
- "exclude": ["*.txt"],
- "destination": null
- }
- ],
- "azure_storage": [
- {
- "storage_account_settings": "mystorageaccount",
- "container": "taskcontainer",
- "include": ["taskdata*.bin"],
- "destination": "$AZ_BATCH_NODE_SHARED_DIR/taskdata",
- "blobxfer_extra_options": null
- }
- ]
- },
- "output_data": {
- "azure_storage": [
- {
- "storage_account_settings": "mystorageaccount",
- "container": "output",
- "source": null,
- "include": ["**/out*.dat"],
- "blobxfer_extra_options": null
- }
- ]
- },
- "remove_container_after_exit": true,
- "shm_size": "256m",
- "additional_docker_run_options": [
- ],
- "infiniband": false,
- "gpu": false,
- "max_task_retries": 3,
- "retention_time": "1.12:00:00",
- "multi_instance": {
- "num_instances": "pool_current_dedicated",
- "coordination_command": null,
- "resource_files": [
- {
- "file_path": "",
- "blob_source": "",
- "file_mode": ""
- }
- ]
- },
- "entrypoint": null,
- "command": ""
- }
- ]
- }
- ]
-}
-```
-
-`job_specifications` array consists of jobs to create.
-* (required) `id` is the job id to create. If the job already exists, the
-specified `tasks` under the job will be added to the existing job.
-* (optional) `multi_instance_auto_complete` enables auto-completion of the job
-for which a multi-task instance is run. This allows automatic cleanup of the
-Docker container in multi-instance tasks. This is defaulted to `true` when
-multi-instance tasks are specified.
-* (optional) `environment_variables` under the job are environment variables
-which will be applied to all tasks operating under the job. Note that
-environment variables are not expanded and are passed as-is. You will need
-to source the environment file `$AZ_BATCH_TASK_WORKING_DIR/.shipyard.envlist`
-in a shell within the docker `command` or `entrypoint` if you want any
-environment variables to be expanded.
-* (optional) `environment_variables_keyvault_secret_id` under the job are
-environment variables stored in KeyVault that should be applied to all tasks
-operating under the job. The secret stored in KeyVault must be a valid json
-string, e.g., `{ "env_var_name": "env_var_value" }`.
-* (optional) `max_task_retries` sets the maximum number of times that
-Azure Batch should retry all tasks in this job for. By default, Azure Batch
-does not retry tasks that fail (i.e. `max_task_retries` is 0).
-* (optional) `input_data` is an object containing data that should be
-ingressed for the job. Any `input_data` defined at this level will be
-downloaded for this job which can be run on any number of compute nodes
-depending upon the number of constituent tasks and repeat invocations. However,
-`input_data` is only downloaded once per job invocation on a compute node.
-For example, if `job-1`:`task-1` is run on compute node A and then
-`job-1`:`task-2` is run on compute node B, then this `input_data` is ingressed
-to both compute node A and B. However, if `job-1`:`task-3` is then run on
-compute node A after `job-1`:`task-1`, then the `input_data` is not
-transferred again. This object currently supports `azure_batch` and
-`azure_storage` as members.
- * `azure_batch` contains the following members:
- * (required) `job_id` the job id of the task
- * (required) `task_id` the id of the task to fetch files from
- * (optional) `include` is an array of include filters
- * (optional) `exclude` is an array of exclude filters
- * (required) `destination` is the destination path to place the files
- * `azure_storage` contains the following members:
- * (required) `storage_account_settings` contains a storage account link
- as defined in the credentials json.
- * (required) `container` or `file_share` is required when downloading
- from Azure Blob Storage or Azure File Storage, respectively.
- `container` specifies which container to download from for Azure Blob
- Storage while `file_share` specifies which file share to download from
- for Azure File Storage. Only one of these properties can be specified
- per `data_transfer` object.
- * (optional) `include` property defines an optional include filter.
- Although this property is an array, it is only allowed to have 1
- maximum filter.
- * (required) `destination` property defines where to place the
- downloaded files on the host file system. Please note that you should
- not specify a destination that is on a shared file system. If you
- require ingressing to a shared file system location like a GlusterFS
- volume, then use the global configuration `files` property and the
- `data ingress` command.
- * (optional) `blobxfer_extra_options` are any extra options to pass to
- `blobxfer`.
-* (required) `tasks` is an array of tasks to add to the job.
- * (optional) `id` is the task id. Note that if the task `id` is null or
- empty then a generic task id will be assigned. The generic task id is
- formatted as `dockertask-NNNNN` where `NNNNN` starts from `00000` and is
- increased by 1 for each task added to the same job. If there are more
- than `99999` autonamed tasks in a job then the numbering is not
- padded for tasks exceeding 5 digits.
- * (optional) `depends_on` is an array of task ids for which this container
- invocation (task) depends on and must run to successful completion prior
- to this task executing.
- * (optional) `depends_on_range` is an array with exactly two integral
- elements containing a task `id` range for which this task is dependent
- upon, i.e., the start `id` and the end `id` for which this task depends
- on. Although task `id`s are always strings, the dependent task `id`s for
- ranges must be expressed by their integral representation for this
- property. This also implies that task `id`s for which this task depends
- on must be integral in nature. For example, if `depends_on_range` is set
- to `[1, 10]` (note the integral members), then there should be task
- `id`s of `"1"`, `"2"`, ... `"10"` within the job. Once these dependent
- tasks complete successfully, then this specified task will execute.
- * (required) `image` is the Docker image to use for this task
- * (optional) `name` is the name to assign to the container. If not
- specified, the value of the `id` property will be used for `name`.
- * (optional) `labels` is an array of labels to apply to the container.
- * (optional) `environment_variables` are any additional task-specific
- environment variables that should be applied to the container. Note that
- environment variables are not expanded and are passed as-is. You will
- need to source the environment file
- `$AZ_BATCH_TASK_WORKING_DIR/.shipyard.envlist` in a shell within the
- docker `command` or `entrypoint` if you want any environment variables
- to be expanded.
- * (optional) `environment_variables_keyvault_secret_id` are any additional
- task-specific environment variables that should be applied to the
- container but are stored in KeyVault. The secret stored in KeyVault must
- be a valid json string, e.g., `{ "env_var_name": "env_var_value" }`.
- * (optional) `ports` is an array of port specifications that should be
- exposed to the host.
- * (optional) `data_volumes` is an array of `data_volume` aliases as defined
- in the global configuration file. These volumes will be mounted in the
- container.
- * (optional) `shared_data_volumes` is an array of `shared_data_volume`
- aliases as defined in the global configuration file. These volumes will be
- mounted in the container.
- * (optional) `resource_files` is an array of resource files that should be
- downloaded as part of the task. Each array entry contains the following
- information:
- * `file_path` is the path within the task working directory to place the
- file on the compute node.
- * `blob_source` is an accessible HTTP/HTTPS URL. This need not be an Azure
- Blob Storage URL.
- * `file_mode` if the file mode to set for the file on the compute node.
- This is optional.
- * (optional) `input_data` is an object containing data that should be
- ingressed for this specific task. This object currently supports
- `azure_batch` and `azure_storage` as members. Note for multi-instance
- tasks, transfer of `input_data` is only applied to the task running the
- application command.
- * `azure_batch` contains the following members:
- * (required) `job_id` the job id of the task
- * (required) `task_id` the id of the task to fetch files from
- * (optional) `include` is an array of include filters
- * (optional) `exclude` is an array of exclude filters
- * (optional) `destination` is the destination path to place the files.
- If `destination` is not specified at this level, then files are
- defaulted to download into `$AZ_BATCH_TASK_WORKING_DIR`.
- * `azure_storage` contains the following members:
- * (required) `storage_account_settings` contains a storage account link
- as defined in the credentials json.
- * (required) `container` or `file_share` is required when downloading
- from Azure Blob Storage or Azure File Storage, respectively.
- `container` specifies which container to download from for Azure Blob
- Storage while `file_share` specifies which file share to download from
- for Azure File Storage. Only one of these properties can be specified
- per `data_transfer` object.
- * (optional) `include` property defines an optional include filter.
- Although this property is an array, it is only allowed to have 1
- maximum filter.
- * (optional) `destination` property defines where to place the
- downloaded files on the host file system. Unlike the job-level
- version of `input_data`, this `destination` property can be ommitted.
- If `destination` is not specified at this level, then files are
- defaulted to download into `$AZ_BATCH_TASK_WORKING_DIR`. Please note
- that you should not specify a destination that is on a shared file
- system. If you require ingressing to a shared file system location
- like a GlusterFS volume, then use the global configuration `files`
- property and the `data ingress` command.
- * (optional) `blobxfer_extra_options` are any extra options to pass to
- `blobxfer`.
- * (optional) `output_data` is an object containing data that should be
- egressed for this specific task if and only if the task completes
- successfully. This object currently only supports `azure_storage` as a
- member. Note for multi-instance tasks, transfer of `output_data` is only
- applied to the task running the application command.
- * `azure_storage` contains the following members:
- * (required) `storage_account_settings` contains a storage account link
- as defined in the credentials json.
- * (required) `container` or `file_share` is required when uploading to
- Azure Blob Storage or Azure File Storage, respectively. `container`
- specifies which container to upload to for Azure Blob Storage while
- `file_share` specifies which file share to upload to for Azure File
- Storage. Only one of these properties can be specified per
- `data_transfer` object.
- * (optional) `source` property defines which directory to upload to
- Azure storage. If `source` is not specified, then `source` is
- defaulted to `$AZ_BATCH_TASK_DIR`.
- * (optional) `include` property defines an optional include filter.
- Although this property is an array, it is only allowed to have 1
- maximum filter.
- * (optional) `blobxfer_extra_options` are any extra options to pass to
- `blobxfer`.
- * (optional) `remove_container_after_exit` property specifies if the
- container should be automatically removed/cleaned up after it exits. This
- defaults to `false`.
- * (optional) `shm_size` property specifies the size of `/dev/shm` in
- the container. The default is `64m`. The postfix unit can be designated
- as `b` (bytes), `k` (kilobytes), `m` (megabytes), or `g` (gigabytes). This
- value may need to be increased from the default of `64m` for certain
- Docker applications, including multi-instance tasks using Intel MPI
- (see [issue #8](https://github.com/Azure/batch-shipyard/issues/8)).
- * (optional) `additional_docker_run_options` is an array of addition Docker
- run options that should be passed to the Docker daemon when starting this
- container.
- * (optional) `infiniband` designates if this container requires access to the
- Infiniband/RDMA devices on the host. Note that this will automatically
- force the container to use the host network stack. If this property is
- set to `true`, ensure that the `pool_specification` property
- `inter_node_communication_enabled` is set to `true`.
- * (optional) `gpu` designates if this container requires access to the GPU
- devices on the host. If this property is set to `true`, Docker containers
- are instantiated via `nvidia-docker`. This requires N-series VM instances.
- * (optional) `max_task_retries` sets the maximum number of times that
- Azure Batch should retry this task for. This overrides the job-level task
- retry count. By default, Azure Batch does not retry tasks that fail
- (i.e. `max_task_retries` is 0).
- * (optional) `retention_time` sets the timedelta to retain the task
- directory on the compute node where it ran after the task completes.
- The format for this property is a timedelta with a string representation
- of "d.HH:mm:ss". For example, "1.12:00:00" would allow the compute node
- to clean up this task's directory 36 hours after the task completed. The
- default, if unspecified, is effectively infinite - i.e., task data is
- retained forever on the compute node that ran the task.
- * (optional) `multi_instance` is a property indicating that this task is a
- multi-instance task. This is required if the Docker image is an MPI
- program. Additional information about multi-instance tasks and Batch
- Shipyard can be found
- [here](80-batch-shipyard-multi-instance-tasks.md). Do not define this
- property for tasks that are not multi-instance. Additional members of this
- property are:
- * `num_instances` is a property setting the number of compute node
- instances are required for this multi-instance task. This can be any one
- of the following:
- 1. An integral number
- 2. `pool_current_dedicated` which is the instantaneous reading of the
- target pool's current dedicated count during this function invocation.
- 3. `pool_specification_vm_count` which is the `vm_count` specified in the
- pool configuration.
- * `coordination_command` is the coordination command this is run by each
- instance (compute node) of this multi-instance task prior to the
- application command. This command must not block and must exit
- successfully for the multi-instance task to proceed. This is the command
- passed to the container in `docker run` for multi-instance tasks. This
- docker container instance will automatically be daemonized. This is
- optional and may be null.
- * `resource_files` is an array of resource files that should be downloaded
- as part of the multi-instance task. Each array entry contains the
- following information:
- * `file_path` is the path within the task working directory to place
- the file on the compute node.
- * `blob_source` is an accessible HTTP/HTTPS URL. This need not be an
- Azure Blob Storage URL.
- * `file_mode` if the file mode to set for the file on the compute node.
- This is optional.
- * (optional) `entrypoint` is the property that can override the Docker image
- defined `ENTRYPOINT`.
- * (optional) `command` is the command to execute in the Docker container
- context. If this task is a regular non-multi-instance task, then this is
- the command passed to the container context during `docker run`. If this
- task is a multi-instance task, then this `command` is the application
- command and is executed with `docker exec` in the running Docker container
- context from the `coordination_command` in the `multi_instance` property.
- This property may be null.
-
-An example jobs json template can be found
-[here](../config\_templates/jobs.json).
-
## Batch Shipyard Usage
Continue on to [Batch Shipyard Usage](20-batch-shipyard-usage.md).
diff --git a/docs/11-batch-shipyard-configuration-credentials.md b/docs/11-batch-shipyard-configuration-credentials.md
index 9db8fae..44e365f 100644
--- a/docs/11-batch-shipyard-configuration-credentials.md
+++ b/docs/11-batch-shipyard-configuration-credentials.md
@@ -36,7 +36,7 @@ The credentials schema is as follows:
"rsa_private_key_pem": "/path/to/privkey.pem",
"x509_cert_sha1_thumbprint": "01AB02CD...",
"user": "me@domain.com",
- "password": "password"
+ "password": "password",
"token_cache": {
"enabled": true,
"filename": ""
@@ -59,6 +59,7 @@ The credentials schema is as follows:
"filename": ""
}
},
+ "resource_group": "",
"account_key": "batchaccountkey",
"account_key_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/batchkey"
},
@@ -149,9 +150,12 @@ under the `batch` property can be found in the
* (required) `account_service_url` is the Batch account service URL.
* (required for UserSubscription accounts, optional otherwise) `aad` AAD
authentication parameters for Azure Batch.
+ * (optional) `resource_group` is the resource group containing the Batch
+ account. This is only required if using a UserSubscription Batch account
+ with `aad` authentication.
* (required unless `aad` is specified) `account_key` is the shared
- key. This is required for non-AAD logins. This is ignored if the `aad`
- property is specified.
+ key. This is required for non-AAD logins. This option takes precendence
+ over the `aad` property if specified.
* (optional) `account_key_keyvault_secret_id` property can be used to
reference an Azure KeyVault secret id. Batch Shipyard will contact the
specified KeyVault and replace the `account_key` value as returned by
diff --git a/docs/12-batch-shipyard-configuration-global.md b/docs/12-batch-shipyard-configuration-global.md
new file mode 100644
index 0000000..7bc583c
--- /dev/null
+++ b/docs/12-batch-shipyard-configuration-global.md
@@ -0,0 +1,418 @@
+# Batch Shipyard Global Configuration
+This page contains in-depth details on how to configure the global
+json file for Batch Shipyard.
+
+## Schema
+The global config schema is as follows:
+
+```json
+{
+ "batch_shipyard": {
+ "storage_account_settings": "mystorageaccount",
+ "storage_entity_prefix": "shipyard",
+ "generated_sas_expiry_days": 90,
+ "encryption" : {
+ "enabled": true,
+ "pfx": {
+ "filename": "encrypt.pfx",
+ "passphrase": "mysupersecretpassword",
+ "sha1_thumbprint": "123456789..."
+ },
+ "public_key_pem": "encrypt.pem"
+ }
+ },
+ "docker_registry": {
+ "private": {
+ "allow_public_docker_hub_pull_on_missing": true,
+ "server": "myserver-myorg.azurecr.io",
+ "azure_storage": {
+ "storage_account_settings": "mystorageaccount",
+ "container": "mydockerregistry"
+ }
+ }
+ },
+ "data_replication": {
+ "peer_to_peer": {
+ "enabled": true,
+ "compression": true,
+ "concurrent_source_downloads": 10,
+ "direct_download_seed_bias": null
+ },
+ "non_peer_to_peer_concurrent_downloading": true
+ },
+ "global_resources": {
+ "docker_images": [
+ "busybox",
+ "redis:3.2.3-alpine",
+ ],
+ "files": [
+ {
+ "source": {
+ "path": "/some/local/path/dir",
+ "include": ["*.dat"],
+ "exclude": ["*.bak"]
+ },
+ "destination": {
+ "shared_data_volume": "glustervol",
+ "relative_destination_path": "myfiles",
+ "data_transfer": {
+ "method": "multinode_scp",
+ "ssh_private_key": "id_rsa_shipyard",
+ "scp_ssh_extra_options": "-C -c aes256-gcm@openssh.com",
+ "rsync_extra_options": "",
+ "split_files_megabytes": 500,
+ "max_parallel_transfers_per_node": 2
+ }
+ }
+ },
+ {
+ "source": {
+ "path": "/some/local/path/bound/for/blob",
+ "include": ["*.bin"]
+ },
+ "destination": {
+ "storage_account_settings": "mystorageaccount",
+ "data_transfer": {
+ "container": "mycontainer",
+ "blobxfer_extra_options": "--no-computefilemd5"
+ }
+ }
+ },
+ {
+ "source": {
+ "path": "/another/local/path/dir",
+ "include": [],
+ "exclude": []
+ },
+ "destination": {
+ "relative_destination_path": "relpath/on/host",
+ "data_transfer": {
+ "method": "rsync+ssh",
+ "ssh_private_key": "id_rsa_shipyard",
+ "scp_ssh_extra_options": "-c aes256-gcm@openssh.com",
+ "rsync_extra_options": "-v"
+ }
+ }
+ }
+ ],
+ "docker_volumes": {
+ "data_volumes": {
+ "abcvol": {
+ "host_path": null,
+ "container_path": "/abc"
+ },
+ "hosttempvol": {
+ "host_path": "/tmp",
+ "container_path": "/hosttmp"
+ }
+ },
+ "shared_data_volumes": {
+ "shipyardvol": {
+ "volume_driver": "azurefile",
+ "storage_account_settings": "mystorageaccount",
+ "azure_file_share_name": "shipyardshared",
+ "container_path": "$AZ_BATCH_NODE_SHARED_DIR/azfile",
+ "mount_options": [
+ "filemode=0777",
+ "dirmode=0777",
+ "nolock=true"
+ ]
+ },
+ "glustervol": {
+ "volume_driver": "glusterfs_on_compute",
+ "container_path": "$AZ_BATCH_NODE_SHARED_DIR/gfs",
+ "volume_type": "replica",
+ "volume_options": [
+ "performance.cache-size 1 GB",
+ "performance.cache-max-file-size 10 MB",
+ "performance.cache-refresh-timeout 61",
+ ]
+ }
+ }
+ }
+ }
+}
+```
+
+The `batch_shipyard` property is used to set settings for the tool.
+* (required) `storage_account_settings` is a link to the alias of the storage
+account specified, in this case, it is `mystorageaccount`. Batch shipyard
+requires a storage account for storing metadata in order to execute across a
+distributed environment.
+* (optional) `storage_entity_prefix` property is used as a generic qualifier
+to prefix storage containers (blob containers, tables, queues) with. If not
+specified, defaults to `shipyard`.
+* (optional) `generated_sas_expiry_days` property is used to set the number of
+days any generated SAS key by Batch Shipyard is valid for. The default is 30
+days. This is useful if you have long-lived pools and want to ensure that
+SAS keys are valid for longer periods of time.
+* (optional) `encryption` object is used to define credential encryption which
+contains the following members:
+ * (required) `enabled` property enables or disables this feature.
+ * (required) `pfx` object defines the PFX certificate
+ * (required) `filename` property is the full path and name to the PFX
+ certificate
+ * (required) `passphrase` property is the passphrase for the PFX
+ certificate. This cannot be empty.
+ * (optional) `sha1_thumbprint` is the SHA1 thumbprint of the
+ certificate. If the PFX file is created using the `cert create` command,
+ then the SHA1 thumbprint is output. It is recommended to populate this
+ property such that it does not have to be generated when needed for
+ encryption.
+ * (optional) `public_key_pem` property is the full path and name to the
+ RSA public key in PEM format. If the PFX file is created using the
+ `cert create` command, then this file is generated along with the PFX
+ file. It is recommended to populate this property with the PEM file path
+ such that it does not have to be generated when needed for encryption.
+
+The `docker_registry` property is used to configure Docker image distribution
+options from public/private Docker hub and private registries.
+* (optional) `private` property controls settings for interacting with private
+registries. There are three kinds of private registries that are supported:
+(1) private registries hosted on Docker Hub, (2) Internet accessible
+registries such as those hosted by the
+[Azure Container Registry](https://azure.microsoft.com/en-us/services/container-registry/)
+service and (3) [private registry instances backed to
+Azure Blob Storage](https://azure.microsoft.com/en-us/documentation/articles/virtual-machines-linux-docker-registry-in-blob-storage/)
+and are run on compute nodes. To use private registries hosted on Docker Hub,
+no additional properties need to be specified here, instead, specify your
+Docker Hub login information in the credentials json. To specify a private
+registry other than on Docker Hub, a json property named `server` should be
+defined. To use a private registry backed by Azure Blob Storage, define a
+json object named `azure_storage`. Note that a maximum of only one of these
+three types private registries may be specified at once. The following
+describes members of the non-Docker Hub private registries supported:
+ * (optional) `server` object is a property that is the fully-qualified host
+ name to a private registry server. A specific port other than 80 can be
+ specified using a `:` separator, e.g.,
+ `mydockerregistry.com:8080`. Port 80 is the default if no port is
+ specified. The value of this property should have an associated login
+ in the credentials json file.
+ * (optional) `azure_storage` object is to define settings for connecting
+ to a private registry backed by Azure Storage blobs and where the
+ private registry instances are hosted on the compute nodes themselves.
+ * (required) `storage_account_settings` is a link to the alias of the
+ storage account specified that stores the private registry blobs.
+ * (required) `container` property is the name of the Azure Blob
+ container holding the private registry blobs.
+ * (optional) `allow_public_docker_hub_pull_on_missing` property allows
+ pass-through of Docker image retrieval to public Docker Hub if it is
+ missing in the private registry. This defaults to `false` if not
+ specified. Note that this setting does not apply to a missing Docker
+ image that is allowed to run via the job property
+ `allow_run_on_missing_image`.
+
+The `data_replication` property is used to configure the internal image
+replication mechanism between compute nodes within a compute pool. The
+`non_peer_to_peer_concurrent_downloading` property specifies if it is ok
+to allow unfettered concurrent downloading from the source registry among
+all compute nodes. The following options apply to `peer_to_peer` data
+replication options:
+* (optional) `enabled` property enables or disables private peer-to-peer
+transfer. Note that for compute pools with a relatively small number of VMs,
+peer-to-peer transfer may not provide any benefit and is recommended to be
+disabled in these cases. Compute pools with large number of VMs and especially
+in the case of an Azure Storage-backed private registry can benefit from
+peer-to-peer image replication.
+* `compression` property enables or disables compression of image files. It
+is strongly recommended to keep this enabled.
+* `concurrent_source_downloads` property specifies the number of
+simultaneous downloads allowed to each image.
+* `direct_download_seed_bias` property sets the number of direct download
+seeds to prefer per image before switching to peer-to-peer transfer.
+
+The `global_resources` property contains information regarding required
+Docker images, volume configuration and data ingress information. This
+property is required.
+
+`docker_images` is an array of docker images that should be installed on
+every compute node when this configuration file is supplied while creating
+a compute pool. Image tags are supported. Image names should not include
+private registry server names, as these will be automatically prepended. For
+instance, if you have an image `abc/mytag` on your private registry
+`myregistry-myorg.azurecr.io`, your image should be named in the
+`docker_images` array as `abc/mytag` and not
+`myregistry-myorg.azurecr.io/abc/mytag`.
+
+`files` is an optional property that specifies data that should be ingressed
+from a location accessible by the local machine (i.e., machine invoking
+`shipyard.py` to a shared file system location accessible by compute nodes
+in the pool or Azure Blob or File Storage). `files` is a json list of objects,
+which allows for multiple sources to destinations to be ingressed during the
+same invocation. Note that no Azure Batch environment variables
+(i.e., `$AZ_BATCH_`-style environment variables) are available as path
+arguments since ingress actions performed within `files` are done locally
+on the machine invoking `shipyard.py`. Each object within the `files` list
+contains the following members:
+* (required) `source` property contains the following members:
+ * (required) `path` is a local path. A single file or a directory
+ can be specified. Filters below will be ignored if `path` is a file and
+ not a directory.
+ * (optional) `include` is an array of
+ [Unix shell-style wildcard filters](https://docs.python.org/3.5/library/fnmatch.html)
+ where only files matching a filter are included in the data transfer.
+ Filters specified in `include` have precedence over `exclude` described
+ next. `include` can only have a maximum of 1 filter for ingress to Azure
+ Blob Storage. In this example, all files ending in `.dat` are ingressed.
+ * (optional) `exclude` is an array of
+ [Unix shell-style wildcard filters](https://docs.python.org/3.5/library/fnmatch.html)
+ where files matching a filter are excluded from the data transfer. Filters
+ specified in `include` have precedence over filters specified in
+ `exclude`. `exclude` cannot be specified for ingress into Azure Blob
+ Storage. In this example, all files ending in `.bak` are skipped for
+ ingress.
+* (required) `destination` property contains the following members:
+ * (required or optional) `shared_data_volume` or `storage_account_settings`
+ for data ingress to a GlusterFS volume or Azure Blob or File Storage. If
+ you are ingressing to a pool with only one compute node, you may omit
+ `shared_data_volume`. Otherwise, you may specify one or the other, but
+ not both in the same object. Please see below in the
+ `shared_data_volumes` for information on how to set up a GlusterFS share.
+ * (required or optional) `relative_destination_path` specifies a relative
+ destination path to place the files, with respect to the target root.
+ If transferring to a `shared_data_volume` then this is relative to the
+ GlusterFS volume root. If transferring to a pool with one single node in
+ it, thus, no `shared_data_volume` is specified in the prior property, then
+ this is relative to
+ [$AZ_BATCH_NODE_ROOT_DIR](https://azure.microsoft.com/en-us/documentation/articles/batch-api-basics/#files-and-directories).
+ To place files directly in `$AZ_BATCH_NODE_ROOT_DIR` (not recommended),
+ you can specify this property as empty string when not ingressing to
+ a `shared_data_volume`. Note that if `scp` is selected while attempting
+ to transfer directly to this aforementioned path, then `scp` will fail
+ with exit code of 1 but the transfer will have succeeded (this is due
+ to some of the permission options). If this property is not specified for
+ a `shared_data_volume`, then files will be placed directly in the
+ GlusterFS volume root. This property cannot be specified for a Azure
+ Storage destination (i.e., `storage_account_settings`).
+ * (required) `data_transfer` specifies how the transfer should take place.
+ The following list contains members for GlusterFS ingress when a GlusterFS
+ volume is provided for `shared_data_volume` (see below for ingressing to
+ Azure Blob or File Storage):
+ * (required) `method` specified which method should be used to ingress
+ data, which should be one of: `scp`, `multinode_scp`, `rsync+ssh` or
+ `multinode_rsync+ssh`. `scp` will use secure copy to copy a file or a
+ directory (recursively) to the remote share path. `multinode_scp` will
+ attempt to simultaneously transfer files to many compute nodes using
+ `scp` at the same time to speed up data transfer. `rsync+ssh` will
+ perform an rsync of files through SSH. `multinode_rsync+ssh` will
+ attempt to simultaneously transfer files using `rsync` to many compute
+ nodes at the same time to speed up data transfer with. Note that you may
+ specify the `multinode_*` methods even with only 1 compute node in a
+ pool which will allow you to take advantage of
+ `max_parallel_transfers_per_node` below.
+ * (optional) `ssh_private_key` location of the SSH private key for the
+ username specified in the `pool_specification`:`ssh` section when
+ connecting to compute nodes. The default is `id_rsa_shipyard`, if
+ omitted, which is automatically generated if no SSH key is specified
+ when an SSH user is added to a pool.
+ * (optional) `scp_ssh_extra_options` are any extra options to pass to
+ `scp` or `ssh` for `scp`/`multinode_scp` or
+ `rsync+ssh`/`multinode_rsync+ssh` methods, respectively. In the example
+ above, `-C` enables compression and `-c aes256-gcm@openssh.com`
+ is passed to `scp`, which can potentially increase the transfer speed by
+ selecting the `aes256-gcm@openssh.com` cipher which can exploit Intel
+ AES-NI.
+ * (optional) `rsync_extra_options` are any extra options to pass to
+ `rsync` for the `rsync+ssh`/`multinode_rsync+ssh` transfer methods. This
+ property is ignored for non-rsync transfer methods.
+ * (optional) `split_files_megabytes` splits files into chunks with the
+ specified size in MiB. This can potentially help with very large files.
+ This option forces the transfer `method` to `multinode_scp`.
+ Note that the destination file system must be able to accommodate
+ up to 2x the size of files which are split. Additionally, transfers
+ involving files which are split will incur reconstruction costs after
+ the transfer is complete, which will increase the total end-to-end
+ ingress time. However, in certain scenarios, by splitting files and
+ transferring chunks in parallel along with reconstruction may end up
+ being faster than transferring a large file without chunking.
+ * (optional) `max_parallel_transfers_per_node` is the maximum number of
+ parallel transfer to invoke per node with the
+ `multinode_scp`/`multinode_rsync+ssh` methods. For example, if there
+ are 3 compute nodes in the pool, and `2` is given for this option, then
+ there will be up to 2 scp sessions in parallel per compute node for a
+ maximum of 6 concurrent scp sessions to the pool. The default is 1 if
+ not specified or omitted.
+ * (required) `data_transfer` specifies how the transfer should take place.
+ When Azure Blob or File Storage is selected as the destination for data
+ ingress, [blobxfer](https://github.com/Azure/blobxfer) is invoked. The
+ following list contains members for Azure Blob or File Storage ingress
+ when a storage account link is provided for `storage_account_settings`:
+ * (required) `container` or `file_share` is required when uploading to
+ Azure Blob Storage or Azure File Storage, respectively. `container`
+ specifies which container to upload to for Azure Blob Storage while
+ `file_share` specifies which file share to upload to for Azure File
+ Storage. Only one of these properties can be specified per
+ `data_transfer` object. The container or file share need not be created
+ beforehand.
+ * (optional) `blobxfer_extra_options` are any extra options to pass to
+ `blobxfer`. In the example above, `--no-computefilemd5` will force
+ `blobxfer` to skip MD5 calculation on files ingressed.
+
+`docker_volumes` is an optional property that can consist of two
+different types of volumes: `data_volumes` and `shared_data_volumes`.
+`data_volumes` can be of two flavors depending upon if `host_path` is set to
+null or not. In the former, this is typically used with the `VOLUME` keyword
+in Dockerfiles to initialize a data volume with existing data inside the
+image. If `host_path` is set, then the path on the host is mounted in the
+container at the path specified with `container_path`.
+
+`shared_data_volumes` is an optional property for initializing persistent
+shared storage volumes. In the first shared volume, `shipyardvol` is the alias
+of this volume:
+* `volume_driver` property specifies the Docker Volume Driver to use.
+Currently Batch Shipyard only supports the `volume_driver` as `azurefile` or
+`glusterfs_on_compute`. Note that `glusterfs_on_compute` is not a true Docker
+Volume Driver. For this volume (`shipyardvol`), as this is an Azure File
+shared volume, the `volume_driver` should be set as `azurefile`.
+* `storage_account_settings` is a link to the alias of the storage account
+specified that holds this Azure File Share.
+* `azure_file_share_name` is the name of the share name on Azure Files. Note
+that the Azure File share must be created beforehand, the toolkit does not
+create Azure File shares, it only mounts them to the compute nodes.
+* `container_path` is the path in the container to mount.
+* `mount_options` are the mount options to pass to the mount command. Supported
+options are documented
+[here](https://github.com/Azure/azurefile-dockervolumedriver). It is
+recommended to use `0777` for both `filemode` and `dirmode` as the `uid` and
+`gid` cannot be reliably determined before the compute pool is allocated and
+this volume will be mounted as the root user.
+
+Note that when using `azurefile` for a shared data volume, the storage account
+that holds the file share must reside within the same Azure region as the
+Azure Batch compute pool. Attempting to mount an Azure File share that is
+cross-region will result in failure as current Linux Samba clients do not
+support share level encryption at this time.
+
+The second shared volue, `glustervol`, is a
+[GlusterFS](https://www.gluster.org/) network file system. Please note that
+`glusterfs_on_compute` are GlusterFS volumes co-located on the VM's temporary
+local disk space which is a shared resource. Sizes of the local temp disk for
+each VM size can be found
+[here](https://azure.microsoft.com/en-us/documentation/articles/virtual-machines-windows-sizes/).
+If specifying a `glusterfs_on_compute` volume, you must enable internode
+communication in the pool configuration file. These volumes have the following
+properties:
+* (required) `volume_driver` property should be set as `glusterfs_on_compute`.
+* (required) `container_path` is the path in the container to mount.
+* (optional) `volume_type` property defines the GlusterFS volume type.
+Currently, `replica` is the only supported type.
+* (optional) `volume_options` property defines additional GlusterFS volume
+options to set.
+
+`glusterfs_on_compute` volumes are mounted on the host at
+`$AZ_BATCH_NODE_SHARED_DIR/.gluster/gv0`. Batch Shipyard will automatically
+replace container path references in direct and storage-based data
+ingress/egress with their host path equivalents.
+
+Note that when resizing a pool with a `glusterfs_on_compute` shared file
+systems that you must resize with the `pool resize` command in `shipyard.py`
+and not with Azure Portal, Batch Explorer or any other tool.
+
+Finally, note that all `docker_volumes` can be omitted completely along with
+one or all of `data_volumes` and `shared_data_volumes` if you do not require
+this functionality.
+
+## Full template
+An full template of a credentials file can be found
+[here](../config\_templates/config.json). Note that this template cannot
+be used as-is and must be modified to fit your scenario.
diff --git a/docs/13-batch-shipyard-configuration-pool.md b/docs/13-batch-shipyard-configuration-pool.md
new file mode 100644
index 0000000..f902214
--- /dev/null
+++ b/docs/13-batch-shipyard-configuration-pool.md
@@ -0,0 +1,163 @@
+# Batch Shipyard Pool Configuration
+This page contains in-depth details on how to configure the pool
+json file for Batch Shipyard.
+
+## Schema
+The pool schema is as follows:
+
+```json
+{
+ "pool_specification": {
+ "id": "dockerpool",
+ "vm_size": "STANDARD_A9",
+ "vm_count": 10,
+ "max_tasks_per_node": 1,
+ "inter_node_communication_enabled": true,
+ "publisher": "OpenLogic",
+ "offer": "CentOS-HPC",
+ "sku": "7.1",
+ "reboot_on_start_task_failed": true,
+ "block_until_all_global_resources_loaded": true,
+ "transfer_files_on_pool_creation": false,
+ "input_data": {
+ "azure_batch": [
+ {
+ "job_id": "jobonanotherpool",
+ "task_id": "mytask",
+ "include": ["wd/*.dat"],
+ "exclude": ["*.txt"],
+ "destination": "$AZ_BATCH_NODE_SHARED_DIR/jobonanotherpool"
+ }
+ ],
+ "azure_storage": [
+ {
+ "storage_account_settings": "mystorageaccount",
+ "container": "poolcontainer",
+ "include": ["pooldata*.bin"],
+ "destination": "$AZ_BATCH_NODE_SHARED_DIR/pooldata",
+ "blobxfer_extra_options": null
+ }
+ ]
+ },
+ "ssh": {
+ "username": "docker",
+ "expiry_days": 7,
+ "ssh_public_key": null,
+ "generate_docker_tunnel_script": true,
+ "generated_file_export_path": null,
+ "hpn_server_swap": false
+ },
+ "gpu": {
+ "nvidia_driver": {
+ "source": "https://some.url"
+ }
+ },
+ "additional_node_prep_commands": [
+ ]
+ }
+}
+```
+
+The `pool_specification` property has the following members:
+* (required) `id` is the compute pool ID.
+* (required) `vm_size` is the
+[Azure Virtual Machine Instance Size](https://azure.microsoft.com/en-us/pricing/details/virtual-machines/).
+Please note that not all regions have every VM size available.
+* (required) `vm_count` is the number of compute nodes to allocate.
+* (optional) `max_tasks_per_node` is the maximum number of concurrent tasks
+that can be running at any one time on a compute node. This defaults to a
+value of 1 if not specified.
+* (optional) `inter_node_communication_enabled` designates if this pool is set
+up for inter-node communication. This must be set to `true` for any containers
+that must communicate with each other such as MPI applications. This property
+will be force enabled if peer-to-peer replication is enabled.
+* (required) `publisher` is the publisher name of the Marketplace VM image.
+* (required) `offer` is the offer name of the Marketplace VM image.
+* (required) `sku` is the sku name of the Marketplace VM image.
+* (optional) `reboot_on_start_task_failed` allows Batch Shipyard to reboot the
+compute node in case there is a transient failure in node preparation (e.g.,
+network timeout, resolution failure or download problem). This defaults to
+`false`.
+* (optional) `block_until_all_global_resources_loaded` will block the node
+from entering ready state until all Docker images are loaded. This defaults
+to `true`.
+* (optional) `transfer_files_on_pool_creation` will ingress all `files`
+specified in the `global_resources` section of the configuration json when
+the pool is created. If files are to be ingressed to Azure Blob or File
+Storage, then data movement operations are overlapped with the creation of the
+pool. If files are to be ingressed to a shared file system on the compute
+nodes, then the files are ingressed after the pool is created and the shared
+file system is ready. Files can be ingressed to both Azure Blob Storage and a
+shared file system during the same pool creation invocation. If this property
+is set to `true` then `block_until_all_global_resources_loaded` will be force
+disabled. If omitted, this property defaults to `false`.
+* (optional) `input_data` is an object containing data that should be
+ingressed to all compute nodes as part of node preparation. It is
+important to note that if you are combining this action with `files` and
+are ingressing data to Azure Blob or File storage as part of pool creation,
+that the blob containers or file shares defined here will be downloaded as
+soon as the compute node is ready to do so. This may result in the blob
+container/blobs or file share/files not being ready in time for the
+`input_data` transfer. It is up to you to ensure that these two operations do
+not overlap. If there is a possibility of overlap, then you should ingress
+data defined in `files` prior to pool creation and disable the option above
+`transfer_files_on_pool_creation`. This object currently supports
+`azure_batch` and `azure_storage` as members.
+ * `azure_batch` contains the following members:
+ * (required) `job_id` the job id of the task
+ * (required) `task_id` the id of the task to fetch files from
+ * (optional) `include` is an array of include filters
+ * (optional) `exclude` is an array of exclude filters
+ * (required) `destination` is the destination path to place the files
+ * `azure_storage` contains the following members:
+ * (required) `storage_account_settings` contains a storage account link
+ as defined in the credentials json.
+ * (required) `container` or `file_share` is required when downloading
+ from Azure Blob Storage or Azure File Storage, respectively.
+ `container` specifies which container to download from for Azure Blob
+ Storage while `file_share` specifies which file share to download from
+ for Azure File Storage. Only one of these properties can be specified
+ per `data_transfer` object.
+ * (optional) `include` property defines an optional include filter.
+ Although this property is an array, it is only allowed to have 1
+ maximum filter.
+ * (required) `destination` property defines where to place the
+ downloaded files on the host file system. Please note that you should
+ not specify a destination that is on a shared file system. If you
+ require ingressing to a shared file system location like a GlusterFS
+ volume, then use the global configuration `files` property and the
+ `data ingress` command.
+ * (optional) `blobxfer_extra_options` are any extra options to pass to
+ `blobxfer`.
+* (optional) `ssh` is the property for creating a user to accomodate SSH
+sessions to compute nodes. If this property is absent, then an SSH user is not
+created with pool creation.
+ * (required) `username` is the user to create on the compute nodes.
+ * (optional) `expiry_days` is the number of days from now for the account on
+ the compute nodes to expire. The default is 30 days from invocation time.
+ * (optional) `ssh_public_key` is the path to an existing SSH public key to
+ use. If not specified, an RSA public/private keypair will be automatically
+ generated only on Linux. If this is `null` or not specified on Windows,
+ the SSH user is not created.
+ * (optional) `generate_docker_tunnel_script` property directs script to
+ generate an SSH tunnel script that can be used to connect to the remote
+ Docker engine running on a compute node.
+ * (optional) `generated_file_export_path` is the path to export the
+ generated RSA keypair and docker tunnel script to. If omitted, the
+ current directory is used.
+ * (experimental) `hpn_server_swap` property enables an OpenSSH server with
+ [HPN patches](https://www.psc.edu/index.php/using-joomla/extensions/templates/atomic/636-hpn-ssh)
+ to be swapped with the standard distribution OpenSSH server. This is not
+ supported on all Linux distributions and may be force disabled.
+* (required for `STANDARD_NV` instances, optional for `STANDARD_NC` instances)
+`gpu` property defines additional information for NVIDIA GPU-enabled VMs:
+ * `nvidia_driver` property contains the following required members:
+ * `source` is the source url to download the driver.
+* (optional) `additional_node_prep_commands` is an array of additional commands
+to execute on the compute node host as part of node preparation. This can
+be empty or omitted.
+
+## Full template
+An full template of a credentials file can be found
+[here](../config\_templates/pool.json). Note that this template cannot
+be used as-is and must be modified to fit your scenario.
diff --git a/docs/14-batch-shipyard-configuration-jobs.md b/docs/14-batch-shipyard-configuration-jobs.md
new file mode 100644
index 0000000..68bb8dd
--- /dev/null
+++ b/docs/14-batch-shipyard-configuration-jobs.md
@@ -0,0 +1,378 @@
+# Batch Shipyard Jobs Configuration
+This page contains in-depth details on how to configure the jobs
+json file for Batch Shipyard.
+
+## Schema
+The jobs schema is as follows:
+
+```json
+{
+ "job_specifications": [
+ {
+ "id": "dockerjob",
+ "multi_instance_auto_complete": true,
+ "environment_variables": {
+ "abc": "xyz"
+ },
+ "environment_variables_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/myjobenv",
+ "max_task_retries": 3,
+ "allow_run_on_missing_image": false,
+ "input_data": {
+ "azure_batch": [
+ {
+ "job_id": "someotherjob",
+ "task_id": "task-a",
+ "include": ["wd/*.dat"],
+ "exclude": ["*.txt"],
+ "destination": null
+ }
+ ],
+ "azure_storage": [
+ {
+ "storage_account_settings": "mystorageaccount",
+ "container": "jobcontainer",
+ "include": ["jobdata*.bin"],
+ "destination": "$AZ_BATCH_NODE_SHARED_DIR/jobdata",
+ "blobxfer_extra_options": null
+ }
+ ]
+ },
+ "tasks": [
+ {
+ "id": null,
+ "depends_on": [
+ "taskid-a", "taskid-b", "taskid-c"
+ ],
+ "depends_on_range": [
+ 1, 10
+ ],
+ "image": "busybox",
+ "name": null,
+ "labels": [],
+ "environment_variables": {
+ "def": "123"
+ },
+ "environment_variables_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/mytaskenv",
+ "ports": [],
+ "data_volumes": [
+ "contdatavol",
+ "hosttempvol"
+ ],
+ "shared_data_volumes": [
+ "azurefilevol"
+ ],
+ "resource_files": [
+ {
+ "file_path": "",
+ "blob_source": "",
+ "file_mode": ""
+ }
+ ],
+ "input_data": {
+ "azure_batch": [
+ {
+ "job_id": "previousjob",
+ "task_id": "mytask1",
+ "include": ["wd/output/*.bin"],
+ "exclude": ["*.txt"],
+ "destination": null
+ }
+ ],
+ "azure_storage": [
+ {
+ "storage_account_settings": "mystorageaccount",
+ "container": "taskcontainer",
+ "include": ["taskdata*.bin"],
+ "destination": "$AZ_BATCH_NODE_SHARED_DIR/taskdata",
+ "blobxfer_extra_options": null
+ }
+ ]
+ },
+ "output_data": {
+ "azure_storage": [
+ {
+ "storage_account_settings": "mystorageaccount",
+ "container": "output",
+ "source": null,
+ "include": ["**/out*.dat"],
+ "blobxfer_extra_options": null
+ }
+ ]
+ },
+ "remove_container_after_exit": true,
+ "shm_size": "256m",
+ "additional_docker_run_options": [
+ ],
+ "infiniband": false,
+ "gpu": false,
+ "max_task_retries": 3,
+ "retention_time": "1.12:00:00",
+ "multi_instance": {
+ "num_instances": "pool_current_dedicated",
+ "coordination_command": null,
+ "resource_files": [
+ {
+ "file_path": "",
+ "blob_source": "",
+ "file_mode": ""
+ }
+ ]
+ },
+ "entrypoint": null,
+ "command": ""
+ }
+ ]
+ }
+ ]
+}
+```
+
+`job_specifications` array consists of jobs to create.
+* (required) `id` is the job id to create. If the job already exists, the
+specified `tasks` under the job will be added to the existing job.
+* (optional) `multi_instance_auto_complete` enables auto-completion of the job
+for which a multi-task instance is run. This allows automatic cleanup of the
+Docker container in multi-instance tasks. This is defaulted to `true` when
+multi-instance tasks are specified.
+* (optional) `environment_variables` under the job are environment variables
+which will be applied to all tasks operating under the job. Note that
+environment variables are not expanded and are passed as-is. You will need
+to source the environment file `$AZ_BATCH_TASK_WORKING_DIR/.shipyard.envlist`
+in a shell within the docker `command` or `entrypoint` if you want any
+environment variables to be expanded.
+* (optional) `environment_variables_keyvault_secret_id` under the job are
+environment variables stored in KeyVault that should be applied to all tasks
+operating under the job. The secret stored in KeyVault must be a valid json
+string, e.g., `{ "env_var_name": "env_var_value" }`.
+* (optional) `max_task_retries` sets the maximum number of times that
+Azure Batch should retry all tasks in this job for. By default, Azure Batch
+does not retry tasks that fail (i.e. `max_task_retries` is 0).
+* (optional) `allow_run_on_missing` allows tasks with a Docker image reference
+that was not pre-loaded on to the compute node via
+`global_resources`:`docker_images` in the global configuration to be able to
+run. Note that you should attempt to specify all Docker images that you intend
+to run in the `global_resources`:`docker_images` property in the global
+configuration to minimize scheduling to task execution latency.
+* (optional) `input_data` is an object containing data that should be
+ingressed for the job. Any `input_data` defined at this level will be
+downloaded for this job which can be run on any number of compute nodes
+depending upon the number of constituent tasks and repeat invocations. However,
+`input_data` is only downloaded once per job invocation on a compute node.
+For example, if `job-1`:`task-1` is run on compute node A and then
+`job-1`:`task-2` is run on compute node B, then this `input_data` is ingressed
+to both compute node A and B. However, if `job-1`:`task-3` is then run on
+compute node A after `job-1`:`task-1`, then the `input_data` is not
+transferred again. This object currently supports `azure_batch` and
+`azure_storage` as members.
+ * `azure_batch` contains the following members:
+ * (required) `job_id` the job id of the task
+ * (required) `task_id` the id of the task to fetch files from
+ * (optional) `include` is an array of include filters
+ * (optional) `exclude` is an array of exclude filters
+ * (required) `destination` is the destination path to place the files
+ * `azure_storage` contains the following members:
+ * (required) `storage_account_settings` contains a storage account link
+ as defined in the credentials json.
+ * (required) `container` or `file_share` is required when downloading
+ from Azure Blob Storage or Azure File Storage, respectively.
+ `container` specifies which container to download from for Azure Blob
+ Storage while `file_share` specifies which file share to download from
+ for Azure File Storage. Only one of these properties can be specified
+ per `data_transfer` object.
+ * (optional) `include` property defines an optional include filter.
+ Although this property is an array, it is only allowed to have 1
+ maximum filter.
+ * (required) `destination` property defines where to place the
+ downloaded files on the host file system. Please note that you should
+ not specify a destination that is on a shared file system. If you
+ require ingressing to a shared file system location like a GlusterFS
+ volume, then use the global configuration `files` property and the
+ `data ingress` command.
+ * (optional) `blobxfer_extra_options` are any extra options to pass to
+ `blobxfer`.
+* (required) `tasks` is an array of tasks to add to the job.
+ * (optional) `id` is the task id. Note that if the task `id` is null or
+ empty then a generic task id will be assigned. The generic task id is
+ formatted as `dockertask-NNNNN` where `NNNNN` starts from `00000` and is
+ increased by 1 for each task added to the same job. If there are more
+ than `99999` autonamed tasks in a job then the numbering is not
+ padded for tasks exceeding 5 digits.
+ * (optional) `depends_on` is an array of task ids for which this container
+ invocation (task) depends on and must run to successful completion prior
+ to this task executing.
+ * (optional) `depends_on_range` is an array with exactly two integral
+ elements containing a task `id` range for which this task is dependent
+ upon, i.e., the start `id` and the end `id` for which this task depends
+ on. Although task `id`s are always strings, the dependent task `id`s for
+ ranges must be expressed by their integral representation for this
+ property. This also implies that task `id`s for which this task depends
+ on must be integral in nature. For example, if `depends_on_range` is set
+ to `[1, 10]` (note the integral members), then there should be task
+ `id`s of `"1"`, `"2"`, ... `"10"` within the job. Once these dependent
+ tasks complete successfully, then this specified task will execute.
+ * (required) `image` is the Docker image to use for this task
+ * (optional) `name` is the name to assign to the container. If not
+ specified, the value of the `id` property will be used for `name`.
+ * (optional) `labels` is an array of labels to apply to the container.
+ * (optional) `environment_variables` are any additional task-specific
+ environment variables that should be applied to the container. Note that
+ environment variables are not expanded and are passed as-is. You will
+ need to source the environment file
+ `$AZ_BATCH_TASK_WORKING_DIR/.shipyard.envlist` in a shell within the
+ docker `command` or `entrypoint` if you want any environment variables
+ to be expanded.
+ * (optional) `environment_variables_keyvault_secret_id` are any additional
+ task-specific environment variables that should be applied to the
+ container but are stored in KeyVault. The secret stored in KeyVault must
+ be a valid json string, e.g., `{ "env_var_name": "env_var_value" }`.
+ * (optional) `ports` is an array of port specifications that should be
+ exposed to the host.
+ * (optional) `data_volumes` is an array of `data_volume` aliases as defined
+ in the global configuration file. These volumes will be mounted in the
+ container.
+ * (optional) `shared_data_volumes` is an array of `shared_data_volume`
+ aliases as defined in the global configuration file. These volumes will be
+ mounted in the container.
+ * (optional) `resource_files` is an array of resource files that should be
+ downloaded as part of the task. Each array entry contains the following
+ information:
+ * `file_path` is the path within the task working directory to place the
+ file on the compute node.
+ * `blob_source` is an accessible HTTP/HTTPS URL. This need not be an Azure
+ Blob Storage URL.
+ * `file_mode` if the file mode to set for the file on the compute node.
+ This is optional.
+ * (optional) `input_data` is an object containing data that should be
+ ingressed for this specific task. This object currently supports
+ `azure_batch` and `azure_storage` as members. Note for multi-instance
+ tasks, transfer of `input_data` is only applied to the task running the
+ application command.
+ * `azure_batch` contains the following members:
+ * (required) `job_id` the job id of the task
+ * (required) `task_id` the id of the task to fetch files from
+ * (optional) `include` is an array of include filters
+ * (optional) `exclude` is an array of exclude filters
+ * (optional) `destination` is the destination path to place the files.
+ If `destination` is not specified at this level, then files are
+ defaulted to download into `$AZ_BATCH_TASK_WORKING_DIR`.
+ * `azure_storage` contains the following members:
+ * (required) `storage_account_settings` contains a storage account link
+ as defined in the credentials json.
+ * (required) `container` or `file_share` is required when downloading
+ from Azure Blob Storage or Azure File Storage, respectively.
+ `container` specifies which container to download from for Azure Blob
+ Storage while `file_share` specifies which file share to download from
+ for Azure File Storage. Only one of these properties can be specified
+ per `data_transfer` object.
+ * (optional) `include` property defines an optional include filter.
+ Although this property is an array, it is only allowed to have 1
+ maximum filter.
+ * (optional) `destination` property defines where to place the
+ downloaded files on the host file system. Unlike the job-level
+ version of `input_data`, this `destination` property can be ommitted.
+ If `destination` is not specified at this level, then files are
+ defaulted to download into `$AZ_BATCH_TASK_WORKING_DIR`. Please note
+ that you should not specify a destination that is on a shared file
+ system. If you require ingressing to a shared file system location
+ like a GlusterFS volume, then use the global configuration `files`
+ property and the `data ingress` command.
+ * (optional) `blobxfer_extra_options` are any extra options to pass to
+ `blobxfer`.
+ * (optional) `output_data` is an object containing data that should be
+ egressed for this specific task if and only if the task completes
+ successfully. This object currently only supports `azure_storage` as a
+ member. Note for multi-instance tasks, transfer of `output_data` is only
+ applied to the task running the application command.
+ * `azure_storage` contains the following members:
+ * (required) `storage_account_settings` contains a storage account link
+ as defined in the credentials json.
+ * (required) `container` or `file_share` is required when uploading to
+ Azure Blob Storage or Azure File Storage, respectively. `container`
+ specifies which container to upload to for Azure Blob Storage while
+ `file_share` specifies which file share to upload to for Azure File
+ Storage. Only one of these properties can be specified per
+ `data_transfer` object.
+ * (optional) `source` property defines which directory to upload to
+ Azure storage. If `source` is not specified, then `source` is
+ defaulted to `$AZ_BATCH_TASK_DIR`.
+ * (optional) `include` property defines an optional include filter.
+ Although this property is an array, it is only allowed to have 1
+ maximum filter.
+ * (optional) `blobxfer_extra_options` are any extra options to pass to
+ `blobxfer`.
+ * (optional) `remove_container_after_exit` property specifies if the
+ container should be automatically removed/cleaned up after it exits. This
+ defaults to `false`.
+ * (optional) `shm_size` property specifies the size of `/dev/shm` in
+ the container. The default is `64m`. The postfix unit can be designated
+ as `b` (bytes), `k` (kilobytes), `m` (megabytes), or `g` (gigabytes). This
+ value may need to be increased from the default of `64m` for certain
+ Docker applications, including multi-instance tasks using Intel MPI
+ (see [issue #8](https://github.com/Azure/batch-shipyard/issues/8)).
+ * (optional) `additional_docker_run_options` is an array of addition Docker
+ run options that should be passed to the Docker daemon when starting this
+ container.
+ * (optional) `infiniband` designates if this container requires access to the
+ Infiniband/RDMA devices on the host. Note that this will automatically
+ force the container to use the host network stack. If this property is
+ set to `true`, ensure that the `pool_specification` property
+ `inter_node_communication_enabled` is set to `true`.
+ * (optional) `gpu` designates if this container requires access to the GPU
+ devices on the host. If this property is set to `true`, Docker containers
+ are instantiated via `nvidia-docker`. This requires N-series VM instances.
+ * (optional) `max_task_retries` sets the maximum number of times that
+ Azure Batch should retry this task for. This overrides the job-level task
+ retry count. By default, Azure Batch does not retry tasks that fail
+ (i.e. `max_task_retries` is 0).
+ * (optional) `retention_time` sets the timedelta to retain the task
+ directory on the compute node where it ran after the task completes.
+ The format for this property is a timedelta with a string representation
+ of "d.HH:mm:ss". For example, "1.12:00:00" would allow the compute node
+ to clean up this task's directory 36 hours after the task completed. The
+ default, if unspecified, is effectively infinite - i.e., task data is
+ retained forever on the compute node that ran the task.
+ * (optional) `multi_instance` is a property indicating that this task is a
+ multi-instance task. This is required if the Docker image is an MPI
+ program. Additional information about multi-instance tasks and Batch
+ Shipyard can be found
+ [here](80-batch-shipyard-multi-instance-tasks.md). Do not define this
+ property for tasks that are not multi-instance. Additional members of this
+ property are:
+ * `num_instances` is a property setting the number of compute node
+ instances are required for this multi-instance task. This can be any one
+ of the following:
+ 1. An integral number
+ 2. `pool_current_dedicated` which is the instantaneous reading of the
+ target pool's current dedicated count during this function invocation.
+ 3. `pool_specification_vm_count` which is the `vm_count` specified in the
+ pool configuration.
+ * `coordination_command` is the coordination command this is run by each
+ instance (compute node) of this multi-instance task prior to the
+ application command. This command must not block and must exit
+ successfully for the multi-instance task to proceed. This is the command
+ passed to the container in `docker run` for multi-instance tasks. This
+ docker container instance will automatically be daemonized. This is
+ optional and may be null.
+ * `resource_files` is an array of resource files that should be downloaded
+ as part of the multi-instance task. Each array entry contains the
+ following information:
+ * `file_path` is the path within the task working directory to place
+ the file on the compute node.
+ * `blob_source` is an accessible HTTP/HTTPS URL. This need not be an
+ Azure Blob Storage URL.
+ * `file_mode` if the file mode to set for the file on the compute node.
+ This is optional.
+ * (optional) `entrypoint` is the property that can override the Docker image
+ defined `ENTRYPOINT`.
+ * (optional) `command` is the command to execute in the Docker container
+ context. If this task is a regular non-multi-instance task, then this is
+ the command passed to the container context during `docker run`. If this
+ task is a multi-instance task, then this `command` is the application
+ command and is executed with `docker exec` in the running Docker container
+ context from the `coordination_command` in the `multi_instance` property.
+ This property may be null.
+
+## Full template
+An full template of a credentials file can be found
+[here](../config\_templates/jobs.json). Note that this template cannot
+be used as-is and must be modified to fit your scenario.
diff --git a/docs/99-current-limitations.md b/docs/99-current-limitations.md
index d4dc65b..c20d455 100644
--- a/docs/99-current-limitations.md
+++ b/docs/99-current-limitations.md
@@ -17,7 +17,9 @@ The following are general limitations or restrictions:
* Compute pool resize down (i.e., removing nodes from a pool) is not supported
when peer-to-peer transfer is enabled.
* The maximum number of compute nodes with peer-to-peer enabled is currently
-40 for Linux pools for non-UserSubscription Batch accounts.
+40 for Linux pools for non-UserSubscription Batch accounts. This check is
+no longer performed before a pool is created and will instead result in
+a ResizeError on the pool if not all compute nodes can be allocated.
* Data movement between Batch tasks as defined by `input_data`:`azure_batch`
is restricted to Batch accounts with keys (non-AAD).
* Virtual network support in Batch pools can only be used with
diff --git a/docs/README.md b/docs/README.md
index f05efe8..dbc67ec 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -6,6 +6,10 @@ and effectively running your batch-style Docker workloads on Azure Batch.
2. [Installation](01-batch-shipyard-installation.md)
3. [Quick Start](02-batch-shipyard-quickstart.md)
4. [Configuration](10-batch-shipyard-configuration.md)
+ 1. [Credentials Configuration](11-batch-shipyard-configuration-credentials.md)
+ 2. [Global Configuration](12-batch-shipyard-configuration-global.md)
+ 3. [Pool Configuration](13-batch-shipyard-configuration-pool.md)
+ 4. [Jobs Configuration](14-batch-shipyard-configuration-jobs.md)
5. [Usage](20-batch-shipyard-usage.md)
6. [Data Movement](70-batch-shipyard-data-movement.md)
7. [Azure KeyVault for Credential Management](74-batch-shipyard-azure-keyvault.md)
diff --git a/recipes/CNTK-CPU-Infiniband-IntelMPI/config/credentials.json b/recipes/CNTK-CPU-Infiniband-IntelMPI/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/CNTK-CPU-Infiniband-IntelMPI/config/credentials.json
+++ b/recipes/CNTK-CPU-Infiniband-IntelMPI/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/CNTK-CPU-OpenMPI/config/multinode/credentials.json b/recipes/CNTK-CPU-OpenMPI/config/multinode/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/CNTK-CPU-OpenMPI/config/multinode/credentials.json
+++ b/recipes/CNTK-CPU-OpenMPI/config/multinode/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/CNTK-CPU-OpenMPI/config/multinode/pool.json b/recipes/CNTK-CPU-OpenMPI/config/multinode/pool.json
index 9ec459b..110b57c 100644
--- a/recipes/CNTK-CPU-OpenMPI/config/multinode/pool.json
+++ b/recipes/CNTK-CPU-OpenMPI/config/multinode/pool.json
@@ -6,7 +6,7 @@
"inter_node_communication_enabled": true,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/CNTK-CPU-OpenMPI/config/singlenode/credentials.json b/recipes/CNTK-CPU-OpenMPI/config/singlenode/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/CNTK-CPU-OpenMPI/config/singlenode/credentials.json
+++ b/recipes/CNTK-CPU-OpenMPI/config/singlenode/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/CNTK-CPU-OpenMPI/config/singlenode/pool.json b/recipes/CNTK-CPU-OpenMPI/config/singlenode/pool.json
index 23db972..fa90e76 100644
--- a/recipes/CNTK-CPU-OpenMPI/config/singlenode/pool.json
+++ b/recipes/CNTK-CPU-OpenMPI/config/singlenode/pool.json
@@ -6,7 +6,7 @@
"inter_node_communication_enabled": true,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/CNTK-GPU-OpenMPI/README.md b/recipes/CNTK-GPU-OpenMPI/README.md
index 501d9b0..8d29d3a 100644
--- a/recipes/CNTK-GPU-OpenMPI/README.md
+++ b/recipes/CNTK-GPU-OpenMPI/README.md
@@ -20,7 +20,7 @@ compute application, it is best to choose `NC` VM instances.
once they are available for N-series VMs.
* `offer` should be `UbuntuServer`. Other offers will be supported once they
are available for N-series VMs.
-* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
+* `sku` should be `16.04-LTS`. Other skus will be supported once they are
available for N-series VMs.
* `inter_node_communication_enabled` must be set to `true`
* `max_tasks_per_node` must be set to 1 or omitted
diff --git a/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/credentials.json b/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/credentials.json
+++ b/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/pool.json b/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/pool.json
index 4ab8344..6c63cc0 100644
--- a/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/pool.json
+++ b/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/pool.json
@@ -6,7 +6,7 @@
"inter_node_communication_enabled": true,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/credentials.json b/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/credentials.json
+++ b/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/pool.json b/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/pool.json
index 0cee0cf..66190c5 100644
--- a/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/pool.json
+++ b/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/pool.json
@@ -5,7 +5,7 @@
"vm_count": 1,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/credentials.json b/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/credentials.json
+++ b/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/pool.json b/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/pool.json
index 38d50f4..481a6ee 100644
--- a/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/pool.json
+++ b/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/pool.json
@@ -5,7 +5,7 @@
"vm_count": 1,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/Caffe-CPU/config/credentials.json b/recipes/Caffe-CPU/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/Caffe-CPU/config/credentials.json
+++ b/recipes/Caffe-CPU/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/Caffe-CPU/config/pool.json b/recipes/Caffe-CPU/config/pool.json
index 341f476..6109e1a 100644
--- a/recipes/Caffe-CPU/config/pool.json
+++ b/recipes/Caffe-CPU/config/pool.json
@@ -5,7 +5,7 @@
"vm_count": 1,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/Caffe-GPU/README.md b/recipes/Caffe-GPU/README.md
index bd3317c..707b6f4 100644
--- a/recipes/Caffe-GPU/README.md
+++ b/recipes/Caffe-GPU/README.md
@@ -17,7 +17,7 @@ compute application, it is best to choose `NC` VM instances.
once they are available for N-series VMs.
* `offer` should be `UbuntuServer`. Other offers will be supported once they
are available for N-series VMs.
-* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
+* `sku` should be `16.04-LTS`. Other skus will be supported once they are
available for N-series VMs.
### Global Configuration
diff --git a/recipes/Caffe-GPU/config/credentials.json b/recipes/Caffe-GPU/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/Caffe-GPU/config/credentials.json
+++ b/recipes/Caffe-GPU/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/Caffe-GPU/config/pool.json b/recipes/Caffe-GPU/config/pool.json
index 33a6f26..ca45ab6 100644
--- a/recipes/Caffe-GPU/config/pool.json
+++ b/recipes/Caffe-GPU/config/pool.json
@@ -5,7 +5,7 @@
"vm_count": 1,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/Chainer-CPU/config/credentials.json b/recipes/Chainer-CPU/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/Chainer-CPU/config/credentials.json
+++ b/recipes/Chainer-CPU/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/Chainer-CPU/config/pool.json b/recipes/Chainer-CPU/config/pool.json
index ab98728..4ec8659 100644
--- a/recipes/Chainer-CPU/config/pool.json
+++ b/recipes/Chainer-CPU/config/pool.json
@@ -5,7 +5,7 @@
"vm_count": 1,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/Chainer-GPU/README.md b/recipes/Chainer-GPU/README.md
index 25db219..236f9f6 100644
--- a/recipes/Chainer-GPU/README.md
+++ b/recipes/Chainer-GPU/README.md
@@ -17,7 +17,7 @@ compute application, it is best to choose `NC` VM instances.
once they are available for N-series VMs.
* `offer` should be `UbuntuServer`. Other offers will be supported once they
are available for N-series VMs.
-* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
+* `sku` should be `16.04-LTS`. Other skus will be supported once they are
available for N-series VMs.
### Global Configuration
diff --git a/recipes/Chainer-GPU/config/credentials.json b/recipes/Chainer-GPU/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/Chainer-GPU/config/credentials.json
+++ b/recipes/Chainer-GPU/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/Chainer-GPU/config/pool.json b/recipes/Chainer-GPU/config/pool.json
index 5e0832f..871a7c6 100644
--- a/recipes/Chainer-GPU/config/pool.json
+++ b/recipes/Chainer-GPU/config/pool.json
@@ -5,7 +5,7 @@
"vm_count": 1,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/FFmpeg-GPU/README.md b/recipes/FFmpeg-GPU/README.md
index eab56b4..cf87f08 100644
--- a/recipes/FFmpeg-GPU/README.md
+++ b/recipes/FFmpeg-GPU/README.md
@@ -18,7 +18,7 @@ audio/video, it is best to choose `NV` VM instances.
once they are available for N-series VMs.
* `offer` should be `UbuntuServer`. Other offers will be supported once they
are available for N-series VMs.
-* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
+* `sku` should be `16.04-LTS`. Other skus will be supported once they are
available for N-series VMs.
* `gpu` property should be specified with the following members:
* `nvidia_driver` property contains the following members:
diff --git a/recipes/FFmpeg-GPU/config/credentials.json b/recipes/FFmpeg-GPU/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/FFmpeg-GPU/config/credentials.json
+++ b/recipes/FFmpeg-GPU/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/FFmpeg-GPU/config/pool.json b/recipes/FFmpeg-GPU/config/pool.json
index d5eaec8..9fc2154 100644
--- a/recipes/FFmpeg-GPU/config/pool.json
+++ b/recipes/FFmpeg-GPU/config/pool.json
@@ -5,7 +5,7 @@
"vm_count": 1,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/HPCG-Infiniband-IntelMPI/config/credentials.json b/recipes/HPCG-Infiniband-IntelMPI/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/HPCG-Infiniband-IntelMPI/config/credentials.json
+++ b/recipes/HPCG-Infiniband-IntelMPI/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/HPLinpack-Infiniband-IntelMPI/config/credentials.json b/recipes/HPLinpack-Infiniband-IntelMPI/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/HPLinpack-Infiniband-IntelMPI/config/credentials.json
+++ b/recipes/HPLinpack-Infiniband-IntelMPI/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/Keras+Theano-CPU/config/credentials.json b/recipes/Keras+Theano-CPU/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/Keras+Theano-CPU/config/credentials.json
+++ b/recipes/Keras+Theano-CPU/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/Keras+Theano-CPU/config/pool.json b/recipes/Keras+Theano-CPU/config/pool.json
index a5a0518..aec1ecf 100644
--- a/recipes/Keras+Theano-CPU/config/pool.json
+++ b/recipes/Keras+Theano-CPU/config/pool.json
@@ -5,7 +5,7 @@
"vm_count": 1,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/Keras+Theano-GPU/README.md b/recipes/Keras+Theano-GPU/README.md
index 6d6b982..c8d4cf5 100644
--- a/recipes/Keras+Theano-GPU/README.md
+++ b/recipes/Keras+Theano-GPU/README.md
@@ -18,7 +18,7 @@ compute application, it is best to choose `NC` VM instances.
once they are available for N-series VMs.
* `offer` should be `UbuntuServer`. Other offers will be supported once they
are available for N-series VMs.
-* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
+* `sku` should be `16.04-LTS`. Other skus will be supported once they are
available for N-series VMs.
### Global Configuration
diff --git a/recipes/Keras+Theano-GPU/config/credentials.json b/recipes/Keras+Theano-GPU/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/Keras+Theano-GPU/config/credentials.json
+++ b/recipes/Keras+Theano-GPU/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/Keras+Theano-GPU/config/pool.json b/recipes/Keras+Theano-GPU/config/pool.json
index 15ecd66..88b4c2c 100644
--- a/recipes/Keras+Theano-GPU/config/pool.json
+++ b/recipes/Keras+Theano-GPU/config/pool.json
@@ -5,7 +5,7 @@
"vm_count": 1,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/MXNet-CPU/config/multinode/credentials.json b/recipes/MXNet-CPU/config/multinode/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/MXNet-CPU/config/multinode/credentials.json
+++ b/recipes/MXNet-CPU/config/multinode/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/MXNet-CPU/config/multinode/pool.json b/recipes/MXNet-CPU/config/multinode/pool.json
index 4a823fa..2f36646 100644
--- a/recipes/MXNet-CPU/config/multinode/pool.json
+++ b/recipes/MXNet-CPU/config/multinode/pool.json
@@ -6,7 +6,7 @@
"inter_node_communication_enabled": true,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/MXNet-CPU/config/singlenode/credentials.json b/recipes/MXNet-CPU/config/singlenode/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/MXNet-CPU/config/singlenode/credentials.json
+++ b/recipes/MXNet-CPU/config/singlenode/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/MXNet-CPU/config/singlenode/pool.json b/recipes/MXNet-CPU/config/singlenode/pool.json
index de74395..e6134a1 100644
--- a/recipes/MXNet-CPU/config/singlenode/pool.json
+++ b/recipes/MXNet-CPU/config/singlenode/pool.json
@@ -6,7 +6,7 @@
"inter_node_communication_enabled": true,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/MXNet-GPU/README.md b/recipes/MXNet-GPU/README.md
index 3e7727f..913cfa0 100644
--- a/recipes/MXNet-GPU/README.md
+++ b/recipes/MXNet-GPU/README.md
@@ -17,7 +17,7 @@ compute application, it is best to choose `NC` VM instances.
once they are available for N-series VMs.
* `offer` should be `UbuntuServer`. Other offers will be supported once they
are available for N-series VMs.
-* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
+* `sku` should be `16.04-LTS`. Other skus will be supported once they are
available for N-series VMs.
* `inter_node_communication_enabled` must be set to `true`
* `max_tasks_per_node` must be set to 1 or omitted
diff --git a/recipes/MXNet-GPU/config/multinode/credentials.json b/recipes/MXNet-GPU/config/multinode/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/MXNet-GPU/config/multinode/credentials.json
+++ b/recipes/MXNet-GPU/config/multinode/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/MXNet-GPU/config/multinode/pool.json b/recipes/MXNet-GPU/config/multinode/pool.json
index 284477e..5999a4f 100644
--- a/recipes/MXNet-GPU/config/multinode/pool.json
+++ b/recipes/MXNet-GPU/config/multinode/pool.json
@@ -5,7 +5,7 @@
"vm_count": 2,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/MXNet-GPU/config/singlenode/credentials.json b/recipes/MXNet-GPU/config/singlenode/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/MXNet-GPU/config/singlenode/credentials.json
+++ b/recipes/MXNet-GPU/config/singlenode/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/MXNet-GPU/config/singlenode/pool.json b/recipes/MXNet-GPU/config/singlenode/pool.json
index c0a3ffc..7fae2b5 100644
--- a/recipes/MXNet-GPU/config/singlenode/pool.json
+++ b/recipes/MXNet-GPU/config/singlenode/pool.json
@@ -5,7 +5,7 @@
"vm_count": 1,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/NAMD-GPU/README.md b/recipes/NAMD-GPU/README.md
index 4280d7c..c4119f5 100644
--- a/recipes/NAMD-GPU/README.md
+++ b/recipes/NAMD-GPU/README.md
@@ -19,7 +19,7 @@ compute application, it is best to choose `NC` VM instances.
once they are available for N-series VMs.
* `offer` should be `UbuntuServer`. Other offers will be supported once they
are available for N-series VMs.
-* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
+* `sku` should be `16.04-LTS`. Other skus will be supported once they are
available for N-series VMs.
* `max_tasks_per_node` must be set to 1 or omitted
diff --git a/recipes/NAMD-GPU/config/credentials.json b/recipes/NAMD-GPU/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/NAMD-GPU/config/credentials.json
+++ b/recipes/NAMD-GPU/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/NAMD-GPU/config/pool.json b/recipes/NAMD-GPU/config/pool.json
index 4b07db6..a9a9860 100644
--- a/recipes/NAMD-GPU/config/pool.json
+++ b/recipes/NAMD-GPU/config/pool.json
@@ -5,7 +5,7 @@
"vm_count": 1,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/NAMD-Infiniband-IntelMPI/config/credentials.json b/recipes/NAMD-Infiniband-IntelMPI/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/NAMD-Infiniband-IntelMPI/config/credentials.json
+++ b/recipes/NAMD-Infiniband-IntelMPI/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/NAMD-TCP/config/credentials.json b/recipes/NAMD-TCP/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/NAMD-TCP/config/credentials.json
+++ b/recipes/NAMD-TCP/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/OpenFOAM-Infiniband-IntelMPI/config/credentials.json b/recipes/OpenFOAM-Infiniband-IntelMPI/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/OpenFOAM-Infiniband-IntelMPI/config/credentials.json
+++ b/recipes/OpenFOAM-Infiniband-IntelMPI/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/OpenFOAM-TCP-OpenMPI/config/credentials.json b/recipes/OpenFOAM-TCP-OpenMPI/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/OpenFOAM-TCP-OpenMPI/config/credentials.json
+++ b/recipes/OpenFOAM-TCP-OpenMPI/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/TensorFlow-CPU/config/credentials.json b/recipes/TensorFlow-CPU/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/TensorFlow-CPU/config/credentials.json
+++ b/recipes/TensorFlow-CPU/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/TensorFlow-CPU/config/pool.json b/recipes/TensorFlow-CPU/config/pool.json
index 9eddb8e..125be08 100644
--- a/recipes/TensorFlow-CPU/config/pool.json
+++ b/recipes/TensorFlow-CPU/config/pool.json
@@ -5,7 +5,7 @@
"vm_count": 1,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/TensorFlow-Distributed/README.md b/recipes/TensorFlow-Distributed/README.md
index c5a2fec..1982a98 100644
--- a/recipes/TensorFlow-Distributed/README.md
+++ b/recipes/TensorFlow-Distributed/README.md
@@ -20,7 +20,7 @@ If not using GPUs, another appropriate SKU can be selected.
supported once they are available for N-series VMs.
* `offer` should be `UbuntuServer` if using GPUs. Other offers will be
supported once they are available for N-series VMs.
-* `sku` should be `16.04.0-LTS` if using GPUs. Other skus will be supported
+* `sku` should be `16.04-LTS` if using GPUs. Other skus will be supported
once they are available for N-series VMs.
If on multiple CPUs:
diff --git a/recipes/TensorFlow-Distributed/config/cpu/credentials.json b/recipes/TensorFlow-Distributed/config/cpu/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/TensorFlow-Distributed/config/cpu/credentials.json
+++ b/recipes/TensorFlow-Distributed/config/cpu/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/TensorFlow-Distributed/config/cpu/pool.json b/recipes/TensorFlow-Distributed/config/cpu/pool.json
index 4e3d80b..e083c64 100644
--- a/recipes/TensorFlow-Distributed/config/cpu/pool.json
+++ b/recipes/TensorFlow-Distributed/config/cpu/pool.json
@@ -6,7 +6,7 @@
"inter_node_communication_enabled": true,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/TensorFlow-Distributed/config/gpu/credentials.json b/recipes/TensorFlow-Distributed/config/gpu/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/TensorFlow-Distributed/config/gpu/credentials.json
+++ b/recipes/TensorFlow-Distributed/config/gpu/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/TensorFlow-Distributed/config/gpu/pool.json b/recipes/TensorFlow-Distributed/config/gpu/pool.json
index c17621f..c1e0d98 100644
--- a/recipes/TensorFlow-Distributed/config/gpu/pool.json
+++ b/recipes/TensorFlow-Distributed/config/gpu/pool.json
@@ -6,7 +6,7 @@
"inter_node_communication_enabled": true,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/TensorFlow-GPU/README.md b/recipes/TensorFlow-GPU/README.md
index a08e8d7..66941e3 100644
--- a/recipes/TensorFlow-GPU/README.md
+++ b/recipes/TensorFlow-GPU/README.md
@@ -17,7 +17,7 @@ compute application, it is best to choose `NC` VM instances.
once they are available for N-series VMs.
* `offer` should be `UbuntuServer`. Other offers will be supported once they
are available for N-series VMs.
-* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
+* `sku` should be `16.04-LTS`. Other skus will be supported once they are
available for N-series VMs.
### Global Configuration
diff --git a/recipes/TensorFlow-GPU/config/credentials.json b/recipes/TensorFlow-GPU/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/TensorFlow-GPU/config/credentials.json
+++ b/recipes/TensorFlow-GPU/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/TensorFlow-GPU/config/pool.json b/recipes/TensorFlow-GPU/config/pool.json
index 272f12c..9c68ff2 100644
--- a/recipes/TensorFlow-GPU/config/pool.json
+++ b/recipes/TensorFlow-GPU/config/pool.json
@@ -5,7 +5,7 @@
"vm_count": 1,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/Torch-CPU/config/credentials.json b/recipes/Torch-CPU/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/Torch-CPU/config/credentials.json
+++ b/recipes/Torch-CPU/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/Torch-CPU/config/pool.json b/recipes/Torch-CPU/config/pool.json
index 49b0127..21e64fd 100644
--- a/recipes/Torch-CPU/config/pool.json
+++ b/recipes/Torch-CPU/config/pool.json
@@ -5,7 +5,7 @@
"vm_count": 1,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/recipes/Torch-GPU/README.md b/recipes/Torch-GPU/README.md
index 1c2ef08..e283dee 100644
--- a/recipes/Torch-GPU/README.md
+++ b/recipes/Torch-GPU/README.md
@@ -17,7 +17,7 @@ compute application, it is best to choose `NC` VM instances.
once they are available for N-series VMs.
* `offer` should be `UbuntuServer`. Other offers will be supported once they
are available for N-series VMs.
-* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
+* `sku` should be `16.04-LTS`. Other skus will be supported once they are
available for N-series VMs.
### Global Configuration
diff --git a/recipes/Torch-GPU/config/credentials.json b/recipes/Torch-GPU/config/credentials.json
index 451e167..e9ffd05 100644
--- a/recipes/Torch-GPU/config/credentials.json
+++ b/recipes/Torch-GPU/config/credentials.json
@@ -1,7 +1,6 @@
{
"credentials": {
"batch": {
- "account": "",
"account_key": "",
"account_service_url": ""
},
diff --git a/recipes/Torch-GPU/config/pool.json b/recipes/Torch-GPU/config/pool.json
index 35bcaa9..d62ed03 100644
--- a/recipes/Torch-GPU/config/pool.json
+++ b/recipes/Torch-GPU/config/pool.json
@@ -5,7 +5,7 @@
"vm_count": 1,
"publisher": "Canonical",
"offer": "UbuntuServer",
- "sku": "16.04.0-LTS",
+ "sku": "16.04-LTS",
"ssh": {
"username": "docker"
},
diff --git a/scripts/shipyard_remotefs_stat.sh b/scripts/shipyard_remotefs_stat.sh
index 845476d..4c0ef74 100755
--- a/scripts/shipyard_remotefs_stat.sh
+++ b/scripts/shipyard_remotefs_stat.sh
@@ -60,6 +60,9 @@ if [ $server_type == "nfs" ]; then
echo ""
echo "nfsstat:"
nfsstat -s -4
+ echo ""
+ echo "connected clients:"
+ netstat -tn | grep :2049
else
echo "$server_type not supported."
exit 1