Support missing image tasks, pool check
- Break out configs into separate pages - Update all configs using 16.04.0-LTS to 16.04-LTS - Remove Batch `account` from recipe credentials
This commit is contained in:
Родитель
e349a004cd
Коммит
33291504c2
21
CHANGELOG.md
21
CHANGELOG.md
|
@ -2,19 +2,34 @@
|
|||
|
||||
## [Unreleased]
|
||||
### Added
|
||||
- Support for provisioning storage clusters via the `fs cluster` command
|
||||
- Support for provisioning managed disks via the `fs disks` command
|
||||
- Support for UserSubscription Batch accounts
|
||||
- Azure Active Directory authentication support for Batch accounts
|
||||
- `allow_run_on_missing` option to jobs that allows tasks to execute under
|
||||
jobs with Docker images that have not been pre-loaded via the
|
||||
`global_resources`:`docker_images` setting in config.json. Note that, if
|
||||
possible, you should attempt to specify all Docker images that you intend
|
||||
to run in the `global_resources`:`docker_images` property in the global
|
||||
configuration to minimize scheduling to task execution latency.
|
||||
- Support for Canonical/UbuntuServer/16.04-LTS. This sku should be used over
|
||||
the old 16.04.0-LTS sku due to
|
||||
[issue #31](https://github.com/Azure/batch-shipyard/issues/31).
|
||||
|
||||
### Changed
|
||||
- **Breaking Change:** `glusterfs` `volume_driver` for `shared_data_volumes`
|
||||
should now be named as `glusterfs_on_compute`. This is to distinguish
|
||||
co-located glusterfs on compute nodes with possible standalone glusterfs
|
||||
`storage_cluster` remote mounted in the future.
|
||||
- Batch account (name) is now an optional property in the credentials config
|
||||
- Pool existance is now checked prior to job submission and can now proceed
|
||||
to add without an active pool.
|
||||
- Batch `account` (name) is now an optional property in the credentials config
|
||||
- Configuration doc broken up into multiple pages
|
||||
- Update all recipes using Canonical/UbuntuServer/16.04.0-LTS to use
|
||||
Canonical/UbuntuServer/16.04-LTS instead
|
||||
- Precompile python files for Docker images
|
||||
- All dependencies updated to latest versions
|
||||
- Update Batch API call compatibility for `azure-batch 2.0.0`
|
||||
- Precompile python files for Docker images
|
||||
- Configuration doc broken up into multiple pages
|
||||
|
||||
## [2.5.4] - 2017-03-08
|
||||
### Changed
|
||||
|
|
|
@ -37,10 +37,6 @@
|
|||
},
|
||||
"batch": {
|
||||
"account_service_url": "",
|
||||
"account_key": "",
|
||||
"account_key_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/batchkey",
|
||||
"user_subscription": false,
|
||||
"resource_group": "",
|
||||
"aad": {
|
||||
"endpoint": "https://batch.core.windows.net/",
|
||||
"directory_id": "",
|
||||
|
@ -54,7 +50,10 @@
|
|||
"enabled": true,
|
||||
"filename": ""
|
||||
}
|
||||
}
|
||||
},
|
||||
"resource_group": "",
|
||||
"account_key": "",
|
||||
"account_key_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/batchkey"
|
||||
},
|
||||
"storage": {
|
||||
"mystorageaccount": {
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
"static_public_ip": false,
|
||||
"virtual_network": {
|
||||
"name": "",
|
||||
"resource_group": "",
|
||||
"existing_ok": false,
|
||||
"address_space": "",
|
||||
"subnet": {
|
||||
|
@ -28,7 +29,7 @@
|
|||
"network_security": {
|
||||
"nfs": ["1.2.3.0/24"],
|
||||
"ssh": ["*"],
|
||||
"custom_inbound": {
|
||||
"custom_inbound_rules": {
|
||||
"myrule": {
|
||||
"destination_port_range": "5000-5001",
|
||||
"source_address_prefix": ["1.2.3.4", "5.6.7.0/24"],
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
},
|
||||
"environment_variables_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/myjobenv",
|
||||
"max_task_retries": 1,
|
||||
"allow_run_on_missing_image": false,
|
||||
"input_data": {
|
||||
"azure_batch": [
|
||||
{
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
},
|
||||
"virtual_network": {
|
||||
"name": "",
|
||||
"resource_group": "",
|
||||
"create_nonexistant": false,
|
||||
"address_space": "",
|
||||
"subnet": {
|
||||
|
|
136
convoy/batch.py
136
convoy/batch.py
|
@ -66,6 +66,27 @@ _RUN_ELEVATED = batchmodels.UserIdentity(
|
|||
)
|
||||
|
||||
|
||||
def get_batch_account(batch_mgmt_client, config):
|
||||
# type: (azure.mgmt.batch.BatchManagementClient, dict) ->
|
||||
# azure.mgmt.batch.models.BatchAccount
|
||||
"""Get Batch account properties from ARM
|
||||
:param azure.mgmt.batch.BatchManagementClient batch_mgmt_client:
|
||||
batch management client
|
||||
:param dict config: configuration dict
|
||||
:rtype: azure.mgmt.batch.models.BatchAccount
|
||||
:return: Batch account
|
||||
"""
|
||||
if batch_mgmt_client is None:
|
||||
raise RuntimeError(
|
||||
'Batch management client is invalid, please specify management '
|
||||
'aad credentials')
|
||||
bc = settings.credentials_batch(config)
|
||||
return batch_mgmt_client.batch_account.get(
|
||||
resource_group_name=bc.resource_group,
|
||||
account_name=bc.account,
|
||||
)
|
||||
|
||||
|
||||
def list_node_agent_skus(batch_client):
|
||||
# type: (batch.BatchServiceClient) -> None
|
||||
"""List all node agent skus
|
||||
|
@ -1708,15 +1729,81 @@ def add_jobs(
|
|||
# get the pool inter-node comm setting
|
||||
bs = settings.batch_shipyard_settings(config)
|
||||
pool = settings.pool_settings(config)
|
||||
_pool = batch_client.pool.get(pool.id)
|
||||
global_resources = []
|
||||
for gr in settings.global_resources_docker_images(config):
|
||||
global_resources.append(gr)
|
||||
try:
|
||||
cloud_pool = batch_client.pool.get(pool.id)
|
||||
except batchmodels.batch_error.BatchErrorException as ex:
|
||||
if 'The specified pool does not exist.' in ex.message.value:
|
||||
logger.error('{} pool does not exist'.format(pool.id))
|
||||
if util.confirm_action(
|
||||
config, 'add jobs to nonexistant pool {}'.format(pool.id)):
|
||||
cloud_pool = None
|
||||
else:
|
||||
logger.error(
|
||||
'not submitting jobs to nonexistant pool {}'.format(
|
||||
pool.id))
|
||||
return
|
||||
else:
|
||||
raise
|
||||
global_resources = settings.global_resources_docker_images(config)
|
||||
lastjob = None
|
||||
lasttask = None
|
||||
for jobspec in settings.job_specifications(config):
|
||||
jpcmd = ['$AZ_BATCH_NODE_STARTUP_DIR/wd/{} {}'.format(
|
||||
jpfile[0], ' '.join(global_resources))]
|
||||
job_id = settings.job_id(jobspec)
|
||||
# perform checks:
|
||||
# 1. check docker images in task against pre-loaded on pool
|
||||
# 2. if tasks have dependencies, set it if so
|
||||
# 3. if there are multi-instance tasks
|
||||
mi_ac = settings.job_multi_instance_auto_complete(config)
|
||||
multi_instance = False
|
||||
mi_docker_container_name = None
|
||||
reserved_task_id = None
|
||||
uses_task_dependencies = False
|
||||
missing_images = []
|
||||
allow_run_on_missing = settings.job_allow_run_on_missing(jobspec)
|
||||
for task in settings.job_tasks(jobspec):
|
||||
# check if task docker image is set in config.json
|
||||
di = settings.task_docker_image(task)
|
||||
if di not in global_resources:
|
||||
if allow_run_on_missing:
|
||||
logger.warning(
|
||||
('docker image {} not pre-loaded on pool for a '
|
||||
'task specified in job {}').format(di, job_id))
|
||||
missing_images.append(di)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
('not submitting job {} with missing docker image {} '
|
||||
'pre-load on pool {}').format(job_id, di, pool.id))
|
||||
# do not break, check to ensure ids are set on each task if
|
||||
# task dependencies are set
|
||||
if settings.has_depends_on_task(task):
|
||||
uses_task_dependencies = True
|
||||
if settings.is_multi_instance_task(task):
|
||||
if multi_instance and mi_ac:
|
||||
raise ValueError(
|
||||
'cannot specify more than one multi-instance task '
|
||||
'per job with auto completion enabled')
|
||||
multi_instance = True
|
||||
mi_docker_container_name = settings.task_name(task)
|
||||
if util.is_none_or_empty(mi_docker_container_name):
|
||||
_id = settings.task_id(task)
|
||||
if util.is_none_or_empty(_id):
|
||||
reserved_task_id = _generate_next_generic_task_id(
|
||||
batch_client, job_id)
|
||||
settings.set_task_id(task, reserved_task_id)
|
||||
_id = '{}-{}'.format(job_id, reserved_task_id)
|
||||
settings.set_task_name(task, _id)
|
||||
mi_docker_container_name = settings.task_name(task)
|
||||
del _id
|
||||
# construct job prep
|
||||
if util.is_not_empty(global_resources):
|
||||
if len(missing_images) > 0 and allow_run_on_missing:
|
||||
gr = list(set(global_resources) - set(missing_images))
|
||||
else:
|
||||
gr = global_resources
|
||||
jpcmd = ['$AZ_BATCH_NODE_STARTUP_DIR/wd/{} {}'.format(
|
||||
jpfile[0], ' '.join(gr))]
|
||||
else:
|
||||
jpcmd = []
|
||||
# digest any input_data
|
||||
addlcmds = data.process_input_data(config, bxfile, jobspec)
|
||||
if addlcmds is not None:
|
||||
|
@ -1741,39 +1828,10 @@ def add_jobs(
|
|||
user_identity=_RUN_ELEVATED,
|
||||
rerun_on_node_reboot_after_success=False,
|
||||
),
|
||||
uses_task_dependencies=False,
|
||||
uses_task_dependencies=uses_task_dependencies,
|
||||
constraints=job_constraints,
|
||||
)
|
||||
lastjob = job.id
|
||||
# perform checks:
|
||||
# 1. if tasks have dependencies, set it if so
|
||||
# 2. if there are multi-instance tasks
|
||||
mi_ac = settings.job_multi_instance_auto_complete(config)
|
||||
multi_instance = False
|
||||
mi_docker_container_name = None
|
||||
reserved_task_id = None
|
||||
for task in settings.job_tasks(jobspec):
|
||||
# do not break, check to ensure ids are set on each task if
|
||||
# task dependencies are set
|
||||
if settings.has_depends_on_task(task):
|
||||
job.uses_task_dependencies = True
|
||||
if settings.is_multi_instance_task(task):
|
||||
if multi_instance and mi_ac:
|
||||
raise ValueError(
|
||||
'cannot specify more than one multi-instance task '
|
||||
'per job with auto completion enabled')
|
||||
multi_instance = True
|
||||
mi_docker_container_name = settings.task_name(task)
|
||||
if util.is_none_or_empty(mi_docker_container_name):
|
||||
_id = settings.task_id(task)
|
||||
if util.is_none_or_empty(_id):
|
||||
reserved_task_id = _generate_next_generic_task_id(
|
||||
batch_client, job.id)
|
||||
settings.set_task_id(task, reserved_task_id)
|
||||
_id = '{}-{}'.format(job.id, reserved_task_id)
|
||||
settings.set_task_name(task, _id)
|
||||
mi_docker_container_name = settings.task_name(task)
|
||||
del _id
|
||||
# add multi-instance settings
|
||||
set_terminate_on_all_tasks_complete = False
|
||||
if multi_instance and mi_ac:
|
||||
|
@ -1784,7 +1842,7 @@ def add_jobs(
|
|||
'docker rm -v {}'.format(mi_docker_container_name)]),
|
||||
user_identity=_RUN_ELEVATED,
|
||||
)
|
||||
logger.info('Adding job: {}'.format(job.id))
|
||||
logger.info('Adding job {} to pool {}'.format(job.id, pool.id))
|
||||
try:
|
||||
batch_client.job.add(job)
|
||||
except batchmodels.batch_error.BatchErrorException as ex:
|
||||
|
@ -1810,6 +1868,7 @@ def add_jobs(
|
|||
del mi_ac
|
||||
del multi_instance
|
||||
del mi_docker_container_name
|
||||
del uses_task_dependencies
|
||||
# get base env vars from job
|
||||
job_env_vars = settings.job_environment_variables(jobspec)
|
||||
_job_env_vars_secid = \
|
||||
|
@ -1830,7 +1889,8 @@ def add_jobs(
|
|||
if util.is_none_or_empty(settings.task_name(_task)):
|
||||
settings.set_task_name(_task, '{}-{}'.format(job.id, _task_id))
|
||||
del _task_id
|
||||
task = settings.task_settings(_pool, config, _task)
|
||||
task = settings.task_settings(
|
||||
cloud_pool, config, pool, _task, missing_images)
|
||||
# retrieve keyvault task env vars
|
||||
if util.is_not_empty(
|
||||
task.environment_variables_keyvault_secret_id):
|
||||
|
|
|
@ -118,6 +118,31 @@ def create_network_client(ctx, credentials=None, subscription_id=None):
|
|||
credentials, subscription_id)
|
||||
|
||||
|
||||
def create_batch_mgmt_client(ctx, credentials=None, subscription_id=None):
|
||||
# type: (CliContext, object, str) ->
|
||||
# azure.mgmt.batch.BatchManagementClient
|
||||
"""Create batch management client
|
||||
:param CliContext ctx: Cli Context
|
||||
:param object credentials: credentials object
|
||||
:param str subscription_id: subscription id
|
||||
:rtype: azure.mgmt.batch.BatchManagementClient
|
||||
:return: batch management client
|
||||
"""
|
||||
mgmt_aad = None
|
||||
if credentials is None:
|
||||
mgmt_aad = settings.credentials_management(ctx.config).aad
|
||||
credentials = aad.create_aad_credentials(ctx, mgmt_aad)
|
||||
if util.is_none_or_empty(subscription_id):
|
||||
if mgmt_aad is None:
|
||||
mgmt_aad = settings.credentials_management(ctx.config).aad
|
||||
subscription_id = ctx.subscription_id or mgmt_aad.subscription_id
|
||||
batch_mgmt_client = azure.mgmt.batch.BatchManagementClient(
|
||||
credentials, subscription_id)
|
||||
batch_mgmt_client.config.add_user_agent(
|
||||
'batch-shipyard/{}'.format(__version__))
|
||||
return batch_mgmt_client
|
||||
|
||||
|
||||
def create_arm_clients(ctx, batch_clients=False):
|
||||
# type: (CliContext, bool) ->
|
||||
# Tuple[azure.mgmt.resource.resources.ResourceManagementClient,
|
||||
|
@ -148,10 +173,16 @@ def create_arm_clients(ctx, batch_clients=False):
|
|||
network_client = create_network_client(
|
||||
ctx, credentials=credentials, subscription_id=subscription_id)
|
||||
if batch_clients:
|
||||
batch_mgmt_client, batch_client = create_batch_clients(ctx)
|
||||
batch_client = create_batch_service_client(ctx)
|
||||
try:
|
||||
batch_mgmt_client = create_batch_mgmt_client(
|
||||
ctx, credentials=credentials, subscription_id=subscription_id)
|
||||
except Exception:
|
||||
logger.warning('could not create batch management client')
|
||||
batch_mgmt_client = None
|
||||
else:
|
||||
batch_mgmt_client = None
|
||||
batch_client = None
|
||||
batch_mgmt_client = None
|
||||
return (
|
||||
resource_client, compute_client, network_client, batch_mgmt_client,
|
||||
batch_client
|
||||
|
@ -171,60 +202,25 @@ def create_keyvault_client(ctx):
|
|||
)
|
||||
|
||||
|
||||
def create_batch_mgmt_client(ctx, credentials=None, subscription_id=None):
|
||||
# type: (CliContext, object, str) ->
|
||||
# azure.mgmt.batch.BatchManagementClient
|
||||
"""Create batch management client
|
||||
def create_batch_service_client(ctx):
|
||||
# type: (CliContext) -> azure.batch.batch_service_client.BatchServiceClient
|
||||
"""Create batch service client
|
||||
:param CliContext ctx: Cli Context
|
||||
:param object credentials: credentials object
|
||||
:param str subscription_id: subscription id
|
||||
:rtype: azure.mgmt.batch.BatchManagementClient
|
||||
:return: batch management client
|
||||
"""
|
||||
batch_aad = None
|
||||
if credentials is None:
|
||||
batch_aad = settings.credentials_batch(ctx.config).aad
|
||||
credentials = aad.create_aad_credentials(ctx, batch_aad)
|
||||
if util.is_none_or_empty(subscription_id):
|
||||
if batch_aad is None:
|
||||
batch_aad = settings.credentials_batch(ctx.config).aad
|
||||
subscription_id = ctx.subscription_id or batch_aad.subscription_id
|
||||
if util.is_none_or_empty(subscription_id):
|
||||
return None
|
||||
batch_mgmt_client = azure.mgmt.batch.BatchManagementClient(
|
||||
credentials, subscription_id)
|
||||
batch_mgmt_client.config.add_user_agent(
|
||||
'batch-shipyard/{}'.format(__version__))
|
||||
return batch_mgmt_client
|
||||
|
||||
|
||||
def create_batch_clients(ctx):
|
||||
# type: (CliContext) ->
|
||||
# Tuple[azure.mgmt.batch.BatchManagementClient,
|
||||
# azure.batch.batch_service_client.BatchServiceClient]
|
||||
"""Create batch client
|
||||
:param CliContext ctx: Cli Context
|
||||
:rtype: tuple
|
||||
:return: (
|
||||
azure.mgmt.batch.BatchManagementClient,
|
||||
azure.batch.batch_service_client.BatchServiceClient)
|
||||
:rtype: azure.batch.batch_service_client.BatchServiceClient
|
||||
:return: batch service client
|
||||
"""
|
||||
bc = settings.credentials_batch(ctx.config)
|
||||
use_aad = bc.user_subscription or util.is_none_or_empty(bc.account_key)
|
||||
batch_mgmt_client = None
|
||||
if use_aad:
|
||||
subscription_id = ctx.subscription_id or bc.subscription_id
|
||||
if util.is_none_or_empty(bc.account_key):
|
||||
logger.debug('batch account key not specified, using aad auth')
|
||||
batch_aad = settings.credentials_batch(ctx.config).aad
|
||||
credentials = aad.create_aad_credentials(ctx, batch_aad)
|
||||
batch_mgmt_client = create_batch_mgmt_client(
|
||||
ctx, credentials=credentials, subscription_id=subscription_id)
|
||||
else:
|
||||
credentials = batchauth.SharedKeyCredentials(
|
||||
bc.account, bc.account_key)
|
||||
batch_client = batchsc.BatchServiceClient(
|
||||
credentials, base_url=bc.account_service_url)
|
||||
batch_client.config.add_user_agent('batch-shipyard/{}'.format(__version__))
|
||||
return (batch_mgmt_client, batch_client)
|
||||
return batch_client
|
||||
|
||||
|
||||
def create_storage_clients():
|
||||
|
|
|
@ -43,6 +43,7 @@ except ImportError:
|
|||
import uuid
|
||||
# non-stdlib imports
|
||||
import azure.batch.models as batchmodels
|
||||
import azure.mgmt.batch.models as batchmgmtmodels
|
||||
# local imports
|
||||
from . import batch
|
||||
from . import crypto
|
||||
|
@ -451,10 +452,15 @@ def _add_pool(
|
|||
raise ValueError(
|
||||
'Invalid subnet name on virtual network {}'.format(
|
||||
pool_settings.virtual_network.name))
|
||||
if util.is_not_empty(pool_settings.virtual_network.resource_group):
|
||||
_vnet_rg = pool_settings.virtual_network.resource_group
|
||||
else:
|
||||
_vnet_rg = bc.resource_group
|
||||
# create virtual network and subnet if specified
|
||||
vnet, subnet = resource.create_virtual_network_and_subnet(
|
||||
network_client, bc.resource_group, bc.location,
|
||||
network_client, _vnet_rg, bc.location,
|
||||
pool_settings.virtual_network)
|
||||
del _vnet_rg
|
||||
# ensure address prefix for subnet is valid
|
||||
tmp = subnet.address_prefix.split('/')
|
||||
if len(tmp) <= 1:
|
||||
|
@ -490,7 +496,9 @@ def _add_pool(
|
|||
sc_arg = None
|
||||
if storage_cluster_mount:
|
||||
# ensure usersubscription account
|
||||
if not bc.user_subscription:
|
||||
ba = batch.get_batch_account(batch_mgmt_client, config)
|
||||
if (not ba.pool_allocation_mode ==
|
||||
batchmgmtmodels.PoolAllocationMode.user_subscription):
|
||||
raise RuntimeError(
|
||||
'{} account is not a UserSubscription account'.format(
|
||||
bc.account))
|
||||
|
@ -1135,21 +1143,6 @@ def _adjust_settings_for_pool_creation(config):
|
|||
# adjust inter node comm setting
|
||||
if pool.vm_count < 1:
|
||||
raise ValueError('invalid vm_count: {}'.format(pool.vm_count))
|
||||
dr = settings.data_replication_settings(config)
|
||||
max_vms = 20 if publisher == 'microsoftwindowsserver' else 40
|
||||
if pool.vm_count > max_vms:
|
||||
if dr.peer_to_peer.enabled:
|
||||
logger.warning(
|
||||
('disabling peer-to-peer transfer as pool size of {} exceeds '
|
||||
'max limit of {} vms for inter-node communication').format(
|
||||
pool.vm_count, max_vms))
|
||||
settings.set_peer_to_peer_enabled(config, False)
|
||||
if pool.inter_node_communication_enabled:
|
||||
logger.warning(
|
||||
('disabling inter-node communication as pool size of {} '
|
||||
'exceeds max limit of {} vms for setting').format(
|
||||
pool.vm_count, max_vms))
|
||||
settings.set_inter_node_communication_enabled(config, False)
|
||||
# re-read pool and data replication settings
|
||||
pool = settings.pool_settings(config)
|
||||
dr = settings.data_replication_settings(config)
|
||||
|
|
|
@ -632,9 +632,14 @@ def create_storage_cluster(
|
|||
# upload scripts to blob storage for customscript
|
||||
blob_urls = storage.upload_for_remotefs(blob_client, remotefs_files)
|
||||
# create virtual network and subnet if specified
|
||||
if util.is_not_empty(rfs.storage_cluster.virtual_network.resource_group):
|
||||
_vnet_rg = rfs.storage_cluster.virtual_network.resource_group
|
||||
else:
|
||||
_vnet_rg = rfs.resource_group
|
||||
vnet, subnet = resource.create_virtual_network_and_subnet(
|
||||
network_client, rfs.resource_group, rfs.location,
|
||||
network_client, _vnet_rg, rfs.location,
|
||||
rfs.storage_cluster.virtual_network)
|
||||
del _vnet_rg
|
||||
|
||||
# TODO create slb
|
||||
|
||||
|
|
|
@ -98,7 +98,7 @@ ManagementCredentialsSettings = collections.namedtuple(
|
|||
BatchCredentialsSettings = collections.namedtuple(
|
||||
'BatchCredentialsSettings', [
|
||||
'aad', 'account', 'account_key', 'account_service_url',
|
||||
'user_subscription', 'resource_group', 'subscription_id', 'location',
|
||||
'resource_group', 'subscription_id', 'location',
|
||||
]
|
||||
)
|
||||
StorageCredentialsSettings = collections.namedtuple(
|
||||
|
@ -176,8 +176,8 @@ ManagedDisksSettings = collections.namedtuple(
|
|||
)
|
||||
VirtualNetworkSettings = collections.namedtuple(
|
||||
'VirtualNetworkSettings', [
|
||||
'name', 'address_space', 'subnet_name', 'subnet_address_prefix',
|
||||
'existing_ok', 'create_nonexistant',
|
||||
'name', 'resource_group', 'address_space', 'subnet_name',
|
||||
'subnet_address_prefix', 'existing_ok', 'create_nonexistant',
|
||||
]
|
||||
)
|
||||
FileServerSettings = collections.namedtuple(
|
||||
|
@ -718,7 +718,6 @@ def credentials_batch(config):
|
|||
account = _kv_read_checked(conf, 'account')
|
||||
account_key = _kv_read_checked(conf, 'account_key')
|
||||
account_service_url = conf['account_service_url']
|
||||
user_subscription = _kv_read(conf, 'user_subscription', False)
|
||||
resource_group = _kv_read_checked(conf, 'resource_group')
|
||||
# get subscription id from management section
|
||||
try:
|
||||
|
@ -749,7 +748,6 @@ def credentials_batch(config):
|
|||
account=account,
|
||||
account_key=account_key,
|
||||
account_service_url=conf['account_service_url'],
|
||||
user_subscription=user_subscription,
|
||||
resource_group=resource_group,
|
||||
location=location,
|
||||
subscription_id=subscription_id,
|
||||
|
@ -1804,6 +1802,22 @@ def job_max_task_retries(conf):
|
|||
return max_task_retries
|
||||
|
||||
|
||||
def job_allow_run_on_missing(conf):
|
||||
# type: (dict) -> int
|
||||
"""Get allow task run on missing image
|
||||
:param dict conf: job configuration object
|
||||
:rtype: bool
|
||||
:return: allow run on missing image
|
||||
"""
|
||||
try:
|
||||
allow = conf['allow_run_on_missing_image']
|
||||
if allow is None:
|
||||
raise KeyError()
|
||||
except KeyError:
|
||||
allow = False
|
||||
return allow
|
||||
|
||||
|
||||
def has_depends_on_task(conf):
|
||||
# type: (dict) -> bool
|
||||
"""Determines if task has task dependencies
|
||||
|
@ -1825,7 +1839,7 @@ def has_depends_on_task(conf):
|
|||
def is_multi_instance_task(conf):
|
||||
# type: (dict) -> bool
|
||||
"""Determines if task is multi-isntance
|
||||
:param dict conf: job configuration object
|
||||
:param dict conf: task configuration object
|
||||
:rtype: bool
|
||||
:return: task is multi-instance
|
||||
"""
|
||||
|
@ -1835,7 +1849,7 @@ def is_multi_instance_task(conf):
|
|||
def task_name(conf):
|
||||
# type: (dict) -> str
|
||||
"""Get task name
|
||||
:param dict conf: job configuration object
|
||||
:param dict conf: task configuration object
|
||||
:rtype: str
|
||||
:return: task name
|
||||
"""
|
||||
|
@ -1848,10 +1862,26 @@ def task_name(conf):
|
|||
return name
|
||||
|
||||
|
||||
def task_docker_image(conf):
|
||||
# type: (dict) -> str
|
||||
"""Get docker image used by task
|
||||
:param dict conf: task configuration object
|
||||
:rtype: str
|
||||
:return: docker image used by task
|
||||
"""
|
||||
try:
|
||||
di = conf['image']
|
||||
if util.is_none_or_empty(di):
|
||||
raise KeyError()
|
||||
except KeyError:
|
||||
di = None
|
||||
return di
|
||||
|
||||
|
||||
def set_task_name(conf, name):
|
||||
# type: (dict, str) -> None
|
||||
"""Set task name
|
||||
:param dict conf: job configuration object
|
||||
:param dict conf: task configuration object
|
||||
:param str name: task name to set
|
||||
"""
|
||||
conf['name'] = name
|
||||
|
@ -1860,7 +1890,7 @@ def set_task_name(conf, name):
|
|||
def task_id(conf):
|
||||
# type: (dict) -> str
|
||||
"""Get task id
|
||||
:param dict conf: job configuration object
|
||||
:param dict conf: task configuration object
|
||||
:rtype: str
|
||||
:return: task id
|
||||
"""
|
||||
|
@ -1876,18 +1906,21 @@ def task_id(conf):
|
|||
def set_task_id(conf, id):
|
||||
# type: (dict, str) -> None
|
||||
"""Set task id
|
||||
:param dict conf: job configuration object
|
||||
:param dict conf: task configuration object
|
||||
:param str id: task id to set
|
||||
"""
|
||||
conf['id'] = id
|
||||
|
||||
|
||||
def task_settings(pool, config, conf):
|
||||
# type: (azure.batch.models.CloudPool, dict, dict) -> TaskSettings
|
||||
def task_settings(cloud_pool, config, poolconf, conf, missing_images):
|
||||
# type: (azure.batch.models.CloudPool, dict, PoolSettings,
|
||||
# dict, list) -> TaskSettings
|
||||
"""Get task settings
|
||||
:param azure.batch.models.CloudPool pool: cloud pool object
|
||||
:param azure.batch.models.CloudPool cloud_pool: cloud pool object
|
||||
:param dict config: configuration dict
|
||||
:param dict conf: job configuration object
|
||||
:param PoolSettings poolconf: pool settings
|
||||
:param dict conf: task configuration object
|
||||
:param list missing_images: list of missing docker images on pool
|
||||
:rtype: TaskSettings
|
||||
:return: task settings
|
||||
"""
|
||||
|
@ -1898,11 +1931,36 @@ def task_settings(pool, config, conf):
|
|||
image = conf['image']
|
||||
if util.is_none_or_empty(image):
|
||||
raise ValueError('image is invalid')
|
||||
# check if image is in missing image list
|
||||
if image in missing_images:
|
||||
# get private registry settings
|
||||
preg = docker_registry_private_settings(config)
|
||||
if util.is_not_empty(preg.storage_account):
|
||||
registry = 'localhost:5000/'
|
||||
elif util.is_not_empty(preg.server):
|
||||
registry = '{}/'.format(preg.server)
|
||||
else:
|
||||
registry = ''
|
||||
del preg
|
||||
image = '{}{}'.format(registry, image)
|
||||
# get some pool props
|
||||
publisher = pool.virtual_machine_configuration.image_reference.\
|
||||
publisher.lower()
|
||||
offer = pool.virtual_machine_configuration.image_reference.offer.lower()
|
||||
sku = pool.virtual_machine_configuration.image_reference.sku.lower()
|
||||
if cloud_pool is None:
|
||||
pool_id = poolconf.id
|
||||
publisher = poolconf.publisher.lower()
|
||||
offer = poolconf.offer.lower()
|
||||
sku = poolconf.sku.lower()
|
||||
vm_size = poolconf.vm_size
|
||||
inter_node_comm = poolconf.inter_node_communication_enabled
|
||||
else:
|
||||
pool_id = cloud_pool.id
|
||||
publisher = cloud_pool.virtual_machine_configuration.image_reference.\
|
||||
publisher.lower()
|
||||
offer = cloud_pool.virtual_machine_configuration.image_reference.\
|
||||
offer.lower()
|
||||
sku = cloud_pool.virtual_machine_configuration.image_reference.sku.\
|
||||
lower()
|
||||
vm_size = cloud_pool.vm_size.lower()
|
||||
inter_node_comm = cloud_pool.enable_inter_node_communication
|
||||
# get depends on
|
||||
try:
|
||||
depends_on = conf['depends_on']
|
||||
|
@ -2088,10 +2146,10 @@ def task_settings(pool, config, conf):
|
|||
gpu = False
|
||||
# adjust for gpu settings
|
||||
if gpu:
|
||||
if not is_gpu_pool(pool.vm_size):
|
||||
if not is_gpu_pool(vm_size):
|
||||
raise RuntimeError(
|
||||
('cannot initialize a gpu task on nodes without '
|
||||
'gpus, pool: {} vm_size: {}').format(pool.id, pool.vm_size))
|
||||
'gpus, pool: {} vm_size: {}').format(pool_id, vm_size))
|
||||
# TODO other images as they become available with gpu support
|
||||
if (publisher != 'canonical' and offer != 'ubuntuserver' and
|
||||
sku < '16.04'):
|
||||
|
@ -2107,16 +2165,16 @@ def task_settings(pool, config, conf):
|
|||
docker_exec_cmd = 'docker exec'
|
||||
# adjust for infiniband
|
||||
if infiniband:
|
||||
if not pool.enable_inter_node_communication:
|
||||
if not inter_node_comm:
|
||||
raise RuntimeError(
|
||||
('cannot initialize an infiniband task on a '
|
||||
'non-internode communication enabled '
|
||||
'pool: {}').format(pool.id))
|
||||
if not is_rdma_pool(pool.vm_size):
|
||||
'pool: {}').format(pool_id))
|
||||
if not is_rdma_pool(vm_size):
|
||||
raise RuntimeError(
|
||||
('cannot initialize an infiniband task on nodes '
|
||||
'without RDMA, pool: {} vm_size: {}').format(
|
||||
pool.id, pool.vm_size))
|
||||
pool_id, vm_size))
|
||||
# only centos-hpc and sles-hpc:12-sp1 are supported
|
||||
# for infiniband
|
||||
if publisher == 'openlogic' and offer == 'centos-hpc':
|
||||
|
@ -2147,7 +2205,7 @@ def task_settings(pool, config, conf):
|
|||
run_opts.append('--env-file {}'.format(envfile))
|
||||
# populate mult-instance settings
|
||||
if is_multi_instance_task(conf):
|
||||
if not pool.enable_inter_node_communication:
|
||||
if not inter_node_comm:
|
||||
raise RuntimeError(
|
||||
('cannot run a multi-instance task on a '
|
||||
'non-internode communication enabled '
|
||||
|
@ -2194,7 +2252,12 @@ def task_settings(pool, config, conf):
|
|||
if num_instances == 'pool_specification_vm_count':
|
||||
num_instances = pool_vm_count(config)
|
||||
elif num_instances == 'pool_current_dedicated':
|
||||
num_instances = pool.current_dedicated
|
||||
if cloud_pool is None:
|
||||
raise RuntimeError(
|
||||
('Cannot retrieve current dedicated count for '
|
||||
'pool: {}. Ensure pool exists.)'.format(pool_id)))
|
||||
else:
|
||||
num_instances = cloud_pool.current_dedicated
|
||||
else:
|
||||
raise ValueError(
|
||||
('multi instance num instances setting '
|
||||
|
@ -2267,6 +2330,7 @@ def virtual_network_settings(
|
|||
except KeyError:
|
||||
conf = {}
|
||||
name = _kv_read_checked(conf, 'name')
|
||||
resource_group = _kv_read_checked(conf, 'resource_group')
|
||||
address_space = _kv_read_checked(conf, 'address_space')
|
||||
existing_ok = _kv_read(conf, 'existing_ok', default_existing_ok)
|
||||
subnet_name = _kv_read_checked(conf['subnet'], 'name')
|
||||
|
@ -2275,6 +2339,7 @@ def virtual_network_settings(
|
|||
conf, 'create_nonexistant', default_create_nonexistant)
|
||||
return VirtualNetworkSettings(
|
||||
name=name,
|
||||
resource_group=resource_group,
|
||||
address_space=address_space,
|
||||
subnet_name=subnet_name,
|
||||
subnet_address_prefix=subnet_address_prefix,
|
||||
|
@ -2331,9 +2396,9 @@ def remotefs_settings(config):
|
|||
)
|
||||
if not isinstance(sc_ns_inbound['nfs'].source_address_prefix, list):
|
||||
raise ValueError('expected list for nfs network security rule')
|
||||
if 'custom_inbound' in ns_conf:
|
||||
if 'custom_inbound_rules' in ns_conf:
|
||||
_reserved = frozenset(['ssh', 'nfs', 'glusterfs'])
|
||||
for key in ns_conf['custom_inbound']:
|
||||
for key in ns_conf['custom_inbound_rules']:
|
||||
# ensure key is not reserved
|
||||
if key.lower() in _reserved:
|
||||
raise ValueError(
|
||||
|
@ -2341,11 +2406,13 @@ def remotefs_settings(config):
|
|||
'reserved name {}').format(key, _reserved))
|
||||
sc_ns_inbound[key] = InboundNetworkSecurityRule(
|
||||
destination_port_range=_kv_read_checked(
|
||||
ns_conf['custom_inbound'][key], 'destination_port_range'),
|
||||
ns_conf['custom_inbound_rules'][key],
|
||||
'destination_port_range'),
|
||||
source_address_prefix=_kv_read_checked(
|
||||
ns_conf['custom_inbound'][key], 'source_address_prefix'),
|
||||
ns_conf['custom_inbound_rules'][key],
|
||||
'source_address_prefix'),
|
||||
protocol=_kv_read_checked(
|
||||
ns_conf['custom_inbound'][key], 'protocol'),
|
||||
ns_conf['custom_inbound_rules'][key], 'protocol'),
|
||||
)
|
||||
if not isinstance(sc_ns_inbound[key].source_address_prefix, list):
|
||||
raise ValueError(
|
||||
|
|
|
@ -7,10 +7,12 @@ Batch Shipyard is driven by the following json configuration files:
|
|||
1. [Credentials](11-batch-shipyard-configuration-credentials.md) -
|
||||
credentials for Azure Batch, Storage, KeyVault, Management and Docker private
|
||||
registries
|
||||
2. [Global config](#global) - Batch Shipyard and Docker-specific configuration
|
||||
settings
|
||||
3. [Pool](#pool) - Azure Batch pool configuration
|
||||
4. [Jobs](#jobs) - Azure Batch jobs and tasks configuration
|
||||
2. [Global config](12-batch-shipyard-configuration-global.md) -
|
||||
Batch Shipyard and Docker-specific configuration settings
|
||||
3. [Pool](13-batch-shipyard-configuration-pool.md) -
|
||||
Batch Shipyard pool configuration
|
||||
4. [Jobs](14-batch-shipyard-configuration-jobs.md) -
|
||||
Batch Shipyard jobs and tasks configuration
|
||||
|
||||
Note that all potential properties are described here and that specifying
|
||||
all such properties may result in invalid configuration as some properties
|
||||
|
@ -26,940 +28,5 @@ may be invalid if specified as such. They must be modified for your execution
|
|||
scenario. All [sample recipe](../recipes) also have a set of configuration
|
||||
files that can be modified to fit your needs.
|
||||
|
||||
### <a name="global"></a>Global Config
|
||||
The global config schema is as follows:
|
||||
|
||||
```json
|
||||
{
|
||||
"batch_shipyard": {
|
||||
"storage_account_settings": "mystorageaccount",
|
||||
"storage_entity_prefix": "shipyard",
|
||||
"generated_sas_expiry_days": 90,
|
||||
"encryption" : {
|
||||
"enabled": true,
|
||||
"pfx": {
|
||||
"filename": "encrypt.pfx",
|
||||
"passphrase": "mysupersecretpassword",
|
||||
"sha1_thumbprint": "123456789..."
|
||||
},
|
||||
"public_key_pem": "encrypt.pem"
|
||||
}
|
||||
},
|
||||
"docker_registry": {
|
||||
"private": {
|
||||
"allow_public_docker_hub_pull_on_missing": true,
|
||||
"server": "myserver-myorg.azurecr.io",
|
||||
"azure_storage": {
|
||||
"storage_account_settings": "mystorageaccount",
|
||||
"container": "mydockerregistry"
|
||||
}
|
||||
}
|
||||
},
|
||||
"data_replication": {
|
||||
"peer_to_peer": {
|
||||
"enabled": true,
|
||||
"compression": true,
|
||||
"concurrent_source_downloads": 10,
|
||||
"direct_download_seed_bias": null
|
||||
},
|
||||
"non_peer_to_peer_concurrent_downloading": true
|
||||
},
|
||||
"global_resources": {
|
||||
"docker_images": [
|
||||
"busybox",
|
||||
"redis:3.2.3-alpine",
|
||||
],
|
||||
"files": [
|
||||
{
|
||||
"source": {
|
||||
"path": "/some/local/path/dir",
|
||||
"include": ["*.dat"],
|
||||
"exclude": ["*.bak"]
|
||||
},
|
||||
"destination": {
|
||||
"shared_data_volume": "glustervol",
|
||||
"relative_destination_path": "myfiles",
|
||||
"data_transfer": {
|
||||
"method": "multinode_scp",
|
||||
"ssh_private_key": "id_rsa_shipyard",
|
||||
"scp_ssh_extra_options": "-C -c aes256-gcm@openssh.com",
|
||||
"rsync_extra_options": "",
|
||||
"split_files_megabytes": 500,
|
||||
"max_parallel_transfers_per_node": 2
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"path": "/some/local/path/bound/for/blob",
|
||||
"include": ["*.bin"]
|
||||
},
|
||||
"destination": {
|
||||
"storage_account_settings": "mystorageaccount",
|
||||
"data_transfer": {
|
||||
"container": "mycontainer",
|
||||
"blobxfer_extra_options": "--no-computefilemd5"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"path": "/another/local/path/dir",
|
||||
"include": [],
|
||||
"exclude": []
|
||||
},
|
||||
"destination": {
|
||||
"relative_destination_path": "relpath/on/host",
|
||||
"data_transfer": {
|
||||
"method": "rsync+ssh",
|
||||
"ssh_private_key": "id_rsa_shipyard",
|
||||
"scp_ssh_extra_options": "-c aes256-gcm@openssh.com",
|
||||
"rsync_extra_options": "-v"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"docker_volumes": {
|
||||
"data_volumes": {
|
||||
"abcvol": {
|
||||
"host_path": null,
|
||||
"container_path": "/abc"
|
||||
},
|
||||
"hosttempvol": {
|
||||
"host_path": "/tmp",
|
||||
"container_path": "/hosttmp"
|
||||
}
|
||||
},
|
||||
"shared_data_volumes": {
|
||||
"shipyardvol": {
|
||||
"volume_driver": "azurefile",
|
||||
"storage_account_settings": "mystorageaccount",
|
||||
"azure_file_share_name": "shipyardshared",
|
||||
"container_path": "$AZ_BATCH_NODE_SHARED_DIR/azfile",
|
||||
"mount_options": [
|
||||
"filemode=0777",
|
||||
"dirmode=0777",
|
||||
"nolock=true"
|
||||
]
|
||||
},
|
||||
"glustervol": {
|
||||
"volume_driver": "glusterfs_on_compute",
|
||||
"container_path": "$AZ_BATCH_NODE_SHARED_DIR/gfs",
|
||||
"volume_type": "replica",
|
||||
"volume_options": [
|
||||
"performance.cache-size 1 GB",
|
||||
"performance.cache-max-file-size 10 MB",
|
||||
"performance.cache-refresh-timeout 61",
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The `batch_shipyard` property is used to set settings for the tool.
|
||||
* (required) `storage_account_settings` is a link to the alias of the storage
|
||||
account specified, in this case, it is `mystorageaccount`. Batch shipyard
|
||||
requires a storage account for storing metadata in order to execute across a
|
||||
distributed environment.
|
||||
* (optional) `storage_entity_prefix` property is used as a generic qualifier
|
||||
to prefix storage containers (blob containers, tables, queues) with. If not
|
||||
specified, defaults to `shipyard`.
|
||||
* (optional) `generated_sas_expiry_days` property is used to set the number of
|
||||
days any generated SAS key by Batch Shipyard is valid for. The default is 30
|
||||
days. This is useful if you have long-lived pools and want to ensure that
|
||||
SAS keys are valid for longer periods of time.
|
||||
* (optional) `encryption` object is used to define credential encryption which
|
||||
contains the following members:
|
||||
* (required) `enabled` property enables or disables this feature.
|
||||
* (required) `pfx` object defines the PFX certificate
|
||||
* (required) `filename` property is the full path and name to the PFX
|
||||
certificate
|
||||
* (required) `passphrase` property is the passphrase for the PFX
|
||||
certificate. This cannot be empty.
|
||||
* (optional) `sha1_thumbprint` is the SHA1 thumbprint of the
|
||||
certificate. If the PFX file is created using the `cert create` command,
|
||||
then the SHA1 thumbprint is output. It is recommended to populate this
|
||||
property such that it does not have to be generated when needed for
|
||||
encryption.
|
||||
* (optional) `public_key_pem` property is the full path and name to the
|
||||
RSA public key in PEM format. If the PFX file is created using the
|
||||
`cert create` command, then this file is generated along with the PFX
|
||||
file. It is recommended to populate this property with the PEM file path
|
||||
such that it does not have to be generated when needed for encryption.
|
||||
|
||||
The `docker_registry` property is used to configure Docker image distribution
|
||||
options from public/private Docker hub and private registries.
|
||||
* (optional) `private` property controls settings for interacting with private
|
||||
registries. There are three kinds of private registries that are supported:
|
||||
(1) private registries hosted on Docker Hub, (2) Internet accessible
|
||||
registries such as those hosted by the
|
||||
[Azure Container Registry](https://azure.microsoft.com/en-us/services/container-registry/)
|
||||
service and (3) [private registry instances backed to
|
||||
Azure Blob Storage](https://azure.microsoft.com/en-us/documentation/articles/virtual-machines-linux-docker-registry-in-blob-storage/)
|
||||
and are run on compute nodes. To use private registries hosted on Docker Hub,
|
||||
no additional properties need to be specified here, instead, specify your
|
||||
Docker Hub login information in the credentials json. To specify a private
|
||||
registry other than on Docker Hub, a json property named `server` should be
|
||||
defined. To use a private registry backed by Azure Blob Storage, define a
|
||||
json object named `azure_storage`. Note that a maximum of only one of these
|
||||
three types private registries may be specified at once. The following
|
||||
describes members of the non-Docker Hub private registries supported:
|
||||
* (optional) `server` object is a property that is the fully-qualified host
|
||||
name to a private registry server. A specific port other than 80 can be
|
||||
specified using a `:` separator, e.g.,
|
||||
`mydockerregistry.com:8080`. Port 80 is the default if no port is
|
||||
specified. The value of this property should have an associated login
|
||||
in the credentials json file.
|
||||
* (optional) `azure_storage` object is to define settings for connecting
|
||||
to a private registry backed by Azure Storage blobs and where the
|
||||
private registry instances are hosted on the compute nodes themselves.
|
||||
* (required) `storage_account_settings` is a link to the alias of the
|
||||
storage account specified that stores the private registry blobs.
|
||||
* (required) `container` property is the name of the Azure Blob
|
||||
container holding the private registry blobs.
|
||||
* (optional) `allow_public_docker_hub_pull_on_missing` property allows
|
||||
pass-through of Docker image retrieval to public Docker Hub if it is
|
||||
missing in the private registry. This defaults to `false` if not
|
||||
specified.
|
||||
|
||||
The `data_replication` property is used to configure the internal image
|
||||
replication mechanism between compute nodes within a compute pool. The
|
||||
`non_peer_to_peer_concurrent_downloading` property specifies if it is ok
|
||||
to allow unfettered concurrent downloading from the source registry among
|
||||
all compute nodes. The following options apply to `peer_to_peer` data
|
||||
replication options:
|
||||
* (optional) `enabled` property enables or disables private peer-to-peer
|
||||
transfer. Note that for compute pools with a relatively small number of VMs,
|
||||
peer-to-peer transfer may not provide any benefit and is recommended to be
|
||||
disabled in these cases. Compute pools with large number of VMs and especially
|
||||
in the case of an Azure Storage-backed private registry can benefit from
|
||||
peer-to-peer image replication.
|
||||
* `compression` property enables or disables compression of image files. It
|
||||
is strongly recommended to keep this enabled.
|
||||
* `concurrent_source_downloads` property specifies the number of
|
||||
simultaneous downloads allowed to each image.
|
||||
* `direct_download_seed_bias` property sets the number of direct download
|
||||
seeds to prefer per image before switching to peer-to-peer transfer.
|
||||
|
||||
The `global_resources` property contains information regarding required
|
||||
Docker images, volume configuration and data ingress information. This
|
||||
property is required.
|
||||
|
||||
`docker_images` is an array of docker images that should be installed on
|
||||
every compute node when this configuration file is supplied while creating
|
||||
a compute pool. Image tags are supported. Image names should not include
|
||||
private registry server names, as these will be automatically prepended. For
|
||||
instance, if you have an image `abc/mytag` on your private registry
|
||||
`myregistry-myorg.azurecr.io`, your image should be named in the
|
||||
`docker_images` array as `abc/mytag` and not
|
||||
`myregistry-myorg.azurecr.io/abc/mytag`.
|
||||
|
||||
`files` is an optional property that specifies data that should be ingressed
|
||||
from a location accessible by the local machine (i.e., machine invoking
|
||||
`shipyard.py` to a shared file system location accessible by compute nodes
|
||||
in the pool or Azure Blob or File Storage). `files` is a json list of objects,
|
||||
which allows for multiple sources to destinations to be ingressed during the
|
||||
same invocation. Note that no Azure Batch environment variables
|
||||
(i.e., `$AZ_BATCH_`-style environment variables) are available as path
|
||||
arguments since ingress actions performed within `files` are done locally
|
||||
on the machine invoking `shipyard.py`. Each object within the `files` list
|
||||
contains the following members:
|
||||
* (required) `source` property contains the following members:
|
||||
* (required) `path` is a local path. A single file or a directory
|
||||
can be specified. Filters below will be ignored if `path` is a file and
|
||||
not a directory.
|
||||
* (optional) `include` is an array of
|
||||
[Unix shell-style wildcard filters](https://docs.python.org/3.5/library/fnmatch.html)
|
||||
where only files matching a filter are included in the data transfer.
|
||||
Filters specified in `include` have precedence over `exclude` described
|
||||
next. `include` can only have a maximum of 1 filter for ingress to Azure
|
||||
Blob Storage. In this example, all files ending in `.dat` are ingressed.
|
||||
* (optional) `exclude` is an array of
|
||||
[Unix shell-style wildcard filters](https://docs.python.org/3.5/library/fnmatch.html)
|
||||
where files matching a filter are excluded from the data transfer. Filters
|
||||
specified in `include` have precedence over filters specified in
|
||||
`exclude`. `exclude` cannot be specified for ingress into Azure Blob
|
||||
Storage. In this example, all files ending in `.bak` are skipped for
|
||||
ingress.
|
||||
* (required) `destination` property contains the following members:
|
||||
* (required or optional) `shared_data_volume` or `storage_account_settings`
|
||||
for data ingress to a GlusterFS volume or Azure Blob or File Storage. If
|
||||
you are ingressing to a pool with only one compute node, you may omit
|
||||
`shared_data_volume`. Otherwise, you may specify one or the other, but
|
||||
not both in the same object. Please see below in the
|
||||
`shared_data_volumes` for information on how to set up a GlusterFS share.
|
||||
* (required or optional) `relative_destination_path` specifies a relative
|
||||
destination path to place the files, with respect to the target root.
|
||||
If transferring to a `shared_data_volume` then this is relative to the
|
||||
GlusterFS volume root. If transferring to a pool with one single node in
|
||||
it, thus, no `shared_data_volume` is specified in the prior property, then
|
||||
this is relative to
|
||||
[$AZ_BATCH_NODE_ROOT_DIR](https://azure.microsoft.com/en-us/documentation/articles/batch-api-basics/#files-and-directories).
|
||||
To place files directly in `$AZ_BATCH_NODE_ROOT_DIR` (not recommended),
|
||||
you can specify this property as empty string when not ingressing to
|
||||
a `shared_data_volume`. Note that if `scp` is selected while attempting
|
||||
to transfer directly to this aforementioned path, then `scp` will fail
|
||||
with exit code of 1 but the transfer will have succeeded (this is due
|
||||
to some of the permission options). If this property is not specified for
|
||||
a `shared_data_volume`, then files will be placed directly in the
|
||||
GlusterFS volume root. This property cannot be specified for a Azure
|
||||
Storage destination (i.e., `storage_account_settings`).
|
||||
* (required) `data_transfer` specifies how the transfer should take place.
|
||||
The following list contains members for GlusterFS ingress when a GlusterFS
|
||||
volume is provided for `shared_data_volume` (see below for ingressing to
|
||||
Azure Blob or File Storage):
|
||||
* (required) `method` specified which method should be used to ingress
|
||||
data, which should be one of: `scp`, `multinode_scp`, `rsync+ssh` or
|
||||
`multinode_rsync+ssh`. `scp` will use secure copy to copy a file or a
|
||||
directory (recursively) to the remote share path. `multinode_scp` will
|
||||
attempt to simultaneously transfer files to many compute nodes using
|
||||
`scp` at the same time to speed up data transfer. `rsync+ssh` will
|
||||
perform an rsync of files through SSH. `multinode_rsync+ssh` will
|
||||
attempt to simultaneously transfer files using `rsync` to many compute
|
||||
nodes at the same time to speed up data transfer with. Note that you may
|
||||
specify the `multinode_*` methods even with only 1 compute node in a
|
||||
pool which will allow you to take advantage of
|
||||
`max_parallel_transfers_per_node` below.
|
||||
* (optional) `ssh_private_key` location of the SSH private key for the
|
||||
username specified in the `pool_specification`:`ssh` section when
|
||||
connecting to compute nodes. The default is `id_rsa_shipyard`, if
|
||||
omitted, which is automatically generated if no SSH key is specified
|
||||
when an SSH user is added to a pool.
|
||||
* (optional) `scp_ssh_extra_options` are any extra options to pass to
|
||||
`scp` or `ssh` for `scp`/`multinode_scp` or
|
||||
`rsync+ssh`/`multinode_rsync+ssh` methods, respectively. In the example
|
||||
above, `-C` enables compression and `-c aes256-gcm@openssh.com`
|
||||
is passed to `scp`, which can potentially increase the transfer speed by
|
||||
selecting the `aes256-gcm@openssh.com` cipher which can exploit Intel
|
||||
AES-NI.
|
||||
* (optional) `rsync_extra_options` are any extra options to pass to
|
||||
`rsync` for the `rsync+ssh`/`multinode_rsync+ssh` transfer methods. This
|
||||
property is ignored for non-rsync transfer methods.
|
||||
* (optional) `split_files_megabytes` splits files into chunks with the
|
||||
specified size in MiB. This can potentially help with very large files.
|
||||
This option forces the transfer `method` to `multinode_scp`.
|
||||
Note that the destination file system must be able to accommodate
|
||||
up to 2x the size of files which are split. Additionally, transfers
|
||||
involving files which are split will incur reconstruction costs after
|
||||
the transfer is complete, which will increase the total end-to-end
|
||||
ingress time. However, in certain scenarios, by splitting files and
|
||||
transferring chunks in parallel along with reconstruction may end up
|
||||
being faster than transferring a large file without chunking.
|
||||
* (optional) `max_parallel_transfers_per_node` is the maximum number of
|
||||
parallel transfer to invoke per node with the
|
||||
`multinode_scp`/`multinode_rsync+ssh` methods. For example, if there
|
||||
are 3 compute nodes in the pool, and `2` is given for this option, then
|
||||
there will be up to 2 scp sessions in parallel per compute node for a
|
||||
maximum of 6 concurrent scp sessions to the pool. The default is 1 if
|
||||
not specified or omitted.
|
||||
* (required) `data_transfer` specifies how the transfer should take place.
|
||||
When Azure Blob or File Storage is selected as the destination for data
|
||||
ingress, [blobxfer](https://github.com/Azure/blobxfer) is invoked. The
|
||||
following list contains members for Azure Blob or File Storage ingress
|
||||
when a storage account link is provided for `storage_account_settings`:
|
||||
* (required) `container` or `file_share` is required when uploading to
|
||||
Azure Blob Storage or Azure File Storage, respectively. `container`
|
||||
specifies which container to upload to for Azure Blob Storage while
|
||||
`file_share` specifies which file share to upload to for Azure File
|
||||
Storage. Only one of these properties can be specified per
|
||||
`data_transfer` object. The container or file share need not be created
|
||||
beforehand.
|
||||
* (optional) `blobxfer_extra_options` are any extra options to pass to
|
||||
`blobxfer`. In the example above, `--no-computefilemd5` will force
|
||||
`blobxfer` to skip MD5 calculation on files ingressed.
|
||||
|
||||
`docker_volumes` is an optional property that can consist of two
|
||||
different types of volumes: `data_volumes` and `shared_data_volumes`.
|
||||
`data_volumes` can be of two flavors depending upon if `host_path` is set to
|
||||
null or not. In the former, this is typically used with the `VOLUME` keyword
|
||||
in Dockerfiles to initialize a data volume with existing data inside the
|
||||
image. If `host_path` is set, then the path on the host is mounted in the
|
||||
container at the path specified with `container_path`.
|
||||
|
||||
`shared_data_volumes` is an optional property for initializing persistent
|
||||
shared storage volumes. In the first shared volume, `shipyardvol` is the alias
|
||||
of this volume:
|
||||
* `volume_driver` property specifies the Docker Volume Driver to use.
|
||||
Currently Batch Shipyard only supports the `volume_driver` as `azurefile` or
|
||||
`glusterfs_on_compute`. Note that `glusterfs_on_compute` is not a true Docker
|
||||
Volume Driver. For this volume (`shipyardvol`), as this is an Azure File
|
||||
shared volume, the `volume_driver` should be set as `azurefile`.
|
||||
* `storage_account_settings` is a link to the alias of the storage account
|
||||
specified that holds this Azure File Share.
|
||||
* `azure_file_share_name` is the name of the share name on Azure Files. Note
|
||||
that the Azure File share must be created beforehand, the toolkit does not
|
||||
create Azure File shares, it only mounts them to the compute nodes.
|
||||
* `container_path` is the path in the container to mount.
|
||||
* `mount_options` are the mount options to pass to the mount command. Supported
|
||||
options are documented
|
||||
[here](https://github.com/Azure/azurefile-dockervolumedriver). It is
|
||||
recommended to use `0777` for both `filemode` and `dirmode` as the `uid` and
|
||||
`gid` cannot be reliably determined before the compute pool is allocated and
|
||||
this volume will be mounted as the root user.
|
||||
|
||||
Note that when using `azurefile` for a shared data volume, the storage account
|
||||
that holds the file share must reside within the same Azure region as the
|
||||
Azure Batch compute pool. Attempting to mount an Azure File share that is
|
||||
cross-region will result in failure as current Linux Samba clients do not
|
||||
support share level encryption at this time.
|
||||
|
||||
The second shared volue, `glustervol`, is a
|
||||
[GlusterFS](https://www.gluster.org/) network file system. Please note that
|
||||
`glusterfs_on_compute` are GlusterFS volumes co-located on the VM's temporary
|
||||
local disk space which is a shared resource. Sizes of the local temp disk for
|
||||
each VM size can be found
|
||||
[here](https://azure.microsoft.com/en-us/documentation/articles/virtual-machines-windows-sizes/).
|
||||
If specifying a `glusterfs_on_compute` volume, you must enable internode
|
||||
communication in the pool configuration file. These volumes have the following
|
||||
properties:
|
||||
* (required) `volume_driver` property should be set as `glusterfs_on_compute`.
|
||||
* (required) `container_path` is the path in the container to mount.
|
||||
* (optional) `volume_type` property defines the GlusterFS volume type.
|
||||
Currently, `replica` is the only supported type.
|
||||
* (optional) `volume_options` property defines additional GlusterFS volume
|
||||
options to set.
|
||||
|
||||
`glusterfs_on_compute` volumes are mounted on the host at
|
||||
`$AZ_BATCH_NODE_SHARED_DIR/.gluster/gv0`. Batch Shipyard will automatically
|
||||
replace container path references in direct and storage-based data
|
||||
ingress/egress with their host path equivalents.
|
||||
|
||||
Note that when resizing a pool with a `glusterfs_on_compute` shared file
|
||||
systems that you must resize with the `pool resize` command in `shipyard.py`
|
||||
and not with Azure Portal, Batch Explorer or any other tool.
|
||||
|
||||
Finally, note that all `docker_volumes` can be omitted completely along with
|
||||
one or all of `data_volumes` and `shared_data_volumes` if you do not require
|
||||
this functionality.
|
||||
|
||||
An example global config json template can be found
|
||||
[here](../config\_templates/config.json).
|
||||
|
||||
### <a name="pool"></a>Pool
|
||||
The pool schema is as follows:
|
||||
|
||||
```json
|
||||
{
|
||||
"pool_specification": {
|
||||
"id": "dockerpool",
|
||||
"vm_size": "STANDARD_A9",
|
||||
"vm_count": 10,
|
||||
"max_tasks_per_node": 1,
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "OpenLogic",
|
||||
"offer": "CentOS-HPC",
|
||||
"sku": "7.1",
|
||||
"reboot_on_start_task_failed": true,
|
||||
"block_until_all_global_resources_loaded": true,
|
||||
"transfer_files_on_pool_creation": false,
|
||||
"input_data": {
|
||||
"azure_batch": [
|
||||
{
|
||||
"job_id": "jobonanotherpool",
|
||||
"task_id": "mytask",
|
||||
"include": ["wd/*.dat"],
|
||||
"exclude": ["*.txt"],
|
||||
"destination": "$AZ_BATCH_NODE_SHARED_DIR/jobonanotherpool"
|
||||
}
|
||||
],
|
||||
"azure_storage": [
|
||||
{
|
||||
"storage_account_settings": "mystorageaccount",
|
||||
"container": "poolcontainer",
|
||||
"include": ["pooldata*.bin"],
|
||||
"destination": "$AZ_BATCH_NODE_SHARED_DIR/pooldata",
|
||||
"blobxfer_extra_options": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"ssh": {
|
||||
"username": "docker",
|
||||
"expiry_days": 7,
|
||||
"ssh_public_key": null,
|
||||
"generate_docker_tunnel_script": true,
|
||||
"generated_file_export_path": null,
|
||||
"hpn_server_swap": false
|
||||
},
|
||||
"gpu": {
|
||||
"nvidia_driver": {
|
||||
"source": "https://some.url"
|
||||
}
|
||||
},
|
||||
"additional_node_prep_commands": [
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The `pool_specification` property has the following members:
|
||||
* (required) `id` is the compute pool ID.
|
||||
* (required) `vm_size` is the
|
||||
[Azure Virtual Machine Instance Size](https://azure.microsoft.com/en-us/pricing/details/virtual-machines/).
|
||||
Please note that not all regions have every VM size available.
|
||||
* (required) `vm_count` is the number of compute nodes to allocate.
|
||||
* (optional) `max_tasks_per_node` is the maximum number of concurrent tasks
|
||||
that can be running at any one time on a compute node. This defaults to a
|
||||
value of 1 if not specified.
|
||||
* (optional) `inter_node_communication_enabled` designates if this pool is set
|
||||
up for inter-node communication. This must be set to `true` for any containers
|
||||
that must communicate with each other such as MPI applications. This property
|
||||
will be force enabled if peer-to-peer replication is enabled.
|
||||
* (required) `publisher` is the publisher name of the Marketplace VM image.
|
||||
* (required) `offer` is the offer name of the Marketplace VM image.
|
||||
* (required) `sku` is the sku name of the Marketplace VM image.
|
||||
* (optional) `reboot_on_start_task_failed` allows Batch Shipyard to reboot the
|
||||
compute node in case there is a transient failure in node preparation (e.g.,
|
||||
network timeout, resolution failure or download problem). This defaults to
|
||||
`false`.
|
||||
* (optional) `block_until_all_global_resources_loaded` will block the node
|
||||
from entering ready state until all Docker images are loaded. This defaults
|
||||
to `true`.
|
||||
* (optional) `transfer_files_on_pool_creation` will ingress all `files`
|
||||
specified in the `global_resources` section of the configuration json when
|
||||
the pool is created. If files are to be ingressed to Azure Blob or File
|
||||
Storage, then data movement operations are overlapped with the creation of the
|
||||
pool. If files are to be ingressed to a shared file system on the compute
|
||||
nodes, then the files are ingressed after the pool is created and the shared
|
||||
file system is ready. Files can be ingressed to both Azure Blob Storage and a
|
||||
shared file system during the same pool creation invocation. If this property
|
||||
is set to `true` then `block_until_all_global_resources_loaded` will be force
|
||||
disabled. If omitted, this property defaults to `false`.
|
||||
* (optional) `input_data` is an object containing data that should be
|
||||
ingressed to all compute nodes as part of node preparation. It is
|
||||
important to note that if you are combining this action with `files` and
|
||||
are ingressing data to Azure Blob or File storage as part of pool creation,
|
||||
that the blob containers or file shares defined here will be downloaded as
|
||||
soon as the compute node is ready to do so. This may result in the blob
|
||||
container/blobs or file share/files not being ready in time for the
|
||||
`input_data` transfer. It is up to you to ensure that these two operations do
|
||||
not overlap. If there is a possibility of overlap, then you should ingress
|
||||
data defined in `files` prior to pool creation and disable the option above
|
||||
`transfer_files_on_pool_creation`. This object currently supports
|
||||
`azure_batch` and `azure_storage` as members.
|
||||
* `azure_batch` contains the following members:
|
||||
* (required) `job_id` the job id of the task
|
||||
* (required) `task_id` the id of the task to fetch files from
|
||||
* (optional) `include` is an array of include filters
|
||||
* (optional) `exclude` is an array of exclude filters
|
||||
* (required) `destination` is the destination path to place the files
|
||||
* `azure_storage` contains the following members:
|
||||
* (required) `storage_account_settings` contains a storage account link
|
||||
as defined in the credentials json.
|
||||
* (required) `container` or `file_share` is required when downloading
|
||||
from Azure Blob Storage or Azure File Storage, respectively.
|
||||
`container` specifies which container to download from for Azure Blob
|
||||
Storage while `file_share` specifies which file share to download from
|
||||
for Azure File Storage. Only one of these properties can be specified
|
||||
per `data_transfer` object.
|
||||
* (optional) `include` property defines an optional include filter.
|
||||
Although this property is an array, it is only allowed to have 1
|
||||
maximum filter.
|
||||
* (required) `destination` property defines where to place the
|
||||
downloaded files on the host file system. Please note that you should
|
||||
not specify a destination that is on a shared file system. If you
|
||||
require ingressing to a shared file system location like a GlusterFS
|
||||
volume, then use the global configuration `files` property and the
|
||||
`data ingress` command.
|
||||
* (optional) `blobxfer_extra_options` are any extra options to pass to
|
||||
`blobxfer`.
|
||||
* (optional) `ssh` is the property for creating a user to accomodate SSH
|
||||
sessions to compute nodes. If this property is absent, then an SSH user is not
|
||||
created with pool creation.
|
||||
* (required) `username` is the user to create on the compute nodes.
|
||||
* (optional) `expiry_days` is the number of days from now for the account on
|
||||
the compute nodes to expire. The default is 30 days from invocation time.
|
||||
* (optional) `ssh_public_key` is the path to an existing SSH public key to
|
||||
use. If not specified, an RSA public/private keypair will be automatically
|
||||
generated only on Linux. If this is `null` or not specified on Windows,
|
||||
the SSH user is not created.
|
||||
* (optional) `generate_docker_tunnel_script` property directs script to
|
||||
generate an SSH tunnel script that can be used to connect to the remote
|
||||
Docker engine running on a compute node.
|
||||
* (optional) `generated_file_export_path` is the path to export the
|
||||
generated RSA keypair and docker tunnel script to. If omitted, the
|
||||
current directory is used.
|
||||
* (experimental) `hpn_server_swap` property enables an OpenSSH server with
|
||||
[HPN patches](https://www.psc.edu/index.php/using-joomla/extensions/templates/atomic/636-hpn-ssh)
|
||||
to be swapped with the standard distribution OpenSSH server. This is not
|
||||
supported on all Linux distributions and may be force disabled.
|
||||
* (required for `STANDARD_NV` instances, optional for `STANDARD_NC` instances)
|
||||
`gpu` property defines additional information for NVIDIA GPU-enabled VMs:
|
||||
* `nvidia_driver` property contains the following required members:
|
||||
* `source` is the source url to download the driver.
|
||||
* (optional) `additional_node_prep_commands` is an array of additional commands
|
||||
to execute on the compute node host as part of node preparation. This can
|
||||
be empty or omitted.
|
||||
|
||||
An example pool json template can be found
|
||||
[here](../config\_templates/pool.json).
|
||||
|
||||
### <a name="jobs"></a>Jobs
|
||||
The jobs schema is as follows:
|
||||
|
||||
```json
|
||||
{
|
||||
"job_specifications": [
|
||||
{
|
||||
"id": "dockerjob",
|
||||
"multi_instance_auto_complete": true,
|
||||
"environment_variables": {
|
||||
"abc": "xyz"
|
||||
},
|
||||
"environment_variables_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/myjobenv",
|
||||
"max_task_retries": 3,
|
||||
"input_data": {
|
||||
"azure_batch": [
|
||||
{
|
||||
"job_id": "someotherjob",
|
||||
"task_id": "task-a",
|
||||
"include": ["wd/*.dat"],
|
||||
"exclude": ["*.txt"],
|
||||
"destination": null
|
||||
}
|
||||
],
|
||||
"azure_storage": [
|
||||
{
|
||||
"storage_account_settings": "mystorageaccount",
|
||||
"container": "jobcontainer",
|
||||
"include": ["jobdata*.bin"],
|
||||
"destination": "$AZ_BATCH_NODE_SHARED_DIR/jobdata",
|
||||
"blobxfer_extra_options": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"tasks": [
|
||||
{
|
||||
"id": null,
|
||||
"depends_on": [
|
||||
"taskid-a", "taskid-b", "taskid-c"
|
||||
],
|
||||
"depends_on_range": [
|
||||
1, 10
|
||||
],
|
||||
"image": "busybox",
|
||||
"name": null,
|
||||
"labels": [],
|
||||
"environment_variables": {
|
||||
"def": "123"
|
||||
},
|
||||
"environment_variables_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/mytaskenv",
|
||||
"ports": [],
|
||||
"data_volumes": [
|
||||
"contdatavol",
|
||||
"hosttempvol"
|
||||
],
|
||||
"shared_data_volumes": [
|
||||
"azurefilevol"
|
||||
],
|
||||
"resource_files": [
|
||||
{
|
||||
"file_path": "",
|
||||
"blob_source": "",
|
||||
"file_mode": ""
|
||||
}
|
||||
],
|
||||
"input_data": {
|
||||
"azure_batch": [
|
||||
{
|
||||
"job_id": "previousjob",
|
||||
"task_id": "mytask1",
|
||||
"include": ["wd/output/*.bin"],
|
||||
"exclude": ["*.txt"],
|
||||
"destination": null
|
||||
}
|
||||
],
|
||||
"azure_storage": [
|
||||
{
|
||||
"storage_account_settings": "mystorageaccount",
|
||||
"container": "taskcontainer",
|
||||
"include": ["taskdata*.bin"],
|
||||
"destination": "$AZ_BATCH_NODE_SHARED_DIR/taskdata",
|
||||
"blobxfer_extra_options": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"output_data": {
|
||||
"azure_storage": [
|
||||
{
|
||||
"storage_account_settings": "mystorageaccount",
|
||||
"container": "output",
|
||||
"source": null,
|
||||
"include": ["**/out*.dat"],
|
||||
"blobxfer_extra_options": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"remove_container_after_exit": true,
|
||||
"shm_size": "256m",
|
||||
"additional_docker_run_options": [
|
||||
],
|
||||
"infiniband": false,
|
||||
"gpu": false,
|
||||
"max_task_retries": 3,
|
||||
"retention_time": "1.12:00:00",
|
||||
"multi_instance": {
|
||||
"num_instances": "pool_current_dedicated",
|
||||
"coordination_command": null,
|
||||
"resource_files": [
|
||||
{
|
||||
"file_path": "",
|
||||
"blob_source": "",
|
||||
"file_mode": ""
|
||||
}
|
||||
]
|
||||
},
|
||||
"entrypoint": null,
|
||||
"command": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
`job_specifications` array consists of jobs to create.
|
||||
* (required) `id` is the job id to create. If the job already exists, the
|
||||
specified `tasks` under the job will be added to the existing job.
|
||||
* (optional) `multi_instance_auto_complete` enables auto-completion of the job
|
||||
for which a multi-task instance is run. This allows automatic cleanup of the
|
||||
Docker container in multi-instance tasks. This is defaulted to `true` when
|
||||
multi-instance tasks are specified.
|
||||
* (optional) `environment_variables` under the job are environment variables
|
||||
which will be applied to all tasks operating under the job. Note that
|
||||
environment variables are not expanded and are passed as-is. You will need
|
||||
to source the environment file `$AZ_BATCH_TASK_WORKING_DIR/.shipyard.envlist`
|
||||
in a shell within the docker `command` or `entrypoint` if you want any
|
||||
environment variables to be expanded.
|
||||
* (optional) `environment_variables_keyvault_secret_id` under the job are
|
||||
environment variables stored in KeyVault that should be applied to all tasks
|
||||
operating under the job. The secret stored in KeyVault must be a valid json
|
||||
string, e.g., `{ "env_var_name": "env_var_value" }`.
|
||||
* (optional) `max_task_retries` sets the maximum number of times that
|
||||
Azure Batch should retry all tasks in this job for. By default, Azure Batch
|
||||
does not retry tasks that fail (i.e. `max_task_retries` is 0).
|
||||
* (optional) `input_data` is an object containing data that should be
|
||||
ingressed for the job. Any `input_data` defined at this level will be
|
||||
downloaded for this job which can be run on any number of compute nodes
|
||||
depending upon the number of constituent tasks and repeat invocations. However,
|
||||
`input_data` is only downloaded once per job invocation on a compute node.
|
||||
For example, if `job-1`:`task-1` is run on compute node A and then
|
||||
`job-1`:`task-2` is run on compute node B, then this `input_data` is ingressed
|
||||
to both compute node A and B. However, if `job-1`:`task-3` is then run on
|
||||
compute node A after `job-1`:`task-1`, then the `input_data` is not
|
||||
transferred again. This object currently supports `azure_batch` and
|
||||
`azure_storage` as members.
|
||||
* `azure_batch` contains the following members:
|
||||
* (required) `job_id` the job id of the task
|
||||
* (required) `task_id` the id of the task to fetch files from
|
||||
* (optional) `include` is an array of include filters
|
||||
* (optional) `exclude` is an array of exclude filters
|
||||
* (required) `destination` is the destination path to place the files
|
||||
* `azure_storage` contains the following members:
|
||||
* (required) `storage_account_settings` contains a storage account link
|
||||
as defined in the credentials json.
|
||||
* (required) `container` or `file_share` is required when downloading
|
||||
from Azure Blob Storage or Azure File Storage, respectively.
|
||||
`container` specifies which container to download from for Azure Blob
|
||||
Storage while `file_share` specifies which file share to download from
|
||||
for Azure File Storage. Only one of these properties can be specified
|
||||
per `data_transfer` object.
|
||||
* (optional) `include` property defines an optional include filter.
|
||||
Although this property is an array, it is only allowed to have 1
|
||||
maximum filter.
|
||||
* (required) `destination` property defines where to place the
|
||||
downloaded files on the host file system. Please note that you should
|
||||
not specify a destination that is on a shared file system. If you
|
||||
require ingressing to a shared file system location like a GlusterFS
|
||||
volume, then use the global configuration `files` property and the
|
||||
`data ingress` command.
|
||||
* (optional) `blobxfer_extra_options` are any extra options to pass to
|
||||
`blobxfer`.
|
||||
* (required) `tasks` is an array of tasks to add to the job.
|
||||
* (optional) `id` is the task id. Note that if the task `id` is null or
|
||||
empty then a generic task id will be assigned. The generic task id is
|
||||
formatted as `dockertask-NNNNN` where `NNNNN` starts from `00000` and is
|
||||
increased by 1 for each task added to the same job. If there are more
|
||||
than `99999` autonamed tasks in a job then the numbering is not
|
||||
padded for tasks exceeding 5 digits.
|
||||
* (optional) `depends_on` is an array of task ids for which this container
|
||||
invocation (task) depends on and must run to successful completion prior
|
||||
to this task executing.
|
||||
* (optional) `depends_on_range` is an array with exactly two integral
|
||||
elements containing a task `id` range for which this task is dependent
|
||||
upon, i.e., the start `id` and the end `id` for which this task depends
|
||||
on. Although task `id`s are always strings, the dependent task `id`s for
|
||||
ranges must be expressed by their integral representation for this
|
||||
property. This also implies that task `id`s for which this task depends
|
||||
on must be integral in nature. For example, if `depends_on_range` is set
|
||||
to `[1, 10]` (note the integral members), then there should be task
|
||||
`id`s of `"1"`, `"2"`, ... `"10"` within the job. Once these dependent
|
||||
tasks complete successfully, then this specified task will execute.
|
||||
* (required) `image` is the Docker image to use for this task
|
||||
* (optional) `name` is the name to assign to the container. If not
|
||||
specified, the value of the `id` property will be used for `name`.
|
||||
* (optional) `labels` is an array of labels to apply to the container.
|
||||
* (optional) `environment_variables` are any additional task-specific
|
||||
environment variables that should be applied to the container. Note that
|
||||
environment variables are not expanded and are passed as-is. You will
|
||||
need to source the environment file
|
||||
`$AZ_BATCH_TASK_WORKING_DIR/.shipyard.envlist` in a shell within the
|
||||
docker `command` or `entrypoint` if you want any environment variables
|
||||
to be expanded.
|
||||
* (optional) `environment_variables_keyvault_secret_id` are any additional
|
||||
task-specific environment variables that should be applied to the
|
||||
container but are stored in KeyVault. The secret stored in KeyVault must
|
||||
be a valid json string, e.g., `{ "env_var_name": "env_var_value" }`.
|
||||
* (optional) `ports` is an array of port specifications that should be
|
||||
exposed to the host.
|
||||
* (optional) `data_volumes` is an array of `data_volume` aliases as defined
|
||||
in the global configuration file. These volumes will be mounted in the
|
||||
container.
|
||||
* (optional) `shared_data_volumes` is an array of `shared_data_volume`
|
||||
aliases as defined in the global configuration file. These volumes will be
|
||||
mounted in the container.
|
||||
* (optional) `resource_files` is an array of resource files that should be
|
||||
downloaded as part of the task. Each array entry contains the following
|
||||
information:
|
||||
* `file_path` is the path within the task working directory to place the
|
||||
file on the compute node.
|
||||
* `blob_source` is an accessible HTTP/HTTPS URL. This need not be an Azure
|
||||
Blob Storage URL.
|
||||
* `file_mode` if the file mode to set for the file on the compute node.
|
||||
This is optional.
|
||||
* (optional) `input_data` is an object containing data that should be
|
||||
ingressed for this specific task. This object currently supports
|
||||
`azure_batch` and `azure_storage` as members. Note for multi-instance
|
||||
tasks, transfer of `input_data` is only applied to the task running the
|
||||
application command.
|
||||
* `azure_batch` contains the following members:
|
||||
* (required) `job_id` the job id of the task
|
||||
* (required) `task_id` the id of the task to fetch files from
|
||||
* (optional) `include` is an array of include filters
|
||||
* (optional) `exclude` is an array of exclude filters
|
||||
* (optional) `destination` is the destination path to place the files.
|
||||
If `destination` is not specified at this level, then files are
|
||||
defaulted to download into `$AZ_BATCH_TASK_WORKING_DIR`.
|
||||
* `azure_storage` contains the following members:
|
||||
* (required) `storage_account_settings` contains a storage account link
|
||||
as defined in the credentials json.
|
||||
* (required) `container` or `file_share` is required when downloading
|
||||
from Azure Blob Storage or Azure File Storage, respectively.
|
||||
`container` specifies which container to download from for Azure Blob
|
||||
Storage while `file_share` specifies which file share to download from
|
||||
for Azure File Storage. Only one of these properties can be specified
|
||||
per `data_transfer` object.
|
||||
* (optional) `include` property defines an optional include filter.
|
||||
Although this property is an array, it is only allowed to have 1
|
||||
maximum filter.
|
||||
* (optional) `destination` property defines where to place the
|
||||
downloaded files on the host file system. Unlike the job-level
|
||||
version of `input_data`, this `destination` property can be ommitted.
|
||||
If `destination` is not specified at this level, then files are
|
||||
defaulted to download into `$AZ_BATCH_TASK_WORKING_DIR`. Please note
|
||||
that you should not specify a destination that is on a shared file
|
||||
system. If you require ingressing to a shared file system location
|
||||
like a GlusterFS volume, then use the global configuration `files`
|
||||
property and the `data ingress` command.
|
||||
* (optional) `blobxfer_extra_options` are any extra options to pass to
|
||||
`blobxfer`.
|
||||
* (optional) `output_data` is an object containing data that should be
|
||||
egressed for this specific task if and only if the task completes
|
||||
successfully. This object currently only supports `azure_storage` as a
|
||||
member. Note for multi-instance tasks, transfer of `output_data` is only
|
||||
applied to the task running the application command.
|
||||
* `azure_storage` contains the following members:
|
||||
* (required) `storage_account_settings` contains a storage account link
|
||||
as defined in the credentials json.
|
||||
* (required) `container` or `file_share` is required when uploading to
|
||||
Azure Blob Storage or Azure File Storage, respectively. `container`
|
||||
specifies which container to upload to for Azure Blob Storage while
|
||||
`file_share` specifies which file share to upload to for Azure File
|
||||
Storage. Only one of these properties can be specified per
|
||||
`data_transfer` object.
|
||||
* (optional) `source` property defines which directory to upload to
|
||||
Azure storage. If `source` is not specified, then `source` is
|
||||
defaulted to `$AZ_BATCH_TASK_DIR`.
|
||||
* (optional) `include` property defines an optional include filter.
|
||||
Although this property is an array, it is only allowed to have 1
|
||||
maximum filter.
|
||||
* (optional) `blobxfer_extra_options` are any extra options to pass to
|
||||
`blobxfer`.
|
||||
* (optional) `remove_container_after_exit` property specifies if the
|
||||
container should be automatically removed/cleaned up after it exits. This
|
||||
defaults to `false`.
|
||||
* (optional) `shm_size` property specifies the size of `/dev/shm` in
|
||||
the container. The default is `64m`. The postfix unit can be designated
|
||||
as `b` (bytes), `k` (kilobytes), `m` (megabytes), or `g` (gigabytes). This
|
||||
value may need to be increased from the default of `64m` for certain
|
||||
Docker applications, including multi-instance tasks using Intel MPI
|
||||
(see [issue #8](https://github.com/Azure/batch-shipyard/issues/8)).
|
||||
* (optional) `additional_docker_run_options` is an array of addition Docker
|
||||
run options that should be passed to the Docker daemon when starting this
|
||||
container.
|
||||
* (optional) `infiniband` designates if this container requires access to the
|
||||
Infiniband/RDMA devices on the host. Note that this will automatically
|
||||
force the container to use the host network stack. If this property is
|
||||
set to `true`, ensure that the `pool_specification` property
|
||||
`inter_node_communication_enabled` is set to `true`.
|
||||
* (optional) `gpu` designates if this container requires access to the GPU
|
||||
devices on the host. If this property is set to `true`, Docker containers
|
||||
are instantiated via `nvidia-docker`. This requires N-series VM instances.
|
||||
* (optional) `max_task_retries` sets the maximum number of times that
|
||||
Azure Batch should retry this task for. This overrides the job-level task
|
||||
retry count. By default, Azure Batch does not retry tasks that fail
|
||||
(i.e. `max_task_retries` is 0).
|
||||
* (optional) `retention_time` sets the timedelta to retain the task
|
||||
directory on the compute node where it ran after the task completes.
|
||||
The format for this property is a timedelta with a string representation
|
||||
of "d.HH:mm:ss". For example, "1.12:00:00" would allow the compute node
|
||||
to clean up this task's directory 36 hours after the task completed. The
|
||||
default, if unspecified, is effectively infinite - i.e., task data is
|
||||
retained forever on the compute node that ran the task.
|
||||
* (optional) `multi_instance` is a property indicating that this task is a
|
||||
multi-instance task. This is required if the Docker image is an MPI
|
||||
program. Additional information about multi-instance tasks and Batch
|
||||
Shipyard can be found
|
||||
[here](80-batch-shipyard-multi-instance-tasks.md). Do not define this
|
||||
property for tasks that are not multi-instance. Additional members of this
|
||||
property are:
|
||||
* `num_instances` is a property setting the number of compute node
|
||||
instances are required for this multi-instance task. This can be any one
|
||||
of the following:
|
||||
1. An integral number
|
||||
2. `pool_current_dedicated` which is the instantaneous reading of the
|
||||
target pool's current dedicated count during this function invocation.
|
||||
3. `pool_specification_vm_count` which is the `vm_count` specified in the
|
||||
pool configuration.
|
||||
* `coordination_command` is the coordination command this is run by each
|
||||
instance (compute node) of this multi-instance task prior to the
|
||||
application command. This command must not block and must exit
|
||||
successfully for the multi-instance task to proceed. This is the command
|
||||
passed to the container in `docker run` for multi-instance tasks. This
|
||||
docker container instance will automatically be daemonized. This is
|
||||
optional and may be null.
|
||||
* `resource_files` is an array of resource files that should be downloaded
|
||||
as part of the multi-instance task. Each array entry contains the
|
||||
following information:
|
||||
* `file_path` is the path within the task working directory to place
|
||||
the file on the compute node.
|
||||
* `blob_source` is an accessible HTTP/HTTPS URL. This need not be an
|
||||
Azure Blob Storage URL.
|
||||
* `file_mode` if the file mode to set for the file on the compute node.
|
||||
This is optional.
|
||||
* (optional) `entrypoint` is the property that can override the Docker image
|
||||
defined `ENTRYPOINT`.
|
||||
* (optional) `command` is the command to execute in the Docker container
|
||||
context. If this task is a regular non-multi-instance task, then this is
|
||||
the command passed to the container context during `docker run`. If this
|
||||
task is a multi-instance task, then this `command` is the application
|
||||
command and is executed with `docker exec` in the running Docker container
|
||||
context from the `coordination_command` in the `multi_instance` property.
|
||||
This property may be null.
|
||||
|
||||
An example jobs json template can be found
|
||||
[here](../config\_templates/jobs.json).
|
||||
|
||||
## Batch Shipyard Usage
|
||||
Continue on to [Batch Shipyard Usage](20-batch-shipyard-usage.md).
|
||||
|
|
|
@ -36,7 +36,7 @@ The credentials schema is as follows:
|
|||
"rsa_private_key_pem": "/path/to/privkey.pem",
|
||||
"x509_cert_sha1_thumbprint": "01AB02CD...",
|
||||
"user": "me@domain.com",
|
||||
"password": "password"
|
||||
"password": "password",
|
||||
"token_cache": {
|
||||
"enabled": true,
|
||||
"filename": ""
|
||||
|
@ -59,6 +59,7 @@ The credentials schema is as follows:
|
|||
"filename": ""
|
||||
}
|
||||
},
|
||||
"resource_group": "",
|
||||
"account_key": "batchaccountkey",
|
||||
"account_key_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/batchkey"
|
||||
},
|
||||
|
@ -149,9 +150,12 @@ under the `batch` property can be found in the
|
|||
* (required) `account_service_url` is the Batch account service URL.
|
||||
* (required for UserSubscription accounts, optional otherwise) `aad` AAD
|
||||
authentication parameters for Azure Batch.
|
||||
* (optional) `resource_group` is the resource group containing the Batch
|
||||
account. This is only required if using a UserSubscription Batch account
|
||||
with `aad` authentication.
|
||||
* (required unless `aad` is specified) `account_key` is the shared
|
||||
key. This is required for non-AAD logins. This is ignored if the `aad`
|
||||
property is specified.
|
||||
key. This is required for non-AAD logins. This option takes precendence
|
||||
over the `aad` property if specified.
|
||||
* (optional) `account_key_keyvault_secret_id` property can be used to
|
||||
reference an Azure KeyVault secret id. Batch Shipyard will contact the
|
||||
specified KeyVault and replace the `account_key` value as returned by
|
||||
|
|
|
@ -0,0 +1,418 @@
|
|||
# Batch Shipyard Global Configuration
|
||||
This page contains in-depth details on how to configure the global
|
||||
json file for Batch Shipyard.
|
||||
|
||||
## Schema
|
||||
The global config schema is as follows:
|
||||
|
||||
```json
|
||||
{
|
||||
"batch_shipyard": {
|
||||
"storage_account_settings": "mystorageaccount",
|
||||
"storage_entity_prefix": "shipyard",
|
||||
"generated_sas_expiry_days": 90,
|
||||
"encryption" : {
|
||||
"enabled": true,
|
||||
"pfx": {
|
||||
"filename": "encrypt.pfx",
|
||||
"passphrase": "mysupersecretpassword",
|
||||
"sha1_thumbprint": "123456789..."
|
||||
},
|
||||
"public_key_pem": "encrypt.pem"
|
||||
}
|
||||
},
|
||||
"docker_registry": {
|
||||
"private": {
|
||||
"allow_public_docker_hub_pull_on_missing": true,
|
||||
"server": "myserver-myorg.azurecr.io",
|
||||
"azure_storage": {
|
||||
"storage_account_settings": "mystorageaccount",
|
||||
"container": "mydockerregistry"
|
||||
}
|
||||
}
|
||||
},
|
||||
"data_replication": {
|
||||
"peer_to_peer": {
|
||||
"enabled": true,
|
||||
"compression": true,
|
||||
"concurrent_source_downloads": 10,
|
||||
"direct_download_seed_bias": null
|
||||
},
|
||||
"non_peer_to_peer_concurrent_downloading": true
|
||||
},
|
||||
"global_resources": {
|
||||
"docker_images": [
|
||||
"busybox",
|
||||
"redis:3.2.3-alpine",
|
||||
],
|
||||
"files": [
|
||||
{
|
||||
"source": {
|
||||
"path": "/some/local/path/dir",
|
||||
"include": ["*.dat"],
|
||||
"exclude": ["*.bak"]
|
||||
},
|
||||
"destination": {
|
||||
"shared_data_volume": "glustervol",
|
||||
"relative_destination_path": "myfiles",
|
||||
"data_transfer": {
|
||||
"method": "multinode_scp",
|
||||
"ssh_private_key": "id_rsa_shipyard",
|
||||
"scp_ssh_extra_options": "-C -c aes256-gcm@openssh.com",
|
||||
"rsync_extra_options": "",
|
||||
"split_files_megabytes": 500,
|
||||
"max_parallel_transfers_per_node": 2
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"path": "/some/local/path/bound/for/blob",
|
||||
"include": ["*.bin"]
|
||||
},
|
||||
"destination": {
|
||||
"storage_account_settings": "mystorageaccount",
|
||||
"data_transfer": {
|
||||
"container": "mycontainer",
|
||||
"blobxfer_extra_options": "--no-computefilemd5"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"path": "/another/local/path/dir",
|
||||
"include": [],
|
||||
"exclude": []
|
||||
},
|
||||
"destination": {
|
||||
"relative_destination_path": "relpath/on/host",
|
||||
"data_transfer": {
|
||||
"method": "rsync+ssh",
|
||||
"ssh_private_key": "id_rsa_shipyard",
|
||||
"scp_ssh_extra_options": "-c aes256-gcm@openssh.com",
|
||||
"rsync_extra_options": "-v"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"docker_volumes": {
|
||||
"data_volumes": {
|
||||
"abcvol": {
|
||||
"host_path": null,
|
||||
"container_path": "/abc"
|
||||
},
|
||||
"hosttempvol": {
|
||||
"host_path": "/tmp",
|
||||
"container_path": "/hosttmp"
|
||||
}
|
||||
},
|
||||
"shared_data_volumes": {
|
||||
"shipyardvol": {
|
||||
"volume_driver": "azurefile",
|
||||
"storage_account_settings": "mystorageaccount",
|
||||
"azure_file_share_name": "shipyardshared",
|
||||
"container_path": "$AZ_BATCH_NODE_SHARED_DIR/azfile",
|
||||
"mount_options": [
|
||||
"filemode=0777",
|
||||
"dirmode=0777",
|
||||
"nolock=true"
|
||||
]
|
||||
},
|
||||
"glustervol": {
|
||||
"volume_driver": "glusterfs_on_compute",
|
||||
"container_path": "$AZ_BATCH_NODE_SHARED_DIR/gfs",
|
||||
"volume_type": "replica",
|
||||
"volume_options": [
|
||||
"performance.cache-size 1 GB",
|
||||
"performance.cache-max-file-size 10 MB",
|
||||
"performance.cache-refresh-timeout 61",
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The `batch_shipyard` property is used to set settings for the tool.
|
||||
* (required) `storage_account_settings` is a link to the alias of the storage
|
||||
account specified, in this case, it is `mystorageaccount`. Batch shipyard
|
||||
requires a storage account for storing metadata in order to execute across a
|
||||
distributed environment.
|
||||
* (optional) `storage_entity_prefix` property is used as a generic qualifier
|
||||
to prefix storage containers (blob containers, tables, queues) with. If not
|
||||
specified, defaults to `shipyard`.
|
||||
* (optional) `generated_sas_expiry_days` property is used to set the number of
|
||||
days any generated SAS key by Batch Shipyard is valid for. The default is 30
|
||||
days. This is useful if you have long-lived pools and want to ensure that
|
||||
SAS keys are valid for longer periods of time.
|
||||
* (optional) `encryption` object is used to define credential encryption which
|
||||
contains the following members:
|
||||
* (required) `enabled` property enables or disables this feature.
|
||||
* (required) `pfx` object defines the PFX certificate
|
||||
* (required) `filename` property is the full path and name to the PFX
|
||||
certificate
|
||||
* (required) `passphrase` property is the passphrase for the PFX
|
||||
certificate. This cannot be empty.
|
||||
* (optional) `sha1_thumbprint` is the SHA1 thumbprint of the
|
||||
certificate. If the PFX file is created using the `cert create` command,
|
||||
then the SHA1 thumbprint is output. It is recommended to populate this
|
||||
property such that it does not have to be generated when needed for
|
||||
encryption.
|
||||
* (optional) `public_key_pem` property is the full path and name to the
|
||||
RSA public key in PEM format. If the PFX file is created using the
|
||||
`cert create` command, then this file is generated along with the PFX
|
||||
file. It is recommended to populate this property with the PEM file path
|
||||
such that it does not have to be generated when needed for encryption.
|
||||
|
||||
The `docker_registry` property is used to configure Docker image distribution
|
||||
options from public/private Docker hub and private registries.
|
||||
* (optional) `private` property controls settings for interacting with private
|
||||
registries. There are three kinds of private registries that are supported:
|
||||
(1) private registries hosted on Docker Hub, (2) Internet accessible
|
||||
registries such as those hosted by the
|
||||
[Azure Container Registry](https://azure.microsoft.com/en-us/services/container-registry/)
|
||||
service and (3) [private registry instances backed to
|
||||
Azure Blob Storage](https://azure.microsoft.com/en-us/documentation/articles/virtual-machines-linux-docker-registry-in-blob-storage/)
|
||||
and are run on compute nodes. To use private registries hosted on Docker Hub,
|
||||
no additional properties need to be specified here, instead, specify your
|
||||
Docker Hub login information in the credentials json. To specify a private
|
||||
registry other than on Docker Hub, a json property named `server` should be
|
||||
defined. To use a private registry backed by Azure Blob Storage, define a
|
||||
json object named `azure_storage`. Note that a maximum of only one of these
|
||||
three types private registries may be specified at once. The following
|
||||
describes members of the non-Docker Hub private registries supported:
|
||||
* (optional) `server` object is a property that is the fully-qualified host
|
||||
name to a private registry server. A specific port other than 80 can be
|
||||
specified using a `:` separator, e.g.,
|
||||
`mydockerregistry.com:8080`. Port 80 is the default if no port is
|
||||
specified. The value of this property should have an associated login
|
||||
in the credentials json file.
|
||||
* (optional) `azure_storage` object is to define settings for connecting
|
||||
to a private registry backed by Azure Storage blobs and where the
|
||||
private registry instances are hosted on the compute nodes themselves.
|
||||
* (required) `storage_account_settings` is a link to the alias of the
|
||||
storage account specified that stores the private registry blobs.
|
||||
* (required) `container` property is the name of the Azure Blob
|
||||
container holding the private registry blobs.
|
||||
* (optional) `allow_public_docker_hub_pull_on_missing` property allows
|
||||
pass-through of Docker image retrieval to public Docker Hub if it is
|
||||
missing in the private registry. This defaults to `false` if not
|
||||
specified. Note that this setting does not apply to a missing Docker
|
||||
image that is allowed to run via the job property
|
||||
`allow_run_on_missing_image`.
|
||||
|
||||
The `data_replication` property is used to configure the internal image
|
||||
replication mechanism between compute nodes within a compute pool. The
|
||||
`non_peer_to_peer_concurrent_downloading` property specifies if it is ok
|
||||
to allow unfettered concurrent downloading from the source registry among
|
||||
all compute nodes. The following options apply to `peer_to_peer` data
|
||||
replication options:
|
||||
* (optional) `enabled` property enables or disables private peer-to-peer
|
||||
transfer. Note that for compute pools with a relatively small number of VMs,
|
||||
peer-to-peer transfer may not provide any benefit and is recommended to be
|
||||
disabled in these cases. Compute pools with large number of VMs and especially
|
||||
in the case of an Azure Storage-backed private registry can benefit from
|
||||
peer-to-peer image replication.
|
||||
* `compression` property enables or disables compression of image files. It
|
||||
is strongly recommended to keep this enabled.
|
||||
* `concurrent_source_downloads` property specifies the number of
|
||||
simultaneous downloads allowed to each image.
|
||||
* `direct_download_seed_bias` property sets the number of direct download
|
||||
seeds to prefer per image before switching to peer-to-peer transfer.
|
||||
|
||||
The `global_resources` property contains information regarding required
|
||||
Docker images, volume configuration and data ingress information. This
|
||||
property is required.
|
||||
|
||||
`docker_images` is an array of docker images that should be installed on
|
||||
every compute node when this configuration file is supplied while creating
|
||||
a compute pool. Image tags are supported. Image names should not include
|
||||
private registry server names, as these will be automatically prepended. For
|
||||
instance, if you have an image `abc/mytag` on your private registry
|
||||
`myregistry-myorg.azurecr.io`, your image should be named in the
|
||||
`docker_images` array as `abc/mytag` and not
|
||||
`myregistry-myorg.azurecr.io/abc/mytag`.
|
||||
|
||||
`files` is an optional property that specifies data that should be ingressed
|
||||
from a location accessible by the local machine (i.e., machine invoking
|
||||
`shipyard.py` to a shared file system location accessible by compute nodes
|
||||
in the pool or Azure Blob or File Storage). `files` is a json list of objects,
|
||||
which allows for multiple sources to destinations to be ingressed during the
|
||||
same invocation. Note that no Azure Batch environment variables
|
||||
(i.e., `$AZ_BATCH_`-style environment variables) are available as path
|
||||
arguments since ingress actions performed within `files` are done locally
|
||||
on the machine invoking `shipyard.py`. Each object within the `files` list
|
||||
contains the following members:
|
||||
* (required) `source` property contains the following members:
|
||||
* (required) `path` is a local path. A single file or a directory
|
||||
can be specified. Filters below will be ignored if `path` is a file and
|
||||
not a directory.
|
||||
* (optional) `include` is an array of
|
||||
[Unix shell-style wildcard filters](https://docs.python.org/3.5/library/fnmatch.html)
|
||||
where only files matching a filter are included in the data transfer.
|
||||
Filters specified in `include` have precedence over `exclude` described
|
||||
next. `include` can only have a maximum of 1 filter for ingress to Azure
|
||||
Blob Storage. In this example, all files ending in `.dat` are ingressed.
|
||||
* (optional) `exclude` is an array of
|
||||
[Unix shell-style wildcard filters](https://docs.python.org/3.5/library/fnmatch.html)
|
||||
where files matching a filter are excluded from the data transfer. Filters
|
||||
specified in `include` have precedence over filters specified in
|
||||
`exclude`. `exclude` cannot be specified for ingress into Azure Blob
|
||||
Storage. In this example, all files ending in `.bak` are skipped for
|
||||
ingress.
|
||||
* (required) `destination` property contains the following members:
|
||||
* (required or optional) `shared_data_volume` or `storage_account_settings`
|
||||
for data ingress to a GlusterFS volume or Azure Blob or File Storage. If
|
||||
you are ingressing to a pool with only one compute node, you may omit
|
||||
`shared_data_volume`. Otherwise, you may specify one or the other, but
|
||||
not both in the same object. Please see below in the
|
||||
`shared_data_volumes` for information on how to set up a GlusterFS share.
|
||||
* (required or optional) `relative_destination_path` specifies a relative
|
||||
destination path to place the files, with respect to the target root.
|
||||
If transferring to a `shared_data_volume` then this is relative to the
|
||||
GlusterFS volume root. If transferring to a pool with one single node in
|
||||
it, thus, no `shared_data_volume` is specified in the prior property, then
|
||||
this is relative to
|
||||
[$AZ_BATCH_NODE_ROOT_DIR](https://azure.microsoft.com/en-us/documentation/articles/batch-api-basics/#files-and-directories).
|
||||
To place files directly in `$AZ_BATCH_NODE_ROOT_DIR` (not recommended),
|
||||
you can specify this property as empty string when not ingressing to
|
||||
a `shared_data_volume`. Note that if `scp` is selected while attempting
|
||||
to transfer directly to this aforementioned path, then `scp` will fail
|
||||
with exit code of 1 but the transfer will have succeeded (this is due
|
||||
to some of the permission options). If this property is not specified for
|
||||
a `shared_data_volume`, then files will be placed directly in the
|
||||
GlusterFS volume root. This property cannot be specified for a Azure
|
||||
Storage destination (i.e., `storage_account_settings`).
|
||||
* (required) `data_transfer` specifies how the transfer should take place.
|
||||
The following list contains members for GlusterFS ingress when a GlusterFS
|
||||
volume is provided for `shared_data_volume` (see below for ingressing to
|
||||
Azure Blob or File Storage):
|
||||
* (required) `method` specified which method should be used to ingress
|
||||
data, which should be one of: `scp`, `multinode_scp`, `rsync+ssh` or
|
||||
`multinode_rsync+ssh`. `scp` will use secure copy to copy a file or a
|
||||
directory (recursively) to the remote share path. `multinode_scp` will
|
||||
attempt to simultaneously transfer files to many compute nodes using
|
||||
`scp` at the same time to speed up data transfer. `rsync+ssh` will
|
||||
perform an rsync of files through SSH. `multinode_rsync+ssh` will
|
||||
attempt to simultaneously transfer files using `rsync` to many compute
|
||||
nodes at the same time to speed up data transfer with. Note that you may
|
||||
specify the `multinode_*` methods even with only 1 compute node in a
|
||||
pool which will allow you to take advantage of
|
||||
`max_parallel_transfers_per_node` below.
|
||||
* (optional) `ssh_private_key` location of the SSH private key for the
|
||||
username specified in the `pool_specification`:`ssh` section when
|
||||
connecting to compute nodes. The default is `id_rsa_shipyard`, if
|
||||
omitted, which is automatically generated if no SSH key is specified
|
||||
when an SSH user is added to a pool.
|
||||
* (optional) `scp_ssh_extra_options` are any extra options to pass to
|
||||
`scp` or `ssh` for `scp`/`multinode_scp` or
|
||||
`rsync+ssh`/`multinode_rsync+ssh` methods, respectively. In the example
|
||||
above, `-C` enables compression and `-c aes256-gcm@openssh.com`
|
||||
is passed to `scp`, which can potentially increase the transfer speed by
|
||||
selecting the `aes256-gcm@openssh.com` cipher which can exploit Intel
|
||||
AES-NI.
|
||||
* (optional) `rsync_extra_options` are any extra options to pass to
|
||||
`rsync` for the `rsync+ssh`/`multinode_rsync+ssh` transfer methods. This
|
||||
property is ignored for non-rsync transfer methods.
|
||||
* (optional) `split_files_megabytes` splits files into chunks with the
|
||||
specified size in MiB. This can potentially help with very large files.
|
||||
This option forces the transfer `method` to `multinode_scp`.
|
||||
Note that the destination file system must be able to accommodate
|
||||
up to 2x the size of files which are split. Additionally, transfers
|
||||
involving files which are split will incur reconstruction costs after
|
||||
the transfer is complete, which will increase the total end-to-end
|
||||
ingress time. However, in certain scenarios, by splitting files and
|
||||
transferring chunks in parallel along with reconstruction may end up
|
||||
being faster than transferring a large file without chunking.
|
||||
* (optional) `max_parallel_transfers_per_node` is the maximum number of
|
||||
parallel transfer to invoke per node with the
|
||||
`multinode_scp`/`multinode_rsync+ssh` methods. For example, if there
|
||||
are 3 compute nodes in the pool, and `2` is given for this option, then
|
||||
there will be up to 2 scp sessions in parallel per compute node for a
|
||||
maximum of 6 concurrent scp sessions to the pool. The default is 1 if
|
||||
not specified or omitted.
|
||||
* (required) `data_transfer` specifies how the transfer should take place.
|
||||
When Azure Blob or File Storage is selected as the destination for data
|
||||
ingress, [blobxfer](https://github.com/Azure/blobxfer) is invoked. The
|
||||
following list contains members for Azure Blob or File Storage ingress
|
||||
when a storage account link is provided for `storage_account_settings`:
|
||||
* (required) `container` or `file_share` is required when uploading to
|
||||
Azure Blob Storage or Azure File Storage, respectively. `container`
|
||||
specifies which container to upload to for Azure Blob Storage while
|
||||
`file_share` specifies which file share to upload to for Azure File
|
||||
Storage. Only one of these properties can be specified per
|
||||
`data_transfer` object. The container or file share need not be created
|
||||
beforehand.
|
||||
* (optional) `blobxfer_extra_options` are any extra options to pass to
|
||||
`blobxfer`. In the example above, `--no-computefilemd5` will force
|
||||
`blobxfer` to skip MD5 calculation on files ingressed.
|
||||
|
||||
`docker_volumes` is an optional property that can consist of two
|
||||
different types of volumes: `data_volumes` and `shared_data_volumes`.
|
||||
`data_volumes` can be of two flavors depending upon if `host_path` is set to
|
||||
null or not. In the former, this is typically used with the `VOLUME` keyword
|
||||
in Dockerfiles to initialize a data volume with existing data inside the
|
||||
image. If `host_path` is set, then the path on the host is mounted in the
|
||||
container at the path specified with `container_path`.
|
||||
|
||||
`shared_data_volumes` is an optional property for initializing persistent
|
||||
shared storage volumes. In the first shared volume, `shipyardvol` is the alias
|
||||
of this volume:
|
||||
* `volume_driver` property specifies the Docker Volume Driver to use.
|
||||
Currently Batch Shipyard only supports the `volume_driver` as `azurefile` or
|
||||
`glusterfs_on_compute`. Note that `glusterfs_on_compute` is not a true Docker
|
||||
Volume Driver. For this volume (`shipyardvol`), as this is an Azure File
|
||||
shared volume, the `volume_driver` should be set as `azurefile`.
|
||||
* `storage_account_settings` is a link to the alias of the storage account
|
||||
specified that holds this Azure File Share.
|
||||
* `azure_file_share_name` is the name of the share name on Azure Files. Note
|
||||
that the Azure File share must be created beforehand, the toolkit does not
|
||||
create Azure File shares, it only mounts them to the compute nodes.
|
||||
* `container_path` is the path in the container to mount.
|
||||
* `mount_options` are the mount options to pass to the mount command. Supported
|
||||
options are documented
|
||||
[here](https://github.com/Azure/azurefile-dockervolumedriver). It is
|
||||
recommended to use `0777` for both `filemode` and `dirmode` as the `uid` and
|
||||
`gid` cannot be reliably determined before the compute pool is allocated and
|
||||
this volume will be mounted as the root user.
|
||||
|
||||
Note that when using `azurefile` for a shared data volume, the storage account
|
||||
that holds the file share must reside within the same Azure region as the
|
||||
Azure Batch compute pool. Attempting to mount an Azure File share that is
|
||||
cross-region will result in failure as current Linux Samba clients do not
|
||||
support share level encryption at this time.
|
||||
|
||||
The second shared volue, `glustervol`, is a
|
||||
[GlusterFS](https://www.gluster.org/) network file system. Please note that
|
||||
`glusterfs_on_compute` are GlusterFS volumes co-located on the VM's temporary
|
||||
local disk space which is a shared resource. Sizes of the local temp disk for
|
||||
each VM size can be found
|
||||
[here](https://azure.microsoft.com/en-us/documentation/articles/virtual-machines-windows-sizes/).
|
||||
If specifying a `glusterfs_on_compute` volume, you must enable internode
|
||||
communication in the pool configuration file. These volumes have the following
|
||||
properties:
|
||||
* (required) `volume_driver` property should be set as `glusterfs_on_compute`.
|
||||
* (required) `container_path` is the path in the container to mount.
|
||||
* (optional) `volume_type` property defines the GlusterFS volume type.
|
||||
Currently, `replica` is the only supported type.
|
||||
* (optional) `volume_options` property defines additional GlusterFS volume
|
||||
options to set.
|
||||
|
||||
`glusterfs_on_compute` volumes are mounted on the host at
|
||||
`$AZ_BATCH_NODE_SHARED_DIR/.gluster/gv0`. Batch Shipyard will automatically
|
||||
replace container path references in direct and storage-based data
|
||||
ingress/egress with their host path equivalents.
|
||||
|
||||
Note that when resizing a pool with a `glusterfs_on_compute` shared file
|
||||
systems that you must resize with the `pool resize` command in `shipyard.py`
|
||||
and not with Azure Portal, Batch Explorer or any other tool.
|
||||
|
||||
Finally, note that all `docker_volumes` can be omitted completely along with
|
||||
one or all of `data_volumes` and `shared_data_volumes` if you do not require
|
||||
this functionality.
|
||||
|
||||
## Full template
|
||||
An full template of a credentials file can be found
|
||||
[here](../config\_templates/config.json). Note that this template cannot
|
||||
be used as-is and must be modified to fit your scenario.
|
|
@ -0,0 +1,163 @@
|
|||
# Batch Shipyard Pool Configuration
|
||||
This page contains in-depth details on how to configure the pool
|
||||
json file for Batch Shipyard.
|
||||
|
||||
## Schema
|
||||
The pool schema is as follows:
|
||||
|
||||
```json
|
||||
{
|
||||
"pool_specification": {
|
||||
"id": "dockerpool",
|
||||
"vm_size": "STANDARD_A9",
|
||||
"vm_count": 10,
|
||||
"max_tasks_per_node": 1,
|
||||
"inter_node_communication_enabled": true,
|
||||
"publisher": "OpenLogic",
|
||||
"offer": "CentOS-HPC",
|
||||
"sku": "7.1",
|
||||
"reboot_on_start_task_failed": true,
|
||||
"block_until_all_global_resources_loaded": true,
|
||||
"transfer_files_on_pool_creation": false,
|
||||
"input_data": {
|
||||
"azure_batch": [
|
||||
{
|
||||
"job_id": "jobonanotherpool",
|
||||
"task_id": "mytask",
|
||||
"include": ["wd/*.dat"],
|
||||
"exclude": ["*.txt"],
|
||||
"destination": "$AZ_BATCH_NODE_SHARED_DIR/jobonanotherpool"
|
||||
}
|
||||
],
|
||||
"azure_storage": [
|
||||
{
|
||||
"storage_account_settings": "mystorageaccount",
|
||||
"container": "poolcontainer",
|
||||
"include": ["pooldata*.bin"],
|
||||
"destination": "$AZ_BATCH_NODE_SHARED_DIR/pooldata",
|
||||
"blobxfer_extra_options": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"ssh": {
|
||||
"username": "docker",
|
||||
"expiry_days": 7,
|
||||
"ssh_public_key": null,
|
||||
"generate_docker_tunnel_script": true,
|
||||
"generated_file_export_path": null,
|
||||
"hpn_server_swap": false
|
||||
},
|
||||
"gpu": {
|
||||
"nvidia_driver": {
|
||||
"source": "https://some.url"
|
||||
}
|
||||
},
|
||||
"additional_node_prep_commands": [
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The `pool_specification` property has the following members:
|
||||
* (required) `id` is the compute pool ID.
|
||||
* (required) `vm_size` is the
|
||||
[Azure Virtual Machine Instance Size](https://azure.microsoft.com/en-us/pricing/details/virtual-machines/).
|
||||
Please note that not all regions have every VM size available.
|
||||
* (required) `vm_count` is the number of compute nodes to allocate.
|
||||
* (optional) `max_tasks_per_node` is the maximum number of concurrent tasks
|
||||
that can be running at any one time on a compute node. This defaults to a
|
||||
value of 1 if not specified.
|
||||
* (optional) `inter_node_communication_enabled` designates if this pool is set
|
||||
up for inter-node communication. This must be set to `true` for any containers
|
||||
that must communicate with each other such as MPI applications. This property
|
||||
will be force enabled if peer-to-peer replication is enabled.
|
||||
* (required) `publisher` is the publisher name of the Marketplace VM image.
|
||||
* (required) `offer` is the offer name of the Marketplace VM image.
|
||||
* (required) `sku` is the sku name of the Marketplace VM image.
|
||||
* (optional) `reboot_on_start_task_failed` allows Batch Shipyard to reboot the
|
||||
compute node in case there is a transient failure in node preparation (e.g.,
|
||||
network timeout, resolution failure or download problem). This defaults to
|
||||
`false`.
|
||||
* (optional) `block_until_all_global_resources_loaded` will block the node
|
||||
from entering ready state until all Docker images are loaded. This defaults
|
||||
to `true`.
|
||||
* (optional) `transfer_files_on_pool_creation` will ingress all `files`
|
||||
specified in the `global_resources` section of the configuration json when
|
||||
the pool is created. If files are to be ingressed to Azure Blob or File
|
||||
Storage, then data movement operations are overlapped with the creation of the
|
||||
pool. If files are to be ingressed to a shared file system on the compute
|
||||
nodes, then the files are ingressed after the pool is created and the shared
|
||||
file system is ready. Files can be ingressed to both Azure Blob Storage and a
|
||||
shared file system during the same pool creation invocation. If this property
|
||||
is set to `true` then `block_until_all_global_resources_loaded` will be force
|
||||
disabled. If omitted, this property defaults to `false`.
|
||||
* (optional) `input_data` is an object containing data that should be
|
||||
ingressed to all compute nodes as part of node preparation. It is
|
||||
important to note that if you are combining this action with `files` and
|
||||
are ingressing data to Azure Blob or File storage as part of pool creation,
|
||||
that the blob containers or file shares defined here will be downloaded as
|
||||
soon as the compute node is ready to do so. This may result in the blob
|
||||
container/blobs or file share/files not being ready in time for the
|
||||
`input_data` transfer. It is up to you to ensure that these two operations do
|
||||
not overlap. If there is a possibility of overlap, then you should ingress
|
||||
data defined in `files` prior to pool creation and disable the option above
|
||||
`transfer_files_on_pool_creation`. This object currently supports
|
||||
`azure_batch` and `azure_storage` as members.
|
||||
* `azure_batch` contains the following members:
|
||||
* (required) `job_id` the job id of the task
|
||||
* (required) `task_id` the id of the task to fetch files from
|
||||
* (optional) `include` is an array of include filters
|
||||
* (optional) `exclude` is an array of exclude filters
|
||||
* (required) `destination` is the destination path to place the files
|
||||
* `azure_storage` contains the following members:
|
||||
* (required) `storage_account_settings` contains a storage account link
|
||||
as defined in the credentials json.
|
||||
* (required) `container` or `file_share` is required when downloading
|
||||
from Azure Blob Storage or Azure File Storage, respectively.
|
||||
`container` specifies which container to download from for Azure Blob
|
||||
Storage while `file_share` specifies which file share to download from
|
||||
for Azure File Storage. Only one of these properties can be specified
|
||||
per `data_transfer` object.
|
||||
* (optional) `include` property defines an optional include filter.
|
||||
Although this property is an array, it is only allowed to have 1
|
||||
maximum filter.
|
||||
* (required) `destination` property defines where to place the
|
||||
downloaded files on the host file system. Please note that you should
|
||||
not specify a destination that is on a shared file system. If you
|
||||
require ingressing to a shared file system location like a GlusterFS
|
||||
volume, then use the global configuration `files` property and the
|
||||
`data ingress` command.
|
||||
* (optional) `blobxfer_extra_options` are any extra options to pass to
|
||||
`blobxfer`.
|
||||
* (optional) `ssh` is the property for creating a user to accomodate SSH
|
||||
sessions to compute nodes. If this property is absent, then an SSH user is not
|
||||
created with pool creation.
|
||||
* (required) `username` is the user to create on the compute nodes.
|
||||
* (optional) `expiry_days` is the number of days from now for the account on
|
||||
the compute nodes to expire. The default is 30 days from invocation time.
|
||||
* (optional) `ssh_public_key` is the path to an existing SSH public key to
|
||||
use. If not specified, an RSA public/private keypair will be automatically
|
||||
generated only on Linux. If this is `null` or not specified on Windows,
|
||||
the SSH user is not created.
|
||||
* (optional) `generate_docker_tunnel_script` property directs script to
|
||||
generate an SSH tunnel script that can be used to connect to the remote
|
||||
Docker engine running on a compute node.
|
||||
* (optional) `generated_file_export_path` is the path to export the
|
||||
generated RSA keypair and docker tunnel script to. If omitted, the
|
||||
current directory is used.
|
||||
* (experimental) `hpn_server_swap` property enables an OpenSSH server with
|
||||
[HPN patches](https://www.psc.edu/index.php/using-joomla/extensions/templates/atomic/636-hpn-ssh)
|
||||
to be swapped with the standard distribution OpenSSH server. This is not
|
||||
supported on all Linux distributions and may be force disabled.
|
||||
* (required for `STANDARD_NV` instances, optional for `STANDARD_NC` instances)
|
||||
`gpu` property defines additional information for NVIDIA GPU-enabled VMs:
|
||||
* `nvidia_driver` property contains the following required members:
|
||||
* `source` is the source url to download the driver.
|
||||
* (optional) `additional_node_prep_commands` is an array of additional commands
|
||||
to execute on the compute node host as part of node preparation. This can
|
||||
be empty or omitted.
|
||||
|
||||
## Full template
|
||||
An full template of a credentials file can be found
|
||||
[here](../config\_templates/pool.json). Note that this template cannot
|
||||
be used as-is and must be modified to fit your scenario.
|
|
@ -0,0 +1,378 @@
|
|||
# Batch Shipyard Jobs Configuration
|
||||
This page contains in-depth details on how to configure the jobs
|
||||
json file for Batch Shipyard.
|
||||
|
||||
## Schema
|
||||
The jobs schema is as follows:
|
||||
|
||||
```json
|
||||
{
|
||||
"job_specifications": [
|
||||
{
|
||||
"id": "dockerjob",
|
||||
"multi_instance_auto_complete": true,
|
||||
"environment_variables": {
|
||||
"abc": "xyz"
|
||||
},
|
||||
"environment_variables_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/myjobenv",
|
||||
"max_task_retries": 3,
|
||||
"allow_run_on_missing_image": false,
|
||||
"input_data": {
|
||||
"azure_batch": [
|
||||
{
|
||||
"job_id": "someotherjob",
|
||||
"task_id": "task-a",
|
||||
"include": ["wd/*.dat"],
|
||||
"exclude": ["*.txt"],
|
||||
"destination": null
|
||||
}
|
||||
],
|
||||
"azure_storage": [
|
||||
{
|
||||
"storage_account_settings": "mystorageaccount",
|
||||
"container": "jobcontainer",
|
||||
"include": ["jobdata*.bin"],
|
||||
"destination": "$AZ_BATCH_NODE_SHARED_DIR/jobdata",
|
||||
"blobxfer_extra_options": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"tasks": [
|
||||
{
|
||||
"id": null,
|
||||
"depends_on": [
|
||||
"taskid-a", "taskid-b", "taskid-c"
|
||||
],
|
||||
"depends_on_range": [
|
||||
1, 10
|
||||
],
|
||||
"image": "busybox",
|
||||
"name": null,
|
||||
"labels": [],
|
||||
"environment_variables": {
|
||||
"def": "123"
|
||||
},
|
||||
"environment_variables_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/mytaskenv",
|
||||
"ports": [],
|
||||
"data_volumes": [
|
||||
"contdatavol",
|
||||
"hosttempvol"
|
||||
],
|
||||
"shared_data_volumes": [
|
||||
"azurefilevol"
|
||||
],
|
||||
"resource_files": [
|
||||
{
|
||||
"file_path": "",
|
||||
"blob_source": "",
|
||||
"file_mode": ""
|
||||
}
|
||||
],
|
||||
"input_data": {
|
||||
"azure_batch": [
|
||||
{
|
||||
"job_id": "previousjob",
|
||||
"task_id": "mytask1",
|
||||
"include": ["wd/output/*.bin"],
|
||||
"exclude": ["*.txt"],
|
||||
"destination": null
|
||||
}
|
||||
],
|
||||
"azure_storage": [
|
||||
{
|
||||
"storage_account_settings": "mystorageaccount",
|
||||
"container": "taskcontainer",
|
||||
"include": ["taskdata*.bin"],
|
||||
"destination": "$AZ_BATCH_NODE_SHARED_DIR/taskdata",
|
||||
"blobxfer_extra_options": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"output_data": {
|
||||
"azure_storage": [
|
||||
{
|
||||
"storage_account_settings": "mystorageaccount",
|
||||
"container": "output",
|
||||
"source": null,
|
||||
"include": ["**/out*.dat"],
|
||||
"blobxfer_extra_options": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"remove_container_after_exit": true,
|
||||
"shm_size": "256m",
|
||||
"additional_docker_run_options": [
|
||||
],
|
||||
"infiniband": false,
|
||||
"gpu": false,
|
||||
"max_task_retries": 3,
|
||||
"retention_time": "1.12:00:00",
|
||||
"multi_instance": {
|
||||
"num_instances": "pool_current_dedicated",
|
||||
"coordination_command": null,
|
||||
"resource_files": [
|
||||
{
|
||||
"file_path": "",
|
||||
"blob_source": "",
|
||||
"file_mode": ""
|
||||
}
|
||||
]
|
||||
},
|
||||
"entrypoint": null,
|
||||
"command": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
`job_specifications` array consists of jobs to create.
|
||||
* (required) `id` is the job id to create. If the job already exists, the
|
||||
specified `tasks` under the job will be added to the existing job.
|
||||
* (optional) `multi_instance_auto_complete` enables auto-completion of the job
|
||||
for which a multi-task instance is run. This allows automatic cleanup of the
|
||||
Docker container in multi-instance tasks. This is defaulted to `true` when
|
||||
multi-instance tasks are specified.
|
||||
* (optional) `environment_variables` under the job are environment variables
|
||||
which will be applied to all tasks operating under the job. Note that
|
||||
environment variables are not expanded and are passed as-is. You will need
|
||||
to source the environment file `$AZ_BATCH_TASK_WORKING_DIR/.shipyard.envlist`
|
||||
in a shell within the docker `command` or `entrypoint` if you want any
|
||||
environment variables to be expanded.
|
||||
* (optional) `environment_variables_keyvault_secret_id` under the job are
|
||||
environment variables stored in KeyVault that should be applied to all tasks
|
||||
operating under the job. The secret stored in KeyVault must be a valid json
|
||||
string, e.g., `{ "env_var_name": "env_var_value" }`.
|
||||
* (optional) `max_task_retries` sets the maximum number of times that
|
||||
Azure Batch should retry all tasks in this job for. By default, Azure Batch
|
||||
does not retry tasks that fail (i.e. `max_task_retries` is 0).
|
||||
* (optional) `allow_run_on_missing` allows tasks with a Docker image reference
|
||||
that was not pre-loaded on to the compute node via
|
||||
`global_resources`:`docker_images` in the global configuration to be able to
|
||||
run. Note that you should attempt to specify all Docker images that you intend
|
||||
to run in the `global_resources`:`docker_images` property in the global
|
||||
configuration to minimize scheduling to task execution latency.
|
||||
* (optional) `input_data` is an object containing data that should be
|
||||
ingressed for the job. Any `input_data` defined at this level will be
|
||||
downloaded for this job which can be run on any number of compute nodes
|
||||
depending upon the number of constituent tasks and repeat invocations. However,
|
||||
`input_data` is only downloaded once per job invocation on a compute node.
|
||||
For example, if `job-1`:`task-1` is run on compute node A and then
|
||||
`job-1`:`task-2` is run on compute node B, then this `input_data` is ingressed
|
||||
to both compute node A and B. However, if `job-1`:`task-3` is then run on
|
||||
compute node A after `job-1`:`task-1`, then the `input_data` is not
|
||||
transferred again. This object currently supports `azure_batch` and
|
||||
`azure_storage` as members.
|
||||
* `azure_batch` contains the following members:
|
||||
* (required) `job_id` the job id of the task
|
||||
* (required) `task_id` the id of the task to fetch files from
|
||||
* (optional) `include` is an array of include filters
|
||||
* (optional) `exclude` is an array of exclude filters
|
||||
* (required) `destination` is the destination path to place the files
|
||||
* `azure_storage` contains the following members:
|
||||
* (required) `storage_account_settings` contains a storage account link
|
||||
as defined in the credentials json.
|
||||
* (required) `container` or `file_share` is required when downloading
|
||||
from Azure Blob Storage or Azure File Storage, respectively.
|
||||
`container` specifies which container to download from for Azure Blob
|
||||
Storage while `file_share` specifies which file share to download from
|
||||
for Azure File Storage. Only one of these properties can be specified
|
||||
per `data_transfer` object.
|
||||
* (optional) `include` property defines an optional include filter.
|
||||
Although this property is an array, it is only allowed to have 1
|
||||
maximum filter.
|
||||
* (required) `destination` property defines where to place the
|
||||
downloaded files on the host file system. Please note that you should
|
||||
not specify a destination that is on a shared file system. If you
|
||||
require ingressing to a shared file system location like a GlusterFS
|
||||
volume, then use the global configuration `files` property and the
|
||||
`data ingress` command.
|
||||
* (optional) `blobxfer_extra_options` are any extra options to pass to
|
||||
`blobxfer`.
|
||||
* (required) `tasks` is an array of tasks to add to the job.
|
||||
* (optional) `id` is the task id. Note that if the task `id` is null or
|
||||
empty then a generic task id will be assigned. The generic task id is
|
||||
formatted as `dockertask-NNNNN` where `NNNNN` starts from `00000` and is
|
||||
increased by 1 for each task added to the same job. If there are more
|
||||
than `99999` autonamed tasks in a job then the numbering is not
|
||||
padded for tasks exceeding 5 digits.
|
||||
* (optional) `depends_on` is an array of task ids for which this container
|
||||
invocation (task) depends on and must run to successful completion prior
|
||||
to this task executing.
|
||||
* (optional) `depends_on_range` is an array with exactly two integral
|
||||
elements containing a task `id` range for which this task is dependent
|
||||
upon, i.e., the start `id` and the end `id` for which this task depends
|
||||
on. Although task `id`s are always strings, the dependent task `id`s for
|
||||
ranges must be expressed by their integral representation for this
|
||||
property. This also implies that task `id`s for which this task depends
|
||||
on must be integral in nature. For example, if `depends_on_range` is set
|
||||
to `[1, 10]` (note the integral members), then there should be task
|
||||
`id`s of `"1"`, `"2"`, ... `"10"` within the job. Once these dependent
|
||||
tasks complete successfully, then this specified task will execute.
|
||||
* (required) `image` is the Docker image to use for this task
|
||||
* (optional) `name` is the name to assign to the container. If not
|
||||
specified, the value of the `id` property will be used for `name`.
|
||||
* (optional) `labels` is an array of labels to apply to the container.
|
||||
* (optional) `environment_variables` are any additional task-specific
|
||||
environment variables that should be applied to the container. Note that
|
||||
environment variables are not expanded and are passed as-is. You will
|
||||
need to source the environment file
|
||||
`$AZ_BATCH_TASK_WORKING_DIR/.shipyard.envlist` in a shell within the
|
||||
docker `command` or `entrypoint` if you want any environment variables
|
||||
to be expanded.
|
||||
* (optional) `environment_variables_keyvault_secret_id` are any additional
|
||||
task-specific environment variables that should be applied to the
|
||||
container but are stored in KeyVault. The secret stored in KeyVault must
|
||||
be a valid json string, e.g., `{ "env_var_name": "env_var_value" }`.
|
||||
* (optional) `ports` is an array of port specifications that should be
|
||||
exposed to the host.
|
||||
* (optional) `data_volumes` is an array of `data_volume` aliases as defined
|
||||
in the global configuration file. These volumes will be mounted in the
|
||||
container.
|
||||
* (optional) `shared_data_volumes` is an array of `shared_data_volume`
|
||||
aliases as defined in the global configuration file. These volumes will be
|
||||
mounted in the container.
|
||||
* (optional) `resource_files` is an array of resource files that should be
|
||||
downloaded as part of the task. Each array entry contains the following
|
||||
information:
|
||||
* `file_path` is the path within the task working directory to place the
|
||||
file on the compute node.
|
||||
* `blob_source` is an accessible HTTP/HTTPS URL. This need not be an Azure
|
||||
Blob Storage URL.
|
||||
* `file_mode` if the file mode to set for the file on the compute node.
|
||||
This is optional.
|
||||
* (optional) `input_data` is an object containing data that should be
|
||||
ingressed for this specific task. This object currently supports
|
||||
`azure_batch` and `azure_storage` as members. Note for multi-instance
|
||||
tasks, transfer of `input_data` is only applied to the task running the
|
||||
application command.
|
||||
* `azure_batch` contains the following members:
|
||||
* (required) `job_id` the job id of the task
|
||||
* (required) `task_id` the id of the task to fetch files from
|
||||
* (optional) `include` is an array of include filters
|
||||
* (optional) `exclude` is an array of exclude filters
|
||||
* (optional) `destination` is the destination path to place the files.
|
||||
If `destination` is not specified at this level, then files are
|
||||
defaulted to download into `$AZ_BATCH_TASK_WORKING_DIR`.
|
||||
* `azure_storage` contains the following members:
|
||||
* (required) `storage_account_settings` contains a storage account link
|
||||
as defined in the credentials json.
|
||||
* (required) `container` or `file_share` is required when downloading
|
||||
from Azure Blob Storage or Azure File Storage, respectively.
|
||||
`container` specifies which container to download from for Azure Blob
|
||||
Storage while `file_share` specifies which file share to download from
|
||||
for Azure File Storage. Only one of these properties can be specified
|
||||
per `data_transfer` object.
|
||||
* (optional) `include` property defines an optional include filter.
|
||||
Although this property is an array, it is only allowed to have 1
|
||||
maximum filter.
|
||||
* (optional) `destination` property defines where to place the
|
||||
downloaded files on the host file system. Unlike the job-level
|
||||
version of `input_data`, this `destination` property can be ommitted.
|
||||
If `destination` is not specified at this level, then files are
|
||||
defaulted to download into `$AZ_BATCH_TASK_WORKING_DIR`. Please note
|
||||
that you should not specify a destination that is on a shared file
|
||||
system. If you require ingressing to a shared file system location
|
||||
like a GlusterFS volume, then use the global configuration `files`
|
||||
property and the `data ingress` command.
|
||||
* (optional) `blobxfer_extra_options` are any extra options to pass to
|
||||
`blobxfer`.
|
||||
* (optional) `output_data` is an object containing data that should be
|
||||
egressed for this specific task if and only if the task completes
|
||||
successfully. This object currently only supports `azure_storage` as a
|
||||
member. Note for multi-instance tasks, transfer of `output_data` is only
|
||||
applied to the task running the application command.
|
||||
* `azure_storage` contains the following members:
|
||||
* (required) `storage_account_settings` contains a storage account link
|
||||
as defined in the credentials json.
|
||||
* (required) `container` or `file_share` is required when uploading to
|
||||
Azure Blob Storage or Azure File Storage, respectively. `container`
|
||||
specifies which container to upload to for Azure Blob Storage while
|
||||
`file_share` specifies which file share to upload to for Azure File
|
||||
Storage. Only one of these properties can be specified per
|
||||
`data_transfer` object.
|
||||
* (optional) `source` property defines which directory to upload to
|
||||
Azure storage. If `source` is not specified, then `source` is
|
||||
defaulted to `$AZ_BATCH_TASK_DIR`.
|
||||
* (optional) `include` property defines an optional include filter.
|
||||
Although this property is an array, it is only allowed to have 1
|
||||
maximum filter.
|
||||
* (optional) `blobxfer_extra_options` are any extra options to pass to
|
||||
`blobxfer`.
|
||||
* (optional) `remove_container_after_exit` property specifies if the
|
||||
container should be automatically removed/cleaned up after it exits. This
|
||||
defaults to `false`.
|
||||
* (optional) `shm_size` property specifies the size of `/dev/shm` in
|
||||
the container. The default is `64m`. The postfix unit can be designated
|
||||
as `b` (bytes), `k` (kilobytes), `m` (megabytes), or `g` (gigabytes). This
|
||||
value may need to be increased from the default of `64m` for certain
|
||||
Docker applications, including multi-instance tasks using Intel MPI
|
||||
(see [issue #8](https://github.com/Azure/batch-shipyard/issues/8)).
|
||||
* (optional) `additional_docker_run_options` is an array of addition Docker
|
||||
run options that should be passed to the Docker daemon when starting this
|
||||
container.
|
||||
* (optional) `infiniband` designates if this container requires access to the
|
||||
Infiniband/RDMA devices on the host. Note that this will automatically
|
||||
force the container to use the host network stack. If this property is
|
||||
set to `true`, ensure that the `pool_specification` property
|
||||
`inter_node_communication_enabled` is set to `true`.
|
||||
* (optional) `gpu` designates if this container requires access to the GPU
|
||||
devices on the host. If this property is set to `true`, Docker containers
|
||||
are instantiated via `nvidia-docker`. This requires N-series VM instances.
|
||||
* (optional) `max_task_retries` sets the maximum number of times that
|
||||
Azure Batch should retry this task for. This overrides the job-level task
|
||||
retry count. By default, Azure Batch does not retry tasks that fail
|
||||
(i.e. `max_task_retries` is 0).
|
||||
* (optional) `retention_time` sets the timedelta to retain the task
|
||||
directory on the compute node where it ran after the task completes.
|
||||
The format for this property is a timedelta with a string representation
|
||||
of "d.HH:mm:ss". For example, "1.12:00:00" would allow the compute node
|
||||
to clean up this task's directory 36 hours after the task completed. The
|
||||
default, if unspecified, is effectively infinite - i.e., task data is
|
||||
retained forever on the compute node that ran the task.
|
||||
* (optional) `multi_instance` is a property indicating that this task is a
|
||||
multi-instance task. This is required if the Docker image is an MPI
|
||||
program. Additional information about multi-instance tasks and Batch
|
||||
Shipyard can be found
|
||||
[here](80-batch-shipyard-multi-instance-tasks.md). Do not define this
|
||||
property for tasks that are not multi-instance. Additional members of this
|
||||
property are:
|
||||
* `num_instances` is a property setting the number of compute node
|
||||
instances are required for this multi-instance task. This can be any one
|
||||
of the following:
|
||||
1. An integral number
|
||||
2. `pool_current_dedicated` which is the instantaneous reading of the
|
||||
target pool's current dedicated count during this function invocation.
|
||||
3. `pool_specification_vm_count` which is the `vm_count` specified in the
|
||||
pool configuration.
|
||||
* `coordination_command` is the coordination command this is run by each
|
||||
instance (compute node) of this multi-instance task prior to the
|
||||
application command. This command must not block and must exit
|
||||
successfully for the multi-instance task to proceed. This is the command
|
||||
passed to the container in `docker run` for multi-instance tasks. This
|
||||
docker container instance will automatically be daemonized. This is
|
||||
optional and may be null.
|
||||
* `resource_files` is an array of resource files that should be downloaded
|
||||
as part of the multi-instance task. Each array entry contains the
|
||||
following information:
|
||||
* `file_path` is the path within the task working directory to place
|
||||
the file on the compute node.
|
||||
* `blob_source` is an accessible HTTP/HTTPS URL. This need not be an
|
||||
Azure Blob Storage URL.
|
||||
* `file_mode` if the file mode to set for the file on the compute node.
|
||||
This is optional.
|
||||
* (optional) `entrypoint` is the property that can override the Docker image
|
||||
defined `ENTRYPOINT`.
|
||||
* (optional) `command` is the command to execute in the Docker container
|
||||
context. If this task is a regular non-multi-instance task, then this is
|
||||
the command passed to the container context during `docker run`. If this
|
||||
task is a multi-instance task, then this `command` is the application
|
||||
command and is executed with `docker exec` in the running Docker container
|
||||
context from the `coordination_command` in the `multi_instance` property.
|
||||
This property may be null.
|
||||
|
||||
## Full template
|
||||
An full template of a credentials file can be found
|
||||
[here](../config\_templates/jobs.json). Note that this template cannot
|
||||
be used as-is and must be modified to fit your scenario.
|
|
@ -17,7 +17,9 @@ The following are general limitations or restrictions:
|
|||
* Compute pool resize down (i.e., removing nodes from a pool) is not supported
|
||||
when peer-to-peer transfer is enabled.
|
||||
* The maximum number of compute nodes with peer-to-peer enabled is currently
|
||||
40 for Linux pools for non-UserSubscription Batch accounts.
|
||||
40 for Linux pools for non-UserSubscription Batch accounts. This check is
|
||||
no longer performed before a pool is created and will instead result in
|
||||
a ResizeError on the pool if not all compute nodes can be allocated.
|
||||
* Data movement between Batch tasks as defined by `input_data`:`azure_batch`
|
||||
is restricted to Batch accounts with keys (non-AAD).
|
||||
* Virtual network support in Batch pools can only be used with
|
||||
|
|
|
@ -6,6 +6,10 @@ and effectively running your batch-style Docker workloads on Azure Batch.
|
|||
2. [Installation](01-batch-shipyard-installation.md)
|
||||
3. [Quick Start](02-batch-shipyard-quickstart.md)
|
||||
4. [Configuration](10-batch-shipyard-configuration.md)
|
||||
1. [Credentials Configuration](11-batch-shipyard-configuration-credentials.md)
|
||||
2. [Global Configuration](12-batch-shipyard-configuration-global.md)
|
||||
3. [Pool Configuration](13-batch-shipyard-configuration-pool.md)
|
||||
4. [Jobs Configuration](14-batch-shipyard-configuration-jobs.md)
|
||||
5. [Usage](20-batch-shipyard-usage.md)
|
||||
6. [Data Movement](70-batch-shipyard-data-movement.md)
|
||||
7. [Azure KeyVault for Credential Management](74-batch-shipyard-azure-keyvault.md)
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
"inter_node_communication_enabled": true,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
"inter_node_communication_enabled": true,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -20,7 +20,7 @@ compute application, it is best to choose `NC` VM instances.
|
|||
once they are available for N-series VMs.
|
||||
* `offer` should be `UbuntuServer`. Other offers will be supported once they
|
||||
are available for N-series VMs.
|
||||
* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
|
||||
* `sku` should be `16.04-LTS`. Other skus will be supported once they are
|
||||
available for N-series VMs.
|
||||
* `inter_node_communication_enabled` must be set to `true`
|
||||
* `max_tasks_per_node` must be set to 1 or omitted
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
"inter_node_communication_enabled": true,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"vm_count": 1,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"vm_count": 1,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"vm_count": 1,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -17,7 +17,7 @@ compute application, it is best to choose `NC` VM instances.
|
|||
once they are available for N-series VMs.
|
||||
* `offer` should be `UbuntuServer`. Other offers will be supported once they
|
||||
are available for N-series VMs.
|
||||
* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
|
||||
* `sku` should be `16.04-LTS`. Other skus will be supported once they are
|
||||
available for N-series VMs.
|
||||
|
||||
### Global Configuration
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"vm_count": 1,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"vm_count": 1,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -17,7 +17,7 @@ compute application, it is best to choose `NC` VM instances.
|
|||
once they are available for N-series VMs.
|
||||
* `offer` should be `UbuntuServer`. Other offers will be supported once they
|
||||
are available for N-series VMs.
|
||||
* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
|
||||
* `sku` should be `16.04-LTS`. Other skus will be supported once they are
|
||||
available for N-series VMs.
|
||||
|
||||
### Global Configuration
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"vm_count": 1,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -18,7 +18,7 @@ audio/video, it is best to choose `NV` VM instances.
|
|||
once they are available for N-series VMs.
|
||||
* `offer` should be `UbuntuServer`. Other offers will be supported once they
|
||||
are available for N-series VMs.
|
||||
* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
|
||||
* `sku` should be `16.04-LTS`. Other skus will be supported once they are
|
||||
available for N-series VMs.
|
||||
* `gpu` property should be specified with the following members:
|
||||
* `nvidia_driver` property contains the following members:
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"vm_count": 1,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"vm_count": 1,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -18,7 +18,7 @@ compute application, it is best to choose `NC` VM instances.
|
|||
once they are available for N-series VMs.
|
||||
* `offer` should be `UbuntuServer`. Other offers will be supported once they
|
||||
are available for N-series VMs.
|
||||
* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
|
||||
* `sku` should be `16.04-LTS`. Other skus will be supported once they are
|
||||
available for N-series VMs.
|
||||
|
||||
### Global Configuration
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"vm_count": 1,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
"inter_node_communication_enabled": true,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
"inter_node_communication_enabled": true,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -17,7 +17,7 @@ compute application, it is best to choose `NC` VM instances.
|
|||
once they are available for N-series VMs.
|
||||
* `offer` should be `UbuntuServer`. Other offers will be supported once they
|
||||
are available for N-series VMs.
|
||||
* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
|
||||
* `sku` should be `16.04-LTS`. Other skus will be supported once they are
|
||||
available for N-series VMs.
|
||||
* `inter_node_communication_enabled` must be set to `true`
|
||||
* `max_tasks_per_node` must be set to 1 or omitted
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"vm_count": 2,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"vm_count": 1,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -19,7 +19,7 @@ compute application, it is best to choose `NC` VM instances.
|
|||
once they are available for N-series VMs.
|
||||
* `offer` should be `UbuntuServer`. Other offers will be supported once they
|
||||
are available for N-series VMs.
|
||||
* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
|
||||
* `sku` should be `16.04-LTS`. Other skus will be supported once they are
|
||||
available for N-series VMs.
|
||||
* `max_tasks_per_node` must be set to 1 or omitted
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"vm_count": 1,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"vm_count": 1,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -20,7 +20,7 @@ If not using GPUs, another appropriate SKU can be selected.
|
|||
supported once they are available for N-series VMs.
|
||||
* `offer` should be `UbuntuServer` if using GPUs. Other offers will be
|
||||
supported once they are available for N-series VMs.
|
||||
* `sku` should be `16.04.0-LTS` if using GPUs. Other skus will be supported
|
||||
* `sku` should be `16.04-LTS` if using GPUs. Other skus will be supported
|
||||
once they are available for N-series VMs.
|
||||
|
||||
If on multiple CPUs:
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
"inter_node_communication_enabled": true,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
"inter_node_communication_enabled": true,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -17,7 +17,7 @@ compute application, it is best to choose `NC` VM instances.
|
|||
once they are available for N-series VMs.
|
||||
* `offer` should be `UbuntuServer`. Other offers will be supported once they
|
||||
are available for N-series VMs.
|
||||
* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
|
||||
* `sku` should be `16.04-LTS`. Other skus will be supported once they are
|
||||
available for N-series VMs.
|
||||
|
||||
### Global Configuration
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"vm_count": 1,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"vm_count": 1,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -17,7 +17,7 @@ compute application, it is best to choose `NC` VM instances.
|
|||
once they are available for N-series VMs.
|
||||
* `offer` should be `UbuntuServer`. Other offers will be supported once they
|
||||
are available for N-series VMs.
|
||||
* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are
|
||||
* `sku` should be `16.04-LTS`. Other skus will be supported once they are
|
||||
available for N-series VMs.
|
||||
|
||||
### Global Configuration
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
"credentials": {
|
||||
"batch": {
|
||||
"account": "<batch account name>",
|
||||
"account_key": "<batch account key>",
|
||||
"account_service_url": "<batch account service url>"
|
||||
},
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"vm_count": 1,
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04.0-LTS",
|
||||
"sku": "16.04-LTS",
|
||||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
|
|
|
@ -60,6 +60,9 @@ if [ $server_type == "nfs" ]; then
|
|||
echo ""
|
||||
echo "nfsstat:"
|
||||
nfsstat -s -4
|
||||
echo ""
|
||||
echo "connected clients:"
|
||||
netstat -tn | grep :2049
|
||||
else
|
||||
echo "$server_type not supported."
|
||||
exit 1
|
||||
|
|
Загрузка…
Ссылка в новой задаче