diff --git a/CHANGELOG.md b/CHANGELOG.md index 9343168..7e1b321 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,19 +2,34 @@ ## [Unreleased] ### Added +- Support for provisioning storage clusters via the `fs cluster` command +- Support for provisioning managed disks via the `fs disks` command - Support for UserSubscription Batch accounts - Azure Active Directory authentication support for Batch accounts +- `allow_run_on_missing` option to jobs that allows tasks to execute under +jobs with Docker images that have not been pre-loaded via the +`global_resources`:`docker_images` setting in config.json. Note that, if +possible, you should attempt to specify all Docker images that you intend +to run in the `global_resources`:`docker_images` property in the global +configuration to minimize scheduling to task execution latency. +- Support for Canonical/UbuntuServer/16.04-LTS. This sku should be used over +the old 16.04.0-LTS sku due to +[issue #31](https://github.com/Azure/batch-shipyard/issues/31). ### Changed - **Breaking Change:** `glusterfs` `volume_driver` for `shared_data_volumes` should now be named as `glusterfs_on_compute`. This is to distinguish co-located glusterfs on compute nodes with possible standalone glusterfs `storage_cluster` remote mounted in the future. -- Batch account (name) is now an optional property in the credentials config +- Pool existance is now checked prior to job submission and can now proceed +to add without an active pool. +- Batch `account` (name) is now an optional property in the credentials config +- Configuration doc broken up into multiple pages +- Update all recipes using Canonical/UbuntuServer/16.04.0-LTS to use +Canonical/UbuntuServer/16.04-LTS instead +- Precompile python files for Docker images - All dependencies updated to latest versions - Update Batch API call compatibility for `azure-batch 2.0.0` -- Precompile python files for Docker images -- Configuration doc broken up into multiple pages ## [2.5.4] - 2017-03-08 ### Changed diff --git a/config_templates/credentials.json b/config_templates/credentials.json index f61b95e..4155ede 100644 --- a/config_templates/credentials.json +++ b/config_templates/credentials.json @@ -37,10 +37,6 @@ }, "batch": { "account_service_url": "", - "account_key": "", - "account_key_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/batchkey", - "user_subscription": false, - "resource_group": "", "aad": { "endpoint": "https://batch.core.windows.net/", "directory_id": "", @@ -54,7 +50,10 @@ "enabled": true, "filename": "" } - } + }, + "resource_group": "", + "account_key": "", + "account_key_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/batchkey" }, "storage": { "mystorageaccount": { diff --git a/config_templates/fs.json b/config_templates/fs.json index 1987233..4e3236a 100644 --- a/config_templates/fs.json +++ b/config_templates/fs.json @@ -18,6 +18,7 @@ "static_public_ip": false, "virtual_network": { "name": "", + "resource_group": "", "existing_ok": false, "address_space": "", "subnet": { @@ -28,7 +29,7 @@ "network_security": { "nfs": ["1.2.3.0/24"], "ssh": ["*"], - "custom_inbound": { + "custom_inbound_rules": { "myrule": { "destination_port_range": "5000-5001", "source_address_prefix": ["1.2.3.4", "5.6.7.0/24"], diff --git a/config_templates/jobs.json b/config_templates/jobs.json index b97e4f0..5fa8823 100644 --- a/config_templates/jobs.json +++ b/config_templates/jobs.json @@ -8,6 +8,7 @@ }, "environment_variables_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/myjobenv", "max_task_retries": 1, + "allow_run_on_missing_image": false, "input_data": { "azure_batch": [ { diff --git a/config_templates/pool.json b/config_templates/pool.json index b741563..3f544d5 100644 --- a/config_templates/pool.json +++ b/config_templates/pool.json @@ -33,6 +33,7 @@ }, "virtual_network": { "name": "", + "resource_group": "", "create_nonexistant": false, "address_space": "", "subnet": { diff --git a/convoy/batch.py b/convoy/batch.py index c043516..05499ae 100644 --- a/convoy/batch.py +++ b/convoy/batch.py @@ -66,6 +66,27 @@ _RUN_ELEVATED = batchmodels.UserIdentity( ) +def get_batch_account(batch_mgmt_client, config): + # type: (azure.mgmt.batch.BatchManagementClient, dict) -> + # azure.mgmt.batch.models.BatchAccount + """Get Batch account properties from ARM + :param azure.mgmt.batch.BatchManagementClient batch_mgmt_client: + batch management client + :param dict config: configuration dict + :rtype: azure.mgmt.batch.models.BatchAccount + :return: Batch account + """ + if batch_mgmt_client is None: + raise RuntimeError( + 'Batch management client is invalid, please specify management ' + 'aad credentials') + bc = settings.credentials_batch(config) + return batch_mgmt_client.batch_account.get( + resource_group_name=bc.resource_group, + account_name=bc.account, + ) + + def list_node_agent_skus(batch_client): # type: (batch.BatchServiceClient) -> None """List all node agent skus @@ -1708,15 +1729,81 @@ def add_jobs( # get the pool inter-node comm setting bs = settings.batch_shipyard_settings(config) pool = settings.pool_settings(config) - _pool = batch_client.pool.get(pool.id) - global_resources = [] - for gr in settings.global_resources_docker_images(config): - global_resources.append(gr) + try: + cloud_pool = batch_client.pool.get(pool.id) + except batchmodels.batch_error.BatchErrorException as ex: + if 'The specified pool does not exist.' in ex.message.value: + logger.error('{} pool does not exist'.format(pool.id)) + if util.confirm_action( + config, 'add jobs to nonexistant pool {}'.format(pool.id)): + cloud_pool = None + else: + logger.error( + 'not submitting jobs to nonexistant pool {}'.format( + pool.id)) + return + else: + raise + global_resources = settings.global_resources_docker_images(config) lastjob = None lasttask = None for jobspec in settings.job_specifications(config): - jpcmd = ['$AZ_BATCH_NODE_STARTUP_DIR/wd/{} {}'.format( - jpfile[0], ' '.join(global_resources))] + job_id = settings.job_id(jobspec) + # perform checks: + # 1. check docker images in task against pre-loaded on pool + # 2. if tasks have dependencies, set it if so + # 3. if there are multi-instance tasks + mi_ac = settings.job_multi_instance_auto_complete(config) + multi_instance = False + mi_docker_container_name = None + reserved_task_id = None + uses_task_dependencies = False + missing_images = [] + allow_run_on_missing = settings.job_allow_run_on_missing(jobspec) + for task in settings.job_tasks(jobspec): + # check if task docker image is set in config.json + di = settings.task_docker_image(task) + if di not in global_resources: + if allow_run_on_missing: + logger.warning( + ('docker image {} not pre-loaded on pool for a ' + 'task specified in job {}').format(di, job_id)) + missing_images.append(di) + else: + raise RuntimeError( + ('not submitting job {} with missing docker image {} ' + 'pre-load on pool {}').format(job_id, di, pool.id)) + # do not break, check to ensure ids are set on each task if + # task dependencies are set + if settings.has_depends_on_task(task): + uses_task_dependencies = True + if settings.is_multi_instance_task(task): + if multi_instance and mi_ac: + raise ValueError( + 'cannot specify more than one multi-instance task ' + 'per job with auto completion enabled') + multi_instance = True + mi_docker_container_name = settings.task_name(task) + if util.is_none_or_empty(mi_docker_container_name): + _id = settings.task_id(task) + if util.is_none_or_empty(_id): + reserved_task_id = _generate_next_generic_task_id( + batch_client, job_id) + settings.set_task_id(task, reserved_task_id) + _id = '{}-{}'.format(job_id, reserved_task_id) + settings.set_task_name(task, _id) + mi_docker_container_name = settings.task_name(task) + del _id + # construct job prep + if util.is_not_empty(global_resources): + if len(missing_images) > 0 and allow_run_on_missing: + gr = list(set(global_resources) - set(missing_images)) + else: + gr = global_resources + jpcmd = ['$AZ_BATCH_NODE_STARTUP_DIR/wd/{} {}'.format( + jpfile[0], ' '.join(gr))] + else: + jpcmd = [] # digest any input_data addlcmds = data.process_input_data(config, bxfile, jobspec) if addlcmds is not None: @@ -1741,39 +1828,10 @@ def add_jobs( user_identity=_RUN_ELEVATED, rerun_on_node_reboot_after_success=False, ), - uses_task_dependencies=False, + uses_task_dependencies=uses_task_dependencies, constraints=job_constraints, ) lastjob = job.id - # perform checks: - # 1. if tasks have dependencies, set it if so - # 2. if there are multi-instance tasks - mi_ac = settings.job_multi_instance_auto_complete(config) - multi_instance = False - mi_docker_container_name = None - reserved_task_id = None - for task in settings.job_tasks(jobspec): - # do not break, check to ensure ids are set on each task if - # task dependencies are set - if settings.has_depends_on_task(task): - job.uses_task_dependencies = True - if settings.is_multi_instance_task(task): - if multi_instance and mi_ac: - raise ValueError( - 'cannot specify more than one multi-instance task ' - 'per job with auto completion enabled') - multi_instance = True - mi_docker_container_name = settings.task_name(task) - if util.is_none_or_empty(mi_docker_container_name): - _id = settings.task_id(task) - if util.is_none_or_empty(_id): - reserved_task_id = _generate_next_generic_task_id( - batch_client, job.id) - settings.set_task_id(task, reserved_task_id) - _id = '{}-{}'.format(job.id, reserved_task_id) - settings.set_task_name(task, _id) - mi_docker_container_name = settings.task_name(task) - del _id # add multi-instance settings set_terminate_on_all_tasks_complete = False if multi_instance and mi_ac: @@ -1784,7 +1842,7 @@ def add_jobs( 'docker rm -v {}'.format(mi_docker_container_name)]), user_identity=_RUN_ELEVATED, ) - logger.info('Adding job: {}'.format(job.id)) + logger.info('Adding job {} to pool {}'.format(job.id, pool.id)) try: batch_client.job.add(job) except batchmodels.batch_error.BatchErrorException as ex: @@ -1810,6 +1868,7 @@ def add_jobs( del mi_ac del multi_instance del mi_docker_container_name + del uses_task_dependencies # get base env vars from job job_env_vars = settings.job_environment_variables(jobspec) _job_env_vars_secid = \ @@ -1830,7 +1889,8 @@ def add_jobs( if util.is_none_or_empty(settings.task_name(_task)): settings.set_task_name(_task, '{}-{}'.format(job.id, _task_id)) del _task_id - task = settings.task_settings(_pool, config, _task) + task = settings.task_settings( + cloud_pool, config, pool, _task, missing_images) # retrieve keyvault task env vars if util.is_not_empty( task.environment_variables_keyvault_secret_id): diff --git a/convoy/clients.py b/convoy/clients.py index c806a6c..76fb414 100644 --- a/convoy/clients.py +++ b/convoy/clients.py @@ -118,6 +118,31 @@ def create_network_client(ctx, credentials=None, subscription_id=None): credentials, subscription_id) +def create_batch_mgmt_client(ctx, credentials=None, subscription_id=None): + # type: (CliContext, object, str) -> + # azure.mgmt.batch.BatchManagementClient + """Create batch management client + :param CliContext ctx: Cli Context + :param object credentials: credentials object + :param str subscription_id: subscription id + :rtype: azure.mgmt.batch.BatchManagementClient + :return: batch management client + """ + mgmt_aad = None + if credentials is None: + mgmt_aad = settings.credentials_management(ctx.config).aad + credentials = aad.create_aad_credentials(ctx, mgmt_aad) + if util.is_none_or_empty(subscription_id): + if mgmt_aad is None: + mgmt_aad = settings.credentials_management(ctx.config).aad + subscription_id = ctx.subscription_id or mgmt_aad.subscription_id + batch_mgmt_client = azure.mgmt.batch.BatchManagementClient( + credentials, subscription_id) + batch_mgmt_client.config.add_user_agent( + 'batch-shipyard/{}'.format(__version__)) + return batch_mgmt_client + + def create_arm_clients(ctx, batch_clients=False): # type: (CliContext, bool) -> # Tuple[azure.mgmt.resource.resources.ResourceManagementClient, @@ -148,10 +173,16 @@ def create_arm_clients(ctx, batch_clients=False): network_client = create_network_client( ctx, credentials=credentials, subscription_id=subscription_id) if batch_clients: - batch_mgmt_client, batch_client = create_batch_clients(ctx) + batch_client = create_batch_service_client(ctx) + try: + batch_mgmt_client = create_batch_mgmt_client( + ctx, credentials=credentials, subscription_id=subscription_id) + except Exception: + logger.warning('could not create batch management client') + batch_mgmt_client = None else: - batch_mgmt_client = None batch_client = None + batch_mgmt_client = None return ( resource_client, compute_client, network_client, batch_mgmt_client, batch_client @@ -171,60 +202,25 @@ def create_keyvault_client(ctx): ) -def create_batch_mgmt_client(ctx, credentials=None, subscription_id=None): - # type: (CliContext, object, str) -> - # azure.mgmt.batch.BatchManagementClient - """Create batch management client +def create_batch_service_client(ctx): + # type: (CliContext) -> azure.batch.batch_service_client.BatchServiceClient + """Create batch service client :param CliContext ctx: Cli Context - :param object credentials: credentials object - :param str subscription_id: subscription id - :rtype: azure.mgmt.batch.BatchManagementClient - :return: batch management client - """ - batch_aad = None - if credentials is None: - batch_aad = settings.credentials_batch(ctx.config).aad - credentials = aad.create_aad_credentials(ctx, batch_aad) - if util.is_none_or_empty(subscription_id): - if batch_aad is None: - batch_aad = settings.credentials_batch(ctx.config).aad - subscription_id = ctx.subscription_id or batch_aad.subscription_id - if util.is_none_or_empty(subscription_id): - return None - batch_mgmt_client = azure.mgmt.batch.BatchManagementClient( - credentials, subscription_id) - batch_mgmt_client.config.add_user_agent( - 'batch-shipyard/{}'.format(__version__)) - return batch_mgmt_client - - -def create_batch_clients(ctx): - # type: (CliContext) -> - # Tuple[azure.mgmt.batch.BatchManagementClient, - # azure.batch.batch_service_client.BatchServiceClient] - """Create batch client - :param CliContext ctx: Cli Context - :rtype: tuple - :return: ( - azure.mgmt.batch.BatchManagementClient, - azure.batch.batch_service_client.BatchServiceClient) + :rtype: azure.batch.batch_service_client.BatchServiceClient + :return: batch service client """ bc = settings.credentials_batch(ctx.config) - use_aad = bc.user_subscription or util.is_none_or_empty(bc.account_key) - batch_mgmt_client = None - if use_aad: - subscription_id = ctx.subscription_id or bc.subscription_id + if util.is_none_or_empty(bc.account_key): + logger.debug('batch account key not specified, using aad auth') batch_aad = settings.credentials_batch(ctx.config).aad credentials = aad.create_aad_credentials(ctx, batch_aad) - batch_mgmt_client = create_batch_mgmt_client( - ctx, credentials=credentials, subscription_id=subscription_id) else: credentials = batchauth.SharedKeyCredentials( bc.account, bc.account_key) batch_client = batchsc.BatchServiceClient( credentials, base_url=bc.account_service_url) batch_client.config.add_user_agent('batch-shipyard/{}'.format(__version__)) - return (batch_mgmt_client, batch_client) + return batch_client def create_storage_clients(): diff --git a/convoy/fleet.py b/convoy/fleet.py index df41f09..632578a 100644 --- a/convoy/fleet.py +++ b/convoy/fleet.py @@ -43,6 +43,7 @@ except ImportError: import uuid # non-stdlib imports import azure.batch.models as batchmodels +import azure.mgmt.batch.models as batchmgmtmodels # local imports from . import batch from . import crypto @@ -451,10 +452,15 @@ def _add_pool( raise ValueError( 'Invalid subnet name on virtual network {}'.format( pool_settings.virtual_network.name)) + if util.is_not_empty(pool_settings.virtual_network.resource_group): + _vnet_rg = pool_settings.virtual_network.resource_group + else: + _vnet_rg = bc.resource_group # create virtual network and subnet if specified vnet, subnet = resource.create_virtual_network_and_subnet( - network_client, bc.resource_group, bc.location, + network_client, _vnet_rg, bc.location, pool_settings.virtual_network) + del _vnet_rg # ensure address prefix for subnet is valid tmp = subnet.address_prefix.split('/') if len(tmp) <= 1: @@ -490,7 +496,9 @@ def _add_pool( sc_arg = None if storage_cluster_mount: # ensure usersubscription account - if not bc.user_subscription: + ba = batch.get_batch_account(batch_mgmt_client, config) + if (not ba.pool_allocation_mode == + batchmgmtmodels.PoolAllocationMode.user_subscription): raise RuntimeError( '{} account is not a UserSubscription account'.format( bc.account)) @@ -1135,21 +1143,6 @@ def _adjust_settings_for_pool_creation(config): # adjust inter node comm setting if pool.vm_count < 1: raise ValueError('invalid vm_count: {}'.format(pool.vm_count)) - dr = settings.data_replication_settings(config) - max_vms = 20 if publisher == 'microsoftwindowsserver' else 40 - if pool.vm_count > max_vms: - if dr.peer_to_peer.enabled: - logger.warning( - ('disabling peer-to-peer transfer as pool size of {} exceeds ' - 'max limit of {} vms for inter-node communication').format( - pool.vm_count, max_vms)) - settings.set_peer_to_peer_enabled(config, False) - if pool.inter_node_communication_enabled: - logger.warning( - ('disabling inter-node communication as pool size of {} ' - 'exceeds max limit of {} vms for setting').format( - pool.vm_count, max_vms)) - settings.set_inter_node_communication_enabled(config, False) # re-read pool and data replication settings pool = settings.pool_settings(config) dr = settings.data_replication_settings(config) diff --git a/convoy/remotefs.py b/convoy/remotefs.py index 1d1d13c..6368568 100644 --- a/convoy/remotefs.py +++ b/convoy/remotefs.py @@ -632,9 +632,14 @@ def create_storage_cluster( # upload scripts to blob storage for customscript blob_urls = storage.upload_for_remotefs(blob_client, remotefs_files) # create virtual network and subnet if specified + if util.is_not_empty(rfs.storage_cluster.virtual_network.resource_group): + _vnet_rg = rfs.storage_cluster.virtual_network.resource_group + else: + _vnet_rg = rfs.resource_group vnet, subnet = resource.create_virtual_network_and_subnet( - network_client, rfs.resource_group, rfs.location, + network_client, _vnet_rg, rfs.location, rfs.storage_cluster.virtual_network) + del _vnet_rg # TODO create slb diff --git a/convoy/settings.py b/convoy/settings.py index cf0ff17..ae38e3c 100644 --- a/convoy/settings.py +++ b/convoy/settings.py @@ -98,7 +98,7 @@ ManagementCredentialsSettings = collections.namedtuple( BatchCredentialsSettings = collections.namedtuple( 'BatchCredentialsSettings', [ 'aad', 'account', 'account_key', 'account_service_url', - 'user_subscription', 'resource_group', 'subscription_id', 'location', + 'resource_group', 'subscription_id', 'location', ] ) StorageCredentialsSettings = collections.namedtuple( @@ -176,8 +176,8 @@ ManagedDisksSettings = collections.namedtuple( ) VirtualNetworkSettings = collections.namedtuple( 'VirtualNetworkSettings', [ - 'name', 'address_space', 'subnet_name', 'subnet_address_prefix', - 'existing_ok', 'create_nonexistant', + 'name', 'resource_group', 'address_space', 'subnet_name', + 'subnet_address_prefix', 'existing_ok', 'create_nonexistant', ] ) FileServerSettings = collections.namedtuple( @@ -718,7 +718,6 @@ def credentials_batch(config): account = _kv_read_checked(conf, 'account') account_key = _kv_read_checked(conf, 'account_key') account_service_url = conf['account_service_url'] - user_subscription = _kv_read(conf, 'user_subscription', False) resource_group = _kv_read_checked(conf, 'resource_group') # get subscription id from management section try: @@ -749,7 +748,6 @@ def credentials_batch(config): account=account, account_key=account_key, account_service_url=conf['account_service_url'], - user_subscription=user_subscription, resource_group=resource_group, location=location, subscription_id=subscription_id, @@ -1804,6 +1802,22 @@ def job_max_task_retries(conf): return max_task_retries +def job_allow_run_on_missing(conf): + # type: (dict) -> int + """Get allow task run on missing image + :param dict conf: job configuration object + :rtype: bool + :return: allow run on missing image + """ + try: + allow = conf['allow_run_on_missing_image'] + if allow is None: + raise KeyError() + except KeyError: + allow = False + return allow + + def has_depends_on_task(conf): # type: (dict) -> bool """Determines if task has task dependencies @@ -1825,7 +1839,7 @@ def has_depends_on_task(conf): def is_multi_instance_task(conf): # type: (dict) -> bool """Determines if task is multi-isntance - :param dict conf: job configuration object + :param dict conf: task configuration object :rtype: bool :return: task is multi-instance """ @@ -1835,7 +1849,7 @@ def is_multi_instance_task(conf): def task_name(conf): # type: (dict) -> str """Get task name - :param dict conf: job configuration object + :param dict conf: task configuration object :rtype: str :return: task name """ @@ -1848,10 +1862,26 @@ def task_name(conf): return name +def task_docker_image(conf): + # type: (dict) -> str + """Get docker image used by task + :param dict conf: task configuration object + :rtype: str + :return: docker image used by task + """ + try: + di = conf['image'] + if util.is_none_or_empty(di): + raise KeyError() + except KeyError: + di = None + return di + + def set_task_name(conf, name): # type: (dict, str) -> None """Set task name - :param dict conf: job configuration object + :param dict conf: task configuration object :param str name: task name to set """ conf['name'] = name @@ -1860,7 +1890,7 @@ def set_task_name(conf, name): def task_id(conf): # type: (dict) -> str """Get task id - :param dict conf: job configuration object + :param dict conf: task configuration object :rtype: str :return: task id """ @@ -1876,18 +1906,21 @@ def task_id(conf): def set_task_id(conf, id): # type: (dict, str) -> None """Set task id - :param dict conf: job configuration object + :param dict conf: task configuration object :param str id: task id to set """ conf['id'] = id -def task_settings(pool, config, conf): - # type: (azure.batch.models.CloudPool, dict, dict) -> TaskSettings +def task_settings(cloud_pool, config, poolconf, conf, missing_images): + # type: (azure.batch.models.CloudPool, dict, PoolSettings, + # dict, list) -> TaskSettings """Get task settings - :param azure.batch.models.CloudPool pool: cloud pool object + :param azure.batch.models.CloudPool cloud_pool: cloud pool object :param dict config: configuration dict - :param dict conf: job configuration object + :param PoolSettings poolconf: pool settings + :param dict conf: task configuration object + :param list missing_images: list of missing docker images on pool :rtype: TaskSettings :return: task settings """ @@ -1898,11 +1931,36 @@ def task_settings(pool, config, conf): image = conf['image'] if util.is_none_or_empty(image): raise ValueError('image is invalid') + # check if image is in missing image list + if image in missing_images: + # get private registry settings + preg = docker_registry_private_settings(config) + if util.is_not_empty(preg.storage_account): + registry = 'localhost:5000/' + elif util.is_not_empty(preg.server): + registry = '{}/'.format(preg.server) + else: + registry = '' + del preg + image = '{}{}'.format(registry, image) # get some pool props - publisher = pool.virtual_machine_configuration.image_reference.\ - publisher.lower() - offer = pool.virtual_machine_configuration.image_reference.offer.lower() - sku = pool.virtual_machine_configuration.image_reference.sku.lower() + if cloud_pool is None: + pool_id = poolconf.id + publisher = poolconf.publisher.lower() + offer = poolconf.offer.lower() + sku = poolconf.sku.lower() + vm_size = poolconf.vm_size + inter_node_comm = poolconf.inter_node_communication_enabled + else: + pool_id = cloud_pool.id + publisher = cloud_pool.virtual_machine_configuration.image_reference.\ + publisher.lower() + offer = cloud_pool.virtual_machine_configuration.image_reference.\ + offer.lower() + sku = cloud_pool.virtual_machine_configuration.image_reference.sku.\ + lower() + vm_size = cloud_pool.vm_size.lower() + inter_node_comm = cloud_pool.enable_inter_node_communication # get depends on try: depends_on = conf['depends_on'] @@ -2088,10 +2146,10 @@ def task_settings(pool, config, conf): gpu = False # adjust for gpu settings if gpu: - if not is_gpu_pool(pool.vm_size): + if not is_gpu_pool(vm_size): raise RuntimeError( ('cannot initialize a gpu task on nodes without ' - 'gpus, pool: {} vm_size: {}').format(pool.id, pool.vm_size)) + 'gpus, pool: {} vm_size: {}').format(pool_id, vm_size)) # TODO other images as they become available with gpu support if (publisher != 'canonical' and offer != 'ubuntuserver' and sku < '16.04'): @@ -2107,16 +2165,16 @@ def task_settings(pool, config, conf): docker_exec_cmd = 'docker exec' # adjust for infiniband if infiniband: - if not pool.enable_inter_node_communication: + if not inter_node_comm: raise RuntimeError( ('cannot initialize an infiniband task on a ' 'non-internode communication enabled ' - 'pool: {}').format(pool.id)) - if not is_rdma_pool(pool.vm_size): + 'pool: {}').format(pool_id)) + if not is_rdma_pool(vm_size): raise RuntimeError( ('cannot initialize an infiniband task on nodes ' 'without RDMA, pool: {} vm_size: {}').format( - pool.id, pool.vm_size)) + pool_id, vm_size)) # only centos-hpc and sles-hpc:12-sp1 are supported # for infiniband if publisher == 'openlogic' and offer == 'centos-hpc': @@ -2147,7 +2205,7 @@ def task_settings(pool, config, conf): run_opts.append('--env-file {}'.format(envfile)) # populate mult-instance settings if is_multi_instance_task(conf): - if not pool.enable_inter_node_communication: + if not inter_node_comm: raise RuntimeError( ('cannot run a multi-instance task on a ' 'non-internode communication enabled ' @@ -2194,7 +2252,12 @@ def task_settings(pool, config, conf): if num_instances == 'pool_specification_vm_count': num_instances = pool_vm_count(config) elif num_instances == 'pool_current_dedicated': - num_instances = pool.current_dedicated + if cloud_pool is None: + raise RuntimeError( + ('Cannot retrieve current dedicated count for ' + 'pool: {}. Ensure pool exists.)'.format(pool_id))) + else: + num_instances = cloud_pool.current_dedicated else: raise ValueError( ('multi instance num instances setting ' @@ -2267,6 +2330,7 @@ def virtual_network_settings( except KeyError: conf = {} name = _kv_read_checked(conf, 'name') + resource_group = _kv_read_checked(conf, 'resource_group') address_space = _kv_read_checked(conf, 'address_space') existing_ok = _kv_read(conf, 'existing_ok', default_existing_ok) subnet_name = _kv_read_checked(conf['subnet'], 'name') @@ -2275,6 +2339,7 @@ def virtual_network_settings( conf, 'create_nonexistant', default_create_nonexistant) return VirtualNetworkSettings( name=name, + resource_group=resource_group, address_space=address_space, subnet_name=subnet_name, subnet_address_prefix=subnet_address_prefix, @@ -2331,9 +2396,9 @@ def remotefs_settings(config): ) if not isinstance(sc_ns_inbound['nfs'].source_address_prefix, list): raise ValueError('expected list for nfs network security rule') - if 'custom_inbound' in ns_conf: + if 'custom_inbound_rules' in ns_conf: _reserved = frozenset(['ssh', 'nfs', 'glusterfs']) - for key in ns_conf['custom_inbound']: + for key in ns_conf['custom_inbound_rules']: # ensure key is not reserved if key.lower() in _reserved: raise ValueError( @@ -2341,11 +2406,13 @@ def remotefs_settings(config): 'reserved name {}').format(key, _reserved)) sc_ns_inbound[key] = InboundNetworkSecurityRule( destination_port_range=_kv_read_checked( - ns_conf['custom_inbound'][key], 'destination_port_range'), + ns_conf['custom_inbound_rules'][key], + 'destination_port_range'), source_address_prefix=_kv_read_checked( - ns_conf['custom_inbound'][key], 'source_address_prefix'), + ns_conf['custom_inbound_rules'][key], + 'source_address_prefix'), protocol=_kv_read_checked( - ns_conf['custom_inbound'][key], 'protocol'), + ns_conf['custom_inbound_rules'][key], 'protocol'), ) if not isinstance(sc_ns_inbound[key].source_address_prefix, list): raise ValueError( diff --git a/docs/10-batch-shipyard-configuration.md b/docs/10-batch-shipyard-configuration.md index c982152..1ad1c9d 100644 --- a/docs/10-batch-shipyard-configuration.md +++ b/docs/10-batch-shipyard-configuration.md @@ -7,10 +7,12 @@ Batch Shipyard is driven by the following json configuration files: 1. [Credentials](11-batch-shipyard-configuration-credentials.md) - credentials for Azure Batch, Storage, KeyVault, Management and Docker private registries -2. [Global config](#global) - Batch Shipyard and Docker-specific configuration -settings -3. [Pool](#pool) - Azure Batch pool configuration -4. [Jobs](#jobs) - Azure Batch jobs and tasks configuration +2. [Global config](12-batch-shipyard-configuration-global.md) - +Batch Shipyard and Docker-specific configuration settings +3. [Pool](13-batch-shipyard-configuration-pool.md) - +Batch Shipyard pool configuration +4. [Jobs](14-batch-shipyard-configuration-jobs.md) - +Batch Shipyard jobs and tasks configuration Note that all potential properties are described here and that specifying all such properties may result in invalid configuration as some properties @@ -26,940 +28,5 @@ may be invalid if specified as such. They must be modified for your execution scenario. All [sample recipe](../recipes) also have a set of configuration files that can be modified to fit your needs. -### Global Config -The global config schema is as follows: - -```json -{ - "batch_shipyard": { - "storage_account_settings": "mystorageaccount", - "storage_entity_prefix": "shipyard", - "generated_sas_expiry_days": 90, - "encryption" : { - "enabled": true, - "pfx": { - "filename": "encrypt.pfx", - "passphrase": "mysupersecretpassword", - "sha1_thumbprint": "123456789..." - }, - "public_key_pem": "encrypt.pem" - } - }, - "docker_registry": { - "private": { - "allow_public_docker_hub_pull_on_missing": true, - "server": "myserver-myorg.azurecr.io", - "azure_storage": { - "storage_account_settings": "mystorageaccount", - "container": "mydockerregistry" - } - } - }, - "data_replication": { - "peer_to_peer": { - "enabled": true, - "compression": true, - "concurrent_source_downloads": 10, - "direct_download_seed_bias": null - }, - "non_peer_to_peer_concurrent_downloading": true - }, - "global_resources": { - "docker_images": [ - "busybox", - "redis:3.2.3-alpine", - ], - "files": [ - { - "source": { - "path": "/some/local/path/dir", - "include": ["*.dat"], - "exclude": ["*.bak"] - }, - "destination": { - "shared_data_volume": "glustervol", - "relative_destination_path": "myfiles", - "data_transfer": { - "method": "multinode_scp", - "ssh_private_key": "id_rsa_shipyard", - "scp_ssh_extra_options": "-C -c aes256-gcm@openssh.com", - "rsync_extra_options": "", - "split_files_megabytes": 500, - "max_parallel_transfers_per_node": 2 - } - } - }, - { - "source": { - "path": "/some/local/path/bound/for/blob", - "include": ["*.bin"] - }, - "destination": { - "storage_account_settings": "mystorageaccount", - "data_transfer": { - "container": "mycontainer", - "blobxfer_extra_options": "--no-computefilemd5" - } - } - }, - { - "source": { - "path": "/another/local/path/dir", - "include": [], - "exclude": [] - }, - "destination": { - "relative_destination_path": "relpath/on/host", - "data_transfer": { - "method": "rsync+ssh", - "ssh_private_key": "id_rsa_shipyard", - "scp_ssh_extra_options": "-c aes256-gcm@openssh.com", - "rsync_extra_options": "-v" - } - } - } - ], - "docker_volumes": { - "data_volumes": { - "abcvol": { - "host_path": null, - "container_path": "/abc" - }, - "hosttempvol": { - "host_path": "/tmp", - "container_path": "/hosttmp" - } - }, - "shared_data_volumes": { - "shipyardvol": { - "volume_driver": "azurefile", - "storage_account_settings": "mystorageaccount", - "azure_file_share_name": "shipyardshared", - "container_path": "$AZ_BATCH_NODE_SHARED_DIR/azfile", - "mount_options": [ - "filemode=0777", - "dirmode=0777", - "nolock=true" - ] - }, - "glustervol": { - "volume_driver": "glusterfs_on_compute", - "container_path": "$AZ_BATCH_NODE_SHARED_DIR/gfs", - "volume_type": "replica", - "volume_options": [ - "performance.cache-size 1 GB", - "performance.cache-max-file-size 10 MB", - "performance.cache-refresh-timeout 61", - ] - } - } - } - } -} -``` - -The `batch_shipyard` property is used to set settings for the tool. -* (required) `storage_account_settings` is a link to the alias of the storage -account specified, in this case, it is `mystorageaccount`. Batch shipyard -requires a storage account for storing metadata in order to execute across a -distributed environment. -* (optional) `storage_entity_prefix` property is used as a generic qualifier -to prefix storage containers (blob containers, tables, queues) with. If not -specified, defaults to `shipyard`. -* (optional) `generated_sas_expiry_days` property is used to set the number of -days any generated SAS key by Batch Shipyard is valid for. The default is 30 -days. This is useful if you have long-lived pools and want to ensure that -SAS keys are valid for longer periods of time. -* (optional) `encryption` object is used to define credential encryption which -contains the following members: - * (required) `enabled` property enables or disables this feature. - * (required) `pfx` object defines the PFX certificate - * (required) `filename` property is the full path and name to the PFX - certificate - * (required) `passphrase` property is the passphrase for the PFX - certificate. This cannot be empty. - * (optional) `sha1_thumbprint` is the SHA1 thumbprint of the - certificate. If the PFX file is created using the `cert create` command, - then the SHA1 thumbprint is output. It is recommended to populate this - property such that it does not have to be generated when needed for - encryption. - * (optional) `public_key_pem` property is the full path and name to the - RSA public key in PEM format. If the PFX file is created using the - `cert create` command, then this file is generated along with the PFX - file. It is recommended to populate this property with the PEM file path - such that it does not have to be generated when needed for encryption. - -The `docker_registry` property is used to configure Docker image distribution -options from public/private Docker hub and private registries. -* (optional) `private` property controls settings for interacting with private -registries. There are three kinds of private registries that are supported: -(1) private registries hosted on Docker Hub, (2) Internet accessible -registries such as those hosted by the -[Azure Container Registry](https://azure.microsoft.com/en-us/services/container-registry/) -service and (3) [private registry instances backed to -Azure Blob Storage](https://azure.microsoft.com/en-us/documentation/articles/virtual-machines-linux-docker-registry-in-blob-storage/) -and are run on compute nodes. To use private registries hosted on Docker Hub, -no additional properties need to be specified here, instead, specify your -Docker Hub login information in the credentials json. To specify a private -registry other than on Docker Hub, a json property named `server` should be -defined. To use a private registry backed by Azure Blob Storage, define a -json object named `azure_storage`. Note that a maximum of only one of these -three types private registries may be specified at once. The following -describes members of the non-Docker Hub private registries supported: - * (optional) `server` object is a property that is the fully-qualified host - name to a private registry server. A specific port other than 80 can be - specified using a `:` separator, e.g., - `mydockerregistry.com:8080`. Port 80 is the default if no port is - specified. The value of this property should have an associated login - in the credentials json file. - * (optional) `azure_storage` object is to define settings for connecting - to a private registry backed by Azure Storage blobs and where the - private registry instances are hosted on the compute nodes themselves. - * (required) `storage_account_settings` is a link to the alias of the - storage account specified that stores the private registry blobs. - * (required) `container` property is the name of the Azure Blob - container holding the private registry blobs. - * (optional) `allow_public_docker_hub_pull_on_missing` property allows - pass-through of Docker image retrieval to public Docker Hub if it is - missing in the private registry. This defaults to `false` if not - specified. - -The `data_replication` property is used to configure the internal image -replication mechanism between compute nodes within a compute pool. The -`non_peer_to_peer_concurrent_downloading` property specifies if it is ok -to allow unfettered concurrent downloading from the source registry among -all compute nodes. The following options apply to `peer_to_peer` data -replication options: -* (optional) `enabled` property enables or disables private peer-to-peer -transfer. Note that for compute pools with a relatively small number of VMs, -peer-to-peer transfer may not provide any benefit and is recommended to be -disabled in these cases. Compute pools with large number of VMs and especially -in the case of an Azure Storage-backed private registry can benefit from -peer-to-peer image replication. -* `compression` property enables or disables compression of image files. It -is strongly recommended to keep this enabled. -* `concurrent_source_downloads` property specifies the number of -simultaneous downloads allowed to each image. -* `direct_download_seed_bias` property sets the number of direct download -seeds to prefer per image before switching to peer-to-peer transfer. - -The `global_resources` property contains information regarding required -Docker images, volume configuration and data ingress information. This -property is required. - -`docker_images` is an array of docker images that should be installed on -every compute node when this configuration file is supplied while creating -a compute pool. Image tags are supported. Image names should not include -private registry server names, as these will be automatically prepended. For -instance, if you have an image `abc/mytag` on your private registry -`myregistry-myorg.azurecr.io`, your image should be named in the -`docker_images` array as `abc/mytag` and not -`myregistry-myorg.azurecr.io/abc/mytag`. - -`files` is an optional property that specifies data that should be ingressed -from a location accessible by the local machine (i.e., machine invoking -`shipyard.py` to a shared file system location accessible by compute nodes -in the pool or Azure Blob or File Storage). `files` is a json list of objects, -which allows for multiple sources to destinations to be ingressed during the -same invocation. Note that no Azure Batch environment variables -(i.e., `$AZ_BATCH_`-style environment variables) are available as path -arguments since ingress actions performed within `files` are done locally -on the machine invoking `shipyard.py`. Each object within the `files` list -contains the following members: -* (required) `source` property contains the following members: - * (required) `path` is a local path. A single file or a directory - can be specified. Filters below will be ignored if `path` is a file and - not a directory. - * (optional) `include` is an array of - [Unix shell-style wildcard filters](https://docs.python.org/3.5/library/fnmatch.html) - where only files matching a filter are included in the data transfer. - Filters specified in `include` have precedence over `exclude` described - next. `include` can only have a maximum of 1 filter for ingress to Azure - Blob Storage. In this example, all files ending in `.dat` are ingressed. - * (optional) `exclude` is an array of - [Unix shell-style wildcard filters](https://docs.python.org/3.5/library/fnmatch.html) - where files matching a filter are excluded from the data transfer. Filters - specified in `include` have precedence over filters specified in - `exclude`. `exclude` cannot be specified for ingress into Azure Blob - Storage. In this example, all files ending in `.bak` are skipped for - ingress. -* (required) `destination` property contains the following members: - * (required or optional) `shared_data_volume` or `storage_account_settings` - for data ingress to a GlusterFS volume or Azure Blob or File Storage. If - you are ingressing to a pool with only one compute node, you may omit - `shared_data_volume`. Otherwise, you may specify one or the other, but - not both in the same object. Please see below in the - `shared_data_volumes` for information on how to set up a GlusterFS share. - * (required or optional) `relative_destination_path` specifies a relative - destination path to place the files, with respect to the target root. - If transferring to a `shared_data_volume` then this is relative to the - GlusterFS volume root. If transferring to a pool with one single node in - it, thus, no `shared_data_volume` is specified in the prior property, then - this is relative to - [$AZ_BATCH_NODE_ROOT_DIR](https://azure.microsoft.com/en-us/documentation/articles/batch-api-basics/#files-and-directories). - To place files directly in `$AZ_BATCH_NODE_ROOT_DIR` (not recommended), - you can specify this property as empty string when not ingressing to - a `shared_data_volume`. Note that if `scp` is selected while attempting - to transfer directly to this aforementioned path, then `scp` will fail - with exit code of 1 but the transfer will have succeeded (this is due - to some of the permission options). If this property is not specified for - a `shared_data_volume`, then files will be placed directly in the - GlusterFS volume root. This property cannot be specified for a Azure - Storage destination (i.e., `storage_account_settings`). - * (required) `data_transfer` specifies how the transfer should take place. - The following list contains members for GlusterFS ingress when a GlusterFS - volume is provided for `shared_data_volume` (see below for ingressing to - Azure Blob or File Storage): - * (required) `method` specified which method should be used to ingress - data, which should be one of: `scp`, `multinode_scp`, `rsync+ssh` or - `multinode_rsync+ssh`. `scp` will use secure copy to copy a file or a - directory (recursively) to the remote share path. `multinode_scp` will - attempt to simultaneously transfer files to many compute nodes using - `scp` at the same time to speed up data transfer. `rsync+ssh` will - perform an rsync of files through SSH. `multinode_rsync+ssh` will - attempt to simultaneously transfer files using `rsync` to many compute - nodes at the same time to speed up data transfer with. Note that you may - specify the `multinode_*` methods even with only 1 compute node in a - pool which will allow you to take advantage of - `max_parallel_transfers_per_node` below. - * (optional) `ssh_private_key` location of the SSH private key for the - username specified in the `pool_specification`:`ssh` section when - connecting to compute nodes. The default is `id_rsa_shipyard`, if - omitted, which is automatically generated if no SSH key is specified - when an SSH user is added to a pool. - * (optional) `scp_ssh_extra_options` are any extra options to pass to - `scp` or `ssh` for `scp`/`multinode_scp` or - `rsync+ssh`/`multinode_rsync+ssh` methods, respectively. In the example - above, `-C` enables compression and `-c aes256-gcm@openssh.com` - is passed to `scp`, which can potentially increase the transfer speed by - selecting the `aes256-gcm@openssh.com` cipher which can exploit Intel - AES-NI. - * (optional) `rsync_extra_options` are any extra options to pass to - `rsync` for the `rsync+ssh`/`multinode_rsync+ssh` transfer methods. This - property is ignored for non-rsync transfer methods. - * (optional) `split_files_megabytes` splits files into chunks with the - specified size in MiB. This can potentially help with very large files. - This option forces the transfer `method` to `multinode_scp`. - Note that the destination file system must be able to accommodate - up to 2x the size of files which are split. Additionally, transfers - involving files which are split will incur reconstruction costs after - the transfer is complete, which will increase the total end-to-end - ingress time. However, in certain scenarios, by splitting files and - transferring chunks in parallel along with reconstruction may end up - being faster than transferring a large file without chunking. - * (optional) `max_parallel_transfers_per_node` is the maximum number of - parallel transfer to invoke per node with the - `multinode_scp`/`multinode_rsync+ssh` methods. For example, if there - are 3 compute nodes in the pool, and `2` is given for this option, then - there will be up to 2 scp sessions in parallel per compute node for a - maximum of 6 concurrent scp sessions to the pool. The default is 1 if - not specified or omitted. - * (required) `data_transfer` specifies how the transfer should take place. - When Azure Blob or File Storage is selected as the destination for data - ingress, [blobxfer](https://github.com/Azure/blobxfer) is invoked. The - following list contains members for Azure Blob or File Storage ingress - when a storage account link is provided for `storage_account_settings`: - * (required) `container` or `file_share` is required when uploading to - Azure Blob Storage or Azure File Storage, respectively. `container` - specifies which container to upload to for Azure Blob Storage while - `file_share` specifies which file share to upload to for Azure File - Storage. Only one of these properties can be specified per - `data_transfer` object. The container or file share need not be created - beforehand. - * (optional) `blobxfer_extra_options` are any extra options to pass to - `blobxfer`. In the example above, `--no-computefilemd5` will force - `blobxfer` to skip MD5 calculation on files ingressed. - -`docker_volumes` is an optional property that can consist of two -different types of volumes: `data_volumes` and `shared_data_volumes`. -`data_volumes` can be of two flavors depending upon if `host_path` is set to -null or not. In the former, this is typically used with the `VOLUME` keyword -in Dockerfiles to initialize a data volume with existing data inside the -image. If `host_path` is set, then the path on the host is mounted in the -container at the path specified with `container_path`. - -`shared_data_volumes` is an optional property for initializing persistent -shared storage volumes. In the first shared volume, `shipyardvol` is the alias -of this volume: -* `volume_driver` property specifies the Docker Volume Driver to use. -Currently Batch Shipyard only supports the `volume_driver` as `azurefile` or -`glusterfs_on_compute`. Note that `glusterfs_on_compute` is not a true Docker -Volume Driver. For this volume (`shipyardvol`), as this is an Azure File -shared volume, the `volume_driver` should be set as `azurefile`. -* `storage_account_settings` is a link to the alias of the storage account -specified that holds this Azure File Share. -* `azure_file_share_name` is the name of the share name on Azure Files. Note -that the Azure File share must be created beforehand, the toolkit does not -create Azure File shares, it only mounts them to the compute nodes. -* `container_path` is the path in the container to mount. -* `mount_options` are the mount options to pass to the mount command. Supported -options are documented -[here](https://github.com/Azure/azurefile-dockervolumedriver). It is -recommended to use `0777` for both `filemode` and `dirmode` as the `uid` and -`gid` cannot be reliably determined before the compute pool is allocated and -this volume will be mounted as the root user. - -Note that when using `azurefile` for a shared data volume, the storage account -that holds the file share must reside within the same Azure region as the -Azure Batch compute pool. Attempting to mount an Azure File share that is -cross-region will result in failure as current Linux Samba clients do not -support share level encryption at this time. - -The second shared volue, `glustervol`, is a -[GlusterFS](https://www.gluster.org/) network file system. Please note that -`glusterfs_on_compute` are GlusterFS volumes co-located on the VM's temporary -local disk space which is a shared resource. Sizes of the local temp disk for -each VM size can be found -[here](https://azure.microsoft.com/en-us/documentation/articles/virtual-machines-windows-sizes/). -If specifying a `glusterfs_on_compute` volume, you must enable internode -communication in the pool configuration file. These volumes have the following -properties: -* (required) `volume_driver` property should be set as `glusterfs_on_compute`. -* (required) `container_path` is the path in the container to mount. -* (optional) `volume_type` property defines the GlusterFS volume type. -Currently, `replica` is the only supported type. -* (optional) `volume_options` property defines additional GlusterFS volume -options to set. - -`glusterfs_on_compute` volumes are mounted on the host at -`$AZ_BATCH_NODE_SHARED_DIR/.gluster/gv0`. Batch Shipyard will automatically -replace container path references in direct and storage-based data -ingress/egress with their host path equivalents. - -Note that when resizing a pool with a `glusterfs_on_compute` shared file -systems that you must resize with the `pool resize` command in `shipyard.py` -and not with Azure Portal, Batch Explorer or any other tool. - -Finally, note that all `docker_volumes` can be omitted completely along with -one or all of `data_volumes` and `shared_data_volumes` if you do not require -this functionality. - -An example global config json template can be found -[here](../config\_templates/config.json). - -### Pool -The pool schema is as follows: - -```json -{ - "pool_specification": { - "id": "dockerpool", - "vm_size": "STANDARD_A9", - "vm_count": 10, - "max_tasks_per_node": 1, - "inter_node_communication_enabled": true, - "publisher": "OpenLogic", - "offer": "CentOS-HPC", - "sku": "7.1", - "reboot_on_start_task_failed": true, - "block_until_all_global_resources_loaded": true, - "transfer_files_on_pool_creation": false, - "input_data": { - "azure_batch": [ - { - "job_id": "jobonanotherpool", - "task_id": "mytask", - "include": ["wd/*.dat"], - "exclude": ["*.txt"], - "destination": "$AZ_BATCH_NODE_SHARED_DIR/jobonanotherpool" - } - ], - "azure_storage": [ - { - "storage_account_settings": "mystorageaccount", - "container": "poolcontainer", - "include": ["pooldata*.bin"], - "destination": "$AZ_BATCH_NODE_SHARED_DIR/pooldata", - "blobxfer_extra_options": null - } - ] - }, - "ssh": { - "username": "docker", - "expiry_days": 7, - "ssh_public_key": null, - "generate_docker_tunnel_script": true, - "generated_file_export_path": null, - "hpn_server_swap": false - }, - "gpu": { - "nvidia_driver": { - "source": "https://some.url" - } - }, - "additional_node_prep_commands": [ - ] - } -} -``` - -The `pool_specification` property has the following members: -* (required) `id` is the compute pool ID. -* (required) `vm_size` is the -[Azure Virtual Machine Instance Size](https://azure.microsoft.com/en-us/pricing/details/virtual-machines/). -Please note that not all regions have every VM size available. -* (required) `vm_count` is the number of compute nodes to allocate. -* (optional) `max_tasks_per_node` is the maximum number of concurrent tasks -that can be running at any one time on a compute node. This defaults to a -value of 1 if not specified. -* (optional) `inter_node_communication_enabled` designates if this pool is set -up for inter-node communication. This must be set to `true` for any containers -that must communicate with each other such as MPI applications. This property -will be force enabled if peer-to-peer replication is enabled. -* (required) `publisher` is the publisher name of the Marketplace VM image. -* (required) `offer` is the offer name of the Marketplace VM image. -* (required) `sku` is the sku name of the Marketplace VM image. -* (optional) `reboot_on_start_task_failed` allows Batch Shipyard to reboot the -compute node in case there is a transient failure in node preparation (e.g., -network timeout, resolution failure or download problem). This defaults to -`false`. -* (optional) `block_until_all_global_resources_loaded` will block the node -from entering ready state until all Docker images are loaded. This defaults -to `true`. -* (optional) `transfer_files_on_pool_creation` will ingress all `files` -specified in the `global_resources` section of the configuration json when -the pool is created. If files are to be ingressed to Azure Blob or File -Storage, then data movement operations are overlapped with the creation of the -pool. If files are to be ingressed to a shared file system on the compute -nodes, then the files are ingressed after the pool is created and the shared -file system is ready. Files can be ingressed to both Azure Blob Storage and a -shared file system during the same pool creation invocation. If this property -is set to `true` then `block_until_all_global_resources_loaded` will be force -disabled. If omitted, this property defaults to `false`. -* (optional) `input_data` is an object containing data that should be -ingressed to all compute nodes as part of node preparation. It is -important to note that if you are combining this action with `files` and -are ingressing data to Azure Blob or File storage as part of pool creation, -that the blob containers or file shares defined here will be downloaded as -soon as the compute node is ready to do so. This may result in the blob -container/blobs or file share/files not being ready in time for the -`input_data` transfer. It is up to you to ensure that these two operations do -not overlap. If there is a possibility of overlap, then you should ingress -data defined in `files` prior to pool creation and disable the option above -`transfer_files_on_pool_creation`. This object currently supports -`azure_batch` and `azure_storage` as members. - * `azure_batch` contains the following members: - * (required) `job_id` the job id of the task - * (required) `task_id` the id of the task to fetch files from - * (optional) `include` is an array of include filters - * (optional) `exclude` is an array of exclude filters - * (required) `destination` is the destination path to place the files - * `azure_storage` contains the following members: - * (required) `storage_account_settings` contains a storage account link - as defined in the credentials json. - * (required) `container` or `file_share` is required when downloading - from Azure Blob Storage or Azure File Storage, respectively. - `container` specifies which container to download from for Azure Blob - Storage while `file_share` specifies which file share to download from - for Azure File Storage. Only one of these properties can be specified - per `data_transfer` object. - * (optional) `include` property defines an optional include filter. - Although this property is an array, it is only allowed to have 1 - maximum filter. - * (required) `destination` property defines where to place the - downloaded files on the host file system. Please note that you should - not specify a destination that is on a shared file system. If you - require ingressing to a shared file system location like a GlusterFS - volume, then use the global configuration `files` property and the - `data ingress` command. - * (optional) `blobxfer_extra_options` are any extra options to pass to - `blobxfer`. -* (optional) `ssh` is the property for creating a user to accomodate SSH -sessions to compute nodes. If this property is absent, then an SSH user is not -created with pool creation. - * (required) `username` is the user to create on the compute nodes. - * (optional) `expiry_days` is the number of days from now for the account on - the compute nodes to expire. The default is 30 days from invocation time. - * (optional) `ssh_public_key` is the path to an existing SSH public key to - use. If not specified, an RSA public/private keypair will be automatically - generated only on Linux. If this is `null` or not specified on Windows, - the SSH user is not created. - * (optional) `generate_docker_tunnel_script` property directs script to - generate an SSH tunnel script that can be used to connect to the remote - Docker engine running on a compute node. - * (optional) `generated_file_export_path` is the path to export the - generated RSA keypair and docker tunnel script to. If omitted, the - current directory is used. - * (experimental) `hpn_server_swap` property enables an OpenSSH server with - [HPN patches](https://www.psc.edu/index.php/using-joomla/extensions/templates/atomic/636-hpn-ssh) - to be swapped with the standard distribution OpenSSH server. This is not - supported on all Linux distributions and may be force disabled. -* (required for `STANDARD_NV` instances, optional for `STANDARD_NC` instances) -`gpu` property defines additional information for NVIDIA GPU-enabled VMs: - * `nvidia_driver` property contains the following required members: - * `source` is the source url to download the driver. -* (optional) `additional_node_prep_commands` is an array of additional commands -to execute on the compute node host as part of node preparation. This can -be empty or omitted. - -An example pool json template can be found -[here](../config\_templates/pool.json). - -### Jobs -The jobs schema is as follows: - -```json -{ - "job_specifications": [ - { - "id": "dockerjob", - "multi_instance_auto_complete": true, - "environment_variables": { - "abc": "xyz" - }, - "environment_variables_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/myjobenv", - "max_task_retries": 3, - "input_data": { - "azure_batch": [ - { - "job_id": "someotherjob", - "task_id": "task-a", - "include": ["wd/*.dat"], - "exclude": ["*.txt"], - "destination": null - } - ], - "azure_storage": [ - { - "storage_account_settings": "mystorageaccount", - "container": "jobcontainer", - "include": ["jobdata*.bin"], - "destination": "$AZ_BATCH_NODE_SHARED_DIR/jobdata", - "blobxfer_extra_options": null - } - ] - }, - "tasks": [ - { - "id": null, - "depends_on": [ - "taskid-a", "taskid-b", "taskid-c" - ], - "depends_on_range": [ - 1, 10 - ], - "image": "busybox", - "name": null, - "labels": [], - "environment_variables": { - "def": "123" - }, - "environment_variables_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/mytaskenv", - "ports": [], - "data_volumes": [ - "contdatavol", - "hosttempvol" - ], - "shared_data_volumes": [ - "azurefilevol" - ], - "resource_files": [ - { - "file_path": "", - "blob_source": "", - "file_mode": "" - } - ], - "input_data": { - "azure_batch": [ - { - "job_id": "previousjob", - "task_id": "mytask1", - "include": ["wd/output/*.bin"], - "exclude": ["*.txt"], - "destination": null - } - ], - "azure_storage": [ - { - "storage_account_settings": "mystorageaccount", - "container": "taskcontainer", - "include": ["taskdata*.bin"], - "destination": "$AZ_BATCH_NODE_SHARED_DIR/taskdata", - "blobxfer_extra_options": null - } - ] - }, - "output_data": { - "azure_storage": [ - { - "storage_account_settings": "mystorageaccount", - "container": "output", - "source": null, - "include": ["**/out*.dat"], - "blobxfer_extra_options": null - } - ] - }, - "remove_container_after_exit": true, - "shm_size": "256m", - "additional_docker_run_options": [ - ], - "infiniband": false, - "gpu": false, - "max_task_retries": 3, - "retention_time": "1.12:00:00", - "multi_instance": { - "num_instances": "pool_current_dedicated", - "coordination_command": null, - "resource_files": [ - { - "file_path": "", - "blob_source": "", - "file_mode": "" - } - ] - }, - "entrypoint": null, - "command": "" - } - ] - } - ] -} -``` - -`job_specifications` array consists of jobs to create. -* (required) `id` is the job id to create. If the job already exists, the -specified `tasks` under the job will be added to the existing job. -* (optional) `multi_instance_auto_complete` enables auto-completion of the job -for which a multi-task instance is run. This allows automatic cleanup of the -Docker container in multi-instance tasks. This is defaulted to `true` when -multi-instance tasks are specified. -* (optional) `environment_variables` under the job are environment variables -which will be applied to all tasks operating under the job. Note that -environment variables are not expanded and are passed as-is. You will need -to source the environment file `$AZ_BATCH_TASK_WORKING_DIR/.shipyard.envlist` -in a shell within the docker `command` or `entrypoint` if you want any -environment variables to be expanded. -* (optional) `environment_variables_keyvault_secret_id` under the job are -environment variables stored in KeyVault that should be applied to all tasks -operating under the job. The secret stored in KeyVault must be a valid json -string, e.g., `{ "env_var_name": "env_var_value" }`. -* (optional) `max_task_retries` sets the maximum number of times that -Azure Batch should retry all tasks in this job for. By default, Azure Batch -does not retry tasks that fail (i.e. `max_task_retries` is 0). -* (optional) `input_data` is an object containing data that should be -ingressed for the job. Any `input_data` defined at this level will be -downloaded for this job which can be run on any number of compute nodes -depending upon the number of constituent tasks and repeat invocations. However, -`input_data` is only downloaded once per job invocation on a compute node. -For example, if `job-1`:`task-1` is run on compute node A and then -`job-1`:`task-2` is run on compute node B, then this `input_data` is ingressed -to both compute node A and B. However, if `job-1`:`task-3` is then run on -compute node A after `job-1`:`task-1`, then the `input_data` is not -transferred again. This object currently supports `azure_batch` and -`azure_storage` as members. - * `azure_batch` contains the following members: - * (required) `job_id` the job id of the task - * (required) `task_id` the id of the task to fetch files from - * (optional) `include` is an array of include filters - * (optional) `exclude` is an array of exclude filters - * (required) `destination` is the destination path to place the files - * `azure_storage` contains the following members: - * (required) `storage_account_settings` contains a storage account link - as defined in the credentials json. - * (required) `container` or `file_share` is required when downloading - from Azure Blob Storage or Azure File Storage, respectively. - `container` specifies which container to download from for Azure Blob - Storage while `file_share` specifies which file share to download from - for Azure File Storage. Only one of these properties can be specified - per `data_transfer` object. - * (optional) `include` property defines an optional include filter. - Although this property is an array, it is only allowed to have 1 - maximum filter. - * (required) `destination` property defines where to place the - downloaded files on the host file system. Please note that you should - not specify a destination that is on a shared file system. If you - require ingressing to a shared file system location like a GlusterFS - volume, then use the global configuration `files` property and the - `data ingress` command. - * (optional) `blobxfer_extra_options` are any extra options to pass to - `blobxfer`. -* (required) `tasks` is an array of tasks to add to the job. - * (optional) `id` is the task id. Note that if the task `id` is null or - empty then a generic task id will be assigned. The generic task id is - formatted as `dockertask-NNNNN` where `NNNNN` starts from `00000` and is - increased by 1 for each task added to the same job. If there are more - than `99999` autonamed tasks in a job then the numbering is not - padded for tasks exceeding 5 digits. - * (optional) `depends_on` is an array of task ids for which this container - invocation (task) depends on and must run to successful completion prior - to this task executing. - * (optional) `depends_on_range` is an array with exactly two integral - elements containing a task `id` range for which this task is dependent - upon, i.e., the start `id` and the end `id` for which this task depends - on. Although task `id`s are always strings, the dependent task `id`s for - ranges must be expressed by their integral representation for this - property. This also implies that task `id`s for which this task depends - on must be integral in nature. For example, if `depends_on_range` is set - to `[1, 10]` (note the integral members), then there should be task - `id`s of `"1"`, `"2"`, ... `"10"` within the job. Once these dependent - tasks complete successfully, then this specified task will execute. - * (required) `image` is the Docker image to use for this task - * (optional) `name` is the name to assign to the container. If not - specified, the value of the `id` property will be used for `name`. - * (optional) `labels` is an array of labels to apply to the container. - * (optional) `environment_variables` are any additional task-specific - environment variables that should be applied to the container. Note that - environment variables are not expanded and are passed as-is. You will - need to source the environment file - `$AZ_BATCH_TASK_WORKING_DIR/.shipyard.envlist` in a shell within the - docker `command` or `entrypoint` if you want any environment variables - to be expanded. - * (optional) `environment_variables_keyvault_secret_id` are any additional - task-specific environment variables that should be applied to the - container but are stored in KeyVault. The secret stored in KeyVault must - be a valid json string, e.g., `{ "env_var_name": "env_var_value" }`. - * (optional) `ports` is an array of port specifications that should be - exposed to the host. - * (optional) `data_volumes` is an array of `data_volume` aliases as defined - in the global configuration file. These volumes will be mounted in the - container. - * (optional) `shared_data_volumes` is an array of `shared_data_volume` - aliases as defined in the global configuration file. These volumes will be - mounted in the container. - * (optional) `resource_files` is an array of resource files that should be - downloaded as part of the task. Each array entry contains the following - information: - * `file_path` is the path within the task working directory to place the - file on the compute node. - * `blob_source` is an accessible HTTP/HTTPS URL. This need not be an Azure - Blob Storage URL. - * `file_mode` if the file mode to set for the file on the compute node. - This is optional. - * (optional) `input_data` is an object containing data that should be - ingressed for this specific task. This object currently supports - `azure_batch` and `azure_storage` as members. Note for multi-instance - tasks, transfer of `input_data` is only applied to the task running the - application command. - * `azure_batch` contains the following members: - * (required) `job_id` the job id of the task - * (required) `task_id` the id of the task to fetch files from - * (optional) `include` is an array of include filters - * (optional) `exclude` is an array of exclude filters - * (optional) `destination` is the destination path to place the files. - If `destination` is not specified at this level, then files are - defaulted to download into `$AZ_BATCH_TASK_WORKING_DIR`. - * `azure_storage` contains the following members: - * (required) `storage_account_settings` contains a storage account link - as defined in the credentials json. - * (required) `container` or `file_share` is required when downloading - from Azure Blob Storage or Azure File Storage, respectively. - `container` specifies which container to download from for Azure Blob - Storage while `file_share` specifies which file share to download from - for Azure File Storage. Only one of these properties can be specified - per `data_transfer` object. - * (optional) `include` property defines an optional include filter. - Although this property is an array, it is only allowed to have 1 - maximum filter. - * (optional) `destination` property defines where to place the - downloaded files on the host file system. Unlike the job-level - version of `input_data`, this `destination` property can be ommitted. - If `destination` is not specified at this level, then files are - defaulted to download into `$AZ_BATCH_TASK_WORKING_DIR`. Please note - that you should not specify a destination that is on a shared file - system. If you require ingressing to a shared file system location - like a GlusterFS volume, then use the global configuration `files` - property and the `data ingress` command. - * (optional) `blobxfer_extra_options` are any extra options to pass to - `blobxfer`. - * (optional) `output_data` is an object containing data that should be - egressed for this specific task if and only if the task completes - successfully. This object currently only supports `azure_storage` as a - member. Note for multi-instance tasks, transfer of `output_data` is only - applied to the task running the application command. - * `azure_storage` contains the following members: - * (required) `storage_account_settings` contains a storage account link - as defined in the credentials json. - * (required) `container` or `file_share` is required when uploading to - Azure Blob Storage or Azure File Storage, respectively. `container` - specifies which container to upload to for Azure Blob Storage while - `file_share` specifies which file share to upload to for Azure File - Storage. Only one of these properties can be specified per - `data_transfer` object. - * (optional) `source` property defines which directory to upload to - Azure storage. If `source` is not specified, then `source` is - defaulted to `$AZ_BATCH_TASK_DIR`. - * (optional) `include` property defines an optional include filter. - Although this property is an array, it is only allowed to have 1 - maximum filter. - * (optional) `blobxfer_extra_options` are any extra options to pass to - `blobxfer`. - * (optional) `remove_container_after_exit` property specifies if the - container should be automatically removed/cleaned up after it exits. This - defaults to `false`. - * (optional) `shm_size` property specifies the size of `/dev/shm` in - the container. The default is `64m`. The postfix unit can be designated - as `b` (bytes), `k` (kilobytes), `m` (megabytes), or `g` (gigabytes). This - value may need to be increased from the default of `64m` for certain - Docker applications, including multi-instance tasks using Intel MPI - (see [issue #8](https://github.com/Azure/batch-shipyard/issues/8)). - * (optional) `additional_docker_run_options` is an array of addition Docker - run options that should be passed to the Docker daemon when starting this - container. - * (optional) `infiniband` designates if this container requires access to the - Infiniband/RDMA devices on the host. Note that this will automatically - force the container to use the host network stack. If this property is - set to `true`, ensure that the `pool_specification` property - `inter_node_communication_enabled` is set to `true`. - * (optional) `gpu` designates if this container requires access to the GPU - devices on the host. If this property is set to `true`, Docker containers - are instantiated via `nvidia-docker`. This requires N-series VM instances. - * (optional) `max_task_retries` sets the maximum number of times that - Azure Batch should retry this task for. This overrides the job-level task - retry count. By default, Azure Batch does not retry tasks that fail - (i.e. `max_task_retries` is 0). - * (optional) `retention_time` sets the timedelta to retain the task - directory on the compute node where it ran after the task completes. - The format for this property is a timedelta with a string representation - of "d.HH:mm:ss". For example, "1.12:00:00" would allow the compute node - to clean up this task's directory 36 hours after the task completed. The - default, if unspecified, is effectively infinite - i.e., task data is - retained forever on the compute node that ran the task. - * (optional) `multi_instance` is a property indicating that this task is a - multi-instance task. This is required if the Docker image is an MPI - program. Additional information about multi-instance tasks and Batch - Shipyard can be found - [here](80-batch-shipyard-multi-instance-tasks.md). Do not define this - property for tasks that are not multi-instance. Additional members of this - property are: - * `num_instances` is a property setting the number of compute node - instances are required for this multi-instance task. This can be any one - of the following: - 1. An integral number - 2. `pool_current_dedicated` which is the instantaneous reading of the - target pool's current dedicated count during this function invocation. - 3. `pool_specification_vm_count` which is the `vm_count` specified in the - pool configuration. - * `coordination_command` is the coordination command this is run by each - instance (compute node) of this multi-instance task prior to the - application command. This command must not block and must exit - successfully for the multi-instance task to proceed. This is the command - passed to the container in `docker run` for multi-instance tasks. This - docker container instance will automatically be daemonized. This is - optional and may be null. - * `resource_files` is an array of resource files that should be downloaded - as part of the multi-instance task. Each array entry contains the - following information: - * `file_path` is the path within the task working directory to place - the file on the compute node. - * `blob_source` is an accessible HTTP/HTTPS URL. This need not be an - Azure Blob Storage URL. - * `file_mode` if the file mode to set for the file on the compute node. - This is optional. - * (optional) `entrypoint` is the property that can override the Docker image - defined `ENTRYPOINT`. - * (optional) `command` is the command to execute in the Docker container - context. If this task is a regular non-multi-instance task, then this is - the command passed to the container context during `docker run`. If this - task is a multi-instance task, then this `command` is the application - command and is executed with `docker exec` in the running Docker container - context from the `coordination_command` in the `multi_instance` property. - This property may be null. - -An example jobs json template can be found -[here](../config\_templates/jobs.json). - ## Batch Shipyard Usage Continue on to [Batch Shipyard Usage](20-batch-shipyard-usage.md). diff --git a/docs/11-batch-shipyard-configuration-credentials.md b/docs/11-batch-shipyard-configuration-credentials.md index 9db8fae..44e365f 100644 --- a/docs/11-batch-shipyard-configuration-credentials.md +++ b/docs/11-batch-shipyard-configuration-credentials.md @@ -36,7 +36,7 @@ The credentials schema is as follows: "rsa_private_key_pem": "/path/to/privkey.pem", "x509_cert_sha1_thumbprint": "01AB02CD...", "user": "me@domain.com", - "password": "password" + "password": "password", "token_cache": { "enabled": true, "filename": "" @@ -59,6 +59,7 @@ The credentials schema is as follows: "filename": "" } }, + "resource_group": "", "account_key": "batchaccountkey", "account_key_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/batchkey" }, @@ -149,9 +150,12 @@ under the `batch` property can be found in the * (required) `account_service_url` is the Batch account service URL. * (required for UserSubscription accounts, optional otherwise) `aad` AAD authentication parameters for Azure Batch. + * (optional) `resource_group` is the resource group containing the Batch + account. This is only required if using a UserSubscription Batch account + with `aad` authentication. * (required unless `aad` is specified) `account_key` is the shared - key. This is required for non-AAD logins. This is ignored if the `aad` - property is specified. + key. This is required for non-AAD logins. This option takes precendence + over the `aad` property if specified. * (optional) `account_key_keyvault_secret_id` property can be used to reference an Azure KeyVault secret id. Batch Shipyard will contact the specified KeyVault and replace the `account_key` value as returned by diff --git a/docs/12-batch-shipyard-configuration-global.md b/docs/12-batch-shipyard-configuration-global.md new file mode 100644 index 0000000..7bc583c --- /dev/null +++ b/docs/12-batch-shipyard-configuration-global.md @@ -0,0 +1,418 @@ +# Batch Shipyard Global Configuration +This page contains in-depth details on how to configure the global +json file for Batch Shipyard. + +## Schema +The global config schema is as follows: + +```json +{ + "batch_shipyard": { + "storage_account_settings": "mystorageaccount", + "storage_entity_prefix": "shipyard", + "generated_sas_expiry_days": 90, + "encryption" : { + "enabled": true, + "pfx": { + "filename": "encrypt.pfx", + "passphrase": "mysupersecretpassword", + "sha1_thumbprint": "123456789..." + }, + "public_key_pem": "encrypt.pem" + } + }, + "docker_registry": { + "private": { + "allow_public_docker_hub_pull_on_missing": true, + "server": "myserver-myorg.azurecr.io", + "azure_storage": { + "storage_account_settings": "mystorageaccount", + "container": "mydockerregistry" + } + } + }, + "data_replication": { + "peer_to_peer": { + "enabled": true, + "compression": true, + "concurrent_source_downloads": 10, + "direct_download_seed_bias": null + }, + "non_peer_to_peer_concurrent_downloading": true + }, + "global_resources": { + "docker_images": [ + "busybox", + "redis:3.2.3-alpine", + ], + "files": [ + { + "source": { + "path": "/some/local/path/dir", + "include": ["*.dat"], + "exclude": ["*.bak"] + }, + "destination": { + "shared_data_volume": "glustervol", + "relative_destination_path": "myfiles", + "data_transfer": { + "method": "multinode_scp", + "ssh_private_key": "id_rsa_shipyard", + "scp_ssh_extra_options": "-C -c aes256-gcm@openssh.com", + "rsync_extra_options": "", + "split_files_megabytes": 500, + "max_parallel_transfers_per_node": 2 + } + } + }, + { + "source": { + "path": "/some/local/path/bound/for/blob", + "include": ["*.bin"] + }, + "destination": { + "storage_account_settings": "mystorageaccount", + "data_transfer": { + "container": "mycontainer", + "blobxfer_extra_options": "--no-computefilemd5" + } + } + }, + { + "source": { + "path": "/another/local/path/dir", + "include": [], + "exclude": [] + }, + "destination": { + "relative_destination_path": "relpath/on/host", + "data_transfer": { + "method": "rsync+ssh", + "ssh_private_key": "id_rsa_shipyard", + "scp_ssh_extra_options": "-c aes256-gcm@openssh.com", + "rsync_extra_options": "-v" + } + } + } + ], + "docker_volumes": { + "data_volumes": { + "abcvol": { + "host_path": null, + "container_path": "/abc" + }, + "hosttempvol": { + "host_path": "/tmp", + "container_path": "/hosttmp" + } + }, + "shared_data_volumes": { + "shipyardvol": { + "volume_driver": "azurefile", + "storage_account_settings": "mystorageaccount", + "azure_file_share_name": "shipyardshared", + "container_path": "$AZ_BATCH_NODE_SHARED_DIR/azfile", + "mount_options": [ + "filemode=0777", + "dirmode=0777", + "nolock=true" + ] + }, + "glustervol": { + "volume_driver": "glusterfs_on_compute", + "container_path": "$AZ_BATCH_NODE_SHARED_DIR/gfs", + "volume_type": "replica", + "volume_options": [ + "performance.cache-size 1 GB", + "performance.cache-max-file-size 10 MB", + "performance.cache-refresh-timeout 61", + ] + } + } + } + } +} +``` + +The `batch_shipyard` property is used to set settings for the tool. +* (required) `storage_account_settings` is a link to the alias of the storage +account specified, in this case, it is `mystorageaccount`. Batch shipyard +requires a storage account for storing metadata in order to execute across a +distributed environment. +* (optional) `storage_entity_prefix` property is used as a generic qualifier +to prefix storage containers (blob containers, tables, queues) with. If not +specified, defaults to `shipyard`. +* (optional) `generated_sas_expiry_days` property is used to set the number of +days any generated SAS key by Batch Shipyard is valid for. The default is 30 +days. This is useful if you have long-lived pools and want to ensure that +SAS keys are valid for longer periods of time. +* (optional) `encryption` object is used to define credential encryption which +contains the following members: + * (required) `enabled` property enables or disables this feature. + * (required) `pfx` object defines the PFX certificate + * (required) `filename` property is the full path and name to the PFX + certificate + * (required) `passphrase` property is the passphrase for the PFX + certificate. This cannot be empty. + * (optional) `sha1_thumbprint` is the SHA1 thumbprint of the + certificate. If the PFX file is created using the `cert create` command, + then the SHA1 thumbprint is output. It is recommended to populate this + property such that it does not have to be generated when needed for + encryption. + * (optional) `public_key_pem` property is the full path and name to the + RSA public key in PEM format. If the PFX file is created using the + `cert create` command, then this file is generated along with the PFX + file. It is recommended to populate this property with the PEM file path + such that it does not have to be generated when needed for encryption. + +The `docker_registry` property is used to configure Docker image distribution +options from public/private Docker hub and private registries. +* (optional) `private` property controls settings for interacting with private +registries. There are three kinds of private registries that are supported: +(1) private registries hosted on Docker Hub, (2) Internet accessible +registries such as those hosted by the +[Azure Container Registry](https://azure.microsoft.com/en-us/services/container-registry/) +service and (3) [private registry instances backed to +Azure Blob Storage](https://azure.microsoft.com/en-us/documentation/articles/virtual-machines-linux-docker-registry-in-blob-storage/) +and are run on compute nodes. To use private registries hosted on Docker Hub, +no additional properties need to be specified here, instead, specify your +Docker Hub login information in the credentials json. To specify a private +registry other than on Docker Hub, a json property named `server` should be +defined. To use a private registry backed by Azure Blob Storage, define a +json object named `azure_storage`. Note that a maximum of only one of these +three types private registries may be specified at once. The following +describes members of the non-Docker Hub private registries supported: + * (optional) `server` object is a property that is the fully-qualified host + name to a private registry server. A specific port other than 80 can be + specified using a `:` separator, e.g., + `mydockerregistry.com:8080`. Port 80 is the default if no port is + specified. The value of this property should have an associated login + in the credentials json file. + * (optional) `azure_storage` object is to define settings for connecting + to a private registry backed by Azure Storage blobs and where the + private registry instances are hosted on the compute nodes themselves. + * (required) `storage_account_settings` is a link to the alias of the + storage account specified that stores the private registry blobs. + * (required) `container` property is the name of the Azure Blob + container holding the private registry blobs. + * (optional) `allow_public_docker_hub_pull_on_missing` property allows + pass-through of Docker image retrieval to public Docker Hub if it is + missing in the private registry. This defaults to `false` if not + specified. Note that this setting does not apply to a missing Docker + image that is allowed to run via the job property + `allow_run_on_missing_image`. + +The `data_replication` property is used to configure the internal image +replication mechanism between compute nodes within a compute pool. The +`non_peer_to_peer_concurrent_downloading` property specifies if it is ok +to allow unfettered concurrent downloading from the source registry among +all compute nodes. The following options apply to `peer_to_peer` data +replication options: +* (optional) `enabled` property enables or disables private peer-to-peer +transfer. Note that for compute pools with a relatively small number of VMs, +peer-to-peer transfer may not provide any benefit and is recommended to be +disabled in these cases. Compute pools with large number of VMs and especially +in the case of an Azure Storage-backed private registry can benefit from +peer-to-peer image replication. +* `compression` property enables or disables compression of image files. It +is strongly recommended to keep this enabled. +* `concurrent_source_downloads` property specifies the number of +simultaneous downloads allowed to each image. +* `direct_download_seed_bias` property sets the number of direct download +seeds to prefer per image before switching to peer-to-peer transfer. + +The `global_resources` property contains information regarding required +Docker images, volume configuration and data ingress information. This +property is required. + +`docker_images` is an array of docker images that should be installed on +every compute node when this configuration file is supplied while creating +a compute pool. Image tags are supported. Image names should not include +private registry server names, as these will be automatically prepended. For +instance, if you have an image `abc/mytag` on your private registry +`myregistry-myorg.azurecr.io`, your image should be named in the +`docker_images` array as `abc/mytag` and not +`myregistry-myorg.azurecr.io/abc/mytag`. + +`files` is an optional property that specifies data that should be ingressed +from a location accessible by the local machine (i.e., machine invoking +`shipyard.py` to a shared file system location accessible by compute nodes +in the pool or Azure Blob or File Storage). `files` is a json list of objects, +which allows for multiple sources to destinations to be ingressed during the +same invocation. Note that no Azure Batch environment variables +(i.e., `$AZ_BATCH_`-style environment variables) are available as path +arguments since ingress actions performed within `files` are done locally +on the machine invoking `shipyard.py`. Each object within the `files` list +contains the following members: +* (required) `source` property contains the following members: + * (required) `path` is a local path. A single file or a directory + can be specified. Filters below will be ignored if `path` is a file and + not a directory. + * (optional) `include` is an array of + [Unix shell-style wildcard filters](https://docs.python.org/3.5/library/fnmatch.html) + where only files matching a filter are included in the data transfer. + Filters specified in `include` have precedence over `exclude` described + next. `include` can only have a maximum of 1 filter for ingress to Azure + Blob Storage. In this example, all files ending in `.dat` are ingressed. + * (optional) `exclude` is an array of + [Unix shell-style wildcard filters](https://docs.python.org/3.5/library/fnmatch.html) + where files matching a filter are excluded from the data transfer. Filters + specified in `include` have precedence over filters specified in + `exclude`. `exclude` cannot be specified for ingress into Azure Blob + Storage. In this example, all files ending in `.bak` are skipped for + ingress. +* (required) `destination` property contains the following members: + * (required or optional) `shared_data_volume` or `storage_account_settings` + for data ingress to a GlusterFS volume or Azure Blob or File Storage. If + you are ingressing to a pool with only one compute node, you may omit + `shared_data_volume`. Otherwise, you may specify one or the other, but + not both in the same object. Please see below in the + `shared_data_volumes` for information on how to set up a GlusterFS share. + * (required or optional) `relative_destination_path` specifies a relative + destination path to place the files, with respect to the target root. + If transferring to a `shared_data_volume` then this is relative to the + GlusterFS volume root. If transferring to a pool with one single node in + it, thus, no `shared_data_volume` is specified in the prior property, then + this is relative to + [$AZ_BATCH_NODE_ROOT_DIR](https://azure.microsoft.com/en-us/documentation/articles/batch-api-basics/#files-and-directories). + To place files directly in `$AZ_BATCH_NODE_ROOT_DIR` (not recommended), + you can specify this property as empty string when not ingressing to + a `shared_data_volume`. Note that if `scp` is selected while attempting + to transfer directly to this aforementioned path, then `scp` will fail + with exit code of 1 but the transfer will have succeeded (this is due + to some of the permission options). If this property is not specified for + a `shared_data_volume`, then files will be placed directly in the + GlusterFS volume root. This property cannot be specified for a Azure + Storage destination (i.e., `storage_account_settings`). + * (required) `data_transfer` specifies how the transfer should take place. + The following list contains members for GlusterFS ingress when a GlusterFS + volume is provided for `shared_data_volume` (see below for ingressing to + Azure Blob or File Storage): + * (required) `method` specified which method should be used to ingress + data, which should be one of: `scp`, `multinode_scp`, `rsync+ssh` or + `multinode_rsync+ssh`. `scp` will use secure copy to copy a file or a + directory (recursively) to the remote share path. `multinode_scp` will + attempt to simultaneously transfer files to many compute nodes using + `scp` at the same time to speed up data transfer. `rsync+ssh` will + perform an rsync of files through SSH. `multinode_rsync+ssh` will + attempt to simultaneously transfer files using `rsync` to many compute + nodes at the same time to speed up data transfer with. Note that you may + specify the `multinode_*` methods even with only 1 compute node in a + pool which will allow you to take advantage of + `max_parallel_transfers_per_node` below. + * (optional) `ssh_private_key` location of the SSH private key for the + username specified in the `pool_specification`:`ssh` section when + connecting to compute nodes. The default is `id_rsa_shipyard`, if + omitted, which is automatically generated if no SSH key is specified + when an SSH user is added to a pool. + * (optional) `scp_ssh_extra_options` are any extra options to pass to + `scp` or `ssh` for `scp`/`multinode_scp` or + `rsync+ssh`/`multinode_rsync+ssh` methods, respectively. In the example + above, `-C` enables compression and `-c aes256-gcm@openssh.com` + is passed to `scp`, which can potentially increase the transfer speed by + selecting the `aes256-gcm@openssh.com` cipher which can exploit Intel + AES-NI. + * (optional) `rsync_extra_options` are any extra options to pass to + `rsync` for the `rsync+ssh`/`multinode_rsync+ssh` transfer methods. This + property is ignored for non-rsync transfer methods. + * (optional) `split_files_megabytes` splits files into chunks with the + specified size in MiB. This can potentially help with very large files. + This option forces the transfer `method` to `multinode_scp`. + Note that the destination file system must be able to accommodate + up to 2x the size of files which are split. Additionally, transfers + involving files which are split will incur reconstruction costs after + the transfer is complete, which will increase the total end-to-end + ingress time. However, in certain scenarios, by splitting files and + transferring chunks in parallel along with reconstruction may end up + being faster than transferring a large file without chunking. + * (optional) `max_parallel_transfers_per_node` is the maximum number of + parallel transfer to invoke per node with the + `multinode_scp`/`multinode_rsync+ssh` methods. For example, if there + are 3 compute nodes in the pool, and `2` is given for this option, then + there will be up to 2 scp sessions in parallel per compute node for a + maximum of 6 concurrent scp sessions to the pool. The default is 1 if + not specified or omitted. + * (required) `data_transfer` specifies how the transfer should take place. + When Azure Blob or File Storage is selected as the destination for data + ingress, [blobxfer](https://github.com/Azure/blobxfer) is invoked. The + following list contains members for Azure Blob or File Storage ingress + when a storage account link is provided for `storage_account_settings`: + * (required) `container` or `file_share` is required when uploading to + Azure Blob Storage or Azure File Storage, respectively. `container` + specifies which container to upload to for Azure Blob Storage while + `file_share` specifies which file share to upload to for Azure File + Storage. Only one of these properties can be specified per + `data_transfer` object. The container or file share need not be created + beforehand. + * (optional) `blobxfer_extra_options` are any extra options to pass to + `blobxfer`. In the example above, `--no-computefilemd5` will force + `blobxfer` to skip MD5 calculation on files ingressed. + +`docker_volumes` is an optional property that can consist of two +different types of volumes: `data_volumes` and `shared_data_volumes`. +`data_volumes` can be of two flavors depending upon if `host_path` is set to +null or not. In the former, this is typically used with the `VOLUME` keyword +in Dockerfiles to initialize a data volume with existing data inside the +image. If `host_path` is set, then the path on the host is mounted in the +container at the path specified with `container_path`. + +`shared_data_volumes` is an optional property for initializing persistent +shared storage volumes. In the first shared volume, `shipyardvol` is the alias +of this volume: +* `volume_driver` property specifies the Docker Volume Driver to use. +Currently Batch Shipyard only supports the `volume_driver` as `azurefile` or +`glusterfs_on_compute`. Note that `glusterfs_on_compute` is not a true Docker +Volume Driver. For this volume (`shipyardvol`), as this is an Azure File +shared volume, the `volume_driver` should be set as `azurefile`. +* `storage_account_settings` is a link to the alias of the storage account +specified that holds this Azure File Share. +* `azure_file_share_name` is the name of the share name on Azure Files. Note +that the Azure File share must be created beforehand, the toolkit does not +create Azure File shares, it only mounts them to the compute nodes. +* `container_path` is the path in the container to mount. +* `mount_options` are the mount options to pass to the mount command. Supported +options are documented +[here](https://github.com/Azure/azurefile-dockervolumedriver). It is +recommended to use `0777` for both `filemode` and `dirmode` as the `uid` and +`gid` cannot be reliably determined before the compute pool is allocated and +this volume will be mounted as the root user. + +Note that when using `azurefile` for a shared data volume, the storage account +that holds the file share must reside within the same Azure region as the +Azure Batch compute pool. Attempting to mount an Azure File share that is +cross-region will result in failure as current Linux Samba clients do not +support share level encryption at this time. + +The second shared volue, `glustervol`, is a +[GlusterFS](https://www.gluster.org/) network file system. Please note that +`glusterfs_on_compute` are GlusterFS volumes co-located on the VM's temporary +local disk space which is a shared resource. Sizes of the local temp disk for +each VM size can be found +[here](https://azure.microsoft.com/en-us/documentation/articles/virtual-machines-windows-sizes/). +If specifying a `glusterfs_on_compute` volume, you must enable internode +communication in the pool configuration file. These volumes have the following +properties: +* (required) `volume_driver` property should be set as `glusterfs_on_compute`. +* (required) `container_path` is the path in the container to mount. +* (optional) `volume_type` property defines the GlusterFS volume type. +Currently, `replica` is the only supported type. +* (optional) `volume_options` property defines additional GlusterFS volume +options to set. + +`glusterfs_on_compute` volumes are mounted on the host at +`$AZ_BATCH_NODE_SHARED_DIR/.gluster/gv0`. Batch Shipyard will automatically +replace container path references in direct and storage-based data +ingress/egress with their host path equivalents. + +Note that when resizing a pool with a `glusterfs_on_compute` shared file +systems that you must resize with the `pool resize` command in `shipyard.py` +and not with Azure Portal, Batch Explorer or any other tool. + +Finally, note that all `docker_volumes` can be omitted completely along with +one or all of `data_volumes` and `shared_data_volumes` if you do not require +this functionality. + +## Full template +An full template of a credentials file can be found +[here](../config\_templates/config.json). Note that this template cannot +be used as-is and must be modified to fit your scenario. diff --git a/docs/13-batch-shipyard-configuration-pool.md b/docs/13-batch-shipyard-configuration-pool.md new file mode 100644 index 0000000..f902214 --- /dev/null +++ b/docs/13-batch-shipyard-configuration-pool.md @@ -0,0 +1,163 @@ +# Batch Shipyard Pool Configuration +This page contains in-depth details on how to configure the pool +json file for Batch Shipyard. + +## Schema +The pool schema is as follows: + +```json +{ + "pool_specification": { + "id": "dockerpool", + "vm_size": "STANDARD_A9", + "vm_count": 10, + "max_tasks_per_node": 1, + "inter_node_communication_enabled": true, + "publisher": "OpenLogic", + "offer": "CentOS-HPC", + "sku": "7.1", + "reboot_on_start_task_failed": true, + "block_until_all_global_resources_loaded": true, + "transfer_files_on_pool_creation": false, + "input_data": { + "azure_batch": [ + { + "job_id": "jobonanotherpool", + "task_id": "mytask", + "include": ["wd/*.dat"], + "exclude": ["*.txt"], + "destination": "$AZ_BATCH_NODE_SHARED_DIR/jobonanotherpool" + } + ], + "azure_storage": [ + { + "storage_account_settings": "mystorageaccount", + "container": "poolcontainer", + "include": ["pooldata*.bin"], + "destination": "$AZ_BATCH_NODE_SHARED_DIR/pooldata", + "blobxfer_extra_options": null + } + ] + }, + "ssh": { + "username": "docker", + "expiry_days": 7, + "ssh_public_key": null, + "generate_docker_tunnel_script": true, + "generated_file_export_path": null, + "hpn_server_swap": false + }, + "gpu": { + "nvidia_driver": { + "source": "https://some.url" + } + }, + "additional_node_prep_commands": [ + ] + } +} +``` + +The `pool_specification` property has the following members: +* (required) `id` is the compute pool ID. +* (required) `vm_size` is the +[Azure Virtual Machine Instance Size](https://azure.microsoft.com/en-us/pricing/details/virtual-machines/). +Please note that not all regions have every VM size available. +* (required) `vm_count` is the number of compute nodes to allocate. +* (optional) `max_tasks_per_node` is the maximum number of concurrent tasks +that can be running at any one time on a compute node. This defaults to a +value of 1 if not specified. +* (optional) `inter_node_communication_enabled` designates if this pool is set +up for inter-node communication. This must be set to `true` for any containers +that must communicate with each other such as MPI applications. This property +will be force enabled if peer-to-peer replication is enabled. +* (required) `publisher` is the publisher name of the Marketplace VM image. +* (required) `offer` is the offer name of the Marketplace VM image. +* (required) `sku` is the sku name of the Marketplace VM image. +* (optional) `reboot_on_start_task_failed` allows Batch Shipyard to reboot the +compute node in case there is a transient failure in node preparation (e.g., +network timeout, resolution failure or download problem). This defaults to +`false`. +* (optional) `block_until_all_global_resources_loaded` will block the node +from entering ready state until all Docker images are loaded. This defaults +to `true`. +* (optional) `transfer_files_on_pool_creation` will ingress all `files` +specified in the `global_resources` section of the configuration json when +the pool is created. If files are to be ingressed to Azure Blob or File +Storage, then data movement operations are overlapped with the creation of the +pool. If files are to be ingressed to a shared file system on the compute +nodes, then the files are ingressed after the pool is created and the shared +file system is ready. Files can be ingressed to both Azure Blob Storage and a +shared file system during the same pool creation invocation. If this property +is set to `true` then `block_until_all_global_resources_loaded` will be force +disabled. If omitted, this property defaults to `false`. +* (optional) `input_data` is an object containing data that should be +ingressed to all compute nodes as part of node preparation. It is +important to note that if you are combining this action with `files` and +are ingressing data to Azure Blob or File storage as part of pool creation, +that the blob containers or file shares defined here will be downloaded as +soon as the compute node is ready to do so. This may result in the blob +container/blobs or file share/files not being ready in time for the +`input_data` transfer. It is up to you to ensure that these two operations do +not overlap. If there is a possibility of overlap, then you should ingress +data defined in `files` prior to pool creation and disable the option above +`transfer_files_on_pool_creation`. This object currently supports +`azure_batch` and `azure_storage` as members. + * `azure_batch` contains the following members: + * (required) `job_id` the job id of the task + * (required) `task_id` the id of the task to fetch files from + * (optional) `include` is an array of include filters + * (optional) `exclude` is an array of exclude filters + * (required) `destination` is the destination path to place the files + * `azure_storage` contains the following members: + * (required) `storage_account_settings` contains a storage account link + as defined in the credentials json. + * (required) `container` or `file_share` is required when downloading + from Azure Blob Storage or Azure File Storage, respectively. + `container` specifies which container to download from for Azure Blob + Storage while `file_share` specifies which file share to download from + for Azure File Storage. Only one of these properties can be specified + per `data_transfer` object. + * (optional) `include` property defines an optional include filter. + Although this property is an array, it is only allowed to have 1 + maximum filter. + * (required) `destination` property defines where to place the + downloaded files on the host file system. Please note that you should + not specify a destination that is on a shared file system. If you + require ingressing to a shared file system location like a GlusterFS + volume, then use the global configuration `files` property and the + `data ingress` command. + * (optional) `blobxfer_extra_options` are any extra options to pass to + `blobxfer`. +* (optional) `ssh` is the property for creating a user to accomodate SSH +sessions to compute nodes. If this property is absent, then an SSH user is not +created with pool creation. + * (required) `username` is the user to create on the compute nodes. + * (optional) `expiry_days` is the number of days from now for the account on + the compute nodes to expire. The default is 30 days from invocation time. + * (optional) `ssh_public_key` is the path to an existing SSH public key to + use. If not specified, an RSA public/private keypair will be automatically + generated only on Linux. If this is `null` or not specified on Windows, + the SSH user is not created. + * (optional) `generate_docker_tunnel_script` property directs script to + generate an SSH tunnel script that can be used to connect to the remote + Docker engine running on a compute node. + * (optional) `generated_file_export_path` is the path to export the + generated RSA keypair and docker tunnel script to. If omitted, the + current directory is used. + * (experimental) `hpn_server_swap` property enables an OpenSSH server with + [HPN patches](https://www.psc.edu/index.php/using-joomla/extensions/templates/atomic/636-hpn-ssh) + to be swapped with the standard distribution OpenSSH server. This is not + supported on all Linux distributions and may be force disabled. +* (required for `STANDARD_NV` instances, optional for `STANDARD_NC` instances) +`gpu` property defines additional information for NVIDIA GPU-enabled VMs: + * `nvidia_driver` property contains the following required members: + * `source` is the source url to download the driver. +* (optional) `additional_node_prep_commands` is an array of additional commands +to execute on the compute node host as part of node preparation. This can +be empty or omitted. + +## Full template +An full template of a credentials file can be found +[here](../config\_templates/pool.json). Note that this template cannot +be used as-is and must be modified to fit your scenario. diff --git a/docs/14-batch-shipyard-configuration-jobs.md b/docs/14-batch-shipyard-configuration-jobs.md new file mode 100644 index 0000000..68bb8dd --- /dev/null +++ b/docs/14-batch-shipyard-configuration-jobs.md @@ -0,0 +1,378 @@ +# Batch Shipyard Jobs Configuration +This page contains in-depth details on how to configure the jobs +json file for Batch Shipyard. + +## Schema +The jobs schema is as follows: + +```json +{ + "job_specifications": [ + { + "id": "dockerjob", + "multi_instance_auto_complete": true, + "environment_variables": { + "abc": "xyz" + }, + "environment_variables_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/myjobenv", + "max_task_retries": 3, + "allow_run_on_missing_image": false, + "input_data": { + "azure_batch": [ + { + "job_id": "someotherjob", + "task_id": "task-a", + "include": ["wd/*.dat"], + "exclude": ["*.txt"], + "destination": null + } + ], + "azure_storage": [ + { + "storage_account_settings": "mystorageaccount", + "container": "jobcontainer", + "include": ["jobdata*.bin"], + "destination": "$AZ_BATCH_NODE_SHARED_DIR/jobdata", + "blobxfer_extra_options": null + } + ] + }, + "tasks": [ + { + "id": null, + "depends_on": [ + "taskid-a", "taskid-b", "taskid-c" + ], + "depends_on_range": [ + 1, 10 + ], + "image": "busybox", + "name": null, + "labels": [], + "environment_variables": { + "def": "123" + }, + "environment_variables_keyvault_secret_id": "https://myvault.vault.azure.net/secrets/mytaskenv", + "ports": [], + "data_volumes": [ + "contdatavol", + "hosttempvol" + ], + "shared_data_volumes": [ + "azurefilevol" + ], + "resource_files": [ + { + "file_path": "", + "blob_source": "", + "file_mode": "" + } + ], + "input_data": { + "azure_batch": [ + { + "job_id": "previousjob", + "task_id": "mytask1", + "include": ["wd/output/*.bin"], + "exclude": ["*.txt"], + "destination": null + } + ], + "azure_storage": [ + { + "storage_account_settings": "mystorageaccount", + "container": "taskcontainer", + "include": ["taskdata*.bin"], + "destination": "$AZ_BATCH_NODE_SHARED_DIR/taskdata", + "blobxfer_extra_options": null + } + ] + }, + "output_data": { + "azure_storage": [ + { + "storage_account_settings": "mystorageaccount", + "container": "output", + "source": null, + "include": ["**/out*.dat"], + "blobxfer_extra_options": null + } + ] + }, + "remove_container_after_exit": true, + "shm_size": "256m", + "additional_docker_run_options": [ + ], + "infiniband": false, + "gpu": false, + "max_task_retries": 3, + "retention_time": "1.12:00:00", + "multi_instance": { + "num_instances": "pool_current_dedicated", + "coordination_command": null, + "resource_files": [ + { + "file_path": "", + "blob_source": "", + "file_mode": "" + } + ] + }, + "entrypoint": null, + "command": "" + } + ] + } + ] +} +``` + +`job_specifications` array consists of jobs to create. +* (required) `id` is the job id to create. If the job already exists, the +specified `tasks` under the job will be added to the existing job. +* (optional) `multi_instance_auto_complete` enables auto-completion of the job +for which a multi-task instance is run. This allows automatic cleanup of the +Docker container in multi-instance tasks. This is defaulted to `true` when +multi-instance tasks are specified. +* (optional) `environment_variables` under the job are environment variables +which will be applied to all tasks operating under the job. Note that +environment variables are not expanded and are passed as-is. You will need +to source the environment file `$AZ_BATCH_TASK_WORKING_DIR/.shipyard.envlist` +in a shell within the docker `command` or `entrypoint` if you want any +environment variables to be expanded. +* (optional) `environment_variables_keyvault_secret_id` under the job are +environment variables stored in KeyVault that should be applied to all tasks +operating under the job. The secret stored in KeyVault must be a valid json +string, e.g., `{ "env_var_name": "env_var_value" }`. +* (optional) `max_task_retries` sets the maximum number of times that +Azure Batch should retry all tasks in this job for. By default, Azure Batch +does not retry tasks that fail (i.e. `max_task_retries` is 0). +* (optional) `allow_run_on_missing` allows tasks with a Docker image reference +that was not pre-loaded on to the compute node via +`global_resources`:`docker_images` in the global configuration to be able to +run. Note that you should attempt to specify all Docker images that you intend +to run in the `global_resources`:`docker_images` property in the global +configuration to minimize scheduling to task execution latency. +* (optional) `input_data` is an object containing data that should be +ingressed for the job. Any `input_data` defined at this level will be +downloaded for this job which can be run on any number of compute nodes +depending upon the number of constituent tasks and repeat invocations. However, +`input_data` is only downloaded once per job invocation on a compute node. +For example, if `job-1`:`task-1` is run on compute node A and then +`job-1`:`task-2` is run on compute node B, then this `input_data` is ingressed +to both compute node A and B. However, if `job-1`:`task-3` is then run on +compute node A after `job-1`:`task-1`, then the `input_data` is not +transferred again. This object currently supports `azure_batch` and +`azure_storage` as members. + * `azure_batch` contains the following members: + * (required) `job_id` the job id of the task + * (required) `task_id` the id of the task to fetch files from + * (optional) `include` is an array of include filters + * (optional) `exclude` is an array of exclude filters + * (required) `destination` is the destination path to place the files + * `azure_storage` contains the following members: + * (required) `storage_account_settings` contains a storage account link + as defined in the credentials json. + * (required) `container` or `file_share` is required when downloading + from Azure Blob Storage or Azure File Storage, respectively. + `container` specifies which container to download from for Azure Blob + Storage while `file_share` specifies which file share to download from + for Azure File Storage. Only one of these properties can be specified + per `data_transfer` object. + * (optional) `include` property defines an optional include filter. + Although this property is an array, it is only allowed to have 1 + maximum filter. + * (required) `destination` property defines where to place the + downloaded files on the host file system. Please note that you should + not specify a destination that is on a shared file system. If you + require ingressing to a shared file system location like a GlusterFS + volume, then use the global configuration `files` property and the + `data ingress` command. + * (optional) `blobxfer_extra_options` are any extra options to pass to + `blobxfer`. +* (required) `tasks` is an array of tasks to add to the job. + * (optional) `id` is the task id. Note that if the task `id` is null or + empty then a generic task id will be assigned. The generic task id is + formatted as `dockertask-NNNNN` where `NNNNN` starts from `00000` and is + increased by 1 for each task added to the same job. If there are more + than `99999` autonamed tasks in a job then the numbering is not + padded for tasks exceeding 5 digits. + * (optional) `depends_on` is an array of task ids for which this container + invocation (task) depends on and must run to successful completion prior + to this task executing. + * (optional) `depends_on_range` is an array with exactly two integral + elements containing a task `id` range for which this task is dependent + upon, i.e., the start `id` and the end `id` for which this task depends + on. Although task `id`s are always strings, the dependent task `id`s for + ranges must be expressed by their integral representation for this + property. This also implies that task `id`s for which this task depends + on must be integral in nature. For example, if `depends_on_range` is set + to `[1, 10]` (note the integral members), then there should be task + `id`s of `"1"`, `"2"`, ... `"10"` within the job. Once these dependent + tasks complete successfully, then this specified task will execute. + * (required) `image` is the Docker image to use for this task + * (optional) `name` is the name to assign to the container. If not + specified, the value of the `id` property will be used for `name`. + * (optional) `labels` is an array of labels to apply to the container. + * (optional) `environment_variables` are any additional task-specific + environment variables that should be applied to the container. Note that + environment variables are not expanded and are passed as-is. You will + need to source the environment file + `$AZ_BATCH_TASK_WORKING_DIR/.shipyard.envlist` in a shell within the + docker `command` or `entrypoint` if you want any environment variables + to be expanded. + * (optional) `environment_variables_keyvault_secret_id` are any additional + task-specific environment variables that should be applied to the + container but are stored in KeyVault. The secret stored in KeyVault must + be a valid json string, e.g., `{ "env_var_name": "env_var_value" }`. + * (optional) `ports` is an array of port specifications that should be + exposed to the host. + * (optional) `data_volumes` is an array of `data_volume` aliases as defined + in the global configuration file. These volumes will be mounted in the + container. + * (optional) `shared_data_volumes` is an array of `shared_data_volume` + aliases as defined in the global configuration file. These volumes will be + mounted in the container. + * (optional) `resource_files` is an array of resource files that should be + downloaded as part of the task. Each array entry contains the following + information: + * `file_path` is the path within the task working directory to place the + file on the compute node. + * `blob_source` is an accessible HTTP/HTTPS URL. This need not be an Azure + Blob Storage URL. + * `file_mode` if the file mode to set for the file on the compute node. + This is optional. + * (optional) `input_data` is an object containing data that should be + ingressed for this specific task. This object currently supports + `azure_batch` and `azure_storage` as members. Note for multi-instance + tasks, transfer of `input_data` is only applied to the task running the + application command. + * `azure_batch` contains the following members: + * (required) `job_id` the job id of the task + * (required) `task_id` the id of the task to fetch files from + * (optional) `include` is an array of include filters + * (optional) `exclude` is an array of exclude filters + * (optional) `destination` is the destination path to place the files. + If `destination` is not specified at this level, then files are + defaulted to download into `$AZ_BATCH_TASK_WORKING_DIR`. + * `azure_storage` contains the following members: + * (required) `storage_account_settings` contains a storage account link + as defined in the credentials json. + * (required) `container` or `file_share` is required when downloading + from Azure Blob Storage or Azure File Storage, respectively. + `container` specifies which container to download from for Azure Blob + Storage while `file_share` specifies which file share to download from + for Azure File Storage. Only one of these properties can be specified + per `data_transfer` object. + * (optional) `include` property defines an optional include filter. + Although this property is an array, it is only allowed to have 1 + maximum filter. + * (optional) `destination` property defines where to place the + downloaded files on the host file system. Unlike the job-level + version of `input_data`, this `destination` property can be ommitted. + If `destination` is not specified at this level, then files are + defaulted to download into `$AZ_BATCH_TASK_WORKING_DIR`. Please note + that you should not specify a destination that is on a shared file + system. If you require ingressing to a shared file system location + like a GlusterFS volume, then use the global configuration `files` + property and the `data ingress` command. + * (optional) `blobxfer_extra_options` are any extra options to pass to + `blobxfer`. + * (optional) `output_data` is an object containing data that should be + egressed for this specific task if and only if the task completes + successfully. This object currently only supports `azure_storage` as a + member. Note for multi-instance tasks, transfer of `output_data` is only + applied to the task running the application command. + * `azure_storage` contains the following members: + * (required) `storage_account_settings` contains a storage account link + as defined in the credentials json. + * (required) `container` or `file_share` is required when uploading to + Azure Blob Storage or Azure File Storage, respectively. `container` + specifies which container to upload to for Azure Blob Storage while + `file_share` specifies which file share to upload to for Azure File + Storage. Only one of these properties can be specified per + `data_transfer` object. + * (optional) `source` property defines which directory to upload to + Azure storage. If `source` is not specified, then `source` is + defaulted to `$AZ_BATCH_TASK_DIR`. + * (optional) `include` property defines an optional include filter. + Although this property is an array, it is only allowed to have 1 + maximum filter. + * (optional) `blobxfer_extra_options` are any extra options to pass to + `blobxfer`. + * (optional) `remove_container_after_exit` property specifies if the + container should be automatically removed/cleaned up after it exits. This + defaults to `false`. + * (optional) `shm_size` property specifies the size of `/dev/shm` in + the container. The default is `64m`. The postfix unit can be designated + as `b` (bytes), `k` (kilobytes), `m` (megabytes), or `g` (gigabytes). This + value may need to be increased from the default of `64m` for certain + Docker applications, including multi-instance tasks using Intel MPI + (see [issue #8](https://github.com/Azure/batch-shipyard/issues/8)). + * (optional) `additional_docker_run_options` is an array of addition Docker + run options that should be passed to the Docker daemon when starting this + container. + * (optional) `infiniband` designates if this container requires access to the + Infiniband/RDMA devices on the host. Note that this will automatically + force the container to use the host network stack. If this property is + set to `true`, ensure that the `pool_specification` property + `inter_node_communication_enabled` is set to `true`. + * (optional) `gpu` designates if this container requires access to the GPU + devices on the host. If this property is set to `true`, Docker containers + are instantiated via `nvidia-docker`. This requires N-series VM instances. + * (optional) `max_task_retries` sets the maximum number of times that + Azure Batch should retry this task for. This overrides the job-level task + retry count. By default, Azure Batch does not retry tasks that fail + (i.e. `max_task_retries` is 0). + * (optional) `retention_time` sets the timedelta to retain the task + directory on the compute node where it ran after the task completes. + The format for this property is a timedelta with a string representation + of "d.HH:mm:ss". For example, "1.12:00:00" would allow the compute node + to clean up this task's directory 36 hours after the task completed. The + default, if unspecified, is effectively infinite - i.e., task data is + retained forever on the compute node that ran the task. + * (optional) `multi_instance` is a property indicating that this task is a + multi-instance task. This is required if the Docker image is an MPI + program. Additional information about multi-instance tasks and Batch + Shipyard can be found + [here](80-batch-shipyard-multi-instance-tasks.md). Do not define this + property for tasks that are not multi-instance. Additional members of this + property are: + * `num_instances` is a property setting the number of compute node + instances are required for this multi-instance task. This can be any one + of the following: + 1. An integral number + 2. `pool_current_dedicated` which is the instantaneous reading of the + target pool's current dedicated count during this function invocation. + 3. `pool_specification_vm_count` which is the `vm_count` specified in the + pool configuration. + * `coordination_command` is the coordination command this is run by each + instance (compute node) of this multi-instance task prior to the + application command. This command must not block and must exit + successfully for the multi-instance task to proceed. This is the command + passed to the container in `docker run` for multi-instance tasks. This + docker container instance will automatically be daemonized. This is + optional and may be null. + * `resource_files` is an array of resource files that should be downloaded + as part of the multi-instance task. Each array entry contains the + following information: + * `file_path` is the path within the task working directory to place + the file on the compute node. + * `blob_source` is an accessible HTTP/HTTPS URL. This need not be an + Azure Blob Storage URL. + * `file_mode` if the file mode to set for the file on the compute node. + This is optional. + * (optional) `entrypoint` is the property that can override the Docker image + defined `ENTRYPOINT`. + * (optional) `command` is the command to execute in the Docker container + context. If this task is a regular non-multi-instance task, then this is + the command passed to the container context during `docker run`. If this + task is a multi-instance task, then this `command` is the application + command and is executed with `docker exec` in the running Docker container + context from the `coordination_command` in the `multi_instance` property. + This property may be null. + +## Full template +An full template of a credentials file can be found +[here](../config\_templates/jobs.json). Note that this template cannot +be used as-is and must be modified to fit your scenario. diff --git a/docs/99-current-limitations.md b/docs/99-current-limitations.md index d4dc65b..c20d455 100644 --- a/docs/99-current-limitations.md +++ b/docs/99-current-limitations.md @@ -17,7 +17,9 @@ The following are general limitations or restrictions: * Compute pool resize down (i.e., removing nodes from a pool) is not supported when peer-to-peer transfer is enabled. * The maximum number of compute nodes with peer-to-peer enabled is currently -40 for Linux pools for non-UserSubscription Batch accounts. +40 for Linux pools for non-UserSubscription Batch accounts. This check is +no longer performed before a pool is created and will instead result in +a ResizeError on the pool if not all compute nodes can be allocated. * Data movement between Batch tasks as defined by `input_data`:`azure_batch` is restricted to Batch accounts with keys (non-AAD). * Virtual network support in Batch pools can only be used with diff --git a/docs/README.md b/docs/README.md index f05efe8..dbc67ec 100644 --- a/docs/README.md +++ b/docs/README.md @@ -6,6 +6,10 @@ and effectively running your batch-style Docker workloads on Azure Batch. 2. [Installation](01-batch-shipyard-installation.md) 3. [Quick Start](02-batch-shipyard-quickstart.md) 4. [Configuration](10-batch-shipyard-configuration.md) + 1. [Credentials Configuration](11-batch-shipyard-configuration-credentials.md) + 2. [Global Configuration](12-batch-shipyard-configuration-global.md) + 3. [Pool Configuration](13-batch-shipyard-configuration-pool.md) + 4. [Jobs Configuration](14-batch-shipyard-configuration-jobs.md) 5. [Usage](20-batch-shipyard-usage.md) 6. [Data Movement](70-batch-shipyard-data-movement.md) 7. [Azure KeyVault for Credential Management](74-batch-shipyard-azure-keyvault.md) diff --git a/recipes/CNTK-CPU-Infiniband-IntelMPI/config/credentials.json b/recipes/CNTK-CPU-Infiniband-IntelMPI/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/CNTK-CPU-Infiniband-IntelMPI/config/credentials.json +++ b/recipes/CNTK-CPU-Infiniband-IntelMPI/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/CNTK-CPU-OpenMPI/config/multinode/credentials.json b/recipes/CNTK-CPU-OpenMPI/config/multinode/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/CNTK-CPU-OpenMPI/config/multinode/credentials.json +++ b/recipes/CNTK-CPU-OpenMPI/config/multinode/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/CNTK-CPU-OpenMPI/config/multinode/pool.json b/recipes/CNTK-CPU-OpenMPI/config/multinode/pool.json index 9ec459b..110b57c 100644 --- a/recipes/CNTK-CPU-OpenMPI/config/multinode/pool.json +++ b/recipes/CNTK-CPU-OpenMPI/config/multinode/pool.json @@ -6,7 +6,7 @@ "inter_node_communication_enabled": true, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/CNTK-CPU-OpenMPI/config/singlenode/credentials.json b/recipes/CNTK-CPU-OpenMPI/config/singlenode/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/CNTK-CPU-OpenMPI/config/singlenode/credentials.json +++ b/recipes/CNTK-CPU-OpenMPI/config/singlenode/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/CNTK-CPU-OpenMPI/config/singlenode/pool.json b/recipes/CNTK-CPU-OpenMPI/config/singlenode/pool.json index 23db972..fa90e76 100644 --- a/recipes/CNTK-CPU-OpenMPI/config/singlenode/pool.json +++ b/recipes/CNTK-CPU-OpenMPI/config/singlenode/pool.json @@ -6,7 +6,7 @@ "inter_node_communication_enabled": true, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/CNTK-GPU-OpenMPI/README.md b/recipes/CNTK-GPU-OpenMPI/README.md index 501d9b0..8d29d3a 100644 --- a/recipes/CNTK-GPU-OpenMPI/README.md +++ b/recipes/CNTK-GPU-OpenMPI/README.md @@ -20,7 +20,7 @@ compute application, it is best to choose `NC` VM instances. once they are available for N-series VMs. * `offer` should be `UbuntuServer`. Other offers will be supported once they are available for N-series VMs. -* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are +* `sku` should be `16.04-LTS`. Other skus will be supported once they are available for N-series VMs. * `inter_node_communication_enabled` must be set to `true` * `max_tasks_per_node` must be set to 1 or omitted diff --git a/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/credentials.json b/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/credentials.json +++ b/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/pool.json b/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/pool.json index 4ab8344..6c63cc0 100644 --- a/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/pool.json +++ b/recipes/CNTK-GPU-OpenMPI/config/multinode-multigpu/pool.json @@ -6,7 +6,7 @@ "inter_node_communication_enabled": true, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/credentials.json b/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/credentials.json +++ b/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/pool.json b/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/pool.json index 0cee0cf..66190c5 100644 --- a/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/pool.json +++ b/recipes/CNTK-GPU-OpenMPI/config/singlenode-multigpu/pool.json @@ -5,7 +5,7 @@ "vm_count": 1, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/credentials.json b/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/credentials.json +++ b/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/pool.json b/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/pool.json index 38d50f4..481a6ee 100644 --- a/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/pool.json +++ b/recipes/CNTK-GPU-OpenMPI/config/singlenode-singlegpu/pool.json @@ -5,7 +5,7 @@ "vm_count": 1, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/Caffe-CPU/config/credentials.json b/recipes/Caffe-CPU/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/Caffe-CPU/config/credentials.json +++ b/recipes/Caffe-CPU/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/Caffe-CPU/config/pool.json b/recipes/Caffe-CPU/config/pool.json index 341f476..6109e1a 100644 --- a/recipes/Caffe-CPU/config/pool.json +++ b/recipes/Caffe-CPU/config/pool.json @@ -5,7 +5,7 @@ "vm_count": 1, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/Caffe-GPU/README.md b/recipes/Caffe-GPU/README.md index bd3317c..707b6f4 100644 --- a/recipes/Caffe-GPU/README.md +++ b/recipes/Caffe-GPU/README.md @@ -17,7 +17,7 @@ compute application, it is best to choose `NC` VM instances. once they are available for N-series VMs. * `offer` should be `UbuntuServer`. Other offers will be supported once they are available for N-series VMs. -* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are +* `sku` should be `16.04-LTS`. Other skus will be supported once they are available for N-series VMs. ### Global Configuration diff --git a/recipes/Caffe-GPU/config/credentials.json b/recipes/Caffe-GPU/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/Caffe-GPU/config/credentials.json +++ b/recipes/Caffe-GPU/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/Caffe-GPU/config/pool.json b/recipes/Caffe-GPU/config/pool.json index 33a6f26..ca45ab6 100644 --- a/recipes/Caffe-GPU/config/pool.json +++ b/recipes/Caffe-GPU/config/pool.json @@ -5,7 +5,7 @@ "vm_count": 1, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/Chainer-CPU/config/credentials.json b/recipes/Chainer-CPU/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/Chainer-CPU/config/credentials.json +++ b/recipes/Chainer-CPU/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/Chainer-CPU/config/pool.json b/recipes/Chainer-CPU/config/pool.json index ab98728..4ec8659 100644 --- a/recipes/Chainer-CPU/config/pool.json +++ b/recipes/Chainer-CPU/config/pool.json @@ -5,7 +5,7 @@ "vm_count": 1, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/Chainer-GPU/README.md b/recipes/Chainer-GPU/README.md index 25db219..236f9f6 100644 --- a/recipes/Chainer-GPU/README.md +++ b/recipes/Chainer-GPU/README.md @@ -17,7 +17,7 @@ compute application, it is best to choose `NC` VM instances. once they are available for N-series VMs. * `offer` should be `UbuntuServer`. Other offers will be supported once they are available for N-series VMs. -* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are +* `sku` should be `16.04-LTS`. Other skus will be supported once they are available for N-series VMs. ### Global Configuration diff --git a/recipes/Chainer-GPU/config/credentials.json b/recipes/Chainer-GPU/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/Chainer-GPU/config/credentials.json +++ b/recipes/Chainer-GPU/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/Chainer-GPU/config/pool.json b/recipes/Chainer-GPU/config/pool.json index 5e0832f..871a7c6 100644 --- a/recipes/Chainer-GPU/config/pool.json +++ b/recipes/Chainer-GPU/config/pool.json @@ -5,7 +5,7 @@ "vm_count": 1, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/FFmpeg-GPU/README.md b/recipes/FFmpeg-GPU/README.md index eab56b4..cf87f08 100644 --- a/recipes/FFmpeg-GPU/README.md +++ b/recipes/FFmpeg-GPU/README.md @@ -18,7 +18,7 @@ audio/video, it is best to choose `NV` VM instances. once they are available for N-series VMs. * `offer` should be `UbuntuServer`. Other offers will be supported once they are available for N-series VMs. -* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are +* `sku` should be `16.04-LTS`. Other skus will be supported once they are available for N-series VMs. * `gpu` property should be specified with the following members: * `nvidia_driver` property contains the following members: diff --git a/recipes/FFmpeg-GPU/config/credentials.json b/recipes/FFmpeg-GPU/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/FFmpeg-GPU/config/credentials.json +++ b/recipes/FFmpeg-GPU/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/FFmpeg-GPU/config/pool.json b/recipes/FFmpeg-GPU/config/pool.json index d5eaec8..9fc2154 100644 --- a/recipes/FFmpeg-GPU/config/pool.json +++ b/recipes/FFmpeg-GPU/config/pool.json @@ -5,7 +5,7 @@ "vm_count": 1, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/HPCG-Infiniband-IntelMPI/config/credentials.json b/recipes/HPCG-Infiniband-IntelMPI/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/HPCG-Infiniband-IntelMPI/config/credentials.json +++ b/recipes/HPCG-Infiniband-IntelMPI/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/HPLinpack-Infiniband-IntelMPI/config/credentials.json b/recipes/HPLinpack-Infiniband-IntelMPI/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/HPLinpack-Infiniband-IntelMPI/config/credentials.json +++ b/recipes/HPLinpack-Infiniband-IntelMPI/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/Keras+Theano-CPU/config/credentials.json b/recipes/Keras+Theano-CPU/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/Keras+Theano-CPU/config/credentials.json +++ b/recipes/Keras+Theano-CPU/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/Keras+Theano-CPU/config/pool.json b/recipes/Keras+Theano-CPU/config/pool.json index a5a0518..aec1ecf 100644 --- a/recipes/Keras+Theano-CPU/config/pool.json +++ b/recipes/Keras+Theano-CPU/config/pool.json @@ -5,7 +5,7 @@ "vm_count": 1, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/Keras+Theano-GPU/README.md b/recipes/Keras+Theano-GPU/README.md index 6d6b982..c8d4cf5 100644 --- a/recipes/Keras+Theano-GPU/README.md +++ b/recipes/Keras+Theano-GPU/README.md @@ -18,7 +18,7 @@ compute application, it is best to choose `NC` VM instances. once they are available for N-series VMs. * `offer` should be `UbuntuServer`. Other offers will be supported once they are available for N-series VMs. -* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are +* `sku` should be `16.04-LTS`. Other skus will be supported once they are available for N-series VMs. ### Global Configuration diff --git a/recipes/Keras+Theano-GPU/config/credentials.json b/recipes/Keras+Theano-GPU/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/Keras+Theano-GPU/config/credentials.json +++ b/recipes/Keras+Theano-GPU/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/Keras+Theano-GPU/config/pool.json b/recipes/Keras+Theano-GPU/config/pool.json index 15ecd66..88b4c2c 100644 --- a/recipes/Keras+Theano-GPU/config/pool.json +++ b/recipes/Keras+Theano-GPU/config/pool.json @@ -5,7 +5,7 @@ "vm_count": 1, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/MXNet-CPU/config/multinode/credentials.json b/recipes/MXNet-CPU/config/multinode/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/MXNet-CPU/config/multinode/credentials.json +++ b/recipes/MXNet-CPU/config/multinode/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/MXNet-CPU/config/multinode/pool.json b/recipes/MXNet-CPU/config/multinode/pool.json index 4a823fa..2f36646 100644 --- a/recipes/MXNet-CPU/config/multinode/pool.json +++ b/recipes/MXNet-CPU/config/multinode/pool.json @@ -6,7 +6,7 @@ "inter_node_communication_enabled": true, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/MXNet-CPU/config/singlenode/credentials.json b/recipes/MXNet-CPU/config/singlenode/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/MXNet-CPU/config/singlenode/credentials.json +++ b/recipes/MXNet-CPU/config/singlenode/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/MXNet-CPU/config/singlenode/pool.json b/recipes/MXNet-CPU/config/singlenode/pool.json index de74395..e6134a1 100644 --- a/recipes/MXNet-CPU/config/singlenode/pool.json +++ b/recipes/MXNet-CPU/config/singlenode/pool.json @@ -6,7 +6,7 @@ "inter_node_communication_enabled": true, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/MXNet-GPU/README.md b/recipes/MXNet-GPU/README.md index 3e7727f..913cfa0 100644 --- a/recipes/MXNet-GPU/README.md +++ b/recipes/MXNet-GPU/README.md @@ -17,7 +17,7 @@ compute application, it is best to choose `NC` VM instances. once they are available for N-series VMs. * `offer` should be `UbuntuServer`. Other offers will be supported once they are available for N-series VMs. -* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are +* `sku` should be `16.04-LTS`. Other skus will be supported once they are available for N-series VMs. * `inter_node_communication_enabled` must be set to `true` * `max_tasks_per_node` must be set to 1 or omitted diff --git a/recipes/MXNet-GPU/config/multinode/credentials.json b/recipes/MXNet-GPU/config/multinode/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/MXNet-GPU/config/multinode/credentials.json +++ b/recipes/MXNet-GPU/config/multinode/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/MXNet-GPU/config/multinode/pool.json b/recipes/MXNet-GPU/config/multinode/pool.json index 284477e..5999a4f 100644 --- a/recipes/MXNet-GPU/config/multinode/pool.json +++ b/recipes/MXNet-GPU/config/multinode/pool.json @@ -5,7 +5,7 @@ "vm_count": 2, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/MXNet-GPU/config/singlenode/credentials.json b/recipes/MXNet-GPU/config/singlenode/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/MXNet-GPU/config/singlenode/credentials.json +++ b/recipes/MXNet-GPU/config/singlenode/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/MXNet-GPU/config/singlenode/pool.json b/recipes/MXNet-GPU/config/singlenode/pool.json index c0a3ffc..7fae2b5 100644 --- a/recipes/MXNet-GPU/config/singlenode/pool.json +++ b/recipes/MXNet-GPU/config/singlenode/pool.json @@ -5,7 +5,7 @@ "vm_count": 1, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/NAMD-GPU/README.md b/recipes/NAMD-GPU/README.md index 4280d7c..c4119f5 100644 --- a/recipes/NAMD-GPU/README.md +++ b/recipes/NAMD-GPU/README.md @@ -19,7 +19,7 @@ compute application, it is best to choose `NC` VM instances. once they are available for N-series VMs. * `offer` should be `UbuntuServer`. Other offers will be supported once they are available for N-series VMs. -* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are +* `sku` should be `16.04-LTS`. Other skus will be supported once they are available for N-series VMs. * `max_tasks_per_node` must be set to 1 or omitted diff --git a/recipes/NAMD-GPU/config/credentials.json b/recipes/NAMD-GPU/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/NAMD-GPU/config/credentials.json +++ b/recipes/NAMD-GPU/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/NAMD-GPU/config/pool.json b/recipes/NAMD-GPU/config/pool.json index 4b07db6..a9a9860 100644 --- a/recipes/NAMD-GPU/config/pool.json +++ b/recipes/NAMD-GPU/config/pool.json @@ -5,7 +5,7 @@ "vm_count": 1, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/NAMD-Infiniband-IntelMPI/config/credentials.json b/recipes/NAMD-Infiniband-IntelMPI/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/NAMD-Infiniband-IntelMPI/config/credentials.json +++ b/recipes/NAMD-Infiniband-IntelMPI/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/NAMD-TCP/config/credentials.json b/recipes/NAMD-TCP/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/NAMD-TCP/config/credentials.json +++ b/recipes/NAMD-TCP/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/OpenFOAM-Infiniband-IntelMPI/config/credentials.json b/recipes/OpenFOAM-Infiniband-IntelMPI/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/OpenFOAM-Infiniband-IntelMPI/config/credentials.json +++ b/recipes/OpenFOAM-Infiniband-IntelMPI/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/OpenFOAM-TCP-OpenMPI/config/credentials.json b/recipes/OpenFOAM-TCP-OpenMPI/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/OpenFOAM-TCP-OpenMPI/config/credentials.json +++ b/recipes/OpenFOAM-TCP-OpenMPI/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/TensorFlow-CPU/config/credentials.json b/recipes/TensorFlow-CPU/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/TensorFlow-CPU/config/credentials.json +++ b/recipes/TensorFlow-CPU/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/TensorFlow-CPU/config/pool.json b/recipes/TensorFlow-CPU/config/pool.json index 9eddb8e..125be08 100644 --- a/recipes/TensorFlow-CPU/config/pool.json +++ b/recipes/TensorFlow-CPU/config/pool.json @@ -5,7 +5,7 @@ "vm_count": 1, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/TensorFlow-Distributed/README.md b/recipes/TensorFlow-Distributed/README.md index c5a2fec..1982a98 100644 --- a/recipes/TensorFlow-Distributed/README.md +++ b/recipes/TensorFlow-Distributed/README.md @@ -20,7 +20,7 @@ If not using GPUs, another appropriate SKU can be selected. supported once they are available for N-series VMs. * `offer` should be `UbuntuServer` if using GPUs. Other offers will be supported once they are available for N-series VMs. -* `sku` should be `16.04.0-LTS` if using GPUs. Other skus will be supported +* `sku` should be `16.04-LTS` if using GPUs. Other skus will be supported once they are available for N-series VMs. If on multiple CPUs: diff --git a/recipes/TensorFlow-Distributed/config/cpu/credentials.json b/recipes/TensorFlow-Distributed/config/cpu/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/TensorFlow-Distributed/config/cpu/credentials.json +++ b/recipes/TensorFlow-Distributed/config/cpu/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/TensorFlow-Distributed/config/cpu/pool.json b/recipes/TensorFlow-Distributed/config/cpu/pool.json index 4e3d80b..e083c64 100644 --- a/recipes/TensorFlow-Distributed/config/cpu/pool.json +++ b/recipes/TensorFlow-Distributed/config/cpu/pool.json @@ -6,7 +6,7 @@ "inter_node_communication_enabled": true, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/TensorFlow-Distributed/config/gpu/credentials.json b/recipes/TensorFlow-Distributed/config/gpu/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/TensorFlow-Distributed/config/gpu/credentials.json +++ b/recipes/TensorFlow-Distributed/config/gpu/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/TensorFlow-Distributed/config/gpu/pool.json b/recipes/TensorFlow-Distributed/config/gpu/pool.json index c17621f..c1e0d98 100644 --- a/recipes/TensorFlow-Distributed/config/gpu/pool.json +++ b/recipes/TensorFlow-Distributed/config/gpu/pool.json @@ -6,7 +6,7 @@ "inter_node_communication_enabled": true, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/TensorFlow-GPU/README.md b/recipes/TensorFlow-GPU/README.md index a08e8d7..66941e3 100644 --- a/recipes/TensorFlow-GPU/README.md +++ b/recipes/TensorFlow-GPU/README.md @@ -17,7 +17,7 @@ compute application, it is best to choose `NC` VM instances. once they are available for N-series VMs. * `offer` should be `UbuntuServer`. Other offers will be supported once they are available for N-series VMs. -* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are +* `sku` should be `16.04-LTS`. Other skus will be supported once they are available for N-series VMs. ### Global Configuration diff --git a/recipes/TensorFlow-GPU/config/credentials.json b/recipes/TensorFlow-GPU/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/TensorFlow-GPU/config/credentials.json +++ b/recipes/TensorFlow-GPU/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/TensorFlow-GPU/config/pool.json b/recipes/TensorFlow-GPU/config/pool.json index 272f12c..9c68ff2 100644 --- a/recipes/TensorFlow-GPU/config/pool.json +++ b/recipes/TensorFlow-GPU/config/pool.json @@ -5,7 +5,7 @@ "vm_count": 1, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/Torch-CPU/config/credentials.json b/recipes/Torch-CPU/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/Torch-CPU/config/credentials.json +++ b/recipes/Torch-CPU/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/Torch-CPU/config/pool.json b/recipes/Torch-CPU/config/pool.json index 49b0127..21e64fd 100644 --- a/recipes/Torch-CPU/config/pool.json +++ b/recipes/Torch-CPU/config/pool.json @@ -5,7 +5,7 @@ "vm_count": 1, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/recipes/Torch-GPU/README.md b/recipes/Torch-GPU/README.md index 1c2ef08..e283dee 100644 --- a/recipes/Torch-GPU/README.md +++ b/recipes/Torch-GPU/README.md @@ -17,7 +17,7 @@ compute application, it is best to choose `NC` VM instances. once they are available for N-series VMs. * `offer` should be `UbuntuServer`. Other offers will be supported once they are available for N-series VMs. -* `sku` should be `16.04.0-LTS`. Other skus will be supported once they are +* `sku` should be `16.04-LTS`. Other skus will be supported once they are available for N-series VMs. ### Global Configuration diff --git a/recipes/Torch-GPU/config/credentials.json b/recipes/Torch-GPU/config/credentials.json index 451e167..e9ffd05 100644 --- a/recipes/Torch-GPU/config/credentials.json +++ b/recipes/Torch-GPU/config/credentials.json @@ -1,7 +1,6 @@ { "credentials": { "batch": { - "account": "", "account_key": "", "account_service_url": "" }, diff --git a/recipes/Torch-GPU/config/pool.json b/recipes/Torch-GPU/config/pool.json index 35bcaa9..d62ed03 100644 --- a/recipes/Torch-GPU/config/pool.json +++ b/recipes/Torch-GPU/config/pool.json @@ -5,7 +5,7 @@ "vm_count": 1, "publisher": "Canonical", "offer": "UbuntuServer", - "sku": "16.04.0-LTS", + "sku": "16.04-LTS", "ssh": { "username": "docker" }, diff --git a/scripts/shipyard_remotefs_stat.sh b/scripts/shipyard_remotefs_stat.sh index 845476d..4c0ef74 100755 --- a/scripts/shipyard_remotefs_stat.sh +++ b/scripts/shipyard_remotefs_stat.sh @@ -60,6 +60,9 @@ if [ $server_type == "nfs" ]; then echo "" echo "nfsstat:" nfsstat -s -4 + echo "" + echo "connected clients:" + netstat -tn | grep :2049 else echo "$server_type not supported." exit 1