# Copyright (c) Microsoft Corporation # # All rights reserved. # # MIT License # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. # compat imports from __future__ import ( absolute_import, division, print_function, unicode_literals ) from builtins import ( # noqa bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # stdlib imports import logging import os try: import pathlib2 as pathlib except ImportError: import pathlib import requests import tempfile import time import uuid # non-stdlib imports import azure.batch.models as batchmodels # local imports from . import autoscale from . import batch from . import crypto from . import data from . import keyvault from . import misc from . import remotefs from . import resource from . import settings from . import storage from . import util from .version import __version__ # create logger logger = logging.getLogger(__name__) util.setup_logger(logger) # global defines _REQUEST_CHUNK_SIZE = 4194304 _ROOT_PATH = pathlib.Path(__file__).resolve().parent.parent _RESOURCES_PATH = None _NVIDIA_DRIVER = { 'compute': { 'url': ( 'http://us.download.nvidia.com/tesla/' '384.111/NVIDIA-Linux-x86_64-384.111.run' ), 'sha256': ( 'bd8af7654ccb224c37e74c8e81477a42f63fa9f2360b1b1ec6ae00b03ae21054' ), 'target': 'nvidia-driver.run' }, 'visualization': { 'url': 'https://go.microsoft.com/fwlink/?linkid=849941', 'sha256': ( 'ca3fd5f5e9156ad3d983b2032bde3c009dca73400f2753f9b475825f4670a854' ), 'target': 'nvidia-driver-grid.run' }, 'license': ( 'http://www.nvidia.com/content/DriverDownload-March2009' '/licence.php?lang=us' ), } _CASCADE_FILE = ( 'cascade.py', pathlib.Path(_ROOT_PATH, 'cascade/cascade.py') ) _PERF_FILE = ( 'perf.py', pathlib.Path(_ROOT_PATH, 'cascade/perf.py') ) _NODEPREP_FILE = ( 'shipyard_nodeprep.sh', pathlib.Path(_ROOT_PATH, 'scripts/shipyard_nodeprep.sh') ) _NODEPREP_CUSTOMIMAGE_FILE = ( 'shipyard_nodeprep_customimage.sh', pathlib.Path(_ROOT_PATH, 'scripts/shipyard_nodeprep_customimage.sh') ) _NODEPREP_NATIVEDOCKER_FILE = ( 'shipyard_nodeprep_nativedocker.sh', pathlib.Path(_ROOT_PATH, 'scripts/shipyard_nodeprep_nativedocker.sh') ) _NODEPREP_WINDOWS_FILE = ( 'shipyard_nodeprep_nativedocker.ps1', pathlib.Path( _ROOT_PATH, 'scripts/windows/shipyard_nodeprep_nativedocker.ps1' ) ) _GLUSTERPREP_FILE = ( 'shipyard_glusterfs_on_compute.sh', pathlib.Path(_ROOT_PATH, 'scripts/shipyard_glusterfs_on_compute.sh') ) _GLUSTERRESIZE_FILE = ( 'shipyard_glusterfs_on_compute_resize.sh', pathlib.Path( _ROOT_PATH, 'scripts/shipyard_glusterfs_on_compute_resize.sh') ) _HPNSSH_FILE = ( 'shipyard_hpnssh.sh', pathlib.Path(_ROOT_PATH, 'scripts/shipyard_hpnssh.sh') ) _IMAGE_BLOCK_FILE = ( 'wait_for_images.sh', pathlib.Path(_ROOT_PATH, 'scripts/wait_for_images.sh') ) _REGISTRY_LOGIN_FILE = ( 'registry_login.sh', pathlib.Path(_ROOT_PATH, 'scripts/registry_login.sh') ) _REGISTRY_LOGIN_WINDOWS_FILE = ( 'registry_login.ps1', pathlib.Path(_ROOT_PATH, 'scripts/windows/registry_login.ps1') ) _BLOBXFER_FILE = ( 'shipyard_blobxfer.sh', pathlib.Path(_ROOT_PATH, 'scripts/shipyard_blobxfer.sh') ) _BLOBXFER_WINDOWS_FILE = ( 'shipyard_blobxfer.ps1', pathlib.Path(_ROOT_PATH, 'scripts/windows/shipyard_blobxfer.ps1') ) _REMOTEFSPREP_FILE = ( 'shipyard_remotefs_bootstrap.sh', pathlib.Path(_ROOT_PATH, 'scripts/shipyard_remotefs_bootstrap.sh') ) _REMOTEFSADDBRICK_FILE = ( 'shipyard_remotefs_addbrick.sh', pathlib.Path(_ROOT_PATH, 'scripts/shipyard_remotefs_addbrick.sh') ) _REMOTEFSSTAT_FILE = ( 'shipyard_remotefs_stat.sh', pathlib.Path(_ROOT_PATH, 'scripts/shipyard_remotefs_stat.sh') ) _ALL_REMOTEFS_FILES = [ _REMOTEFSPREP_FILE, _REMOTEFSADDBRICK_FILE, _REMOTEFSSTAT_FILE, ] def initialize_globals(verbose): # type: (bool) -> None """Initialize any runtime globals :param bool verbose: verbose """ global _RESOURCES_PATH if _RESOURCES_PATH is None: _RESOURCES_PATH = _ROOT_PATH / 'resources' if not _RESOURCES_PATH.exists(): _RESOURCES_PATH = pathlib.Path( tempfile.gettempdir()) / 'batch-shipyard-{}-resources'.format( __version__) _RESOURCES_PATH.mkdir(parents=True, exist_ok=True) if verbose: logger.debug('initialized resources path to: {}'.format( _RESOURCES_PATH)) def populate_global_settings(config, fs_storage, pool_id=None): # type: (dict, bool) -> None """Populate global settings from config :param dict config: configuration dict :param bool fs_storage: adjust for fs context :param str pool_id: pool id override """ bs = settings.batch_shipyard_settings(config) sc = settings.credentials_storage(config, bs.storage_account_settings) if fs_storage: # set postfix to empty for now, it will be populated with the # storage cluster during the actual calls postfix = '' if util.is_not_empty(pool_id): raise ValueError('pool id specified for fs_storage') else: bc = settings.credentials_batch(config) if util.is_none_or_empty(pool_id): pool_id = settings.pool_id(config, lower=True) postfix = '-'.join((bc.account.lower(), pool_id)) storage.set_storage_configuration( bs.storage_entity_prefix, postfix, sc.account, sc.account_key, sc.endpoint, bs.generated_sas_expiry_days) def fetch_credentials_conf_from_keyvault( keyvault_client, keyvault_uri, keyvault_credentials_secret_id): # type: (azure.keyvault.KeyVaultClient, str, str) -> dict """Fetch a credentials conf from keyvault :param azure.keyvault.KeyVaultClient keyvault_client: keyvault client :param str keyvault_uri: keyvault uri :param str keyvault_credentials_secret_id: keyvault cred secret id :rtype: dict :return: credentials conf """ if keyvault_uri is None: raise ValueError('credentials conf was not specified or is invalid') if keyvault_client is None: raise ValueError('no Azure KeyVault or AAD credentials specified') return keyvault.fetch_credentials_conf( keyvault_client, keyvault_uri, keyvault_credentials_secret_id) def fetch_secrets_from_keyvault(keyvault_client, config): # type: (azure.keyvault.KeyVaultClient, dict) -> None """Fetch secrets with secret ids in config from keyvault :param azure.keyvault.KeyVaultClient keyvault_client: keyvault client :param dict config: configuration dict """ if keyvault_client is not None: keyvault.parse_secret_ids(keyvault_client, config) def _setup_nvidia_driver_package(blob_client, config, vm_size): # type: (azure.storage.blob.BlockBlobService, dict, str) -> pathlib.Path """Set up the nvidia driver package :param azure.storage.blob.BlockBlobService blob_client: blob client :param dict config: configuration dict :param str vm_size: vm size :rtype: pathlib.Path :return: package path """ gpu_type = settings.get_gpu_type_from_vm_size(vm_size) pkg = _RESOURCES_PATH / _NVIDIA_DRIVER[gpu_type]['target'] # check to see if package is downloaded if (not pkg.exists() or util.compute_sha256_for_file(pkg, False) != _NVIDIA_DRIVER[gpu_type]['sha256']): # display license link if not util.confirm_action( config, msg=('agreement with License for Customer Use of NVIDIA ' 'Software @ {}').format(_NVIDIA_DRIVER['license']), allow_auto=True): raise RuntimeError( 'Cannot proceed with deployment due to non-agreement with ' 'license for NVIDIA driver') else: logger.info('NVIDIA Software License accepted') # download driver logger.debug('downloading NVIDIA driver to {}'.format( _NVIDIA_DRIVER[gpu_type]['target'])) response = requests.get(_NVIDIA_DRIVER[gpu_type]['url'], stream=True) with pkg.open('wb') as f: for chunk in response.iter_content(chunk_size=_REQUEST_CHUNK_SIZE): if chunk: f.write(chunk) logger.debug('wrote {} bytes to {}'.format(pkg.stat().st_size, pkg)) # check sha256 if (util.compute_sha256_for_file(pkg, False) != _NVIDIA_DRIVER[gpu_type]['sha256']): raise RuntimeError('sha256 mismatch for {}'.format(pkg)) return pkg def _generate_azure_mount_script_name( batch_account_name, pool_id, is_file_share, is_windows): # type: (str, str, bool, bool) -> pathlib.Path """Generate an azure blob/file mount script name :param str batch_account_name: batch account name :param str pool_id: pool id :param boo is_file_share: is file share :param bool is_windows: is windows :rtype: pathlib.Path :return: path to azure mount script """ if is_file_share: prefix = 'azurefile' else: prefix = 'azureblob' return _RESOURCES_PATH / '{}-mount-{}-{}.{}'.format( prefix, batch_account_name.lower(), pool_id.lower(), 'cmd' if is_windows else 'sh') def _setup_azureblob_mounts(blob_client, config, bc): # type: (azure.storage.blob.BlockBlobService, dict, # settings.BatchCredentials) -> tuple """Set up the Azure Blob container via blobfuse :param azure.storage.blob.BlockBlobService blob_client: blob client :param dict config: configuration dict :param settings.BatchCredentials bc: batch creds :rtype: tuple :return: (bin path, service file path, service env file path, volume creation script path) """ tmpmount = settings.temp_disk_mountpoint(config) # construct mount commands cmds = [] sdv = settings.global_resources_shared_data_volumes(config) for svkey in sdv: if settings.is_shared_data_volume_azure_blob(sdv, svkey): sa = settings.credentials_storage( config, settings.azure_storage_account_settings(sdv, svkey)) cont = settings.azure_blob_container_name(sdv, svkey) hmp = settings.azure_blob_host_mount_path(sa.account, cont) tmpmp = '{}/blobfuse-tmp/{}-{}'.format(tmpmount, sa.account, cont) cmds.append('mkdir -p {}'.format(hmp)) cmds.append('chmod 0770 {}'.format(hmp)) cmds.append('mkdir -p {}'.format(tmpmp)) cmds.append('chown _azbatch:_azbatchgrp {}'.format(tmpmp)) cmds.append('chmod 0770 {}'.format(tmpmp)) conn = 'azblob-{}-{}.cfg'.format(sa.account, cont) cmds.append('cat > {} << EOF'.format(conn)) cmds.append('accountName {}'.format(sa.account)) cmds.append('accountKey {}'.format(sa.account_key)) cmds.append('containerName {}'.format(cont)) cmds.append('EOF') cmd = ( 'blobfuse {hmp} --tmp-path={tmpmp} -o attr_timeout=240 ' '-o entry_timeout=240 -o negative_timeout=120 -o allow_other ' '--config-file={conn}' ).format(hmp=hmp, tmpmp=tmpmp, conn=conn) # add any additional mount options mo = settings.shared_data_volume_mount_options(sdv, svkey) if util.is_not_empty(mo): opts = [] for opt in mo: if opt.strip() == '-o allow_other': continue opts.append(opt) cmd = '{} {}'.format(cmd, ' '.join(opts)) cmds.append(cmd) # create file share mount command script if util.is_none_or_empty(cmds): raise RuntimeError('Generated Azure blob mount commands are invalid') volcreate = _generate_azure_mount_script_name( bc.account, settings.pool_id(config), False, False) newline = '\n' with volcreate.open('w', newline=newline) as f: f.write('#!/usr/bin/env bash') f.write(newline) f.write('set -e') f.write(newline) f.write('set -o pipefail') f.write(newline) for cmd in cmds: f.write(cmd) f.write(newline) return volcreate def _setup_azurefile_mounts(blob_client, config, bc, is_windows): # type: (azure.storage.blob.BlockBlobService, dict, # settings.BatchCredentials, bool) -> tuple """Set up the Azure File shares :param azure.storage.blob.BlockBlobService blob_client: blob client :param dict config: configuration dict :param settings.BatchCredentials bc: batch creds :param bool is_windows: is windows pool :rtype: tuple :return: (bin path, service file path, service env file path, volume creation script path) """ # construct mount commands cmds = [] sdv = settings.global_resources_shared_data_volumes(config) for svkey in sdv: if settings.is_shared_data_volume_azure_file(sdv, svkey): sa = settings.credentials_storage( config, settings.azure_storage_account_settings(sdv, svkey)) share = settings.azure_file_share_name(sdv, svkey) hmp = settings.azure_file_host_mount_path( sa.account, share, is_windows) if is_windows: cmd = ( 'net use \\\\{sa}.file.{ep}\{share} {sakey} ' '/user:Azure\{sa}' ).format( sa=sa.account, ep=sa.endpoint, share=share, sakey=sa.account_key) cmds.append(cmd) cmd = 'mklink /d {hmp} \\\\{sa}.file.{ep}\{share}'.format( hmp=hmp, sa=sa.account, ep=sa.endpoint, share=share) else: cmd = ( 'mount -t cifs //{sa}.file.{ep}/{share} {hmp} -o ' 'vers=3.0,username={sa},password={sakey},' 'serverino' ).format( sa=sa.account, ep=sa.endpoint, share=share, hmp=hmp, sakey=sa.account_key) # add any additional mount options mo = settings.shared_data_volume_mount_options(sdv, svkey) if util.is_not_empty(mo): opts = [] # retain backward compatibility with filemode/dirmode # options from the old Azure File Docker volume driver for opt in mo: tmp = opt.split('=') if tmp[0] == 'filemode': opts.append('file_mode={}'.format(tmp[1])) elif tmp[0] == 'dirmode': opts.append('dir_mode={}'.format(tmp[1])) else: opts.append(opt) cmd = '{},{}'.format(cmd, ','.join(opts)) if not is_windows: cmds.append('mkdir -p {}'.format(hmp)) cmds.append(cmd) # create file share mount command script if util.is_none_or_empty(cmds): raise RuntimeError('Generated Azure file mount commands are invalid') volcreate = _generate_azure_mount_script_name( bc.account, settings.pool_id(config), True, is_windows) newline = '\r\n' if is_windows else '\n' with volcreate.open('w', newline=newline) as f: if is_windows: f.write('@echo off') f.write(newline) else: f.write('#!/usr/bin/env bash') f.write(newline) f.write('set -e') f.write(newline) f.write('set -o pipefail') f.write(newline) for cmd in cmds: f.write(cmd) f.write(newline) return volcreate def _create_storage_cluster_mount_args( compute_client, network_client, batch_mgmt_client, config, sc_id, bc, subnet_id): # type: (azure.mgmt.compute.ComputeManagementClient, # azure.mgmt.network.NetworkManagementClient, # azure.mgmt.batch.BatchManagementClient, dict, str, # settings.BatchCredentials, str) -> Tuple[str, str] """Create storage cluster mount arguments :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param azure.mgmt.network.NetworkManagementClient network_client: network client :param azure.mgmt.batch.BatchManagementClient: batch_mgmt_client :param dict config: configuration dict :param str sc_id: storage cluster id :param settings.BatchCredentials bc: batch creds :param str subnet_id: subnet id :rtype: tuple :return: (fstab mount, storage cluster arg) """ fstab_mount = None sc_arg = None ba = batch.get_batch_account(batch_mgmt_client, config) # check for vnet/subnet presence if util.is_none_or_empty(subnet_id): raise RuntimeError( 'cannot mount a storage cluster without a valid virtual ' 'network or subnet') # get remotefs settings rfs = settings.remotefs_settings(config, sc_id) sc = rfs.storage_cluster # iterate through shared data volumes and fine storage clusters sdv = settings.global_resources_shared_data_volumes(config) if (sc_id not in sdv or not settings.is_shared_data_volume_storage_cluster( sdv, sc_id)): raise RuntimeError( 'No storage cluster {} found in configuration'.format(sc_id)) vnet_subid, vnet_rg, _, vnet_name, subnet_name = _explode_arm_subnet_id( subnet_id) # check for same vnet name if vnet_name.lower() != sc.virtual_network.name.lower(): raise RuntimeError( 'cannot link storage cluster {} on virtual ' 'network {} with pool virtual network {}'.format( sc_id, sc.virtual_network.name, vnet_name)) # cross check vnet resource group if vnet_rg.lower() != sc.virtual_network.resource_group.lower(): raise RuntimeError( 'cannot link storage cluster {} virtual network in resource group ' '{} with pool virtual network in resource group {}'.format( sc_id, sc.virtual_network.resource_group, vnet_rg)) # cross check vnet subscription id _ba_tmp = ba.id.lower().split('/') if vnet_subid.lower() != _ba_tmp[2]: raise RuntimeError( 'cannot link storage cluster {} virtual network in subscription ' '{} with pool virtual network in subscription {}'.format( sc_id, vnet_subid, _ba_tmp[2])) del _ba_tmp # get vm count if sc.vm_count < 1: raise RuntimeError( 'storage cluster {} vm_count {} is invalid'.format( sc_id, sc.vm_count)) # get fileserver type if sc.file_server.type == 'nfs': # query first vm for info vm_name = settings.generate_virtual_machine_name(sc, 0) vm = compute_client.virtual_machines.get( resource_group_name=sc.resource_group, vm_name=vm_name, ) nic = resource.get_nic_from_virtual_machine( network_client, sc.resource_group, vm) # get private ip of vm remote_ip = nic.ip_configurations[0].private_ip_address # construct mount options mo = '_netdev,auto,nfsvers=4,intr' amo = settings.shared_data_volume_mount_options(sdv, sc_id) if util.is_not_empty(amo): if 'udp' in mo: raise RuntimeError( ('udp cannot be specified as a mount option for ' 'storage cluster {}').format(sc_id)) if any([x.startswith('nfsvers=') for x in amo]): raise RuntimeError( ('nfsvers cannot be specified as a mount option for ' 'storage cluster {}').format(sc_id)) if any([x.startswith('port=') for x in amo]): raise RuntimeError( ('port cannot be specified as a mount option for ' 'storage cluster {}').format(sc_id)) mo = ','.join((mo, ','.join(amo))) # construct mount string for fstab fstab_mount = ( '{remoteip}:{srcpath} {hmp}/{scid} ' '{fstype} {mo} 0 2').format( remoteip=remote_ip, srcpath=sc.file_server.mountpoint, hmp=settings.get_host_mounts_path(False), scid=sc_id, fstype=sc.file_server.type, mo=mo) elif sc.file_server.type == 'glusterfs': # walk vms and find non-overlapping ud/fds primary_ip = None primary_ud = None primary_fd = None backup_ip = None backup_ud = None backup_fd = None vms = {} # first pass, attempt to populate all ip, ud/fd for i in range(sc.vm_count): vm_name = settings.generate_virtual_machine_name(sc, i) vm = compute_client.virtual_machines.get( resource_group_name=sc.resource_group, vm_name=vm_name, expand=compute_client.virtual_machines.models. InstanceViewTypes.instance_view, ) nic = resource.get_nic_from_virtual_machine( network_client, sc.resource_group, vm) vms[i] = (vm, nic) # get private ip and ud/fd of vm remote_ip = nic.ip_configurations[0].private_ip_address ud = vm.instance_view.platform_update_domain fd = vm.instance_view.platform_fault_domain if primary_ip is None: primary_ip = remote_ip primary_ud = ud primary_fd = fd if backup_ip is None: if (primary_ip == backup_ip or primary_ud == ud or primary_fd == fd): continue backup_ip = remote_ip backup_ud = ud backup_fd = fd # second pass, fill in with at least non-overlapping update domains if backup_ip is None: for i in range(sc.vm_count): vm, nic = vms[i] remote_ip = nic.ip_configurations[0].private_ip_address ud = vm.instance_view.platform_update_domain fd = vm.instance_view.platform_fault_domain if primary_ud != ud: backup_ip = remote_ip backup_ud = ud backup_fd = fd break if primary_ip is None or backup_ip is None: raise RuntimeError( 'Could not find either a primary ip {} or backup ip {} for ' 'glusterfs client mount'.format(primary_ip, backup_ip)) logger.debug('primary ip/ud/fd={} backup ip/ud/fd={}'.format( (primary_ip, primary_ud, primary_fd), (backup_ip, backup_ud, backup_fd))) # construct mount options mo = '_netdev,auto,transport=tcp,backupvolfile-server={}'.format( backup_ip) amo = settings.shared_data_volume_mount_options(sdv, sc_id) if util.is_not_empty(amo): if any([x.startswith('backupvolfile-server=') for x in amo]): raise RuntimeError( ('backupvolfile-server cannot be specified as a mount ' 'option for storage cluster {}').format(sc_id)) if any([x.startswith('transport=') for x in amo]): raise RuntimeError( ('transport cannot be specified as a mount option for ' 'storage cluster {}').format(sc_id)) mo = ','.join((mo, ','.join(amo))) # construct mount string for fstab, srcpath is the gluster volume fstab_mount = ( '{remoteip}:/{srcpath} {hmp}/{scid} ' '{fstype} {mo} 0 2').format( remoteip=primary_ip, srcpath=settings.get_file_server_glusterfs_volume_name(sc), hmp=settings.get_host_mounts_path(False), scid=sc_id, fstype=sc.file_server.type, mo=mo) else: raise NotImplementedError( ('cannot handle file_server type {} for storage ' 'cluster {}').format(sc.file_server.type, sc_id)) if util.is_none_or_empty(fstab_mount): raise RuntimeError( ('Could not construct an fstab mount entry for storage ' 'cluster {}').format(sc_id)) # construct sc_arg sc_arg = '{}:{}'.format(sc.file_server.type, sc_id) # log config if settings.verbose(config): logger.debug('storage cluster {} fstab mount: {}'.format( sc_id, fstab_mount)) return (fstab_mount, sc_arg) def _create_custom_linux_mount_args(config, mount_name): # type: (dict, str) -> str """Create a custom linux mount fstab entry :param dict config: configuration dict :param str mount_name: mount name :rtype: str :return: fstab entry """ sdv = settings.global_resources_shared_data_volumes(config) fstab = settings.custom_linux_mount_fstab_options(sdv, mount_name) fstab_mount = ( '{fs_spec} {hmp}/{name} {fs_vfstype} {fs_mntops} {fs_freq} ' '{fs_passno}').format( fs_spec=fstab.fs_spec, hmp=settings.get_host_mounts_path(False), name=mount_name, fs_vfstype=fstab.fs_vfstype, fs_mntops=fstab.fs_mntops, fs_freq=fstab.fs_freq, fs_passno=fstab.fs_passno) return fstab_mount def _pick_node_agent_for_vm(batch_client, pool_settings): # type: (azure.batch.batch_service_client.BatchServiceClient, # settings.PoolSettings) -> (str, str) """Pick a node agent id for the vm :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param settings.PoolSettings pool_settings: pool settings :rtype: tuple :return: image reference to use, node agent id to use """ publisher = pool_settings.vm_configuration.publisher offer = pool_settings.vm_configuration.offer sku = pool_settings.vm_configuration.sku # TODO special exception for CentOS HPC 7.1 if publisher == 'openlogic' and offer == 'centos-hpc' and sku == '7.1': return ({ 'publisher': publisher, 'offer': offer, 'sku': sku, 'version': pool_settings.vm_configuration.version, }, 'batch.node.centos 7') # pick latest sku node_agent_skus = batch_client.account.list_node_agent_skus() skus_to_use = [ (nas, image_ref) for nas in node_agent_skus for image_ref in sorted( nas.verified_image_references, key=lambda item: item.sku ) if image_ref.publisher.lower() == publisher and image_ref.offer.lower() == offer and image_ref.sku.lower() == sku ] try: sku_to_use, image_ref_to_use = skus_to_use[-1] except IndexError: raise RuntimeError( ('Could not find an Azure Batch Node Agent Sku for this ' 'offer={} publisher={} sku={}. You can list the valid and ' 'available Marketplace images with the command: pool ' 'listskus').format( pool_settings.vm_configuration.offer, pool_settings.vm_configuration.publisher, pool_settings.vm_configuration.sku)) # set image version to use image_ref_to_use.version = pool_settings.vm_configuration.version logger.info('deploying vm config: {}'.format(image_ref_to_use)) return (image_ref_to_use, sku_to_use.id) def _explode_arm_subnet_id(arm_subnet_id): # type: (str) -> Tuple[str, str, str, str, str] """Parses components from ARM subnet id :param str arm_subnet_id: ARM subnet id :rtype: tuple :return: subid, rg, provider, vnet, subnet """ tmp = arm_subnet_id.split('/') subid = tmp[2] rg = tmp[4] provider = tmp[6] vnet = tmp[8] subnet = tmp[10] return subid, rg, provider, vnet, subnet def _pool_virtual_network_subnet_address_space_check( resource_client, network_client, config, pool_settings, bc): # type: (azure.mgmt.resource.resources.ResourceManagementClient, # azure.mgmt.network.NetworkManagementClient, dict, # settings.PoolSettings, settings.BatchCredentialsSettings) -> str """Pool Virtual Network and subnet address space check and create if specified :param azure.mgmt.resource.resources.ResourceManagementClient resource_client: resource client :param azure.mgmt.network.NetworkManagementClient network_client: network client :param dict config: configuration dict :param settings.PoolSettings pool_settings: pool settings :param settings.BatchCredentialsSettings bc: batch cred settings :rtype: str :return: subnet id """ if (util.is_none_or_empty(pool_settings.virtual_network.arm_subnet_id) and util.is_none_or_empty(pool_settings.virtual_network.name)): logger.debug('no virtual network settings specified') return None # check if AAD is enabled if util.is_not_empty(bc.account_key): raise RuntimeError( 'cannot allocate a pool with a virtual network without AAD ' 'credentials') # get subnet object subnet_id = None if util.is_not_empty(pool_settings.virtual_network.arm_subnet_id): subnet_components = _explode_arm_subnet_id( pool_settings.virtual_network.arm_subnet_id) logger.debug( ('arm subnet id breakdown: subid={} rg={} provider={} vnet={} ' 'subnet={}').format( subnet_components[0], subnet_components[1], subnet_components[2], subnet_components[3], subnet_components[4])) subnet_id = pool_settings.virtual_network.arm_subnet_id if network_client is None: logger.info('using virtual network subnet id: {}'.format( subnet_id)) logger.warning( 'cannot perform IP space validation without a valid ' 'network_client, please specify management AAD credentials ' 'to allow pre-validation') return subnet_id # retrieve address prefix for subnet _subnet = network_client.subnets.get( subnet_components[1], subnet_components[3], subnet_components[4]) else: if util.is_not_empty(pool_settings.virtual_network.resource_group): _vnet_rg = pool_settings.virtual_network.resource_group else: _vnet_rg = bc.resource_group # create virtual network and subnet if specified _, _subnet = resource.create_virtual_network_and_subnet( resource_client, network_client, _vnet_rg, bc.location, pool_settings.virtual_network) del _vnet_rg subnet_id = _subnet.id # ensure address prefix for subnet is valid tmp = _subnet.address_prefix.split('/') if len(tmp) <= 1: raise RuntimeError( 'subnet address_prefix is invalid for Batch pools: {}'.format( _subnet.address_prefix)) mask = int(tmp[-1]) # subtract 5 for guideline and Azure numbering start allowable_addresses = (1 << (32 - mask)) - 5 logger.debug('subnet {} mask is {} and allows {} addresses'.format( _subnet.name, mask, allowable_addresses)) pool_total_vm_count = ( pool_settings.vm_count.dedicated + pool_settings.vm_count.low_priority ) if allowable_addresses < pool_total_vm_count: raise RuntimeError( ('subnet {} mask is {} and allows {} addresses but desired ' 'pool vm_count is {}').format( _subnet.name, mask, allowable_addresses, pool_total_vm_count)) elif int(allowable_addresses * 0.9) <= pool_total_vm_count: # if within 90% tolerance, warn user due to potential # address shortage if other compute resources are in this subnet if not util.confirm_action( config, msg=('subnet {} mask is {} and allows {} addresses ' 'but desired pool vm_count is {}, proceed?').format( _subnet.name, mask, allowable_addresses, pool_total_vm_count)): raise RuntimeError('Pool deployment rejected by user') logger.info('using virtual network subnet id: {}'.format(subnet_id)) return subnet_id def _construct_pool_object( resource_client, compute_client, network_client, batch_mgmt_client, batch_client, blob_client, config): # type: (azure.mgmt.resource.resources.ResourceManagementClient, # azure.mgmt.compute.ComputeManagementClient, # azure.mgmt.network.NetworkManagementClient, # azure.mgmt.batch.BatchManagementClient, # azure.batch.batch_service_client.BatchServiceClient, # azureblob.BlockBlobService, dict) -> None """Construct a pool add parameter object for create pool along with uploading resource files :param azure.mgmt.resource.resources.ResourceManagementClient resource_client: resource client :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param azure.mgmt.network.NetworkManagementClient network_client: network client :param azure.mgmt.batch.BatchManagementClient: batch_mgmt_client :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param azure.storage.blob.BlockBlobService blob_client: blob client :param dict config: configuration dict """ # check shared data volume mounts before proceeding to allocate azureblob_vd = False azurefile_vd = False gluster_on_compute = False storage_cluster_mounts = [] custom_linux_mounts = [] try: sdv = settings.global_resources_shared_data_volumes(config) for sdvkey in sdv: if settings.is_shared_data_volume_azure_file(sdv, sdvkey): azurefile_vd = True elif settings.is_shared_data_volume_azure_blob(sdv, sdvkey): azureblob_vd = True elif settings.is_shared_data_volume_gluster_on_compute( sdv, sdvkey): if gluster_on_compute: raise ValueError( 'only one glusterfs on compute can be created') gluster_on_compute = True elif settings.is_shared_data_volume_storage_cluster( sdv, sdvkey): storage_cluster_mounts.append(sdvkey) elif settings.is_shared_data_volume_custom_linux_mount( sdv, sdvkey): custom_linux_mounts.append(sdvkey) else: raise ValueError('Unknown shared data volume: {}'.format( settings.shared_data_volume_driver(sdv, sdvkey))) except KeyError: pass # retrieve settings pool_settings = settings.pool_settings(config) native = settings.is_native_docker_pool( config, vm_config=pool_settings.vm_configuration) is_windows = settings.is_windows_pool( config, vm_config=pool_settings.vm_configuration) # get autoscale settings if settings.is_pool_autoscale_enabled(config, pas=pool_settings.autoscale): asenable = True asformula = autoscale.get_formula(pool_settings) asei = pool_settings.autoscale.evaluation_interval if pool_settings.resize_timeout is not None: logger.warning( 'ignoring resize timeout for autoscale-enabled pool') else: asenable = False asformula = None asei = None logger.debug('autoscale enabled: {}'.format(asenable)) # task scheduling policy settings if util.is_not_empty(pool_settings.node_fill_type): task_scheduling_policy = batchmodels.TaskSchedulingPolicy( node_fill_type=batchmodels.ComputeNodeFillType( pool_settings.node_fill_type), ) else: task_scheduling_policy = None # custom image settings custom_image_na = settings.pool_custom_image_node_agent(config) # check for virtual network settings bc = settings.credentials_batch(config) subnet_id = _pool_virtual_network_subnet_address_space_check( resource_client, network_client, config, pool_settings, bc) # construct fstab mounts for storage clusters sc_fstab_mounts = [] sc_args = [] if util.is_not_empty(storage_cluster_mounts): for sc_id in storage_cluster_mounts: fm, sca = _create_storage_cluster_mount_args( compute_client, network_client, batch_mgmt_client, config, sc_id, bc, subnet_id) sc_fstab_mounts.append(fm) sc_args.append(sca) if settings.verbose(config): logger.debug('storage cluster args: {}'.format(sc_args)) del storage_cluster_mounts # constrcut fstab mounts for custom mounts custom_linux_fstab_mounts = [] if util.is_not_empty(custom_linux_mounts): for id in custom_linux_mounts: custom_linux_fstab_mounts.append( _create_custom_linux_mount_args(config, id)) del custom_linux_mounts # add encryption cert to account if specified encrypt = settings.batch_shipyard_encryption_enabled(config) if encrypt: pfx = crypto.get_encryption_pfx_settings(config) batch.add_certificate_to_account(batch_client, config) # construct block list block_for_gr = None if pool_settings.block_until_all_global_resources_loaded: block_for_gr_docker = '' block_for_gr_singularity = '' docker_images = settings.global_resources_docker_images(config) if len(docker_images) > 0: block_for_gr_docker = ','.join([x for x in docker_images]) singularity_images = settings.global_resources_singularity_images( config) if len(singularity_images) > 0: block_for_gr_singularity = ','.join( [util.singularity_image_name_on_disk(x) for x in singularity_images]) if (util.is_none_or_empty(block_for_gr_docker) and util.is_none_or_empty(block_for_gr_singularity)): logger.warning( 'no Docker and Singularity images specified in global ' 'resources') if native: # native pools will auto preload block_for_gr_docker = '' block_for_gr = '{}#{}'.format( block_for_gr_docker, block_for_gr_singularity) # shipyard settings bs = settings.batch_shipyard_settings(config) # data replication and peer-to-peer settings dr = settings.data_replication_settings(config) # create torrent flags torrentflags = '{}:{}:{}:{}'.format( dr.peer_to_peer.enabled, dr.concurrent_source_downloads, dr.peer_to_peer.direct_download_seed_bias, dr.peer_to_peer.compression) # create resource files list if is_windows: _rflist = [_REGISTRY_LOGIN_WINDOWS_FILE, _BLOBXFER_WINDOWS_FILE] else: _rflist = [_REGISTRY_LOGIN_FILE, _BLOBXFER_FILE] if not native and not is_windows: _rflist.append(_IMAGE_BLOCK_FILE) if not bs.use_shipyard_docker_image: _rflist.append(_CASCADE_FILE) if bs.store_timing_metrics: _rflist.append(_PERF_FILE) if pool_settings.ssh.hpn_server_swap: _rflist.append(_HPNSSH_FILE) # handle azure mounts if azureblob_vd: abms = _setup_azureblob_mounts(blob_client, config, bc) _rflist.append(('azureblob-mount.sh', abms)) if azurefile_vd: afms = _setup_azurefile_mounts(blob_client, config, bc, is_windows) _rflist.append( ('azurefile-mount.{}'.format('cmd' if is_windows else 'sh'), afms) ) # gpu settings if (not native and settings.is_gpu_pool(pool_settings.vm_size) and util.is_none_or_empty(custom_image_na)): if pool_settings.gpu_driver is None: gpu_driver = _setup_nvidia_driver_package( blob_client, config, pool_settings.vm_size) _rflist.append((gpu_driver.name, gpu_driver)) else: gpu_type = settings.get_gpu_type_from_vm_size( pool_settings.vm_size) gpu_driver = pathlib.Path(_NVIDIA_DRIVER[gpu_type]['target']) gpu_env = '{}:{}'.format( settings.is_gpu_visualization_pool(pool_settings.vm_size), gpu_driver.name) else: gpu_env = None # get container registries docker_registries = settings.docker_registries(config) # set additional start task commands (pre version) start_task = pool_settings.additional_node_prep_commands_pre # set vm configuration if native: if util.is_not_empty(custom_image_na): # check if AAD is enabled if util.is_not_empty(bc.account_key): raise RuntimeError( 'cannot allocate a pool with a custom image without AAD ' 'credentials') vmconfig = batchmodels.VirtualMachineConfiguration( image_reference=batchmodels.ImageReference( virtual_machine_image_id=pool_settings. vm_configuration.arm_image_id, ), node_agent_sku_id=pool_settings.vm_configuration.node_agent, ) logger.debug( ('deploying custom image to pool in native mode: {} ' 'node agent: {}').format( vmconfig.image_reference.virtual_machine_image_id, vmconfig.node_agent_sku_id)) else: image_ref, na_ref = _pick_node_agent_for_vm( batch_client, pool_settings) vmconfig = batchmodels.VirtualMachineConfiguration( image_reference=image_ref, node_agent_sku_id=na_ref, ) logger.debug('deploying pool in native mode') # attach container config vmconfig.container_configuration = batchmodels.ContainerConfiguration( container_image_names=settings.global_resources_docker_images( config), container_registries=docker_registries, ) if is_windows: if util.is_not_empty(custom_image_na): raise RuntimeError( 'Native mode and Windows custom images is not supported') _rflist.append(_NODEPREP_WINDOWS_FILE) start_task.append( ('powershell -ExecutionPolicy Unrestricted -command ' '{npf}{a}{e}{v}{x}').format( npf=_NODEPREP_WINDOWS_FILE[0], a=' -a' if azurefile_vd else '', e=' -e {}'.format(pfx.sha1) if encrypt else '', v=' -v {}'.format(__version__), x=' -x {}'.format(data._BLOBXFER_VERSION)) ) else: _rflist.append(_NODEPREP_NATIVEDOCKER_FILE) start_task.append( '{npf}{a}{c}{e}{f}{m}{n}{v}{x}'.format( npf=_NODEPREP_NATIVEDOCKER_FILE[0], a=' -a' if azurefile_vd else '', c=' -c' if azureblob_vd else '', e=' -e {}'.format(pfx.sha1) if encrypt else '', f=' -f' if gluster_on_compute else '', m=' -m {}'.format(','.join(sc_args)) if util.is_not_empty( sc_args) else '', n=' -n' if settings.can_tune_tcp( pool_settings.vm_size) else '', v=' -v {}'.format(__version__), x=' -x {}'.format(data._BLOBXFER_VERSION), ) ) elif util.is_not_empty(custom_image_na): # check if AAD is enabled if util.is_not_empty(bc.account_key): raise RuntimeError( 'cannot allocate a pool with a custom image without AAD ' 'credentials') _rflist.append(_NODEPREP_CUSTOMIMAGE_FILE) vmconfig = batchmodels.VirtualMachineConfiguration( image_reference=batchmodels.ImageReference( virtual_machine_image_id=pool_settings. vm_configuration.arm_image_id, ), node_agent_sku_id=pool_settings.vm_configuration.node_agent, ) logger.debug('deploying custom image: {} node agent: {}'.format( vmconfig.image_reference.virtual_machine_image_id, vmconfig.node_agent_sku_id)) start_task.append( '{npf}{a}{b}{c}{e}{f}{m}{n}{p}{t}{v}{x}'.format( npf=_NODEPREP_CUSTOMIMAGE_FILE[0], a=' -a' if azurefile_vd else '', b=' -b' if util.is_not_empty(block_for_gr) else '', c=' -c' if azureblob_vd else '', e=' -e {}'.format(pfx.sha1) if encrypt else '', f=' -f' if gluster_on_compute else '', m=' -m {}'.format(','.join(sc_args)) if util.is_not_empty( sc_args) else '', n=' -n' if settings.can_tune_tcp( pool_settings.vm_size) else '', p=' -p {}'.format(bs.storage_entity_prefix) if bs.storage_entity_prefix else '', t=' -t {}'.format(torrentflags), v=' -v {}'.format(__version__), x=' -x {}'.format(data._BLOBXFER_VERSION), ) ) else: _rflist.append(_NODEPREP_FILE) image_ref, na_ref = _pick_node_agent_for_vm( batch_client, pool_settings) vmconfig = batchmodels.VirtualMachineConfiguration( image_reference=image_ref, node_agent_sku_id=na_ref, ) # create start task commandline start_task.append( '{npf}{a}{b}{c}{d}{e}{f}{g}{m}{n}{o}{p}{s}{t}{v}{w}{x}'.format( npf=_NODEPREP_FILE[0], a=' -a' if azurefile_vd else '', b=' -b' if util.is_not_empty(block_for_gr) else '', c=' -c' if azureblob_vd else '', d=' -d' if bs.use_shipyard_docker_image else '', e=' -e {}'.format(pfx.sha1) if encrypt else '', f=' -f' if gluster_on_compute else '', g=' -g {}'.format(gpu_env) if gpu_env is not None else '', m=' -m {}'.format(','.join(sc_args)) if util.is_not_empty( sc_args) else '', n=' -n' if settings.can_tune_tcp( pool_settings.vm_size) else '', o=' -o {}'.format(pool_settings.vm_configuration.offer), p=' -p {}'.format(bs.storage_entity_prefix) if bs.storage_entity_prefix else '', s=' -s {}'.format(pool_settings.vm_configuration.sku), t=' -t {}'.format(torrentflags), v=' -v {}'.format(__version__), w=' -w' if pool_settings.ssh.hpn_server_swap else '', x=' -x {}'.format(data._BLOBXFER_VERSION), ) ) # upload resource files sas_urls = storage.upload_resource_files(blob_client, config, _rflist) del _rflist # remove temporary az mount files created if azureblob_vd: try: abms.unlink() pass except OSError: pass if azurefile_vd: try: afms.unlink() except OSError: pass # digest any input data addlcmds = data.process_input_data( config, _BLOBXFER_WINDOWS_FILE if is_windows else _BLOBXFER_FILE, settings.pool_specification(config)) if addlcmds is not None: start_task.append(addlcmds) del addlcmds # add additional start task commands (post version) start_task.extend(pool_settings.additional_node_prep_commands_post) # create pool param pool = batchmodels.PoolAddParameter( id=pool_settings.id, virtual_machine_configuration=vmconfig, vm_size=pool_settings.vm_size, target_dedicated_nodes=( pool_settings.vm_count.dedicated if not asenable else None ), target_low_priority_nodes=( pool_settings.vm_count.low_priority if not asenable else None ), resize_timeout=pool_settings.resize_timeout if not asenable else None, max_tasks_per_node=pool_settings.max_tasks_per_node, enable_inter_node_communication=pool_settings. inter_node_communication_enabled, start_task=batchmodels.StartTask( command_line=util.wrap_commands_in_shell( start_task, windows=is_windows, wait=False), user_identity=batch._RUN_ELEVATED, wait_for_success=True, environment_settings=[ batchmodels.EnvironmentSetting('LC_ALL', 'en_US.UTF-8'), ], resource_files=[], ), enable_auto_scale=asenable, auto_scale_formula=asformula, auto_scale_evaluation_interval=asei, metadata=[ batchmodels.MetadataItem( name=settings.get_metadata_version_name(), value=__version__, ), ], task_scheduling_policy=task_scheduling_policy, ) if encrypt: if is_windows: pool.certificate_references = [ batchmodels.CertificateReference( pfx.sha1, 'sha1', visibility=[ batchmodels.CertificateVisibility.start_task, batchmodels.CertificateVisibility.task, ] ) ] else: pool.certificate_references = [ batchmodels.CertificateReference( pfx.sha1, 'sha1', visibility=[batchmodels.CertificateVisibility.start_task] ) ] for rf in sas_urls: pool.start_task.resource_files.append( batchmodels.ResourceFile( file_path=rf, blob_source=sas_urls[rf]) ) if not native: pool.start_task.environment_settings.append( batchmodels.EnvironmentSetting( 'SHIPYARD_STORAGE_ENV', crypto.encrypt_string( encrypt, '{}:{}:{}'.format( storage.get_storageaccount(), storage.get_storageaccount_endpoint(), storage.get_storageaccount_key()), config) ) ) if pool_settings.gpu_driver and util.is_none_or_empty(custom_image_na): pool.start_task.resource_files.append( batchmodels.ResourceFile( file_path=gpu_driver.name, blob_source=pool_settings.gpu_driver, file_mode='0755') ) # add any additional specified resource files if util.is_not_empty(pool_settings.resource_files): for rf in pool_settings.resource_files: pool.start_task.resource_files.append( batchmodels.ResourceFile( file_path=rf.file_path, blob_source=rf.blob_source, file_mode=rf.file_mode, ) ) # virtual network settings if subnet_id is not None: pool.network_configuration = batchmodels.NetworkConfiguration( subnet_id=subnet_id, ) # storage cluster settings if util.is_not_empty(sc_fstab_mounts): pool.start_task.environment_settings.append( batchmodels.EnvironmentSetting( 'SHIPYARD_STORAGE_CLUSTER_FSTAB', '#'.join(sc_fstab_mounts) ) ) del sc_args del sc_fstab_mounts # custom linux mount settings if util.is_not_empty(custom_linux_fstab_mounts): pool.start_task.environment_settings.append( batchmodels.EnvironmentSetting( 'SHIPYARD_CUSTOM_MOUNTS_FSTAB', '#'.join(custom_linux_fstab_mounts) ) ) del custom_linux_fstab_mounts # add optional environment variables if not native and bs.store_timing_metrics: pool.start_task.environment_settings.append( batchmodels.EnvironmentSetting('SHIPYARD_TIMING', '1') ) # add docker login settings pool.start_task.environment_settings.extend( batch.generate_docker_login_settings(config)[0]) # image preload setting if util.is_not_empty(block_for_gr): pool.start_task.environment_settings.append( batchmodels.EnvironmentSetting( 'SHIPYARD_CONTAINER_IMAGES_PRELOAD', block_for_gr, ) ) # singularity env vars if not is_windows: pool.start_task.environment_settings.append( batchmodels.EnvironmentSetting( 'SINGULARITY_TMPDIR', settings.get_singularity_tmpdir(config) ) ) pool.start_task.environment_settings.append( batchmodels.EnvironmentSetting( 'SINGULARITY_CACHEDIR', settings.get_singularity_cachedir(config) ) ) return (pool_settings, gluster_on_compute, pool) def _construct_auto_pool_specification( resource_client, compute_client, network_client, batch_mgmt_client, batch_client, blob_client, config): # type: (azure.mgmt.resource.resources.ResourceManagementClient, # azure.mgmt.compute.ComputeManagementClient, # azure.mgmt.network.NetworkManagementClient, # azure.mgmt.batch.BatchManagementClient, # azure.batch.batch_service_client.BatchServiceClient, # azureblob.BlockBlobService, dict) -> None """Construct an auto pool specification :param azure.mgmt.resource.resources.ResourceManagementClient resource_client: resource client :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param azure.mgmt.network.NetworkManagementClient network_client: network client :param azure.mgmt.batch.BatchManagementClient: batch_mgmt_client :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param azure.storage.blob.BlockBlobService blob_client: blob client :param dict config: configuration dict """ # upload resource files and construct pool add parameter object pool_settings, gluster_on_compute, pool = _construct_pool_object( resource_client, compute_client, network_client, batch_mgmt_client, batch_client, blob_client, config) # convert pool add parameter object to a pool specification object poolspec = batchmodels.PoolSpecification( vm_size=pool.vm_size, virtual_machine_configuration=pool.virtual_machine_configuration, max_tasks_per_node=pool.max_tasks_per_node, task_scheduling_policy=pool.task_scheduling_policy, resize_timeout=pool.resize_timeout, target_dedicated_nodes=pool.target_dedicated_nodes, target_low_priority_nodes=pool.target_low_priority_nodes, enable_auto_scale=pool.enable_auto_scale, auto_scale_formula=pool.auto_scale_formula, auto_scale_evaluation_interval=pool.auto_scale_evaluation_interval, enable_inter_node_communication=pool.enable_inter_node_communication, network_configuration=pool.network_configuration, start_task=pool.start_task, certificate_references=pool.certificate_references, metadata=pool.metadata, ) # add auto pool env var for cascade poolspec.start_task.environment_settings.append( batchmodels.EnvironmentSetting('SHIPYARD_AUTOPOOL', 1) ) return poolspec def _add_pool( resource_client, compute_client, network_client, batch_mgmt_client, batch_client, blob_client, config): # type: (azure.mgmt.resource.resources.ResourceManagementClient, # azure.mgmt.compute.ComputeManagementClient, # azure.mgmt.network.NetworkManagementClient, # azure.mgmt.batch.BatchManagementClient, # azure.batch.batch_service_client.BatchServiceClient, # azureblob.BlockBlobService, dict) -> None """Add a Batch pool to account :param azure.mgmt.resource.resources.ResourceManagementClient resource_client: resource client :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param azure.mgmt.network.NetworkManagementClient network_client: network client :param azure.mgmt.batch.BatchManagementClient: batch_mgmt_client :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param azure.storage.blob.BlockBlobService blob_client: blob client :param dict config: configuration dict """ # upload resource files and construct pool add parameter object pool_settings, gluster_on_compute, pool = _construct_pool_object( resource_client, compute_client, network_client, batch_mgmt_client, batch_client, blob_client, config) # ingress data to Azure Blob Storage if specified storage_threads = [] if pool_settings.transfer_files_on_pool_creation: storage_threads = data.ingress_data( batch_client, compute_client, network_client, config, rls=None, kind='storage') # create pool nodes = batch.create_pool(batch_client, config, pool) _pool = batch_client.pool.get(pool.id) pool_current_vm_count = ( _pool.current_dedicated_nodes + _pool.current_low_priority_nodes ) pool_target_vm_count = ( _pool.target_dedicated_nodes + _pool.target_low_priority_nodes ) if util.is_none_or_empty(nodes) and pool_target_vm_count > 0: raise RuntimeError( ('No nodes could be allocated for pool: {}. If the pool is ' 'comprised entirely of low priority nodes, then there may not ' 'have been enough available capacity in the region to satisfy ' 'your request. Please inspect the pool for resize errors and ' 'issue pool resize to try again.').format(pool.id)) # set up gluster on compute if specified if gluster_on_compute and pool_current_vm_count > 0: _setup_glusterfs( batch_client, blob_client, config, nodes, _GLUSTERPREP_FILE, cmdline=None) # create admin user on each node if requested if pool_current_vm_count > 0: try: batch.add_rdp_user(batch_client, config, nodes) except Exception as e: logger.exception(e) try: batch.add_ssh_user(batch_client, config, nodes) except Exception as e: logger.exception(e) logger.error( 'Could not add SSH users to nodes. Please ensure ssh-keygen ' 'is available in your PATH or cwd. Skipping data ingress if ' 'specified.') else: rls = None # ingress data to shared fs if specified if pool_settings.transfer_files_on_pool_creation: if rls is None: rls = batch.get_remote_login_settings( batch_client, config, nodes) data.ingress_data( batch_client, compute_client, network_client, config, rls=rls, kind='shared', total_vm_count=pool_current_vm_count) # log remote login settings if rls is None: if pool_current_vm_count <= 16: batch.get_remote_login_settings( batch_client, config, nodes) else: logger.info( 'Not listing remote login settings due to VM count. ' 'If you need a list of remote login settings for all ' 'nodes in the pool, issue the "pool nodes grls" ' 'command.') # wait for storage ingress processes data.wait_for_storage_threads(storage_threads) def _setup_glusterfs( batch_client, blob_client, config, nodes, shell_script, cmdline=None): # type: (batchsc.BatchServiceClient, azureblob.BlockBlobService, dict, # List[batchmodels.ComputeNode], str, str) -> None """Setup glusterfs via multi-instance task :param batch_client: The batch client to use. :type batch_client: `azure.batch.batch_service_client.BatchServiceClient` :param azure.storage.blob.BlockBlobService blob_client: blob client :param dict config: configuration dict :param list nodes: list of nodes :param str shell_script: glusterfs setup script to use :param str cmdline: coordination cmdline """ # get volume type/options voltype = None volopts = None sdv = settings.global_resources_shared_data_volumes(config) for sdvkey in sdv: try: if settings.is_shared_data_volume_gluster_on_compute(sdv, sdvkey): voltype = settings.gluster_volume_type(sdv, sdvkey) volopts = settings.gluster_volume_options(sdv, sdvkey) break except KeyError: pass if voltype is None: raise RuntimeError('glusterfs volume not defined') pool_id = settings.pool_id(config) job_id = 'shipyard-glusterfs-{}'.format(uuid.uuid4()) job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id), ) # create coordination command line if cmdline is None: tempdisk = settings.temp_disk_mountpoint(config) cmdline = util.wrap_commands_in_shell([ '$AZ_BATCH_TASK_DIR/{} {} {}'.format( shell_script[0], voltype.lower(), tempdisk)]) # create application command line appcmd = [ '[[ -f $AZ_BATCH_TASK_WORKING_DIR/.glusterfs_success ]] || exit 1', ] if volopts is not None: for vo in volopts: appcmd.append('gluster volume set {} {}'.format( settings.get_gluster_default_volume_name(), vo)) # upload script sas_urls = storage.upload_resource_files( blob_client, config, [shell_script]) # get pool current dedicated pool = batch_client.pool.get(pool_id) batchtask = batchmodels.TaskAddParameter( id='gluster-setup', multi_instance_settings=batchmodels.MultiInstanceSettings( number_of_instances=pool.current_dedicated_nodes, coordination_command_line=cmdline, common_resource_files=[ batchmodels.ResourceFile( file_path=shell_script[0], blob_source=sas_urls[shell_script[0]], file_mode='0755'), ], ), command_line=util.wrap_commands_in_shell(appcmd), user_identity=batch._RUN_ELEVATED, ) # add job and task batch_client.job.add(job) batch_client.task.add(job_id=job_id, task=batchtask) logger.debug( 'waiting for glusterfs setup task {} in job {} to complete'.format( batchtask.id, job_id)) # wait for gluster fs setup task to complete while True: batchtask = batch_client.task.get(job_id, batchtask.id) if batchtask.state == batchmodels.TaskState.completed: break time.sleep(1) # ensure all nodes have glusterfs success file if nodes is None: nodes = batch_client.compute_node.list(pool_id) success = True for node in nodes: try: batch_client.file.get_properties_from_compute_node( pool_id, node.id, ('workitems/{}/job-1/gluster-setup/wd/' '.glusterfs_success').format(job_id)) except batchmodels.BatchErrorException: logger.error('gluster success file absent on node {}'.format( node.id)) success = False break # delete job batch_client.job.delete(job_id) if not success: raise RuntimeError('glusterfs setup failed') logger.info( 'glusterfs setup task {} in job {} completed'.format( batchtask.id, job_id)) def _update_container_images_over_ssh(batch_client, config, pool, cmd): # type: (batchsc.BatchServiceClient, dict, batchmodels.CloudPool, # list) -> None """Update docker images in pool over ssh :param batch_client: The batch client to use. :type batch_client: `azure.batch.batch_service_client.BatchServiceClient` :param dict config: configuration dict :param batchmodels.CloudPool pool: cloud pool :param list cmd: command """ _pool = settings.pool_settings(config) # get ssh settings username = _pool.ssh.username if util.is_none_or_empty(username): raise ValueError( 'cannot update container images without an SSH username') ssh_private_key = _pool.ssh.ssh_private_key if ssh_private_key is None: ssh_private_key = pathlib.Path( _pool.ssh.generated_file_export_path, crypto.get_ssh_key_prefix()) if not ssh_private_key.exists(): raise RuntimeError('SSH private key file not found at: {}'.format( ssh_private_key)) command = ['sudo', '/bin/bash -c "{}"'.format(' && '.join(cmd))] if settings.verbose(config): logger.debug('executing command: {}'.format(command)) # iterate through all nodes nodes = batch_client.compute_node.list(pool.id) procs = [] failures = False for node in nodes: rls = batch_client.compute_node.get_remote_login_settings( pool.id, node.id) procs.append(crypto.connect_or_exec_ssh_command( rls.remote_login_ip_address, rls.remote_login_port, ssh_private_key, username, sync=False, command=command)) if len(procs) >= 40: logger.debug('waiting for {} update processes to complete'.format( len(procs))) rcs = util.subprocess_wait_all(procs, poll=False) if any([x != 0 for x in rcs]): failures = True procs = [] del rcs if len(procs) > 0: logger.debug('waiting for {} update processes to complete'.format( len(procs))) rcs = util.subprocess_wait_all(procs, poll=False) if any([x != 0 for x in rcs]): failures = True procs = [] del rcs if failures: raise RuntimeError( 'failures detected updating container image on pool: {}'.format( pool.id)) else: logger.info('container image update completed for pool: {}'.format( pool.id)) def _update_container_images( batch_client, config, docker_image=None, docker_image_digest=None, singularity_image=None, force_ssh=False): # type: (batchsc.BatchServiceClient, dict, str, str, str, bool) -> None """Update container images in pool :param batch_client: The batch client to use. :type batch_client: `azure.batch.batch_service_client.BatchServiceClient` :param dict config: configuration dict :param str docker_image: docker image to update :param str docker_image_digest: digest to update to :param str singularity_image: singularity image to update :param bool force_ssh: force update over SSH """ # first check that peer-to-peer is disabled for pool pool_id = settings.pool_id(config) try: if settings.data_replication_settings(config).peer_to_peer.enabled: raise RuntimeError( 'cannot update container images for a pool with peer-to-peer ' 'image distribution') except KeyError: pass native = settings.is_native_docker_pool(config) if native and not force_ssh: logger.debug('forcing update via SSH due to native mode') force_ssh = True # if image is not specified use images from global config singularity_images = None if util.is_none_or_empty(docker_image): docker_images = settings.global_resources_docker_images(config) else: # log warning if it doesn't exist in global resources if docker_image not in settings.global_resources_docker_images(config): logger.warning( ('docker image {} is not specified as a global resource ' 'for pool {}').format(docker_image, pool_id)) if docker_image_digest is None: docker_images = [docker_image] else: docker_images = ['{}@{}'.format(docker_image, docker_image_digest)] if util.is_none_or_empty(singularity_image): singularity_images = settings.global_resources_singularity_images( config) else: # log warning if it doesn't exist in global resources if (singularity_image not in settings.global_resources_singularity_images(config)): logger.warning( ('singularity image {} is not specified as a global resource ' 'for pool {}').format(singularity_image, pool_id)) singularity_images = [singularity_image] if (util.is_none_or_empty(docker_images) and util.is_none_or_empty(singularity_images)): logger.error('no images detected or specified to update') return # get pool current dedicated pool = batch_client.pool.get(pool_id) # check pool current vms is > 0. There is no reason to run updateimages # if pool has no nodes in it. When the pool is resized up, the nodes # will always fetch either :latest if untagged or the latest :tag if # updated in the upstream registry if (pool.current_dedicated_nodes == 0 and pool.current_low_priority_nodes == 0): logger.warning( ('not executing updateimages command as the current number of ' 'compute nodes is zero for pool {}').format(pool_id)) return # force ssh on some paths if not force_ssh: if pool.current_low_priority_nodes > 0: logger.debug('forcing update via SSH due to low priority nodes') force_ssh = True if (pool.current_dedicated_nodes > 1 and not pool.enable_inter_node_communication): logger.debug( 'forcing update via SSH due to non-internode communicaton ' 'enabled pool') force_ssh = True # check pool metadata version if util.is_none_or_empty(pool.metadata): logger.warning('pool version metadata not present') else: for md in pool.metadata: if (md.name == settings.get_metadata_version_name() and md.value != __version__): logger.warning( 'pool version metadata mismatch: pool={} cli={}'.format( md.value, __version__)) break # perform windows compat checks is_windows = settings.is_windows_pool(config) if is_windows: if force_ssh: raise RuntimeError('cannot update images via SSH on windows') if util.is_not_empty(singularity_images): raise RuntimeError( 'invalid configuration: windows pool with singularity images') # create coordination command line # 1. log in again in case of cred expiry # 2. pull images with respect to registry # 3. tag images that are in a private registry # 4. prune docker images with no tag taskenv, coordcmd = batch.generate_docker_login_settings(config, force_ssh) if util.is_not_empty(docker_images): coordcmd.extend(['docker pull {}'.format(x) for x in docker_images]) coordcmd.append( 'docker images --filter dangling=true -q --no-trunc | ' 'xargs --no-run-if-empty docker rmi') if util.is_not_empty(singularity_images): coordcmd.extend([ 'export SINGULARITY_TMPDIR={}'.format( settings.get_singularity_tmpdir(config)), 'export SINGULARITY_CACHEDIR={}'.format( settings.get_singularity_cachedir(config)), ]) coordcmd.extend( ['singularity pull -F {}'.format(x) for x in singularity_images] ) coordcmd.append('chown -R _azbatch:_azbatchgrp {}'.format( settings.get_singularity_cachedir(config))) if force_ssh: _update_container_images_over_ssh(batch_client, config, pool, coordcmd) return if is_windows: coordcmd.append('copy /y nul .update_images_success') else: coordcmd.append('touch .update_images_success') # update taskenv for Singularity taskenv.append( batchmodels.EnvironmentSetting( 'SINGULARITY_TMPDIR', settings.get_singularity_tmpdir(config) ) ) taskenv.append( batchmodels.EnvironmentSetting( 'SINGULARITY_CACHEDIR', settings.get_singularity_cachedir(config) ) ) coordcmd = util.wrap_commands_in_shell(coordcmd, windows=is_windows) # create job for update job_id = 'shipyard-updateimages-{}'.format(uuid.uuid4()) job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id), ) # create task batchtask = batchmodels.TaskAddParameter( id='update-container-images', command_line=coordcmd, environment_settings=taskenv, user_identity=batch._RUN_ELEVATED, ) # create multi-instance task for pools with more than 1 node if pool.current_dedicated_nodes > 1: batchtask.multi_instance_settings = batchmodels.MultiInstanceSettings( number_of_instances=pool.current_dedicated_nodes, coordination_command_line=coordcmd, ) # create application command line if is_windows: appcmd = util.wrap_commands_in_shell([ 'if not exist %AZ_BATCH_TASK_WORKING_DIR%\\' '.update_images_success exit 1' ], windows=is_windows) else: appcmd = util.wrap_commands_in_shell([ '[[ -f $AZ_BATCH_TASK_WORKING_DIR/.update_images_success ]] ' '|| exit 1' ], windows=is_windows) batchtask.command_line = appcmd # add job and task batch_client.job.add(job) batch_client.task.add(job_id=job_id, task=batchtask) logger.debug( ('waiting for update container images task {} in job {} ' 'to complete').format(batchtask.id, job_id)) # wait for task to complete while True: batchtask = batch_client.task.get(job_id, batchtask.id) if batchtask.state == batchmodels.TaskState.completed: break time.sleep(1) # ensure all nodes have success file if multi-instance success = True if pool.current_dedicated_nodes > 1: if is_windows: sep = '\\' else: sep = '/' uis_file = sep.join( ('workitems', job_id, 'job-1', batchtask.id, 'wd', '.update_images_success') ) nodes = batch_client.compute_node.list(pool_id) for node in nodes: try: batch_client.file.get_properties_from_compute_node( pool_id, node.id, uis_file) except batchmodels.BatchErrorException: logger.error( 'update images success file absent on node {}'.format( node.id)) success = False break else: task = batch_client.task.get(job_id, batchtask.id) if task.execution_info is None or task.execution_info.exit_code != 0: success = False # stream stderr to console batch.stream_file_and_wait_for_task( batch_client, config, '{},{},stderr.txt'.format(batchtask.id, job_id)) # delete job batch_client.job.delete(job_id) if not success: raise RuntimeError('update container images job failed') logger.info( 'update container images task {} in job {} completed'.format( batchtask.id, job_id)) def _list_docker_images(batch_client, config): # type: (batchsc.BatchServiceClient, dict) -> None """List Docker images in pool over ssh :param batch_client: The batch client to use. :type batch_client: `azure.batch.batch_service_client.BatchServiceClient` :param dict config: configuration dict :param batchmodels.CloudPool pool: cloud pool """ _pool = settings.pool_settings(config) pool = batch_client.pool.get(_pool.id) if (pool.current_dedicated_nodes == 0 and pool.current_low_priority_nodes == 0): logger.warning('pool {} has no compute nodes'.format(pool.id)) return is_windows = settings.is_windows_pool(config) # TODO temporarily disable listimages with windows pools if is_windows: raise RuntimeError( 'listing images is currently not supported for windows pools') # get ssh settings username = _pool.ssh.username if util.is_none_or_empty(username): raise ValueError('cannot list docker images without an SSH username') ssh_private_key = _pool.ssh.ssh_private_key if ssh_private_key is None: ssh_private_key = pathlib.Path( _pool.ssh.generated_file_export_path, crypto.get_ssh_key_prefix()) if not ssh_private_key.exists(): raise RuntimeError('SSH private key file not found at: {}'.format( ssh_private_key)) # iterate through all nodes nodes = batch_client.compute_node.list(pool.id) procs = {} stdout = {} failures = False for node in nodes: rls = batch_client.compute_node.get_remote_login_settings( pool.id, node.id) procs[node.id] = crypto.connect_or_exec_ssh_command( rls.remote_login_ip_address, rls.remote_login_port, ssh_private_key, username, sync=False, command=[ 'sudo', 'docker', 'images', '--format', '"{{.ID}} {{.Repository}}:{{.Tag}}"' ]) if len(procs) >= 40: logger.debug('waiting for {} processes to complete'.format( len(procs))) for key in procs: stdout[key] = procs[key].communicate()[0].decode( 'utf8').split('\n') rcs = util.subprocess_wait_all(list(procs.values())) if any([x != 0 for x in rcs]): failures = True procs.clear() del rcs if len(procs) > 0: logger.debug('waiting for {} processes to complete'.format( len(procs))) for key in procs: stdout[key] = procs[key].communicate()[0].decode( 'utf8').split('\n') rcs = util.subprocess_wait_all(list(procs.values())) if any([x != 0 for x in rcs]): failures = True procs.clear() del rcs if failures: raise RuntimeError( 'failures retrieving docker images on pool: {}'.format( pool.id)) # process stdout node_images = {} all_images = {} for key in stdout: node_images[key] = set() for out in stdout[key]: if util.is_not_empty(out): dec = out.split() if (not dec[1].startswith('alfpark/batch-shipyard') and not dec[1].startswith('alfpark/blobxfer')): node_images[key].add(dec[0]) if dec[0] not in all_images: all_images[dec[0]] = dec[1] # find set intersection among all nodes intersecting_images = set.intersection(*list(node_images.values())) logger.info('Common Docker images across all nodes in pool {}:{}{}'.format( pool.id, os.linesep, os.linesep.join( ['{} {}'.format(key, all_images[key]) for key in intersecting_images]) )) # find mismatched images on nodes for node in node_images: images = set(node_images[node]) diff = images.difference(intersecting_images) if len(diff) > 0: logger.warning('Docker images present only on node {}:{}{}'.format( node, os.linesep, os.linesep.join( ['{} {}'.format(key, all_images[key]) for key in diff]) )) def _adjust_settings_for_pool_creation(config): # type: (dict) -> None """Adjust settings for pool creation :param dict config: configuration dict """ # get settings pool = settings.pool_settings(config) publisher = settings.pool_publisher(config, lower=True) offer = settings.pool_offer(config, lower=True) sku = settings.pool_sku(config, lower=True) node_agent = settings.pool_custom_image_node_agent(config) if util.is_not_empty(node_agent) and util.is_not_empty(sku): raise ValueError( 'cannot specify both a platform_image and a custom_image in the ' 'pool specification') is_windows = settings.is_windows_pool(config) bs = settings.batch_shipyard_settings(config) # enforce publisher/offer/sku restrictions allowed = False shipyard_container_required = True # oracle linux is not supported due to UEKR4 requirement if publisher == 'canonical': if offer == 'ubuntuserver': if sku.startswith('14.04'): allowed = True elif sku.startswith('16.04'): allowed = True shipyard_container_required = False elif publisher == 'credativ': if offer == 'debian': if sku >= '8': allowed = True elif publisher == 'openlogic': if offer.startswith('centos'): if sku >= '7': allowed = True elif publisher == 'redhat': if offer == 'rhel': if sku >= '7': allowed = True elif publisher == 'suse': if offer.startswith('sles'): if sku >= '12-sp1': allowed = True elif offer == 'opensuse-leap': if sku >= '42': allowed = True elif publisher == 'microsoftwindowsserver': if offer == 'windowsserver': if sku == '2016-datacenter-with-containers': allowed = True # check if allowed for gpu (if gpu vm size) if allowed: allowed = settings.gpu_configuration_check( config, vm_size=pool.vm_size) if not allowed and util.is_none_or_empty(node_agent): raise ValueError( ('unsupported Docker (and/or GPU) Host VM Config, publisher={} ' 'offer={} sku={} vm_size={}').format( publisher, offer, sku, pool.vm_size)) # ensure HPC offers are matched with RDMA sizes if (not is_windows and ( (offer == 'centos-hpc' or offer == 'sles-hpc') and not settings.is_rdma_pool(pool.vm_size))): raise ValueError( ('cannot allocate an HPC VM config of publisher={} offer={} ' 'sku={} with a non-RDMA vm_size={}').format( publisher, offer, sku, pool.vm_size)) # compute total vm count pool_total_vm_count = pool.vm_count.dedicated + pool.vm_count.low_priority # adjust for shipyard container requirement if (not bs.use_shipyard_docker_image and (shipyard_container_required or util.is_not_empty(node_agent))): settings.set_use_shipyard_docker_image(config, True) logger.debug( ('forcing shipyard docker image to be used due to ' 'VM config, publisher={} offer={} sku={}').format( publisher, offer, sku)) # re-read pool and data replication settings pool = settings.pool_settings(config) dr = settings.data_replication_settings(config) native = settings.is_native_docker_pool( config, vm_config=pool.vm_configuration) # ensure singularity images are not specified for native pools if native: images = settings.global_resources_singularity_images(config) if util.is_not_empty(images): raise ValueError( 'cannot specify a native container pool with Singularity ' 'images as global resources') # ensure settings p2p/as/internode settings are compatible if dr.peer_to_peer.enabled: if native: raise ValueError( 'cannot enable peer-to-peer and native container pools') if settings.is_pool_autoscale_enabled(config, pas=pool.autoscale): raise ValueError('cannot enable peer-to-peer and autoscale') if pool.inter_node_communication_enabled: logger.warning( 'force enabling inter-node communication due to peer-to-peer ' 'transfer') settings.set_inter_node_communication_enabled(config, True) # hpn-ssh can only be used for Ubuntu currently try: if (pool.ssh.hpn_server_swap and ((publisher != 'canonical' and offer != 'ubuntuserver') or util.is_not_empty(node_agent))): logger.warning('cannot enable HPN SSH swap on {} {} {}'.format( publisher, offer, sku)) settings.set_hpn_server_swap(config, False) except KeyError: pass # force disable block for global resources if ingressing data if (pool.transfer_files_on_pool_creation and pool.block_until_all_global_resources_loaded): logger.warning( 'disabling block until all global resources loaded with ' 'transfer files on pool creation enabled') settings.set_block_until_all_global_resources_loaded(config, False) # re-read pool settings pool = settings.pool_settings(config) # ensure internode is not enabled for mix node pools if (pool.inter_node_communication_enabled and pool.vm_count.dedicated > 0 and pool.vm_count.low_priority > 0): raise ValueError( 'inter node communication cannot be enabled with both ' 'dedicated and low priority nodes') # check shared data volume settings try: num_gluster = 0 sdv = settings.global_resources_shared_data_volumes(config) for sdvkey in sdv: if settings.is_shared_data_volume_gluster_on_compute(sdv, sdvkey): if is_windows: raise ValueError( 'glusterfs on compute is not supported on windows') if settings.is_pool_autoscale_enabled( config, pas=pool.autoscale): raise ValueError( 'glusterfs on compute cannot be installed on an ' 'autoscale-enabled pool') if not pool.inter_node_communication_enabled: # do not modify value and proceed since this interplays # with p2p settings, simply raise exception and force # user to reconfigure raise ValueError( 'inter node communication in pool configuration ' 'must be enabled for glusterfs on compute') if pool.vm_count.low_priority > 0: raise ValueError( 'glusterfs on compute cannot be installed on pools ' 'with low priority nodes') if pool.vm_count.dedicated <= 1: raise ValueError( 'vm_count dedicated should exceed 1 for glusterfs ' 'on compute') if pool.max_tasks_per_node > 1: raise ValueError( 'max_tasks_per_node cannot exceed 1 for glusterfs ' 'on compute') num_gluster += 1 try: if settings.gluster_volume_type(sdv, sdvkey) != 'replica': raise ValueError( 'only replicated GlusterFS volumes are ' 'currently supported') except KeyError: pass elif settings.is_shared_data_volume_storage_cluster(sdv, sdvkey): if is_windows: raise ValueError( 'storage cluster mounting is not supported on windows') elif settings.is_shared_data_volume_azure_blob(sdv, sdvkey): if is_windows: raise ValueError( 'azure blob mounting is not supported on windows') if native: raise ValueError( 'azure blob mounting is not supported on native ' 'container pools') if offer == 'ubuntuserver': if sku < '16.04-lts': raise ValueError( ('azure blob mounting is not supported ' 'on publisher={} offer={} sku={}').format( publisher, offer, sku)) elif not offer.startswith('centos'): raise ValueError( ('azure blob mounting is not supported ' 'on publisher={} offer={} sku={}').format( publisher, offer, sku)) elif settings.is_shared_data_volume_custom_linux_mount( sdv, sdvkey): if is_windows: raise ValueError( 'custom linux mounting is not supported on windows') if num_gluster > 1: raise ValueError( 'cannot create more than one GlusterFS on compute volume ' 'per pool') except KeyError: pass # check data ingress on pool creation on windows if is_windows and pool.transfer_files_on_pool_creation: raise ValueError( 'cannot transfer files on pool creation to windows compute nodes') # check singularity images are not present for windows if (is_windows and util.is_not_empty( settings.global_resources_singularity_images(config))): raise ValueError('cannot deploy Singularity images on windows pools') # check pool count of 0 and remote login if pool_total_vm_count == 0: if is_windows: # TODO RDP check pass else: if util.is_not_empty(pool.ssh.username): logger.warning('cannot add SSH user with zero target nodes') # ensure unusable recovery is not enabled for custom image if (pool.attempt_recovery_on_unusable and not settings.is_platform_image( config, vm_config=pool.vm_configuration)): logger.warning( 'override attempt recovery on unusable due to custom image') settings.set_attempt_recovery_on_unusable(config, False) # TODO temporarily disable credential encryption with windows if is_windows and settings.batch_shipyard_encryption_enabled(config): raise ValueError( 'cannot enable credential encryption with windows pools') def _check_settings_for_auto_pool(config): # type: (dict) -> None """Check settings for autopool :param dict config: configuration dict """ # check glusterfs on compute try: sdv = settings.global_resources_shared_data_volumes(config) for sdvkey in sdv: if settings.is_shared_data_volume_gluster_on_compute(sdv, sdvkey): raise ValueError( 'GlusterFS on compute is not possible with autopool') break except KeyError: pass # get settings pool = settings.pool_settings(config) # check local data movement to pool if pool.transfer_files_on_pool_creation: raise ValueError('Cannot ingress data on pool creation with autopool') # check ssh if util.is_not_empty(pool.ssh.username): logger.warning('cannot add SSH user with autopool') def _check_resource_client(resource_client): # type: (azure.mgmt.resource.resources.ResourceManagementClient) -> None """Check resource client validity""" if resource_client is None: raise RuntimeError( 'resource management client is invalid, ensure you have ' 'specified proper "management" credentials') def _check_compute_client(compute_client): # type: (azure.mgmt.resource.compute.ComputeManagementClient) -> None """Check compute client validity""" if compute_client is None: raise RuntimeError( 'compute management client is invalid, ensure you have ' 'specified proper "management" credentials') def _check_network_client(network_client): # type: (azure.mgmt.resource.network.NetworkManagementClient) -> None """Check network client validity""" if network_client is None: raise RuntimeError( 'network management client is invalid, ensure you have ' 'specified proper "management" credentials') def _check_keyvault_client(keyvault_client): # type: (azure.keyvault.KeyVaultClient) -> None """Check keyvault client validity""" if keyvault_client is None: raise RuntimeError( 'keyvault client is invalid, ensure you have specified ' 'proper "keyvault" credentials') def _check_batch_client(batch_client): # type: (batchsc.BatchServiceClient) -> None """Check batch client validity""" if batch_client is None: raise RuntimeError( 'batch client is invalid, ensure you have specified ' 'proper "batch" credentials') def action_fs_disks_add(resource_client, compute_client, config): # type: (azure.mgmt.resource.resources.ResourceManagementClient, # azure.mgmt.compute.ComputeManagementClient, dict) -> None """Action: Fs Disks Add :param azure.mgmt.resource.resources.ResourceManagementClient resource_client: resource client :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param dict config: configuration dict """ _check_resource_client(resource_client) _check_compute_client(compute_client) remotefs.create_managed_disks(resource_client, compute_client, config) def action_fs_disks_del( compute_client, config, name, resource_group, all, wait): # type: (azure.mgmt.compute.ComputeManagementClient, dict, str, # str, bool, bool) -> None """Action: Fs Disks Del :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param dict config: configuration dict :param str name: disk name :param str resource_group: resource group :param bool all: delete all in resource group :param bool wait: wait for operation to complete """ _check_compute_client(compute_client) remotefs.delete_managed_disks( compute_client, config, name, resource_group, all, wait, confirm_override=False) def action_fs_disks_list( compute_client, config, resource_group, restrict_scope): # type: (azure.mgmt.compute.ComputeManagementClient, dict, str, # bool) -> None """Action: Fs Disks List :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param dict config: configuration dict :param str resource_group: resource group :param bool restrict_scope: restrict scope to config """ _check_compute_client(compute_client) remotefs.list_disks(compute_client, config, resource_group, restrict_scope) def action_fs_cluster_add( resource_client, compute_client, network_client, blob_client, config, storage_cluster_id): # type: (azure.mgmt.resource.resources.ResourceManagementClient, # azure.mgmt.compute.ComputeManagementClient, # azure.mgmt.network.NetworkManagementClient, # azure.storage.blob.BlockBlobService, dict, str) -> None """Action: Fs Cluster Add :param azure.mgmt.resource.resources.ResourceManagementClient resource_client: resource client :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param azure.mgmt.network.NetworkManagementClient network_client: network client :param azure.storage.blob.BlockBlobService blob_client: blob client :param dict config: configuration dict :param str storage_cluster_id: storage cluster id """ _check_resource_client(resource_client) _check_compute_client(compute_client) _check_network_client(network_client) storage.set_storage_remotefs_container(storage_cluster_id) remotefs.create_storage_cluster( resource_client, compute_client, network_client, blob_client, config, storage_cluster_id, _REMOTEFSPREP_FILE[0], _ALL_REMOTEFS_FILES) def action_fs_cluster_resize( compute_client, network_client, blob_client, config, storage_cluster_id): # type: (azure.mgmt.compute.ComputeManagementClient, # azure.mgmt.network.NetworkManagementClient, # azure.storage.blob.BlockBlobService, dict, str) -> None """Action: Fs Cluster Resize :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param azure.mgmt.network.NetworkManagementClient network_client: network client :param azure.storage.blob.BlockBlobService blob_client: blob client :param dict config: configuration dict :param str storage_cluster_id: storage cluster id """ _check_compute_client(compute_client) _check_network_client(network_client) remotefs.resize_storage_cluster( compute_client, network_client, blob_client, config, storage_cluster_id, _REMOTEFSPREP_FILE[0], _REMOTEFSADDBRICK_FILE[0], _ALL_REMOTEFS_FILES) def action_fs_cluster_del( resource_client, compute_client, network_client, blob_client, config, storage_cluster_id, delete_all_resources, delete_data_disks, delete_virtual_network, generate_from_prefix, wait): # type: (azure.mgmt.resource.resources.ResourceManagementClient, # azure.mgmt.compute.ComputeManagementClient, # azure.mgmt.network.NetworkManagementClient, # azure.storage.blob.BlockBlobService, dict, str, bool, bool, # bool, bool, bool) -> None """Action: Fs Cluster Add :param azure.mgmt.resource.resources.ResourceManagementClient resource_client: resource client :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param azure.mgmt.network.NetworkManagementClient network_client: network client :param azure.storage.blob.BlockBlobService blob_client: blob client :param dict config: configuration dict :param str storage_cluster_id: storage cluster id :param bool delete_all_resources: delete all resources :param bool delete_data_disks: delete data disks :param bool delete_virtual_network: delete virtual network :param bool generate_from_prefix: generate resources from hostname prefix :param bool wait: wait for deletion to complete """ _check_resource_client(resource_client) _check_compute_client(compute_client) _check_network_client(network_client) if (generate_from_prefix and (delete_all_resources or delete_data_disks or delete_virtual_network)): raise ValueError( 'Cannot specify generate_from_prefix and a delete_* option') storage.set_storage_remotefs_container(storage_cluster_id) remotefs.delete_storage_cluster( resource_client, compute_client, network_client, blob_client, config, storage_cluster_id, delete_data_disks=delete_data_disks, delete_virtual_network=delete_virtual_network, delete_resource_group=delete_all_resources, generate_from_prefix=generate_from_prefix, wait=wait) def action_fs_cluster_expand( compute_client, network_client, config, storage_cluster_id, rebalance): # type: (azure.mgmt.compute.ComputeManagementClient, # azure.mgmt.network.NetworkManagementClient, dict, str, # bool) -> None """Action: Fs Cluster Expand :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param azure.mgmt.network.NetworkManagementClient network_client: network client :param dict config: configuration dict :param str storage_cluster_id: storage cluster id :param bool rebalance: rebalance filesystem """ _check_compute_client(compute_client) _check_network_client(network_client) if remotefs.expand_storage_cluster( compute_client, network_client, config, storage_cluster_id, _REMOTEFSPREP_FILE[0], rebalance): action_fs_cluster_status( compute_client, network_client, config, storage_cluster_id, detail=True, hosts=False) def action_fs_cluster_suspend( compute_client, config, storage_cluster_id, wait): # type: (azure.mgmt.compute.ComputeManagementClient, dict, str, # bool) -> None """Action: Fs Cluster Suspend :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param dict config: configuration dict :param str storage_cluster_id: storage cluster id :param bool wait: wait for suspension to complete """ _check_compute_client(compute_client) remotefs.suspend_storage_cluster( compute_client, config, storage_cluster_id, wait) def action_fs_cluster_start( compute_client, network_client, config, storage_cluster_id, wait): # type: (azure.mgmt.compute.ComputeManagementClient, # azure.mgmt.network.NetworkManagementClient, dict, str, # bool) -> None """Action: Fs Cluster Start :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param azure.mgmt.network.NetworkManagementClient network_client: network client :param dict config: configuration dict :param str storage_cluster_id: storage cluster id :param bool wait: wait for restart to complete """ _check_compute_client(compute_client) _check_network_client(network_client) remotefs.start_storage_cluster( compute_client, config, storage_cluster_id, wait) if wait: action_fs_cluster_status( compute_client, network_client, config, storage_cluster_id, detail=True, hosts=False) def action_fs_cluster_status( compute_client, network_client, config, storage_cluster_id, detail, hosts): # type: (azure.mgmt.compute.ComputeManagementClient, # azure.mgmt.network.NetworkManagementClient, dict, str, bool, # bool) -> None """Action: Fs Cluster Status :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param azure.mgmt.network.NetworkManagementClient network_client: network client :param dict config: configuration dict :param str storage_cluster_id: storage cluster id :param bool detail: detailed status :param bool hosts: dump info for /etc/hosts """ _check_compute_client(compute_client) _check_network_client(network_client) remotefs.stat_storage_cluster( compute_client, network_client, config, storage_cluster_id, _REMOTEFSSTAT_FILE[0], detail, hosts) def action_fs_cluster_ssh( compute_client, network_client, config, storage_cluster_id, cardinal, hostname, tty, command): # type: (azure.mgmt.compute.ComputeManagementClient, # azure.mgmt.network.NetworkManagementClient, dict, str, int, # str, bool, tuple) -> None """Action: Fs Cluster Ssh :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param azure.mgmt.network.NetworkManagementClient network_client: network client :param dict config: configuration dict :param str storage_cluster_id: storage cluster id :param int cardinal: cardinal number :param str hostname: hostname :param bool tty: allocate pseudo-tty :param tuple command: command """ _check_compute_client(compute_client) _check_network_client(network_client) if cardinal is not None and hostname is not None: raise ValueError('cannot specify both cardinal and hostname options') if cardinal is None and hostname is None: logger.warning( 'assuming node cardinal of 0 as no cardinal or hostname option ' 'was specified') cardinal = 0 if cardinal is not None and cardinal < 0: raise ValueError('invalid cardinal option value') remotefs.ssh_storage_cluster( compute_client, network_client, config, storage_cluster_id, cardinal, hostname, tty, command) def action_keyvault_add(keyvault_client, config, keyvault_uri, name): # type: (azure.keyvault.KeyVaultClient, dict, str, str) -> None """Action: Keyvault Add :param azure.keyvault.KeyVaultClient keyvault_client: keyvault client :param dict config: configuration dict :param str keyvault_uri: keyvault uri :param str name: secret name """ _check_keyvault_client(keyvault_client) keyvault.store_credentials_conf( keyvault_client, config, keyvault_uri, name) def action_keyvault_del(keyvault_client, keyvault_uri, name): # type: (azure.keyvault.KeyVaultClient, str, str) -> None """Action: Keyvault Del :param azure.keyvault.KeyVaultClient keyvault_client: keyvault client :param str keyvault_uri: keyvault uri :param str name: secret name """ _check_keyvault_client(keyvault_client) keyvault.delete_secret(keyvault_client, keyvault_uri, name) def action_keyvault_list(keyvault_client, keyvault_uri): # type: (azure.keyvault.KeyVaultClient, str) -> None """Action: Keyvault List :param azure.keyvault.KeyVaultClient keyvault_client: keyvault client :param str keyvault_uri: keyvault uri """ _check_keyvault_client(keyvault_client) keyvault.list_secrets(keyvault_client, keyvault_uri) def action_cert_create(config): # type: (dict) -> None """Action: Cert Create :param dict config: configuration dict """ sha1tp = crypto.generate_pem_pfx_certificates(config) logger.info('SHA1 Thumbprint: {}'.format(sha1tp)) def action_cert_add(batch_client, config): # type: (batchsc.BatchServiceClient, dict) -> None """Action: Cert Add :param azure.batch.batch_service_client.BatchServiceClient: batch client :param dict config: configuration dict """ _check_batch_client(batch_client) batch.add_certificate_to_account(batch_client, config, False) def action_cert_list(batch_client): # type: (batchsc.BatchServiceClient) -> None """Action: Cert List :param azure.batch.batch_service_client.BatchServiceClient: batch client """ _check_batch_client(batch_client) batch.list_certificates_in_account(batch_client) def action_cert_del(batch_client, config): # type: (batchsc.BatchServiceClient, dict) -> None """Action: Cert Del :param azure.batch.batch_service_client.BatchServiceClient: batch client :param dict config: configuration dict """ _check_batch_client(batch_client) batch.del_certificate_from_account(batch_client, config) def action_pool_listskus(batch_client): # type: (batchsc.BatchServiceClient) -> None """Action: Pool Listskus :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client """ _check_batch_client(batch_client) batch.list_node_agent_skus(batch_client) def action_pool_add( resource_client, compute_client, network_client, batch_mgmt_client, batch_client, blob_client, table_client, config): # type: (azure.mgmt.resource.resources.ResourceManagementClient, # azure.mgmt.compute.ComputeManagementClient, # azure.mgmt.network.NetworkManagementClient, # azure.mgmt.batch.BatchManagementClient, # azure.batch.batch_service_client.BatchServiceClient, # azureblob.BlockBlobService, azuretable.TableService, # dict) -> None """Action: Pool Add :param azure.mgmt.resource.resources.ResourceManagementClient resource_client: resource client :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param azure.mgmt.network.NetworkManagementClient network_client: network client :param azure.mgmt.batch.BatchManagementClient: batch_mgmt_client :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param azure.storage.blob.BlockBlobService blob_client: blob client :param azure.cosmosdb.table.TableService table_client: table client :param dict config: configuration dict """ _check_batch_client(batch_client) # first check if pool exists to prevent accidential metadata clear if batch_client.pool.exists(settings.pool_id(config)): raise RuntimeError( 'attempting to create a pool that already exists: {}'.format( settings.pool_id(config))) _adjust_settings_for_pool_creation(config) storage.create_storage_containers(blob_client, table_client, config) storage.clear_storage_containers(blob_client, table_client, config) if not settings.is_native_docker_pool(config): storage.populate_global_resource_blobs( blob_client, table_client, config) _add_pool( resource_client, compute_client, network_client, batch_mgmt_client, batch_client, blob_client, config ) def action_pool_list(batch_client): # type: (batchsc.BatchServiceClient) -> None """Action: Pool List :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client """ _check_batch_client(batch_client) batch.list_pools(batch_client) def action_pool_delete( batch_client, blob_client, table_client, config, pool_id=None, wait=False): # type: (batchsc.BatchServiceClient, azureblob.BlockBlobService, # azuretable.TableService, dict, str, bool) -> None """Action: Pool Delete :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param azure.storage.blob.BlockBlobService blob_client: blob client :param azure.cosmosdb.table.TableService table_client: table client :param dict config: configuration dict :param str pool_id: poolid to delete :param bool wait: wait for pool to delete """ _check_batch_client(batch_client) deleted = False try: deleted = batch.del_pool(batch_client, config, pool_id=pool_id) except batchmodels.BatchErrorException as ex: if ('The specified pool does not exist' in ex.message.value or 'The specified pool has been marked for deletion' in ex.message.value): deleted = True else: logger.exception(ex) if deleted: # reset storage settings to target poolid if required if util.is_not_empty(pool_id): populate_global_settings(config, False, pool_id=pool_id) else: pool_id = settings.pool_id(config) storage.cleanup_with_del_pool( blob_client, table_client, config, pool_id=pool_id) if wait: logger.debug('waiting for pool {} to delete'.format(pool_id)) while batch_client.pool.exists(pool_id): time.sleep(3) def action_pool_resize(batch_client, blob_client, config, wait): # type: (batchsc.BatchServiceClient, azureblob.BlockBlobService, # dict, bool) -> None """Resize pool that may contain glusterfs :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param azure.storage.blob.BlockBlobService blob_client: blob client :param dict config: configuration dict :param bool wait: wait for operation to complete """ _check_batch_client(batch_client) pool = settings.pool_settings(config) # check direction of resize _pool = batch_client.pool.get(pool.id) if (pool.vm_count.dedicated == _pool.current_dedicated_nodes == _pool.target_dedicated_nodes and pool.vm_count.low_priority == _pool.current_low_priority_nodes == _pool.target_low_priority_nodes): logger.error( 'pool {} is already at {} nodes'.format(pool.id, pool.vm_count)) return resize_up_d = False resize_up_lp = False if pool.vm_count.dedicated > _pool.current_dedicated_nodes: resize_up_d = True if pool.vm_count.low_priority > _pool.current_low_priority_nodes: resize_up_lp = True del _pool create_ssh_user = False # try to get handle on public key, avoid generating another set # of keys if resize_up_d or resize_up_lp: if pool.ssh.username is None: logger.info('not creating ssh user on new nodes of pool {}'.format( pool.id)) else: if pool.ssh.ssh_public_key is None: sfp = pathlib.Path(crypto.get_ssh_key_prefix() + '.pub') if sfp.exists(): logger.debug( 'setting public key for ssh user to: {}'.format(sfp)) settings.set_ssh_public_key(config, str(sfp)) create_ssh_user = True else: logger.warning( ('not creating ssh user for new nodes of pool {} as ' 'an existing ssh public key cannot be found').format( pool.id)) create_ssh_user = False # check if this is a glusterfs-enabled pool gluster_present = False voltype = None try: sdv = settings.global_resources_shared_data_volumes(config) for sdvkey in sdv: if settings.is_shared_data_volume_gluster_on_compute(sdv, sdvkey): gluster_present = True try: voltype = settings.gluster_volume_type(sdv, sdvkey) except KeyError: pass break except KeyError: pass logger.debug('glusterfs shared volume present: {}'.format( gluster_present)) if gluster_present: if resize_up_lp: raise RuntimeError( 'cannot resize up a pool with glusterfs_on_compute and ' 'low priority nodes') logger.debug('forcing wait to True due to glusterfs') wait = True # cache old nodes old_nodes = {} if gluster_present or create_ssh_user: for node in batch_client.compute_node.list(pool.id): old_nodes[node.id] = node.ip_address # resize pool nodes = batch.resize_pool(batch_client, config, wait) # add ssh user to new nodes if present if create_ssh_user and (resize_up_d or resize_up_lp): if wait: # get list of new nodes only new_nodes = [node for node in nodes if node.id not in old_nodes] # create admin user on each new node if requested batch.add_ssh_user(batch_client, config, nodes=new_nodes) # log remote login settings for new ndoes batch.get_remote_login_settings( batch_client, config, nodes=new_nodes) del new_nodes else: logger.warning('ssh user was not added as --wait was not given') # add brick for new nodes if gluster_present and resize_up_d: # get pool current dedicated _pool = batch_client.pool.get(pool.id) # ensure current dedicated is the target if pool.vm_count.dedicated != _pool.current_dedicated_nodes: raise RuntimeError( ('cannot perform glusterfs setup on new nodes, unexpected ' 'current dedicated {} to vm_count {}').format( _pool.current_dedicated_nodes, pool.vm_count.dedicated)) del _pool # get internal ip addresses of new nodes new_nodes = [ node.ip_address for node in nodes if node.id not in old_nodes ] masterip = next(iter(old_nodes.values())) # get tempdisk mountpoint tempdisk = settings.temp_disk_mountpoint(config) # construct cmdline cmdline = util.wrap_commands_in_shell([ '$AZ_BATCH_TASK_DIR/{} {} {} {} {} {}'.format( _GLUSTERRESIZE_FILE[0], voltype.lower(), tempdisk, pool.vm_count.dedicated, masterip, ' '.join(new_nodes))]) # setup gluster _setup_glusterfs( batch_client, blob_client, config, nodes, _GLUSTERRESIZE_FILE, cmdline=cmdline) def action_pool_nodes_grls(batch_client, config): # type: (batchsc.BatchServiceClient, dict) -> None """Action: Pool Nodes Grls :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict """ _check_batch_client(batch_client) batch.get_remote_login_settings(batch_client, config) batch.generate_ssh_tunnel_script( batch_client, settings.pool_settings(config), None, None) def action_pool_nodes_list(batch_client, config): # type: (batchsc.BatchServiceClient, dict) -> None """Action: Pool Nodes List :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict """ _check_batch_client(batch_client) batch.list_nodes(batch_client, config) def action_pool_user_add(batch_client, config): # type: (batchsc.BatchServiceClient, dict) -> None """Action: Pool User Add :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict """ _check_batch_client(batch_client) if settings.is_windows_pool(config): batch.add_rdp_user(batch_client, config) else: batch.add_ssh_user(batch_client, config) def action_pool_user_del(batch_client, config): # type: (batchsc.BatchServiceClient, dict) -> None """Action: Pool Dru :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict """ _check_batch_client(batch_client) if settings.is_windows_pool(config): batch.del_rdp_user(batch_client, config) else: batch.del_ssh_user(batch_client, config) def action_pool_ssh(batch_client, config, cardinal, nodeid, tty, command): # type: (batchsc.BatchServiceClient, dict, int, str, bool, tuple) -> None """Action: Pool Ssh :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param int cardinal: cardinal node num :param str nodeid: node id :param bool tty: allocate pseudo-tty :param tuple command: command to execute """ _check_batch_client(batch_client) if cardinal is not None and nodeid is not None: raise ValueError('cannot specify both cardinal and nodeid options') if cardinal is None and nodeid is None: logger.warning( 'assuming node cardinal of 0 as no cardinal or nodeid option ' 'was specified') cardinal = 0 if cardinal is not None and cardinal < 0: raise ValueError('invalid cardinal option value') pool = settings.pool_settings(config) ssh_private_key = pool.ssh.ssh_private_key if ssh_private_key is None: ssh_private_key = pathlib.Path( pool.ssh.generated_file_export_path, crypto.get_ssh_key_prefix()) ip, port = batch.get_remote_login_setting_for_node( batch_client, config, cardinal, nodeid) crypto.connect_or_exec_ssh_command( ip, port, ssh_private_key, pool.ssh.username, tty=tty, command=command) def action_pool_nodes_del( batch_client, config, all_start_task_failed, all_starting, all_unusable, nodeid): # type: (batchsc.BatchServiceClient, dict, bool, bool, bool, str) -> None """Action: Pool Nodes Del :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param bool all_start_task_failed: delete all start task failed nodes :param bool all_starting: delete all starting nodes :param bool all_unusable: delete all unusable nodes :param str nodeid: nodeid to delete """ _check_batch_client(batch_client) if ((all_start_task_failed or all_starting or all_unusable) and nodeid is not None): raise ValueError( 'cannot specify all start task failed nodes or unusable with ' 'a specific node id') batch.del_node( batch_client, config, all_start_task_failed, all_starting, all_unusable, nodeid) def action_pool_nodes_reboot( batch_client, config, all_start_task_failed, nodeid): # type: (batchsc.BatchServiceClient, dict, bool, str) -> None """Action: Pool Nodes Reboot :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param bool all_start_task_failed: reboot all start task failed nodes :param str nodeid: nodeid to reboot """ _check_batch_client(batch_client) if all_start_task_failed and nodeid is not None: raise ValueError( 'cannot specify all start task failed nodes with a specific ' 'node id') batch.reboot_nodes(batch_client, config, all_start_task_failed, nodeid) def action_pool_images_update( batch_client, config, docker_image, docker_image_digest, singularity_image, ssh): # type: (batchsc.BatchServiceClient, dict, str, str, str, bool) -> None """Action: Pool Images Update :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param str docker_image: docker image to update :param str docker_image_digest: docker image digest to update to :param str singularity_image: singularity image to update :param bool ssh: use direct SSH update mode """ _check_batch_client(batch_client) if docker_image_digest is not None and docker_image is None: raise ValueError( 'cannot specify a digest to update to without the image') _update_container_images( batch_client, config, docker_image, docker_image_digest, singularity_image, force_ssh=ssh) def action_pool_images_list(batch_client, config): # type: (batchsc.BatchServiceClient, dict, str, str, bool) -> None """Action: Pool Images List :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict """ _check_batch_client(batch_client) _list_docker_images(batch_client, config) def action_pool_stats(batch_client, config, pool_id): # type: (batchsc.BatchServiceClient, dict, str) -> None """Action: Pool Stats :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param str pool_id: pool id """ _check_batch_client(batch_client) batch.pool_stats(batch_client, config, pool_id=pool_id) def action_pool_autoscale_disable(batch_client, config): # type: (batchsc.BatchServiceClient, dict, str, str, bool) -> None """Action: Pool Autoscale Disable :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict """ _check_batch_client(batch_client) batch.pool_autoscale_disable(batch_client, config) def action_pool_autoscale_enable(batch_client, config): # type: (batchsc.BatchServiceClient, dict, str, str, bool) -> None """Action: Pool Autoscale Enable :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict """ _check_batch_client(batch_client) batch.pool_autoscale_enable(batch_client, config) def action_pool_autoscale_evaluate(batch_client, config): # type: (batchsc.BatchServiceClient, dict, str, str, bool) -> None """Action: Pool Autoscale Evaluate :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict """ _check_batch_client(batch_client) batch.pool_autoscale_evaluate(batch_client, config) def action_pool_autoscale_lastexec(batch_client, config): # type: (batchsc.BatchServiceClient, dict, str, str, bool) -> None """Action: Pool Autoscale Lastexec :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict """ _check_batch_client(batch_client) batch.pool_autoscale_lastexec(batch_client, config) def action_jobs_add( resource_client, compute_client, network_client, batch_mgmt_client, batch_client, blob_client, table_client, keyvault_client, config, recreate, tail): # type: (azure.mgmt.resource.resources.ResourceManagementClient, # azure.mgmt.compute.ComputeManagementClient, # azure.mgmt.network.NetworkManagementClient, # azure.mgmt.batch.BatchManagementClient, # azure.batch.batch_service_client.BatchServiceClient, # azureblob.BlockBlobService, azuretable.TableService, # azure.keyvault.KeyVaultClient, dict, bool, str) -> None """Action: Jobs Add :param azure.mgmt.resource.resources.ResourceManagementClient resource_client: resource client :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param azure.mgmt.network.NetworkManagementClient network_client: network client :param azure.mgmt.batch.BatchManagementClient: batch_mgmt_client :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param azure.storage.blob.BlockBlobService blob_client: blob client :param azure.cosmosdb.table.TableService table_client: table client :param azure.keyvault.KeyVaultClient keyvault_client: keyvault client :param dict config: configuration dict :param bool recreate: recreate jobs if completed :param str tail: file to tail or last job and task added """ _check_batch_client(batch_client) # check for job autopools autopool = batch.check_jobs_for_auto_pool(config) if autopool: # check to ensure pool id is within 20 chars pool_id = settings.pool_id(config) if len(pool_id) > 20: raise ValueError( 'pool id must be less than 21 characters: {}'.format(pool_id)) # check if a pool id with existing pool id exists try: batch_client.pool.get(pool_id) except batchmodels.BatchErrorException as ex: if 'The specified pool does not exist' in ex.message.value: pass else: raise RuntimeError( 'pool with id of {} already exists'.format(pool_id)) _adjust_settings_for_pool_creation(config) # create storage containers and clear storage.create_storage_containers(blob_client, table_client, config) storage.clear_storage_containers(blob_client, table_client, config) if not settings.is_native_docker_pool(config): storage.populate_global_resource_blobs( blob_client, table_client, config) # create autopool specification object autopool = _construct_auto_pool_specification( resource_client, compute_client, network_client, batch_mgmt_client, batch_client, blob_client, config ) # check settings and warn _check_settings_for_auto_pool(config) else: autopool = None # add jobs is_windows = settings.is_windows_pool(config) batch.add_jobs( batch_client, blob_client, keyvault_client, config, autopool, _IMAGE_BLOCK_FILE, _BLOBXFER_WINDOWS_FILE if is_windows else _BLOBXFER_FILE, recreate, tail) def action_jobs_list(batch_client, config): # type: (batchsc.BatchServiceClient, dict) -> None """Action: Jobs List :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict """ _check_batch_client(batch_client) batch.list_jobs(batch_client, config) def action_jobs_tasks_list( batch_client, config, all, jobid, poll_until_tasks_complete): # type: (batchsc.BatchServiceClient, dict, bool, str, bool) -> None """Action: Jobs Tasks List :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param bool all: all jobs :param str jobid: job id :param bool poll_until_tasks_complete: poll until tasks complete """ _check_batch_client(batch_client) if all and jobid is not None: raise ValueError('cannot specify both --all and --jobid') while True: all_complete = batch.list_tasks( batch_client, config, all=all, jobid=jobid) if not poll_until_tasks_complete or all_complete: break time.sleep(5) def action_jobs_tasks_term(batch_client, config, jobid, taskid, wait, force): # type: (batchsc.BatchServiceClient, dict, str, str, bool, bool) -> None """Action: Jobs Tasks Term :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param str jobid: job id :param str taskid: task id :param bool wait: wait for action to complete :param bool force: force docker kill even if completed """ _check_batch_client(batch_client) if taskid is not None and jobid is None: raise ValueError( 'cannot specify a task to terminate without the corresponding ' 'job id') if force and (taskid is None or jobid is None): raise ValueError('cannot force docker kill without task id/job id') batch.terminate_tasks( batch_client, config, jobid=jobid, taskid=taskid, wait=wait, force=force) def action_jobs_tasks_del(batch_client, config, jobid, taskid, wait): # type: (batchsc.BatchServiceClient, dict, str, str, bool) -> None """Action: Jobs Tasks Del :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param str jobid: job id :param str taskid: task id :param bool wait: wait for action to complete """ _check_batch_client(batch_client) if taskid is not None and jobid is None: raise ValueError( 'cannot specify a task to delete without the corresponding ' 'job id') batch.del_tasks( batch_client, config, jobid=jobid, taskid=taskid, wait=wait) def action_jobs_del_or_term( batch_client, blob_client, table_client, config, delete, all_jobs, all_jobschedules, jobid, jobscheduleid, termtasks, wait): # type: (batchsc.BatchServiceClient, azureblob.BlockBlobService, # azuretable.TableService, dict, bool, bool, str, str, # bool, bool) -> None """Action: Jobs Del or Term :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param azure.storage.blob.BlockBlobService blob_client: blob client :param azure.cosmosdb.table.TableService table_client: table client :param dict config: configuration dict :param bool all_jobs: all jobs :param bool all_jobschedules: all job schedules :param str jobid: job id :param str jobscheduleid: job schedule id :param bool termtasks: terminate tasks prior :param bool wait: wait for action to complete """ _check_batch_client(batch_client) if jobid is not None and jobscheduleid is not None: raise ValueError('cannot specify both --jobid and --jobscheduleid') if all_jobs: if jobid is not None: raise ValueError('cannot specify both --all-jobs and --jobid') batch.delete_or_terminate_all_jobs( batch_client, config, delete, termtasks=termtasks, wait=wait) elif all_jobschedules: if jobscheduleid is not None: raise ValueError( 'cannot specify both --all-jobschedules and --jobscheduleid') if termtasks: raise ValueError( 'Cannot specify --termtasks with --all-jobschedules. ' 'Please terminate tasks with each individual job first.') batch.delete_or_terminate_all_job_schedules( batch_client, config, delete, wait=wait) else: # check for autopool if util.is_none_or_empty(jobid): autopool = batch.check_jobs_for_auto_pool(config) if autopool: # check if a pool id with existing pool id exists try: batch_client.pool.get(settings.pool_id(config)) except batchmodels.BatchErrorException as ex: if 'The specified pool does not exist' in ex.message.value: pass else: autopool = False else: autopool = False # terminate the jobs batch.delete_or_terminate_jobs( batch_client, config, delete, jobid=jobid, jobscheduleid=jobscheduleid, termtasks=termtasks, wait=wait) # if autopool, delete the storage if autopool: storage.cleanup_with_del_pool(blob_client, table_client, config) def action_jobs_cmi(batch_client, config, delete): # type: (batchsc.BatchServiceClient, dict, bool) -> None """Action: Jobs Cmi :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param bool delete: delete all cmi jobs """ _check_batch_client(batch_client) if delete: batch.del_clean_mi_jobs(batch_client, config) else: batch.clean_mi_jobs(batch_client, config) batch.del_clean_mi_jobs(batch_client, config) def action_jobs_migrate( batch_client, config, jobid, jobscheduleid, poolid, requeue, terminate, wait): # type: (batchsc.BatchServiceClient, dict, str, str, str, bool, bool, # bool) -> None """Action: Jobs Migrate :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param str jobid: job id to migrate to in lieu of config :param str jobscheduleid: job schedule id to migrate to in lieu of config :param str poolid: pool id to migrate to in lieu of config :param bool requeue: requeue action :param bool terminate: terminate action :param bool wait: wait action """ _check_batch_client(batch_client) if jobid is not None: if jobscheduleid is not None: raise ValueError('cannot specify both --jobid and --jobscheduleid') if [requeue, terminate, wait].count(True) != 1: raise ValueError( 'must specify only one option of --requeue, --terminate, ' '--wait') if requeue: action = 'requeue' elif terminate: action = 'terminate' elif wait: action = 'wait' else: action = None # check jobs to see if targetted pool id is the same batch.check_pool_for_job_migration( batch_client, config, jobid=jobid, jobscheduleid=jobscheduleid, poolid=poolid) if not util.confirm_action( config, msg='migration of jobs or job schedules'): return logger.warning( 'ensure that the new target pool has the proper Docker images ' 'loaded, or you have enabled allow_run_on_missing_image') # disable job and wait for disabled state batch.disable_jobs( batch_client, config, action, jobid=jobid, jobscheduleid=jobscheduleid, suppress_confirm=True) # patch job batch.update_job_with_pool( batch_client, config, jobid=jobid, jobscheduleid=jobscheduleid, poolid=poolid) # enable job batch.enable_jobs( batch_client, config, jobid=jobid, jobscheduleid=jobscheduleid) def action_jobs_disable( batch_client, config, jobid, jobscheduleid, requeue, terminate, wait): # type: (batchsc.BatchServiceClient, dict, str, str, bool, bool, # bool) -> None """Action: Jobs Disable :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param str jobid: job id to disable to in lieu of config :param str jobscheduleid: job schedule id to disable to in lieu of config :param bool requeue: requeue action :param bool terminate: terminate action :param bool wait: wait action """ _check_batch_client(batch_client) if jobid is not None: if jobscheduleid is not None: raise ValueError('cannot specify both --jobid and --jobscheduleid') if [requeue, terminate, wait].count(True) != 1: raise ValueError( 'must specify only one option of --requeue, --terminate, ' '--wait') if requeue: action = 'requeue' elif terminate: action = 'terminate' elif wait: action = 'wait' else: action = None batch.disable_jobs( batch_client, config, action, jobid=jobid, jobscheduleid=jobscheduleid, disabling_state_ok=True) def action_jobs_enable(batch_client, config, jobid, jobscheduleid): # type: (batchsc.BatchServiceClient, dict, str, str) -> None """Action: Jobs Enable :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param str jobid: job id to enable to in lieu of config :param str jobscheduleid: job schedule id to enable to in lieu of config """ _check_batch_client(batch_client) batch.enable_jobs( batch_client, config, jobid=jobid, jobscheduleid=jobscheduleid) def action_jobs_stats(batch_client, config, job_id): # type: (batchsc.BatchServiceClient, dict, str) -> None """Action: Jobs Stats :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param str job_id: job id """ _check_batch_client(batch_client) batch.job_stats(batch_client, config, jobid=job_id) def action_storage_del( blob_client, table_client, config, clear_tables, poolid): # type: (azureblob.BlockBlobService, azuretable.TableService, # dict, bool, str) -> None """Action: Storage Del :param azure.storage.blob.BlockBlobService blob_client: blob client :param azure.cosmosdb.table.TableService table_client: table client :param dict config: configuration dict :param bool clear_tables: clear tables instead of deleting :param str poolid: pool id to target """ # reset storage settings to target poolid if util.is_not_empty(poolid): populate_global_settings(config, False, pool_id=poolid) if clear_tables: storage.clear_storage_containers( blob_client, table_client, config, tables_only=True, pool_id=poolid) storage.delete_storage_containers( blob_client, table_client, config, skip_tables=clear_tables) def action_storage_clear(blob_client, table_client, config, poolid): # type: (azureblob.BlockBlobService, azuretable.TableService, dict, # str) -> None """Action: Storage Clear :param azure.storage.blob.BlockBlobService blob_client: blob client :param azure.cosmosdb.table.TableService table_client: table client :param dict config: configuration dict :param str poolid: pool id to target """ # reset storage settings to target poolid if util.is_not_empty(poolid): populate_global_settings(config, False, pool_id=poolid) storage.clear_storage_containers( blob_client, table_client, config, pool_id=poolid) def action_data_files_stream(batch_client, config, filespec, disk): # type: (batchsc.BatchServiceClient, dict, str, bool) -> None """Action: Data Files Stream :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param str filespec: filespec of file to retrieve :param bool disk: write streamed data to disk instead """ _check_batch_client(batch_client) batch.stream_file_and_wait_for_task(batch_client, config, filespec, disk) def action_data_files_list(batch_client, config, jobid, taskid): # type: (batchsc.BatchServiceClient, dict, str, str) -> None """Action: Data Files List :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param str jobid: job id to list :param str taskid: task id to list """ _check_batch_client(batch_client) if taskid is not None and jobid is None: raise ValueError( 'cannot specify a task to list files without the corresponding ' 'job id') batch.list_task_files(batch_client, config, jobid, taskid) def action_data_files_task(batch_client, config, all, filespec): # type: (batchsc.BatchServiceClient, dict, bool, str) -> None """Action: Data Files Task :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param bool all: retrieve all files :param str filespec: filespec of file to retrieve """ _check_batch_client(batch_client) if all: batch.get_all_files_via_task(batch_client, config, filespec) else: batch.get_file_via_task(batch_client, config, filespec) def action_data_files_node(batch_client, config, all, nodeid): # type: (batchsc.BatchServiceClient, dict, bool, str) -> None """Action: Data Files Node :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param bool all: retrieve all files :param str nodeid: node id to retrieve file from """ _check_batch_client(batch_client) if all: batch.get_all_files_via_node(batch_client, config, nodeid) else: batch.get_file_via_node(batch_client, config, nodeid) def action_data_ingress( batch_client, compute_client, network_client, config, to_fs): # type: (batchsc.BatchServiceClient, # azure.mgmt.compute.ComputeManagementClient, # azure.mgmt.network.NetworkManagementClient, dict, str) -> None """Action: Data Ingress :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param azure.mgmt.compute.ComputeManagementClient compute_client: compute client :param azure.mgmt.network.NetworkManagementClient network_client: network client :param dict config: configuration dict :param str to_fs: ingress to remote filesystem """ pool_total_vm_count = None if util.is_none_or_empty(to_fs): try: # get pool current dedicated pool = batch_client.pool.get(settings.pool_id(config)) pool_total_vm_count = ( pool.current_dedicated_nodes + pool.current_low_priority_nodes ) del pool # ensure there are remote login settings rls = batch.get_remote_login_settings( batch_client, config, nodes=None) # ensure nodes are at least idle/running for shared ingress kind = 'all' if not batch.check_pool_nodes_runnable(batch_client, config): kind = 'storage' except batchmodels.BatchErrorException as ex: if 'The specified pool does not exist' in ex.message.value: rls = None kind = 'storage' else: raise else: rls = None kind = 'remotefs' if compute_client is None or network_client is None: raise RuntimeError( 'required ARM clients are invalid, please provide management ' 'AAD credentials') storage_threads = data.ingress_data( batch_client, compute_client, network_client, config, rls=rls, kind=kind, total_vm_count=pool_total_vm_count, to_fs=to_fs) data.wait_for_storage_threads(storage_threads) def action_misc_tensorboard( batch_client, config, jobid, taskid, logdir, image): # type: (batchsc.BatchServiceClient, dict, str, str, str, str) -> None """Action: Misc Tensorboard :param azure.batch.batch_service_client.BatchServiceClient batch_client: batch client :param dict config: configuration dict :param str jobid: job id to list :param str taskid: task id to list :param str logdir: log dir :param str image: tensorflow image to use """ _check_batch_client(batch_client) if util.is_none_or_empty(jobid): jobspecs = settings.job_specifications(config) if len(jobspecs) != 1: raise ValueError( 'The number of jobs in the specified jobs config is not ' 'one. Please specify which job with --jobid.') if util.is_not_empty(taskid): raise ValueError( 'cannot specify a task to tunnel Tensorboard to without the ' 'corresponding job id') misc.tunnel_tensorboard(batch_client, config, jobid, taskid, logdir, image)