3550 строки
145 KiB
Python
3550 строки
145 KiB
Python
# Copyright (c) Microsoft Corporation
|
|
#
|
|
# All rights reserved.
|
|
#
|
|
# MIT License
|
|
#
|
|
# Permission is hereby granted, free of charge, to any person obtaining a
|
|
# copy of this software and associated documentation files (the "Software"),
|
|
# to deal in the Software without restriction, including without limitation
|
|
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
# and/or sell copies of the Software, and to permit persons to whom the
|
|
# Software is furnished to do so, subject to the following conditions:
|
|
#
|
|
# The above copyright notice and this permission notice shall be included in
|
|
# all copies or substantial portions of the Software.
|
|
#
|
|
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
# DEALINGS IN THE SOFTWARE.
|
|
|
|
# compat imports
|
|
from __future__ import (
|
|
absolute_import, division, print_function, unicode_literals
|
|
)
|
|
from builtins import ( # noqa
|
|
bytes, dict, int, list, object, range, str, ascii, chr, hex, input,
|
|
next, oct, open, pow, round, super, filter, map, zip)
|
|
# stdlib imports
|
|
import logging
|
|
import os
|
|
try:
|
|
import pathlib2 as pathlib
|
|
except ImportError:
|
|
import pathlib
|
|
import requests
|
|
import tempfile
|
|
import time
|
|
import uuid
|
|
# non-stdlib imports
|
|
import azure.batch.models as batchmodels
|
|
# local imports
|
|
from . import autoscale
|
|
from . import batch
|
|
from . import crypto
|
|
from . import data
|
|
from . import keyvault
|
|
from . import misc
|
|
from . import remotefs
|
|
from . import resource
|
|
from . import settings
|
|
from . import storage
|
|
from . import util
|
|
from .version import __version__
|
|
|
|
# create logger
|
|
logger = logging.getLogger(__name__)
|
|
util.setup_logger(logger)
|
|
# global defines
|
|
_REQUEST_CHUNK_SIZE = 4194304
|
|
_ROOT_PATH = pathlib.Path(__file__).resolve().parent.parent
|
|
_RESOURCES_PATH = None
|
|
_NVIDIA_DRIVER = {
|
|
'compute': {
|
|
'url': (
|
|
'http://us.download.nvidia.com/tesla/'
|
|
'384.111/NVIDIA-Linux-x86_64-384.111.run'
|
|
),
|
|
'sha256': (
|
|
'bd8af7654ccb224c37e74c8e81477a42f63fa9f2360b1b1ec6ae00b03ae21054'
|
|
),
|
|
'target': 'nvidia-driver.run'
|
|
},
|
|
'visualization': {
|
|
'url': 'https://go.microsoft.com/fwlink/?linkid=849941',
|
|
'sha256': (
|
|
'ca3fd5f5e9156ad3d983b2032bde3c009dca73400f2753f9b475825f4670a854'
|
|
),
|
|
'target': 'nvidia-driver-grid.run'
|
|
},
|
|
'license': (
|
|
'http://www.nvidia.com/content/DriverDownload-March2009'
|
|
'/licence.php?lang=us'
|
|
),
|
|
}
|
|
_CASCADE_FILE = (
|
|
'cascade.py',
|
|
pathlib.Path(_ROOT_PATH, 'cascade/cascade.py')
|
|
)
|
|
_PERF_FILE = (
|
|
'perf.py',
|
|
pathlib.Path(_ROOT_PATH, 'cascade/perf.py')
|
|
)
|
|
_NODEPREP_FILE = (
|
|
'shipyard_nodeprep.sh',
|
|
pathlib.Path(_ROOT_PATH, 'scripts/shipyard_nodeprep.sh')
|
|
)
|
|
_NODEPREP_CUSTOMIMAGE_FILE = (
|
|
'shipyard_nodeprep_customimage.sh',
|
|
pathlib.Path(_ROOT_PATH, 'scripts/shipyard_nodeprep_customimage.sh')
|
|
)
|
|
_NODEPREP_NATIVEDOCKER_FILE = (
|
|
'shipyard_nodeprep_nativedocker.sh',
|
|
pathlib.Path(_ROOT_PATH, 'scripts/shipyard_nodeprep_nativedocker.sh')
|
|
)
|
|
_NODEPREP_WINDOWS_FILE = (
|
|
'shipyard_nodeprep_nativedocker.ps1',
|
|
pathlib.Path(
|
|
_ROOT_PATH,
|
|
'scripts/windows/shipyard_nodeprep_nativedocker.ps1'
|
|
)
|
|
)
|
|
_GLUSTERPREP_FILE = (
|
|
'shipyard_glusterfs_on_compute.sh',
|
|
pathlib.Path(_ROOT_PATH, 'scripts/shipyard_glusterfs_on_compute.sh')
|
|
)
|
|
_GLUSTERRESIZE_FILE = (
|
|
'shipyard_glusterfs_on_compute_resize.sh',
|
|
pathlib.Path(
|
|
_ROOT_PATH, 'scripts/shipyard_glusterfs_on_compute_resize.sh')
|
|
)
|
|
_HPNSSH_FILE = (
|
|
'shipyard_hpnssh.sh',
|
|
pathlib.Path(_ROOT_PATH, 'scripts/shipyard_hpnssh.sh')
|
|
)
|
|
_IMAGE_BLOCK_FILE = (
|
|
'wait_for_images.sh',
|
|
pathlib.Path(_ROOT_PATH, 'scripts/wait_for_images.sh')
|
|
)
|
|
_REGISTRY_LOGIN_FILE = (
|
|
'registry_login.sh',
|
|
pathlib.Path(_ROOT_PATH, 'scripts/registry_login.sh')
|
|
)
|
|
_REGISTRY_LOGIN_WINDOWS_FILE = (
|
|
'registry_login.ps1',
|
|
pathlib.Path(_ROOT_PATH, 'scripts/windows/registry_login.ps1')
|
|
)
|
|
_BLOBXFER_FILE = (
|
|
'shipyard_blobxfer.sh',
|
|
pathlib.Path(_ROOT_PATH, 'scripts/shipyard_blobxfer.sh')
|
|
)
|
|
_BLOBXFER_WINDOWS_FILE = (
|
|
'shipyard_blobxfer.ps1',
|
|
pathlib.Path(_ROOT_PATH, 'scripts/windows/shipyard_blobxfer.ps1')
|
|
)
|
|
_REMOTEFSPREP_FILE = (
|
|
'shipyard_remotefs_bootstrap.sh',
|
|
pathlib.Path(_ROOT_PATH, 'scripts/shipyard_remotefs_bootstrap.sh')
|
|
)
|
|
_REMOTEFSADDBRICK_FILE = (
|
|
'shipyard_remotefs_addbrick.sh',
|
|
pathlib.Path(_ROOT_PATH, 'scripts/shipyard_remotefs_addbrick.sh')
|
|
)
|
|
_REMOTEFSSTAT_FILE = (
|
|
'shipyard_remotefs_stat.sh',
|
|
pathlib.Path(_ROOT_PATH, 'scripts/shipyard_remotefs_stat.sh')
|
|
)
|
|
_ALL_REMOTEFS_FILES = [
|
|
_REMOTEFSPREP_FILE, _REMOTEFSADDBRICK_FILE, _REMOTEFSSTAT_FILE,
|
|
]
|
|
|
|
|
|
def initialize_globals(verbose):
|
|
# type: (bool) -> None
|
|
"""Initialize any runtime globals
|
|
:param bool verbose: verbose
|
|
"""
|
|
global _RESOURCES_PATH
|
|
if _RESOURCES_PATH is None:
|
|
_RESOURCES_PATH = _ROOT_PATH / 'resources'
|
|
if not _RESOURCES_PATH.exists():
|
|
_RESOURCES_PATH = pathlib.Path(
|
|
tempfile.gettempdir()) / 'batch-shipyard-{}-resources'.format(
|
|
__version__)
|
|
_RESOURCES_PATH.mkdir(parents=True, exist_ok=True)
|
|
if verbose:
|
|
logger.debug('initialized resources path to: {}'.format(
|
|
_RESOURCES_PATH))
|
|
|
|
|
|
def populate_global_settings(config, fs_storage, pool_id=None):
|
|
# type: (dict, bool) -> None
|
|
"""Populate global settings from config
|
|
:param dict config: configuration dict
|
|
:param bool fs_storage: adjust for fs context
|
|
:param str pool_id: pool id override
|
|
"""
|
|
bs = settings.batch_shipyard_settings(config)
|
|
sc = settings.credentials_storage(config, bs.storage_account_settings)
|
|
if fs_storage:
|
|
# set postfix to empty for now, it will be populated with the
|
|
# storage cluster during the actual calls
|
|
postfix = ''
|
|
if util.is_not_empty(pool_id):
|
|
raise ValueError('pool id specified for fs_storage')
|
|
else:
|
|
bc = settings.credentials_batch(config)
|
|
if util.is_none_or_empty(pool_id):
|
|
pool_id = settings.pool_id(config, lower=True)
|
|
postfix = '-'.join((bc.account.lower(), pool_id))
|
|
storage.set_storage_configuration(
|
|
bs.storage_entity_prefix,
|
|
postfix,
|
|
sc.account,
|
|
sc.account_key,
|
|
sc.endpoint,
|
|
bs.generated_sas_expiry_days)
|
|
|
|
|
|
def fetch_credentials_conf_from_keyvault(
|
|
keyvault_client, keyvault_uri, keyvault_credentials_secret_id):
|
|
# type: (azure.keyvault.KeyVaultClient, str, str) -> dict
|
|
"""Fetch a credentials conf from keyvault
|
|
:param azure.keyvault.KeyVaultClient keyvault_client: keyvault client
|
|
:param str keyvault_uri: keyvault uri
|
|
:param str keyvault_credentials_secret_id: keyvault cred secret id
|
|
:rtype: dict
|
|
:return: credentials conf
|
|
"""
|
|
if keyvault_uri is None:
|
|
raise ValueError('credentials conf was not specified or is invalid')
|
|
if keyvault_client is None:
|
|
raise ValueError('no Azure KeyVault or AAD credentials specified')
|
|
return keyvault.fetch_credentials_conf(
|
|
keyvault_client, keyvault_uri, keyvault_credentials_secret_id)
|
|
|
|
|
|
def fetch_secrets_from_keyvault(keyvault_client, config):
|
|
# type: (azure.keyvault.KeyVaultClient, dict) -> None
|
|
"""Fetch secrets with secret ids in config from keyvault
|
|
:param azure.keyvault.KeyVaultClient keyvault_client: keyvault client
|
|
:param dict config: configuration dict
|
|
"""
|
|
if keyvault_client is not None:
|
|
keyvault.parse_secret_ids(keyvault_client, config)
|
|
|
|
|
|
def _setup_nvidia_driver_package(blob_client, config, vm_size):
|
|
# type: (azure.storage.blob.BlockBlobService, dict, str) -> pathlib.Path
|
|
"""Set up the nvidia driver package
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param dict config: configuration dict
|
|
:param str vm_size: vm size
|
|
:rtype: pathlib.Path
|
|
:return: package path
|
|
"""
|
|
gpu_type = settings.get_gpu_type_from_vm_size(vm_size)
|
|
pkg = _RESOURCES_PATH / _NVIDIA_DRIVER[gpu_type]['target']
|
|
# check to see if package is downloaded
|
|
if (not pkg.exists() or
|
|
util.compute_sha256_for_file(pkg, False) !=
|
|
_NVIDIA_DRIVER[gpu_type]['sha256']):
|
|
# display license link
|
|
if not util.confirm_action(
|
|
config,
|
|
msg=('agreement with License for Customer Use of NVIDIA '
|
|
'Software @ {}').format(_NVIDIA_DRIVER['license']),
|
|
allow_auto=True):
|
|
raise RuntimeError(
|
|
'Cannot proceed with deployment due to non-agreement with '
|
|
'license for NVIDIA driver')
|
|
else:
|
|
logger.info('NVIDIA Software License accepted')
|
|
# download driver
|
|
logger.debug('downloading NVIDIA driver to {}'.format(
|
|
_NVIDIA_DRIVER[gpu_type]['target']))
|
|
response = requests.get(_NVIDIA_DRIVER[gpu_type]['url'], stream=True)
|
|
with pkg.open('wb') as f:
|
|
for chunk in response.iter_content(chunk_size=_REQUEST_CHUNK_SIZE):
|
|
if chunk:
|
|
f.write(chunk)
|
|
logger.debug('wrote {} bytes to {}'.format(pkg.stat().st_size, pkg))
|
|
# check sha256
|
|
if (util.compute_sha256_for_file(pkg, False) !=
|
|
_NVIDIA_DRIVER[gpu_type]['sha256']):
|
|
raise RuntimeError('sha256 mismatch for {}'.format(pkg))
|
|
return pkg
|
|
|
|
|
|
def _generate_azure_mount_script_name(
|
|
batch_account_name, pool_id, is_file_share, is_windows):
|
|
# type: (str, str, bool, bool) -> pathlib.Path
|
|
"""Generate an azure blob/file mount script name
|
|
:param str batch_account_name: batch account name
|
|
:param str pool_id: pool id
|
|
:param boo is_file_share: is file share
|
|
:param bool is_windows: is windows
|
|
:rtype: pathlib.Path
|
|
:return: path to azure mount script
|
|
"""
|
|
if is_file_share:
|
|
prefix = 'azurefile'
|
|
else:
|
|
prefix = 'azureblob'
|
|
return _RESOURCES_PATH / '{}-mount-{}-{}.{}'.format(
|
|
prefix, batch_account_name.lower(), pool_id.lower(),
|
|
'cmd' if is_windows else 'sh')
|
|
|
|
|
|
def _setup_azureblob_mounts(blob_client, config, bc):
|
|
# type: (azure.storage.blob.BlockBlobService, dict,
|
|
# settings.BatchCredentials) -> tuple
|
|
"""Set up the Azure Blob container via blobfuse
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param dict config: configuration dict
|
|
:param settings.BatchCredentials bc: batch creds
|
|
:rtype: tuple
|
|
:return: (bin path, service file path, service env file path,
|
|
volume creation script path)
|
|
"""
|
|
tmpmount = settings.temp_disk_mountpoint(config)
|
|
# construct mount commands
|
|
cmds = []
|
|
sdv = settings.global_resources_shared_data_volumes(config)
|
|
for svkey in sdv:
|
|
if settings.is_shared_data_volume_azure_blob(sdv, svkey):
|
|
sa = settings.credentials_storage(
|
|
config,
|
|
settings.azure_storage_account_settings(sdv, svkey))
|
|
cont = settings.azure_blob_container_name(sdv, svkey)
|
|
hmp = settings.azure_blob_host_mount_path(sa.account, cont)
|
|
tmpmp = '{}/blobfuse-tmp/{}-{}'.format(tmpmount, sa.account, cont)
|
|
cmds.append('mkdir -p {}'.format(hmp))
|
|
cmds.append('chmod 0770 {}'.format(hmp))
|
|
cmds.append('mkdir -p {}'.format(tmpmp))
|
|
cmds.append('chown _azbatch:_azbatchgrp {}'.format(tmpmp))
|
|
cmds.append('chmod 0770 {}'.format(tmpmp))
|
|
conn = 'azblob-{}-{}.cfg'.format(sa.account, cont)
|
|
cmds.append('cat > {} << EOF'.format(conn))
|
|
cmds.append('accountName {}'.format(sa.account))
|
|
cmds.append('accountKey {}'.format(sa.account_key))
|
|
cmds.append('containerName {}'.format(cont))
|
|
cmds.append('EOF')
|
|
cmd = (
|
|
'blobfuse {hmp} --tmp-path={tmpmp} -o attr_timeout=240 '
|
|
'-o entry_timeout=240 -o negative_timeout=120 -o allow_other '
|
|
'--config-file={conn}'
|
|
).format(hmp=hmp, tmpmp=tmpmp, conn=conn)
|
|
# add any additional mount options
|
|
mo = settings.shared_data_volume_mount_options(sdv, svkey)
|
|
if util.is_not_empty(mo):
|
|
opts = []
|
|
for opt in mo:
|
|
if opt.strip() == '-o allow_other':
|
|
continue
|
|
opts.append(opt)
|
|
cmd = '{} {}'.format(cmd, ' '.join(opts))
|
|
cmds.append(cmd)
|
|
# create file share mount command script
|
|
if util.is_none_or_empty(cmds):
|
|
raise RuntimeError('Generated Azure blob mount commands are invalid')
|
|
volcreate = _generate_azure_mount_script_name(
|
|
bc.account, settings.pool_id(config), False, False)
|
|
newline = '\n'
|
|
with volcreate.open('w', newline=newline) as f:
|
|
f.write('#!/usr/bin/env bash')
|
|
f.write(newline)
|
|
f.write('set -e')
|
|
f.write(newline)
|
|
f.write('set -o pipefail')
|
|
f.write(newline)
|
|
for cmd in cmds:
|
|
f.write(cmd)
|
|
f.write(newline)
|
|
return volcreate
|
|
|
|
|
|
def _setup_azurefile_mounts(blob_client, config, bc, is_windows):
|
|
# type: (azure.storage.blob.BlockBlobService, dict,
|
|
# settings.BatchCredentials, bool) -> tuple
|
|
"""Set up the Azure File shares
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param dict config: configuration dict
|
|
:param settings.BatchCredentials bc: batch creds
|
|
:param bool is_windows: is windows pool
|
|
:rtype: tuple
|
|
:return: (bin path, service file path, service env file path,
|
|
volume creation script path)
|
|
"""
|
|
# construct mount commands
|
|
cmds = []
|
|
sdv = settings.global_resources_shared_data_volumes(config)
|
|
for svkey in sdv:
|
|
if settings.is_shared_data_volume_azure_file(sdv, svkey):
|
|
sa = settings.credentials_storage(
|
|
config,
|
|
settings.azure_storage_account_settings(sdv, svkey))
|
|
share = settings.azure_file_share_name(sdv, svkey)
|
|
hmp = settings.azure_file_host_mount_path(
|
|
sa.account, share, is_windows)
|
|
if is_windows:
|
|
cmd = (
|
|
'net use \\\\{sa}.file.{ep}\{share} {sakey} '
|
|
'/user:Azure\{sa}'
|
|
).format(
|
|
sa=sa.account, ep=sa.endpoint, share=share,
|
|
sakey=sa.account_key)
|
|
cmds.append(cmd)
|
|
cmd = 'mklink /d {hmp} \\\\{sa}.file.{ep}\{share}'.format(
|
|
hmp=hmp, sa=sa.account, ep=sa.endpoint, share=share)
|
|
else:
|
|
cmd = (
|
|
'mount -t cifs //{sa}.file.{ep}/{share} {hmp} -o '
|
|
'vers=3.0,username={sa},password={sakey},'
|
|
'serverino'
|
|
).format(
|
|
sa=sa.account, ep=sa.endpoint, share=share, hmp=hmp,
|
|
sakey=sa.account_key)
|
|
# add any additional mount options
|
|
mo = settings.shared_data_volume_mount_options(sdv, svkey)
|
|
if util.is_not_empty(mo):
|
|
opts = []
|
|
# retain backward compatibility with filemode/dirmode
|
|
# options from the old Azure File Docker volume driver
|
|
for opt in mo:
|
|
tmp = opt.split('=')
|
|
if tmp[0] == 'filemode':
|
|
opts.append('file_mode={}'.format(tmp[1]))
|
|
elif tmp[0] == 'dirmode':
|
|
opts.append('dir_mode={}'.format(tmp[1]))
|
|
else:
|
|
opts.append(opt)
|
|
cmd = '{},{}'.format(cmd, ','.join(opts))
|
|
if not is_windows:
|
|
cmds.append('mkdir -p {}'.format(hmp))
|
|
cmds.append(cmd)
|
|
# create file share mount command script
|
|
if util.is_none_or_empty(cmds):
|
|
raise RuntimeError('Generated Azure file mount commands are invalid')
|
|
volcreate = _generate_azure_mount_script_name(
|
|
bc.account, settings.pool_id(config), True, is_windows)
|
|
newline = '\r\n' if is_windows else '\n'
|
|
with volcreate.open('w', newline=newline) as f:
|
|
if is_windows:
|
|
f.write('@echo off')
|
|
f.write(newline)
|
|
else:
|
|
f.write('#!/usr/bin/env bash')
|
|
f.write(newline)
|
|
f.write('set -e')
|
|
f.write(newline)
|
|
f.write('set -o pipefail')
|
|
f.write(newline)
|
|
for cmd in cmds:
|
|
f.write(cmd)
|
|
f.write(newline)
|
|
return volcreate
|
|
|
|
|
|
def _create_storage_cluster_mount_args(
|
|
compute_client, network_client, batch_mgmt_client, config, sc_id,
|
|
bc, subnet_id):
|
|
# type: (azure.mgmt.compute.ComputeManagementClient,
|
|
# azure.mgmt.network.NetworkManagementClient,
|
|
# azure.mgmt.batch.BatchManagementClient, dict, str,
|
|
# settings.BatchCredentials, str) -> Tuple[str, str]
|
|
"""Create storage cluster mount arguments
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param azure.mgmt.network.NetworkManagementClient network_client:
|
|
network client
|
|
:param azure.mgmt.batch.BatchManagementClient: batch_mgmt_client
|
|
:param dict config: configuration dict
|
|
:param str sc_id: storage cluster id
|
|
:param settings.BatchCredentials bc: batch creds
|
|
:param str subnet_id: subnet id
|
|
:rtype: tuple
|
|
:return: (fstab mount, storage cluster arg)
|
|
"""
|
|
fstab_mount = None
|
|
sc_arg = None
|
|
ba = batch.get_batch_account(batch_mgmt_client, config)
|
|
# check for vnet/subnet presence
|
|
if util.is_none_or_empty(subnet_id):
|
|
raise RuntimeError(
|
|
'cannot mount a storage cluster without a valid virtual '
|
|
'network or subnet')
|
|
# get remotefs settings
|
|
rfs = settings.remotefs_settings(config, sc_id)
|
|
sc = rfs.storage_cluster
|
|
# iterate through shared data volumes and fine storage clusters
|
|
sdv = settings.global_resources_shared_data_volumes(config)
|
|
if (sc_id not in sdv or
|
|
not settings.is_shared_data_volume_storage_cluster(
|
|
sdv, sc_id)):
|
|
raise RuntimeError(
|
|
'No storage cluster {} found in configuration'.format(sc_id))
|
|
vnet_subid, vnet_rg, _, vnet_name, subnet_name = _explode_arm_subnet_id(
|
|
subnet_id)
|
|
# check for same vnet name
|
|
if vnet_name.lower() != sc.virtual_network.name.lower():
|
|
raise RuntimeError(
|
|
'cannot link storage cluster {} on virtual '
|
|
'network {} with pool virtual network {}'.format(
|
|
sc_id, sc.virtual_network.name, vnet_name))
|
|
# cross check vnet resource group
|
|
if vnet_rg.lower() != sc.virtual_network.resource_group.lower():
|
|
raise RuntimeError(
|
|
'cannot link storage cluster {} virtual network in resource group '
|
|
'{} with pool virtual network in resource group {}'.format(
|
|
sc_id, sc.virtual_network.resource_group, vnet_rg))
|
|
# cross check vnet subscription id
|
|
_ba_tmp = ba.id.lower().split('/')
|
|
if vnet_subid.lower() != _ba_tmp[2]:
|
|
raise RuntimeError(
|
|
'cannot link storage cluster {} virtual network in subscription '
|
|
'{} with pool virtual network in subscription {}'.format(
|
|
sc_id, vnet_subid, _ba_tmp[2]))
|
|
del _ba_tmp
|
|
# get vm count
|
|
if sc.vm_count < 1:
|
|
raise RuntimeError(
|
|
'storage cluster {} vm_count {} is invalid'.format(
|
|
sc_id, sc.vm_count))
|
|
# get fileserver type
|
|
if sc.file_server.type == 'nfs':
|
|
# query first vm for info
|
|
vm_name = settings.generate_virtual_machine_name(sc, 0)
|
|
vm = compute_client.virtual_machines.get(
|
|
resource_group_name=sc.resource_group,
|
|
vm_name=vm_name,
|
|
)
|
|
nic = resource.get_nic_from_virtual_machine(
|
|
network_client, sc.resource_group, vm)
|
|
# get private ip of vm
|
|
remote_ip = nic.ip_configurations[0].private_ip_address
|
|
# construct mount options
|
|
mo = '_netdev,auto,nfsvers=4,intr'
|
|
amo = settings.shared_data_volume_mount_options(sdv, sc_id)
|
|
if util.is_not_empty(amo):
|
|
if 'udp' in mo:
|
|
raise RuntimeError(
|
|
('udp cannot be specified as a mount option for '
|
|
'storage cluster {}').format(sc_id))
|
|
if any([x.startswith('nfsvers=') for x in amo]):
|
|
raise RuntimeError(
|
|
('nfsvers cannot be specified as a mount option for '
|
|
'storage cluster {}').format(sc_id))
|
|
if any([x.startswith('port=') for x in amo]):
|
|
raise RuntimeError(
|
|
('port cannot be specified as a mount option for '
|
|
'storage cluster {}').format(sc_id))
|
|
mo = ','.join((mo, ','.join(amo)))
|
|
# construct mount string for fstab
|
|
fstab_mount = (
|
|
'{remoteip}:{srcpath} {hmp}/{scid} '
|
|
'{fstype} {mo} 0 2').format(
|
|
remoteip=remote_ip,
|
|
srcpath=sc.file_server.mountpoint,
|
|
hmp=settings.get_host_mounts_path(False),
|
|
scid=sc_id,
|
|
fstype=sc.file_server.type,
|
|
mo=mo)
|
|
elif sc.file_server.type == 'glusterfs':
|
|
# walk vms and find non-overlapping ud/fds
|
|
primary_ip = None
|
|
primary_ud = None
|
|
primary_fd = None
|
|
backup_ip = None
|
|
backup_ud = None
|
|
backup_fd = None
|
|
vms = {}
|
|
# first pass, attempt to populate all ip, ud/fd
|
|
for i in range(sc.vm_count):
|
|
vm_name = settings.generate_virtual_machine_name(sc, i)
|
|
vm = compute_client.virtual_machines.get(
|
|
resource_group_name=sc.resource_group,
|
|
vm_name=vm_name,
|
|
expand=compute_client.virtual_machines.models.
|
|
InstanceViewTypes.instance_view,
|
|
)
|
|
nic = resource.get_nic_from_virtual_machine(
|
|
network_client, sc.resource_group, vm)
|
|
vms[i] = (vm, nic)
|
|
# get private ip and ud/fd of vm
|
|
remote_ip = nic.ip_configurations[0].private_ip_address
|
|
ud = vm.instance_view.platform_update_domain
|
|
fd = vm.instance_view.platform_fault_domain
|
|
if primary_ip is None:
|
|
primary_ip = remote_ip
|
|
primary_ud = ud
|
|
primary_fd = fd
|
|
if backup_ip is None:
|
|
if (primary_ip == backup_ip or primary_ud == ud or
|
|
primary_fd == fd):
|
|
continue
|
|
backup_ip = remote_ip
|
|
backup_ud = ud
|
|
backup_fd = fd
|
|
# second pass, fill in with at least non-overlapping update domains
|
|
if backup_ip is None:
|
|
for i in range(sc.vm_count):
|
|
vm, nic = vms[i]
|
|
remote_ip = nic.ip_configurations[0].private_ip_address
|
|
ud = vm.instance_view.platform_update_domain
|
|
fd = vm.instance_view.platform_fault_domain
|
|
if primary_ud != ud:
|
|
backup_ip = remote_ip
|
|
backup_ud = ud
|
|
backup_fd = fd
|
|
break
|
|
if primary_ip is None or backup_ip is None:
|
|
raise RuntimeError(
|
|
'Could not find either a primary ip {} or backup ip {} for '
|
|
'glusterfs client mount'.format(primary_ip, backup_ip))
|
|
logger.debug('primary ip/ud/fd={} backup ip/ud/fd={}'.format(
|
|
(primary_ip, primary_ud, primary_fd),
|
|
(backup_ip, backup_ud, backup_fd)))
|
|
# construct mount options
|
|
mo = '_netdev,auto,transport=tcp,backupvolfile-server={}'.format(
|
|
backup_ip)
|
|
amo = settings.shared_data_volume_mount_options(sdv, sc_id)
|
|
if util.is_not_empty(amo):
|
|
if any([x.startswith('backupvolfile-server=') for x in amo]):
|
|
raise RuntimeError(
|
|
('backupvolfile-server cannot be specified as a mount '
|
|
'option for storage cluster {}').format(sc_id))
|
|
if any([x.startswith('transport=') for x in amo]):
|
|
raise RuntimeError(
|
|
('transport cannot be specified as a mount option for '
|
|
'storage cluster {}').format(sc_id))
|
|
mo = ','.join((mo, ','.join(amo)))
|
|
# construct mount string for fstab, srcpath is the gluster volume
|
|
fstab_mount = (
|
|
'{remoteip}:/{srcpath} {hmp}/{scid} '
|
|
'{fstype} {mo} 0 2').format(
|
|
remoteip=primary_ip,
|
|
srcpath=settings.get_file_server_glusterfs_volume_name(sc),
|
|
hmp=settings.get_host_mounts_path(False),
|
|
scid=sc_id,
|
|
fstype=sc.file_server.type,
|
|
mo=mo)
|
|
else:
|
|
raise NotImplementedError(
|
|
('cannot handle file_server type {} for storage '
|
|
'cluster {}').format(sc.file_server.type, sc_id))
|
|
if util.is_none_or_empty(fstab_mount):
|
|
raise RuntimeError(
|
|
('Could not construct an fstab mount entry for storage '
|
|
'cluster {}').format(sc_id))
|
|
# construct sc_arg
|
|
sc_arg = '{}:{}'.format(sc.file_server.type, sc_id)
|
|
# log config
|
|
if settings.verbose(config):
|
|
logger.debug('storage cluster {} fstab mount: {}'.format(
|
|
sc_id, fstab_mount))
|
|
return (fstab_mount, sc_arg)
|
|
|
|
|
|
def _create_custom_linux_mount_args(config, mount_name):
|
|
# type: (dict, str) -> str
|
|
"""Create a custom linux mount fstab entry
|
|
:param dict config: configuration dict
|
|
:param str mount_name: mount name
|
|
:rtype: str
|
|
:return: fstab entry
|
|
"""
|
|
sdv = settings.global_resources_shared_data_volumes(config)
|
|
fstab = settings.custom_linux_mount_fstab_options(sdv, mount_name)
|
|
fstab_mount = (
|
|
'{fs_spec} {hmp}/{name} {fs_vfstype} {fs_mntops} {fs_freq} '
|
|
'{fs_passno}').format(
|
|
fs_spec=fstab.fs_spec,
|
|
hmp=settings.get_host_mounts_path(False),
|
|
name=mount_name,
|
|
fs_vfstype=fstab.fs_vfstype,
|
|
fs_mntops=fstab.fs_mntops,
|
|
fs_freq=fstab.fs_freq,
|
|
fs_passno=fstab.fs_passno)
|
|
return fstab_mount
|
|
|
|
|
|
def _pick_node_agent_for_vm(batch_client, pool_settings):
|
|
# type: (azure.batch.batch_service_client.BatchServiceClient,
|
|
# settings.PoolSettings) -> (str, str)
|
|
"""Pick a node agent id for the vm
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param settings.PoolSettings pool_settings: pool settings
|
|
:rtype: tuple
|
|
:return: image reference to use, node agent id to use
|
|
"""
|
|
publisher = pool_settings.vm_configuration.publisher
|
|
offer = pool_settings.vm_configuration.offer
|
|
sku = pool_settings.vm_configuration.sku
|
|
# TODO special exception for CentOS HPC 7.1
|
|
if publisher == 'openlogic' and offer == 'centos-hpc' and sku == '7.1':
|
|
return ({
|
|
'publisher': publisher,
|
|
'offer': offer,
|
|
'sku': sku,
|
|
'version': pool_settings.vm_configuration.version,
|
|
}, 'batch.node.centos 7')
|
|
# pick latest sku
|
|
node_agent_skus = batch_client.account.list_node_agent_skus()
|
|
skus_to_use = [
|
|
(nas, image_ref) for nas in node_agent_skus
|
|
for image_ref in sorted(
|
|
nas.verified_image_references,
|
|
key=lambda item: item.sku
|
|
)
|
|
if image_ref.publisher.lower() == publisher and
|
|
image_ref.offer.lower() == offer and
|
|
image_ref.sku.lower() == sku
|
|
]
|
|
try:
|
|
sku_to_use, image_ref_to_use = skus_to_use[-1]
|
|
except IndexError:
|
|
raise RuntimeError(
|
|
('Could not find an Azure Batch Node Agent Sku for this '
|
|
'offer={} publisher={} sku={}. You can list the valid and '
|
|
'available Marketplace images with the command: pool '
|
|
'listskus').format(
|
|
pool_settings.vm_configuration.offer,
|
|
pool_settings.vm_configuration.publisher,
|
|
pool_settings.vm_configuration.sku))
|
|
# set image version to use
|
|
image_ref_to_use.version = pool_settings.vm_configuration.version
|
|
logger.info('deploying vm config: {}'.format(image_ref_to_use))
|
|
return (image_ref_to_use, sku_to_use.id)
|
|
|
|
|
|
def _explode_arm_subnet_id(arm_subnet_id):
|
|
# type: (str) -> Tuple[str, str, str, str, str]
|
|
"""Parses components from ARM subnet id
|
|
:param str arm_subnet_id: ARM subnet id
|
|
:rtype: tuple
|
|
:return: subid, rg, provider, vnet, subnet
|
|
"""
|
|
tmp = arm_subnet_id.split('/')
|
|
subid = tmp[2]
|
|
rg = tmp[4]
|
|
provider = tmp[6]
|
|
vnet = tmp[8]
|
|
subnet = tmp[10]
|
|
return subid, rg, provider, vnet, subnet
|
|
|
|
|
|
def _pool_virtual_network_subnet_address_space_check(
|
|
resource_client, network_client, config, pool_settings, bc):
|
|
# type: (azure.mgmt.resource.resources.ResourceManagementClient,
|
|
# azure.mgmt.network.NetworkManagementClient, dict,
|
|
# settings.PoolSettings, settings.BatchCredentialsSettings) -> str
|
|
"""Pool Virtual Network and subnet address space check and create if
|
|
specified
|
|
:param azure.mgmt.resource.resources.ResourceManagementClient
|
|
resource_client: resource client
|
|
:param azure.mgmt.network.NetworkManagementClient network_client:
|
|
network client
|
|
:param dict config: configuration dict
|
|
:param settings.PoolSettings pool_settings: pool settings
|
|
:param settings.BatchCredentialsSettings bc: batch cred settings
|
|
:rtype: str
|
|
:return: subnet id
|
|
"""
|
|
if (util.is_none_or_empty(pool_settings.virtual_network.arm_subnet_id) and
|
|
util.is_none_or_empty(pool_settings.virtual_network.name)):
|
|
logger.debug('no virtual network settings specified')
|
|
return None
|
|
# check if AAD is enabled
|
|
if util.is_not_empty(bc.account_key):
|
|
raise RuntimeError(
|
|
'cannot allocate a pool with a virtual network without AAD '
|
|
'credentials')
|
|
# get subnet object
|
|
subnet_id = None
|
|
if util.is_not_empty(pool_settings.virtual_network.arm_subnet_id):
|
|
subnet_components = _explode_arm_subnet_id(
|
|
pool_settings.virtual_network.arm_subnet_id)
|
|
logger.debug(
|
|
('arm subnet id breakdown: subid={} rg={} provider={} vnet={} '
|
|
'subnet={}').format(
|
|
subnet_components[0], subnet_components[1],
|
|
subnet_components[2], subnet_components[3],
|
|
subnet_components[4]))
|
|
subnet_id = pool_settings.virtual_network.arm_subnet_id
|
|
if network_client is None:
|
|
logger.info('using virtual network subnet id: {}'.format(
|
|
subnet_id))
|
|
logger.warning(
|
|
'cannot perform IP space validation without a valid '
|
|
'network_client, please specify management AAD credentials '
|
|
'to allow pre-validation')
|
|
return subnet_id
|
|
# retrieve address prefix for subnet
|
|
_subnet = network_client.subnets.get(
|
|
subnet_components[1], subnet_components[3], subnet_components[4])
|
|
else:
|
|
if util.is_not_empty(pool_settings.virtual_network.resource_group):
|
|
_vnet_rg = pool_settings.virtual_network.resource_group
|
|
else:
|
|
_vnet_rg = bc.resource_group
|
|
# create virtual network and subnet if specified
|
|
_, _subnet = resource.create_virtual_network_and_subnet(
|
|
resource_client, network_client, _vnet_rg, bc.location,
|
|
pool_settings.virtual_network)
|
|
del _vnet_rg
|
|
subnet_id = _subnet.id
|
|
# ensure address prefix for subnet is valid
|
|
tmp = _subnet.address_prefix.split('/')
|
|
if len(tmp) <= 1:
|
|
raise RuntimeError(
|
|
'subnet address_prefix is invalid for Batch pools: {}'.format(
|
|
_subnet.address_prefix))
|
|
mask = int(tmp[-1])
|
|
# subtract 5 for guideline and Azure numbering start
|
|
allowable_addresses = (1 << (32 - mask)) - 5
|
|
logger.debug('subnet {} mask is {} and allows {} addresses'.format(
|
|
_subnet.name, mask, allowable_addresses))
|
|
pool_total_vm_count = (
|
|
pool_settings.vm_count.dedicated +
|
|
pool_settings.vm_count.low_priority
|
|
)
|
|
if allowable_addresses < pool_total_vm_count:
|
|
raise RuntimeError(
|
|
('subnet {} mask is {} and allows {} addresses but desired '
|
|
'pool vm_count is {}').format(
|
|
_subnet.name, mask, allowable_addresses, pool_total_vm_count))
|
|
elif int(allowable_addresses * 0.9) <= pool_total_vm_count:
|
|
# if within 90% tolerance, warn user due to potential
|
|
# address shortage if other compute resources are in this subnet
|
|
if not util.confirm_action(
|
|
config,
|
|
msg=('subnet {} mask is {} and allows {} addresses '
|
|
'but desired pool vm_count is {}, proceed?').format(
|
|
_subnet.name, mask, allowable_addresses,
|
|
pool_total_vm_count)):
|
|
raise RuntimeError('Pool deployment rejected by user')
|
|
logger.info('using virtual network subnet id: {}'.format(subnet_id))
|
|
return subnet_id
|
|
|
|
|
|
def _construct_pool_object(
|
|
resource_client, compute_client, network_client, batch_mgmt_client,
|
|
batch_client, blob_client, config):
|
|
# type: (azure.mgmt.resource.resources.ResourceManagementClient,
|
|
# azure.mgmt.compute.ComputeManagementClient,
|
|
# azure.mgmt.network.NetworkManagementClient,
|
|
# azure.mgmt.batch.BatchManagementClient,
|
|
# azure.batch.batch_service_client.BatchServiceClient,
|
|
# azureblob.BlockBlobService, dict) -> None
|
|
"""Construct a pool add parameter object for create pool along with
|
|
uploading resource files
|
|
:param azure.mgmt.resource.resources.ResourceManagementClient
|
|
resource_client: resource client
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param azure.mgmt.network.NetworkManagementClient network_client:
|
|
network client
|
|
:param azure.mgmt.batch.BatchManagementClient: batch_mgmt_client
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param dict config: configuration dict
|
|
"""
|
|
# check shared data volume mounts before proceeding to allocate
|
|
azureblob_vd = False
|
|
azurefile_vd = False
|
|
gluster_on_compute = False
|
|
storage_cluster_mounts = []
|
|
custom_linux_mounts = []
|
|
try:
|
|
sdv = settings.global_resources_shared_data_volumes(config)
|
|
for sdvkey in sdv:
|
|
if settings.is_shared_data_volume_azure_file(sdv, sdvkey):
|
|
azurefile_vd = True
|
|
elif settings.is_shared_data_volume_azure_blob(sdv, sdvkey):
|
|
azureblob_vd = True
|
|
elif settings.is_shared_data_volume_gluster_on_compute(
|
|
sdv, sdvkey):
|
|
if gluster_on_compute:
|
|
raise ValueError(
|
|
'only one glusterfs on compute can be created')
|
|
gluster_on_compute = True
|
|
elif settings.is_shared_data_volume_storage_cluster(
|
|
sdv, sdvkey):
|
|
storage_cluster_mounts.append(sdvkey)
|
|
elif settings.is_shared_data_volume_custom_linux_mount(
|
|
sdv, sdvkey):
|
|
custom_linux_mounts.append(sdvkey)
|
|
else:
|
|
raise ValueError('Unknown shared data volume: {}'.format(
|
|
settings.shared_data_volume_driver(sdv, sdvkey)))
|
|
except KeyError:
|
|
pass
|
|
# retrieve settings
|
|
pool_settings = settings.pool_settings(config)
|
|
native = settings.is_native_docker_pool(
|
|
config, vm_config=pool_settings.vm_configuration)
|
|
is_windows = settings.is_windows_pool(
|
|
config, vm_config=pool_settings.vm_configuration)
|
|
# get autoscale settings
|
|
if settings.is_pool_autoscale_enabled(config, pas=pool_settings.autoscale):
|
|
asenable = True
|
|
asformula = autoscale.get_formula(pool_settings)
|
|
asei = pool_settings.autoscale.evaluation_interval
|
|
if pool_settings.resize_timeout is not None:
|
|
logger.warning(
|
|
'ignoring resize timeout for autoscale-enabled pool')
|
|
else:
|
|
asenable = False
|
|
asformula = None
|
|
asei = None
|
|
logger.debug('autoscale enabled: {}'.format(asenable))
|
|
# task scheduling policy settings
|
|
if util.is_not_empty(pool_settings.node_fill_type):
|
|
task_scheduling_policy = batchmodels.TaskSchedulingPolicy(
|
|
node_fill_type=batchmodels.ComputeNodeFillType(
|
|
pool_settings.node_fill_type),
|
|
)
|
|
else:
|
|
task_scheduling_policy = None
|
|
# custom image settings
|
|
custom_image_na = settings.pool_custom_image_node_agent(config)
|
|
# check for virtual network settings
|
|
bc = settings.credentials_batch(config)
|
|
subnet_id = _pool_virtual_network_subnet_address_space_check(
|
|
resource_client, network_client, config, pool_settings, bc)
|
|
# construct fstab mounts for storage clusters
|
|
sc_fstab_mounts = []
|
|
sc_args = []
|
|
if util.is_not_empty(storage_cluster_mounts):
|
|
for sc_id in storage_cluster_mounts:
|
|
fm, sca = _create_storage_cluster_mount_args(
|
|
compute_client, network_client, batch_mgmt_client, config,
|
|
sc_id, bc, subnet_id)
|
|
sc_fstab_mounts.append(fm)
|
|
sc_args.append(sca)
|
|
if settings.verbose(config):
|
|
logger.debug('storage cluster args: {}'.format(sc_args))
|
|
del storage_cluster_mounts
|
|
# constrcut fstab mounts for custom mounts
|
|
custom_linux_fstab_mounts = []
|
|
if util.is_not_empty(custom_linux_mounts):
|
|
for id in custom_linux_mounts:
|
|
custom_linux_fstab_mounts.append(
|
|
_create_custom_linux_mount_args(config, id))
|
|
del custom_linux_mounts
|
|
# add encryption cert to account if specified
|
|
encrypt = settings.batch_shipyard_encryption_enabled(config)
|
|
if encrypt:
|
|
pfx = crypto.get_encryption_pfx_settings(config)
|
|
batch.add_certificate_to_account(batch_client, config)
|
|
# construct block list
|
|
block_for_gr = None
|
|
if pool_settings.block_until_all_global_resources_loaded:
|
|
block_for_gr_docker = ''
|
|
block_for_gr_singularity = ''
|
|
docker_images = settings.global_resources_docker_images(config)
|
|
if len(docker_images) > 0:
|
|
block_for_gr_docker = ','.join([x for x in docker_images])
|
|
singularity_images = settings.global_resources_singularity_images(
|
|
config)
|
|
if len(singularity_images) > 0:
|
|
block_for_gr_singularity = ','.join(
|
|
[util.singularity_image_name_on_disk(x)
|
|
for x in singularity_images])
|
|
if (util.is_none_or_empty(block_for_gr_docker) and
|
|
util.is_none_or_empty(block_for_gr_singularity)):
|
|
logger.warning(
|
|
'no Docker and Singularity images specified in global '
|
|
'resources')
|
|
if native:
|
|
# native pools will auto preload
|
|
block_for_gr_docker = ''
|
|
block_for_gr = '{}#{}'.format(
|
|
block_for_gr_docker, block_for_gr_singularity)
|
|
# shipyard settings
|
|
bs = settings.batch_shipyard_settings(config)
|
|
# data replication and peer-to-peer settings
|
|
dr = settings.data_replication_settings(config)
|
|
# create torrent flags
|
|
torrentflags = '{}:{}:{}:{}'.format(
|
|
dr.peer_to_peer.enabled, dr.concurrent_source_downloads,
|
|
dr.peer_to_peer.direct_download_seed_bias,
|
|
dr.peer_to_peer.compression)
|
|
# create resource files list
|
|
if is_windows:
|
|
_rflist = [_REGISTRY_LOGIN_WINDOWS_FILE, _BLOBXFER_WINDOWS_FILE]
|
|
else:
|
|
_rflist = [_REGISTRY_LOGIN_FILE, _BLOBXFER_FILE]
|
|
if not native and not is_windows:
|
|
_rflist.append(_IMAGE_BLOCK_FILE)
|
|
if not bs.use_shipyard_docker_image:
|
|
_rflist.append(_CASCADE_FILE)
|
|
if bs.store_timing_metrics:
|
|
_rflist.append(_PERF_FILE)
|
|
if pool_settings.ssh.hpn_server_swap:
|
|
_rflist.append(_HPNSSH_FILE)
|
|
# handle azure mounts
|
|
if azureblob_vd:
|
|
abms = _setup_azureblob_mounts(blob_client, config, bc)
|
|
_rflist.append(('azureblob-mount.sh', abms))
|
|
if azurefile_vd:
|
|
afms = _setup_azurefile_mounts(blob_client, config, bc, is_windows)
|
|
_rflist.append(
|
|
('azurefile-mount.{}'.format('cmd' if is_windows else 'sh'), afms)
|
|
)
|
|
# gpu settings
|
|
if (not native and settings.is_gpu_pool(pool_settings.vm_size) and
|
|
util.is_none_or_empty(custom_image_na)):
|
|
if pool_settings.gpu_driver is None:
|
|
gpu_driver = _setup_nvidia_driver_package(
|
|
blob_client, config, pool_settings.vm_size)
|
|
_rflist.append((gpu_driver.name, gpu_driver))
|
|
else:
|
|
gpu_type = settings.get_gpu_type_from_vm_size(
|
|
pool_settings.vm_size)
|
|
gpu_driver = pathlib.Path(_NVIDIA_DRIVER[gpu_type]['target'])
|
|
gpu_env = '{}:{}'.format(
|
|
settings.is_gpu_visualization_pool(pool_settings.vm_size),
|
|
gpu_driver.name)
|
|
else:
|
|
gpu_env = None
|
|
# get container registries
|
|
docker_registries = settings.docker_registries(config)
|
|
# set additional start task commands (pre version)
|
|
start_task = pool_settings.additional_node_prep_commands_pre
|
|
# set vm configuration
|
|
if native:
|
|
if util.is_not_empty(custom_image_na):
|
|
# check if AAD is enabled
|
|
if util.is_not_empty(bc.account_key):
|
|
raise RuntimeError(
|
|
'cannot allocate a pool with a custom image without AAD '
|
|
'credentials')
|
|
vmconfig = batchmodels.VirtualMachineConfiguration(
|
|
image_reference=batchmodels.ImageReference(
|
|
virtual_machine_image_id=pool_settings.
|
|
vm_configuration.arm_image_id,
|
|
),
|
|
node_agent_sku_id=pool_settings.vm_configuration.node_agent,
|
|
)
|
|
logger.debug(
|
|
('deploying custom image to pool in native mode: {} '
|
|
'node agent: {}').format(
|
|
vmconfig.image_reference.virtual_machine_image_id,
|
|
vmconfig.node_agent_sku_id))
|
|
else:
|
|
image_ref, na_ref = _pick_node_agent_for_vm(
|
|
batch_client, pool_settings)
|
|
vmconfig = batchmodels.VirtualMachineConfiguration(
|
|
image_reference=image_ref,
|
|
node_agent_sku_id=na_ref,
|
|
)
|
|
logger.debug('deploying pool in native mode')
|
|
# attach container config
|
|
vmconfig.container_configuration = batchmodels.ContainerConfiguration(
|
|
container_image_names=settings.global_resources_docker_images(
|
|
config),
|
|
container_registries=docker_registries,
|
|
)
|
|
if is_windows:
|
|
if util.is_not_empty(custom_image_na):
|
|
raise RuntimeError(
|
|
'Native mode and Windows custom images is not supported')
|
|
_rflist.append(_NODEPREP_WINDOWS_FILE)
|
|
start_task.append(
|
|
('powershell -ExecutionPolicy Unrestricted -command '
|
|
'{npf}{a}{e}{v}{x}').format(
|
|
npf=_NODEPREP_WINDOWS_FILE[0],
|
|
a=' -a' if azurefile_vd else '',
|
|
e=' -e {}'.format(pfx.sha1) if encrypt else '',
|
|
v=' -v {}'.format(__version__),
|
|
x=' -x {}'.format(data._BLOBXFER_VERSION))
|
|
)
|
|
else:
|
|
_rflist.append(_NODEPREP_NATIVEDOCKER_FILE)
|
|
start_task.append(
|
|
'{npf}{a}{c}{e}{f}{m}{n}{v}{x}'.format(
|
|
npf=_NODEPREP_NATIVEDOCKER_FILE[0],
|
|
a=' -a' if azurefile_vd else '',
|
|
c=' -c' if azureblob_vd else '',
|
|
e=' -e {}'.format(pfx.sha1) if encrypt else '',
|
|
f=' -f' if gluster_on_compute else '',
|
|
m=' -m {}'.format(','.join(sc_args)) if util.is_not_empty(
|
|
sc_args) else '',
|
|
n=' -n' if settings.can_tune_tcp(
|
|
pool_settings.vm_size) else '',
|
|
v=' -v {}'.format(__version__),
|
|
x=' -x {}'.format(data._BLOBXFER_VERSION),
|
|
)
|
|
)
|
|
elif util.is_not_empty(custom_image_na):
|
|
# check if AAD is enabled
|
|
if util.is_not_empty(bc.account_key):
|
|
raise RuntimeError(
|
|
'cannot allocate a pool with a custom image without AAD '
|
|
'credentials')
|
|
_rflist.append(_NODEPREP_CUSTOMIMAGE_FILE)
|
|
vmconfig = batchmodels.VirtualMachineConfiguration(
|
|
image_reference=batchmodels.ImageReference(
|
|
virtual_machine_image_id=pool_settings.
|
|
vm_configuration.arm_image_id,
|
|
),
|
|
node_agent_sku_id=pool_settings.vm_configuration.node_agent,
|
|
)
|
|
logger.debug('deploying custom image: {} node agent: {}'.format(
|
|
vmconfig.image_reference.virtual_machine_image_id,
|
|
vmconfig.node_agent_sku_id))
|
|
start_task.append(
|
|
'{npf}{a}{b}{c}{e}{f}{m}{n}{p}{t}{v}{x}'.format(
|
|
npf=_NODEPREP_CUSTOMIMAGE_FILE[0],
|
|
a=' -a' if azurefile_vd else '',
|
|
b=' -b' if util.is_not_empty(block_for_gr) else '',
|
|
c=' -c' if azureblob_vd else '',
|
|
e=' -e {}'.format(pfx.sha1) if encrypt else '',
|
|
f=' -f' if gluster_on_compute else '',
|
|
m=' -m {}'.format(','.join(sc_args)) if util.is_not_empty(
|
|
sc_args) else '',
|
|
n=' -n' if settings.can_tune_tcp(
|
|
pool_settings.vm_size) else '',
|
|
p=' -p {}'.format(bs.storage_entity_prefix)
|
|
if bs.storage_entity_prefix else '',
|
|
t=' -t {}'.format(torrentflags),
|
|
v=' -v {}'.format(__version__),
|
|
x=' -x {}'.format(data._BLOBXFER_VERSION),
|
|
)
|
|
)
|
|
else:
|
|
_rflist.append(_NODEPREP_FILE)
|
|
image_ref, na_ref = _pick_node_agent_for_vm(
|
|
batch_client, pool_settings)
|
|
vmconfig = batchmodels.VirtualMachineConfiguration(
|
|
image_reference=image_ref,
|
|
node_agent_sku_id=na_ref,
|
|
)
|
|
# create start task commandline
|
|
start_task.append(
|
|
'{npf}{a}{b}{c}{d}{e}{f}{g}{m}{n}{o}{p}{s}{t}{v}{w}{x}'.format(
|
|
npf=_NODEPREP_FILE[0],
|
|
a=' -a' if azurefile_vd else '',
|
|
b=' -b' if util.is_not_empty(block_for_gr) else '',
|
|
c=' -c' if azureblob_vd else '',
|
|
d=' -d' if bs.use_shipyard_docker_image else '',
|
|
e=' -e {}'.format(pfx.sha1) if encrypt else '',
|
|
f=' -f' if gluster_on_compute else '',
|
|
g=' -g {}'.format(gpu_env) if gpu_env is not None else '',
|
|
m=' -m {}'.format(','.join(sc_args)) if util.is_not_empty(
|
|
sc_args) else '',
|
|
n=' -n' if settings.can_tune_tcp(
|
|
pool_settings.vm_size) else '',
|
|
o=' -o {}'.format(pool_settings.vm_configuration.offer),
|
|
p=' -p {}'.format(bs.storage_entity_prefix)
|
|
if bs.storage_entity_prefix else '',
|
|
s=' -s {}'.format(pool_settings.vm_configuration.sku),
|
|
t=' -t {}'.format(torrentflags),
|
|
v=' -v {}'.format(__version__),
|
|
w=' -w' if pool_settings.ssh.hpn_server_swap else '',
|
|
x=' -x {}'.format(data._BLOBXFER_VERSION),
|
|
)
|
|
)
|
|
# upload resource files
|
|
sas_urls = storage.upload_resource_files(blob_client, config, _rflist)
|
|
del _rflist
|
|
# remove temporary az mount files created
|
|
if azureblob_vd:
|
|
try:
|
|
abms.unlink()
|
|
pass
|
|
except OSError:
|
|
pass
|
|
if azurefile_vd:
|
|
try:
|
|
afms.unlink()
|
|
except OSError:
|
|
pass
|
|
# digest any input data
|
|
addlcmds = data.process_input_data(
|
|
config, _BLOBXFER_WINDOWS_FILE if is_windows else _BLOBXFER_FILE,
|
|
settings.pool_specification(config))
|
|
if addlcmds is not None:
|
|
start_task.append(addlcmds)
|
|
del addlcmds
|
|
# add additional start task commands (post version)
|
|
start_task.extend(pool_settings.additional_node_prep_commands_post)
|
|
# create pool param
|
|
pool = batchmodels.PoolAddParameter(
|
|
id=pool_settings.id,
|
|
virtual_machine_configuration=vmconfig,
|
|
vm_size=pool_settings.vm_size,
|
|
target_dedicated_nodes=(
|
|
pool_settings.vm_count.dedicated if not asenable else None
|
|
),
|
|
target_low_priority_nodes=(
|
|
pool_settings.vm_count.low_priority if not asenable else None
|
|
),
|
|
resize_timeout=pool_settings.resize_timeout if not asenable else None,
|
|
max_tasks_per_node=pool_settings.max_tasks_per_node,
|
|
enable_inter_node_communication=pool_settings.
|
|
inter_node_communication_enabled,
|
|
start_task=batchmodels.StartTask(
|
|
command_line=util.wrap_commands_in_shell(
|
|
start_task, windows=is_windows, wait=False),
|
|
user_identity=batch._RUN_ELEVATED,
|
|
wait_for_success=True,
|
|
environment_settings=[
|
|
batchmodels.EnvironmentSetting('LC_ALL', 'en_US.UTF-8'),
|
|
],
|
|
resource_files=[],
|
|
),
|
|
enable_auto_scale=asenable,
|
|
auto_scale_formula=asformula,
|
|
auto_scale_evaluation_interval=asei,
|
|
metadata=[
|
|
batchmodels.MetadataItem(
|
|
name=settings.get_metadata_version_name(),
|
|
value=__version__,
|
|
),
|
|
],
|
|
task_scheduling_policy=task_scheduling_policy,
|
|
)
|
|
if encrypt:
|
|
if is_windows:
|
|
pool.certificate_references = [
|
|
batchmodels.CertificateReference(
|
|
pfx.sha1, 'sha1',
|
|
visibility=[
|
|
batchmodels.CertificateVisibility.start_task,
|
|
batchmodels.CertificateVisibility.task,
|
|
]
|
|
)
|
|
]
|
|
else:
|
|
pool.certificate_references = [
|
|
batchmodels.CertificateReference(
|
|
pfx.sha1, 'sha1',
|
|
visibility=[batchmodels.CertificateVisibility.start_task]
|
|
)
|
|
]
|
|
for rf in sas_urls:
|
|
pool.start_task.resource_files.append(
|
|
batchmodels.ResourceFile(
|
|
file_path=rf,
|
|
blob_source=sas_urls[rf])
|
|
)
|
|
if not native:
|
|
pool.start_task.environment_settings.append(
|
|
batchmodels.EnvironmentSetting(
|
|
'SHIPYARD_STORAGE_ENV',
|
|
crypto.encrypt_string(
|
|
encrypt, '{}:{}:{}'.format(
|
|
storage.get_storageaccount(),
|
|
storage.get_storageaccount_endpoint(),
|
|
storage.get_storageaccount_key()),
|
|
config)
|
|
)
|
|
)
|
|
if pool_settings.gpu_driver and util.is_none_or_empty(custom_image_na):
|
|
pool.start_task.resource_files.append(
|
|
batchmodels.ResourceFile(
|
|
file_path=gpu_driver.name,
|
|
blob_source=pool_settings.gpu_driver,
|
|
file_mode='0755')
|
|
)
|
|
# add any additional specified resource files
|
|
if util.is_not_empty(pool_settings.resource_files):
|
|
for rf in pool_settings.resource_files:
|
|
pool.start_task.resource_files.append(
|
|
batchmodels.ResourceFile(
|
|
file_path=rf.file_path,
|
|
blob_source=rf.blob_source,
|
|
file_mode=rf.file_mode,
|
|
)
|
|
)
|
|
# virtual network settings
|
|
if subnet_id is not None:
|
|
pool.network_configuration = batchmodels.NetworkConfiguration(
|
|
subnet_id=subnet_id,
|
|
)
|
|
# storage cluster settings
|
|
if util.is_not_empty(sc_fstab_mounts):
|
|
pool.start_task.environment_settings.append(
|
|
batchmodels.EnvironmentSetting(
|
|
'SHIPYARD_STORAGE_CLUSTER_FSTAB',
|
|
'#'.join(sc_fstab_mounts)
|
|
)
|
|
)
|
|
del sc_args
|
|
del sc_fstab_mounts
|
|
# custom linux mount settings
|
|
if util.is_not_empty(custom_linux_fstab_mounts):
|
|
pool.start_task.environment_settings.append(
|
|
batchmodels.EnvironmentSetting(
|
|
'SHIPYARD_CUSTOM_MOUNTS_FSTAB',
|
|
'#'.join(custom_linux_fstab_mounts)
|
|
)
|
|
)
|
|
del custom_linux_fstab_mounts
|
|
# add optional environment variables
|
|
if not native and bs.store_timing_metrics:
|
|
pool.start_task.environment_settings.append(
|
|
batchmodels.EnvironmentSetting('SHIPYARD_TIMING', '1')
|
|
)
|
|
# add docker login settings
|
|
pool.start_task.environment_settings.extend(
|
|
batch.generate_docker_login_settings(config)[0])
|
|
# image preload setting
|
|
if util.is_not_empty(block_for_gr):
|
|
pool.start_task.environment_settings.append(
|
|
batchmodels.EnvironmentSetting(
|
|
'SHIPYARD_CONTAINER_IMAGES_PRELOAD',
|
|
block_for_gr,
|
|
)
|
|
)
|
|
# singularity env vars
|
|
if not is_windows:
|
|
pool.start_task.environment_settings.append(
|
|
batchmodels.EnvironmentSetting(
|
|
'SINGULARITY_TMPDIR',
|
|
settings.get_singularity_tmpdir(config)
|
|
)
|
|
)
|
|
pool.start_task.environment_settings.append(
|
|
batchmodels.EnvironmentSetting(
|
|
'SINGULARITY_CACHEDIR',
|
|
settings.get_singularity_cachedir(config)
|
|
)
|
|
)
|
|
return (pool_settings, gluster_on_compute, pool)
|
|
|
|
|
|
def _construct_auto_pool_specification(
|
|
resource_client, compute_client, network_client, batch_mgmt_client,
|
|
batch_client, blob_client, config):
|
|
# type: (azure.mgmt.resource.resources.ResourceManagementClient,
|
|
# azure.mgmt.compute.ComputeManagementClient,
|
|
# azure.mgmt.network.NetworkManagementClient,
|
|
# azure.mgmt.batch.BatchManagementClient,
|
|
# azure.batch.batch_service_client.BatchServiceClient,
|
|
# azureblob.BlockBlobService, dict) -> None
|
|
"""Construct an auto pool specification
|
|
:param azure.mgmt.resource.resources.ResourceManagementClient
|
|
resource_client: resource client
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param azure.mgmt.network.NetworkManagementClient network_client:
|
|
network client
|
|
:param azure.mgmt.batch.BatchManagementClient: batch_mgmt_client
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param dict config: configuration dict
|
|
"""
|
|
# upload resource files and construct pool add parameter object
|
|
pool_settings, gluster_on_compute, pool = _construct_pool_object(
|
|
resource_client, compute_client, network_client, batch_mgmt_client,
|
|
batch_client, blob_client, config)
|
|
# convert pool add parameter object to a pool specification object
|
|
poolspec = batchmodels.PoolSpecification(
|
|
vm_size=pool.vm_size,
|
|
virtual_machine_configuration=pool.virtual_machine_configuration,
|
|
max_tasks_per_node=pool.max_tasks_per_node,
|
|
task_scheduling_policy=pool.task_scheduling_policy,
|
|
resize_timeout=pool.resize_timeout,
|
|
target_dedicated_nodes=pool.target_dedicated_nodes,
|
|
target_low_priority_nodes=pool.target_low_priority_nodes,
|
|
enable_auto_scale=pool.enable_auto_scale,
|
|
auto_scale_formula=pool.auto_scale_formula,
|
|
auto_scale_evaluation_interval=pool.auto_scale_evaluation_interval,
|
|
enable_inter_node_communication=pool.enable_inter_node_communication,
|
|
network_configuration=pool.network_configuration,
|
|
start_task=pool.start_task,
|
|
certificate_references=pool.certificate_references,
|
|
metadata=pool.metadata,
|
|
)
|
|
# add auto pool env var for cascade
|
|
poolspec.start_task.environment_settings.append(
|
|
batchmodels.EnvironmentSetting('SHIPYARD_AUTOPOOL', 1)
|
|
)
|
|
return poolspec
|
|
|
|
|
|
def _add_pool(
|
|
resource_client, compute_client, network_client, batch_mgmt_client,
|
|
batch_client, blob_client, config):
|
|
# type: (azure.mgmt.resource.resources.ResourceManagementClient,
|
|
# azure.mgmt.compute.ComputeManagementClient,
|
|
# azure.mgmt.network.NetworkManagementClient,
|
|
# azure.mgmt.batch.BatchManagementClient,
|
|
# azure.batch.batch_service_client.BatchServiceClient,
|
|
# azureblob.BlockBlobService, dict) -> None
|
|
"""Add a Batch pool to account
|
|
:param azure.mgmt.resource.resources.ResourceManagementClient
|
|
resource_client: resource client
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param azure.mgmt.network.NetworkManagementClient network_client:
|
|
network client
|
|
:param azure.mgmt.batch.BatchManagementClient: batch_mgmt_client
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param dict config: configuration dict
|
|
"""
|
|
# upload resource files and construct pool add parameter object
|
|
pool_settings, gluster_on_compute, pool = _construct_pool_object(
|
|
resource_client, compute_client, network_client, batch_mgmt_client,
|
|
batch_client, blob_client, config)
|
|
# ingress data to Azure Blob Storage if specified
|
|
storage_threads = []
|
|
if pool_settings.transfer_files_on_pool_creation:
|
|
storage_threads = data.ingress_data(
|
|
batch_client, compute_client, network_client, config, rls=None,
|
|
kind='storage')
|
|
# create pool
|
|
nodes = batch.create_pool(batch_client, config, pool)
|
|
_pool = batch_client.pool.get(pool.id)
|
|
pool_current_vm_count = (
|
|
_pool.current_dedicated_nodes + _pool.current_low_priority_nodes
|
|
)
|
|
pool_target_vm_count = (
|
|
_pool.target_dedicated_nodes + _pool.target_low_priority_nodes
|
|
)
|
|
if util.is_none_or_empty(nodes) and pool_target_vm_count > 0:
|
|
raise RuntimeError(
|
|
('No nodes could be allocated for pool: {}. If the pool is '
|
|
'comprised entirely of low priority nodes, then there may not '
|
|
'have been enough available capacity in the region to satisfy '
|
|
'your request. Please inspect the pool for resize errors and '
|
|
'issue pool resize to try again.').format(pool.id))
|
|
# set up gluster on compute if specified
|
|
if gluster_on_compute and pool_current_vm_count > 0:
|
|
_setup_glusterfs(
|
|
batch_client, blob_client, config, nodes, _GLUSTERPREP_FILE,
|
|
cmdline=None)
|
|
# create admin user on each node if requested
|
|
if pool_current_vm_count > 0:
|
|
try:
|
|
batch.add_rdp_user(batch_client, config, nodes)
|
|
except Exception as e:
|
|
logger.exception(e)
|
|
try:
|
|
batch.add_ssh_user(batch_client, config, nodes)
|
|
except Exception as e:
|
|
logger.exception(e)
|
|
logger.error(
|
|
'Could not add SSH users to nodes. Please ensure ssh-keygen '
|
|
'is available in your PATH or cwd. Skipping data ingress if '
|
|
'specified.')
|
|
else:
|
|
rls = None
|
|
# ingress data to shared fs if specified
|
|
if pool_settings.transfer_files_on_pool_creation:
|
|
if rls is None:
|
|
rls = batch.get_remote_login_settings(
|
|
batch_client, config, nodes)
|
|
data.ingress_data(
|
|
batch_client, compute_client, network_client, config,
|
|
rls=rls, kind='shared',
|
|
total_vm_count=pool_current_vm_count)
|
|
# log remote login settings
|
|
if rls is None:
|
|
if pool_current_vm_count <= 16:
|
|
batch.get_remote_login_settings(
|
|
batch_client, config, nodes)
|
|
else:
|
|
logger.info(
|
|
'Not listing remote login settings due to VM count. '
|
|
'If you need a list of remote login settings for all '
|
|
'nodes in the pool, issue the "pool nodes grls" '
|
|
'command.')
|
|
# wait for storage ingress processes
|
|
data.wait_for_storage_threads(storage_threads)
|
|
|
|
|
|
def _setup_glusterfs(
|
|
batch_client, blob_client, config, nodes, shell_script, cmdline=None):
|
|
# type: (batchsc.BatchServiceClient, azureblob.BlockBlobService, dict,
|
|
# List[batchmodels.ComputeNode], str, str) -> None
|
|
"""Setup glusterfs via multi-instance task
|
|
:param batch_client: The batch client to use.
|
|
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param dict config: configuration dict
|
|
:param list nodes: list of nodes
|
|
:param str shell_script: glusterfs setup script to use
|
|
:param str cmdline: coordination cmdline
|
|
"""
|
|
# get volume type/options
|
|
voltype = None
|
|
volopts = None
|
|
sdv = settings.global_resources_shared_data_volumes(config)
|
|
for sdvkey in sdv:
|
|
try:
|
|
if settings.is_shared_data_volume_gluster_on_compute(sdv, sdvkey):
|
|
voltype = settings.gluster_volume_type(sdv, sdvkey)
|
|
volopts = settings.gluster_volume_options(sdv, sdvkey)
|
|
break
|
|
except KeyError:
|
|
pass
|
|
if voltype is None:
|
|
raise RuntimeError('glusterfs volume not defined')
|
|
pool_id = settings.pool_id(config)
|
|
job_id = 'shipyard-glusterfs-{}'.format(uuid.uuid4())
|
|
job = batchmodels.JobAddParameter(
|
|
id=job_id,
|
|
pool_info=batchmodels.PoolInformation(pool_id=pool_id),
|
|
)
|
|
# create coordination command line
|
|
if cmdline is None:
|
|
tempdisk = settings.temp_disk_mountpoint(config)
|
|
cmdline = util.wrap_commands_in_shell([
|
|
'$AZ_BATCH_TASK_DIR/{} {} {}'.format(
|
|
shell_script[0], voltype.lower(), tempdisk)])
|
|
# create application command line
|
|
appcmd = [
|
|
'[[ -f $AZ_BATCH_TASK_WORKING_DIR/.glusterfs_success ]] || exit 1',
|
|
]
|
|
if volopts is not None:
|
|
for vo in volopts:
|
|
appcmd.append('gluster volume set {} {}'.format(
|
|
settings.get_gluster_default_volume_name(), vo))
|
|
# upload script
|
|
sas_urls = storage.upload_resource_files(
|
|
blob_client, config, [shell_script])
|
|
# get pool current dedicated
|
|
pool = batch_client.pool.get(pool_id)
|
|
batchtask = batchmodels.TaskAddParameter(
|
|
id='gluster-setup',
|
|
multi_instance_settings=batchmodels.MultiInstanceSettings(
|
|
number_of_instances=pool.current_dedicated_nodes,
|
|
coordination_command_line=cmdline,
|
|
common_resource_files=[
|
|
batchmodels.ResourceFile(
|
|
file_path=shell_script[0],
|
|
blob_source=sas_urls[shell_script[0]],
|
|
file_mode='0755'),
|
|
],
|
|
),
|
|
command_line=util.wrap_commands_in_shell(appcmd),
|
|
user_identity=batch._RUN_ELEVATED,
|
|
)
|
|
# add job and task
|
|
batch_client.job.add(job)
|
|
batch_client.task.add(job_id=job_id, task=batchtask)
|
|
logger.debug(
|
|
'waiting for glusterfs setup task {} in job {} to complete'.format(
|
|
batchtask.id, job_id))
|
|
# wait for gluster fs setup task to complete
|
|
while True:
|
|
batchtask = batch_client.task.get(job_id, batchtask.id)
|
|
if batchtask.state == batchmodels.TaskState.completed:
|
|
break
|
|
time.sleep(1)
|
|
# ensure all nodes have glusterfs success file
|
|
if nodes is None:
|
|
nodes = batch_client.compute_node.list(pool_id)
|
|
success = True
|
|
for node in nodes:
|
|
try:
|
|
batch_client.file.get_properties_from_compute_node(
|
|
pool_id, node.id,
|
|
('workitems/{}/job-1/gluster-setup/wd/'
|
|
'.glusterfs_success').format(job_id))
|
|
except batchmodels.BatchErrorException:
|
|
logger.error('gluster success file absent on node {}'.format(
|
|
node.id))
|
|
success = False
|
|
break
|
|
# delete job
|
|
batch_client.job.delete(job_id)
|
|
if not success:
|
|
raise RuntimeError('glusterfs setup failed')
|
|
logger.info(
|
|
'glusterfs setup task {} in job {} completed'.format(
|
|
batchtask.id, job_id))
|
|
|
|
|
|
def _update_container_images_over_ssh(batch_client, config, pool, cmd):
|
|
# type: (batchsc.BatchServiceClient, dict, batchmodels.CloudPool,
|
|
# list) -> None
|
|
"""Update docker images in pool over ssh
|
|
:param batch_client: The batch client to use.
|
|
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
|
|
:param dict config: configuration dict
|
|
:param batchmodels.CloudPool pool: cloud pool
|
|
:param list cmd: command
|
|
"""
|
|
_pool = settings.pool_settings(config)
|
|
# get ssh settings
|
|
username = _pool.ssh.username
|
|
if util.is_none_or_empty(username):
|
|
raise ValueError(
|
|
'cannot update container images without an SSH username')
|
|
ssh_private_key = _pool.ssh.ssh_private_key
|
|
if ssh_private_key is None:
|
|
ssh_private_key = pathlib.Path(
|
|
_pool.ssh.generated_file_export_path, crypto.get_ssh_key_prefix())
|
|
if not ssh_private_key.exists():
|
|
raise RuntimeError('SSH private key file not found at: {}'.format(
|
|
ssh_private_key))
|
|
command = ['sudo', '/bin/bash -c "{}"'.format(' && '.join(cmd))]
|
|
if settings.verbose(config):
|
|
logger.debug('executing command: {}'.format(command))
|
|
# iterate through all nodes
|
|
nodes = batch_client.compute_node.list(pool.id)
|
|
procs = []
|
|
failures = False
|
|
for node in nodes:
|
|
rls = batch_client.compute_node.get_remote_login_settings(
|
|
pool.id, node.id)
|
|
procs.append(crypto.connect_or_exec_ssh_command(
|
|
rls.remote_login_ip_address, rls.remote_login_port,
|
|
ssh_private_key, username, sync=False, command=command))
|
|
if len(procs) >= 40:
|
|
logger.debug('waiting for {} update processes to complete'.format(
|
|
len(procs)))
|
|
rcs = util.subprocess_wait_all(procs, poll=False)
|
|
if any([x != 0 for x in rcs]):
|
|
failures = True
|
|
procs = []
|
|
del rcs
|
|
if len(procs) > 0:
|
|
logger.debug('waiting for {} update processes to complete'.format(
|
|
len(procs)))
|
|
rcs = util.subprocess_wait_all(procs, poll=False)
|
|
if any([x != 0 for x in rcs]):
|
|
failures = True
|
|
procs = []
|
|
del rcs
|
|
if failures:
|
|
raise RuntimeError(
|
|
'failures detected updating container image on pool: {}'.format(
|
|
pool.id))
|
|
else:
|
|
logger.info('container image update completed for pool: {}'.format(
|
|
pool.id))
|
|
|
|
|
|
def _update_container_images(
|
|
batch_client, config, docker_image=None, docker_image_digest=None,
|
|
singularity_image=None, force_ssh=False):
|
|
# type: (batchsc.BatchServiceClient, dict, str, str, str, bool) -> None
|
|
"""Update container images in pool
|
|
:param batch_client: The batch client to use.
|
|
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
|
|
:param dict config: configuration dict
|
|
:param str docker_image: docker image to update
|
|
:param str docker_image_digest: digest to update to
|
|
:param str singularity_image: singularity image to update
|
|
:param bool force_ssh: force update over SSH
|
|
"""
|
|
# first check that peer-to-peer is disabled for pool
|
|
pool_id = settings.pool_id(config)
|
|
try:
|
|
if settings.data_replication_settings(config).peer_to_peer.enabled:
|
|
raise RuntimeError(
|
|
'cannot update container images for a pool with peer-to-peer '
|
|
'image distribution')
|
|
except KeyError:
|
|
pass
|
|
native = settings.is_native_docker_pool(config)
|
|
if native and not force_ssh:
|
|
logger.debug('forcing update via SSH due to native mode')
|
|
force_ssh = True
|
|
# if image is not specified use images from global config
|
|
singularity_images = None
|
|
if util.is_none_or_empty(docker_image):
|
|
docker_images = settings.global_resources_docker_images(config)
|
|
else:
|
|
# log warning if it doesn't exist in global resources
|
|
if docker_image not in settings.global_resources_docker_images(config):
|
|
logger.warning(
|
|
('docker image {} is not specified as a global resource '
|
|
'for pool {}').format(docker_image, pool_id))
|
|
if docker_image_digest is None:
|
|
docker_images = [docker_image]
|
|
else:
|
|
docker_images = ['{}@{}'.format(docker_image, docker_image_digest)]
|
|
if util.is_none_or_empty(singularity_image):
|
|
singularity_images = settings.global_resources_singularity_images(
|
|
config)
|
|
else:
|
|
# log warning if it doesn't exist in global resources
|
|
if (singularity_image not in
|
|
settings.global_resources_singularity_images(config)):
|
|
logger.warning(
|
|
('singularity image {} is not specified as a global resource '
|
|
'for pool {}').format(singularity_image, pool_id))
|
|
singularity_images = [singularity_image]
|
|
if (util.is_none_or_empty(docker_images) and
|
|
util.is_none_or_empty(singularity_images)):
|
|
logger.error('no images detected or specified to update')
|
|
return
|
|
# get pool current dedicated
|
|
pool = batch_client.pool.get(pool_id)
|
|
# check pool current vms is > 0. There is no reason to run updateimages
|
|
# if pool has no nodes in it. When the pool is resized up, the nodes
|
|
# will always fetch either :latest if untagged or the latest :tag if
|
|
# updated in the upstream registry
|
|
if (pool.current_dedicated_nodes == 0 and
|
|
pool.current_low_priority_nodes == 0):
|
|
logger.warning(
|
|
('not executing updateimages command as the current number of '
|
|
'compute nodes is zero for pool {}').format(pool_id))
|
|
return
|
|
# force ssh on some paths
|
|
if not force_ssh:
|
|
if pool.current_low_priority_nodes > 0:
|
|
logger.debug('forcing update via SSH due to low priority nodes')
|
|
force_ssh = True
|
|
if (pool.current_dedicated_nodes > 1 and
|
|
not pool.enable_inter_node_communication):
|
|
logger.debug(
|
|
'forcing update via SSH due to non-internode communicaton '
|
|
'enabled pool')
|
|
force_ssh = True
|
|
# check pool metadata version
|
|
if util.is_none_or_empty(pool.metadata):
|
|
logger.warning('pool version metadata not present')
|
|
else:
|
|
for md in pool.metadata:
|
|
if (md.name == settings.get_metadata_version_name() and
|
|
md.value != __version__):
|
|
logger.warning(
|
|
'pool version metadata mismatch: pool={} cli={}'.format(
|
|
md.value, __version__))
|
|
break
|
|
# perform windows compat checks
|
|
is_windows = settings.is_windows_pool(config)
|
|
if is_windows:
|
|
if force_ssh:
|
|
raise RuntimeError('cannot update images via SSH on windows')
|
|
if util.is_not_empty(singularity_images):
|
|
raise RuntimeError(
|
|
'invalid configuration: windows pool with singularity images')
|
|
# create coordination command line
|
|
# 1. log in again in case of cred expiry
|
|
# 2. pull images with respect to registry
|
|
# 3. tag images that are in a private registry
|
|
# 4. prune docker images with no tag
|
|
taskenv, coordcmd = batch.generate_docker_login_settings(config, force_ssh)
|
|
if util.is_not_empty(docker_images):
|
|
coordcmd.extend(['docker pull {}'.format(x) for x in docker_images])
|
|
coordcmd.append(
|
|
'docker images --filter dangling=true -q --no-trunc | '
|
|
'xargs --no-run-if-empty docker rmi')
|
|
if util.is_not_empty(singularity_images):
|
|
coordcmd.extend([
|
|
'export SINGULARITY_TMPDIR={}'.format(
|
|
settings.get_singularity_tmpdir(config)),
|
|
'export SINGULARITY_CACHEDIR={}'.format(
|
|
settings.get_singularity_cachedir(config)),
|
|
])
|
|
coordcmd.extend(
|
|
['singularity pull -F {}'.format(x) for x in singularity_images]
|
|
)
|
|
coordcmd.append('chown -R _azbatch:_azbatchgrp {}'.format(
|
|
settings.get_singularity_cachedir(config)))
|
|
if force_ssh:
|
|
_update_container_images_over_ssh(batch_client, config, pool, coordcmd)
|
|
return
|
|
if is_windows:
|
|
coordcmd.append('copy /y nul .update_images_success')
|
|
else:
|
|
coordcmd.append('touch .update_images_success')
|
|
# update taskenv for Singularity
|
|
taskenv.append(
|
|
batchmodels.EnvironmentSetting(
|
|
'SINGULARITY_TMPDIR',
|
|
settings.get_singularity_tmpdir(config)
|
|
)
|
|
)
|
|
taskenv.append(
|
|
batchmodels.EnvironmentSetting(
|
|
'SINGULARITY_CACHEDIR',
|
|
settings.get_singularity_cachedir(config)
|
|
)
|
|
)
|
|
coordcmd = util.wrap_commands_in_shell(coordcmd, windows=is_windows)
|
|
# create job for update
|
|
job_id = 'shipyard-updateimages-{}'.format(uuid.uuid4())
|
|
job = batchmodels.JobAddParameter(
|
|
id=job_id,
|
|
pool_info=batchmodels.PoolInformation(pool_id=pool_id),
|
|
)
|
|
# create task
|
|
batchtask = batchmodels.TaskAddParameter(
|
|
id='update-container-images',
|
|
command_line=coordcmd,
|
|
environment_settings=taskenv,
|
|
user_identity=batch._RUN_ELEVATED,
|
|
)
|
|
# create multi-instance task for pools with more than 1 node
|
|
if pool.current_dedicated_nodes > 1:
|
|
batchtask.multi_instance_settings = batchmodels.MultiInstanceSettings(
|
|
number_of_instances=pool.current_dedicated_nodes,
|
|
coordination_command_line=coordcmd,
|
|
)
|
|
# create application command line
|
|
if is_windows:
|
|
appcmd = util.wrap_commands_in_shell([
|
|
'if not exist %AZ_BATCH_TASK_WORKING_DIR%\\'
|
|
'.update_images_success exit 1'
|
|
], windows=is_windows)
|
|
else:
|
|
appcmd = util.wrap_commands_in_shell([
|
|
'[[ -f $AZ_BATCH_TASK_WORKING_DIR/.update_images_success ]] '
|
|
'|| exit 1'
|
|
], windows=is_windows)
|
|
batchtask.command_line = appcmd
|
|
# add job and task
|
|
batch_client.job.add(job)
|
|
batch_client.task.add(job_id=job_id, task=batchtask)
|
|
logger.debug(
|
|
('waiting for update container images task {} in job {} '
|
|
'to complete').format(batchtask.id, job_id))
|
|
# wait for task to complete
|
|
while True:
|
|
batchtask = batch_client.task.get(job_id, batchtask.id)
|
|
if batchtask.state == batchmodels.TaskState.completed:
|
|
break
|
|
time.sleep(1)
|
|
# ensure all nodes have success file if multi-instance
|
|
success = True
|
|
if pool.current_dedicated_nodes > 1:
|
|
if is_windows:
|
|
sep = '\\'
|
|
else:
|
|
sep = '/'
|
|
uis_file = sep.join(
|
|
('workitems', job_id, 'job-1', batchtask.id, 'wd',
|
|
'.update_images_success')
|
|
)
|
|
nodes = batch_client.compute_node.list(pool_id)
|
|
for node in nodes:
|
|
try:
|
|
batch_client.file.get_properties_from_compute_node(
|
|
pool_id, node.id, uis_file)
|
|
except batchmodels.BatchErrorException:
|
|
logger.error(
|
|
'update images success file absent on node {}'.format(
|
|
node.id))
|
|
success = False
|
|
break
|
|
else:
|
|
task = batch_client.task.get(job_id, batchtask.id)
|
|
if task.execution_info is None or task.execution_info.exit_code != 0:
|
|
success = False
|
|
# stream stderr to console
|
|
batch.stream_file_and_wait_for_task(
|
|
batch_client, config,
|
|
'{},{},stderr.txt'.format(batchtask.id, job_id))
|
|
# delete job
|
|
batch_client.job.delete(job_id)
|
|
if not success:
|
|
raise RuntimeError('update container images job failed')
|
|
logger.info(
|
|
'update container images task {} in job {} completed'.format(
|
|
batchtask.id, job_id))
|
|
|
|
|
|
def _list_docker_images(batch_client, config):
|
|
# type: (batchsc.BatchServiceClient, dict) -> None
|
|
"""List Docker images in pool over ssh
|
|
:param batch_client: The batch client to use.
|
|
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
|
|
:param dict config: configuration dict
|
|
:param batchmodels.CloudPool pool: cloud pool
|
|
"""
|
|
_pool = settings.pool_settings(config)
|
|
pool = batch_client.pool.get(_pool.id)
|
|
if (pool.current_dedicated_nodes == 0 and
|
|
pool.current_low_priority_nodes == 0):
|
|
logger.warning('pool {} has no compute nodes'.format(pool.id))
|
|
return
|
|
is_windows = settings.is_windows_pool(config)
|
|
# TODO temporarily disable listimages with windows pools
|
|
if is_windows:
|
|
raise RuntimeError(
|
|
'listing images is currently not supported for windows pools')
|
|
# get ssh settings
|
|
username = _pool.ssh.username
|
|
if util.is_none_or_empty(username):
|
|
raise ValueError('cannot list docker images without an SSH username')
|
|
ssh_private_key = _pool.ssh.ssh_private_key
|
|
if ssh_private_key is None:
|
|
ssh_private_key = pathlib.Path(
|
|
_pool.ssh.generated_file_export_path, crypto.get_ssh_key_prefix())
|
|
if not ssh_private_key.exists():
|
|
raise RuntimeError('SSH private key file not found at: {}'.format(
|
|
ssh_private_key))
|
|
# iterate through all nodes
|
|
nodes = batch_client.compute_node.list(pool.id)
|
|
procs = {}
|
|
stdout = {}
|
|
failures = False
|
|
for node in nodes:
|
|
rls = batch_client.compute_node.get_remote_login_settings(
|
|
pool.id, node.id)
|
|
procs[node.id] = crypto.connect_or_exec_ssh_command(
|
|
rls.remote_login_ip_address, rls.remote_login_port,
|
|
ssh_private_key, username, sync=False,
|
|
command=[
|
|
'sudo', 'docker', 'images', '--format',
|
|
'"{{.ID}} {{.Repository}}:{{.Tag}}"'
|
|
])
|
|
if len(procs) >= 40:
|
|
logger.debug('waiting for {} processes to complete'.format(
|
|
len(procs)))
|
|
for key in procs:
|
|
stdout[key] = procs[key].communicate()[0].decode(
|
|
'utf8').split('\n')
|
|
rcs = util.subprocess_wait_all(list(procs.values()))
|
|
if any([x != 0 for x in rcs]):
|
|
failures = True
|
|
procs.clear()
|
|
del rcs
|
|
if len(procs) > 0:
|
|
logger.debug('waiting for {} processes to complete'.format(
|
|
len(procs)))
|
|
for key in procs:
|
|
stdout[key] = procs[key].communicate()[0].decode(
|
|
'utf8').split('\n')
|
|
rcs = util.subprocess_wait_all(list(procs.values()))
|
|
if any([x != 0 for x in rcs]):
|
|
failures = True
|
|
procs.clear()
|
|
del rcs
|
|
if failures:
|
|
raise RuntimeError(
|
|
'failures retrieving docker images on pool: {}'.format(
|
|
pool.id))
|
|
# process stdout
|
|
node_images = {}
|
|
all_images = {}
|
|
for key in stdout:
|
|
node_images[key] = set()
|
|
for out in stdout[key]:
|
|
if util.is_not_empty(out):
|
|
dec = out.split()
|
|
if (not dec[1].startswith('alfpark/batch-shipyard') and
|
|
not dec[1].startswith('alfpark/blobxfer')):
|
|
node_images[key].add(dec[0])
|
|
if dec[0] not in all_images:
|
|
all_images[dec[0]] = dec[1]
|
|
# find set intersection among all nodes
|
|
intersecting_images = set.intersection(*list(node_images.values()))
|
|
logger.info('Common Docker images across all nodes in pool {}:{}{}'.format(
|
|
pool.id,
|
|
os.linesep,
|
|
os.linesep.join(
|
|
['{} {}'.format(key, all_images[key])
|
|
for key in intersecting_images])
|
|
))
|
|
# find mismatched images on nodes
|
|
for node in node_images:
|
|
images = set(node_images[node])
|
|
diff = images.difference(intersecting_images)
|
|
if len(diff) > 0:
|
|
logger.warning('Docker images present only on node {}:{}{}'.format(
|
|
node, os.linesep,
|
|
os.linesep.join(
|
|
['{} {}'.format(key, all_images[key])
|
|
for key in diff])
|
|
))
|
|
|
|
|
|
def _adjust_settings_for_pool_creation(config):
|
|
# type: (dict) -> None
|
|
"""Adjust settings for pool creation
|
|
:param dict config: configuration dict
|
|
"""
|
|
# get settings
|
|
pool = settings.pool_settings(config)
|
|
publisher = settings.pool_publisher(config, lower=True)
|
|
offer = settings.pool_offer(config, lower=True)
|
|
sku = settings.pool_sku(config, lower=True)
|
|
node_agent = settings.pool_custom_image_node_agent(config)
|
|
if util.is_not_empty(node_agent) and util.is_not_empty(sku):
|
|
raise ValueError(
|
|
'cannot specify both a platform_image and a custom_image in the '
|
|
'pool specification')
|
|
is_windows = settings.is_windows_pool(config)
|
|
bs = settings.batch_shipyard_settings(config)
|
|
# enforce publisher/offer/sku restrictions
|
|
allowed = False
|
|
shipyard_container_required = True
|
|
# oracle linux is not supported due to UEKR4 requirement
|
|
if publisher == 'canonical':
|
|
if offer == 'ubuntuserver':
|
|
if sku.startswith('14.04'):
|
|
allowed = True
|
|
elif sku.startswith('16.04'):
|
|
allowed = True
|
|
shipyard_container_required = False
|
|
elif publisher == 'credativ':
|
|
if offer == 'debian':
|
|
if sku >= '8':
|
|
allowed = True
|
|
elif publisher == 'openlogic':
|
|
if offer.startswith('centos'):
|
|
if sku >= '7':
|
|
allowed = True
|
|
elif publisher == 'redhat':
|
|
if offer == 'rhel':
|
|
if sku >= '7':
|
|
allowed = True
|
|
elif publisher == 'suse':
|
|
if offer.startswith('sles'):
|
|
if sku >= '12-sp1':
|
|
allowed = True
|
|
elif offer == 'opensuse-leap':
|
|
if sku >= '42':
|
|
allowed = True
|
|
elif publisher == 'microsoftwindowsserver':
|
|
if offer == 'windowsserver':
|
|
if sku == '2016-datacenter-with-containers':
|
|
allowed = True
|
|
# check if allowed for gpu (if gpu vm size)
|
|
if allowed:
|
|
allowed = settings.gpu_configuration_check(
|
|
config, vm_size=pool.vm_size)
|
|
if not allowed and util.is_none_or_empty(node_agent):
|
|
raise ValueError(
|
|
('unsupported Docker (and/or GPU) Host VM Config, publisher={} '
|
|
'offer={} sku={} vm_size={}').format(
|
|
publisher, offer, sku, pool.vm_size))
|
|
# ensure HPC offers are matched with RDMA sizes
|
|
if (not is_windows and (
|
|
(offer == 'centos-hpc' or offer == 'sles-hpc') and
|
|
not settings.is_rdma_pool(pool.vm_size))):
|
|
raise ValueError(
|
|
('cannot allocate an HPC VM config of publisher={} offer={} '
|
|
'sku={} with a non-RDMA vm_size={}').format(
|
|
publisher, offer, sku, pool.vm_size))
|
|
# compute total vm count
|
|
pool_total_vm_count = pool.vm_count.dedicated + pool.vm_count.low_priority
|
|
# adjust for shipyard container requirement
|
|
if (not bs.use_shipyard_docker_image and
|
|
(shipyard_container_required or util.is_not_empty(node_agent))):
|
|
settings.set_use_shipyard_docker_image(config, True)
|
|
logger.debug(
|
|
('forcing shipyard docker image to be used due to '
|
|
'VM config, publisher={} offer={} sku={}').format(
|
|
publisher, offer, sku))
|
|
# re-read pool and data replication settings
|
|
pool = settings.pool_settings(config)
|
|
dr = settings.data_replication_settings(config)
|
|
native = settings.is_native_docker_pool(
|
|
config, vm_config=pool.vm_configuration)
|
|
# ensure singularity images are not specified for native pools
|
|
if native:
|
|
images = settings.global_resources_singularity_images(config)
|
|
if util.is_not_empty(images):
|
|
raise ValueError(
|
|
'cannot specify a native container pool with Singularity '
|
|
'images as global resources')
|
|
# ensure settings p2p/as/internode settings are compatible
|
|
if dr.peer_to_peer.enabled:
|
|
if native:
|
|
raise ValueError(
|
|
'cannot enable peer-to-peer and native container pools')
|
|
if settings.is_pool_autoscale_enabled(config, pas=pool.autoscale):
|
|
raise ValueError('cannot enable peer-to-peer and autoscale')
|
|
if pool.inter_node_communication_enabled:
|
|
logger.warning(
|
|
'force enabling inter-node communication due to peer-to-peer '
|
|
'transfer')
|
|
settings.set_inter_node_communication_enabled(config, True)
|
|
# hpn-ssh can only be used for Ubuntu currently
|
|
try:
|
|
if (pool.ssh.hpn_server_swap and
|
|
((publisher != 'canonical' and offer != 'ubuntuserver') or
|
|
util.is_not_empty(node_agent))):
|
|
logger.warning('cannot enable HPN SSH swap on {} {} {}'.format(
|
|
publisher, offer, sku))
|
|
settings.set_hpn_server_swap(config, False)
|
|
except KeyError:
|
|
pass
|
|
# force disable block for global resources if ingressing data
|
|
if (pool.transfer_files_on_pool_creation and
|
|
pool.block_until_all_global_resources_loaded):
|
|
logger.warning(
|
|
'disabling block until all global resources loaded with '
|
|
'transfer files on pool creation enabled')
|
|
settings.set_block_until_all_global_resources_loaded(config, False)
|
|
# re-read pool settings
|
|
pool = settings.pool_settings(config)
|
|
# ensure internode is not enabled for mix node pools
|
|
if (pool.inter_node_communication_enabled and
|
|
pool.vm_count.dedicated > 0 and pool.vm_count.low_priority > 0):
|
|
raise ValueError(
|
|
'inter node communication cannot be enabled with both '
|
|
'dedicated and low priority nodes')
|
|
# check shared data volume settings
|
|
try:
|
|
num_gluster = 0
|
|
sdv = settings.global_resources_shared_data_volumes(config)
|
|
for sdvkey in sdv:
|
|
if settings.is_shared_data_volume_gluster_on_compute(sdv, sdvkey):
|
|
if is_windows:
|
|
raise ValueError(
|
|
'glusterfs on compute is not supported on windows')
|
|
if settings.is_pool_autoscale_enabled(
|
|
config, pas=pool.autoscale):
|
|
raise ValueError(
|
|
'glusterfs on compute cannot be installed on an '
|
|
'autoscale-enabled pool')
|
|
if not pool.inter_node_communication_enabled:
|
|
# do not modify value and proceed since this interplays
|
|
# with p2p settings, simply raise exception and force
|
|
# user to reconfigure
|
|
raise ValueError(
|
|
'inter node communication in pool configuration '
|
|
'must be enabled for glusterfs on compute')
|
|
if pool.vm_count.low_priority > 0:
|
|
raise ValueError(
|
|
'glusterfs on compute cannot be installed on pools '
|
|
'with low priority nodes')
|
|
if pool.vm_count.dedicated <= 1:
|
|
raise ValueError(
|
|
'vm_count dedicated should exceed 1 for glusterfs '
|
|
'on compute')
|
|
if pool.max_tasks_per_node > 1:
|
|
raise ValueError(
|
|
'max_tasks_per_node cannot exceed 1 for glusterfs '
|
|
'on compute')
|
|
num_gluster += 1
|
|
try:
|
|
if settings.gluster_volume_type(sdv, sdvkey) != 'replica':
|
|
raise ValueError(
|
|
'only replicated GlusterFS volumes are '
|
|
'currently supported')
|
|
except KeyError:
|
|
pass
|
|
elif settings.is_shared_data_volume_storage_cluster(sdv, sdvkey):
|
|
if is_windows:
|
|
raise ValueError(
|
|
'storage cluster mounting is not supported on windows')
|
|
elif settings.is_shared_data_volume_azure_blob(sdv, sdvkey):
|
|
if is_windows:
|
|
raise ValueError(
|
|
'azure blob mounting is not supported on windows')
|
|
if native:
|
|
raise ValueError(
|
|
'azure blob mounting is not supported on native '
|
|
'container pools')
|
|
if offer == 'ubuntuserver':
|
|
if sku < '16.04-lts':
|
|
raise ValueError(
|
|
('azure blob mounting is not supported '
|
|
'on publisher={} offer={} sku={}').format(
|
|
publisher, offer, sku))
|
|
elif not offer.startswith('centos'):
|
|
raise ValueError(
|
|
('azure blob mounting is not supported '
|
|
'on publisher={} offer={} sku={}').format(
|
|
publisher, offer, sku))
|
|
elif settings.is_shared_data_volume_custom_linux_mount(
|
|
sdv, sdvkey):
|
|
if is_windows:
|
|
raise ValueError(
|
|
'custom linux mounting is not supported on windows')
|
|
if num_gluster > 1:
|
|
raise ValueError(
|
|
'cannot create more than one GlusterFS on compute volume '
|
|
'per pool')
|
|
except KeyError:
|
|
pass
|
|
# check data ingress on pool creation on windows
|
|
if is_windows and pool.transfer_files_on_pool_creation:
|
|
raise ValueError(
|
|
'cannot transfer files on pool creation to windows compute nodes')
|
|
# check singularity images are not present for windows
|
|
if (is_windows and util.is_not_empty(
|
|
settings.global_resources_singularity_images(config))):
|
|
raise ValueError('cannot deploy Singularity images on windows pools')
|
|
# check pool count of 0 and remote login
|
|
if pool_total_vm_count == 0:
|
|
if is_windows:
|
|
# TODO RDP check
|
|
pass
|
|
else:
|
|
if util.is_not_empty(pool.ssh.username):
|
|
logger.warning('cannot add SSH user with zero target nodes')
|
|
# ensure unusable recovery is not enabled for custom image
|
|
if (pool.attempt_recovery_on_unusable and
|
|
not settings.is_platform_image(
|
|
config, vm_config=pool.vm_configuration)):
|
|
logger.warning(
|
|
'override attempt recovery on unusable due to custom image')
|
|
settings.set_attempt_recovery_on_unusable(config, False)
|
|
# TODO temporarily disable credential encryption with windows
|
|
if is_windows and settings.batch_shipyard_encryption_enabled(config):
|
|
raise ValueError(
|
|
'cannot enable credential encryption with windows pools')
|
|
|
|
|
|
def _check_settings_for_auto_pool(config):
|
|
# type: (dict) -> None
|
|
"""Check settings for autopool
|
|
:param dict config: configuration dict
|
|
"""
|
|
# check glusterfs on compute
|
|
try:
|
|
sdv = settings.global_resources_shared_data_volumes(config)
|
|
for sdvkey in sdv:
|
|
if settings.is_shared_data_volume_gluster_on_compute(sdv, sdvkey):
|
|
raise ValueError(
|
|
'GlusterFS on compute is not possible with autopool')
|
|
break
|
|
except KeyError:
|
|
pass
|
|
# get settings
|
|
pool = settings.pool_settings(config)
|
|
# check local data movement to pool
|
|
if pool.transfer_files_on_pool_creation:
|
|
raise ValueError('Cannot ingress data on pool creation with autopool')
|
|
# check ssh
|
|
if util.is_not_empty(pool.ssh.username):
|
|
logger.warning('cannot add SSH user with autopool')
|
|
|
|
|
|
def _check_resource_client(resource_client):
|
|
# type: (azure.mgmt.resource.resources.ResourceManagementClient) -> None
|
|
"""Check resource client validity"""
|
|
if resource_client is None:
|
|
raise RuntimeError(
|
|
'resource management client is invalid, ensure you have '
|
|
'specified proper "management" credentials')
|
|
|
|
|
|
def _check_compute_client(compute_client):
|
|
# type: (azure.mgmt.resource.compute.ComputeManagementClient) -> None
|
|
"""Check compute client validity"""
|
|
if compute_client is None:
|
|
raise RuntimeError(
|
|
'compute management client is invalid, ensure you have '
|
|
'specified proper "management" credentials')
|
|
|
|
|
|
def _check_network_client(network_client):
|
|
# type: (azure.mgmt.resource.network.NetworkManagementClient) -> None
|
|
"""Check network client validity"""
|
|
if network_client is None:
|
|
raise RuntimeError(
|
|
'network management client is invalid, ensure you have '
|
|
'specified proper "management" credentials')
|
|
|
|
|
|
def _check_keyvault_client(keyvault_client):
|
|
# type: (azure.keyvault.KeyVaultClient) -> None
|
|
"""Check keyvault client validity"""
|
|
if keyvault_client is None:
|
|
raise RuntimeError(
|
|
'keyvault client is invalid, ensure you have specified '
|
|
'proper "keyvault" credentials')
|
|
|
|
|
|
def _check_batch_client(batch_client):
|
|
# type: (batchsc.BatchServiceClient) -> None
|
|
"""Check batch client validity"""
|
|
if batch_client is None:
|
|
raise RuntimeError(
|
|
'batch client is invalid, ensure you have specified '
|
|
'proper "batch" credentials')
|
|
|
|
|
|
def action_fs_disks_add(resource_client, compute_client, config):
|
|
# type: (azure.mgmt.resource.resources.ResourceManagementClient,
|
|
# azure.mgmt.compute.ComputeManagementClient, dict) -> None
|
|
"""Action: Fs Disks Add
|
|
:param azure.mgmt.resource.resources.ResourceManagementClient
|
|
resource_client: resource client
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param dict config: configuration dict
|
|
"""
|
|
_check_resource_client(resource_client)
|
|
_check_compute_client(compute_client)
|
|
remotefs.create_managed_disks(resource_client, compute_client, config)
|
|
|
|
|
|
def action_fs_disks_del(
|
|
compute_client, config, name, resource_group, all, wait):
|
|
# type: (azure.mgmt.compute.ComputeManagementClient, dict, str,
|
|
# str, bool, bool) -> None
|
|
"""Action: Fs Disks Del
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param dict config: configuration dict
|
|
:param str name: disk name
|
|
:param str resource_group: resource group
|
|
:param bool all: delete all in resource group
|
|
:param bool wait: wait for operation to complete
|
|
"""
|
|
_check_compute_client(compute_client)
|
|
remotefs.delete_managed_disks(
|
|
compute_client, config, name, resource_group, all, wait,
|
|
confirm_override=False)
|
|
|
|
|
|
def action_fs_disks_list(
|
|
compute_client, config, resource_group, restrict_scope):
|
|
# type: (azure.mgmt.compute.ComputeManagementClient, dict, str,
|
|
# bool) -> None
|
|
"""Action: Fs Disks List
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param dict config: configuration dict
|
|
:param str resource_group: resource group
|
|
:param bool restrict_scope: restrict scope to config
|
|
"""
|
|
_check_compute_client(compute_client)
|
|
remotefs.list_disks(compute_client, config, resource_group, restrict_scope)
|
|
|
|
|
|
def action_fs_cluster_add(
|
|
resource_client, compute_client, network_client, blob_client,
|
|
config, storage_cluster_id):
|
|
# type: (azure.mgmt.resource.resources.ResourceManagementClient,
|
|
# azure.mgmt.compute.ComputeManagementClient,
|
|
# azure.mgmt.network.NetworkManagementClient,
|
|
# azure.storage.blob.BlockBlobService, dict, str) -> None
|
|
"""Action: Fs Cluster Add
|
|
:param azure.mgmt.resource.resources.ResourceManagementClient
|
|
resource_client: resource client
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param azure.mgmt.network.NetworkManagementClient network_client:
|
|
network client
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param dict config: configuration dict
|
|
:param str storage_cluster_id: storage cluster id
|
|
"""
|
|
_check_resource_client(resource_client)
|
|
_check_compute_client(compute_client)
|
|
_check_network_client(network_client)
|
|
storage.set_storage_remotefs_container(storage_cluster_id)
|
|
remotefs.create_storage_cluster(
|
|
resource_client, compute_client, network_client, blob_client, config,
|
|
storage_cluster_id, _REMOTEFSPREP_FILE[0], _ALL_REMOTEFS_FILES)
|
|
|
|
|
|
def action_fs_cluster_resize(
|
|
compute_client, network_client, blob_client, config,
|
|
storage_cluster_id):
|
|
# type: (azure.mgmt.compute.ComputeManagementClient,
|
|
# azure.mgmt.network.NetworkManagementClient,
|
|
# azure.storage.blob.BlockBlobService, dict, str) -> None
|
|
"""Action: Fs Cluster Resize
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param azure.mgmt.network.NetworkManagementClient network_client:
|
|
network client
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param dict config: configuration dict
|
|
:param str storage_cluster_id: storage cluster id
|
|
"""
|
|
_check_compute_client(compute_client)
|
|
_check_network_client(network_client)
|
|
remotefs.resize_storage_cluster(
|
|
compute_client, network_client, blob_client, config,
|
|
storage_cluster_id, _REMOTEFSPREP_FILE[0], _REMOTEFSADDBRICK_FILE[0],
|
|
_ALL_REMOTEFS_FILES)
|
|
|
|
|
|
def action_fs_cluster_del(
|
|
resource_client, compute_client, network_client, blob_client, config,
|
|
storage_cluster_id, delete_all_resources, delete_data_disks,
|
|
delete_virtual_network, generate_from_prefix, wait):
|
|
# type: (azure.mgmt.resource.resources.ResourceManagementClient,
|
|
# azure.mgmt.compute.ComputeManagementClient,
|
|
# azure.mgmt.network.NetworkManagementClient,
|
|
# azure.storage.blob.BlockBlobService, dict, str, bool, bool,
|
|
# bool, bool, bool) -> None
|
|
"""Action: Fs Cluster Add
|
|
:param azure.mgmt.resource.resources.ResourceManagementClient
|
|
resource_client: resource client
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param azure.mgmt.network.NetworkManagementClient network_client:
|
|
network client
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param dict config: configuration dict
|
|
:param str storage_cluster_id: storage cluster id
|
|
:param bool delete_all_resources: delete all resources
|
|
:param bool delete_data_disks: delete data disks
|
|
:param bool delete_virtual_network: delete virtual network
|
|
:param bool generate_from_prefix: generate resources from hostname prefix
|
|
:param bool wait: wait for deletion to complete
|
|
"""
|
|
_check_resource_client(resource_client)
|
|
_check_compute_client(compute_client)
|
|
_check_network_client(network_client)
|
|
if (generate_from_prefix and
|
|
(delete_all_resources or delete_data_disks or
|
|
delete_virtual_network)):
|
|
raise ValueError(
|
|
'Cannot specify generate_from_prefix and a delete_* option')
|
|
storage.set_storage_remotefs_container(storage_cluster_id)
|
|
remotefs.delete_storage_cluster(
|
|
resource_client, compute_client, network_client, blob_client, config,
|
|
storage_cluster_id, delete_data_disks=delete_data_disks,
|
|
delete_virtual_network=delete_virtual_network,
|
|
delete_resource_group=delete_all_resources,
|
|
generate_from_prefix=generate_from_prefix, wait=wait)
|
|
|
|
|
|
def action_fs_cluster_expand(
|
|
compute_client, network_client, config, storage_cluster_id, rebalance):
|
|
# type: (azure.mgmt.compute.ComputeManagementClient,
|
|
# azure.mgmt.network.NetworkManagementClient, dict, str,
|
|
# bool) -> None
|
|
"""Action: Fs Cluster Expand
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param azure.mgmt.network.NetworkManagementClient network_client:
|
|
network client
|
|
:param dict config: configuration dict
|
|
:param str storage_cluster_id: storage cluster id
|
|
:param bool rebalance: rebalance filesystem
|
|
"""
|
|
_check_compute_client(compute_client)
|
|
_check_network_client(network_client)
|
|
if remotefs.expand_storage_cluster(
|
|
compute_client, network_client, config, storage_cluster_id,
|
|
_REMOTEFSPREP_FILE[0], rebalance):
|
|
action_fs_cluster_status(
|
|
compute_client, network_client, config, storage_cluster_id,
|
|
detail=True, hosts=False)
|
|
|
|
|
|
def action_fs_cluster_suspend(
|
|
compute_client, config, storage_cluster_id, wait):
|
|
# type: (azure.mgmt.compute.ComputeManagementClient, dict, str,
|
|
# bool) -> None
|
|
"""Action: Fs Cluster Suspend
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param dict config: configuration dict
|
|
:param str storage_cluster_id: storage cluster id
|
|
:param bool wait: wait for suspension to complete
|
|
"""
|
|
_check_compute_client(compute_client)
|
|
remotefs.suspend_storage_cluster(
|
|
compute_client, config, storage_cluster_id, wait)
|
|
|
|
|
|
def action_fs_cluster_start(
|
|
compute_client, network_client, config, storage_cluster_id, wait):
|
|
# type: (azure.mgmt.compute.ComputeManagementClient,
|
|
# azure.mgmt.network.NetworkManagementClient, dict, str,
|
|
# bool) -> None
|
|
"""Action: Fs Cluster Start
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param azure.mgmt.network.NetworkManagementClient network_client:
|
|
network client
|
|
:param dict config: configuration dict
|
|
:param str storage_cluster_id: storage cluster id
|
|
:param bool wait: wait for restart to complete
|
|
"""
|
|
_check_compute_client(compute_client)
|
|
_check_network_client(network_client)
|
|
remotefs.start_storage_cluster(
|
|
compute_client, config, storage_cluster_id, wait)
|
|
if wait:
|
|
action_fs_cluster_status(
|
|
compute_client, network_client, config, storage_cluster_id,
|
|
detail=True, hosts=False)
|
|
|
|
|
|
def action_fs_cluster_status(
|
|
compute_client, network_client, config, storage_cluster_id,
|
|
detail, hosts):
|
|
# type: (azure.mgmt.compute.ComputeManagementClient,
|
|
# azure.mgmt.network.NetworkManagementClient, dict, str, bool,
|
|
# bool) -> None
|
|
"""Action: Fs Cluster Status
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param azure.mgmt.network.NetworkManagementClient network_client:
|
|
network client
|
|
:param dict config: configuration dict
|
|
:param str storage_cluster_id: storage cluster id
|
|
:param bool detail: detailed status
|
|
:param bool hosts: dump info for /etc/hosts
|
|
"""
|
|
_check_compute_client(compute_client)
|
|
_check_network_client(network_client)
|
|
remotefs.stat_storage_cluster(
|
|
compute_client, network_client, config, storage_cluster_id,
|
|
_REMOTEFSSTAT_FILE[0], detail, hosts)
|
|
|
|
|
|
def action_fs_cluster_ssh(
|
|
compute_client, network_client, config, storage_cluster_id,
|
|
cardinal, hostname, tty, command):
|
|
# type: (azure.mgmt.compute.ComputeManagementClient,
|
|
# azure.mgmt.network.NetworkManagementClient, dict, str, int,
|
|
# str, bool, tuple) -> None
|
|
"""Action: Fs Cluster Ssh
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param azure.mgmt.network.NetworkManagementClient network_client:
|
|
network client
|
|
:param dict config: configuration dict
|
|
:param str storage_cluster_id: storage cluster id
|
|
:param int cardinal: cardinal number
|
|
:param str hostname: hostname
|
|
:param bool tty: allocate pseudo-tty
|
|
:param tuple command: command
|
|
"""
|
|
_check_compute_client(compute_client)
|
|
_check_network_client(network_client)
|
|
if cardinal is not None and hostname is not None:
|
|
raise ValueError('cannot specify both cardinal and hostname options')
|
|
if cardinal is None and hostname is None:
|
|
logger.warning(
|
|
'assuming node cardinal of 0 as no cardinal or hostname option '
|
|
'was specified')
|
|
cardinal = 0
|
|
if cardinal is not None and cardinal < 0:
|
|
raise ValueError('invalid cardinal option value')
|
|
remotefs.ssh_storage_cluster(
|
|
compute_client, network_client, config, storage_cluster_id,
|
|
cardinal, hostname, tty, command)
|
|
|
|
|
|
def action_keyvault_add(keyvault_client, config, keyvault_uri, name):
|
|
# type: (azure.keyvault.KeyVaultClient, dict, str, str) -> None
|
|
"""Action: Keyvault Add
|
|
:param azure.keyvault.KeyVaultClient keyvault_client: keyvault client
|
|
:param dict config: configuration dict
|
|
:param str keyvault_uri: keyvault uri
|
|
:param str name: secret name
|
|
"""
|
|
_check_keyvault_client(keyvault_client)
|
|
keyvault.store_credentials_conf(
|
|
keyvault_client, config, keyvault_uri, name)
|
|
|
|
|
|
def action_keyvault_del(keyvault_client, keyvault_uri, name):
|
|
# type: (azure.keyvault.KeyVaultClient, str, str) -> None
|
|
"""Action: Keyvault Del
|
|
:param azure.keyvault.KeyVaultClient keyvault_client: keyvault client
|
|
:param str keyvault_uri: keyvault uri
|
|
:param str name: secret name
|
|
"""
|
|
_check_keyvault_client(keyvault_client)
|
|
keyvault.delete_secret(keyvault_client, keyvault_uri, name)
|
|
|
|
|
|
def action_keyvault_list(keyvault_client, keyvault_uri):
|
|
# type: (azure.keyvault.KeyVaultClient, str) -> None
|
|
"""Action: Keyvault List
|
|
:param azure.keyvault.KeyVaultClient keyvault_client: keyvault client
|
|
:param str keyvault_uri: keyvault uri
|
|
"""
|
|
_check_keyvault_client(keyvault_client)
|
|
keyvault.list_secrets(keyvault_client, keyvault_uri)
|
|
|
|
|
|
def action_cert_create(config):
|
|
# type: (dict) -> None
|
|
"""Action: Cert Create
|
|
:param dict config: configuration dict
|
|
"""
|
|
sha1tp = crypto.generate_pem_pfx_certificates(config)
|
|
logger.info('SHA1 Thumbprint: {}'.format(sha1tp))
|
|
|
|
|
|
def action_cert_add(batch_client, config):
|
|
# type: (batchsc.BatchServiceClient, dict) -> None
|
|
"""Action: Cert Add
|
|
:param azure.batch.batch_service_client.BatchServiceClient: batch client
|
|
:param dict config: configuration dict
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
batch.add_certificate_to_account(batch_client, config, False)
|
|
|
|
|
|
def action_cert_list(batch_client):
|
|
# type: (batchsc.BatchServiceClient) -> None
|
|
"""Action: Cert List
|
|
:param azure.batch.batch_service_client.BatchServiceClient: batch client
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
batch.list_certificates_in_account(batch_client)
|
|
|
|
|
|
def action_cert_del(batch_client, config):
|
|
# type: (batchsc.BatchServiceClient, dict) -> None
|
|
"""Action: Cert Del
|
|
:param azure.batch.batch_service_client.BatchServiceClient: batch client
|
|
:param dict config: configuration dict
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
batch.del_certificate_from_account(batch_client, config)
|
|
|
|
|
|
def action_pool_listskus(batch_client):
|
|
# type: (batchsc.BatchServiceClient) -> None
|
|
"""Action: Pool Listskus
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
batch.list_node_agent_skus(batch_client)
|
|
|
|
|
|
def action_pool_add(
|
|
resource_client, compute_client, network_client, batch_mgmt_client,
|
|
batch_client, blob_client, table_client, config):
|
|
# type: (azure.mgmt.resource.resources.ResourceManagementClient,
|
|
# azure.mgmt.compute.ComputeManagementClient,
|
|
# azure.mgmt.network.NetworkManagementClient,
|
|
# azure.mgmt.batch.BatchManagementClient,
|
|
# azure.batch.batch_service_client.BatchServiceClient,
|
|
# azureblob.BlockBlobService, azuretable.TableService,
|
|
# dict) -> None
|
|
"""Action: Pool Add
|
|
:param azure.mgmt.resource.resources.ResourceManagementClient
|
|
resource_client: resource client
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param azure.mgmt.network.NetworkManagementClient network_client:
|
|
network client
|
|
:param azure.mgmt.batch.BatchManagementClient: batch_mgmt_client
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param azure.cosmosdb.table.TableService table_client: table client
|
|
:param dict config: configuration dict
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
# first check if pool exists to prevent accidential metadata clear
|
|
if batch_client.pool.exists(settings.pool_id(config)):
|
|
raise RuntimeError(
|
|
'attempting to create a pool that already exists: {}'.format(
|
|
settings.pool_id(config)))
|
|
_adjust_settings_for_pool_creation(config)
|
|
storage.create_storage_containers(blob_client, table_client, config)
|
|
storage.clear_storage_containers(blob_client, table_client, config)
|
|
if not settings.is_native_docker_pool(config):
|
|
storage.populate_global_resource_blobs(
|
|
blob_client, table_client, config)
|
|
_add_pool(
|
|
resource_client, compute_client, network_client, batch_mgmt_client,
|
|
batch_client, blob_client, config
|
|
)
|
|
|
|
|
|
def action_pool_list(batch_client):
|
|
# type: (batchsc.BatchServiceClient) -> None
|
|
"""Action: Pool List
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
batch.list_pools(batch_client)
|
|
|
|
|
|
def action_pool_delete(
|
|
batch_client, blob_client, table_client, config, pool_id=None,
|
|
wait=False):
|
|
# type: (batchsc.BatchServiceClient, azureblob.BlockBlobService,
|
|
# azuretable.TableService, dict, str, bool) -> None
|
|
"""Action: Pool Delete
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param azure.cosmosdb.table.TableService table_client: table client
|
|
:param dict config: configuration dict
|
|
:param str pool_id: poolid to delete
|
|
:param bool wait: wait for pool to delete
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
deleted = False
|
|
try:
|
|
deleted = batch.del_pool(batch_client, config, pool_id=pool_id)
|
|
except batchmodels.BatchErrorException as ex:
|
|
if ('The specified pool does not exist' in ex.message.value or
|
|
'The specified pool has been marked for deletion' in
|
|
ex.message.value):
|
|
deleted = True
|
|
else:
|
|
logger.exception(ex)
|
|
if deleted:
|
|
# reset storage settings to target poolid if required
|
|
if util.is_not_empty(pool_id):
|
|
populate_global_settings(config, False, pool_id=pool_id)
|
|
else:
|
|
pool_id = settings.pool_id(config)
|
|
storage.cleanup_with_del_pool(
|
|
blob_client, table_client, config, pool_id=pool_id)
|
|
if wait:
|
|
logger.debug('waiting for pool {} to delete'.format(pool_id))
|
|
while batch_client.pool.exists(pool_id):
|
|
time.sleep(3)
|
|
|
|
|
|
def action_pool_resize(batch_client, blob_client, config, wait):
|
|
# type: (batchsc.BatchServiceClient, azureblob.BlockBlobService,
|
|
# dict, bool) -> None
|
|
"""Resize pool that may contain glusterfs
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param dict config: configuration dict
|
|
:param bool wait: wait for operation to complete
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
pool = settings.pool_settings(config)
|
|
# check direction of resize
|
|
_pool = batch_client.pool.get(pool.id)
|
|
if (pool.vm_count.dedicated == _pool.current_dedicated_nodes ==
|
|
_pool.target_dedicated_nodes and
|
|
pool.vm_count.low_priority == _pool.current_low_priority_nodes ==
|
|
_pool.target_low_priority_nodes):
|
|
logger.error(
|
|
'pool {} is already at {} nodes'.format(pool.id, pool.vm_count))
|
|
return
|
|
resize_up_d = False
|
|
resize_up_lp = False
|
|
if pool.vm_count.dedicated > _pool.current_dedicated_nodes:
|
|
resize_up_d = True
|
|
if pool.vm_count.low_priority > _pool.current_low_priority_nodes:
|
|
resize_up_lp = True
|
|
del _pool
|
|
create_ssh_user = False
|
|
# try to get handle on public key, avoid generating another set
|
|
# of keys
|
|
if resize_up_d or resize_up_lp:
|
|
if pool.ssh.username is None:
|
|
logger.info('not creating ssh user on new nodes of pool {}'.format(
|
|
pool.id))
|
|
else:
|
|
if pool.ssh.ssh_public_key is None:
|
|
sfp = pathlib.Path(crypto.get_ssh_key_prefix() + '.pub')
|
|
if sfp.exists():
|
|
logger.debug(
|
|
'setting public key for ssh user to: {}'.format(sfp))
|
|
settings.set_ssh_public_key(config, str(sfp))
|
|
create_ssh_user = True
|
|
else:
|
|
logger.warning(
|
|
('not creating ssh user for new nodes of pool {} as '
|
|
'an existing ssh public key cannot be found').format(
|
|
pool.id))
|
|
create_ssh_user = False
|
|
# check if this is a glusterfs-enabled pool
|
|
gluster_present = False
|
|
voltype = None
|
|
try:
|
|
sdv = settings.global_resources_shared_data_volumes(config)
|
|
for sdvkey in sdv:
|
|
if settings.is_shared_data_volume_gluster_on_compute(sdv, sdvkey):
|
|
gluster_present = True
|
|
try:
|
|
voltype = settings.gluster_volume_type(sdv, sdvkey)
|
|
except KeyError:
|
|
pass
|
|
break
|
|
except KeyError:
|
|
pass
|
|
logger.debug('glusterfs shared volume present: {}'.format(
|
|
gluster_present))
|
|
if gluster_present:
|
|
if resize_up_lp:
|
|
raise RuntimeError(
|
|
'cannot resize up a pool with glusterfs_on_compute and '
|
|
'low priority nodes')
|
|
logger.debug('forcing wait to True due to glusterfs')
|
|
wait = True
|
|
# cache old nodes
|
|
old_nodes = {}
|
|
if gluster_present or create_ssh_user:
|
|
for node in batch_client.compute_node.list(pool.id):
|
|
old_nodes[node.id] = node.ip_address
|
|
# resize pool
|
|
nodes = batch.resize_pool(batch_client, config, wait)
|
|
# add ssh user to new nodes if present
|
|
if create_ssh_user and (resize_up_d or resize_up_lp):
|
|
if wait:
|
|
# get list of new nodes only
|
|
new_nodes = [node for node in nodes if node.id not in old_nodes]
|
|
# create admin user on each new node if requested
|
|
batch.add_ssh_user(batch_client, config, nodes=new_nodes)
|
|
# log remote login settings for new ndoes
|
|
batch.get_remote_login_settings(
|
|
batch_client, config, nodes=new_nodes)
|
|
del new_nodes
|
|
else:
|
|
logger.warning('ssh user was not added as --wait was not given')
|
|
# add brick for new nodes
|
|
if gluster_present and resize_up_d:
|
|
# get pool current dedicated
|
|
_pool = batch_client.pool.get(pool.id)
|
|
# ensure current dedicated is the target
|
|
if pool.vm_count.dedicated != _pool.current_dedicated_nodes:
|
|
raise RuntimeError(
|
|
('cannot perform glusterfs setup on new nodes, unexpected '
|
|
'current dedicated {} to vm_count {}').format(
|
|
_pool.current_dedicated_nodes, pool.vm_count.dedicated))
|
|
del _pool
|
|
# get internal ip addresses of new nodes
|
|
new_nodes = [
|
|
node.ip_address for node in nodes if node.id not in old_nodes
|
|
]
|
|
masterip = next(iter(old_nodes.values()))
|
|
# get tempdisk mountpoint
|
|
tempdisk = settings.temp_disk_mountpoint(config)
|
|
# construct cmdline
|
|
cmdline = util.wrap_commands_in_shell([
|
|
'$AZ_BATCH_TASK_DIR/{} {} {} {} {} {}'.format(
|
|
_GLUSTERRESIZE_FILE[0], voltype.lower(), tempdisk,
|
|
pool.vm_count.dedicated, masterip, ' '.join(new_nodes))])
|
|
# setup gluster
|
|
_setup_glusterfs(
|
|
batch_client, blob_client, config, nodes, _GLUSTERRESIZE_FILE,
|
|
cmdline=cmdline)
|
|
|
|
|
|
def action_pool_nodes_grls(batch_client, config):
|
|
# type: (batchsc.BatchServiceClient, dict) -> None
|
|
"""Action: Pool Nodes Grls
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
batch.get_remote_login_settings(batch_client, config)
|
|
batch.generate_ssh_tunnel_script(
|
|
batch_client, settings.pool_settings(config), None, None)
|
|
|
|
|
|
def action_pool_nodes_list(batch_client, config):
|
|
# type: (batchsc.BatchServiceClient, dict) -> None
|
|
"""Action: Pool Nodes List
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
batch.list_nodes(batch_client, config)
|
|
|
|
|
|
def action_pool_user_add(batch_client, config):
|
|
# type: (batchsc.BatchServiceClient, dict) -> None
|
|
"""Action: Pool User Add
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if settings.is_windows_pool(config):
|
|
batch.add_rdp_user(batch_client, config)
|
|
else:
|
|
batch.add_ssh_user(batch_client, config)
|
|
|
|
|
|
def action_pool_user_del(batch_client, config):
|
|
# type: (batchsc.BatchServiceClient, dict) -> None
|
|
"""Action: Pool Dru
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if settings.is_windows_pool(config):
|
|
batch.del_rdp_user(batch_client, config)
|
|
else:
|
|
batch.del_ssh_user(batch_client, config)
|
|
|
|
|
|
def action_pool_ssh(batch_client, config, cardinal, nodeid, tty, command):
|
|
# type: (batchsc.BatchServiceClient, dict, int, str, bool, tuple) -> None
|
|
"""Action: Pool Ssh
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param int cardinal: cardinal node num
|
|
:param str nodeid: node id
|
|
:param bool tty: allocate pseudo-tty
|
|
:param tuple command: command to execute
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if cardinal is not None and nodeid is not None:
|
|
raise ValueError('cannot specify both cardinal and nodeid options')
|
|
if cardinal is None and nodeid is None:
|
|
logger.warning(
|
|
'assuming node cardinal of 0 as no cardinal or nodeid option '
|
|
'was specified')
|
|
cardinal = 0
|
|
if cardinal is not None and cardinal < 0:
|
|
raise ValueError('invalid cardinal option value')
|
|
pool = settings.pool_settings(config)
|
|
ssh_private_key = pool.ssh.ssh_private_key
|
|
if ssh_private_key is None:
|
|
ssh_private_key = pathlib.Path(
|
|
pool.ssh.generated_file_export_path, crypto.get_ssh_key_prefix())
|
|
ip, port = batch.get_remote_login_setting_for_node(
|
|
batch_client, config, cardinal, nodeid)
|
|
crypto.connect_or_exec_ssh_command(
|
|
ip, port, ssh_private_key, pool.ssh.username, tty=tty,
|
|
command=command)
|
|
|
|
|
|
def action_pool_nodes_del(
|
|
batch_client, config, all_start_task_failed, all_starting,
|
|
all_unusable, nodeid):
|
|
# type: (batchsc.BatchServiceClient, dict, bool, bool, bool, str) -> None
|
|
"""Action: Pool Nodes Del
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param bool all_start_task_failed: delete all start task failed nodes
|
|
:param bool all_starting: delete all starting nodes
|
|
:param bool all_unusable: delete all unusable nodes
|
|
:param str nodeid: nodeid to delete
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if ((all_start_task_failed or all_starting or all_unusable) and
|
|
nodeid is not None):
|
|
raise ValueError(
|
|
'cannot specify all start task failed nodes or unusable with '
|
|
'a specific node id')
|
|
batch.del_node(
|
|
batch_client, config, all_start_task_failed, all_starting,
|
|
all_unusable, nodeid)
|
|
|
|
|
|
def action_pool_nodes_reboot(
|
|
batch_client, config, all_start_task_failed, nodeid):
|
|
# type: (batchsc.BatchServiceClient, dict, bool, str) -> None
|
|
"""Action: Pool Nodes Reboot
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param bool all_start_task_failed: reboot all start task failed nodes
|
|
:param str nodeid: nodeid to reboot
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if all_start_task_failed and nodeid is not None:
|
|
raise ValueError(
|
|
'cannot specify all start task failed nodes with a specific '
|
|
'node id')
|
|
batch.reboot_nodes(batch_client, config, all_start_task_failed, nodeid)
|
|
|
|
|
|
def action_pool_images_update(
|
|
batch_client, config, docker_image, docker_image_digest,
|
|
singularity_image, ssh):
|
|
# type: (batchsc.BatchServiceClient, dict, str, str, str, bool) -> None
|
|
"""Action: Pool Images Update
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param str docker_image: docker image to update
|
|
:param str docker_image_digest: docker image digest to update to
|
|
:param str singularity_image: singularity image to update
|
|
:param bool ssh: use direct SSH update mode
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if docker_image_digest is not None and docker_image is None:
|
|
raise ValueError(
|
|
'cannot specify a digest to update to without the image')
|
|
_update_container_images(
|
|
batch_client, config, docker_image, docker_image_digest,
|
|
singularity_image, force_ssh=ssh)
|
|
|
|
|
|
def action_pool_images_list(batch_client, config):
|
|
# type: (batchsc.BatchServiceClient, dict, str, str, bool) -> None
|
|
"""Action: Pool Images List
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
_list_docker_images(batch_client, config)
|
|
|
|
|
|
def action_pool_stats(batch_client, config, pool_id):
|
|
# type: (batchsc.BatchServiceClient, dict, str) -> None
|
|
"""Action: Pool Stats
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param str pool_id: pool id
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
batch.pool_stats(batch_client, config, pool_id=pool_id)
|
|
|
|
|
|
def action_pool_autoscale_disable(batch_client, config):
|
|
# type: (batchsc.BatchServiceClient, dict, str, str, bool) -> None
|
|
"""Action: Pool Autoscale Disable
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
batch.pool_autoscale_disable(batch_client, config)
|
|
|
|
|
|
def action_pool_autoscale_enable(batch_client, config):
|
|
# type: (batchsc.BatchServiceClient, dict, str, str, bool) -> None
|
|
"""Action: Pool Autoscale Enable
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
batch.pool_autoscale_enable(batch_client, config)
|
|
|
|
|
|
def action_pool_autoscale_evaluate(batch_client, config):
|
|
# type: (batchsc.BatchServiceClient, dict, str, str, bool) -> None
|
|
"""Action: Pool Autoscale Evaluate
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
batch.pool_autoscale_evaluate(batch_client, config)
|
|
|
|
|
|
def action_pool_autoscale_lastexec(batch_client, config):
|
|
# type: (batchsc.BatchServiceClient, dict, str, str, bool) -> None
|
|
"""Action: Pool Autoscale Lastexec
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
batch.pool_autoscale_lastexec(batch_client, config)
|
|
|
|
|
|
def action_jobs_add(
|
|
resource_client, compute_client, network_client, batch_mgmt_client,
|
|
batch_client, blob_client, table_client, keyvault_client, config,
|
|
recreate, tail):
|
|
# type: (azure.mgmt.resource.resources.ResourceManagementClient,
|
|
# azure.mgmt.compute.ComputeManagementClient,
|
|
# azure.mgmt.network.NetworkManagementClient,
|
|
# azure.mgmt.batch.BatchManagementClient,
|
|
# azure.batch.batch_service_client.BatchServiceClient,
|
|
# azureblob.BlockBlobService, azuretable.TableService,
|
|
# azure.keyvault.KeyVaultClient, dict, bool, str) -> None
|
|
"""Action: Jobs Add
|
|
:param azure.mgmt.resource.resources.ResourceManagementClient
|
|
resource_client: resource client
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param azure.mgmt.network.NetworkManagementClient network_client:
|
|
network client
|
|
:param azure.mgmt.batch.BatchManagementClient: batch_mgmt_client
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param azure.cosmosdb.table.TableService table_client: table client
|
|
:param azure.keyvault.KeyVaultClient keyvault_client: keyvault client
|
|
:param dict config: configuration dict
|
|
:param bool recreate: recreate jobs if completed
|
|
:param str tail: file to tail or last job and task added
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
# check for job autopools
|
|
autopool = batch.check_jobs_for_auto_pool(config)
|
|
if autopool:
|
|
# check to ensure pool id is within 20 chars
|
|
pool_id = settings.pool_id(config)
|
|
if len(pool_id) > 20:
|
|
raise ValueError(
|
|
'pool id must be less than 21 characters: {}'.format(pool_id))
|
|
# check if a pool id with existing pool id exists
|
|
try:
|
|
batch_client.pool.get(pool_id)
|
|
except batchmodels.BatchErrorException as ex:
|
|
if 'The specified pool does not exist' in ex.message.value:
|
|
pass
|
|
else:
|
|
raise RuntimeError(
|
|
'pool with id of {} already exists'.format(pool_id))
|
|
_adjust_settings_for_pool_creation(config)
|
|
# create storage containers and clear
|
|
storage.create_storage_containers(blob_client, table_client, config)
|
|
storage.clear_storage_containers(blob_client, table_client, config)
|
|
if not settings.is_native_docker_pool(config):
|
|
storage.populate_global_resource_blobs(
|
|
blob_client, table_client, config)
|
|
# create autopool specification object
|
|
autopool = _construct_auto_pool_specification(
|
|
resource_client, compute_client, network_client, batch_mgmt_client,
|
|
batch_client, blob_client, config
|
|
)
|
|
# check settings and warn
|
|
_check_settings_for_auto_pool(config)
|
|
else:
|
|
autopool = None
|
|
# add jobs
|
|
is_windows = settings.is_windows_pool(config)
|
|
batch.add_jobs(
|
|
batch_client, blob_client, keyvault_client, config, autopool,
|
|
_IMAGE_BLOCK_FILE,
|
|
_BLOBXFER_WINDOWS_FILE if is_windows else _BLOBXFER_FILE,
|
|
recreate, tail)
|
|
|
|
|
|
def action_jobs_list(batch_client, config):
|
|
# type: (batchsc.BatchServiceClient, dict) -> None
|
|
"""Action: Jobs List
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
batch.list_jobs(batch_client, config)
|
|
|
|
|
|
def action_jobs_tasks_list(
|
|
batch_client, config, all, jobid, poll_until_tasks_complete):
|
|
# type: (batchsc.BatchServiceClient, dict, bool, str, bool) -> None
|
|
"""Action: Jobs Tasks List
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param bool all: all jobs
|
|
:param str jobid: job id
|
|
:param bool poll_until_tasks_complete: poll until tasks complete
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if all and jobid is not None:
|
|
raise ValueError('cannot specify both --all and --jobid')
|
|
while True:
|
|
all_complete = batch.list_tasks(
|
|
batch_client, config, all=all, jobid=jobid)
|
|
if not poll_until_tasks_complete or all_complete:
|
|
break
|
|
time.sleep(5)
|
|
|
|
|
|
def action_jobs_tasks_term(batch_client, config, jobid, taskid, wait, force):
|
|
# type: (batchsc.BatchServiceClient, dict, str, str, bool, bool) -> None
|
|
"""Action: Jobs Tasks Term
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param str jobid: job id
|
|
:param str taskid: task id
|
|
:param bool wait: wait for action to complete
|
|
:param bool force: force docker kill even if completed
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if taskid is not None and jobid is None:
|
|
raise ValueError(
|
|
'cannot specify a task to terminate without the corresponding '
|
|
'job id')
|
|
if force and (taskid is None or jobid is None):
|
|
raise ValueError('cannot force docker kill without task id/job id')
|
|
batch.terminate_tasks(
|
|
batch_client, config, jobid=jobid, taskid=taskid, wait=wait,
|
|
force=force)
|
|
|
|
|
|
def action_jobs_tasks_del(batch_client, config, jobid, taskid, wait):
|
|
# type: (batchsc.BatchServiceClient, dict, str, str, bool) -> None
|
|
"""Action: Jobs Tasks Del
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param str jobid: job id
|
|
:param str taskid: task id
|
|
:param bool wait: wait for action to complete
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if taskid is not None and jobid is None:
|
|
raise ValueError(
|
|
'cannot specify a task to delete without the corresponding '
|
|
'job id')
|
|
batch.del_tasks(
|
|
batch_client, config, jobid=jobid, taskid=taskid, wait=wait)
|
|
|
|
|
|
def action_jobs_del_or_term(
|
|
batch_client, blob_client, table_client, config, delete, all_jobs,
|
|
all_jobschedules, jobid, jobscheduleid, termtasks, wait):
|
|
# type: (batchsc.BatchServiceClient, azureblob.BlockBlobService,
|
|
# azuretable.TableService, dict, bool, bool, str, str,
|
|
# bool, bool) -> None
|
|
"""Action: Jobs Del or Term
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param azure.cosmosdb.table.TableService table_client: table client
|
|
:param dict config: configuration dict
|
|
:param bool all_jobs: all jobs
|
|
:param bool all_jobschedules: all job schedules
|
|
:param str jobid: job id
|
|
:param str jobscheduleid: job schedule id
|
|
:param bool termtasks: terminate tasks prior
|
|
:param bool wait: wait for action to complete
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if jobid is not None and jobscheduleid is not None:
|
|
raise ValueError('cannot specify both --jobid and --jobscheduleid')
|
|
if all_jobs:
|
|
if jobid is not None:
|
|
raise ValueError('cannot specify both --all-jobs and --jobid')
|
|
batch.delete_or_terminate_all_jobs(
|
|
batch_client, config, delete, termtasks=termtasks, wait=wait)
|
|
elif all_jobschedules:
|
|
if jobscheduleid is not None:
|
|
raise ValueError(
|
|
'cannot specify both --all-jobschedules and --jobscheduleid')
|
|
if termtasks:
|
|
raise ValueError(
|
|
'Cannot specify --termtasks with --all-jobschedules. '
|
|
'Please terminate tasks with each individual job first.')
|
|
batch.delete_or_terminate_all_job_schedules(
|
|
batch_client, config, delete, wait=wait)
|
|
else:
|
|
# check for autopool
|
|
if util.is_none_or_empty(jobid):
|
|
autopool = batch.check_jobs_for_auto_pool(config)
|
|
if autopool:
|
|
# check if a pool id with existing pool id exists
|
|
try:
|
|
batch_client.pool.get(settings.pool_id(config))
|
|
except batchmodels.BatchErrorException as ex:
|
|
if 'The specified pool does not exist' in ex.message.value:
|
|
pass
|
|
else:
|
|
autopool = False
|
|
else:
|
|
autopool = False
|
|
# terminate the jobs
|
|
batch.delete_or_terminate_jobs(
|
|
batch_client, config, delete, jobid=jobid,
|
|
jobscheduleid=jobscheduleid, termtasks=termtasks, wait=wait)
|
|
# if autopool, delete the storage
|
|
if autopool:
|
|
storage.cleanup_with_del_pool(blob_client, table_client, config)
|
|
|
|
|
|
def action_jobs_cmi(batch_client, config, delete):
|
|
# type: (batchsc.BatchServiceClient, dict, bool) -> None
|
|
"""Action: Jobs Cmi
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param bool delete: delete all cmi jobs
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if delete:
|
|
batch.del_clean_mi_jobs(batch_client, config)
|
|
else:
|
|
batch.clean_mi_jobs(batch_client, config)
|
|
batch.del_clean_mi_jobs(batch_client, config)
|
|
|
|
|
|
def action_jobs_migrate(
|
|
batch_client, config, jobid, jobscheduleid, poolid, requeue,
|
|
terminate, wait):
|
|
# type: (batchsc.BatchServiceClient, dict, str, str, str, bool, bool,
|
|
# bool) -> None
|
|
"""Action: Jobs Migrate
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param str jobid: job id to migrate to in lieu of config
|
|
:param str jobscheduleid: job schedule id to migrate to in lieu of config
|
|
:param str poolid: pool id to migrate to in lieu of config
|
|
:param bool requeue: requeue action
|
|
:param bool terminate: terminate action
|
|
:param bool wait: wait action
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if jobid is not None:
|
|
if jobscheduleid is not None:
|
|
raise ValueError('cannot specify both --jobid and --jobscheduleid')
|
|
if [requeue, terminate, wait].count(True) != 1:
|
|
raise ValueError(
|
|
'must specify only one option of --requeue, --terminate, '
|
|
'--wait')
|
|
if requeue:
|
|
action = 'requeue'
|
|
elif terminate:
|
|
action = 'terminate'
|
|
elif wait:
|
|
action = 'wait'
|
|
else:
|
|
action = None
|
|
# check jobs to see if targetted pool id is the same
|
|
batch.check_pool_for_job_migration(
|
|
batch_client, config, jobid=jobid, jobscheduleid=jobscheduleid,
|
|
poolid=poolid)
|
|
if not util.confirm_action(
|
|
config, msg='migration of jobs or job schedules'):
|
|
return
|
|
logger.warning(
|
|
'ensure that the new target pool has the proper Docker images '
|
|
'loaded, or you have enabled allow_run_on_missing_image')
|
|
# disable job and wait for disabled state
|
|
batch.disable_jobs(
|
|
batch_client, config, action, jobid=jobid, jobscheduleid=jobscheduleid,
|
|
suppress_confirm=True)
|
|
# patch job
|
|
batch.update_job_with_pool(
|
|
batch_client, config, jobid=jobid, jobscheduleid=jobscheduleid,
|
|
poolid=poolid)
|
|
# enable job
|
|
batch.enable_jobs(
|
|
batch_client, config, jobid=jobid, jobscheduleid=jobscheduleid)
|
|
|
|
|
|
def action_jobs_disable(
|
|
batch_client, config, jobid, jobscheduleid, requeue, terminate, wait):
|
|
# type: (batchsc.BatchServiceClient, dict, str, str, bool, bool,
|
|
# bool) -> None
|
|
"""Action: Jobs Disable
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param str jobid: job id to disable to in lieu of config
|
|
:param str jobscheduleid: job schedule id to disable to in lieu of config
|
|
:param bool requeue: requeue action
|
|
:param bool terminate: terminate action
|
|
:param bool wait: wait action
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if jobid is not None:
|
|
if jobscheduleid is not None:
|
|
raise ValueError('cannot specify both --jobid and --jobscheduleid')
|
|
if [requeue, terminate, wait].count(True) != 1:
|
|
raise ValueError(
|
|
'must specify only one option of --requeue, --terminate, '
|
|
'--wait')
|
|
if requeue:
|
|
action = 'requeue'
|
|
elif terminate:
|
|
action = 'terminate'
|
|
elif wait:
|
|
action = 'wait'
|
|
else:
|
|
action = None
|
|
batch.disable_jobs(
|
|
batch_client, config, action, jobid=jobid,
|
|
jobscheduleid=jobscheduleid, disabling_state_ok=True)
|
|
|
|
|
|
def action_jobs_enable(batch_client, config, jobid, jobscheduleid):
|
|
# type: (batchsc.BatchServiceClient, dict, str, str) -> None
|
|
"""Action: Jobs Enable
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param str jobid: job id to enable to in lieu of config
|
|
:param str jobscheduleid: job schedule id to enable to in lieu of config
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
batch.enable_jobs(
|
|
batch_client, config, jobid=jobid, jobscheduleid=jobscheduleid)
|
|
|
|
|
|
def action_jobs_stats(batch_client, config, job_id):
|
|
# type: (batchsc.BatchServiceClient, dict, str) -> None
|
|
"""Action: Jobs Stats
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param str job_id: job id
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
batch.job_stats(batch_client, config, jobid=job_id)
|
|
|
|
|
|
def action_storage_del(
|
|
blob_client, table_client, config, clear_tables, poolid):
|
|
# type: (azureblob.BlockBlobService, azuretable.TableService,
|
|
# dict, bool, str) -> None
|
|
"""Action: Storage Del
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param azure.cosmosdb.table.TableService table_client: table client
|
|
:param dict config: configuration dict
|
|
:param bool clear_tables: clear tables instead of deleting
|
|
:param str poolid: pool id to target
|
|
"""
|
|
# reset storage settings to target poolid
|
|
if util.is_not_empty(poolid):
|
|
populate_global_settings(config, False, pool_id=poolid)
|
|
if clear_tables:
|
|
storage.clear_storage_containers(
|
|
blob_client, table_client, config, tables_only=True,
|
|
pool_id=poolid)
|
|
storage.delete_storage_containers(
|
|
blob_client, table_client, config, skip_tables=clear_tables)
|
|
|
|
|
|
def action_storage_clear(blob_client, table_client, config, poolid):
|
|
# type: (azureblob.BlockBlobService, azuretable.TableService, dict,
|
|
# str) -> None
|
|
"""Action: Storage Clear
|
|
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
|
:param azure.cosmosdb.table.TableService table_client: table client
|
|
:param dict config: configuration dict
|
|
:param str poolid: pool id to target
|
|
"""
|
|
# reset storage settings to target poolid
|
|
if util.is_not_empty(poolid):
|
|
populate_global_settings(config, False, pool_id=poolid)
|
|
storage.clear_storage_containers(
|
|
blob_client, table_client, config, pool_id=poolid)
|
|
|
|
|
|
def action_data_files_stream(batch_client, config, filespec, disk):
|
|
# type: (batchsc.BatchServiceClient, dict, str, bool) -> None
|
|
"""Action: Data Files Stream
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param str filespec: filespec of file to retrieve
|
|
:param bool disk: write streamed data to disk instead
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
batch.stream_file_and_wait_for_task(batch_client, config, filespec, disk)
|
|
|
|
|
|
def action_data_files_list(batch_client, config, jobid, taskid):
|
|
# type: (batchsc.BatchServiceClient, dict, str, str) -> None
|
|
"""Action: Data Files List
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param str jobid: job id to list
|
|
:param str taskid: task id to list
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if taskid is not None and jobid is None:
|
|
raise ValueError(
|
|
'cannot specify a task to list files without the corresponding '
|
|
'job id')
|
|
batch.list_task_files(batch_client, config, jobid, taskid)
|
|
|
|
|
|
def action_data_files_task(batch_client, config, all, filespec):
|
|
# type: (batchsc.BatchServiceClient, dict, bool, str) -> None
|
|
"""Action: Data Files Task
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param bool all: retrieve all files
|
|
:param str filespec: filespec of file to retrieve
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if all:
|
|
batch.get_all_files_via_task(batch_client, config, filespec)
|
|
else:
|
|
batch.get_file_via_task(batch_client, config, filespec)
|
|
|
|
|
|
def action_data_files_node(batch_client, config, all, nodeid):
|
|
# type: (batchsc.BatchServiceClient, dict, bool, str) -> None
|
|
"""Action: Data Files Node
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param bool all: retrieve all files
|
|
:param str nodeid: node id to retrieve file from
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if all:
|
|
batch.get_all_files_via_node(batch_client, config, nodeid)
|
|
else:
|
|
batch.get_file_via_node(batch_client, config, nodeid)
|
|
|
|
|
|
def action_data_ingress(
|
|
batch_client, compute_client, network_client, config, to_fs):
|
|
# type: (batchsc.BatchServiceClient,
|
|
# azure.mgmt.compute.ComputeManagementClient,
|
|
# azure.mgmt.network.NetworkManagementClient, dict, str) -> None
|
|
"""Action: Data Ingress
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param azure.mgmt.compute.ComputeManagementClient compute_client:
|
|
compute client
|
|
:param azure.mgmt.network.NetworkManagementClient network_client:
|
|
network client
|
|
:param dict config: configuration dict
|
|
:param str to_fs: ingress to remote filesystem
|
|
"""
|
|
pool_total_vm_count = None
|
|
if util.is_none_or_empty(to_fs):
|
|
try:
|
|
# get pool current dedicated
|
|
pool = batch_client.pool.get(settings.pool_id(config))
|
|
pool_total_vm_count = (
|
|
pool.current_dedicated_nodes + pool.current_low_priority_nodes
|
|
)
|
|
del pool
|
|
# ensure there are remote login settings
|
|
rls = batch.get_remote_login_settings(
|
|
batch_client, config, nodes=None)
|
|
# ensure nodes are at least idle/running for shared ingress
|
|
kind = 'all'
|
|
if not batch.check_pool_nodes_runnable(batch_client, config):
|
|
kind = 'storage'
|
|
except batchmodels.BatchErrorException as ex:
|
|
if 'The specified pool does not exist' in ex.message.value:
|
|
rls = None
|
|
kind = 'storage'
|
|
else:
|
|
raise
|
|
else:
|
|
rls = None
|
|
kind = 'remotefs'
|
|
if compute_client is None or network_client is None:
|
|
raise RuntimeError(
|
|
'required ARM clients are invalid, please provide management '
|
|
'AAD credentials')
|
|
storage_threads = data.ingress_data(
|
|
batch_client, compute_client, network_client, config, rls=rls,
|
|
kind=kind, total_vm_count=pool_total_vm_count, to_fs=to_fs)
|
|
data.wait_for_storage_threads(storage_threads)
|
|
|
|
|
|
def action_misc_tensorboard(
|
|
batch_client, config, jobid, taskid, logdir, image):
|
|
# type: (batchsc.BatchServiceClient, dict, str, str, str, str) -> None
|
|
"""Action: Misc Tensorboard
|
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
|
batch client
|
|
:param dict config: configuration dict
|
|
:param str jobid: job id to list
|
|
:param str taskid: task id to list
|
|
:param str logdir: log dir
|
|
:param str image: tensorflow image to use
|
|
"""
|
|
_check_batch_client(batch_client)
|
|
if util.is_none_or_empty(jobid):
|
|
jobspecs = settings.job_specifications(config)
|
|
if len(jobspecs) != 1:
|
|
raise ValueError(
|
|
'The number of jobs in the specified jobs config is not '
|
|
'one. Please specify which job with --jobid.')
|
|
if util.is_not_empty(taskid):
|
|
raise ValueError(
|
|
'cannot specify a task to tunnel Tensorboard to without the '
|
|
'corresponding job id')
|
|
misc.tunnel_tensorboard(batch_client, config, jobid, taskid, logdir, image)
|