batch-shipyard/convoy/batch.py

2831 строка
115 KiB
Python
Исходник Обычный вид История

# Copyright (c) Microsoft Corporation
#
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
# compat imports
from __future__ import (
absolute_import, division, print_function, unicode_literals
)
from builtins import ( # noqa
bytes, dict, int, list, object, range, str, ascii, chr, hex, input,
next, oct, open, pow, round, super, filter, map, zip)
# stdlib imports
import collections
import datetime
import fnmatch
2016-10-20 07:10:40 +03:00
import getpass
import logging
try:
import pathlib2 as pathlib
except ImportError:
import pathlib
import os
import ssl
import tempfile
import time
# non-stdlib imports
import azure.batch.models as batchmodels
# local imports
from . import autoscale
2016-11-01 06:49:24 +03:00
from . import crypto
from . import data
from . import keyvault
2016-11-12 02:11:13 +03:00
from . import settings
2016-11-01 06:49:24 +03:00
from . import storage
from . import util
from .version import __version__
# create logger
logger = logging.getLogger(__name__)
2016-11-01 06:49:24 +03:00
util.setup_logger(logger)
# global defines
_MAX_REBOOT_RETRIES = 5
_SSH_TUNNEL_SCRIPT = 'ssh_docker_tunnel_shipyard.sh'
_GENERIC_DOCKER_TASK_PREFIX = 'dockertask-'
_RUN_ELEVATED = batchmodels.UserIdentity(
auto_user=batchmodels.AutoUserSpecification(
scope=batchmodels.AutoUserScope.pool,
elevation_level=batchmodels.ElevationLevel.admin,
)
)
NodeStateCountCollection = collections.namedtuple(
'NodeStateCountCollection', [
'creating',
'idle',
'leaving_pool',
'offline',
'preempted',
'rebooting',
'reimaging',
'running',
'start_task_failed',
'starting',
'unknown',
'unusable',
'waiting_for_start_task',
]
)
def get_batch_account(batch_mgmt_client, config):
# type: (azure.mgmt.batch.BatchManagementClient, dict) ->
# azure.mgmt.batch.models.BatchAccount
"""Get Batch account properties from ARM
:param azure.mgmt.batch.BatchManagementClient batch_mgmt_client:
batch management client
:param dict config: configuration dict
:rtype: azure.mgmt.batch.models.BatchAccount
:return: Batch account
"""
if batch_mgmt_client is None:
raise RuntimeError(
'Batch management client is invalid, please specify management '
'aad credentials')
bc = settings.credentials_batch(config)
return batch_mgmt_client.batch_account.get(
resource_group_name=bc.resource_group,
account_name=bc.account,
)
def list_node_agent_skus(batch_client):
# type: (batch.BatchServiceClient) -> None
"""List all node agent skus
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
"""
node_agent_skus = batch_client.account.list_node_agent_skus()
for sku in node_agent_skus:
for img in sku.verified_image_references:
logger.info(
'os_type={} publisher={} offer={} sku={} node_agent={}'.format(
sku.os_type, img.publisher, img.offer, img.sku, sku.id))
2016-10-20 07:10:40 +03:00
def add_certificate_to_account(batch_client, config, rm_pfxfile=False):
# type: (batch.BatchServiceClient, dict, bool) -> None
2016-10-20 07:10:40 +03:00
"""Adds a certificate to a Batch account
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str sha1_cert_tp: sha1 thumbprint of pfx
:param bool rm_pfxfile: remove PFX file from local disk
"""
pfx = crypto.get_encryption_pfx_settings(config)
2016-10-20 07:10:40 +03:00
# first check if this cert exists
certs = batch_client.certificate.list()
for cert in certs:
if cert.thumbprint.lower() == pfx.sha1:
2016-10-20 07:10:40 +03:00
logger.error(
'cert with thumbprint {} already exists for account'.format(
pfx.sha1))
2016-10-20 07:10:40 +03:00
# remove pfxfile
if rm_pfxfile:
os.unlink(pfx.filename)
2016-10-20 07:10:40 +03:00
return
# add cert to account
if pfx.passphrase is None:
pfx.passphrase = getpass.getpass('Enter password for PFX: ')
2016-10-20 07:10:40 +03:00
logger.debug('adding pfx cert with thumbprint {} to account'.format(
pfx.sha1))
2017-01-05 21:20:13 +03:00
data = util.base64_encode_string(open(pfx.filename, 'rb').read())
2016-10-20 07:10:40 +03:00
batch_client.certificate.add(
certificate=batchmodels.CertificateAddParameter(
pfx.sha1, 'sha1', data,
2016-10-20 07:10:40 +03:00
certificate_format=batchmodels.CertificateFormat.pfx,
password=pfx.passphrase)
2016-10-20 07:10:40 +03:00
)
# remove pfxfile
if rm_pfxfile:
os.unlink(pfx.filename)
2016-10-20 07:10:40 +03:00
def list_certificates_in_account(batch_client):
# type: (batch.BatchServiceClient) -> None
"""List all certificates in a Batch account
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
"""
i = 0
certs = batch_client.certificate.list()
for cert in certs:
if cert.delete_certificate_error is not None:
ce = 'delete_error=(code={} msg={})'.format(
cert.delete_certificate_error.code,
cert.delete_certificate_error.message)
else:
ce = ''
logger.info('{}={} [state={}{}]'.format(
cert.thumbprint_algorithm, cert.thumbprint, cert.state, ce))
i += 1
if i == 0:
logger.error('no certificates found')
2016-10-20 07:10:40 +03:00
def del_certificate_from_account(batch_client, config):
# type: (batch.BatchServiceClient, dict) -> None
2016-10-20 07:10:40 +03:00
"""Delete a certificate from a Batch account
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
"""
pfx = crypto.get_encryption_pfx_settings(config)
batch_client.certificate.delete('sha1', pfx.sha1)
2016-10-20 07:10:40 +03:00
def _reboot_node(batch_client, pool_id, node_id, wait):
# type: (batch.BatchServiceClient, str, str, bool) -> None
"""Reboot a node in a pool
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param str pool_id: pool id of node
:param str node_id: node id to delete
:param bool wait: wait for node to enter rebooting state
"""
logger.info('Rebooting node {} in pool {}'.format(node_id, pool_id))
batch_client.compute_node.reboot(
pool_id=pool_id,
node_id=node_id,
)
if wait:
logger.debug('waiting for node {} to enter rebooting state'.format(
node_id))
while True:
node = batch_client.compute_node.get(pool_id, node_id)
if node.state == batchmodels.ComputeNodeState.rebooting:
break
else:
time.sleep(1)
def _retrieve_outputs_from_failed_nodes(batch_client, config, nodeid=None):
# type: (batch.BatchServiceClient, dict) -> None
"""Retrieve stdout/stderr from failed nodes
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
"""
pool_id = settings.pool_id(config)
if nodeid is None:
nodes = batch_client.compute_node.list(pool_id)
else:
nodes = [batch_client.compute_node.get(pool_id, nodeid)]
# for any node in state start task failed, retrieve the stdout and stderr
for node in nodes:
if node.state == batchmodels.ComputeNodeState.start_task_failed:
settings.set_auto_confirm(config, True)
get_all_files_via_node(
batch_client, config,
filespec='{},{}'.format(node.id, 'startup/std*.txt'))
try:
get_all_files_via_node(
batch_client, config,
filespec='{},{}'.format(node.id, 'startup/wd/cascade.log'))
except batchmodels.BatchErrorException:
pass
def _block_for_nodes_ready(
batch_client, config, stopping_states, end_states, pool_id,
reboot_on_failed):
# type: (batch.BatchServiceClient, dict,
# List[batchmodels.ComputeNodeState],
# List[batchmodels.ComputeNodeState], str,
# bool) -> List[batchmodels.ComputeNode]
"""Wait for pool to enter steady state and all nodes to enter stopping
states
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param list stopping_states: list of node states to stop polling
:param list end_states: list of acceptable end states
:param str pool_id: pool id
:param bool reboot_on_failed: reboot node on failed start state
:rtype: list
:return: list of nodes
"""
logger.info(
'waiting for all nodes in pool {} to reach one of: {!r}'.format(
pool_id, stopping_states))
i = 0
reboot_map = {}
while True:
# refresh pool to ensure that there is no dedicated resize error
pool = batch_client.pool.get(pool_id)
if util.is_not_empty(pool.resize_errors):
2017-06-06 18:29:44 +03:00
fatal_resize_error = False
errors = []
for err in pool.resize_errors:
errors.append('code={} msg={}'.format(err.code, err.message))
2017-06-06 18:29:44 +03:00
if (err.code == 'AccountCoreQuotaReached' or
(err.code == 'AccountLowPriorityCoreQuotaReached' and
pool.target_dedicated_nodes == 0)):
fatal_resize_error = True
if fatal_resize_error:
list_nodes(batch_client, config)
raise RuntimeError(
'Fatal resize errors encountered for pool {}: {}'.format(
pool.id, os.linesep.join(errors)))
else:
logger.error(
'Resize errors encountered for pool {}: {}'.format(
pool.id, os.linesep.join(errors)))
# check pool allocation state
if pool.allocation_state == batchmodels.AllocationState.resizing:
nodes = []
else:
try:
nodes = list(batch_client.compute_node.list(pool.id))
except ssl.SSLError:
# SSL error happens sometimes on paging... this is probably
# a bug in the underlying msrest/msrestazure library that
# is reusing the SSL connection improperly
nodes = []
# check if any nodes are in start task failed state
if (any(node.state == batchmodels.ComputeNodeState.start_task_failed
for node in nodes)):
# attempt reboot if enabled for potentially transient errors
if reboot_on_failed:
for node in nodes:
if (node.state !=
batchmodels.ComputeNodeState.start_task_failed):
continue
if node.id not in reboot_map:
reboot_map[node.id] = 0
logger.error(
('Detected start task failure, attempting to '
'retrieve stdout/stderr for error diagnosis '
'from node {}').format(node.id))
_retrieve_outputs_from_failed_nodes(
batch_client, config, nodeid=node.id)
if reboot_map[node.id] > _MAX_REBOOT_RETRIES:
list_nodes(batch_client, config)
raise RuntimeError(
('Ran out of reboot retries for recovery. '
'Please inspect both the node status above and '
'stdout.txt/stderr.txt files within the '
'{}/{}/startup directory in the current working '
'directory if available. If this error '
'appears non-transient, please submit an '
'issue on GitHub').format(
pool.id, node.id))
_reboot_node(batch_client, pool.id, node.id, True)
reboot_map[node.id] += 1
# refresh node list to reflect rebooting states
try:
nodes = list(batch_client.compute_node.list(pool.id))
except ssl.SSLError:
nodes = []
else:
# fast path check for start task failures in non-reboot mode
logger.error(
'Detected start task failure, attempting to retrieve '
'stdout/stderr for error diagnosis from nodes')
_retrieve_outputs_from_failed_nodes(batch_client, config)
list_nodes(batch_client, config)
raise RuntimeError(
('Please inspect both the node status above and '
'stdout.txt/stderr.txt files within the '
'{}/<nodes>/startup directory in the current working '
'directory if available. If this error appears '
'non-transient, please submit an issue on '
'GitHub.').format(pool.id))
if (len(nodes) ==
(pool.target_dedicated_nodes +
pool.target_low_priority_nodes) and
all(node.state in stopping_states for node in nodes)):
if any(node.state not in end_states for node in nodes):
list_nodes(batch_client, config)
raise RuntimeError(
('Node(s) of pool {} not in {} state. Please inspect the '
'state of nodes in the pool above. If this appears to '
'be a transient error, please retry pool creation by '
'deleting and recreating the pool.').format(
pool.id, end_states))
else:
return nodes
i += 1
if i % 3 == 0:
i = 0
logger.debug(
('waiting for {} dedicated nodes and {} low priority nodes '
'of size {} to reach desired state in pool {} '
'[resize_timeout={} allocation_state={} '
'allocation_state_transition_time={}]').format(
pool.target_dedicated_nodes,
pool.target_low_priority_nodes,
pool.vm_size,
pool.id,
pool.resize_timeout,
pool.allocation_state,
pool.allocation_state_transition_time,
))
if len(nodes) <= 5:
for node in nodes:
logger.debug('{}: {}'.format(node.id, node.state))
else:
logger.debug(_node_state_counts(nodes))
time.sleep(10)
def _node_state_counts(nodes):
# type: (List[batchmodels.ComputeNode]) -> NodeStateCountCollection
"""Collate counts of various nodes
:param list nodes: list of nodes
:rtype: NodeStateCountCollection
:return: node state count collection
"""
node_states = [node.state for node in nodes]
return NodeStateCountCollection(
creating=node_states.count(batchmodels.ComputeNodeState.creating),
idle=node_states.count(batchmodels.ComputeNodeState.idle),
leaving_pool=node_states.count(
batchmodels.ComputeNodeState.leaving_pool),
offline=node_states.count(batchmodels.ComputeNodeState.offline),
preempted=node_states.count(batchmodels.ComputeNodeState.preempted),
rebooting=node_states.count(batchmodels.ComputeNodeState.rebooting),
reimaging=node_states.count(batchmodels.ComputeNodeState.reimaging),
running=node_states.count(batchmodels.ComputeNodeState.running),
start_task_failed=node_states.count(
batchmodels.ComputeNodeState.start_task_failed),
starting=node_states.count(batchmodels.ComputeNodeState.starting),
unknown=node_states.count(batchmodels.ComputeNodeState.unknown),
unusable=node_states.count(batchmodels.ComputeNodeState.unusable),
waiting_for_start_task=node_states.count(
batchmodels.ComputeNodeState.waiting_for_start_task),
)
def wait_for_pool_ready(batch_client, config, pool_id, addl_end_states=None):
# type: (batch.BatchServiceClient, dict, str,
# List[batchmodels.ComputeNode]) -> List[batchmodels.ComputeNode]
"""Wait for pool to enter steady state and all nodes in end states
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str pool_id: pool id
:param list addl_end_states: additional end states
:rtype: list
:return: list of nodes
"""
base_stopping_states = [
batchmodels.ComputeNodeState.start_task_failed,
batchmodels.ComputeNodeState.unusable,
2017-05-15 18:08:30 +03:00
batchmodels.ComputeNodeState.preempted,
batchmodels.ComputeNodeState.idle,
]
base_end_states = [
batchmodels.ComputeNodeState.preempted,
batchmodels.ComputeNodeState.idle,
]
if addl_end_states is not None and len(addl_end_states) > 0:
base_stopping_states.extend(addl_end_states)
base_end_states.extend(addl_end_states)
stopping_states = frozenset(base_stopping_states)
end_states = frozenset(base_end_states)
nodes = _block_for_nodes_ready(
batch_client, config, stopping_states, end_states, pool_id,
settings.pool_settings(config).reboot_on_start_task_failed)
list_nodes(batch_client, config, nodes=nodes)
return nodes
def check_pool_nodes_runnable(batch_client, config):
# type: (batch.BatchServiceClient, dict) -> bool
"""Check that all pool nodes in idle/running state
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:rtype: bool
:return: all pool nodes are runnable
"""
pool_id = settings.pool_id(config)
node_state = frozenset(
(batchmodels.ComputeNodeState.idle,
batchmodels.ComputeNodeState.running)
)
pool = batch_client.pool.get(pool_id)
nodes = list(batch_client.compute_node.list(pool_id))
if (len(nodes) >=
(pool.target_dedicated_nodes + pool.target_low_priority_nodes) and
all(node.state in node_state for node in nodes)):
return True
return False
def create_pool(batch_client, config, pool):
# type: (batch.BatchServiceClient, dict, batchmodels.PoolAddParameter) ->
# List[batchmodels.ComputeNode]
"""Create pool if not exists
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param batchmodels.PoolAddParameter pool: pool addparameter object
:rtype: list
:return: list of nodes
"""
# create pool if not exists
try:
logger.info('Attempting to create pool: {}'.format(pool.id))
if settings.verbose(config):
logger.debug('node prep commandline: {}'.format(
pool.start_task.command_line))
batch_client.pool.add(pool)
logger.info('Created pool: {}'.format(pool.id))
except batchmodels.BatchErrorException as e:
if e.error.code != 'PoolExists':
raise
else:
logger.error('Pool {!r} already exists'.format(pool.id))
# wait for pool idle
return wait_for_pool_ready(batch_client, config, pool.id)
def _add_admin_user_to_compute_node(
batch_client, config, node, username, ssh_public_key_data):
# type: (batch.BatchServiceClient, dict, str, batchmodels.ComputeNode,
# str) -> None
"""Adds an administrative user to the Batch Compute Node with a default
expiry time of 7 days if not specified.
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param node: The compute node.
:type node: `azure.batch.batch_service_client.models.ComputeNode`
:param str username: user name
:param str ssh_public_key_data: ssh rsa public key data
"""
pool = settings.pool_settings(config)
expiry = datetime.datetime.utcnow() + datetime.timedelta(
pool.ssh.expiry_days)
logger.info('adding user {} to node {} in pool {}, expiry={}'.format(
username, node.id, pool.id, expiry))
try:
batch_client.compute_node.add_user(
pool.id,
node.id,
batchmodels.ComputeNodeUser(
username,
is_admin=True,
expiry_time=expiry,
password=None,
ssh_public_key=ssh_public_key_data,
)
)
except batchmodels.batch_error.BatchErrorException as ex:
if 'The specified node user already exists' in ex.message.value:
logger.warning('user {} already exists on node {}'.format(
username, node.id))
else:
# log as error instead of raising the exception in case
# of low-priority removal
logger.error(ex.message.value)
def add_ssh_user(batch_client, config, nodes=None):
# type: (batch.BatchServiceClient, dict,
# List[batchmodels.ComputeNode]) -> None
"""Add an SSH user to all nodes of a pool and optionally generate a
SSH tunneling script
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param list nodes: list of nodes
"""
pool = settings.pool_settings(config)
if util.is_none_or_empty(pool.ssh.username):
logger.info('not creating ssh user on pool {}'.format(pool.id))
return
# read public key data from settings if available
if util.is_not_empty(pool.ssh.ssh_public_key_data):
ssh_pub_key_data = pool.ssh.ssh_public_key_data
ssh_priv_key = pool.ssh.ssh_private_key
else:
# generate ssh key pair if not specified
if pool.ssh.ssh_public_key is None:
ssh_priv_key, ssh_pub_key = crypto.generate_ssh_keypair(
pool.ssh.generated_file_export_path)
else:
ssh_pub_key = pool.ssh.ssh_public_key
ssh_priv_key = pool.ssh.ssh_private_key
# read public key data
with ssh_pub_key.open('rb') as fd:
ssh_pub_key_data = fd.read().decode('utf8')
# get node list if not provided
if nodes is None:
nodes = batch_client.compute_node.list(pool.id)
for node in nodes:
_add_admin_user_to_compute_node(
batch_client, config, node, pool.ssh.username, ssh_pub_key_data)
# generate tunnel script if requested
generate_ssh_tunnel_script(batch_client, pool, ssh_priv_key, nodes)
def generate_ssh_tunnel_script(batch_client, pool, ssh_priv_key, nodes):
# type: (batch.BatchServiceClient, PoolSettings, str,
# List[batchmodels.ComputeNode]) -> None
"""Generate SSH tunneling script
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param PoolSettings pool: pool settings
:param str ssh_priv_key: path to ssh private key
:param list nodes: list of nodes
"""
if pool.ssh.generate_docker_tunnel_script:
if nodes is None or len(list(nodes)) != pool.vm_count:
nodes = batch_client.compute_node.list(pool.id)
if ssh_priv_key is None:
ssh_priv_key = pathlib.Path(
pool.ssh.generated_file_export_path,
crypto.get_ssh_key_prefix())
if not ssh_priv_key.exists():
logger.warning(
('cannot generate tunnel script with non-existant RSA '
'private key: {}').format(ssh_priv_key))
return
ssh_args = [
'ssh', '-o', 'StrictHostKeyChecking=no',
'-o', 'UserKnownHostsFile={}'.format(os.devnull),
'-i', str(ssh_priv_key), '-p', '$port', '-N',
'-L', '2375:localhost:2375', '-L', '3476:localhost:3476',
'{}@$ip'.format(pool.ssh.username)
]
tunnelscript = pathlib.Path(
pool.ssh.generated_file_export_path, _SSH_TUNNEL_SCRIPT)
with tunnelscript.open('w') as fd:
fd.write('#!/usr/bin/env bash\n')
fd.write('set -e\n')
# populate node arrays
fd.write('declare -A nodes\n')
fd.write('declare -A ips\n')
fd.write('declare -A ports\n')
i = 0
for node in nodes:
rls = batch_client.compute_node.get_remote_login_settings(
pool.id, node.id)
fd.write('nodes[{}]={}\n'.format(i, node.id))
fd.write('ips[{}]={}\n'.format(i, rls.remote_login_ip_address))
fd.write('ports[{}]={}\n'.format(i, rls.remote_login_port))
i += 1
fd.write(
'if [ -z $1 ]; then echo must specify node cardinal; exit 1; '
'fi\n')
fd.write('node=${nodes[$1]}\n')
fd.write('ip=${ips[$1]}\n')
fd.write('port=${ports[$1]}\n')
fd.write(
'echo tunneling to docker daemon on $node at '
'$ip:$port\n')
fd.write(' '.join(ssh_args))
fd.write(' >{} 2>&1 &\n'.format(os.devnull))
fd.write('pid=$!\n')
fd.write('echo ssh tunnel pid is $pid\n')
fd.write(
'echo execute docker commands with DOCKER_HOST=: or with '
'option: -H :\n')
os.chmod(str(tunnelscript), 0o755)
logger.info('ssh tunnel script generated: {}'.format(tunnelscript))
def del_ssh_user(batch_client, config, nodes=None):
# type: (batch.BatchServiceClient, dict,
# List[batchmodels.ComputeNode]) -> None
"""Delete an SSH user on all nodes of a pool
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param list nodes: list of nodes
"""
pool = settings.pool_settings(config)
if util.is_none_or_empty(pool.ssh.username):
logger.error('not deleting unspecified ssh user on pool {}'.format(
pool.id))
return
if not util.confirm_action(
config, 'delete user {} from pool {}'.format(
pool.ssh.username, pool.id)):
return
# get node list if not provided
if nodes is None:
nodes = batch_client.compute_node.list(pool.id)
for node in nodes:
try:
batch_client.compute_node.delete_user(
pool.id, node.id, pool.ssh.username)
logger.debug('deleted user {} from node {}'.format(
pool.ssh.username, node.id))
except batchmodels.batch_error.BatchErrorException as ex:
if 'The node user does not exist' not in ex.message.value:
raise
def list_pools(batch_client):
# type: (azure.batch.batch_service_client.BatchServiceClient) -> None
"""List pools
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
"""
i = 0
pools = batch_client.pool.list()
for pool in pools:
if util.is_not_empty(pool.resize_errors):
errors = []
for err in pool.resize_errors:
errors.append('code={} msg={}'.format(err.code, err.message))
errors = ' resize_error=({})'.format(' '.join(errors))
else:
errors = ''
logger.info(
2017-06-06 18:29:44 +03:00
('pool_id={} [state={} allocation_state={}{} vm_size={} '
'node_agent={} vm_dedicated_count={} '
'target_vm_dedicated_count={} vm_low_priority_count={} '
'target_vm_low_priority_count={}]'.format(
pool.id, pool.state, pool.allocation_state, errors,
2017-06-06 18:29:44 +03:00
pool.vm_size,
pool.virtual_machine_configuration.node_agent_sku_id,
pool.current_dedicated_nodes,
pool.target_dedicated_nodes, pool.current_low_priority_nodes,
pool.target_low_priority_nodes)))
i += 1
if i == 0:
logger.error('no pools found')
def _check_metadata_mismatch(mdtype, metadata, req_ge=None):
# type: (str, List[batchmodels.MetadataItem], str) -> None
"""Check for metadata mismatch
:param str mdtype: metadata type (e.g., pool, job)
:param list metadata: list of metadata items
:param str req_ge: required greater than or equal to
"""
if util.is_none_or_empty(metadata):
if req_ge is not None:
raise RuntimeError(
('{} version metadata not present but version {} is '
'required').format(mdtype, req_ge))
else:
logger.warning('{} version metadata not present'.format(mdtype))
else:
for md in metadata:
if md.name == settings.get_metadata_version_name():
if md.value != __version__:
logger.warning(
'{} version metadata mismatch: {}={} cli={}'.format(
mdtype, mdtype, md.value, __version__))
if req_ge is not None:
# split version into tuple
mdt = md.value.split('.')
mdt = tuple((int(mdt[0]), int(mdt[1]), mdt[2]))
rv = req_ge.split('.')
rv = tuple((int(rv[0]), int(rv[1]), rv[2]))
if mdt < rv:
raise RuntimeError(
('{} version of {} does not meet the version '
'requirement of at least {}').format(
mdtype, md.value, req_ge))
break
def resize_pool(batch_client, config, wait=False):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict,
# bool) -> list
"""Resize a pool
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param bool wait: wait for operation to complete
:rtype: list or None
:return: list of nodes if wait or None
"""
pool = settings.pool_settings(config)
_pool = batch_client.pool.get(pool.id)
# check pool metadata version
_check_metadata_mismatch('pool', _pool.metadata)
logger.info(
('Resizing pool {} to {} compute nodes [current_dedicated_nodes={} '
'current_low_priority_nodes={}]').format(
pool.id, pool.vm_count, _pool.current_dedicated_nodes,
_pool.current_low_priority_nodes))
total_vm_count = (
_pool.current_dedicated_nodes + _pool.current_low_priority_nodes
)
batch_client.pool.resize(
pool_id=pool.id,
pool_resize_parameter=batchmodels.PoolResizeParameter(
target_dedicated_nodes=pool.vm_count.dedicated,
target_low_priority_nodes=pool.vm_count.low_priority,
resize_timeout=pool.resize_timeout,
)
)
if wait:
# wait until at least one node has entered leaving_pool state first
# if this pool is being resized down
diff_vm_count = (
pool.vm_count.dedicated + pool.vm_count.low_priority -
total_vm_count
)
if diff_vm_count < 0:
logger.debug(
'waiting for resize to start on pool: {}'.format(pool.id))
while True:
nodes = list(batch_client.compute_node.list(pool.id))
if (len(nodes) != total_vm_count or any(
node.state == batchmodels.ComputeNodeState.leaving_pool
for node in nodes)):
break
else:
time.sleep(1)
return wait_for_pool_ready(
batch_client, config, pool.id,
addl_end_states=[batchmodels.ComputeNodeState.running])
def del_pool(batch_client, config, pool_id=None):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict,
# str) -> bool
"""Delete a pool
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str pool_id: pool id
:rtype: bool
:return: if pool was deleted
"""
if util.is_none_or_empty(pool_id):
pool_id = settings.pool_id(config)
2016-11-01 06:49:24 +03:00
if not util.confirm_action(
config, 'delete {} pool'.format(pool_id)):
return False
logger.info('Deleting pool: {}'.format(pool_id))
batch_client.pool.delete(pool_id)
return True
def pool_autoscale_disable(batch_client, config):
# type: (batch.BatchServiceClient, dict) -> None
"""Enable autoscale formula
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
"""
pool_id = settings.pool_id(config)
batch_client.pool.disable_auto_scale(pool_id=pool_id)
logger.info('autoscale disabled for pool {}'.format(pool_id))
def pool_autoscale_enable(batch_client, config):
# type: (batch.BatchServiceClient, dict) -> None
"""Enable autoscale formula
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
"""
pool = settings.pool_settings(config)
_pool = batch_client.pool.get(pool.id)
# check pool metadata
# TODO fix req version to current release version until 2.9.0
_check_metadata_mismatch('pool', _pool.metadata, req_ge='2.8.0')
asformula = None
asei = None
if not _pool.enable_auto_scale:
# check if an autoscale formula exists in config
if not settings.is_pool_autoscale_enabled(config, pas=pool.autoscale):
if not util.confirm_action(
config,
('enable dummy formula for pool {} as no autoscale '
'formula exists').format(pool.id)):
logger.error('not enabling autoscale for pool {}'.format(
pool.id))
return
# set dummy formula
asformula = (
'$TargetDedicatedNodes = {}; '
'$TargetLowPriorityNodes = {};'
).format(
_pool.target_dedicated_nodes, _pool.target_low_priority_nodes)
if asformula is None:
asformula = autoscale.get_formula(pool)
asei = pool.autoscale.evaluation_interval
# enable autoscale
batch_client.pool.enable_auto_scale(
pool_id=pool.id,
auto_scale_formula=asformula,
auto_scale_evaluation_interval=asei,
)
logger.info('autoscale enabled/updated for pool {}'.format(pool.id))
def _output_autoscale_result(result):
# type: (batchmodels.AutoScaleRun) -> None
"""Output autoscale evalute or last exec results
:param batchmodels.AutoScaleRun result: result
"""
if result is None:
logger.error(
'autoscale result is invalid, ensure autoscale is enabled')
return
if result.error is not None:
logger.error('autoscale evaluate error: code={} message={}'.format(
result.error.code, result.error.message))
else:
logger.info('autoscale result: {}'.format(result.results))
logger.info('last autoscale evaluation: {}'.format(result.timestamp))
def pool_autoscale_evaluate(batch_client, config):
# type: (batch.BatchServiceClient, dict) -> None
"""Evaluate autoscale formula
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
"""
pool = settings.pool_settings(config)
if not settings.is_pool_autoscale_enabled(config, pas=pool.autoscale):
logger.error(
('cannot evaluate autoscale for pool {}, not enabled or '
'no formula').format(pool.id))
return
result = batch_client.pool.evaluate_auto_scale(
pool_id=pool.id,
auto_scale_formula=autoscale.get_formula(pool),
)
_output_autoscale_result(result)
def pool_autoscale_lastexec(batch_client, config):
# type: (batch.BatchServiceClient, dict) -> None
"""Get last execution of the autoscale formula
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
"""
pool_id = settings.pool_id(config)
pool = batch_client.pool.get(pool_id)
if not pool.enable_auto_scale:
logger.error(
('last execution information not available for autoscale '
'disabled pool {}').format(pool_id))
return
_output_autoscale_result(pool.auto_scale_run)
def reboot_nodes(batch_client, config, all_start_task_failed, node_id):
# type: (batch.BatchServiceClient, dict, bool, str) -> None
"""Reboot nodes in a pool
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param bool all_start_task_failed: reboot all start task failed nodes
:param str node_id: node id to delete
"""
pool_id = settings.pool_id(config)
if all_start_task_failed:
nodes = list(
batch_client.compute_node.list(
pool_id=pool_id,
compute_node_list_options=batchmodels.ComputeNodeListOptions(
filter='state eq \'starttaskfailed\'',
),
))
for node in nodes:
if not util.confirm_action(
config, 'reboot node {} from {} pool'.format(
node.id, pool_id)):
continue
_reboot_node(batch_client, pool_id, node.id, False)
else:
_reboot_node(batch_client, pool_id, node_id, False)
def del_node(batch_client, config, all_start_task_failed, node_id):
# type: (batch.BatchServiceClient, dict, bool, str) -> None
"""Delete a node in a pool
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param bool all_start_task_failed: delete all start task failed nodes
:param str node_id: node id to delete
"""
node_ids = []
pool_id = settings.pool_id(config)
if all_start_task_failed:
nodes = list(
batch_client.compute_node.list(
pool_id=pool_id,
compute_node_list_options=batchmodels.ComputeNodeListOptions(
filter='state eq \'starttaskfailed\'',
),
))
for node in nodes:
if util.confirm_action(
config, 'delete node {} from {} pool'.format(
node.id, pool_id)):
node_ids.append(node.id)
else:
if util.is_none_or_empty(node_id):
raise ValueError('node id is invalid')
if util.confirm_action(
config, 'delete node {} from {} pool'.format(
node_id, pool_id)):
node_ids.append(node_id)
if util.is_none_or_empty(node_ids):
logger.warning('no nodes to delete from pool: {}'.format(pool_id))
return
logger.info('Deleting nodes {} from pool {}'.format(node_ids, pool_id))
batch_client.pool.remove_nodes(
pool_id=pool_id,
node_remove_parameter=batchmodels.NodeRemoveParameter(
node_list=node_ids,
)
)
def check_pool_for_job_migration(
batch_client, config, jobid=None, poolid=None):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict,
# str, str) -> None
"""Check pool for job migration eligibility
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str jobid: job id to migrate
:param str poolid: pool id to update to
"""
if poolid is None:
poolid = settings.pool_id(config)
if jobid is None:
jobs = settings.job_specifications(config)
else:
jobs = [{'id': jobid}]
for _job in jobs:
job_id = settings.job_id(_job)
job = batch_client.job.get(job_id=job_id)
if (job.state == batchmodels.JobState.completed or
job.state == batchmodels.JobState.deleting or
job.state == batchmodels.JobState.terminating):
raise RuntimeError(
'cannot migrate job {} in state {}'.format(job_id, job.state))
if job.pool_info.auto_pool_specification is not None:
raise RuntimeError(
'cannot migrate job {} with an autopool specification'.format(
job_id))
if job.pool_info.pool_id == poolid:
raise RuntimeError(
'cannot migrate job {} to the same pool {}'.format(
job_id, poolid))
def update_job_with_pool(batch_client, config, jobid=None, poolid=None):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict,
# str, str) -> None
"""Update job with different pool id
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str jobid: job id to update
:param str poolid: pool id to update to
"""
if poolid is None:
poolid = settings.pool_id(config)
if jobid is None:
jobs = settings.job_specifications(config)
else:
jobs = [{'id': jobid}]
for _job in jobs:
job_id = settings.job_id(_job)
batch_client.job.patch(
job_id=job_id,
job_patch_parameter=batchmodels.JobPatchParameter(
pool_info=batchmodels.PoolInformation(
pool_id=poolid)
)
)
logger.info('updated job {} to target pool {}'.format(
job_id, poolid))
def disable_jobs(
batch_client, config, disable_tasks_action, jobid=None,
disabling_state_ok=False, terminate_tasks=False):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict,
# str, str, bool, bool) -> None
"""Disable jobs
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str disable_tasks_action: disable tasks action
:param str jobid: job id to disable
:param bool disabling_state_ok: disabling state is ok to proceed
:param bool terminate_tasks: terminate tasks after disable
"""
if jobid is None:
jobs = settings.job_specifications(config)
else:
jobs = [{'id': jobid}]
for job in jobs:
job_id = settings.job_id(job)
try:
batch_client.job.disable(
job_id=job_id,
disable_tasks=batchmodels.DisableJobOption(
disable_tasks_action),
)
except batchmodels.batch_error.BatchErrorException as ex:
if ('The specified job is already in a completed state' in
ex.message.value):
pass
else:
# wait for job to enter disabled/completed/deleting state
while True:
_job = batch_client.job.get(
job_id=job_id,
job_get_options=batchmodels.JobGetOptions(
select='id,state')
)
if ((disabling_state_ok and
_job.state == batchmodels.JobState.disabling) or
_job.state == batchmodels.JobState.disabled or
_job.state == batchmodels.JobState.completed or
_job.state == batchmodels.JobState.deleting):
break
time.sleep(1)
logger.info('job {} disabled'.format(job_id))
if terminate_tasks:
terminate_tasks(
batch_client, config, jobid=job_id, wait=True)
def enable_jobs(batch_client, config, jobid=None):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict,
# str) -> None
"""Enable jobs
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str jobid: job id to enable
"""
if jobid is None:
jobs = settings.job_specifications(config)
else:
jobs = [{'id': jobid}]
for job in jobs:
job_id = settings.job_id(job)
try:
batch_client.job.enable(job_id=job_id)
except batchmodels.batch_error.BatchErrorException as ex:
if ('The specified job is already in a completed state' in
ex.message.value):
pass
else:
logger.info('job {} enabled'.format(job_id))
def del_jobs(batch_client, config, jobid=None, termtasks=False, wait=False):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict,
# str, bool, bool) -> None
"""Delete jobs
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str jobid: job id to delete
:param bool termtasks: terminate tasks manually prior
:param bool wait: wait for jobs to delete
"""
if jobid is None:
jobs = settings.job_specifications(config)
else:
jobs = [{'id': jobid}]
nocheck = set()
for job in jobs:
job_id = settings.job_id(job)
2016-11-01 06:49:24 +03:00
if not util.confirm_action(
config, 'delete {} job'.format(job_id)):
nocheck.add(job_id)
continue
logger.info('Deleting job: {}'.format(job_id))
try:
if termtasks:
# disable job first to prevent active tasks from
# getting processed
logger.debug(
'disabling job {} first due to task termination'.format(
job_id))
disable_jobs(
batch_client, config, 'wait', jobid=job_id,
disabling_state_ok=True, terminate_tasks=True)
# delete job
batch_client.job.delete(job_id)
except batchmodels.batch_error.BatchErrorException as ex:
if 'The specified job does not exist' in ex.message.value:
logger.error('{} job does not exist'.format(job_id))
nocheck.add(job_id)
continue
else:
raise
if wait:
for job in jobs:
job_id = settings.job_id(job)
if job_id in nocheck:
continue
try:
logger.debug('waiting for job {} to delete'.format(job_id))
while True:
batch_client.job.get(
job_id,
job_get_options=batchmodels.JobGetOptions(select='id')
)
time.sleep(1)
except batchmodels.batch_error.BatchErrorException as ex:
if 'The specified job does not exist' in ex.message.value:
logger.info('job {} does not exist'.format(job_id))
continue
else:
raise
def del_all_jobs(batch_client, config, termtasks=False, wait=False):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict,
# bool, bool) -> None
"""Delete all jobs
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param bool termtasks: terminate tasks prior
:param bool wait: wait for jobs to delete
"""
check = set()
logger.debug('Getting list of all jobs...')
jobs = batch_client.job.list()
for job in jobs:
2016-11-01 06:49:24 +03:00
if not util.confirm_action(
config, 'delete {} job'.format(job.id)):
continue
if termtasks:
terminate_tasks(batch_client, config, jobid=job.id, wait=True)
logger.info('Deleting job: {}'.format(job.id))
batch_client.job.delete(job.id)
check.add(job.id)
if wait:
for job_id in check:
try:
logger.debug('waiting for job {} to delete'.format(job_id))
while True:
batch_client.job.get(job_id)
time.sleep(1)
except batchmodels.batch_error.BatchErrorException as ex:
if 'The specified job does not exist' not in ex.message.value:
raise
def del_tasks(batch_client, config, jobid=None, taskid=None, wait=False):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict,
# str, str, bool) -> None
"""Delete tasks
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str jobid: job id of task to terminate
:param str taskid: task id to terminate
:param bool wait: wait for task to terminate
"""
# first terminate tasks, force wait for completion
terminate_tasks(
batch_client, config, jobid=jobid, taskid=taskid, wait=True)
# proceed with deletion
if jobid is None:
jobs = settings.job_specifications(config)
else:
jobs = [{'id': jobid}]
nocheck = {}
for job in jobs:
job_id = settings.job_id(job)
nocheck[job_id] = set()
if taskid is None:
tasks = [
x.id for x in batch_client.task.list(
job_id,
task_list_options=batchmodels.TaskListOptions(select='id')
)
]
else:
tasks = [taskid]
for task in tasks:
2016-11-01 06:49:24 +03:00
if not util.confirm_action(
config, 'delete {} task in job {}'.format(
task, job_id)):
nocheck[job_id].add(task)
continue
logger.info('Deleting task: {}'.format(task))
batch_client.task.delete(job_id, task)
if wait:
for job in jobs:
job_id = settings.job_id(job)
if taskid is None:
tasks = [
x.id for x in batch_client.task.list(
job_id,
task_list_options=batchmodels.TaskListOptions(
select='id')
)
]
else:
tasks = [taskid]
for task in tasks:
try:
if task in nocheck[job_id]:
continue
except KeyError:
pass
try:
logger.debug(
'waiting for task {} in job {} to terminate'.format(
task, job_id))
while True:
batch_client.task.get(
job_id, task,
task_get_options=batchmodels.TaskGetOptions(
select='id')
)
time.sleep(1)
except batchmodels.batch_error.BatchErrorException as ex:
if 'The specified task does not exist' in ex.message.value:
logger.info('task {} in job {} does not exist'.format(
task, job_id))
continue
else:
raise
def clean_mi_jobs(batch_client, config):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict) -> None
"""Clean up multi-instance jobs
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
"""
for job in settings.job_specifications(config):
job_id = settings.job_id(job)
if not util.confirm_action(
config, 'cleanup {} job'.format(job_id)):
continue
cleanup_job_id = 'shipyardcleanup-' + job_id
cleanup_job = batchmodels.JobAddParameter(
id=cleanup_job_id,
pool_info=batchmodels.PoolInformation(
pool_id=settings.pool_id(config)),
)
try:
batch_client.job.add(cleanup_job)
logger.info('Added cleanup job: {}'.format(cleanup_job.id))
except batchmodels.batch_error.BatchErrorException as ex:
if 'The specified job already exists' not in ex.message.value:
raise
# get all cleanup tasks
cleanup_tasks = [
x.id for x in batch_client.task.list(
cleanup_job_id,
task_list_options=batchmodels.TaskListOptions(select='id')
)
]
# list all tasks in job
tasks = batch_client.task.list(job_id)
for task in tasks:
if (task.id in cleanup_tasks or
task.multi_instance_settings is None):
continue
# check if task is complete
if task.state == batchmodels.TaskState.completed:
name = task.multi_instance_settings.coordination_command_line.\
split('--name')[-1].split()[0]
# create cleanup task
batchtask = batchmodels.TaskAddParameter(
id=task.id,
multi_instance_settings=batchmodels.MultiInstanceSettings(
number_of_instances=task.
multi_instance_settings.number_of_instances,
2016-11-01 06:49:24 +03:00
coordination_command_line=util.
wrap_commands_in_shell([
'docker stop {}'.format(name),
'docker rm -v {}'.format(name),
'exit 0',
], wait=False),
),
command_line='/bin/sh -c "exit 0"',
user_identity=_RUN_ELEVATED,
)
batch_client.task.add(job_id=cleanup_job_id, task=batchtask)
logger.debug(
('Waiting for docker multi-instance clean up task {} '
'for job {} to complete').format(batchtask.id, job_id))
# wait for cleanup task to complete before adding another
while True:
batchtask = batch_client.task.get(
cleanup_job_id, batchtask.id,
task_get_options=batchmodels.TaskGetOptions(
select='id,state')
)
if batchtask.state == batchmodels.TaskState.completed:
break
time.sleep(1)
logger.info(
('Docker multi-instance clean up task {} for job {} '
'completed').format(batchtask.id, job_id))
def del_clean_mi_jobs(batch_client, config):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict) -> None
"""Delete clean up multi-instance jobs
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
"""
for job in settings.job_specifications(config):
job_id = settings.job_id(job)
cleanup_job_id = 'shipyardcleanup-' + job_id
logger.info('deleting job: {}'.format(cleanup_job_id))
try:
batch_client.job.delete(cleanup_job_id)
except batchmodels.batch_error.BatchErrorException:
pass
def terminate_jobs(
batch_client, config, jobid=None, termtasks=False, wait=False):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict,
# str, bool, bool) -> None
"""Terminate jobs
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str jobid: job id to terminate
:param bool termtasks: terminate tasks manually prior
:param bool wait: wait for job to terminate
"""
if termtasks:
terminate_tasks(batch_client, config, jobid=jobid, wait=True)
if jobid is None:
jobs = settings.job_specifications(config)
else:
jobs = [{'id': jobid}]
nocheck = set()
for job in jobs:
job_id = settings.job_id(job)
2016-11-01 06:49:24 +03:00
if not util.confirm_action(
config, 'terminate {} job'.format(job_id)):
nocheck.add(job_id)
continue
logger.info('Terminating job: {}'.format(job_id))
batch_client.job.terminate(job_id)
if wait:
for job in jobs:
job_id = settings.job_id(job)
if job_id in nocheck:
continue
try:
logger.debug('waiting for job {} to terminate'.format(job_id))
while True:
_job = batch_client.job.get(job_id)
if _job.state == batchmodels.JobState.completed:
break
time.sleep(1)
except batchmodels.batch_error.BatchErrorException as ex:
if 'The specified job does not exist' not in ex.message.value:
raise
def terminate_all_jobs(batch_client, config, termtasks=False, wait=False):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict,
# bool, bool) -> None
"""Terminate all jobs
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param bool termtasks: terminate tasks prior
:param bool wait: wait for jobs to terminate
"""
check = set()
logger.debug('Getting list of all jobs...')
jobs = batch_client.job.list()
for job in jobs:
2016-11-01 06:49:24 +03:00
if not util.confirm_action(
config, 'terminate {} job'.format(job.id)):
continue
if termtasks:
terminate_tasks(batch_client, config, jobid=job.id, wait=True)
logger.info('Terminating job: {}'.format(job.id))
batch_client.job.terminate(job.id)
check.add(job.id)
if wait:
for job_id in check:
try:
logger.debug('waiting for job {} to terminate'.format(job_id))
while True:
_job = batch_client.job.get(job_id)
if _job.state == batchmodels.JobState.completed:
break
time.sleep(1)
except batchmodels.batch_error.BatchErrorException as ex:
if 'The specified job does not exist' not in ex.message.value:
raise
def _send_docker_kill_signal(
batch_client, config, username, ssh_private_key, pool_id, node_id,
job_id, task_id, task_is_mi):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict, str,
# pathlib.Path, str, str, str, str, bool) -> None
"""Send docker kill signal via SSH
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str username: SSH username
:param pathlib.Path ssh_private_key: SSH private key
:param str pool_id: pool_id of node
:param str node_id: node_id of node
:param str job_id: job id of task id to kill
:param str task_id: task id to kill
:param bool task_is_mi: task is multi-instance
"""
targets = [(pool_id, node_id)]
2016-11-23 01:27:33 +03:00
task_name = None
# if this task is multi-instance, get all subtasks
if task_is_mi:
subtasks = batch_client.task.list_subtasks(job_id, task_id)
for subtask in subtasks.value:
targets.append(
(subtask.node_info.pool_id, subtask.node_info.node_id))
# fetch container name
try:
jobs = settings.job_specifications(config)
for job in jobs:
if job_id == settings.job_id(job):
tasks = settings.job_tasks(job)
task_name = settings.task_name(tasks[0])
break
except KeyError:
pass
# TODO get task names for non-mi tasks?
if task_name is None:
task_name = '{}-{}'.format(job_id, task_id)
# for each task node target, issue docker kill
for target in targets:
rls = batch_client.compute_node.get_remote_login_settings(
target[0], target[1])
ssh_args = [
'ssh', '-o', 'StrictHostKeyChecking=no',
'-o', 'UserKnownHostsFile={}'.format(os.devnull),
'-i', str(ssh_private_key), '-p', str(rls.remote_login_port),
'-t', '{}@{}'.format(username, rls.remote_login_ip_address),
('sudo /bin/bash -c "docker kill {tn}; '
'docker ps -qa -f name={tn} | '
'xargs --no-run-if-empty docker rm -v"').format(tn=task_name)
]
2016-11-01 06:49:24 +03:00
rc = util.subprocess_with_output(ssh_args, shell=False)
if rc != 0:
logger.error('docker kill failed with return code: {}'.format(rc))
def terminate_tasks(
batch_client, config, jobid=None, taskid=None, wait=False,
force=False):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict,
# str, str, bool, bool) -> None
"""Terminate tasks
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str jobid: job id of task to terminate
:param str taskid: task id to terminate
:param bool wait: wait for task to terminate
:param bool force: force task docker kill signal regardless of state
"""
# get ssh login settings
pool = settings.pool_settings(config)
if util.is_none_or_empty(pool.ssh.username):
raise ValueError(
'cannot terminate docker container without an SSH username')
ssh_private_key = pool.ssh.ssh_private_key
if ssh_private_key is None:
ssh_private_key = pathlib.Path(
pool.ssh.generated_file_export_path, crypto.get_ssh_key_prefix())
if not ssh_private_key.exists():
raise RuntimeError('SSH private key file not found at: {}'.format(
ssh_private_key))
if jobid is None:
jobs = settings.job_specifications(config)
else:
jobs = [{'id': jobid}]
nocheck = {}
for job in jobs:
job_id = settings.job_id(job)
nocheck[job_id] = set()
if taskid is None:
tasks = [
x.id for x in batch_client.task.list(
job_id,
task_list_options=batchmodels.TaskListOptions(select='id')
)
]
else:
tasks = [taskid]
for task in tasks:
_task = batch_client.task.get(job_id, task)
# if completed, skip
if _task.state == batchmodels.TaskState.completed and not force:
logger.debug(
'Skipping termination of completed task {} on '
'job {}'.format(task, job_id))
nocheck[job_id].add(task)
continue
2016-11-01 06:49:24 +03:00
if not util.confirm_action(
config, 'terminate {} task in job {}'.format(
task, job_id)):
nocheck[job_id].add(task)
continue
logger.info('Terminating task: {}'.format(task))
# directly send docker kill signal if running
if _task.state == batchmodels.TaskState.running or force:
if (_task.multi_instance_settings is not None and
_task.multi_instance_settings.number_of_instances > 1):
task_is_mi = True
else:
task_is_mi = False
_send_docker_kill_signal(
batch_client, config, pool.ssh.username, ssh_private_key,
_task.node_info.pool_id, _task.node_info.node_id,
job_id, task, task_is_mi)
else:
batch_client.task.terminate(job_id, task)
if wait:
for job in jobs:
job_id = settings.job_id(job)
if taskid is None:
tasks = [
x.id for x in batch_client.task.list(
job_id,
task_list_options=batchmodels.TaskListOptions(
select='id'
)
)
]
else:
tasks = [taskid]
for task in tasks:
try:
if task in nocheck[job_id]:
continue
except KeyError:
pass
try:
logger.debug(
'waiting for task {} in job {} to terminate'.format(
task, job_id))
while True:
_task = batch_client.task.get(
job_id, task,
task_get_options=batchmodels.TaskGetOptions(
select='state')
)
if _task.state == batchmodels.TaskState.completed:
break
time.sleep(1)
except batchmodels.batch_error.BatchErrorException as ex:
if ('The specified task does not exist'
not in ex.message.value):
raise
def list_nodes(batch_client, config, nodes=None):
# type: (batch.BatchServiceClient, dict, list) -> None
"""Get a list of nodes
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param lsit nodes: list of nodes
"""
pool_id = settings.pool_id(config)
logger.debug('listing nodes for pool {}'.format(pool_id))
if nodes is None:
nodes = batch_client.compute_node.list(pool_id)
for node in nodes:
if node.errors is not None:
info = ' error=(code={} message={})'.format(
node.errors.code, node.errors.message)
else:
info = ''
if node.start_task_info is not None:
if node.start_task_info.failure_info is not None:
info += (' start_task_failure_info=(category={} code={} '
'message={})').format(
node.start_task_info.failure_info.category,
node.start_task_info.failure_info.code,
node.start_task_info.failure_info.message)
else:
info += ' start_task_exit_code={}'.format(
node.start_task_info.exit_code)
logger.info(
('node_id={} [state={}{} scheduling_state={} ip_address={} '
2017-05-13 05:23:29 +03:00
'vm_size={} dedicated={} total_tasks_run={} '
'running_tasks_count={} total_tasks_succeeded={}]').format(
node.id, node.state, info, node.scheduling_state,
2017-05-13 05:23:29 +03:00
node.ip_address, node.vm_size, node.is_dedicated,
node.total_tasks_run, node.running_tasks_count,
node.total_tasks_succeeded))
def get_remote_login_settings(batch_client, config, nodes=None):
# type: (batch.BatchServiceClient, dict, List[str]) -> dict
"""Get remote login settings
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param list nodes: list of nodes
:rtype: dict
:return: dict of node id -> remote login settings
"""
pool_id = settings.pool_id(config)
if nodes is None:
nodes = batch_client.compute_node.list(pool_id)
ret = {}
for node in nodes:
rls = batch_client.compute_node.get_remote_login_settings(
pool_id, node.id)
logger.info('node {}: ip {} port {}'.format(
node.id, rls.remote_login_ip_address, rls.remote_login_port))
ret[node.id] = rls
return ret
def get_remote_login_setting_for_node(batch_client, config, cardinal, node_id):
# type: (batch.BatchServiceClient, dict, int, str) -> dict
"""Get remote login setting for a node
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param int cardinal: node cardinal number
:param str node_id: node id
:rtype: tuple
:return: ip, port
"""
pool_id = settings.pool_id(config)
if node_id is None:
if cardinal is None:
raise ValueError('cardinal is invalid with no node_id specified')
nodes = list(batch_client.compute_node.list(pool_id))
if cardinal >= len(nodes):
raise ValueError(
('cardinal value {} invalid for number of nodes {} in '
'pool {}').format(cardinal, len(nodes), pool_id))
node_id = nodes[cardinal].id
rls = batch_client.compute_node.get_remote_login_settings(
pool_id, node_id)
return rls.remote_login_ip_address, rls.remote_login_port
def stream_file_and_wait_for_task(
batch_client, config, filespec=None, disk=False):
# type: (batch.BatchServiceClient, dict, str, bool) -> None
"""Stream a file and wait for task to complete
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str filespec: filespec (jobid,taskid,filename)
:param bool disk: write to disk instead
"""
if filespec is None:
job_id = None
task_id = None
file = None
else:
job_id, task_id, file = filespec.split(',')
if job_id is None:
2016-11-01 06:49:24 +03:00
job_id = util.get_input('Enter job id: ')
if task_id is None:
2016-11-01 06:49:24 +03:00
task_id = util.get_input('Enter task id: ')
if file is None:
2016-11-01 06:49:24 +03:00
file = util.get_input(
'Enter task-relative file path to stream [stdout.txt]: ')
if file == '' or file is None:
file = 'stdout.txt'
# get first running task if specified
if task_id == '@FIRSTRUNNING':
logger.debug('attempting to get first running task in job {}'.format(
job_id))
while True:
tasks = batch_client.task.list(
job_id,
task_list_options=batchmodels.TaskListOptions(
filter='state eq \'running\'',
select='id,state',
),
)
for task in tasks:
task_id = task.id
break
if task_id == '@FIRSTRUNNING':
time.sleep(1)
else:
break
logger.debug('attempting to stream file {} from job={} task={}'.format(
file, job_id, task_id))
curr = 0
completed = False
notfound = 0
try:
fd = None
if disk:
fp = pathlib.Path(job_id, task_id, file)
if (fp.exists() and not util.confirm_action(
config, 'overwrite {}'.format(fp))):
return
fp.parent.mkdir(mode=0o750, parents=True, exist_ok=True)
logger.info('writing streamed data to disk: {}'.format(fp))
fd = fp.open('wb', buffering=0)
while True:
# get task file properties
try:
tfp = batch_client.file.get_properties_from_task(
job_id, task_id, file, raw=True)
except batchmodels.BatchErrorException as ex:
if ('The specified operation is not valid for the current '
'state of the resource.' in ex.message):
time.sleep(1)
continue
elif 'The specified file does not exist.' in ex.message:
notfound += 1
if notfound > 10:
raise
time.sleep(1)
continue
else:
raise
size = int(tfp.response.headers['Content-Length'])
2017-05-02 04:47:41 +03:00
if curr < size:
frag = batch_client.file.get_from_task(
job_id, task_id, file,
batchmodels.FileGetFromTaskOptions(
2017-05-02 04:47:41 +03:00
ocp_range='bytes={}-{}'.format(curr, size))
)
for f in frag:
if fd is not None:
fd.write(f)
else:
print(f.decode('utf8'), end='')
2017-05-02 04:47:41 +03:00
curr = size
elif completed:
if not disk:
print()
break
if not completed and curr == size:
task = batch_client.task.get(
job_id, task_id,
task_get_options=batchmodels.TaskGetOptions(
select='state')
)
if task.state == batchmodels.TaskState.completed:
completed = True
time.sleep(1)
finally:
if fd is not None:
fd.close()
def get_file_via_task(batch_client, config, filespec=None):
# type: (batch.BatchServiceClient, dict, str) -> None
"""Get a file task style
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str filespec: filespec (jobid,taskid,filename)
"""
if filespec is None:
job_id = None
task_id = None
file = None
else:
job_id, task_id, file = filespec.split(',')
if job_id is None:
2016-11-01 06:49:24 +03:00
job_id = util.get_input('Enter job id: ')
if task_id is None:
2016-11-01 06:49:24 +03:00
task_id = util.get_input('Enter task id: ')
if file is None:
2016-11-01 06:49:24 +03:00
file = util.get_input(
'Enter task-relative file path to retrieve [stdout.txt]: ')
if file == '' or file is None:
file = 'stdout.txt'
# get first running task if specified
if task_id == '@FIRSTRUNNING':
logger.debug('attempting to get first running task in job {}'.format(
job_id))
while True:
tasks = batch_client.task.list(
job_id,
task_list_options=batchmodels.TaskListOptions(
filter='state eq \'running\'',
select='id,state',
),
)
for task in tasks:
task_id = task.id
break
if task_id == '@FIRSTRUNNING':
time.sleep(1)
else:
break
# check if file exists on disk; a possible race condition here is
# understood
fp = pathlib.Path(pathlib.Path(file).name)
if (fp.exists() and
2016-11-01 06:49:24 +03:00
not util.confirm_action(
config, 'file overwrite of {}'.format(file))):
raise RuntimeError('file already exists: {}'.format(file))
logger.debug('attempting to retrieve file {} from job={} task={}'.format(
file, job_id, task_id))
stream = batch_client.file.get_from_task(job_id, task_id, file)
with fp.open('wb') as f:
2016-11-01 06:49:24 +03:00
for fdata in stream:
f.write(fdata)
logger.debug('file {} retrieved from job={} task={} bytes={}'.format(
file, job_id, task_id, fp.stat().st_size))
def get_all_files_via_task(batch_client, config, filespec=None):
# type: (batch.BatchServiceClient, dict, str) -> None
"""Get all files from a task
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str filespec: filespec (jobid,taskid,include_pattern)
"""
if filespec is None:
job_id = None
task_id = None
incl = None
else:
job_id, task_id, incl = filespec.split(',')
if job_id is None:
2016-11-01 06:49:24 +03:00
job_id = util.get_input('Enter job id: ')
if task_id is None:
2016-11-01 06:49:24 +03:00
task_id = util.get_input('Enter task id: ')
if incl is None:
2016-11-01 06:49:24 +03:00
incl = util.get_input('Enter filter: ')
if incl is not None and len(incl) == 0:
incl = None
# get first running task if specified
if task_id == '@FIRSTRUNNING':
logger.debug('attempting to get first running task in job {}'.format(
job_id))
while True:
tasks = batch_client.task.list(
job_id,
task_list_options=batchmodels.TaskListOptions(
filter='state eq \'running\'',
select='id,state',
),
)
for task in tasks:
task_id = task.id
break
if task_id == '@FIRSTRUNNING':
time.sleep(1)
else:
break
# iterate through all files in task and download them
logger.debug('downloading files to {}/{}'.format(job_id, task_id))
files = batch_client.file.list_from_task(job_id, task_id, recursive=True)
i = 0
dirs_created = set('.')
for file in files:
if file.is_directory:
continue
if incl is not None and not fnmatch.fnmatch(file.name, incl):
continue
fp = pathlib.Path(job_id, task_id, file.name)
if str(fp.parent) not in dirs_created:
fp.parent.mkdir(mode=0o750, parents=True, exist_ok=True)
dirs_created.add(str(fp.parent))
stream = batch_client.file.get_from_task(job_id, task_id, file.name)
with fp.open('wb') as f:
2016-11-01 06:49:24 +03:00
for fdata in stream:
f.write(fdata)
i += 1
if i == 0:
logger.error('no files found for task {} job {} include={}'.format(
task_id, job_id, incl if incl is not None else ''))
else:
logger.info(
'all task files retrieved from job={} task={} include={}'.format(
job_id, task_id, incl if incl is not None else ''))
def get_all_files_via_node(batch_client, config, filespec=None):
# type: (batch.BatchServiceClient, dict, str) -> None
"""Get a file node style
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str filespec: filespec (nodeid,include_pattern)
"""
if filespec is None:
node_id = None
incl = None
else:
node_id, incl = filespec.split(',')
if node_id is None:
2016-11-01 06:49:24 +03:00
node_id = util.get_input('Enter node id: ')
if incl is None:
2016-11-01 06:49:24 +03:00
incl = util.get_input('Enter filter: ')
if node_id is None or len(node_id) == 0:
raise ValueError('node id is invalid')
if incl is not None and len(incl) == 0:
incl = None
pool_id = settings.pool_id(config)
logger.debug('downloading files to {}/{}'.format(pool_id, node_id))
files = batch_client.file.list_from_compute_node(
pool_id, node_id, recursive=True)
i = 0
dirs_created = set('.')
for file in files:
if file.is_directory:
continue
if incl is not None and not fnmatch.fnmatch(file.name, incl):
continue
fp = pathlib.Path(pool_id, node_id, file.name)
if str(fp.parent) not in dirs_created:
fp.parent.mkdir(mode=0o750, parents=True, exist_ok=True)
dirs_created.add(str(fp.parent))
stream = batch_client.file.get_from_compute_node(
pool_id, node_id, file.name)
with fp.open('wb') as f:
2016-11-01 06:49:24 +03:00
for fdata in stream:
f.write(fdata)
i += 1
if i == 0:
logger.error('no files found for pool {} node {} include={}'.format(
pool_id, node_id, incl if incl is not None else ''))
else:
logger.info(
'all files retrieved from pool={} node={} include={}'.format(
pool_id, node_id, incl if incl is not None else ''))
def get_file_via_node(batch_client, config, filespec=None):
# type: (batch.BatchServiceClient, dict, str) -> None
"""Get a file node style
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str filespec: filespec (nodeid,filename)
"""
if filespec is None:
node_id = None
file = None
else:
node_id, file = filespec.split(',')
if node_id is None:
2016-11-01 06:49:24 +03:00
node_id = util.get_input('Enter node id: ')
if file is None:
2016-11-01 06:49:24 +03:00
file = util.get_input(
'Enter node-relative file path to retrieve: ')
if node_id is None or len(node_id) == 0:
raise ValueError('node id is invalid')
if file == '' or file is None:
raise RuntimeError('specified invalid file to retrieve')
pool_id = settings.pool_id(config)
# check if file exists on disk; a possible race condition here is
# understood
fp = pathlib.Path(pathlib.Path(file).name)
if (fp.exists() and
2016-11-01 06:49:24 +03:00
not util.confirm_action(
config, 'file overwrite of {}'.format(file))):
raise RuntimeError('file already exists: {}'.format(file))
logger.debug('attempting to retrieve file {} from pool={} node={}'.format(
file, pool_id, node_id))
stream = batch_client.file.get_from_compute_node(pool_id, node_id, file)
with fp.open('wb') as f:
2016-11-01 06:49:24 +03:00
for fdata in stream:
f.write(fdata)
logger.debug('file {} retrieved from pool={} node={} bytes={}'.format(
file, pool_id, node_id, fp.stat().st_size))
def list_jobs(batch_client, config):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict) -> None
"""List all jobs
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
"""
jobs = batch_client.job.list()
i = 0
for job in jobs:
logger.info('job_id={} [state={} pool_id={}]'.format(
job.id, job.state, job.pool_info.pool_id))
i += 1
if i == 0:
logger.error('no jobs found')
def list_tasks(batch_client, config, jobid=None):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict,
# str, bool) -> bool
"""List tasks for specified jobs
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str jobid: job id to list tasks from
:rtype: bool
:return: if all tasks have completed under job(s)
"""
all_complete = True
if util.is_none_or_empty(jobid):
jobs = settings.job_specifications(config)
else:
jobs = [{'id': jobid}]
for job in jobs:
jobid = settings.job_id(job)
i = 0
try:
tasks = batch_client.task.list(jobid)
for task in tasks:
if task.execution_info is not None:
if task.execution_info.failure_info is not None:
ei = (' failure_info=(category={} code={} '
'message={})').format(
task.execution_info.failure_info.category,
task.execution_info.failure_info.code,
task.execution_info.failure_info.message)
else:
2016-11-23 20:06:37 +03:00
if (task.execution_info.end_time is not None and
task.execution_info.start_time is not None):
duration = (task.execution_info.end_time -
task.execution_info.start_time)
else:
duration = 'n/a'
ei = (' start_time={} end_time={} duration={} '
'exit_code={}').format(
task.execution_info.start_time,
task.execution_info.end_time,
2016-11-23 20:06:37 +03:00
duration,
task.execution_info.exit_code)
else:
ei = ''
some_extra_info = ('none', 'none', ei)
if task.node_info is not None:
some_extra_info = (
task.node_info.pool_id,
task.node_info.node_id, ei)
logger.info(
'job_id={} task_id={} [state={} max_retries={} '
'retention_time={} pool_id={} node_id={}{}]'.format(
jobid, task.id, task.state,
task.constraints.max_task_retry_count,
task.constraints.retention_time, *some_extra_info))
if task.state != batchmodels.TaskState.completed:
all_complete = False
i += 1
except batchmodels.batch_error.BatchErrorException as ex:
if 'The specified job does not exist' in ex.message.value:
logger.error('{} job does not exist'.format(jobid))
continue
else:
raise
if i == 0:
logger.error('no tasks found for job {}'.format(jobid))
return all_complete
def list_task_files(batch_client, config, jobid=None, taskid=None):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict,
# str, str) -> None
"""List task files for specified jobs
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str jobid: job id to list
:param str taskid: task id to list
"""
if util.is_none_or_empty(jobid):
jobs = settings.job_specifications(config)
else:
jobs = [{'id': jobid}]
for job in jobs:
jobid = settings.job_id(job)
i = 0
try:
tasks = batch_client.task.list(
jobid,
task_list_options=batchmodels.TaskListOptions(select='id'))
for task in tasks:
if taskid is not None and taskid != task.id:
continue
j = 0
files = batch_client.file.list_from_task(
jobid, taskid, recursive=True)
for file in files:
if file.is_directory:
continue
logger.info(
'task_id={} file={} [job_id={} lmt={} '
'bytes={}]'.format(
taskid, file.name, jobid,
file.properties.last_modified,
file.properties.content_length))
j += 1
if j == 0:
logger.error('no files found for task {} job {}'.format(
taskid, jobid))
i += 1
except batchmodels.batch_error.BatchErrorException as ex:
if 'The specified job does not exist' in ex.message.value:
logger.error('{} job does not exist'.format(jobid))
continue
else:
raise
if i == 0:
logger.error('no tasks found for job {}'.format(jobid))
def generate_docker_login_settings(config, for_ssh=False):
# type: (dict, bool) -> tuple
"""Generate docker login environment variables and command line
for login/re-login
:param dict config: configuration object
:param bool for_ssh: for direct SSH use
:rtype: tuple
:return: (env vars, login cmds)
"""
# get private registry settings
preg = settings.docker_registry_private_settings(config)
# get encryption settings
encrypt = settings.batch_shipyard_encryption_enabled(config)
# populate command and env vars
cmd = []
env = []
if preg.server:
env.append(
batchmodels.EnvironmentSetting(
'DOCKER_LOGIN_SERVER', preg.server)
)
env.append(
batchmodels.EnvironmentSetting(
'DOCKER_LOGIN_USERNAME', preg.user)
)
env.append(
batchmodels.EnvironmentSetting(
'DOCKER_LOGIN_PASSWORD',
crypto.encrypt_string(encrypt, preg.password, config))
)
if encrypt:
cmd.append(
'DOCKER_LOGIN_PASSWORD='
'`echo $DOCKER_LOGIN_PASSWORD | base64 -d | '
'openssl rsautl -decrypt -inkey '
'$AZ_BATCH_NODE_STARTUP_DIR/certs/key.pem`')
cmd.append(
'docker login -u $DOCKER_LOGIN_USERNAME '
'-p $DOCKER_LOGIN_PASSWORD $DOCKER_LOGIN_SERVER')
else:
hubuser, hubpw = settings.docker_registry_login(config, 'hub')
if hubuser:
env.append(
batchmodels.EnvironmentSetting(
'DOCKER_LOGIN_USERNAME', hubuser)
)
env.append(
batchmodels.EnvironmentSetting(
'DOCKER_LOGIN_PASSWORD',
crypto.encrypt_string(encrypt, hubpw, config))
)
if encrypt:
cmd.append(
'DOCKER_LOGIN_PASSWORD='
'`echo $DOCKER_LOGIN_PASSWORD | base64 -d | '
'openssl rsautl -decrypt -inkey '
'$AZ_BATCH_NODE_STARTUP_DIR/certs/key.pem`')
cmd.append(
'docker login -u $DOCKER_LOGIN_USERNAME '
'-p $DOCKER_LOGIN_PASSWORD')
# transform env and cmd into single command for ssh
if for_ssh and len(cmd) > 0:
srv = None
for ev in env:
if ev.name == 'DOCKER_LOGIN_PASSWORD':
pw = ev.value
elif ev.name == 'DOCKER_LOGIN_USERNAME':
user = ev.value
elif ev.name == 'DOCKER_LOGIN_SERVER':
srv = ev.value
key = '${}'.format('DOCKER_LOGIN_PASSWORD')
if encrypt:
pw = cmd[0][22:].replace(key, pw)
cmd = cmd[1].replace(key, pw)
else:
cmd = cmd[0].replace(key, pw)
key = '${}'.format('DOCKER_LOGIN_USERNAME')
cmd = cmd.replace(key, user)
if util.is_not_empty(srv):
key = '${}'.format('DOCKER_LOGIN_SERVER')
cmd = cmd.replace(key, srv)
if encrypt:
key = 'openssl'
if key in cmd:
cmd = cmd.replace(key, 'sudo {}'.format(key))
key = '$AZ_BATCH_NODE_STARTUP_DIR'
if key in cmd:
start_mnt = '/'.join((
settings.temp_disk_mountpoint(config), 'batch', 'tasks',
'startup',
))
cmd = cmd.replace(key, start_mnt)
return None, [cmd]
return env, cmd
def _format_generic_task_id(tasknum):
# type: (int) -> str
"""Format a generic task id from a task number
:param int tasknum: task number
:rtype: str
:return: generic task id
"""
if tasknum > 99999:
return '{}{}'.format(_GENERIC_DOCKER_TASK_PREFIX, tasknum)
else:
return '{0}{1:05d}'.format(_GENERIC_DOCKER_TASK_PREFIX, tasknum)
def _generate_next_generic_task_id(
batch_client, job_id, tasklist=None, reserved=None, task_map=None,
last_task_id=None):
# type: (azure.batch.batch_service_client.BatchServiceClient, str,
# list, str, dict, str) -> Tuple[list, str]
"""Generate the next generic task id
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param str job_id: job id
:param list tasklist: list of current (committed) tasks in job
:param str reserved: reserved task id
:param dict task_map: map of pending tasks to add to the job
:param str last_task_id: last task id
:rtype: tuple
:return: (list of committed task ids for job, next generic docker task id)
"""
# get filtered, sorted list of generic docker task ids
try:
if util.is_none_or_empty(tasklist):
tasklist = batch_client.task.list(
job_id,
task_list_options=batchmodels.TaskListOptions(
filter='startswith(id, \'{}\')'.format(
_GENERIC_DOCKER_TASK_PREFIX),
select='id'))
tasklist = list(tasklist)
tasknum = sorted([int(x.id.split('-')[-1]) for x in tasklist])[-1] + 1
except (batchmodels.batch_error.BatchErrorException, IndexError):
tasknum = 0
if reserved is not None:
tasknum_reserved = int(reserved.split('-')[-1])
while tasknum == tasknum_reserved:
tasknum += 1
id = _format_generic_task_id(tasknum)
if task_map is not None:
while id in task_map:
try:
if (last_task_id is not None and
last_task_id.startswith(_GENERIC_DOCKER_TASK_PREFIX)):
tasknum = int(last_task_id.split('-')[-1])
last_task_id = None
except Exception:
last_task_id = None
tasknum += 1
id = _format_generic_task_id(tasknum)
return tasklist, id
def _add_task_collection(batch_client, job_id, task_map):
# type: (batch.BatchServiceClient, str, dict) -> None
"""Add a collection of tasks to a job
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param str job_id: job to add to
:param dict task_map: task collection map to add
"""
all_tasks = list(task_map.values())
start = 0
slice = 100 # can only submit up to 100 tasks at a time
while True:
end = start + slice
if end > len(all_tasks):
end = len(all_tasks)
chunk = all_tasks[start:end]
logger.debug('submitting {} tasks ({} -> {}) to job {}'.format(
len(chunk), start, end - 1, job_id))
try:
results = batch_client.task.add_collection(job_id, chunk)
except batchmodels.BatchErrorException as e:
if e.error.code == 'RequestBodyTooLarge':
# collection contents are too large, reduce and retry
if slice == 1:
raise
slice = slice >> 1
if slice < 1:
slice = 1
logger.error(
('task collection slice was too big, retrying with '
'slice={}').format(slice))
continue
else:
# go through result and retry just failed tasks
while True:
retry = []
for result in results.value:
if result.status == batchmodels.TaskAddStatus.client_error:
logger.error(
('skipping retry of adding task {} as it '
'returned a client error (code={} message={}) '
'for job {}').format(
result.task_id, result.error.code,
result.error.message, job_id))
elif (result.status ==
batchmodels.TaskAddStatus.server_error):
retry.append(task_map[result.task_id])
if len(retry) > 0:
logger.debug('retrying adding {} tasks to job {}'.format(
len(retry), job_id))
results = batch_client.task.add_collection(job_id, retry)
else:
break
if end == len(all_tasks):
break
start += slice
slice = 100
logger.info('submitted all {} tasks to job {}'.format(
len(task_map), job_id))
def add_jobs(
batch_client, blob_client, keyvault_client, config, jpfile, bxfile,
recreate=False, tail=None):
# type: (batch.BatchServiceClient, azureblob.BlockBlobService,
# azure.keyvault.KeyVaultClient, dict, tuple, tuple, bool,
# str) -> None
"""Add jobs
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param azure.storage.blob.BlockBlobService blob_client: blob client
:param azure.keyvault.KeyVaultClient keyvault_client: keyvault client
:param dict config: configuration dict
:param tuple jpfile: jobprep file
:param tuple bxfile: blobxfer file
:param bool recreate: recreate job if completed
:param str tail: tail specified file of last job/task added
"""
# get the pool inter-node comm setting
bs = settings.batch_shipyard_settings(config)
pool = settings.pool_settings(config)
try:
cloud_pool = batch_client.pool.get(pool.id)
except batchmodels.batch_error.BatchErrorException as ex:
if 'The specified pool does not exist.' in ex.message.value:
logger.error('{} pool does not exist'.format(pool.id))
if util.confirm_action(
config, 'add jobs to nonexistant pool {}'.format(pool.id)):
cloud_pool = None
else:
logger.error(
'not submitting jobs to nonexistant pool {}'.format(
pool.id))
return
else:
raise
preg = settings.docker_registry_private_settings(config)
global_resources = settings.global_resources_docker_images(config)
lastjob = None
lasttaskid = None
for jobspec in settings.job_specifications(config):
job_id = settings.job_id(jobspec)
# perform checks:
# 1. check docker images in task against pre-loaded on pool
# 2. if tasks have dependencies, set it if so
# 3. if there are multi-instance tasks
auto_complete = settings.job_auto_complete(jobspec)
multi_instance = False
mi_docker_container_name = None
reserved_task_id = None
uses_task_dependencies = False
missing_images = []
allow_run_on_missing = settings.job_allow_run_on_missing(jobspec)
existing_tasklist = None
# check for public pull on missing setting
if (allow_run_on_missing and
preg.allow_public_docker_hub_pull_on_missing):
logger.warning(
'allow run on missing image and allow public docker hub '
'pull on missing are both enabled. Note that allow public '
'pull on missing will not work in this situation.')
for task in settings.job_tasks(jobspec):
# check if task docker image is set in config.json
di = settings.task_docker_image(task)
if di not in global_resources:
if allow_run_on_missing:
logger.warning(
('docker image {} not pre-loaded on pool for a '
'task specified in job {}').format(di, job_id))
missing_images.append(di)
else:
raise RuntimeError(
('not submitting job {} with missing docker image {} '
'pre-load on pool {}').format(job_id, di, pool.id))
# do not break, check to ensure ids are set on each task if
# task dependencies are set
if settings.has_depends_on_task(task):
uses_task_dependencies = True
if settings.is_multi_instance_task(task):
if multi_instance and auto_complete:
raise ValueError(
'cannot specify more than one multi-instance task '
'per job with auto completion enabled')
multi_instance = True
mi_docker_container_name = settings.task_name(task)
if util.is_none_or_empty(mi_docker_container_name):
_id = settings.task_id(task)
if util.is_none_or_empty(_id):
existing_tasklist, reserved_task_id = \
_generate_next_generic_task_id(
batch_client, job_id,
tasklist=existing_tasklist)
settings.set_task_id(task, reserved_task_id)
_id = '{}-{}'.format(job_id, reserved_task_id)
settings.set_task_name(task, _id)
mi_docker_container_name = settings.task_name(task)
del _id
2017-04-04 00:16:21 +03:00
# define max task retry count constraint for this task if set
job_constraints = None
max_task_retries = settings.job_max_task_retries(jobspec)
max_wall_time = settings.job_max_wall_time(jobspec)
if max_task_retries is not None or max_wall_time is not None:
2017-04-04 00:16:21 +03:00
job_constraints = batchmodels.JobConstraints(
max_task_retry_count=max_task_retries,
max_wall_clock_time=max_wall_time,
2017-04-04 00:16:21 +03:00
)
# construct job prep
2017-04-04 00:16:21 +03:00
jpcmd = []
if len(missing_images) > 0 and allow_run_on_missing:
# we don't want symmetric difference as we just want to
# block on pre-loaded images only
gr = list(set(global_resources) - set(missing_images))
else:
gr = global_resources
if len(gr) > 0:
jpcmd.append('$AZ_BATCH_NODE_STARTUP_DIR/wd/{} {}'.format(
jpfile[0], ' '.join(gr)))
del gr
2017-04-04 00:16:21 +03:00
# job prep: digest any input_data
2016-11-01 06:49:24 +03:00
addlcmds = data.process_input_data(config, bxfile, jobspec)
if addlcmds is not None:
jpcmd.append(addlcmds)
del addlcmds
2017-04-04 00:16:21 +03:00
jptask = None
if len(jpcmd) > 0:
jptask = batchmodels.JobPreparationTask(
command_line=util.wrap_commands_in_shell(jpcmd),
wait_for_success=True,
user_identity=_RUN_ELEVATED,
rerun_on_node_reboot_after_success=False,
2017-04-04 00:16:21 +03:00
)
del jpcmd
# construct job release for multi-instance auto-complete
jrtask = None
if multi_instance and auto_complete:
2017-04-04 00:16:21 +03:00
jrtask = batchmodels.JobReleaseTask(
2016-11-01 06:49:24 +03:00
command_line=util.wrap_commands_in_shell(
['docker kill {}'.format(mi_docker_container_name),
'docker rm -v {}'.format(mi_docker_container_name)]),
user_identity=_RUN_ELEVATED,
)
2017-04-04 00:16:21 +03:00
# job prep task must exist
if jptask is None:
jptask = batchmodels.JobPreparationTask(
command_line='echo',
wait_for_success=False,
user_identity=_RUN_ELEVATED,
rerun_on_node_reboot_after_success=False,
)
# create job
job = batchmodels.JobAddParameter(
id=settings.job_id(jobspec),
pool_info=batchmodels.PoolInformation(pool_id=pool.id),
constraints=job_constraints,
uses_task_dependencies=uses_task_dependencies,
job_preparation_task=jptask,
job_release_task=jrtask,
metadata=[
batchmodels.MetadataItem(
name=settings.get_metadata_version_name(),
value=__version__,
),
],
2017-07-20 19:34:44 +03:00
priority=settings.job_priority(jobspec),
2017-04-04 00:16:21 +03:00
)
lastjob = job.id
logger.info('Adding job {} to pool {}'.format(job.id, pool.id))
try:
batch_client.job.add(job)
if settings.verbose(config) and jptask is not None:
logger.debug('Job prep command: {}'.format(
jptask.command_line))
except batchmodels.batch_error.BatchErrorException as ex:
if ('The specified job is already in a completed state.' in
ex.message.value):
if recreate:
# get job state
_job = batch_client.job.get(job.id)
if _job.state == batchmodels.JobState.completed:
del_jobs(
batch_client, config, jobid=job.id, wait=True)
time.sleep(1)
batch_client.job.add(job)
else:
raise
elif 'The specified job already exists' in ex.message.value:
# cannot re-use an existing job if multi-instance due to
# job release requirement
if multi_instance and auto_complete:
raise
else:
# retrieve job and check for version consistency
_job = batch_client.job.get(job.id)
_check_metadata_mismatch('job', _job.metadata)
else:
raise
del multi_instance
del mi_docker_container_name
del uses_task_dependencies
# get base env vars from job
job_env_vars = settings.job_environment_variables(jobspec)
_job_env_vars_secid = \
settings.job_environment_variables_keyvault_secret_id(jobspec)
if util.is_not_empty(_job_env_vars_secid):
jevs = keyvault.get_secret(
keyvault_client, _job_env_vars_secid, value_is_json=True)
job_env_vars = util.merge_dict(job_env_vars, jevs or {})
del jevs
del _job_env_vars_secid
# add all tasks under job
task_map = {}
for _task in settings.job_tasks(jobspec):
_task_id = settings.task_id(_task)
if util.is_none_or_empty(_task_id):
existing_tasklist, _task_id = _generate_next_generic_task_id(
batch_client, job.id, tasklist=existing_tasklist,
reserved=reserved_task_id, task_map=task_map,
last_task_id=lasttaskid)
settings.set_task_id(_task, _task_id)
if util.is_none_or_empty(settings.task_name(_task)):
settings.set_task_name(_task, '{}-{}'.format(job.id, _task_id))
del _task_id
task = settings.task_settings(
cloud_pool, config, pool, jobspec, _task, missing_images)
# retrieve keyvault task env vars
if util.is_not_empty(
task.environment_variables_keyvault_secret_id):
task_env_vars = keyvault.get_secret(
keyvault_client,
task.environment_variables_keyvault_secret_id,
value_is_json=True)
task_env_vars = util.merge_dict(
task.environment_variables, task_env_vars or {})
else:
task_env_vars = task.environment_variables
# merge job and task env vars
env_vars = util.merge_dict(job_env_vars, task_env_vars)
del task_env_vars
# get and create env var file
sas_urls = None
2016-11-13 22:35:09 +03:00
if util.is_not_empty(env_vars) or task.infiniband or task.gpu:
envfileloc = '{}taskrf-{}/{}{}'.format(
bs.storage_entity_prefix, job.id, task.id, task.envfile)
f = tempfile.NamedTemporaryFile(mode='wb', delete=False)
fname = f.name
try:
2016-11-13 22:35:09 +03:00
if util.is_not_empty(env_vars):
for key in env_vars:
f.write('{}={}\n'.format(
key, env_vars[key]).encode('utf8'))
if task.infiniband:
f.write(b'I_MPI_FABRICS=shm:dapl\n')
f.write(b'I_MPI_DAPL_PROVIDER=ofa-v2-ib0\n')
f.write(b'I_MPI_DYNAMIC_CONNECTION=0\n')
# create a manpath entry for potentially buggy
# intel mpivars.sh
f.write(b'MANPATH=/usr/share/man:/usr/local/man\n')
2016-11-13 22:35:09 +03:00
if task.gpu:
f.write(b'CUDA_CACHE_DISABLE=0\n')
f.write(b'CUDA_CACHE_MAXSIZE=1073741824\n')
# use absolute path due to non-expansion
f.write(
('CUDA_CACHE_PATH={}/batch/tasks/'
'.nv/ComputeCache\n').format(
2016-11-14 10:26:48 +03:00
settings.temp_disk_mountpoint(
config)).encode('utf8'))
# close and upload env var file
f.close()
2016-11-01 06:49:24 +03:00
sas_urls = storage.upload_resource_files(
blob_client, config, [(envfileloc, fname)])
finally:
os.unlink(fname)
del f
del fname
if len(sas_urls) != 1:
raise RuntimeError('unexpected number of sas urls')
# check if this is a multi-instance task
mis = None
if settings.is_multi_instance_task(_task):
mis = batchmodels.MultiInstanceSettings(
number_of_instances=task.multi_instance.num_instances,
coordination_command_line=util.wrap_commands_in_shell(
task.multi_instance.coordination_command, wait=False),
common_resource_files=[],
)
# add common resource files for multi-instance
if util.is_not_empty(task.multi_instance.resource_files):
for rf in task.multi_instance.resource_files:
mis.common_resource_files.append(
batchmodels.ResourceFile(
file_path=rf.file_path,
blob_source=rf.blob_source,
file_mode=rf.file_mode,
)
)
# set application command
task_commands = [
'{} {} {}'.format(
task.docker_exec_cmd, task.name, task.command)
]
else:
task_commands = [
'env | grep AZ_BATCH_ >> {}'.format(task.envfile),
'{} {} {}{}'.format(
task.docker_run_cmd,
' '.join(task.docker_run_options),
task.image,
'{}'.format(
' ' + task.command) if task.command else '')
]
# get docker login if missing images
if len(missing_images) > 0 and allow_run_on_missing:
taskenv, logincmd = generate_docker_login_settings(config)
logincmd.extend(task_commands)
task_commands = logincmd
else:
taskenv = None
# digest any input_data
2016-11-01 06:49:24 +03:00
addlcmds = data.process_input_data(
config, bxfile, _task, on_task=True)
if addlcmds is not None:
task_commands.insert(0, addlcmds)
# digest any output data
2016-11-01 06:49:24 +03:00
addlcmds = data.process_output_data(
config, bxfile, _task)
if addlcmds is not None:
task_commands.append(addlcmds)
del addlcmds
# set task constraints
task_constraints = batchmodels.TaskConstraints(
retention_time=task.retention_time,
max_task_retry_count=task.max_task_retries,
max_wall_clock_time=task.max_wall_time,
)
# create task
batchtask = batchmodels.TaskAddParameter(
id=task.id,
2016-11-01 06:49:24 +03:00
command_line=util.wrap_commands_in_shell(task_commands),
user_identity=_RUN_ELEVATED,
resource_files=[],
2016-11-13 22:35:09 +03:00
multi_instance_settings=mis,
constraints=task_constraints,
environment_settings=taskenv,
)
# add envfile
if sas_urls is not None:
batchtask.resource_files.append(
batchmodels.ResourceFile(
file_path=str(task.envfile),
blob_source=next(iter(sas_urls.values())),
file_mode='0640',
)
)
sas_urls = None
# add additional resource files
if util.is_not_empty(task.resource_files):
for rf in task.resource_files:
batchtask.resource_files.append(
batchmodels.ResourceFile(
file_path=rf.file_path,
blob_source=rf.blob_source,
file_mode=rf.file_mode,
)
)
# add task dependencies
2017-01-12 20:23:25 +03:00
if (util.is_not_empty(task.depends_on) or
util.is_not_empty(task.depends_on_range)):
if util.is_not_empty(task.depends_on_range):
task_id_ranges = [batchmodels.TaskIdRange(
task.depends_on_range[0], task.depends_on_range[1])]
else:
task_id_ranges = None
batchtask.depends_on = batchmodels.TaskDependencies(
2017-01-12 20:23:25 +03:00
task_ids=task.depends_on,
task_id_ranges=task_id_ranges,
)
# create task
if settings.verbose(config):
if mis is not None:
logger.debug(
'multi-instance task coordination command: {}'.format(
mis.coordination_command_line))
logger.debug('task: {} command: {}'.format(
task.id, batchtask.command_line))
if task.id in task_map:
raise RuntimeError(
'duplicate task id detected: {} for job {}'.format(
task.id, job.id))
task_map[task.id] = batchtask
lasttaskid = task.id
# add task collection to job
_add_task_collection(batch_client, job.id, task_map)
# patch job if job autocompletion is needed
if auto_complete:
batch_client.job.patch(
job_id=job.id,
job_patch_parameter=batchmodels.JobPatchParameter(
pool_info=batchmodels.PoolInformation(pool_id=pool.id),
on_all_tasks_complete=batchmodels.
OnAllTasksComplete.terminate_job))
# tail file if specified
if tail:
stream_file_and_wait_for_task(
batch_client, config, filespec='{},{},{}'.format(
lastjob, lasttaskid, tail), disk=False)