More improvements for scale out robustness

- Add --all-start-task-failed to delnode
- Reduce node output on pool allocation wait with number of nodes > 10
This commit is contained in:
Fred Park 2017-06-30 23:50:21 -07:00
Родитель 06188c1944
Коммит 2a48885da1
12 изменённых файлов: 270 добавлений и 87 удалений

3
.gitignore поставляемый
Просмотреть файл

@ -90,6 +90,7 @@ celerybeat-schedule
# project specific ignores
shipyard
shipyard.cmd
ssh_docker_tunnel_shipyard.sh
id_rsa_shipyard*
resources/azurefile-dockervolume-create.sh
@ -97,4 +98,4 @@ resources/azurefile-dockervolumedriver
resources/azurefile-dockervolumedriver.env
resources/docker-registry-v2.tar.gz
resources/nvidia-docker.deb
resources/nvidia-driver.run
resources/nvidia-driver*.run

Просмотреть файл

@ -2,6 +2,17 @@
## [Unreleased]
### Added
- `--all-start-task-failed` parameter for `pool delnode`
### Changed
- Improve robustness of docker image pulls within node prep scripts
- Restrict node list queries until pool allocation state emerges from resizing
### Fixed
- Remove nvidia gpu driver property from FFmpeg recipe
- Further improve retry logic for docker image pulls in cascade
## [2.8.0rc2] - 2017-06-30
### Added
- Support Mac OS X and Windows Subsystem for Linux installations via

Просмотреть файл

@ -395,6 +395,18 @@ class DockerSaveThread(threading.Thread):
_DIRECTDL_DOWNLOADING.remove(self.resource)
_DIRECTDL.remove(self.resource)
def _check_pull_output_overload(self, stdout: str, stderr: str) -> bool:
"""Check output for registry overload errors
:param str stdout: stdout
:param str stderr: stderr
:rtype: bool
:return: if error appears to be overload from registry
"""
if ('toomanyrequests' in stdout or 'toomanyrequests' in stderr or
'connection reset by peer' in stderr):
return True
return False
def _pull(self, image: str) -> tuple:
"""Docker image pull with registry normalization
:param str image: image to pull
@ -414,7 +426,9 @@ class DockerSaveThread(threading.Thread):
shell=True,
universal_newlines=True)
stdout, stderr = proc.communicate()
if proc.returncode != 0 and _ALLOW_PUBLIC_PULL_WITH_PRIVATE:
if (proc.returncode != 0 and _ALLOW_PUBLIC_PULL_WITH_PRIVATE and
not _pub and
not self._check_pull_output_overload(stdout, stderr)):
logger.warning(
'could not pull from private registry, attempting '
'Docker Public Hub instead')
@ -445,31 +459,19 @@ class DockerSaveThread(threading.Thread):
_record_perf('pull-start', 'img={}'.format(image))
start = datetime.datetime.now()
logger.info('pulling image {} from {}'.format(image, _REGISTRY))
npa_errors = 0
while True:
rc, stdout, stderr = self._pull(image)
if rc != 0:
fail = True
if 'toomanyrequests' in stdout or 'toomanyrequests' in stderr:
logger.error(
'Too many requests issued to registry server, '
'retrying...')
fail = False
time.sleep(random.randint(5, 30))
elif 'no pull access' in stdout or 'no pull access' in stderr:
npa_errors += 1
if npa_errors < 3:
fail = False
logger.error(
'No pull access to registry server, retrying in '
'case of temporary overload...')
time.sleep(random.randint(1, 10))
if fail:
raise RuntimeError(
'docker pull failed: stdout={} stderr={}'.format(
stdout, stderr))
else:
if rc == 0:
break
elif self._check_pull_output_overload(stdout, stderr):
logger.error(
'Too many requests issued to registry server, '
'retrying...')
time.sleep(random.randint(5, 30))
else:
raise RuntimeError(
'docker pull failed: stdout={} stderr={}'.format(
stdout, stderr))
diff = (datetime.datetime.now() - start).total_seconds()
logger.debug('took {} sec to pull docker image {} from {}'.format(
diff, image, _REGISTRY))

Просмотреть файл

@ -30,6 +30,7 @@ from builtins import ( # noqa
bytes, dict, int, list, object, range, str, ascii, chr, hex, input,
next, oct, open, pow, round, super, filter, map, zip)
# stdlib imports
import collections
import datetime
import fnmatch
import getpass
@ -39,6 +40,7 @@ try:
except ImportError:
import pathlib
import os
import ssl
import tempfile
import time
# non-stdlib imports
@ -65,6 +67,23 @@ _RUN_ELEVATED = batchmodels.UserIdentity(
elevation_level=batchmodels.ElevationLevel.admin,
)
)
NodeStateCountCollection = collections.namedtuple(
'NodeStateCountCollection', [
'creating',
'idle',
'leaving_pool',
'offline',
'preempted',
'rebooting',
'reimaging',
'running',
'start_task_failed',
'starting',
'unknown',
'unusable',
'waiting_for_start_task',
]
)
def get_batch_account(batch_mgmt_client, config):
@ -262,6 +281,7 @@ def _block_for_nodes_ready(
pool.target_dedicated_nodes == 0)):
fatal_resize_error = True
if fatal_resize_error:
list_nodes(batch_client, config)
raise RuntimeError(
'Fatal resize errors encountered for pool {}: {}'.format(
pool.id, os.linesep.join(errors)))
@ -269,7 +289,17 @@ def _block_for_nodes_ready(
logger.error(
'Resize errors encountered for pool {}: {}'.format(
pool.id, os.linesep.join(errors)))
nodes = list(batch_client.compute_node.list(pool.id))
# check pool allocation state
if pool.allocation_state == batchmodels.AllocationState.resizing:
nodes = []
else:
try:
nodes = list(batch_client.compute_node.list(pool.id))
except ssl.SSLError:
# SSL error happens sometimes on paging... this is probably
# a bug in the underlying msrest/msrestazure library that
# is reusing the SSL connection improperly
nodes = []
# check if any nodes are in start task failed state
if (any(node.state == batchmodels.ComputeNodeState.start_task_failed
for node in nodes)):
@ -301,7 +331,10 @@ def _block_for_nodes_ready(
_reboot_node(batch_client, pool.id, node.id, True)
reboot_map[node.id] += 1
# refresh node list to reflect rebooting states
nodes = list(batch_client.compute_node.list(pool.id))
try:
nodes = list(batch_client.compute_node.list(pool.id))
except ssl.SSLError:
nodes = []
else:
# fast path check for start task failures in non-reboot mode
logger.error(
@ -321,7 +354,6 @@ def _block_for_nodes_ready(
pool.target_low_priority_nodes) and
all(node.state in stopping_states for node in nodes)):
if any(node.state not in end_states for node in nodes):
# list nodes of pool
list_nodes(batch_client, config)
raise RuntimeError(
('Node(s) of pool {} not in {} state. Please inspect the '
@ -336,14 +368,48 @@ def _block_for_nodes_ready(
i = 0
logger.debug(
('waiting for {} dedicated nodes and {} low priority nodes '
'to reach desired state').format(
'to reach desired state in pool {} with '
'allocation_state={}').format(
pool.target_dedicated_nodes,
pool.target_low_priority_nodes))
for node in nodes:
logger.debug('{}: {}'.format(node.id, node.state))
pool.target_low_priority_nodes,
pool.id,
pool.allocation_state))
if len(nodes) < 10:
for node in nodes:
logger.debug('{}: {}'.format(node.id, node.state))
else:
logger.debug(_node_state_counts(nodes))
time.sleep(10)
def _node_state_counts(nodes):
# type: (List[batchmodels.ComputeNode]) -> NodeStateCountCollection
"""Collate counts of various nodes
:param list nodes: list of nodes
:rtype: NodeStateCountCollection
:return: node state count collection
"""
node_states = [node.state for node in nodes]
return NodeStateCountCollection(
creating=node_states.count(batchmodels.ComputeNodeState.creating),
idle=node_states.count(batchmodels.ComputeNodeState.idle),
leaving_pool=node_states.count(
batchmodels.ComputeNodeState.leaving_pool),
offline=node_states.count(batchmodels.ComputeNodeState.offline),
preempted=node_states.count(batchmodels.ComputeNodeState.preempted),
rebooting=node_states.count(batchmodels.ComputeNodeState.rebooting),
reimaging=node_states.count(batchmodels.ComputeNodeState.reimaging),
running=node_states.count(batchmodels.ComputeNodeState.running),
start_task_failed=node_states.count(
batchmodels.ComputeNodeState.start_task_failed),
starting=node_states.count(batchmodels.ComputeNodeState.starting),
unknown=node_states.count(batchmodels.ComputeNodeState.unknown),
unusable=node_states.count(batchmodels.ComputeNodeState.unusable),
waiting_for_start_task=node_states.count(
batchmodels.ComputeNodeState.waiting_for_start_task),
)
def wait_for_pool_ready(batch_client, config, pool_id, addl_end_states=None):
# type: (batch.BatchServiceClient, dict, str,
# List[batchmodels.ComputeNode]) -> List[batchmodels.ComputeNode]
@ -751,25 +817,45 @@ def reboot_nodes(batch_client, config, all_start_task_failed, node_id):
_reboot_node(batch_client, pool_id, node_id, False)
def del_node(batch_client, config, node_id):
# type: (batch.BatchServiceClient, dict, str) -> None
def del_node(batch_client, config, all_start_task_failed, node_id):
# type: (batch.BatchServiceClient, dict, bool, str) -> None
"""Delete a node in a pool
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param bool all_start_task_failed: reboot all start task failed nodes
:param str node_id: node id to delete
"""
if util.is_none_or_empty(node_id):
raise ValueError('node id is invalid')
node_ids = []
pool_id = settings.pool_id(config)
if not util.confirm_action(
config, 'delete node {} from {} pool'.format(node_id, pool_id)):
if all_start_task_failed:
nodes = list(
batch_client.compute_node.list(
pool_id=pool_id,
compute_node_list_options=batchmodels.ComputeNodeListOptions(
filter='state eq \'starttaskfailed\'',
),
))
for node in nodes:
if util.confirm_action(
config, 'delete node {} from {} pool'.format(
node.id, pool_id)):
node_ids.append(node.id)
else:
if util.is_none_or_empty(node_id):
raise ValueError('node id is invalid')
if util.confirm_action(
config, 'delete node {} from {} pool'.format(
node_id, pool_id)):
node_ids.append(node_id)
if util.is_none_or_empty(node_ids):
logger.warning('no nodes to delete from pool: {}'.format(pool_id))
return
logger.info('Deleting node {} from pool {}'.format(node_id, pool_id))
logger.info('Deleting nodes {} from pool {}'.format(node_ids, pool_id))
batch_client.pool.remove_nodes(
pool_id=pool_id,
node_remove_parameter=batchmodels.NodeRemoveParameter(
node_list=[node_id],
node_list=node_ids,
)
)

Просмотреть файл

@ -2280,15 +2280,16 @@ def action_pool_ssh(batch_client, config, cardinal, nodeid, tty, command):
util.subprocess_with_output(ssh_cmd)
def action_pool_delnode(batch_client, config, nodeid):
# type: (batchsc.BatchServiceClient, dict, str) -> None
def action_pool_delnode(batch_client, config, all_start_task_failed, nodeid):
# type: (batchsc.BatchServiceClient, dict, bool, str) -> None
"""Action: Pool Delnode
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
batch client
:param dict config: configuration dict
:param bool all_start_task_failed: reboot all start task failed nodes
:param str nodeid: nodeid to delete
"""
batch.del_node(batch_client, config, nodeid)
batch.del_node(batch_client, config, all_start_task_failed, nodeid)
def action_pool_rebootnode(

Просмотреть файл

@ -129,8 +129,8 @@ It is recommended to follow the steps outlined on
[this guide](http://docs.python-guide.org/en/latest/starting/install3/osx/#install3-osx)
to install Batch Shipyard on a Python3 installation rather than the default
Python 2.7 that is shipped with Mac OS X. However, if you prefer to use
the system defaulted Python 2.7, the installation will work that environment
as well.
the system defaulted Python 2.7, the installation will work with that
environment as well.
The `install.sh` script supports isolated installation through a virtual
environment so that other system-wide or user python dependencies are left

Просмотреть файл

@ -456,6 +456,9 @@ Azure Storage.
pool configuration file
* `--wait` will wait for deletion to complete
* `delnode` will delete the specified node from the pool
* `--all-start-task-failed` will delete all nodes in the start task
failed state
* `--nodeid` is the node id to delete
* `dsu` will delete the SSH user defined in the pool configuration file
from all nodes in the specified pool
* `grls` will retrieve all of the remote login settings for every node

Просмотреть файл

@ -20,9 +20,6 @@ once they are available for N-series VMs.
are available for N-series VMs.
* `sku` should be `16.04-LTS`. Other skus will be supported once they are
available for N-series VMs.
* `gpu` property should be specified with the following members:
* `nvidia_driver` property contains the following members:
* `source` is a URL for the driver installer .run file
### Global Configuration
The global configuration should set the following properties:

Просмотреть файл

@ -15,11 +15,6 @@
"ssh": {
"username": "docker"
},
"gpu": {
"nvidia_driver": {
"source": "<URL for nvidia driver for STANDARD_NV VMs>"
}
},
"reboot_on_start_task_failed": false,
"block_until_all_global_resources_loaded": true
}

Просмотреть файл

@ -119,6 +119,16 @@ if [ -z $version ]; then
exit 1
fi
contains() {
string="$1"
substring="$2"
if test "${string#*$substring}" != "$string"; then
return 0
else
return 1
fi
}
check_for_buggy_ntfs_mount() {
# Check to ensure sdb1 mount is not mounted as ntfs
set +e
@ -187,7 +197,7 @@ refresh_package_index() {
fi
let retries=retries-1
if [ $retries -eq 0 ]; then
echo "Could not update package index"
echo "ERROR: Could not update package index"
exit 1
fi
sleep 1
@ -213,7 +223,7 @@ install_packages() {
fi
let retries=retries-1
if [ $retries -eq 0 ]; then
echo "Could not install packages: $*"
echo "ERROR: Could not install packages: $*"
exit 1
fi
sleep 1
@ -221,6 +231,35 @@ install_packages() {
set -e
}
docker_pull_image() {
image=$1
set +e
retries=60
while [ $retries -gt 0 ]; do
pull_out=$(docker pull $image 2>&1)
rc=$?
if [ $rc -eq 0 ]; then
echo "$pull_out"
break
fi
# non-zero exit code: check if pull output has toomanyrequests or
# connection resets
if [ contains "$pull_out" "toomanyrequests" ] || [ contains "$pull_out" "connection reset by peer" ]; then
echo "WARNING: will retry:\n$pull_out"
else
echo "ERROR:\n$pull_out"
exit $rc
fi
let retries=retries-1
if [ $retries -le 0 ]; then
echo "ERROR: Could not pull docker image: $image"
exit $rc
fi
sleep $[($RANDOM % 5) + 1]s
done
set -e
}
# check sdb1 mount
check_for_buggy_ntfs_mount
@ -269,10 +308,10 @@ fi
# check if we're coming up from a reboot
if [ -f $cascadefailed ]; then
echo "$cascadefailed file exists, assuming cascade failure during node prep"
echo "ERROR: $cascadefailed file exists, assuming cascade failure during node prep"
exit 1
elif [ -f $nodeprepfinished ]; then
echo "$nodeprepfinished file exists, assuming successful completion of node prep"
echo "INFO: $nodeprepfinished file exists, assuming successful completion of node prep"
exit 0
fi
@ -343,11 +382,11 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then
repo=https://download.docker.com/linux/debian
dockerversion=${dockerversion}debian
else
echo "unsupported sku: $sku for offer: $offer"
echo "ERROR: unsupported sku: $sku for offer: $offer"
exit 1
fi
if [ ! -z $gpu ] && [ $name != "ubuntu-xenial" ]; then
echo "gpu unsupported on this sku: $sku for offer $offer"
echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
exit 1
fi
# reload network settings
@ -375,7 +414,7 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then
fi
let retries=retries-1
if [ $retries -eq 0 ]; then
echo "Could not add key for docker repo"
echo "ERROR: Could not add key for docker repo"
exit 1
fi
sleep 1
@ -470,7 +509,7 @@ EOF
set +e
while :
do
echo "Attempting to create nvidia-docker volume with version $nvdriverver"
echo "INFO: Attempting to create nvidia-docker volume with version $nvdriverver"
docker volume create -d nvidia-docker --name nvidia_driver_$nvdriverver
if [ $? -eq 0 ]; then
break
@ -479,7 +518,7 @@ EOF
NV_DIFF=$((($NV_NOW-$NV_START)/60))
# fail after 5 minutes of attempts
if [ $NV_DIFF -ge 5 ]; then
echo "could not create nvidia-docker volume"
echo "ERROR: could not create nvidia-docker volume"
exit 1
fi
sleep 1
@ -507,7 +546,7 @@ EOF
elif [ $server_type == "glusterfs" ]; then
install_packages $offer glusterfs-client acl
else
echo "Unknown file server type ${sc[0]} for ${sc[1]}"
echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
exit 1
fi
done
@ -525,12 +564,12 @@ EOF
elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-linux" ]]; then
# ensure container only support
if [ $cascadecontainer -eq 0 ]; then
echo "only supported through shipyard container"
echo "ERROR: only supported through shipyard container"
exit 1
fi
# gpu is not supported on these offers
if [ ! -z $gpu ]; then
echo "gpu unsupported on this sku: $sku for offer $offer"
echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
exit 1
fi
if [[ $sku == 7.* ]]; then
@ -542,7 +581,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
gfsenable="systemctl enable glusterd"
rpcbindenable="systemctl enable rpcbind"
# TODO, in order to support docker > 1.9, need to upgrade to UEKR4
echo "oracle linux is not supported at this time"
echo "ERROR: oracle linux is not supported at this time"
exit 1
else
srvenable="chkconfig docker on"
@ -552,7 +591,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
rpcbindenable="chkconfig rpcbind on"
fi
else
echo "unsupported sku: $sku for offer: $offer"
echo "ERROR: unsupported sku: $sku for offer: $offer"
exit 1
fi
# reload network settings
@ -605,7 +644,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
sed -i -e "s/enabled=1/enabled=0/g" /etc/yum.repos.d/CentOS-Gluster-3.8.repo
install_packages $offer --enablerepo=centos-gluster38,epel glusterfs-server acl
else
echo "Unknown file server type ${sc[0]} for ${sc[1]}"
echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
exit 1
fi
done
@ -613,12 +652,12 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
# ensure container only support
if [ $cascadecontainer -eq 0 ]; then
echo "only supported through shipyard container"
echo "ERROR: only supported through shipyard container"
exit 1
fi
# gpu is not supported on these offers
if [ ! -z $gpu ]; then
echo "gpu unsupported on this sku: $sku for offer $offer"
echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
exit 1
fi
# reload network settings
@ -648,7 +687,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
SUSEConnect -p sle-module-containers/12/x86_64 -r ''
fi
if [ -z $repodir ]; then
echo "unsupported sku: $sku for offer: $offer"
echo "ERROR: unsupported sku: $sku for offer: $offer"
exit 1
fi
# update index
@ -694,7 +733,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
zypper -n --gpg-auto-import-keys ref
install_packages $offer glusterfs acl
else
echo "Unknown file server type ${sc[0]} for ${sc[1]}"
echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
exit 1
fi
done
@ -702,7 +741,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
# if hpc sku, set up intel mpi
if [[ $offer == sles-hpc* ]]; then
if [ $sku != "12-sp1" ]; then
echo "unsupported sku for intel mpi setup on SLES"
echo "ERROR: unsupported sku for intel mpi setup on SLES"
exit 1
fi
install_packages $offer lsb
@ -712,13 +751,13 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
fi
fi
else
echo "unsupported offer: $offer (sku: $sku)"
echo "ERROR: unsupported offer: $offer (sku: $sku)"
exit 1
fi
# retrieve docker images related to data movement
docker pull alfpark/blobxfer:$blobxferversion
docker pull alfpark/batch-shipyard:tfm-$version
docker_pull_image alfpark/blobxfer:$blobxferversion
docker_pull_image alfpark/batch-shipyard:tfm-$version
# login to registry server
if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then
@ -734,15 +773,15 @@ if [ ! -z $sc_args ]; then
for sc_arg in ${sc_args[@]}; do
IFS=':' read -ra sc <<< "$sc_arg"
mountpoint=$AZ_BATCH_NODE_SHARED_DIR/${sc[1]}
echo "Creating host directory for storage cluster $sc_arg at $mountpoint"
echo "INFO: Creating host directory for storage cluster $sc_arg at $mountpoint"
mkdir -p $mountpoint
chmod 777 $mountpoint
echo "Adding $mountpoint to fstab"
echo "INFO: Adding $mountpoint to fstab"
# eval fstab var to expand vars (this is ok since it is set by shipyard)
fstab_entry="${fstabs[$i]}"
echo $fstab_entry >> /etc/fstab
tail -n1 /etc/fstab
echo "Mounting $mountpoint"
echo "INFO: Mounting $mountpoint"
START=$(date -u +"%s")
set +e
while :
@ -755,14 +794,14 @@ if [ ! -z $sc_args ]; then
DIFF=$((($NOW-$START)/60))
# fail after 5 minutes of attempts
if [ $DIFF -ge 5 ]; then
echo "Could not mount storage cluster $sc_arg on: $mountpoint"
echo "ERROR: Could not mount storage cluster $sc_arg on: $mountpoint"
exit 1
fi
sleep 1
fi
done
set -e
echo "$mountpoint mounted."
echo "INFO: $mountpoint mounted."
i=$(($i + 1))
done
fi
@ -805,6 +844,8 @@ p2p=$p2p
`env | grep DOCKER_LOGIN_`
EOF
chmod 600 $envfile
# pull image
docker_pull_image alfpark/batch-shipyard:cascade-$version
# launch container
docker run $detached --net=host --env-file $envfile \
-v /var/run/docker.sock:/var/run/docker.sock \
@ -847,7 +888,7 @@ if [ $p2penabled -eq 0 ]; then
wait $cascadepid
rc=$?
if [ $rc -ne 0 ]; then
echo "cascade exited with non-zero exit code: $rc"
echo "ERROR: cascade exited with non-zero exit code: $rc"
rm -f $nodeprepfinished
exit $rc
fi
@ -859,7 +900,7 @@ rm -f $cascadefailed
# block until images ready if specified
if [ ! -z $block ]; then
echo "blocking until images ready: $block"
echo "INFO: blocking until images ready: $block"
IFS=',' read -ra RES <<< "$block"
declare -a missing
while :
@ -870,7 +911,7 @@ if [ ! -z $block ]; then
fi
done
if [ ${#missing[@]} -eq 0 ]; then
echo "all docker images present"
echo "INFO: all docker images present"
break
else
unset missing

Просмотреть файл

@ -81,6 +81,16 @@ done
shift $((OPTIND-1))
[ "$1" = "--" ] && shift
contains() {
string="$1"
substring="$2"
if test "${string#*$substring}" != "$string"; then
return 0
else
return 1
fi
}
check_for_buggy_ntfs_mount() {
# Check to ensure sdb1 mount is not mounted as ntfs
set +e
@ -249,6 +259,35 @@ install_azurefile_docker_volume_driver() {
./azurefile-dockervolume-create.sh
}
docker_pull_image() {
image=$1
set +e
retries=60
while [ $retries -gt 0 ]; do
pull_out=$(docker pull $image 2>&1)
rc=$?
if [ $rc -eq 0 ]; then
echo "$pull_out"
break
fi
# non-zero exit code: check if pull output has toomanyrequests or
# connection resets
if [ contains "$pull_out" "toomanyrequests" ] || [ contains "$pull_out" "connection reset by peer" ]; then
echo "WARNING: will retry:\n$pull_out"
else
echo "ERROR:\n$pull_out"
exit $rc
fi
let retries=retries-1
if [ $retries -le 0 ]; then
echo "ERROR: Could not pull docker image: $image"
exit $rc
fi
sleep $[($RANDOM % 5) + 1]s
done
set -e
}
# try to get /etc/lsb-release
if [ -e /etc/lsb-release ]; then
. /etc/lsb-release
@ -397,8 +436,8 @@ if [ ! -z $sc_args ]; then
fi
# retrieve docker images related to data movement
docker pull alfpark/blobxfer:$blobxferversion
docker pull alfpark/batch-shipyard:tfm-$version
docker_pull_image alfpark/blobxfer:$blobxferversion
docker_pull_image alfpark/batch-shipyard:tfm-$version
# login to registry server
if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then
@ -442,6 +481,8 @@ p2p=$p2p
`env | grep DOCKER_LOGIN_`
EOF
chmod 600 $envfile
# pull image
docker_pull_image alfpark/batch-shipyard:cascade-$version
# launch container
docker run $detached --net=host --env-file $envfile \
-v /var/run/docker.sock:/var/run/docker.sock \

Просмотреть файл

@ -1181,6 +1181,10 @@ def pool_ssh(ctx, cardinal, nodeid, tty, command):
@pool.command('delnode')
@click.option(
'--all-start-task-failed',
is_flag=True,
help='Deleted all nodes with start task failed state')
@click.option(
'--nodeid', help='NodeId of compute node in pool to delete')
@common_options
@ -1188,10 +1192,11 @@ def pool_ssh(ctx, cardinal, nodeid, tty, command):
@keyvault_options
@aad_options
@pass_cli_context
def pool_delnode(ctx, nodeid):
def pool_delnode(ctx, all_start_task_failed, nodeid):
"""Delete a node from a pool"""
ctx.initialize_for_batch()
convoy.fleet.action_pool_delnode(ctx.batch_client, ctx.config, nodeid)
convoy.fleet.action_pool_delnode(
ctx.batch_client, ctx.config, all_start_task_failed, nodeid)
@pool.command('rebootnode')