More improvements for scale out robustness
- Add --all-start-task-failed to delnode - Reduce node output on pool allocation wait with number of nodes > 10
This commit is contained in:
Родитель
06188c1944
Коммит
2a48885da1
|
@ -90,6 +90,7 @@ celerybeat-schedule
|
|||
|
||||
# project specific ignores
|
||||
shipyard
|
||||
shipyard.cmd
|
||||
ssh_docker_tunnel_shipyard.sh
|
||||
id_rsa_shipyard*
|
||||
resources/azurefile-dockervolume-create.sh
|
||||
|
@ -97,4 +98,4 @@ resources/azurefile-dockervolumedriver
|
|||
resources/azurefile-dockervolumedriver.env
|
||||
resources/docker-registry-v2.tar.gz
|
||||
resources/nvidia-docker.deb
|
||||
resources/nvidia-driver.run
|
||||
resources/nvidia-driver*.run
|
||||
|
|
11
CHANGELOG.md
11
CHANGELOG.md
|
@ -2,6 +2,17 @@
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- `--all-start-task-failed` parameter for `pool delnode`
|
||||
|
||||
### Changed
|
||||
- Improve robustness of docker image pulls within node prep scripts
|
||||
- Restrict node list queries until pool allocation state emerges from resizing
|
||||
|
||||
### Fixed
|
||||
- Remove nvidia gpu driver property from FFmpeg recipe
|
||||
- Further improve retry logic for docker image pulls in cascade
|
||||
|
||||
## [2.8.0rc2] - 2017-06-30
|
||||
### Added
|
||||
- Support Mac OS X and Windows Subsystem for Linux installations via
|
||||
|
|
|
@ -395,6 +395,18 @@ class DockerSaveThread(threading.Thread):
|
|||
_DIRECTDL_DOWNLOADING.remove(self.resource)
|
||||
_DIRECTDL.remove(self.resource)
|
||||
|
||||
def _check_pull_output_overload(self, stdout: str, stderr: str) -> bool:
|
||||
"""Check output for registry overload errors
|
||||
:param str stdout: stdout
|
||||
:param str stderr: stderr
|
||||
:rtype: bool
|
||||
:return: if error appears to be overload from registry
|
||||
"""
|
||||
if ('toomanyrequests' in stdout or 'toomanyrequests' in stderr or
|
||||
'connection reset by peer' in stderr):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _pull(self, image: str) -> tuple:
|
||||
"""Docker image pull with registry normalization
|
||||
:param str image: image to pull
|
||||
|
@ -414,7 +426,9 @@ class DockerSaveThread(threading.Thread):
|
|||
shell=True,
|
||||
universal_newlines=True)
|
||||
stdout, stderr = proc.communicate()
|
||||
if proc.returncode != 0 and _ALLOW_PUBLIC_PULL_WITH_PRIVATE:
|
||||
if (proc.returncode != 0 and _ALLOW_PUBLIC_PULL_WITH_PRIVATE and
|
||||
not _pub and
|
||||
not self._check_pull_output_overload(stdout, stderr)):
|
||||
logger.warning(
|
||||
'could not pull from private registry, attempting '
|
||||
'Docker Public Hub instead')
|
||||
|
@ -445,31 +459,19 @@ class DockerSaveThread(threading.Thread):
|
|||
_record_perf('pull-start', 'img={}'.format(image))
|
||||
start = datetime.datetime.now()
|
||||
logger.info('pulling image {} from {}'.format(image, _REGISTRY))
|
||||
npa_errors = 0
|
||||
while True:
|
||||
rc, stdout, stderr = self._pull(image)
|
||||
if rc != 0:
|
||||
fail = True
|
||||
if 'toomanyrequests' in stdout or 'toomanyrequests' in stderr:
|
||||
logger.error(
|
||||
'Too many requests issued to registry server, '
|
||||
'retrying...')
|
||||
fail = False
|
||||
time.sleep(random.randint(5, 30))
|
||||
elif 'no pull access' in stdout or 'no pull access' in stderr:
|
||||
npa_errors += 1
|
||||
if npa_errors < 3:
|
||||
fail = False
|
||||
logger.error(
|
||||
'No pull access to registry server, retrying in '
|
||||
'case of temporary overload...')
|
||||
time.sleep(random.randint(1, 10))
|
||||
if fail:
|
||||
raise RuntimeError(
|
||||
'docker pull failed: stdout={} stderr={}'.format(
|
||||
stdout, stderr))
|
||||
else:
|
||||
if rc == 0:
|
||||
break
|
||||
elif self._check_pull_output_overload(stdout, stderr):
|
||||
logger.error(
|
||||
'Too many requests issued to registry server, '
|
||||
'retrying...')
|
||||
time.sleep(random.randint(5, 30))
|
||||
else:
|
||||
raise RuntimeError(
|
||||
'docker pull failed: stdout={} stderr={}'.format(
|
||||
stdout, stderr))
|
||||
diff = (datetime.datetime.now() - start).total_seconds()
|
||||
logger.debug('took {} sec to pull docker image {} from {}'.format(
|
||||
diff, image, _REGISTRY))
|
||||
|
|
116
convoy/batch.py
116
convoy/batch.py
|
@ -30,6 +30,7 @@ from builtins import ( # noqa
|
|||
bytes, dict, int, list, object, range, str, ascii, chr, hex, input,
|
||||
next, oct, open, pow, round, super, filter, map, zip)
|
||||
# stdlib imports
|
||||
import collections
|
||||
import datetime
|
||||
import fnmatch
|
||||
import getpass
|
||||
|
@ -39,6 +40,7 @@ try:
|
|||
except ImportError:
|
||||
import pathlib
|
||||
import os
|
||||
import ssl
|
||||
import tempfile
|
||||
import time
|
||||
# non-stdlib imports
|
||||
|
@ -65,6 +67,23 @@ _RUN_ELEVATED = batchmodels.UserIdentity(
|
|||
elevation_level=batchmodels.ElevationLevel.admin,
|
||||
)
|
||||
)
|
||||
NodeStateCountCollection = collections.namedtuple(
|
||||
'NodeStateCountCollection', [
|
||||
'creating',
|
||||
'idle',
|
||||
'leaving_pool',
|
||||
'offline',
|
||||
'preempted',
|
||||
'rebooting',
|
||||
'reimaging',
|
||||
'running',
|
||||
'start_task_failed',
|
||||
'starting',
|
||||
'unknown',
|
||||
'unusable',
|
||||
'waiting_for_start_task',
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def get_batch_account(batch_mgmt_client, config):
|
||||
|
@ -262,6 +281,7 @@ def _block_for_nodes_ready(
|
|||
pool.target_dedicated_nodes == 0)):
|
||||
fatal_resize_error = True
|
||||
if fatal_resize_error:
|
||||
list_nodes(batch_client, config)
|
||||
raise RuntimeError(
|
||||
'Fatal resize errors encountered for pool {}: {}'.format(
|
||||
pool.id, os.linesep.join(errors)))
|
||||
|
@ -269,7 +289,17 @@ def _block_for_nodes_ready(
|
|||
logger.error(
|
||||
'Resize errors encountered for pool {}: {}'.format(
|
||||
pool.id, os.linesep.join(errors)))
|
||||
nodes = list(batch_client.compute_node.list(pool.id))
|
||||
# check pool allocation state
|
||||
if pool.allocation_state == batchmodels.AllocationState.resizing:
|
||||
nodes = []
|
||||
else:
|
||||
try:
|
||||
nodes = list(batch_client.compute_node.list(pool.id))
|
||||
except ssl.SSLError:
|
||||
# SSL error happens sometimes on paging... this is probably
|
||||
# a bug in the underlying msrest/msrestazure library that
|
||||
# is reusing the SSL connection improperly
|
||||
nodes = []
|
||||
# check if any nodes are in start task failed state
|
||||
if (any(node.state == batchmodels.ComputeNodeState.start_task_failed
|
||||
for node in nodes)):
|
||||
|
@ -301,7 +331,10 @@ def _block_for_nodes_ready(
|
|||
_reboot_node(batch_client, pool.id, node.id, True)
|
||||
reboot_map[node.id] += 1
|
||||
# refresh node list to reflect rebooting states
|
||||
nodes = list(batch_client.compute_node.list(pool.id))
|
||||
try:
|
||||
nodes = list(batch_client.compute_node.list(pool.id))
|
||||
except ssl.SSLError:
|
||||
nodes = []
|
||||
else:
|
||||
# fast path check for start task failures in non-reboot mode
|
||||
logger.error(
|
||||
|
@ -321,7 +354,6 @@ def _block_for_nodes_ready(
|
|||
pool.target_low_priority_nodes) and
|
||||
all(node.state in stopping_states for node in nodes)):
|
||||
if any(node.state not in end_states for node in nodes):
|
||||
# list nodes of pool
|
||||
list_nodes(batch_client, config)
|
||||
raise RuntimeError(
|
||||
('Node(s) of pool {} not in {} state. Please inspect the '
|
||||
|
@ -336,14 +368,48 @@ def _block_for_nodes_ready(
|
|||
i = 0
|
||||
logger.debug(
|
||||
('waiting for {} dedicated nodes and {} low priority nodes '
|
||||
'to reach desired state').format(
|
||||
'to reach desired state in pool {} with '
|
||||
'allocation_state={}').format(
|
||||
pool.target_dedicated_nodes,
|
||||
pool.target_low_priority_nodes))
|
||||
for node in nodes:
|
||||
logger.debug('{}: {}'.format(node.id, node.state))
|
||||
pool.target_low_priority_nodes,
|
||||
pool.id,
|
||||
pool.allocation_state))
|
||||
if len(nodes) < 10:
|
||||
for node in nodes:
|
||||
logger.debug('{}: {}'.format(node.id, node.state))
|
||||
else:
|
||||
logger.debug(_node_state_counts(nodes))
|
||||
time.sleep(10)
|
||||
|
||||
|
||||
def _node_state_counts(nodes):
|
||||
# type: (List[batchmodels.ComputeNode]) -> NodeStateCountCollection
|
||||
"""Collate counts of various nodes
|
||||
:param list nodes: list of nodes
|
||||
:rtype: NodeStateCountCollection
|
||||
:return: node state count collection
|
||||
"""
|
||||
node_states = [node.state for node in nodes]
|
||||
return NodeStateCountCollection(
|
||||
creating=node_states.count(batchmodels.ComputeNodeState.creating),
|
||||
idle=node_states.count(batchmodels.ComputeNodeState.idle),
|
||||
leaving_pool=node_states.count(
|
||||
batchmodels.ComputeNodeState.leaving_pool),
|
||||
offline=node_states.count(batchmodels.ComputeNodeState.offline),
|
||||
preempted=node_states.count(batchmodels.ComputeNodeState.preempted),
|
||||
rebooting=node_states.count(batchmodels.ComputeNodeState.rebooting),
|
||||
reimaging=node_states.count(batchmodels.ComputeNodeState.reimaging),
|
||||
running=node_states.count(batchmodels.ComputeNodeState.running),
|
||||
start_task_failed=node_states.count(
|
||||
batchmodels.ComputeNodeState.start_task_failed),
|
||||
starting=node_states.count(batchmodels.ComputeNodeState.starting),
|
||||
unknown=node_states.count(batchmodels.ComputeNodeState.unknown),
|
||||
unusable=node_states.count(batchmodels.ComputeNodeState.unusable),
|
||||
waiting_for_start_task=node_states.count(
|
||||
batchmodels.ComputeNodeState.waiting_for_start_task),
|
||||
)
|
||||
|
||||
|
||||
def wait_for_pool_ready(batch_client, config, pool_id, addl_end_states=None):
|
||||
# type: (batch.BatchServiceClient, dict, str,
|
||||
# List[batchmodels.ComputeNode]) -> List[batchmodels.ComputeNode]
|
||||
|
@ -751,25 +817,45 @@ def reboot_nodes(batch_client, config, all_start_task_failed, node_id):
|
|||
_reboot_node(batch_client, pool_id, node_id, False)
|
||||
|
||||
|
||||
def del_node(batch_client, config, node_id):
|
||||
# type: (batch.BatchServiceClient, dict, str) -> None
|
||||
def del_node(batch_client, config, all_start_task_failed, node_id):
|
||||
# type: (batch.BatchServiceClient, dict, bool, str) -> None
|
||||
"""Delete a node in a pool
|
||||
:param batch_client: The batch client to use.
|
||||
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
|
||||
:param dict config: configuration dict
|
||||
:param bool all_start_task_failed: reboot all start task failed nodes
|
||||
:param str node_id: node id to delete
|
||||
"""
|
||||
if util.is_none_or_empty(node_id):
|
||||
raise ValueError('node id is invalid')
|
||||
node_ids = []
|
||||
pool_id = settings.pool_id(config)
|
||||
if not util.confirm_action(
|
||||
config, 'delete node {} from {} pool'.format(node_id, pool_id)):
|
||||
if all_start_task_failed:
|
||||
nodes = list(
|
||||
batch_client.compute_node.list(
|
||||
pool_id=pool_id,
|
||||
compute_node_list_options=batchmodels.ComputeNodeListOptions(
|
||||
filter='state eq \'starttaskfailed\'',
|
||||
),
|
||||
))
|
||||
for node in nodes:
|
||||
if util.confirm_action(
|
||||
config, 'delete node {} from {} pool'.format(
|
||||
node.id, pool_id)):
|
||||
node_ids.append(node.id)
|
||||
else:
|
||||
if util.is_none_or_empty(node_id):
|
||||
raise ValueError('node id is invalid')
|
||||
if util.confirm_action(
|
||||
config, 'delete node {} from {} pool'.format(
|
||||
node_id, pool_id)):
|
||||
node_ids.append(node_id)
|
||||
if util.is_none_or_empty(node_ids):
|
||||
logger.warning('no nodes to delete from pool: {}'.format(pool_id))
|
||||
return
|
||||
logger.info('Deleting node {} from pool {}'.format(node_id, pool_id))
|
||||
logger.info('Deleting nodes {} from pool {}'.format(node_ids, pool_id))
|
||||
batch_client.pool.remove_nodes(
|
||||
pool_id=pool_id,
|
||||
node_remove_parameter=batchmodels.NodeRemoveParameter(
|
||||
node_list=[node_id],
|
||||
node_list=node_ids,
|
||||
)
|
||||
)
|
||||
|
||||
|
|
|
@ -2280,15 +2280,16 @@ def action_pool_ssh(batch_client, config, cardinal, nodeid, tty, command):
|
|||
util.subprocess_with_output(ssh_cmd)
|
||||
|
||||
|
||||
def action_pool_delnode(batch_client, config, nodeid):
|
||||
# type: (batchsc.BatchServiceClient, dict, str) -> None
|
||||
def action_pool_delnode(batch_client, config, all_start_task_failed, nodeid):
|
||||
# type: (batchsc.BatchServiceClient, dict, bool, str) -> None
|
||||
"""Action: Pool Delnode
|
||||
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
||||
batch client
|
||||
:param dict config: configuration dict
|
||||
:param bool all_start_task_failed: reboot all start task failed nodes
|
||||
:param str nodeid: nodeid to delete
|
||||
"""
|
||||
batch.del_node(batch_client, config, nodeid)
|
||||
batch.del_node(batch_client, config, all_start_task_failed, nodeid)
|
||||
|
||||
|
||||
def action_pool_rebootnode(
|
||||
|
|
|
@ -129,8 +129,8 @@ It is recommended to follow the steps outlined on
|
|||
[this guide](http://docs.python-guide.org/en/latest/starting/install3/osx/#install3-osx)
|
||||
to install Batch Shipyard on a Python3 installation rather than the default
|
||||
Python 2.7 that is shipped with Mac OS X. However, if you prefer to use
|
||||
the system defaulted Python 2.7, the installation will work that environment
|
||||
as well.
|
||||
the system defaulted Python 2.7, the installation will work with that
|
||||
environment as well.
|
||||
|
||||
The `install.sh` script supports isolated installation through a virtual
|
||||
environment so that other system-wide or user python dependencies are left
|
||||
|
|
|
@ -456,6 +456,9 @@ Azure Storage.
|
|||
pool configuration file
|
||||
* `--wait` will wait for deletion to complete
|
||||
* `delnode` will delete the specified node from the pool
|
||||
* `--all-start-task-failed` will delete all nodes in the start task
|
||||
failed state
|
||||
* `--nodeid` is the node id to delete
|
||||
* `dsu` will delete the SSH user defined in the pool configuration file
|
||||
from all nodes in the specified pool
|
||||
* `grls` will retrieve all of the remote login settings for every node
|
||||
|
|
|
@ -20,9 +20,6 @@ once they are available for N-series VMs.
|
|||
are available for N-series VMs.
|
||||
* `sku` should be `16.04-LTS`. Other skus will be supported once they are
|
||||
available for N-series VMs.
|
||||
* `gpu` property should be specified with the following members:
|
||||
* `nvidia_driver` property contains the following members:
|
||||
* `source` is a URL for the driver installer .run file
|
||||
|
||||
### Global Configuration
|
||||
The global configuration should set the following properties:
|
||||
|
|
|
@ -15,11 +15,6 @@
|
|||
"ssh": {
|
||||
"username": "docker"
|
||||
},
|
||||
"gpu": {
|
||||
"nvidia_driver": {
|
||||
"source": "<URL for nvidia driver for STANDARD_NV VMs>"
|
||||
}
|
||||
},
|
||||
"reboot_on_start_task_failed": false,
|
||||
"block_until_all_global_resources_loaded": true
|
||||
}
|
||||
|
|
|
@ -119,6 +119,16 @@ if [ -z $version ]; then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
contains() {
|
||||
string="$1"
|
||||
substring="$2"
|
||||
if test "${string#*$substring}" != "$string"; then
|
||||
return 0
|
||||
else
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
check_for_buggy_ntfs_mount() {
|
||||
# Check to ensure sdb1 mount is not mounted as ntfs
|
||||
set +e
|
||||
|
@ -187,7 +197,7 @@ refresh_package_index() {
|
|||
fi
|
||||
let retries=retries-1
|
||||
if [ $retries -eq 0 ]; then
|
||||
echo "Could not update package index"
|
||||
echo "ERROR: Could not update package index"
|
||||
exit 1
|
||||
fi
|
||||
sleep 1
|
||||
|
@ -213,7 +223,7 @@ install_packages() {
|
|||
fi
|
||||
let retries=retries-1
|
||||
if [ $retries -eq 0 ]; then
|
||||
echo "Could not install packages: $*"
|
||||
echo "ERROR: Could not install packages: $*"
|
||||
exit 1
|
||||
fi
|
||||
sleep 1
|
||||
|
@ -221,6 +231,35 @@ install_packages() {
|
|||
set -e
|
||||
}
|
||||
|
||||
docker_pull_image() {
|
||||
image=$1
|
||||
set +e
|
||||
retries=60
|
||||
while [ $retries -gt 0 ]; do
|
||||
pull_out=$(docker pull $image 2>&1)
|
||||
rc=$?
|
||||
if [ $rc -eq 0 ]; then
|
||||
echo "$pull_out"
|
||||
break
|
||||
fi
|
||||
# non-zero exit code: check if pull output has toomanyrequests or
|
||||
# connection resets
|
||||
if [ contains "$pull_out" "toomanyrequests" ] || [ contains "$pull_out" "connection reset by peer" ]; then
|
||||
echo "WARNING: will retry:\n$pull_out"
|
||||
else
|
||||
echo "ERROR:\n$pull_out"
|
||||
exit $rc
|
||||
fi
|
||||
let retries=retries-1
|
||||
if [ $retries -le 0 ]; then
|
||||
echo "ERROR: Could not pull docker image: $image"
|
||||
exit $rc
|
||||
fi
|
||||
sleep $[($RANDOM % 5) + 1]s
|
||||
done
|
||||
set -e
|
||||
}
|
||||
|
||||
# check sdb1 mount
|
||||
check_for_buggy_ntfs_mount
|
||||
|
||||
|
@ -269,10 +308,10 @@ fi
|
|||
|
||||
# check if we're coming up from a reboot
|
||||
if [ -f $cascadefailed ]; then
|
||||
echo "$cascadefailed file exists, assuming cascade failure during node prep"
|
||||
echo "ERROR: $cascadefailed file exists, assuming cascade failure during node prep"
|
||||
exit 1
|
||||
elif [ -f $nodeprepfinished ]; then
|
||||
echo "$nodeprepfinished file exists, assuming successful completion of node prep"
|
||||
echo "INFO: $nodeprepfinished file exists, assuming successful completion of node prep"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
@ -343,11 +382,11 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then
|
|||
repo=https://download.docker.com/linux/debian
|
||||
dockerversion=${dockerversion}debian
|
||||
else
|
||||
echo "unsupported sku: $sku for offer: $offer"
|
||||
echo "ERROR: unsupported sku: $sku for offer: $offer"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -z $gpu ] && [ $name != "ubuntu-xenial" ]; then
|
||||
echo "gpu unsupported on this sku: $sku for offer $offer"
|
||||
echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
|
||||
exit 1
|
||||
fi
|
||||
# reload network settings
|
||||
|
@ -375,7 +414,7 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then
|
|||
fi
|
||||
let retries=retries-1
|
||||
if [ $retries -eq 0 ]; then
|
||||
echo "Could not add key for docker repo"
|
||||
echo "ERROR: Could not add key for docker repo"
|
||||
exit 1
|
||||
fi
|
||||
sleep 1
|
||||
|
@ -470,7 +509,7 @@ EOF
|
|||
set +e
|
||||
while :
|
||||
do
|
||||
echo "Attempting to create nvidia-docker volume with version $nvdriverver"
|
||||
echo "INFO: Attempting to create nvidia-docker volume with version $nvdriverver"
|
||||
docker volume create -d nvidia-docker --name nvidia_driver_$nvdriverver
|
||||
if [ $? -eq 0 ]; then
|
||||
break
|
||||
|
@ -479,7 +518,7 @@ EOF
|
|||
NV_DIFF=$((($NV_NOW-$NV_START)/60))
|
||||
# fail after 5 minutes of attempts
|
||||
if [ $NV_DIFF -ge 5 ]; then
|
||||
echo "could not create nvidia-docker volume"
|
||||
echo "ERROR: could not create nvidia-docker volume"
|
||||
exit 1
|
||||
fi
|
||||
sleep 1
|
||||
|
@ -507,7 +546,7 @@ EOF
|
|||
elif [ $server_type == "glusterfs" ]; then
|
||||
install_packages $offer glusterfs-client acl
|
||||
else
|
||||
echo "Unknown file server type ${sc[0]} for ${sc[1]}"
|
||||
echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
@ -525,12 +564,12 @@ EOF
|
|||
elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-linux" ]]; then
|
||||
# ensure container only support
|
||||
if [ $cascadecontainer -eq 0 ]; then
|
||||
echo "only supported through shipyard container"
|
||||
echo "ERROR: only supported through shipyard container"
|
||||
exit 1
|
||||
fi
|
||||
# gpu is not supported on these offers
|
||||
if [ ! -z $gpu ]; then
|
||||
echo "gpu unsupported on this sku: $sku for offer $offer"
|
||||
echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
|
||||
exit 1
|
||||
fi
|
||||
if [[ $sku == 7.* ]]; then
|
||||
|
@ -542,7 +581,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
|
|||
gfsenable="systemctl enable glusterd"
|
||||
rpcbindenable="systemctl enable rpcbind"
|
||||
# TODO, in order to support docker > 1.9, need to upgrade to UEKR4
|
||||
echo "oracle linux is not supported at this time"
|
||||
echo "ERROR: oracle linux is not supported at this time"
|
||||
exit 1
|
||||
else
|
||||
srvenable="chkconfig docker on"
|
||||
|
@ -552,7 +591,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
|
|||
rpcbindenable="chkconfig rpcbind on"
|
||||
fi
|
||||
else
|
||||
echo "unsupported sku: $sku for offer: $offer"
|
||||
echo "ERROR: unsupported sku: $sku for offer: $offer"
|
||||
exit 1
|
||||
fi
|
||||
# reload network settings
|
||||
|
@ -605,7 +644,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
|
|||
sed -i -e "s/enabled=1/enabled=0/g" /etc/yum.repos.d/CentOS-Gluster-3.8.repo
|
||||
install_packages $offer --enablerepo=centos-gluster38,epel glusterfs-server acl
|
||||
else
|
||||
echo "Unknown file server type ${sc[0]} for ${sc[1]}"
|
||||
echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
@ -613,12 +652,12 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
|
|||
elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
|
||||
# ensure container only support
|
||||
if [ $cascadecontainer -eq 0 ]; then
|
||||
echo "only supported through shipyard container"
|
||||
echo "ERROR: only supported through shipyard container"
|
||||
exit 1
|
||||
fi
|
||||
# gpu is not supported on these offers
|
||||
if [ ! -z $gpu ]; then
|
||||
echo "gpu unsupported on this sku: $sku for offer $offer"
|
||||
echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
|
||||
exit 1
|
||||
fi
|
||||
# reload network settings
|
||||
|
@ -648,7 +687,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
|
|||
SUSEConnect -p sle-module-containers/12/x86_64 -r ''
|
||||
fi
|
||||
if [ -z $repodir ]; then
|
||||
echo "unsupported sku: $sku for offer: $offer"
|
||||
echo "ERROR: unsupported sku: $sku for offer: $offer"
|
||||
exit 1
|
||||
fi
|
||||
# update index
|
||||
|
@ -694,7 +733,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
|
|||
zypper -n --gpg-auto-import-keys ref
|
||||
install_packages $offer glusterfs acl
|
||||
else
|
||||
echo "Unknown file server type ${sc[0]} for ${sc[1]}"
|
||||
echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
@ -702,7 +741,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
|
|||
# if hpc sku, set up intel mpi
|
||||
if [[ $offer == sles-hpc* ]]; then
|
||||
if [ $sku != "12-sp1" ]; then
|
||||
echo "unsupported sku for intel mpi setup on SLES"
|
||||
echo "ERROR: unsupported sku for intel mpi setup on SLES"
|
||||
exit 1
|
||||
fi
|
||||
install_packages $offer lsb
|
||||
|
@ -712,13 +751,13 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
|
|||
fi
|
||||
fi
|
||||
else
|
||||
echo "unsupported offer: $offer (sku: $sku)"
|
||||
echo "ERROR: unsupported offer: $offer (sku: $sku)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# retrieve docker images related to data movement
|
||||
docker pull alfpark/blobxfer:$blobxferversion
|
||||
docker pull alfpark/batch-shipyard:tfm-$version
|
||||
docker_pull_image alfpark/blobxfer:$blobxferversion
|
||||
docker_pull_image alfpark/batch-shipyard:tfm-$version
|
||||
|
||||
# login to registry server
|
||||
if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then
|
||||
|
@ -734,15 +773,15 @@ if [ ! -z $sc_args ]; then
|
|||
for sc_arg in ${sc_args[@]}; do
|
||||
IFS=':' read -ra sc <<< "$sc_arg"
|
||||
mountpoint=$AZ_BATCH_NODE_SHARED_DIR/${sc[1]}
|
||||
echo "Creating host directory for storage cluster $sc_arg at $mountpoint"
|
||||
echo "INFO: Creating host directory for storage cluster $sc_arg at $mountpoint"
|
||||
mkdir -p $mountpoint
|
||||
chmod 777 $mountpoint
|
||||
echo "Adding $mountpoint to fstab"
|
||||
echo "INFO: Adding $mountpoint to fstab"
|
||||
# eval fstab var to expand vars (this is ok since it is set by shipyard)
|
||||
fstab_entry="${fstabs[$i]}"
|
||||
echo $fstab_entry >> /etc/fstab
|
||||
tail -n1 /etc/fstab
|
||||
echo "Mounting $mountpoint"
|
||||
echo "INFO: Mounting $mountpoint"
|
||||
START=$(date -u +"%s")
|
||||
set +e
|
||||
while :
|
||||
|
@ -755,14 +794,14 @@ if [ ! -z $sc_args ]; then
|
|||
DIFF=$((($NOW-$START)/60))
|
||||
# fail after 5 minutes of attempts
|
||||
if [ $DIFF -ge 5 ]; then
|
||||
echo "Could not mount storage cluster $sc_arg on: $mountpoint"
|
||||
echo "ERROR: Could not mount storage cluster $sc_arg on: $mountpoint"
|
||||
exit 1
|
||||
fi
|
||||
sleep 1
|
||||
fi
|
||||
done
|
||||
set -e
|
||||
echo "$mountpoint mounted."
|
||||
echo "INFO: $mountpoint mounted."
|
||||
i=$(($i + 1))
|
||||
done
|
||||
fi
|
||||
|
@ -805,6 +844,8 @@ p2p=$p2p
|
|||
`env | grep DOCKER_LOGIN_`
|
||||
EOF
|
||||
chmod 600 $envfile
|
||||
# pull image
|
||||
docker_pull_image alfpark/batch-shipyard:cascade-$version
|
||||
# launch container
|
||||
docker run $detached --net=host --env-file $envfile \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
|
@ -847,7 +888,7 @@ if [ $p2penabled -eq 0 ]; then
|
|||
wait $cascadepid
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
echo "cascade exited with non-zero exit code: $rc"
|
||||
echo "ERROR: cascade exited with non-zero exit code: $rc"
|
||||
rm -f $nodeprepfinished
|
||||
exit $rc
|
||||
fi
|
||||
|
@ -859,7 +900,7 @@ rm -f $cascadefailed
|
|||
|
||||
# block until images ready if specified
|
||||
if [ ! -z $block ]; then
|
||||
echo "blocking until images ready: $block"
|
||||
echo "INFO: blocking until images ready: $block"
|
||||
IFS=',' read -ra RES <<< "$block"
|
||||
declare -a missing
|
||||
while :
|
||||
|
@ -870,7 +911,7 @@ if [ ! -z $block ]; then
|
|||
fi
|
||||
done
|
||||
if [ ${#missing[@]} -eq 0 ]; then
|
||||
echo "all docker images present"
|
||||
echo "INFO: all docker images present"
|
||||
break
|
||||
else
|
||||
unset missing
|
||||
|
|
|
@ -81,6 +81,16 @@ done
|
|||
shift $((OPTIND-1))
|
||||
[ "$1" = "--" ] && shift
|
||||
|
||||
contains() {
|
||||
string="$1"
|
||||
substring="$2"
|
||||
if test "${string#*$substring}" != "$string"; then
|
||||
return 0
|
||||
else
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
check_for_buggy_ntfs_mount() {
|
||||
# Check to ensure sdb1 mount is not mounted as ntfs
|
||||
set +e
|
||||
|
@ -249,6 +259,35 @@ install_azurefile_docker_volume_driver() {
|
|||
./azurefile-dockervolume-create.sh
|
||||
}
|
||||
|
||||
docker_pull_image() {
|
||||
image=$1
|
||||
set +e
|
||||
retries=60
|
||||
while [ $retries -gt 0 ]; do
|
||||
pull_out=$(docker pull $image 2>&1)
|
||||
rc=$?
|
||||
if [ $rc -eq 0 ]; then
|
||||
echo "$pull_out"
|
||||
break
|
||||
fi
|
||||
# non-zero exit code: check if pull output has toomanyrequests or
|
||||
# connection resets
|
||||
if [ contains "$pull_out" "toomanyrequests" ] || [ contains "$pull_out" "connection reset by peer" ]; then
|
||||
echo "WARNING: will retry:\n$pull_out"
|
||||
else
|
||||
echo "ERROR:\n$pull_out"
|
||||
exit $rc
|
||||
fi
|
||||
let retries=retries-1
|
||||
if [ $retries -le 0 ]; then
|
||||
echo "ERROR: Could not pull docker image: $image"
|
||||
exit $rc
|
||||
fi
|
||||
sleep $[($RANDOM % 5) + 1]s
|
||||
done
|
||||
set -e
|
||||
}
|
||||
|
||||
# try to get /etc/lsb-release
|
||||
if [ -e /etc/lsb-release ]; then
|
||||
. /etc/lsb-release
|
||||
|
@ -397,8 +436,8 @@ if [ ! -z $sc_args ]; then
|
|||
fi
|
||||
|
||||
# retrieve docker images related to data movement
|
||||
docker pull alfpark/blobxfer:$blobxferversion
|
||||
docker pull alfpark/batch-shipyard:tfm-$version
|
||||
docker_pull_image alfpark/blobxfer:$blobxferversion
|
||||
docker_pull_image alfpark/batch-shipyard:tfm-$version
|
||||
|
||||
# login to registry server
|
||||
if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then
|
||||
|
@ -442,6 +481,8 @@ p2p=$p2p
|
|||
`env | grep DOCKER_LOGIN_`
|
||||
EOF
|
||||
chmod 600 $envfile
|
||||
# pull image
|
||||
docker_pull_image alfpark/batch-shipyard:cascade-$version
|
||||
# launch container
|
||||
docker run $detached --net=host --env-file $envfile \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
|
|
|
@ -1181,6 +1181,10 @@ def pool_ssh(ctx, cardinal, nodeid, tty, command):
|
|||
|
||||
|
||||
@pool.command('delnode')
|
||||
@click.option(
|
||||
'--all-start-task-failed',
|
||||
is_flag=True,
|
||||
help='Deleted all nodes with start task failed state')
|
||||
@click.option(
|
||||
'--nodeid', help='NodeId of compute node in pool to delete')
|
||||
@common_options
|
||||
|
@ -1188,10 +1192,11 @@ def pool_ssh(ctx, cardinal, nodeid, tty, command):
|
|||
@keyvault_options
|
||||
@aad_options
|
||||
@pass_cli_context
|
||||
def pool_delnode(ctx, nodeid):
|
||||
def pool_delnode(ctx, all_start_task_failed, nodeid):
|
||||
"""Delete a node from a pool"""
|
||||
ctx.initialize_for_batch()
|
||||
convoy.fleet.action_pool_delnode(ctx.batch_client, ctx.config, nodeid)
|
||||
convoy.fleet.action_pool_delnode(
|
||||
ctx.batch_client, ctx.config, all_start_task_failed, nodeid)
|
||||
|
||||
|
||||
@pool.command('rebootnode')
|
||||
|
|
Загрузка…
Ссылка в новой задаче