More improvements for scale out robustness

- Add --all-start-task-failed to delnode
- Reduce node output on pool allocation wait with number of nodes > 10
This commit is contained in:
Fred Park 2017-06-30 23:50:21 -07:00
Родитель 06188c1944
Коммит 2a48885da1
12 изменённых файлов: 270 добавлений и 87 удалений

3
.gitignore поставляемый
Просмотреть файл

@ -90,6 +90,7 @@ celerybeat-schedule
# project specific ignores # project specific ignores
shipyard shipyard
shipyard.cmd
ssh_docker_tunnel_shipyard.sh ssh_docker_tunnel_shipyard.sh
id_rsa_shipyard* id_rsa_shipyard*
resources/azurefile-dockervolume-create.sh resources/azurefile-dockervolume-create.sh
@ -97,4 +98,4 @@ resources/azurefile-dockervolumedriver
resources/azurefile-dockervolumedriver.env resources/azurefile-dockervolumedriver.env
resources/docker-registry-v2.tar.gz resources/docker-registry-v2.tar.gz
resources/nvidia-docker.deb resources/nvidia-docker.deb
resources/nvidia-driver.run resources/nvidia-driver*.run

Просмотреть файл

@ -2,6 +2,17 @@
## [Unreleased] ## [Unreleased]
### Added
- `--all-start-task-failed` parameter for `pool delnode`
### Changed
- Improve robustness of docker image pulls within node prep scripts
- Restrict node list queries until pool allocation state emerges from resizing
### Fixed
- Remove nvidia gpu driver property from FFmpeg recipe
- Further improve retry logic for docker image pulls in cascade
## [2.8.0rc2] - 2017-06-30 ## [2.8.0rc2] - 2017-06-30
### Added ### Added
- Support Mac OS X and Windows Subsystem for Linux installations via - Support Mac OS X and Windows Subsystem for Linux installations via

Просмотреть файл

@ -395,6 +395,18 @@ class DockerSaveThread(threading.Thread):
_DIRECTDL_DOWNLOADING.remove(self.resource) _DIRECTDL_DOWNLOADING.remove(self.resource)
_DIRECTDL.remove(self.resource) _DIRECTDL.remove(self.resource)
def _check_pull_output_overload(self, stdout: str, stderr: str) -> bool:
"""Check output for registry overload errors
:param str stdout: stdout
:param str stderr: stderr
:rtype: bool
:return: if error appears to be overload from registry
"""
if ('toomanyrequests' in stdout or 'toomanyrequests' in stderr or
'connection reset by peer' in stderr):
return True
return False
def _pull(self, image: str) -> tuple: def _pull(self, image: str) -> tuple:
"""Docker image pull with registry normalization """Docker image pull with registry normalization
:param str image: image to pull :param str image: image to pull
@ -414,7 +426,9 @@ class DockerSaveThread(threading.Thread):
shell=True, shell=True,
universal_newlines=True) universal_newlines=True)
stdout, stderr = proc.communicate() stdout, stderr = proc.communicate()
if proc.returncode != 0 and _ALLOW_PUBLIC_PULL_WITH_PRIVATE: if (proc.returncode != 0 and _ALLOW_PUBLIC_PULL_WITH_PRIVATE and
not _pub and
not self._check_pull_output_overload(stdout, stderr)):
logger.warning( logger.warning(
'could not pull from private registry, attempting ' 'could not pull from private registry, attempting '
'Docker Public Hub instead') 'Docker Public Hub instead')
@ -445,31 +459,19 @@ class DockerSaveThread(threading.Thread):
_record_perf('pull-start', 'img={}'.format(image)) _record_perf('pull-start', 'img={}'.format(image))
start = datetime.datetime.now() start = datetime.datetime.now()
logger.info('pulling image {} from {}'.format(image, _REGISTRY)) logger.info('pulling image {} from {}'.format(image, _REGISTRY))
npa_errors = 0
while True: while True:
rc, stdout, stderr = self._pull(image) rc, stdout, stderr = self._pull(image)
if rc != 0: if rc == 0:
fail = True
if 'toomanyrequests' in stdout or 'toomanyrequests' in stderr:
logger.error(
'Too many requests issued to registry server, '
'retrying...')
fail = False
time.sleep(random.randint(5, 30))
elif 'no pull access' in stdout or 'no pull access' in stderr:
npa_errors += 1
if npa_errors < 3:
fail = False
logger.error(
'No pull access to registry server, retrying in '
'case of temporary overload...')
time.sleep(random.randint(1, 10))
if fail:
raise RuntimeError(
'docker pull failed: stdout={} stderr={}'.format(
stdout, stderr))
else:
break break
elif self._check_pull_output_overload(stdout, stderr):
logger.error(
'Too many requests issued to registry server, '
'retrying...')
time.sleep(random.randint(5, 30))
else:
raise RuntimeError(
'docker pull failed: stdout={} stderr={}'.format(
stdout, stderr))
diff = (datetime.datetime.now() - start).total_seconds() diff = (datetime.datetime.now() - start).total_seconds()
logger.debug('took {} sec to pull docker image {} from {}'.format( logger.debug('took {} sec to pull docker image {} from {}'.format(
diff, image, _REGISTRY)) diff, image, _REGISTRY))

Просмотреть файл

@ -30,6 +30,7 @@ from builtins import ( # noqa
bytes, dict, int, list, object, range, str, ascii, chr, hex, input, bytes, dict, int, list, object, range, str, ascii, chr, hex, input,
next, oct, open, pow, round, super, filter, map, zip) next, oct, open, pow, round, super, filter, map, zip)
# stdlib imports # stdlib imports
import collections
import datetime import datetime
import fnmatch import fnmatch
import getpass import getpass
@ -39,6 +40,7 @@ try:
except ImportError: except ImportError:
import pathlib import pathlib
import os import os
import ssl
import tempfile import tempfile
import time import time
# non-stdlib imports # non-stdlib imports
@ -65,6 +67,23 @@ _RUN_ELEVATED = batchmodels.UserIdentity(
elevation_level=batchmodels.ElevationLevel.admin, elevation_level=batchmodels.ElevationLevel.admin,
) )
) )
NodeStateCountCollection = collections.namedtuple(
'NodeStateCountCollection', [
'creating',
'idle',
'leaving_pool',
'offline',
'preempted',
'rebooting',
'reimaging',
'running',
'start_task_failed',
'starting',
'unknown',
'unusable',
'waiting_for_start_task',
]
)
def get_batch_account(batch_mgmt_client, config): def get_batch_account(batch_mgmt_client, config):
@ -262,6 +281,7 @@ def _block_for_nodes_ready(
pool.target_dedicated_nodes == 0)): pool.target_dedicated_nodes == 0)):
fatal_resize_error = True fatal_resize_error = True
if fatal_resize_error: if fatal_resize_error:
list_nodes(batch_client, config)
raise RuntimeError( raise RuntimeError(
'Fatal resize errors encountered for pool {}: {}'.format( 'Fatal resize errors encountered for pool {}: {}'.format(
pool.id, os.linesep.join(errors))) pool.id, os.linesep.join(errors)))
@ -269,7 +289,17 @@ def _block_for_nodes_ready(
logger.error( logger.error(
'Resize errors encountered for pool {}: {}'.format( 'Resize errors encountered for pool {}: {}'.format(
pool.id, os.linesep.join(errors))) pool.id, os.linesep.join(errors)))
nodes = list(batch_client.compute_node.list(pool.id)) # check pool allocation state
if pool.allocation_state == batchmodels.AllocationState.resizing:
nodes = []
else:
try:
nodes = list(batch_client.compute_node.list(pool.id))
except ssl.SSLError:
# SSL error happens sometimes on paging... this is probably
# a bug in the underlying msrest/msrestazure library that
# is reusing the SSL connection improperly
nodes = []
# check if any nodes are in start task failed state # check if any nodes are in start task failed state
if (any(node.state == batchmodels.ComputeNodeState.start_task_failed if (any(node.state == batchmodels.ComputeNodeState.start_task_failed
for node in nodes)): for node in nodes)):
@ -301,7 +331,10 @@ def _block_for_nodes_ready(
_reboot_node(batch_client, pool.id, node.id, True) _reboot_node(batch_client, pool.id, node.id, True)
reboot_map[node.id] += 1 reboot_map[node.id] += 1
# refresh node list to reflect rebooting states # refresh node list to reflect rebooting states
nodes = list(batch_client.compute_node.list(pool.id)) try:
nodes = list(batch_client.compute_node.list(pool.id))
except ssl.SSLError:
nodes = []
else: else:
# fast path check for start task failures in non-reboot mode # fast path check for start task failures in non-reboot mode
logger.error( logger.error(
@ -321,7 +354,6 @@ def _block_for_nodes_ready(
pool.target_low_priority_nodes) and pool.target_low_priority_nodes) and
all(node.state in stopping_states for node in nodes)): all(node.state in stopping_states for node in nodes)):
if any(node.state not in end_states for node in nodes): if any(node.state not in end_states for node in nodes):
# list nodes of pool
list_nodes(batch_client, config) list_nodes(batch_client, config)
raise RuntimeError( raise RuntimeError(
('Node(s) of pool {} not in {} state. Please inspect the ' ('Node(s) of pool {} not in {} state. Please inspect the '
@ -336,14 +368,48 @@ def _block_for_nodes_ready(
i = 0 i = 0
logger.debug( logger.debug(
('waiting for {} dedicated nodes and {} low priority nodes ' ('waiting for {} dedicated nodes and {} low priority nodes '
'to reach desired state').format( 'to reach desired state in pool {} with '
'allocation_state={}').format(
pool.target_dedicated_nodes, pool.target_dedicated_nodes,
pool.target_low_priority_nodes)) pool.target_low_priority_nodes,
for node in nodes: pool.id,
logger.debug('{}: {}'.format(node.id, node.state)) pool.allocation_state))
if len(nodes) < 10:
for node in nodes:
logger.debug('{}: {}'.format(node.id, node.state))
else:
logger.debug(_node_state_counts(nodes))
time.sleep(10) time.sleep(10)
def _node_state_counts(nodes):
# type: (List[batchmodels.ComputeNode]) -> NodeStateCountCollection
"""Collate counts of various nodes
:param list nodes: list of nodes
:rtype: NodeStateCountCollection
:return: node state count collection
"""
node_states = [node.state for node in nodes]
return NodeStateCountCollection(
creating=node_states.count(batchmodels.ComputeNodeState.creating),
idle=node_states.count(batchmodels.ComputeNodeState.idle),
leaving_pool=node_states.count(
batchmodels.ComputeNodeState.leaving_pool),
offline=node_states.count(batchmodels.ComputeNodeState.offline),
preempted=node_states.count(batchmodels.ComputeNodeState.preempted),
rebooting=node_states.count(batchmodels.ComputeNodeState.rebooting),
reimaging=node_states.count(batchmodels.ComputeNodeState.reimaging),
running=node_states.count(batchmodels.ComputeNodeState.running),
start_task_failed=node_states.count(
batchmodels.ComputeNodeState.start_task_failed),
starting=node_states.count(batchmodels.ComputeNodeState.starting),
unknown=node_states.count(batchmodels.ComputeNodeState.unknown),
unusable=node_states.count(batchmodels.ComputeNodeState.unusable),
waiting_for_start_task=node_states.count(
batchmodels.ComputeNodeState.waiting_for_start_task),
)
def wait_for_pool_ready(batch_client, config, pool_id, addl_end_states=None): def wait_for_pool_ready(batch_client, config, pool_id, addl_end_states=None):
# type: (batch.BatchServiceClient, dict, str, # type: (batch.BatchServiceClient, dict, str,
# List[batchmodels.ComputeNode]) -> List[batchmodels.ComputeNode] # List[batchmodels.ComputeNode]) -> List[batchmodels.ComputeNode]
@ -751,25 +817,45 @@ def reboot_nodes(batch_client, config, all_start_task_failed, node_id):
_reboot_node(batch_client, pool_id, node_id, False) _reboot_node(batch_client, pool_id, node_id, False)
def del_node(batch_client, config, node_id): def del_node(batch_client, config, all_start_task_failed, node_id):
# type: (batch.BatchServiceClient, dict, str) -> None # type: (batch.BatchServiceClient, dict, bool, str) -> None
"""Delete a node in a pool """Delete a node in a pool
:param batch_client: The batch client to use. :param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient` :type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict :param dict config: configuration dict
:param bool all_start_task_failed: reboot all start task failed nodes
:param str node_id: node id to delete :param str node_id: node id to delete
""" """
if util.is_none_or_empty(node_id): node_ids = []
raise ValueError('node id is invalid')
pool_id = settings.pool_id(config) pool_id = settings.pool_id(config)
if not util.confirm_action( if all_start_task_failed:
config, 'delete node {} from {} pool'.format(node_id, pool_id)): nodes = list(
batch_client.compute_node.list(
pool_id=pool_id,
compute_node_list_options=batchmodels.ComputeNodeListOptions(
filter='state eq \'starttaskfailed\'',
),
))
for node in nodes:
if util.confirm_action(
config, 'delete node {} from {} pool'.format(
node.id, pool_id)):
node_ids.append(node.id)
else:
if util.is_none_or_empty(node_id):
raise ValueError('node id is invalid')
if util.confirm_action(
config, 'delete node {} from {} pool'.format(
node_id, pool_id)):
node_ids.append(node_id)
if util.is_none_or_empty(node_ids):
logger.warning('no nodes to delete from pool: {}'.format(pool_id))
return return
logger.info('Deleting node {} from pool {}'.format(node_id, pool_id)) logger.info('Deleting nodes {} from pool {}'.format(node_ids, pool_id))
batch_client.pool.remove_nodes( batch_client.pool.remove_nodes(
pool_id=pool_id, pool_id=pool_id,
node_remove_parameter=batchmodels.NodeRemoveParameter( node_remove_parameter=batchmodels.NodeRemoveParameter(
node_list=[node_id], node_list=node_ids,
) )
) )

Просмотреть файл

@ -2280,15 +2280,16 @@ def action_pool_ssh(batch_client, config, cardinal, nodeid, tty, command):
util.subprocess_with_output(ssh_cmd) util.subprocess_with_output(ssh_cmd)
def action_pool_delnode(batch_client, config, nodeid): def action_pool_delnode(batch_client, config, all_start_task_failed, nodeid):
# type: (batchsc.BatchServiceClient, dict, str) -> None # type: (batchsc.BatchServiceClient, dict, bool, str) -> None
"""Action: Pool Delnode """Action: Pool Delnode
:param azure.batch.batch_service_client.BatchServiceClient batch_client: :param azure.batch.batch_service_client.BatchServiceClient batch_client:
batch client batch client
:param dict config: configuration dict :param dict config: configuration dict
:param bool all_start_task_failed: reboot all start task failed nodes
:param str nodeid: nodeid to delete :param str nodeid: nodeid to delete
""" """
batch.del_node(batch_client, config, nodeid) batch.del_node(batch_client, config, all_start_task_failed, nodeid)
def action_pool_rebootnode( def action_pool_rebootnode(

Просмотреть файл

@ -129,8 +129,8 @@ It is recommended to follow the steps outlined on
[this guide](http://docs.python-guide.org/en/latest/starting/install3/osx/#install3-osx) [this guide](http://docs.python-guide.org/en/latest/starting/install3/osx/#install3-osx)
to install Batch Shipyard on a Python3 installation rather than the default to install Batch Shipyard on a Python3 installation rather than the default
Python 2.7 that is shipped with Mac OS X. However, if you prefer to use Python 2.7 that is shipped with Mac OS X. However, if you prefer to use
the system defaulted Python 2.7, the installation will work that environment the system defaulted Python 2.7, the installation will work with that
as well. environment as well.
The `install.sh` script supports isolated installation through a virtual The `install.sh` script supports isolated installation through a virtual
environment so that other system-wide or user python dependencies are left environment so that other system-wide or user python dependencies are left

Просмотреть файл

@ -456,6 +456,9 @@ Azure Storage.
pool configuration file pool configuration file
* `--wait` will wait for deletion to complete * `--wait` will wait for deletion to complete
* `delnode` will delete the specified node from the pool * `delnode` will delete the specified node from the pool
* `--all-start-task-failed` will delete all nodes in the start task
failed state
* `--nodeid` is the node id to delete
* `dsu` will delete the SSH user defined in the pool configuration file * `dsu` will delete the SSH user defined in the pool configuration file
from all nodes in the specified pool from all nodes in the specified pool
* `grls` will retrieve all of the remote login settings for every node * `grls` will retrieve all of the remote login settings for every node

Просмотреть файл

@ -20,9 +20,6 @@ once they are available for N-series VMs.
are available for N-series VMs. are available for N-series VMs.
* `sku` should be `16.04-LTS`. Other skus will be supported once they are * `sku` should be `16.04-LTS`. Other skus will be supported once they are
available for N-series VMs. available for N-series VMs.
* `gpu` property should be specified with the following members:
* `nvidia_driver` property contains the following members:
* `source` is a URL for the driver installer .run file
### Global Configuration ### Global Configuration
The global configuration should set the following properties: The global configuration should set the following properties:

Просмотреть файл

@ -15,11 +15,6 @@
"ssh": { "ssh": {
"username": "docker" "username": "docker"
}, },
"gpu": {
"nvidia_driver": {
"source": "<URL for nvidia driver for STANDARD_NV VMs>"
}
},
"reboot_on_start_task_failed": false, "reboot_on_start_task_failed": false,
"block_until_all_global_resources_loaded": true "block_until_all_global_resources_loaded": true
} }

Просмотреть файл

@ -119,6 +119,16 @@ if [ -z $version ]; then
exit 1 exit 1
fi fi
contains() {
string="$1"
substring="$2"
if test "${string#*$substring}" != "$string"; then
return 0
else
return 1
fi
}
check_for_buggy_ntfs_mount() { check_for_buggy_ntfs_mount() {
# Check to ensure sdb1 mount is not mounted as ntfs # Check to ensure sdb1 mount is not mounted as ntfs
set +e set +e
@ -187,7 +197,7 @@ refresh_package_index() {
fi fi
let retries=retries-1 let retries=retries-1
if [ $retries -eq 0 ]; then if [ $retries -eq 0 ]; then
echo "Could not update package index" echo "ERROR: Could not update package index"
exit 1 exit 1
fi fi
sleep 1 sleep 1
@ -213,7 +223,7 @@ install_packages() {
fi fi
let retries=retries-1 let retries=retries-1
if [ $retries -eq 0 ]; then if [ $retries -eq 0 ]; then
echo "Could not install packages: $*" echo "ERROR: Could not install packages: $*"
exit 1 exit 1
fi fi
sleep 1 sleep 1
@ -221,6 +231,35 @@ install_packages() {
set -e set -e
} }
docker_pull_image() {
image=$1
set +e
retries=60
while [ $retries -gt 0 ]; do
pull_out=$(docker pull $image 2>&1)
rc=$?
if [ $rc -eq 0 ]; then
echo "$pull_out"
break
fi
# non-zero exit code: check if pull output has toomanyrequests or
# connection resets
if [ contains "$pull_out" "toomanyrequests" ] || [ contains "$pull_out" "connection reset by peer" ]; then
echo "WARNING: will retry:\n$pull_out"
else
echo "ERROR:\n$pull_out"
exit $rc
fi
let retries=retries-1
if [ $retries -le 0 ]; then
echo "ERROR: Could not pull docker image: $image"
exit $rc
fi
sleep $[($RANDOM % 5) + 1]s
done
set -e
}
# check sdb1 mount # check sdb1 mount
check_for_buggy_ntfs_mount check_for_buggy_ntfs_mount
@ -269,10 +308,10 @@ fi
# check if we're coming up from a reboot # check if we're coming up from a reboot
if [ -f $cascadefailed ]; then if [ -f $cascadefailed ]; then
echo "$cascadefailed file exists, assuming cascade failure during node prep" echo "ERROR: $cascadefailed file exists, assuming cascade failure during node prep"
exit 1 exit 1
elif [ -f $nodeprepfinished ]; then elif [ -f $nodeprepfinished ]; then
echo "$nodeprepfinished file exists, assuming successful completion of node prep" echo "INFO: $nodeprepfinished file exists, assuming successful completion of node prep"
exit 0 exit 0
fi fi
@ -343,11 +382,11 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then
repo=https://download.docker.com/linux/debian repo=https://download.docker.com/linux/debian
dockerversion=${dockerversion}debian dockerversion=${dockerversion}debian
else else
echo "unsupported sku: $sku for offer: $offer" echo "ERROR: unsupported sku: $sku for offer: $offer"
exit 1 exit 1
fi fi
if [ ! -z $gpu ] && [ $name != "ubuntu-xenial" ]; then if [ ! -z $gpu ] && [ $name != "ubuntu-xenial" ]; then
echo "gpu unsupported on this sku: $sku for offer $offer" echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
exit 1 exit 1
fi fi
# reload network settings # reload network settings
@ -375,7 +414,7 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then
fi fi
let retries=retries-1 let retries=retries-1
if [ $retries -eq 0 ]; then if [ $retries -eq 0 ]; then
echo "Could not add key for docker repo" echo "ERROR: Could not add key for docker repo"
exit 1 exit 1
fi fi
sleep 1 sleep 1
@ -470,7 +509,7 @@ EOF
set +e set +e
while : while :
do do
echo "Attempting to create nvidia-docker volume with version $nvdriverver" echo "INFO: Attempting to create nvidia-docker volume with version $nvdriverver"
docker volume create -d nvidia-docker --name nvidia_driver_$nvdriverver docker volume create -d nvidia-docker --name nvidia_driver_$nvdriverver
if [ $? -eq 0 ]; then if [ $? -eq 0 ]; then
break break
@ -479,7 +518,7 @@ EOF
NV_DIFF=$((($NV_NOW-$NV_START)/60)) NV_DIFF=$((($NV_NOW-$NV_START)/60))
# fail after 5 minutes of attempts # fail after 5 minutes of attempts
if [ $NV_DIFF -ge 5 ]; then if [ $NV_DIFF -ge 5 ]; then
echo "could not create nvidia-docker volume" echo "ERROR: could not create nvidia-docker volume"
exit 1 exit 1
fi fi
sleep 1 sleep 1
@ -507,7 +546,7 @@ EOF
elif [ $server_type == "glusterfs" ]; then elif [ $server_type == "glusterfs" ]; then
install_packages $offer glusterfs-client acl install_packages $offer glusterfs-client acl
else else
echo "Unknown file server type ${sc[0]} for ${sc[1]}" echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
exit 1 exit 1
fi fi
done done
@ -525,12 +564,12 @@ EOF
elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-linux" ]]; then elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-linux" ]]; then
# ensure container only support # ensure container only support
if [ $cascadecontainer -eq 0 ]; then if [ $cascadecontainer -eq 0 ]; then
echo "only supported through shipyard container" echo "ERROR: only supported through shipyard container"
exit 1 exit 1
fi fi
# gpu is not supported on these offers # gpu is not supported on these offers
if [ ! -z $gpu ]; then if [ ! -z $gpu ]; then
echo "gpu unsupported on this sku: $sku for offer $offer" echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
exit 1 exit 1
fi fi
if [[ $sku == 7.* ]]; then if [[ $sku == 7.* ]]; then
@ -542,7 +581,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
gfsenable="systemctl enable glusterd" gfsenable="systemctl enable glusterd"
rpcbindenable="systemctl enable rpcbind" rpcbindenable="systemctl enable rpcbind"
# TODO, in order to support docker > 1.9, need to upgrade to UEKR4 # TODO, in order to support docker > 1.9, need to upgrade to UEKR4
echo "oracle linux is not supported at this time" echo "ERROR: oracle linux is not supported at this time"
exit 1 exit 1
else else
srvenable="chkconfig docker on" srvenable="chkconfig docker on"
@ -552,7 +591,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
rpcbindenable="chkconfig rpcbind on" rpcbindenable="chkconfig rpcbind on"
fi fi
else else
echo "unsupported sku: $sku for offer: $offer" echo "ERROR: unsupported sku: $sku for offer: $offer"
exit 1 exit 1
fi fi
# reload network settings # reload network settings
@ -605,7 +644,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
sed -i -e "s/enabled=1/enabled=0/g" /etc/yum.repos.d/CentOS-Gluster-3.8.repo sed -i -e "s/enabled=1/enabled=0/g" /etc/yum.repos.d/CentOS-Gluster-3.8.repo
install_packages $offer --enablerepo=centos-gluster38,epel glusterfs-server acl install_packages $offer --enablerepo=centos-gluster38,epel glusterfs-server acl
else else
echo "Unknown file server type ${sc[0]} for ${sc[1]}" echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
exit 1 exit 1
fi fi
done done
@ -613,12 +652,12 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
# ensure container only support # ensure container only support
if [ $cascadecontainer -eq 0 ]; then if [ $cascadecontainer -eq 0 ]; then
echo "only supported through shipyard container" echo "ERROR: only supported through shipyard container"
exit 1 exit 1
fi fi
# gpu is not supported on these offers # gpu is not supported on these offers
if [ ! -z $gpu ]; then if [ ! -z $gpu ]; then
echo "gpu unsupported on this sku: $sku for offer $offer" echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
exit 1 exit 1
fi fi
# reload network settings # reload network settings
@ -648,7 +687,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
SUSEConnect -p sle-module-containers/12/x86_64 -r '' SUSEConnect -p sle-module-containers/12/x86_64 -r ''
fi fi
if [ -z $repodir ]; then if [ -z $repodir ]; then
echo "unsupported sku: $sku for offer: $offer" echo "ERROR: unsupported sku: $sku for offer: $offer"
exit 1 exit 1
fi fi
# update index # update index
@ -694,7 +733,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
zypper -n --gpg-auto-import-keys ref zypper -n --gpg-auto-import-keys ref
install_packages $offer glusterfs acl install_packages $offer glusterfs acl
else else
echo "Unknown file server type ${sc[0]} for ${sc[1]}" echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
exit 1 exit 1
fi fi
done done
@ -702,7 +741,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
# if hpc sku, set up intel mpi # if hpc sku, set up intel mpi
if [[ $offer == sles-hpc* ]]; then if [[ $offer == sles-hpc* ]]; then
if [ $sku != "12-sp1" ]; then if [ $sku != "12-sp1" ]; then
echo "unsupported sku for intel mpi setup on SLES" echo "ERROR: unsupported sku for intel mpi setup on SLES"
exit 1 exit 1
fi fi
install_packages $offer lsb install_packages $offer lsb
@ -712,13 +751,13 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
fi fi
fi fi
else else
echo "unsupported offer: $offer (sku: $sku)" echo "ERROR: unsupported offer: $offer (sku: $sku)"
exit 1 exit 1
fi fi
# retrieve docker images related to data movement # retrieve docker images related to data movement
docker pull alfpark/blobxfer:$blobxferversion docker_pull_image alfpark/blobxfer:$blobxferversion
docker pull alfpark/batch-shipyard:tfm-$version docker_pull_image alfpark/batch-shipyard:tfm-$version
# login to registry server # login to registry server
if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then
@ -734,15 +773,15 @@ if [ ! -z $sc_args ]; then
for sc_arg in ${sc_args[@]}; do for sc_arg in ${sc_args[@]}; do
IFS=':' read -ra sc <<< "$sc_arg" IFS=':' read -ra sc <<< "$sc_arg"
mountpoint=$AZ_BATCH_NODE_SHARED_DIR/${sc[1]} mountpoint=$AZ_BATCH_NODE_SHARED_DIR/${sc[1]}
echo "Creating host directory for storage cluster $sc_arg at $mountpoint" echo "INFO: Creating host directory for storage cluster $sc_arg at $mountpoint"
mkdir -p $mountpoint mkdir -p $mountpoint
chmod 777 $mountpoint chmod 777 $mountpoint
echo "Adding $mountpoint to fstab" echo "INFO: Adding $mountpoint to fstab"
# eval fstab var to expand vars (this is ok since it is set by shipyard) # eval fstab var to expand vars (this is ok since it is set by shipyard)
fstab_entry="${fstabs[$i]}" fstab_entry="${fstabs[$i]}"
echo $fstab_entry >> /etc/fstab echo $fstab_entry >> /etc/fstab
tail -n1 /etc/fstab tail -n1 /etc/fstab
echo "Mounting $mountpoint" echo "INFO: Mounting $mountpoint"
START=$(date -u +"%s") START=$(date -u +"%s")
set +e set +e
while : while :
@ -755,14 +794,14 @@ if [ ! -z $sc_args ]; then
DIFF=$((($NOW-$START)/60)) DIFF=$((($NOW-$START)/60))
# fail after 5 minutes of attempts # fail after 5 minutes of attempts
if [ $DIFF -ge 5 ]; then if [ $DIFF -ge 5 ]; then
echo "Could not mount storage cluster $sc_arg on: $mountpoint" echo "ERROR: Could not mount storage cluster $sc_arg on: $mountpoint"
exit 1 exit 1
fi fi
sleep 1 sleep 1
fi fi
done done
set -e set -e
echo "$mountpoint mounted." echo "INFO: $mountpoint mounted."
i=$(($i + 1)) i=$(($i + 1))
done done
fi fi
@ -805,6 +844,8 @@ p2p=$p2p
`env | grep DOCKER_LOGIN_` `env | grep DOCKER_LOGIN_`
EOF EOF
chmod 600 $envfile chmod 600 $envfile
# pull image
docker_pull_image alfpark/batch-shipyard:cascade-$version
# launch container # launch container
docker run $detached --net=host --env-file $envfile \ docker run $detached --net=host --env-file $envfile \
-v /var/run/docker.sock:/var/run/docker.sock \ -v /var/run/docker.sock:/var/run/docker.sock \
@ -847,7 +888,7 @@ if [ $p2penabled -eq 0 ]; then
wait $cascadepid wait $cascadepid
rc=$? rc=$?
if [ $rc -ne 0 ]; then if [ $rc -ne 0 ]; then
echo "cascade exited with non-zero exit code: $rc" echo "ERROR: cascade exited with non-zero exit code: $rc"
rm -f $nodeprepfinished rm -f $nodeprepfinished
exit $rc exit $rc
fi fi
@ -859,7 +900,7 @@ rm -f $cascadefailed
# block until images ready if specified # block until images ready if specified
if [ ! -z $block ]; then if [ ! -z $block ]; then
echo "blocking until images ready: $block" echo "INFO: blocking until images ready: $block"
IFS=',' read -ra RES <<< "$block" IFS=',' read -ra RES <<< "$block"
declare -a missing declare -a missing
while : while :
@ -870,7 +911,7 @@ if [ ! -z $block ]; then
fi fi
done done
if [ ${#missing[@]} -eq 0 ]; then if [ ${#missing[@]} -eq 0 ]; then
echo "all docker images present" echo "INFO: all docker images present"
break break
else else
unset missing unset missing

Просмотреть файл

@ -81,6 +81,16 @@ done
shift $((OPTIND-1)) shift $((OPTIND-1))
[ "$1" = "--" ] && shift [ "$1" = "--" ] && shift
contains() {
string="$1"
substring="$2"
if test "${string#*$substring}" != "$string"; then
return 0
else
return 1
fi
}
check_for_buggy_ntfs_mount() { check_for_buggy_ntfs_mount() {
# Check to ensure sdb1 mount is not mounted as ntfs # Check to ensure sdb1 mount is not mounted as ntfs
set +e set +e
@ -249,6 +259,35 @@ install_azurefile_docker_volume_driver() {
./azurefile-dockervolume-create.sh ./azurefile-dockervolume-create.sh
} }
docker_pull_image() {
image=$1
set +e
retries=60
while [ $retries -gt 0 ]; do
pull_out=$(docker pull $image 2>&1)
rc=$?
if [ $rc -eq 0 ]; then
echo "$pull_out"
break
fi
# non-zero exit code: check if pull output has toomanyrequests or
# connection resets
if [ contains "$pull_out" "toomanyrequests" ] || [ contains "$pull_out" "connection reset by peer" ]; then
echo "WARNING: will retry:\n$pull_out"
else
echo "ERROR:\n$pull_out"
exit $rc
fi
let retries=retries-1
if [ $retries -le 0 ]; then
echo "ERROR: Could not pull docker image: $image"
exit $rc
fi
sleep $[($RANDOM % 5) + 1]s
done
set -e
}
# try to get /etc/lsb-release # try to get /etc/lsb-release
if [ -e /etc/lsb-release ]; then if [ -e /etc/lsb-release ]; then
. /etc/lsb-release . /etc/lsb-release
@ -397,8 +436,8 @@ if [ ! -z $sc_args ]; then
fi fi
# retrieve docker images related to data movement # retrieve docker images related to data movement
docker pull alfpark/blobxfer:$blobxferversion docker_pull_image alfpark/blobxfer:$blobxferversion
docker pull alfpark/batch-shipyard:tfm-$version docker_pull_image alfpark/batch-shipyard:tfm-$version
# login to registry server # login to registry server
if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then
@ -442,6 +481,8 @@ p2p=$p2p
`env | grep DOCKER_LOGIN_` `env | grep DOCKER_LOGIN_`
EOF EOF
chmod 600 $envfile chmod 600 $envfile
# pull image
docker_pull_image alfpark/batch-shipyard:cascade-$version
# launch container # launch container
docker run $detached --net=host --env-file $envfile \ docker run $detached --net=host --env-file $envfile \
-v /var/run/docker.sock:/var/run/docker.sock \ -v /var/run/docker.sock:/var/run/docker.sock \

Просмотреть файл

@ -1181,6 +1181,10 @@ def pool_ssh(ctx, cardinal, nodeid, tty, command):
@pool.command('delnode') @pool.command('delnode')
@click.option(
'--all-start-task-failed',
is_flag=True,
help='Deleted all nodes with start task failed state')
@click.option( @click.option(
'--nodeid', help='NodeId of compute node in pool to delete') '--nodeid', help='NodeId of compute node in pool to delete')
@common_options @common_options
@ -1188,10 +1192,11 @@ def pool_ssh(ctx, cardinal, nodeid, tty, command):
@keyvault_options @keyvault_options
@aad_options @aad_options
@pass_cli_context @pass_cli_context
def pool_delnode(ctx, nodeid): def pool_delnode(ctx, all_start_task_failed, nodeid):
"""Delete a node from a pool""" """Delete a node from a pool"""
ctx.initialize_for_batch() ctx.initialize_for_batch()
convoy.fleet.action_pool_delnode(ctx.batch_client, ctx.config, nodeid) convoy.fleet.action_pool_delnode(
ctx.batch_client, ctx.config, all_start_task_failed, nodeid)
@pool.command('rebootnode') @pool.command('rebootnode')