More improvements for scale out robustness
- Add --all-start-task-failed to delnode - Reduce node output on pool allocation wait with number of nodes > 10
This commit is contained in:
Родитель
06188c1944
Коммит
2a48885da1
|
@ -90,6 +90,7 @@ celerybeat-schedule
|
||||||
|
|
||||||
# project specific ignores
|
# project specific ignores
|
||||||
shipyard
|
shipyard
|
||||||
|
shipyard.cmd
|
||||||
ssh_docker_tunnel_shipyard.sh
|
ssh_docker_tunnel_shipyard.sh
|
||||||
id_rsa_shipyard*
|
id_rsa_shipyard*
|
||||||
resources/azurefile-dockervolume-create.sh
|
resources/azurefile-dockervolume-create.sh
|
||||||
|
@ -97,4 +98,4 @@ resources/azurefile-dockervolumedriver
|
||||||
resources/azurefile-dockervolumedriver.env
|
resources/azurefile-dockervolumedriver.env
|
||||||
resources/docker-registry-v2.tar.gz
|
resources/docker-registry-v2.tar.gz
|
||||||
resources/nvidia-docker.deb
|
resources/nvidia-docker.deb
|
||||||
resources/nvidia-driver.run
|
resources/nvidia-driver*.run
|
||||||
|
|
11
CHANGELOG.md
11
CHANGELOG.md
|
@ -2,6 +2,17 @@
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- `--all-start-task-failed` parameter for `pool delnode`
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Improve robustness of docker image pulls within node prep scripts
|
||||||
|
- Restrict node list queries until pool allocation state emerges from resizing
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Remove nvidia gpu driver property from FFmpeg recipe
|
||||||
|
- Further improve retry logic for docker image pulls in cascade
|
||||||
|
|
||||||
## [2.8.0rc2] - 2017-06-30
|
## [2.8.0rc2] - 2017-06-30
|
||||||
### Added
|
### Added
|
||||||
- Support Mac OS X and Windows Subsystem for Linux installations via
|
- Support Mac OS X and Windows Subsystem for Linux installations via
|
||||||
|
|
|
@ -395,6 +395,18 @@ class DockerSaveThread(threading.Thread):
|
||||||
_DIRECTDL_DOWNLOADING.remove(self.resource)
|
_DIRECTDL_DOWNLOADING.remove(self.resource)
|
||||||
_DIRECTDL.remove(self.resource)
|
_DIRECTDL.remove(self.resource)
|
||||||
|
|
||||||
|
def _check_pull_output_overload(self, stdout: str, stderr: str) -> bool:
|
||||||
|
"""Check output for registry overload errors
|
||||||
|
:param str stdout: stdout
|
||||||
|
:param str stderr: stderr
|
||||||
|
:rtype: bool
|
||||||
|
:return: if error appears to be overload from registry
|
||||||
|
"""
|
||||||
|
if ('toomanyrequests' in stdout or 'toomanyrequests' in stderr or
|
||||||
|
'connection reset by peer' in stderr):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def _pull(self, image: str) -> tuple:
|
def _pull(self, image: str) -> tuple:
|
||||||
"""Docker image pull with registry normalization
|
"""Docker image pull with registry normalization
|
||||||
:param str image: image to pull
|
:param str image: image to pull
|
||||||
|
@ -414,7 +426,9 @@ class DockerSaveThread(threading.Thread):
|
||||||
shell=True,
|
shell=True,
|
||||||
universal_newlines=True)
|
universal_newlines=True)
|
||||||
stdout, stderr = proc.communicate()
|
stdout, stderr = proc.communicate()
|
||||||
if proc.returncode != 0 and _ALLOW_PUBLIC_PULL_WITH_PRIVATE:
|
if (proc.returncode != 0 and _ALLOW_PUBLIC_PULL_WITH_PRIVATE and
|
||||||
|
not _pub and
|
||||||
|
not self._check_pull_output_overload(stdout, stderr)):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
'could not pull from private registry, attempting '
|
'could not pull from private registry, attempting '
|
||||||
'Docker Public Hub instead')
|
'Docker Public Hub instead')
|
||||||
|
@ -445,31 +459,19 @@ class DockerSaveThread(threading.Thread):
|
||||||
_record_perf('pull-start', 'img={}'.format(image))
|
_record_perf('pull-start', 'img={}'.format(image))
|
||||||
start = datetime.datetime.now()
|
start = datetime.datetime.now()
|
||||||
logger.info('pulling image {} from {}'.format(image, _REGISTRY))
|
logger.info('pulling image {} from {}'.format(image, _REGISTRY))
|
||||||
npa_errors = 0
|
|
||||||
while True:
|
while True:
|
||||||
rc, stdout, stderr = self._pull(image)
|
rc, stdout, stderr = self._pull(image)
|
||||||
if rc != 0:
|
if rc == 0:
|
||||||
fail = True
|
|
||||||
if 'toomanyrequests' in stdout or 'toomanyrequests' in stderr:
|
|
||||||
logger.error(
|
|
||||||
'Too many requests issued to registry server, '
|
|
||||||
'retrying...')
|
|
||||||
fail = False
|
|
||||||
time.sleep(random.randint(5, 30))
|
|
||||||
elif 'no pull access' in stdout or 'no pull access' in stderr:
|
|
||||||
npa_errors += 1
|
|
||||||
if npa_errors < 3:
|
|
||||||
fail = False
|
|
||||||
logger.error(
|
|
||||||
'No pull access to registry server, retrying in '
|
|
||||||
'case of temporary overload...')
|
|
||||||
time.sleep(random.randint(1, 10))
|
|
||||||
if fail:
|
|
||||||
raise RuntimeError(
|
|
||||||
'docker pull failed: stdout={} stderr={}'.format(
|
|
||||||
stdout, stderr))
|
|
||||||
else:
|
|
||||||
break
|
break
|
||||||
|
elif self._check_pull_output_overload(stdout, stderr):
|
||||||
|
logger.error(
|
||||||
|
'Too many requests issued to registry server, '
|
||||||
|
'retrying...')
|
||||||
|
time.sleep(random.randint(5, 30))
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
'docker pull failed: stdout={} stderr={}'.format(
|
||||||
|
stdout, stderr))
|
||||||
diff = (datetime.datetime.now() - start).total_seconds()
|
diff = (datetime.datetime.now() - start).total_seconds()
|
||||||
logger.debug('took {} sec to pull docker image {} from {}'.format(
|
logger.debug('took {} sec to pull docker image {} from {}'.format(
|
||||||
diff, image, _REGISTRY))
|
diff, image, _REGISTRY))
|
||||||
|
|
116
convoy/batch.py
116
convoy/batch.py
|
@ -30,6 +30,7 @@ from builtins import ( # noqa
|
||||||
bytes, dict, int, list, object, range, str, ascii, chr, hex, input,
|
bytes, dict, int, list, object, range, str, ascii, chr, hex, input,
|
||||||
next, oct, open, pow, round, super, filter, map, zip)
|
next, oct, open, pow, round, super, filter, map, zip)
|
||||||
# stdlib imports
|
# stdlib imports
|
||||||
|
import collections
|
||||||
import datetime
|
import datetime
|
||||||
import fnmatch
|
import fnmatch
|
||||||
import getpass
|
import getpass
|
||||||
|
@ -39,6 +40,7 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import pathlib
|
import pathlib
|
||||||
import os
|
import os
|
||||||
|
import ssl
|
||||||
import tempfile
|
import tempfile
|
||||||
import time
|
import time
|
||||||
# non-stdlib imports
|
# non-stdlib imports
|
||||||
|
@ -65,6 +67,23 @@ _RUN_ELEVATED = batchmodels.UserIdentity(
|
||||||
elevation_level=batchmodels.ElevationLevel.admin,
|
elevation_level=batchmodels.ElevationLevel.admin,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
NodeStateCountCollection = collections.namedtuple(
|
||||||
|
'NodeStateCountCollection', [
|
||||||
|
'creating',
|
||||||
|
'idle',
|
||||||
|
'leaving_pool',
|
||||||
|
'offline',
|
||||||
|
'preempted',
|
||||||
|
'rebooting',
|
||||||
|
'reimaging',
|
||||||
|
'running',
|
||||||
|
'start_task_failed',
|
||||||
|
'starting',
|
||||||
|
'unknown',
|
||||||
|
'unusable',
|
||||||
|
'waiting_for_start_task',
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_batch_account(batch_mgmt_client, config):
|
def get_batch_account(batch_mgmt_client, config):
|
||||||
|
@ -262,6 +281,7 @@ def _block_for_nodes_ready(
|
||||||
pool.target_dedicated_nodes == 0)):
|
pool.target_dedicated_nodes == 0)):
|
||||||
fatal_resize_error = True
|
fatal_resize_error = True
|
||||||
if fatal_resize_error:
|
if fatal_resize_error:
|
||||||
|
list_nodes(batch_client, config)
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
'Fatal resize errors encountered for pool {}: {}'.format(
|
'Fatal resize errors encountered for pool {}: {}'.format(
|
||||||
pool.id, os.linesep.join(errors)))
|
pool.id, os.linesep.join(errors)))
|
||||||
|
@ -269,7 +289,17 @@ def _block_for_nodes_ready(
|
||||||
logger.error(
|
logger.error(
|
||||||
'Resize errors encountered for pool {}: {}'.format(
|
'Resize errors encountered for pool {}: {}'.format(
|
||||||
pool.id, os.linesep.join(errors)))
|
pool.id, os.linesep.join(errors)))
|
||||||
nodes = list(batch_client.compute_node.list(pool.id))
|
# check pool allocation state
|
||||||
|
if pool.allocation_state == batchmodels.AllocationState.resizing:
|
||||||
|
nodes = []
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
nodes = list(batch_client.compute_node.list(pool.id))
|
||||||
|
except ssl.SSLError:
|
||||||
|
# SSL error happens sometimes on paging... this is probably
|
||||||
|
# a bug in the underlying msrest/msrestazure library that
|
||||||
|
# is reusing the SSL connection improperly
|
||||||
|
nodes = []
|
||||||
# check if any nodes are in start task failed state
|
# check if any nodes are in start task failed state
|
||||||
if (any(node.state == batchmodels.ComputeNodeState.start_task_failed
|
if (any(node.state == batchmodels.ComputeNodeState.start_task_failed
|
||||||
for node in nodes)):
|
for node in nodes)):
|
||||||
|
@ -301,7 +331,10 @@ def _block_for_nodes_ready(
|
||||||
_reboot_node(batch_client, pool.id, node.id, True)
|
_reboot_node(batch_client, pool.id, node.id, True)
|
||||||
reboot_map[node.id] += 1
|
reboot_map[node.id] += 1
|
||||||
# refresh node list to reflect rebooting states
|
# refresh node list to reflect rebooting states
|
||||||
nodes = list(batch_client.compute_node.list(pool.id))
|
try:
|
||||||
|
nodes = list(batch_client.compute_node.list(pool.id))
|
||||||
|
except ssl.SSLError:
|
||||||
|
nodes = []
|
||||||
else:
|
else:
|
||||||
# fast path check for start task failures in non-reboot mode
|
# fast path check for start task failures in non-reboot mode
|
||||||
logger.error(
|
logger.error(
|
||||||
|
@ -321,7 +354,6 @@ def _block_for_nodes_ready(
|
||||||
pool.target_low_priority_nodes) and
|
pool.target_low_priority_nodes) and
|
||||||
all(node.state in stopping_states for node in nodes)):
|
all(node.state in stopping_states for node in nodes)):
|
||||||
if any(node.state not in end_states for node in nodes):
|
if any(node.state not in end_states for node in nodes):
|
||||||
# list nodes of pool
|
|
||||||
list_nodes(batch_client, config)
|
list_nodes(batch_client, config)
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
('Node(s) of pool {} not in {} state. Please inspect the '
|
('Node(s) of pool {} not in {} state. Please inspect the '
|
||||||
|
@ -336,14 +368,48 @@ def _block_for_nodes_ready(
|
||||||
i = 0
|
i = 0
|
||||||
logger.debug(
|
logger.debug(
|
||||||
('waiting for {} dedicated nodes and {} low priority nodes '
|
('waiting for {} dedicated nodes and {} low priority nodes '
|
||||||
'to reach desired state').format(
|
'to reach desired state in pool {} with '
|
||||||
|
'allocation_state={}').format(
|
||||||
pool.target_dedicated_nodes,
|
pool.target_dedicated_nodes,
|
||||||
pool.target_low_priority_nodes))
|
pool.target_low_priority_nodes,
|
||||||
for node in nodes:
|
pool.id,
|
||||||
logger.debug('{}: {}'.format(node.id, node.state))
|
pool.allocation_state))
|
||||||
|
if len(nodes) < 10:
|
||||||
|
for node in nodes:
|
||||||
|
logger.debug('{}: {}'.format(node.id, node.state))
|
||||||
|
else:
|
||||||
|
logger.debug(_node_state_counts(nodes))
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
|
|
||||||
|
|
||||||
|
def _node_state_counts(nodes):
|
||||||
|
# type: (List[batchmodels.ComputeNode]) -> NodeStateCountCollection
|
||||||
|
"""Collate counts of various nodes
|
||||||
|
:param list nodes: list of nodes
|
||||||
|
:rtype: NodeStateCountCollection
|
||||||
|
:return: node state count collection
|
||||||
|
"""
|
||||||
|
node_states = [node.state for node in nodes]
|
||||||
|
return NodeStateCountCollection(
|
||||||
|
creating=node_states.count(batchmodels.ComputeNodeState.creating),
|
||||||
|
idle=node_states.count(batchmodels.ComputeNodeState.idle),
|
||||||
|
leaving_pool=node_states.count(
|
||||||
|
batchmodels.ComputeNodeState.leaving_pool),
|
||||||
|
offline=node_states.count(batchmodels.ComputeNodeState.offline),
|
||||||
|
preempted=node_states.count(batchmodels.ComputeNodeState.preempted),
|
||||||
|
rebooting=node_states.count(batchmodels.ComputeNodeState.rebooting),
|
||||||
|
reimaging=node_states.count(batchmodels.ComputeNodeState.reimaging),
|
||||||
|
running=node_states.count(batchmodels.ComputeNodeState.running),
|
||||||
|
start_task_failed=node_states.count(
|
||||||
|
batchmodels.ComputeNodeState.start_task_failed),
|
||||||
|
starting=node_states.count(batchmodels.ComputeNodeState.starting),
|
||||||
|
unknown=node_states.count(batchmodels.ComputeNodeState.unknown),
|
||||||
|
unusable=node_states.count(batchmodels.ComputeNodeState.unusable),
|
||||||
|
waiting_for_start_task=node_states.count(
|
||||||
|
batchmodels.ComputeNodeState.waiting_for_start_task),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def wait_for_pool_ready(batch_client, config, pool_id, addl_end_states=None):
|
def wait_for_pool_ready(batch_client, config, pool_id, addl_end_states=None):
|
||||||
# type: (batch.BatchServiceClient, dict, str,
|
# type: (batch.BatchServiceClient, dict, str,
|
||||||
# List[batchmodels.ComputeNode]) -> List[batchmodels.ComputeNode]
|
# List[batchmodels.ComputeNode]) -> List[batchmodels.ComputeNode]
|
||||||
|
@ -751,25 +817,45 @@ def reboot_nodes(batch_client, config, all_start_task_failed, node_id):
|
||||||
_reboot_node(batch_client, pool_id, node_id, False)
|
_reboot_node(batch_client, pool_id, node_id, False)
|
||||||
|
|
||||||
|
|
||||||
def del_node(batch_client, config, node_id):
|
def del_node(batch_client, config, all_start_task_failed, node_id):
|
||||||
# type: (batch.BatchServiceClient, dict, str) -> None
|
# type: (batch.BatchServiceClient, dict, bool, str) -> None
|
||||||
"""Delete a node in a pool
|
"""Delete a node in a pool
|
||||||
:param batch_client: The batch client to use.
|
:param batch_client: The batch client to use.
|
||||||
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
|
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
|
||||||
:param dict config: configuration dict
|
:param dict config: configuration dict
|
||||||
|
:param bool all_start_task_failed: reboot all start task failed nodes
|
||||||
:param str node_id: node id to delete
|
:param str node_id: node id to delete
|
||||||
"""
|
"""
|
||||||
if util.is_none_or_empty(node_id):
|
node_ids = []
|
||||||
raise ValueError('node id is invalid')
|
|
||||||
pool_id = settings.pool_id(config)
|
pool_id = settings.pool_id(config)
|
||||||
if not util.confirm_action(
|
if all_start_task_failed:
|
||||||
config, 'delete node {} from {} pool'.format(node_id, pool_id)):
|
nodes = list(
|
||||||
|
batch_client.compute_node.list(
|
||||||
|
pool_id=pool_id,
|
||||||
|
compute_node_list_options=batchmodels.ComputeNodeListOptions(
|
||||||
|
filter='state eq \'starttaskfailed\'',
|
||||||
|
),
|
||||||
|
))
|
||||||
|
for node in nodes:
|
||||||
|
if util.confirm_action(
|
||||||
|
config, 'delete node {} from {} pool'.format(
|
||||||
|
node.id, pool_id)):
|
||||||
|
node_ids.append(node.id)
|
||||||
|
else:
|
||||||
|
if util.is_none_or_empty(node_id):
|
||||||
|
raise ValueError('node id is invalid')
|
||||||
|
if util.confirm_action(
|
||||||
|
config, 'delete node {} from {} pool'.format(
|
||||||
|
node_id, pool_id)):
|
||||||
|
node_ids.append(node_id)
|
||||||
|
if util.is_none_or_empty(node_ids):
|
||||||
|
logger.warning('no nodes to delete from pool: {}'.format(pool_id))
|
||||||
return
|
return
|
||||||
logger.info('Deleting node {} from pool {}'.format(node_id, pool_id))
|
logger.info('Deleting nodes {} from pool {}'.format(node_ids, pool_id))
|
||||||
batch_client.pool.remove_nodes(
|
batch_client.pool.remove_nodes(
|
||||||
pool_id=pool_id,
|
pool_id=pool_id,
|
||||||
node_remove_parameter=batchmodels.NodeRemoveParameter(
|
node_remove_parameter=batchmodels.NodeRemoveParameter(
|
||||||
node_list=[node_id],
|
node_list=node_ids,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -2280,15 +2280,16 @@ def action_pool_ssh(batch_client, config, cardinal, nodeid, tty, command):
|
||||||
util.subprocess_with_output(ssh_cmd)
|
util.subprocess_with_output(ssh_cmd)
|
||||||
|
|
||||||
|
|
||||||
def action_pool_delnode(batch_client, config, nodeid):
|
def action_pool_delnode(batch_client, config, all_start_task_failed, nodeid):
|
||||||
# type: (batchsc.BatchServiceClient, dict, str) -> None
|
# type: (batchsc.BatchServiceClient, dict, bool, str) -> None
|
||||||
"""Action: Pool Delnode
|
"""Action: Pool Delnode
|
||||||
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
:param azure.batch.batch_service_client.BatchServiceClient batch_client:
|
||||||
batch client
|
batch client
|
||||||
:param dict config: configuration dict
|
:param dict config: configuration dict
|
||||||
|
:param bool all_start_task_failed: reboot all start task failed nodes
|
||||||
:param str nodeid: nodeid to delete
|
:param str nodeid: nodeid to delete
|
||||||
"""
|
"""
|
||||||
batch.del_node(batch_client, config, nodeid)
|
batch.del_node(batch_client, config, all_start_task_failed, nodeid)
|
||||||
|
|
||||||
|
|
||||||
def action_pool_rebootnode(
|
def action_pool_rebootnode(
|
||||||
|
|
|
@ -129,8 +129,8 @@ It is recommended to follow the steps outlined on
|
||||||
[this guide](http://docs.python-guide.org/en/latest/starting/install3/osx/#install3-osx)
|
[this guide](http://docs.python-guide.org/en/latest/starting/install3/osx/#install3-osx)
|
||||||
to install Batch Shipyard on a Python3 installation rather than the default
|
to install Batch Shipyard on a Python3 installation rather than the default
|
||||||
Python 2.7 that is shipped with Mac OS X. However, if you prefer to use
|
Python 2.7 that is shipped with Mac OS X. However, if you prefer to use
|
||||||
the system defaulted Python 2.7, the installation will work that environment
|
the system defaulted Python 2.7, the installation will work with that
|
||||||
as well.
|
environment as well.
|
||||||
|
|
||||||
The `install.sh` script supports isolated installation through a virtual
|
The `install.sh` script supports isolated installation through a virtual
|
||||||
environment so that other system-wide or user python dependencies are left
|
environment so that other system-wide or user python dependencies are left
|
||||||
|
|
|
@ -456,6 +456,9 @@ Azure Storage.
|
||||||
pool configuration file
|
pool configuration file
|
||||||
* `--wait` will wait for deletion to complete
|
* `--wait` will wait for deletion to complete
|
||||||
* `delnode` will delete the specified node from the pool
|
* `delnode` will delete the specified node from the pool
|
||||||
|
* `--all-start-task-failed` will delete all nodes in the start task
|
||||||
|
failed state
|
||||||
|
* `--nodeid` is the node id to delete
|
||||||
* `dsu` will delete the SSH user defined in the pool configuration file
|
* `dsu` will delete the SSH user defined in the pool configuration file
|
||||||
from all nodes in the specified pool
|
from all nodes in the specified pool
|
||||||
* `grls` will retrieve all of the remote login settings for every node
|
* `grls` will retrieve all of the remote login settings for every node
|
||||||
|
|
|
@ -20,9 +20,6 @@ once they are available for N-series VMs.
|
||||||
are available for N-series VMs.
|
are available for N-series VMs.
|
||||||
* `sku` should be `16.04-LTS`. Other skus will be supported once they are
|
* `sku` should be `16.04-LTS`. Other skus will be supported once they are
|
||||||
available for N-series VMs.
|
available for N-series VMs.
|
||||||
* `gpu` property should be specified with the following members:
|
|
||||||
* `nvidia_driver` property contains the following members:
|
|
||||||
* `source` is a URL for the driver installer .run file
|
|
||||||
|
|
||||||
### Global Configuration
|
### Global Configuration
|
||||||
The global configuration should set the following properties:
|
The global configuration should set the following properties:
|
||||||
|
|
|
@ -15,11 +15,6 @@
|
||||||
"ssh": {
|
"ssh": {
|
||||||
"username": "docker"
|
"username": "docker"
|
||||||
},
|
},
|
||||||
"gpu": {
|
|
||||||
"nvidia_driver": {
|
|
||||||
"source": "<URL for nvidia driver for STANDARD_NV VMs>"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"reboot_on_start_task_failed": false,
|
"reboot_on_start_task_failed": false,
|
||||||
"block_until_all_global_resources_loaded": true
|
"block_until_all_global_resources_loaded": true
|
||||||
}
|
}
|
||||||
|
|
|
@ -119,6 +119,16 @@ if [ -z $version ]; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
contains() {
|
||||||
|
string="$1"
|
||||||
|
substring="$2"
|
||||||
|
if test "${string#*$substring}" != "$string"; then
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
check_for_buggy_ntfs_mount() {
|
check_for_buggy_ntfs_mount() {
|
||||||
# Check to ensure sdb1 mount is not mounted as ntfs
|
# Check to ensure sdb1 mount is not mounted as ntfs
|
||||||
set +e
|
set +e
|
||||||
|
@ -187,7 +197,7 @@ refresh_package_index() {
|
||||||
fi
|
fi
|
||||||
let retries=retries-1
|
let retries=retries-1
|
||||||
if [ $retries -eq 0 ]; then
|
if [ $retries -eq 0 ]; then
|
||||||
echo "Could not update package index"
|
echo "ERROR: Could not update package index"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
sleep 1
|
sleep 1
|
||||||
|
@ -213,7 +223,7 @@ install_packages() {
|
||||||
fi
|
fi
|
||||||
let retries=retries-1
|
let retries=retries-1
|
||||||
if [ $retries -eq 0 ]; then
|
if [ $retries -eq 0 ]; then
|
||||||
echo "Could not install packages: $*"
|
echo "ERROR: Could not install packages: $*"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
sleep 1
|
sleep 1
|
||||||
|
@ -221,6 +231,35 @@ install_packages() {
|
||||||
set -e
|
set -e
|
||||||
}
|
}
|
||||||
|
|
||||||
|
docker_pull_image() {
|
||||||
|
image=$1
|
||||||
|
set +e
|
||||||
|
retries=60
|
||||||
|
while [ $retries -gt 0 ]; do
|
||||||
|
pull_out=$(docker pull $image 2>&1)
|
||||||
|
rc=$?
|
||||||
|
if [ $rc -eq 0 ]; then
|
||||||
|
echo "$pull_out"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
# non-zero exit code: check if pull output has toomanyrequests or
|
||||||
|
# connection resets
|
||||||
|
if [ contains "$pull_out" "toomanyrequests" ] || [ contains "$pull_out" "connection reset by peer" ]; then
|
||||||
|
echo "WARNING: will retry:\n$pull_out"
|
||||||
|
else
|
||||||
|
echo "ERROR:\n$pull_out"
|
||||||
|
exit $rc
|
||||||
|
fi
|
||||||
|
let retries=retries-1
|
||||||
|
if [ $retries -le 0 ]; then
|
||||||
|
echo "ERROR: Could not pull docker image: $image"
|
||||||
|
exit $rc
|
||||||
|
fi
|
||||||
|
sleep $[($RANDOM % 5) + 1]s
|
||||||
|
done
|
||||||
|
set -e
|
||||||
|
}
|
||||||
|
|
||||||
# check sdb1 mount
|
# check sdb1 mount
|
||||||
check_for_buggy_ntfs_mount
|
check_for_buggy_ntfs_mount
|
||||||
|
|
||||||
|
@ -269,10 +308,10 @@ fi
|
||||||
|
|
||||||
# check if we're coming up from a reboot
|
# check if we're coming up from a reboot
|
||||||
if [ -f $cascadefailed ]; then
|
if [ -f $cascadefailed ]; then
|
||||||
echo "$cascadefailed file exists, assuming cascade failure during node prep"
|
echo "ERROR: $cascadefailed file exists, assuming cascade failure during node prep"
|
||||||
exit 1
|
exit 1
|
||||||
elif [ -f $nodeprepfinished ]; then
|
elif [ -f $nodeprepfinished ]; then
|
||||||
echo "$nodeprepfinished file exists, assuming successful completion of node prep"
|
echo "INFO: $nodeprepfinished file exists, assuming successful completion of node prep"
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -343,11 +382,11 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then
|
||||||
repo=https://download.docker.com/linux/debian
|
repo=https://download.docker.com/linux/debian
|
||||||
dockerversion=${dockerversion}debian
|
dockerversion=${dockerversion}debian
|
||||||
else
|
else
|
||||||
echo "unsupported sku: $sku for offer: $offer"
|
echo "ERROR: unsupported sku: $sku for offer: $offer"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if [ ! -z $gpu ] && [ $name != "ubuntu-xenial" ]; then
|
if [ ! -z $gpu ] && [ $name != "ubuntu-xenial" ]; then
|
||||||
echo "gpu unsupported on this sku: $sku for offer $offer"
|
echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
# reload network settings
|
# reload network settings
|
||||||
|
@ -375,7 +414,7 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then
|
||||||
fi
|
fi
|
||||||
let retries=retries-1
|
let retries=retries-1
|
||||||
if [ $retries -eq 0 ]; then
|
if [ $retries -eq 0 ]; then
|
||||||
echo "Could not add key for docker repo"
|
echo "ERROR: Could not add key for docker repo"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
sleep 1
|
sleep 1
|
||||||
|
@ -470,7 +509,7 @@ EOF
|
||||||
set +e
|
set +e
|
||||||
while :
|
while :
|
||||||
do
|
do
|
||||||
echo "Attempting to create nvidia-docker volume with version $nvdriverver"
|
echo "INFO: Attempting to create nvidia-docker volume with version $nvdriverver"
|
||||||
docker volume create -d nvidia-docker --name nvidia_driver_$nvdriverver
|
docker volume create -d nvidia-docker --name nvidia_driver_$nvdriverver
|
||||||
if [ $? -eq 0 ]; then
|
if [ $? -eq 0 ]; then
|
||||||
break
|
break
|
||||||
|
@ -479,7 +518,7 @@ EOF
|
||||||
NV_DIFF=$((($NV_NOW-$NV_START)/60))
|
NV_DIFF=$((($NV_NOW-$NV_START)/60))
|
||||||
# fail after 5 minutes of attempts
|
# fail after 5 minutes of attempts
|
||||||
if [ $NV_DIFF -ge 5 ]; then
|
if [ $NV_DIFF -ge 5 ]; then
|
||||||
echo "could not create nvidia-docker volume"
|
echo "ERROR: could not create nvidia-docker volume"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
sleep 1
|
sleep 1
|
||||||
|
@ -507,7 +546,7 @@ EOF
|
||||||
elif [ $server_type == "glusterfs" ]; then
|
elif [ $server_type == "glusterfs" ]; then
|
||||||
install_packages $offer glusterfs-client acl
|
install_packages $offer glusterfs-client acl
|
||||||
else
|
else
|
||||||
echo "Unknown file server type ${sc[0]} for ${sc[1]}"
|
echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
@ -525,12 +564,12 @@ EOF
|
||||||
elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-linux" ]]; then
|
elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-linux" ]]; then
|
||||||
# ensure container only support
|
# ensure container only support
|
||||||
if [ $cascadecontainer -eq 0 ]; then
|
if [ $cascadecontainer -eq 0 ]; then
|
||||||
echo "only supported through shipyard container"
|
echo "ERROR: only supported through shipyard container"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
# gpu is not supported on these offers
|
# gpu is not supported on these offers
|
||||||
if [ ! -z $gpu ]; then
|
if [ ! -z $gpu ]; then
|
||||||
echo "gpu unsupported on this sku: $sku for offer $offer"
|
echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
if [[ $sku == 7.* ]]; then
|
if [[ $sku == 7.* ]]; then
|
||||||
|
@ -542,7 +581,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
|
||||||
gfsenable="systemctl enable glusterd"
|
gfsenable="systemctl enable glusterd"
|
||||||
rpcbindenable="systemctl enable rpcbind"
|
rpcbindenable="systemctl enable rpcbind"
|
||||||
# TODO, in order to support docker > 1.9, need to upgrade to UEKR4
|
# TODO, in order to support docker > 1.9, need to upgrade to UEKR4
|
||||||
echo "oracle linux is not supported at this time"
|
echo "ERROR: oracle linux is not supported at this time"
|
||||||
exit 1
|
exit 1
|
||||||
else
|
else
|
||||||
srvenable="chkconfig docker on"
|
srvenable="chkconfig docker on"
|
||||||
|
@ -552,7 +591,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
|
||||||
rpcbindenable="chkconfig rpcbind on"
|
rpcbindenable="chkconfig rpcbind on"
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
echo "unsupported sku: $sku for offer: $offer"
|
echo "ERROR: unsupported sku: $sku for offer: $offer"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
# reload network settings
|
# reload network settings
|
||||||
|
@ -605,7 +644,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
|
||||||
sed -i -e "s/enabled=1/enabled=0/g" /etc/yum.repos.d/CentOS-Gluster-3.8.repo
|
sed -i -e "s/enabled=1/enabled=0/g" /etc/yum.repos.d/CentOS-Gluster-3.8.repo
|
||||||
install_packages $offer --enablerepo=centos-gluster38,epel glusterfs-server acl
|
install_packages $offer --enablerepo=centos-gluster38,epel glusterfs-server acl
|
||||||
else
|
else
|
||||||
echo "Unknown file server type ${sc[0]} for ${sc[1]}"
|
echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
@ -613,12 +652,12 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
|
||||||
elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
|
elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
|
||||||
# ensure container only support
|
# ensure container only support
|
||||||
if [ $cascadecontainer -eq 0 ]; then
|
if [ $cascadecontainer -eq 0 ]; then
|
||||||
echo "only supported through shipyard container"
|
echo "ERROR: only supported through shipyard container"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
# gpu is not supported on these offers
|
# gpu is not supported on these offers
|
||||||
if [ ! -z $gpu ]; then
|
if [ ! -z $gpu ]; then
|
||||||
echo "gpu unsupported on this sku: $sku for offer $offer"
|
echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
# reload network settings
|
# reload network settings
|
||||||
|
@ -648,7 +687,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
|
||||||
SUSEConnect -p sle-module-containers/12/x86_64 -r ''
|
SUSEConnect -p sle-module-containers/12/x86_64 -r ''
|
||||||
fi
|
fi
|
||||||
if [ -z $repodir ]; then
|
if [ -z $repodir ]; then
|
||||||
echo "unsupported sku: $sku for offer: $offer"
|
echo "ERROR: unsupported sku: $sku for offer: $offer"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
# update index
|
# update index
|
||||||
|
@ -694,7 +733,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
|
||||||
zypper -n --gpg-auto-import-keys ref
|
zypper -n --gpg-auto-import-keys ref
|
||||||
install_packages $offer glusterfs acl
|
install_packages $offer glusterfs acl
|
||||||
else
|
else
|
||||||
echo "Unknown file server type ${sc[0]} for ${sc[1]}"
|
echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
@ -702,7 +741,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
|
||||||
# if hpc sku, set up intel mpi
|
# if hpc sku, set up intel mpi
|
||||||
if [[ $offer == sles-hpc* ]]; then
|
if [[ $offer == sles-hpc* ]]; then
|
||||||
if [ $sku != "12-sp1" ]; then
|
if [ $sku != "12-sp1" ]; then
|
||||||
echo "unsupported sku for intel mpi setup on SLES"
|
echo "ERROR: unsupported sku for intel mpi setup on SLES"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
install_packages $offer lsb
|
install_packages $offer lsb
|
||||||
|
@ -712,13 +751,13 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
echo "unsupported offer: $offer (sku: $sku)"
|
echo "ERROR: unsupported offer: $offer (sku: $sku)"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# retrieve docker images related to data movement
|
# retrieve docker images related to data movement
|
||||||
docker pull alfpark/blobxfer:$blobxferversion
|
docker_pull_image alfpark/blobxfer:$blobxferversion
|
||||||
docker pull alfpark/batch-shipyard:tfm-$version
|
docker_pull_image alfpark/batch-shipyard:tfm-$version
|
||||||
|
|
||||||
# login to registry server
|
# login to registry server
|
||||||
if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then
|
if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then
|
||||||
|
@ -734,15 +773,15 @@ if [ ! -z $sc_args ]; then
|
||||||
for sc_arg in ${sc_args[@]}; do
|
for sc_arg in ${sc_args[@]}; do
|
||||||
IFS=':' read -ra sc <<< "$sc_arg"
|
IFS=':' read -ra sc <<< "$sc_arg"
|
||||||
mountpoint=$AZ_BATCH_NODE_SHARED_DIR/${sc[1]}
|
mountpoint=$AZ_BATCH_NODE_SHARED_DIR/${sc[1]}
|
||||||
echo "Creating host directory for storage cluster $sc_arg at $mountpoint"
|
echo "INFO: Creating host directory for storage cluster $sc_arg at $mountpoint"
|
||||||
mkdir -p $mountpoint
|
mkdir -p $mountpoint
|
||||||
chmod 777 $mountpoint
|
chmod 777 $mountpoint
|
||||||
echo "Adding $mountpoint to fstab"
|
echo "INFO: Adding $mountpoint to fstab"
|
||||||
# eval fstab var to expand vars (this is ok since it is set by shipyard)
|
# eval fstab var to expand vars (this is ok since it is set by shipyard)
|
||||||
fstab_entry="${fstabs[$i]}"
|
fstab_entry="${fstabs[$i]}"
|
||||||
echo $fstab_entry >> /etc/fstab
|
echo $fstab_entry >> /etc/fstab
|
||||||
tail -n1 /etc/fstab
|
tail -n1 /etc/fstab
|
||||||
echo "Mounting $mountpoint"
|
echo "INFO: Mounting $mountpoint"
|
||||||
START=$(date -u +"%s")
|
START=$(date -u +"%s")
|
||||||
set +e
|
set +e
|
||||||
while :
|
while :
|
||||||
|
@ -755,14 +794,14 @@ if [ ! -z $sc_args ]; then
|
||||||
DIFF=$((($NOW-$START)/60))
|
DIFF=$((($NOW-$START)/60))
|
||||||
# fail after 5 minutes of attempts
|
# fail after 5 minutes of attempts
|
||||||
if [ $DIFF -ge 5 ]; then
|
if [ $DIFF -ge 5 ]; then
|
||||||
echo "Could not mount storage cluster $sc_arg on: $mountpoint"
|
echo "ERROR: Could not mount storage cluster $sc_arg on: $mountpoint"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
sleep 1
|
sleep 1
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
set -e
|
set -e
|
||||||
echo "$mountpoint mounted."
|
echo "INFO: $mountpoint mounted."
|
||||||
i=$(($i + 1))
|
i=$(($i + 1))
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
@ -805,6 +844,8 @@ p2p=$p2p
|
||||||
`env | grep DOCKER_LOGIN_`
|
`env | grep DOCKER_LOGIN_`
|
||||||
EOF
|
EOF
|
||||||
chmod 600 $envfile
|
chmod 600 $envfile
|
||||||
|
# pull image
|
||||||
|
docker_pull_image alfpark/batch-shipyard:cascade-$version
|
||||||
# launch container
|
# launch container
|
||||||
docker run $detached --net=host --env-file $envfile \
|
docker run $detached --net=host --env-file $envfile \
|
||||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||||
|
@ -847,7 +888,7 @@ if [ $p2penabled -eq 0 ]; then
|
||||||
wait $cascadepid
|
wait $cascadepid
|
||||||
rc=$?
|
rc=$?
|
||||||
if [ $rc -ne 0 ]; then
|
if [ $rc -ne 0 ]; then
|
||||||
echo "cascade exited with non-zero exit code: $rc"
|
echo "ERROR: cascade exited with non-zero exit code: $rc"
|
||||||
rm -f $nodeprepfinished
|
rm -f $nodeprepfinished
|
||||||
exit $rc
|
exit $rc
|
||||||
fi
|
fi
|
||||||
|
@ -859,7 +900,7 @@ rm -f $cascadefailed
|
||||||
|
|
||||||
# block until images ready if specified
|
# block until images ready if specified
|
||||||
if [ ! -z $block ]; then
|
if [ ! -z $block ]; then
|
||||||
echo "blocking until images ready: $block"
|
echo "INFO: blocking until images ready: $block"
|
||||||
IFS=',' read -ra RES <<< "$block"
|
IFS=',' read -ra RES <<< "$block"
|
||||||
declare -a missing
|
declare -a missing
|
||||||
while :
|
while :
|
||||||
|
@ -870,7 +911,7 @@ if [ ! -z $block ]; then
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
if [ ${#missing[@]} -eq 0 ]; then
|
if [ ${#missing[@]} -eq 0 ]; then
|
||||||
echo "all docker images present"
|
echo "INFO: all docker images present"
|
||||||
break
|
break
|
||||||
else
|
else
|
||||||
unset missing
|
unset missing
|
||||||
|
|
|
@ -81,6 +81,16 @@ done
|
||||||
shift $((OPTIND-1))
|
shift $((OPTIND-1))
|
||||||
[ "$1" = "--" ] && shift
|
[ "$1" = "--" ] && shift
|
||||||
|
|
||||||
|
contains() {
|
||||||
|
string="$1"
|
||||||
|
substring="$2"
|
||||||
|
if test "${string#*$substring}" != "$string"; then
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
check_for_buggy_ntfs_mount() {
|
check_for_buggy_ntfs_mount() {
|
||||||
# Check to ensure sdb1 mount is not mounted as ntfs
|
# Check to ensure sdb1 mount is not mounted as ntfs
|
||||||
set +e
|
set +e
|
||||||
|
@ -249,6 +259,35 @@ install_azurefile_docker_volume_driver() {
|
||||||
./azurefile-dockervolume-create.sh
|
./azurefile-dockervolume-create.sh
|
||||||
}
|
}
|
||||||
|
|
||||||
|
docker_pull_image() {
|
||||||
|
image=$1
|
||||||
|
set +e
|
||||||
|
retries=60
|
||||||
|
while [ $retries -gt 0 ]; do
|
||||||
|
pull_out=$(docker pull $image 2>&1)
|
||||||
|
rc=$?
|
||||||
|
if [ $rc -eq 0 ]; then
|
||||||
|
echo "$pull_out"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
# non-zero exit code: check if pull output has toomanyrequests or
|
||||||
|
# connection resets
|
||||||
|
if [ contains "$pull_out" "toomanyrequests" ] || [ contains "$pull_out" "connection reset by peer" ]; then
|
||||||
|
echo "WARNING: will retry:\n$pull_out"
|
||||||
|
else
|
||||||
|
echo "ERROR:\n$pull_out"
|
||||||
|
exit $rc
|
||||||
|
fi
|
||||||
|
let retries=retries-1
|
||||||
|
if [ $retries -le 0 ]; then
|
||||||
|
echo "ERROR: Could not pull docker image: $image"
|
||||||
|
exit $rc
|
||||||
|
fi
|
||||||
|
sleep $[($RANDOM % 5) + 1]s
|
||||||
|
done
|
||||||
|
set -e
|
||||||
|
}
|
||||||
|
|
||||||
# try to get /etc/lsb-release
|
# try to get /etc/lsb-release
|
||||||
if [ -e /etc/lsb-release ]; then
|
if [ -e /etc/lsb-release ]; then
|
||||||
. /etc/lsb-release
|
. /etc/lsb-release
|
||||||
|
@ -397,8 +436,8 @@ if [ ! -z $sc_args ]; then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# retrieve docker images related to data movement
|
# retrieve docker images related to data movement
|
||||||
docker pull alfpark/blobxfer:$blobxferversion
|
docker_pull_image alfpark/blobxfer:$blobxferversion
|
||||||
docker pull alfpark/batch-shipyard:tfm-$version
|
docker_pull_image alfpark/batch-shipyard:tfm-$version
|
||||||
|
|
||||||
# login to registry server
|
# login to registry server
|
||||||
if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then
|
if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then
|
||||||
|
@ -442,6 +481,8 @@ p2p=$p2p
|
||||||
`env | grep DOCKER_LOGIN_`
|
`env | grep DOCKER_LOGIN_`
|
||||||
EOF
|
EOF
|
||||||
chmod 600 $envfile
|
chmod 600 $envfile
|
||||||
|
# pull image
|
||||||
|
docker_pull_image alfpark/batch-shipyard:cascade-$version
|
||||||
# launch container
|
# launch container
|
||||||
docker run $detached --net=host --env-file $envfile \
|
docker run $detached --net=host --env-file $envfile \
|
||||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||||
|
|
|
@ -1181,6 +1181,10 @@ def pool_ssh(ctx, cardinal, nodeid, tty, command):
|
||||||
|
|
||||||
|
|
||||||
@pool.command('delnode')
|
@pool.command('delnode')
|
||||||
|
@click.option(
|
||||||
|
'--all-start-task-failed',
|
||||||
|
is_flag=True,
|
||||||
|
help='Deleted all nodes with start task failed state')
|
||||||
@click.option(
|
@click.option(
|
||||||
'--nodeid', help='NodeId of compute node in pool to delete')
|
'--nodeid', help='NodeId of compute node in pool to delete')
|
||||||
@common_options
|
@common_options
|
||||||
|
@ -1188,10 +1192,11 @@ def pool_ssh(ctx, cardinal, nodeid, tty, command):
|
||||||
@keyvault_options
|
@keyvault_options
|
||||||
@aad_options
|
@aad_options
|
||||||
@pass_cli_context
|
@pass_cli_context
|
||||||
def pool_delnode(ctx, nodeid):
|
def pool_delnode(ctx, all_start_task_failed, nodeid):
|
||||||
"""Delete a node from a pool"""
|
"""Delete a node from a pool"""
|
||||||
ctx.initialize_for_batch()
|
ctx.initialize_for_batch()
|
||||||
convoy.fleet.action_pool_delnode(ctx.batch_client, ctx.config, nodeid)
|
convoy.fleet.action_pool_delnode(
|
||||||
|
ctx.batch_client, ctx.config, all_start_task_failed, nodeid)
|
||||||
|
|
||||||
|
|
||||||
@pool.command('rebootnode')
|
@pool.command('rebootnode')
|
||||||
|
|
Загрузка…
Ссылка в новой задаче