Add wait option for pool resize
- Fix TMPDIR sed command - Add generated shipyard script to gitignore
This commit is contained in:
Родитель
eb2f108e86
Коммит
efb8c3105f
|
@ -92,6 +92,7 @@ ENV/
|
|||
.ropeproject
|
||||
|
||||
# project specific ignores
|
||||
shipyard
|
||||
ssh_docker_tunnel_shipyard.sh
|
||||
id_rsa_shipyard*
|
||||
resources/azurefile-dockervolume-create.sh
|
||||
|
|
|
@ -11,7 +11,7 @@ RUN apk update \
|
|||
&& pip3 install --no-cache-dir --upgrade pip \
|
||||
&& git clone https://github.com/Azure/batch-shipyard.git /opt/batch-shipyard \
|
||||
&& cd /opt/batch-shipyard \
|
||||
&& rm -rf .git* .travis.yml install.sh shipyard \
|
||||
&& rm -rf .git* .travis.yml install.sh \
|
||||
&& pip3 install -r requirements.txt \
|
||||
&& apk del --purge \
|
||||
build-base python3-dev openssl-dev libffi-dev git \
|
||||
|
|
|
@ -29,6 +29,7 @@ import argparse
|
|||
import copy
|
||||
import datetime
|
||||
import json
|
||||
import pathlib
|
||||
import subprocess
|
||||
import sys
|
||||
# non-stdlib imports
|
||||
|
@ -282,7 +283,7 @@ def graph_data(data: dict, sizes: dict, offer: str, sku: str):
|
|||
mintime = start
|
||||
if start > maxtime:
|
||||
maxtime = start
|
||||
print('delta:', maxtime - mintime)
|
||||
print('nodeready variance:', maxtime - mintime)
|
||||
total_gr = 0
|
||||
total_ac = 0
|
||||
with open(dat_fname, 'w') as f:
|
||||
|
@ -383,10 +384,21 @@ def main():
|
|||
# get command-line args
|
||||
args = parseargs()
|
||||
|
||||
if args.configdir is not None:
|
||||
if args.credentials is None:
|
||||
args.credentials = str(
|
||||
pathlib.Path(args.configdir, 'credentials.json'))
|
||||
if args.config is None:
|
||||
args.config = str(pathlib.Path(args.configdir, 'config.json'))
|
||||
if args.pool is None:
|
||||
args.pool = str(pathlib.Path(args.configdir, 'pool.json'))
|
||||
|
||||
if args.credentials is None:
|
||||
raise ValueError('credentials json not specified')
|
||||
if args.config is None:
|
||||
raise ValueError('config json not specified')
|
||||
if args.pool is None:
|
||||
raise ValueError('pool json not specified')
|
||||
|
||||
with open(args.credentials, 'r') as f:
|
||||
config = json.load(f)
|
||||
|
@ -408,7 +420,9 @@ def parseargs():
|
|||
:return: parsed arguments
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Shipyard perf graph generator')
|
||||
description='Batch Shipyard perf graph generator')
|
||||
parser.add_argument(
|
||||
'--configdir', help='json config dir')
|
||||
parser.add_argument(
|
||||
'--credentials', help='credentials json config')
|
||||
parser.add_argument(
|
||||
|
|
|
@ -107,7 +107,7 @@ def parseargs():
|
|||
:return: parsed arguments
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Shipyard perf recorder')
|
||||
description='Batch Shipyard perf recorder')
|
||||
parser.add_argument('source', help='event source')
|
||||
parser.add_argument('event', help='event')
|
||||
parser.add_argument('--ts', help='timestamp (posix)')
|
||||
|
|
|
@ -215,16 +215,19 @@ def _retrieve_outputs_from_failed_nodes(batch_client, config, nodeid=None):
|
|||
|
||||
|
||||
def _block_for_nodes_ready(
|
||||
batch_client, config, node_state, pool_id, reboot_on_failed):
|
||||
batch_client, config, stopping_states, end_states, pool_id,
|
||||
reboot_on_failed):
|
||||
# type: (batch.BatchServiceClient, dict,
|
||||
# List[batchmodels.ComputeNodeState],
|
||||
# List[batchmodels.ComputeNodeState], str,
|
||||
# bool) -> List[batchmodels.ComputeNode]
|
||||
"""Wait for nodes to enter "ready": steady state and all nodes in
|
||||
specified states
|
||||
"""Wait for pool to enter steady state and all nodes to enter stopping
|
||||
states
|
||||
:param batch_client: The batch client to use.
|
||||
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
|
||||
:param dict config: configuration dict
|
||||
:param list node_state: list of acceptable node states
|
||||
:param list stopping_states: list of node states to stop polling
|
||||
:param list end_states: list of acceptable end states
|
||||
:param str pool_id: pool id
|
||||
:param bool reboot_on_failed: reboot node on failed start state
|
||||
:rtype: list
|
||||
|
@ -232,7 +235,7 @@ def _block_for_nodes_ready(
|
|||
"""
|
||||
logger.info(
|
||||
'waiting for all nodes in pool {} to reach one of: {!r}'.format(
|
||||
pool_id, node_state))
|
||||
pool_id, stopping_states))
|
||||
i = 0
|
||||
reboot_map = {}
|
||||
while True:
|
||||
|
@ -289,15 +292,15 @@ def _block_for_nodes_ready(
|
|||
'non-transient, please submit an issue on '
|
||||
'GitHub.').format(pool.id))
|
||||
if (len(nodes) >= pool.target_dedicated and
|
||||
all(node.state in node_state for node in nodes)):
|
||||
if any(node.state != batchmodels.ComputeNodeState.idle
|
||||
for node in nodes):
|
||||
all(node.state in stopping_states for node in nodes)):
|
||||
if any(node.state not in end_states for node in nodes):
|
||||
# list nodes of pool
|
||||
list_nodes(batch_client, config)
|
||||
raise RuntimeError(
|
||||
'Node(s) of pool {} not in idle state. Please inspect '
|
||||
'the state of nodes in the pool. Please retry pool '
|
||||
'creation by deleting and recreating the pool.')
|
||||
('Node(s) of pool {} not in {} state. Please inspect '
|
||||
'the state of nodes in the pool. Please retry pool '
|
||||
'creation by deleting and recreating the pool.').format(
|
||||
pool.id, end_states))
|
||||
else:
|
||||
return nodes
|
||||
i += 1
|
||||
|
@ -310,30 +313,38 @@ def _block_for_nodes_ready(
|
|||
time.sleep(10)
|
||||
|
||||
|
||||
def wait_for_pool_ready(batch_client, config, pool_id):
|
||||
# type: (batch.BatchServiceClient, dict, str) ->
|
||||
# List[batchmodels.ComputeNode]
|
||||
"""Wait for pool to enter "ready": steady state and all nodes idle
|
||||
def wait_for_pool_ready(batch_client, config, pool_id, addl_end_states=None):
|
||||
# type: (batch.BatchServiceClient, dict, str,
|
||||
# List[batchmodels.ComputeNode]) -> List[batchmodels.ComputeNode]
|
||||
"""Wait for pool to enter steady state and all nodes in end states
|
||||
:param batch_client: The batch client to use.
|
||||
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
|
||||
:param dict config: configuration dict
|
||||
:param str pool_id: pool id
|
||||
:param list addl_end_states: additional end states
|
||||
:rtype: list
|
||||
:return: list of nodes
|
||||
"""
|
||||
# wait for pool idle
|
||||
node_state = frozenset(
|
||||
(batchmodels.ComputeNodeState.starttaskfailed,
|
||||
batchmodels.ComputeNodeState.unusable,
|
||||
batchmodels.ComputeNodeState.idle)
|
||||
)
|
||||
base_stopping_states = [
|
||||
batchmodels.ComputeNodeState.starttaskfailed,
|
||||
batchmodels.ComputeNodeState.unusable,
|
||||
batchmodels.ComputeNodeState.idle
|
||||
]
|
||||
base_end_states = [batchmodels.ComputeNodeState.idle]
|
||||
try:
|
||||
reboot_on_failed = config[
|
||||
'pool_specification']['reboot_on_start_task_failed']
|
||||
except KeyError:
|
||||
reboot_on_failed = False
|
||||
if addl_end_states is not None and len(addl_end_states) > 0:
|
||||
base_stopping_states.extend(addl_end_states)
|
||||
base_end_states.extend(addl_end_states)
|
||||
stopping_states = frozenset(base_stopping_states)
|
||||
end_states = frozenset(base_end_states)
|
||||
nodes = _block_for_nodes_ready(
|
||||
batch_client, config, node_state, pool_id, reboot_on_failed)
|
||||
batch_client, config, stopping_states, end_states, pool_id,
|
||||
reboot_on_failed)
|
||||
list_nodes(batch_client, config, nodes=nodes)
|
||||
return nodes
|
||||
|
||||
|
||||
|
@ -556,16 +567,21 @@ def list_pools(batch_client):
|
|||
logger.error('no pools found')
|
||||
|
||||
|
||||
def resize_pool(batch_client, config):
|
||||
# type: (azure.batch.batch_service_client.BatchServiceClient, dict) -> None
|
||||
def resize_pool(batch_client, config, wait=False):
|
||||
# type: (azure.batch.batch_service_client.BatchServiceClient, dict,
|
||||
# bool) -> list
|
||||
"""Resize a pool
|
||||
:param batch_client: The batch client to use.
|
||||
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
|
||||
:param dict config: configuration dict
|
||||
:param bool wait: wait for operation to complete
|
||||
:rtype: list or None
|
||||
:return: list of nodes if wait or None
|
||||
"""
|
||||
pool_id = config['pool_specification']['id']
|
||||
vm_count = int(config['pool_specification']['vm_count'])
|
||||
logger.info('Resizing pool {} to {}'.format(pool_id, vm_count))
|
||||
logger.info('Resizing pool {} to {} compute nodes'.format(
|
||||
pool_id, vm_count))
|
||||
batch_client.pool.resize(
|
||||
pool_id=pool_id,
|
||||
pool_resize_parameter=batchmodels.PoolResizeParameter(
|
||||
|
@ -573,6 +589,10 @@ def resize_pool(batch_client, config):
|
|||
resize_timeout=datetime.timedelta(minutes=20),
|
||||
)
|
||||
)
|
||||
if wait:
|
||||
return wait_for_pool_ready(
|
||||
batch_client, config, pool_id,
|
||||
addl_end_states=[batchmodels.ComputeNodeState.running])
|
||||
|
||||
|
||||
def del_pool(batch_client, config):
|
||||
|
@ -1037,16 +1057,18 @@ def terminate_tasks(batch_client, config, jobid=None, taskid=None, wait=False):
|
|||
continue
|
||||
|
||||
|
||||
def list_nodes(batch_client, config):
|
||||
# type: (batch.BatchServiceClient, dict) -> None
|
||||
def list_nodes(batch_client, config, nodes=None):
|
||||
# type: (batch.BatchServiceClient, dict, list) -> None
|
||||
"""Get a list of nodes
|
||||
:param batch_client: The batch client to use.
|
||||
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
|
||||
:param dict config: configuration dict
|
||||
:param lsit nodes: list of nodes
|
||||
"""
|
||||
pool_id = config['pool_specification']['id']
|
||||
logger.debug('listing nodes for pool {}'.format(pool_id))
|
||||
nodes = batch_client.compute_node.list(pool_id)
|
||||
if nodes is None:
|
||||
nodes = batch_client.compute_node.list(pool_id)
|
||||
for node in nodes:
|
||||
logger.info(
|
||||
('node_id={} [state={} scheduling_state={} ip_address={} '
|
||||
|
|
|
@ -1049,14 +1049,15 @@ def action_pool_delete(
|
|||
time.sleep(3)
|
||||
|
||||
|
||||
def action_pool_resize(batch_client, blob_client, config):
|
||||
def action_pool_resize(batch_client, blob_client, config, wait):
|
||||
# type: (batch.BatchServiceClient, azureblob.BlockBlobService,
|
||||
# dict) -> None
|
||||
# dict, bool) -> None
|
||||
"""Resize pool that may contain glusterfs
|
||||
:param batch_client: The batch client to use.
|
||||
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
|
||||
:param azure.storage.blob.BlockBlobService blob_client: blob client
|
||||
:param dict config: configuration dict
|
||||
:param bool wait: wait for operation to complete
|
||||
"""
|
||||
pool_id = config['pool_specification']['id']
|
||||
# check if this is a glusterfs-enabled pool
|
||||
|
@ -1081,13 +1082,12 @@ def action_pool_resize(batch_client, blob_client, config):
|
|||
if gluster_present:
|
||||
for node in batch_client.compute_node.list(pool_id):
|
||||
old_nodes[node.id] = node.ip_address
|
||||
logger.debug('forcing wait to True due to glusterfs')
|
||||
wait = True
|
||||
# resize pool
|
||||
convoy.batch.resize_pool(batch_client, config)
|
||||
nodes = convoy.batch.resize_pool(batch_client, config, wait)
|
||||
# add brick for new nodes
|
||||
if gluster_present:
|
||||
# wait for nodes to reach idle
|
||||
nodes = convoy.batch.wait_for_pool_ready(
|
||||
batch_client, config, pool_id)
|
||||
# get internal ip addresses of new nodes
|
||||
new_nodes = [
|
||||
node.ip_address for node in nodes if node.id not in old_nodes
|
||||
|
|
|
@ -10,7 +10,7 @@ git clone https://github.com/Azure/batch-shipyard.git
|
|||
or [download the latest release](https://github.com/Azure/batch-shipyard/releases).
|
||||
|
||||
Batch Shipyard includes an installation script to simplify installation on
|
||||
a variety of recent platforms. This installation script can be used
|
||||
a variety of recent Linux distributions. This installation script can be used
|
||||
regardless of if you obtained Batch Shipyard through `git clone` or
|
||||
downloading a release package.
|
||||
|
||||
|
@ -70,9 +70,9 @@ line-endings (CRLF) then compute nodes will fail to start properly.
|
|||
## Manual Installation
|
||||
### Requirements
|
||||
The Batch Shipyard tool is written in Python. The client script is compatible
|
||||
with Python 2.7 or 3.3+. You will also need to install the
|
||||
[Azure Batch](https://pypi.python.org/pypi/azure-batch) and
|
||||
[Azure Storage](https://pypi.python.org/pypi/azure-storage) python packages.
|
||||
with Python 2.7 or 3.3+. You will also need to install dependent Python
|
||||
packages including the [Azure Batch](https://pypi.python.org/pypi/azure-batch)
|
||||
and [Azure Storage](https://pypi.python.org/pypi/azure-storage) packages.
|
||||
Installation can be performed using the [requirements.txt](../requirements.txt)
|
||||
file via the command `pip install --upgrade --user -r requirements.txt`
|
||||
(or via `pip3` for python3). If `pip` is not installed on your system,
|
||||
|
@ -80,7 +80,7 @@ please continue reading below. Note that this `pip` command should be run
|
|||
for every Batch Shipyard upgrade if not using `install.sh`.
|
||||
|
||||
Batch Shipyard has some Python dependencies which require a valid compiler,
|
||||
ssl, ffi, and python development libraries to be installed due to the
|
||||
ssl, ffi, and Python development libraries to be installed due to the
|
||||
[cryptography](https://pypi.python.org/pypi/cryptography) dependency on Linux.
|
||||
For Windows, binary wheels will be installed for dependencies, thus no
|
||||
development environment is needed. The following are example commands to
|
||||
|
@ -109,13 +109,11 @@ pip install --upgrade pip
|
|||
####Note about Python 3.3+
|
||||
If installing for Python 3.3+, then simply use the Python3 equivalents for
|
||||
the python dependencies. For example, on Ubuntu/Debian:
|
||||
|
||||
```
|
||||
apt-get update
|
||||
apt-get install -y build-essential libssl-dev libffi-dev libpython3-dev python3-dev python3-pip
|
||||
pip install --upgrade pip
|
||||
```
|
||||
|
||||
would install the proper dependencies for Python3.
|
||||
|
||||
###Data Movement Support
|
||||
|
|
|
@ -181,6 +181,7 @@ in the specified pool
|
|||
* `listnodes` will list all nodes in the specified pool
|
||||
* `resize` will resize the pool to the `vm_count` specified in the pool
|
||||
configuration file
|
||||
* `--wait` will wait for resize to complete
|
||||
|
||||
## Storage Command
|
||||
The `storage` command has the following sub-commands:
|
||||
|
|
|
@ -274,11 +274,11 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then
|
|||
rm -f /var/lib/docker/network/files/local-kv.db
|
||||
if [ $name == "debian-jessie" ]; then
|
||||
mkdir -p /mnt/resource/docker-tmp
|
||||
sed -i -e '/export TMPDIR=.*/,${s||export TMPDIR=\"/mnt/resource/docker-tmp\"|;b};$q1' /etc/default/docker
|
||||
sed -i -e 's,.*export TMPDIR=.*,export TMPDIR="/mnt/resource/docker-tmp",g' /etc/default/docker || echo export TMPDIR=\"/mnt/resource/docker-tmp\" >> /etc/default/docker
|
||||
sed -i -e '/^DOCKER_OPTS=.*/,${s||DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\"|;b};$q1' /etc/default/docker || echo DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\" >> /etc/default/docker
|
||||
else
|
||||
mkdir -p /mnt/docker-tmp
|
||||
sed -i -e '/export TMPDIR=.*/,${s||export TMPDIR=\"/mnt/docker-tmp\"|;b};$q1' /etc/default/docker
|
||||
sed -i -e 's,.*export TMPDIR=.*,export TMPDIR="/mnt/docker-tmp",g' /etc/default/docker || echo export TMPDIR=\"/mnt/docker-tmp\" >> /etc/default/docker
|
||||
sed -i -e '/^DOCKER_OPTS=.*/,${s||DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/docker\"|;b};$q1' /etc/default/docker || echo DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/docker\" >> /etc/default/docker
|
||||
fi
|
||||
|
||||
|
@ -429,7 +429,7 @@ EOF
|
|||
yum install -y docker-engine
|
||||
# modify docker opts
|
||||
mkdir -p /mnt/resource/docker-tmp
|
||||
sed -i -e '/export TMPDIR=.*/,${s||export TMPDIR=\"/mnt/resource/docker-tmp\"|;b};$q1' /etc/default/docker
|
||||
sed -i -e 's,.*export TMPDIR=.*,export TMPDIR="/mnt/resource/docker-tmp",g' /etc/default/docker || echo export TMPDIR=\"/mnt/resource/docker-tmp\" >> /etc/default/docker
|
||||
sed -i -e '/^DOCKER_OPTS=.*/,${s||DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\"|;b};$q1' /etc/default/docker || echo DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\" >> /etc/default/docker
|
||||
sed -i '/^\[Service\]/a EnvironmentFile=-/etc/default/docker' /lib/systemd/system/docker.service
|
||||
sed -i '/^ExecStart=/ s/$/ $DOCKER_OPTS/' /lib/systemd/system/docker.service
|
||||
|
@ -501,7 +501,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
|
|||
zypper -n in docker
|
||||
# modify docker opts, docker opts in /etc/sysconfig/docker
|
||||
mkdir -p /mnt/resource/docker-tmp
|
||||
sed -i -e '/export TMPDIR=.*/,${s||export TMPDIR=\"/mnt/resource/docker-tmp\"|;b};$q1' /etc/default/docker
|
||||
sed -i -e 's,.*export TMPDIR=.*,export TMPDIR="/mnt/resource/docker-tmp",g' /etc/default/docker || echo export TMPDIR=\"/mnt/resource/docker-tmp\" >> /etc/default/docker
|
||||
sed -i -e '/^DOCKER_OPTS=.*/,${s||DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\"|;b};$q1' /etc/sysconfig/docker || echo DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\" >> /etc/sysconfig/docker
|
||||
systemctl daemon-reload
|
||||
# start docker service and enable docker daemon on boot
|
||||
|
|
|
@ -341,13 +341,15 @@ def pool_del(ctx, wait):
|
|||
|
||||
|
||||
@pool.command('resize')
|
||||
@click.option(
|
||||
'--wait', is_flag=True, help='Wait for pool resize to complete')
|
||||
@common_options
|
||||
@pass_cli_context
|
||||
def pool_resize(ctx):
|
||||
def pool_resize(ctx, wait):
|
||||
"""Resize a pool"""
|
||||
_setup_context(ctx)
|
||||
convoy.fleet.action_pool_resize(
|
||||
ctx.batch_client, ctx.blob_client, ctx.config)
|
||||
ctx.batch_client, ctx.blob_client, ctx.config, wait=wait)
|
||||
|
||||
|
||||
@pool.command('grls')
|
||||
|
|
Загрузка…
Ссылка в новой задаче