Add wait option for pool resize

- Fix TMPDIR sed command
- Add generated shipyard script to gitignore
This commit is contained in:
Fred Park 2016-10-30 01:33:15 -07:00
Родитель eb2f108e86
Коммит efb8c3105f
10 изменённых файлов: 89 добавлений и 51 удалений

1
.gitignore поставляемый
Просмотреть файл

@ -92,6 +92,7 @@ ENV/
.ropeproject
# project specific ignores
shipyard
ssh_docker_tunnel_shipyard.sh
id_rsa_shipyard*
resources/azurefile-dockervolume-create.sh

Просмотреть файл

@ -11,7 +11,7 @@ RUN apk update \
&& pip3 install --no-cache-dir --upgrade pip \
&& git clone https://github.com/Azure/batch-shipyard.git /opt/batch-shipyard \
&& cd /opt/batch-shipyard \
&& rm -rf .git* .travis.yml install.sh shipyard \
&& rm -rf .git* .travis.yml install.sh \
&& pip3 install -r requirements.txt \
&& apk del --purge \
build-base python3-dev openssl-dev libffi-dev git \

Просмотреть файл

@ -29,6 +29,7 @@ import argparse
import copy
import datetime
import json
import pathlib
import subprocess
import sys
# non-stdlib imports
@ -282,7 +283,7 @@ def graph_data(data: dict, sizes: dict, offer: str, sku: str):
mintime = start
if start > maxtime:
maxtime = start
print('delta:', maxtime - mintime)
print('nodeready variance:', maxtime - mintime)
total_gr = 0
total_ac = 0
with open(dat_fname, 'w') as f:
@ -383,10 +384,21 @@ def main():
# get command-line args
args = parseargs()
if args.configdir is not None:
if args.credentials is None:
args.credentials = str(
pathlib.Path(args.configdir, 'credentials.json'))
if args.config is None:
args.config = str(pathlib.Path(args.configdir, 'config.json'))
if args.pool is None:
args.pool = str(pathlib.Path(args.configdir, 'pool.json'))
if args.credentials is None:
raise ValueError('credentials json not specified')
if args.config is None:
raise ValueError('config json not specified')
if args.pool is None:
raise ValueError('pool json not specified')
with open(args.credentials, 'r') as f:
config = json.load(f)
@ -408,7 +420,9 @@ def parseargs():
:return: parsed arguments
"""
parser = argparse.ArgumentParser(
description='Shipyard perf graph generator')
description='Batch Shipyard perf graph generator')
parser.add_argument(
'--configdir', help='json config dir')
parser.add_argument(
'--credentials', help='credentials json config')
parser.add_argument(

Просмотреть файл

@ -107,7 +107,7 @@ def parseargs():
:return: parsed arguments
"""
parser = argparse.ArgumentParser(
description='Shipyard perf recorder')
description='Batch Shipyard perf recorder')
parser.add_argument('source', help='event source')
parser.add_argument('event', help='event')
parser.add_argument('--ts', help='timestamp (posix)')

Просмотреть файл

@ -215,16 +215,19 @@ def _retrieve_outputs_from_failed_nodes(batch_client, config, nodeid=None):
def _block_for_nodes_ready(
batch_client, config, node_state, pool_id, reboot_on_failed):
batch_client, config, stopping_states, end_states, pool_id,
reboot_on_failed):
# type: (batch.BatchServiceClient, dict,
# List[batchmodels.ComputeNodeState],
# List[batchmodels.ComputeNodeState], str,
# bool) -> List[batchmodels.ComputeNode]
"""Wait for nodes to enter "ready": steady state and all nodes in
specified states
"""Wait for pool to enter steady state and all nodes to enter stopping
states
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param list node_state: list of acceptable node states
:param list stopping_states: list of node states to stop polling
:param list end_states: list of acceptable end states
:param str pool_id: pool id
:param bool reboot_on_failed: reboot node on failed start state
:rtype: list
@ -232,7 +235,7 @@ def _block_for_nodes_ready(
"""
logger.info(
'waiting for all nodes in pool {} to reach one of: {!r}'.format(
pool_id, node_state))
pool_id, stopping_states))
i = 0
reboot_map = {}
while True:
@ -289,15 +292,15 @@ def _block_for_nodes_ready(
'non-transient, please submit an issue on '
'GitHub.').format(pool.id))
if (len(nodes) >= pool.target_dedicated and
all(node.state in node_state for node in nodes)):
if any(node.state != batchmodels.ComputeNodeState.idle
for node in nodes):
all(node.state in stopping_states for node in nodes)):
if any(node.state not in end_states for node in nodes):
# list nodes of pool
list_nodes(batch_client, config)
raise RuntimeError(
'Node(s) of pool {} not in idle state. Please inspect '
'the state of nodes in the pool. Please retry pool '
'creation by deleting and recreating the pool.')
('Node(s) of pool {} not in {} state. Please inspect '
'the state of nodes in the pool. Please retry pool '
'creation by deleting and recreating the pool.').format(
pool.id, end_states))
else:
return nodes
i += 1
@ -310,30 +313,38 @@ def _block_for_nodes_ready(
time.sleep(10)
def wait_for_pool_ready(batch_client, config, pool_id):
# type: (batch.BatchServiceClient, dict, str) ->
# List[batchmodels.ComputeNode]
"""Wait for pool to enter "ready": steady state and all nodes idle
def wait_for_pool_ready(batch_client, config, pool_id, addl_end_states=None):
# type: (batch.BatchServiceClient, dict, str,
# List[batchmodels.ComputeNode]) -> List[batchmodels.ComputeNode]
"""Wait for pool to enter steady state and all nodes in end states
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param str pool_id: pool id
:param list addl_end_states: additional end states
:rtype: list
:return: list of nodes
"""
# wait for pool idle
node_state = frozenset(
(batchmodels.ComputeNodeState.starttaskfailed,
batchmodels.ComputeNodeState.unusable,
batchmodels.ComputeNodeState.idle)
)
base_stopping_states = [
batchmodels.ComputeNodeState.starttaskfailed,
batchmodels.ComputeNodeState.unusable,
batchmodels.ComputeNodeState.idle
]
base_end_states = [batchmodels.ComputeNodeState.idle]
try:
reboot_on_failed = config[
'pool_specification']['reboot_on_start_task_failed']
except KeyError:
reboot_on_failed = False
if addl_end_states is not None and len(addl_end_states) > 0:
base_stopping_states.extend(addl_end_states)
base_end_states.extend(addl_end_states)
stopping_states = frozenset(base_stopping_states)
end_states = frozenset(base_end_states)
nodes = _block_for_nodes_ready(
batch_client, config, node_state, pool_id, reboot_on_failed)
batch_client, config, stopping_states, end_states, pool_id,
reboot_on_failed)
list_nodes(batch_client, config, nodes=nodes)
return nodes
@ -556,16 +567,21 @@ def list_pools(batch_client):
logger.error('no pools found')
def resize_pool(batch_client, config):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict) -> None
def resize_pool(batch_client, config, wait=False):
# type: (azure.batch.batch_service_client.BatchServiceClient, dict,
# bool) -> list
"""Resize a pool
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param bool wait: wait for operation to complete
:rtype: list or None
:return: list of nodes if wait or None
"""
pool_id = config['pool_specification']['id']
vm_count = int(config['pool_specification']['vm_count'])
logger.info('Resizing pool {} to {}'.format(pool_id, vm_count))
logger.info('Resizing pool {} to {} compute nodes'.format(
pool_id, vm_count))
batch_client.pool.resize(
pool_id=pool_id,
pool_resize_parameter=batchmodels.PoolResizeParameter(
@ -573,6 +589,10 @@ def resize_pool(batch_client, config):
resize_timeout=datetime.timedelta(minutes=20),
)
)
if wait:
return wait_for_pool_ready(
batch_client, config, pool_id,
addl_end_states=[batchmodels.ComputeNodeState.running])
def del_pool(batch_client, config):
@ -1037,16 +1057,18 @@ def terminate_tasks(batch_client, config, jobid=None, taskid=None, wait=False):
continue
def list_nodes(batch_client, config):
# type: (batch.BatchServiceClient, dict) -> None
def list_nodes(batch_client, config, nodes=None):
# type: (batch.BatchServiceClient, dict, list) -> None
"""Get a list of nodes
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param dict config: configuration dict
:param lsit nodes: list of nodes
"""
pool_id = config['pool_specification']['id']
logger.debug('listing nodes for pool {}'.format(pool_id))
nodes = batch_client.compute_node.list(pool_id)
if nodes is None:
nodes = batch_client.compute_node.list(pool_id)
for node in nodes:
logger.info(
('node_id={} [state={} scheduling_state={} ip_address={} '

Просмотреть файл

@ -1049,14 +1049,15 @@ def action_pool_delete(
time.sleep(3)
def action_pool_resize(batch_client, blob_client, config):
def action_pool_resize(batch_client, blob_client, config, wait):
# type: (batch.BatchServiceClient, azureblob.BlockBlobService,
# dict) -> None
# dict, bool) -> None
"""Resize pool that may contain glusterfs
:param batch_client: The batch client to use.
:type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
:param azure.storage.blob.BlockBlobService blob_client: blob client
:param dict config: configuration dict
:param bool wait: wait for operation to complete
"""
pool_id = config['pool_specification']['id']
# check if this is a glusterfs-enabled pool
@ -1081,13 +1082,12 @@ def action_pool_resize(batch_client, blob_client, config):
if gluster_present:
for node in batch_client.compute_node.list(pool_id):
old_nodes[node.id] = node.ip_address
logger.debug('forcing wait to True due to glusterfs')
wait = True
# resize pool
convoy.batch.resize_pool(batch_client, config)
nodes = convoy.batch.resize_pool(batch_client, config, wait)
# add brick for new nodes
if gluster_present:
# wait for nodes to reach idle
nodes = convoy.batch.wait_for_pool_ready(
batch_client, config, pool_id)
# get internal ip addresses of new nodes
new_nodes = [
node.ip_address for node in nodes if node.id not in old_nodes

Просмотреть файл

@ -10,7 +10,7 @@ git clone https://github.com/Azure/batch-shipyard.git
or [download the latest release](https://github.com/Azure/batch-shipyard/releases).
Batch Shipyard includes an installation script to simplify installation on
a variety of recent platforms. This installation script can be used
a variety of recent Linux distributions. This installation script can be used
regardless of if you obtained Batch Shipyard through `git clone` or
downloading a release package.
@ -70,9 +70,9 @@ line-endings (CRLF) then compute nodes will fail to start properly.
## Manual Installation
### Requirements
The Batch Shipyard tool is written in Python. The client script is compatible
with Python 2.7 or 3.3+. You will also need to install the
[Azure Batch](https://pypi.python.org/pypi/azure-batch) and
[Azure Storage](https://pypi.python.org/pypi/azure-storage) python packages.
with Python 2.7 or 3.3+. You will also need to install dependent Python
packages including the [Azure Batch](https://pypi.python.org/pypi/azure-batch)
and [Azure Storage](https://pypi.python.org/pypi/azure-storage) packages.
Installation can be performed using the [requirements.txt](../requirements.txt)
file via the command `pip install --upgrade --user -r requirements.txt`
(or via `pip3` for python3). If `pip` is not installed on your system,
@ -80,7 +80,7 @@ please continue reading below. Note that this `pip` command should be run
for every Batch Shipyard upgrade if not using `install.sh`.
Batch Shipyard has some Python dependencies which require a valid compiler,
ssl, ffi, and python development libraries to be installed due to the
ssl, ffi, and Python development libraries to be installed due to the
[cryptography](https://pypi.python.org/pypi/cryptography) dependency on Linux.
For Windows, binary wheels will be installed for dependencies, thus no
development environment is needed. The following are example commands to
@ -109,13 +109,11 @@ pip install --upgrade pip
####Note about Python 3.3+
If installing for Python 3.3+, then simply use the Python3 equivalents for
the python dependencies. For example, on Ubuntu/Debian:
```
apt-get update
apt-get install -y build-essential libssl-dev libffi-dev libpython3-dev python3-dev python3-pip
pip install --upgrade pip
```
would install the proper dependencies for Python3.
###Data Movement Support

Просмотреть файл

@ -181,6 +181,7 @@ in the specified pool
* `listnodes` will list all nodes in the specified pool
* `resize` will resize the pool to the `vm_count` specified in the pool
configuration file
* `--wait` will wait for resize to complete
## Storage Command
The `storage` command has the following sub-commands:

Просмотреть файл

@ -274,11 +274,11 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then
rm -f /var/lib/docker/network/files/local-kv.db
if [ $name == "debian-jessie" ]; then
mkdir -p /mnt/resource/docker-tmp
sed -i -e '/export TMPDIR=.*/,${s||export TMPDIR=\"/mnt/resource/docker-tmp\"|;b};$q1' /etc/default/docker
sed -i -e 's,.*export TMPDIR=.*,export TMPDIR="/mnt/resource/docker-tmp",g' /etc/default/docker || echo export TMPDIR=\"/mnt/resource/docker-tmp\" >> /etc/default/docker
sed -i -e '/^DOCKER_OPTS=.*/,${s||DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\"|;b};$q1' /etc/default/docker || echo DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\" >> /etc/default/docker
else
mkdir -p /mnt/docker-tmp
sed -i -e '/export TMPDIR=.*/,${s||export TMPDIR=\"/mnt/docker-tmp\"|;b};$q1' /etc/default/docker
sed -i -e 's,.*export TMPDIR=.*,export TMPDIR="/mnt/docker-tmp",g' /etc/default/docker || echo export TMPDIR=\"/mnt/docker-tmp\" >> /etc/default/docker
sed -i -e '/^DOCKER_OPTS=.*/,${s||DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/docker\"|;b};$q1' /etc/default/docker || echo DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/docker\" >> /etc/default/docker
fi
@ -429,7 +429,7 @@ EOF
yum install -y docker-engine
# modify docker opts
mkdir -p /mnt/resource/docker-tmp
sed -i -e '/export TMPDIR=.*/,${s||export TMPDIR=\"/mnt/resource/docker-tmp\"|;b};$q1' /etc/default/docker
sed -i -e 's,.*export TMPDIR=.*,export TMPDIR="/mnt/resource/docker-tmp",g' /etc/default/docker || echo export TMPDIR=\"/mnt/resource/docker-tmp\" >> /etc/default/docker
sed -i -e '/^DOCKER_OPTS=.*/,${s||DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\"|;b};$q1' /etc/default/docker || echo DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\" >> /etc/default/docker
sed -i '/^\[Service\]/a EnvironmentFile=-/etc/default/docker' /lib/systemd/system/docker.service
sed -i '/^ExecStart=/ s/$/ $DOCKER_OPTS/' /lib/systemd/system/docker.service
@ -501,7 +501,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
zypper -n in docker
# modify docker opts, docker opts in /etc/sysconfig/docker
mkdir -p /mnt/resource/docker-tmp
sed -i -e '/export TMPDIR=.*/,${s||export TMPDIR=\"/mnt/resource/docker-tmp\"|;b};$q1' /etc/default/docker
sed -i -e 's,.*export TMPDIR=.*,export TMPDIR="/mnt/resource/docker-tmp",g' /etc/default/docker || echo export TMPDIR=\"/mnt/resource/docker-tmp\" >> /etc/default/docker
sed -i -e '/^DOCKER_OPTS=.*/,${s||DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\"|;b};$q1' /etc/sysconfig/docker || echo DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\" >> /etc/sysconfig/docker
systemctl daemon-reload
# start docker service and enable docker daemon on boot

Просмотреть файл

@ -341,13 +341,15 @@ def pool_del(ctx, wait):
@pool.command('resize')
@click.option(
'--wait', is_flag=True, help='Wait for pool resize to complete')
@common_options
@pass_cli_context
def pool_resize(ctx):
def pool_resize(ctx, wait):
"""Resize a pool"""
_setup_context(ctx)
convoy.fleet.action_pool_resize(
ctx.batch_client, ctx.blob_client, ctx.config)
ctx.batch_client, ctx.blob_client, ctx.config, wait=wait)
@pool.command('grls')