diff --git a/.gitignore b/.gitignore index 4ef76f4..6110950 100644 --- a/.gitignore +++ b/.gitignore @@ -92,6 +92,7 @@ ENV/ .ropeproject # project specific ignores +shipyard ssh_docker_tunnel_shipyard.sh id_rsa_shipyard* resources/azurefile-dockervolume-create.sh diff --git a/Dockerfile b/Dockerfile index 3420757..b9ddb2e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,7 @@ RUN apk update \ && pip3 install --no-cache-dir --upgrade pip \ && git clone https://github.com/Azure/batch-shipyard.git /opt/batch-shipyard \ && cd /opt/batch-shipyard \ - && rm -rf .git* .travis.yml install.sh shipyard \ + && rm -rf .git* .travis.yml install.sh \ && pip3 install -r requirements.txt \ && apk del --purge \ build-base python3-dev openssl-dev libffi-dev git \ diff --git a/cascade/graph.py b/cascade/graph.py index 09a5df5..68fc415 100755 --- a/cascade/graph.py +++ b/cascade/graph.py @@ -29,6 +29,7 @@ import argparse import copy import datetime import json +import pathlib import subprocess import sys # non-stdlib imports @@ -282,7 +283,7 @@ def graph_data(data: dict, sizes: dict, offer: str, sku: str): mintime = start if start > maxtime: maxtime = start - print('delta:', maxtime - mintime) + print('nodeready variance:', maxtime - mintime) total_gr = 0 total_ac = 0 with open(dat_fname, 'w') as f: @@ -383,10 +384,21 @@ def main(): # get command-line args args = parseargs() + if args.configdir is not None: + if args.credentials is None: + args.credentials = str( + pathlib.Path(args.configdir, 'credentials.json')) + if args.config is None: + args.config = str(pathlib.Path(args.configdir, 'config.json')) + if args.pool is None: + args.pool = str(pathlib.Path(args.configdir, 'pool.json')) + if args.credentials is None: raise ValueError('credentials json not specified') if args.config is None: raise ValueError('config json not specified') + if args.pool is None: + raise ValueError('pool json not specified') with open(args.credentials, 'r') as f: config = json.load(f) @@ -408,7 +420,9 @@ def parseargs(): :return: parsed arguments """ parser = argparse.ArgumentParser( - description='Shipyard perf graph generator') + description='Batch Shipyard perf graph generator') + parser.add_argument( + '--configdir', help='json config dir') parser.add_argument( '--credentials', help='credentials json config') parser.add_argument( diff --git a/cascade/perf.py b/cascade/perf.py index fd31683..2d2a0e9 100755 --- a/cascade/perf.py +++ b/cascade/perf.py @@ -107,7 +107,7 @@ def parseargs(): :return: parsed arguments """ parser = argparse.ArgumentParser( - description='Shipyard perf recorder') + description='Batch Shipyard perf recorder') parser.add_argument('source', help='event source') parser.add_argument('event', help='event') parser.add_argument('--ts', help='timestamp (posix)') diff --git a/convoy/batch.py b/convoy/batch.py index 6310fa8..b17a2ef 100644 --- a/convoy/batch.py +++ b/convoy/batch.py @@ -215,16 +215,19 @@ def _retrieve_outputs_from_failed_nodes(batch_client, config, nodeid=None): def _block_for_nodes_ready( - batch_client, config, node_state, pool_id, reboot_on_failed): + batch_client, config, stopping_states, end_states, pool_id, + reboot_on_failed): # type: (batch.BatchServiceClient, dict, + # List[batchmodels.ComputeNodeState], # List[batchmodels.ComputeNodeState], str, # bool) -> List[batchmodels.ComputeNode] - """Wait for nodes to enter "ready": steady state and all nodes in - specified states + """Wait for pool to enter steady state and all nodes to enter stopping + states :param batch_client: The batch client to use. :type batch_client: `azure.batch.batch_service_client.BatchServiceClient` :param dict config: configuration dict - :param list node_state: list of acceptable node states + :param list stopping_states: list of node states to stop polling + :param list end_states: list of acceptable end states :param str pool_id: pool id :param bool reboot_on_failed: reboot node on failed start state :rtype: list @@ -232,7 +235,7 @@ def _block_for_nodes_ready( """ logger.info( 'waiting for all nodes in pool {} to reach one of: {!r}'.format( - pool_id, node_state)) + pool_id, stopping_states)) i = 0 reboot_map = {} while True: @@ -289,15 +292,15 @@ def _block_for_nodes_ready( 'non-transient, please submit an issue on ' 'GitHub.').format(pool.id)) if (len(nodes) >= pool.target_dedicated and - all(node.state in node_state for node in nodes)): - if any(node.state != batchmodels.ComputeNodeState.idle - for node in nodes): + all(node.state in stopping_states for node in nodes)): + if any(node.state not in end_states for node in nodes): # list nodes of pool list_nodes(batch_client, config) raise RuntimeError( - 'Node(s) of pool {} not in idle state. Please inspect ' - 'the state of nodes in the pool. Please retry pool ' - 'creation by deleting and recreating the pool.') + ('Node(s) of pool {} not in {} state. Please inspect ' + 'the state of nodes in the pool. Please retry pool ' + 'creation by deleting and recreating the pool.').format( + pool.id, end_states)) else: return nodes i += 1 @@ -310,30 +313,38 @@ def _block_for_nodes_ready( time.sleep(10) -def wait_for_pool_ready(batch_client, config, pool_id): - # type: (batch.BatchServiceClient, dict, str) -> - # List[batchmodels.ComputeNode] - """Wait for pool to enter "ready": steady state and all nodes idle +def wait_for_pool_ready(batch_client, config, pool_id, addl_end_states=None): + # type: (batch.BatchServiceClient, dict, str, + # List[batchmodels.ComputeNode]) -> List[batchmodels.ComputeNode] + """Wait for pool to enter steady state and all nodes in end states :param batch_client: The batch client to use. :type batch_client: `azure.batch.batch_service_client.BatchServiceClient` :param dict config: configuration dict :param str pool_id: pool id + :param list addl_end_states: additional end states :rtype: list :return: list of nodes """ - # wait for pool idle - node_state = frozenset( - (batchmodels.ComputeNodeState.starttaskfailed, - batchmodels.ComputeNodeState.unusable, - batchmodels.ComputeNodeState.idle) - ) + base_stopping_states = [ + batchmodels.ComputeNodeState.starttaskfailed, + batchmodels.ComputeNodeState.unusable, + batchmodels.ComputeNodeState.idle + ] + base_end_states = [batchmodels.ComputeNodeState.idle] try: reboot_on_failed = config[ 'pool_specification']['reboot_on_start_task_failed'] except KeyError: reboot_on_failed = False + if addl_end_states is not None and len(addl_end_states) > 0: + base_stopping_states.extend(addl_end_states) + base_end_states.extend(addl_end_states) + stopping_states = frozenset(base_stopping_states) + end_states = frozenset(base_end_states) nodes = _block_for_nodes_ready( - batch_client, config, node_state, pool_id, reboot_on_failed) + batch_client, config, stopping_states, end_states, pool_id, + reboot_on_failed) + list_nodes(batch_client, config, nodes=nodes) return nodes @@ -556,16 +567,21 @@ def list_pools(batch_client): logger.error('no pools found') -def resize_pool(batch_client, config): - # type: (azure.batch.batch_service_client.BatchServiceClient, dict) -> None +def resize_pool(batch_client, config, wait=False): + # type: (azure.batch.batch_service_client.BatchServiceClient, dict, + # bool) -> list """Resize a pool :param batch_client: The batch client to use. :type batch_client: `azure.batch.batch_service_client.BatchServiceClient` :param dict config: configuration dict + :param bool wait: wait for operation to complete + :rtype: list or None + :return: list of nodes if wait or None """ pool_id = config['pool_specification']['id'] vm_count = int(config['pool_specification']['vm_count']) - logger.info('Resizing pool {} to {}'.format(pool_id, vm_count)) + logger.info('Resizing pool {} to {} compute nodes'.format( + pool_id, vm_count)) batch_client.pool.resize( pool_id=pool_id, pool_resize_parameter=batchmodels.PoolResizeParameter( @@ -573,6 +589,10 @@ def resize_pool(batch_client, config): resize_timeout=datetime.timedelta(minutes=20), ) ) + if wait: + return wait_for_pool_ready( + batch_client, config, pool_id, + addl_end_states=[batchmodels.ComputeNodeState.running]) def del_pool(batch_client, config): @@ -1037,16 +1057,18 @@ def terminate_tasks(batch_client, config, jobid=None, taskid=None, wait=False): continue -def list_nodes(batch_client, config): - # type: (batch.BatchServiceClient, dict) -> None +def list_nodes(batch_client, config, nodes=None): + # type: (batch.BatchServiceClient, dict, list) -> None """Get a list of nodes :param batch_client: The batch client to use. :type batch_client: `azure.batch.batch_service_client.BatchServiceClient` :param dict config: configuration dict + :param lsit nodes: list of nodes """ pool_id = config['pool_specification']['id'] logger.debug('listing nodes for pool {}'.format(pool_id)) - nodes = batch_client.compute_node.list(pool_id) + if nodes is None: + nodes = batch_client.compute_node.list(pool_id) for node in nodes: logger.info( ('node_id={} [state={} scheduling_state={} ip_address={} ' diff --git a/convoy/fleet.py b/convoy/fleet.py index d5f38bf..a1401fb 100644 --- a/convoy/fleet.py +++ b/convoy/fleet.py @@ -1049,14 +1049,15 @@ def action_pool_delete( time.sleep(3) -def action_pool_resize(batch_client, blob_client, config): +def action_pool_resize(batch_client, blob_client, config, wait): # type: (batch.BatchServiceClient, azureblob.BlockBlobService, - # dict) -> None + # dict, bool) -> None """Resize pool that may contain glusterfs :param batch_client: The batch client to use. :type batch_client: `azure.batch.batch_service_client.BatchServiceClient` :param azure.storage.blob.BlockBlobService blob_client: blob client :param dict config: configuration dict + :param bool wait: wait for operation to complete """ pool_id = config['pool_specification']['id'] # check if this is a glusterfs-enabled pool @@ -1081,13 +1082,12 @@ def action_pool_resize(batch_client, blob_client, config): if gluster_present: for node in batch_client.compute_node.list(pool_id): old_nodes[node.id] = node.ip_address + logger.debug('forcing wait to True due to glusterfs') + wait = True # resize pool - convoy.batch.resize_pool(batch_client, config) + nodes = convoy.batch.resize_pool(batch_client, config, wait) # add brick for new nodes if gluster_present: - # wait for nodes to reach idle - nodes = convoy.batch.wait_for_pool_ready( - batch_client, config, pool_id) # get internal ip addresses of new nodes new_nodes = [ node.ip_address for node in nodes if node.id not in old_nodes diff --git a/docs/01-batch-shipyard-installation.md b/docs/01-batch-shipyard-installation.md index 9e5830a..1d7884b 100644 --- a/docs/01-batch-shipyard-installation.md +++ b/docs/01-batch-shipyard-installation.md @@ -10,7 +10,7 @@ git clone https://github.com/Azure/batch-shipyard.git or [download the latest release](https://github.com/Azure/batch-shipyard/releases). Batch Shipyard includes an installation script to simplify installation on -a variety of recent platforms. This installation script can be used +a variety of recent Linux distributions. This installation script can be used regardless of if you obtained Batch Shipyard through `git clone` or downloading a release package. @@ -70,9 +70,9 @@ line-endings (CRLF) then compute nodes will fail to start properly. ## Manual Installation ### Requirements The Batch Shipyard tool is written in Python. The client script is compatible -with Python 2.7 or 3.3+. You will also need to install the -[Azure Batch](https://pypi.python.org/pypi/azure-batch) and -[Azure Storage](https://pypi.python.org/pypi/azure-storage) python packages. +with Python 2.7 or 3.3+. You will also need to install dependent Python +packages including the [Azure Batch](https://pypi.python.org/pypi/azure-batch) +and [Azure Storage](https://pypi.python.org/pypi/azure-storage) packages. Installation can be performed using the [requirements.txt](../requirements.txt) file via the command `pip install --upgrade --user -r requirements.txt` (or via `pip3` for python3). If `pip` is not installed on your system, @@ -80,7 +80,7 @@ please continue reading below. Note that this `pip` command should be run for every Batch Shipyard upgrade if not using `install.sh`. Batch Shipyard has some Python dependencies which require a valid compiler, -ssl, ffi, and python development libraries to be installed due to the +ssl, ffi, and Python development libraries to be installed due to the [cryptography](https://pypi.python.org/pypi/cryptography) dependency on Linux. For Windows, binary wheels will be installed for dependencies, thus no development environment is needed. The following are example commands to @@ -109,13 +109,11 @@ pip install --upgrade pip ####Note about Python 3.3+ If installing for Python 3.3+, then simply use the Python3 equivalents for the python dependencies. For example, on Ubuntu/Debian: - ``` apt-get update apt-get install -y build-essential libssl-dev libffi-dev libpython3-dev python3-dev python3-pip pip install --upgrade pip ``` - would install the proper dependencies for Python3. ###Data Movement Support diff --git a/docs/20-batch-shipyard-usage.md b/docs/20-batch-shipyard-usage.md index 75da29a..0dc7662 100644 --- a/docs/20-batch-shipyard-usage.md +++ b/docs/20-batch-shipyard-usage.md @@ -181,6 +181,7 @@ in the specified pool * `listnodes` will list all nodes in the specified pool * `resize` will resize the pool to the `vm_count` specified in the pool configuration file + * `--wait` will wait for resize to complete ## Storage Command The `storage` command has the following sub-commands: diff --git a/scripts/shipyard_nodeprep.sh b/scripts/shipyard_nodeprep.sh index 26c2980..701cb1b 100755 --- a/scripts/shipyard_nodeprep.sh +++ b/scripts/shipyard_nodeprep.sh @@ -274,11 +274,11 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then rm -f /var/lib/docker/network/files/local-kv.db if [ $name == "debian-jessie" ]; then mkdir -p /mnt/resource/docker-tmp - sed -i -e '/export TMPDIR=.*/,${s||export TMPDIR=\"/mnt/resource/docker-tmp\"|;b};$q1' /etc/default/docker + sed -i -e 's,.*export TMPDIR=.*,export TMPDIR="/mnt/resource/docker-tmp",g' /etc/default/docker || echo export TMPDIR=\"/mnt/resource/docker-tmp\" >> /etc/default/docker sed -i -e '/^DOCKER_OPTS=.*/,${s||DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\"|;b};$q1' /etc/default/docker || echo DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\" >> /etc/default/docker else mkdir -p /mnt/docker-tmp - sed -i -e '/export TMPDIR=.*/,${s||export TMPDIR=\"/mnt/docker-tmp\"|;b};$q1' /etc/default/docker + sed -i -e 's,.*export TMPDIR=.*,export TMPDIR="/mnt/docker-tmp",g' /etc/default/docker || echo export TMPDIR=\"/mnt/docker-tmp\" >> /etc/default/docker sed -i -e '/^DOCKER_OPTS=.*/,${s||DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/docker\"|;b};$q1' /etc/default/docker || echo DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/docker\" >> /etc/default/docker fi @@ -429,7 +429,7 @@ EOF yum install -y docker-engine # modify docker opts mkdir -p /mnt/resource/docker-tmp - sed -i -e '/export TMPDIR=.*/,${s||export TMPDIR=\"/mnt/resource/docker-tmp\"|;b};$q1' /etc/default/docker + sed -i -e 's,.*export TMPDIR=.*,export TMPDIR="/mnt/resource/docker-tmp",g' /etc/default/docker || echo export TMPDIR=\"/mnt/resource/docker-tmp\" >> /etc/default/docker sed -i -e '/^DOCKER_OPTS=.*/,${s||DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\"|;b};$q1' /etc/default/docker || echo DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\" >> /etc/default/docker sed -i '/^\[Service\]/a EnvironmentFile=-/etc/default/docker' /lib/systemd/system/docker.service sed -i '/^ExecStart=/ s/$/ $DOCKER_OPTS/' /lib/systemd/system/docker.service @@ -501,7 +501,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then zypper -n in docker # modify docker opts, docker opts in /etc/sysconfig/docker mkdir -p /mnt/resource/docker-tmp - sed -i -e '/export TMPDIR=.*/,${s||export TMPDIR=\"/mnt/resource/docker-tmp\"|;b};$q1' /etc/default/docker + sed -i -e 's,.*export TMPDIR=.*,export TMPDIR="/mnt/resource/docker-tmp",g' /etc/default/docker || echo export TMPDIR=\"/mnt/resource/docker-tmp\" >> /etc/default/docker sed -i -e '/^DOCKER_OPTS=.*/,${s||DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\"|;b};$q1' /etc/sysconfig/docker || echo DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\" >> /etc/sysconfig/docker systemctl daemon-reload # start docker service and enable docker daemon on boot diff --git a/shipyard.py b/shipyard.py index c788561..88eae16 100755 --- a/shipyard.py +++ b/shipyard.py @@ -341,13 +341,15 @@ def pool_del(ctx, wait): @pool.command('resize') +@click.option( + '--wait', is_flag=True, help='Wait for pool resize to complete') @common_options @pass_cli_context -def pool_resize(ctx): +def pool_resize(ctx, wait): """Resize a pool""" _setup_context(ctx) convoy.fleet.action_pool_resize( - ctx.batch_client, ctx.blob_client, ctx.config) + ctx.batch_client, ctx.blob_client, ctx.config, wait=wait) @pool.command('grls')