More improvements for scale out robustness

- Add --all-start-task-failed to delnode - Reduce node output on pool allocation wait with number of nodes > 10
2017-06-30 23:50:21 -07:00 · 2017-06-30 23:50:21 -07:00 · 2a48885da1
--- a/.gitignore
+++ b/.gitignore
@ -90,6 +90,7 @@ celerybeat-schedule

 # project specific ignores
 shipyard
+shipyard.cmd
 ssh_docker_tunnel_shipyard.sh
 id_rsa_shipyard*
 resources/azurefile-dockervolume-create.sh
@ -97,4 +98,4 @@ resources/azurefile-dockervolumedriver
 resources/azurefile-dockervolumedriver.env
 resources/docker-registry-v2.tar.gz
 resources/nvidia-docker.deb
-resources/nvidia-driver.run
+resources/nvidia-driver*.run
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,17 @@

 ## [Unreleased]

+### Added
+- `--all-start-task-failed` parameter for `pool delnode`
+
+### Changed
+- Improve robustness of docker image pulls within node prep scripts
+- Restrict node list queries until pool allocation state emerges from resizing
+
+### Fixed
+- Remove nvidia gpu driver property from FFmpeg recipe
+- Further improve retry logic for docker image pulls in cascade
+
 ## [2.8.0rc2] - 2017-06-30
 ### Added
 - Support Mac OS X and Windows Subsystem for Linux installations via
--- a/cascade/cascade.py
+++ b/cascade/cascade.py
@ -395,6 +395,18 @@ class DockerSaveThread(threading.Thread):
                    _DIRECTDL_DOWNLOADING.remove(self.resource)
                    _DIRECTDL.remove(self.resource)

+    def _check_pull_output_overload(self, stdout: str, stderr: str) -> bool:
+        """Check output for registry overload errors
+        :param str stdout: stdout
+        :param str stderr: stderr
+        :rtype: bool
+        :return: if error appears to be overload from registry
+        """
+        if ('toomanyrequests' in stdout or 'toomanyrequests' in stderr or
+                'connection reset by peer' in stderr):
+            return True
+        return False
+
    def _pull(self, image: str) -> tuple:
        """Docker image pull with registry normalization
        :param str image: image to pull
@ -414,7 +426,9 @@ class DockerSaveThread(threading.Thread):
            shell=True,
            universal_newlines=True)
        stdout, stderr = proc.communicate()
-        if proc.returncode != 0 and _ALLOW_PUBLIC_PULL_WITH_PRIVATE:
+        if (proc.returncode != 0 and _ALLOW_PUBLIC_PULL_WITH_PRIVATE and
+                not _pub and
+                not self._check_pull_output_overload(stdout, stderr)):
            logger.warning(
                'could not pull from private registry, attempting '
                'Docker Public Hub instead')
@ -445,31 +459,19 @@ class DockerSaveThread(threading.Thread):
        _record_perf('pull-start', 'img={}'.format(image))
        start = datetime.datetime.now()
        logger.info('pulling image {} from {}'.format(image, _REGISTRY))
-        npa_errors = 0
        while True:
            rc, stdout, stderr = self._pull(image)
-            if rc != 0:
-                fail = True
-                if 'toomanyrequests' in stdout or 'toomanyrequests' in stderr:
-                    logger.error(
-                        'Too many requests issued to registry server, '
-                        'retrying...')
-                    fail = False
-                    time.sleep(random.randint(5, 30))
-                elif 'no pull access' in stdout or 'no pull access' in stderr:
-                    npa_errors += 1
-                    if npa_errors < 3:
-                        fail = False
-                        logger.error(
-                            'No pull access to registry server, retrying in '
-                            'case of temporary overload...')
-                    time.sleep(random.randint(1, 10))
-                if fail:
-                    raise RuntimeError(
-                        'docker pull failed: stdout={} stderr={}'.format(
-                            stdout, stderr))
-            else:
+            if rc == 0:
                break
+            elif self._check_pull_output_overload(stdout, stderr):
+                logger.error(
+                    'Too many requests issued to registry server, '
+                    'retrying...')
+                time.sleep(random.randint(5, 30))
+            else:
+                raise RuntimeError(
+                    'docker pull failed: stdout={} stderr={}'.format(
+                        stdout, stderr))
        diff = (datetime.datetime.now() - start).total_seconds()
        logger.debug('took {} sec to pull docker image {} from {}'.format(
            diff, image, _REGISTRY))
--- a/convoy/batch.py
+++ b/convoy/batch.py
@ -30,6 +30,7 @@ from builtins import (  # noqa
    bytes, dict, int, list, object, range, str, ascii, chr, hex, input,
    next, oct, open, pow, round, super, filter, map, zip)
 # stdlib imports
+import collections
 import datetime
 import fnmatch
 import getpass
@ -39,6 +40,7 @@ try:
 except ImportError:
    import pathlib
 import os
+import ssl
 import tempfile
 import time
 # non-stdlib imports
@ -65,6 +67,23 @@ _RUN_ELEVATED = batchmodels.UserIdentity(
        elevation_level=batchmodels.ElevationLevel.admin,
    )
 )
+NodeStateCountCollection = collections.namedtuple(
+    'NodeStateCountCollection', [
+        'creating',
+        'idle',
+        'leaving_pool',
+        'offline',
+        'preempted',
+        'rebooting',
+        'reimaging',
+        'running',
+        'start_task_failed',
+        'starting',
+        'unknown',
+        'unusable',
+        'waiting_for_start_task',
+    ]
+)


 def get_batch_account(batch_mgmt_client, config):
@ -262,6 +281,7 @@ def _block_for_nodes_ready(
                         pool.target_dedicated_nodes == 0)):
                    fatal_resize_error = True
            if fatal_resize_error:
+                list_nodes(batch_client, config)
                raise RuntimeError(
                    'Fatal resize errors encountered for pool {}: {}'.format(
                        pool.id, os.linesep.join(errors)))
@ -269,7 +289,17 @@ def _block_for_nodes_ready(
                logger.error(
                    'Resize errors encountered for pool {}: {}'.format(
                        pool.id, os.linesep.join(errors)))
-        nodes = list(batch_client.compute_node.list(pool.id))
+        # check pool allocation state
+        if pool.allocation_state == batchmodels.AllocationState.resizing:
+            nodes = []
+        else:
+            try:
+                nodes = list(batch_client.compute_node.list(pool.id))
+            except ssl.SSLError:
+                # SSL error happens sometimes on paging... this is probably
+                # a bug in the underlying msrest/msrestazure library that
+                # is reusing the SSL connection improperly
+                nodes = []
        # check if any nodes are in start task failed state
        if (any(node.state == batchmodels.ComputeNodeState.start_task_failed
                for node in nodes)):
@ -301,7 +331,10 @@ def _block_for_nodes_ready(
                    _reboot_node(batch_client, pool.id, node.id, True)
                    reboot_map[node.id] += 1
                # refresh node list to reflect rebooting states
-                nodes = list(batch_client.compute_node.list(pool.id))
+                try:
+                    nodes = list(batch_client.compute_node.list(pool.id))
+                except ssl.SSLError:
+                    nodes = []
            else:
                # fast path check for start task failures in non-reboot mode
                logger.error(
@ -321,7 +354,6 @@ def _block_for_nodes_ready(
                 pool.target_low_priority_nodes) and
                all(node.state in stopping_states for node in nodes)):
            if any(node.state not in end_states for node in nodes):
-                # list nodes of pool
                list_nodes(batch_client, config)
                raise RuntimeError(
                    ('Node(s) of pool {} not in {} state. Please inspect the '
@ -336,14 +368,48 @@ def _block_for_nodes_ready(
            i = 0
            logger.debug(
                ('waiting for {} dedicated nodes and {} low priority nodes '
-                 'to reach desired state').format(
+                 'to reach desired state in pool {} with '
+                 'allocation_state={}').format(
                     pool.target_dedicated_nodes,
-                     pool.target_low_priority_nodes))
-            for node in nodes:
-                logger.debug('{}: {}'.format(node.id, node.state))
+                     pool.target_low_priority_nodes,
+                     pool.id,
+                     pool.allocation_state))
+            if len(nodes) < 10:
+                for node in nodes:
+                    logger.debug('{}: {}'.format(node.id, node.state))
+            else:
+                logger.debug(_node_state_counts(nodes))
        time.sleep(10)


+def _node_state_counts(nodes):
+    # type: (List[batchmodels.ComputeNode]) -> NodeStateCountCollection
+    """Collate counts of various nodes
+    :param list nodes: list of nodes
+    :rtype: NodeStateCountCollection
+    :return: node state count collection
+    """
+    node_states = [node.state for node in nodes]
+    return NodeStateCountCollection(
+        creating=node_states.count(batchmodels.ComputeNodeState.creating),
+        idle=node_states.count(batchmodels.ComputeNodeState.idle),
+        leaving_pool=node_states.count(
+            batchmodels.ComputeNodeState.leaving_pool),
+        offline=node_states.count(batchmodels.ComputeNodeState.offline),
+        preempted=node_states.count(batchmodels.ComputeNodeState.preempted),
+        rebooting=node_states.count(batchmodels.ComputeNodeState.rebooting),
+        reimaging=node_states.count(batchmodels.ComputeNodeState.reimaging),
+        running=node_states.count(batchmodels.ComputeNodeState.running),
+        start_task_failed=node_states.count(
+            batchmodels.ComputeNodeState.start_task_failed),
+        starting=node_states.count(batchmodels.ComputeNodeState.starting),
+        unknown=node_states.count(batchmodels.ComputeNodeState.unknown),
+        unusable=node_states.count(batchmodels.ComputeNodeState.unusable),
+        waiting_for_start_task=node_states.count(
+            batchmodels.ComputeNodeState.waiting_for_start_task),
+    )
+
+
 def wait_for_pool_ready(batch_client, config, pool_id, addl_end_states=None):
    # type: (batch.BatchServiceClient, dict, str,
    #        List[batchmodels.ComputeNode]) -> List[batchmodels.ComputeNode]
@ -751,25 +817,45 @@ def reboot_nodes(batch_client, config, all_start_task_failed, node_id):
        _reboot_node(batch_client, pool_id, node_id, False)


-def del_node(batch_client, config, node_id):
-    # type: (batch.BatchServiceClient, dict, str) -> None
+def del_node(batch_client, config, all_start_task_failed, node_id):
+    # type: (batch.BatchServiceClient, dict, bool, str) -> None
    """Delete a node in a pool
    :param batch_client: The batch client to use.
    :type batch_client: `azure.batch.batch_service_client.BatchServiceClient`
    :param dict config: configuration dict
+    :param bool all_start_task_failed: reboot all start task failed nodes
    :param str node_id: node id to delete
    """
-    if util.is_none_or_empty(node_id):
-        raise ValueError('node id is invalid')
+    node_ids = []
    pool_id = settings.pool_id(config)
-    if not util.confirm_action(
-            config, 'delete node {} from {} pool'.format(node_id, pool_id)):
+    if all_start_task_failed:
+        nodes = list(
+            batch_client.compute_node.list(
+                pool_id=pool_id,
+                compute_node_list_options=batchmodels.ComputeNodeListOptions(
+                    filter='state eq \'starttaskfailed\'',
+                ),
+            ))
+        for node in nodes:
+            if util.confirm_action(
+                    config, 'delete node {} from {} pool'.format(
+                        node.id, pool_id)):
+                node_ids.append(node.id)
+    else:
+        if util.is_none_or_empty(node_id):
+            raise ValueError('node id is invalid')
+        if util.confirm_action(
+                config, 'delete node {} from {} pool'.format(
+                    node_id, pool_id)):
+            node_ids.append(node_id)
+    if util.is_none_or_empty(node_ids):
+        logger.warning('no nodes to delete from pool: {}'.format(pool_id))
        return
-    logger.info('Deleting node {} from pool {}'.format(node_id, pool_id))
+    logger.info('Deleting nodes {} from pool {}'.format(node_ids, pool_id))
    batch_client.pool.remove_nodes(
        pool_id=pool_id,
        node_remove_parameter=batchmodels.NodeRemoveParameter(
-            node_list=[node_id],
+            node_list=node_ids,
        )
    )

--- a/convoy/fleet.py
+++ b/convoy/fleet.py
@ -2280,15 +2280,16 @@ def action_pool_ssh(batch_client, config, cardinal, nodeid, tty, command):
    util.subprocess_with_output(ssh_cmd)


-def action_pool_delnode(batch_client, config, nodeid):
-    # type: (batchsc.BatchServiceClient, dict, str) -> None
+def action_pool_delnode(batch_client, config, all_start_task_failed, nodeid):
+    # type: (batchsc.BatchServiceClient, dict, bool, str) -> None
    """Action: Pool Delnode
    :param azure.batch.batch_service_client.BatchServiceClient batch_client:
        batch client
    :param dict config: configuration dict
+    :param bool all_start_task_failed: reboot all start task failed nodes
    :param str nodeid: nodeid to delete
    """
-    batch.del_node(batch_client, config, nodeid)
+    batch.del_node(batch_client, config, all_start_task_failed, nodeid)


 def action_pool_rebootnode(
--- a/docs/01-batch-shipyard-installation.md
+++ b/docs/01-batch-shipyard-installation.md
@ -129,8 +129,8 @@ It is recommended to follow the steps outlined on
 [this guide](http://docs.python-guide.org/en/latest/starting/install3/osx/#install3-osx)
 to install Batch Shipyard on a Python3 installation rather than the default
 Python 2.7 that is shipped with Mac OS X. However, if you prefer to use
-the system defaulted Python 2.7, the installation will work that environment
-as well.
+the system defaulted Python 2.7, the installation will work with that
+environment as well.

 The `install.sh` script supports isolated installation through a virtual
 environment so that other system-wide or user python dependencies are left
--- a/docs/20-batch-shipyard-usage.md
+++ b/docs/20-batch-shipyard-usage.md
@ -456,6 +456,9 @@ Azure Storage.
    pool configuration file
  * `--wait` will wait for deletion to complete
 * `delnode` will delete the specified node from the pool
+  * `--all-start-task-failed` will delete all nodes in the start task
+    failed state
+  * `--nodeid` is the node id to delete
 * `dsu` will delete the SSH user defined in the pool configuration file
 from all nodes in the specified pool
 * `grls` will retrieve all of the remote login settings for every node
--- a/recipes/FFmpeg-GPU/README.md
+++ b/recipes/FFmpeg-GPU/README.md
@ -20,9 +20,6 @@ once they are available for N-series VMs.
 are available for N-series VMs.
 * `sku` should be `16.04-LTS`. Other skus will be supported once they are
 available for N-series VMs.
-* `gpu` property should be specified with the following members:
-  * `nvidia_driver` property contains the following members:
-    * `source` is a URL for the driver installer .run file

 ### Global Configuration
 The global configuration should set the following properties:
--- a/recipes/FFmpeg-GPU/config/pool.json
+++ b/recipes/FFmpeg-GPU/config/pool.json
@ -15,11 +15,6 @@
        "ssh": {
            "username": "docker"
        },
-        "gpu": {
-            "nvidia_driver": {
-                "source": "<URL for nvidia driver for STANDARD_NV VMs>"
-            }
-        },
        "reboot_on_start_task_failed": false,
        "block_until_all_global_resources_loaded": true
    }
--- a/scripts/shipyard_nodeprep.sh
+++ b/scripts/shipyard_nodeprep.sh
@ -119,6 +119,16 @@ if [ -z $version ]; then
    exit 1
 fi

+contains() {
+    string="$1"
+    substring="$2"
+    if test "${string#*$substring}" != "$string"; then
+        return 0
+    else
+        return 1
+    fi
+}
+
 check_for_buggy_ntfs_mount() {
    # Check to ensure sdb1 mount is not mounted as ntfs
    set +e
@ -187,7 +197,7 @@ refresh_package_index() {
        fi
        let retries=retries-1
        if [ $retries -eq 0 ]; then
-            echo "Could not update package index"
+            echo "ERROR: Could not update package index"
            exit 1
        fi
        sleep 1
@ -213,7 +223,7 @@ install_packages() {
        fi
        let retries=retries-1
        if [ $retries -eq 0 ]; then
-            echo "Could not install packages: $*"
+            echo "ERROR: Could not install packages: $*"
            exit 1
        fi
        sleep 1
@ -221,6 +231,35 @@ install_packages() {
    set -e
 }

+docker_pull_image() {
+    image=$1
+    set +e
+    retries=60
+    while [ $retries -gt 0 ]; do
+        pull_out=$(docker pull $image 2>&1)
+        rc=$?
+        if [ $rc -eq 0 ]; then
+            echo "$pull_out"
+            break
+        fi
+        # non-zero exit code: check if pull output has toomanyrequests or
+        # connection resets
+        if [ contains "$pull_out" "toomanyrequests" ] || [ contains "$pull_out" "connection reset by peer" ]; then
+            echo "WARNING: will retry:\n$pull_out"
+        else
+            echo "ERROR:\n$pull_out"
+            exit $rc
+        fi
+        let retries=retries-1
+        if [ $retries -le 0 ]; then
+            echo "ERROR: Could not pull docker image: $image"
+            exit $rc
+        fi
+        sleep $[($RANDOM % 5) + 1]s
+    done
+    set -e
+}
+
 # check sdb1 mount
 check_for_buggy_ntfs_mount

@ -269,10 +308,10 @@ fi

 # check if we're coming up from a reboot
 if [ -f $cascadefailed ]; then
-    echo "$cascadefailed file exists, assuming cascade failure during node prep"
+    echo "ERROR: $cascadefailed file exists, assuming cascade failure during node prep"
    exit 1
 elif [ -f $nodeprepfinished ]; then
-    echo "$nodeprepfinished file exists, assuming successful completion of node prep"
+    echo "INFO: $nodeprepfinished file exists, assuming successful completion of node prep"
    exit 0
 fi

@ -343,11 +382,11 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then
        repo=https://download.docker.com/linux/debian
        dockerversion=${dockerversion}debian
    else
-        echo "unsupported sku: $sku for offer: $offer"
+        echo "ERROR: unsupported sku: $sku for offer: $offer"
        exit 1
    fi
    if [ ! -z $gpu ] && [ $name != "ubuntu-xenial" ]; then
-        echo "gpu unsupported on this sku: $sku for offer $offer"
+        echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
        exit 1
    fi
    # reload network settings
@ -375,7 +414,7 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then
        fi
        let retries=retries-1
        if [ $retries -eq 0 ]; then
-            echo "Could not add key for docker repo"
+            echo "ERROR: Could not add key for docker repo"
            exit 1
        fi
        sleep 1
@ -470,7 +509,7 @@ EOF
        set +e
        while :
        do
-            echo "Attempting to create nvidia-docker volume with version $nvdriverver"
+            echo "INFO: Attempting to create nvidia-docker volume with version $nvdriverver"
            docker volume create -d nvidia-docker --name nvidia_driver_$nvdriverver
            if [ $? -eq 0 ]; then
                break
@ -479,7 +518,7 @@ EOF
                NV_DIFF=$((($NV_NOW-$NV_START)/60))
                # fail after 5 minutes of attempts
                if [ $NV_DIFF -ge 5 ]; then
-                    echo "could not create nvidia-docker volume"
+                    echo "ERROR: could not create nvidia-docker volume"
                    exit 1
                fi
                sleep 1
@ -507,7 +546,7 @@ EOF
            elif [ $server_type == "glusterfs" ]; then
                install_packages $offer glusterfs-client acl
            else
-                echo "Unknown file server type ${sc[0]} for ${sc[1]}"
+                echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
                exit 1
            fi
        done
@ -525,12 +564,12 @@ EOF
 elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-linux" ]]; then
    # ensure container only support
    if [ $cascadecontainer -eq 0 ]; then
-        echo "only supported through shipyard container"
+        echo "ERROR: only supported through shipyard container"
        exit 1
    fi
    # gpu is not supported on these offers
    if [ ! -z $gpu ]; then
-        echo "gpu unsupported on this sku: $sku for offer $offer"
+        echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
        exit 1
    fi
    if [[ $sku == 7.* ]]; then
@ -542,7 +581,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
            gfsenable="systemctl enable glusterd"
            rpcbindenable="systemctl enable rpcbind"
            # TODO, in order to support docker > 1.9, need to upgrade to UEKR4
-            echo "oracle linux is not supported at this time"
+            echo "ERROR: oracle linux is not supported at this time"
            exit 1
        else
            srvenable="chkconfig docker on"
@ -552,7 +591,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
            rpcbindenable="chkconfig rpcbind on"
        fi
    else
-        echo "unsupported sku: $sku for offer: $offer"
+        echo "ERROR: unsupported sku: $sku for offer: $offer"
        exit 1
    fi
    # reload network settings
@ -605,7 +644,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
                sed -i -e "s/enabled=1/enabled=0/g" /etc/yum.repos.d/CentOS-Gluster-3.8.repo
                install_packages $offer --enablerepo=centos-gluster38,epel glusterfs-server acl
            else
-                echo "Unknown file server type ${sc[0]} for ${sc[1]}"
+                echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
                exit 1
            fi
        done
@ -613,12 +652,12 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
 elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
    # ensure container only support
    if [ $cascadecontainer -eq 0 ]; then
-        echo "only supported through shipyard container"
+        echo "ERROR: only supported through shipyard container"
        exit 1
    fi
    # gpu is not supported on these offers
    if [ ! -z $gpu ]; then
-        echo "gpu unsupported on this sku: $sku for offer $offer"
+        echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
        exit 1
    fi
    # reload network settings
@ -648,7 +687,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
            SUSEConnect -p sle-module-containers/12/x86_64 -r ''
        fi
        if [ -z $repodir ]; then
-            echo "unsupported sku: $sku for offer: $offer"
+            echo "ERROR: unsupported sku: $sku for offer: $offer"
            exit 1
        fi
        # update index
@ -694,7 +733,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
                    zypper -n --gpg-auto-import-keys ref
                    install_packages $offer glusterfs acl
                else
-                    echo "Unknown file server type ${sc[0]} for ${sc[1]}"
+                    echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
                    exit 1
                fi
            done
@ -702,7 +741,7 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
        # if hpc sku, set up intel mpi
        if [[ $offer == sles-hpc* ]]; then
            if [ $sku != "12-sp1" ]; then
-                echo "unsupported sku for intel mpi setup on SLES"
+                echo "ERROR: unsupported sku for intel mpi setup on SLES"
                exit 1
            fi
            install_packages $offer lsb
@ -712,13 +751,13 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
        fi
    fi
 else
-    echo "unsupported offer: $offer (sku: $sku)"
+    echo "ERROR: unsupported offer: $offer (sku: $sku)"
    exit 1
 fi

 # retrieve docker images related to data movement
-docker pull alfpark/blobxfer:$blobxferversion
-docker pull alfpark/batch-shipyard:tfm-$version
+docker_pull_image alfpark/blobxfer:$blobxferversion
+docker_pull_image alfpark/batch-shipyard:tfm-$version

 # login to registry server
 if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then
@ -734,15 +773,15 @@ if [ ! -z $sc_args ]; then
    for sc_arg in ${sc_args[@]}; do
        IFS=':' read -ra sc <<< "$sc_arg"
        mountpoint=$AZ_BATCH_NODE_SHARED_DIR/${sc[1]}
-        echo "Creating host directory for storage cluster $sc_arg at $mountpoint"
+        echo "INFO: Creating host directory for storage cluster $sc_arg at $mountpoint"
        mkdir -p $mountpoint
        chmod 777 $mountpoint
-        echo "Adding $mountpoint to fstab"
+        echo "INFO: Adding $mountpoint to fstab"
        # eval fstab var to expand vars (this is ok since it is set by shipyard)
        fstab_entry="${fstabs[$i]}"
        echo $fstab_entry >> /etc/fstab
        tail -n1 /etc/fstab
-        echo "Mounting $mountpoint"
+        echo "INFO: Mounting $mountpoint"
        START=$(date -u +"%s")
        set +e
        while :
@ -755,14 +794,14 @@ if [ ! -z $sc_args ]; then
                DIFF=$((($NOW-$START)/60))
                # fail after 5 minutes of attempts
                if [ $DIFF -ge 5 ]; then
-                    echo "Could not mount storage cluster $sc_arg on: $mountpoint"
+                    echo "ERROR: Could not mount storage cluster $sc_arg on: $mountpoint"
                    exit 1
                fi
                sleep 1
            fi
        done
        set -e
-        echo "$mountpoint mounted."
+        echo "INFO: $mountpoint mounted."
        i=$(($i + 1))
    done
 fi
@ -805,6 +844,8 @@ p2p=$p2p
 `env | grep DOCKER_LOGIN_`
 EOF
    chmod 600 $envfile
+    # pull image
+    docker_pull_image alfpark/batch-shipyard:cascade-$version
    # launch container
    docker run $detached --net=host --env-file $envfile \
        -v /var/run/docker.sock:/var/run/docker.sock \
@ -847,7 +888,7 @@ if [ $p2penabled -eq 0 ]; then
    wait $cascadepid
    rc=$?
    if [ $rc -ne 0 ]; then
-        echo "cascade exited with non-zero exit code: $rc"
+        echo "ERROR: cascade exited with non-zero exit code: $rc"
        rm -f $nodeprepfinished
        exit $rc
    fi
@ -859,7 +900,7 @@ rm -f $cascadefailed

 # block until images ready if specified
 if [ ! -z $block ]; then
-    echo "blocking until images ready: $block"
+    echo "INFO: blocking until images ready: $block"
    IFS=',' read -ra RES <<< "$block"
    declare -a missing
    while :
@ -870,7 +911,7 @@ if [ ! -z $block ]; then
            fi
        done
        if [ ${#missing[@]} -eq 0 ]; then
-            echo "all docker images present"
+            echo "INFO: all docker images present"
            break
        else
            unset missing
--- a/scripts/shipyard_nodeprep_customimage.sh
+++ b/scripts/shipyard_nodeprep_customimage.sh
@ -81,6 +81,16 @@ done
 shift $((OPTIND-1))
 [ "$1" = "--" ] && shift

+contains() {
+    string="$1"
+    substring="$2"
+    if test "${string#*$substring}" != "$string"; then
+        return 0
+    else
+        return 1
+    fi
+}
+
 check_for_buggy_ntfs_mount() {
    # Check to ensure sdb1 mount is not mounted as ntfs
    set +e
@ -249,6 +259,35 @@ install_azurefile_docker_volume_driver() {
    ./azurefile-dockervolume-create.sh
 }

+docker_pull_image() {
+    image=$1
+    set +e
+    retries=60
+    while [ $retries -gt 0 ]; do
+        pull_out=$(docker pull $image 2>&1)
+        rc=$?
+        if [ $rc -eq 0 ]; then
+            echo "$pull_out"
+            break
+        fi
+        # non-zero exit code: check if pull output has toomanyrequests or
+        # connection resets
+        if [ contains "$pull_out" "toomanyrequests" ] || [ contains "$pull_out" "connection reset by peer" ]; then
+            echo "WARNING: will retry:\n$pull_out"
+        else
+            echo "ERROR:\n$pull_out"
+            exit $rc
+        fi
+        let retries=retries-1
+        if [ $retries -le 0 ]; then
+            echo "ERROR: Could not pull docker image: $image"
+            exit $rc
+        fi
+        sleep $[($RANDOM % 5) + 1]s
+    done
+    set -e
+}
+
 # try to get /etc/lsb-release
 if [ -e /etc/lsb-release ]; then
    . /etc/lsb-release
@ -397,8 +436,8 @@ if [ ! -z $sc_args ]; then
 fi

 # retrieve docker images related to data movement
-docker pull alfpark/blobxfer:$blobxferversion
-docker pull alfpark/batch-shipyard:tfm-$version
+docker_pull_image alfpark/blobxfer:$blobxferversion
+docker_pull_image alfpark/batch-shipyard:tfm-$version

 # login to registry server
 if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then
@ -442,6 +481,8 @@ p2p=$p2p
 `env | grep DOCKER_LOGIN_`
 EOF
 chmod 600 $envfile
+# pull image
+docker_pull_image alfpark/batch-shipyard:cascade-$version
 # launch container
 docker run $detached --net=host --env-file $envfile \
    -v /var/run/docker.sock:/var/run/docker.sock \
--- a/shipyard.py
+++ b/shipyard.py
@ -1181,6 +1181,10 @@ def pool_ssh(ctx, cardinal, nodeid, tty, command):


@pool.command('delnode')
+@click.option(
+    '--all-start-task-failed',
+    is_flag=True,
+    help='Deleted all nodes with start task failed state')
@click.option(
    '--nodeid', help='NodeId of compute node in pool to delete')
@common_options
@ -1188,10 +1192,11 @@ def pool_ssh(ctx, cardinal, nodeid, tty, command):
@keyvault_options
@aad_options
@pass_cli_context
-def pool_delnode(ctx, nodeid):
+def pool_delnode(ctx, all_start_task_failed, nodeid):
    """Delete a node from a pool"""
    ctx.initialize_for_batch()
-    convoy.fleet.action_pool_delnode(ctx.batch_client, ctx.config, nodeid)
+    convoy.fleet.action_pool_delnode(
+        ctx.batch_client, ctx.config, all_start_task_failed, nodeid)


@pool.command('rebootnode')