Allow CentOS 7.3 on NC/NV

2017-07-06 11:12:05 -07:00 · 2017-07-06 11:12:05 -07:00 · 8eb2197d23
--- a/.gitignore
+++ b/.gitignore
@ -97,5 +97,5 @@ resources/azurefile-dockervolume-create.sh
 resources/azurefile-dockervolumedriver
 resources/azurefile-dockervolumedriver.env
 resources/docker-registry-v2.tar.gz
-resources/nvidia-docker.deb
+resources/nvidia-docker.*
 resources/nvidia-driver*.run
--- a/convoy/fleet.py
+++ b/convoy/fleet.py
@ -73,6 +73,16 @@ _AZUREFILE_DVD_BIN = {
    ),
    'target': 'resources/azurefile-dockervolumedriver'
 }
 __NVIDIA_DOCKER_RPM = {
    'url': (
        'https://github.com/NVIDIA/nvidia-docker/releases/download/'
        'v1.0.1/nvidia-docker-1.0.1-1.x86_64.rpm'
    ),
    'sha256': (
        'f05dfe7fe655ed39c399db0d6362e351b059f2708c3e6da17f590a000237ec3a'
    ),
    'target': 'resources/nvidia-docker.rpm'
 }
 _NVIDIA_DOCKER = {
    'ubuntuserver': {
        'url': (
@ -84,6 +94,8 @@ _NVIDIA_DOCKER = {
        ),
        'target': 'resources/nvidia-docker.deb'
    },
    'centos': __NVIDIA_DOCKER_RPM,
    'centos-hpc': __NVIDIA_DOCKER_RPM,
 }
 _NVIDIA_DRIVER = {
    'compute': {
@ -339,9 +351,6 @@ def _setup_nvidia_docker_package(blob_client, config):
    :return: package path
    """
    offer = settings.pool_offer(config, lower=True)
    if offer != 'ubuntuserver':
        raise ValueError('Offer {} is unsupported with nvidia docker'.format(
            offer))
    pkg = pathlib.Path(_ROOT_PATH, _NVIDIA_DOCKER[offer]['target'])
    # check to see if package is downloaded
    if (not pkg.exists() or
@ -1534,6 +1543,7 @@ def _adjust_settings_for_pool_creation(config):
    # enforce publisher/offer/sku restrictions
    allowed = False
    shipyard_container_required = True
    # oracle linux is not supported due to UEKR4 requirement
    if publisher == 'canonical':
        if offer == 'ubuntuserver':
            if sku.startswith('14.04'):
@ -1560,13 +1570,10 @@ def _adjust_settings_for_pool_creation(config):
        elif offer == 'opensuse-leap':
            if sku >= '42':
                allowed = True
-    # check for valid image if gpu, currently only ubuntu 16.04 is supported
+    # check if allowed for gpu (if gpu vm size)
-    if (settings.is_gpu_pool(pool.vm_size) and
+    if allowed:
-            util.is_none_or_empty(node_agent) and
+        allowed = settings.gpu_configuration_check(
-            (publisher != 'canonical' and offer != 'ubuntuserver' and
+            config, vm_size=pool.vm_size)
             sku < '16.04')):
        allowed = False
    # oracle linux is not supported due to UEKR4 requirement
    if not allowed and util.is_none_or_empty(node_agent):
        raise ValueError(
            ('Unsupported Docker Host VM Config, publisher={} offer={} '
--- a/convoy/settings.py
+++ b/convoy/settings.py
@ -403,6 +403,37 @@ def get_gpu_type_from_vm_size(vm_size):
        return None
 def gpu_configuration_check(config, vm_size=None):
    # type: (dict, str) -> bool
    """Check if OS is allowed with a GPU VM
    :param dict config: configuration dict
    :param str vm_size: vm size
    :rtype: bool
    :return: if configuration is allowed
    """
    # if this is not a gpu sku, always allow
    if util.is_none_or_empty(vm_size):
        vm_size = pool_settings(config).vm_size
    if not is_gpu_pool(vm_size):
        return True
    # always allow gpu with custom images
    node_agent = pool_custom_image_node_agent(config)
    if util.is_not_empty(node_agent):
        return True
    # check for platform image support
    publisher = pool_publisher(config, lower=True)
    offer = pool_offer(config, lower=True)
    sku = pool_sku(config, lower=True)
    if (publisher == 'canonical' and offer == 'ubuntuserver' and
            sku > '16.04'):
        return True
    elif (publisher == 'openlogic' and
          (offer == 'centos' or offer == 'centos-hpc') and sku == '7.3'):
        return True
    else:
        return False
 def is_rdma_pool(vm_size):
    # type: (str) -> bool
    """Check if pool is IB/RDMA capable
--- a/docs/20-batch-shipyard-usage.md
+++ b/docs/20-batch-shipyard-usage.md
@ -20,10 +20,11 @@ you can invoke as:
 shipyard.cmd
 ```
-If on Mac, you will need to invoke the Python interpreter and pass
+If you installed manually (i.e., did not use the installer scripts), then
-the script as an argument. For example:
+you will need to invoke the Python interpreter and pass the script as an
 argument. For example:
 ```
-python shipyard.py
+python3 shipyard.py
 ```
 The `-h` or `--help` option will list the available options, which are
--- a/recipes/CNTK-GPU-OpenMPI/README.md
+++ b/recipes/CNTK-GPU-OpenMPI/README.md
@ -16,12 +16,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because CNTK is a GPU-accelerated
 compute application, it is best to choose `NC` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
+* `vm_configuration` is the VM configuration
-once they are available for N-series VMs.
+  * `platform_image` specifies to use a platform image
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
+    * `publisher` should be `Canonical` or `OpenLogic`.
-are available for N-series VMs.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 available for N-series VMs.
 * `inter_node_communication_enabled` must be set to `true`
 * `max_tasks_per_node` must be set to 1 or omitted
--- a/recipes/Caffe-GPU/README.md
+++ b/recipes/Caffe-GPU/README.md
@ -13,12 +13,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because Caffe is a GPU-accelerated
 compute application, it is best to choose `NC` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
+* `vm_configuration` is the VM configuration
-once they are available for N-series VMs.
+  * `platform_image` specifies to use a platform image
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
+    * `publisher` should be `Canonical` or `OpenLogic`.
-are available for N-series VMs.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 available for N-series VMs.
 ### Global Configuration
 The global configuration should set the following properties:
--- a/recipes/Chainer-GPU/README.md
+++ b/recipes/Chainer-GPU/README.md
@ -13,12 +13,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because Caffe is a GPU-accelerated
 compute application, it is best to choose `NC` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
+* `vm_configuration` is the VM configuration
-once they are available for N-series VMs.
+  * `platform_image` specifies to use a platform image
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
+    * `publisher` should be `Canonical` or `OpenLogic`.
-are available for N-series VMs.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 available for N-series VMs.
 ### Global Configuration
 The global configuration should set the following properties:
--- a/recipes/FFmpeg-GPU/README.md
+++ b/recipes/FFmpeg-GPU/README.md
@ -14,12 +14,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because FFmpeg is for transforming
 audio/video, it is best to choose `NV` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
+* `vm_configuration` is the VM configuration
-once they are available for N-series VMs.
+  * `platform_image` specifies to use a platform image
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
+    * `publisher` should be `Canonical` or `OpenLogic`.
-are available for N-series VMs.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 available for N-series VMs.
 ### Global Configuration
 The global configuration should set the following properties:
--- a/recipes/Keras+Theano-GPU/README.md
+++ b/recipes/Keras+Theano-GPU/README.md
@ -14,12 +14,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because Caffe is a GPU-accelerated
 compute application, it is best to choose `NC` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
+* `vm_configuration` is the VM configuration
-once they are available for N-series VMs.
+  * `platform_image` specifies to use a platform image
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
+    * `publisher` should be `Canonical` or `OpenLogic`.
-are available for N-series VMs.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 available for N-series VMs.
 ### Global Configuration
 The global configuration should set the following properties:
--- a/recipes/MXNet-GPU/README.md
+++ b/recipes/MXNet-GPU/README.md
@ -13,12 +13,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because CNTK is a GPU-accelerated
 compute application, it is best to choose `NC` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
+* `vm_configuration` is the VM configuration
-once they are available for N-series VMs.
+  * `platform_image` specifies to use a platform image
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
+    * `publisher` should be `Canonical` or `OpenLogic`.
-are available for N-series VMs.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 available for N-series VMs.
 * `inter_node_communication_enabled` must be set to `true`
 * `max_tasks_per_node` must be set to 1 or omitted
--- a/recipes/NAMD-GPU/README.md
+++ b/recipes/NAMD-GPU/README.md
@ -15,13 +15,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because NAMD is a GPU-accelerated
 compute application, it is best to choose `NC` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
+* `vm_configuration` is the VM configuration
-once they are available for N-series VMs.
+  * `platform_image` specifies to use a platform image
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
+    * `publisher` should be `Canonical` or `OpenLogic`.
-are available for N-series VMs.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 available for N-series VMs.
 * `max_tasks_per_node` must be set to 1 or omitted
 ### Global Configuration
 The global configuration should set the following properties:
--- a/recipes/TensorFlow-Distributed/README.md
+++ b/recipes/TensorFlow-Distributed/README.md
@ -16,12 +16,14 @@ GPUs:
 instances feature M60 GPUs for visualization workloads. Because TensorFlow is
 a GPU-accelerated compute application, it is best to choose `NC` VM instances.
 If not using GPUs, another appropriate SKU can be selected.
-* `publisher` should be `Canonical` if using GPUs. Other publishers will be
+* `vm_configuration` is the VM configuration
-supported once they are available for N-series VMs.
+  * `platform_image` specifies to use a platform image
-* `offer` should be `UbuntuServer` if using GPUs. Other offers will be
+    * `publisher` should be `Canonical` or `OpenLogic` if using GPUs. Other
-supported once they are available for N-series VMs.
+      supported publishers can be used if not.
-* `sku` should be `16.04-LTS` if using GPUs. Other skus will be supported
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic
-once they are available for N-series VMs.
+      if using GPUs. Other supported offers can be used if not.
    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS if using
      GPUs. Other supported skus can be used if not.
 If on multiple CPUs:
 * `max_tasks_per_node` must be set to 1 or omitted
--- a/recipes/TensorFlow-GPU/README.md
+++ b/recipes/TensorFlow-GPU/README.md
@ -13,12 +13,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because TensorFlow is a GPU-accelerated
 compute application, it is best to choose `NC` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
+* `vm_configuration` is the VM configuration
-once they are available for N-series VMs.
+  * `platform_image` specifies to use a platform image
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
+    * `publisher` should be `Canonical` or `OpenLogic`.
-are available for N-series VMs.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 available for N-series VMs.
 ### Global Configuration
 The global configuration should set the following properties:
--- a/recipes/Torch-GPU/README.md
+++ b/recipes/Torch-GPU/README.md
@ -13,12 +13,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because Torch is a GPU-accelerated
 compute application, it is best to choose `NC` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
+* `vm_configuration` is the VM configuration
-once they are available for N-series VMs.
+  * `platform_image` specifies to use a platform image
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
+    * `publisher` should be `Canonical` or `OpenLogic`.
-are available for N-series VMs.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 available for N-series VMs.
 ### Global Configuration
 The global configuration should set the following properties:
--- a/scripts/shipyard_nodeprep.sh
+++ b/scripts/shipyard_nodeprep.sh
@ -154,6 +154,93 @@ check_for_nvidia_card() {
    fi
 }
 install_nvidia_software() {
    offer=$1
    shift
    # check for nvidia card
    check_for_nvidia_card
    # split arg into two
    IFS=':' read -ra GPUARGS <<< "$gpu"
    nvdriver=${GPUARGS[1]}
    nvdocker=${GPUARGS[2]}
    # remove nouveau
    rmmod nouveau
    # purge nouveau off system
    if [ $offer == "ubuntuserver" ]; then
        apt-get --purge remove xserver-xorg-video-nouveau xserver-xorg-video-nouveau-hwe-16.04
    elif [[ $offer == centos* ]]; then
        yum erase -y xorg-x11-drv-nouveau
    else
        echo "ERROR: unsupported distribution for nvidia/GPU, offer: $offer"
        exit 1
    fi
    # blacklist nouveau from being loaded if rebooted
 cat > /etc/modprobe.d/blacklist-nouveau.conf << EOF
 blacklist nouveau
 blacklist lbm-nouveau
 options nouveau modeset=0
 alias nouveau off
 alias lbm-nouveau off
 EOF
    # get development essentials for nvidia driver
    if [ $offer == "ubuntuserver" ]; then
        install_packages $offer build-essential
    elif [[ $offer == centos* ]]; then
        install_packages $offer gcc binutils make "kernel-devel-$(uname -r)"
    fi
    # get additional dependency if NV-series VMs
    if [ ${GPUARGS[0]} == "True" ]; then
        if [ $offer == "ubuntuserver" ]; then
            install_packages $offer xserver-xorg-dev
        elif [[ $offer == centos* ]]; then
            install_packages $offer xorg-x11-server-devel
        fi
    fi
    # install driver
    ./$nvdriver -s
    # add flag to config template for GRID driver
    if [ ${GPUARGS[0]} == "True" ]; then
        echo "IgnoreSP=TRUE" >> /etc/nvidia/gridd.conf.template
    fi
    # install nvidia-docker
    if [ $offer == "ubuntuserver" ]; then
        dpkg -i $nvdocker
    elif [[ $offer == centos* ]]; then
        rpm -Uvh $nvdocker
    fi
    # enable and start nvidia docker service
    systemctl enable nvidia-docker.service
    systemctl start nvidia-docker.service
    systemctl status nvidia-docker.service
    # get driver version
    nvdriverver=`cat /proc/driver/nvidia/version | grep "Kernel Module" | cut -d ' ' -f 9`
    echo nvidia driver version $nvdriverver detected
    # create the docker volume now to avoid volume driver conflicts for
    # tasks. run this in a loop as it can fail if triggered too quickly
    # after start
    NV_START=$(date -u +"%s")
    set +e
    while :
    do
        echo "INFO: Attempting to create nvidia-docker volume with version $nvdriverver"
        docker volume create -d nvidia-docker --name nvidia_driver_$nvdriverver
        if [ $? -eq 0 ]; then
            docker volume list
            break
        else
            NV_NOW=$(date -u +"%s")
            NV_DIFF=$((($NV_NOW-$NV_START)/60))
            # fail after 5 minutes of attempts
            if [ $NV_DIFF -ge 5 ]; then
                echo "ERROR: could not create nvidia-docker volume"
                exit 1
            fi
            sleep 1
        fi
    done
    set -e
 }
 install_azurefile_docker_volume_driver() {
    chown root:root azurefile-dockervolumedriver*
    chmod 755 azurefile-dockervolumedriver
@ -178,6 +265,8 @@ install_azurefile_docker_volume_driver() {
    # create docker volumes
    chmod +x azurefile-dockervolume-create.sh
    ./azurefile-dockervolume-create.sh
    # list volumes
    docker volume list
 }
 refresh_package_index() {
@ -464,67 +553,7 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then
    $srvstatus
    # install gpu related items
    if [ ! -z $gpu ] && [ ! -f $nodeprepfinished ]; then
-        # check for nvidia card
+        install_nvidia_software $offer
        check_for_nvidia_card
        # split arg into two
        IFS=':' read -ra GPUARGS <<< "$gpu"
        # remove nouveau
        rmmod nouveau
        apt-get --purge remove xserver-xorg-video-nouveau xserver-xorg-video-nouveau-hwe-16.04
        # blacklist nouveau from being loaded if rebooted
 cat > /etc/modprobe.d/blacklist-nouveau.conf << EOF
 blacklist nouveau
 blacklist lbm-nouveau
 options nouveau modeset=0
 alias nouveau off
 alias lbm-nouveau off
 EOF
        nvdriver=${GPUARGS[1]}
        nvdocker=${GPUARGS[2]}
        # get development essentials for nvidia driver
        install_packages $offer build-essential
        # get additional dependency if NV-series VMs
        if [ ${GPUARGS[0]} == "True" ]; then
            install_packages $offer xserver-xorg-dev
        fi
        # install driver
        ./$nvdriver -s
        # add flag to config template for GRID driver
        if [ ${GPUARGS[0]} == "True" ]; then
            echo "IgnoreSP=TRUE" >> /etc/nvidia/gridd.conf.template
        fi
        # install nvidia-docker
        dpkg -i $nvdocker
        # enable and start nvidia docker service
        systemctl enable nvidia-docker.service
        systemctl start nvidia-docker.service
        systemctl status nvidia-docker.service
        # get driver version
        nvdriverver=`cat /proc/driver/nvidia/version | grep "Kernel Module" | cut -d ' ' -f 9`
        echo nvidia driver version $nvdriverver detected
        # create the docker volume now to avoid volume driver conflicts for
        # tasks. run this in a loop as it can fail if triggered too quickly
        # after start
        NV_START=$(date -u +"%s")
        set +e
        while :
        do
            echo "INFO: Attempting to create nvidia-docker volume with version $nvdriverver"
            docker volume create -d nvidia-docker --name nvidia_driver_$nvdriverver
            if [ $? -eq 0 ]; then
                break
            else
                NV_NOW=$(date -u +"%s")
                NV_DIFF=$((($NV_NOW-$NV_START)/60))
                # fail after 5 minutes of attempts
                if [ $NV_DIFF -ge 5 ]; then
                    echo "ERROR: could not create nvidia-docker volume"
                    exit 1
                fi
                sleep 1
            fi
        done
        set -e
    fi
    # set up glusterfs
    if [ $gluster_on_compute -eq 1 ] && [ ! -f $nodeprepfinished ]; then
@ -568,7 +597,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
        exit 1
    fi
    # gpu is not supported on these offers
-    if [ ! -z $gpu ]; then
+    if [[ ! -z $gpu ]] && [[ $offer != centos* ]]; then
        echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
        exit 1
    fi
@ -618,6 +647,10 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
    if [ $azurefile -eq 1 ]; then
        install_azurefile_docker_volume_driver $offer $sku
    fi
    # install gpu related items
    if [ ! -z $gpu ] && [ ! -f $nodeprepfinished ]; then
        install_nvidia_software $offer
    fi
    # set up glusterfs
    if [ $gluster_on_compute -eq 1 ] && [ ! -f $nodeprepfinished ]; then
        install_packages $offer epel-release centos-release-gluster38