From 8eb2197d232975a05d44d669cab0541be65d39da Mon Sep 17 00:00:00 2001
From: Fred Park <fred.park@microsoft.com>
Date: Thu, 6 Jul 2017 11:12:05 -0700
Subject: [PATCH] Allow CentOS 7.3 on NC/NV

---
 .gitignore                               |   2 +-
 convoy/fleet.py                          |  27 ++--
 convoy/settings.py                       |  31 +++++
 docs/20-batch-shipyard-usage.md          |   7 +-
 recipes/CNTK-GPU-OpenMPI/README.md       |  11 +-
 recipes/Caffe-GPU/README.md              |  11 +-
 recipes/Chainer-GPU/README.md            |  11 +-
 recipes/FFmpeg-GPU/README.md             |  11 +-
 recipes/Keras+Theano-GPU/README.md       |  11 +-
 recipes/MXNet-GPU/README.md              |  11 +-
 recipes/NAMD-GPU/README.md               |  12 +-
 recipes/TensorFlow-Distributed/README.md |  14 +-
 recipes/TensorFlow-GPU/README.md         |  11 +-
 recipes/Torch-GPU/README.md              |  11 +-
 scripts/shipyard_nodeprep.sh             | 157 ++++++++++++++---------
 15 files changed, 201 insertions(+), 137 deletions(-)

diff --git a/.gitignore b/.gitignore
index 53af23f..68fd9c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -97,5 +97,5 @@ resources/azurefile-dockervolume-create.sh
 resources/azurefile-dockervolumedriver
 resources/azurefile-dockervolumedriver.env
 resources/docker-registry-v2.tar.gz
-resources/nvidia-docker.deb
+resources/nvidia-docker.*
 resources/nvidia-driver*.run
diff --git a/convoy/fleet.py b/convoy/fleet.py
index e9d894b..94f8df4 100644
--- a/convoy/fleet.py
+++ b/convoy/fleet.py
@@ -73,6 +73,16 @@ _AZUREFILE_DVD_BIN = {
     ),
     'target': 'resources/azurefile-dockervolumedriver'
 }
+__NVIDIA_DOCKER_RPM = {
+    'url': (
+        'https://github.com/NVIDIA/nvidia-docker/releases/download/'
+        'v1.0.1/nvidia-docker-1.0.1-1.x86_64.rpm'
+    ),
+    'sha256': (
+        'f05dfe7fe655ed39c399db0d6362e351b059f2708c3e6da17f590a000237ec3a'
+    ),
+    'target': 'resources/nvidia-docker.rpm'
+}
 _NVIDIA_DOCKER = {
     'ubuntuserver': {
         'url': (
@@ -84,6 +94,8 @@ _NVIDIA_DOCKER = {
         ),
         'target': 'resources/nvidia-docker.deb'
     },
+    'centos': __NVIDIA_DOCKER_RPM,
+    'centos-hpc': __NVIDIA_DOCKER_RPM,
 }
 _NVIDIA_DRIVER = {
     'compute': {
@@ -339,9 +351,6 @@ def _setup_nvidia_docker_package(blob_client, config):
     :return: package path
     """
     offer = settings.pool_offer(config, lower=True)
-    if offer != 'ubuntuserver':
-        raise ValueError('Offer {} is unsupported with nvidia docker'.format(
-            offer))
     pkg = pathlib.Path(_ROOT_PATH, _NVIDIA_DOCKER[offer]['target'])
     # check to see if package is downloaded
     if (not pkg.exists() or
@@ -1534,6 +1543,7 @@ def _adjust_settings_for_pool_creation(config):
     # enforce publisher/offer/sku restrictions
     allowed = False
     shipyard_container_required = True
+    # oracle linux is not supported due to UEKR4 requirement
     if publisher == 'canonical':
         if offer == 'ubuntuserver':
             if sku.startswith('14.04'):
@@ -1560,13 +1570,10 @@ def _adjust_settings_for_pool_creation(config):
         elif offer == 'opensuse-leap':
             if sku >= '42':
                 allowed = True
-    # check for valid image if gpu, currently only ubuntu 16.04 is supported
-    if (settings.is_gpu_pool(pool.vm_size) and
-            util.is_none_or_empty(node_agent) and
-            (publisher != 'canonical' and offer != 'ubuntuserver' and
-             sku < '16.04')):
-        allowed = False
-    # oracle linux is not supported due to UEKR4 requirement
+    # check if allowed for gpu (if gpu vm size)
+    if allowed:
+        allowed = settings.gpu_configuration_check(
+            config, vm_size=pool.vm_size)
     if not allowed and util.is_none_or_empty(node_agent):
         raise ValueError(
             ('Unsupported Docker Host VM Config, publisher={} offer={} '
diff --git a/convoy/settings.py b/convoy/settings.py
index f3ad8aa..59892eb 100644
--- a/convoy/settings.py
+++ b/convoy/settings.py
@@ -403,6 +403,37 @@ def get_gpu_type_from_vm_size(vm_size):
         return None
 
 
+def gpu_configuration_check(config, vm_size=None):
+    # type: (dict, str) -> bool
+    """Check if OS is allowed with a GPU VM
+    :param dict config: configuration dict
+    :param str vm_size: vm size
+    :rtype: bool
+    :return: if configuration is allowed
+    """
+    # if this is not a gpu sku, always allow
+    if util.is_none_or_empty(vm_size):
+        vm_size = pool_settings(config).vm_size
+    if not is_gpu_pool(vm_size):
+        return True
+    # always allow gpu with custom images
+    node_agent = pool_custom_image_node_agent(config)
+    if util.is_not_empty(node_agent):
+        return True
+    # check for platform image support
+    publisher = pool_publisher(config, lower=True)
+    offer = pool_offer(config, lower=True)
+    sku = pool_sku(config, lower=True)
+    if (publisher == 'canonical' and offer == 'ubuntuserver' and
+            sku > '16.04'):
+        return True
+    elif (publisher == 'openlogic' and
+          (offer == 'centos' or offer == 'centos-hpc') and sku == '7.3'):
+        return True
+    else:
+        return False
+
+
 def is_rdma_pool(vm_size):
     # type: (str) -> bool
     """Check if pool is IB/RDMA capable
diff --git a/docs/20-batch-shipyard-usage.md b/docs/20-batch-shipyard-usage.md
index 6b4e5a5..67ded24 100644
--- a/docs/20-batch-shipyard-usage.md
+++ b/docs/20-batch-shipyard-usage.md
@@ -20,10 +20,11 @@ you can invoke as:
 shipyard.cmd
 ```
 
-If on Mac, you will need to invoke the Python interpreter and pass
-the script as an argument. For example:
+If you installed manually (i.e., did not use the installer scripts), then
+you will need to invoke the Python interpreter and pass the script as an
+argument. For example:
 ```
-python shipyard.py
+python3 shipyard.py
 ```
 
 The `-h` or `--help` option will list the available options, which are
diff --git a/recipes/CNTK-GPU-OpenMPI/README.md b/recipes/CNTK-GPU-OpenMPI/README.md
index 5451f10..66dbfdf 100644
--- a/recipes/CNTK-GPU-OpenMPI/README.md
+++ b/recipes/CNTK-GPU-OpenMPI/README.md
@@ -16,12 +16,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because CNTK is a GPU-accelerated
 compute application, it is best to choose `NC` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
-once they are available for N-series VMs.
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
-are available for N-series VMs.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
-available for N-series VMs.
+* `vm_configuration` is the VM configuration
+  * `platform_image` specifies to use a platform image
+    * `publisher` should be `Canonical` or `OpenLogic`.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 * `inter_node_communication_enabled` must be set to `true`
 * `max_tasks_per_node` must be set to 1 or omitted
 
diff --git a/recipes/Caffe-GPU/README.md b/recipes/Caffe-GPU/README.md
index 707b6f4..d791da9 100644
--- a/recipes/Caffe-GPU/README.md
+++ b/recipes/Caffe-GPU/README.md
@@ -13,12 +13,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because Caffe is a GPU-accelerated
 compute application, it is best to choose `NC` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
-once they are available for N-series VMs.
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
-are available for N-series VMs.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
-available for N-series VMs.
+* `vm_configuration` is the VM configuration
+  * `platform_image` specifies to use a platform image
+    * `publisher` should be `Canonical` or `OpenLogic`.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 
 ### Global Configuration
 The global configuration should set the following properties:
diff --git a/recipes/Chainer-GPU/README.md b/recipes/Chainer-GPU/README.md
index 236f9f6..c03c8b5 100644
--- a/recipes/Chainer-GPU/README.md
+++ b/recipes/Chainer-GPU/README.md
@@ -13,12 +13,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because Caffe is a GPU-accelerated
 compute application, it is best to choose `NC` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
-once they are available for N-series VMs.
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
-are available for N-series VMs.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
-available for N-series VMs.
+* `vm_configuration` is the VM configuration
+  * `platform_image` specifies to use a platform image
+    * `publisher` should be `Canonical` or `OpenLogic`.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 
 ### Global Configuration
 The global configuration should set the following properties:
diff --git a/recipes/FFmpeg-GPU/README.md b/recipes/FFmpeg-GPU/README.md
index 9c67d59..56260c5 100644
--- a/recipes/FFmpeg-GPU/README.md
+++ b/recipes/FFmpeg-GPU/README.md
@@ -14,12 +14,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because FFmpeg is for transforming
 audio/video, it is best to choose `NV` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
-once they are available for N-series VMs.
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
-are available for N-series VMs.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
-available for N-series VMs.
+* `vm_configuration` is the VM configuration
+  * `platform_image` specifies to use a platform image
+    * `publisher` should be `Canonical` or `OpenLogic`.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 
 ### Global Configuration
 The global configuration should set the following properties:
diff --git a/recipes/Keras+Theano-GPU/README.md b/recipes/Keras+Theano-GPU/README.md
index c8d4cf5..cd39238 100644
--- a/recipes/Keras+Theano-GPU/README.md
+++ b/recipes/Keras+Theano-GPU/README.md
@@ -14,12 +14,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because Caffe is a GPU-accelerated
 compute application, it is best to choose `NC` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
-once they are available for N-series VMs.
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
-are available for N-series VMs.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
-available for N-series VMs.
+* `vm_configuration` is the VM configuration
+  * `platform_image` specifies to use a platform image
+    * `publisher` should be `Canonical` or `OpenLogic`.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 
 ### Global Configuration
 The global configuration should set the following properties:
diff --git a/recipes/MXNet-GPU/README.md b/recipes/MXNet-GPU/README.md
index 56570b6..879e278 100644
--- a/recipes/MXNet-GPU/README.md
+++ b/recipes/MXNet-GPU/README.md
@@ -13,12 +13,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because CNTK is a GPU-accelerated
 compute application, it is best to choose `NC` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
-once they are available for N-series VMs.
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
-are available for N-series VMs.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
-available for N-series VMs.
+* `vm_configuration` is the VM configuration
+  * `platform_image` specifies to use a platform image
+    * `publisher` should be `Canonical` or `OpenLogic`.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 * `inter_node_communication_enabled` must be set to `true`
 * `max_tasks_per_node` must be set to 1 or omitted
 
diff --git a/recipes/NAMD-GPU/README.md b/recipes/NAMD-GPU/README.md
index c4119f5..792eb1b 100644
--- a/recipes/NAMD-GPU/README.md
+++ b/recipes/NAMD-GPU/README.md
@@ -15,13 +15,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because NAMD is a GPU-accelerated
 compute application, it is best to choose `NC` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
-once they are available for N-series VMs.
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
-are available for N-series VMs.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
-available for N-series VMs.
-* `max_tasks_per_node` must be set to 1 or omitted
+* `vm_configuration` is the VM configuration
+  * `platform_image` specifies to use a platform image
+    * `publisher` should be `Canonical` or `OpenLogic`.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 
 ### Global Configuration
 The global configuration should set the following properties:
diff --git a/recipes/TensorFlow-Distributed/README.md b/recipes/TensorFlow-Distributed/README.md
index 136f88a..bff7cea 100644
--- a/recipes/TensorFlow-Distributed/README.md
+++ b/recipes/TensorFlow-Distributed/README.md
@@ -16,12 +16,14 @@ GPUs:
 instances feature M60 GPUs for visualization workloads. Because TensorFlow is
 a GPU-accelerated compute application, it is best to choose `NC` VM instances.
 If not using GPUs, another appropriate SKU can be selected.
-* `publisher` should be `Canonical` if using GPUs. Other publishers will be
-supported once they are available for N-series VMs.
-* `offer` should be `UbuntuServer` if using GPUs. Other offers will be
-supported once they are available for N-series VMs.
-* `sku` should be `16.04-LTS` if using GPUs. Other skus will be supported
-once they are available for N-series VMs.
+* `vm_configuration` is the VM configuration
+  * `platform_image` specifies to use a platform image
+    * `publisher` should be `Canonical` or `OpenLogic` if using GPUs. Other
+      supported publishers can be used if not.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic
+      if using GPUs. Other supported offers can be used if not.
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS if using
+      GPUs. Other supported skus can be used if not.
 
 If on multiple CPUs:
 * `max_tasks_per_node` must be set to 1 or omitted
diff --git a/recipes/TensorFlow-GPU/README.md b/recipes/TensorFlow-GPU/README.md
index 92d762e..6e77dcf 100644
--- a/recipes/TensorFlow-GPU/README.md
+++ b/recipes/TensorFlow-GPU/README.md
@@ -13,12 +13,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because TensorFlow is a GPU-accelerated
 compute application, it is best to choose `NC` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
-once they are available for N-series VMs.
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
-are available for N-series VMs.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
-available for N-series VMs.
+* `vm_configuration` is the VM configuration
+  * `platform_image` specifies to use a platform image
+    * `publisher` should be `Canonical` or `OpenLogic`.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 
 ### Global Configuration
 The global configuration should set the following properties:
diff --git a/recipes/Torch-GPU/README.md b/recipes/Torch-GPU/README.md
index e283dee..688ba92 100644
--- a/recipes/Torch-GPU/README.md
+++ b/recipes/Torch-GPU/README.md
@@ -13,12 +13,11 @@ The pool configuration should enable the following properties:
 K80 GPUs for GPU compute acceleration while `NV` VM instances feature
 M60 GPUs for visualization workloads. Because Torch is a GPU-accelerated
 compute application, it is best to choose `NC` VM instances.
-* `publisher` should be `Canonical`. Other publishers will be supported
-once they are available for N-series VMs.
-* `offer` should be `UbuntuServer`. Other offers will be supported once they
-are available for N-series VMs.
-* `sku` should be `16.04-LTS`. Other skus will be supported once they are
-available for N-series VMs.
+* `vm_configuration` is the VM configuration
+  * `platform_image` specifies to use a platform image
+    * `publisher` should be `Canonical` or `OpenLogic`.
+    * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic.
+    * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS.
 
 ### Global Configuration
 The global configuration should set the following properties:
diff --git a/scripts/shipyard_nodeprep.sh b/scripts/shipyard_nodeprep.sh
index e5b6ddb..a37b38f 100755
--- a/scripts/shipyard_nodeprep.sh
+++ b/scripts/shipyard_nodeprep.sh
@@ -154,6 +154,93 @@ check_for_nvidia_card() {
     fi
 }
 
+install_nvidia_software() {
+    offer=$1
+    shift
+    # check for nvidia card
+    check_for_nvidia_card
+    # split arg into two
+    IFS=':' read -ra GPUARGS <<< "$gpu"
+    nvdriver=${GPUARGS[1]}
+    nvdocker=${GPUARGS[2]}
+    # remove nouveau
+    rmmod nouveau
+    # purge nouveau off system
+    if [ $offer == "ubuntuserver" ]; then
+        apt-get --purge remove xserver-xorg-video-nouveau xserver-xorg-video-nouveau-hwe-16.04
+    elif [[ $offer == centos* ]]; then
+        yum erase -y xorg-x11-drv-nouveau
+    else
+        echo "ERROR: unsupported distribution for nvidia/GPU, offer: $offer"
+        exit 1
+    fi
+    # blacklist nouveau from being loaded if rebooted
+cat > /etc/modprobe.d/blacklist-nouveau.conf << EOF
+blacklist nouveau
+blacklist lbm-nouveau
+options nouveau modeset=0
+alias nouveau off
+alias lbm-nouveau off
+EOF
+    # get development essentials for nvidia driver
+    if [ $offer == "ubuntuserver" ]; then
+        install_packages $offer build-essential
+    elif [[ $offer == centos* ]]; then
+        install_packages $offer gcc binutils make "kernel-devel-$(uname -r)"
+    fi
+    # get additional dependency if NV-series VMs
+    if [ ${GPUARGS[0]} == "True" ]; then
+        if [ $offer == "ubuntuserver" ]; then
+            install_packages $offer xserver-xorg-dev
+        elif [[ $offer == centos* ]]; then
+            install_packages $offer xorg-x11-server-devel
+        fi
+    fi
+    # install driver
+    ./$nvdriver -s
+    # add flag to config template for GRID driver
+    if [ ${GPUARGS[0]} == "True" ]; then
+        echo "IgnoreSP=TRUE" >> /etc/nvidia/gridd.conf.template
+    fi
+    # install nvidia-docker
+    if [ $offer == "ubuntuserver" ]; then
+        dpkg -i $nvdocker
+    elif [[ $offer == centos* ]]; then
+        rpm -Uvh $nvdocker
+    fi
+    # enable and start nvidia docker service
+    systemctl enable nvidia-docker.service
+    systemctl start nvidia-docker.service
+    systemctl status nvidia-docker.service
+    # get driver version
+    nvdriverver=`cat /proc/driver/nvidia/version | grep "Kernel Module" | cut -d ' ' -f 9`
+    echo nvidia driver version $nvdriverver detected
+    # create the docker volume now to avoid volume driver conflicts for
+    # tasks. run this in a loop as it can fail if triggered too quickly
+    # after start
+    NV_START=$(date -u +"%s")
+    set +e
+    while :
+    do
+        echo "INFO: Attempting to create nvidia-docker volume with version $nvdriverver"
+        docker volume create -d nvidia-docker --name nvidia_driver_$nvdriverver
+        if [ $? -eq 0 ]; then
+            docker volume list
+            break
+        else
+            NV_NOW=$(date -u +"%s")
+            NV_DIFF=$((($NV_NOW-$NV_START)/60))
+            # fail after 5 minutes of attempts
+            if [ $NV_DIFF -ge 5 ]; then
+                echo "ERROR: could not create nvidia-docker volume"
+                exit 1
+            fi
+            sleep 1
+        fi
+    done
+    set -e
+}
+
 install_azurefile_docker_volume_driver() {
     chown root:root azurefile-dockervolumedriver*
     chmod 755 azurefile-dockervolumedriver
@@ -178,6 +265,8 @@ install_azurefile_docker_volume_driver() {
     # create docker volumes
     chmod +x azurefile-dockervolume-create.sh
     ./azurefile-dockervolume-create.sh
+    # list volumes
+    docker volume list
 }
 
 refresh_package_index() {
@@ -464,67 +553,7 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then
     $srvstatus
     # install gpu related items
     if [ ! -z $gpu ] && [ ! -f $nodeprepfinished ]; then
-        # check for nvidia card
-        check_for_nvidia_card
-        # split arg into two
-        IFS=':' read -ra GPUARGS <<< "$gpu"
-        # remove nouveau
-        rmmod nouveau
-        apt-get --purge remove xserver-xorg-video-nouveau xserver-xorg-video-nouveau-hwe-16.04
-        # blacklist nouveau from being loaded if rebooted
-cat > /etc/modprobe.d/blacklist-nouveau.conf << EOF
-blacklist nouveau
-blacklist lbm-nouveau
-options nouveau modeset=0
-alias nouveau off
-alias lbm-nouveau off
-EOF
-        nvdriver=${GPUARGS[1]}
-        nvdocker=${GPUARGS[2]}
-        # get development essentials for nvidia driver
-        install_packages $offer build-essential
-        # get additional dependency if NV-series VMs
-        if [ ${GPUARGS[0]} == "True" ]; then
-            install_packages $offer xserver-xorg-dev
-        fi
-        # install driver
-        ./$nvdriver -s
-        # add flag to config template for GRID driver
-        if [ ${GPUARGS[0]} == "True" ]; then
-            echo "IgnoreSP=TRUE" >> /etc/nvidia/gridd.conf.template
-        fi
-        # install nvidia-docker
-        dpkg -i $nvdocker
-        # enable and start nvidia docker service
-        systemctl enable nvidia-docker.service
-        systemctl start nvidia-docker.service
-        systemctl status nvidia-docker.service
-        # get driver version
-        nvdriverver=`cat /proc/driver/nvidia/version | grep "Kernel Module" | cut -d ' ' -f 9`
-        echo nvidia driver version $nvdriverver detected
-        # create the docker volume now to avoid volume driver conflicts for
-        # tasks. run this in a loop as it can fail if triggered too quickly
-        # after start
-        NV_START=$(date -u +"%s")
-        set +e
-        while :
-        do
-            echo "INFO: Attempting to create nvidia-docker volume with version $nvdriverver"
-            docker volume create -d nvidia-docker --name nvidia_driver_$nvdriverver
-            if [ $? -eq 0 ]; then
-                break
-            else
-                NV_NOW=$(date -u +"%s")
-                NV_DIFF=$((($NV_NOW-$NV_START)/60))
-                # fail after 5 minutes of attempts
-                if [ $NV_DIFF -ge 5 ]; then
-                    echo "ERROR: could not create nvidia-docker volume"
-                    exit 1
-                fi
-                sleep 1
-            fi
-        done
-        set -e
+        install_nvidia_software $offer
     fi
     # set up glusterfs
     if [ $gluster_on_compute -eq 1 ] && [ ! -f $nodeprepfinished ]; then
@@ -568,7 +597,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
         exit 1
     fi
     # gpu is not supported on these offers
-    if [ ! -z $gpu ]; then
+    if [[ ! -z $gpu ]] && [[ $offer != centos* ]]; then
         echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
         exit 1
     fi
@@ -618,6 +647,10 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l
     if [ $azurefile -eq 1 ]; then
         install_azurefile_docker_volume_driver $offer $sku
     fi
+    # install gpu related items
+    if [ ! -z $gpu ] && [ ! -f $nodeprepfinished ]; then
+        install_nvidia_software $offer
+    fi
     # set up glusterfs
     if [ $gluster_on_compute -eq 1 ] && [ ! -f $nodeprepfinished ]; then
         install_packages $offer epel-release centos-release-gluster38