diff --git a/.gitignore b/.gitignore index 53af23f..68fd9c3 100644 --- a/.gitignore +++ b/.gitignore @@ -97,5 +97,5 @@ resources/azurefile-dockervolume-create.sh resources/azurefile-dockervolumedriver resources/azurefile-dockervolumedriver.env resources/docker-registry-v2.tar.gz -resources/nvidia-docker.deb +resources/nvidia-docker.* resources/nvidia-driver*.run diff --git a/convoy/fleet.py b/convoy/fleet.py index e9d894b..94f8df4 100644 --- a/convoy/fleet.py +++ b/convoy/fleet.py @@ -73,6 +73,16 @@ _AZUREFILE_DVD_BIN = { ), 'target': 'resources/azurefile-dockervolumedriver' } +__NVIDIA_DOCKER_RPM = { + 'url': ( + 'https://github.com/NVIDIA/nvidia-docker/releases/download/' + 'v1.0.1/nvidia-docker-1.0.1-1.x86_64.rpm' + ), + 'sha256': ( + 'f05dfe7fe655ed39c399db0d6362e351b059f2708c3e6da17f590a000237ec3a' + ), + 'target': 'resources/nvidia-docker.rpm' +} _NVIDIA_DOCKER = { 'ubuntuserver': { 'url': ( @@ -84,6 +94,8 @@ _NVIDIA_DOCKER = { ), 'target': 'resources/nvidia-docker.deb' }, + 'centos': __NVIDIA_DOCKER_RPM, + 'centos-hpc': __NVIDIA_DOCKER_RPM, } _NVIDIA_DRIVER = { 'compute': { @@ -339,9 +351,6 @@ def _setup_nvidia_docker_package(blob_client, config): :return: package path """ offer = settings.pool_offer(config, lower=True) - if offer != 'ubuntuserver': - raise ValueError('Offer {} is unsupported with nvidia docker'.format( - offer)) pkg = pathlib.Path(_ROOT_PATH, _NVIDIA_DOCKER[offer]['target']) # check to see if package is downloaded if (not pkg.exists() or @@ -1534,6 +1543,7 @@ def _adjust_settings_for_pool_creation(config): # enforce publisher/offer/sku restrictions allowed = False shipyard_container_required = True + # oracle linux is not supported due to UEKR4 requirement if publisher == 'canonical': if offer == 'ubuntuserver': if sku.startswith('14.04'): @@ -1560,13 +1570,10 @@ def _adjust_settings_for_pool_creation(config): elif offer == 'opensuse-leap': if sku >= '42': allowed = True - # check for valid image if gpu, currently only ubuntu 16.04 is supported - if (settings.is_gpu_pool(pool.vm_size) and - util.is_none_or_empty(node_agent) and - (publisher != 'canonical' and offer != 'ubuntuserver' and - sku < '16.04')): - allowed = False - # oracle linux is not supported due to UEKR4 requirement + # check if allowed for gpu (if gpu vm size) + if allowed: + allowed = settings.gpu_configuration_check( + config, vm_size=pool.vm_size) if not allowed and util.is_none_or_empty(node_agent): raise ValueError( ('Unsupported Docker Host VM Config, publisher={} offer={} ' diff --git a/convoy/settings.py b/convoy/settings.py index f3ad8aa..59892eb 100644 --- a/convoy/settings.py +++ b/convoy/settings.py @@ -403,6 +403,37 @@ def get_gpu_type_from_vm_size(vm_size): return None +def gpu_configuration_check(config, vm_size=None): + # type: (dict, str) -> bool + """Check if OS is allowed with a GPU VM + :param dict config: configuration dict + :param str vm_size: vm size + :rtype: bool + :return: if configuration is allowed + """ + # if this is not a gpu sku, always allow + if util.is_none_or_empty(vm_size): + vm_size = pool_settings(config).vm_size + if not is_gpu_pool(vm_size): + return True + # always allow gpu with custom images + node_agent = pool_custom_image_node_agent(config) + if util.is_not_empty(node_agent): + return True + # check for platform image support + publisher = pool_publisher(config, lower=True) + offer = pool_offer(config, lower=True) + sku = pool_sku(config, lower=True) + if (publisher == 'canonical' and offer == 'ubuntuserver' and + sku > '16.04'): + return True + elif (publisher == 'openlogic' and + (offer == 'centos' or offer == 'centos-hpc') and sku == '7.3'): + return True + else: + return False + + def is_rdma_pool(vm_size): # type: (str) -> bool """Check if pool is IB/RDMA capable diff --git a/docs/20-batch-shipyard-usage.md b/docs/20-batch-shipyard-usage.md index 6b4e5a5..67ded24 100644 --- a/docs/20-batch-shipyard-usage.md +++ b/docs/20-batch-shipyard-usage.md @@ -20,10 +20,11 @@ you can invoke as: shipyard.cmd ``` -If on Mac, you will need to invoke the Python interpreter and pass -the script as an argument. For example: +If you installed manually (i.e., did not use the installer scripts), then +you will need to invoke the Python interpreter and pass the script as an +argument. For example: ``` -python shipyard.py +python3 shipyard.py ``` The `-h` or `--help` option will list the available options, which are diff --git a/recipes/CNTK-GPU-OpenMPI/README.md b/recipes/CNTK-GPU-OpenMPI/README.md index 5451f10..66dbfdf 100644 --- a/recipes/CNTK-GPU-OpenMPI/README.md +++ b/recipes/CNTK-GPU-OpenMPI/README.md @@ -16,12 +16,11 @@ The pool configuration should enable the following properties: K80 GPUs for GPU compute acceleration while `NV` VM instances feature M60 GPUs for visualization workloads. Because CNTK is a GPU-accelerated compute application, it is best to choose `NC` VM instances. -* `publisher` should be `Canonical`. Other publishers will be supported -once they are available for N-series VMs. -* `offer` should be `UbuntuServer`. Other offers will be supported once they -are available for N-series VMs. -* `sku` should be `16.04-LTS`. Other skus will be supported once they are -available for N-series VMs. +* `vm_configuration` is the VM configuration + * `platform_image` specifies to use a platform image + * `publisher` should be `Canonical` or `OpenLogic`. + * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic. + * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS. * `inter_node_communication_enabled` must be set to `true` * `max_tasks_per_node` must be set to 1 or omitted diff --git a/recipes/Caffe-GPU/README.md b/recipes/Caffe-GPU/README.md index 707b6f4..d791da9 100644 --- a/recipes/Caffe-GPU/README.md +++ b/recipes/Caffe-GPU/README.md @@ -13,12 +13,11 @@ The pool configuration should enable the following properties: K80 GPUs for GPU compute acceleration while `NV` VM instances feature M60 GPUs for visualization workloads. Because Caffe is a GPU-accelerated compute application, it is best to choose `NC` VM instances. -* `publisher` should be `Canonical`. Other publishers will be supported -once they are available for N-series VMs. -* `offer` should be `UbuntuServer`. Other offers will be supported once they -are available for N-series VMs. -* `sku` should be `16.04-LTS`. Other skus will be supported once they are -available for N-series VMs. +* `vm_configuration` is the VM configuration + * `platform_image` specifies to use a platform image + * `publisher` should be `Canonical` or `OpenLogic`. + * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic. + * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS. ### Global Configuration The global configuration should set the following properties: diff --git a/recipes/Chainer-GPU/README.md b/recipes/Chainer-GPU/README.md index 236f9f6..c03c8b5 100644 --- a/recipes/Chainer-GPU/README.md +++ b/recipes/Chainer-GPU/README.md @@ -13,12 +13,11 @@ The pool configuration should enable the following properties: K80 GPUs for GPU compute acceleration while `NV` VM instances feature M60 GPUs for visualization workloads. Because Caffe is a GPU-accelerated compute application, it is best to choose `NC` VM instances. -* `publisher` should be `Canonical`. Other publishers will be supported -once they are available for N-series VMs. -* `offer` should be `UbuntuServer`. Other offers will be supported once they -are available for N-series VMs. -* `sku` should be `16.04-LTS`. Other skus will be supported once they are -available for N-series VMs. +* `vm_configuration` is the VM configuration + * `platform_image` specifies to use a platform image + * `publisher` should be `Canonical` or `OpenLogic`. + * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic. + * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS. ### Global Configuration The global configuration should set the following properties: diff --git a/recipes/FFmpeg-GPU/README.md b/recipes/FFmpeg-GPU/README.md index 9c67d59..56260c5 100644 --- a/recipes/FFmpeg-GPU/README.md +++ b/recipes/FFmpeg-GPU/README.md @@ -14,12 +14,11 @@ The pool configuration should enable the following properties: K80 GPUs for GPU compute acceleration while `NV` VM instances feature M60 GPUs for visualization workloads. Because FFmpeg is for transforming audio/video, it is best to choose `NV` VM instances. -* `publisher` should be `Canonical`. Other publishers will be supported -once they are available for N-series VMs. -* `offer` should be `UbuntuServer`. Other offers will be supported once they -are available for N-series VMs. -* `sku` should be `16.04-LTS`. Other skus will be supported once they are -available for N-series VMs. +* `vm_configuration` is the VM configuration + * `platform_image` specifies to use a platform image + * `publisher` should be `Canonical` or `OpenLogic`. + * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic. + * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS. ### Global Configuration The global configuration should set the following properties: diff --git a/recipes/Keras+Theano-GPU/README.md b/recipes/Keras+Theano-GPU/README.md index c8d4cf5..cd39238 100644 --- a/recipes/Keras+Theano-GPU/README.md +++ b/recipes/Keras+Theano-GPU/README.md @@ -14,12 +14,11 @@ The pool configuration should enable the following properties: K80 GPUs for GPU compute acceleration while `NV` VM instances feature M60 GPUs for visualization workloads. Because Caffe is a GPU-accelerated compute application, it is best to choose `NC` VM instances. -* `publisher` should be `Canonical`. Other publishers will be supported -once they are available for N-series VMs. -* `offer` should be `UbuntuServer`. Other offers will be supported once they -are available for N-series VMs. -* `sku` should be `16.04-LTS`. Other skus will be supported once they are -available for N-series VMs. +* `vm_configuration` is the VM configuration + * `platform_image` specifies to use a platform image + * `publisher` should be `Canonical` or `OpenLogic`. + * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic. + * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS. ### Global Configuration The global configuration should set the following properties: diff --git a/recipes/MXNet-GPU/README.md b/recipes/MXNet-GPU/README.md index 56570b6..879e278 100644 --- a/recipes/MXNet-GPU/README.md +++ b/recipes/MXNet-GPU/README.md @@ -13,12 +13,11 @@ The pool configuration should enable the following properties: K80 GPUs for GPU compute acceleration while `NV` VM instances feature M60 GPUs for visualization workloads. Because CNTK is a GPU-accelerated compute application, it is best to choose `NC` VM instances. -* `publisher` should be `Canonical`. Other publishers will be supported -once they are available for N-series VMs. -* `offer` should be `UbuntuServer`. Other offers will be supported once they -are available for N-series VMs. -* `sku` should be `16.04-LTS`. Other skus will be supported once they are -available for N-series VMs. +* `vm_configuration` is the VM configuration + * `platform_image` specifies to use a platform image + * `publisher` should be `Canonical` or `OpenLogic`. + * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic. + * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS. * `inter_node_communication_enabled` must be set to `true` * `max_tasks_per_node` must be set to 1 or omitted diff --git a/recipes/NAMD-GPU/README.md b/recipes/NAMD-GPU/README.md index c4119f5..792eb1b 100644 --- a/recipes/NAMD-GPU/README.md +++ b/recipes/NAMD-GPU/README.md @@ -15,13 +15,11 @@ The pool configuration should enable the following properties: K80 GPUs for GPU compute acceleration while `NV` VM instances feature M60 GPUs for visualization workloads. Because NAMD is a GPU-accelerated compute application, it is best to choose `NC` VM instances. -* `publisher` should be `Canonical`. Other publishers will be supported -once they are available for N-series VMs. -* `offer` should be `UbuntuServer`. Other offers will be supported once they -are available for N-series VMs. -* `sku` should be `16.04-LTS`. Other skus will be supported once they are -available for N-series VMs. -* `max_tasks_per_node` must be set to 1 or omitted +* `vm_configuration` is the VM configuration + * `platform_image` specifies to use a platform image + * `publisher` should be `Canonical` or `OpenLogic`. + * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic. + * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS. ### Global Configuration The global configuration should set the following properties: diff --git a/recipes/TensorFlow-Distributed/README.md b/recipes/TensorFlow-Distributed/README.md index 136f88a..bff7cea 100644 --- a/recipes/TensorFlow-Distributed/README.md +++ b/recipes/TensorFlow-Distributed/README.md @@ -16,12 +16,14 @@ GPUs: instances feature M60 GPUs for visualization workloads. Because TensorFlow is a GPU-accelerated compute application, it is best to choose `NC` VM instances. If not using GPUs, another appropriate SKU can be selected. -* `publisher` should be `Canonical` if using GPUs. Other publishers will be -supported once they are available for N-series VMs. -* `offer` should be `UbuntuServer` if using GPUs. Other offers will be -supported once they are available for N-series VMs. -* `sku` should be `16.04-LTS` if using GPUs. Other skus will be supported -once they are available for N-series VMs. +* `vm_configuration` is the VM configuration + * `platform_image` specifies to use a platform image + * `publisher` should be `Canonical` or `OpenLogic` if using GPUs. Other + supported publishers can be used if not. + * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic + if using GPUs. Other supported offers can be used if not. + * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS if using + GPUs. Other supported skus can be used if not. If on multiple CPUs: * `max_tasks_per_node` must be set to 1 or omitted diff --git a/recipes/TensorFlow-GPU/README.md b/recipes/TensorFlow-GPU/README.md index 92d762e..6e77dcf 100644 --- a/recipes/TensorFlow-GPU/README.md +++ b/recipes/TensorFlow-GPU/README.md @@ -13,12 +13,11 @@ The pool configuration should enable the following properties: K80 GPUs for GPU compute acceleration while `NV` VM instances feature M60 GPUs for visualization workloads. Because TensorFlow is a GPU-accelerated compute application, it is best to choose `NC` VM instances. -* `publisher` should be `Canonical`. Other publishers will be supported -once they are available for N-series VMs. -* `offer` should be `UbuntuServer`. Other offers will be supported once they -are available for N-series VMs. -* `sku` should be `16.04-LTS`. Other skus will be supported once they are -available for N-series VMs. +* `vm_configuration` is the VM configuration + * `platform_image` specifies to use a platform image + * `publisher` should be `Canonical` or `OpenLogic`. + * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic. + * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS. ### Global Configuration The global configuration should set the following properties: diff --git a/recipes/Torch-GPU/README.md b/recipes/Torch-GPU/README.md index e283dee..688ba92 100644 --- a/recipes/Torch-GPU/README.md +++ b/recipes/Torch-GPU/README.md @@ -13,12 +13,11 @@ The pool configuration should enable the following properties: K80 GPUs for GPU compute acceleration while `NV` VM instances feature M60 GPUs for visualization workloads. Because Torch is a GPU-accelerated compute application, it is best to choose `NC` VM instances. -* `publisher` should be `Canonical`. Other publishers will be supported -once they are available for N-series VMs. -* `offer` should be `UbuntuServer`. Other offers will be supported once they -are available for N-series VMs. -* `sku` should be `16.04-LTS`. Other skus will be supported once they are -available for N-series VMs. +* `vm_configuration` is the VM configuration + * `platform_image` specifies to use a platform image + * `publisher` should be `Canonical` or `OpenLogic`. + * `offer` should be `UbuntuServer` for Canonical or `CentOS` for OpenLogic. + * `sku` should be `16.04-LTS` for Ubuntu or `7.3` for CentOS. ### Global Configuration The global configuration should set the following properties: diff --git a/scripts/shipyard_nodeprep.sh b/scripts/shipyard_nodeprep.sh index e5b6ddb..a37b38f 100755 --- a/scripts/shipyard_nodeprep.sh +++ b/scripts/shipyard_nodeprep.sh @@ -154,6 +154,93 @@ check_for_nvidia_card() { fi } +install_nvidia_software() { + offer=$1 + shift + # check for nvidia card + check_for_nvidia_card + # split arg into two + IFS=':' read -ra GPUARGS <<< "$gpu" + nvdriver=${GPUARGS[1]} + nvdocker=${GPUARGS[2]} + # remove nouveau + rmmod nouveau + # purge nouveau off system + if [ $offer == "ubuntuserver" ]; then + apt-get --purge remove xserver-xorg-video-nouveau xserver-xorg-video-nouveau-hwe-16.04 + elif [[ $offer == centos* ]]; then + yum erase -y xorg-x11-drv-nouveau + else + echo "ERROR: unsupported distribution for nvidia/GPU, offer: $offer" + exit 1 + fi + # blacklist nouveau from being loaded if rebooted +cat > /etc/modprobe.d/blacklist-nouveau.conf << EOF +blacklist nouveau +blacklist lbm-nouveau +options nouveau modeset=0 +alias nouveau off +alias lbm-nouveau off +EOF + # get development essentials for nvidia driver + if [ $offer == "ubuntuserver" ]; then + install_packages $offer build-essential + elif [[ $offer == centos* ]]; then + install_packages $offer gcc binutils make "kernel-devel-$(uname -r)" + fi + # get additional dependency if NV-series VMs + if [ ${GPUARGS[0]} == "True" ]; then + if [ $offer == "ubuntuserver" ]; then + install_packages $offer xserver-xorg-dev + elif [[ $offer == centos* ]]; then + install_packages $offer xorg-x11-server-devel + fi + fi + # install driver + ./$nvdriver -s + # add flag to config template for GRID driver + if [ ${GPUARGS[0]} == "True" ]; then + echo "IgnoreSP=TRUE" >> /etc/nvidia/gridd.conf.template + fi + # install nvidia-docker + if [ $offer == "ubuntuserver" ]; then + dpkg -i $nvdocker + elif [[ $offer == centos* ]]; then + rpm -Uvh $nvdocker + fi + # enable and start nvidia docker service + systemctl enable nvidia-docker.service + systemctl start nvidia-docker.service + systemctl status nvidia-docker.service + # get driver version + nvdriverver=`cat /proc/driver/nvidia/version | grep "Kernel Module" | cut -d ' ' -f 9` + echo nvidia driver version $nvdriverver detected + # create the docker volume now to avoid volume driver conflicts for + # tasks. run this in a loop as it can fail if triggered too quickly + # after start + NV_START=$(date -u +"%s") + set +e + while : + do + echo "INFO: Attempting to create nvidia-docker volume with version $nvdriverver" + docker volume create -d nvidia-docker --name nvidia_driver_$nvdriverver + if [ $? -eq 0 ]; then + docker volume list + break + else + NV_NOW=$(date -u +"%s") + NV_DIFF=$((($NV_NOW-$NV_START)/60)) + # fail after 5 minutes of attempts + if [ $NV_DIFF -ge 5 ]; then + echo "ERROR: could not create nvidia-docker volume" + exit 1 + fi + sleep 1 + fi + done + set -e +} + install_azurefile_docker_volume_driver() { chown root:root azurefile-dockervolumedriver* chmod 755 azurefile-dockervolumedriver @@ -178,6 +265,8 @@ install_azurefile_docker_volume_driver() { # create docker volumes chmod +x azurefile-dockervolume-create.sh ./azurefile-dockervolume-create.sh + # list volumes + docker volume list } refresh_package_index() { @@ -464,67 +553,7 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then $srvstatus # install gpu related items if [ ! -z $gpu ] && [ ! -f $nodeprepfinished ]; then - # check for nvidia card - check_for_nvidia_card - # split arg into two - IFS=':' read -ra GPUARGS <<< "$gpu" - # remove nouveau - rmmod nouveau - apt-get --purge remove xserver-xorg-video-nouveau xserver-xorg-video-nouveau-hwe-16.04 - # blacklist nouveau from being loaded if rebooted -cat > /etc/modprobe.d/blacklist-nouveau.conf << EOF -blacklist nouveau -blacklist lbm-nouveau -options nouveau modeset=0 -alias nouveau off -alias lbm-nouveau off -EOF - nvdriver=${GPUARGS[1]} - nvdocker=${GPUARGS[2]} - # get development essentials for nvidia driver - install_packages $offer build-essential - # get additional dependency if NV-series VMs - if [ ${GPUARGS[0]} == "True" ]; then - install_packages $offer xserver-xorg-dev - fi - # install driver - ./$nvdriver -s - # add flag to config template for GRID driver - if [ ${GPUARGS[0]} == "True" ]; then - echo "IgnoreSP=TRUE" >> /etc/nvidia/gridd.conf.template - fi - # install nvidia-docker - dpkg -i $nvdocker - # enable and start nvidia docker service - systemctl enable nvidia-docker.service - systemctl start nvidia-docker.service - systemctl status nvidia-docker.service - # get driver version - nvdriverver=`cat /proc/driver/nvidia/version | grep "Kernel Module" | cut -d ' ' -f 9` - echo nvidia driver version $nvdriverver detected - # create the docker volume now to avoid volume driver conflicts for - # tasks. run this in a loop as it can fail if triggered too quickly - # after start - NV_START=$(date -u +"%s") - set +e - while : - do - echo "INFO: Attempting to create nvidia-docker volume with version $nvdriverver" - docker volume create -d nvidia-docker --name nvidia_driver_$nvdriverver - if [ $? -eq 0 ]; then - break - else - NV_NOW=$(date -u +"%s") - NV_DIFF=$((($NV_NOW-$NV_START)/60)) - # fail after 5 minutes of attempts - if [ $NV_DIFF -ge 5 ]; then - echo "ERROR: could not create nvidia-docker volume" - exit 1 - fi - sleep 1 - fi - done - set -e + install_nvidia_software $offer fi # set up glusterfs if [ $gluster_on_compute -eq 1 ] && [ ! -f $nodeprepfinished ]; then @@ -568,7 +597,7 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l exit 1 fi # gpu is not supported on these offers - if [ ! -z $gpu ]; then + if [[ ! -z $gpu ]] && [[ $offer != centos* ]]; then echo "ERROR: gpu unsupported on this sku: $sku for offer $offer" exit 1 fi @@ -618,6 +647,10 @@ elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-l if [ $azurefile -eq 1 ]; then install_azurefile_docker_volume_driver $offer $sku fi + # install gpu related items + if [ ! -z $gpu ] && [ ! -f $nodeprepfinished ]; then + install_nvidia_software $offer + fi # set up glusterfs if [ $gluster_on_compute -eq 1 ] && [ ! -f $nodeprepfinished ]; then install_packages $offer epel-release centos-release-gluster38