From 983a7eed4575675233c2a51c6611a2c77d34095f Mon Sep 17 00:00:00 2001 From: Fred Park Date: Thu, 4 May 2017 22:30:44 -0700 Subject: [PATCH] Node prep script improvements - Blacklist nouveau universally on GPU VMs - Change URL retrieval to requests - Update requirements to latest --- convoy/fleet.py | 35 +++++++++++++++++++--------- docs/02-batch-shipyard-quickstart.md | 3 ++- requirements.txt | 5 ++-- scripts/shipyard_nodeprep.sh | 17 +++++++------- 4 files changed, 37 insertions(+), 23 deletions(-) diff --git a/convoy/fleet.py b/convoy/fleet.py index e244555..31518b3 100644 --- a/convoy/fleet.py +++ b/convoy/fleet.py @@ -36,11 +36,8 @@ try: import pathlib2 as pathlib except ImportError: import pathlib +import requests import time -try: - import urllib.request as urllibreq -except ImportError: - import urllib as urllibreq import uuid # non-stdlib imports import azure.batch.models as batchmodels @@ -63,6 +60,7 @@ from .version import __version__ logger = logging.getLogger(__name__) util.setup_logger(logger) # global defines +_REQUEST_CHUNK_SIZE = 4194304 _ROOT_PATH = pathlib.Path(__file__).resolve().parent.parent _AZUREFILE_DVD_BIN = { 'url': ( @@ -72,6 +70,7 @@ _AZUREFILE_DVD_BIN = { 'sha256': ( '288f809a1290ea8daf89d222507bda9b3709a9665cec8b70354a50252395e127' ), + 'target': 'resources/azurefile-dockervolumedriver' } _NVIDIA_DOCKER = { 'ubuntuserver': { @@ -275,12 +274,18 @@ def _setup_nvidia_driver_package(blob_client, config, vm_size): raise RuntimeError( 'Cannot proceed with deployment due to non-agreement with ' 'license for NVIDIA driver') + else: + logger.info('NVIDIA Software License accepted') # download driver logger.debug('downloading NVIDIA driver to {}'.format( _NVIDIA_DRIVER['target'])) - response = urllibreq.urlopen(_NVIDIA_DRIVER[gpu_type]['url']) + response = requests.get(_NVIDIA_DRIVER[gpu_type]['url'], stream=True) with pkg.open('wb') as f: - f.write(response.read()) + for chunk in response.iter_content(chunk_size=_REQUEST_CHUNK_SIZE): + if chunk: + f.write(chunk) + logger.debug('wrote {} bytes to {}'.format( + pkg.stat().st_size, _NVIDIA_DRIVER['target'])) # check sha256 if (util.compute_sha256_for_file(pkg, False) != _NVIDIA_DRIVER[gpu_type]['sha256']): @@ -308,9 +313,13 @@ def _setup_nvidia_docker_package(blob_client, config): # download package logger.debug('downloading NVIDIA docker to {}'.format( _NVIDIA_DOCKER[offer]['target'])) - response = urllibreq.urlopen(_NVIDIA_DOCKER[offer]['url']) + response = requests.get(_NVIDIA_DOCKER[offer]['url'], stream=True) with pkg.open('wb') as f: - f.write(response.read()) + for chunk in response.iter_content(chunk_size=_REQUEST_CHUNK_SIZE): + if chunk: + f.write(chunk) + logger.debug('wrote {} bytes to {}'.format( + pkg.stat().st_size, _NVIDIA_DOCKER[offer]['target'])) # check sha256 if (util.compute_sha256_for_file(pkg, False) != _NVIDIA_DOCKER[offer]['sha256']): @@ -331,15 +340,19 @@ def _setup_azurefile_volume_driver(blob_client, config): offer = settings.pool_offer(config, lower=True) sku = settings.pool_sku(config, lower=True) # check to see if binary is downloaded - bin = pathlib.Path(_ROOT_PATH, 'resources/azurefile-dockervolumedriver') + bin = pathlib.Path(_ROOT_PATH, _AZUREFILE_DVD_BIN['target']) if (not bin.exists() or util.compute_sha256_for_file(bin, False) != _AZUREFILE_DVD_BIN['sha256']): # download package logger.debug('downloading Azure File Docker Volume Driver') - response = urllibreq.urlopen(_AZUREFILE_DVD_BIN['url']) + response = requests.get(_AZUREFILE_DVD_BIN['url'], stream=True) with bin.open('wb') as f: - f.write(response.read()) + for chunk in response.iter_content(chunk_size=_REQUEST_CHUNK_SIZE): + if chunk: + f.write(chunk) + logger.debug('wrote {} bytes to {}'.format( + bin.stat().st_size, _AZUREFILE_DVD_BIN['target'])) # check sha256 if (util.compute_sha256_for_file(bin, False) != _AZUREFILE_DVD_BIN['sha256']): diff --git a/docs/02-batch-shipyard-quickstart.md b/docs/02-batch-shipyard-quickstart.md index f278932..405ec53 100644 --- a/docs/02-batch-shipyard-quickstart.md +++ b/docs/02-batch-shipyard-quickstart.md @@ -22,7 +22,8 @@ to your local machine has been completed. 2. Create a directory to hold your configuration files. For this quickstart guide, create a directory named `config`. 3. Copy the sample configuration files from the Deep Learning framework recipe -of your choice to the `config` directory: +of your choice to the `config` directory (please note that some Docker images +are very large, such as CNTK, which will lead to longer pool allocation time): * [CNTK-CPU-OpenMPI](../recipes/CNTK-CPU-OpenMPI/config/singlenode/) * [Caffe-CPU](../recipes/Caffe-CPU/config/) * [Chainer-CPU](../recipes/Chainer-CPU/config/) diff --git a/requirements.txt b/requirements.txt index e4f27f0..fb7ab14 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,8 +3,8 @@ azure-batch==2.0.1 azure-keyvault==0.2.0 azure-mgmt-batch==3.0.1 azure-mgmt-compute==1.0.0rc1 -azure-mgmt-network==1.0.0rc2 -azure-mgmt-resource==1.0.0rc1 +azure-mgmt-network==1.0.0rc3 +azure-mgmt-resource==1.0.0rc2 azure-storage==0.34.0 blobxfer==0.12.1 click==6.7 @@ -12,4 +12,5 @@ future==0.16.0 msrest==0.4.7 msrestazure==0.4.7 pathlib2==2.2.1; python_version < '3.5' +requests==2.13.0 scandir==1.5; python_version < '3.5' diff --git a/scripts/shipyard_nodeprep.sh b/scripts/shipyard_nodeprep.sh index cefa54a..0123f5d 100755 --- a/scripts/shipyard_nodeprep.sh +++ b/scripts/shipyard_nodeprep.sh @@ -132,13 +132,15 @@ check_for_buggy_ntfs_mount() { check_for_nvidia_card() { set +e - lspci - lspci | grep -i nvidia > /dev/null + out=$(lspci) + echo "$out" | grep -i nvidia > /dev/null if [ $? -ne 0 ]; then + echo $out echo "ERROR: No Nvidia card(s) detected!" exit 1 fi set -e + echo $out } install_azurefile_docker_volume_driver() { @@ -376,12 +378,10 @@ if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then check_for_nvidia_card # split arg into two IFS=':' read -ra GPUARGS <<< "$gpu" - # take special actions if we're on NV-series VMs - if [ ${GPUARGS[0]} == "True" ]; then - # remove nouveau - apt-get --purge remove xserver-xorg-video-nouveau - rmmod nouveau - # blacklist nouveau from being loaded if rebooted + # remove nouveau + apt-get --purge remove xserver-xorg-video-nouveau + rmmod nouveau + # blacklist nouveau from being loaded if rebooted cat > /etc/modprobe.d/blacklist-nouveau.conf << EOF blacklist nouveau blacklist lbm-nouveau @@ -389,7 +389,6 @@ options nouveau modeset=0 alias nouveau off alias lbm-nouveau off EOF - fi nvdriver=${GPUARGS[1]} nvdocker=${GPUARGS[2]} # get development essentials for nvidia driver