diff --git a/config_templates/pool.yaml b/config_templates/pool.yaml index b944bbf..84e6447 100644 --- a/config_templates/pool.yaml +++ b/config_templates/pool.yaml @@ -122,6 +122,7 @@ pool_specification: gpu: nvidia_driver: source: https://some.url + ignore_warnings: false batch_insights_enabled: false prometheus: node_exporter: diff --git a/convoy/fleet.py b/convoy/fleet.py index 745b671..2fd4e4f 100644 --- a/convoy/fleet.py +++ b/convoy/fleet.py @@ -1719,6 +1719,14 @@ def _construct_pool_object( value=','.join(pool_settings.prometheus.ca_options) ) ) + # gpu env vars + if pool_settings.gpu_ignore_warnings: + pool.start_task.environment_settings.append( + batchmodels.EnvironmentSetting( + name='SHIPYARD_GPU_IGNORE_WARNINGS', + value='1' + ) + ) # batch insights if pool_settings.batch_insights_enabled: pool.start_task.environment_settings.append( diff --git a/convoy/settings.py b/convoy/settings.py index 74e8e8c..c93516f 100644 --- a/convoy/settings.py +++ b/convoy/settings.py @@ -231,6 +231,7 @@ PoolSettings = collections.namedtuple( 'certificates', 'prometheus', 'upload_diagnostics_logs_on_unusable', 'container_runtimes_install', 'container_runtimes_default', 'per_job_auto_scratch', 'batch_insights_enabled', 'public_ips', + 'gpu_ignore_warnings', ] ) SSHSettings = collections.namedtuple( @@ -1396,9 +1397,11 @@ def pool_settings(config): (rac.starting_port > 49000 and rac.starting_port <= 55000) or rac.starting_port > 64536): raise ValueError('starting_port is invalid or in a reserved range') - # gpu driver + # gpu settings + gpu = _kv_read_checked(conf, 'gpu', default={}) + gpu_ignore_warnings = _kv_read(gpu, 'ignore_warnings', default=False) try: - gpu_driver = _kv_read_checked(conf['gpu']['nvidia_driver'], 'source') + gpu_driver = _kv_read_checked(gpu['nvidia_driver'], 'source') except KeyError: gpu_driver = None # additional node prep @@ -1474,6 +1477,7 @@ def pool_settings(config): password=rdp_password, ), gpu_driver=gpu_driver, + gpu_ignore_warnings=gpu_ignore_warnings, additional_node_prep=additional_node_prep, virtual_network=virtual_network_settings( conf, diff --git a/docs/13-batch-shipyard-configuration-pool.md b/docs/13-batch-shipyard-configuration-pool.md index f46c1b2..f13bb65 100644 --- a/docs/13-batch-shipyard-configuration-pool.md +++ b/docs/13-batch-shipyard-configuration-pool.md @@ -137,6 +137,7 @@ pool_specification: gpu: nvidia_driver: source: https://some.url + ignore_warnings: false batch_insights_enabled: false prometheus: node_exporter: @@ -565,9 +566,14 @@ commands to execute on node start. * (optional) `gpu` property defines additional information for NVIDIA GPU-enabled VMs. If not specified, Batch Shipyard will automatically download the driver for the `vm_size` specified. - * `nvidia_driver` property contains the following required members: - * `source` is the source url to download the driver. This should be - the silent-installable driver package. + * (optional) `nvidia_driver` property contains the following members: + * (required) `source` is the source url to download the driver. This + should be the silent-installable driver package. + * (optional) `ignore_warnings` property allows overriding the default + beahvior to place the node in start task failed state if during node + prep there are warnings of possible GPU issues such as infoROM + corruption. It is recommended not to set this value to `true`. The + default, if not specified, is `false`. * (optional) `batch_insights_enabled` property enables [Batch Insights](https://github.com/Azure/batch-insights) monitoring for the pool. This provides simple non-realtime, host-based monitoring through diff --git a/schemas/pool.yaml b/schemas/pool.yaml index 481f1b4..6e633eb 100644 --- a/schemas/pool.yaml +++ b/schemas/pool.yaml @@ -323,6 +323,9 @@ mapping: mapping: source: type: str + required: true + ignore_warnings: + type: bool additional_node_prep: type: map mapping: diff --git a/scripts/shipyard_nodeprep.sh b/scripts/shipyard_nodeprep.sh index ce4be2d..33c869f 100755 --- a/scripts/shipyard_nodeprep.sh +++ b/scripts/shipyard_nodeprep.sh @@ -654,8 +654,14 @@ enable_nvidia_persistence_mode() { } query_nvidia_card() { + if [ "$SHIPYARD_GPU_IGNORE_WARNINGS" -eq 1 ]; then + set +e + fi nvidia-smi -q nvidia-smi + if [ "$SHIPYARD_GPU_IGNORE_WARNINGS" -eq 1 ]; then + set -e + fi } check_for_nvidia_on_custom_or_native() { @@ -1847,6 +1853,7 @@ echo "Storage cluster mounts (${#sc_args[@]}): ${sc_args[*]}" echo "Custom mount: $SHIPYARD_CUSTOM_MOUNTS_FSTAB" echo "Install LIS: $lis" echo "GPU: $gpu" +echo "GPU ignore warnings: $SHIPYARD_GPU_IGNORE_WARNINGS" echo "Azure Blob: $azureblob" echo "Azure File: $azurefile" echo "GlusterFS on compute: $gluster_on_compute"