Add ignore GPU warnings option

2019-12-12 20:42:47 +00:00 · 2019-12-12 20:42:47 +00:00 · a54f872326
--- a/config_templates/pool.yaml
+++ b/config_templates/pool.yaml
@ -122,6 +122,7 @@ pool_specification:
  gpu:
    nvidia_driver:
      source: https://some.url
+    ignore_warnings: false
  batch_insights_enabled: false
  prometheus:
    node_exporter:
--- a/convoy/fleet.py
+++ b/convoy/fleet.py
@ -1719,6 +1719,14 @@ def _construct_pool_object(
                        value=','.join(pool_settings.prometheus.ca_options)
                    )
                )
+        # gpu env vars
+        if pool_settings.gpu_ignore_warnings:
+            pool.start_task.environment_settings.append(
+                batchmodels.EnvironmentSetting(
+                    name='SHIPYARD_GPU_IGNORE_WARNINGS',
+                    value='1'
+                )
+            )
    # batch insights
    if pool_settings.batch_insights_enabled:
        pool.start_task.environment_settings.append(
--- a/convoy/settings.py
+++ b/convoy/settings.py
@ -231,6 +231,7 @@ PoolSettings = collections.namedtuple(
        'certificates', 'prometheus', 'upload_diagnostics_logs_on_unusable',
        'container_runtimes_install', 'container_runtimes_default',
        'per_job_auto_scratch', 'batch_insights_enabled', 'public_ips',
+        'gpu_ignore_warnings',
    ]
 )
 SSHSettings = collections.namedtuple(
@ -1396,9 +1397,11 @@ def pool_settings(config):
            (rac.starting_port > 49000 and rac.starting_port <= 55000) or
            rac.starting_port > 64536):
        raise ValueError('starting_port is invalid or in a reserved range')
-    # gpu driver
+    # gpu settings
+    gpu = _kv_read_checked(conf, 'gpu', default={})
+    gpu_ignore_warnings = _kv_read(gpu, 'ignore_warnings', default=False)
    try:
-        gpu_driver = _kv_read_checked(conf['gpu']['nvidia_driver'], 'source')
+        gpu_driver = _kv_read_checked(gpu['nvidia_driver'], 'source')
    except KeyError:
        gpu_driver = None
    # additional node prep
@ -1474,6 +1477,7 @@ def pool_settings(config):
            password=rdp_password,
        ),
        gpu_driver=gpu_driver,
+        gpu_ignore_warnings=gpu_ignore_warnings,
        additional_node_prep=additional_node_prep,
        virtual_network=virtual_network_settings(
            conf,
--- a/docs/13-batch-shipyard-configuration-pool.md
+++ b/docs/13-batch-shipyard-configuration-pool.md
@ -137,6 +137,7 @@ pool_specification:
  gpu:
    nvidia_driver:
      source: https://some.url
+    ignore_warnings: false
  batch_insights_enabled: false
  prometheus:
    node_exporter:
@ -565,9 +566,14 @@ commands to execute on node start.
 * (optional) `gpu` property defines additional information for NVIDIA
 GPU-enabled VMs. If not specified, Batch Shipyard will automatically download
 the driver for the `vm_size` specified.
-    * `nvidia_driver` property contains the following required members:
-        * `source` is the source url to download the driver. This should be
-          the silent-installable driver package.
+    * (optional) `nvidia_driver` property contains the following members:
+        * (required) `source` is the source url to download the driver. This
+          should be the silent-installable driver package.
+    * (optional) `ignore_warnings` property allows overriding the default
+      beahvior to place the node in start task failed state if during node
+      prep there are warnings of possible GPU issues such as infoROM
+      corruption. It is recommended not to set this value to `true`. The
+      default, if not specified, is `false`.
 * (optional) `batch_insights_enabled` property enables
 [Batch Insights](https://github.com/Azure/batch-insights) monitoring for
 the pool. This provides simple non-realtime, host-based monitoring through
--- a/schemas/pool.yaml
+++ b/schemas/pool.yaml
@ -323,6 +323,9 @@ mapping:
            mapping:
              source:
                type: str
+                required: true
+          ignore_warnings:
+            type: bool
      additional_node_prep:
        type: map
        mapping:
--- a/scripts/shipyard_nodeprep.sh
+++ b/scripts/shipyard_nodeprep.sh
@ -654,8 +654,14 @@ enable_nvidia_persistence_mode() {
 }

 query_nvidia_card() {
+    if [ "$SHIPYARD_GPU_IGNORE_WARNINGS" -eq 1 ]; then
+        set +e
+    fi
    nvidia-smi -q
    nvidia-smi
+    if [ "$SHIPYARD_GPU_IGNORE_WARNINGS" -eq 1 ]; then
+        set -e
+    fi
 }

 check_for_nvidia_on_custom_or_native() {
@ -1847,6 +1853,7 @@ echo "Storage cluster mounts (${#sc_args[@]}): ${sc_args[*]}"
 echo "Custom mount: $SHIPYARD_CUSTOM_MOUNTS_FSTAB"
 echo "Install LIS: $lis"
 echo "GPU: $gpu"
+echo "GPU ignore warnings: $SHIPYARD_GPU_IGNORE_WARNINGS"
 echo "Azure Blob: $azureblob"
 echo "Azure File: $azurefile"
 echo "GlusterFS on compute: $gluster_on_compute"