Add ignore GPU warnings option
This commit is contained in:
Родитель
7a4595d44f
Коммит
a54f872326
|
@ -122,6 +122,7 @@ pool_specification:
|
|||
gpu:
|
||||
nvidia_driver:
|
||||
source: https://some.url
|
||||
ignore_warnings: false
|
||||
batch_insights_enabled: false
|
||||
prometheus:
|
||||
node_exporter:
|
||||
|
|
|
@ -1719,6 +1719,14 @@ def _construct_pool_object(
|
|||
value=','.join(pool_settings.prometheus.ca_options)
|
||||
)
|
||||
)
|
||||
# gpu env vars
|
||||
if pool_settings.gpu_ignore_warnings:
|
||||
pool.start_task.environment_settings.append(
|
||||
batchmodels.EnvironmentSetting(
|
||||
name='SHIPYARD_GPU_IGNORE_WARNINGS',
|
||||
value='1'
|
||||
)
|
||||
)
|
||||
# batch insights
|
||||
if pool_settings.batch_insights_enabled:
|
||||
pool.start_task.environment_settings.append(
|
||||
|
|
|
@ -231,6 +231,7 @@ PoolSettings = collections.namedtuple(
|
|||
'certificates', 'prometheus', 'upload_diagnostics_logs_on_unusable',
|
||||
'container_runtimes_install', 'container_runtimes_default',
|
||||
'per_job_auto_scratch', 'batch_insights_enabled', 'public_ips',
|
||||
'gpu_ignore_warnings',
|
||||
]
|
||||
)
|
||||
SSHSettings = collections.namedtuple(
|
||||
|
@ -1396,9 +1397,11 @@ def pool_settings(config):
|
|||
(rac.starting_port > 49000 and rac.starting_port <= 55000) or
|
||||
rac.starting_port > 64536):
|
||||
raise ValueError('starting_port is invalid or in a reserved range')
|
||||
# gpu driver
|
||||
# gpu settings
|
||||
gpu = _kv_read_checked(conf, 'gpu', default={})
|
||||
gpu_ignore_warnings = _kv_read(gpu, 'ignore_warnings', default=False)
|
||||
try:
|
||||
gpu_driver = _kv_read_checked(conf['gpu']['nvidia_driver'], 'source')
|
||||
gpu_driver = _kv_read_checked(gpu['nvidia_driver'], 'source')
|
||||
except KeyError:
|
||||
gpu_driver = None
|
||||
# additional node prep
|
||||
|
@ -1474,6 +1477,7 @@ def pool_settings(config):
|
|||
password=rdp_password,
|
||||
),
|
||||
gpu_driver=gpu_driver,
|
||||
gpu_ignore_warnings=gpu_ignore_warnings,
|
||||
additional_node_prep=additional_node_prep,
|
||||
virtual_network=virtual_network_settings(
|
||||
conf,
|
||||
|
|
|
@ -137,6 +137,7 @@ pool_specification:
|
|||
gpu:
|
||||
nvidia_driver:
|
||||
source: https://some.url
|
||||
ignore_warnings: false
|
||||
batch_insights_enabled: false
|
||||
prometheus:
|
||||
node_exporter:
|
||||
|
@ -565,9 +566,14 @@ commands to execute on node start.
|
|||
* (optional) `gpu` property defines additional information for NVIDIA
|
||||
GPU-enabled VMs. If not specified, Batch Shipyard will automatically download
|
||||
the driver for the `vm_size` specified.
|
||||
* `nvidia_driver` property contains the following required members:
|
||||
* `source` is the source url to download the driver. This should be
|
||||
the silent-installable driver package.
|
||||
* (optional) `nvidia_driver` property contains the following members:
|
||||
* (required) `source` is the source url to download the driver. This
|
||||
should be the silent-installable driver package.
|
||||
* (optional) `ignore_warnings` property allows overriding the default
|
||||
beahvior to place the node in start task failed state if during node
|
||||
prep there are warnings of possible GPU issues such as infoROM
|
||||
corruption. It is recommended not to set this value to `true`. The
|
||||
default, if not specified, is `false`.
|
||||
* (optional) `batch_insights_enabled` property enables
|
||||
[Batch Insights](https://github.com/Azure/batch-insights) monitoring for
|
||||
the pool. This provides simple non-realtime, host-based monitoring through
|
||||
|
|
|
@ -323,6 +323,9 @@ mapping:
|
|||
mapping:
|
||||
source:
|
||||
type: str
|
||||
required: true
|
||||
ignore_warnings:
|
||||
type: bool
|
||||
additional_node_prep:
|
||||
type: map
|
||||
mapping:
|
||||
|
|
|
@ -654,8 +654,14 @@ enable_nvidia_persistence_mode() {
|
|||
}
|
||||
|
||||
query_nvidia_card() {
|
||||
if [ "$SHIPYARD_GPU_IGNORE_WARNINGS" -eq 1 ]; then
|
||||
set +e
|
||||
fi
|
||||
nvidia-smi -q
|
||||
nvidia-smi
|
||||
if [ "$SHIPYARD_GPU_IGNORE_WARNINGS" -eq 1 ]; then
|
||||
set -e
|
||||
fi
|
||||
}
|
||||
|
||||
check_for_nvidia_on_custom_or_native() {
|
||||
|
@ -1847,6 +1853,7 @@ echo "Storage cluster mounts (${#sc_args[@]}): ${sc_args[*]}"
|
|||
echo "Custom mount: $SHIPYARD_CUSTOM_MOUNTS_FSTAB"
|
||||
echo "Install LIS: $lis"
|
||||
echo "GPU: $gpu"
|
||||
echo "GPU ignore warnings: $SHIPYARD_GPU_IGNORE_WARNINGS"
|
||||
echo "Azure Blob: $azureblob"
|
||||
echo "Azure File: $azurefile"
|
||||
echo "GlusterFS on compute: $gluster_on_compute"
|
||||
|
|
Загрузка…
Ссылка в новой задаче