Add ignore GPU warnings option

This commit is contained in:
Fred Park 2019-12-12 20:42:47 +00:00
Родитель 7a4595d44f
Коммит a54f872326
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 3C4D545F457737EB
6 изменённых файлов: 34 добавлений и 5 удалений

Просмотреть файл

@ -122,6 +122,7 @@ pool_specification:
gpu:
nvidia_driver:
source: https://some.url
ignore_warnings: false
batch_insights_enabled: false
prometheus:
node_exporter:

Просмотреть файл

@ -1719,6 +1719,14 @@ def _construct_pool_object(
value=','.join(pool_settings.prometheus.ca_options)
)
)
# gpu env vars
if pool_settings.gpu_ignore_warnings:
pool.start_task.environment_settings.append(
batchmodels.EnvironmentSetting(
name='SHIPYARD_GPU_IGNORE_WARNINGS',
value='1'
)
)
# batch insights
if pool_settings.batch_insights_enabled:
pool.start_task.environment_settings.append(

Просмотреть файл

@ -231,6 +231,7 @@ PoolSettings = collections.namedtuple(
'certificates', 'prometheus', 'upload_diagnostics_logs_on_unusable',
'container_runtimes_install', 'container_runtimes_default',
'per_job_auto_scratch', 'batch_insights_enabled', 'public_ips',
'gpu_ignore_warnings',
]
)
SSHSettings = collections.namedtuple(
@ -1396,9 +1397,11 @@ def pool_settings(config):
(rac.starting_port > 49000 and rac.starting_port <= 55000) or
rac.starting_port > 64536):
raise ValueError('starting_port is invalid or in a reserved range')
# gpu driver
# gpu settings
gpu = _kv_read_checked(conf, 'gpu', default={})
gpu_ignore_warnings = _kv_read(gpu, 'ignore_warnings', default=False)
try:
gpu_driver = _kv_read_checked(conf['gpu']['nvidia_driver'], 'source')
gpu_driver = _kv_read_checked(gpu['nvidia_driver'], 'source')
except KeyError:
gpu_driver = None
# additional node prep
@ -1474,6 +1477,7 @@ def pool_settings(config):
password=rdp_password,
),
gpu_driver=gpu_driver,
gpu_ignore_warnings=gpu_ignore_warnings,
additional_node_prep=additional_node_prep,
virtual_network=virtual_network_settings(
conf,

Просмотреть файл

@ -137,6 +137,7 @@ pool_specification:
gpu:
nvidia_driver:
source: https://some.url
ignore_warnings: false
batch_insights_enabled: false
prometheus:
node_exporter:
@ -565,9 +566,14 @@ commands to execute on node start.
* (optional) `gpu` property defines additional information for NVIDIA
GPU-enabled VMs. If not specified, Batch Shipyard will automatically download
the driver for the `vm_size` specified.
* `nvidia_driver` property contains the following required members:
* `source` is the source url to download the driver. This should be
the silent-installable driver package.
* (optional) `nvidia_driver` property contains the following members:
* (required) `source` is the source url to download the driver. This
should be the silent-installable driver package.
* (optional) `ignore_warnings` property allows overriding the default
beahvior to place the node in start task failed state if during node
prep there are warnings of possible GPU issues such as infoROM
corruption. It is recommended not to set this value to `true`. The
default, if not specified, is `false`.
* (optional) `batch_insights_enabled` property enables
[Batch Insights](https://github.com/Azure/batch-insights) monitoring for
the pool. This provides simple non-realtime, host-based monitoring through

Просмотреть файл

@ -323,6 +323,9 @@ mapping:
mapping:
source:
type: str
required: true
ignore_warnings:
type: bool
additional_node_prep:
type: map
mapping:

Просмотреть файл

@ -654,8 +654,14 @@ enable_nvidia_persistence_mode() {
}
query_nvidia_card() {
if [ "$SHIPYARD_GPU_IGNORE_WARNINGS" -eq 1 ]; then
set +e
fi
nvidia-smi -q
nvidia-smi
if [ "$SHIPYARD_GPU_IGNORE_WARNINGS" -eq 1 ]; then
set -e
fi
}
check_for_nvidia_on_custom_or_native() {
@ -1847,6 +1853,7 @@ echo "Storage cluster mounts (${#sc_args[@]}): ${sc_args[*]}"
echo "Custom mount: $SHIPYARD_CUSTOM_MOUNTS_FSTAB"
echo "Install LIS: $lis"
echo "GPU: $gpu"
echo "GPU ignore warnings: $SHIPYARD_GPU_IGNORE_WARNINGS"
echo "Azure Blob: $azureblob"
echo "Azure File: $azurefile"
echo "GlusterFS on compute: $gluster_on_compute"