Add ignore GPU warnings option

This commit is contained in:
Fred Park 2019-12-12 20:42:47 +00:00
Родитель 7a4595d44f
Коммит a54f872326
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 3C4D545F457737EB
6 изменённых файлов: 34 добавлений и 5 удалений

Просмотреть файл

@ -122,6 +122,7 @@ pool_specification:
gpu: gpu:
nvidia_driver: nvidia_driver:
source: https://some.url source: https://some.url
ignore_warnings: false
batch_insights_enabled: false batch_insights_enabled: false
prometheus: prometheus:
node_exporter: node_exporter:

Просмотреть файл

@ -1719,6 +1719,14 @@ def _construct_pool_object(
value=','.join(pool_settings.prometheus.ca_options) value=','.join(pool_settings.prometheus.ca_options)
) )
) )
# gpu env vars
if pool_settings.gpu_ignore_warnings:
pool.start_task.environment_settings.append(
batchmodels.EnvironmentSetting(
name='SHIPYARD_GPU_IGNORE_WARNINGS',
value='1'
)
)
# batch insights # batch insights
if pool_settings.batch_insights_enabled: if pool_settings.batch_insights_enabled:
pool.start_task.environment_settings.append( pool.start_task.environment_settings.append(

Просмотреть файл

@ -231,6 +231,7 @@ PoolSettings = collections.namedtuple(
'certificates', 'prometheus', 'upload_diagnostics_logs_on_unusable', 'certificates', 'prometheus', 'upload_diagnostics_logs_on_unusable',
'container_runtimes_install', 'container_runtimes_default', 'container_runtimes_install', 'container_runtimes_default',
'per_job_auto_scratch', 'batch_insights_enabled', 'public_ips', 'per_job_auto_scratch', 'batch_insights_enabled', 'public_ips',
'gpu_ignore_warnings',
] ]
) )
SSHSettings = collections.namedtuple( SSHSettings = collections.namedtuple(
@ -1396,9 +1397,11 @@ def pool_settings(config):
(rac.starting_port > 49000 and rac.starting_port <= 55000) or (rac.starting_port > 49000 and rac.starting_port <= 55000) or
rac.starting_port > 64536): rac.starting_port > 64536):
raise ValueError('starting_port is invalid or in a reserved range') raise ValueError('starting_port is invalid or in a reserved range')
# gpu driver # gpu settings
gpu = _kv_read_checked(conf, 'gpu', default={})
gpu_ignore_warnings = _kv_read(gpu, 'ignore_warnings', default=False)
try: try:
gpu_driver = _kv_read_checked(conf['gpu']['nvidia_driver'], 'source') gpu_driver = _kv_read_checked(gpu['nvidia_driver'], 'source')
except KeyError: except KeyError:
gpu_driver = None gpu_driver = None
# additional node prep # additional node prep
@ -1474,6 +1477,7 @@ def pool_settings(config):
password=rdp_password, password=rdp_password,
), ),
gpu_driver=gpu_driver, gpu_driver=gpu_driver,
gpu_ignore_warnings=gpu_ignore_warnings,
additional_node_prep=additional_node_prep, additional_node_prep=additional_node_prep,
virtual_network=virtual_network_settings( virtual_network=virtual_network_settings(
conf, conf,

Просмотреть файл

@ -137,6 +137,7 @@ pool_specification:
gpu: gpu:
nvidia_driver: nvidia_driver:
source: https://some.url source: https://some.url
ignore_warnings: false
batch_insights_enabled: false batch_insights_enabled: false
prometheus: prometheus:
node_exporter: node_exporter:
@ -565,9 +566,14 @@ commands to execute on node start.
* (optional) `gpu` property defines additional information for NVIDIA * (optional) `gpu` property defines additional information for NVIDIA
GPU-enabled VMs. If not specified, Batch Shipyard will automatically download GPU-enabled VMs. If not specified, Batch Shipyard will automatically download
the driver for the `vm_size` specified. the driver for the `vm_size` specified.
* `nvidia_driver` property contains the following required members: * (optional) `nvidia_driver` property contains the following members:
* `source` is the source url to download the driver. This should be * (required) `source` is the source url to download the driver. This
the silent-installable driver package. should be the silent-installable driver package.
* (optional) `ignore_warnings` property allows overriding the default
beahvior to place the node in start task failed state if during node
prep there are warnings of possible GPU issues such as infoROM
corruption. It is recommended not to set this value to `true`. The
default, if not specified, is `false`.
* (optional) `batch_insights_enabled` property enables * (optional) `batch_insights_enabled` property enables
[Batch Insights](https://github.com/Azure/batch-insights) monitoring for [Batch Insights](https://github.com/Azure/batch-insights) monitoring for
the pool. This provides simple non-realtime, host-based monitoring through the pool. This provides simple non-realtime, host-based monitoring through

Просмотреть файл

@ -323,6 +323,9 @@ mapping:
mapping: mapping:
source: source:
type: str type: str
required: true
ignore_warnings:
type: bool
additional_node_prep: additional_node_prep:
type: map type: map
mapping: mapping:

Просмотреть файл

@ -654,8 +654,14 @@ enable_nvidia_persistence_mode() {
} }
query_nvidia_card() { query_nvidia_card() {
if [ "$SHIPYARD_GPU_IGNORE_WARNINGS" -eq 1 ]; then
set +e
fi
nvidia-smi -q nvidia-smi -q
nvidia-smi nvidia-smi
if [ "$SHIPYARD_GPU_IGNORE_WARNINGS" -eq 1 ]; then
set -e
fi
} }
check_for_nvidia_on_custom_or_native() { check_for_nvidia_on_custom_or_native() {
@ -1847,6 +1853,7 @@ echo "Storage cluster mounts (${#sc_args[@]}): ${sc_args[*]}"
echo "Custom mount: $SHIPYARD_CUSTOM_MOUNTS_FSTAB" echo "Custom mount: $SHIPYARD_CUSTOM_MOUNTS_FSTAB"
echo "Install LIS: $lis" echo "Install LIS: $lis"
echo "GPU: $gpu" echo "GPU: $gpu"
echo "GPU ignore warnings: $SHIPYARD_GPU_IGNORE_WARNINGS"
echo "Azure Blob: $azureblob" echo "Azure Blob: $azureblob"
echo "Azure File: $azurefile" echo "Azure File: $azurefile"
echo "GlusterFS on compute: $gluster_on_compute" echo "GlusterFS on compute: $gluster_on_compute"