Add ignore GPU warnings option
This commit is contained in:
Родитель
7a4595d44f
Коммит
a54f872326
|
@ -122,6 +122,7 @@ pool_specification:
|
||||||
gpu:
|
gpu:
|
||||||
nvidia_driver:
|
nvidia_driver:
|
||||||
source: https://some.url
|
source: https://some.url
|
||||||
|
ignore_warnings: false
|
||||||
batch_insights_enabled: false
|
batch_insights_enabled: false
|
||||||
prometheus:
|
prometheus:
|
||||||
node_exporter:
|
node_exporter:
|
||||||
|
|
|
@ -1719,6 +1719,14 @@ def _construct_pool_object(
|
||||||
value=','.join(pool_settings.prometheus.ca_options)
|
value=','.join(pool_settings.prometheus.ca_options)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
# gpu env vars
|
||||||
|
if pool_settings.gpu_ignore_warnings:
|
||||||
|
pool.start_task.environment_settings.append(
|
||||||
|
batchmodels.EnvironmentSetting(
|
||||||
|
name='SHIPYARD_GPU_IGNORE_WARNINGS',
|
||||||
|
value='1'
|
||||||
|
)
|
||||||
|
)
|
||||||
# batch insights
|
# batch insights
|
||||||
if pool_settings.batch_insights_enabled:
|
if pool_settings.batch_insights_enabled:
|
||||||
pool.start_task.environment_settings.append(
|
pool.start_task.environment_settings.append(
|
||||||
|
|
|
@ -231,6 +231,7 @@ PoolSettings = collections.namedtuple(
|
||||||
'certificates', 'prometheus', 'upload_diagnostics_logs_on_unusable',
|
'certificates', 'prometheus', 'upload_diagnostics_logs_on_unusable',
|
||||||
'container_runtimes_install', 'container_runtimes_default',
|
'container_runtimes_install', 'container_runtimes_default',
|
||||||
'per_job_auto_scratch', 'batch_insights_enabled', 'public_ips',
|
'per_job_auto_scratch', 'batch_insights_enabled', 'public_ips',
|
||||||
|
'gpu_ignore_warnings',
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
SSHSettings = collections.namedtuple(
|
SSHSettings = collections.namedtuple(
|
||||||
|
@ -1396,9 +1397,11 @@ def pool_settings(config):
|
||||||
(rac.starting_port > 49000 and rac.starting_port <= 55000) or
|
(rac.starting_port > 49000 and rac.starting_port <= 55000) or
|
||||||
rac.starting_port > 64536):
|
rac.starting_port > 64536):
|
||||||
raise ValueError('starting_port is invalid or in a reserved range')
|
raise ValueError('starting_port is invalid or in a reserved range')
|
||||||
# gpu driver
|
# gpu settings
|
||||||
|
gpu = _kv_read_checked(conf, 'gpu', default={})
|
||||||
|
gpu_ignore_warnings = _kv_read(gpu, 'ignore_warnings', default=False)
|
||||||
try:
|
try:
|
||||||
gpu_driver = _kv_read_checked(conf['gpu']['nvidia_driver'], 'source')
|
gpu_driver = _kv_read_checked(gpu['nvidia_driver'], 'source')
|
||||||
except KeyError:
|
except KeyError:
|
||||||
gpu_driver = None
|
gpu_driver = None
|
||||||
# additional node prep
|
# additional node prep
|
||||||
|
@ -1474,6 +1477,7 @@ def pool_settings(config):
|
||||||
password=rdp_password,
|
password=rdp_password,
|
||||||
),
|
),
|
||||||
gpu_driver=gpu_driver,
|
gpu_driver=gpu_driver,
|
||||||
|
gpu_ignore_warnings=gpu_ignore_warnings,
|
||||||
additional_node_prep=additional_node_prep,
|
additional_node_prep=additional_node_prep,
|
||||||
virtual_network=virtual_network_settings(
|
virtual_network=virtual_network_settings(
|
||||||
conf,
|
conf,
|
||||||
|
|
|
@ -137,6 +137,7 @@ pool_specification:
|
||||||
gpu:
|
gpu:
|
||||||
nvidia_driver:
|
nvidia_driver:
|
||||||
source: https://some.url
|
source: https://some.url
|
||||||
|
ignore_warnings: false
|
||||||
batch_insights_enabled: false
|
batch_insights_enabled: false
|
||||||
prometheus:
|
prometheus:
|
||||||
node_exporter:
|
node_exporter:
|
||||||
|
@ -565,9 +566,14 @@ commands to execute on node start.
|
||||||
* (optional) `gpu` property defines additional information for NVIDIA
|
* (optional) `gpu` property defines additional information for NVIDIA
|
||||||
GPU-enabled VMs. If not specified, Batch Shipyard will automatically download
|
GPU-enabled VMs. If not specified, Batch Shipyard will automatically download
|
||||||
the driver for the `vm_size` specified.
|
the driver for the `vm_size` specified.
|
||||||
* `nvidia_driver` property contains the following required members:
|
* (optional) `nvidia_driver` property contains the following members:
|
||||||
* `source` is the source url to download the driver. This should be
|
* (required) `source` is the source url to download the driver. This
|
||||||
the silent-installable driver package.
|
should be the silent-installable driver package.
|
||||||
|
* (optional) `ignore_warnings` property allows overriding the default
|
||||||
|
beahvior to place the node in start task failed state if during node
|
||||||
|
prep there are warnings of possible GPU issues such as infoROM
|
||||||
|
corruption. It is recommended not to set this value to `true`. The
|
||||||
|
default, if not specified, is `false`.
|
||||||
* (optional) `batch_insights_enabled` property enables
|
* (optional) `batch_insights_enabled` property enables
|
||||||
[Batch Insights](https://github.com/Azure/batch-insights) monitoring for
|
[Batch Insights](https://github.com/Azure/batch-insights) monitoring for
|
||||||
the pool. This provides simple non-realtime, host-based monitoring through
|
the pool. This provides simple non-realtime, host-based monitoring through
|
||||||
|
|
|
@ -323,6 +323,9 @@ mapping:
|
||||||
mapping:
|
mapping:
|
||||||
source:
|
source:
|
||||||
type: str
|
type: str
|
||||||
|
required: true
|
||||||
|
ignore_warnings:
|
||||||
|
type: bool
|
||||||
additional_node_prep:
|
additional_node_prep:
|
||||||
type: map
|
type: map
|
||||||
mapping:
|
mapping:
|
||||||
|
|
|
@ -654,8 +654,14 @@ enable_nvidia_persistence_mode() {
|
||||||
}
|
}
|
||||||
|
|
||||||
query_nvidia_card() {
|
query_nvidia_card() {
|
||||||
|
if [ "$SHIPYARD_GPU_IGNORE_WARNINGS" -eq 1 ]; then
|
||||||
|
set +e
|
||||||
|
fi
|
||||||
nvidia-smi -q
|
nvidia-smi -q
|
||||||
nvidia-smi
|
nvidia-smi
|
||||||
|
if [ "$SHIPYARD_GPU_IGNORE_WARNINGS" -eq 1 ]; then
|
||||||
|
set -e
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
check_for_nvidia_on_custom_or_native() {
|
check_for_nvidia_on_custom_or_native() {
|
||||||
|
@ -1847,6 +1853,7 @@ echo "Storage cluster mounts (${#sc_args[@]}): ${sc_args[*]}"
|
||||||
echo "Custom mount: $SHIPYARD_CUSTOM_MOUNTS_FSTAB"
|
echo "Custom mount: $SHIPYARD_CUSTOM_MOUNTS_FSTAB"
|
||||||
echo "Install LIS: $lis"
|
echo "Install LIS: $lis"
|
||||||
echo "GPU: $gpu"
|
echo "GPU: $gpu"
|
||||||
|
echo "GPU ignore warnings: $SHIPYARD_GPU_IGNORE_WARNINGS"
|
||||||
echo "Azure Blob: $azureblob"
|
echo "Azure Blob: $azureblob"
|
||||||
echo "Azure File: $azurefile"
|
echo "Azure File: $azurefile"
|
||||||
echo "GlusterFS on compute: $gluster_on_compute"
|
echo "GlusterFS on compute: $gluster_on_compute"
|
||||||
|
|
Загрузка…
Ссылка в новой задаче