Lock gpu clocks using nvidia-smi to avoid overheating

For now only applies to Windows Nvidia (GTX 1660) bots
where I tested this. Default clocks boost to 1800+
reporting 110W+ initial power usage which can't be dissipated by
the setup and and gpu quickly hits ~90C and enters aggressive
throttling. Locking to 1410MHz yield a much more modest 50~60W usage
with reasonable gpu temps in the 70C~80C range over long runs at
100% utilization.

Also log gpu temperature before/after running tests.

Bug: angleproject:7671
Change-Id: Id43bede3b1d6a445f8caee6bbbaa43f7f380199b
Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/4031495
Reviewed-by: Yuly Novikov <ynovikov@chromium.org>
Reviewed-by: Amirali Abdolrashidi <abdolrashidi@google.com>
Commit-Queue: Roman Lavrov <romanl@google.com>
This commit is contained in:
Roman Lavrov 2022-11-16 14:39:00 -05:00 коммит произвёл Angle LUCI CQ
Родитель 2dde73576a
Коммит b205207a0a
1 изменённых файлов: 43 добавлений и 1 удалений

Просмотреть файл

@ -8,6 +8,7 @@
# Runs ANGLE perf tests using some statistical averaging. # Runs ANGLE perf tests using some statistical averaging.
import argparse import argparse
import contextlib
import fnmatch import fnmatch
import glob import glob
import importlib import importlib
@ -434,6 +435,46 @@ def _split_shard_samples(tests, samples_per_test, shard_count, shard_index):
return [test for (test, sample) in shard_test_samples] return [test for (test, sample) in shard_test_samples]
def _should_lock_gpu_clocks():
if not angle_test_util.IsWindows():
return False
try:
gpu_info = subprocess.check_output(
['nvidia-smi', '--query-gpu=gpu_name', '--format=csv,noheader']).decode()
except FileNotFoundError:
# expected in some cases, e.g. non-nvidia bots
return False
logging.info('nvidia-smi --query-gpu=gpu_name output: %s' % gpu_info)
return gpu_info.strip() == 'GeForce GTX 1660'
def _log_nvidia_gpu_temperature():
t = subprocess.check_output(
['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader']).decode().strip()
logging.info('Current GPU temperature: %s ' % t)
@contextlib.contextmanager
def _maybe_lock_gpu_clocks():
if not _should_lock_gpu_clocks():
yield
return
# Lock to 1410Mhz (`nvidia-smi --query-supported-clocks=gr --format=csv`)
lgc_out = subprocess.check_output(['nvidia-smi', '--lock-gpu-clocks=1410,1410']).decode()
logging.info('Lock GPU clocks output: %s' % lgc_out)
_log_nvidia_gpu_temperature()
try:
yield
finally:
rgc_out = subprocess.check_output(['nvidia-smi', '--reset-gpu-clocks']).decode()
logging.info('Reset GPU clocks output: %s' % rgc_out)
_log_nvidia_gpu_temperature()
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--isolated-script-test-output', type=str) parser.add_argument('--isolated-script-test-output', type=str)
@ -571,6 +612,7 @@ def main():
logging.info('Running %d test%s' % (len(tests), 's' if len(tests) > 1 else ' ')) logging.info('Running %d test%s' % (len(tests), 's' if len(tests) > 1 else ' '))
try: try:
with _maybe_lock_gpu_clocks():
results, histograms = _run_tests(tests, args, extra_flags, env) results, histograms = _run_tests(tests, args, extra_flags, env)
except _MaxErrorsException: except _MaxErrorsException:
logging.error('Error count exceeded max errors (%d). Aborting.' % args.max_errors) logging.error('Error count exceeded max errors (%d). Aborting.' % args.max_errors)