Lock gpu clocks using nvidia-smi to avoid overheating

For now only applies to Windows Nvidia (GTX 1660) bots where I tested this. Default clocks boost to 1800+ reporting 110W+ initial power usage which can't be dissipated by the setup and and gpu quickly hits ~90C and enters aggressive throttling. Locking to 1410MHz yield a much more modest 50~60W usage with reasonable gpu temps in the 70C~80C range over long runs at 100% utilization. Also log gpu temperature before/after running tests. Bug: angleproject:7671 Change-Id: Id43bede3b1d6a445f8caee6bbbaa43f7f380199b Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/4031495 Reviewed-by: Yuly Novikov <ynovikov@chromium.org> Reviewed-by: Amirali Abdolrashidi <abdolrashidi@google.com> Commit-Queue: Roman Lavrov <romanl@google.com>
2022-11-16 14:39:00 -05:00 · 2022-11-16 14:39:00 -05:00 · b205207a0a
--- a/src/tests/run_perf_tests.py
+++ b/src/tests/run_perf_tests.py
@ -8,6 +8,7 @@
 #   Runs ANGLE perf tests using some statistical averaging.

 import argparse
+import contextlib
 import fnmatch
 import glob
 import importlib
@ -434,6 +435,46 @@ def _split_shard_samples(tests, samples_per_test, shard_count, shard_index):
    return [test for (test, sample) in shard_test_samples]


+def _should_lock_gpu_clocks():
+    if not angle_test_util.IsWindows():
+        return False
+
+    try:
+        gpu_info = subprocess.check_output(
+            ['nvidia-smi', '--query-gpu=gpu_name', '--format=csv,noheader']).decode()
+    except FileNotFoundError:
+        # expected in some cases, e.g. non-nvidia bots
+        return False
+
+    logging.info('nvidia-smi --query-gpu=gpu_name output: %s' % gpu_info)
+
+    return gpu_info.strip() == 'GeForce GTX 1660'
+
+
+def _log_nvidia_gpu_temperature():
+    t = subprocess.check_output(
+        ['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader']).decode().strip()
+    logging.info('Current GPU temperature: %s ' % t)
+
+
+@contextlib.contextmanager
+def _maybe_lock_gpu_clocks():
+    if not _should_lock_gpu_clocks():
+        yield
+        return
+
+    # Lock to 1410Mhz (`nvidia-smi --query-supported-clocks=gr --format=csv`)
+    lgc_out = subprocess.check_output(['nvidia-smi', '--lock-gpu-clocks=1410,1410']).decode()
+    logging.info('Lock GPU clocks output: %s' % lgc_out)
+    _log_nvidia_gpu_temperature()
+    try:
+        yield
+    finally:
+        rgc_out = subprocess.check_output(['nvidia-smi', '--reset-gpu-clocks']).decode()
+        logging.info('Reset GPU clocks output: %s' % rgc_out)
+        _log_nvidia_gpu_temperature()
+
+
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--isolated-script-test-output', type=str)
@ -571,6 +612,7 @@ def main():
    logging.info('Running %d test%s' % (len(tests), 's' if len(tests) > 1 else ' '))

    try:
+        with _maybe_lock_gpu_clocks():
            results, histograms = _run_tests(tests, args, extra_flags, env)
    except _MaxErrorsException:
        logging.error('Error count exceeded max errors (%d). Aborting.' % args.max_errors)