зеркало из https://github.com/AvaloniaUI/angle.git
Lock gpu clocks using nvidia-smi to avoid overheating
For now only applies to Windows Nvidia (GTX 1660) bots where I tested this. Default clocks boost to 1800+ reporting 110W+ initial power usage which can't be dissipated by the setup and and gpu quickly hits ~90C and enters aggressive throttling. Locking to 1410MHz yield a much more modest 50~60W usage with reasonable gpu temps in the 70C~80C range over long runs at 100% utilization. Also log gpu temperature before/after running tests. Bug: angleproject:7671 Change-Id: Id43bede3b1d6a445f8caee6bbbaa43f7f380199b Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/4031495 Reviewed-by: Yuly Novikov <ynovikov@chromium.org> Reviewed-by: Amirali Abdolrashidi <abdolrashidi@google.com> Commit-Queue: Roman Lavrov <romanl@google.com>
This commit is contained in:
Родитель
2dde73576a
Коммит
b205207a0a
|
@ -8,6 +8,7 @@
|
||||||
# Runs ANGLE perf tests using some statistical averaging.
|
# Runs ANGLE perf tests using some statistical averaging.
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import contextlib
|
||||||
import fnmatch
|
import fnmatch
|
||||||
import glob
|
import glob
|
||||||
import importlib
|
import importlib
|
||||||
|
@ -434,6 +435,46 @@ def _split_shard_samples(tests, samples_per_test, shard_count, shard_index):
|
||||||
return [test for (test, sample) in shard_test_samples]
|
return [test for (test, sample) in shard_test_samples]
|
||||||
|
|
||||||
|
|
||||||
|
def _should_lock_gpu_clocks():
|
||||||
|
if not angle_test_util.IsWindows():
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
gpu_info = subprocess.check_output(
|
||||||
|
['nvidia-smi', '--query-gpu=gpu_name', '--format=csv,noheader']).decode()
|
||||||
|
except FileNotFoundError:
|
||||||
|
# expected in some cases, e.g. non-nvidia bots
|
||||||
|
return False
|
||||||
|
|
||||||
|
logging.info('nvidia-smi --query-gpu=gpu_name output: %s' % gpu_info)
|
||||||
|
|
||||||
|
return gpu_info.strip() == 'GeForce GTX 1660'
|
||||||
|
|
||||||
|
|
||||||
|
def _log_nvidia_gpu_temperature():
|
||||||
|
t = subprocess.check_output(
|
||||||
|
['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader']).decode().strip()
|
||||||
|
logging.info('Current GPU temperature: %s ' % t)
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def _maybe_lock_gpu_clocks():
|
||||||
|
if not _should_lock_gpu_clocks():
|
||||||
|
yield
|
||||||
|
return
|
||||||
|
|
||||||
|
# Lock to 1410Mhz (`nvidia-smi --query-supported-clocks=gr --format=csv`)
|
||||||
|
lgc_out = subprocess.check_output(['nvidia-smi', '--lock-gpu-clocks=1410,1410']).decode()
|
||||||
|
logging.info('Lock GPU clocks output: %s' % lgc_out)
|
||||||
|
_log_nvidia_gpu_temperature()
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
rgc_out = subprocess.check_output(['nvidia-smi', '--reset-gpu-clocks']).decode()
|
||||||
|
logging.info('Reset GPU clocks output: %s' % rgc_out)
|
||||||
|
_log_nvidia_gpu_temperature()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('--isolated-script-test-output', type=str)
|
parser.add_argument('--isolated-script-test-output', type=str)
|
||||||
|
@ -571,6 +612,7 @@ def main():
|
||||||
logging.info('Running %d test%s' % (len(tests), 's' if len(tests) > 1 else ' '))
|
logging.info('Running %d test%s' % (len(tests), 's' if len(tests) > 1 else ' '))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
with _maybe_lock_gpu_clocks():
|
||||||
results, histograms = _run_tests(tests, args, extra_flags, env)
|
results, histograms = _run_tests(tests, args, extra_flags, env)
|
||||||
except _MaxErrorsException:
|
except _MaxErrorsException:
|
||||||
logging.error('Error count exceeded max errors (%d). Aborting.' % args.max_errors)
|
logging.error('Error count exceeded max errors (%d). Aborting.' % args.max_errors)
|
||||||
|
|
Загрузка…
Ссылка в новой задаче