Runner - Add signal handler in runner (#530)

Add signal handler in runner to gracefully exit when receiving SIGINT
(<kbd>Ctrl</kbd>+<kbd>C</kbd>) or SIGTERM during benchmark execution.
This commit is contained in:
Yifan Xiong 2023-05-23 17:25:35 +08:00 коммит произвёл GitHub
Родитель 4c0d96e5d8
Коммит a1cd3c9475
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 24 добавлений и 3 удалений

Просмотреть файл

@ -198,6 +198,7 @@ setup(
'types-pkg_resources',
'types-pyyaml',
'typing-extensions>=3.10',
'urllib3<2.0',
'vcrpy>=4.1.1',
'yapf==0.31.0',
],

Просмотреть файл

@ -59,11 +59,12 @@ class AnsibleClient():
self._config['cmdline'] += ' --ask-pass --ask-become-pass'
logger.info(self._config)
def run(self, ansible_config, sudo=False): # pragma: no cover
def run(self, ansible_config, cancel_callback=None, sudo=False): # pragma: no cover
"""Run Ansible runner.
Args:
ansible_config (dict): Ansible config dict.
cancel_callback (Callable): Ansible runner cancel callback.
sudo (bool): Run as sudo or not. Defaults to False.
Returns:
@ -73,7 +74,7 @@ class AnsibleClient():
logger.info('Run as sudo ...')
ansible_config['cmdline'] += ' --become'
with tempfile.TemporaryDirectory(prefix='ansible') as tmpdir:
r = ansible_runner.run(private_data_dir=tmpdir, **ansible_config)
r = ansible_runner.run(private_data_dir=tmpdir, cancel_callback=cancel_callback, **ansible_config)
logger.debug(r.stats)
if r.rc == 0:
logger.info('Run succeed, return code {}.'.format(r.rc))

Просмотреть файл

@ -4,8 +4,10 @@
"""SuperBench Runner."""
import os
import sys
import json
import random
import signal
from pathlib import Path
from pprint import pformat
from collections import defaultdict
@ -233,6 +235,18 @@ class SuperBenchRunner():
)
)
def __signal_handler(self, signum, frame):
"""Signal handler for runner.
Args:
signum (int): Signal number.
frame (FrameType): Timeout frame.
"""
if signum == signal.SIGINT or signum == signal.SIGTERM:
logger.info('Killed by %s, exiting ...', signal.Signals(signum).name)
self.cleanup()
sys.exit(128 + signum)
def __create_results_summary(self): # pragma: no cover
"""Create the result summary file of all nodes."""
all_results = list()
@ -438,12 +452,17 @@ class SuperBenchRunner():
# we do not expect timeout in ansible unless subprocess hangs
ansible_runner_config['timeout'] = timeout + 60
rc = self._ansible_client.run(ansible_runner_config, sudo=(not self._docker_config.skip))
# overwrite ansible runner's default signal handler with main process's
rc = self._ansible_client.run(
ansible_runner_config, cancel_callback=lambda: None, sudo=(not self._docker_config.skip)
)
return rc
def run(self):
"""Run the SuperBench benchmarks distributedly."""
self.check_env()
signal.signal(signal.SIGINT, self.__signal_handler)
signal.signal(signal.SIGTERM, self.__signal_handler)
for benchmark_name in self._sb_benchmarks:
if benchmark_name not in self._sb_enabled_benchmarks:
continue