Monitor - Support cgroup V2 when read system metrics. (#491)

**Description**
Since ubuntu 22.04 will use cgroup V2 and the file structure changed.
Modify the monitor to adapt to cgroup v1 and v2.
This commit is contained in:
guoshzhao 2023-03-22 16:33:18 +08:00 коммит произвёл GitHub
Родитель dbeba8056b
Коммит a9b45a072e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 89 добавлений и 43 удалений

Просмотреть файл

@ -129,7 +129,7 @@ class DeviceManager:
Return:
remapped_metrics (dict): the row remapped information, None means failed to get the data.
"""
output = process.run_command('nvidia-smi -i {} -q'.format(idx), quite=True)
output = process.run_command('nvidia-smi -i {} -q'.format(idx), quiet=True)
if output.returncode == 0:
begin = output.stdout.find('Remapped Rows')
end = output.stdout.find('Temperature', begin)

Просмотреть файл

@ -10,12 +10,12 @@ import shlex
from superbench.common.utils import stdout_logger
def run_command(command, quite=False, flush_output=False):
def run_command(command, quiet=False, flush_output=False):
"""Run command in string format, return the result with stdout and stderr.
Args:
command (str): command to run.
quite (bool): no stdout display of the command if quite is True.
quiet (bool): no stdout display of the command if quiet is True.
flush_output (bool): enable real-time output flush or not when running the command.
Return:
@ -31,7 +31,7 @@ def run_command(command, quite=False, flush_output=False):
output = ''
for line in process.stdout:
output += line
if not quite:
if not quiet:
stdout_logger.log(line)
process.wait()
retcode = process.poll()
@ -45,6 +45,6 @@ def run_command(command, quite=False, flush_output=False):
result = subprocess.run(
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
)
if not quite:
if not quiet:
stdout_logger.log(result.stdout)
return result

Просмотреть файл

@ -39,6 +39,16 @@ class Monitor(multiprocessing.Process):
self.__output_handler = open(self.__output_file, 'a')
self.__cgroup = 1
output = run_command('grep cgroup /proc/filesystems', quiet=True)
if output.returncode != 0:
logger.error('Failed to check the cgroup version, will assume using cgroup V1.')
else:
if 'cgroup2' in output.stdout:
self.__cgroup = 2
logger.info('cgroup version: {}.'.format(self.__cgroup))
def __preprocess(self):
"""Preprocess/preparation operations before the monitoring.
@ -67,11 +77,20 @@ class Monitor(multiprocessing.Process):
container_pid = output.stdout
try:
self._cpu_file = glob.glob('/sys/fs/cgroup/cpuacct/docker/{}*/cpuacct.stat'.format(container_id))[0]
self._mem_file = glob.glob(
'/sys/fs/cgroup/memory/docker/{}*/memory.usage_in_bytes'.format(container_id)
)[0]
self._net_file = '/proc/{}/net/dev'.format(container_pid)
if self.__cgroup == 1:
self._cpu_file = glob.glob('/sys/fs/cgroup/cpuacct/docker/{}*/cpuacct.stat'.format(container_id))[0]
self._mem_file = glob.glob(
'/sys/fs/cgroup/memory/docker/{}*/memory.usage_in_bytes'.format(container_id)
)[0]
self._net_file = '/proc/{}/net/dev'.format(container_pid)
else:
self._cpu_file = glob.glob(
'/sys/fs/cgroup/system.slice/docker-{}*.scope/cpu.stat'.format(container_id)
)[0]
self._mem_file = glob.glob(
'/sys/fs/cgroup/system.slice/docker-{}*.scope/memory.stat'.format(container_id)
)[0]
self._net_file = '/proc/net/dev'
except BaseException as e:
logger.error(
'Faild to get the cpu/mem/net file - container: {}, error message: {}'.format(
@ -80,8 +99,12 @@ class Monitor(multiprocessing.Process):
)
return False
else:
self._cpu_file = '/sys/fs/cgroup/cpuacct/cpuacct.stat'
self._mem_file = '/sys/fs/cgroup/memory/memory.usage_in_bytes'
if self.__cgroup == 1:
self._cpu_file = '/sys/fs/cgroup/cpuacct/cpuacct.stat'
self._mem_file = '/sys/fs/cgroup/memory/memory.usage_in_bytes'
else:
self._cpu_file = '/sys/fs/cgroup/cpu.stat'
self._mem_file = '/sys/fs/cgroup/memory.stat'
self._net_file = '/proc/net/dev'
return True
@ -215,13 +238,21 @@ class Monitor(multiprocessing.Process):
system_time = 0
try:
with open(self._cpu_file, 'r') as f:
for line in f:
items = line.split()
if items[0] == 'user':
user_time = int(items[1])
elif items[1] == 'system':
system_time = int(items[1])
return user_time + system_time
if self.__cgroup == 1:
for line in f:
items = line.split()
if items[0] == 'user':
user_time = int(items[1])
elif items[0] == 'system':
system_time = int(items[1])
else:
for line in f:
items = line.split()
if items[0] == 'user_usec':
user_time = int(items[1]) / 10000
elif items[0] == 'system_usec':
system_time = int(items[1]) / 10000
return user_time + system_time
except BaseException as e:
logger.error('Failed to read process cpu ticks information - error message: {}'.format(str(e)))

Просмотреть файл

@ -4,35 +4,50 @@
"""Tests for Monitor module."""
import numbers
import tempfile
import unittest
import shutil
import pathlib
from tests.helper import decorator
from superbench.monitor import Monitor
from superbench.monitor import MonitorRecord
@decorator.cuda_test
def test_monitor():
"""Test the module Monitor."""
monitor = Monitor(None, 1, 10, 'file')
monitor._Monitor__preprocess()
record = MonitorRecord()
monitor._Monitor__sample_host_metrics(record)
assert (isinstance(record.cpu_usage, numbers.Number))
assert (record.net_receive)
assert (record.net_transmit)
for key, value in record.net_receive.items():
assert ('_receive_bw' in key)
isinstance(value, numbers.Number)
for key, value in record.net_transmit.items():
assert ('_transmit_bw' in key)
isinstance(value, numbers.Number)
class MonitorTestCase(unittest.TestCase):
"""A class for Monitor test cases."""
def setUp(self):
"""Hook method for setting up the test fixture before exercising it."""
self.sb_output_dir = tempfile.mkdtemp()
monitor._Monitor__sample_gpu_metrics(record)
gpu_list_metrics = [
record.gpu_usage, record.gpu_temperature, record.gpu_power_limit, record.gpu_mem_used, record.gpu_mem_total,
record.gpu_corrected_ecc, record.gpu_uncorrected_ecc
]
for metric in gpu_list_metrics:
assert (metric)
for value in metric:
def tearDown(self):
"""Hook method for deconstructing the test fixture after testing it."""
shutil.rmtree(self.sb_output_dir)
@decorator.cuda_test
def test_monitor(self):
"""Test the module Monitor."""
log_file = pathlib.Path(self.sb_output_dir) / 'monitor.log'
monitor = Monitor(None, 1, 10, str(log_file))
monitor._Monitor__preprocess()
record = MonitorRecord()
monitor._Monitor__sample_host_metrics(record)
assert (isinstance(record.cpu_usage, numbers.Number))
assert (record.net_receive)
assert (record.net_transmit)
for key, value in record.net_receive.items():
assert ('_receive_bw' in key)
isinstance(value, numbers.Number)
for key, value in record.net_transmit.items():
assert ('_transmit_bw' in key)
isinstance(value, numbers.Number)
monitor._Monitor__sample_gpu_metrics(record)
gpu_list_metrics = [
record.gpu_usage, record.gpu_temperature, record.gpu_power_limit, record.gpu_mem_used, record.gpu_mem_total,
record.gpu_corrected_ecc, record.gpu_uncorrected_ecc
]
for metric in gpu_list_metrics:
assert (metric)
for value in metric:
isinstance(value, numbers.Number)