Monitor - Support cgroup V2 when read system metrics. (#491)
**Description** Since ubuntu 22.04 will use cgroup V2 and the file structure changed. Modify the monitor to adapt to cgroup v1 and v2.
This commit is contained in:
Родитель
dbeba8056b
Коммит
a9b45a072e
|
@ -129,7 +129,7 @@ class DeviceManager:
|
|||
Return:
|
||||
remapped_metrics (dict): the row remapped information, None means failed to get the data.
|
||||
"""
|
||||
output = process.run_command('nvidia-smi -i {} -q'.format(idx), quite=True)
|
||||
output = process.run_command('nvidia-smi -i {} -q'.format(idx), quiet=True)
|
||||
if output.returncode == 0:
|
||||
begin = output.stdout.find('Remapped Rows')
|
||||
end = output.stdout.find('Temperature', begin)
|
||||
|
|
|
@ -10,12 +10,12 @@ import shlex
|
|||
from superbench.common.utils import stdout_logger
|
||||
|
||||
|
||||
def run_command(command, quite=False, flush_output=False):
|
||||
def run_command(command, quiet=False, flush_output=False):
|
||||
"""Run command in string format, return the result with stdout and stderr.
|
||||
|
||||
Args:
|
||||
command (str): command to run.
|
||||
quite (bool): no stdout display of the command if quite is True.
|
||||
quiet (bool): no stdout display of the command if quiet is True.
|
||||
flush_output (bool): enable real-time output flush or not when running the command.
|
||||
|
||||
Return:
|
||||
|
@ -31,7 +31,7 @@ def run_command(command, quite=False, flush_output=False):
|
|||
output = ''
|
||||
for line in process.stdout:
|
||||
output += line
|
||||
if not quite:
|
||||
if not quiet:
|
||||
stdout_logger.log(line)
|
||||
process.wait()
|
||||
retcode = process.poll()
|
||||
|
@ -45,6 +45,6 @@ def run_command(command, quite=False, flush_output=False):
|
|||
result = subprocess.run(
|
||||
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
|
||||
)
|
||||
if not quite:
|
||||
if not quiet:
|
||||
stdout_logger.log(result.stdout)
|
||||
return result
|
||||
|
|
|
@ -39,6 +39,16 @@ class Monitor(multiprocessing.Process):
|
|||
|
||||
self.__output_handler = open(self.__output_file, 'a')
|
||||
|
||||
self.__cgroup = 1
|
||||
output = run_command('grep cgroup /proc/filesystems', quiet=True)
|
||||
if output.returncode != 0:
|
||||
logger.error('Failed to check the cgroup version, will assume using cgroup V1.')
|
||||
else:
|
||||
if 'cgroup2' in output.stdout:
|
||||
self.__cgroup = 2
|
||||
|
||||
logger.info('cgroup version: {}.'.format(self.__cgroup))
|
||||
|
||||
def __preprocess(self):
|
||||
"""Preprocess/preparation operations before the monitoring.
|
||||
|
||||
|
@ -67,11 +77,20 @@ class Monitor(multiprocessing.Process):
|
|||
container_pid = output.stdout
|
||||
|
||||
try:
|
||||
self._cpu_file = glob.glob('/sys/fs/cgroup/cpuacct/docker/{}*/cpuacct.stat'.format(container_id))[0]
|
||||
self._mem_file = glob.glob(
|
||||
'/sys/fs/cgroup/memory/docker/{}*/memory.usage_in_bytes'.format(container_id)
|
||||
)[0]
|
||||
self._net_file = '/proc/{}/net/dev'.format(container_pid)
|
||||
if self.__cgroup == 1:
|
||||
self._cpu_file = glob.glob('/sys/fs/cgroup/cpuacct/docker/{}*/cpuacct.stat'.format(container_id))[0]
|
||||
self._mem_file = glob.glob(
|
||||
'/sys/fs/cgroup/memory/docker/{}*/memory.usage_in_bytes'.format(container_id)
|
||||
)[0]
|
||||
self._net_file = '/proc/{}/net/dev'.format(container_pid)
|
||||
else:
|
||||
self._cpu_file = glob.glob(
|
||||
'/sys/fs/cgroup/system.slice/docker-{}*.scope/cpu.stat'.format(container_id)
|
||||
)[0]
|
||||
self._mem_file = glob.glob(
|
||||
'/sys/fs/cgroup/system.slice/docker-{}*.scope/memory.stat'.format(container_id)
|
||||
)[0]
|
||||
self._net_file = '/proc/net/dev'
|
||||
except BaseException as e:
|
||||
logger.error(
|
||||
'Faild to get the cpu/mem/net file - container: {}, error message: {}'.format(
|
||||
|
@ -80,8 +99,12 @@ class Monitor(multiprocessing.Process):
|
|||
)
|
||||
return False
|
||||
else:
|
||||
self._cpu_file = '/sys/fs/cgroup/cpuacct/cpuacct.stat'
|
||||
self._mem_file = '/sys/fs/cgroup/memory/memory.usage_in_bytes'
|
||||
if self.__cgroup == 1:
|
||||
self._cpu_file = '/sys/fs/cgroup/cpuacct/cpuacct.stat'
|
||||
self._mem_file = '/sys/fs/cgroup/memory/memory.usage_in_bytes'
|
||||
else:
|
||||
self._cpu_file = '/sys/fs/cgroup/cpu.stat'
|
||||
self._mem_file = '/sys/fs/cgroup/memory.stat'
|
||||
self._net_file = '/proc/net/dev'
|
||||
|
||||
return True
|
||||
|
@ -215,13 +238,21 @@ class Monitor(multiprocessing.Process):
|
|||
system_time = 0
|
||||
try:
|
||||
with open(self._cpu_file, 'r') as f:
|
||||
for line in f:
|
||||
items = line.split()
|
||||
if items[0] == 'user':
|
||||
user_time = int(items[1])
|
||||
elif items[1] == 'system':
|
||||
system_time = int(items[1])
|
||||
return user_time + system_time
|
||||
if self.__cgroup == 1:
|
||||
for line in f:
|
||||
items = line.split()
|
||||
if items[0] == 'user':
|
||||
user_time = int(items[1])
|
||||
elif items[0] == 'system':
|
||||
system_time = int(items[1])
|
||||
else:
|
||||
for line in f:
|
||||
items = line.split()
|
||||
if items[0] == 'user_usec':
|
||||
user_time = int(items[1]) / 10000
|
||||
elif items[0] == 'system_usec':
|
||||
system_time = int(items[1]) / 10000
|
||||
return user_time + system_time
|
||||
except BaseException as e:
|
||||
logger.error('Failed to read process cpu ticks information - error message: {}'.format(str(e)))
|
||||
|
||||
|
|
|
@ -4,35 +4,50 @@
|
|||
"""Tests for Monitor module."""
|
||||
|
||||
import numbers
|
||||
import tempfile
|
||||
import unittest
|
||||
import shutil
|
||||
import pathlib
|
||||
|
||||
from tests.helper import decorator
|
||||
from superbench.monitor import Monitor
|
||||
from superbench.monitor import MonitorRecord
|
||||
|
||||
|
||||
@decorator.cuda_test
|
||||
def test_monitor():
|
||||
"""Test the module Monitor."""
|
||||
monitor = Monitor(None, 1, 10, 'file')
|
||||
monitor._Monitor__preprocess()
|
||||
record = MonitorRecord()
|
||||
monitor._Monitor__sample_host_metrics(record)
|
||||
assert (isinstance(record.cpu_usage, numbers.Number))
|
||||
assert (record.net_receive)
|
||||
assert (record.net_transmit)
|
||||
for key, value in record.net_receive.items():
|
||||
assert ('_receive_bw' in key)
|
||||
isinstance(value, numbers.Number)
|
||||
for key, value in record.net_transmit.items():
|
||||
assert ('_transmit_bw' in key)
|
||||
isinstance(value, numbers.Number)
|
||||
class MonitorTestCase(unittest.TestCase):
|
||||
"""A class for Monitor test cases."""
|
||||
def setUp(self):
|
||||
"""Hook method for setting up the test fixture before exercising it."""
|
||||
self.sb_output_dir = tempfile.mkdtemp()
|
||||
|
||||
monitor._Monitor__sample_gpu_metrics(record)
|
||||
gpu_list_metrics = [
|
||||
record.gpu_usage, record.gpu_temperature, record.gpu_power_limit, record.gpu_mem_used, record.gpu_mem_total,
|
||||
record.gpu_corrected_ecc, record.gpu_uncorrected_ecc
|
||||
]
|
||||
for metric in gpu_list_metrics:
|
||||
assert (metric)
|
||||
for value in metric:
|
||||
def tearDown(self):
|
||||
"""Hook method for deconstructing the test fixture after testing it."""
|
||||
shutil.rmtree(self.sb_output_dir)
|
||||
|
||||
@decorator.cuda_test
|
||||
def test_monitor(self):
|
||||
"""Test the module Monitor."""
|
||||
log_file = pathlib.Path(self.sb_output_dir) / 'monitor.log'
|
||||
monitor = Monitor(None, 1, 10, str(log_file))
|
||||
monitor._Monitor__preprocess()
|
||||
record = MonitorRecord()
|
||||
monitor._Monitor__sample_host_metrics(record)
|
||||
assert (isinstance(record.cpu_usage, numbers.Number))
|
||||
assert (record.net_receive)
|
||||
assert (record.net_transmit)
|
||||
for key, value in record.net_receive.items():
|
||||
assert ('_receive_bw' in key)
|
||||
isinstance(value, numbers.Number)
|
||||
for key, value in record.net_transmit.items():
|
||||
assert ('_transmit_bw' in key)
|
||||
isinstance(value, numbers.Number)
|
||||
|
||||
monitor._Monitor__sample_gpu_metrics(record)
|
||||
gpu_list_metrics = [
|
||||
record.gpu_usage, record.gpu_temperature, record.gpu_power_limit, record.gpu_mem_used, record.gpu_mem_total,
|
||||
record.gpu_corrected_ecc, record.gpu_uncorrected_ecc
|
||||
]
|
||||
for metric in gpu_list_metrics:
|
||||
assert (metric)
|
||||
for value in metric:
|
||||
isinstance(value, numbers.Number)
|
||||
|
|
Загрузка…
Ссылка в новой задаче