Monitor - Support cgroup V2 when read system metrics. (#491)

**Description** Since ubuntu 22.04 will use cgroup V2 and the file structure changed. Modify the monitor to adapt to cgroup v1 and v2.
2023-03-22 16:33:18 +08:00 · 2023-03-22 16:33:18 +08:00 · a9b45a072e
--- a/superbench/common/utils/device_manager.py
+++ b/superbench/common/utils/device_manager.py
@ -129,7 +129,7 @@ class DeviceManager:
        Return:
            remapped_metrics (dict): the row remapped information, None means failed to get the data.
        """
-        output = process.run_command('nvidia-smi -i {} -q'.format(idx), quite=True)
+        output = process.run_command('nvidia-smi -i {} -q'.format(idx), quiet=True)
        if output.returncode == 0:
            begin = output.stdout.find('Remapped Rows')
            end = output.stdout.find('Temperature', begin)
--- a/superbench/common/utils/process.py
+++ b/superbench/common/utils/process.py
@ -10,12 +10,12 @@ import shlex
 from superbench.common.utils import stdout_logger


-def run_command(command, quite=False, flush_output=False):
+def run_command(command, quiet=False, flush_output=False):
    """Run command in string format, return the result with stdout and stderr.

    Args:
        command (str): command to run.
-        quite (bool): no stdout display of the command if quite is True.
+        quiet (bool): no stdout display of the command if quiet is True.
        flush_output (bool): enable real-time output flush or not when running the command.

    Return:
@ -31,7 +31,7 @@ def run_command(command, quite=False, flush_output=False):
            output = ''
            for line in process.stdout:
                output += line
-                if not quite:
+                if not quiet:
                    stdout_logger.log(line)
            process.wait()
            retcode = process.poll()
@ -45,6 +45,6 @@ def run_command(command, quite=False, flush_output=False):
        result = subprocess.run(
            command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
        )
-        if not quite:
+        if not quiet:
            stdout_logger.log(result.stdout)
        return result
--- a/superbench/monitor/monitor.py
+++ b/superbench/monitor/monitor.py
@ -39,6 +39,16 @@ class Monitor(multiprocessing.Process):

        self.__output_handler = open(self.__output_file, 'a')

+        self.__cgroup = 1
+        output = run_command('grep cgroup /proc/filesystems', quiet=True)
+        if output.returncode != 0:
+            logger.error('Failed to check the cgroup version, will assume using cgroup V1.')
+        else:
+            if 'cgroup2' in output.stdout:
+                self.__cgroup = 2
+
+        logger.info('cgroup version: {}.'.format(self.__cgroup))
+
    def __preprocess(self):
        """Preprocess/preparation operations before the monitoring.

@ -67,11 +77,20 @@ class Monitor(multiprocessing.Process):
            container_pid = output.stdout

            try:
-                self._cpu_file = glob.glob('/sys/fs/cgroup/cpuacct/docker/{}*/cpuacct.stat'.format(container_id))[0]
-                self._mem_file = glob.glob(
-                    '/sys/fs/cgroup/memory/docker/{}*/memory.usage_in_bytes'.format(container_id)
-                )[0]
-                self._net_file = '/proc/{}/net/dev'.format(container_pid)
+                if self.__cgroup == 1:
+                    self._cpu_file = glob.glob('/sys/fs/cgroup/cpuacct/docker/{}*/cpuacct.stat'.format(container_id))[0]
+                    self._mem_file = glob.glob(
+                        '/sys/fs/cgroup/memory/docker/{}*/memory.usage_in_bytes'.format(container_id)
+                    )[0]
+                    self._net_file = '/proc/{}/net/dev'.format(container_pid)
+                else:
+                    self._cpu_file = glob.glob(
+                        '/sys/fs/cgroup/system.slice/docker-{}*.scope/cpu.stat'.format(container_id)
+                    )[0]
+                    self._mem_file = glob.glob(
+                        '/sys/fs/cgroup/system.slice/docker-{}*.scope/memory.stat'.format(container_id)
+                    )[0]
+                    self._net_file = '/proc/net/dev'
            except BaseException as e:
                logger.error(
                    'Faild to get the cpu/mem/net file - container: {}, error message: {}'.format(
@ -80,8 +99,12 @@ class Monitor(multiprocessing.Process):
                )
                return False
        else:
-            self._cpu_file = '/sys/fs/cgroup/cpuacct/cpuacct.stat'
-            self._mem_file = '/sys/fs/cgroup/memory/memory.usage_in_bytes'
+            if self.__cgroup == 1:
+                self._cpu_file = '/sys/fs/cgroup/cpuacct/cpuacct.stat'
+                self._mem_file = '/sys/fs/cgroup/memory/memory.usage_in_bytes'
+            else:
+                self._cpu_file = '/sys/fs/cgroup/cpu.stat'
+                self._mem_file = '/sys/fs/cgroup/memory.stat'
            self._net_file = '/proc/net/dev'

        return True
@ -215,13 +238,21 @@ class Monitor(multiprocessing.Process):
        system_time = 0
        try:
            with open(self._cpu_file, 'r') as f:
-                for line in f:
-                    items = line.split()
-                    if items[0] == 'user':
-                        user_time = int(items[1])
-                    elif items[1] == 'system':
-                        system_time = int(items[1])
-                return user_time + system_time
+                if self.__cgroup == 1:
+                    for line in f:
+                        items = line.split()
+                        if items[0] == 'user':
+                            user_time = int(items[1])
+                        elif items[0] == 'system':
+                            system_time = int(items[1])
+                else:
+                    for line in f:
+                        items = line.split()
+                        if items[0] == 'user_usec':
+                            user_time = int(items[1]) / 10000
+                        elif items[0] == 'system_usec':
+                            system_time = int(items[1]) / 10000
+            return user_time + system_time
        except BaseException as e:
            logger.error('Failed to read process cpu ticks information - error message: {}'.format(str(e)))

--- a/tests/monitor/test_monitor.py
+++ b/tests/monitor/test_monitor.py
@ -4,35 +4,50 @@
 """Tests for Monitor module."""

 import numbers
+import tempfile
+import unittest
+import shutil
+import pathlib

 from tests.helper import decorator
 from superbench.monitor import Monitor
 from superbench.monitor import MonitorRecord


-@decorator.cuda_test
-def test_monitor():
-    """Test the module Monitor."""
-    monitor = Monitor(None, 1, 10, 'file')
-    monitor._Monitor__preprocess()
-    record = MonitorRecord()
-    monitor._Monitor__sample_host_metrics(record)
-    assert (isinstance(record.cpu_usage, numbers.Number))
-    assert (record.net_receive)
-    assert (record.net_transmit)
-    for key, value in record.net_receive.items():
-        assert ('_receive_bw' in key)
-        isinstance(value, numbers.Number)
-    for key, value in record.net_transmit.items():
-        assert ('_transmit_bw' in key)
-        isinstance(value, numbers.Number)
+class MonitorTestCase(unittest.TestCase):
+    """A class for Monitor test cases."""
+    def setUp(self):
+        """Hook method for setting up the test fixture before exercising it."""
+        self.sb_output_dir = tempfile.mkdtemp()

-    monitor._Monitor__sample_gpu_metrics(record)
-    gpu_list_metrics = [
-        record.gpu_usage, record.gpu_temperature, record.gpu_power_limit, record.gpu_mem_used, record.gpu_mem_total,
-        record.gpu_corrected_ecc, record.gpu_uncorrected_ecc
-    ]
-    for metric in gpu_list_metrics:
-        assert (metric)
-        for value in metric:
+    def tearDown(self):
+        """Hook method for deconstructing the test fixture after testing it."""
+        shutil.rmtree(self.sb_output_dir)
+
+    @decorator.cuda_test
+    def test_monitor(self):
+        """Test the module Monitor."""
+        log_file = pathlib.Path(self.sb_output_dir) / 'monitor.log'
+        monitor = Monitor(None, 1, 10, str(log_file))
+        monitor._Monitor__preprocess()
+        record = MonitorRecord()
+        monitor._Monitor__sample_host_metrics(record)
+        assert (isinstance(record.cpu_usage, numbers.Number))
+        assert (record.net_receive)
+        assert (record.net_transmit)
+        for key, value in record.net_receive.items():
+            assert ('_receive_bw' in key)
            isinstance(value, numbers.Number)
+        for key, value in record.net_transmit.items():
+            assert ('_transmit_bw' in key)
+            isinstance(value, numbers.Number)
+
+        monitor._Monitor__sample_gpu_metrics(record)
+        gpu_list_metrics = [
+            record.gpu_usage, record.gpu_temperature, record.gpu_power_limit, record.gpu_mem_used, record.gpu_mem_total,
+            record.gpu_corrected_ecc, record.gpu_uncorrected_ecc
+        ]
+        for metric in gpu_list_metrics:
+            assert (metric)
+            for value in metric:
+                isinstance(value, numbers.Number)