Executor - Add stdout logging util module and enable real-time logging flushing in executor (#445)

**Description** Add stdout logging util module and enable real-time logging flushing in executor **Major Revision** - Add stdout logging util module to redirect stdout into file log - enable stdout logging in executor to write benchmark output into both stdout and file `sb-bench.log` - enable real-time log flushing in run_command of microbenchmarks through config `log_flushing` **Minor Revision** - add log_n_step args to enable regular step time log in model benchmarks - udpate related docs
2022-12-30 17:40:28 +08:00 · 2022-12-30 17:40:28 +08:00 · 9dfefce350
--- a/docs/design-docs/overview.md
+++ b/docs/design-docs/overview.md
@ -68,6 +68,7 @@ Here're the details about work directory structure for SuperBench Runner.
        │       │       └── rank-0            # output for each rank in each benchmark
        │       │           ├── results.json  # raw results
        |       |           └── monitor.jsonl # monitor results (optional)
        |       ├── sb-bench.log              # SuperBench benchmarks' runtime log for debugging
        │       └── sb-exec.log               # collected SuperBench Executor log
        ├── sb-run.log                        # SuperBench Runner log
        ├── sb.config.yaml                    # SuperBench configuration snapshot
@ -98,6 +99,7 @@ The `/root` directory is mounted from `$HOME/sb-workspace` on the host path.
        │           ├── results.json  # raw results
        │           └── monitor.jsonl # monitor results (optional)
        ├── sb.config.yaml            # SuperBench configuration snapshot
        ├── sb-bench.log              # SuperBench benchmarks' runtime log for debugging
        └── sb.env                    # SuperBench runtime environment variables
 ```
--- a/docs/superbench-config.mdx
+++ b/docs/superbench-config.mdx
@ -336,10 +336,11 @@ A list of models to run, only supported in model-benchmark.
 Parameters for benchmark to use, varying for different benchmarks.
-There have three common parameters for all benchmarks:
+There have four common parameters for all benchmarks:
 * run_count: how many times do user want to run this benchmark, default value is 1.
 * duration: the elapsed time of benchmark in seconds. It can work for all model-benchmark. But for micro-benchmark, benchmark authors should consume it by themselves.
 * log_raw_data: log raw data into file instead of saving it into result object, default value is `False`.  Benchmarks who have large raw output may want to set it as `True`, such as `nccl-bw`/`rccl-bw`.
 * log_flushing: real-time log flushing, default value is `False`.
 For Model-Benchmark, there have some parameters that can control the elapsed time.
 * duration: the elapsed time of benchmark in seconds.
--- a/superbench/benchmarks/base.py
+++ b/superbench/benchmarks/base.py
@ -74,6 +74,12 @@ class Benchmark(ABC):
            default=False,
            help='Log raw data into file instead of saving it into result object.',
        )
        self._parser.add_argument(
            '--log_flushing',
            action='store_true',
            default=False,
            help='Real-time log flushing.',
        )
    def get_configurable_settings(self):
        """Get all the configurable settings.
--- a/superbench/benchmarks/docker_benchmarks/docker_base.py
+++ b/superbench/benchmarks/docker_benchmarks/docker_base.py
@ -122,7 +122,7 @@ class DockerBenchmark(Benchmark):
                    self._curr_run_index, self._name, self._commands[cmd_idx]
                )
            )
-            output = run_command(self._commands[cmd_idx])
+            output = run_command(self._commands[cmd_idx], flush_output=self._args.log_flushing)
            if output.returncode != 0:
                self._result.set_return_code(ReturnCode.DOCKERBENCHMARK_EXECUTION_FAILURE)
                logger.error(
--- a/superbench/benchmarks/micro_benchmarks/micro_base.py
+++ b/superbench/benchmarks/micro_benchmarks/micro_base.py
@ -173,7 +173,7 @@ class MicroBenchmarkWithInvoke(MicroBenchmark):
                )
            )
-            output = run_command(self._commands[cmd_idx])
+            output = run_command(self._commands[cmd_idx], flush_output=self._args.log_flushing)
            if output.returncode != 0:
                self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
                logger.error(
--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@ -8,7 +8,7 @@ import time
 import statistics
 from abc import abstractmethod
-from superbench.common.utils import logger
+from superbench.common.utils import logger, stdout_logger
 from superbench.benchmarks import Precision, ModelAction, DistributedImpl, DistributedBackend, BenchmarkType, ReturnCode
 from superbench.benchmarks.base import Benchmark
 from superbench.benchmarks.context import Enum
@ -131,6 +131,14 @@ class ModelBenchmark(Benchmark):
            help='Enable option to use full float32 precision.',
        )
        self._parser.add_argument(
            '--log_n_steps',
            type=int,
            default=0,
            required=False,
            help='Real-time log every n steps.',
        )
    @abstractmethod
    def _judge_gpu_availability(self):
        """Judge GPUs' availability according to arguments and running environment."""
@ -435,3 +443,16 @@ class ModelBenchmark(Benchmark):
        """Print environments or dependencies information."""
        # TODO: will implement it when add real benchmarks in the future.
        pass
    def _log_step_time(self, curr_step, precision, duration):
        """Log step time into stdout regularly.
        Args:
            curr_step (int): the index of current step
            precision (Precision): precision of model and input data, such as float32, float16.
            duration (list): the durations of all steps
        """
        if self._args.log_n_steps and curr_step % self._args.log_n_steps == 0:
            step_time = statistics.mean(duration) if len(duration) < self._args.log_n_steps \
                else statistics.mean(duration[-self._args.log_n_steps:])
            stdout_logger.log(f'{self._name} - {precision.value}: step {curr_step}, step time {step_time}\n')
--- a/superbench/benchmarks/model_benchmarks/pytorch_bert.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_bert.py
@ -151,6 +151,7 @@ class PytorchBERT(PytorchBase):
                if curr_step > self._args.num_warmup:
                    # Save the step time of every training/inference step, unit is millisecond.
                    duration.append((end - start) * 1000)
                    self._log_step_time(curr_step, precision, duration)
                if self._is_finished(curr_step, end, check_frequency):
                    return duration
@ -179,6 +180,7 @@ class PytorchBERT(PytorchBase):
                    if curr_step > self._args.num_warmup:
                        # Save the step time of every training/inference step, unit is millisecond.
                        duration.append((end - start) * 1000)
                        self._log_step_time(curr_step, precision, duration)
                    if self._is_finished(curr_step, end):
                        return duration
--- a/superbench/benchmarks/model_benchmarks/pytorch_cnn.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_cnn.py
@ -114,6 +114,7 @@ class PytorchCNN(PytorchBase):
                if curr_step > self._args.num_warmup:
                    # Save the step time of every training/inference step, unit is millisecond.
                    duration.append((end - start) * 1000)
                    self._log_step_time(curr_step, precision, duration)
                if self._is_finished(curr_step, end, check_frequency):
                    return duration
@ -143,6 +144,7 @@ class PytorchCNN(PytorchBase):
                    if curr_step > self._args.num_warmup:
                        # Save the step time of every training/inference step, unit is millisecond.
                        duration.append((end - start) * 1000)
                        self._log_step_time(curr_step, precision, duration)
                    if self._is_finished(curr_step, end):
                        return duration
--- a/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py
@ -145,6 +145,7 @@ class PytorchGPT2(PytorchBase):
                if curr_step > self._args.num_warmup:
                    # Save the step time of every training/inference step, unit is millisecond.
                    duration.append((end - start) * 1000)
                    self._log_step_time(curr_step, precision, duration)
                if self._is_finished(curr_step, end, check_frequency):
                    return duration
@ -173,6 +174,7 @@ class PytorchGPT2(PytorchBase):
                    if curr_step > self._args.num_warmup:
                        # Save the step time of every training/inference step, unit is millisecond.
                        duration.append((end - start) * 1000)
                        self._log_step_time(curr_step, precision, duration)
                    if self._is_finished(curr_step, end):
                        return duration
--- a/superbench/benchmarks/model_benchmarks/pytorch_lstm.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_lstm.py
@ -154,6 +154,7 @@ class PytorchLSTM(PytorchBase):
                if curr_step > self._args.num_warmup:
                    # Save the step time of every training/inference step, unit is millisecond.
                    duration.append((end - start) * 1000)
                    self._log_step_time(curr_step, precision, duration)
                if self._is_finished(curr_step, end, check_frequency):
                    return duration
@ -183,6 +184,7 @@ class PytorchLSTM(PytorchBase):
                    if curr_step > self._args.num_warmup:
                        # Save the step time of every training/inference step, unit is millisecond.
                        duration.append((end - start) * 1000)
                        self._log_step_time(curr_step, precision, duration)
                    if self._is_finished(curr_step, end):
                        return duration
--- a/superbench/common/utils/init.py
+++ b/superbench/common/utils/init.py
@ -5,6 +5,7 @@
 from superbench.common.utils.azure import get_vm_size
 from superbench.common.utils.logging import SuperBenchLogger, logger
 from superbench.common.utils.stdout_logging import StdLogger, stdout_logger
 from superbench.common.utils.file_handler import rotate_dir, create_sb_output_dir, get_sb_config
 from superbench.common.utils.lazy_import import LazyImport
 from superbench.common.utils.process import run_command
@ -16,6 +17,8 @@ device_manager = LazyImport('superbench.common.utils.device_manager')
 __all__ = [
    'LazyImport',
    'SuperBenchLogger',
    'StdLogger',
    'stdout_logger',
    'create_sb_output_dir',
    'device_manager',
    'get_sb_config',
--- a/superbench/common/utils/device_manager.py
+++ b/superbench/common/utils/device_manager.py
@ -129,7 +129,7 @@ class DeviceManager:
        Return:
            remapped_metrics (dict): the row remapped information, None means failed to get the data.
        """
-        output = process.run_command('nvidia-smi -i {} -q'.format(idx))
+        output = process.run_command('nvidia-smi -i {} -q'.format(idx), quite=True)
        if output.returncode == 0:
            begin = output.stdout.find('Remapped Rows')
            end = output.stdout.find('Temperature', begin)
--- a/superbench/common/utils/process.py
+++ b/superbench/common/utils/process.py
@ -4,19 +4,47 @@
 """Process Utility."""
 import subprocess
 import os
 import shlex
 from superbench.common.utils import stdout_logger
-def run_command(command):
+def run_command(command, quite=False, flush_output=False):
    """Run command in string format, return the result with stdout and stderr.
    Args:
        command (str): command to run.
        quite (bool): no stdout display of the command if quite is True.
        flush_output (bool): enable real-time output flush or not when running the command.
    Return:
        result (subprocess.CompletedProcess): The return value from subprocess.run().
    """
-    result = subprocess.run(
+    if flush_output:
-        command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
+        process = None
-    )
+        try:
-
+            args = shlex.split(command)
-    return result
+            process = subprocess.Popen(
                args, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True
            )
            output = ''
            for line in process.stdout:
                output += line
                if not quite:
                    stdout_logger.log(line)
            process.wait()
            retcode = process.poll()
            return subprocess.CompletedProcess(args=args, returncode=retcode, stdout=output)
        except Exception as e:
            if process:
                process.kill()
                process.wait()
            return subprocess.CompletedProcess(args=args, returncode=-1, stdout=str(e))
    else:
        result = subprocess.run(
            command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
        )
        if not quite:
            stdout_logger.log(result.stdout)
        return result
--- a/superbench/common/utils/stdout_logging.py
+++ b/superbench/common/utils/stdout_logging.py
@ -0,0 +1,94 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 """SuperBench stdout logging module."""
 import sys
 class StdLogger:
    """Logger class to enable or disable to redirect STDOUT and STDERR to file."""
    class StdoutLoggerStream:
        """StdoutLoggerStream class which redirect the sys.stdout to file."""
        def __init__(self, filename, rank):
            """Init the class with filename.
            Args:
                filename (str): the path of the file to save the log
                rank (int): the rank id
            """
            self._terminal = sys.stdout
            self._rank = rank
            self._log_file_handler = open(filename, 'a')
        def __getattr__(self, attr):
            """Override __getattr__.
            Args:
                attr (str): Attribute name.
            Returns:
                Any: Attribute value.
            """
            return getattr(self._terminal, attr)
        def write(self, message):
            """Write the message to the stream.
            Args:
                message (str): the message to log.
            """
            message = f'[{self._rank}]: {message}'
            self._terminal.write(message)
            self._log_file_handler.write(message)
            self._log_file_handler.flush()
        def flush(self):
            """Override flush."""
            pass
        def restore(self):
            """Restore sys.stdout and close the file."""
            self._log_file_handler.close()
            sys.stdout = self._terminal
    def add_file_handler(self, filename):
        """Init the class with filename.
        Args:
            filename (str): the path of file to save the log
        """
        self.filename = filename
    def __init__(self):
        """Init the logger."""
        self.logger_stream = None
    def start(self, rank):
        """Start the logger to redirect the sys.stdout to file.
        Args:
            rank (int): the rank id
        """
        self.logger_stream = self.StdoutLoggerStream(self.filename, rank)
        sys.stdout = self.logger_stream
        sys.stderr = sys.stdout
    def stop(self):
        """Restore the sys.stdout to termital."""
        if self.logger_stream is not None:
            self.logger_stream.restore()
    def log(self, message):
        """Write the message into the logger.
        Args:
            message (str): the message to log.
        """
        if self.logger_stream:
            self.logger_stream.write(message)
        else:
            sys.stdout.write(message)
 stdout_logger = StdLogger()
--- a/superbench/executor/executor.py
+++ b/superbench/executor/executor.py
@ -10,7 +10,7 @@ from pathlib import Path
 from omegaconf import ListConfig
 from superbench.benchmarks import Platform, Framework, BenchmarkRegistry
-from superbench.common.utils import SuperBenchLogger, logger, rotate_dir
+from superbench.common.utils import SuperBenchLogger, logger, rotate_dir, stdout_logger
 from superbench.common.devices import GPU
 from superbench.monitor import Monitor
@ -29,6 +29,7 @@ class SuperBenchExecutor():
        self._output_path = Path(sb_output_dir).expanduser().resolve()
        self.__set_logger('sb-exec.log')
        self.__set_stdout_logger(self._output_path / 'sb-bench.log')
        logger.debug('Executor uses config: %s.', self._sb_config)
        logger.debug('Executor writes to: %s.', str(self._output_path))
@ -46,6 +47,16 @@ class SuperBenchExecutor():
        """
        SuperBenchLogger.add_handler(logger.logger, filename=str(self._output_path / filename))
    def __set_stdout_logger(self, filename):
        """Set stdout logger and redirect logs and stdout into the file.
        Args:
            filename (str): Log file name.
        """
        stdout_logger.add_file_handler(filename)
        stdout_logger.start(self.__get_rank_id())
        SuperBenchLogger.add_handler(logger.logger, filename=filename)
    def __validate_sb_config(self):
        """Validate SuperBench config object.
@ -244,5 +255,6 @@ class SuperBenchExecutor():
            if monitor:
                monitor.stop()
            stdout_logger.stop()
            self.__write_benchmark_results(benchmark_name, benchmark_results)
            os.chdir(cwd)
--- a/tests/benchmarks/model_benchmarks/test_model_base.py
+++ b/tests/benchmarks/model_benchmarks/test_model_base.py
@ -158,6 +158,8 @@ def test_arguments_related_interfaces():
  --duration int        The elapsed time of benchmark in seconds.
  --force_fp32          Enable option to use full float32 precision.
  --hidden_size int     Hidden size.
  --log_flushing        Real-time log flushing.
  --log_n_steps int     Real-time log every n steps.
  --log_raw_data        Log raw data into file instead of saving it into
                        result object.
  --model_action ModelAction [ModelAction ...]
@ -194,6 +196,8 @@ def test_preprocess():
  --duration int        The elapsed time of benchmark in seconds.
  --force_fp32          Enable option to use full float32 precision.
  --hidden_size int     Hidden size.
  --log_flushing        Real-time log flushing.
  --log_n_steps int     Real-time log every n steps.
  --log_raw_data        Log raw data into file instead of saving it into
                        result object.
  --model_action ModelAction [ModelAction ...]
--- a/tests/benchmarks/test_registry.py
+++ b/tests/benchmarks/test_registry.py
@ -114,6 +114,7 @@ def test_get_benchmark_configurable_settings():
    expected = """optional arguments:
  --duration int     The elapsed time of benchmark in seconds.
  --log_flushing     Real-time log flushing.
  --log_raw_data     Log raw data into file instead of saving it into result
                     object.
  --lower_bound int  The lower bound for accumulation.
--- a/tests/common/test_process.py
+++ b/tests/common/test_process.py
@ -0,0 +1,24 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 """Test common.util.process."""
 from superbench.common.utils import run_command
 def test_run_command():
    """Test run_command."""
    command = 'echo 123'
    expected_output = '123\n'
    output = run_command(command)
    assert (output.stdout == expected_output)
    assert (output.returncode == 0)
    output = run_command(command, flush_output=True)
    assert (output.stdout == expected_output)
    assert (output.returncode == 0)
    command = 'abb'
    output = run_command(command)
    assert (output.returncode != 0)
    output = run_command(command, flush_output=True)
    assert (output.returncode != 0)