Executor - Add stdout logging util module and enable real-time logging flushing in executor (#445)

**Description** Add stdout logging util module and enable real-time logging flushing in executor **Major Revision** - Add stdout logging util module to redirect stdout into file log - enable stdout logging in executor to write benchmark output into both stdout and file `sb-bench.log` - enable real-time log flushing in run_command of microbenchmarks through config `log_flushing` **Minor Revision** - add log_n_step args to enable regular step time log in model benchmarks - udpate related docs
2022-12-30 17:40:28 +08:00 · 2022-12-30 17:40:28 +08:00 · 9dfefce350
--- a/docs/design-docs/overview.md
+++ b/docs/design-docs/overview.md
@ -68,6 +68,7 @@ Here're the details about work directory structure for SuperBench Runner.
        │       │       └── rank-0            # output for each rank in each benchmark
        │       │           ├── results.json  # raw results
        |       |           └── monitor.jsonl # monitor results (optional)
+        |       ├── sb-bench.log              # SuperBench benchmarks' runtime log for debugging
        │       └── sb-exec.log               # collected SuperBench Executor log
        ├── sb-run.log                        # SuperBench Runner log
        ├── sb.config.yaml                    # SuperBench configuration snapshot
@ -98,6 +99,7 @@ The `/root` directory is mounted from `$HOME/sb-workspace` on the host path.
        │           ├── results.json  # raw results
        │           └── monitor.jsonl # monitor results (optional)
        ├── sb.config.yaml            # SuperBench configuration snapshot
+        ├── sb-bench.log              # SuperBench benchmarks' runtime log for debugging
        └── sb.env                    # SuperBench runtime environment variables
 ```

--- a/docs/superbench-config.mdx
+++ b/docs/superbench-config.mdx
@ -336,10 +336,11 @@ A list of models to run, only supported in model-benchmark.

 Parameters for benchmark to use, varying for different benchmarks.

-There have three common parameters for all benchmarks:
+There have four common parameters for all benchmarks:
 * run_count: how many times do user want to run this benchmark, default value is 1.
 * duration: the elapsed time of benchmark in seconds. It can work for all model-benchmark. But for micro-benchmark, benchmark authors should consume it by themselves.
 * log_raw_data: log raw data into file instead of saving it into result object, default value is `False`.  Benchmarks who have large raw output may want to set it as `True`, such as `nccl-bw`/`rccl-bw`.
+* log_flushing: real-time log flushing, default value is `False`.

 For Model-Benchmark, there have some parameters that can control the elapsed time.
 * duration: the elapsed time of benchmark in seconds.
--- a/superbench/benchmarks/base.py
+++ b/superbench/benchmarks/base.py
@ -74,6 +74,12 @@ class Benchmark(ABC):
            default=False,
            help='Log raw data into file instead of saving it into result object.',
        )
+        self._parser.add_argument(
+            '--log_flushing',
+            action='store_true',
+            default=False,
+            help='Real-time log flushing.',
+        )

    def get_configurable_settings(self):
        """Get all the configurable settings.
--- a/superbench/benchmarks/docker_benchmarks/docker_base.py
+++ b/superbench/benchmarks/docker_benchmarks/docker_base.py
@ -122,7 +122,7 @@ class DockerBenchmark(Benchmark):
                    self._curr_run_index, self._name, self._commands[cmd_idx]
                )
            )
-            output = run_command(self._commands[cmd_idx])
+            output = run_command(self._commands[cmd_idx], flush_output=self._args.log_flushing)
            if output.returncode != 0:
                self._result.set_return_code(ReturnCode.DOCKERBENCHMARK_EXECUTION_FAILURE)
                logger.error(
--- a/superbench/benchmarks/micro_benchmarks/micro_base.py
+++ b/superbench/benchmarks/micro_benchmarks/micro_base.py
@ -173,7 +173,7 @@ class MicroBenchmarkWithInvoke(MicroBenchmark):
                )
            )

-            output = run_command(self._commands[cmd_idx])
+            output = run_command(self._commands[cmd_idx], flush_output=self._args.log_flushing)
            if output.returncode != 0:
                self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
                logger.error(
--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@ -8,7 +8,7 @@ import time
 import statistics
 from abc import abstractmethod

-from superbench.common.utils import logger
+from superbench.common.utils import logger, stdout_logger
 from superbench.benchmarks import Precision, ModelAction, DistributedImpl, DistributedBackend, BenchmarkType, ReturnCode
 from superbench.benchmarks.base import Benchmark
 from superbench.benchmarks.context import Enum
@ -131,6 +131,14 @@ class ModelBenchmark(Benchmark):
            help='Enable option to use full float32 precision.',
        )

+        self._parser.add_argument(
+            '--log_n_steps',
+            type=int,
+            default=0,
+            required=False,
+            help='Real-time log every n steps.',
+        )
+
    @abstractmethod
    def _judge_gpu_availability(self):
        """Judge GPUs' availability according to arguments and running environment."""
@ -435,3 +443,16 @@ class ModelBenchmark(Benchmark):
        """Print environments or dependencies information."""
        # TODO: will implement it when add real benchmarks in the future.
        pass
+
+    def _log_step_time(self, curr_step, precision, duration):
+        """Log step time into stdout regularly.
+
+        Args:
+            curr_step (int): the index of current step
+            precision (Precision): precision of model and input data, such as float32, float16.
+            duration (list): the durations of all steps
+        """
+        if self._args.log_n_steps and curr_step % self._args.log_n_steps == 0:
+            step_time = statistics.mean(duration) if len(duration) < self._args.log_n_steps \
+                else statistics.mean(duration[-self._args.log_n_steps:])
+            stdout_logger.log(f'{self._name} - {precision.value}: step {curr_step}, step time {step_time}\n')
--- a/superbench/benchmarks/model_benchmarks/pytorch_bert.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_bert.py
@ -151,6 +151,7 @@ class PytorchBERT(PytorchBase):
                if curr_step > self._args.num_warmup:
                    # Save the step time of every training/inference step, unit is millisecond.
                    duration.append((end - start) * 1000)
+                    self._log_step_time(curr_step, precision, duration)
                if self._is_finished(curr_step, end, check_frequency):
                    return duration

@ -179,6 +180,7 @@ class PytorchBERT(PytorchBase):
                    if curr_step > self._args.num_warmup:
                        # Save the step time of every training/inference step, unit is millisecond.
                        duration.append((end - start) * 1000)
+                        self._log_step_time(curr_step, precision, duration)
                    if self._is_finished(curr_step, end):
                        return duration

--- a/superbench/benchmarks/model_benchmarks/pytorch_cnn.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_cnn.py
@ -114,6 +114,7 @@ class PytorchCNN(PytorchBase):
                if curr_step > self._args.num_warmup:
                    # Save the step time of every training/inference step, unit is millisecond.
                    duration.append((end - start) * 1000)
+                    self._log_step_time(curr_step, precision, duration)
                if self._is_finished(curr_step, end, check_frequency):
                    return duration

@ -143,6 +144,7 @@ class PytorchCNN(PytorchBase):
                    if curr_step > self._args.num_warmup:
                        # Save the step time of every training/inference step, unit is millisecond.
                        duration.append((end - start) * 1000)
+                        self._log_step_time(curr_step, precision, duration)
                    if self._is_finished(curr_step, end):
                        return duration

--- a/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_gpt2.py
@ -145,6 +145,7 @@ class PytorchGPT2(PytorchBase):
                if curr_step > self._args.num_warmup:
                    # Save the step time of every training/inference step, unit is millisecond.
                    duration.append((end - start) * 1000)
+                    self._log_step_time(curr_step, precision, duration)
                if self._is_finished(curr_step, end, check_frequency):
                    return duration

@ -173,6 +174,7 @@ class PytorchGPT2(PytorchBase):
                    if curr_step > self._args.num_warmup:
                        # Save the step time of every training/inference step, unit is millisecond.
                        duration.append((end - start) * 1000)
+                        self._log_step_time(curr_step, precision, duration)
                    if self._is_finished(curr_step, end):
                        return duration

--- a/superbench/benchmarks/model_benchmarks/pytorch_lstm.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_lstm.py
@ -154,6 +154,7 @@ class PytorchLSTM(PytorchBase):
                if curr_step > self._args.num_warmup:
                    # Save the step time of every training/inference step, unit is millisecond.
                    duration.append((end - start) * 1000)
+                    self._log_step_time(curr_step, precision, duration)
                if self._is_finished(curr_step, end, check_frequency):
                    return duration

@ -183,6 +184,7 @@ class PytorchLSTM(PytorchBase):
                    if curr_step > self._args.num_warmup:
                        # Save the step time of every training/inference step, unit is millisecond.
                        duration.append((end - start) * 1000)
+                        self._log_step_time(curr_step, precision, duration)
                    if self._is_finished(curr_step, end):
                        return duration

--- a/superbench/common/utils/init.py
+++ b/superbench/common/utils/init.py
@ -5,6 +5,7 @@

 from superbench.common.utils.azure import get_vm_size
 from superbench.common.utils.logging import SuperBenchLogger, logger
+from superbench.common.utils.stdout_logging import StdLogger, stdout_logger
 from superbench.common.utils.file_handler import rotate_dir, create_sb_output_dir, get_sb_config
 from superbench.common.utils.lazy_import import LazyImport
 from superbench.common.utils.process import run_command
@ -16,6 +17,8 @@ device_manager = LazyImport('superbench.common.utils.device_manager')
 __all__ = [
    'LazyImport',
    'SuperBenchLogger',
+    'StdLogger',
+    'stdout_logger',
    'create_sb_output_dir',
    'device_manager',
    'get_sb_config',
--- a/superbench/common/utils/device_manager.py
+++ b/superbench/common/utils/device_manager.py
@ -129,7 +129,7 @@ class DeviceManager:
        Return:
            remapped_metrics (dict): the row remapped information, None means failed to get the data.
        """
-        output = process.run_command('nvidia-smi -i {} -q'.format(idx))
+        output = process.run_command('nvidia-smi -i {} -q'.format(idx), quite=True)
        if output.returncode == 0:
            begin = output.stdout.find('Remapped Rows')
            end = output.stdout.find('Temperature', begin)
--- a/superbench/common/utils/process.py
+++ b/superbench/common/utils/process.py
@ -4,19 +4,47 @@
 """Process Utility."""

 import subprocess
+import os
+import shlex
+
+from superbench.common.utils import stdout_logger


-def run_command(command):
+def run_command(command, quite=False, flush_output=False):
    """Run command in string format, return the result with stdout and stderr.

    Args:
        command (str): command to run.
+        quite (bool): no stdout display of the command if quite is True.
+        flush_output (bool): enable real-time output flush or not when running the command.

    Return:
        result (subprocess.CompletedProcess): The return value from subprocess.run().
    """
-    result = subprocess.run(
-        command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
-    )
-
-    return result
+    if flush_output:
+        process = None
+        try:
+            args = shlex.split(command)
+            process = subprocess.Popen(
+                args, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True
+            )
+            output = ''
+            for line in process.stdout:
+                output += line
+                if not quite:
+                    stdout_logger.log(line)
+            process.wait()
+            retcode = process.poll()
+            return subprocess.CompletedProcess(args=args, returncode=retcode, stdout=output)
+        except Exception as e:
+            if process:
+                process.kill()
+                process.wait()
+            return subprocess.CompletedProcess(args=args, returncode=-1, stdout=str(e))
+    else:
+        result = subprocess.run(
+            command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
+        )
+        if not quite:
+            stdout_logger.log(result.stdout)
+        return result
--- a/superbench/common/utils/stdout_logging.py
+++ b/superbench/common/utils/stdout_logging.py
@ -0,0 +1,94 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""SuperBench stdout logging module."""
+
+import sys
+
+
+class StdLogger:
+    """Logger class to enable or disable to redirect STDOUT and STDERR to file."""
+    class StdoutLoggerStream:
+        """StdoutLoggerStream class which redirect the sys.stdout to file."""
+        def __init__(self, filename, rank):
+            """Init the class with filename.
+
+            Args:
+                filename (str): the path of the file to save the log
+                rank (int): the rank id
+            """
+            self._terminal = sys.stdout
+            self._rank = rank
+            self._log_file_handler = open(filename, 'a')
+
+        def __getattr__(self, attr):
+            """Override __getattr__.
+
+            Args:
+                attr (str): Attribute name.
+
+            Returns:
+                Any: Attribute value.
+            """
+            return getattr(self._terminal, attr)
+
+        def write(self, message):
+            """Write the message to the stream.
+
+            Args:
+                message (str): the message to log.
+            """
+            message = f'[{self._rank}]: {message}'
+            self._terminal.write(message)
+            self._log_file_handler.write(message)
+            self._log_file_handler.flush()
+
+        def flush(self):
+            """Override flush."""
+            pass
+
+        def restore(self):
+            """Restore sys.stdout and close the file."""
+            self._log_file_handler.close()
+            sys.stdout = self._terminal
+
+    def add_file_handler(self, filename):
+        """Init the class with filename.
+
+        Args:
+            filename (str): the path of file to save the log
+        """
+        self.filename = filename
+
+    def __init__(self):
+        """Init the logger."""
+        self.logger_stream = None
+
+    def start(self, rank):
+        """Start the logger to redirect the sys.stdout to file.
+
+        Args:
+            rank (int): the rank id
+        """
+        self.logger_stream = self.StdoutLoggerStream(self.filename, rank)
+        sys.stdout = self.logger_stream
+        sys.stderr = sys.stdout
+
+    def stop(self):
+        """Restore the sys.stdout to termital."""
+        if self.logger_stream is not None:
+            self.logger_stream.restore()
+
+    def log(self, message):
+        """Write the message into the logger.
+
+        Args:
+            message (str): the message to log.
+        """
+        if self.logger_stream:
+            self.logger_stream.write(message)
+        else:
+            sys.stdout.write(message)
+
+
+stdout_logger = StdLogger()
--- a/superbench/executor/executor.py
+++ b/superbench/executor/executor.py
@ -10,7 +10,7 @@ from pathlib import Path
 from omegaconf import ListConfig

 from superbench.benchmarks import Platform, Framework, BenchmarkRegistry
-from superbench.common.utils import SuperBenchLogger, logger, rotate_dir
+from superbench.common.utils import SuperBenchLogger, logger, rotate_dir, stdout_logger
 from superbench.common.devices import GPU
 from superbench.monitor import Monitor

@ -29,6 +29,7 @@ class SuperBenchExecutor():
        self._output_path = Path(sb_output_dir).expanduser().resolve()

        self.__set_logger('sb-exec.log')
+        self.__set_stdout_logger(self._output_path / 'sb-bench.log')
        logger.debug('Executor uses config: %s.', self._sb_config)
        logger.debug('Executor writes to: %s.', str(self._output_path))

@ -46,6 +47,16 @@ class SuperBenchExecutor():
        """
        SuperBenchLogger.add_handler(logger.logger, filename=str(self._output_path / filename))

+    def __set_stdout_logger(self, filename):
+        """Set stdout logger and redirect logs and stdout into the file.
+
+        Args:
+            filename (str): Log file name.
+        """
+        stdout_logger.add_file_handler(filename)
+        stdout_logger.start(self.__get_rank_id())
+        SuperBenchLogger.add_handler(logger.logger, filename=filename)
+
    def __validate_sb_config(self):
        """Validate SuperBench config object.

@ -244,5 +255,6 @@ class SuperBenchExecutor():

            if monitor:
                monitor.stop()
+            stdout_logger.stop()
            self.__write_benchmark_results(benchmark_name, benchmark_results)
            os.chdir(cwd)
--- a/tests/benchmarks/model_benchmarks/test_model_base.py
+++ b/tests/benchmarks/model_benchmarks/test_model_base.py
@ -158,6 +158,8 @@ def test_arguments_related_interfaces():
  --duration int        The elapsed time of benchmark in seconds.
  --force_fp32          Enable option to use full float32 precision.
  --hidden_size int     Hidden size.
+  --log_flushing        Real-time log flushing.
+  --log_n_steps int     Real-time log every n steps.
  --log_raw_data        Log raw data into file instead of saving it into
                        result object.
  --model_action ModelAction [ModelAction ...]
@ -194,6 +196,8 @@ def test_preprocess():
  --duration int        The elapsed time of benchmark in seconds.
  --force_fp32          Enable option to use full float32 precision.
  --hidden_size int     Hidden size.
+  --log_flushing        Real-time log flushing.
+  --log_n_steps int     Real-time log every n steps.
  --log_raw_data        Log raw data into file instead of saving it into
                        result object.
  --model_action ModelAction [ModelAction ...]
--- a/tests/benchmarks/test_registry.py
+++ b/tests/benchmarks/test_registry.py
@ -114,6 +114,7 @@ def test_get_benchmark_configurable_settings():

    expected = """optional arguments:
  --duration int     The elapsed time of benchmark in seconds.
+  --log_flushing     Real-time log flushing.
  --log_raw_data     Log raw data into file instead of saving it into result
                     object.
  --lower_bound int  The lower bound for accumulation.
--- a/tests/common/test_process.py
+++ b/tests/common/test_process.py
@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Test common.util.process."""
+
+from superbench.common.utils import run_command
+
+
+def test_run_command():
+    """Test run_command."""
+    command = 'echo 123'
+    expected_output = '123\n'
+    output = run_command(command)
+    assert (output.stdout == expected_output)
+    assert (output.returncode == 0)
+    output = run_command(command, flush_output=True)
+    assert (output.stdout == expected_output)
+    assert (output.returncode == 0)
+
+    command = 'abb'
+    output = run_command(command)
+    assert (output.returncode != 0)
+    output = run_command(command, flush_output=True)
+    assert (output.returncode != 0)