Executor - Add stdout logging util module and enable real-time logging flushing in executor (#445)
**Description** Add stdout logging util module and enable real-time logging flushing in executor **Major Revision** - Add stdout logging util module to redirect stdout into file log - enable stdout logging in executor to write benchmark output into both stdout and file `sb-bench.log` - enable real-time log flushing in run_command of microbenchmarks through config `log_flushing` **Minor Revision** - add log_n_step args to enable regular step time log in model benchmarks - udpate related docs
This commit is contained in:
Родитель
f2634d8608
Коммит
9dfefce350
|
@ -68,6 +68,7 @@ Here're the details about work directory structure for SuperBench Runner.
|
|||
│ │ └── rank-0 # output for each rank in each benchmark
|
||||
│ │ ├── results.json # raw results
|
||||
| | └── monitor.jsonl # monitor results (optional)
|
||||
| ├── sb-bench.log # SuperBench benchmarks' runtime log for debugging
|
||||
│ └── sb-exec.log # collected SuperBench Executor log
|
||||
├── sb-run.log # SuperBench Runner log
|
||||
├── sb.config.yaml # SuperBench configuration snapshot
|
||||
|
@ -98,6 +99,7 @@ The `/root` directory is mounted from `$HOME/sb-workspace` on the host path.
|
|||
│ ├── results.json # raw results
|
||||
│ └── monitor.jsonl # monitor results (optional)
|
||||
├── sb.config.yaml # SuperBench configuration snapshot
|
||||
├── sb-bench.log # SuperBench benchmarks' runtime log for debugging
|
||||
└── sb.env # SuperBench runtime environment variables
|
||||
```
|
||||
|
||||
|
|
|
@ -336,10 +336,11 @@ A list of models to run, only supported in model-benchmark.
|
|||
|
||||
Parameters for benchmark to use, varying for different benchmarks.
|
||||
|
||||
There have three common parameters for all benchmarks:
|
||||
There have four common parameters for all benchmarks:
|
||||
* run_count: how many times do user want to run this benchmark, default value is 1.
|
||||
* duration: the elapsed time of benchmark in seconds. It can work for all model-benchmark. But for micro-benchmark, benchmark authors should consume it by themselves.
|
||||
* log_raw_data: log raw data into file instead of saving it into result object, default value is `False`. Benchmarks who have large raw output may want to set it as `True`, such as `nccl-bw`/`rccl-bw`.
|
||||
* log_flushing: real-time log flushing, default value is `False`.
|
||||
|
||||
For Model-Benchmark, there have some parameters that can control the elapsed time.
|
||||
* duration: the elapsed time of benchmark in seconds.
|
||||
|
|
|
@ -74,6 +74,12 @@ class Benchmark(ABC):
|
|||
default=False,
|
||||
help='Log raw data into file instead of saving it into result object.',
|
||||
)
|
||||
self._parser.add_argument(
|
||||
'--log_flushing',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Real-time log flushing.',
|
||||
)
|
||||
|
||||
def get_configurable_settings(self):
|
||||
"""Get all the configurable settings.
|
||||
|
|
|
@ -122,7 +122,7 @@ class DockerBenchmark(Benchmark):
|
|||
self._curr_run_index, self._name, self._commands[cmd_idx]
|
||||
)
|
||||
)
|
||||
output = run_command(self._commands[cmd_idx])
|
||||
output = run_command(self._commands[cmd_idx], flush_output=self._args.log_flushing)
|
||||
if output.returncode != 0:
|
||||
self._result.set_return_code(ReturnCode.DOCKERBENCHMARK_EXECUTION_FAILURE)
|
||||
logger.error(
|
||||
|
|
|
@ -173,7 +173,7 @@ class MicroBenchmarkWithInvoke(MicroBenchmark):
|
|||
)
|
||||
)
|
||||
|
||||
output = run_command(self._commands[cmd_idx])
|
||||
output = run_command(self._commands[cmd_idx], flush_output=self._args.log_flushing)
|
||||
if output.returncode != 0:
|
||||
self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
|
||||
logger.error(
|
||||
|
|
|
@ -8,7 +8,7 @@ import time
|
|||
import statistics
|
||||
from abc import abstractmethod
|
||||
|
||||
from superbench.common.utils import logger
|
||||
from superbench.common.utils import logger, stdout_logger
|
||||
from superbench.benchmarks import Precision, ModelAction, DistributedImpl, DistributedBackend, BenchmarkType, ReturnCode
|
||||
from superbench.benchmarks.base import Benchmark
|
||||
from superbench.benchmarks.context import Enum
|
||||
|
@ -131,6 +131,14 @@ class ModelBenchmark(Benchmark):
|
|||
help='Enable option to use full float32 precision.',
|
||||
)
|
||||
|
||||
self._parser.add_argument(
|
||||
'--log_n_steps',
|
||||
type=int,
|
||||
default=0,
|
||||
required=False,
|
||||
help='Real-time log every n steps.',
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def _judge_gpu_availability(self):
|
||||
"""Judge GPUs' availability according to arguments and running environment."""
|
||||
|
@ -435,3 +443,16 @@ class ModelBenchmark(Benchmark):
|
|||
"""Print environments or dependencies information."""
|
||||
# TODO: will implement it when add real benchmarks in the future.
|
||||
pass
|
||||
|
||||
def _log_step_time(self, curr_step, precision, duration):
|
||||
"""Log step time into stdout regularly.
|
||||
|
||||
Args:
|
||||
curr_step (int): the index of current step
|
||||
precision (Precision): precision of model and input data, such as float32, float16.
|
||||
duration (list): the durations of all steps
|
||||
"""
|
||||
if self._args.log_n_steps and curr_step % self._args.log_n_steps == 0:
|
||||
step_time = statistics.mean(duration) if len(duration) < self._args.log_n_steps \
|
||||
else statistics.mean(duration[-self._args.log_n_steps:])
|
||||
stdout_logger.log(f'{self._name} - {precision.value}: step {curr_step}, step time {step_time}\n')
|
||||
|
|
|
@ -151,6 +151,7 @@ class PytorchBERT(PytorchBase):
|
|||
if curr_step > self._args.num_warmup:
|
||||
# Save the step time of every training/inference step, unit is millisecond.
|
||||
duration.append((end - start) * 1000)
|
||||
self._log_step_time(curr_step, precision, duration)
|
||||
if self._is_finished(curr_step, end, check_frequency):
|
||||
return duration
|
||||
|
||||
|
@ -179,6 +180,7 @@ class PytorchBERT(PytorchBase):
|
|||
if curr_step > self._args.num_warmup:
|
||||
# Save the step time of every training/inference step, unit is millisecond.
|
||||
duration.append((end - start) * 1000)
|
||||
self._log_step_time(curr_step, precision, duration)
|
||||
if self._is_finished(curr_step, end):
|
||||
return duration
|
||||
|
||||
|
|
|
@ -114,6 +114,7 @@ class PytorchCNN(PytorchBase):
|
|||
if curr_step > self._args.num_warmup:
|
||||
# Save the step time of every training/inference step, unit is millisecond.
|
||||
duration.append((end - start) * 1000)
|
||||
self._log_step_time(curr_step, precision, duration)
|
||||
if self._is_finished(curr_step, end, check_frequency):
|
||||
return duration
|
||||
|
||||
|
@ -143,6 +144,7 @@ class PytorchCNN(PytorchBase):
|
|||
if curr_step > self._args.num_warmup:
|
||||
# Save the step time of every training/inference step, unit is millisecond.
|
||||
duration.append((end - start) * 1000)
|
||||
self._log_step_time(curr_step, precision, duration)
|
||||
if self._is_finished(curr_step, end):
|
||||
return duration
|
||||
|
||||
|
|
|
@ -145,6 +145,7 @@ class PytorchGPT2(PytorchBase):
|
|||
if curr_step > self._args.num_warmup:
|
||||
# Save the step time of every training/inference step, unit is millisecond.
|
||||
duration.append((end - start) * 1000)
|
||||
self._log_step_time(curr_step, precision, duration)
|
||||
if self._is_finished(curr_step, end, check_frequency):
|
||||
return duration
|
||||
|
||||
|
@ -173,6 +174,7 @@ class PytorchGPT2(PytorchBase):
|
|||
if curr_step > self._args.num_warmup:
|
||||
# Save the step time of every training/inference step, unit is millisecond.
|
||||
duration.append((end - start) * 1000)
|
||||
self._log_step_time(curr_step, precision, duration)
|
||||
if self._is_finished(curr_step, end):
|
||||
return duration
|
||||
|
||||
|
|
|
@ -154,6 +154,7 @@ class PytorchLSTM(PytorchBase):
|
|||
if curr_step > self._args.num_warmup:
|
||||
# Save the step time of every training/inference step, unit is millisecond.
|
||||
duration.append((end - start) * 1000)
|
||||
self._log_step_time(curr_step, precision, duration)
|
||||
if self._is_finished(curr_step, end, check_frequency):
|
||||
return duration
|
||||
|
||||
|
@ -183,6 +184,7 @@ class PytorchLSTM(PytorchBase):
|
|||
if curr_step > self._args.num_warmup:
|
||||
# Save the step time of every training/inference step, unit is millisecond.
|
||||
duration.append((end - start) * 1000)
|
||||
self._log_step_time(curr_step, precision, duration)
|
||||
if self._is_finished(curr_step, end):
|
||||
return duration
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
|
||||
from superbench.common.utils.azure import get_vm_size
|
||||
from superbench.common.utils.logging import SuperBenchLogger, logger
|
||||
from superbench.common.utils.stdout_logging import StdLogger, stdout_logger
|
||||
from superbench.common.utils.file_handler import rotate_dir, create_sb_output_dir, get_sb_config
|
||||
from superbench.common.utils.lazy_import import LazyImport
|
||||
from superbench.common.utils.process import run_command
|
||||
|
@ -16,6 +17,8 @@ device_manager = LazyImport('superbench.common.utils.device_manager')
|
|||
__all__ = [
|
||||
'LazyImport',
|
||||
'SuperBenchLogger',
|
||||
'StdLogger',
|
||||
'stdout_logger',
|
||||
'create_sb_output_dir',
|
||||
'device_manager',
|
||||
'get_sb_config',
|
||||
|
|
|
@ -129,7 +129,7 @@ class DeviceManager:
|
|||
Return:
|
||||
remapped_metrics (dict): the row remapped information, None means failed to get the data.
|
||||
"""
|
||||
output = process.run_command('nvidia-smi -i {} -q'.format(idx))
|
||||
output = process.run_command('nvidia-smi -i {} -q'.format(idx), quite=True)
|
||||
if output.returncode == 0:
|
||||
begin = output.stdout.find('Remapped Rows')
|
||||
end = output.stdout.find('Temperature', begin)
|
||||
|
|
|
@ -4,19 +4,47 @@
|
|||
"""Process Utility."""
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
import shlex
|
||||
|
||||
from superbench.common.utils import stdout_logger
|
||||
|
||||
|
||||
def run_command(command):
|
||||
def run_command(command, quite=False, flush_output=False):
|
||||
"""Run command in string format, return the result with stdout and stderr.
|
||||
|
||||
Args:
|
||||
command (str): command to run.
|
||||
quite (bool): no stdout display of the command if quite is True.
|
||||
flush_output (bool): enable real-time output flush or not when running the command.
|
||||
|
||||
Return:
|
||||
result (subprocess.CompletedProcess): The return value from subprocess.run().
|
||||
"""
|
||||
result = subprocess.run(
|
||||
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
|
||||
)
|
||||
|
||||
return result
|
||||
if flush_output:
|
||||
process = None
|
||||
try:
|
||||
args = shlex.split(command)
|
||||
process = subprocess.Popen(
|
||||
args, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True
|
||||
)
|
||||
output = ''
|
||||
for line in process.stdout:
|
||||
output += line
|
||||
if not quite:
|
||||
stdout_logger.log(line)
|
||||
process.wait()
|
||||
retcode = process.poll()
|
||||
return subprocess.CompletedProcess(args=args, returncode=retcode, stdout=output)
|
||||
except Exception as e:
|
||||
if process:
|
||||
process.kill()
|
||||
process.wait()
|
||||
return subprocess.CompletedProcess(args=args, returncode=-1, stdout=str(e))
|
||||
else:
|
||||
result = subprocess.run(
|
||||
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
|
||||
)
|
||||
if not quite:
|
||||
stdout_logger.log(result.stdout)
|
||||
return result
|
||||
|
|
|
@ -0,0 +1,94 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""SuperBench stdout logging module."""
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
class StdLogger:
|
||||
"""Logger class to enable or disable to redirect STDOUT and STDERR to file."""
|
||||
class StdoutLoggerStream:
|
||||
"""StdoutLoggerStream class which redirect the sys.stdout to file."""
|
||||
def __init__(self, filename, rank):
|
||||
"""Init the class with filename.
|
||||
|
||||
Args:
|
||||
filename (str): the path of the file to save the log
|
||||
rank (int): the rank id
|
||||
"""
|
||||
self._terminal = sys.stdout
|
||||
self._rank = rank
|
||||
self._log_file_handler = open(filename, 'a')
|
||||
|
||||
def __getattr__(self, attr):
|
||||
"""Override __getattr__.
|
||||
|
||||
Args:
|
||||
attr (str): Attribute name.
|
||||
|
||||
Returns:
|
||||
Any: Attribute value.
|
||||
"""
|
||||
return getattr(self._terminal, attr)
|
||||
|
||||
def write(self, message):
|
||||
"""Write the message to the stream.
|
||||
|
||||
Args:
|
||||
message (str): the message to log.
|
||||
"""
|
||||
message = f'[{self._rank}]: {message}'
|
||||
self._terminal.write(message)
|
||||
self._log_file_handler.write(message)
|
||||
self._log_file_handler.flush()
|
||||
|
||||
def flush(self):
|
||||
"""Override flush."""
|
||||
pass
|
||||
|
||||
def restore(self):
|
||||
"""Restore sys.stdout and close the file."""
|
||||
self._log_file_handler.close()
|
||||
sys.stdout = self._terminal
|
||||
|
||||
def add_file_handler(self, filename):
|
||||
"""Init the class with filename.
|
||||
|
||||
Args:
|
||||
filename (str): the path of file to save the log
|
||||
"""
|
||||
self.filename = filename
|
||||
|
||||
def __init__(self):
|
||||
"""Init the logger."""
|
||||
self.logger_stream = None
|
||||
|
||||
def start(self, rank):
|
||||
"""Start the logger to redirect the sys.stdout to file.
|
||||
|
||||
Args:
|
||||
rank (int): the rank id
|
||||
"""
|
||||
self.logger_stream = self.StdoutLoggerStream(self.filename, rank)
|
||||
sys.stdout = self.logger_stream
|
||||
sys.stderr = sys.stdout
|
||||
|
||||
def stop(self):
|
||||
"""Restore the sys.stdout to termital."""
|
||||
if self.logger_stream is not None:
|
||||
self.logger_stream.restore()
|
||||
|
||||
def log(self, message):
|
||||
"""Write the message into the logger.
|
||||
|
||||
Args:
|
||||
message (str): the message to log.
|
||||
"""
|
||||
if self.logger_stream:
|
||||
self.logger_stream.write(message)
|
||||
else:
|
||||
sys.stdout.write(message)
|
||||
|
||||
|
||||
stdout_logger = StdLogger()
|
|
@ -10,7 +10,7 @@ from pathlib import Path
|
|||
from omegaconf import ListConfig
|
||||
|
||||
from superbench.benchmarks import Platform, Framework, BenchmarkRegistry
|
||||
from superbench.common.utils import SuperBenchLogger, logger, rotate_dir
|
||||
from superbench.common.utils import SuperBenchLogger, logger, rotate_dir, stdout_logger
|
||||
from superbench.common.devices import GPU
|
||||
from superbench.monitor import Monitor
|
||||
|
||||
|
@ -29,6 +29,7 @@ class SuperBenchExecutor():
|
|||
self._output_path = Path(sb_output_dir).expanduser().resolve()
|
||||
|
||||
self.__set_logger('sb-exec.log')
|
||||
self.__set_stdout_logger(self._output_path / 'sb-bench.log')
|
||||
logger.debug('Executor uses config: %s.', self._sb_config)
|
||||
logger.debug('Executor writes to: %s.', str(self._output_path))
|
||||
|
||||
|
@ -46,6 +47,16 @@ class SuperBenchExecutor():
|
|||
"""
|
||||
SuperBenchLogger.add_handler(logger.logger, filename=str(self._output_path / filename))
|
||||
|
||||
def __set_stdout_logger(self, filename):
|
||||
"""Set stdout logger and redirect logs and stdout into the file.
|
||||
|
||||
Args:
|
||||
filename (str): Log file name.
|
||||
"""
|
||||
stdout_logger.add_file_handler(filename)
|
||||
stdout_logger.start(self.__get_rank_id())
|
||||
SuperBenchLogger.add_handler(logger.logger, filename=filename)
|
||||
|
||||
def __validate_sb_config(self):
|
||||
"""Validate SuperBench config object.
|
||||
|
||||
|
@ -244,5 +255,6 @@ class SuperBenchExecutor():
|
|||
|
||||
if monitor:
|
||||
monitor.stop()
|
||||
stdout_logger.stop()
|
||||
self.__write_benchmark_results(benchmark_name, benchmark_results)
|
||||
os.chdir(cwd)
|
||||
|
|
|
@ -158,6 +158,8 @@ def test_arguments_related_interfaces():
|
|||
--duration int The elapsed time of benchmark in seconds.
|
||||
--force_fp32 Enable option to use full float32 precision.
|
||||
--hidden_size int Hidden size.
|
||||
--log_flushing Real-time log flushing.
|
||||
--log_n_steps int Real-time log every n steps.
|
||||
--log_raw_data Log raw data into file instead of saving it into
|
||||
result object.
|
||||
--model_action ModelAction [ModelAction ...]
|
||||
|
@ -194,6 +196,8 @@ def test_preprocess():
|
|||
--duration int The elapsed time of benchmark in seconds.
|
||||
--force_fp32 Enable option to use full float32 precision.
|
||||
--hidden_size int Hidden size.
|
||||
--log_flushing Real-time log flushing.
|
||||
--log_n_steps int Real-time log every n steps.
|
||||
--log_raw_data Log raw data into file instead of saving it into
|
||||
result object.
|
||||
--model_action ModelAction [ModelAction ...]
|
||||
|
|
|
@ -114,6 +114,7 @@ def test_get_benchmark_configurable_settings():
|
|||
|
||||
expected = """optional arguments:
|
||||
--duration int The elapsed time of benchmark in seconds.
|
||||
--log_flushing Real-time log flushing.
|
||||
--log_raw_data Log raw data into file instead of saving it into result
|
||||
object.
|
||||
--lower_bound int The lower bound for accumulation.
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""Test common.util.process."""
|
||||
|
||||
from superbench.common.utils import run_command
|
||||
|
||||
|
||||
def test_run_command():
|
||||
"""Test run_command."""
|
||||
command = 'echo 123'
|
||||
expected_output = '123\n'
|
||||
output = run_command(command)
|
||||
assert (output.stdout == expected_output)
|
||||
assert (output.returncode == 0)
|
||||
output = run_command(command, flush_output=True)
|
||||
assert (output.stdout == expected_output)
|
||||
assert (output.returncode == 0)
|
||||
|
||||
command = 'abb'
|
||||
output = run_command(command)
|
||||
assert (output.returncode != 0)
|
||||
output = run_command(command, flush_output=True)
|
||||
assert (output.returncode != 0)
|
Загрузка…
Ссылка в новой задаче