Merge branch 'xiongyf/support-te-fp8' of https://github.com/microsoft/superbenchmark into xiongyf/support-te-fp8

2023-03-24 22:21:03 +08:00 · 2023-03-24 22:21:03 +08:00 · a6da958d6f
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@ -409,6 +409,19 @@ Test the performance of large scale matmul operation with multiple GPUs:
 | pytorch-sharding-matmul/allreduce_time | time (ms) | Time of sharding matmul using allreduce. |
 | pytorch-sharding-matmul/allgather_time | time (ms) | Time of sharding matmul using allgather. |
 ### `dist-inference`
 #### Introduction
 Test the performance of distributed model inference.
 #### Metrics
 | Name                                            | Unit      | Description                                           |
 |-------------------------------------------------|-----------|-------------------------------------------------------|
 | pytorch-dist-inference/step_times               | time (ms) | Average time of model inference runs.                 |
 | pytorch-dist-inference/step_times_${percentile} | time (ms) | Tail (50,90,95,99,99.9) time of model inference runs. |
 ## Storage Benchmarks
 ### `disk-benchmark`
--- a/examples/benchmarks/dist_inference.py
+++ b/examples/benchmarks/dist_inference.py
@ -0,0 +1,22 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 """Micro benchmark example for distributed inference with pytorch.
 Commands to run:
  python3 -m torch.distributed.launch --nproc_per_node=8 examples/benchmarks/dist_inference.py
 """
 from superbench.benchmarks import Framework, BenchmarkRegistry
 from superbench.common.utils import logger
 if __name__ == '__main__':
    context = BenchmarkRegistry.create_benchmark_context('dist-inference', parameters='', framework=Framework.PYTORCH)
    benchmark = BenchmarkRegistry.launch_benchmark(context)
    if benchmark:
        logger.info(
            'benchmark: {}, return code: {}, result: {}'.format(
                benchmark.name, benchmark.return_code, benchmark.result
            )
        )
--- a/superbench/benchmarks/micro_benchmarks/init.py
+++ b/superbench/benchmarks/micro_benchmarks/init.py
@ -15,6 +15,7 @@ from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import Cu
 from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
 from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark
 from superbench.benchmarks.micro_benchmarks.disk_performance import DiskBenchmark
 from superbench.benchmarks.micro_benchmarks.dist_inference import DistInference
 from superbench.benchmarks.micro_benchmarks.cpu_memory_bw_latency_performance import CpuMemBwLatencyBenchmark
 from superbench.benchmarks.micro_benchmarks.cpu_stream_performance import CpuStreamBenchmark
 from superbench.benchmarks.micro_benchmarks.cpu_hpl_performance import CpuHplBenchmark
@ -43,6 +44,7 @@ __all__ = [
    'CudaNcclBwBenchmark',
    'CudnnBenchmark',
    'DiskBenchmark',
    'DistInference',
    'GPCNetBenchmark',
    'GemmFlopsBenchmark',
    'GpuBurnBenchmark',
--- a/superbench/benchmarks/micro_benchmarks/dist_inference.py
+++ b/superbench/benchmarks/micro_benchmarks/dist_inference.py
@ -0,0 +1,455 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 """Module of the distributed inference benchmark."""
 import os
 import time
 import torch
 import torch.nn.functional as F
 import torch.distributed as dist
 from superbench.common.utils import logger
 from superbench.benchmarks import DistributedImpl, DistributedBackend, BenchmarkRegistry, ReturnCode, Precision
 from superbench.benchmarks.micro_benchmarks import MicroBenchmark
 from superbench.benchmarks.context import Enum
 class ComputationKernelType(Enum):
    """The Enum class representing different computation kernel type."""
    ADDMM = 'addmm'
    MATMUL = 'matmul'
    MUL = 'mul'
 class CommunicationKernelType(Enum):
    """The Enum class representing different communication kernel type."""
    ALLGATHER = 'allgather'
    ALLREDUCE = 'allreduce'
    ALLTOALL = 'alltoall'
 class ActivationKernelType(Enum):
    """The Enum class representing different activation kernel type."""
    RELU = 'relu'
    SIGMOID = 'sigmoid'
    TANH = 'tanh'
 class DistInferenceModel(torch.nn.Module):
    """The model class for distributed inference benchmark."""
    def __init__(
        self, input_size, hidden_size, num_layers, computation, communication, activation, precision, num_ranks, device
    ):
        """Constructor.
        Args:
            input_size (int): input data dimension.
            hidden_size (int): hidden layer dimension.
            num_layers (int): number of layers in the model.
            computation (ComputationKernelType): type of computation kernel of this model.
            communication (CommunicationKernelType): type of communication kernel of this model.
            activation (ActivationKernelType): type of activation kernel of this model.
            precision (Precision): data type of this model.
            num_ranks (int): number of ranks in this model runs.
            device (torch.device): device this model runs on.
        """
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.weights = torch.rand(
            self.input_size, self.hidden_size, dtype=getattr(torch, precision.value), device=device
        )
        self.bias = torch.rand(self.hidden_size, dtype=getattr(torch, precision.value), device=device)
        self.num_ranks = num_ranks
        self.step_times = []
        self.__init_computation_kernels(computation)
        self.__init_communication_kernels(communication)
        self.__init_activation_kernels(activation)
    def __init_computation_kernels(self, computation):
        """Select computation kernel according to option.
        Args:
            computation (ComputationKernelType): the type of the computation kernel to run.
        """
        self.computation_kernel = None
        if computation == ComputationKernelType.ADDMM:
            self.computation_kernel = lambda x: torch.addmm(self.bias, x, self.weights)
        elif computation == ComputationKernelType.MATMUL:
            self.computation_kernel = lambda x: torch.matmul(x, self.weights)
        elif computation == ComputationKernelType.MUL:
            self.computation_kernel = lambda x: torch.mul(x, x)
    def __init_communication_kernels(self, communication):
        """Select communication kernel according to option.
        Args:
            communication (CommunicationKernelType): the type of the communication kernel to run.
        """
        self.communication_kernel = None
        if communication == CommunicationKernelType.ALLGATHER:
            self.communication_kernel = self.__all_gather_wrapper
        elif communication == CommunicationKernelType.ALLREDUCE:
            self.communication_kernel = self.__all_reduce_wrapper
        elif communication == CommunicationKernelType.ALLTOALL:
            self.communication_kernel = self.__all_to_all_wrapper
    def __init_activation_kernels(self, activation):
        """Select activation kernel according to option.
        Args:
            activation (ActivationKernelType): the type of the activation kernel to run.
        """
        self.activation_kernel = None
        if activation == ActivationKernelType.RELU:
            self.activation_kernel = F.relu
        elif activation == ActivationKernelType.SIGMOID:
            self.activation_kernel = F.sigmoid
        elif activation == ActivationKernelType.TANH:
            self.activation_kernel = F.tanh
    def __all_gather_wrapper(self, x):
        """All-gather wrapper with output initialization.
        Args:
            x (Tensor): input.
        Return:
            Tensor after all-gather.
        """
        output = torch.empty_like([x.shape[0] * self.num_ranks] + list(x.shape[1:]))
        dist.all_gather_into_tensor(output, x)
        return output
    def __all_reduce_wrapper(self, x):
        """All-reduce wrapper.
        Args:
            x (Tensor): input.
        Return:
            Tensor after all-reduce.
        """
        dist.all_reduce(x)
        return x
    def __all_to_all_wrapper(self, x):
        """All-to-all wrapper with output initialization.
        Args:
            x (Tensor): input.
        Return:
            Tensor after all-to-all.
        """
        output = torch.empty_like(x)
        dist.all_to_all_single(output, x)
        return output
    def forward(self, x):
        """Do forward loops.
        Args:
            x (Tensor): input.
        Return:
            Tensor after the whole inference process.
        """
        activation_out = None
        for i in range(self.num_layers):
            computation_out = self.computation_kernel(x)
            communication_out = self.communication_kernel(computation_out)
            activation_out = self.activation_kernel(communication_out)
        return activation_out
 class DistInference(MicroBenchmark):
    """The base class of micro-benchmarks."""
    def __init__(self, name, parameters=''):
        """Constructor.
        Args:
            name (str): benchmark name.
            parameters (str): benchmark parameters.
        """
        super().__init__(name, parameters)
        self.__world_size = 1
        self.__local_rank = 0
        torch.backends.cudnn.benchmark = True
        self.__device = None
        self.__cuda_available = False
    def __timer(self):
        """Returns the current time which ensures all previous CUDA events have been finished.
        If there is no GPU present, this defaults to `time.time()`; otherwise it will
        synchronize CUDA before measuring the time.
        Return:
            Current time in second.
        """
        if self.__cuda_available:
            torch.cuda.synchronize()
        return time.time()
    def add_parser_arguments(self):
        """Add the specified arguments."""
        super().add_parser_arguments()
        self._parser.add_argument(
            '--batch_size',
            type=int,
            default=64,
            required=False,
            help='Batch size.',
        )
        self._parser.add_argument(
            '--input_size',
            type=int,
            default=1024,
            required=False,
            help='Input dimension size.',
        )
        self._parser.add_argument(
            '--hidden_size',
            type=int,
            default=1024,
            required=False,
            help='Hidden size.',
        )
        self._parser.add_argument(
            '--num_layers',
            type=int,
            default=1,
            required=False,
            help='Number of compute-communicate-activate layers.',
        )
        self._parser.add_argument(
            '--computation_kernel',
            type=ComputationKernelType,
            default=ComputationKernelType.MATMUL,
            required=False,
            help='Computation kernel type. E.g. {}.'.format(' '.join(ComputationKernelType.get_values())),
        )
        self._parser.add_argument(
            '--communication_kernel',
            type=CommunicationKernelType,
            default=CommunicationKernelType.ALLREDUCE,
            required=False,
            help='Communication kernel type. E.g. {}.'.format(' '.join(CommunicationKernelType.get_values())),
        )
        self._parser.add_argument(
            '--activation_kernel',
            type=ActivationKernelType,
            default=ActivationKernelType.RELU,
            required=False,
            help='Activation kernel type. E.g. {}.'.format(' '.join(ActivationKernelType.get_values())),
        )
        self._parser.add_argument(
            '--precision',
            type=Precision,
            default=Precision.FLOAT32,
            required=False,
            help='Model precision. E.g. {}.'.format(' '.join(Precision.get_values())),
        )
        self._parser.add_argument(
            '--num_warmup',
            type=int,
            default=50,
            required=False,
            help='Number of warmup steps.',
        )
        self._parser.add_argument(
            '--num_steps',
            type=int,
            default=10000,
            required=False,
            help='Number of test steps.',
        )
        self._parser.add_argument(
            '--distributed_impl',
            type=DistributedImpl,
            default=DistributedImpl.DDP,
            required=False,
            help='Distributed implementations. E.g. {}.'.format(' '.join(DistributedImpl.get_values())),
        )
        self._parser.add_argument(
            '--distributed_backend',
            type=DistributedBackend,
            default=DistributedBackend.NCCL,
            required=False,
            help='Distributed backends. E.g. {}.'.format(' '.join(DistributedBackend.get_values())),
        )
    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.
        Return:
            True if _preprocess() succeed.
        """
        if not super()._preprocess():
            return False
        if self._args.distributed_impl != DistributedImpl.DDP:
            self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
            logger.error(
                'Unsupported distributed implementation - model: {}, distributed implementation: {}.'.format(
                    self._name, self._args.distributed_impl
                )
            )
            return False
        try:
            torch.distributed.init_process_group(backend=self._args.distributed_backend.value)
            self.__world_size = int(os.environ['WORLD_SIZE'])
            self.__local_rank = int(os.environ['LOCAL_RANK'])
        except BaseException as e:
            self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
            torch.distributed.destroy_process_group()
            logger.error('Initialize distributed env failed - benchmark: {}, message: {}.'.format(self._name, str(e)))
            return False
        if torch.cuda.is_available():
            torch.cuda.set_device(self.__local_rank)
            self.__device = torch.device('cuda:{}'.format(self.__local_rank))
            self.__cuda_available = True
        else:
            self.__device = torch.device('cpu:{}'.format(self.__local_rank))
            self.__cuda_available = False
        return True
    def _prepare_model(
        self, input_size, hidden_size, num_layers, computation, communication, activation, precision, num_ranks
    ):
        """Prepare model.
        Args:
            input_size (int): input data dimension.
            hidden_size (int): hidden layer dimension.
            num_layers (int): number of layers in the model.
            computation (ComputationKernelType): type of computation kernel of this model.
            communication (CommunicationKernelType): type of communication kernel of this model.
            activation (ActivationKernelType): type of activation kernel of this model.
            precision (Precision): data type of this model.
            num_ranks (int): number of ranks in this model runs.
        Return:
            Model prepared.
        """
        model = DistInferenceModel(
            input_size, hidden_size, num_layers, computation, communication, activation, precision, num_ranks,
            self.__device
        )
        model = model.to(dtype=getattr(torch, precision.value))
        if self.__cuda_available:
            model = model.cuda()
        return model
    def _run_model(self, model, batch_size, input_size, precision, device, num_warmup, num_steps):
        """Run model and collect step times.
        Args:
            model (torch.nn.Module): model to run.
            batch_size (int): batch size of input data.
            input_size (int): input data dimension.
            precision (Precision): data type of this model.
            device (torch.device): device this model runs on.
            num_warmup (int): number of warm-up runs.
            num_steps (int): number of test runs.
        Return:
            Model step times collected.
        """
        data = torch.rand(batch_size, input_size, dtype=getattr(torch, precision.value), device=self.__device)
        # warm up
        for i in range(num_warmup):
            model(data)
        # run and collect results
        step_times = [0.] * num_steps
        for i in range(self._args.num_steps):
            start = self.__timer()
            model(data)
            end = self.__timer()
            step_times[i] = (end - start) * 1000
        return step_times
    def _process_data(self, step_times):
        """Process data.
        Args:
            step_times (List[float]): Model step times collected.
        Return:
            True if _process_data succeeds.
        """
        if not self._process_numeric_result('step_times', step_times, cal_percentile=True):
            return False
        return True
    def _benchmark(self):
        """Implementation for benchmarking.
        Return:
            True if _benchmark succeeds.
        """
        batch_size = self._args.batch_size
        input_size = self._args.input_size
        hidden_size = self._args.hidden_size
        num_layers = self._args.num_layers
        computation = self._args.computation_kernel
        communication = self._args.communication_kernel
        activation = self._args.activation_kernel
        precision = self._args.precision
        num_warmup = self._args.num_warmup
        num_steps = self._args.num_steps
        if self.__local_rank == 0:
            logger.info(
                'Distributed Inference - using {} GPUs: '
                'batch_size={}, input_size={}, hidden_size={}, num_layers={}, '
                'computation_kernel={}, communication_kernel={}, activation_kernel={}, precision={}, '
                'num_warmup={} num_steps={}'.format(
                    self.__world_size, batch_size, input_size, hidden_size, num_layers, computation, communication,
                    activation, precision, num_warmup, num_steps
                )
            )
        # Prepare model
        model = self._prepare_model(
            input_size, hidden_size, num_layers, computation, communication, activation, precision, self.__world_size
        )
        # Run model
        step_times = self._run_model(model, batch_size, input_size, precision, self.__device, num_warmup, num_steps)
        # Process data and return
        return self._process_data(step_times)
    def _postprocess(self):
        """Postprocess/cleanup operations after the benchmarking.
        Return:
            True if _postprocess succeeds.
        """
        if not super()._postprocess():
            return False
        try:
            torch.distributed.destroy_process_group()
        except BaseException as e:
            self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_DESTROY_FAILURE)
            logger.error('Post process failed - benchmark: {}, message: {}.'.format(self._name, str(e)))
            return False
        return True
 BenchmarkRegistry.register_benchmark('pytorch-dist-inference', DistInference, parameters='')
--- a/superbench/config/azure_ndmv4.yaml
+++ b/superbench/config/azure_ndmv4.yaml
@ -26,6 +26,13 @@ superbench:
          node_num: 1
      frameworks:
        - pytorch
    dist_inference_pytorch_mode: &dist_inference_pytorch_mode
      modes:
        - name: torch.distributed
          proc_num: 8
          node_num: 1
          env:
            NCCL_ASYNC_ERROR_HANDLING: '0'
    common_model_config: &common_model_config
      duration: 0
      num_warmup: 64
@ -159,6 +166,8 @@ superbench:
      <<: *default_pytorch_mode
    computation-communication-overlap:
      <<: *default_pytorch_mode
    dist-inference:
      <<: *dist_inference_pytorch_mode
    ib-traffic:
      enable: false
      modes:
--- a/superbench/config/azure_ndv4.yaml
+++ b/superbench/config/azure_ndv4.yaml
@ -22,6 +22,13 @@ superbench:
          node_num: 1
      frameworks:
        - pytorch
    dist_inference_pytorch_mode: &dist_inference_pytorch_mode
      modes:
        - name: torch.distributed
          proc_num: 8
          node_num: 1
          env:
            NCCL_ASYNC_ERROR_HANDLING: '0'
    common_model_config: &common_model_config
      duration: 0
      num_warmup: 64
@ -165,6 +172,8 @@ superbench:
      <<: *default_pytorch_mode
    computation-communication-overlap:
      <<: *default_pytorch_mode
    dist-inference:
      <<: *dist_inference_pytorch_mode
    ib-traffic:
      enable: false
      modes:
--- a/tests/benchmarks/micro_benchmarks/test_dist_inference.py
+++ b/tests/benchmarks/micro_benchmarks/test_dist_inference.py
@ -0,0 +1,96 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 """Tests for distributed inference benchmark."""
 import unittest
 from tests.helper import decorator
 import tests.benchmarks.utils as utils
 from superbench.benchmarks \
    import BenchmarkRegistry, Framework, BenchmarkType, ReturnCode, Precision, DistributedImpl, DistributedBackend
 from superbench.benchmarks.micro_benchmarks.dist_inference \
    import DistInference, ComputationKernelType, CommunicationKernelType, ActivationKernelType
 from superbench.common.utils import network
 # TODO - replace unittest.skip("no multiple GPUs") to decorator of skipIfNoMultiGPUS
@unittest.skip('no multiple GPUs')
@decorator.cuda_test
@decorator.pytorch_test
 def test_pytorch_dist_inference_normal():
    """Test pytorch-dist-inference benchmark on distributed normal case."""
    context = BenchmarkRegistry.create_benchmark_context('dist-inference', parameters='', framework=Framework.PYTORCH)
    world_size = 2
    assert (BenchmarkRegistry.is_benchmark_context_valid(context))
    results = utils.simulated_ddp_distributed_benchmark(context, world_size)
    assert (results)
    for benchmark in results:
        # Check basic information.
        assert (benchmark)
        assert (isinstance(benchmark, DistInference))
        assert (benchmark.name == 'pytorch-dist-inference')
        assert (benchmark.type == BenchmarkType.MICRO)
        # Check predefined parameters of dist-inference benchmark.
        assert (benchmark._args.batch_size == 64)
        assert (benchmark._args.input_size == 1024)
        assert (benchmark._args.hidden_size == 1024)
        assert (benchmark._args.num_layers == 1)
        assert (benchmark._args.computation_kernel == ComputationKernelType.MATMUL)
        assert (benchmark._args.communication_kernel == CommunicationKernelType.ALLREDUCE)
        assert (benchmark._args.activation_kernel == ActivationKernelType.RELU)
        assert (benchmark._args.precision == Precision.FLOAT32)
        assert (benchmark._args.num_warmup == 50)
        assert (benchmark._args.num_steps == 10000)
        assert (benchmark._args.distributed_impl == DistributedImpl.DDP)
        assert (benchmark._args.distributed_backend == DistributedBackend.NCCL)
        # Check results and metrics.
        assert (benchmark.run_count == 1)
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        # step_times
        assert (len(benchmark.raw_data) == 1)
        # return code + (avg, 50th, 90th, 95th, 99th, 99.9th)
        assert (len(benchmark.result) == 7)
@decorator.cuda_test
@decorator.pytorch_test
 def test_pytorch_dist_inference_fake_distributed():
    """Test pytorch-dist-inference benchmark on single gpu."""
    context = BenchmarkRegistry.create_benchmark_context('dist-inference', parameters='', framework=Framework.PYTORCH)
    port = network.get_free_port()
    assert (port)
    utils.setup_simulated_ddp_distributed_env(1, 0, port)
    benchmark = BenchmarkRegistry.launch_benchmark(context)
    # Check basic information.
    assert (benchmark)
    assert (isinstance(benchmark, DistInference))
    assert (benchmark.name == 'pytorch-dist-inference')
    assert (benchmark.type == BenchmarkType.MICRO)
    # Check predefined parameters of dist-inference benchmark.
    assert (benchmark._args.batch_size == 64)
    assert (benchmark._args.input_size == 1024)
    assert (benchmark._args.hidden_size == 1024)
    assert (benchmark._args.num_layers == 1)
    assert (benchmark._args.computation_kernel == ComputationKernelType.MATMUL)
    assert (benchmark._args.communication_kernel == CommunicationKernelType.ALLREDUCE)
    assert (benchmark._args.activation_kernel == ActivationKernelType.RELU)
    assert (benchmark._args.precision == Precision.FLOAT32)
    assert (benchmark._args.num_warmup == 50)
    assert (benchmark._args.num_steps == 10000)
    assert (benchmark._args.distributed_impl == DistributedImpl.DDP)
    assert (benchmark._args.distributed_backend == DistributedBackend.NCCL)
    # Check results and metrics.
    assert (benchmark.run_count == 1)
    assert (benchmark.return_code == ReturnCode.SUCCESS)
    # step_times
    assert (len(benchmark.raw_data) == 1)
    # return code + (avg, 50th, 90th, 95th, 99th, 99.9th)
    assert (len(benchmark.result) == 7)
    utils.clean_simulated_ddp_distributed_env()