Analyzer: Add feature - Add result summary in excel,md,html format (#320)

**Description** Add result summary in excel,md,html format. **Major Revision** - Add ResultSummary class to support result summary in excel,md,html format. - Abstract RuleBase class for common-used functions in DataDiagnosis and ResultSummary.
2022-03-24 15:32:01 +08:00 · 2022-03-24 15:32:01 +08:00 · 84fed1ce18
--- a/superbench/analyzer/init.py
+++ b/superbench/analyzer/init.py
@ -6,5 +6,7 @@
 from superbench.analyzer.rule_base import RuleBase
 from superbench.analyzer.data_diagnosis import DataDiagnosis
 from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType
+from superbench.analyzer.summary_op import SummaryOp, SummaryType
+from superbench.analyzer.result_summary import ResultSummary

-__all__ = ['DataDiagnosis', 'DiagnosisRuleType', 'RuleOp', 'RuleBase']
+__all__ = ['DataDiagnosis', 'DiagnosisRuleType', 'RuleOp', 'RuleBase', 'SummaryOp', 'SummaryType', 'ResultSummary']
--- a/superbench/analyzer/data_analysis.py
+++ b/superbench/analyzer/data_analysis.py
@ -7,6 +7,7 @@ import numpy as np
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
+import re

 from superbench.common.utils import logger

@ -210,3 +211,42 @@ def round_significant_decimal_places(df, digit, cols):
                lambda x: float(format_significant_str % x) if abs(x) < 1 else round(x, digit), na_action='ignore'
            )
    return df
+
+
+def aggregate(raw_data_df, pattern=None):
+    r"""Aggregate data of multiple ranks or multiple devices.
+
+    By default, aggregate results of multiple ranks like 'metric:\\d+' for most metrics.
+    For example, aggregate the results of kernel-launch overhead
+    from 8 GPU devices into one collection.
+    If pattern is given, use pattern to match metric and replace matched part in metric to *
+    to generate a aggregated metric name and then aggpregate these metrics' data.
+
+    Args:
+        raw_data_df (DataFrame): raw data
+
+    Returns:
+        DataFrame: the dataframe of aggregated data
+    """
+    try:
+        metric_store = {}
+        metrics = list(raw_data_df.columns)
+        for metric in metrics:
+            short = metric.strip(metric.split(':')[-1]).strip(':')
+            if pattern:
+                match = re.search(pattern, metric)
+                if match:
+                    metric_in_list = list(metric)
+                    for i in range(1, len(match.groups()) + 1):
+                        metric_in_list[match.start(i):match.end(i)] = '*'
+                    short = ''.join(metric_in_list)
+            if short not in metric_store:
+                metric_store[short] = []
+            metric_store[short].extend(raw_data_df[metric].tolist())
+        df = pd.DataFrame()
+        for short in metric_store:
+            df = pd.concat([df, pd.DataFrame(metric_store[short], columns=[short])], axis=1)
+        return df
+    except Exception as e:
+        logger.error('DataAnalyzer: aggregate failed, msg: {}'.format(str(e)))
+        return None
--- a/superbench/analyzer/data_diagnosis.py
+++ b/superbench/analyzer/data_diagnosis.py
@ -256,7 +256,7 @@ class DataDiagnosis(RuleBase):
        except Exception as e:
            logger.error('DataDiagnosis: output json data failed, msg: {}'.format(str(e)))

-    def gen_md_lines(self, data_not_accept_df, rules, round):
+    def generate_md_lines(self, data_not_accept_df, rules, round):
        """Convert DataFrame into markdown lines.

        Args:
@ -290,7 +290,7 @@ class DataDiagnosis(RuleBase):
                        data_not_accept_df = data_analysis.round_significant_decimal_places(
                            data_not_accept_df, round, [metric]
                        )
-        lines = file_handler.gen_md_table(data_not_accept_df, header)
+        lines = file_handler.generate_md_table(data_not_accept_df, header)
        return lines

    def run(self, raw_data_file, rule_file, baseline_file, output_dir, output_format='excel', round=2):
@ -319,7 +319,7 @@ class DataDiagnosis(RuleBase):
                output_path = str(Path(output_dir) / 'diagnosis_summary.jsonl')
                self.output_diagnosis_in_json(data_not_accept_df, output_path)
            elif output_format == 'md' or output_format == 'html':
-                lines = self.gen_md_lines(data_not_accept_df, self._sb_rules, round)
+                lines = self.generate_md_lines(data_not_accept_df, self._sb_rules, round)
                if output_format == 'md':
                    output_path = str(Path(output_dir) / 'diagnosis_summary.md')
                    file_handler.output_lines_in_md(lines, output_path)
--- a/superbench/analyzer/file_handler.py
+++ b/superbench/analyzer/file_handler.py
@ -10,6 +10,7 @@ import json
 import jsonlines
 import pandas as pd
 import yaml
+from openpyxl.styles import Alignment
 import markdown

 from superbench.common.utils import logger
@ -158,7 +159,7 @@ def output_excel_data_not_accept(writer, data_not_accept_df, rules):
        logger.warning('FileHandler: excel_data_output - data_not_accept_df is not DataFrame.')


-def gen_md_table(data_df, header):
+def generate_md_table(data_df, header):
    """Generate table text in markdown format.

    | header[0] | header[1] |
@ -221,3 +222,29 @@ def output_lines_in_html(lines, output_path):
            f.writelines(html_str)
    except Exception as e:
        logger.error('FileHandler: html_data_output - {}'.format(str(e)))
+
+
+def merge_column_in_excel(ws, row, column):
+    """Merge cells in the selected index of column with continuous same contents.
+
+    Args:
+        ws (worksheet): the worksheet of the excel to process
+        row (int): the max row index to merge
+        column (int): the index of the column to merge
+    """
+    dict_from = {}
+    aligncenter = Alignment(horizontal='center', vertical='center')
+    # record continuous row index (start, end) with the same content
+    for row_index in range(1, row + 1):
+        value = str(ws.cell(row_index, column).value)
+        if value not in dict_from:
+            dict_from[value] = [row_index, row_index]
+        else:
+            dict_from[value][1] = dict_from[value][1] + 1
+    # merge the cells
+    for value in dict_from.values():
+        if value[0] != value[1]:
+            ws.merge_cells(start_row=value[0], start_column=column, end_row=value[1], end_column=column)
+    # align center for merged cells
+    for i in range(1, row + 1):
+        ws.cell(row=i, column=column).alignment = aligncenter
--- a/superbench/analyzer/result_summary.py
+++ b/superbench/analyzer/result_summary.py
@ -0,0 +1,251 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""A module for Result Summary."""
+
+import re
+from pathlib import Path
+
+import pandas as pd
+
+from superbench.common.utils import logger
+from superbench.analyzer import file_handler
+from superbench.analyzer.summary_op import SummaryOp, SummaryType
+from superbench.analyzer import RuleBase
+from superbench.analyzer import data_analysis
+
+
+class ResultSummary(RuleBase):
+    """Result summary class."""
+    def _check_rules(self, rule, name):
+        """Check whether the formart of the rule is valid.
+
+        Args:
+            rule (dict): the rule
+            name (str): the rule name
+
+        Returns:
+            dict: the rule for the metric
+        """
+        # check if rule is supported
+        super()._check_and_format_rules(rule, name)
+        if 'metrics' not in rule:
+            logger.log_and_raise(exception=Exception, msg='{} lack of metrics'.format(name))
+        if 'statistics' not in rule:
+            logger.log_and_raise(exception=Exception, msg='{} lack of function'.format(name))
+        # convert single statistic str to list
+        if not isinstance(rule['statistics'], list):
+            rule['statistics'] = [rule['statistics']]
+        # check statistics format, should be SummaryType or p\d\d?
+        for function in rule['statistics']:
+            try:
+                if not (re.fullmatch(r'p\d\d?', function) or isinstance(SummaryType(function), SummaryType)):
+                    logger.log_and_raise(
+                        exception=Exception, msg='{} has invalid statistics name {}'.format(name, function)
+                    )
+            except Exception:
+                logger.log_and_raise(
+                    exception=Exception, msg='{} has invalid statistics name {}'.format(name, function)
+                )
+        # check aggregate format, should be None or bool or pattern in regex with () group
+        if 'aggregate' in rule and not isinstance(rule['aggregate'],
+                                                  bool) and not re.search(r'\(.*\)', rule['aggregate']):
+            logger.log_and_raise(exception=Exception, msg='{} aggregate must be bool type'.format(name))
+        return rule
+
+    def _parse_rules(self, rules):
+        """Parse the rules for result summary.
+
+        Args:
+            rules (dict): rules from rule yaml file
+
+        Returns:
+            bool: return True if successfully get all rules, otherwise False.
+        """
+        try:
+            if not rules:
+                logger.error('ResultSummary: get rules failed')
+                return False
+            self._sb_rules = {}
+            self._enable_metrics = set()
+            benchmark_rules = rules['superbench']['rules']
+            for rule in benchmark_rules:
+                benchmark_rules[rule] = self._check_rules(benchmark_rules[rule], rule)
+                self._sb_rules[rule] = {}
+                self._sb_rules[rule]['name'] = rule
+                self._sb_rules[rule]['categories'] = benchmark_rules[rule]['categories']
+                self._sb_rules[rule]['metrics'] = {}
+                self._sb_rules[rule]['statistics'] = benchmark_rules[rule]['statistics']
+                self._sb_rules[rule][
+                    'aggregate'] = benchmark_rules[rule]['aggregate'] if 'aggregate' in benchmark_rules[rule] else False
+                super()._get_metrics(rule, benchmark_rules)
+            return True
+        except Exception as e:
+            logger.error('ResultSummary: parse rules failed - {}'.format(str(e)))
+            return False
+
+    def _format_summary_of_rule(self, category, summary_df_of_rule):
+        """Format summary_df of a rule info list of lines.
+
+        Args:
+            category (str): category in the rule
+            summary_df_of_rule ([type]): summary df of a rule, the columns are metrics, the index are statistics
+        Returns:
+            list: list of summary lines like [category, metric, statistic, value]
+        """
+        summary = []
+        metrics = summary_df_of_rule.columns
+        for metric in metrics:
+            for statistic in summary_df_of_rule.index:
+                summary.append([category, metric, statistic, summary_df_of_rule.loc[statistic, metric]])
+        return summary
+
+    def _merge_summary(self, summary):
+        """Merge summary of multiple rules into DataFrame.
+
+        Args:
+            summary (dict): summary dict, the keys are categories, the values are summary lines for the category
+
+        Returns:
+            DataFrame: summary of all rules
+        """
+        summary_df = pd.DataFrame()
+        for category in summary:
+            for i in range(len(summary[category])):
+                summary_df = summary_df.append([summary[category][i]], ignore_index=True)
+        return summary_df
+
+    def _generate_summary(self, round):
+        r"""Generate summay dict of all rules.
+
+        For each rule, aggregate the data by user-defined pattern or ranks (:\\d+), calculate
+        the list of statistics of aggregated metrics, then format the summary in {category, lines}.
+
+        Args:
+            round (int): the number of decimal digits
+
+        Returns:
+            dict: summary dict, the keys are categories, the values are summary lines for the category
+        """
+        summary = {}
+        for rule in self._sb_rules:
+            metrics = list(self._sb_rules[rule]['metrics'].keys())
+            category = self._sb_rules[rule]['categories']
+            data_df_of_rule = self._raw_data_df[metrics]
+            if self._sb_rules[rule]['aggregate']:
+                # if aggregate is True, aggregate in ranks
+                if self._sb_rules[rule]['aggregate'] is True:
+                    data_df_of_rule = data_analysis.aggregate(data_df_of_rule)
+                # if aggregate is not empty and is a pattern in regex, aggregate according to pattern
+                else:
+                    data_df_of_rule = data_analysis.aggregate(data_df_of_rule, self._sb_rules[rule]['aggregate'])
+            statistics = self._sb_rules[rule]['statistics']
+            summary_df_of_rule = pd.DataFrame(columns=sorted(data_df_of_rule.columns))
+            for statistic_name in statistics:
+                # get SummaryOp and calculate statistics
+                # if statistic_name is 'p\d\d?', SummaryOp should be pencentile
+                if str.startswith(statistic_name, 'p'):
+                    rule_op = SummaryOp.get_summary_func(SummaryType('percentile'))
+                    val = int(statistic_name.strip('p'))
+                    summary_df_of_rule.loc[statistic_name] = rule_op(data_df_of_rule, val)
+                else:
+                    rule_op = SummaryOp.get_summary_func(SummaryType(statistic_name))
+                    summary_df_of_rule.loc[statistic_name] = rule_op(data_df_of_rule)
+            # format values to n significant decimal digits
+            if round and isinstance(round, int):
+                summary_df_of_rule = data_analysis.round_significant_decimal_places(
+                    summary_df_of_rule, round, list(summary_df_of_rule.columns)
+                )
+            # format summary_df of a rule to list of lines
+            summary_lines_of_rule = self._format_summary_of_rule(category, summary_df_of_rule)
+            summary[category] = summary_lines_of_rule
+
+        return summary
+
+    def generate_md_lines(self, summary):
+        """Generate text in markdown foramt.
+
+        Use category to be the 2nd-header, use tables to show the data
+
+        Args:
+            summary (dict): summary dict, the keys are categories, the values are summary lines for the category
+
+        Returns:
+            list: lines in markdown format
+        """
+        lines = []
+        for category in summary:
+            lines.append('## {}\n'.format(category))
+            summary_df = pd.DataFrame(summary[category])
+            summary_df = summary_df.drop(columns=0, axis=1)
+            header = ['metric', 'statistics', 'values']
+            table_lines = file_handler.generate_md_table(summary_df, header)
+            lines.extend(table_lines)
+            lines.append('\n')
+        return lines
+
+    def output_summary_in_excel(self, raw_data_df, summary, output_path):
+        """Output result summary in excel foramt.
+
+        Args:
+            raw_data_df (DataFrame): the DataFrame of raw data df
+            summary (DataFrame): the DataFrame of summary
+            output_path (str): the path of output file
+        """
+        try:
+            writer = pd.ExcelWriter(output_path, engine='openpyxl')
+            # check whether writer is valiad
+            if not isinstance(writer, pd.ExcelWriter):
+                logger.error('ResultSummary: excel_data_output - invalid file path.')
+                return
+            # output the raw data in 'Raw Data' sheet
+            file_handler.output_excel_raw_data(writer, raw_data_df, 'Raw Data')
+            # output the result summary in 'Summary' sheet
+            if isinstance(summary, pd.DataFrame) and not summary.empty:
+                summary.to_excel(writer, 'Summary', index=False, header=False)
+                worksheet = writer.sheets['Summary']
+                row = worksheet.max_row
+                # merge cells in 'category' column with the same category
+                file_handler.merge_column_in_excel(worksheet, row, 1)
+            else:
+                logger.error('ResultSummary: excel_data_output - summary is empty.')
+            writer.save()
+        except Exception as e:
+            logger.error('ResultSummary: excel_data_output - {}'.format(str(e)))
+
+    def run(self, raw_data_file, rule_file, output_dir, output_format, round=2):
+        """Run the main process of result summary.
+
+        Args:
+            raw_data_file (str): the path of raw data jsonl file.
+            rule_file (str): The path of baseline yaml file
+            output_dir (str): the directory of output file
+            output_format (str): the format of the output, 'excel' or 'md' or 'html'
+            round (int): the number of decimal digits
+        """
+        try:
+            rules = self._preprocess(raw_data_file, rule_file)
+            # parse rules for result summary
+            if not self._parse_rules(rules):
+                return
+            # generate result summary for each category
+            summary = self._generate_summary(round)
+            # output result summary to file
+            output_path = ''
+            if output_format == 'excel':
+                output_path = str(Path(output_dir) / 'results_summary.xlsx')
+                summary_df = self._merge_summary(summary)
+                self.output_summary_in_excel(self._raw_data_df, summary_df, output_path)
+            elif output_format == 'md':
+                output_path = str(Path(output_dir) / 'results_summary.md')
+                lines = self.generate_md_lines(summary)
+                file_handler.output_lines_in_md(lines, output_path)
+            elif output_format == 'html':
+                output_path = str(Path(output_dir) / 'results_summary.html')
+                lines = self.generate_md_lines(summary)
+                file_handler.output_lines_in_html(lines, output_path)
+            else:
+                logger.error('ResultSummary: output failed - unsupported output format')
+            logger.info('ResultSummary: Output results to {}'.format(output_path))
+        except Exception as e:
+            logger.error('ResultSummary: run failed - {}'.format(str(e)))
--- a/superbench/analyzer/summary_op.py
+++ b/superbench/analyzer/summary_op.py
@ -0,0 +1,157 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""A module for result summary ops."""
+
+from typing import Dict, Callable
+import numbers
+
+from superbench.benchmarks.context import Enum
+from superbench.common.utils import logger
+
+
+class SummaryType(Enum):
+    """The Enum class representing different summary ops."""
+
+    MEAN = 'mean'
+    PENCENTILE = 'percentile'
+    MIN = 'min'
+    MAX = 'max'
+    STD = 'std'
+    COUNT = 'count'
+
+
+class SummaryOp:
+    """SummaryOp class to maintain all summary functions."""
+
+    functions: Dict[SummaryType, Callable] = dict()
+
+    @classmethod
+    def add_summary_func(cls, summary_type):
+        """Add summary fuction.
+
+        Args:
+            summary_type (SummaryType): The type of summary function.
+
+        Return:
+            decorator (Callable): return the decorator to add the summary function.
+        """
+        def decorator(func):
+            cls.functions[summary_type] = func
+            return func
+
+        return decorator
+
+    @classmethod
+    def get_summary_func(cls, summary_type):
+        """Get summary fuction by summary_type.
+
+        Args:
+            summary_type (SummaryType): The type of summary function.
+
+        Return:
+            func (Callable): summary function, None means invalid summary type.
+        """
+        if summary_type in cls.functions:
+            return cls.functions[summary_type]
+
+        return None
+
+    @staticmethod
+    def _check_raw_data_df(raw_data_df):
+        """Check whether raw_data_df is empty or None.
+
+        Args:
+            raw_data_df (DataFrame): raw data df
+        """
+        if raw_data_df is None or raw_data_df.empty:
+            logger.log_and_raise(exception=Exception, msg='empty data in summary op')
+
+    @staticmethod
+    def mean(raw_data_df):
+        """Mean of raw_data_df.
+
+        Args:
+            raw_data_df (DataFrame): raw data df
+
+        Returns:
+            Series: mean of raw_data_df
+        """
+        SummaryOp._check_raw_data_df(raw_data_df)
+        return raw_data_df.mean()
+
+    @staticmethod
+    def percentile(raw_data_df, val):
+        """Pencentile$(val) of raw_data_df.
+
+        Args:
+            raw_data_df (DataFrame): raw data df
+            val (numbers.Number): the pencentile value, 1-99
+
+        Returns:
+            Series: mean of raw_data_df
+        """
+        SummaryOp._check_raw_data_df(raw_data_df)
+        if not isinstance(val, numbers.Number) or val < 1 or val > 99:
+            logger.log_and_raise(exception=Exception, msg='val in pencentile should be 1-99')
+        return raw_data_df.quantile(val / 100)
+
+    @staticmethod
+    def min(raw_data_df):
+        """The min of values for each column in raw_data_df.
+
+        Args:
+            raw_data_df (DataFrame): raw data df
+
+        Returns:
+            Series: min of raw_data_df
+        """
+        SummaryOp._check_raw_data_df(raw_data_df)
+        return raw_data_df.min()
+
+    @staticmethod
+    def max(raw_data_df):
+        """The max of values for each column in raw_data_df.
+
+        Args:
+            raw_data_df (DataFrame): raw data df
+
+        Returns:
+            Series: max of raw_data_df
+        """
+        SummaryOp._check_raw_data_df(raw_data_df)
+        return raw_data_df.max()
+
+    @staticmethod
+    def std(raw_data_df):
+        """The std of values for each column in raw_data_df.
+
+        Args:
+            raw_data_df (DataFrame): raw data df
+
+        Returns:
+            Series: std of raw_data_df
+        """
+        SummaryOp._check_raw_data_df(raw_data_df)
+        return raw_data_df.std(axis=0, skipna=True)
+
+    @staticmethod
+    def count(raw_data_df):
+        """The number of values for each column in raw_data_df.
+
+        Args:
+            raw_data_df (DataFrame): raw data df
+
+        Returns:
+            Series: count of raw_data_df
+        """
+        SummaryOp._check_raw_data_df(raw_data_df)
+        return raw_data_df.count()
+
+
+SummaryOp.add_summary_func(SummaryType.MEAN)(SummaryOp.mean)
+SummaryOp.add_summary_func(SummaryType.PENCENTILE)(SummaryOp.percentile)
+SummaryOp.add_summary_func(SummaryType.MIN)(SummaryOp.min)
+SummaryOp.add_summary_func(SummaryType.MAX)(SummaryOp.max)
+SummaryOp.add_summary_func(SummaryType.STD)(SummaryOp.std)
+SummaryOp.add_summary_func(SummaryType.COUNT)(SummaryOp.count)
--- a/tests/analyzer/test_data_analysis.py
+++ b/tests/analyzer/test_data_analysis.py
@ -73,3 +73,10 @@ class TestDataAnalysis(unittest.TestCase):
        pd.testing.assert_frame_equal(df, pd.DataFrame([[0.0046, 500.6789], [1.53, 100.7424]], columns=['a', 'b']))
        df = data_analysis.round_significant_decimal_places(df, 2, 'b')
        pd.testing.assert_frame_equal(df, pd.DataFrame([[0.0046, 500.68], [1.53, 100.74]], columns=['a', 'b']))
+        # Test aggregate
+        df = pd.DataFrame([[1, 2], [3, 4]], columns=['a:0', 'a:1'])
+        df = data_analysis.aggregate(df)
+        pd.testing.assert_frame_equal(df, pd.DataFrame({'a': [1, 3, 2, 4]}))
+        df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=['ib_1_a', 'ib_2_a', 'ib_1_b', 'ib_2_b'])
+        df = data_analysis.aggregate(df, pattern='ib_(.)_.')
+        pd.testing.assert_frame_equal(df, pd.DataFrame({'ib_*_a': [1, 5, 2, 6], 'ib_*_b': [3, 7, 4, 8]}))
--- a/tests/analyzer/test_data_diagnosis.py
+++ b/tests/analyzer/test_data_diagnosis.py
@ -196,8 +196,8 @@ class TestDataDiagnosis(unittest.TestCase):
            assert ('Category' in line)
            assert ('Defective Details' in line)
            assert ('Index' in line)
-        # Test - gen_md_lines
-        lines = diag1.gen_md_lines(data_not_accept_df, diag1._sb_rules, 2)
+        # Test - generate_md_lines
+        lines = diag1.generate_md_lines(data_not_accept_df, diag1._sb_rules, 2)
        assert (lines)
        expected_md_file = str(self.parent_path / '../data/diagnosis_summary.md')
        with open(expected_md_file, 'r') as f:
--- a/tests/analyzer/test_file_handler.py
+++ b/tests/analyzer/test_file_handler.py
@ -48,8 +48,8 @@ class TestFileHandler(unittest.TestCase):
        assert (not baseline)
        baseline = file_handler.read_baseline(test_baseline_file)
        assert (baseline)
-        # Test - gen_md_table
+        # Test - generate_md_table
        data_df = pd.DataFrame([[1, 2], [3, 4]])
-        lines = file_handler.gen_md_table(data_df, header=['A', 'B'])
+        lines = file_handler.generate_md_table(data_df, header=['A', 'B'])
        expected_lines = ['| A | B |\n', '| --- | --- |\n', '| 1 | 2 |\n', '| 3 | 4 |\n']
        assert (lines == expected_lines)
--- a/tests/analyzer/test_result_summary.py
+++ b/tests/analyzer/test_result_summary.py
@ -0,0 +1,149 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for ResultSummary module."""
+
+import unittest
+import yaml
+from pathlib import Path
+
+import pandas as pd
+
+from superbench.analyzer import ResultSummary
+import superbench.analyzer.file_handler as file_handler
+
+
+class TestResultSummary(unittest.TestCase):
+    """Test for ResultSummary class."""
+    def setUp(self):
+        """Method called to prepare the test fixture."""
+        self.parent_path = Path(__file__).parent
+        self.output_excel_file = str(self.parent_path / 'results_summary.xlsx')
+        self.output_md_file = str(self.parent_path / 'results_summary.md')
+        self.output_html_file = str(self.parent_path / 'results_summary.html')
+        self.test_rule_file_fake = str(self.parent_path / 'test_rules_fake.yaml')
+        self.test_raw_data = str(self.parent_path / 'test_results.jsonl')
+        self.test_rule_file = str(self.parent_path / 'test_summary_rules.yaml')
+
+    def tearDown(self):
+        """Method called after the test method has been called and the result recorded."""
+        for file in [self.output_excel_file, self.test_rule_file_fake, self.output_md_file, self.output_html_file]:
+            p = Path(file)
+            if p.is_file():
+                p.unlink()
+
+    def test_result_summary(self):
+        """Test result summary class."""
+        rs1 = ResultSummary()
+        rs1._raw_data_df = file_handler.read_raw_data(self.test_raw_data)
+        rs1._benchmark_metrics_dict = rs1._get_metrics_by_benchmarks(list(rs1._raw_data_df))
+        # Test - _check_rules
+        # Negative case
+        false_rules = [
+            {
+                'categories': 'KernelLaunch',
+                'metrics': ['kernel-launch/event_overhead:\\d+']
+            }, {
+                'categories': 'KernelLaunch',
+                'statistics': 'abb',
+                'metrics': ['kernel-launch/event_overhead:\\d+']
+            }, {
+                'categories': 'KernelLaunch',
+                'statistics': 'mean',
+                'metrics': ['kernel-launch/event_overhead:\\d+'],
+                'aggregate': 'abb'
+            }
+        ]
+        metric = 'kernel-launch/event_overhead:0'
+        for rules in false_rules:
+            self.assertRaises(Exception, rs1._check_rules, rules, metric)
+        # Positive case
+        true_rules = [
+            {
+                'categories': 'KernelLaunch',
+                'statistics': 'mean',
+                'metrics': ['kernel-launch/event_overhead:\\d+'],
+                'aggregate': True
+            },
+            {
+                'categories': 'KernelLaunch',
+                'statistics': ['mean', 'p50'],
+                'metrics': ['kernel-launch/event_overhead:\\d+']
+            },
+            {
+                'categories': 'KernelLaunch',
+                'statistics': 'mean',
+                'metrics': ['kernel-launch/event_overhead:\\d+'],
+                'aggregate': 'kernel-launch/event_overhead(:\\d+)'
+            },
+        ]
+        for rules in true_rules:
+            assert (rs1._check_rules(rules, metric))
+
+        # Test - _parse_rules
+        # Negative case
+        rs2 = ResultSummary()
+        fake_rules = file_handler.read_rules(self.test_rule_file_fake)
+        assert (rs2._parse_rules(fake_rules) is False)
+        rs2._raw_data_df = file_handler.read_raw_data(self.test_raw_data)
+        rs2._benchmark_metrics_dict = rs2._get_metrics_by_benchmarks(list(rs2._raw_data_df))
+        p = Path(self.test_rule_file)
+        with p.open() as f:
+            rules = yaml.load(f, Loader=yaml.SafeLoader)
+        rules['superbench']['rules']['fake'] = false_rules[0]
+        with open(self.test_rule_file_fake, 'w') as f:
+            yaml.dump(rules, f)
+        assert (rs1._parse_rules(fake_rules) is False)
+        # Positive case
+        rules = file_handler.read_rules(self.test_rule_file)
+        assert (rs1._parse_rules(rules))
+
+        # Test - _generate_summary
+        summary = rs1._generate_summary(round=2)
+        assert (len(summary) == 3)
+
+        # Test - _merge_summary
+        expected_summary_merge = [
+            ['KernelLaunch', 'kernel-launch/event_overhead', 'mean', 0.0097],
+            ['KernelLaunch', 'kernel-launch/event_overhead', 'p90', 0.006],
+            ['KernelLaunch', 'kernel-launch/event_overhead', 'min', 0.0055],
+            ['KernelLaunch', 'kernel-launch/event_overhead', 'max', 0.1],
+            ['KernelLaunch', 'kernel-launch/wall_overhead', 'mean', 0.01],
+            ['KernelLaunch', 'kernel-launch/wall_overhead', 'p90', 0.011],
+            ['KernelLaunch', 'kernel-launch/wall_overhead', 'min', 0.01],
+            ['KernelLaunch', 'kernel-launch/wall_overhead', 'max', 0.011],
+            ['NCCL', 'nccl-bw/allreduce_8388608_busbw:0', 'mean', 89.51],
+            ['RDMA', 'ib-loopback/IB_write_8388608_Avg_*:0', 'mean', 23925.84]
+        ]
+        expected_summary_merge_df = pd.DataFrame(expected_summary_merge)
+        summary_merge_df = rs1._merge_summary(summary)
+        pd.testing.assert_frame_equal(expected_summary_merge_df, summary_merge_df)
+
+    def test_result_summary_run(self):
+        """Test for the run process of result summary."""
+        # Test - output in excel
+        ResultSummary().run(self.test_raw_data, self.test_rule_file, str(self.parent_path), 'excel', round=2)
+        excel_file = pd.ExcelFile(self.output_excel_file, engine='openpyxl')
+        data_sheet_name = 'Summary'
+        summary = excel_file.parse(data_sheet_name, header=None)
+        expect_result_file = pd.ExcelFile(str(self.parent_path / '../data/results_summary.xlsx'), engine='openpyxl')
+        expect_result = expect_result_file.parse(data_sheet_name, header=None)
+        pd.testing.assert_frame_equal(summary, expect_result)
+
+        # Test - output in md
+        ResultSummary().run(self.test_raw_data, self.test_rule_file, str(self.parent_path), 'md', round=2)
+        expected_md_file = str(self.parent_path / '../data/results_summary.md')
+        with open(expected_md_file, 'r') as f:
+            expect_result = f.read()
+        with open(self.output_md_file, 'r') as f:
+            summary = f.read()
+        assert (summary == expect_result)
+
+        # Test - output in html
+        ResultSummary().run(self.test_raw_data, self.test_rule_file, str(self.parent_path), 'html', round=2)
+        expected_html_file = str(self.parent_path / '../data/results_summary.html')
+        with open(expected_html_file, 'r') as f:
+            expect_result = f.read()
+        with open(self.output_html_file, 'r') as f:
+            summary = f.read()
+        assert (summary == expect_result)
--- a/tests/analyzer/test_summary_rules.yaml
+++ b/tests/analyzer/test_summary_rules.yaml
@ -0,0 +1,26 @@
+# SuperBench rules
+version: v0.4
+superbench:
+  rules:
+    kernel_launch:
+      statistics:
+        - mean
+        - p90
+        - min
+        - max
+      aggregate: True
+      categories: KernelLaunch
+      metrics:
+        - kernel-launch/event_overhead
+        - kernel-launch/wall_overhead
+    nccl:
+      statistics: mean
+      categories: NCCL
+      metrics:
+        - nccl-bw/allreduce_8388608_busbw
+    ib-loopback:
+      statistics: mean
+      categories: RDMA
+      metrics:
+        - ib-loopback/IB_write_8388608_Avg_\d+
+      aggregate: ib-loopback/IB_write_.*_Avg_(\d+)
--- a/tests/analyzer/test_summaryop.py
+++ b/tests/analyzer/test_summaryop.py
@ -0,0 +1,70 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for SummaryOp module."""
+
+import unittest
+from numpy import NaN, float64
+
+import pandas as pd
+
+from superbench.analyzer import SummaryOp, SummaryType
+
+
+class TestSummaryOp(unittest.TestCase):
+    """Test for Summary Ops."""
+    def test_rule_op(self):
+        """Test for defined rule operators."""
+        # Test - get_rule_func
+        # Negative case
+        assert (not SummaryOp.get_summary_func('fake'))
+        # Positive case
+        summary_op = SummaryOp.get_summary_func(SummaryType.MEAN)
+        assert (summary_op == SummaryOp.mean)
+        summary_op = SummaryOp.get_summary_func(SummaryType.PENCENTILE)
+        assert (summary_op == SummaryOp.percentile)
+        summary_op = SummaryOp.get_summary_func(SummaryType.MIN)
+        assert (summary_op == SummaryOp.min)
+        summary_op = SummaryOp.get_summary_func(SummaryType.MAX)
+        assert (summary_op == SummaryOp.max)
+        summary_op = SummaryOp.get_summary_func(SummaryType.STD)
+        assert (summary_op == SummaryOp.std)
+        summary_op = SummaryOp.get_summary_func(SummaryType.COUNT)
+        assert (summary_op == SummaryOp.count)
+
+        # Test - _check_raw_data_Df
+        # Negative case
+        empty_data_df = pd.DataFrame()
+        self.assertRaises(Exception, SummaryOp._check_raw_data_df, empty_data_df)
+        self.assertRaises(Exception, SummaryOp._check_raw_data_df, None)
+
+        data1 = [[1, 2, 3, 4], [4, 5, 6], [7, 8]]
+        raw_data_df = pd.DataFrame(data1, columns=['a', 'b', 'c', 'd'])
+        # Test - mean
+        result = SummaryOp.mean(raw_data_df)
+        expectedResult = pd.Series([4.0, 5.0, 4.5, 4.0], index=['a', 'b', 'c', 'd'])
+        pd.testing.assert_series_equal(result, expectedResult)
+        # Test - min
+        result = SummaryOp.min(raw_data_df)
+        expectedResult = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'], dtype=float64)
+        pd.testing.assert_series_equal(result, expectedResult)
+        # Test - max
+        result = SummaryOp.max(raw_data_df)
+        expectedResult = pd.Series([7, 8, 6, 4], index=['a', 'b', 'c', 'd'], dtype=float64)
+        pd.testing.assert_series_equal(result, expectedResult)
+        # Test - std
+        result = SummaryOp.std(raw_data_df)
+        print(result)
+        expectedResult = pd.Series([3.0, 3.0, 2.1213203435596424, NaN], index=['a', 'b', 'c', 'd'], dtype=float64)
+        pd.testing.assert_series_equal(result, expectedResult)
+        # Test - count
+        result = SummaryOp.count(raw_data_df)
+        print(result)
+        expectedResult = pd.Series([3, 3, 2, 1], index=['a', 'b', 'c', 'd'])
+        pd.testing.assert_series_equal(result, expectedResult)
+        # Test - pencentile
+        result = SummaryOp.percentile(raw_data_df, 50)
+        print(result)
+        expectedResult = pd.Series([4.0, 5.0, 4.5, 4.0], index=['a', 'b', 'c', 'd'], dtype=float64)
+        pd.testing.assert_series_equal(result, expectedResult, check_names=False)
+        self.assertRaises(Exception, SummaryOp.percentile, 200)
--- a/tests/data/results_summary.html
+++ b/tests/data/results_summary.html
@ -0,0 +1,86 @@
+<h2>KernelLaunch</h2>
+<table>
+<thead>
+<tr>
+<th>metric</th>
+<th>statistics</th>
+<th>values</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>kernel-launch/event_overhead</td>
+<td>mean</td>
+<td>0.0097</td>
+</tr>
+<tr>
+<td>kernel-launch/event_overhead</td>
+<td>p90</td>
+<td>0.006</td>
+</tr>
+<tr>
+<td>kernel-launch/event_overhead</td>
+<td>min</td>
+<td>0.0055</td>
+</tr>
+<tr>
+<td>kernel-launch/event_overhead</td>
+<td>max</td>
+<td>0.1</td>
+</tr>
+<tr>
+<td>kernel-launch/wall_overhead</td>
+<td>mean</td>
+<td>0.01</td>
+</tr>
+<tr>
+<td>kernel-launch/wall_overhead</td>
+<td>p90</td>
+<td>0.011</td>
+</tr>
+<tr>
+<td>kernel-launch/wall_overhead</td>
+<td>min</td>
+<td>0.01</td>
+</tr>
+<tr>
+<td>kernel-launch/wall_overhead</td>
+<td>max</td>
+<td>0.011</td>
+</tr>
+</tbody>
+</table>
+<h2>NCCL</h2>
+<table>
+<thead>
+<tr>
+<th>metric</th>
+<th>statistics</th>
+<th>values</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>nccl-bw/allreduce_8388608_busbw:0</td>
+<td>mean</td>
+<td>89.51</td>
+</tr>
+</tbody>
+</table>
+<h2>RDMA</h2>
+<table>
+<thead>
+<tr>
+<th>metric</th>
+<th>statistics</th>
+<th>values</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>ib-loopback/IB_write_8388608_Avg_*:0</td>
+<td>mean</td>
+<td>23925.84</td>
+</tr>
+</tbody>
+</table>
--- a/tests/data/results_summary.md
+++ b/tests/data/results_summary.md
@ -0,0 +1,22 @@
+## KernelLaunch
+| metric | statistics | values |
+| --- | --- | --- |
+| kernel-launch/event_overhead | mean | 0.0097 |
+| kernel-launch/event_overhead | p90 | 0.006 |
+| kernel-launch/event_overhead | min | 0.0055 |
+| kernel-launch/event_overhead | max | 0.1 |
+| kernel-launch/wall_overhead | mean | 0.01 |
+| kernel-launch/wall_overhead | p90 | 0.011 |
+| kernel-launch/wall_overhead | min | 0.01 |
+| kernel-launch/wall_overhead | max | 0.011 |
+
+## NCCL
+| metric | statistics | values |
+| --- | --- | --- |
+| nccl-bw/allreduce_8388608_busbw:0 | mean | 89.51 |
+
+## RDMA
+| metric | statistics | values |
+| --- | --- | --- |
+| ib-loopback/IB_write_8388608_Avg_*:0 | mean | 23925.84 |
+
--- a/tests/data/results_summary.xlsx
+++ b/tests/data/results_summary.xlsx