Analyzer: Add Feature - Output results of all nodes in data diagnosis (#336)

**Description** Output results of all nodes in data diagnosis.
2022-04-10 18:57:15 +08:00 · 2022-04-10 18:57:15 +08:00 · 55b0f9d239
--- a/superbench/analyzer/data_diagnosis.py
+++ b/superbench/analyzer/data_diagnosis.py
@ -7,6 +7,7 @@ from pathlib import Path
 import json

 import pandas as pd
+import numpy as np

 from superbench.common.utils import logger
 from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType
@ -209,6 +210,48 @@ class DataDiagnosis(RuleBase):
            logger.error('DataDiagnosis: run diagnosis rules failed, message: {}'.format(str(e)))
        return data_not_accept_df, label_df

+    def output_all_nodes_results(self, raw_data_df, data_not_accept_df):
+        """Output diagnosis results of all nodes.
+
+        Args:
+            raw_data_df (DataFrame): raw data
+            data_not_accept_df (DataFrame): defective nodes's detailed information
+
+        Returns:
+            DataFrame: all nodes' detailed information inluding ['Accept','#Issues','Category','Issue_Details']
+        """
+        append_columns = ['Accept', '#Issues', 'Category', 'Issue_Details']
+        all_data_df = (raw_data_df[self._enable_metrics]).astype('float64')
+
+        if data_not_accept_df.shape[0] == 0:
+            all_data_df['Accept'] = [True for i in range(len(all_data_df))]
+            all_data_df['#Issues'] = [0 for i in range(len(all_data_df))]
+            all_data_df['Category'] = [None for i in range(len(all_data_df))]
+            all_data_df['Issue_Details'] = [None for i in range(len(all_data_df))]
+
+        elif data_not_accept_df.shape[0] > 0:
+            data_not_accept_df['Accept'] = [False for i in range(len(data_not_accept_df))]
+            data_not_accept_df['#Issues'] = data_not_accept_df['Defective Details'].map(lambda x: len(x.split(',')))
+            data_not_accept_df = data_not_accept_df.rename(columns={'Defective Details': 'Issue_Details'})
+            for index in range(len(append_columns)):
+                if append_columns[index] not in data_not_accept_df:
+                    logger.warning(
+                        'DataDiagnosis: output_all_nodes_results - column {} not found in data_not_accept_df.'.format(
+                            append_columns[index]
+                        )
+                    )
+                    all_data_df[append_columns[index]] = None
+                else:
+                    all_data_df = all_data_df.merge(
+                        data_not_accept_df[[append_columns[index]]], left_index=True, right_index=True, how='left'
+                    )
+            all_data_df['Accept'] = all_data_df['Accept'].replace(np.nan, True)
+            all_data_df['#Issues'] = all_data_df['#Issues'].replace(np.nan, 0)
+
+        all_data_df = all_data_df.replace(np.nan, '')
+
+        return all_data_df
+
    def output_diagnosis_in_excel(self, raw_data_df, data_not_accept_df, output_path, rules):
        """Output the raw_data_df and data_not_accept_df results into excel file.

@ -230,7 +273,7 @@ class DataDiagnosis(RuleBase):
        except Exception as e:
            logger.error('DataDiagnosis: excel_data_output - {}'.format(str(e)))

-    def output_diagnosis_in_json(self, data_not_accept_df, output_path):
+    def output_diagnosis_in_jsonl(self, data_not_accept_df, output_path):
        """Output data_not_accept_df into jsonl file.

        Args:
@ -256,6 +299,20 @@ class DataDiagnosis(RuleBase):
        except Exception as e:
            logger.error('DataDiagnosis: output json data failed, msg: {}'.format(str(e)))

+    def output_diagnosis_in_json(self, data_not_accept_df, output_path):
+        """Output data_not_accept_df into json file.
+
+        Args:
+            data_not_accept_df (DataFrame): the DataFrame to output
+            output_path (str): the path of output jsonl file
+        """
+        data_not_accept_df['Index'] = data_not_accept_df.index
+        data_not_accept_json = data_not_accept_df.to_json(orient='records')
+        data_not_accept = json.loads(data_not_accept_json)
+        p = Path(output_path)
+        with p.open('w') as f:
+            json.dump(data_not_accept, f, indent=4)
+
    def generate_md_lines(self, data_not_accept_df, rules, round):
        """Convert DataFrame into markdown lines.

@ -293,7 +350,9 @@ class DataDiagnosis(RuleBase):
        lines = file_handler.generate_md_table(data_not_accept_df, header)
        return lines

-    def run(self, raw_data_file, rule_file, baseline_file, output_dir, output_format='excel', round=2):
+    def run(
+        self, raw_data_file, rule_file, baseline_file, output_dir, output_format='excel', output_all=False, round=2
+    ):
        """Run the data diagnosis and output the results.

        Args:
@ -301,6 +360,7 @@ class DataDiagnosis(RuleBase):
            rule_file (str): The path of baseline yaml file
            baseline_file (str): The path of baseline json file
            output_dir (str): the directory of output file
+            output_all (bool): output diagnosis results for all nodes
            output_format (str): the format of the output, 'excel' or 'json'
            round (int): the number of decimal digits
        """
@ -312,12 +372,21 @@ class DataDiagnosis(RuleBase):
            data_not_accept_df, label_df = self.run_diagnosis_rules(rules, baseline)
            logger.info('DataDiagnosis: Processed finished')
            output_path = ''
+            # generate all nodes' info
+            if output_all:
+                output_path = str(Path(output_dir) / 'diagnosis_summary.json')
+                data_not_accept_df = self.output_all_nodes_results(self._raw_data_df, data_not_accept_df)
+            # output according format
            if output_format == 'excel':
                output_path = str(Path(output_dir) / 'diagnosis_summary.xlsx')
                self.output_diagnosis_in_excel(self._raw_data_df, data_not_accept_df, output_path, self._sb_rules)
            elif output_format == 'json':
-                output_path = str(Path(output_dir) / 'diagnosis_summary.jsonl')
-                self.output_diagnosis_in_json(data_not_accept_df, output_path)
+                if output_all:
+                    output_path = str(Path(output_dir) / 'diagnosis_summary.json')
+                    self.output_diagnosis_in_json(data_not_accept_df, output_path)
+                else:
+                    output_path = str(Path(output_dir) / 'diagnosis_summary.jsonl')
+                    self.output_diagnosis_in_jsonl(data_not_accept_df, output_path)
            elif output_format == 'md' or output_format == 'html':
                lines = self.generate_md_lines(data_not_accept_df, self._sb_rules, round)
                if output_format == 'md':
--- a/tests/analyzer/test_data_diagnosis.py
+++ b/tests/analyzer/test_data_diagnosis.py
@ -24,12 +24,13 @@ class TestDataDiagnosis(unittest.TestCase):
        self.output_json_file = str(self.parent_path / 'diagnosis_summary.jsonl')
        self.output_md_file = str(self.parent_path / 'diagnosis_summary.md')
        self.output_html_file = str(self.parent_path / 'diagnosis_summary.html')
+        self.output_all_json_file = str(self.parent_path / 'diagnosis_summary.json')

    def tearDown(self):
        """Method called after the test method has been called and the result recorded."""
        for file in [
            self.output_excel_file, self.output_json_file, self.test_rule_file_fake, self.output_md_file,
-            self.output_html_file
+            self.output_html_file, self.output_all_json_file
        ]:
            p = Path(file)
            if p.is_file():
@ -185,8 +186,8 @@ class TestDataDiagnosis(unittest.TestCase):
        assert (len(data_not_accept_read_from_excel) == 2)
        assert ('Category' in data_not_accept_read_from_excel)
        assert ('Defective Details' in data_not_accept_read_from_excel)
-        # Test - output in json
-        diag1.output_diagnosis_in_json(data_not_accept_df, self.output_json_file)
+        # Test - output in jsonl
+        diag1.output_diagnosis_in_jsonl(data_not_accept_df, self.output_json_file)
        assert (Path(self.output_json_file).is_file())
        with Path(self.output_json_file).open() as f:
            data_not_accept_read_from_json = f.readlines()
@ -203,6 +204,30 @@ class TestDataDiagnosis(unittest.TestCase):
        with open(expected_md_file, 'r') as f:
            expect_result = f.readlines()
        assert (lines == expect_result)
+        # Test - output_all_nodes_results
+        # case 1: 1 accept, 2 not accept
+        data_df = diag1.output_all_nodes_results(diag1._raw_data_df, data_not_accept_df)
+        assert (len(data_df) == 3)
+        assert (not data_df.loc['sb-validation-01']['Accept'])
+        assert (data_df.loc['sb-validation-02']['Accept'])
+        assert (not data_df.loc['sb-validation-03']['Accept'])
+        assert ('Category' in data_df)
+        assert ('Issue_Details' in data_df)
+        # case 1: 3 accept, 0 not accept
+        data_df_all_accept = diag1.output_all_nodes_results(diag1._raw_data_df, pd.DataFrame())
+        assert (len(data_df_all_accept) == 3)
+        assert (data_df_all_accept.loc['sb-validation-01']['Accept'])
+        assert (data_df_all_accept.loc['sb-validation-02']['Accept'])
+        assert (data_df_all_accept.loc['sb-validation-03']['Accept'])
+        # Test - output in json
+        diag1.output_diagnosis_in_json(data_df, self.output_all_json_file)
+        assert (Path(self.output_all_json_file).is_file())
+        expected_result_file = str(self.parent_path / '../data/diagnosis_summary.json')
+        with Path(self.output_all_json_file).open() as f:
+            data_not_accept_read_from_json = f.read()
+        with Path(expected_result_file).open() as f:
+            expect_result = f.read()
+        assert (data_not_accept_read_from_json == expect_result)

    def test_data_diagnosis_run(self):
        """Test for the run process of rule-based data diagnosis."""
@ -228,7 +253,7 @@ class TestDataDiagnosis(unittest.TestCase):
            expect_result = f.read()
        assert (data_not_accept_read_from_json == expect_result)
        # Test - output in md
-        DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'md', 2)
+        DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'md', round=2)
        assert (Path(self.output_md_file).is_file())
        expected_md_file = str(self.parent_path / '../data/diagnosis_summary.md')
        with open(expected_md_file, 'r') as f:
@ -237,7 +262,7 @@ class TestDataDiagnosis(unittest.TestCase):
            summary = f.read()
        assert (summary == expect_result)
        # Test - output in html
-        DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'html', 2)
+        DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'html', round=2)
        assert (Path(self.output_html_file).is_file())
        expected_html_file = str(self.parent_path / '../data/diagnosis_summary.html')
        with open(expected_html_file, 'r') as f:
@ -245,6 +270,17 @@ class TestDataDiagnosis(unittest.TestCase):
        with open(self.output_html_file, 'r') as f:
            summary = f.read()
        assert (summary == expect_result)
+        # Test - output all nodes results
+        DataDiagnosis().run(
+            test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'json', output_all=True
+        )
+        assert (Path(self.output_all_json_file).is_file())
+        expected_result_file = str(self.parent_path / '../data/diagnosis_summary.json')
+        with Path(self.output_all_json_file).open() as f:
+            data_not_accept_read_from_json = f.read()
+        with Path(expected_result_file).open() as f:
+            expect_result = f.read()
+        assert (data_not_accept_read_from_json == expect_result)

    def test_mutli_rules(self):
        """Test multi rules check feature."""
--- a/tests/data/diagnosis_summary.json
+++ b/tests/data/diagnosis_summary.json
@ -0,0 +1,125 @@
+[
+    {
+        "kernel-launch/event_overhead:0": 0.1,
+        "kernel-launch/event_overhead:1": 0.00595,
+        "kernel-launch/event_overhead:2": 0.00557,
+        "kernel-launch/event_overhead:3": 0.0055,
+        "kernel-launch/event_overhead:4": 0.00592,
+        "kernel-launch/event_overhead:5": 0.00589,
+        "kernel-launch/event_overhead:6": 0.00572,
+        "kernel-launch/event_overhead:7": 0.0059,
+        "kernel-launch/return_code": 0.0,
+        "kernel-launch/wall_overhead:0": 0.01026,
+        "kernel-launch/wall_overhead:1": 0.01026,
+        "kernel-launch/wall_overhead:2": 0.01046,
+        "kernel-launch/wall_overhead:3": 0.01049,
+        "kernel-launch/wall_overhead:4": 0.01063,
+        "kernel-launch/wall_overhead:5": 0.01006,
+        "kernel-launch/wall_overhead:6": 0.01045,
+        "kernel-launch/wall_overhead:7": 0.01071,
+        "mem-bw/D2H_Mem_BW:0": 24.3,
+        "mem-bw/D2H_Mem_BW:1": 24.6,
+        "mem-bw/D2H_Mem_BW:2": 24.5,
+        "mem-bw/D2H_Mem_BW:3": 24.6,
+        "mem-bw/D2H_Mem_BW:4": 24.3,
+        "mem-bw/D2H_Mem_BW:5": 24.3,
+        "mem-bw/D2H_Mem_BW:6": 23.9,
+        "mem-bw/D2H_Mem_BW:7": 24.6,
+        "mem-bw/H2D_Mem_BW:0": 25.6,
+        "mem-bw/H2D_Mem_BW:1": 25.8,
+        "mem-bw/H2D_Mem_BW:2": 26.0,
+        "mem-bw/H2D_Mem_BW:3": 26.1,
+        "mem-bw/H2D_Mem_BW:4": 26.2,
+        "mem-bw/H2D_Mem_BW:5": 25.8,
+        "mem-bw/H2D_Mem_BW:6": 25.3,
+        "mem-bw/H2D_Mem_BW:7": 26.1,
+        "mem-bw/return_code": 0.0,
+        "Accept": false,
+        "#Issues": 1.0,
+        "Category": "KernelLaunch",
+        "Issue_Details": "kernel-launch/event_overhead:0(B/L: 0.0060 VAL: 0.1000 VAR: 1577.85% Rule:lambda x:x>0.05)",
+        "Index": "sb-validation-01"
+    },
+    {
+        "kernel-launch/event_overhead:0": 0.00595,
+        "kernel-launch/event_overhead:1": 0.00595,
+        "kernel-launch/event_overhead:2": 0.00557,
+        "kernel-launch/event_overhead:3": 0.0055,
+        "kernel-launch/event_overhead:4": 0.00592,
+        "kernel-launch/event_overhead:5": 0.00589,
+        "kernel-launch/event_overhead:6": 0.00572,
+        "kernel-launch/event_overhead:7": 0.0059,
+        "kernel-launch/return_code": 0.0,
+        "kernel-launch/wall_overhead:0": 0.01026,
+        "kernel-launch/wall_overhead:1": 0.01026,
+        "kernel-launch/wall_overhead:2": 0.01046,
+        "kernel-launch/wall_overhead:3": 0.01049,
+        "kernel-launch/wall_overhead:4": 0.01063,
+        "kernel-launch/wall_overhead:5": 0.01006,
+        "kernel-launch/wall_overhead:6": 0.01045,
+        "kernel-launch/wall_overhead:7": 0.01071,
+        "mem-bw/D2H_Mem_BW:0": 24.3,
+        "mem-bw/D2H_Mem_BW:1": 24.6,
+        "mem-bw/D2H_Mem_BW:2": 24.5,
+        "mem-bw/D2H_Mem_BW:3": 24.6,
+        "mem-bw/D2H_Mem_BW:4": 24.3,
+        "mem-bw/D2H_Mem_BW:5": 24.3,
+        "mem-bw/D2H_Mem_BW:6": 23.9,
+        "mem-bw/D2H_Mem_BW:7": 24.6,
+        "mem-bw/H2D_Mem_BW:0": 25.6,
+        "mem-bw/H2D_Mem_BW:1": 25.8,
+        "mem-bw/H2D_Mem_BW:2": 26.0,
+        "mem-bw/H2D_Mem_BW:3": 26.1,
+        "mem-bw/H2D_Mem_BW:4": 26.2,
+        "mem-bw/H2D_Mem_BW:5": 25.8,
+        "mem-bw/H2D_Mem_BW:6": 25.3,
+        "mem-bw/H2D_Mem_BW:7": 26.1,
+        "mem-bw/return_code": 0.0,
+        "Accept": true,
+        "#Issues": 0.0,
+        "Category": "",
+        "Issue_Details": "",
+        "Index": "sb-validation-02"
+    },
+    {
+        "kernel-launch/event_overhead:0": 0.00596,
+        "kernel-launch/event_overhead:1": 0.00595,
+        "kernel-launch/event_overhead:2": 0.00557,
+        "kernel-launch/event_overhead:3": 0.0055,
+        "kernel-launch/event_overhead:4": 0.00592,
+        "kernel-launch/event_overhead:5": 0.00589,
+        "kernel-launch/event_overhead:6": 0.00572,
+        "kernel-launch/event_overhead:7": 0.0059,
+        "kernel-launch/return_code": 0.0,
+        "kernel-launch/wall_overhead:0": 0.01026,
+        "kernel-launch/wall_overhead:1": 0.01026,
+        "kernel-launch/wall_overhead:2": 0.01046,
+        "kernel-launch/wall_overhead:3": 0.01049,
+        "kernel-launch/wall_overhead:4": 0.01063,
+        "kernel-launch/wall_overhead:5": 0.01006,
+        "kernel-launch/wall_overhead:6": 0.01045,
+        "kernel-launch/wall_overhead:7": 0.01071,
+        "mem-bw/D2H_Mem_BW:0": "",
+        "mem-bw/D2H_Mem_BW:1": "",
+        "mem-bw/D2H_Mem_BW:2": "",
+        "mem-bw/D2H_Mem_BW:3": "",
+        "mem-bw/D2H_Mem_BW:4": "",
+        "mem-bw/D2H_Mem_BW:5": "",
+        "mem-bw/D2H_Mem_BW:6": "",
+        "mem-bw/D2H_Mem_BW:7": "",
+        "mem-bw/H2D_Mem_BW:0": "",
+        "mem-bw/H2D_Mem_BW:1": "",
+        "mem-bw/H2D_Mem_BW:2": "",
+        "mem-bw/H2D_Mem_BW:3": "",
+        "mem-bw/H2D_Mem_BW:4": "",
+        "mem-bw/H2D_Mem_BW:5": "",
+        "mem-bw/H2D_Mem_BW:6": "",
+        "mem-bw/H2D_Mem_BW:7": "",
+        "mem-bw/return_code": 1.0,
+        "Accept": false,
+        "#Issues": 17.0,
+        "Category": "FailedTest,Mem",
+        "Issue_Details": "mem-bw/D2H_Mem_BW:0_miss,mem-bw/D2H_Mem_BW:1_miss,mem-bw/D2H_Mem_BW:2_miss,mem-bw/D2H_Mem_BW:3_miss,mem-bw/D2H_Mem_BW:4_miss,mem-bw/D2H_Mem_BW:5_miss,mem-bw/D2H_Mem_BW:6_miss,mem-bw/D2H_Mem_BW:7_miss,mem-bw/H2D_Mem_BW:0_miss,mem-bw/H2D_Mem_BW:1_miss,mem-bw/H2D_Mem_BW:2_miss,mem-bw/H2D_Mem_BW:3_miss,mem-bw/H2D_Mem_BW:4_miss,mem-bw/H2D_Mem_BW:5_miss,mem-bw/H2D_Mem_BW:6_miss,mem-bw/H2D_Mem_BW:7_miss,mem-bw/return_code(VAL: 1.0000 Rule:lambda x:x>0)",
+        "Index": "sb-validation-03"
+    }
+]