[Better Engineering] Bump ruff to 0.0.278 and fix new lint errors (#16789)

Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom): * __->__ #16789 Bump ruff to 0.0.278 and fix new lint errors. I added noqa to all existing RUF012 errors which requires mutable class variables to be annotated with `ClassVar`, as well as all PERF issues. Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-07-21 12:53:41 -07:00 · 2023-07-21 12:53:41 -07:00 · d79515041c
--- a/docs/python/_common/onnx_sphinx.py
+++ b/docs/python/_common/onnx_sphinx.py
@ -683,7 +683,7 @@ def get_onnx_example(op_name):
        try:
            mod = importlib.import_module(m)
            module = m
-        except ImportError:
+        except ImportError:  # noqa: PERF203
            continue
    if module is None:
        # Unable to find an example for 'op_name'.
--- a/docs/python/examples/plot_common_errors.py
+++ b/docs/python/examples/plot_common_errors.py
@ -86,7 +86,7 @@ for x in [
    try:
        r = sess.run([output_name], {input_name: x})
        print(f"Shape={x.shape} and predicted labels={r}")
-    except (RuntimeError, InvalidArgument) as e:
+    except (RuntimeError, InvalidArgument) as e:  # noqa: PERF203
        print(f"ERROR with Shape={x.shape} - {e}")

 for x in [
@ -99,7 +99,7 @@ for x in [
    try:
        r = sess.run(None, {input_name: x})
        print(f"Shape={x.shape} and predicted probabilities={r[1]}")
-    except (RuntimeError, InvalidArgument) as e:
+    except (RuntimeError, InvalidArgument) as e:  # noqa: PERF203
        print(f"ERROR with Shape={x.shape} - {e}")

 #########################
@ -114,5 +114,5 @@ for x in [
    try:
        r = sess.run([output_name], {input_name: x})
        print(f"Shape={x.shape} and predicted labels={r}")
-    except (RuntimeError, InvalidArgument) as e:
+    except (RuntimeError, InvalidArgument) as e:  # noqa: PERF203
        print(f"ERROR with Shape={x.shape} - {e}")
--- a/onnxruntime/python/backend/backend.py
+++ b/onnxruntime/python/backend/backend.py
@ -66,7 +66,7 @@ class OnnxRuntimeBackend(Backend):
                            " Got Domain '{}' version '{}'.".format(domain, opset.version)
                        )
                        return False, error_message
-                except AttributeError:
+                except AttributeError:  # noqa: PERF203
                    # for some CI pipelines accessing helper.OP_SET_ID_VERSION_MAP
                    # is generating attribute error. TODO investigate the pipelines to
                    # fix this error. Falling back to a simple version check when this error is encountered
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@ -188,11 +188,10 @@ class Session:
        self._enable_fallback = True

    def _validate_input(self, feed_input_names):
-        # import pdb; pdb.set_trace()
        missing_input_names = []
        for input in self._inputs_meta:
            if input.name not in feed_input_names and not input.type.startswith("optional"):
-                missing_input_names.append(input.name)
+                missing_input_names.append(input.name)  # noqa: PERF401
        if missing_input_names:
            raise ValueError(
                f"Required inputs ({missing_input_names}) are missing from input feed ({feed_input_names})."
@ -219,7 +218,7 @@ class Session:
            return self._sess.run(output_names, input_feed, run_options)
        except C.EPFail as err:
            if self._enable_fallback:
-                print(f"EP Error: {str(err)} using {self._providers}")
+                print(f"EP Error: {err!s} using {self._providers}")
                print(f"Falling back to {self._fallback_providers} and retrying.")
                self.set_providers(self._fallback_providers)
                # Fallback only once.
@ -260,7 +259,7 @@ class Session:
            return invoke(self._sess, output_names, input_dict_ort_values, run_options)
        except C.EPFail as err:
            if self._enable_fallback:
-                print(f"EP Error: {str(err)} using {self._providers}")
+                print(f"EP Error: {err!s} using {self._providers}")
                print(f"Falling back to {self._fallback_providers} and retrying.")
                self.set_providers(self._fallback_providers)
                # Fallback only once.
--- a/onnxruntime/python/tools/custom_op_wrapper/create_custom_op_wrapper.py
+++ b/onnxruntime/python/tools/custom_op_wrapper/create_custom_op_wrapper.py
@ -96,7 +96,7 @@ class ParseIOInfoAction(argparse.Action):

            try:
                comp_strs = io_str.split(";")
-            except ValueError:
+            except ValueError:  # noqa: PERF203
                parser.error(f"{opt_str}: {io_meta_name} info must be separated by ';'")

            if len(comp_strs) != 3:
--- a/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py
@ -78,7 +78,7 @@ def _test_batched_gemm(
        for i in range(batch):
            try:
                np.testing.assert_allclose(my_cs[i], ref_cs[i], rtol=bounds[i])
-            except Exception as err:
+            except Exception as err:  # noqa: PERF203
                header = "*" * 30 + impl + "*" * 30
                print(header, bounds[i])
                print(err)
--- a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py
@ -182,7 +182,7 @@ def _test_gemm_softmax_gemm_permute(
                is_zero_tol, atol, rtol = 1e-3, 2e-2, 1e-2
                not_close_to_zeros = np.abs(ref) > is_zero_tol
                np.testing.assert_allclose(out[not_close_to_zeros], ref[not_close_to_zeros], atol=atol, rtol=rtol)
-        except Exception as err:
+        except Exception as err:  # noqa: PERF203
            header = "*" * 30 + impl + "*" * 30
            print(header)
            print(err)
--- a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py
@ -58,7 +58,7 @@ def _test_gemm(func, dtype: str, transa: bool, transb: bool, m: int, n: int, k:

        try:
            np.testing.assert_allclose(my_c, ref_c, rtol=bound)
-        except Exception as err:
+        except Exception as err:  # noqa: PERF203
            header = "*" * 30 + impl + "*" * 30
            print(header)
            print(err)
--- a/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py
@ -82,7 +82,7 @@ def _test_strided_batched_gemm(
        for i in range(batch):
            try:
                np.testing.assert_allclose(my_c[i], ref_c[i], rtol=bounds[i])
-            except Exception as err:
+            except Exception as err:  # noqa: PERF203
                header = "*" * 30 + impl + "*" * 30
                print(header, bounds[i])
                print(err)
--- a/onnxruntime/python/tools/pytorch_export_contrib_ops.py
+++ b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
@ -96,7 +96,7 @@ def unregister():
    for name in _registered_ops:
        try:
            torch.onnx.unregister_custom_op_symbolic(name, _OPSET_VERSION)
-        except AttributeError:
+        except AttributeError:  # noqa: PERF203
            # The symbolic_registry module was removed in PyTorch 1.13.
            # We are importing it here for backwards compatibility
            # because unregister_custom_op_symbolic is not available before PyTorch 1.12
--- a/onnxruntime/python/tools/qnn/add_trans_cast.py
+++ b/onnxruntime/python/tools/qnn/add_trans_cast.py
@ -142,7 +142,7 @@ def gen_to_channel_first_perm(rank):
    perm.append(0)
    perm.append(rank - 1)
    for i in range(1, rank - 1):
-        perm.append(i)
+        perm.append(i)  # noqa: PERF402

    return perm

@ -152,7 +152,7 @@ def gen_to_channel_last_perm(rank):
    perm = []
    perm.append(0)
    for i in range(2, rank):
-        perm.append(i)
+        perm.append(i)  # noqa: PERF402
    perm.append(1)

    return perm
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@ -370,7 +370,7 @@ class HistogramCalibrater(CalibraterBase):
        self.tensors_to_calibrate, value_infos = self.select_tensors_to_calibrate(self.model)
        for tensor in self.tensors_to_calibrate:
            if tensor not in self.model_original_outputs:
-                self.model.graph.output.append(value_infos[tensor])
+                self.model.graph.output.append(value_infos[tensor])  # noqa: PERF401

        onnx.save(
            self.model,
--- a/onnxruntime/python/tools/quantization/onnx_model.py
+++ b/onnxruntime/python/tools/quantization/onnx_model.py
@ -176,7 +176,7 @@ class ONNXModel:
        for output in node.output:
            if output in input_name_to_nodes:
                for node in input_name_to_nodes[output]:
-                    children.append(node)
+                    children.append(node)  # noqa: PERF402
        return children

    def get_parents(self, node, output_name_to_node=None):
@ -186,7 +186,7 @@ class ONNXModel:
        parents = []
        for input in node.input:
            if input in output_name_to_node:
-                parents.append(output_name_to_node[input])
+                parents.append(output_name_to_node[input])  # noqa: PERF401
        return parents

    def get_parent(self, node, idx, output_name_to_node=None):
@ -222,7 +222,7 @@ class ONNXModel:
        for node in graph.node:
            for node_input in node.input:
                if node_input == initializer.name:
-                    nodes.append(node)
+                    nodes.append(node)  # noqa: PERF401
        return nodes

    @staticmethod
@ -379,7 +379,7 @@ class ONNXModel:
                and not self.is_graph_output(node.output[0])
                and node.output[0] not in input_name_to_nodes
            ):
-                unused_nodes.append(node)
+                unused_nodes.append(node)  # noqa: PERF401

        self.remove_nodes(unused_nodes)

--- a/onnxruntime/python/tools/quantization/qdq_loss_debug.py
+++ b/onnxruntime/python/tools/quantization/qdq_loss_debug.py
@ -145,7 +145,7 @@ def collect_activations(

    intermediate_outputs = []
    for input_d in input_reader:
-        intermediate_outputs.append(inference_session.run(None, input_d))
+        intermediate_outputs.append(inference_session.run(None, input_d))  # noqa: PERF401
    if not intermediate_outputs:
        raise RuntimeError("No data is collected while running augmented model!")

--- a/onnxruntime/python/tools/quantization/shape_inference.py
+++ b/onnxruntime/python/tools/quantization/shape_inference.py
@ -9,6 +9,7 @@ import logging
 import tempfile
 import traceback
 from pathlib import Path
+from typing import Optional

 import onnx

@ -32,7 +33,7 @@ def quant_pre_process(
    verbose: int = 0,
    save_as_external_data: bool = False,
    all_tensors_to_one_file: bool = False,
-    external_data_location: str = None,
+    external_data_location: Optional[str] = None,
    external_data_size_threshold: int = 1024,
 ) -> None:
    """Shape inference and model optimization, in preparation for quantization.
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@ -747,7 +747,7 @@ class SymbolicShapeInference:
        else:
            lhs_reduce_dim = -1
            rhs_reduce_dim = -2
-            new_shape = [*self._broadcast_shapes(lhs_shape[:-2], rhs_shape[:-2]), lhs_shape[-2]] + [rhs_shape[-1]]
+            new_shape = [*self._broadcast_shapes(lhs_shape[:-2], rhs_shape[:-2]), lhs_shape[-2], rhs_shape[-1]]
        # merge reduce dim
        self._check_merged_dims(
            [lhs_shape[lhs_reduce_dim], rhs_shape[rhs_reduce_dim]],
@ -1008,13 +1008,13 @@ class SymbolicShapeInference:
            right_ellipsis_index = right_equation.find(b"...")
            if right_ellipsis_index != -1:
                for i in range(num_ellipsis_indices):
-                    new_sympy_shape.append(shape[i])
+                    new_sympy_shape.append(shape[i])  # noqa: PERF401
            for c in right_equation:
                if c != 46:  # c != b'.'
-                    new_sympy_shape.append(letter_to_dim[c])
+                    new_sympy_shape.append(letter_to_dim[c])  # noqa: PERF401
        else:
            for i in range(num_ellipsis_indices):
-                new_sympy_shape.append(shape[i])
+                new_sympy_shape.append(shape[i])  # noqa: PERF401
            for c in left_equation:
                if c != 44 and c != 46:  # c != b',' and c != b'.':
                    if c in num_letter_occurrences:
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@ -138,7 +138,7 @@ def run_trt_standalone(trtexec, model_name, model_path, test_data_dir, all_input
        logger.info(loaded_input)
        shape = []
        for j in all_inputs_shape[i]:
-            shape.append(str(j))
+            shape.append(str(j))  # noqa: PERF401
        shape = "x".join(shape)
        shape = name + ":" + shape
        input_shape.append(shape)
@ -266,7 +266,7 @@ def get_ort_session_inputs_and_outputs(name, session, ort_input):
        for i in range(len(session.get_inputs())):
            sess_inputs[session.get_inputs()[i].name] = ort_input[i]
        for i in range(len(session.get_outputs())):
-            sess_outputs.append(session.get_outputs()[i].name)
+            sess_outputs.append(session.get_outputs()[i].name)  # noqa: PERF401
    return (sess_inputs, sess_outputs)


@ -406,7 +406,7 @@ def inference_ort(
                runtime = runtime[1:]  # remove warmup
            runtimes += runtime

-        except Exception as e:
+        except Exception as e:  # noqa: PERF203
            logger.error(e)
            if track_memory:
                end_memory_tracking(p, success)
@ -605,7 +605,7 @@ def validate(all_ref_outputs, all_outputs, rtol, atol, percent_mismatch):
                # abs(desired-actual) < rtol * abs(desired) + atol
                try:
                    np.testing.assert_allclose(ref_o, o, rtol, atol)
-                except Exception as e:
+                except Exception as e:  # noqa: PERF203
                    if percentage_in_allowed_threshold(e, percent_mismatch):
                        continue
                    logger.error(e)
@ -1194,7 +1194,7 @@ def read_success_from_file(success_file):
    with open(success_file) as success:
        csv_reader = csv.DictReader(success)
        for row in csv_reader:
-            success_results.append(row)
+            success_results.append(row)  # noqa: PERF402

    success_json = json.loads(json.dumps(success_results, indent=4))
    return success_json
@ -2051,7 +2051,7 @@ class ParseDictArgAction(argparse.Action):
        for kv in values.split(","):
            try:
                k, v = kv.split("=")
-            except ValueError:
+            except ValueError:  # noqa: PERF203
                parser.error(f"argument {option_string}: Expected '=' between key and value")

            if k in dict_arg:
--- a/onnxruntime/python/tools/tensorrt/perf/post.py
+++ b/onnxruntime/python/tools/tensorrt/perf/post.py
@ -146,7 +146,7 @@ def get_memory(memory, model_group):
    memory_columns = [model_title]
    for provider in provider_list:
        if cpu not in provider:
-            memory_columns.append(provider + memory_ending)
+            memory_columns.append(provider + memory_ending)  # noqa: PERF401
    memory_db_columns = [
        model_title,
        cuda,
@ -273,7 +273,7 @@ def get_latency(latency, model_group):

    latency_columns = [model_title]
    for provider in provider_list:
-        latency_columns.append(provider + avg_ending)
+        latency_columns.append(provider + avg_ending)  # noqa: PERF401
    latency_db_columns = table_headers
    latency = adjust_columns(latency, latency_columns, latency_db_columns, model_group)
    return latency
--- a/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py
+++ b/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py
@ -75,7 +75,7 @@ def main():

    model_list = []
    for link in links:
-        model_list.append(get_model_info(link))
+        model_list.append(get_model_info(link))  # noqa: PERF401
    write_json(model_list)


--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@ -417,7 +417,7 @@ def run_pytorch(
                    result.update(get_latency_result(runtimes, batch_size))
                    logger.info(result)
                    results.append(result)
-                except RuntimeError as e:
+                except RuntimeError as e:  # noqa: PERF203
                    logger.exception(e)
                    torch.cuda.empty_cache()

@ -572,7 +572,7 @@ def run_tensorflow(
                    result.update(get_latency_result(runtimes, batch_size))
                    logger.info(result)
                    results.append(result)
-                except RuntimeError as e:
+                except RuntimeError as e:  # noqa: PERF203
                    logger.exception(e)
                    from numba import cuda

--- a/onnxruntime/python/tools/transformers/benchmark_helper.py
+++ b/onnxruntime/python/tools/transformers/benchmark_helper.py
@ -249,7 +249,7 @@ def output_summary(results, csv_filename, args):
                data_names.append(f"b{batch_size}")
            else:
                for sequence_length in args.sequence_lengths:
-                    data_names.append(f"b{batch_size}_s{sequence_length}")
+                    data_names.append(f"b{batch_size}_s{sequence_length}")  # noqa: PERF401

        csv_writer = csv.DictWriter(csv_file, fieldnames=header_names + data_names)
        csv_writer.writeheader()
@ -386,7 +386,7 @@ def allocateOutputBuffers(output_buffers, output_buffer_max_sizes, device):  # n
    # for each test run.

    for i in output_buffer_max_sizes:
-        output_buffers.append(torch.empty(i, dtype=torch.float32, device=device))
+        output_buffers.append(torch.empty(i, dtype=torch.float32, device=device))  # noqa: PERF401


 def set_random_seed(seed=123):
--- a/onnxruntime/python/tools/transformers/convert_tf_models_to_pytorch.py
+++ b/onnxruntime/python/tools/transformers/convert_tf_models_to_pytorch.py
@ -197,7 +197,7 @@ def tf2pt_pipeline_test():
        input = torch.randint(low=0, high=config.vocab_size - 1, size=(4, 128), dtype=torch.long)
        try:
            model(input)
-        except RuntimeError as e:
+        except RuntimeError as e:  # noqa: PERF203
            logger.exception(e)


--- a/onnxruntime/python/tools/transformers/convert_to_packing_mode.py
+++ b/onnxruntime/python/tools/transformers/convert_to_packing_mode.py
@ -124,7 +124,7 @@ class PackingMode:
            attributes = []
            for attr in attention.attribute:
                if attr.name in ["num_heads", "qkv_hidden_sizes", "scale"]:
-                    attributes.append(attr)
+                    attributes.append(attr)  # noqa: PERF401

            packed_attention.attribute.extend(attributes)
            packed_attention.domain = "com.microsoft"
--- a/onnxruntime/python/tools/transformers/float16.py
+++ b/onnxruntime/python/tools/transformers/float16.py
@ -342,7 +342,7 @@ def convert_float_to_float16(
                        # For Resize/GroupNorm, attribute data type cannot be changed
                        if n.op_type not in ["Resize", "GroupNorm"]:
                            for attr in n.attribute:
-                                next_level.append(attr)
+                                next_level.append(attr)  # noqa: PERF402
                        else:
                            mixed_float_type_node_list.append(n)

@ -351,7 +351,7 @@ def convert_float_to_float16(
            if isinstance(q, onnx_proto.AttributeProto):
                next_level.append(q.g)
                for n in q.graphs:
-                    next_level.append(n)
+                    next_level.append(n)  # noqa: PERF402
                q.t.CopyFrom(convert_tensor_float_to_float16(q.t, min_positive_val, max_finite_val))
                for n in q.tensors:
                    n = convert_tensor_float_to_float16(n, min_positive_val, max_finite_val)  # noqa: PLW2901
--- a/onnxruntime/python/tools/transformers/fusion_embedlayer.py
+++ b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
@ -4,7 +4,7 @@
 # --------------------------------------------------------------------------

 from logging import getLogger
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union

 from fusion_base import Fusion
 from fusion_utils import FusionUtils
@ -428,7 +428,7 @@ class FusionEmbedLayerNoMask(Fusion):
        word_embedding_gather: NodeProto,
        position_embedding_gather: NodeProto,
        segment_embedding_gather: Union[None, NodeProto],
-        position_ids: str = None,
+        position_ids: Optional[str] = None,
        embedding_sum_output=False,
    ):
        """Create an EmbedLayerNormalization node. Note that segment embedding is optional.
--- a/onnxruntime/python/tools/transformers/fusion_group_norm.py
+++ b/onnxruntime/python/tools/transformers/fusion_group_norm.py
@ -103,7 +103,7 @@ class FusionGroupNorm(Fusion):

        group_norm_name = self.model.create_node_name("GroupNorm", name_prefix="GroupNorm")

-        if weight_elements not in [320, 640, 960, 1280, 1920, 2560] + [128, 256, 512]:
+        if weight_elements not in [320, 640, 960, 1280, 1920, 2560, 128, 256, 512]:
            logger.info("GroupNorm channels=%d", weight_elements)

        gamma = helper.make_tensor(
--- a/onnxruntime/python/tools/transformers/models/bert/eval_squad.py
+++ b/onnxruntime/python/tools/transformers/models/bert/eval_squad.py
@ -150,7 +150,7 @@ def output_summary(results: List[Dict[str, Any]], csv_filename: str, metric_name
        key_names = []
        for sequence_length in sequence_lengths:
            for batch_size in batch_sizes:
-                key_names.append(f"b{batch_size}_s{sequence_length}")
+                key_names.append(f"b{batch_size}_s{sequence_length}")  # noqa: PERF401

        csv_writer = csv.DictWriter(csv_file, fieldnames=header_names + key_names)
        csv_writer.writeheader()
--- a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
@ -364,7 +364,7 @@ def main(args):
                                # Results of IO binding might be in GPU. Copy outputs to CPU for comparison.
                                copy_outputs = []
                                for output in ort_outputs:
-                                    copy_outputs.append(output.cpu().numpy())
+                                    copy_outputs.append(output.cpu().numpy())  # noqa: PERF401

                            if gpt2helper.compare_outputs(
                                outputs,
@ -404,7 +404,7 @@ def main(args):
                            "onnxruntime_latency": f"{ort_latency:.2f}",
                        }
                        csv_writer.writerow(row)
-                    except Exception:
+                    except Exception:  # noqa: PERF203
                        logger.error("Exception", exc_info=True)
                        return None

--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
@ -75,7 +75,7 @@ class MyGPT2Model(GPT2Model):
            for i in range(num_layer):
                # Since transformers v4.*, past key and values are separated outputs.
                # Here we concate them into one tensor to be compatible with Attention operator.
-                present.append(
+                present.append(  # noqa: PERF401
                    torch.cat(
                        (result[1][i][0].unsqueeze(0), result[1][i][1].unsqueeze(0)),
                        dim=0,
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
@ -134,7 +134,7 @@ def load_results_from_csv(csv_path):
    with open(csv_path, newline="") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
-            rows.append(row)
+            rows.append(row)  # noqa: PERF402
    return rows


@ -256,7 +256,7 @@ def run_significance_test(rows, output_csv_path):
                    utest_statistic, utest_pvalue = scipy.stats.mannwhitneyu(
                        a, b, use_continuity=True, alternative="two-sided"
                    )  # TODO: shall we use one-sided: less or greater according to "top1_match_rate"
-                except ValueError:  # ValueError: All numbers are identical in mannwhitneyu
+                except ValueError:  # ValueError: All numbers are identical in mannwhitneyu  # noqa: PERF203
                    utest_statistic = None
                    utest_pvalue = None
                ttest_statistic, ttest_pvalue = scipy.stats.ttest_ind(a, b, axis=None, equal_var=True)
--- a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
+++ b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
@ -645,7 +645,7 @@ def run_tests(

                        args = parse_arguments(f"{arguments} -t {test_times}".split(" "))
                        latency_results = launch_test(args)
-                    except KeyboardInterrupt as exc:
+                    except KeyboardInterrupt as exc:  # noqa: PERF203
                        raise RuntimeError("Keyboard Interrupted") from exc
                    except Exception:
                        traceback.print_exc()
@ -687,7 +687,7 @@ def output_summary(results, csv_filename, data_field="average_latency_ms"):
        data_names = []
        for sequence_length in sequence_lengths:
            for batch_size in batch_sizes:
-                data_names.append(f"b{batch_size}_s{sequence_length}")
+                data_names.append(f"b{batch_size}_s{sequence_length}")  # noqa: PERF401

        csv_writer = csv.DictWriter(csv_file, fieldnames=header_names + data_names)
        csv_writer.writeheader()
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py
@ -645,7 +645,7 @@ class OnnxruntimeCudaStableDiffusionPipeline(StableDiffusionPipeline):
    @torch.no_grad()
    def __call__(
        self,
-        prompt: Union[str, List[str]] = None,
+        prompt: Optional[Union[str, List[str]]] = None,
        num_inference_steps: int = 50,
        guidance_scale: float = 7.5,
        negative_prompt: Optional[Union[str, List[str]]] = None,
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py
@ -798,7 +798,7 @@ class OnnxruntimeTensorRTStableDiffusionPipeline(StableDiffusionPipeline):
    @torch.no_grad()
    def __call__(
        self,
-        prompt: Union[str, List[str]] = None,
+        prompt: Optional[Union[str, List[str]]] = None,
        num_inference_steps: int = 50,
        guidance_scale: float = 7.5,
        negative_prompt: Optional[Union[str, List[str]]] = None,
--- a/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py
+++ b/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py
@ -9,7 +9,7 @@ import os
 import sys
 import tempfile
 from pathlib import Path
-from typing import List, Union
+from typing import List, Optional, Union

 import numpy
 import onnx
@ -38,7 +38,7 @@ class T5DecoderInit(torch.nn.Module):
        decoder: torch.nn.Module,
        lm_head: torch.nn.Module,
        config: Union[T5Config, MT5Config],
-        decoder_start_token_id: int = None,
+        decoder_start_token_id: Optional[int] = None,
    ):
        super().__init__()
        self.decoder = decoder
@ -204,10 +204,10 @@ class T5DecoderInputs:

            past = []
            for _ in range(2 * num_layers):
-                past.append(torch.rand(self_attention_past_shape, dtype=float_type, device=device))
+                past.append(torch.rand(self_attention_past_shape, dtype=float_type, device=device))  # noqa: PERF401

            for _ in range(2 * num_layers):
-                past.append(torch.rand(cross_attention_past_shape, dtype=float_type, device=device))
+                past.append(torch.rand(cross_attention_past_shape, dtype=float_type, device=device))  # noqa: PERF401
        else:
            past = None

--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
@ -9,7 +9,7 @@ import os
 import sys
 import tempfile
 from pathlib import Path
-from typing import List, Union
+from typing import List, Optional, Union

 import numpy
 import onnx
@ -36,7 +36,7 @@ class WhisperDecoderInit(torch.nn.Module):
        self,
        decoder: torch.nn.Module,
        config: WhisperConfig,
-        decoder_start_token_id: int = None,
+        decoder_start_token_id: Optional[int] = None,
    ):
        super().__init__()
        self.decoder = decoder
@ -167,10 +167,10 @@ class WhisperDecoderInputs:

            past = []
            for _ in range(2 * num_layers):
-                past.append(torch.rand(self_attention_past_shape, dtype=float_type, device=device))
+                past.append(torch.rand(self_attention_past_shape, dtype=float_type, device=device))  # noqa: PERF401

            for _ in range(2 * num_layers):
-                past.append(torch.rand(cross_attention_past_shape, dtype=float_type, device=device))
+                past.append(torch.rand(cross_attention_past_shape, dtype=float_type, device=device))  # noqa: PERF401
        else:
            past = None

--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@ -79,7 +79,7 @@ class OnnxModel:
        all_nodes = []
        for graph in self.graphs():
            for node in graph.node:
-                all_nodes.append(node)
+                all_nodes.append(node)  # noqa: PERF402
        return all_nodes

    def graph(self):
@ -108,14 +108,14 @@ class OnnxModel:
        input_names = []
        for graph in self.graphs():
            for input in graph.input:
-                input_names.append(input.name)
+                input_names.append(input.name)  # noqa: PERF401
        return input_names

    def get_graphs_output_names(self):
        output_names = []
        for graph in self.graphs():
            for output in graph.output:
-                output_names.append(output.name)
+                output_names.append(output.name)  # noqa: PERF401
        return output_names

    def get_graph_by_node(self, node):
@ -217,7 +217,7 @@ class OnnxModel:
        nodes = []
        for node in self.nodes():
            if node.op_type == op_type:
-                nodes.append(node)
+                nodes.append(node)  # noqa: PERF401
        return nodes

    def get_children(self, node, input_name_to_nodes=None):
@ -228,7 +228,7 @@ class OnnxModel:
        for output in node.output:
            if output in input_name_to_nodes:
                for node in input_name_to_nodes[output]:
-                    children.append(node)
+                    children.append(node)  # noqa: PERF402
        return children

    def get_parents(self, node, output_name_to_node=None):
@ -238,7 +238,7 @@ class OnnxModel:
        parents = []
        for input in node.input:
            if input in output_name_to_node:
-                parents.append(output_name_to_node[input])
+                parents.append(output_name_to_node[input])  # noqa: PERF401
        return parents

    def get_parent(self, node, i, output_name_to_node=None):
@ -659,8 +659,8 @@ class OnnxModel:
                    for vi in model.graph.value_info:
                        if vi.name in name_vi:
                            del name_vi[vi.name]
-                    for _, vi in name_vi.items():
-                        model.graph.value_info.append(vi)
+                    for vi in name_vi.values():
+                        model.graph.value_info.append(vi)  # noqa: PERF402
            except Exception:
                logger.warning(
                    "Failed to run symbolic shape inference. Please file an issue in https://github.com/microsoft/onnxruntime."
@ -792,7 +792,7 @@ class OnnxModel:
        nodes = self.nodes()
        for node in nodes:
            if node.op_type == "Constant" and node.output[0] not in input_name_to_nodes:
-                unused_nodes.append(node)
+                unused_nodes.append(node)  # noqa: PERF401

        self.remove_nodes(unused_nodes)

@ -829,10 +829,7 @@ class OnnxModel:
                all_nodes.append(last_node)
                all_nodes.extend(nodes)

-        nodes_to_remove = []
-        for node in self.model.graph.node:
-            if node not in all_nodes:
-                nodes_to_remove.append(node)
+        nodes_to_remove = [node for node in self.model.graph.node if node not in all_nodes]

        self.remove_nodes(nodes_to_remove)

@ -840,7 +837,7 @@ class OnnxModel:
        output_to_remove = []
        for output in self.model.graph.output:
            if output.name not in outputs:
-                output_to_remove.append(output)
+                output_to_remove.append(output)  # noqa: PERF401
        for output in output_to_remove:
            self.model.graph.output.remove(output)

@ -848,9 +845,7 @@ class OnnxModel:
        input_to_remove = []
        if allow_remove_graph_inputs:
            input_name_to_nodes = self.input_name_to_nodes()
-            for input in self.model.graph.input:
-                if input.name not in input_name_to_nodes:
-                    input_to_remove.append(input)
+            input_to_remove = [input for input in self.model.graph.input if input.name not in input_name_to_nodes]
            for input in input_to_remove:
                self.model.graph.input.remove(input)

@ -887,7 +882,7 @@ class OnnxModel:
        if allow_remove_graph_inputs:
            for input in graph.input:
                if input.name not in remaining_input_names:
-                    inputs_to_remove.append(input)
+                    inputs_to_remove.append(input)  # noqa: PERF401
            for input in inputs_to_remove:
                graph.input.remove(input)

@ -1063,7 +1058,7 @@ class OnnxModel:
        graph_inputs = []
        for input in self.model.graph.input:
            if self.get_initializer(input.name) is None:
-                graph_inputs.append(input)
+                graph_inputs.append(input)  # noqa: PERF401
        return graph_inputs

    def get_opset_version(self):
@ -1217,7 +1212,7 @@ class OnnxModel:
                            sub_graphs.append(attr.g)

                        for g in attr.graphs:
-                            sub_graphs.append(g)
+                            sub_graphs.append(g)  # noqa: PERF402

                        if isinstance(attr.t, TensorProto) and attr.t.data_type == TensorProto.FLOAT16:
                            return True
--- a/onnxruntime/python/tools/transformers/onnx_model_unet.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_unet.py
@ -47,7 +47,7 @@ class UnetOnnxModel(BertOnnxModel):
        nodes_to_remove = []
        for div in div_nodes:
            if self.find_constant_input(div, 1.0) == 1:
-                nodes_to_remove.append(div)
+                nodes_to_remove.append(div)  # noqa: PERF401

        for node in nodes_to_remove:
            self.replace_input_of_all_nodes(node.output[0], node.input[0])
--- a/onnxruntime/python/tools/transformers/shape_optimizer.py
+++ b/onnxruntime/python/tools/transformers/shape_optimizer.py
@ -16,7 +16,7 @@ import tempfile
 from collections import deque  # noqa: F401
 from datetime import datetime
 from pathlib import Path  # noqa: F401
-from typing import List
+from typing import List, Optional

 import numpy as np
 import onnx
@ -78,7 +78,7 @@ class BertOnnxModelShapeOptimizer(OnnxModel):
        shape_inputs = []
        for node in self.model.graph.node:
            if node.op_type == "Reshape":
-                shape_inputs.append(node.input[1])
+                shape_inputs.append(node.input[1])  # noqa: PERF401

        return shape_inputs

@ -287,7 +287,7 @@ class BertOnnxModelShapeOptimizer(OnnxModel):
        input_mask: str,
        enable_shape_opt: bool,
        enable_reshape_opt: bool,
-        output_names: List[str] = None,
+        output_names: Optional[List[str]] = None,
        batch_size=1,
        sequence_length=128,
        verbose=False,
--- a/onnxruntime/test/python/onnxruntime_test_float8.py
+++ b/onnxruntime/test/python/onnxruntime_test_float8.py
@ -37,7 +37,7 @@ class TestInferenceSession(unittest.TestCase):
    <https://onnx.ai/onnx/api/numpy_helper.html#onnx.numpy_helper.float8e5m2_to_float32>`_.
    """

-    dtypes = {"FLOAT": np.float32, "FLOAT16": np.float16}
+    dtypes = frozenset({"FLOAT": np.float32, "FLOAT16": np.float16})
    x = np.array(
        [0.4068359375, 352, 416, 336, 304, 272, -248, -100, 1e-4, 1e-2, 416, 432, 1e5, np.inf, -np.inf, np.nan],
        dtype=np.float32,
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@ -406,7 +406,7 @@ class TestInferenceSession(unittest.TestCase):
                    run_base_test2()
                    run_advanced_test()

-                except OSError:
+                except OSError:  # noqa: PERF203
                    continue
                else:
                    break
--- a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
@ -49,7 +49,7 @@ class TestSymbolicShapeInference(unittest.TestCase):

            # https://github.com/onnx/models/issues/562
            if any(model_name in str(filename) for model_name in skipped_models):
-                print(f"Skip symbolic shape inference on : {str(filename)}")
+                print(f"Skip symbolic shape inference on : {filename!s}")
                continue

            print("Running symbolic shape inference on : " + str(filename))
--- a/onnxruntime/test/python/onnxruntime_test_training_unit_tests.py
+++ b/onnxruntime/test/python/onnxruntime_test_training_unit_tests.py
@ -20,7 +20,8 @@ class TestTrainingDropout(unittest.TestCase):

    @unittest.skip(
        "Temporarily disable this test. The graph below will trigger ORT to "
-        "sort backward graph before forward graph which gives incorrect result."
+        "sort backward graph before forward graph which gives incorrect result. "
+        "https://github.com/microsoft/onnxruntime/issues/16801"
    )
    def test_training_and_eval_dropout(self):
        class TwoDropoutNet(nn.Module):
--- a/onnxruntime/test/python/quantization/test_calibration.py
+++ b/onnxruntime/test/python/quantization/test_calibration.py
@ -35,7 +35,7 @@ class TestDataReader(CalibrationDataReader):
        self.count = 4
        self.input_data_list = []
        for _ in range(self.count):
-            self.input_data_list.append(np.random.normal(0, 0.33, [1, 3, 1, 3]).astype(np.float32))
+            self.input_data_list.append(np.random.normal(0, 0.33, [1, 3, 1, 3]).astype(np.float32))  # noqa: PERF401

    def get_next(self):
        if self.preprocess_flag:
--- a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
+++ b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
@ -93,7 +93,7 @@ class TestDataReader(CalibrationDataReader):
        self.count = 2
        self.input_data_list = []
        for _ in range(self.count):
-            self.input_data_list.append(np.random.normal(0, 0.33, input_shape).astype(np.float32))
+            self.input_data_list.append(np.random.normal(0, 0.33, input_shape).astype(np.float32))  # noqa: PERF401

    def get_next(self):
        if self.preprocess_flag:
@ -144,7 +144,7 @@ class TestSaveActivations(unittest.TestCase):
        data_reader.rewind()
        oracle_outputs = []
        for input_d in data_reader:
-            oracle_outputs.append(infer_session.run(None, input_d))
+            oracle_outputs.append(infer_session.run(None, input_d))  # noqa: PERF401

        output_dict = {}
        output_info = infer_session.get_outputs()
--- a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
+++ b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
@ -342,7 +342,7 @@ def generate_test_data(
        path = os.path.join(output_path, "test_data_set_" + str(test_case))
        try:
            os.mkdir(path)
-        except OSError:
+        except OSError:  # noqa: PERF203
            print("Creation of the directory %s failed" % path)
        else:
            print("Successfully created the directory %s " % path)
--- a/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
+++ b/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
@ -451,7 +451,7 @@ def generate_test_data(
        path = os.path.join(output_path, "test_data_set_" + str(test_case))
        try:
            os.mkdir(path)
-        except OSError:
+        except OSError:  # noqa: PERF203
            print("Creation of the directory %s failed" % path)
        else:
            print("Successfully created the directory %s " % path)
--- a/onnxruntime/test/python/transformers/test_parity_t5_mha.py
+++ b/onnxruntime/test/python/transformers/test_parity_t5_mha.py
@ -491,7 +491,7 @@ class T5Attention(nn.Module):
        # attn_output = self.o(attn_output) # ORT places this matmul outside of MHA op

        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
-        outputs = (attn_output,) + (present_key_value_state,)
+        outputs = (attn_output, present_key_value_state)

        return outputs

@ -628,7 +628,7 @@ class T5Attention(nn.Module):
        if past_key_value is not None and self.is_static_kv:
            output = torch.tensor(ort_output)
        else:
-            output = (torch.tensor(ort_output[0]),) + ((torch.tensor(ort_output[1]), torch.tensor(ort_output[2])),)
+            output = (torch.tensor(ort_output[0]), (torch.tensor(ort_output[1]), torch.tensor(ort_output[2])))

        return output

--- a/orttraining/orttraining/python/checkpointing_utils.py
+++ b/orttraining/orttraining/python/checkpointing_utils.py
@ -53,7 +53,7 @@ class CombineZeroCheckpoint:
        self.weight_shape_map = dict()
        self.sharded_params = set()

-    def _split_name(self, name):
+    def _split_name(self, name: str):
        name_split = name.split("_view_")
        view_num = None
        if len(name_split) > 1:
@ -69,7 +69,7 @@ class CombineZeroCheckpoint:
        elif name_split[0].endswith("_fp16"):
            mp_suffix = "_fp16"
        param_name = name_split[0]
-        if optimizer_key != "":  # noqa: PLC1901
+        if optimizer_key:
            param_name = param_name.split(optimizer_key)[1]
        param_name = param_name.split("_fp16")[0]
        return param_name, optimizer_key, view_num, mp_suffix
--- a/orttraining/orttraining/python/ort_trainer.py
+++ b/orttraining/orttraining/python/ort_trainer.py
@ -1207,10 +1207,10 @@ class LossScaler:
        self,
        loss_scale_input_name,
        is_dynamic_scale,
-        loss_scale=float(1 << 16),  # noqa: B008
+        loss_scale=float(1 << 16),
        up_scale_window=2000,
        min_loss_scale=1.0,
-        max_loss_scale=float(1 << 24),  # noqa: B008
+        max_loss_scale=float(1 << 24),
    ):
        super().__init__()
        self.loss_scale_input_name_ = loss_scale_input_name
--- a/orttraining/orttraining/python/training/_utils.py
+++ b/orttraining/orttraining/python/training/_utils.py
@ -107,7 +107,7 @@ def dtype_torch_to_numpy(torch_dtype):
    elif torch_dtype == torch.bool:
        return np.bool_
    else:
-        raise ValueError(f"torch_dtype ({str(torch_dtype)}) type is not supported by Numpy")
+        raise ValueError(f"torch_dtype ({torch_dtype!s}) type is not supported by Numpy")


 def dtype_onnx_to_torch(onnx_type):
--- a/orttraining/orttraining/python/training/amp/loss_scaler.py
+++ b/orttraining/orttraining/python/training/amp/loss_scaler.py
@ -88,10 +88,10 @@ class DynamicLossScaler(LossScaler):
    def __init__(
        self,
        automatic_update=True,
-        loss_scale=float(1 << 16),  # noqa: B008
+        loss_scale=float(1 << 16),
        up_scale_window=2000,
        min_loss_scale=1.0,
-        max_loss_scale=float(1 << 24),  # noqa: B008
+        max_loss_scale=float(1 << 24),
    ):
        super().__init__(loss_scale)
        self.automatic_update = automatic_update
--- a/orttraining/orttraining/python/training/checkpoint.py
+++ b/orttraining/orttraining/python/training/checkpoint.py
@ -145,7 +145,7 @@ def _order_paths(paths, D_groups, H_groups):
    world_rank = _utils.state_dict_trainer_options_world_rank_key()

    for path in paths:
-        trainer_options_path_tuples.append(
+        trainer_options_path_tuples.append(  # noqa: PERF401
            (_checkpoint_storage.load(path, key=_utils.state_dict_trainer_options_key()), path)
        )

@ -365,7 +365,7 @@ def _get_parallellism_groups(data_parallel_size, horizontal_parallel_size, world
    for data_group_id in range(num_data_groups):
        data_group_ranks = []
        for r in range(data_parallel_size):
-            data_group_ranks.append(data_group_id + horizontal_parallel_size * r)
+            data_group_ranks.append(data_group_id + horizontal_parallel_size * r)  # noqa: PERF401
        data_groups.append(data_group_ranks)

    num_horizontal_groups = world_size // horizontal_parallel_size
@ -373,7 +373,7 @@ def _get_parallellism_groups(data_parallel_size, horizontal_parallel_size, world
    for hori_group_id in range(num_horizontal_groups):
        hori_group_ranks = []
        for r in range(horizontal_parallel_size):
-            hori_group_ranks.append(hori_group_id * horizontal_parallel_size + r)
+            hori_group_ranks.append(hori_group_id * horizontal_parallel_size + r)  # noqa: PERF401
        horizontal_groups.append(hori_group_ranks)

    return data_groups, horizontal_groups
@ -665,10 +665,10 @@ class _CombineZeroCheckpoint:
        self.clean_state_dict = clean_state_dict
        self.world_size = int(self.checkpoint_files[0].split("ZeRO")[1].split(".")[2]) + 1
        assert len(self.checkpoint_files) == self.world_size, f"Could not find {self.world_size} files"
-        self.weight_shape_map = dict()
+        self.weight_shape_map = {}
        self.sharded_params = set()

-    def _split_name(self, name):
+    def _split_name(self, name: str):
        name_split = name.split("_view_")
        view_num = None
        if len(name_split) > 1:
@ -684,7 +684,7 @@ class _CombineZeroCheckpoint:
        elif name_split[0].endswith("_fp16"):
            mp_suffix = "_fp16"
        param_name = name_split[0]
-        if optimizer_key != "":  # noqa: PLC1901
+        if optimizer_key:
            param_name = param_name.split(optimizer_key)[1]
        param_name = param_name.split("_fp16")[0]
        return param_name, optimizer_key, view_num, mp_suffix
--- a/orttraining/orttraining/python/training/onnxblock/optim/optim.py
+++ b/orttraining/orttraining/python/training/onnxblock/optim/optim.py
@ -187,7 +187,7 @@ class AdamW(onnxblock_module.ForwardBlock):

        # Prepare the tensor sequence inputs for params and moments
        for input_name in [params_name, gradients_name, first_order_moments_name, second_order_moments_name]:
-            onnx_model.graph.input.append(
+            onnx_model.graph.input.append(  # noqa: PERF401
                onnx.helper.make_tensor_sequence_value_info(input_name, trainable_parameters[0].data_type, None)
            )

--- a/orttraining/orttraining/python/training/optim/_megatron_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_megatron_modifier.py
@ -48,7 +48,7 @@ class LegacyMegatronLMModifier(FP16OptimizerModifier):
                fp32_params = []
                for param_group in target.optimizer.param_groups:
                    for param in param_group["params"]:
-                        fp32_params.append(param)
+                        fp32_params.append(param)  # noqa: PERF402
                #### THIS IS THE ORIGINAL IMPLEMENTATION ####
                # return self.clip_grad_norm(fp32_params, max_norm, norm_type)
                #### END OF THE ORIGINAL IMPLEMENTATION ####
@ -69,10 +69,10 @@ class LegacyMegatronLMModifier(FP16OptimizerModifier):
            params = []
            for group in target.fp16_groups:
                for param in group:
-                    params.append(param)
+                    params.append(param)  # noqa: PERF402
            for group in target.fp32_from_fp32_groups:
                for param in group:
-                    params.append(param)
+                    params.append(param)  # noqa: PERF402
            #### THIS IS THE ORIGINAL IMPLEMENTATION ####
            # self.overflow = self.loss_scaler.has_overflow(params)
            #### END OF THE ORIGINAL IMPLEMENTATION ####
--- a/orttraining/orttraining/python/training/optim/config.py
+++ b/orttraining/orttraining/python/training/optim/config.py
@ -55,7 +55,7 @@ class _OptimizerConfig:
                "Each dict inside 'params' must contain a {'params' : [model parameter names]} entry"
                " and additional entries for custom hyper parameter values"
            )
-            for k, _ in group.items():
+            for k in group:
                if k != "params":
                    assert (
                        k in defaults or k.replace("_coef", "") in defaults
--- a/orttraining/orttraining/python/training/ort_triton/_cache.py
+++ b/orttraining/orttraining/python/training/ort_triton/_cache.py
@ -48,7 +48,7 @@ def _write(source_code, ext, extra=""):


 class PyCodeCache:
-    cache = dict()
+    cache = dict()  # noqa: RUF012
    clear = staticmethod(cache.clear)

    @classmethod
@ -67,7 +67,7 @@ class PyCodeCache:


 class ModuleCache:
-    cache = dict()
+    cache = dict()  # noqa: RUF012
    clear = staticmethod(cache.clear)

    @classmethod
--- a/orttraining/orttraining/python/training/ort_triton/_codegen.py
+++ b/orttraining/orttraining/python/training/ort_triton/_codegen.py
@ -250,7 +250,7 @@ class TritonCodegen(NodeVisitor):
            elif isinstance(ir_node, ReduceForLoopEnd):
                indent -= 4

-    _COMPUTE_CODE_TEMPLATES = {
+    _COMPUTE_CODE_TEMPLATES = {  # noqa: RUF012
        "Add": "{indent}{o0} = {i0} + {i1}\n",
        "Sub": "{indent}{o0} = {i0} - {i1}\n",
        "Mul": "{indent}{o0} = {i0} * {i1}\n",
--- a/orttraining/orttraining/python/training/ort_triton/_common.py
+++ b/orttraining/orttraining/python/training/ort_triton/_common.py
@ -113,7 +113,7 @@ def _infer_dropout(node: NodeProto, input_infos: List[TensorInfo], graph: GraphP


 class TypeAndShapeInfer:
-    _INFER_FUNC_MAP = {
+    _INFER_FUNC_MAP = {  # noqa: RUF012
        "Add": _infer_elementwise,
        "Sub": _infer_elementwise,
        "Mul": _infer_elementwise,
--- a/orttraining/orttraining/python/training/ort_triton/_lowering.py
+++ b/orttraining/orttraining/python/training/ort_triton/_lowering.py
@ -294,8 +294,8 @@ class GraphLowering:
                producers[output] = node
            for input in node.input:
                if input in producers:
-                    precessors[node.name].append(producers[input])
-        for _, value in precessors.items():
+                    precessors[node.name].append(producers[input])  # noqa: PERF401
+        for value in precessors.values():
            value.sort(key=sorted_nodes.index, reverse=True)
        for idx in range(len(sorted_nodes) - 1, -1, -1):
            node = sorted_nodes[idx]
@ -441,7 +441,9 @@ class GraphLowering:
                assert isinstance(sub_nodes[nxt], ReduceForLoopEnd)
                for reduce_node in sub_nodes[nxt].reduce_nodes:
                    if reduce_node.outputs[0].name in output_name_map:
-                        reduce_store_nodes.append(IONode(reduce_node.outputs[0], kernel_node.offset_calc, False))
+                        reduce_store_nodes.append(  # noqa: PERF401
+                            IONode(reduce_node.outputs[0], kernel_node.offset_calc, False)
+                        )
                new_sub_nodes.append(sub_nodes[nxt])
                nxt += 1
            cur = nxt
--- a/orttraining/orttraining/python/training/ort_triton/_sorted_graph.py
+++ b/orttraining/orttraining/python/training/ort_triton/_sorted_graph.py
@ -86,7 +86,7 @@ class SortedGraph:
        name_map = {}
        for idx, input in enumerate(self._graph.input):
            shape_str = str(self._input_shapes[idx]).replace(" ", "")
-            graph_inputs.append(f"({str(input.type.tensor_type.elem_type)},{shape_str})")
+            graph_inputs.append(f"({input.type.tensor_type.elem_type!s},{shape_str})")
            name_map[input.name] = f"i{idx}"
        graph_inputs_str = ",".join(graph_inputs)

@ -110,7 +110,7 @@ class SortedGraph:
        for node_idx, node in enumerate(self._sorted_nodes):
            inputs = []
            for input in node.input:
-                inputs.append(name_map.get(input, input))
+                inputs.append(name_map.get(input, input))  # noqa: PERF401
            inputs_str = ",".join(inputs)
            outputs = []
            for idx, output in enumerate(node.output):
@ -127,7 +127,7 @@ class SortedGraph:
            attributes_str = ",".join(attributes)
            nodes.append(f"{node.op_type}[{attributes_str}]({inputs_str})->({outputs_str})")
        nodes_str = ",".join(nodes)
-        return f"{graph_inputs_str}|{str(len(self._graph.output))}|{constants_str}|{nodes_str}"
+        return f"{graph_inputs_str}|{len(self._graph.output)!s}|{constants_str}|{nodes_str}"

    def __hash__(self):
        return hash(str(self))
@ -180,7 +180,7 @@ class SortedGraph:
            else:
                input_infos = []
                for input in node.input:
-                    input_infos.append(self._node_arg_infos[input])
+                    input_infos.append(self._node_arg_infos[input])  # noqa: PERF401
                output_infos = TypeAndShapeInfer.infer(node, input_infos, self._graph)
                for idx, output in enumerate(node.output):
                    self._node_arg_infos[output] = output_infos[idx]
--- a/orttraining/orttraining/python/training/ort_triton/_utils.py
+++ b/orttraining/orttraining/python/training/ort_triton/_utils.py
@ -52,7 +52,7 @@ def topological_sort(inputs: List[str], nodes: List[NodeProto]) -> List[NodeProt
                continue
            for consumer in non_const_nodes:
                if output in consumer.input:
-                    output_consumers[node.name].append(consumer)
+                    output_consumers[node.name].append(consumer)  # noqa: PERF401

    # Topological sort.
    visited = set()
--- a/orttraining/orttraining/python/training/ort_triton/kernel/_slice_scel.py
+++ b/orttraining/orttraining/python/training/ort_triton/kernel/_slice_scel.py
@ -357,10 +357,10 @@ def transform_slice_scel(graph):
    all_nodes = []
    for node in graph.node:
        if node not in remove_nodes:
-            all_nodes.append(node)
+            all_nodes.append(node)  # noqa: PERF401

    for node in triton_nodes:
-        all_nodes.append(node)
+        all_nodes.append(node)  # noqa: PERF402

    graph.ClearField("node")
    graph.node.extend(all_nodes)
--- a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
@ -56,8 +56,8 @@ def _to_gradient_definition(gradient):


 class CustomGradientRegistry:
-    _GRADIENTS = {}
-    _STOP_GRADIENT_EDGES = {}
+    _GRADIENTS = {}  # noqa: RUF012
+    _STOP_GRADIENT_EDGES = {}  # noqa: RUF012

    @classmethod
    def register(cls, domain, name, attributes, fn):
--- a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
@ -88,7 +88,7 @@ def wrap_custom_export_function(original_func: Callable) -> Callable:


 class CustomOpSymbolicRegistry:
-    _SYMBOLICS = {}
+    _SYMBOLICS = {}  # noqa: RUF012

    @classmethod
    def register(cls, name, domain, fn):
--- a/orttraining/orttraining/python/training/ortmodule/_io.py
+++ b/orttraining/orttraining/python/training/ortmodule/_io.py
@ -138,7 +138,7 @@ class _OutputIdentityOp(torch.autograd.Function):


 class _PrimitiveType:
-    _primitive_types = {int, bool, float}
+    _primitive_types = {int, bool, float}  # noqa: RUF012

    @staticmethod
    def is_primitive_type(value):
@ -153,7 +153,7 @@ class _PrimitiveType:
        # If `value` is a boolean, save the value of the boolean in dtype.
        # This way, if the value changes from one forward call to the next, the schema will mismatch,
        # and the model will be re-exported.
-        return f"{str(type(value))}_{value}" if isinstance(value, bool) else str(type(value))
+        return f"{type(value)!s}_{value}" if isinstance(value, bool) else str(type(value))


 def flatten_kwargs(kwargs, device):
--- a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
+++ b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
@ -146,7 +146,7 @@ class InputDensityObserver:
            self._tensor_to_node_map.clear()
            for node in model.graph.node:
                for output_name in node.output:
-                    if output_name != "":  # noqa: PLC1901
+                    if output_name != "":
                        self._tensor_to_node_map[output_name] = node

            self._initialize_embedding_padding_inspector(model, user_input_names)
@ -440,7 +440,7 @@ class InputDensityObserver:
            self._stats.clear()

    def _try_get_node_from_its_output(self, name):
-        if name == "" or name not in self._tensor_to_node_map:  # noqa: PLC1901
+        if name == "" or name not in self._tensor_to_node_map:
            return None

        return self._tensor_to_node_map[name]
--- a/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/_hierarchical_ortmodule.py
+++ b/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/_hierarchical_ortmodule.py
@ -36,7 +36,7 @@ class _IteratedORTModule(torch.nn.Module):
        self._it = count - 1
        self._ortmodules = []
        for idx in range(count):
-            self._ortmodules.append(
+            self._ortmodules.append(  # noqa: PERF401
                ORTModule(
                    module,
                    debug_options=DebugOptions(
@ -113,9 +113,9 @@ class HierarchicalORTModule(torch.nn.Module):
            # We cannot skip module in allowlist because it's possible that a module is called multiple times
            # so that we still need to know the number of different input sets and use _IteratedORTModule to handle it.
            handle_pool.append(module.register_forward_pre_hook(record_args))
-            for _, sub_module in module._modules.items():
+            for sub_module in module._modules.values():
                if isinstance(sub_module, torch.nn.ModuleList):
-                    for _, sub_module_item in sub_module._modules.items():
+                    for sub_module_item in sub_module._modules.values():
                        recursive_hook(sub_module_item)
                else:
                    recursive_hook(sub_module)
@ -142,7 +142,7 @@ class HierarchicalORTModule(torch.nn.Module):
                except Exception as e:
                    if self._log_level <= LogLevel.WARNING:
                        warnings.warn(
-                            f"Failed to export module with type {type(module).__name__}. Error message: {str(e)}",
+                            f"Failed to export module with type {type(module).__name__}. Error message: {e!s}",
                            UserWarning,
                        )
                    return False
@ -176,9 +176,9 @@ class HierarchicalORTModule(torch.nn.Module):
                # No sub-module exists, so this module is a leaf
                return

-            for _, sub_module in sub_module_dict.items():
+            for sub_module in sub_module_dict.values():
                if isinstance(sub_module, torch.nn.ModuleList):
-                    for _, sub_module_item in sub_module._modules.items():
+                    for sub_module_item in sub_module._modules.values():
                        check_exportable(sub_module_item)
                else:
                    check_exportable(sub_module)
@ -268,7 +268,7 @@ class HierarchicalORTModule(torch.nn.Module):
            recursive_wrap(self._original_module, self._save_onnx, self._name_prefix)
        if self._log_level <= LogLevel.WARNING:
            warnings.warn(
-                f"Wrapped module: {str(self._original_module)}.",
+                f"Wrapped module: {self._original_module!s}.",
                UserWarning,
            )
        self._initialized = True
--- a/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py
+++ b/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py
@ -49,7 +49,7 @@ def _load_propagate_cast_ops(ortmodule_config_accessor, data):

    key_to_function_mapping = {"Strategy": _update_strategy, "Level": _update_level, "Allow": _update_allow}

-    for key, _ in data.PropagateCastOps.__dict__.items():
+    for key in data.PropagateCastOps.__dict__:
        key_to_function_mapping[key]()


@ -162,7 +162,7 @@ def _load_debug_options(ortmodule_config_accessor, data):
        "SaveONNXPath": _update_onnx_path,
    }

-    for key, _ in data.DebugOptions.__dict__.items():
+    for key in data.DebugOptions.__dict__:
        key_to_function_mapping[key]()

    debug_options = DebugOptions(log_level=log_level, save_onnx=save_onnx, onnx_prefix=onnx_prefix)
@ -301,5 +301,5 @@ def load_from_json(ortmodule, path=None):
        # update the debug config for both train and eval modes
        ortmodule_config_accessor = ortmodule._torch_module._execution_manager(training_mode)
        # iterate over the json data instead of checking for keys in json to catch key errors
-        for key, _ in data.__dict__.items():
+        for key in data.__dict__:
            load_functions[key](ortmodule_config_accessor, data)
--- a/orttraining/orttraining/python/training/ortmodule/graph_transformer_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/graph_transformer_registry.py
@ -9,7 +9,7 @@ from onnx.onnx_ml_pb2 import GraphProto


 class GraphTransformerRegistry:
-    _TRANSFORMER_FUNCS = {}
+    _TRANSFORMER_FUNCS = {}  # noqa: RUF012

    @classmethod
    def register(cls, target_modules: str, devices: str, priority: int, fn: Callable[[GraphProto], None]):
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/install.py
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/install.py
@ -20,7 +20,7 @@ def _list_extensions(path):
    for root, _, files in os.walk(path):
        for name in files:
            if name.lower() == "setup.py":
-                extensions.append(os.path.join(root, name))
+                extensions.append(os.path.join(root, name))  # noqa: PERF401
    return extensions


--- a/orttraining/orttraining/python/training/orttrainer.py
+++ b/orttraining/orttraining/python/training/orttrainer.py
@ -933,7 +933,7 @@ class ORTTrainer:
            # so output will be on the same device as input.
            try:
                torch.device(target_device)
-            except Exception:
+            except Exception:  # noqa: PERF203
                # in this case, input/output must on CPU
                assert input.device.type == "cpu"
                target_device = "cpu"
--- a/orttraining/orttraining/python/training/orttrainer_options.py
+++ b/orttraining/orttraining/python/training/orttrainer_options.py
@ -482,7 +482,7 @@ class ORTTrainerOptions:
    def __repr__(self):
        return "{%s}" % str(
            ", ".join(
-                f"'{k}': {repr(v)}"
+                f"'{k}': {v!r}"
                for (k, v) in self.__dict__.items()
                if k not in ["_original_opts", "_validated_opts", "_main_class_name"]
            )
--- a/orttraining/orttraining/python/training/postprocess.py
+++ b/orttraining/orttraining/python/training/postprocess.py
@ -26,7 +26,7 @@ def find_input_node(model, arg):
    for node in model.graph.node:
        for output in node.output:
            if output == arg:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
    return result[0] if len(result) == 1 else None


@ -35,7 +35,7 @@ def find_output_node(model, arg):
    for node in model.graph.node:
        for input in node.input:
            if input == arg:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
    return result[0] if len(result) == 1 else result


@ -189,7 +189,7 @@ def find_nodes(graph, op_type):
    nodes = []
    for node in graph.node:
        if node.op_type == op_type:
-            nodes.append(node)
+            nodes.append(node)  # noqa: PERF401
    return nodes


@ -382,10 +382,10 @@ def layer_norm_transform(model):
    all_nodes = []
    for node in graph.node:
        if node not in remove_nodes:
-            all_nodes.append(node)
+            all_nodes.append(node)  # noqa: PERF401

    for node in layer_norm_nodes:
-        all_nodes.append(node)
+        all_nodes.append(node)  # noqa: PERF402

    graph.ClearField("node")
    graph.node.extend(all_nodes)
--- a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
+++ b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
@ -210,7 +210,7 @@ def _infer_ep_from_graph_module(graph_module: torch.fx.GraphModule) -> Tuple[str
                if hasattr(output_arg, "meta") and "val" in output_arg.meta:
                    # Select outputs with "val" information. Without "val",
                    # it's not possible access output_arg.meta["val"].device.
-                    output_args.append(output_arg.meta["val"])
+                    output_args.append(output_arg.meta["val"])  # noqa: PERF401
            return _infer_ep_from_device(*output_args)
    graph_module_str = graph_module.print_readable(print_output=False)
    raise ValueError(f"No output node is found in graph_module: {graph_module_str}")
--- a/orttraining/orttraining/test/python/onnxruntime_test_postprocess.py
+++ b/orttraining/orttraining/test/python/onnxruntime_test_postprocess.py
@ -64,7 +64,7 @@ class Test_PostPasses(unittest.TestCase):  # noqa: N801
        nodes = []
        for node in model.graph.node:
            if node.op_type == node_type:
-                nodes.append(node)
+                nodes.append(node)  # noqa: PERF401
        return nodes

    def get_name(self, name):
--- a/orttraining/orttraining/test/python/orttraining_test_checkpoint_storage.py
+++ b/orttraining/orttraining/test/python/orttraining_test_checkpoint_storage.py
@ -159,7 +159,7 @@ def test_checkpoint_storage_saved_dict_matches_loaded(checkpoint_storage_test_pa
 )
 def test_checkpoint_storage_saving_non_supported_types_fails(checkpoint_storage_test_parameterized_setup):
    to_save = checkpoint_storage_test_parameterized_setup
-    with pytest.raises(Exception):
+    with pytest.raises(Exception):  # noqa: B017
        _checkpoint_storage.save(to_save, pytest.checkpoint_path)


@ -233,7 +233,7 @@ def test_checkpoint_storage_saving_and_loading_empty_dictionaries_succeeds(check


 def test_checkpoint_storage_load_file_that_does_not_exist_fails(checkpoint_storage_test_setup):
-    with pytest.raises(Exception):
+    with pytest.raises(Exception):  # noqa: B017
        _checkpoint_storage.load(pytest.checkpoint_path)


--- a/orttraining/orttraining/test/python/orttraining_test_data_loader.py
+++ b/orttraining/orttraining/test/python/orttraining_test_data_loader.py
@ -20,7 +20,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):

    values = []
    for _ in range(total_dims):
-        values.append(rng.randint(0, vocab_size - 1))
+        values.append(rng.randint(0, vocab_size - 1))  # noqa: PERF401

    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()

@ -36,7 +36,7 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):

    values = []
    for _ in range(total_dims):
-        values.append(rng.random() * scale)
+        values.append(rng.random() * scale)  # noqa: PERF401

    return torch.tensor(data=values, dtype=torch.float).view(shape).contiguous()

--- a/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py
+++ b/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py
@ -213,7 +213,7 @@ def test_hierarchical_ortmodule():
        call_backward(y_ref)
        g_ref = []
        for param in m.parameters():
-            g_ref.append(param.grad.detach())
+            g_ref.append(param.grad.detach())  # noqa: PERF401

        m.zero_grad()

@ -224,7 +224,7 @@ def test_hierarchical_ortmodule():
        call_backward(y)
        g = []
        for param in m.parameters():
-            g.append(param.grad.detach())
+            g.append(param.grad.detach())  # noqa: PERF401

        # Some sub-modules become ORTModule.
        assert expected_num_ortmodule == count_ortmodule(m)
--- a/orttraining/orttraining/test/python/orttraining_test_layer_norm_transform.py
+++ b/orttraining/orttraining/test/python/orttraining_test_layer_norm_transform.py
@ -173,10 +173,10 @@ def layer_norm_transform(model_proto):
    all_nodes = []
    for node in graph_proto.node:
        if node not in removed_nodes:
-            all_nodes.append(node)
+            all_nodes.append(node)  # noqa: PERF401

    for node in layer_norm_nodes:
-        all_nodes.append(node)
+        all_nodes.append(node)  # noqa: PERF402

    graph_proto.ClearField("node")
    graph_proto.node.extend(all_nodes)
--- a/orttraining/orttraining/test/python/orttraining_test_model_transform.py
+++ b/orttraining/orttraining/test/python/orttraining_test_model_transform.py
@ -13,7 +13,7 @@ def find_single_output_node(model, arg):
    for node in model.graph.node:
        for input in node.input:
            if input == arg:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
    return result[0] if len(result) == 1 else None


@ -63,7 +63,7 @@ def fix_transpose(model):
                for n in model.graph.node:
                    for input in n.input:
                        if input == weight.name:
-                            result.append(n)
+                            result.append(n)  # noqa: PERF401
                if len(result) > 1:
                    continue
                perm = node.attribute[0]
@ -93,7 +93,7 @@ def fix_transpose(model):
    old_ws = []
    for t in transpose:
        if find_single_output_node(model, t[1].name) is None:
-            old_ws.append(find_weight_index(model, t[1].name))
+            old_ws.append(find_weight_index(model, t[1].name))  # noqa: PERF401
    old_ws.sort(reverse=True)
    for w_i in old_ws:
        del model.graph.initializer[w_i]
--- a/orttraining/orttraining/test/python/orttraining_test_onnx_ops_ortmodule.py
+++ b/orttraining/orttraining/test/python/orttraining_test_onnx_ops_ortmodule.py
@ -78,7 +78,7 @@ class TestOnnxOpsOrtModule(unittest.TestCase):
        self.assertIn('op_type: "%s"' % name, str(onnx_graph_inf))
        for onnx_model in [onnx_graph_inf, onnx_graph_train]:
            for oimp in onnx_model.opset_import:
-                if oimp.domain == "":  # noqa: PLC1901
+                if oimp.domain == "":
                    self.assertEqual(oimp.version, 15)
        if op_grad_type is not None:
            if isinstance(op_grad_type, tuple):
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@ -1497,10 +1497,10 @@ def test_gradient_correctness_einsum(equation):
    rhs_op = equation[pos1 + 1 : pos2]
    lhs_shape = []
    for c in lhs_op:
-        lhs_shape.append(SIZE_MAP[c.upper()])
+        lhs_shape.append(SIZE_MAP[c.upper()])  # noqa: PERF401
    rhs_shape = []
    for c in rhs_op:
-        rhs_shape.append(SIZE_MAP[c.upper()])
+        rhs_shape.append(SIZE_MAP[c.upper()])  # noqa: PERF401

    pt_model = NeuralNetEinsum(lhs_shape[-1]).to(device)
    ort_model = ORTModule(copy.deepcopy(pt_model))
@ -1577,7 +1577,7 @@ def test_gradient_correctness_einsum_2():
            random.shuffle(output_candidates)
            output_candidates = output_candidates[:8]
            for output_candidate in [list(candidate) for candidate in output_candidates]:
-                all_cases.append((lhs_candidate, rhs_candidate, output_candidate))
+                all_cases.append((lhs_candidate, rhs_candidate, output_candidate))  # noqa: PERF401

    for case in all_cases:
        equation = to_string(case[0]) + "," + to_string(case[1]) + "->" + to_string(case[2])
@ -1587,10 +1587,10 @@ def test_gradient_correctness_einsum_2():
        rhs_op = equation[pos1 + 1 : pos2]
        lhs_shape = []
        for c in lhs_op:
-            lhs_shape.append(SIZE_MAP[c.upper()])
+            lhs_shape.append(SIZE_MAP[c.upper()])  # noqa: PERF401
        rhs_shape = []
        for c in rhs_op:
-            rhs_shape.append(SIZE_MAP[c.upper()])
+            rhs_shape.append(SIZE_MAP[c.upper()])  # noqa: PERF401

        pt_model = NeuralNetEinsum(lhs_shape[-1]).to(device)
        ort_model = ORTModule(copy.deepcopy(pt_model))
@ -5895,7 +5895,7 @@ def test_ops_for_padding_elimination(test_cases):
        result = []
        for node in model.graph.node:
            if arg in node.output:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
        return result[0].op_type if len(result) == 1 else None

    gathergrad_input_optypes = [find_input_node_type(training_model, arg) for arg in gathergrad_node.input]
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_triton.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_triton.py
@ -136,7 +136,7 @@ def _torch_layer_norm(input, weight, bias, **kwargs):


 class TorchFuncExecutor:
-    _INFER_FUNC_MAP = {
+    _INFER_FUNC_MAP = {  # noqa: RUF012
        "Add": _torch_add,
        "Sub": _torch_sub,
        "Mul": _torch_mul,
--- a/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py
+++ b/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py
@ -112,7 +112,7 @@ def optimizer_parameters(model):
    no_decay_param_group = []
    for initializer in model.graph.initializer:
        if any(key in initializer.name for key in no_decay_keys):
-            no_decay_param_group.append(initializer.name)
+            no_decay_param_group.append(initializer.name)  # noqa: PERF401
    params = [
        {
            "params": no_decay_param_group,
@ -134,7 +134,7 @@ def load_bert_onnx_model():


 class CustomLossScaler(amp.LossScaler):
-    def __init__(self, loss_scale=float(1 << 16)):  # noqa: B008
+    def __init__(self, loss_scale=float(1 << 16)):
        super().__init__(loss_scale)
        self._initial_loss_scale = loss_scale
        self.loss_scale = loss_scale
@ -151,7 +151,7 @@ class CustomLossScaler(amp.LossScaler):


 class LegacyCustomLossScaler:
-    def __init__(self, loss_scale=float(1 << 16)):  # noqa: B008
+    def __init__(self, loss_scale=float(1 << 16)):
        self._initial_loss_scale = loss_scale
        self.loss_scale_ = loss_scale

--- a/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py
+++ b/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py
@ -28,7 +28,7 @@ pytorch_110 = StrictVersion(".".join(torch.__version__.split(".")[:2])) >= Stric

 def get_model_opset(model_onnx):
    for op in model_onnx.opset_import:
-        if op.domain == "":  # noqa: PLC1901
+        if op.domain == "":
            return op.version
    return None

@ -390,10 +390,10 @@ def testOptimizerConfig(optim_name, lr, alpha, default_alpha):

    # 1:1 mapping between defaults and params's hyper parameters
    for param in params:
-        for k, _ in param.items():
+        for k in param:
            if k != "params":
                assert k in cfg.defaults, "hyper parameter {k} not present in one of the parameter params"
-    for k, _ in cfg.defaults.items():
+    for k in cfg.defaults:
        for param in cfg.params:
            assert k in param, "hyper parameter {k} not present in one of the parameter params"

@ -1039,7 +1039,7 @@ def testORTTrainerInternalUseContribOps(enable_onnx_contrib_ops):
    # Training loop
    data, targets = batcher_fn(train_data, 0)
    if not enable_onnx_contrib_ops and not pytorch_110:
-        with pytest.raises(Exception):
+        with pytest.raises(Exception):  # noqa: B017
            _, _ = trainer.train_step(data, targets)
    else:
        _, _ = trainer.train_step(data, targets)
--- a/orttraining/pytorch_frontend_examples/mnist_training.py
+++ b/orttraining/pytorch_frontend_examples/mnist_training.py
@ -193,8 +193,6 @@ def main():

    for epoch in range(1, args.epochs + 1):
        train_with_trainer(args, trainer, device, train_loader, epoch)
-        import pdb  # noqa: F401
-
        test_with_trainer(args, trainer, device, test_loader)


--- a/orttraining/tools/amdgpu/script/rocprof.py
+++ b/orttraining/tools/amdgpu/script/rocprof.py
@ -15,7 +15,7 @@ def get_gpu_lines(path):
        reader = csv.reader(f, delimiter=",")
        for row in reader:
            if row[2].find("TotalDurationNs") < 0:
-                lines.append(row)
+                lines.append(row)  # noqa: PERF401
        return lines


--- a/orttraining/tools/ci_test/compare_results.py
+++ b/orttraining/tools/ci_test/compare_results.py
@ -19,7 +19,7 @@ class Comparisons:
    def float_le(tolerance=None):
        actual_tolerance = 0.0 if tolerance is None else tolerance
        return Comparison(
-            name="less than or equal to" + (f" (tolerance: {str(actual_tolerance)})" if tolerance is not None else ""),
+            name="less than or equal to" + (f" (tolerance: {actual_tolerance!s})" if tolerance is not None else ""),
            fn=(lambda actual, expected: float(actual) <= float(expected) + actual_tolerance),
        )

--- a/orttraining/tools/scripts/gpt2_model_transform.py
+++ b/orttraining/tools/scripts/gpt2_model_transform.py
@ -28,7 +28,7 @@ def find_input_node(model, arg):
    for node in model.graph.node:
        for output in node.output:
            if output == arg:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
    return result[0] if len(result) == 1 else None


@ -37,7 +37,7 @@ def find_output_node(model, arg):
    for node in model.graph.node:
        for input in node.input:
            if input == arg:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
    return result[0] if len(result) == 1 else None


@ -136,7 +136,7 @@ def process_concat(model):
        assert reshape_node.op_type == "Reshape"
        new_nodes[get_node_index(model, reshape_node)] = shape
        for n in fuse_nodes:
-            delete_nodes.append(get_node_index(model, n))
+            delete_nodes.append(get_node_index(model, n))  # noqa: PERF401

    # insert new shape to reshape
    index = 0
@ -189,7 +189,7 @@ def fix_transpose(model):
                for n in model.graph.node:
                    for input in n.input:
                        if input == weight.name:
-                            result.append(n)
+                            result.append(n)  # noqa: PERF401
                if len(result) > 1:
                    continue
                perm = node.attribute[0]
@ -280,7 +280,7 @@ def remove_input_ids_check_subgraph(model):

    remove_node_index = []
    for n in removed_nodes:
-        remove_node_index.append(get_node_index(model, n))
+        remove_node_index.append(get_node_index(model, n))  # noqa: PERF401

    remove_node_index = list(set(remove_node_index))
    remove_node_index.sort(reverse=True)
--- a/orttraining/tools/scripts/layer_norm_transform.py
+++ b/orttraining/tools/scripts/layer_norm_transform.py
@ -141,10 +141,10 @@ def main():
    all_nodes = []
    for node in graph_proto.node:
        if node not in removed_nodes:
-            all_nodes.append(node)
+            all_nodes.append(node)  # noqa: PERF401

    for node in layer_norm_nodes:
-        all_nodes.append(node)
+        all_nodes.append(node)  # noqa: PERF402

    graph_proto.ClearField("node")
    graph_proto.node.extend(all_nodes)
--- a/orttraining/tools/scripts/model_transform.py
+++ b/orttraining/tools/scripts/model_transform.py
@ -26,7 +26,7 @@ def find_input_node(model, arg):
    for node in model.graph.node:
        for output in node.output:
            if output == arg:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
    return result[0] if len(result) == 1 else None


@ -35,7 +35,7 @@ def find_output_node(model, arg):
    for node in model.graph.node:
        for input in node.input:
            if input == arg:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
    return result[0] if len(result) == 1 else None


@ -94,7 +94,7 @@ def process_concat(model):
        if node.op_type == "Concat":
            input_nodes = []
            for input in node.input:
-                input_nodes.append(find_input_node(model, input))
+                input_nodes.append(find_input_node(model, input))  # noqa: PERF401
            # figure out target shape
            shape = []
            for input_node in input_nodes:
@ -116,7 +116,7 @@ def process_concat(model):
            assert reshape_node.op_type == "Reshape"
            new_nodes[get_node_index(model, reshape_node)] = shape
            for n in fuse_nodes:
-                delete_nodes.append(get_node_index(model, n))
+                delete_nodes.append(get_node_index(model, n))  # noqa: PERF401
    # insert new shape to reshape
    index = 0
    for reshape_node_index in new_nodes:
@ -218,7 +218,7 @@ def fix_transpose(model):
                for n in model.graph.node:
                    for input in n.input:
                        if input == weight.name:
-                            result.append(n)
+                            result.append(n)  # noqa: PERF401
                if len(result) > 1:
                    continue
                perm = node.attribute[0]
@ -242,7 +242,7 @@ def fix_transpose(model):
    old_ws = []
    for t in transpose:
        if find_output_node(model, t[1].name) is None:
-            old_ws.append(find_weight_index(model, t[1].name))
+            old_ws.append(find_weight_index(model, t[1].name))  # noqa: PERF401
    old_ws.sort(reverse=True)
    for w_i in old_ws:
        del model.graph.initializer[w_i]
--- a/orttraining/tools/scripts/opset12_model_transform.py
+++ b/orttraining/tools/scripts/opset12_model_transform.py
@ -34,7 +34,7 @@ def find_input_node(model, arg):
    for node in model.graph.node:
        for output in node.output:
            if output == arg:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
    return result[0] if len(result) == 1 else None


--- a/orttraining/tools/scripts/performance_investigation.py
+++ b/orttraining/tools/scripts/performance_investigation.py
@ -30,11 +30,11 @@ def process_file(onnx_file):
        if node.op_type == "ATen":
            for attr in node.attribute:
                if attr.name == "operator":
-                    aten_ops.append(f"{node.name}: {attr.s.decode('utf-8')}")
+                    aten_ops.append(f"{node.name}: {attr.s.decode('utf-8')}")  # noqa: PERF401
        if node.op_type == "PythonOp":
            for attr in node.attribute:
                if attr.name == "name":
-                    python_ops.append(f"{node.name}: {attr.s.decode('utf-8')}")
+                    python_ops.append(f"{node.name}: {attr.s.decode('utf-8')}")  # noqa: PERF401

        # Look for stand-alone Dropout node in *_execution_model_<mode>.onnx graph.
        # Examine whether it should be fused with surrounding Add ops into BiasDropout node.
--- a/orttraining/tools/scripts/pipeline_model_split.py
+++ b/orttraining/tools/scripts/pipeline_model_split.py
@ -49,7 +49,7 @@ def split_graph(model, split_edge_groups):
                            element_types.append(1)
            for info in model.graph.value_info:
                if info.name == id:
-                    output_shapes.append(info.type)
+                    output_shapes.append(info.type)  # noqa: PERF401

        send_input_signal_name = "send_input_signal" + str(cut_index)
        send_signal = model.graph.input.add()
@ -279,14 +279,14 @@ def generate_subgraph(model, start_nodes, identity_node_list):
    # remove added identity node before copy to subgraph
    identity_node_index = []
    for n in identity_node_list:
-        identity_node_index.append(get_identity_index_for_deleting(main_graph.graph.node, n))
+        identity_node_index.append(get_identity_index_for_deleting(main_graph.graph.node, n))  # noqa: PERF401
    identity_node_index.sort(reverse=True)

    for i in reversed(range(len(main_graph.graph.node))):
        try:
            if i in identity_node_index:
                del main_graph.graph.node[i]
-        except Exception:
+        except Exception:  # noqa: PERF203
            print("error deleting identity node", i)

    all_visited_nodes = []
@ -316,19 +316,19 @@ def generate_subgraph(model, start_nodes, identity_node_list):
        # gather visited nodes
        visited_nodes = []
        for n in visited0:
-            visited_nodes.append(get_index(main_graph.graph.node, n))
+            visited_nodes.append(get_index(main_graph.graph.node, n))  # noqa: PERF401
        visited_nodes.sort(reverse=True)

        # gather visited inputs
        visited_inputs = []
        for n in inputs0:
-            visited_inputs.append(get_index(main_graph.graph.input, n))
+            visited_inputs.append(get_index(main_graph.graph.input, n))  # noqa: PERF401
        visited_inputs.sort(reverse=True)

        # gather visited outputs
        visited_outputs = []
        for n in outputs0:
-            visited_outputs.append(get_index(main_graph.graph.output, n))
+            visited_outputs.append(get_index(main_graph.graph.output, n))  # noqa: PERF401
        visited_outputs.sort(reverse=True)

        for i in reversed(range(len(main_graph.graph.node))):
@ -337,7 +337,7 @@ def generate_subgraph(model, start_nodes, identity_node_list):
                    del subgraph.graph.node[i]
                else:
                    del main_graph.graph.node[i]
-            except Exception:
+            except Exception:  # noqa: PERF203
                print("error deleting node", i)

        for i in reversed(range(len(main_graph.graph.input))):
@ -346,7 +346,7 @@ def generate_subgraph(model, start_nodes, identity_node_list):
                    del subgraph.graph.input[i]
                else:
                    del main_graph.graph.input[i]
-            except Exception:
+            except Exception:  # noqa: PERF203
                print("error deleting inputs", i)

        for i in reversed(range(len(main_graph.graph.output))):
@ -355,7 +355,7 @@ def generate_subgraph(model, start_nodes, identity_node_list):
                    del subgraph.graph.output[i]
                else:
                    del main_graph.graph.output[i]
-            except Exception:
+            except Exception:  # noqa: PERF203
                print("error deleting outputs ", i)

        print("model", str(model_count), " length ", len(subgraph.graph.node))
--- a/pyproject.toml
+++ b/pyproject.toml
@ -45,20 +45,22 @@ reportMissingImports = false
 # NOTE: Do not create an exclude list. Edit .lintrunner.toml instead
 target-version = "py38"
 select = [
+    "B", # flake8-bugbear
    "E", # pycodestyle
    "F", # Pyflakes
-    "W", # pycodestyle
-    "B", # flake8-bugbear
-    "N", # pep8-naming
    "ISC", # flake8-implicit-str-concat
-    "YTT", # flake8-2020
-    "RUF", # Ruff-specific rules
-    "SIM", # flake8-simplify
-    "UP", # pyupgrade
+    "N", # pep8-naming
+    "NPY", # numpy
+    "PERF", # Perflint
+    "PLC", # pylint conventions
    "PLE", # pylint errors
    "PLW", # pylint warnings
-    "PLC", # pylint conventions
-    "NPY", # numpy
+    "RUF", # Ruff-specific rules
+    "SIM", # flake8-simplify
+    "T10", # flake8-debugger
+    "UP", # pyupgrade
+    "W", # pycodestyle
+    "YTT", # flake8-2020
 ]
 # NOTE: Refrain from growing the ignore list unless for exceptional cases.
 # Always include a comment to explain why.
--- a/requirements-lintrunner.txt
+++ b/requirements-lintrunner.txt
@ -1,9 +1,9 @@
 # This file is auto updated by dependabot
 lintrunner-adapters>=0.8.0
 # RUFF, RUFF-FIX
-ruff==0.0.261
+ruff==0.0.278
 # BLACK-ISORT
-black==23.3.0
+black==23.7.0
 isort==5.12.0
 # PYLINT
 pylint==2.17.2
--- a/tools/android_custom_build/build_custom_android_package.py
+++ b/tools/android_custom_build/build_custom_android_package.py
@ -120,7 +120,8 @@ def main():
        "--file",
        str(SCRIPT_DIR / "Dockerfile"),
        *docker_build_image_args,
-    ] + [str(SCRIPT_DIR)]
+        str(SCRIPT_DIR),
+    ]

    run(docker_build_image_cmd)

@ -154,7 +155,10 @@ def main():
    # enable use of Ctrl-C to stop when running interactively
    docker_run_interactive_args = ["-it"] if sys.stdin.isatty() else []

-    docker_container_build_cmd = [args.docker_path, "run", *docker_run_interactive_args] + [
+    docker_container_build_cmd = [
+        args.docker_path,
+        "run",
+        *docker_run_interactive_args,
        f"--name={args.docker_container_name}" if args.docker_container_name is not None else "--rm",
        f"--volume={working_dir}:/workspace/shared",
        args.docker_image_tag,
--- a/tools/ci_build/compile_triton.py
+++ b/tools/ci_build/compile_triton.py
@ -102,7 +102,7 @@ def convert_and_save(metadata, header_file, out_dir, out_obj_file):
        # convert constants
        constants = []
        for k, v in m["constants"].items():
-            constants.append(f'{{ "{k}", {str(v)}}}')
+            constants.append(f'{{ "{k}", {v!s}}}')
        meta_ele.append(f"{{ { ', '.join(constants) } }}")

        c_metadata.append(f"{{ { ', '.join(meta_ele) } }}")
--- a/tools/ci_build/get_docker_image.py
+++ b/tools/ci_build/get_docker_image.py
@ -86,7 +86,7 @@ def main():
        manylinux_build_scripts_folder = Path(args.manylinux_src) / "docker" / "build_scripts"
        dest = Path(args.context) / "build_scripts"
        if dest.exists():
-            log.info(f"Deleting: {str(dest)}")
+            log.info(f"Deleting: {dest!s}")
            shutil.rmtree(str(dest))
        shutil.copytree(str(manylinux_build_scripts_folder), str(dest))
        src_entrypoint_file = str(Path(args.manylinux_src) / "docker" / "manylinux-entrypoint")
--- a/tools/ci_build/github/js/validate-npm-packages.py
+++ b/tools/ci_build/github/js/validate-npm-packages.py
@ -110,7 +110,7 @@ ort_common_from = "" if not ort_common_ver else ("node" if RELEASE_NODE else ("w
 print("====== output environment variables ======")
 print(f"##vso[task.setvariable variable=ORT_COMMON_FROM]{ort_common_from}")

-if tag == "latest" or tag == "" or tag == "rc":  # noqa: PLC1901
+if tag == "latest" or tag == "" or tag == "rc":
    if not RELEASE_NODE or not RELEASE_WEB or not RELEASE_REACT_NATIVE:
        raise Exception("@latest or @rc build must release all packages (node, web, react-native)")
    if count_ort_node_common_tgz != 1:
@ -137,7 +137,7 @@ print(f"ort_node_ver={ort_node_ver}")
 print(f"ort_web_ver={ort_web_ver}")
 print(f"ort_react_native_ver={ort_react_native_ver}")

-if tag == "latest" or tag == "":  # noqa: PLC1901
+if tag == "latest" or tag == "":
    print("Publishing @latest ...")
    if not source_branch.startswith("refs/heads/rel-"):
        raise Exception('@latest build must publish from source branch "refs/heads/rel-*"')
@ -164,5 +164,5 @@ if (
    and "+" not in ort_web_ver.replace("-rev", "")
    and "+" not in ort_react_native_ver.replace("-rev", "")
 ):
-    if tag != "latest" and tag != "":  # noqa: PLC1901
+    if tag != "latest" and tag != "":
        raise Exception("default version without decorator can only be published in @latest tag")
--- a/Показать больше
+++ b/Показать больше