From faea42af95253771c530242b917376c36b4d75ec Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Wed, 13 Mar 2024 10:00:32 -0700
Subject: [PATCH] Bump ruff to 0.3.2 and black to 24 (#19878)

### Motivation and Context

Routing updates
---
 cgmanifests/generate_cgmanifest.py            |   4 +-
 .../examples/plot_train_convert_predict.py    |   2 +-
 .../python/tools/microbench/benchmark.py      |   9 +-
 .../tools/quantization/base_quantizer.py      |  12 +-
 .../python/tools/quantization/calibrate.py    |   6 +-
 .../quantization/matmul_4bits_quantizer.py    |   2 +-
 .../python/tools/quantization/onnx_model.py   |   6 +-
 .../tools/quantization/operators/concat.py    |   4 +-
 .../tools/quantization/operators/gemm.py      |   4 +-
 .../tools/quantization/qdq_quantizer.py       |   4 +-
 .../python/tools/quantization/quant_utils.py  |   4 +-
 .../python/tools/symbolic_shape_infer.py      |  58 +--
 .../python/tools/tensorrt/perf/benchmark.py   |   4 +-
 .../tools/tensorrt/perf/benchmark_wrapper.py  |   6 +-
 .../python/tools/transformers/benchmark.py    |   4 +-
 .../tools/transformers/benchmark_helper.py    |   2 +-
 .../tools/transformers/bert_perf_test.py      |   6 +-
 .../transformers/compare_bert_results.py      |  16 +-
 .../python/tools/transformers/float16.py      |   4 +-
 .../transformers/fusion_attention_unet.py     |   8 +-
 .../tools/transformers/fusion_embedlayer.py   |   9 +-
 .../transformers/fusion_qordered_gelu.py      |   8 +-
 .../transformers/fusion_qordered_layernorm.py |   8 +-
 .../models/gpt2/benchmark_gpt2.py             |   2 +-
 .../transformers/models/gpt2/gpt2_helper.py   |   6 +-
 .../transformers/models/gpt2/gpt2_parity.py   |   2 +-
 .../transformers/models/llama/benchmark.py    |  34 +-
 .../models/llama/convert_to_onnx.py           |   8 +-
 .../models/longformer/benchmark_longformer.py |   4 +-
 .../models/phi2/inference_example.py          |  31 +-
 .../stable_diffusion/diffusion_models.py      |   1 -
 .../models/whisper/convert_to_onnx.py         |   6 +-
 .../models/whisper/whisper_chain.py           |   8 +-
 .../models/whisper/whisper_helper.py          |   2 +-
 .../tools/transformers/onnx_model_phi.py      |   6 +-
 .../tools/transformers/onnx_model_unet.py     |   2 +-
 .../tools/transformers/shape_optimizer.py     |   4 +-
 .../reduction_test_cases_generator.py         |   4 +-
 onnxruntime/test/providers/cpu/rnn/GRU.py     |   2 +-
 onnxruntime/test/providers/cpu/rnn/LSTM.py    |   2 +-
 .../test/python/quantization/test_op_pad.py   |   8 +-
 .../test_quantizeblockwise_4bits.py           |   8 +-
 .../transformers/bert_model_generator.py      |  24 +-
 .../transformers/conformer_model_generator.py |   4 +-
 .../transformers/gpt2_model_generator.py      | 360 ++++++++++--------
 .../sharded_moe/test_sharded_moe.py           |  68 ++--
 .../generate_tiny_keras2onnx_bert_models.py   |   4 +-
 .../python/transformers/test_flash_attn.py    |  16 +-
 .../transformers/whisper_model_generator.py   |   4 +-
 .../custom_op_test_float8.py                  |   1 +
 .../adamw_test/adamw_test_data_generator.py   |   2 +-
 .../transform/fusion/embed_layer_norm_gen.py  | 260 +++++++------
 .../orttraining/python/training/__init__.py   |   4 +-
 .../training/optim/_apex_amp_modifier.py      |   1 -
 .../python/training/ort_triton/_lowering.py   |   2 +-
 .../training/ort_triton/kernel/_flash_attn.py |   2 +-
 .../ortmodule/_fallback_exceptions.py         |  12 -
 .../ortmodule/_graph_execution_manager.py     |   7 +-
 .../python/training/ortmodule/_logger.py      |   8 +-
 .../training/ortmodule/_runtime_inspector.py  |  12 +-
 .../training/ortmodule/_training_manager.py   |   4 +-
 .../python/training/ortmodule/_utils.py       |   2 +-
 .../python/training/ortmodule/options.py      |   1 -
 .../test/external_custom_ops/setup.py         |   4 +-
 .../orttraining/test/python/_test_commons.py  |   2 +-
 .../orttraining/test/python/_test_helpers.py  |   1 -
 .../test/python/orttraining_test_gru.py       |   4 +-
 .../test/python/orttraining_test_lstm.py      |  16 +-
 .../orttraining_test_ort_apis_onnxblock.py    |   8 +-
 .../python/orttraining_test_ortmodule_api.py  |  30 +-
 ...training_test_ortmodule_bert_classifier.py |   2 +-
 ...test_ortmodule_bert_classifier_autocast.py |   2 +-
 ...g_test_ortmodule_deepspeed_zero_stage_1.py |  25 +-
 ...t_ortmodule_fairscale_sharded_optimizer.py |  16 +-
 .../orttraining_test_ortmodule_onnx_ops.py    |   1 +
 .../python/orttraining_test_ortmodule_poc.py  |  18 +-
 .../test/python/qat_poc_example/quantize.py   |   2 +-
 orttraining/tools/amdgpu/script/rocprof.py    |  12 +-
 .../tools/ci_test/run_bert_perf_test.py       |   4 +-
 .../tools/scripts/nv_run_pretraining.py       |  12 +-
 orttraining/tools/scripts/watch_experiment.py |  12 +-
 pyproject.toml                                |  12 +-
 requirements-lintrunner.txt                   |   4 +-
 setup.py                                      |  14 +-
 tools/ci_build/build.py                       |  20 +-
 tools/ci_build/clean_docker_image_cache.py    |   6 +-
 tools/ci_build/get_docker_image.py            |   6 +-
 .../github/android/build_aar_package.py       |   8 +-
 .../apple/build_and_assemble_apple_pods.py    |   4 +-
 .../github/apple/build_apple_framework.py     |   8 +-
 .../ort_minimal/check_build_binary_size.py    |   4 +-
 .../windows/post_binary_sizes_to_dashboard.py |   2 +-
 tools/ci_build/op_registration_utils.py       |   2 -
 tools/ci_build/op_registration_validator.py   |   2 +-
 tools/doc/rename_folders.py                   |   1 +
 .../nuget/generate_nuspec_for_native_nuget.py |  10 +-
 tools/python/dump_ort_model.py                |  18 +-
 ...ptimizer_opset_version_updates_required.py |   4 +-
 tools/python/gen_contrib_doc.py               |   6 +-
 .../python/util/convert_onnx_models_to_ort.py |  14 +-
 .../check_model_can_use_ort_mobile_pkg.py     |   2 +-
 .../operator_type_usage_processors.py         |  12 +-
 .../ort_format_model/ort_model_processor.py   |   8 +-
 103 files changed, 702 insertions(+), 764 deletions(-)

diff --git a/cgmanifests/generate_cgmanifest.py b/cgmanifests/generate_cgmanifest.py
index 81181d3ccfb20..3cecbb0cc977f 100644
--- a/cgmanifests/generate_cgmanifest.py
+++ b/cgmanifests/generate_cgmanifest.py
@@ -115,8 +115,8 @@ def normalize_path_separators(path):
 submodule_lines = proc.stdout.splitlines()
 for submodule_line in submodule_lines:
     (absolute_path, url, commit) = submodule_line.split(" ")
-    git_deps[GitDep(commit, url)] = "git submodule at {}".format(
-        normalize_path_separators(os.path.relpath(absolute_path, REPO_DIR))
+    git_deps[GitDep(commit, url)] = (
+        f"git submodule at {normalize_path_separators(os.path.relpath(absolute_path, REPO_DIR))}"
     )
 
 with open(os.path.join(SCRIPT_DIR, "..", "cmake", "deps.txt")) as f:
diff --git a/docs/python/examples/plot_train_convert_predict.py b/docs/python/examples/plot_train_convert_predict.py
index dcbc84b20767a..44b6bb74c29df 100644
--- a/docs/python/examples/plot_train_convert_predict.py
+++ b/docs/python/examples/plot_train_convert_predict.py
@@ -134,7 +134,7 @@ def loop(X_test, fct, n=None):
     nrow = X_test.shape[0]
     if n is None:
         n = nrow
-    for i in range(0, n):
+    for i in range(n):
         im = i % nrow
         fct(X_test[im : im + 1])
 
diff --git a/onnxruntime/python/tools/microbench/benchmark.py b/onnxruntime/python/tools/microbench/benchmark.py
index a52740d45956c..a5936afcfe13e 100644
--- a/onnxruntime/python/tools/microbench/benchmark.py
+++ b/onnxruntime/python/tools/microbench/benchmark.py
@@ -147,20 +147,17 @@ def __init__(self, args):
 
     @classmethod
     @abstractmethod
-    def create_inputs_outputs(cls, op_param):
-        ...
+    def create_inputs_outputs(cls, op_param): ...
 
     def add_case(self, op_param, model):
         self.cases += [(op_param, model)]
 
     @abstractmethod
-    def create_cases(self):
-        ...
+    def create_cases(self): ...
 
     @classmethod
     @abstractmethod
-    def case_profile(cls, op_param, time):
-        ...
+    def case_profile(cls, op_param, time): ...
 
     def benchmark(self):
         self.create_cases()
diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py
index 6fa88a9e44232..667d7047c1fbd 100644
--- a/onnxruntime/python/tools/quantization/base_quantizer.py
+++ b/onnxruntime/python/tools/quantization/base_quantizer.py
@@ -187,17 +187,13 @@ def check_opset_version(self):
 
         if opset_version == 10:
             logging.warning(
-                "The original model opset version is {}, which does not support node fusions. Please update the model to opset >= 11 for better performance.".format(
-                    opset_version
-                )
+                f"The original model opset version is {opset_version}, which does not support node fusions. Please update the model to opset >= 11 for better performance."
             )
             return 10
 
         if opset_version < 10:
             logging.warning(
-                "The original model opset version is {}, which does not support quantization. Please update the model to opset >= 11. Updating the model automatically to opset 11. Please verify the quantized model.".format(
-                    opset_version
-                )
+                f"The original model opset version is {opset_version}, which does not support quantization. Please update the model to opset >= 11. Updating the model automatically to opset 11. Please verify the quantized model."
             )
             self.model.model.opset_import.remove(ai_onnx_domain[0])
             self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 11)])
@@ -205,9 +201,9 @@ def check_opset_version(self):
 
         if opset_version < 19 and self.weight_qType == onnx.TensorProto.FLOAT8E4M3FN:
             logging.warning(
-                "The original model opset version is {}, which does not support quantization to float 8. "
+                f"The original model opset version is {opset_version}, which does not support quantization to float 8. "
                 "Please update the model to opset >= 19. Updating the model automatically to opset 19. "
-                "Please verify the quantized model.".format(opset_version)
+                "Please verify the quantized model."
             )
             self.model.model.opset_import.remove(ai_onnx_domain[0])
             self.model.model.opset_import.extend([onnx.helper.make_opsetid("", 19)])
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index 624049b244580..971cc203f4d73 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -918,11 +918,7 @@ def compute_entropy(self):
         thresholds_dict = {}  # per tensor thresholds
 
         print(f"Number of tensors : {len(histogram_dict)}")
-        print(
-            "Number of histogram bins : {} (The number may increase depends on the data it collects)".format(
-                self.num_bins
-            )
-        )
+        print(f"Number of histogram bins : {self.num_bins} (The number may increase depends on the data it collects)")
         print(f"Number of quantized bins : {self.num_quantized_bins}")
 
         for tensor, histogram in histogram_dict.items():
diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index a1916e806c5c0..f4bcd508960a1 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -216,7 +216,7 @@ def pack_on_row_fast_248bit(pack_tensor, ori_int_tensor, bits):
             pack_tensor = pack_tensor.T
         if bits in [2, 4, 8]:
             compress_ratio = pack_tensor.element_size() * 8 // bits
-            for j in range(0, compress_ratio):
+            for j in range(compress_ratio):
                 pack_tensor[0:] |= ori_int_tensor[j::compress_ratio] << (bits * (j))
         else:
             raise NotImplementedError("Only 2,4,8 bits are supported.")
diff --git a/onnxruntime/python/tools/quantization/onnx_model.py b/onnxruntime/python/tools/quantization/onnx_model.py
index 46d245d353a07..716dd1eacec6a 100644
--- a/onnxruntime/python/tools/quantization/onnx_model.py
+++ b/onnxruntime/python/tools/quantization/onnx_model.py
@@ -79,11 +79,7 @@ def _clean_initializers_helper(graph, model):
                 graph.input.remove(name_to_input[initializer.name])
             except StopIteration:
                 if model.ir_version < 4:
-                    print(
-                        "Warning: invalid weight name {} found in the graph (not a graph input)".format(
-                            initializer.name
-                        )
-                    )
+                    print(f"Warning: invalid weight name {initializer.name} found in the graph (not a graph input)")
 
     requesting_tensor_names.difference_update(input.name for input in graph.input)
 
diff --git a/onnxruntime/python/tools/quantization/operators/concat.py b/onnxruntime/python/tools/quantization/operators/concat.py
index a4f359cf56847..57fcec9cd380b 100644
--- a/onnxruntime/python/tools/quantization/operators/concat.py
+++ b/onnxruntime/python/tools/quantization/operators/concat.py
@@ -30,7 +30,7 @@ def quantize(self):
             zero_point_names,
             scale_names,
             nodes,
-        ) = self.quantizer.quantize_activation(node, [*range(0, len(node.input))])
+        ) = self.quantizer.quantize_activation(node, [*range(len(node.input))])
         if not data_found or q_input_names is None:
             return super().quantize()
 
@@ -52,7 +52,7 @@ def quantize(self):
         qnode_name = node.name + "_quant" if node.name else ""
 
         qlconcat_inputs = [output_scale_name, output_zp_name]
-        for i in range(0, len(q_input_names)):
+        for i in range(len(q_input_names)):
             qlconcat_inputs.extend([q_input_names[i], scale_names[i], zero_point_names[i]])
         qlconcat_node = onnx.helper.make_node(
             "QLinearConcat", qlconcat_inputs, [quantized_output_value.q_name], qnode_name, **kwargs
diff --git a/onnxruntime/python/tools/quantization/operators/gemm.py b/onnxruntime/python/tools/quantization/operators/gemm.py
index 32fdb729635a8..d269c8fb47bd1 100644
--- a/onnxruntime/python/tools/quantization/operators/gemm.py
+++ b/onnxruntime/python/tools/quantization/operators/gemm.py
@@ -157,7 +157,5 @@ def quantize(self):
                 set_default_beta(self.node)
             else:
                 logging.warning(
-                    "Bias of Gemm node '{}' is not constant. Please exclude this node for better performance.".format(
-                        self.node.name
-                    )
+                    f"Bias of Gemm node '{self.node.name}' is not constant. Please exclude this node for better performance."
                 )
diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
index e221a2d57db8b..1875c552fab9c 100644
--- a/onnxruntime/python/tools/quantization/qdq_quantizer.py
+++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -153,9 +153,7 @@ def _is_tensor_quantizable(self, tensor_name):
                 return True
         else:
             logging.warning(
-                "failed to infer the type of tensor: {}. Skip to quantize it. Please check if it is expected.".format(
-                    tensor_name
-                )
+                f"failed to infer the type of tensor: {tensor_name}. Skip to quantize it. Please check if it is expected."
             )
 
         return False
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index 036f49b420734..131e55458fb86 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -276,7 +276,7 @@ def compute_scale_zp_float8(element_type, std):
             from onnx.reference.custom_element_types import float8e4m3fn
 
             zp_dtype = float8e4m3fn
-            all_values = [float8e4m3_to_float32(i) for i in range(0, 256)]
+            all_values = [float8e4m3_to_float32(i) for i in range(256)]
             values = numpy.array(
                 [f for f in all_values if not numpy.isnan(f) and not numpy.isinf(f)], dtype=numpy.float32
             )
@@ -530,7 +530,7 @@ def get_elem_index(elem_name, elem_list):
     Helper function to return index of an item in a node list
     """
     elem_idx = -1
-    for i in range(0, len(elem_list)):
+    for i in range(len(elem_list)):
         if elem_list[i] == elem_name:
             elem_idx = i
     return elem_idx
diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index 4b029f9b172b0..8a911071864aa 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -282,7 +282,7 @@ def _add_suggested_merge(self, symbols, apply=False):
         # when nothing to map to, use the shorter one
         if map_to is None:
             if self.verbose_ > 0:
-                logger.warning("Potential unsafe merge between symbolic expressions: ({})".format(",".join(symbols)))
+                logger.warning("Potential unsafe merge between symbolic expressions: (%s)", ",".join(symbols))
             symbols_list = list(symbols)
             lens = [len(s) for s in symbols_list]
             map_to = symbols_list[lens.index(min(lens))]
@@ -335,10 +335,7 @@ def _merge_symbols(self, dims):
                     int_dim = is_int.index(1)
                     if self.verbose_ > 0:
                         logger.debug(
-                            "dim {} has been merged with value {}".format(
-                                unique_dims[:int_dim] + unique_dims[int_dim + 1 :],
-                                unique_dims[int_dim],
-                            )
+                            f"dim {unique_dims[:int_dim] + unique_dims[int_dim + 1 :]} has been merged with value {unique_dims[int_dim]}"
                         )
                     self._check_merged_dims(unique_dims, allow_broadcast=False)
                     return unique_dims[int_dim]
@@ -379,7 +376,7 @@ def _broadcast_shapes(self, shape1, shape2):
                     if self.auto_merge_:
                         self._add_suggested_merge([dim1, dim2], apply=True)
                     else:
-                        logger.warning("unsupported broadcast between " + str(dim1) + " " + str(dim2))
+                        logger.warning("unsupported broadcast between " + str(dim1) + " " + str(dim2))  # noqa: G003
             new_shape = [new_dim, *new_shape]
         return new_shape
 
@@ -663,12 +660,7 @@ def _new_symbolic_dim(self, prefix, dim):
 
     def _new_symbolic_dim_from_output(self, node, out_idx=0, dim=0):
         return self._new_symbolic_dim(
-            "{}{}_{}_o{}_".format(
-                node.op_type,
-                self.prefix_,
-                list(self.out_mp_.graph.node).index(node),
-                out_idx,
-            ),
+            f"{node.op_type}{self.prefix_}_{list(self.out_mp_.graph.node).index(node)}_o{out_idx}_",
             dim,
         )
 
@@ -1216,9 +1208,7 @@ def _infer_Loop(self, node):  # noqa: N802
         if need_second_infer:
             if self.verbose_ > 2:
                 logger.debug(
-                    "Rerun Loop: {}({}...), because of sequence in loop carried variables".format(
-                        node.name, node.output[0]
-                    )
+                    f"Rerun Loop: {node.name}({node.output[0]}...), because of sequence in loop carried variables"
                 )
             self._onnx_infer_subgraph(node, subgraph, inc_subgraph_id=False)
 
@@ -1843,7 +1833,7 @@ def handle_negative_index(index, bound):
             axes = self._try_get_value(node, 3)
             steps = self._try_get_value(node, 4)
             if axes is None and not (starts is None and ends is None):
-                axes = list(range(0, len(starts if starts is not None else ends)))
+                axes = list(range(len(starts if starts is not None else ends)))
             if steps is None and not (starts is None and ends is None):
                 steps = [1] * len(starts if starts is not None else ends)
             axes = as_list(axes, keep_none=True)
@@ -2669,11 +2659,9 @@ def get_prereq(node):
                         break
 
             if self.verbose_ > 2:
-                logger.debug(node.op_type + ": " + node.name)
+                logger.debug(node.op_type + ": " + node.name)  # noqa: G003
                 for i, name in enumerate(node.input):
-                    logger.debug(
-                        "  Input {}: {} {}".format(i, name, "initializer" if name in self.initializers_ else "")
-                    )
+                    logger.debug("  Input %s: %s %s", i, name, "initializer" if name in self.initializers_ else "")
 
             # onnx automatically merge dims with value, i.e. Mul(['aaa', 'bbb'], [1000, 1]) -> [1000, 'bbb']
             # symbolic shape inference needs to apply merge of 'aaa' -> 1000 in this case
@@ -2722,7 +2710,7 @@ def get_prereq(node):
                             seq_cls_type = out_type.sequence_type.elem_type.WhichOneof("value")
                             if seq_cls_type == "tensor_type":
                                 logger.debug(
-                                    "  {}: sequence of {} {}".format(
+                                    "  {}: sequence of {} {}".format(  # noqa: G001
                                         node.output[i_o],
                                         str(get_shape_from_value_info(vi)),
                                         onnx.TensorProto.DataType.Name(
@@ -2740,14 +2728,10 @@ def get_prereq(node):
                 out_type_undefined = out_type.tensor_type.elem_type == onnx.TensorProto.UNDEFINED
                 if self.verbose_ > 2:
                     logger.debug(
-                        "  {}: {} {}".format(
-                            node.output[i_o],
-                            str(out_shape),
-                            onnx.TensorProto.DataType.Name(vi.type.tensor_type.elem_type),
-                        )
+                        f"  {node.output[i_o]}: {out_shape!s} {onnx.TensorProto.DataType.Name(vi.type.tensor_type.elem_type)}"
                     )
                     if node.output[i_o] in self.sympy_data_:
-                        logger.debug("  Sympy Data: " + str(self.sympy_data_[node.output[i_o]]))
+                        logger.debug("  Sympy Data: " + str(self.sympy_data_[node.output[i_o]]))  # noqa: G003
 
                 # onnx >= 1.11.0, use unk__#index instead of None when the shape dim is uncertain
                 if (
@@ -2848,24 +2832,16 @@ def get_prereq(node):
                             if self.verbose_ > 0:
                                 if is_unknown_op:
                                     logger.debug(
-                                        "Possible unknown op: {} node: {}, guessing {} shape".format(
-                                            node.op_type, node.name, vi.name
-                                        )
+                                        f"Possible unknown op: {node.op_type} node: {node.name}, guessing {vi.name} shape"
                                     )
                                 if self.verbose_ > 2:
-                                    logger.debug(
-                                        "  {}: {} {}".format(
-                                            node.output[i_o],
-                                            str(new_shape),
-                                            vi.type.tensor_type.elem_type,
-                                        )
-                                    )
+                                    logger.debug(f"  {node.output[i_o]}: {new_shape!s} {vi.type.tensor_type.elem_type}")
 
                             self.run_ = True
                             continue  # continue the inference after guess, no need to stop as no merge is needed
 
                     if self.verbose_ > 0 or not self.auto_merge_ or out_type_undefined:
-                        logger.debug("Stopping at incomplete shape inference at " + node.op_type + ": " + node.name)
+                        logger.debug("Stopping at incomplete shape inference at %s: %s", node.op_type, node.name)
                         logger.debug("node inputs:")
                         for i in node.input:
                             if i in self.known_vi_:
@@ -2879,7 +2855,7 @@ def get_prereq(node):
                             else:
                                 logger.debug(f"not in known_vi_ for {o}")
                         if self.auto_merge_ and not out_type_undefined:
-                            logger.debug("Merging: " + str(self.suggested_merge_))
+                            logger.debug("Merging: " + str(self.suggested_merge_))  # noqa: G003
                     return False
 
         self.run_ = False
@@ -2964,9 +2940,9 @@ def parse_arguments():
 
 if __name__ == "__main__":
     args = parse_arguments()
-    logger.info("input model: " + args.input)
+    logger.info("input model: " + args.input)  # noqa: G003
     if args.output:
-        logger.info("output model " + args.output)
+        logger.info("output model " + args.output)  # noqa: G003
     logger.info("Doing symbolic shape inference...")
     out_mp = SymbolicShapeInference.infer_shapes(
         onnx.load(args.input),
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index 20bb8a71dc35f..8af074f24acc9 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -790,7 +790,7 @@ def skip_ep(model_name, ep, model_to_fail_ep):
 
     # if ep in fail_ep_list and fail_ep_list[ep] == "runtime error":
     if ep in fail_ep_list:
-        logger.info("Skip testing " + model_name + " using " + ep + " since it has some issues.")
+        logger.info("Skip testing " + model_name + " using " + ep + " since it has some issues.")  # noqa: G003
         return True
 
     return False
@@ -925,7 +925,7 @@ def find_model_path(path):
 
     logger.info(target_model_path)
     if len(target_model_path) > 1:
-        logger.error("We expect to find only one model in " + path)
+        logger.error("We expect to find only one model in " + path)  # noqa: G003
         raise
 
     return target_model_path[0]
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
index 93d41551c7121..f12d4599817b7 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
@@ -80,9 +80,9 @@ def main():
     benchmark = is_benchmark_mode(args.running_mode)  # noqa: F405
 
     for model, model_info in models.items():
-        logger.info("\n" + "=" * 40 + "=" * len(model))  # noqa: F405
-        logger.info("=" * 20 + model + "=" * 20)  # noqa: F405
-        logger.info("=" * 40 + "=" * len(model))  # noqa: F405
+        logger.info("\n" + "=" * 40 + "=" * len(model))  # noqa: F405, G003
+        logger.info("=" * 20 + model + "=" * 20)  # noqa: F405, G003
+        logger.info("=" * 40 + "=" * len(model))  # noqa: F405, G003
 
         model_info["model_name"] = model
 
diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 89f9947688583..9baafbbfff0e3 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -802,7 +802,7 @@ def main():
         try:
             os.mkdir(args.cache_dir)
         except OSError:
-            logger.error("Creation of the directory %s failed" % args.cache_dir)
+            logger.error("Creation of the directory %s failed" % args.cache_dir)  # noqa: G002
 
     enable_torch = "torch" in args.engines
     enable_torch2 = "torch2" in args.engines
@@ -921,7 +921,7 @@ def main():
                     args,
                 )
             except Exception:
-                logger.error("Exception", exc_info=True)
+                logger.exception("Exception")
 
     time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
     if model_fusion_statistics:
diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py
index c9c815f01e053..66f7a63447764 100644
--- a/onnxruntime/python/tools/transformers/benchmark_helper.py
+++ b/onnxruntime/python/tools/transformers/benchmark_helper.py
@@ -142,7 +142,7 @@ def create_onnxruntime_session(
 
         session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=providers)
     except Exception:
-        logger.error("Exception", exc_info=True)
+        logger.error("Exception", exc_info=True)  # noqa: G201
 
     return session
 
diff --git a/onnxruntime/python/tools/transformers/bert_perf_test.py b/onnxruntime/python/tools/transformers/bert_perf_test.py
index 9c743a83819c3..17c5d3602bb3b 100644
--- a/onnxruntime/python/tools/transformers/bert_perf_test.py
+++ b/onnxruntime/python/tools/transformers/bert_perf_test.py
@@ -232,9 +232,9 @@ def onnxruntime_inference(session, all_inputs, output_names):
 def to_string(model_path, session, test_setting):
     sess_options = session.get_session_options()
     option = f"model={os.path.basename(model_path)},"
-    option += "graph_optimization_level={},intra_op_num_threads={},".format(
-        sess_options.graph_optimization_level, sess_options.intra_op_num_threads
-    ).replace("GraphOptimizationLevel.ORT_", "")
+    option += f"graph_optimization_level={sess_options.graph_optimization_level},intra_op_num_threads={sess_options.intra_op_num_threads},".replace(
+        "GraphOptimizationLevel.ORT_", ""
+    )
 
     option += f"batch_size={test_setting.batch_size},sequence_length={test_setting.sequence_length},"
     option += f"test_cases={test_setting.test_cases},test_times={test_setting.test_times},"
diff --git a/onnxruntime/python/tools/transformers/compare_bert_results.py b/onnxruntime/python/tools/transformers/compare_bert_results.py
index 61e4c97c75c8c..0c5125e74c8a4 100644
--- a/onnxruntime/python/tools/transformers/compare_bert_results.py
+++ b/onnxruntime/python/tools/transformers/compare_bert_results.py
@@ -59,16 +59,10 @@ def compare(baseline_results, treatment_results, verbose, rtol=1e-1, atol=1e-3):
                         print(f"abs_diff={abs_diff}")
 
     if diff_count == 0:
-        print(
-            "100% passed for {} random inputs given thresholds (rtol={}, atol={}).".format(
-                len(baseline_results), rtol, atol
-            )
-        )
+        print(f"100% passed for {len(baseline_results)} random inputs given thresholds (rtol={rtol}, atol={atol}).")
     else:
         print(
-            "WARNING: {} out of {} results NOT passed for thresholds (rtol={}, atol={}).".format(
-                diff_count, len(baseline_results), rtol, atol
-            )
+            f"WARNING: {diff_count} out of {len(baseline_results)} results NOT passed for thresholds (rtol={rtol}, atol={atol})."
         )
 
     print(f"maximum absolute difference={max_abs_diff}")
@@ -117,11 +111,7 @@ def run_test(
         baseline_model, all_inputs, use_gpu, disable_optimization=True
     )
     if verbose:
-        print(
-            "baseline average latency (all optimizations disabled): {} ms".format(
-                statistics.mean(baseline_latency) * 1000
-            )
-        )
+        print(f"baseline average latency (all optimizations disabled): {statistics.mean(baseline_latency) * 1000} ms")
 
     if output_dir is not None:
         for i, inputs in enumerate(all_inputs):
diff --git a/onnxruntime/python/tools/transformers/float16.py b/onnxruntime/python/tools/transformers/float16.py
index 48c79b1d5fa0f..2398bb9d6031b 100644
--- a/onnxruntime/python/tools/transformers/float16.py
+++ b/onnxruntime/python/tools/transformers/float16.py
@@ -411,9 +411,7 @@ def convert_float_to_float16(
             value_info_list.append(make_value_info_from_tensor(value.initializer))
             if value.fp32_nodes and not force_fp16_initializers:
                 logger.info(
-                    "initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{}".format(
-                        value.fp16_nodes
-                    )
+                    f"initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{value.fp16_nodes}"
                 )
 
     # Some operators have data type fixed as float for some input. Add a float16 to float cast for those inputs.
diff --git a/onnxruntime/python/tools/transformers/fusion_attention_unet.py b/onnxruntime/python/tools/transformers/fusion_attention_unet.py
index 9a353e7e2d675..048c13cdb1e2c 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention_unet.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention_unet.py
@@ -373,9 +373,7 @@ def create_attention_node(
             else "MultiHeadAttention ({})".format(
                 "self attention with packed qkv"
                 if self.enable_packed_qkv
-                else "cross attention with packed kv"
-                if self.enable_packed_kv
-                else "cross attention"
+                else "cross attention with packed kv" if self.enable_packed_kv else "cross attention"
             )
         )
         self.increase_counter(counter_name)
@@ -843,9 +841,7 @@ def create_attention_node_lora(
             else "MultiHeadAttention ({})".format(
                 "self attention with packed qkv"
                 if self.enable_packed_qkv
-                else "cross attention with packed kv"
-                if self.enable_packed_kv
-                else "cross attention"
+                else "cross attention with packed kv" if self.enable_packed_kv else "cross attention"
             )
         )
         self.increase_counter(counter_name)
diff --git a/onnxruntime/python/tools/transformers/fusion_embedlayer.py b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
index 42156d9123383..70ff57f0626e1 100644
--- a/onnxruntime/python/tools/transformers/fusion_embedlayer.py
+++ b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
@@ -345,18 +345,13 @@ def check_embedding(self, word_embedding_gather, segment_embedding_gather, posit
                 and input_ids_shape[1] == position_ids_shape[1]
             ):
                 logger.info(
-                    "Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {} vs {}".format(
-                        input_ids_shape, position_ids_shape
-                    )
+                    f"Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {input_ids_shape} vs {position_ids_shape}"
                 )
                 return False
 
             if segment_ids and not self.shape_infer.compare_shape(input_ids, segment_ids):
                 logger.info(
-                    "Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {} != {}".format(
-                        input_ids_shape,
-                        self.shape_infer.get_edge_shape(segment_ids),
-                    )
+                    f"Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {input_ids_shape} != {self.shape_infer.get_edge_shape(segment_ids)}"
                 )
                 return False
 
diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py b/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py
index 6c44bb11e24dc..5f395b364eb6f 100644
--- a/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py
+++ b/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py
@@ -75,9 +75,11 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
 
         if not self.model.is_safe_to_fuse_nodes(
             subgraph_nodes,
-            [node.output[0], downstream_quantize_node.output[0]]
-            if downstream_shape_node is not None
-            else downstream_quantize_node.output,
+            (
+                [node.output[0], downstream_quantize_node.output[0]]
+                if downstream_shape_node is not None
+                else downstream_quantize_node.output
+            ),
             input_name_to_nodes,
             output_name_to_node,
         ):
diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py b/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py
index cf2b357721757..5ec6dadc1e677 100644
--- a/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py
+++ b/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py
@@ -77,9 +77,11 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
 
         if not self.model.is_safe_to_fuse_nodes(
             subgraph_nodes,
-            [node.output[0], downstream_quantize_node.output[0]]
-            if downstream_shape_node is not None
-            else downstream_quantize_node.output,
+            (
+                [node.output[0], downstream_quantize_node.output[0]]
+                if downstream_shape_node is not None
+                else downstream_quantize_node.output
+            ),
             input_name_to_nodes,
             output_name_to_node,
         ):
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
index e48f0adc832c5..6d6a057574a17 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
@@ -400,7 +400,7 @@ def main(args):
                         }
                         csv_writer.writerow(row)
                     except Exception:
-                        logger.error("Exception", exc_info=True)
+                        logger.error("Exception", exc_info=True)  # noqa: G201
                         return None
 
     logger.info(f"Results are saved to file {csv_filename}")
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
index e01585ae84163..9153193a4974a 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
@@ -630,7 +630,7 @@ def pytorch_inference(model, inputs: Gpt2Inputs, total_runs: int = 0):
                 latency.append(time.time() - start)
 
         average_latency = sum(latency) * 1000 / len(latency)
-        logger.debug("PyTorch inference time = {} ms".format(format(average_latency, ".2f")))
+        logger.debug("PyTorch inference time = {} ms".format(format(average_latency, ".2f")))  # noqa: G001
 
         return outputs, average_latency
 
@@ -662,7 +662,7 @@ def onnxruntime_inference(ort_session, inputs: Gpt2Inputs, total_runs: int = 0):
             latency.append(time.time() - start)
 
         average_latency = sum(latency) * 1000 / len(latency)
-        logger.debug("OnnxRuntime Inference time = {} ms".format(format(average_latency, ".2f")))
+        logger.debug("OnnxRuntime Inference time = {} ms".format(format(average_latency, ".2f")))  # noqa: G001
 
         return ort_outputs, average_latency
 
@@ -741,7 +741,7 @@ def onnxruntime_inference_with_binded_io(
             latency.append(time.time() - start)
 
         average_latency = sum(latency) * 1000 / len(latency)
-        logger.debug("OnnxRuntime with IO binding inference time = {} ms".format(format(average_latency, ".2f")))
+        logger.debug("OnnxRuntime with IO binding inference time = %.2f ms", average_latency)
 
         return ort_outputs, average_latency
 
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
index 4823f0d5874dd..b039f1351b1d0 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
@@ -179,7 +179,7 @@ def print_wins(wins, rows, test_name):
         for row in rows:
             if row["run_id"] == key:
                 logger.info(
-                    "{:02d}: WINs={:02d}, run_id={}, latency={:5.2f}, top1_match={:.4f}, size={}_MB, experiment={}, {}".format(
+                    "{:02d}: WINs={:02d}, run_id={}, latency={:5.2f}, top1_match={:.4f}, size={}_MB, experiment={}, {}".format(  # noqa: G001
                         rank,
                         value,
                         key,
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark.py b/onnxruntime/python/tools/transformers/models/llama/benchmark.py
index f597cead40331..bfe108d21a595 100644
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark.py
@@ -55,11 +55,7 @@ def get_inputs(args: argparse.Namespace, ort_model_inputs_len: int):
     max_seq_len = (
         2048
         if args.benchmark_type == "ort-msft"
-        else 16384
-        if "codellama" in temp_name
-        else 4096
-        if "llama2" in temp_name
-        else 2048
+        else 16384 if "codellama" in temp_name else 4096 if "llama2" in temp_name else 2048
     )
 
     if args.benchmark_type in {"hf-pt-eager", "hf-pt-compile"}:
@@ -278,21 +274,25 @@ def time_fn(args, fn, inputs):
         outputs = fn(inputs)
         logger.info(outputs)
 
-    input_sync = (  # noqa: E731
-        lambda *kwargs: args.io_binding.synchronize_inputs()
+    input_sync = lambda *kwargs: (  # noqa: E731
+        args.io_binding.synchronize_inputs()
         if args.device != "cpu" and args.benchmark_type in {"ort-msft", "ort-convert-to-onnx"}  # ORT synchronize
-        else lambda *kwargs: torch.cuda.synchronize()
-        if args.device != "cpu" and torch.cuda.is_available()  # PyTorch synchronize
-        else lambda *kwargs: None  # no-op function
-    )
+        else lambda *kwargs: (
+            torch.cuda.synchronize()
+            if args.device != "cpu" and torch.cuda.is_available()  # PyTorch synchronize
+            else lambda *kwargs: None
+        )
+    )  # no-op function
 
-    output_sync = (  # noqa: E731
-        lambda *kwargs: args.io_binding.synchronize_outputs()
+    output_sync = lambda *kwargs: (  # noqa: E731
+        args.io_binding.synchronize_outputs()
         if args.device != "cpu" and args.benchmark_type in {"ort-msft", "ort-convert-to-onnx"}  # ORT synchronize
-        else lambda *kwargs: torch.cuda.synchronize()
-        if args.device != "cpu" and torch.cuda.is_available()  # PyTorch synchronize
-        else lambda *kwargs: None  # no-op function
-    )
+        else lambda *kwargs: (
+            torch.cuda.synchronize()
+            if args.device != "cpu" and torch.cuda.is_available()  # PyTorch synchronize
+            else lambda *kwargs: None
+        )
+    )  # no-op function
 
     for _ in warmup_range:
         input_sync()
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
index c9ff384a4c856..1ad58327b7fc2 100644
--- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -944,9 +944,11 @@ def main():
                             ort_quantization.quantize_dynamic(
                                 fp32_path,
                                 int8_path,
-                                op_types_to_quantize=["MatMul", "Gemm", "Gather"]
-                                if args.quantize_embedding_layer
-                                else ["MatMul", "Gemm"],
+                                op_types_to_quantize=(
+                                    ["MatMul", "Gemm", "Gather"]
+                                    if args.quantize_embedding_layer
+                                    else ["MatMul", "Gemm"]
+                                ),
                                 per_channel=args.quantize_per_channel,
                                 reduce_range=args.quantize_reduce_range,
                                 use_external_data_format=True,
diff --git a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
index 51a967cf22608..ab92a12343732 100644
--- a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
+++ b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
@@ -335,7 +335,7 @@ def test_ort(args, device) -> List[Dict[str, Any]]:
 
     onnx_model_path = find_onnx_model(model_name) if not args.onnx else args.onnx
 
-    optimized = onnx_model_path.endswith("_fp16.onnx") or onnx_model_path.endswith("_fp32.onnx")
+    optimized = onnx_model_path.endswith("_fp16.onnx") or onnx_model_path.endswith("_fp32.onnx")  # noqa: PIE810
     precision = "fp32" if not onnx_model_path.endswith("_fp16.onnx") else "fp16"
 
     model = load_torch_model(model_name, device)
@@ -590,7 +590,7 @@ def run_tests(
     logger.info(f"ORT_LONGFORMER_COMPACT_MEMORY={compact_memory}")
 
     os.environ["ORT_LONGFORMER_USE_HALF4"] = "1" if use_half4 else "0"
-    logger.info("ORT_LONGFORMER_USE_HALF4={}".format("1" if use_half4 else "0"))
+    logger.info("ORT_LONGFORMER_USE_HALF4={}".format("1" if use_half4 else "0"))  # noqa: G001
 
     results = []
     test_times = 1000
diff --git a/onnxruntime/python/tools/transformers/models/phi2/inference_example.py b/onnxruntime/python/tools/transformers/models/phi2/inference_example.py
index 829334b46b469..eb66533f00834 100644
--- a/onnxruntime/python/tools/transformers/models/phi2/inference_example.py
+++ b/onnxruntime/python/tools/transformers/models/phi2/inference_example.py
@@ -121,9 +121,11 @@ def get_initial_inputs_and_outputs(self, encodings_dict):
         if not self.use_traced_inputs:
             for i in range(self.num_layers):
                 past = torch.zeros(past_shape, device=self.device, dtype=self.torch_dtype)
-                inputs.update(
-                    {f"past_key_{i}": past.contiguous(), f"past_value_{i}": past.clone().contiguous()}
-                ) if not self.packed_kv else inputs.update({f"past_{i}": past.contiguous()})
+                (
+                    inputs.update({f"past_key_{i}": past.contiguous(), f"past_value_{i}": past.clone().contiguous()})
+                    if not self.packed_kv
+                    else inputs.update({f"past_{i}": past.contiguous()})
+                )
         else:
             for i in range(self.num_layers):
                 inputs.update(
@@ -144,9 +146,13 @@ def get_initial_inputs_and_outputs(self, encodings_dict):
             )
             for i in range(self.num_layers):
                 present = torch.zeros(present_shape, device=self.device, dtype=self.torch_dtype)
-                outputs.update(
-                    {f"present_key_{i}": present.contiguous(), f"present_value_{i}": present.contiguous()}
-                ) if not self.packed_kv else outputs.update({f"present_{i}": present.contiguous()})
+                (
+                    outputs.update(
+                        {f"present_key_{i}": present.contiguous(), f"present_value_{i}": present.contiguous()}
+                    )
+                    if not self.packed_kv
+                    else outputs.update({f"present_{i}": present.contiguous()})
+                )
 
         return inputs, outputs
 
@@ -323,9 +329,16 @@ def generate_impl(self, encodings_dict, max_length, cuda_graph_annotation, bench
                 )
                 for i in range(self.num_layers):
                     present = torch.zeros(present_shape, device=self.device, dtype=self.torch_dtype)
-                    outputs.update(
-                        {f"present_key_{i}": present.contiguous(), f"present_value_{i}": present.clone().contiguous()}
-                    ) if not self.packed_kv else outputs.update({f"present_{i}": present.contiguous()})
+                    (
+                        outputs.update(
+                            {
+                                f"present_key_{i}": present.contiguous(),
+                                f"present_value_{i}": present.clone().contiguous(),
+                            }
+                        )
+                        if not self.packed_kv
+                        else outputs.update({f"present_{i}": present.contiguous()})
+                    )
 
         if benchmark:
             print(
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py
index 10af22e44d3a5..c2cfc165e32cf 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py
@@ -414,7 +414,6 @@ def get_profile_id(self, batch_size, image_height, image_width, static_batch, st
 
     def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_image_shape):
         """For TensorRT"""
-        pass
 
     def get_shape_dict(self, batch_size, image_height, image_width):
         pass
diff --git a/onnxruntime/python/tools/transformers/models/whisper/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/whisper/convert_to_onnx.py
index 35211aab272e4..5921e4ed42936 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/convert_to_onnx.py
@@ -414,9 +414,9 @@ def export_onnx_models(
                     quantization.quantize_dynamic(
                         onnx_path,
                         output_path,
-                        op_types_to_quantize=["MatMul", "Gemm", "Gather"]
-                        if quantize_embedding_layer
-                        else ["MatMul", "Gemm"],
+                        op_types_to_quantize=(
+                            ["MatMul", "Gemm", "Gather"] if quantize_embedding_layer else ["MatMul", "Gemm"]
+                        ),
                         use_external_data_format=use_external_data_format,
                         per_channel=quantize_per_channel,
                         reduce_range=quantize_reduce_range,
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
index 14691da4ad643..0b128f122e0f4 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py
@@ -149,9 +149,11 @@ def chain_model(args):
         helper.make_attribute("translate_token_id", tokenizer.convert_tokens_to_ids(["<|translate|>"])[0]),
         helper.make_attribute("transcribe_token_id", tokenizer.convert_tokens_to_ids(["<|transcribe|>"])[0]),
         helper.make_attribute("start_of_lm_token_id", tokenizer.convert_tokens_to_ids(["<|startoflm|>"])[0]),
-        helper.make_attribute("no_speech_token_id", tokenizer.convert_tokens_to_ids(["<|nospeech|>"])[0])
-        if args.output_no_speech_probs
-        else "",
+        (
+            helper.make_attribute("no_speech_token_id", tokenizer.convert_tokens_to_ids(["<|nospeech|>"])[0])
+            if args.output_no_speech_probs
+            else ""
+        ),
         helper.make_attribute("no_timestamps_token_id", tokenizer.convert_tokens_to_ids(["<|notimestamps|>"])[0]),
         helper.make_attribute("beginning_timestamp_token_id", tokenizer.convert_tokens_to_ids(["<|0.00|>"])[0]),
         helper.make_attribute("no_repeat_ngram_size", args.no_repeat_ngram_size),
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py
index 1b47b9426d983..adf7f69470ae7 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py
@@ -334,7 +334,7 @@ def verify_onnx(
         try:
             from datasets import load_dataset
         except Exception as e:
-            logger.error(f"An error occurred while importing `datasets`: {e}", exc_info=True)
+            logger.error(f"An error occurred while importing `datasets`: {e}", exc_info=True)  # noqa: G201
             install_cmd = "pip install datasets"
             logger.warning(f"Could not import `datasets`. Attempting to install `datasets` via `{install_cmd}`.")
             os.system(install_cmd)
diff --git a/onnxruntime/python/tools/transformers/onnx_model_phi.py b/onnxruntime/python/tools/transformers/onnx_model_phi.py
index 0fdce29ae0fa0..05a27ba487f4d 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_phi.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_phi.py
@@ -353,8 +353,10 @@ def process_graph_io(self, attn_op_type: AttentionOpType):
                     elem_type=TensorProto.INT64,
                     shape=[1],
                 )
-                new_inputs.extend([vi_iid, vi_step, vi_mask]) if not self.use_vllm else new_inputs.extend(
-                    [vi_iid, vi_pid, vi_meta]
+                (
+                    new_inputs.extend([vi_iid, vi_step, vi_mask])
+                    if not self.use_vllm
+                    else new_inputs.extend([vi_iid, vi_pid, vi_meta])
                 )
             if self.use_attn:
                 if "past_key" in vi.name:
diff --git a/onnxruntime/python/tools/transformers/onnx_model_unet.py b/onnxruntime/python/tools/transformers/onnx_model_unet.py
index 01298b3576eb1..77e24986f0fde 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_unet.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_unet.py
@@ -127,7 +127,7 @@ def optimize(self, options: Optional[FusionOptions] = None):
 
             with logging_redirect_tqdm():
                 steps = 18
-                progress_bar = tqdm.tqdm(range(0, steps), initial=0, desc="fusion")
+                progress_bar = tqdm.tqdm(range(steps), initial=0, desc="fusion")
                 self._optimize(options, progress_bar)
         else:
             logger.info("tqdm is not installed. Run optimization without progress bar")
diff --git a/onnxruntime/python/tools/transformers/shape_optimizer.py b/onnxruntime/python/tools/transformers/shape_optimizer.py
index ac62188662990..503930b23229f 100644
--- a/onnxruntime/python/tools/transformers/shape_optimizer.py
+++ b/onnxruntime/python/tools/transformers/shape_optimizer.py
@@ -133,9 +133,7 @@ def use_static_input(self, inputs, batch_size=1, max_seq_len=128):
                     dim_proto.dim_value = max_seq_len
                 elif dim_proto.HasField("dim_value") and dim_proto.dim_value != max_seq_len:
                     raise ValueError(
-                        "Unable to set dimension value to {} for axis {} of {}. Contradicts existing dimension value {}.".format(
-                            max_seq_len, 1, input.name, dim_proto.dim_value
-                        )
+                        f"Unable to set dimension value to {max_seq_len} for axis {1} of {input.name}. Contradicts existing dimension value {dim_proto.dim_value}."
                     )
 
     def create_dummy_inputs(
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
index 727351cae84ac..568a4649f3977 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
@@ -59,7 +59,7 @@ def PrintResult(op, axes, keepdims, res):  # noqa: N802
 
     print(" // expected values")
     print("{", end="")
-    for i in range(0, res.size):
+    for i in range(res.size):
         print("%5.6ff," % res.item(i))
 
     print("})},")
@@ -128,7 +128,7 @@ def PrintReenableOptimizations():  # noqa: N802
     print("ReductionTestCases testcases = {")
     print("// input_data")
     print("{")
-    for i in range(0, input_data.size):
+    for i in range(input_data.size):
         print(
             "%5.6ff," % input_data.item(i),
         )
diff --git a/onnxruntime/test/providers/cpu/rnn/GRU.py b/onnxruntime/test/providers/cpu/rnn/GRU.py
index 144acaf14db61..f141710cf31ef 100644
--- a/onnxruntime/test/providers/cpu/rnn/GRU.py
+++ b/onnxruntime/test/providers/cpu/rnn/GRU.py
@@ -84,7 +84,7 @@ def run(self):
             hidden_size = f_output.shape[3]
 
             output = np.empty((0, 2, batch_size, hidden_size), np.float32)
-            for x in range(0, seq_length):
+            for x in range(seq_length):
                 output = np.append(output, f_output[x])
                 output = np.append(output, r_output_orig_input_order[x])
 
diff --git a/onnxruntime/test/providers/cpu/rnn/LSTM.py b/onnxruntime/test/providers/cpu/rnn/LSTM.py
index 116ec3671bf01..49e28a93385a4 100644
--- a/onnxruntime/test/providers/cpu/rnn/LSTM.py
+++ b/onnxruntime/test/providers/cpu/rnn/LSTM.py
@@ -124,7 +124,7 @@ def run(self):
             output = np.empty((0, 2, batch_size, hidden_size), np.float32)
             # Y_h = np.empty((0, 2, batch_size, hidden_size), np.float32)
             # Y_c = np.empty((0, 2, hidden_size, hidden_size), np.float32)
-            for x in range(0, seq_length):
+            for x in range(seq_length):
                 output = np.append(output, f_output[x])
                 output = np.append(output, r_output_orig_input_order[x])
 
diff --git a/onnxruntime/test/python/quantization/test_op_pad.py b/onnxruntime/test/python/quantization/test_op_pad.py
index 03e29dd64f8a7..291bf42405d58 100644
--- a/onnxruntime/test/python/quantization/test_op_pad.py
+++ b/onnxruntime/test/python/quantization/test_op_pad.py
@@ -222,12 +222,8 @@ def verify_quantize_with_pad_mode(
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_i8_path = "qop_pad_{}_i8_{}{}_{}{}.onnx".format(
-            quantize_mode,
-            tag_pad_mode,
-            tag_constant_value,
-            activation_type_str,
-            weight_type_str,
+        model_i8_path = (
+            f"qop_pad_{quantize_mode}_i8_{tag_pad_mode}{tag_constant_value}_{activation_type_str}{weight_type_str}.onnx"
         )
         data_reader.rewind()
         self.quantize_model(
diff --git a/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py b/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py
index 765825d4b86e3..97931acf03f42 100644
--- a/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py
+++ b/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py
@@ -122,9 +122,11 @@ def test_quantize_blockwise_4bits(self):
                                     dequantize_blockwise_4bits(
                                         quant_value_ref[c, k],
                                         scales_ref[c, k],
-                                        (zero_point_ref[c, k // 2] >> 4)
-                                        if (k & 1)
-                                        else (zero_point_ref[c, k // 2] & 0x0F),
+                                        (
+                                            (zero_point_ref[c, k // 2] >> 4)
+                                            if (k & 1)
+                                            else (zero_point_ref[c, k // 2] & 0x0F)
+                                        ),
                                         min(block_size, rows - k * block_size),
                                     ),
                                     dequantize_blockwise_4bits(
diff --git a/onnxruntime/test/python/transformers/bert_model_generator.py b/onnxruntime/test/python/transformers/bert_model_generator.py
index 9b9409545615b..a84137f092e64 100644
--- a/onnxruntime/test/python/transformers/bert_model_generator.py
+++ b/onnxruntime/test/python/transformers/bert_model_generator.py
@@ -94,12 +94,16 @@ def create_bert_attention(
             perm=[0, 2, 3, 1],
         ),
         # mask nodes
-        helper.make_node("Unsqueeze", ["input_mask", "axes_1"], ["unsqueeze0_out"], "unsqueeze0")
-        if has_unsqueeze_two_inputs
-        else helper.make_node("Unsqueeze", ["input_mask"], ["unsqueeze0_out"], "unsqueeze0", axes=[1]),
-        helper.make_node("Unsqueeze", ["unsqueeze0_out", "axes_2"], ["unsqueeze1_out"], "unsqueeze1")
-        if has_unsqueeze_two_inputs
-        else helper.make_node("Unsqueeze", ["unsqueeze0_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[2]),
+        (
+            helper.make_node("Unsqueeze", ["input_mask", "axes_1"], ["unsqueeze0_out"], "unsqueeze0")
+            if has_unsqueeze_two_inputs
+            else helper.make_node("Unsqueeze", ["input_mask"], ["unsqueeze0_out"], "unsqueeze0", axes=[1])
+        ),
+        (
+            helper.make_node("Unsqueeze", ["unsqueeze0_out", "axes_2"], ["unsqueeze1_out"], "unsqueeze1")
+            if has_unsqueeze_two_inputs
+            else helper.make_node("Unsqueeze", ["unsqueeze0_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[2])
+        ),
         # when attention_mask is float type, no need to cast
         helper.make_node("Cast", ["unsqueeze1_out"], ["cast_out"], "cast", to=1) if not use_float_mask else None,
         helper.make_node(
@@ -291,9 +295,11 @@ def create_tf2onnx_attention_3d(input_hidden_size=16, num_heads=4, head_size=4,
         helper.make_node("Add", ["einsum_k_out", "add_k_weight"], ["add_k_out"], "add_k"),
         helper.make_node("Mul", ["add_k_out", "mul_weight_1"], ["mul_k_out"], "mul_k"),
         # mask nodes
-        helper.make_node("Unsqueeze", ["input_mask", "axes_1"], ["unsqueeze0_out"], "unsqueeze0")
-        if has_unsqueeze_two_inputs
-        else helper.make_node("Unsqueeze", ["input_mask"], ["unsqueeze0_out"], "unsqueeze0", axes=[1, 2]),
+        (
+            helper.make_node("Unsqueeze", ["input_mask", "axes_1"], ["unsqueeze0_out"], "unsqueeze0")
+            if has_unsqueeze_two_inputs
+            else helper.make_node("Unsqueeze", ["input_mask"], ["unsqueeze0_out"], "unsqueeze0", axes=[1, 2])
+        ),
         helper.make_node(
             "Slice",
             ["unsqueeze0_out", "slice_start", "slice_end", "slice_axes", "slice_steps"],
diff --git a/onnxruntime/test/python/transformers/conformer_model_generator.py b/onnxruntime/test/python/transformers/conformer_model_generator.py
index 71e4f2b63cf4f..5b27a46ea0fdc 100644
--- a/onnxruntime/test/python/transformers/conformer_model_generator.py
+++ b/onnxruntime/test/python/transformers/conformer_model_generator.py
@@ -22,9 +22,7 @@ def get_tensor_and_weight(name: str, shape: List[int], random=False, zeros=False
     weights = (
         [np.random.uniform(low, high) for _ in range(total_elements)]
         if random
-        else [0.0] * total_elements
-        if zeros
-        else [1.0] * total_elements
+        else [0.0] * total_elements if zeros else [1.0] * total_elements
     )
     return helper.make_tensor(name, TensorProto.FLOAT, shape, weights), weights
 
diff --git a/onnxruntime/test/python/transformers/gpt2_model_generator.py b/onnxruntime/test/python/transformers/gpt2_model_generator.py
index 4a1b48d4d1b48..0865c87b70da7 100644
--- a/onnxruntime/test/python/transformers/gpt2_model_generator.py
+++ b/onnxruntime/test/python/transformers/gpt2_model_generator.py
@@ -41,15 +41,17 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["fc_out"],
             "add_fc",
         ),
-        helper.make_node("Split", ["fc_out", "split_q_k_v"], ["q", "k", "v"], "split_qkv", axis=2)
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Split",
-            ["fc_out"],
-            ["q", "k", "v"],
-            "split_qkv",
-            axis=2,
-            split=[hidden_size, hidden_size, hidden_size],
+        (
+            helper.make_node("Split", ["fc_out", "split_q_k_v"], ["q", "k", "v"], "split_qkv", axis=2)
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Split",
+                ["fc_out"],
+                ["q", "k", "v"],
+                "split_qkv",
+                axis=2,
+                split=[hidden_size, hidden_size, hidden_size],
+            )
         ),
         # q nodes
         helper.make_node("Reshape", ["q", "reshape_x_shape"], ["reshape_q_out"], "reshape_q"),
@@ -79,19 +81,23 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             perm=[0, 2, 1, 3],
         ),
         # past
-        helper.make_node("Split", ["past", "split_1_1"], ["split_k", "split_v"], "split_past", axis=0)
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Split",
-            ["past"],
-            ["split_k", "split_v"],
-            "split_past",
-            axis=0,
-            split=[1, 1],
+        (
+            helper.make_node("Split", ["past", "split_1_1"], ["split_k", "split_v"], "split_past", axis=0)
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Split",
+                ["past"],
+                ["split_k", "split_v"],
+                "split_past",
+                axis=0,
+                split=[1, 1],
+            )
+        ),
+        (
+            helper.make_node("Squeeze", ["split_k", "axes_0"], ["past_k"], "squeeze_past_k")
+            if is_opset_13_or_newer
+            else helper.make_node("Squeeze", ["split_k"], ["past_k"], "squeeze_past_k", axes=[0])
         ),
-        helper.make_node("Squeeze", ["split_k", "axes_0"], ["past_k"], "squeeze_past_k")
-        if is_opset_13_or_newer
-        else helper.make_node("Squeeze", ["split_k"], ["past_k"], "squeeze_past_k", axes=[0]),
         helper.make_node(
             "Concat",
             ["past_k", "transpose_k_out"],
@@ -106,9 +112,11 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             "transpose_concat_k",
             perm=[0, 1, 3, 2],
         ),
-        helper.make_node("Squeeze", ["split_v", "axes_0"], ["past_v"], "squeeze_past_v")
-        if is_opset_13_or_newer
-        else helper.make_node("Squeeze", ["split_v"], ["past_v"], "squeeze_past_v", axes=[0]),
+        (
+            helper.make_node("Squeeze", ["split_v", "axes_0"], ["past_v"], "squeeze_past_v")
+            if is_opset_13_or_newer
+            else helper.make_node("Squeeze", ["split_v"], ["past_v"], "squeeze_past_v", axes=[0])
+        ),
         helper.make_node(
             "Concat",
             ["past_v", "transpose_v_out"],
@@ -117,33 +125,37 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             axis=-2,
         ),
         # present
-        helper.make_node(
-            "Unsqueeze",
-            ["concat_k_out", "axes_0"],
-            ["concat_k_unsqueeze_out"],
-            "concat_k_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["concat_k_out"],
-            ["concat_k_unsqueeze_out"],
-            "concat_k_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["concat_k_out", "axes_0"],
+                ["concat_k_unsqueeze_out"],
+                "concat_k_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["concat_k_out"],
+                ["concat_k_unsqueeze_out"],
+                "concat_k_unsqueeze",
+                axes=[0],
+            )
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["concat_v_out", "axes_0"],
-            ["concat_v_unsqueeze_out"],
-            "concat_v_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["concat_v_out"],
-            ["concat_v_unsqueeze_out"],
-            "concat_v_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["concat_v_out", "axes_0"],
+                ["concat_v_unsqueeze_out"],
+                "concat_v_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["concat_v_out"],
+                ["concat_v_unsqueeze_out"],
+                "concat_v_unsqueeze",
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Concat",
@@ -159,19 +171,21 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["transpose_q_shape_slice_out"],
             "transpose_q_shape_slice",
         ),
-        helper.make_node(
-            "Squeeze",
-            ["transpose_q_shape_slice_out", "axes_0"],
-            ["transpose_q_shape_slice_squeeze_out"],
-            "transpose_q_shape_slice_squeeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Squeeze",
-            ["transpose_q_shape_slice_out"],
-            ["transpose_q_shape_slice_squeeze_out"],
-            "transpose_q_shape_slice_squeeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Squeeze",
+                ["transpose_q_shape_slice_out", "axes_0"],
+                ["transpose_q_shape_slice_squeeze_out"],
+                "transpose_q_shape_slice_squeeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Squeeze",
+                ["transpose_q_shape_slice_out"],
+                ["transpose_q_shape_slice_squeeze_out"],
+                "transpose_q_shape_slice_squeeze",
+                axes=[0],
+            )
         ),
         helper.make_node("Shape", ["concat_k_out"], ["concat_k_shape_out"], "concat_k_shape"),
         helper.make_node(
@@ -180,19 +194,21 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["concat_k_shape_slice_out"],
             "concat_k_shape_slice",
         ),
-        helper.make_node(
-            "Squeeze",
-            ["concat_k_shape_slice_out", "axes_0"],
-            ["concat_k_shape_slice_squeeze_out"],
-            "concat_k_shape_slice_squeeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Squeeze",
-            ["concat_k_shape_slice_out"],
-            ["concat_k_shape_slice_squeeze_out"],
-            "concat_k_shape_slice_squeeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Squeeze",
+                ["concat_k_shape_slice_out", "axes_0"],
+                ["concat_k_shape_slice_squeeze_out"],
+                "concat_k_shape_slice_squeeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Squeeze",
+                ["concat_k_shape_slice_out"],
+                ["concat_k_shape_slice_squeeze_out"],
+                "concat_k_shape_slice_squeeze",
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Sub",
@@ -200,22 +216,26 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["sub_out"],
             "sub",
         ),
-        helper.make_node("Unsqueeze", ["sub_out", "axes_0"], ["sub_unsqueeze_out"], "sub_unsqueeze")
-        if is_opset_13_or_newer
-        else helper.make_node("Unsqueeze", ["sub_out"], ["sub_unsqueeze_out"], "sub_unsqueeze", axes=[0]),
-        helper.make_node(
-            "Unsqueeze",
-            ["concat_k_shape_slice_squeeze_out", "axes_0"],
-            ["concat_k_shape_slice_squeeze_unsqueeze_out"],
-            "concat_k_shape_slice_squeeze_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["concat_k_shape_slice_squeeze_out"],
-            ["concat_k_shape_slice_squeeze_unsqueeze_out"],
-            "concat_k_shape_slice_squeeze_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node("Unsqueeze", ["sub_out", "axes_0"], ["sub_unsqueeze_out"], "sub_unsqueeze")
+            if is_opset_13_or_newer
+            else helper.make_node("Unsqueeze", ["sub_out"], ["sub_unsqueeze_out"], "sub_unsqueeze", axes=[0])
+        ),
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["concat_k_shape_slice_squeeze_out", "axes_0"],
+                ["concat_k_shape_slice_squeeze_unsqueeze_out"],
+                "concat_k_shape_slice_squeeze_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["concat_k_shape_slice_squeeze_out"],
+                ["concat_k_shape_slice_squeeze_unsqueeze_out"],
+                "concat_k_shape_slice_squeeze_unsqueeze",
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Slice",
@@ -255,23 +275,27 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["input_mask_reshape_out"],
             "input_mask_reshape",
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["input_mask_reshape_out", "axes_1"],
-            ["unsqueeze0_out"],
-            "unsqueeze0",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["input_mask_reshape_out"],
-            ["unsqueeze0_out"],
-            "unsqueeze0",
-            axes=[1],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["input_mask_reshape_out", "axes_1"],
+                ["unsqueeze0_out"],
+                "unsqueeze0",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["input_mask_reshape_out"],
+                ["unsqueeze0_out"],
+                "unsqueeze0",
+                axes=[1],
+            )
+        ),
+        (
+            helper.make_node("Unsqueeze", ["unsqueeze0_out", "axes_2"], ["unsqueeze1_out"], "unsqueeze1")
+            if is_opset_13_or_newer
+            else helper.make_node("Unsqueeze", ["unsqueeze0_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[2])
         ),
-        helper.make_node("Unsqueeze", ["unsqueeze0_out", "axes_2"], ["unsqueeze1_out"], "unsqueeze1")
-        if is_opset_13_or_newer
-        else helper.make_node("Unsqueeze", ["unsqueeze0_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[2]),
         helper.make_node("Sub", ["sub_weight", "unsqueeze1_out"], ["mask_sub_out"], "sub_mask"),
         helper.make_node("Mul", ["mask_sub_out", "mul_weight"], ["mul_mask_out"], "mul_mask"),
         # qk nodes
@@ -322,33 +346,37 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             ["qkv_shape_slice_out"],
             "qkv_shape_slice",
         ),
-        helper.make_node(
-            "Squeeze",
-            ["qkv_shape_slice_out", "axes_0"],
-            ["qkv_shape_slice_squeeze_out"],
-            "qkv_shape_slice_squeeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Squeeze",
-            ["qkv_shape_slice_out"],
-            ["qkv_shape_slice_squeeze_out"],
-            "qkv_shape_slice_squeeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Squeeze",
+                ["qkv_shape_slice_out", "axes_0"],
+                ["qkv_shape_slice_squeeze_out"],
+                "qkv_shape_slice_squeeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Squeeze",
+                ["qkv_shape_slice_out"],
+                ["qkv_shape_slice_squeeze_out"],
+                "qkv_shape_slice_squeeze",
+                axes=[0],
+            )
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_slice_squeeze_out", "axes_0"],
-            ["qkv_shape_slice_squeeze_unsqueeze_out"],
-            "qkv_shape_slice_squeeze_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_slice_squeeze_out"],
-            ["qkv_shape_slice_squeeze_unsqueeze_out"],
-            "qkv_shape_slice_squeeze_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_slice_squeeze_out", "axes_0"],
+                ["qkv_shape_slice_squeeze_unsqueeze_out"],
+                "qkv_shape_slice_squeeze_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_slice_squeeze_out"],
+                ["qkv_shape_slice_squeeze_unsqueeze_out"],
+                "qkv_shape_slice_squeeze_unsqueeze",
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Concat",
@@ -387,33 +415,37 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
             "shape_qkv_gather_0",
             axis=0,
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_1", "axes_0"],
-            ["qkv_shape_1_unsqueeze_out"],
-            "qkv_shape_1_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_1"],
-            ["qkv_shape_1_unsqueeze_out"],
-            "qkv_shape_1_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_1", "axes_0"],
+                ["qkv_shape_1_unsqueeze_out"],
+                "qkv_shape_1_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_1"],
+                ["qkv_shape_1_unsqueeze_out"],
+                "qkv_shape_1_unsqueeze",
+                axes=[0],
+            )
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_0", "axes_0"],
-            ["qkv_shape_0_unsqueeze_out"],
-            "qkv_shape_0_unsqueeze",
-        )
-        if is_opset_13_or_newer
-        else helper.make_node(
-            "Unsqueeze",
-            ["qkv_shape_0"],
-            ["qkv_shape_0_unsqueeze_out"],
-            "qkv_shape_0_unsqueeze",
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_0", "axes_0"],
+                ["qkv_shape_0_unsqueeze_out"],
+                "qkv_shape_0_unsqueeze",
+            )
+            if is_opset_13_or_newer
+            else helper.make_node(
+                "Unsqueeze",
+                ["qkv_shape_0"],
+                ["qkv_shape_0_unsqueeze_out"],
+                "qkv_shape_0_unsqueeze",
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Concat",
@@ -767,9 +799,11 @@ def create_gpt2_fused_embedlayer(
                 "",
                 "ids",
             ],
-            ["EmbedLayerNormalization_0_output", "EmbedLayerNormalization_0_dummy_mask_index", "embedding_sum"]
-            if output_embedding_sum
-            else ["EmbedLayerNormalization_0_output", "EmbedLayerNormalization_0_dummy_mask_index"],
+            (
+                ["EmbedLayerNormalization_0_output", "EmbedLayerNormalization_0_dummy_mask_index", "embedding_sum"]
+                if output_embedding_sum
+                else ["EmbedLayerNormalization_0_output", "EmbedLayerNormalization_0_dummy_mask_index"]
+            ),
             "EmbedLayerNormalization_0",
             domain="com.microsoft",
             epsilon=epsilon,
diff --git a/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py b/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
index af835d2906e87..fd1d58cd2a3b8 100644
--- a/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
+++ b/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py
@@ -59,39 +59,41 @@ def create_moe_onnx_graph(
 ):
     use_sharded_moe = local_experts_start_index >= 0
     nodes = [
-        helper.make_node(
-            "MoE",
-            [
-                "input",
-                "router_probs",
-                "fc1_experts_weights",
-                "fc2_experts_weights",
-                "fc1_experts_bias",
-                "fc2_experts_bias",
-            ],
-            ["output"],
-            "MoE_0",
-            k=1,
-            activation_type="gelu",
-            domain="com.microsoft",
-        )
-        if not use_sharded_moe
-        else helper.make_node(
-            "ShardedMoE",
-            [
-                "input",
-                "router_probs",
-                "fc1_experts_weights",
-                "fc2_experts_weights",
-                "fc1_experts_bias",
-                "fc2_experts_bias",
-            ],
-            ["output"],
-            "MoE_0",
-            k=1,
-            activation_type="gelu",
-            local_experts_start_index=local_experts_start_index,
-            domain="com.microsoft",
+        (
+            helper.make_node(
+                "MoE",
+                [
+                    "input",
+                    "router_probs",
+                    "fc1_experts_weights",
+                    "fc2_experts_weights",
+                    "fc1_experts_bias",
+                    "fc2_experts_bias",
+                ],
+                ["output"],
+                "MoE_0",
+                k=1,
+                activation_type="gelu",
+                domain="com.microsoft",
+            )
+            if not use_sharded_moe
+            else helper.make_node(
+                "ShardedMoE",
+                [
+                    "input",
+                    "router_probs",
+                    "fc1_experts_weights",
+                    "fc2_experts_weights",
+                    "fc1_experts_bias",
+                    "fc2_experts_bias",
+                ],
+                ["output"],
+                "MoE_0",
+                k=1,
+                activation_type="gelu",
+                local_experts_start_index=local_experts_start_index,
+                domain="com.microsoft",
+            )
         ),
     ]
 
diff --git a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
index c42c42c3ca170..0086ce0d289c7 100644
--- a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
+++ b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
@@ -403,9 +403,7 @@ def generate_test_data(
         evalTime = timeit.default_timer() - start_time  # noqa: N806
         if outputs[0].tolist() != result[0].tolist():
             print(
-                "Error: not same result after optimization. use_cpu={}, no_opt_output={}, opt_output={}".format(
-                    use_cpu, result[0].tolist(), outputs[1].tolist()
-                )
+                f"Error: not same result after optimization. use_cpu={use_cpu}, no_opt_output={result[0].tolist()}, opt_output={outputs[1].tolist()}"
             )
         print(f"** Evaluation done in total {evalTime} secs")
 
diff --git a/onnxruntime/test/python/transformers/test_flash_attn.py b/onnxruntime/test/python/transformers/test_flash_attn.py
index 90d28872d3cc8..b784c83329c76 100644
--- a/onnxruntime/test/python/transformers/test_flash_attn.py
+++ b/onnxruntime/test/python/transformers/test_flash_attn.py
@@ -229,9 +229,11 @@ def create_group_query_attention_graph_prompt(
             [
                 config.batch_size,
                 config.q_sequence_length,
-                (config.num_heads * config.head_size)
-                if not packed
-                else (config.num_heads * config.head_size + 2 * config.kv_num_heads * config.head_size),
+                (
+                    (config.num_heads * config.head_size)
+                    if not packed
+                    else (config.num_heads * config.head_size + 2 * config.kv_num_heads * config.head_size)
+                ),
             ],
         ),
         helper.make_tensor_value_info(
@@ -415,9 +417,11 @@ def create_group_query_attention_graph_past(
             [
                 config.batch_size,
                 config.sequence_length,
-                (config.num_heads * config.head_size)
-                if not packed
-                else (config.num_heads * config.head_size + 2 * config.kv_num_heads * config.head_size),
+                (
+                    (config.num_heads * config.head_size)
+                    if not packed
+                    else (config.num_heads * config.head_size + 2 * config.kv_num_heads * config.head_size)
+                ),
             ],
         ),
         helper.make_tensor_value_info(
diff --git a/onnxruntime/test/python/transformers/whisper_model_generator.py b/onnxruntime/test/python/transformers/whisper_model_generator.py
index 71d1a4cbdceeb..a57b45cbc5ea3 100644
--- a/onnxruntime/test/python/transformers/whisper_model_generator.py
+++ b/onnxruntime/test/python/transformers/whisper_model_generator.py
@@ -22,9 +22,7 @@ def get_tensor_and_weight(name: str, shape: List[int], random=False, zeros=False
     weights = (
         [np.random.uniform(low, high) for _ in range(total_elements)]
         if random
-        else [0.0] * total_elements
-        if zeros
-        else [1.0] * total_elements
+        else [0.0] * total_elements if zeros else [1.0] * total_elements
     )
     return helper.make_tensor(name, TensorProto.FLOAT, shape, weights), weights
 
diff --git a/onnxruntime/test/testdata/custom_op_library/custom_op_test_float8.py b/onnxruntime/test/testdata/custom_op_library/custom_op_test_float8.py
index 84cf71455f84a..6db8e8fe660f8 100644
--- a/onnxruntime/test/testdata/custom_op_library/custom_op_test_float8.py
+++ b/onnxruntime/test/testdata/custom_op_library/custom_op_test_float8.py
@@ -1,6 +1,7 @@
 """
 This file was used to generate model `custom_op_test_float8.py`.
 """
+
 from onnx import TensorProto
 from onnx.checker import check_model
 from onnx.helper import make_graph, make_model, make_node, make_opsetid, make_tensor_value_info
diff --git a/onnxruntime/test/testdata/test_data_generation/adamw_test/adamw_test_data_generator.py b/onnxruntime/test/testdata/test_data_generation/adamw_test/adamw_test_data_generator.py
index 4c1e3a70de1c7..443444044bb8d 100644
--- a/onnxruntime/test/testdata/test_data_generation/adamw_test/adamw_test_data_generator.py
+++ b/onnxruntime/test/testdata/test_data_generation/adamw_test/adamw_test_data_generator.py
@@ -190,7 +190,7 @@ def main():
     device_candidates = ["cuda", "cpu"]
     test_data_step_count = 11
     for device in device_candidates:
-        for adam_mode in range(0, 2):
+        for adam_mode in range(2):
             generate_adamw_single_weight_tests(adam_mode, test_data_step_count, device)
             generate_adamw_multiple_weights_tests(adam_mode, test_data_step_count, device)
 
diff --git a/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py b/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py
index ed06495b42beb..54fe7b808bf12 100644
--- a/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py
@@ -21,19 +21,21 @@ def GenerateNodes(model_name, has_cast, suffix=""):  # noqa: N802
             ["gather0_out" + suffix],
             "gather0" + suffix,
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["gather0_out" + suffix, "axes_0"],
-            ["unsqueeze0_out" + suffix],
-            "unsqueeze0" + suffix,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "Unsqueeze",
-            ["gather0_out" + suffix],
-            ["unsqueeze0_out" + suffix],
-            "unsqueeze0" + suffix,
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["gather0_out" + suffix, "axes_0"],
+                ["unsqueeze0_out" + suffix],
+                "unsqueeze0" + suffix,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "Unsqueeze",
+                ["gather0_out" + suffix],
+                ["unsqueeze0_out" + suffix],
+                "unsqueeze0" + suffix,
+                axes=[0],
+            )
         ),
         helper.make_node("Shape", ["input_ids" + suffix], ["shape2_out" + suffix], "shape2" + suffix),
         helper.make_node(
@@ -42,19 +44,21 @@ def GenerateNodes(model_name, has_cast, suffix=""):  # noqa: N802
             ["gather1_out" + suffix],
             "gather1" + suffix,
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["gather1_out" + suffix, "axes_0"],
-            ["unsqueeze1_out" + suffix],
-            "unsqueeze1" + suffix,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "Unsqueeze",
-            ["gather1_out" + suffix],
-            ["unsqueeze1_out" + suffix],
-            "unsqueeze1" + suffix,
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["gather1_out" + suffix, "axes_0"],
+                ["unsqueeze1_out" + suffix],
+                "unsqueeze1" + suffix,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "Unsqueeze",
+                ["gather1_out" + suffix],
+                ["unsqueeze1_out" + suffix],
+                "unsqueeze1" + suffix,
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Concat",
@@ -80,19 +84,21 @@ def GenerateNodes(model_name, has_cast, suffix=""):  # noqa: N802
             ["range_out" + suffix],
             "range" + suffix,
         ),
-        helper.make_node(
-            "Unsqueeze",
-            ["range_out" + suffix, "axes_0"],
-            ["unsqueeze2_out" + suffix],
-            "unsqueeze2" + suffix,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "Unsqueeze",
-            ["range_out" + suffix],
-            ["unsqueeze2_out" + suffix],
-            "unsqueeze2" + suffix,
-            axes=[0],
+        (
+            helper.make_node(
+                "Unsqueeze",
+                ["range_out" + suffix, "axes_0"],
+                ["unsqueeze2_out" + suffix],
+                "unsqueeze2" + suffix,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "Unsqueeze",
+                ["range_out" + suffix],
+                ["unsqueeze2_out" + suffix],
+                "unsqueeze2" + suffix,
+                axes=[0],
+            )
         ),
         helper.make_node(
             "Expand",
@@ -145,21 +151,23 @@ def GenerateNodes(model_name, has_cast, suffix=""):  # noqa: N802
             "mask_cast" + suffix,
             to=6,
         ),
-        helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out" + suffix, "axes_1"],
-            ["mask_index_out" + suffix],
-            "mask_index" + suffix,
-            keepdims=0,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out" + suffix],
-            ["mask_index_out" + suffix],
-            "mask_index" + suffix,
-            axes=[1],
-            keepdims=0,
+        (
+            helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out" + suffix, "axes_1"],
+                ["mask_index_out" + suffix],
+                "mask_index" + suffix,
+                keepdims=0,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out" + suffix],
+                ["mask_index_out" + suffix],
+                "mask_index" + suffix,
+                axes=[1],
+                keepdims=0,
+            )
         ),
         helper.make_node(
             "Attention",
@@ -372,21 +380,23 @@ def GenerateModel5(model_name):  # noqa: N802
             epsion=0.000009999999747378752,
         ),
         helper.make_node("Cast", ["input_mask"], ["mask_cast_out"], "mask_cast", to=6),
-        helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out", "axes_1"],
-            ["mask_index_out"],
-            "mask_index",
-            keepdims=0,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out"],
-            ["mask_index_out"],
-            "mask_index",
-            axes=[1],
-            keepdims=0,
+        (
+            helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out", "axes_1"],
+                ["mask_index_out"],
+                "mask_index",
+                keepdims=0,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out"],
+                ["mask_index_out"],
+                "mask_index",
+                axes=[1],
+                keepdims=0,
+            )
         ),
         helper.make_node(
             "Attention",
@@ -514,14 +524,18 @@ def GenerateModel6(model_name):  # noqa: N802
     nodes = [  # LayerNorm subgraph
         helper.make_node("Shape", ["input_ids"], ["shape1_out"], "shape1"),
         helper.make_node("Gather", ["shape1_out", "indices_0"], ["gather0_out"], "gather0"),
-        helper.make_node("Unsqueeze", ["gather0_out", "axes_0"], ["unsqueeze0_out"], "unsqueeze0")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["gather0_out"], ["unsqueeze0_out"], "unsqueeze0", axes=[0]),
+        (
+            helper.make_node("Unsqueeze", ["gather0_out", "axes_0"], ["unsqueeze0_out"], "unsqueeze0")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["gather0_out"], ["unsqueeze0_out"], "unsqueeze0", axes=[0])
+        ),
         helper.make_node("Shape", ["input_ids"], ["shape2_out"], "shape2"),
         helper.make_node("Gather", ["shape2_out", "indices_1"], ["gather1_out"], "gather1"),
-        helper.make_node("Unsqueeze", ["gather1_out", "axes_0"], ["unsqueeze1_out"], "unsqueeze1")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["gather1_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[0]),
+        (
+            helper.make_node("Unsqueeze", ["gather1_out", "axes_0"], ["unsqueeze1_out"], "unsqueeze1")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["gather1_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[0])
+        ),
         helper.make_node(
             "Concat",
             ["unsqueeze0_out", "unsqueeze1_out"],
@@ -533,9 +547,11 @@ def GenerateModel6(model_name):  # noqa: N802
         helper.make_node("Equal", ["reshape_out", "equal_init"], ["equal_out"], "equal"),
         helper.make_node("Where", ["equal_out", "where_init", "reshape_out"], ["where_out"], "where"),
         helper.make_node("Range", ["start_0", "gather1_out", "delta_1"], ["range_out"], "range"),
-        helper.make_node("Unsqueeze", ["range_out", "axes_0"], ["unsqueeze2_out"], "unsqueeze2")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["range_out"], ["unsqueeze2_out"], "unsqueeze2", axes=[0]),
+        (
+            helper.make_node("Unsqueeze", ["range_out", "axes_0"], ["unsqueeze2_out"], "unsqueeze2")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["range_out"], ["unsqueeze2_out"], "unsqueeze2", axes=[0])
+        ),
         helper.make_node("Expand", ["unsqueeze2_out", "where_out"], ["expand_out"], "expand"),
         helper.make_node("Gather", ["pos_embed", "expand_out"], ["pos_gather_out"], "pos_gather"),
         helper.make_node("Gather", ["word_embed", "input_ids"], ["word_gather_out"], "word_gather"),
@@ -556,21 +572,23 @@ def GenerateModel6(model_name):  # noqa: N802
             epsion=0.000009999999747378752,
         ),
         helper.make_node("Cast", ["input_mask"], ["mask_cast_out"], "mask_cast", to=6),
-        helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out", "axes_1"],
-            ["mask_index_out"],
-            "mask_index",
-            keepdims=0,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out"],
-            ["mask_index_out"],
-            "mask_index",
-            axes=[1],
-            keepdims=0,
+        (
+            helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out", "axes_1"],
+                ["mask_index_out"],
+                "mask_index",
+                keepdims=0,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out"],
+                ["mask_index_out"],
+                "mask_index",
+                axes=[1],
+                keepdims=0,
+            )
         ),
         helper.make_node(
             "Attention",
@@ -756,9 +774,11 @@ def GenerateNodes2(attention_heads):  # noqa: N802
         helper.make_node("Shape", ["input_ids"], ["shape0_out"], "shape0"),
         helper.make_node("Gather", ["shape0_out", "indices_1"], ["gather0_out"], "gather0"),
         helper.make_node("Range", ["start", "gather0_out", "delta"], ["range0_out"], "range0"),
-        helper.make_node("Unsqueeze", ["range0_out", "axes_0"], ["unsqueeze0_out"], "unsqueeze0")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["range0_out"], ["unsqueeze0_out"], "unsqueeze0", axes=[0]),
+        (
+            helper.make_node("Unsqueeze", ["range0_out", "axes_0"], ["unsqueeze0_out"], "unsqueeze0")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["range0_out"], ["unsqueeze0_out"], "unsqueeze0", axes=[0])
+        ),
         helper.make_node("Shape", ["input_ids"], ["shape1_out"], "shape1"),
         helper.make_node("Expand", ["unsqueeze0_out", "shape1_out"], ["expand_out"], "expand"),
         helper.make_node(
@@ -778,21 +798,23 @@ def GenerateNodes2(attention_heads):  # noqa: N802
             epsion=0.000009999999747378752,
         ),
         helper.make_node("Cast", ["input_mask"], ["mask_cast_out"], "mask_cast", to=6),
-        helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out", "axes_1"],
-            ["mask_index_out"],
-            "mask_index",
-            keepdims=0,
-        )
-        if opset_version == 13
-        else helper.make_node(
-            "ReduceSum",
-            ["mask_cast_out"],
-            ["mask_index_out"],
-            "mask_index",
-            axes=[1],
-            keepdims=0,
+        (
+            helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out", "axes_1"],
+                ["mask_index_out"],
+                "mask_index",
+                keepdims=0,
+            )
+            if opset_version == 13
+            else helper.make_node(
+                "ReduceSum",
+                ["mask_cast_out"],
+                ["mask_index_out"],
+                "mask_index",
+                axes=[1],
+                keepdims=0,
+            )
         ),
         helper.make_node(
             "Attention",
@@ -898,12 +920,16 @@ def GenerateModel9(model_name):  # noqa: N802
         helper.make_node("Expand", ["unsqueeze0_out", "shape_out"], ["expand_out"], "expand"),
         helper.make_node("Gather", ["shape_out", "indices_0"], ["gather1_out"], "gather1"),
         helper.make_node("Gather", ["shape_out", "indices_1"], ["gather2_out"], "gather2"),
-        helper.make_node("Unsqueeze", ["gather1_out", "axes_0"], ["unsqueeze1_out"], "unsqueeze1")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["gather1_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[0]),
-        helper.make_node("Unsqueeze", ["gather2_out", "axes_0"], ["unsqueeze2_out"], "unsqueeze2")
-        if opset_version == 13
-        else helper.make_node("Unsqueeze", ["gather2_out"], ["unsqueeze2_out"], "unsqueeze2", axes=[0]),
+        (
+            helper.make_node("Unsqueeze", ["gather1_out", "axes_0"], ["unsqueeze1_out"], "unsqueeze1")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["gather1_out"], ["unsqueeze1_out"], "unsqueeze1", axes=[0])
+        ),
+        (
+            helper.make_node("Unsqueeze", ["gather2_out", "axes_0"], ["unsqueeze2_out"], "unsqueeze2")
+            if opset_version == 13
+            else helper.make_node("Unsqueeze", ["gather2_out"], ["unsqueeze2_out"], "unsqueeze2", axes=[0])
+        ),
         helper.make_node(
             "Concat",
             ["unsqueeze1_out", "unsqueeze2_out"],
diff --git a/orttraining/orttraining/python/training/__init__.py b/orttraining/orttraining/python/training/__init__.py
index a3c22686a1039..1da95dff94f9f 100644
--- a/orttraining/orttraining/python/training/__init__.py
+++ b/orttraining/orttraining/python/training/__init__.py
@@ -23,9 +23,9 @@
 
 try:
     if is_ortmodule_available():
-        from .ortmodule import ORTModule  # noqa: F401
+        from .ortmodule import ORTModule
 
-        __all__.append("ORTModule")
+        __all__ += ["ORTModule"]
 except ImportError:
     # That is OK iff this is not a ORTModule training package
     pass
diff --git a/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py b/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
index d7bbd249a000e..ff128c4da4259 100644
--- a/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
@@ -15,7 +15,6 @@
 class ApexAMPModifier(FP16OptimizerModifier):
     def __init__(self, optimizer, **kwargs) -> None:
         super().__init__(optimizer)
-        pass
 
     def can_be_modified(self):
         return self.check_requirements(
diff --git a/orttraining/orttraining/python/training/ort_triton/_lowering.py b/orttraining/orttraining/python/training/ort_triton/_lowering.py
index 5c848d2cecc58..4b580a0cc86de 100644
--- a/orttraining/orttraining/python/training/ort_triton/_lowering.py
+++ b/orttraining/orttraining/python/training/ort_triton/_lowering.py
@@ -312,7 +312,7 @@ def _group_nodes(self):
             for j in range(i + 1, len(groups)):
                 if any(output in group_inputs for output in groups[j].nodes_groups[0].output):
                     group_dependencies[i].add(j)
-                    for k in range(0, i):
+                    for k in range(i):
                         if i in group_dependencies[k]:
                             group_dependencies[k].add(j)
 
diff --git a/orttraining/orttraining/python/training/ort_triton/kernel/_flash_attn.py b/orttraining/orttraining/python/training/ort_triton/kernel/_flash_attn.py
index 03bb0f4373d8d..f7b7c1ff08300 100644
--- a/orttraining/orttraining/python/training/ort_triton/kernel/_flash_attn.py
+++ b/orttraining/orttraining/python/training/ort_triton/kernel/_flash_attn.py
@@ -694,7 +694,7 @@ def _bwd_kernel(
     LSE += off_hb * seqlen_q_rounded
     if not SEQUENCE_PARALLEL:
         num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
-        for start_n in range(0, num_block_n):
+        for start_n in range(num_block_n):
             _bwd_kernel_one_col_block(
                 start_n,
                 Q,
diff --git a/orttraining/orttraining/python/training/ortmodule/_fallback_exceptions.py b/orttraining/orttraining/python/training/ortmodule/_fallback_exceptions.py
index 12780016a9ab1..871d3fff8ce3f 100644
--- a/orttraining/orttraining/python/training/ortmodule/_fallback_exceptions.py
+++ b/orttraining/orttraining/python/training/ortmodule/_fallback_exceptions.py
@@ -10,8 +10,6 @@ class ORTModuleFallbackException(Exception):  # noqa: N818
     it can also be used for generic exception that require fallback
     """
 
-    pass
-
 
 class ORTModuleInitException(ORTModuleFallbackException):
     """Trigger fallback for ORTModule initialization related exceptions
@@ -20,8 +18,6 @@ class ORTModuleInitException(ORTModuleFallbackException):
     including PyTorch version, missing ORTModule's PyTorch C++ extension binaries, etc.
     """
 
-    pass
-
 
 class ORTModuleDeviceException(ORTModuleFallbackException):
     """Trigger fallback for device related exceptions
@@ -31,8 +27,6 @@ class ORTModuleDeviceException(ORTModuleFallbackException):
     This exception does not capture these scenarios.
     """
 
-    pass
-
 
 class ORTModuleIOError(ORTModuleFallbackException):
     """Trigger fallback for I/O related exceptions
@@ -42,8 +36,6 @@ class ORTModuleIOError(ORTModuleFallbackException):
     This exception does not capture these scenarios.
     """
 
-    pass
-
 
 class ORTModuleTorchModelException(ORTModuleFallbackException):
     """Trigger fallback for PyTorch modules related exceptions
@@ -52,8 +44,6 @@ class ORTModuleTorchModelException(ORTModuleFallbackException):
     checking type(model) over a hardcoded list of incompatible models.
     """
 
-    pass
-
 
 class ORTModuleONNXModelException(ORTModuleFallbackException):
     """Trigger fallback for ONNX model related exceptions
@@ -61,8 +51,6 @@ class ORTModuleONNXModelException(ORTModuleFallbackException):
     This exception is raised during model conversion to ONNX and post-processing validation within ORTModule frontend.
     """
 
-    pass
-
 
 def wrap_exception(
     new_exception: ORTModuleFallbackException, raised_exception: Exception
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index c67b05758c5aa..568c92b71277f 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -188,7 +188,6 @@ def forward(self):
         This is an abstract method and must be overridden by a concrete implementation.
         This is the only method that the user should call on a concrete instance of the ExecutionManager
         All other methods are internal"""
-        pass
 
     def _build_graph(self, config):
         if self._runtime_options.use_static_shape:
@@ -412,9 +411,9 @@ def _get_exported_model(self, input_schema: ORTModelInputOutputSchemaType, *inpu
                     # From some PyTorch version, autograd_inlining is a valid argument.
                     # We allow it to be True if custom autograd function is disabled (where autograd.Function
                     # anyway is not supported in ONNX until it can be inlined).
-                    required_export_kwargs[
-                        "autograd_inlining"
-                    ] = not self._runtime_options.enable_custom_autograd_function
+                    required_export_kwargs["autograd_inlining"] = (
+                        not self._runtime_options.enable_custom_autograd_function
+                    )
 
                 invalid_args = self._export_extra_kwargs.keys() & required_export_kwargs.keys()
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_logger.py b/orttraining/orttraining/python/training/ortmodule/_logger.py
index a01db28374b8d..91b99d4323d6f 100644
--- a/orttraining/orttraining/python/training/ortmodule/_logger.py
+++ b/orttraining/orttraining/python/training/ortmodule/_logger.py
@@ -267,9 +267,11 @@ def wrapper(graph_execution_manager, *args, **kwargs):
                 on_exit=partial(
                     _log_with_filter,
                     graph_execution_manager._logger,
-                    graph_execution_manager._debug_options.onnxruntime_log_filter
-                    if self.is_ort_filter
-                    else graph_execution_manager._debug_options.torch_exporter_filter,
+                    (
+                        graph_execution_manager._debug_options.onnxruntime_log_filter
+                        if self.is_ort_filter
+                        else graph_execution_manager._debug_options.torch_exporter_filter
+                    ),
                     self.phase.to_string(),
                 ),
             ):
diff --git a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
index 22e31466887a6..d3fe132609a90 100644
--- a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
+++ b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
@@ -433,9 +433,7 @@ def _print_embed_label_stats(self):
                 total_token,
                 valid_token_per_batch,
             ) in self._stats:
-                stat += "\t| {:<10} | {:<10} | {:<15} | {:<10} | {:<9.2f}% | {:<15} | {:<15} | {:<15} |\n".format(
-                    step, input_type, input_name, padding_idx, density, valid_token, total_token, valid_token_per_batch
-                )
+                stat += f"\t| {step:<10} | {input_type:<10} | {input_name:<15} | {padding_idx:<10} | {density:<9.2f}% | {valid_token:<15} | {total_token:<15} | {valid_token_per_batch:<15} |\n"
             stat += "<<<\n"
             self._logger.info(stat)
             self._stats.clear()
@@ -697,9 +695,11 @@ def _get_user_config_without_freq(configs: str):
                     [
                         f" - Plan {index}",
                         ":",
-                        "ON"
-                        if all(cluster_id in user_configs_with_out_freq for cluster_id in cluster_ids_without_freq)
-                        else "OFF",
+                        (
+                            "ON"
+                            if all(cluster_id in user_configs_with_out_freq for cluster_id in cluster_ids_without_freq)
+                            else "OFF"
+                        ),
                         ":",
                         cluster_id,
                         saving_symbolic.freq if details else "",
diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
index 73c32a2f51e41..5fa332d12f01c 100644
--- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
@@ -171,10 +171,10 @@ def backward(ctx, *grad_outputs):
                 for idx, grad_output in enumerate(grad_outputs):
                     if idx in self._graph_info.output_grad_indices_non_differentiable:
                         assert grad_output is None, (
-                            "ORT found the {}-th module output '{}' is "
+                            f"ORT found the {idx}-th module output '{self._graph_info.user_output_names[idx]}' is "
                             "non-differentiable according to the onnx graph. "
                             "However, the gradient value is still provided by "
-                            "PyTorch's autograd engine.".format(idx, self._graph_info.user_output_names[idx])
+                            "PyTorch's autograd engine."
                         )
                         continue
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_utils.py b/orttraining/orttraining/python/training/ortmodule/_utils.py
index 91825fc492208..5faa1c62bae4f 100644
--- a/orttraining/orttraining/python/training/ortmodule/_utils.py
+++ b/orttraining/orttraining/python/training/ortmodule/_utils.py
@@ -91,7 +91,7 @@ def _ortvalues_to_torch_tensor(
         # Second option makes it impossible to directly use `_from_dlpack` or
         # or `from_dlpack` from torch.
         # The best option would be to add boolean type in DLDataTypeCode.
-        for i in range(0, len(bool_indices)):
+        for i in range(len(bool_indices)):
             j = bool_indices[i]
             res[j] = res[j].to(torch.bool)
 
diff --git a/orttraining/orttraining/python/training/ortmodule/options.py b/orttraining/orttraining/python/training/ortmodule/options.py
index 7263a5719e262..1bde07dc29ba9 100644
--- a/orttraining/orttraining/python/training/ortmodule/options.py
+++ b/orttraining/orttraining/python/training/ortmodule/options.py
@@ -387,7 +387,6 @@ def _override_from_env_vars(self):
             try:
                 import triton  # noqa: F401
             except ImportError:
-                pass
                 self._logger.warning(
                     "triton library missing. Please install triton with `pip install triton`. Triton feature will be off."
                 )
diff --git a/orttraining/orttraining/test/external_custom_ops/setup.py b/orttraining/orttraining/test/external_custom_ops/setup.py
index 435b83b818380..29383e3618346 100644
--- a/orttraining/orttraining/test/external_custom_ops/setup.py
+++ b/orttraining/orttraining/test/external_custom_ops/setup.py
@@ -28,9 +28,7 @@ def build_extension(self, ext):
         subprocess.check_call(
             [
                 "cmake",
-                "-DPYBIND11_PYTHON_VERSION={}.{}.{}".format(
-                    sys.version_info.major, sys.version_info.minor, sys.version_info.micro
-                ),
+                f"-DPYBIND11_PYTHON_VERSION={sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
                 f"-Dpybind11_DIR={pybind11.get_cmake_dir()}",
                 f"-DONNX_INCLUDE={os.path.dirname(os.path.dirname(onnx.__file__))}",
                 "-DONNXRUNTIME_EXTERNAL_INCLUDE={}".format(
diff --git a/orttraining/orttraining/test/python/_test_commons.py b/orttraining/orttraining/test/python/_test_commons.py
index fb7e62551de63..762c4c4d55f9f 100644
--- a/orttraining/orttraining/test/python/_test_commons.py
+++ b/orttraining/orttraining/test/python/_test_commons.py
@@ -25,5 +25,5 @@ def run_subprocess(args, cwd=None, capture=False, dll_path=None, shell=False, en
     completed_process = subprocess.run(args, cwd=cwd, check=True, stdout=stdout, stderr=stderr, env=my_env, shell=shell)
 
     if log:
-        log.debug("Subprocess completed. Return code=" + str(completed_process.returncode))
+        log.debug("Subprocess completed. Return code=%s", completed_process.returncode)
     return completed_process
diff --git a/orttraining/orttraining/test/python/_test_helpers.py b/orttraining/orttraining/test/python/_test_helpers.py
index 8f2a18b5ec00b..65043c10d8a01 100644
--- a/orttraining/orttraining/test/python/_test_helpers.py
+++ b/orttraining/orttraining/test/python/_test_helpers.py
@@ -288,7 +288,6 @@ def cpu_barrier_func():
 
     def cuda_barrier_func():
         torch.cuda.synchronize()
-        pass
 
     cuda = torch.device("cuda:0")
     run_evaluate_test_on_device_and_compare(
diff --git a/orttraining/orttraining/test/python/orttraining_test_gru.py b/orttraining/orttraining/test/python/orttraining_test_gru.py
index fcb7e13b1694f..c9e22bf7384af 100644
--- a/orttraining/orttraining/test/python/orttraining_test_gru.py
+++ b/orttraining/orttraining/test/python/orttraining_test_gru.py
@@ -355,9 +355,7 @@ def backward_np(
                 prev_h = (
                     all_hidden_states[t - 1, 0, idx, :]
                     if t > 0
-                    else initial_hidden_state[0, idx, :]
-                    if initial_hidden_state is not None
-                    else 0
+                    else initial_hidden_state[0, idx, :] if initial_hidden_state is not None else 0
                 )
 
                 grad_update_gate = (prev_h - hidden_gate) * grad_h
diff --git a/orttraining/orttraining/test/python/orttraining_test_lstm.py b/orttraining/orttraining/test/python/orttraining_test_lstm.py
index 2b296cf70c2c1..4debe73951b2f 100644
--- a/orttraining/orttraining/test/python/orttraining_test_lstm.py
+++ b/orttraining/orttraining/test/python/orttraining_test_lstm.py
@@ -480,9 +480,7 @@ def backward_np(
                 grad_forget_gate = grad_c * (
                     all_cell_states[t - 1, 0, idx, :]
                     if t > 0
-                    else initial_cell_state[0, idx, :]
-                    if initial_cell_state is not None
-                    else 0
+                    else initial_cell_state[0, idx, :] if initial_cell_state is not None else 0
                 )
                 grad_control_gate = grad_c * input_gate
 
@@ -522,9 +520,7 @@ def backward_np(
                 prev_h = (
                     all_hidden_states[t - 1, 0, idx, :]
                     if t > 0
-                    else initial_hidden_state[0, idx, :]
-                    if initial_hidden_state is not None
-                    else 0
+                    else initial_hidden_state[0, idx, :] if initial_hidden_state is not None else 0
                 )
                 grad_recurrence_weights[0, : self._hidden_size, :] += np.dot(
                     np.expand_dims(grad_input_activation, axis=0).T, np.expand_dims(prev_h, axis=0)
@@ -553,9 +549,7 @@ def backward_np(
                     grad_peephole_weights[0, : self._hidden_size] += grad_input_activation * (
                         all_cell_states[t - 1, 0, idx, :]
                         if t > 0
-                        else initial_cell_state[0, idx, :]
-                        if initial_cell_state is not None
-                        else 0
+                        else initial_cell_state[0, idx, :] if initial_cell_state is not None else 0
                     )
                     grad_peephole_weights[0, self._hidden_size : 2 * self._hidden_size] += (
                         grad_output_activation * all_cell_states[t, 0, idx, :]
@@ -565,9 +559,7 @@ def backward_np(
                     ] += grad_forget_activation * (
                         all_cell_states[t - 1, 0, idx, :]
                         if t > 0
-                        else initial_cell_state[0, idx, :]
-                        if initial_cell_state is not None
-                        else 0
+                        else initial_cell_state[0, idx, :] if initial_cell_state is not None else 0
                     )
 
                 grad_c = grad_prev_c
diff --git a/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py b/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py
index 3d41c8678278c..11df3fa347ff8 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py
@@ -190,9 +190,11 @@ def _get_training_ort_inputs(x, target, pt_model, onnx_model, target_type=None):
 
     ort_inputs = {
         onnx_model.graph.input[0].name: _to_numpy(copy.deepcopy(x)),
-        onnx_model.graph.input[1].name: _to_numpy(copy.deepcopy(target))
-        if target_type is None
-        else _to_numpy(copy.deepcopy(target).type(target_type)),
+        onnx_model.graph.input[1].name: (
+            _to_numpy(copy.deepcopy(target))
+            if target_type is None
+            else _to_numpy(copy.deepcopy(target).type(target_type))
+        ),
     }
     if target_type is not None:
         ort_inputs[onnx_model.graph.input[1].name]
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index f0261c776609e..7afad9145ed27 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -3797,7 +3797,7 @@ def forward(self, input1=None, input2=None):
             model.eval()
 
         # Must work because forward() and dict order match
-        y1, y2 = model(**{"input1": input1, "input2": input2})
+        y1, y2 = model(input1=input1, input2=input2)
         assert y1 is not None
         assert y2 is not None
         if model._is_training():
@@ -3805,7 +3805,7 @@ def forward(self, input1=None, input2=None):
             loss.backward()
 
         # Must work even when forward() and dict order mismatch
-        y1, y2 = model(**{"input2": input2, "input1": input1})
+        y1, y2 = model(input2=input2, input1=input1)
         assert y1 is not None
         assert y2 is not None
         if model._is_training():
@@ -3887,17 +3887,20 @@ def run_step(expected, a, b, c, d, e, f, y, z):
             None,
             None,
         )
-        run_step(
-            a.item() + f.item(), **{"a": a, "b": None, "c": None, "d": None, "e": None, "f": f, "y": None, "z": None}
-        )
+        run_step(a.item() + f.item(), a=a, b=None, c=None, d=None, e=None, f=f, y=None, z=None)
         run_step(a.item() + z.item(), a, None, None, None, None, None, None, z)
-        run_step(
-            a.item() + z.item(), **{"a": a, "b": None, "c": None, "d": None, "e": None, "f": None, "y": None, "z": z}
-        )
+        run_step(a.item() + z.item(), a=a, b=None, c=None, d=None, e=None, f=None, y=None, z=z)
         run_step(a.item() + c.item() + y.item(), a, None, c, None, None, None, y, None)
         run_step(
             a.item() + c.item() + y.item(),
-            **{"a": a, "b": None, "c": c, "d": None, "e": None, "f": None, "y": y, "z": None},
+            a=a,
+            b=None,
+            c=c,
+            d=None,
+            e=None,
+            f=None,
+            y=y,
+            z=None,
         )
         run_step(
             a.item() + b.item() + c.item() + d.item() + e.item() + f.item() + y.item() + z.item(),
@@ -3912,7 +3915,14 @@ def run_step(expected, a, b, c, d, e, f, y, z):
         )
         run_step(
             a.item() + b.item() + c.item() + d.item() + e.item() + f.item() + y.item() + z.item(),
-            **{"a": a, "b": b, "c": c, "d": d, "e": e, "f": f, "y": y, "z": z},
+            a=a,
+            b=b,
+            c=c,
+            d=d,
+            e=e,
+            f=f,
+            y=y,
+            z=z,
         )
 
     del os.environ["ORTMODULE_SKIPCHECK_POLICY"]
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
index 3d92e0b323c19..a1a7d4660f266 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
@@ -441,7 +441,7 @@ def main():
 
     # 4. Train loop (fine-tune)
     total_training_time, total_test_time, epoch_0_training, validation_accuracy = 0, 0, 0, 0
-    for epoch_i in range(0, args.epochs):
+    for epoch_i in range(args.epochs):
         total_training_time += train(model, optimizer, scheduler, train_dataloader, epoch_i, device, args)
         if not args.pytorch_only and epoch_i == 0:
             epoch_0_training = total_training_time
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
index 87c8e66231a29..0d5aba1a1a5c4 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
@@ -446,7 +446,7 @@ def main():
 
     # 4. Train loop (fine-tune)
     total_training_time, total_test_time, epoch_0_training, validation_accuracy = 0, 0, 0, 0
-    for epoch_i in range(0, args.epochs):
+    for epoch_i in range(args.epochs):
         total_training_time += train(model, optimizer, scaler, scheduler, train_dataloader, epoch_i, device, args)
         if not args.pytorch_only and epoch_i == 0:
             epoch_0_training = total_training_time
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
index 86e8d9aea1d37..5b28e9c52b480 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
@@ -8,6 +8,7 @@
     --deepspeed_config=orttraining_test_ortmodule_deepspeed_zero_stage_1_config.json
 ```
 """
+
 import argparse
 import time
 
@@ -36,11 +37,7 @@ def forward(self, input1):
 
 
 def train(args, model, device, optimizer, loss_fn, train_loader, epoch):
-    print(
-        "\n======== Epoch {:} / {:} with batch size {:} ========".format(
-            epoch + 1, args.epochs, model.train_batch_size()
-        )
-    )
+    print(f"\n======== Epoch {epoch + 1} / {args.epochs} with batch size {model.train_batch_size()} ========")
     model.train()
     # Measure how long the training epoch takes.
     t0 = time.time()
@@ -77,13 +74,7 @@ def train(args, model, device, optimizer, loss_fn, train_loader, epoch):
             curr_time = time.time()
             elapsed_time = curr_time - start_time
             print(
-                "[{:5}/{:5} ({:2.0f}%)]\tLoss: {:.6f}\tExecution time: {:.4f}".format(
-                    iteration * len(data),
-                    len(train_loader.dataset),
-                    100.0 * iteration / len(train_loader),
-                    loss,
-                    elapsed_time,
-                )
+                f"[{iteration * len(data):5}/{len(train_loader.dataset):5} ({100.0 * iteration / len(train_loader):2.0f}%)]\tLoss: {loss:.6f}\tExecution time: {elapsed_time:.4f}"
             )
             start_time = curr_time
 
@@ -115,13 +106,7 @@ def test(args, model, device, loss_fn, test_loader):
             correct += pred.eq(target.view_as(pred)).sum().item()
     test_loss /= len(test_loader.dataset)
     print(
-        "\nTest set: Batch size: {:}, Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
-            args.test_batch_size,
-            test_loss,
-            correct,
-            len(test_loader.dataset),
-            100.0 * correct / len(test_loader.dataset),
-        )
+        f"\nTest set: Batch size: {args.test_batch_size}, Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100.0 * correct / len(test_loader.dataset):.0f}%)\n"
     )
 
     # Report the final accuracy for this validation run.
@@ -251,7 +236,7 @@ def main():
 
     # Train loop
     total_training_time, total_test_time, epoch_0_training = 0, 0, 0
-    for epoch in range(0, args.epochs):
+    for epoch in range(args.epochs):
         total_training_time += train(args, model, device, optimizer, my_loss, train_loader, epoch)
         if not args.pytorch_only and epoch == 0:
             epoch_0_training = total_training_time
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
index 53e1928e2d2f3..4437611283122 100755
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
@@ -123,13 +123,7 @@ def train_step(args, model, device, optimizer, loss_fn, train_loader, epoch):
             curr_time = time.time()
             elapsed_time = curr_time - start_time
             print(
-                "[{:5}/{:5} ({:2.0f}%)]\tLoss: {:.6f}\tExecution time: {:.4f}".format(
-                    iteration * len(data),
-                    len(train_loader.dataset),
-                    100.0 * iteration / len(train_loader),
-                    loss,
-                    elapsed_time,
-                )
+                f"[{iteration * len(data):5}/{len(train_loader.dataset):5} ({100.0 * iteration / len(train_loader):2.0f}%)]\tLoss: {loss:.6f}\tExecution time: {elapsed_time:.4f}"
             )
             start_time = curr_time
 
@@ -160,13 +154,7 @@ def test(args, model, device, loss_fn, test_loader):
             correct += pred.eq(target.view_as(pred)).sum().item()
     test_loss /= len(test_loader.dataset)
     print(
-        "\nTest set: Batch size: {:}, Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
-            args.test_batch_size,
-            test_loss,
-            correct,
-            len(test_loader.dataset),
-            100.0 * correct / len(test_loader.dataset),
-        )
+        f"\nTest set: Batch size: {args.test_batch_size}, Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100.0 * correct / len(test_loader.dataset):.0f}%)\n"
     )
 
     # Report the final accuracy for this validation run.
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py
index 2f240406b25b9..df0b5f195f0b9 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py
@@ -1,6 +1,7 @@
 """
 @brief      test log(time=3s)
 """
+
 import copy
 import unittest
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
index 1cb0b3626e54e..d6f84d94c2838 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
@@ -64,13 +64,7 @@ def train(args, model, device, optimizer, loss_fn, train_loader, epoch):
             curr_time = time.time()
             elapsed_time = curr_time - start_time
             print(
-                "[{:5}/{:5} ({:2.0f}%)]\tLoss: {:.6f}\tExecution time: {:.4f}".format(
-                    iteration * len(data),
-                    len(train_loader.dataset),
-                    100.0 * iteration / len(train_loader),
-                    loss,
-                    elapsed_time,
-                )
+                f"[{iteration * len(data):5}/{len(train_loader.dataset):5} ({100.0 * iteration / len(train_loader):2.0f}%)]\tLoss: {loss:.6f}\tExecution time: {elapsed_time:.4f}"
             )
             start_time = curr_time
 
@@ -102,13 +96,7 @@ def test(args, model, device, loss_fn, test_loader):
             correct += pred.eq(target.view_as(pred)).sum().item()
     test_loss /= len(test_loader.dataset)
     print(
-        "\nTest set: Batch size: {:}, Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
-            args.test_batch_size,
-            test_loss,
-            correct,
-            len(test_loader.dataset),
-            100.0 * correct / len(test_loader.dataset),
-        )
+        f"\nTest set: Batch size: {args.test_batch_size}, Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100.0 * correct / len(test_loader.dataset):.0f}%)\n"
     )
 
     # Report the final accuracy for this validation run.
@@ -221,7 +209,7 @@ def main():
 
     # Train loop
     total_training_time, total_test_time, epoch_0_training, validation_accuracy = 0, 0, 0, 0
-    for epoch in range(0, args.epochs):
+    for epoch in range(args.epochs):
         total_training_time += train(args, model, device, optimizer, my_loss, train_loader, epoch)
         if not args.pytorch_only and epoch == 0:
             epoch_0_training = total_training_time
diff --git a/orttraining/orttraining/test/python/qat_poc_example/quantize.py b/orttraining/orttraining/test/python/qat_poc_example/quantize.py
index 6d9ea284fd3ef..225fb2f8e81b4 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/quantize.py
+++ b/orttraining/orttraining/test/python/qat_poc_example/quantize.py
@@ -53,7 +53,7 @@ def quantize_static(input_model_dir, output_model_dir):
     logging.info(
         "Invoking onnxruntime.quantization.quantize_static with AddQDQPairToWeight=True and QuantizeBias=False.."
     )
-    logging.info("Quantized model will be saved to %s." % output_model_dir)
+    logging.info("Quantized model will be saved to %s.", output_model_dir)
     quantization.quantize_static(
         input_model_dir,
         output_model_dir,
diff --git a/orttraining/tools/amdgpu/script/rocprof.py b/orttraining/tools/amdgpu/script/rocprof.py
index e5b107ba285bf..21dd8501f3f1d 100644
--- a/orttraining/tools/amdgpu/script/rocprof.py
+++ b/orttraining/tools/amdgpu/script/rocprof.py
@@ -68,18 +68,10 @@ def gpu_kernel_calls(activities):
 for name in groups:
     activities = groups[name]
     print(
-        "{}: N={}, calls={}, absolute={:.3f}s, percent={:.2f}%".format(
-            name,
-            len(activities),
-            gpu_kernel_calls(activities),
-            gpu_absolute_time(activities),
-            gpu_percent_time(activities),
-        )
+        f"{name}: N={len(activities)}, calls={gpu_kernel_calls(activities)}, absolute={gpu_absolute_time(activities):.3f}s, percent={gpu_percent_time(activities):.2f}%"
     )
 
 total = [item for name in groups for item in groups[name]]
 print(
-    "Total: N={}, calls={}, absolute={:.3f}s, percent={:.2f}%".format(
-        len(total), gpu_kernel_calls(total), gpu_absolute_time(total), gpu_percent_time(total)
-    )
+    f"Total: N={len(total)}, calls={gpu_kernel_calls(total)}, absolute={gpu_absolute_time(total):.3f}s, percent={gpu_percent_time(total):.2f}%"
 )
diff --git a/orttraining/tools/ci_test/run_bert_perf_test.py b/orttraining/tools/ci_test/run_bert_perf_test.py
index bb15d6f5965b6..13d5e9f140958 100644
--- a/orttraining/tools/ci_test/run_bert_perf_test.py
+++ b/orttraining/tools/ci_test/run_bert_perf_test.py
@@ -99,8 +99,8 @@ def main():
 
         subprocess.run(cmds).check_returncode()  # noqa: PLW1510
         if c.expected_perf > 0.0:
-            json_filename = "onnxruntime_perf_metrics_{}.onnx_bert_{}_{}_Lamb.json".format(
-                model, precision_prefix, c.max_seq_length
+            json_filename = (
+                f"onnxruntime_perf_metrics_{model}.onnx_bert_{precision_prefix}_{c.max_seq_length}_Lamb.json"
             )
             with open(os.path.join(SCRIPT_DIR, "results", json_filename)) as json_file:
                 results = json.load(json_file)
diff --git a/orttraining/tools/scripts/nv_run_pretraining.py b/orttraining/tools/scripts/nv_run_pretraining.py
index f64460f3ff0b9..8c57101f72ddb 100644
--- a/orttraining/tools/scripts/nv_run_pretraining.py
+++ b/orttraining/tools/scripts/nv_run_pretraining.py
@@ -81,9 +81,11 @@ def __len__(self):
 
     def __getitem__(self, index):
         [input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels] = [
-            torch.from_numpy(input[index].astype(np.int64))
-            if indice < 5
-            else torch.from_numpy(np.asarray(input[index].astype(np.int64)))
+            (
+                torch.from_numpy(input[index].astype(np.int64))
+                if indice < 5
+                else torch.from_numpy(np.asarray(input[index].astype(np.int64)))
+            )
             for indice, input in enumerate(self.inputs)
         ]
 
@@ -231,9 +233,7 @@ def setup_training(args):
         )
     if args.train_batch_size % args.gradient_accumulation_steps != 0:
         raise ValueError(
-            "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible".format(
-                args.gradient_accumulation_steps, args.train_batch_size
-            )
+            f"Invalid gradient_accumulation_steps parameter: {args.gradient_accumulation_steps}, batch size {args.train_batch_size} should be divisible"
         )
 
     args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
diff --git a/orttraining/tools/scripts/watch_experiment.py b/orttraining/tools/scripts/watch_experiment.py
index aefa1f57cfc16..d2255b63c66b5 100644
--- a/orttraining/tools/scripts/watch_experiment.py
+++ b/orttraining/tools/scripts/watch_experiment.py
@@ -57,11 +57,7 @@
     remote_root = args.remote_dir
 
     if run.get_status() in ["Completed", "Failed", "Canceled"]:
-        print(
-            "Downloading Experiment files from remote directory: '{}' to local directory: '{}'".format(
-                remote_root, local_root
-            )
-        )
+        print(f"Downloading Experiment files from remote directory: '{remote_root}' to local directory: '{local_root}'")
         files = [f for f in run.get_file_names() if f.startswith(remote_root)]
         for remote_path in files:
             local_path = os.path.join(local_root, os.path.basename(remote_path))
@@ -71,11 +67,7 @@
         event = Event()
         session = Session()
 
-        print(
-            "Streaming Experiment files from remote directory: '{}' to local directory: '{}'".format(
-                remote_root, local_root
-            )
-        )
+        print(f"Streaming Experiment files from remote directory: '{remote_root}' to local directory: '{local_root}'")
         watcher = RunWatcher(
             run, local_root=local_root, remote_root=remote_root, executor=executor, event=event, session=session
         )
diff --git a/pyproject.toml b/pyproject.toml
index 97515cb9fa62b..8fe114d4692c9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,19 +44,26 @@ reportMissingImports = false
 [tool.ruff]
 # NOTE: Do not create an exclude list. Edit .lintrunner.toml instead
 target-version = "py38"
+
+[tool.ruff.lint]
 select = [
     "B", # flake8-bugbear
     "E", # pycodestyle
     "F", # Pyflakes
+    "FURB", # refurb
+    "G", # flake8-logging-format
     "ISC", # flake8-implicit-str-concat
     "N", # pep8-naming
     "NPY", # numpy
     "PERF", # Perflint
+    "PIE", # flake8-pie
     "PLC", # pylint conventions
     "PLE", # pylint errors
     "PLW", # pylint warnings
+    "PYI", # flake8-pyi
     "RUF", # Ruff-specific rules
     "SIM", # flake8-simplify
+    "SLOT", # flake8-slots
     "T10", # flake8-debugger
     "UP", # pyupgrade
     "W", # pycodestyle
@@ -67,12 +74,15 @@ select = [
 ignore = [
     "B028", # FIXME: Add stacklevel to warnings
     "E501", # Line length controlled by black
+    "G004", # FIXME: Enable when the rule can be autofixed
     "N803", # Argument casing
     "N812", # Allow import torch.nn.functional as F
     "N999", # Module names
     "NPY002", # np.random.Generator may not always fit our use cases
     "PERF203", # "try-except-in-loop" only affects Python <3.11, and the improvement is minor; can have false positives
     "PERF401", # List comprehensions are not always readable
+    "PYI041", # May create confusion
+    "PYI024", # May create confusion
     "SIM102", # We don't perfer always combining if branches
     "SIM108", # We don't encourage ternary operators
     "SIM114", # Don't combine if branches for debugability
@@ -84,7 +94,7 @@ unfixable = [
     "SIM112", # Use upper case for env vars
 ]
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 # NOTE: Refrain from growing the ignore list unless for exceptional cases.
 # Prefer inline ignores with `noqa: xxx`.
 # Eventually this list should become empty.
diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt
index 6836d5df69324..d19ebe379b50b 100644
--- a/requirements-lintrunner.txt
+++ b/requirements-lintrunner.txt
@@ -1,9 +1,9 @@
 # This file is auto updated by dependabot
 lintrunner-adapters>=0.11.0
 # RUFF
-ruff==0.2.1
+ruff==0.3.2
 # BLACK-ISORT
-black==23.10.1
+black==24.2.0
 isort==5.12.0
 # CLANGFORMAT
 clang-format==17.0.4
diff --git a/setup.py b/setup.py
index 9a5fc29dd5e02..ac7a70b991fbf 100644
--- a/setup.py
+++ b/setup.py
@@ -257,7 +257,7 @@ def run(self):
                 auditwheel_cmd = ["auditwheel", "-v", "repair", "-w", self.dist_dir, file]
                 for i in cuda_dependencies + rocm_dependencies + tensorrt_dependencies:
                     auditwheel_cmd += ["--exclude", i]
-                logger.info("Running {}".format(" ".join([shlex.quote(arg) for arg in auditwheel_cmd])))
+                logger.info("Running %s", " ".join([shlex.quote(arg) for arg in auditwheel_cmd]))
                 try:
                     subprocess.run(auditwheel_cmd, check=True, stdout=subprocess.PIPE)
                 finally:
@@ -614,9 +614,7 @@ def reformat_run_count(count_str):
             # TODO: this is the last time we have to do this!!!
             # We shall bump up release number right after release cut.
             if ort_version.major == 1 and ort_version.minor == 8 and ort_version.micro == 0:
-                version_number = "{major}.{minor}.{macro}".format(
-                    major=ort_version.major, minor=ort_version.minor + 1, macro=ort_version.micro
-                )
+                version_number = f"{ort_version.major}.{ort_version.minor + 1}.{ort_version.micro}"
 
     version_number = version_number + ".dev" + build_suffix
 
@@ -667,9 +665,11 @@ def save_build_and_package_info(package_name, version_number, cuda_version, rocm
                 else:
                     print(
                         "Error getting cudart version. ",
-                        "did not find any cudart library"
-                        if not cudart_versions or len(cudart_versions) == 0
-                        else "found multiple cudart libraries",
+                        (
+                            "did not find any cudart library"
+                            if not cudart_versions or len(cudart_versions) == 0
+                            else "found multiple cudart libraries"
+                        ),
                     )
             elif rocm_version:
                 f.write(f"rocm_version = '{rocm_version}'\n")
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 1056c4ed84510..067f151844b1b 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -38,8 +38,6 @@ def version_to_tuple(version: str) -> tuple:
 class BaseError(Exception):
     """Base class for errors originating from build.py."""
 
-    pass
-
 
 class BuildError(BaseError):
     """Error from running build steps."""
@@ -89,7 +87,7 @@ def _openvino_verify_device_type(device_read):
         res = True
     elif device_read in choices1:
         res = True
-    elif device_read.startswith("HETERO:") or device_read.startswith("MULTI:") or device_read.startswith("AUTO:"):
+    elif device_read.startswith(("HETERO:", "MULTI:", "AUTO:")):
         res = True
         comma_separated_devices = device_read.split(":")
         comma_separated_devices = comma_separated_devices[1].split(",")
@@ -118,7 +116,7 @@ def invalid_hetero_build():
         print("pick the build type for specific Hardware Device from following options: ", choices)
         print("(or) from the following options with graph partitioning disabled: ", choices1)
         print("\n")
-        if not (device_read.startswith("HETERO") or device_read.startswith("MULTI") or device_read.startswith("AUTO")):
+        if not (device_read.startswith(("HETERO", "MULTI", "AUTO"))):
             invalid_hetero_build()
         sys.exit("Wrong Build Type selected")
 
@@ -1721,9 +1719,7 @@ def setup_cuda_vars(args):
         if not cuda_home_valid or (not is_windows() and not cudnn_home_valid):
             raise BuildError(
                 "cuda_home and cudnn_home paths must be specified and valid.",
-                "cuda_home='{}' valid={}. cudnn_home='{}' valid={}".format(
-                    cuda_home, cuda_home_valid, cudnn_home, cudnn_home_valid
-                ),
+                f"cuda_home='{cuda_home}' valid={cuda_home_valid}. cudnn_home='{cudnn_home}' valid={cudnn_home_valid}",
             )
 
     return cuda_home, cudnn_home
@@ -2489,11 +2485,11 @@ def diff_file(path, regenerate_qualifiers=""):
                     nonlocal have_diff
                     have_diff = True
                     log.warning(
-                        "The updated document {} is different from the checked in version. "
-                        "Please regenerate the file{}, or copy the updated version from the "
-                        "CI build's published artifacts if applicable.".format(path, regenerate_qualifiers)
+                        f"The updated document {path} is different from the checked in version. "
+                        f"Please regenerate the file{regenerate_qualifiers}, or copy the updated version from the "
+                        "CI build's published artifacts if applicable."
                     )
-                    log.debug("diff:\n" + diff)
+                    log.debug("diff:\n" + diff)  # noqa: G003
 
             diff_file(opkernel_doc_path, " with CPU, CUDA and DML execution providers enabled")
             diff_file(contrib_op_doc_path)
@@ -2508,7 +2504,7 @@ def diff_file(path, regenerate_qualifiers=""):
 
 
 def main():
-    log.debug("Command line arguments:\n  {}".format(" ".join(shlex.quote(arg) for arg in sys.argv[1:])))
+    log.debug("Command line arguments:\n  {}".format(" ".join(shlex.quote(arg) for arg in sys.argv[1:])))  # noqa: G001
 
     args = parse_arguments()
 
diff --git a/tools/ci_build/clean_docker_image_cache.py b/tools/ci_build/clean_docker_image_cache.py
index f9b41ce31f92a..8ec2b6b438176 100755
--- a/tools/ci_build/clean_docker_image_cache.py
+++ b/tools/ci_build/clean_docker_image_cache.py
@@ -237,13 +237,13 @@ def main():
     def sorted_image_names(image_infos):
         return sorted([get_image_name(image_info) for image_info in image_infos])
 
-    log.debug("All images:\n{}".format("\n".join(sorted_image_names(all_images))))
-    log.debug("Valid images:\n{}".format("\n".join(sorted_image_names(valid_images))))
+    log.debug("All images:\n{}".format("\n".join(sorted_image_names(all_images))))  # noqa: G001
+    log.debug("Valid images:\n{}".format("\n".join(sorted_image_names(valid_images))))  # noqa: G001
 
     images_to_clean = all_images - valid_images
     image_names_to_clean = sorted_image_names(images_to_clean)
 
-    log.info("Images to clean:\n{}".format("\n".join(image_names_to_clean)))
+    log.info("Images to clean:\n{}".format("\n".join(image_names_to_clean)))  # noqa: G001
 
     if args.dry_run:
         log.info("Dry run, no images will be cleaned.")
diff --git a/tools/ci_build/get_docker_image.py b/tools/ci_build/get_docker_image.py
index 2ce1764c96327..99ecaf677f339 100755
--- a/tools/ci_build/get_docker_image.py
+++ b/tools/ci_build/get_docker_image.py
@@ -56,11 +56,7 @@ def parse_args():
 def main():
     args = parse_args()
 
-    log.debug(
-        "Dockerfile: {}, context: {}, docker build args: '{}'".format(
-            args.dockerfile, args.context, args.docker_build_args
-        )
-    )
+    log.debug(f"Dockerfile: {args.dockerfile}, context: {args.context}, docker build args: '{args.docker_build_args}'")
 
     use_container_registry = args.container_registry is not None
 
diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py
index f9688a1453e12..3aaced63dd410 100644
--- a/tools/ci_build/github/android/build_aar_package.py
+++ b/tools/ci_build/github/android/build_aar_package.py
@@ -149,9 +149,11 @@ def _build_aar(args):
         "-DminSdkVer=" + str(build_settings["android_min_sdk_version"]),
         "-DtargetSdkVer=" + str(build_settings["android_target_sdk_version"]),
         "-DbuildVariant=" + str(build_settings["build_variant"]),
-        "-DENABLE_TRAINING_APIS=1"
-        if "--enable_training_apis" in build_settings["build_params"]
-        else "-DENABLE_TRAINING_APIS=0",
+        (
+            "-DENABLE_TRAINING_APIS=1"
+            if "--enable_training_apis" in build_settings["build_params"]
+            else "-DENABLE_TRAINING_APIS=0"
+        ),
     ]
 
     # clean, build, and publish to a local directory
diff --git a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
index 006dc4c33ffce..6188c7d7c0678 100755
--- a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
+++ b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
@@ -86,9 +86,7 @@ def run(arg_list, cwd=None):
     import shlex
     import subprocess
 
-    log.info(
-        "Running subprocess in '{}'\n  {}".format(cwd or os.getcwd(), " ".join([shlex.quote(arg) for arg in arg_list]))
-    )
+    log.info("Running subprocess in '%s'\n  %s", cwd or os.getcwd(), " ".join([shlex.quote(arg) for arg in arg_list]))
 
     return subprocess.run(arg_list, check=True, cwd=cwd)
 
diff --git a/tools/ci_build/github/apple/build_apple_framework.py b/tools/ci_build/github/apple/build_apple_framework.py
index 5137a0644b2e7..7b8a87632f5c7 100644
--- a/tools/ci_build/github/apple/build_apple_framework.py
+++ b/tools/ci_build/github/apple/build_apple_framework.py
@@ -65,9 +65,11 @@ def _build_for_apple_sysroot(
             build_dir_current_arch,
             build_config,
             build_config + "-" + sysroot,
-            "onnxruntime.framework"
-            if build_dynamic_framework
-            else os.path.join("static_framework", "onnxruntime.framework"),
+            (
+                "onnxruntime.framework"
+                if build_dynamic_framework
+                else os.path.join("static_framework", "onnxruntime.framework")
+            ),
         )
         ort_libs.append(os.path.join(framework_dir, "onnxruntime"))
 
diff --git a/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py b/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py
index ea4a3fd32b18b..40debff3b2fef 100644
--- a/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py
+++ b/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py
@@ -31,9 +31,7 @@ def _check_binary_size(path, readelf, threshold, os_str, arch, build_config):
 
     if threshold is not None and sections_total > threshold:
         raise RuntimeError(
-            "Sections total size for {} of {} exceeds threshold of {} by {}. On-disk size={}".format(
-                path, sections_total, threshold, sections_total - threshold, ondisk_size
-            )
+            f"Sections total size for {path} of {sections_total} exceeds threshold of {threshold} by {sections_total - threshold}. On-disk size={ondisk_size}"
         )
 
 
diff --git a/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py b/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
index acca4fb13c45a..a9667fe4d0654 100644
--- a/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
+++ b/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
@@ -49,7 +49,7 @@ def get_binary_sizes(size_data_file):
                 break
             linedata = line.strip().split(",")
             tablerow = {}
-            for i in range(0, len(headers)):
+            for i in range(len(headers)):
                 if headers[i] == "size":
                     tablerow[headers[i]] = int(linedata[i])
                 else:
diff --git a/tools/ci_build/op_registration_utils.py b/tools/ci_build/op_registration_utils.py
index 3fd01253a3e37..811ce424eae10 100644
--- a/tools/ci_build/op_registration_utils.py
+++ b/tools/ci_build/op_registration_utils.py
@@ -104,14 +104,12 @@ def process_registration(
         :param end_version: End version or None if unversioned registration
         :param type: Type or types used in registration, if this is a typed registration
         """
-        pass
 
     def process_other_line(self, line):
         """
         Process a line that does not contain a kernel registration
         :param line: Original line
         """
-        pass
 
     def ok(self):
         """
diff --git a/tools/ci_build/op_registration_validator.py b/tools/ci_build/op_registration_validator.py
index 5c7edfa88a48b..d92050a31f967 100644
--- a/tools/ci_build/op_registration_validator.py
+++ b/tools/ci_build/op_registration_validator.py
@@ -45,7 +45,7 @@ def domain_and_op_str(self):
 
 
 def _log_registration_error(r: RegistrationInfo, message: str):
-    log.error("Invalid registration for {}. {}\n{}".format(r.domain_and_op_str(), message, "".join(r.lines)))
+    log.error("Invalid registration for %s. %s\n%s", r.domain_and_op_str(), message, "".join(r.lines))
 
 
 class RegistrationValidator(op_registration_utils.RegistrationProcessor):
diff --git a/tools/doc/rename_folders.py b/tools/doc/rename_folders.py
index cc64775ae158d..90d800f2a4498 100644
--- a/tools/doc/rename_folders.py
+++ b/tools/doc/rename_folders.py
@@ -3,6 +3,7 @@
 This extension does not publish any folder starting with `_`.
 These folders need to be renamed.
 """
+
 import os
 import re
 
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 09fe99d36cc34..d5139f00e2f04 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -324,10 +324,12 @@ def generate_metadata(line_list, args):
     generate_owners(metadata_list, "Microsoft")
     generate_description(metadata_list, args.package_name)
     generate_copyright(metadata_list, "\xc2\xa9 " + "Microsoft Corporation. All rights reserved.")
-    generate_tags(
-        metadata_list, "ONNX ONNX Runtime Machine Learning"
-    ) if "Microsoft.ML.OnnxRuntime.Training." in args.package_name else generate_tags(
-        metadata_list, "native ONNX ONNXRuntime-Training Learning-on-The-Edge On-Device-Training MachineLearning"
+    (
+        generate_tags(metadata_list, "ONNX ONNX Runtime Machine Learning")
+        if "Microsoft.ML.OnnxRuntime.Training." in args.package_name
+        else generate_tags(
+            metadata_list, "native ONNX ONNXRuntime-Training Learning-on-The-Edge On-Device-Training MachineLearning"
+        )
     )
     generate_icon(metadata_list, "ORT_icon_for_light_bg.png")
     generate_license(metadata_list)
diff --git a/tools/python/dump_ort_model.py b/tools/python/dump_ort_model.py
index 2177c42f5bc35..b9e3bfa0d3bcd 100644
--- a/tools/python/dump_ort_model.py
+++ b/tools/python/dump_ort_model.py
@@ -29,10 +29,10 @@ def __init__(self, model_path: str):
 
     def _dump_initializers(self, graph: fbs.Graph):
         print("Initializers:")
-        for idx in range(0, graph.InitializersLength()):
+        for idx in range(graph.InitializersLength()):
             tensor = graph.Initializers(idx)
             dims = []
-            for dim in range(0, tensor.DimsLength()):
+            for dim in range(tensor.DimsLength()):
                 dims.append(tensor.Dims(dim))
 
             print(f"{tensor.Name().decode()} data_type={tensor.DataType()} dims={dims}")
@@ -40,7 +40,7 @@ def _dump_initializers(self, graph: fbs.Graph):
 
     def _dump_nodeargs(self, graph: fbs.Graph):
         print("NodeArgs:")
-        for idx in range(0, graph.NodeArgsLength()):
+        for idx in range(graph.NodeArgsLength()):
             node_arg = graph.NodeArgs(idx)
             type = node_arg.Type()
             if not type:
@@ -57,7 +57,7 @@ def _dump_nodeargs(self, graph: fbs.Graph):
                 shape = tensor_type_and_shape.Shape()
                 if shape:
                     dims = []
-                    for dim in range(0, shape.DimLength()):
+                    for dim in range(shape.DimLength()):
                         d = shape.Dim(dim).Value()
                         if d.DimType() == fbs.DimensionValueType.DimensionValueType.VALUE:
                             dims.append(str(d.DimValue()))
@@ -76,8 +76,8 @@ def _dump_node(self, node: fbs.Node):
         domain = node.Domain().decode() or "ai.onnx"  # empty domain defaults to ai.onnx
         since_version = node.SinceVersion()
 
-        inputs = [node.Inputs(i).decode() for i in range(0, node.InputsLength())]
-        outputs = [node.Outputs(i).decode() for i in range(0, node.OutputsLength())]
+        inputs = [node.Inputs(i).decode() for i in range(node.InputsLength())]
+        outputs = [node.Outputs(i).decode() for i in range(node.OutputsLength())]
         print(
             f"{node.Index()}:{node.Name().decode()}({domain}:{optype}:{since_version}) "
             f'inputs=[{",".join(inputs)}] outputs=[{",".join(outputs)}]'
@@ -91,12 +91,12 @@ def _dump_graph(self, graph: fbs.Graph):
         self._dump_initializers(graph)
         self._dump_nodeargs(graph)
         print("Nodes:")
-        for i in range(0, graph.NodesLength()):
+        for i in range(graph.NodesLength()):
             node = graph.Nodes(i)
             self._dump_node(node)
 
             # Read all the attributes
-            for j in range(0, node.AttributesLength()):
+            for j in range(node.AttributesLength()):
                 attr = node.Attributes(j)
                 attr_type = attr.Type()
                 if attr_type == fbs.AttributeType.AttributeType.GRAPH:
@@ -107,7 +107,7 @@ def _dump_graph(self, graph: fbs.Graph):
                     # the ONNX spec doesn't currently define any operators that have multiple graphs in an attribute
                     # so entering this 'elif' isn't currently possible
                     print(f"## Subgraphs for {node.OpType().decode()}.{attr.Name().decode()} ##")
-                    for k in range(0, attr.GraphsLength()):
+                    for k in range(attr.GraphsLength()):
                         print(f"## Subgraph {k} ##")
                         self._dump_graph(attr.Graphs(k))
                         print(f"## End Subgraph {k} ##")
diff --git a/tools/python/find_optimizer_opset_version_updates_required.py b/tools/python/find_optimizer_opset_version_updates_required.py
index 8a5e57b51e38d..b46f7e4a54d9c 100644
--- a/tools/python/find_optimizer_opset_version_updates_required.py
+++ b/tools/python/find_optimizer_opset_version_updates_required.py
@@ -199,9 +199,7 @@ def find_potential_issues(root_dir, op_to_opset):
                 latest = op_to_opset[op]
                 if int(latest) != int(last_version):
                     log.warning(
-                        "Newer opset found for {}. Latest:{} Optimizer support ends at {}. File:{}".format(
-                            op, latest, last_version, file
-                        )
+                        f"Newer opset found for {op}. Latest:{latest} Optimizer support ends at {last_version}. File:{file}"
                     )
             else:
                 log.error(f"Failed to find version information for {op}. File:{file}")
diff --git a/tools/python/gen_contrib_doc.py b/tools/python/gen_contrib_doc.py
index accab96bd3593..ab9421b395326 100644
--- a/tools/python/gen_contrib_doc.py
+++ b/tools/python/gen_contrib_doc.py
@@ -359,11 +359,7 @@ def main(output_path: str, domain_filter: [str]):
 
             for _, namemap in supportmap:
                 for n, schema, versions in namemap:  # noqa: B007
-                    s = '  * {}<a href="#{}">{}</a>\n'.format(
-                        support_level_str(schema.support_level),
-                        format_name_with_domain(domain, n),
-                        format_name_with_domain(domain, n),
-                    )
+                    s = f'  * {support_level_str(schema.support_level)}<a href="#{format_name_with_domain(domain, n)}">{format_name_with_domain(domain, n)}</a>\n'
                     fout.write(s)
 
         fout.write("\n")
diff --git a/tools/python/util/convert_onnx_models_to_ort.py b/tools/python/util/convert_onnx_models_to_ort.py
index 18bba78661796..08e840092bc22 100644
--- a/tools/python/util/convert_onnx_models_to_ort.py
+++ b/tools/python/util/convert_onnx_models_to_ort.py
@@ -302,9 +302,7 @@ def convert_onnx_models_to_ort(
 
     for optimization_style in optimization_styles:
         print(
-            "Converting models with optimization style '{}' and level '{}'".format(
-                optimization_style.name, optimization_level_str
-            )
+            f"Converting models with optimization style '{optimization_style.name}' and level '{optimization_level_str}'"
         )
 
         converted_models = _convert(
@@ -330,9 +328,9 @@ def convert_onnx_models_to_ort(
                 )
                 session_options_config_entries_for_second_conversion = session_options_config_entries.copy()
                 # Limit the optimizations to those that can run in a model with runtime optimizations.
-                session_options_config_entries_for_second_conversion[
-                    "optimization.minimal_build_optimizations"
-                ] = "apply"
+                session_options_config_entries_for_second_conversion["optimization.minimal_build_optimizations"] = (
+                    "apply"
+                )
 
                 print(
                     "Converting models again without runtime optimizations to generate a complete config file. "
@@ -351,9 +349,7 @@ def convert_onnx_models_to_ort(
                 )
 
             print(
-                "Generating config file from ORT format models with optimization style '{}' and level '{}'".format(
-                    optimization_style.name, optimization_level_str
-                )
+                f"Generating config file from ORT format models with optimization style '{optimization_style.name}' and level '{optimization_level_str}'"
             )
 
             config_file = _create_config_file_path(
diff --git a/tools/python/util/mobile_helpers/check_model_can_use_ort_mobile_pkg.py b/tools/python/util/mobile_helpers/check_model_can_use_ort_mobile_pkg.py
index f8cc34e04afa0..548d4a8ba6c45 100644
--- a/tools/python/util/mobile_helpers/check_model_can_use_ort_mobile_pkg.py
+++ b/tools/python/util/mobile_helpers/check_model_can_use_ort_mobile_pkg.py
@@ -230,7 +230,7 @@ def run_check_with_model(
     if unsupported_ops:
         logger.info("Unsupported operators:")
         for entry in sorted(unsupported_ops):
-            logger.info("  " + entry)
+            logger.info("  " + entry)  # noqa: G003
 
     if unsupported:
         logger.info("\nModel is not supported by the pre-built package due to unsupported types and/or operators.")
diff --git a/tools/python/util/ort_format_model/operator_type_usage_processors.py b/tools/python/util/ort_format_model/operator_type_usage_processors.py
index 22d7dff3e13b2..598549c42b60a 100644
--- a/tools/python/util/ort_format_model/operator_type_usage_processors.py
+++ b/tools/python/util/ort_format_model/operator_type_usage_processors.py
@@ -92,7 +92,6 @@ def to_config_entry(self):
         Generate a configuration file entry in JSON format with the required types for the operator.
         :return: JSON string with required type information.
         """
-        pass
 
     @abstractmethod
     def from_config_entry(self, entry: str):
@@ -101,7 +100,6 @@ def from_config_entry(self, entry: str):
         NOTE: Any existing type information should be cleared prior to re-creating from a config file entry.
         :param entry: Configuration file entry
         """
-        pass
 
 
 class DefaultTypeUsageProcessor(TypeUsageProcessor):
@@ -182,9 +180,7 @@ def process_node(self, node: fbs.Node, value_name_to_typeinfo: dict):
             # Don't know of any ops where the number of outputs changed across versions, so require a valid length
             if o >= node.OutputsLength():
                 raise RuntimeError(
-                    "Node has {} outputs. Tracker for {} incorrectly configured as it requires {}.".format(
-                        node.OutputsLength(), self.name, o
-                    )
+                    f"Node has {node.OutputsLength()} outputs. Tracker for {self.name} incorrectly configured as it requires {o}."
                 )
 
             type_str = value_name_to_typestr(node.Outputs(o), value_name_to_typeinfo)
@@ -514,7 +510,6 @@ def is_typed_registration_needed(self, domain: str, optype: str, type_registrati
         :param type_registration_str: Type string from kernel registration
         :return: True is required. False if not.
         """
-        pass
 
     @abstractmethod
     def get_cpp_entries(self):
@@ -522,7 +517,6 @@ def get_cpp_entries(self):
         Get the C++ code that specifies the operator types to enable.
         :return: List of strings. One line of C++ code per entry.
         """
-        pass
 
 
 class OperatorTypeUsageManager:
@@ -644,9 +638,7 @@ def __init__(self, globally_allowed_types: typing.Set[str]):
 
         if not globally_allowed_types.issubset(self._valid_allowed_types):
             raise ValueError(
-                "Globally allowed types must all be valid. Invalid types: {}".format(
-                    sorted(globally_allowed_types - self._valid_allowed_types)
-                )
+                f"Globally allowed types must all be valid. Invalid types: {sorted(globally_allowed_types - self._valid_allowed_types)}"
             )
 
         self._globally_allowed_types = globally_allowed_types
diff --git a/tools/python/util/ort_format_model/ort_model_processor.py b/tools/python/util/ort_format_model/ort_model_processor.py
index d3a07efe92aa5..b20f3a0cfd97d 100644
--- a/tools/python/util/ort_format_model/ort_model_processor.py
+++ b/tools/python/util/ort_format_model/ort_model_processor.py
@@ -35,7 +35,7 @@ def _setup_type_info(graph: fbs.Graph, outer_scope_value_typeinfo={}):  # noqa:
         :return: Dictionary of NodeArg name to TypeInfo
         """
         value_name_to_typeinfo = outer_scope_value_typeinfo.copy()
-        for j in range(0, graph.NodeArgsLength()):
+        for j in range(graph.NodeArgsLength()):
             n = graph.NodeArgs(j)
             value_name_to_typeinfo[n.Name()] = n.Type()  # TypeInfo for this NodeArg's name
 
@@ -57,7 +57,7 @@ def _process_graph(self, graph: fbs.Graph, outer_scope_value_typeinfo: dict):
         # Merge the TypeInfo for all values in this level of the graph with the outer scope value TypeInfo.
         value_name_to_typeinfo = OrtFormatModelProcessor._setup_type_info(graph, outer_scope_value_typeinfo)
 
-        for i in range(0, graph.NodesLength()):
+        for i in range(graph.NodesLength()):
             node = graph.Nodes(i)
 
             optype = node.OpType().decode()
@@ -69,7 +69,7 @@ def _process_graph(self, graph: fbs.Graph, outer_scope_value_typeinfo: dict):
                 self._op_type_processors.process_node(node, value_name_to_typeinfo)
 
             # Read all the attributes
-            for j in range(0, node.AttributesLength()):
+            for j in range(node.AttributesLength()):
                 attr = node.Attributes(j)
                 attr_type = attr.Type()
                 if attr_type == fbs.AttributeType.AttributeType.GRAPH:
@@ -77,7 +77,7 @@ def _process_graph(self, graph: fbs.Graph, outer_scope_value_typeinfo: dict):
                 elif attr_type == fbs.AttributeType.AttributeType.GRAPHS:
                     # the ONNX spec doesn't currently define any operators that have multiple graphs in an attribute
                     # so entering this 'elif' isn't currently possible
-                    for k in range(0, attr.GraphsLength()):
+                    for k in range(attr.GraphsLength()):
                         self._process_graph(attr.Graphs(k), value_name_to_typeinfo)
 
     def process(self):