From 88336ffa92e188e0866799dc559a9b6154980cf3 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Thu, 11 Jul 2024 13:35:08 +0800
Subject: [PATCH] Fix typos - 1st Wave (#21278)

### Description

There are so many typos reported by the review dog, [Optional Lint]
actions (example:
https://github.com/microsoft/onnxruntime/actions/runs/9864564489/job/27239732367),
this PR is to fix some of them.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
---
 .../NativeMethods.shared.cs                   |  2 +-
 .../OrtValue.shared.cs                        | 76 +++++++++----------
 .../InferenceTest.cs                          |  8 +-
 .../Program.cs                                |  2 +-
 .../core/eager/ort_kernel_invoker.h           |  5 +-
 .../core/framework/execution_provider.h       |  2 +-
 .../onnxruntime/core/framework/op_kernel.h    |  2 +-
 .../core/framework/stream_handles.h           |  4 +-
 .../core/optimizer/graph_transformer_config.h |  4 +-
 js/web/lib/onnxjs/execution-plan.ts           |  2 +-
 js/web/lib/onnxjs/graph.ts                    |  2 +-
 js/web/lib/onnxjs/util.ts                     |  2 +-
 js/web/lib/wasm/wasm-core-impl.ts             |  2 +-
 js/web/script/test-runner-cli.ts              |  2 +-
 .../cpu/attnlstm/attention_wrapper.cc         |  2 +-
 onnxruntime/contrib_ops/cpu/cdist.cc          |  2 +-
 onnxruntime/contrib_ops/cpu/inverse.cc        |  2 +-
 .../cuda/bert/decoder_attention.cc            |  2 +-
 .../cuda/bert/embed_layer_norm_impl.cu        |  2 +-
 .../cuda/bert/flash_attention/flash_api.cc    |  2 +-
 .../cuda/bert/longformer_attention_softmax.cu |  2 +-
 .../cross_attention/fmha_cross_attention.h    |  4 +-
 .../cuda/collective/sharding_spec.h           |  2 +-
 onnxruntime/contrib_ops/cuda/math/fft_ops.cc  |  2 +-
 .../transformers/generation_device_helper.cc  |  2 +-
 .../rocm/bert/group_query_attention.cu        |  2 +-
 .../contrib_ops/rocm/math/gemm_float8.cu      |  2 +-
 .../codegen/passes/op_ir_creator/nn/lstm.cc   |  2 +-
 onnxruntime/core/common/cpuid_info.h          |  4 +-
 onnxruntime/core/common/helper.cc             |  2 +-
 onnxruntime/core/common/threadpool.cc         |  2 +-
 .../contrib_ops/onnx_deprecated_operators.cc  |  2 +-
 onnxruntime/core/optimizer/attention_fusion.h |  3 +-
 .../core/optimizer/attention_fusion_helper.h  |  3 +-
 .../common_subexpression_elimination.cc       |  3 +-
 onnxruntime/core/optimizer/gather_fusion.h    |  2 +-
 onnxruntime/core/optimizer/gemm_sum_fusion.h  |  4 +-
 .../core/optimizer/identity_elimination.h     |  2 +-
 .../core/optimizer/isinf_reducesum_fusion.cc  |  2 +-
 onnxruntime/core/optimizer/matmul_bn_fusion.h |  4 +-
 .../core/optimizer/transformer_memcpy.cc      |  2 +-
 .../coreml/builders/impl/resize_op_builder.cc |  4 +-
 .../core/providers/dnnl/subgraph/dnnl_conv.h  |  8 +-
 .../providers/dnnl/subgraph/dnnl_convgrad.h   |  8 +-
 .../providers/dnnl/subgraph/dnnl_qattention.h |  2 +-
 .../nnapi_lib/nnapi_implementation.h          |  2 +-
 .../builders/impl/elementwise_op_builder.h    | 44 +++++------
 .../tools/transformers/dev_benchmark.cmd      |  4 +-
 .../tools/transformers/onnx_model_t5.py       |  2 +-
 .../tools/transformers/run_benchmark.sh       |  2 +-
 .../framework/insert_cast_transformer_test.cc |  4 +-
 onnxruntime/tool/etw/eparser.cc               |  4 +-
 .../torch/custom_function_register.h          |  8 +-
 .../core/framework/torch/dlpack_python.cc     | 10 +--
 .../core/framework/torch/dlpack_python.h      | 10 +--
 .../core/framework/torch/torch_proxy.h        |  2 +-
 .../core/graph/gradient_builder_base.cc       | 18 +++--
 .../core/graph/gradient_builder_base.h        |  4 +-
 .../core/graph/loss_function_registry.h       |  2 +-
 .../compute_optimizer/padding_elimination.cc  |  2 +-
 .../compute_optimizer/padding_elimination.h   |  6 +-
 .../sceloss_compute_optimization.h            |  4 +-
 .../core/optimizer/insert_output_rewriter.h   |  4 +-
 .../memory_optimizer/recompute_analysis.cc    |  4 +-
 .../memory_optimizer/recompute_analysis.h     |  2 +-
 .../memory_optimizer/transformer_specific.cc  |  2 +-
 .../orttraining/core/optimizer/qdq_fusion.h   |  2 +-
 .../core/optimizer/transpose_replacement.h    |  4 +-
 .../training/optim/_apex_amp_modifier.py      |  2 +-
 .../python/training/optim/_ds_modifier.py     |  2 +-
 .../training/optim/_megatron_modifier.py      |  2 +-
 .../ortmodule/_custom_gradient_registry.py    |  2 +-
 .../ortmodule/_custom_op_symbolic_registry.py | 10 +--
 .../training/ortmodule/_execution_agent.py    |  2 +-
 .../ortmodule/_graph_execution_manager.py     |  2 +-
 .../ortmodule/_graph_transition_manager.py    |  2 +-
 .../ortmodule/_mem_efficient_grad_mgmt.py     |  2 +-
 .../training/ortmodule/_training_manager.py   |  4 +-
 .../ortmodule/graph_optimizers/utils.py       |  4 +-
 .../python/training/ortmodule/options.py      |  4 +-
 .../python/training/ortmodule/ortmodule.py    |  2 +-
 81 files changed, 194 insertions(+), 194 deletions(-)

diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index 13d925e0fc2ee..44d2222dbce16 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -1357,7 +1357,7 @@ IntPtr[] outputValues /* An array of output value pointers. Array must be alloca
             OrtAllocatorType allocatorType,
             int identifier,
             OrtMemType memType,
-            out IntPtr /*(OrtMemoryInfo*)*/ allocatorInfo // memory ownership transfered to caller
+            out IntPtr /*(OrtMemoryInfo*)*/ allocatorInfo // memory ownership transferred to caller
         );
 
         public static DOrtCreateMemoryInfo OrtCreateMemoryInfo;
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
index 163a2b394c4ae..5946e9fb1b165 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
@@ -22,7 +22,7 @@ public enum OnnxValueType
         ONNX_TYPE_MAP = 3,  // It's a map
         ONNX_TYPE_OPAQUE = 4, // It's an experimental Opaque object
         ONNX_TYPE_SPARSETENSOR = 5, // It's a Sparse Tensor
-        ONNX_TYPE_OPTIONAL = 6, // It's an optional type that designates anything above (except UNKOWN)
+        ONNX_TYPE_OPTIONAL = 6, // It's an optional type that designates anything above (except UNKNOWN)
     }
 
     /// <summary>
@@ -31,7 +31,7 @@ public enum OnnxValueType
     /// The class implements IDisposable and must
     /// be disposed of, otherwise native resources will leak
     /// and will eventually cause the application to slow down or crash.
-    /// 
+    ///
     /// If the OrtValue instance is constructed over a managed memory, and it is not
     /// disposed properly, the pinned memory will continue to be pinned and interfere
     /// with GC operation.
@@ -72,7 +72,7 @@ internal OrtValue(IntPtr handle, OnnxValueType onnxValueType)
         /// Constructor. The newly constructed OrtValue takes ownership of the native OrtValue instance
         /// and disposes of it when the OrtValue instance is disposed. The instance will take ownership and will
         /// dispose of compositeMembers instances.
-        /// 
+        ///
         /// This constructor can only throw if OnnxType is not specified.
         /// </summary>
         /// <param name="handle">native ortValue handle</param>
@@ -189,10 +189,10 @@ public OrtValue GetValue(int index, OrtAllocator allocator)
         /// <summary>
         /// Returns a ReadOnlySpan<typeparamref name="T"/> over tensor native buffer that
         /// provides a read-only view.
-        /// 
+        ///
         /// Note, that the memory may be device allocated and, therefore, not accessible from the CPU.
         /// To get memory descriptor use GetTensorMemoryInfo().
-        /// 
+        ///
         /// OrtValue must contain a non-string tensor.
         /// The span is valid as long as the OrtValue instance is alive (not disposed).
         /// </summary>
@@ -210,10 +210,10 @@ public ReadOnlySpan<T> GetTensorDataAsSpan<T>() where T : unmanaged
         /// This enables you to safely and efficiently modify the underlying
         /// native buffer in a type-safe manner. This is useful for example in IOBinding scenarios
         /// where you want to modify results of the inference and feed it back as input.
-        /// 
+        ///
         /// Note, that the memory may be device allocated.
         /// To get memory descriptor use GetTensorMemoryInfo().
-        /// 
+        ///
         /// OrtValue must contain a non-string tensor.
         /// The span is valid as long as the OrtValue instance is alive (not disposed).
         /// </summary>
@@ -237,7 +237,7 @@ public Span<byte> GetTensorMutableRawData()
         /// <summary>
         /// Fetch string tensor element buffer pointer at the specified index,
         /// convert/copy to UTF-16 char[] and return a ReadOnlyMemory<char> instance.
-        /// 
+        ///
         /// Obtain TensorTypeAndShape to get shape and element count.
         /// </summary>
         /// <param name="index">flat string tensor element index</param>
@@ -256,7 +256,7 @@ public ReadOnlyMemory<char> GetStringElementAsMemory(int index)
         /// <summary>
         /// Fetch string tensor element buffer pointer at the specified index,
         /// copy/convert UTF-8 into a UTF-16 string and return it.
-        /// 
+        ///
         /// Obtain TensorTypeAndShape to get shape and element count.
         /// </summary>
         /// <param name="index">flat string tensor element index</param>
@@ -279,7 +279,7 @@ public string GetStringElement(int index)
         /// <summary>
         /// Get a span over the native memory of the string tensor element.
         /// The span is valid as long as the OrtValue is valid.
-        /// 
+        ///
         /// This is useful if you want to perform your own UTF-8 decoding or
         /// you do not care about decoding.
         /// Obtain TensorTypeAndShape to get shape and element count.
@@ -483,7 +483,7 @@ private Span<byte> GetTensorBufferRawData(Type requestedType)
         /// This can be a piece of arbitrary memory that may be allocated by OrtAllocator (possibly on a device),
         /// a chunk of managed memory (must be pinned for the duration of OrtValue lifetime) or a memory that is allocated
         /// natively allocated using Marshal.AllocHGlobal(), stackalloc or other means (may be on a device).
-        /// 
+        ///
         /// The resulting OrtValue does not own the underlying memory buffer and will not attempt to
         /// deallocate it. The caller must make sure that the memory remains valid for the duration of OrtValue lifetime.
         /// </summary>
@@ -769,12 +769,12 @@ out IntPtr valueHandle
         /// Converts the string argument represented by ReadOnlySpan to UTF-8,
         /// allocates space in the native tensor and copies it into the native tensor memory.
         /// Typically, this is used to populate a new empty string tensor element.
-        /// 
+        ///
         /// The number of elements is according to the shape supplied to CreateTensorWithEmptyStrings().
         /// However, this API can also be used to overwrite any existing element within the string tensor.
-        /// 
+        ///
         /// In general, to obtain the number of elements for any tensor, use GetTensorTypeAndShape() which
-        /// would return a disposable instance of TensorTypeAndShapeInfo. 
+        /// would return a disposable instance of TensorTypeAndShapeInfo.
         /// Then call GetElementCount() or GetShape().
         /// </summary>
         /// <param name="str">ReadOnlySpan over chars</param>
@@ -795,12 +795,12 @@ public void StringTensorSetElementAt(ReadOnlySpan<char> str, int index)
         /// Converts the string argument represented by ReadOnlyMemory to UTF-8,
         /// allocates space in the native tensor and copies it into the native tensor memory.
         /// Typically, this is used to populate a new empty string tensor element.
-        /// 
+        ///
         /// The number of elements is according to the shape supplied to CreateTensorWithEmptyStrings().
         /// However, this API can also be used to overwrite any existing element within the string tensor.
-        /// 
+        ///
         /// In general, to obtain the number of elements for any tensor, use GetTensorTypeAndShape() which
-        /// would return a disposable instance of TensorTypeAndShapeInfo. 
+        /// would return a disposable instance of TensorTypeAndShapeInfo.
         /// Then call GetElementCount() or GetShape().
         ///
         /// </summary>
@@ -815,7 +815,7 @@ public void StringTensorSetElementAt(ReadOnlyMemory<char> rom, int index)
         /// <summary>
         /// This API resizes String Tensor element to the requested amount of bytes (UTF-8)
         /// and copies the bytes from the supplied ReadOnlySpan into the native tensor memory (resized buffer).
-        /// 
+        ///
         /// The API is useful for quick loading of utf8 data into the native tensor memory.
         /// </summary>
         /// <param name="utf8Bytes">read only span of bytes</param>
@@ -841,7 +841,7 @@ public void StringTensorSetElementAt(ReadOnlySpan<byte> utf8Bytes, int index)
         /// Creates an OrtValue that contains a string tensor.
         /// String tensors are always allocated on CPU.
         /// String data will be converted to UTF-8 and copied to native memory.
-        /// 
+        ///
         /// Note, this is different from creating an OrtValue from other primitive data types
         /// where memory is pinned (if necessary) and the OrtValue points to that chunk of memory.
         /// </summary>
@@ -885,10 +885,10 @@ public static OrtValue CreateFromStringTensor(Tensor<string> tensor)
         /// Creates a sequence of OrtValues from a collection of OrtValues.
         /// All OrtValues in the collection must be of the same Onnx type.
         /// I.e. (Tensor, SparseTensor, Map, Sequence, etc.)
-        /// 
+        ///
         /// The ortValues that are passed as argument are taken possession of by the newly
         /// created OrtValue. The caller should not dispose them, unless this call fails.
-        /// 
+        ///
         /// The ortValues would be empty on successful return.
         /// </summary>
         /// <param name="ortValues">a collection of OrtValues. On success the ortValues contained in the list
@@ -978,24 +978,24 @@ public void ProcessSequence(SequenceElementVisitor visitor, OrtAllocator allocat
         /// Creates a map OrtValue with keys and values.
         /// On a high level the Onnxruntime representation of the map always consists of two
         /// OrtValues, keys and values.
-        /// 
+        ///
         /// According to ONNX standard map keys can be unmanaged types only (or strings).
         /// Those keys are contained in a single tensor within OrtValue keys.
-        /// 
+        ///
         /// Map values, on the other hand, can be composite types. The values parameter
         /// can either contain a single tensor with unmanaged map values with the same number of
         /// elements as the keys, or it can be a sequence of OrtValues,
         /// each of those can be a composite type (tensor, sequence, map). If it is a sequence,
         /// then the number of elements must match the number of elements in keys.
-        /// 
+        ///
         /// Keys and values must be in the same order.
-        /// 
+        ///
         /// ORT supports only a subset of types for keys and values, however, this API does not
         /// restrict it.
-        /// 
+        ///
         /// The ortValues that are passed as argument are taken possession of by the newly
         /// created OrtValue. The caller should not dispose them, unless this call fails.
-        /// 
+        ///
         /// Keys and values arguments will be set to null on success.
         /// </summary>
         /// <param name="keys">Contains keys</param>
@@ -1031,10 +1031,10 @@ public static OrtValue CreateMap(ref OrtValue keys, ref OrtValue values)
         /// This API helps to quickly creates a map OrtValue with unmanaged (primitive) keys and values specified as arrays.
         /// This helps the user not to create OrtValues for keys and values separately and deal only with the final result.
         /// The map would consist of two tensors, one for keys and one for values.
-        /// 
+        ///
         /// The OrtValues would be created on top of the managed memory arrays and use it directly.
         /// The number of elements in keys and values must be the same and they must be in order.
-        /// 
+        ///
         /// The types must be unmanaged.
         /// </summary>
         /// <typeparam name="K">keys type</typeparam>
@@ -1078,10 +1078,10 @@ public static OrtValue CreateMap<K, V>(K[] keys, V[] values) where K : unmanaged
         /// This helps the user not to create OrtValues for keys and values separately.
         /// The number of elements in keys and values must be the same and they must be in order.
         /// The map would consist of two tensors, one for keys and one for values.
-        /// 
+        ///
         /// string keys would be converted to UTF-8 encoding and copied to an allocated native memory.
         /// The OrtValue for values would be created on top of the managed memory using it directly.
-        /// 
+        ///
         /// The values type must be unmanaged.
         /// </summary>
         /// <typeparam name="V"></typeparam>
@@ -1128,13 +1128,13 @@ public static OrtValue CreateMapWithStringKeys<V>(IReadOnlyCollection<string> ke
 
         /// <summary>
         /// Creates a map OrtValue with non-string keys and string values.
-        /// 
+        ///
         /// This helps the user not to create OrtValues for keys and values separately.
         /// The number of elements in keys and values must be the same and they must be in order.
-        /// 
+        ///
         /// The OrtValue for keys would be created on top of the managed memory using it directly.
         /// string values would be converted to UTF-8 encoding and copied to an allocated native memory.
-        /// 
+        ///
         /// </summary>
         /// <typeparam name="K">unmanaged type of keys</typeparam>
         /// <param name="keys"></param>
@@ -1182,17 +1182,17 @@ public static OrtValue CreateMapWithStringValues<K>(K[] keys, IReadOnlyCollectio
         /// Typically, when one uses GetValue() API, it creates a copy of OrtValue
         /// that points to the same buffer as keys or values. This API helps to deal with those
         /// temporary instances and avoid leaks.
-        /// 
+        ///
         /// According to ONNX standard map keys can be unmanaged types only (or strings).
         /// Those keys are contained in a single tensor within OrtValue keys. So you can query those
         /// directly from keys argument.
-        /// 
+        ///
         /// Map values, on the other hand, can be composite types. The values parameter
         /// can either contain a single tensor with unmanaged map values with the same number of
         /// elements as the keys, or it can be a sequence of OrtValues,
         /// each of those can be a composite type (tensor, sequence, map). If it is a sequence,
         /// then the number of elements must match the number of elements in keys.
-        /// 
+        ///
         /// Depending on the structure of the values, one will either directly query a single tensor
         /// from values, or will have to iterate over the sequence of OrtValues and visit each of those
         /// resulting in a recursive visitation.
@@ -1204,7 +1204,7 @@ public static OrtValue CreateMapWithStringValues<K>(K[] keys, IReadOnlyCollectio
         /// <summary>
         /// This API helps the user to process a map OrtValue without
         /// having to deal with the lifespan of intermediate OrtValues.
-        /// 
+        ///
         /// each API value is fed to the vistor functor.
         /// </summary>
         /// <param name="visitor">visitor function</param>
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
index d6a6b9627f418..0892e17fc97bc 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
@@ -116,7 +116,7 @@ public void TestSessionOptions()
 
                 var directml_dll_path = AppDomain.CurrentDomain.BaseDirectory;
                 SetDllDirectory(directml_dll_path);
-                
+
                 try
                 {
                     opt.AppendExecutionProvider_DML(0);
@@ -124,7 +124,7 @@ public void TestSessionOptions()
                 catch (OnnxRuntimeException ortException)
                 {
                     // if we run on a CI machine with the incorrect hardware we might get an error due to that.
-                    // allow that as the call made it through to the DML EP so the C# layer is working correctly. 
+                    // allow that as the call made it through to the DML EP so the C# layer is working correctly.
                     // any other exception type or error message is considered a failure.
                     Assert.Contains("The specified device interface or feature level is not supported on this system.",
                                     ortException.Message);
@@ -1895,7 +1895,7 @@ private void TestSharedAllocatorUsingCreateAndRegisterAllocator()
                     sessionOptions.AddSessionConfigEntry("session.use_env_allocators", "1");
 
                     // Create two sessions to share the allocator
-                    // Create a thrid session that DOES NOT use the allocator in the environment
+                    // Create a third session that DOES NOT use the allocator in the environment
                     using (var session1 = new InferenceSession(model, sessionOptions))
                     using (var session2 = new InferenceSession(model, sessionOptions))
                     using (var session3 = new InferenceSession(model)) // Use the default SessionOptions instance
@@ -2127,7 +2127,7 @@ private void TestLoadAzureEP()
                     }
                     catch (Exception) {
                         Assert.True(false);
-                    } 
+                    }
                 }
             }
         }
diff --git a/csharp/tools/Microsoft.ML.OnnxRuntime.PerfTool/Program.cs b/csharp/tools/Microsoft.ML.OnnxRuntime.PerfTool/Program.cs
index 9370a03f7fbeb..a005efa749a1c 100644
--- a/csharp/tools/Microsoft.ML.OnnxRuntime.PerfTool/Program.cs
+++ b/csharp/tools/Microsoft.ML.OnnxRuntime.PerfTool/Program.cs
@@ -32,7 +32,7 @@ class CommandOptions
         [Option('i', "input_file", Required = false, HelpText = "Input file.")]
         public string InputFile { get; set; }
 
-        [Option('p', Required = false, HelpText = "Run with parallel exection. Default is false")]
+        [Option('p', Required = false, HelpText = "Run with parallel execution. Default is false")]
         public bool ParallelExecution { get; set; } = false;
 
         [Option('o', "optimization_level", Required = false, HelpText = "Optimization Level. Default is 99, all optimization.")]
diff --git a/include/onnxruntime/core/eager/ort_kernel_invoker.h b/include/onnxruntime/core/eager/ort_kernel_invoker.h
index 1d1046742db4b..fcf92de2ee39a 100644
--- a/include/onnxruntime/core/eager/ort_kernel_invoker.h
+++ b/include/onnxruntime/core/eager/ort_kernel_invoker.h
@@ -24,7 +24,10 @@ class ORTInvoker {
  public:
   ORTInvoker(std::shared_ptr<IExecutionProvider> execution_provider,
              const logging::Logger& logger,
-             const IOnnxRuntimeOpSchemaRegistryList& custom_op_registries) : execution_provider_(std::move(execution_provider)), logger_(logger), custom_op_registries_(custom_op_registries) {
+             const IOnnxRuntimeOpSchemaRegistryList& custom_op_registries)
+      : execution_provider_(std::move(execution_provider)),
+        logger_(logger),
+        custom_op_registries_(custom_op_registries) {
     if (!execution_provider_) {
       ORT_THROW("Execution provider is nullptr");
     }
diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index 16ad943a5f47e..49c3d1bdd088a 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -231,7 +231,7 @@ class IExecutionProvider {
     const std::reference_wrapper<GraphViewer> filtered_graph;
   };
 
-  // Fusion approach that is suppported
+  // Fusion approach that is supported
   // !!! The "Function" FusionStyle is deprecated.
   // !!! If your EP is using this fusion style, please migrate it to "FilteredGraphViewer" style.
   enum class FusionStyle {
diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h
index ddaa1ebd088a1..07625c38d8474 100644
--- a/include/onnxruntime/core/framework/op_kernel.h
+++ b/include/onnxruntime/core/framework/op_kernel.h
@@ -94,7 +94,7 @@ class OpKernel {
   }
 
   // Override this function to return a list of attributes the session can safely remove
-  // after it is intialized and saved. This option is useful to reduce memory usage
+  // after it is initialized and saved. This option is useful to reduce memory usage
   // when the kernel does not reuse the operator attributes but copies them.
   // All attributes returned by this method will be removed by method
   // PruneRemovableAttributes of they exists.
diff --git a/include/onnxruntime/core/framework/stream_handles.h b/include/onnxruntime/core/framework/stream_handles.h
index 26d78133b52fc..9c987f10ccadb 100644
--- a/include/onnxruntime/core/framework/stream_handles.h
+++ b/include/onnxruntime/core/framework/stream_handles.h
@@ -54,14 +54,14 @@ class Stream {
   //    update its lookup table with the table snapshot in notification.
   // The memory reusing strategy is:
   // A kernel in current stream is safe to reuse another stream's memory chunk
-  // as long as the reused chunk's timestamp is less than the last synchonized
+  // as long as the reused chunk's timestamp is less than the last synchronized
   // timestamp recorded in the lookup table.
 
   // Get the current timestamp
   uint64_t GetCurrentTimestamp() const { return timestamp_; }
 
   // return the timestamp when the last synchronization happened between target stream and current stream.
-  // return 0 if no synchonization happened.
+  // return 0 if no synchronization happened.
   // if target_stream is nullptr, it means it is a sequence running on device doesn't support Stream (i.e. CPU)
   // we can safely return 0 in that case to save a lookup.
   uint64_t GetLastSyncTimestampWithTargetStream(Stream* target_stream) const {
diff --git a/include/onnxruntime/core/optimizer/graph_transformer_config.h b/include/onnxruntime/core/optimizer/graph_transformer_config.h
index c112d9b0480ab..6af48331270cd 100644
--- a/include/onnxruntime/core/optimizer/graph_transformer_config.h
+++ b/include/onnxruntime/core/optimizer/graph_transformer_config.h
@@ -13,7 +13,7 @@ struct GraphTransformerConfiguration {
     /*
      * Cast propagation strategy.
      * One strategy is to insert casts around all the nodes with the allowed opcodes
-     * and reduce, by removing redundent-casts and back-to-back-casts etc., and
+     * and reduce, by removing redundant-casts and back-to-back-casts etc., and
      * the other is to propagate casts using flood-fill approach, expanding float16 regions in the graph
      * traversing the graph up/down.
      */
@@ -70,4 +70,4 @@ constexpr GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy
 constexpr bool operator==(GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy, GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy);
 constexpr bool operator!=(GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy, GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy);
 
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/js/web/lib/onnxjs/execution-plan.ts b/js/web/lib/onnxjs/execution-plan.ts
index 5599087ab46f5..e155ff123f79d 100644
--- a/js/web/lib/onnxjs/execution-plan.ts
+++ b/js/web/lib/onnxjs/execution-plan.ts
@@ -92,7 +92,7 @@ export class ExecutionPlan {
         const inputTensors = inputList as Tensor[];
         Logger.verbose(
             'ExecPlan',
-            `Runing op:${thisOp.node.name} (${
+            `Running op:${thisOp.node.name} (${
                 inputTensors.map((t, i) => `'${thisOp.node.inputs[i]}': ${t.type}[${t.dims.join(',')}]`).join(', ')})`);
 
         const outputList = await this.profiler.event(
diff --git a/js/web/lib/onnxjs/graph.ts b/js/web/lib/onnxjs/graph.ts
index f16da42815957..d444be2bf7ce0 100644
--- a/js/web/lib/onnxjs/graph.ts
+++ b/js/web/lib/onnxjs/graph.ts
@@ -674,7 +674,7 @@ class GraphImpl implements Graph, Graph.Transformer {
   }
 
   /**
-   * Delete the specifed node. Assume the node has one incoming input and the first output connected to other nodes.
+   * Delete the specified node. Assume the node has one incoming input and the first output connected to other nodes.
    * An input validation must be done before calling this function.
    * @param nodeIndex The index of node to be deleted
    */
diff --git a/js/web/lib/onnxjs/util.ts b/js/web/lib/onnxjs/util.ts
index d697a8b3138cf..22c4e4c755f55 100644
--- a/js/web/lib/onnxjs/util.ts
+++ b/js/web/lib/onnxjs/util.ts
@@ -474,7 +474,7 @@ export class ProtoUtil {
 export class LongUtil {
   // This function is called to get a number from long type of data for attribute, dim, and ir version,
   // which values are signed integers.
-  // To make it more generic, add an optional paramter to convert to a unsigned number.
+  // To make it more generic, add an optional parameter to convert to a unsigned number.
   static longToNumber(n: Long|flatbuffers.Long|number, unsigned?: boolean) {
     if (Long.isLong(n)) {
       return n.toNumber();
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index 905bbf0621014..9fc8786192c5c 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -74,7 +74,7 @@ const initOrt = (numThreads: number, loggingLevel: number): void => {
 };
 
 /**
- * intialize runtime environment.
+ * initialize runtime environment.
  * @param env passed in the environment config object.
  */
 export const initRuntime = async(env: Env): Promise<void> => {
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index 6718dcb639a47..fbde81524ccec 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -613,7 +613,7 @@ async function main() {
           // == The Problem ==
           // every time when a test is completed, it will be added to the recovery page list.
           // if we run the test 100 times, there will be 100 previous tabs when we launch Edge again.
-          // this run out of resources quickly and fails the futher test.
+          // this run out of resources quickly and fails the further test.
           // and it cannot recover by itself because every time it is terminated forcely or crashes.
           // and the auto recovery feature has no way to disable by configuration/commandline/registry
           //
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc b/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc
index 63ec5be8c2900..72c5a813e3d76 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc
@@ -57,7 +57,7 @@ void AttentionWrapper<T>::ProcessOutput(const gsl::span<const T>& rnn_cell_outpu
   if (has_attn_layer_) {
     // concat([p_cell_output, context]) * stack([attn_layer_cell_weights_, attn_layer_attn_weights_]) =
     //      p_cell_output * attn_layer_cell_weights_ + context * attn_layer_attn_weights_
-    //  The first part is calulated above. Here just add the later.
+    //  The first part is calculated above. Here just add the later.
     math::GemmEx<T>(CblasNoTrans, CblasNoTrans,
                     batch_size_, attn_layer_depth_, attn_context_depth_, T{1.0},
                     attn_context_.data(), attn_context_depth_,
diff --git a/onnxruntime/contrib_ops/cpu/cdist.cc b/onnxruntime/contrib_ops/cpu/cdist.cc
index d0ed81a9a6dc3..736dbcfede2fc 100644
--- a/onnxruntime/contrib_ops/cpu/cdist.cc
+++ b/onnxruntime/contrib_ops/cpu/cdist.cc
@@ -67,7 +67,7 @@ static void CalculateSqeuclidean(const Tensor& a, const Tensor& b, Tensor& c, co
                 threadpool);
 #else
   // the performance of this isn't great as the eigen matmul is single threaded by default
-  // if you're on x86 and care about performance try MKL first. if there's a good enough argument for optimising this
+  // if you're on x86 and care about performance try MKL first. if there's a good enough argument for optimizing this
   // we can look into it in the future.
   ORT_UNUSED_PARAMETER(threadpool);
 
diff --git a/onnxruntime/contrib_ops/cpu/inverse.cc b/onnxruntime/contrib_ops/cpu/inverse.cc
index 355b036e36d0a..54bd99d209574 100644
--- a/onnxruntime/contrib_ops/cpu/inverse.cc
+++ b/onnxruntime/contrib_ops/cpu/inverse.cc
@@ -53,7 +53,7 @@ struct Inverse::ComputeImpl<MLFloat16> {
   void operator()(const Tensor* input, Tensor* output,
                   int64_t batch_num, int64_t rows, int64_t cols) const {
     auto batch_offset = batch_num * rows * cols;
-    // Direct cast to half as it just as MLFloat16 containes only uint16_t
+    // Direct cast to half as it just as MLFloat16 contains only uint16_t
     const auto* input_data = reinterpret_cast<const Eigen::half*>(input->Data<MLFloat16>() + batch_offset);
     auto* output_data = reinterpret_cast<Eigen::half*>(output->MutableData<MLFloat16>() + batch_offset);
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_attention.cc b/onnxruntime/contrib_ops/cuda/bert/decoder_attention.cc
index ceee17c2a2d01..ee49f362564a6 100644
--- a/onnxruntime/contrib_ops/cuda/bert/decoder_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/decoder_attention.cc
@@ -282,7 +282,7 @@ Status DecoderAttention<T>::ComputeInternal(OpKernelContext* context) const {
       &one, reinterpret_cast<CudaT*>(gemm_query_buffer_p.get()), n, device_prop, UseTF32()));
   // gemm_query_buffer in col-base: (h2, S*B)
 
-  // calcualte k, v
+  // calculate k, v
   n = 2 * hidden_size;
   k = hidden_size;
   if (!has_layer_state_ || !use_past_) {
diff --git a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu
index ae53eca541fa5..8a17e945df3f3 100644
--- a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu
@@ -141,7 +141,7 @@ __global__ void EmbedLayerNormKernel(
   }
   __syncthreads();
 
-  // 2. load pos/segment/word embeddings and add them toghether
+  // 2. load pos/segment/word embeddings and add them together
   // offset into embeddings is given by word_id * hidden_size
   const int position_offset = position_id * hidden_size;
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
index e04cdf369c6ac..90f0b94cafce8 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
@@ -107,7 +107,7 @@ void set_params_fprop(Flash_fwd_params& params,
   params.scale_softmax = softmax_scale;
   params.scale_softmax_log2 = softmax_scale * M_LOG2E;
 
-  // In our API, causal/unidirectional determines if we only look at prior tokens. However, the flash API seperates
+  // In our API, causal/unidirectional determines if we only look at prior tokens. However, the flash API separates
   // local and causal, meaning when we have local window size
   params.is_causal = is_causal;
   if (is_causal && (window_size_left >= 0 || window_size_right != 0)) {
diff --git a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_softmax.cu b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_softmax.cu
index d610051c77e50..2c251246267b7 100644
--- a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_softmax.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_softmax.cu
@@ -148,7 +148,7 @@ __launch_bounds__(blockSize)
     // following the Softmax.
     //
     // For now zero-out only [row_index - 2*attention_window, row_index + 2*attention_window],
-    // we can even be more agressive and reduce the zeroing out window size since
+    // we can even be more aggressive and reduce the zeroing out window size since
     // each row has entries in 3 blocks (3*attention_window size instead of 4*attention_window)
     int zero_start = row_index - 2 * attention_window;
     if (zero_start < 0) {
diff --git a/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/cross_attention/fmha_cross_attention.h b/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/cross_attention/fmha_cross_attention.h
index 2c9dc3689f882..116b9fb80da4d 100644
--- a/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/cross_attention/fmha_cross_attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/cross_attention/fmha_cross_attention.h
@@ -41,7 +41,7 @@ struct Gmem_params {
   // Hidden dim per head
   int32_t d;
 
-  // array of length b+1 holding prefix sum of actual sequence lenghts.
+  // array of length b+1 holding prefix sum of actual sequence lengths.
   int32_t* cu_seqlens;
 };
 
@@ -69,7 +69,7 @@ struct Fused_multihead_attention_params_mhca {
   // See https://confluence.nvidia.com/pages/viewpage.action?pageId=302779721 for details.
   bool enable_i2f_trick;
 
-  // array of length b+1 holding prefix sum of actual sequence lenghts
+  // array of length b+1 holding prefix sum of actual sequence lengths
   int32_t* cu_seqlens;
 
   // use C/32 Format.
diff --git a/onnxruntime/contrib_ops/cuda/collective/sharding_spec.h b/onnxruntime/contrib_ops/cuda/collective/sharding_spec.h
index 5abc50a61c9a3..5b47273bc8f2f 100644
--- a/onnxruntime/contrib_ops/cuda/collective/sharding_spec.h
+++ b/onnxruntime/contrib_ops/cuda/collective/sharding_spec.h
@@ -37,7 +37,7 @@ class DeviceMesh {
   //     corresponding sharding spec is a string "S[1]S[0]".
   //     If that 2-D tensor's value is np.array([[5, 6], [7, 8]]),
   //     GPU 0/1/2/3 owns 5/7/6/8.  Below is a visualization the sharding
-  //     proccess.
+  //     process.
   //     - Start with a 2-D device mesh [[0, 1], [2, 3]] and
   //       a 2-D tensor [[5, 6], [7, 8]]
   //       - GPU: [[0, 1], [2, 3]], Tensor: [[5, 6], [7, 8]]
diff --git a/onnxruntime/contrib_ops/cuda/math/fft_ops.cc b/onnxruntime/contrib_ops/cuda/math/fft_ops.cc
index 4b524dcf795a2..65bec758ae525 100644
--- a/onnxruntime/contrib_ops/cuda/math/fft_ops.cc
+++ b/onnxruntime/contrib_ops/cuda/math/fft_ops.cc
@@ -87,7 +87,7 @@ Status FFTBase<T>::DoFFT(OpKernelContext* context, const Tensor* X, bool complex
   int64_t batch_size = (batch_ndim == 0 ? 1 : input_shape.SizeToDimension(batch_ndim));
 
   // infer output shape
-  // copy the input shape up to the second last dimention
+  // copy the input shape up to the second last dimension
   std::vector<int64_t> output_dims, signal_dims;
   int i = 0;
   for (; i < batch_ndim + signal_ndim_ - 1; ++i) {
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
index 09d2dba7d203a..e047bd948434d 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
@@ -53,7 +53,7 @@ namespace GenerationCudaDeviceHelper {
 // e.g In the case of past(fp32) -> cast to fp16 -> Attention(fp16), the reorder
 // function will use the fp32 chunk size and cause the model silently generates
 // the incorrect results.
-// TODO: Fix this issue. Either retrive the Attention op type from the graph or
+// TODO: Fix this issue. Either retrieve the Attention op type from the graph or
 // check the type of past state as graph input should be same as Attention op type.
 // It might be better to forcefully require the same type since cast node generates
 // extra overhead.
diff --git a/onnxruntime/contrib_ops/rocm/bert/group_query_attention.cu b/onnxruntime/contrib_ops/rocm/bert/group_query_attention.cu
index 92c780d4a9d41..7a16eb38181aa 100644
--- a/onnxruntime/contrib_ops/rocm/bert/group_query_attention.cu
+++ b/onnxruntime/contrib_ops/rocm/bert/group_query_attention.cu
@@ -360,7 +360,7 @@ Status GroupQueryAttention<T>::ComputeInternal(OpKernelContext* ctx) const {
         max_thr_per_blk));
 
     // NOTE: ORT: seqlens_k Indicates past sequence lengths for token generation case.
-    // we should call fmha with total sequence lenghts
+    // we should call fmha with total sequence lengths
     seqlens_k_tmp = GetScratchBuffer<int>(batch_size * sizeof(int), ctx->GetComputeStream());
     ORT_RETURN_IF_ERROR(LaunchSeqlensInc(hip_stream, seqlens_k_ptr, seqlens_k_tmp.get(), batch_size, sequence_length));
     seqlens_k_ptr = seqlens_k_tmp.get();
diff --git a/onnxruntime/contrib_ops/rocm/math/gemm_float8.cu b/onnxruntime/contrib_ops/rocm/math/gemm_float8.cu
index 1e175b37b02d8..b65841b359647 100644
--- a/onnxruntime/contrib_ops/rocm/math/gemm_float8.cu
+++ b/onnxruntime/contrib_ops/rocm/math/gemm_float8.cu
@@ -78,7 +78,7 @@ Status GemmFloat8::ComputeInternal(OpKernelContext* ctx) const {
 
   auto m = !transA_ ? a_shape[0] : a_shape[1];
   auto k = !transA_ ? a_shape[1] : a_shape[0];
-  ORT_ENFORCE(k == (!transB_ ? b_shape[0] : b_shape[1]));  // k is compatiable
+  ORT_ENFORCE(k == (!transB_ ? b_shape[0] : b_shape[1]));  // k is compatible
   auto n = !transB_ ? b_shape[1] : b_shape[0];
 
   TensorShapeVector output_shape = {m, n};
diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/nn/lstm.cc b/onnxruntime/core/codegen/passes/op_ir_creator/nn/lstm.cc
index 5c2557142dd0e..88170bb56dd2d 100644
--- a/onnxruntime/core/codegen/passes/op_ir_creator/nn/lstm.cc
+++ b/onnxruntime/core/codegen/passes/op_ir_creator/nn/lstm.cc
@@ -9,7 +9,7 @@
 namespace onnxruntime {
 namespace tvm_codegen {
 
-// In the cell computation, we don't have the "direction" dimention and sequence dimension,
+// In the cell computation, we don't have the "direction" dimension and sequence dimension,
 // which have been processed outside of the cell.
 // Here we implement an LTSM cell.
 // For those args (inputs/outputs) of hidden states we put AFTER regular args (inputs/outputs)
diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h
index 0bee36e4d10b3..4c9e7e80db49b 100644
--- a/onnxruntime/core/common/cpuid_info.h
+++ b/onnxruntime/core/common/cpuid_info.h
@@ -61,7 +61,7 @@ class CPUIDInfo {
 
   /**
    * @brief Some ARMv8 power efficient core has narrower 64b load/store
-   *        that needs specialized optimiztion in kernels
+   *        that needs specialized optimization in kernels
    * @return whether the indicated core has narrower load/store device
    */
   bool IsCoreArmv8NarrowLd(uint32_t coreId) const {
@@ -73,7 +73,7 @@ class CPUIDInfo {
 
   /**
    * @brief Some ARMv8 power efficient core has narrower 64b load/store
-   *        that needs specialized optimiztion in kernels
+   *        that needs specialized optimization in kernels
    * @return whether the current core has narrower load/store device
    */
   bool IsCurrentCoreArmv8NarrowLd() const {
diff --git a/onnxruntime/core/common/helper.cc b/onnxruntime/core/common/helper.cc
index 7b7073634989d..6a52db73df106 100644
--- a/onnxruntime/core/common/helper.cc
+++ b/onnxruntime/core/common/helper.cc
@@ -56,7 +56,7 @@ void PrintFinalMessage(const char* msg) {
 #else
   // TODO, consider changing the output of the error message from std::cerr to logging when the
   // exceptions are disabled, since using std::cerr might increase binary size, and std::cerr output
-  // might not be easily accesible on some systems such as mobile
+  // might not be easily accessible on some systems such as mobile
   // TODO, see if we need to change the output of the error message from std::cerr to NSLog for iOS
   std::cerr << msg << std::endl;
 #endif
diff --git a/onnxruntime/core/common/threadpool.cc b/onnxruntime/core/common/threadpool.cc
index 10e117267e14b..7b62de799b6fc 100644
--- a/onnxruntime/core/common/threadpool.cc
+++ b/onnxruntime/core/common/threadpool.cc
@@ -636,7 +636,7 @@ bool ThreadPool::ShouldParallelize(const concurrency::ThreadPool* tp) {
 }
 
 int ThreadPool::DegreeOfParallelism(const concurrency::ThreadPool* tp) {
-  // When not using OpenMP, we parallelise over the N threads created by the pool
+  // When not using OpenMP, we parallelize over the N threads created by the pool
   // tp, plus 1 for the thread entering a loop.
   if (tp) {
     if (tp->force_hybrid_ || CPUIDInfo::GetCPUIDInfo().IsHybrid()) {
diff --git a/onnxruntime/core/graph/contrib_ops/onnx_deprecated_operators.cc b/onnxruntime/core/graph/contrib_ops/onnx_deprecated_operators.cc
index be39a2d7ec2b2..b1b7cf346a27c 100644
--- a/onnxruntime/core/graph/contrib_ops/onnx_deprecated_operators.cc
+++ b/onnxruntime/core/graph/contrib_ops/onnx_deprecated_operators.cc
@@ -395,7 +395,7 @@ ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
             const auto input_rank = input_shape.dim_size();
             if (input_rank != 4) fail_shape_inference("Input's shape must be 4-D");
 
-            // parse necessary attributes for futher processing
+            // parse necessary attributes for further processing
             std::vector<int64_t> border;
             bool border_present = getRepeatedAttribute(ctx, "border", border);
             if (!border_present || border.size() != 4)
diff --git a/onnxruntime/core/optimizer/attention_fusion.h b/onnxruntime/core/optimizer/attention_fusion.h
index acb478da5f31a..befb66b5aa960 100644
--- a/onnxruntime/core/optimizer/attention_fusion.h
+++ b/onnxruntime/core/optimizer/attention_fusion.h
@@ -19,7 +19,8 @@ class AttentionFusion : public GraphTransformer {
   Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
 
  private:
-  static bool FuseSubGraph(Node& layer_norm, const Node& add_after_layer_norm, Graph& graph, int64_t hidden_size, std::map<std::string, NodeArg*>& mask_index_map, const logging::Logger& logger);
+  static bool FuseSubGraph(Node& layer_norm, const Node& add_after_layer_norm, Graph& graph, int64_t hidden_size,
+                           std::map<std::string, NodeArg*>& mask_index_map, const logging::Logger& logger);
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/attention_fusion_helper.h b/onnxruntime/core/optimizer/attention_fusion_helper.h
index 6233e5b839bb9..ca744adddbeec 100644
--- a/onnxruntime/core/optimizer/attention_fusion_helper.h
+++ b/onnxruntime/core/optimizer/attention_fusion_helper.h
@@ -23,7 +23,8 @@ struct MatchGemmResult {
 };
 
 // Compare the expected parameters (starts, ends, axes and step)
-bool CheckSliceParameters(const Graph& graph, const Node& slice, const std::vector<int>& input_indices, const std::vector<int64_t>& expected_values, const logging::Logger& logger) {
+bool CheckSliceParameters(const Graph& graph, const Node& slice, const std::vector<int>& input_indices,
+                          const std::vector<int64_t>& expected_values, const logging::Logger& logger) {
   ORT_ENFORCE(input_indices.size() == expected_values.size() && input_indices.size() > 0);
 
   // Here assumes that the last element of input_indices is the maximum one.
diff --git a/onnxruntime/core/optimizer/common_subexpression_elimination.cc b/onnxruntime/core/optimizer/common_subexpression_elimination.cc
index 48df511d0c672..471e4ee7c03a3 100644
--- a/onnxruntime/core/optimizer/common_subexpression_elimination.cc
+++ b/onnxruntime/core/optimizer/common_subexpression_elimination.cc
@@ -491,7 +491,8 @@ Status CommonSubexpressionElimination::ApplyImpl(Graph& graph, bool& modified, i
 
       if (graph_outputs.count(output_def) > 0) {
         // Currently, we don't support eliminating the graph's outputs.
-        LOGS(logger, VERBOSE) << "Not eliminating output " << output_def->Name() << " of node " << node->Name() << "[" << node->OpType() << "] because it's the graph's output.";
+        LOGS(logger, VERBOSE) << "Not eliminating output " << output_def->Name() << " of node " << node->Name()
+                              << "[" << node->OpType() << "] because it's the graph's output.";
         continue;
       }
 
diff --git a/onnxruntime/core/optimizer/gather_fusion.h b/onnxruntime/core/optimizer/gather_fusion.h
index 098278a77dafe..f431d98b3b827 100644
--- a/onnxruntime/core/optimizer/gather_fusion.h
+++ b/onnxruntime/core/optimizer/gather_fusion.h
@@ -10,7 +10,7 @@ namespace onnxruntime {
 /**
 @Class GatherSliceToSplitFusion
 
-Fuse multiple Gather/Slice nodes that comsuming one output to one Split node.
+Fuse multiple Gather/Slice nodes that consuming one output to one Split node.
 */
 class GatherSliceToSplitFusion : public GraphTransformer {
  public:
diff --git a/onnxruntime/core/optimizer/gemm_sum_fusion.h b/onnxruntime/core/optimizer/gemm_sum_fusion.h
index 0e2ec104703f3..9b2fa22ecc317 100644
--- a/onnxruntime/core/optimizer/gemm_sum_fusion.h
+++ b/onnxruntime/core/optimizer/gemm_sum_fusion.h
@@ -12,14 +12,14 @@ namespace onnxruntime {
 
 Rewrite rule that fuses Gemm and Sum nodes to a single Gemm node.
 This fusion can be applied in the following scenario:
-1) Sum at output of Gemm: when the output of a Gemm is immedietly summed with
+1) Sum at output of Gemm: when the output of a Gemm is immediately summed with
     exactly one other element, we can fuse this Sum with Gemm by using the other
     Sum input as C, provided that the C input to the Gemm is missing.
     This is supported for opset >= 11, as this is when Gemm input C became optional.
 
 TODO: Support the Add use case: Sum(x, y) ~= Add.
 
-This patterm is attempted to be triggered only on nodes with op type "Gemm".
+This pattern is attempted to be triggered only on nodes with op type "Gemm".
 
 A --> Gemm --> D --> Sum --> E
        ^              ^
diff --git a/onnxruntime/core/optimizer/identity_elimination.h b/onnxruntime/core/optimizer/identity_elimination.h
index 5e76275207c32..4b20edec12dfa 100644
--- a/onnxruntime/core/optimizer/identity_elimination.h
+++ b/onnxruntime/core/optimizer/identity_elimination.h
@@ -26,6 +26,6 @@ class EliminateIdentity : public RewriteRule {
   bool SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger& logger) const override;
 
   Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect, const logging::Logger& logger) const override;
-};  // namespace onnxruntime
+};
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/isinf_reducesum_fusion.cc b/onnxruntime/core/optimizer/isinf_reducesum_fusion.cc
index bc0523d517ee6..7d249ea715e8d 100644
--- a/onnxruntime/core/optimizer/isinf_reducesum_fusion.cc
+++ b/onnxruntime/core/optimizer/isinf_reducesum_fusion.cc
@@ -41,7 +41,7 @@ Status IsInfReduceSumFusion::ApplyImpl(Graph& graph, bool& modified, int graph_l
 
     auto input_defs = isinf_node.MutableInputDefs();
     // see if there is a Cast before IsInf
-    // This will happen if input type is FP16 but IsInf doesnt support fp16, so it will be cast to float/double
+    // This will happen if input type is FP16 but IsInf doesn't support fp16, so it will be cast to float/double
     // This Cast can be skipped as we are replacing the subgraph with IsAllFinite, which supports FP16
     auto cast1_node_iter = isinf_node.InputNodesBegin();
     if (cast1_node_iter != isinf_node.InputNodesEnd() &&
diff --git a/onnxruntime/core/optimizer/matmul_bn_fusion.h b/onnxruntime/core/optimizer/matmul_bn_fusion.h
index 7a43483cf37d4..39cd0dd186d53 100644
--- a/onnxruntime/core/optimizer/matmul_bn_fusion.h
+++ b/onnxruntime/core/optimizer/matmul_bn_fusion.h
@@ -8,7 +8,7 @@
 namespace onnxruntime {
 /*
  *   This fusion submerges a BatchNormalization operator to it's super
- *   precedding MatMul operator, if and only if MatmulBNFusion::SatisfyCondition()
+ *   preceding MatMul operator, if and only if MatmulBNFusion::SatisfyCondition()
  *   is true.
  */
 class MatmulBNFusion : public RewriteRule {
@@ -24,4 +24,4 @@ class MatmulBNFusion : public RewriteRule {
 
   Status Apply(Graph& graph, Node& matmul_node, RewriteRuleEffect& rule_effect, const logging::Logger& logger) const override;
 };
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/transformer_memcpy.cc b/onnxruntime/core/optimizer/transformer_memcpy.cc
index 0d7ab70eba613..f1e94dd4fe9e4 100644
--- a/onnxruntime/core/optimizer/transformer_memcpy.cc
+++ b/onnxruntime/core/optimizer/transformer_memcpy.cc
@@ -11,7 +11,7 @@ using namespace ONNX_NAMESPACE;
 namespace onnxruntime {
 
 // implements MemCpy node insertion in graph transform
-// note that GraphTransformer::Apply() is supposed to be stateless, so this cannot derive from GraphTranformer
+// note that GraphTransformer::Apply() is supposed to be stateless, so this cannot derive from GraphTransformer
 class TransformerMemcpyImpl {
  public:
   TransformerMemcpyImpl(onnxruntime::Graph& graph, const std::string& provider)
diff --git a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
index 6c2fcc2ace856..3400f09b4056f 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
@@ -259,13 +259,13 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa
       // Onnx spec requires output sizes to be a positive integer, so we are not checking that here
       if (output_size_h % input_size_h != 0) {
         LOGS(logger, VERBOSE) << "Resize: output_size_h: " << output_size_h
-                              << " is not a mutliple of input_size_h: " << input_size_h;
+                              << " is not a multiple of input_size_h: " << input_size_h;
         return false;
       }
 
       if (output_size_w % input_size_w != 0) {
         LOGS(logger, VERBOSE) << "Resize: output_size_w: " << output_size_w
-                              << " is not a mutliple of input_size_w: " << input_size_w;
+                              << " is not a multiple of input_size_w: " << input_size_w;
         return false;
       }
     }
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_conv.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_conv.h
index 718469f740d4d..831b10c3e147f 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_conv.h
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_conv.h
@@ -66,9 +66,9 @@ class DnnlConv {
    *    - For Onnx a non-dilated kernel would be all 1s
    *    - For OneDNN a non-dilated kernel would be all 0s
    *
-   * The memory dimentions returned is in the form expected for OneDNN each dilation dimention
-   * will be 1 less than the dilated dimention expected by Onnx specification. Be aware of this
-   * fact as 'dilations' are used in any calcuations since this could result in an off-by-one
+   * The memory dimensions returned is in the form expected for OneDNN each dilation dimension
+   * will be 1 less than the dilated dimension expected by Onnx specification. Be aware of this
+   * fact as 'dilations' are used in any calculations since this could result in an off-by-one
    * error.
    */
   dnnl::memory::dims GetDilations(DnnlNode& node, ConvShape shape);
@@ -115,4 +115,4 @@ class DnnlConv {
 };
 
 }  // namespace ort_dnnl
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.h
index f0928974b1317..3a27788745ef0 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.h
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_convgrad.h
@@ -49,9 +49,9 @@ class DnnlConvGrad {
    *    - For Onnx a non-dilated kernel would be all 1s
    *    - For OneDNN a non-dilated kernel would be all 0s
    *
-   * The memory dimentions returned is in the form expected for OneDNN each dilation dimention
-   * will be 1 less than the dilated dimention expected by Onnx specification. Be aware of this
-   * fact as 'dilations' are used in any calcuations since this could result in an off-by-one
+   * The memory dimensions returned is in the form expected for OneDNN each dilation dimension
+   * will be 1 less than the dilated dimension expected by Onnx specification. Be aware of this
+   * fact as 'dilations' are used in any calculations since this could result in an off-by-one
    * error.
    */
   dnnl::memory::dims GetDilations(DnnlNode& node, ConvShape shape);
@@ -62,4 +62,4 @@ class DnnlConvGrad {
 };
 
 }  // namespace ort_dnnl
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_qattention.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_qattention.h
index d1cea23fca245..dac4e743ea198 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_qattention.h
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_qattention.h
@@ -20,7 +20,7 @@ class DnnlQAttention {
     MASK_INDEX = 5,
     INPUT_ZP = 6,
     WEIGHTS_ZP = 7,
-    PAST = 8  // not suppoted
+    PAST = 8  // not supported
   };
 
   enum OutputTensors : int {
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.h
index d3d6da8364b6e..2adf346332c66 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_lib/nnapi_implementation.h
@@ -2098,7 +2098,7 @@ struct NnApi {
    * @param executionCallback The execution callback to set.
    * @param callbackContext The context to be passed to the callbacks when they
    * are invoked. The context object may be used by multiple threads
-   * simulatenously, so it must be thread-safe.
+   * simultaneously, so it must be thread-safe.
    */
   void (*SL_ANeuralNetworksDiagnostic_registerCallbacks)(
       ANeuralNetworksDiagnosticCompilationFinishedCallback compilationCallback,
diff --git a/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h b/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h
index 534a839eeea7a..df2e429f58b2f 100644
--- a/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h
+++ b/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h
@@ -30,28 +30,28 @@
 namespace onnxruntime {
 namespace vsi {
 namespace npu {
-#define ELEMENTWISE_OP_BUILDER(onnx_op_type, vsinpu_op_kind)                                     \
-  class onnx_op_type##OpBuilder : public BaseOpBuilder {                                         \
-    bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,                             \
-                       const Node* node) const override {                                        \
-      for (auto input : node->InputDefs()) {                                                     \
-        if (*input->Type() == "tensor(int64)") {                                                 \
-          LOGS_DEFAULT(WARNING) << "Int64 type is not suppoted as elementwise operation input."; \
-          return false;                                                                          \
-        }                                                                                        \
-      }                                                                                          \
-      return true;                                                                               \
-    }                                                                                            \
-    bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,                                              \
-                       std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,                    \
-                       std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,                   \
-                       const NodeUnit& node_unit) override {                                     \
-      LOGS_DEFAULT(INFO) << "Creating " << #onnx_op_type << " Op";                               \
-      auto op = graph_ep->GetGraph() -> CreateOperation<tim::vx::ops::vsinpu_op_kind>();         \
-      (*op).BindInputs(inputs).BindOutputs(outputs);                                             \
-      return true;                                                                               \
-      ;                                                                                          \
-    }                                                                                            \
+#define ELEMENTWISE_OP_BUILDER(onnx_op_type, vsinpu_op_kind)                                      \
+  class onnx_op_type##OpBuilder : public BaseOpBuilder {                                          \
+    bool IsOpSupported(const onnxruntime::GraphViewer& graph_viewer,                              \
+                       const Node* node) const override {                                         \
+      for (auto input : node->InputDefs()) {                                                      \
+        if (*input->Type() == "tensor(int64)") {                                                  \
+          LOGS_DEFAULT(WARNING) << "Int64 type is not supported as elementwise operation input."; \
+          return false;                                                                           \
+        }                                                                                         \
+      }                                                                                           \
+      return true;                                                                                \
+    }                                                                                             \
+    bool HandleBuildOp(vsi::npu::GraphEP* graph_ep,                                               \
+                       std::vector<std::shared_ptr<tim::vx::Tensor>>& inputs,                     \
+                       std::vector<std::shared_ptr<tim::vx::Tensor>>& outputs,                    \
+                       const NodeUnit& node_unit) override {                                      \
+      LOGS_DEFAULT(INFO) << "Creating " << #onnx_op_type << " Op";                                \
+      auto op = graph_ep->GetGraph() -> CreateOperation<tim::vx::ops::vsinpu_op_kind>();          \
+      (*op).BindInputs(inputs).BindOutputs(outputs);                                              \
+      return true;                                                                                \
+      ;                                                                                           \
+    }                                                                                             \
   };
 
 ELEMENTWISE_OP_BUILDER(Add, Add);
diff --git a/onnxruntime/python/tools/transformers/dev_benchmark.cmd b/onnxruntime/python/tools/transformers/dev_benchmark.cmd
index 7a9b3254a1708..82137de3c0f3b 100644
--- a/onnxruntime/python/tools/transformers/dev_benchmark.cmd
+++ b/onnxruntime/python/tools/transformers/dev_benchmark.cmd
@@ -41,7 +41,7 @@ set input_counts=1
 REM Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased
 set models_to_test=bert-base-cased
 
-REM If you have mutliple GPUs, you can choose one GPU for test. Here is an example to use the second GPU:
+REM If you have multiple GPUs, you can choose one GPU for test. Here is an example to use the second GPU:
 REM set CUDA_VISIBLE_DEVICES=1
 
 REM This script will generate a logs file with a list of commands used in tests.
@@ -163,4 +163,4 @@ IF %FileSize% LSS 10 goto :EOF
 python -c "import sys; lines=sys.stdin.readlines(); h=lines[0]; print(''.join([h]+list(sorted(set(lines)-set([h])))))"   < %1  > sort_%1
 FindStr "[^,]" sort_%1 > summary_%1
 DEL sort_%1
-goto :EOF
\ No newline at end of file
+goto :EOF
diff --git a/onnxruntime/python/tools/transformers/onnx_model_t5.py b/onnxruntime/python/tools/transformers/onnx_model_t5.py
index 95f40af3fd746..9cc4878e8022d 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_t5.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_t5.py
@@ -694,7 +694,7 @@ def __init__(self, model, num_heads, hidden_size):
         self.attention_fusion = FusionT5Attention(self, self.hidden_size, self.num_heads, self.attention_mask)
         self.layer_norm_fusion = FusionSimplifiedLayerNormalization(self)
         self.skip_layer_norm_fusion = FusionSkipSimplifiedLayerNormalization(self)
-        # TODO: consider retrive max_distance from model.
+        # TODO: consider retrieve max_distance from model.
         # math.log(max_distance / (num_buckets // 2))
         self.rpb_fusion = FusionRelativePositionBiasBlock(self, 128)
 
diff --git a/onnxruntime/python/tools/transformers/run_benchmark.sh b/onnxruntime/python/tools/transformers/run_benchmark.sh
index 64d6ecde618f6..77d0c3a76624f 100755
--- a/onnxruntime/python/tools/transformers/run_benchmark.sh
+++ b/onnxruntime/python/tools/transformers/run_benchmark.sh
@@ -62,7 +62,7 @@ input_counts=1
 # Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased
 models_to_test="bert-base-cased roberta-base distilbert-base-uncased"
 
-# If you have mutliple GPUs, you can choose one GPU for test. Here is an example to use the second GPU:
+# If you have multiple GPUs, you can choose one GPU for test. Here is an example to use the second GPU:
 # export CUDA_VISIBLE_DEVICES=1
 
 # This script will generate a logs file with a list of commands used in tests.
diff --git a/onnxruntime/test/framework/insert_cast_transformer_test.cc b/onnxruntime/test/framework/insert_cast_transformer_test.cc
index 1804c09043c7b..9278541b07512 100644
--- a/onnxruntime/test/framework/insert_cast_transformer_test.cc
+++ b/onnxruntime/test/framework/insert_cast_transformer_test.cc
@@ -139,7 +139,7 @@ TEST(TransformerTest, CastRemovalDoesNotLowerPrecisionTest) {
   status = graph.Resolve();
   EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
 
-  // When casting f64 -> f32 -> f64 we should not be optimising away the cast since there is a loss of precision.
+  // When casting f64 -> f32 -> f64 we should not be optimizing away the cast since there is a loss of precision.
   EXPECT_EQ(graph.NumberOfNodes(), 2);
 }
 
@@ -171,7 +171,7 @@ TEST(TransformerTest, CastRemovalDoesNotRemoveSignednessTest) {
   status = graph.Resolve();
   EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
 
-  // When casting i32 -> ui32 -> i32 we should not be optimising away the cast since applying the casts produces a very different result.
+  // When casting i32 -> ui32 -> i32 we should not be optimizing away the cast since applying the casts produces a very different result.
   EXPECT_EQ(graph.NumberOfNodes(), 2);
 }
 
diff --git a/onnxruntime/tool/etw/eparser.cc b/onnxruntime/tool/etw/eparser.cc
index 526ba6de81966..ff0348d721eb7 100644
--- a/onnxruntime/tool/etw/eparser.cc
+++ b/onnxruntime/tool/etw/eparser.cc
@@ -7,7 +7,7 @@
 
 // Get the length of the property data. For MOF-based events, the size is inferred from the data type
 // of the property. For manifest-based events, the property can specify the size of the property value
-// using the length attribute. The length attribue can specify the size directly or specify the name
+// using the length attribute. The length attribute can specify the size directly or specify the name
 // of another property in the event data that contains the size. If the property does not include the
 // length attribute, the size is inferred from the data type. The length will be zero for variable
 // length, null-terminated strings and structures.
@@ -16,7 +16,7 @@ DWORD GetPropertyLength(PEVENT_RECORD pEvent, PTRACE_EVENT_INFO pInfo, USHORT i,
 
 // Get the size of the array. For MOF-based events, the size is specified in the declaration or using
 // the MAX qualifier. For manifest-based events, the property can specify the size of the array
-// using the count attribute. The count attribue can specify the size directly or specify the name
+// using the count attribute. The count attribute can specify the size directly or specify the name
 // of another property in the event data that contains the size.
 
 DWORD GetArraySize(PEVENT_RECORD pEvent, PTRACE_EVENT_INFO pInfo, USHORT i, PUSHORT ArraySize);
diff --git a/orttraining/orttraining/core/framework/torch/custom_function_register.h b/orttraining/orttraining/core/framework/torch/custom_function_register.h
index 67a991ea2cce3..762258a45221e 100644
--- a/orttraining/orttraining/core/framework/torch/custom_function_register.h
+++ b/orttraining/orttraining/core/framework/torch/custom_function_register.h
@@ -9,9 +9,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace onnxruntime {
-namespace language_interop_ops {
-namespace torch {
+namespace onnxruntime::language_interop_ops::torch {
 
 typedef std::vector<PyObject*> (*CustomFunctionRunnerType)(const char* func_name_char,
                                                            void* callback,
@@ -124,6 +122,4 @@ class OrtTorchFunctionPool final {
 
   std::mutex mutex_;
 };
-}  // namespace torch
-}  // namespace language_interop_ops
-}  // namespace onnxruntime
+}  // namespace onnxruntime::language_interop_ops::torch
diff --git a/orttraining/orttraining/core/framework/torch/dlpack_python.cc b/orttraining/orttraining/core/framework/torch/dlpack_python.cc
index d512dc72a438f..f9b237f051258 100644
--- a/orttraining/orttraining/core/framework/torch/dlpack_python.cc
+++ b/orttraining/orttraining/core/framework/torch/dlpack_python.cc
@@ -3,10 +3,7 @@
 
 #include "orttraining/core/framework/torch/dlpack_python.h"
 
-namespace onnxruntime {
-namespace training {
-namespace framework {
-namespace torch {
+namespace onnxruntime::training::framework::torch {
 
 static void DlpackCapsuleDestructor(PyObject* data) {
   DLManagedTensor* dlmanged_tensor = reinterpret_cast<DLManagedTensor*>(
@@ -35,7 +32,4 @@ OrtValue FromDlpack(PyObject* dlpack_tensor, const bool is_bool_tensor) {
   return ort_value;
 }
 
-}  // namespace torch
-}  // namespace framework
-}  // namespace training
-}  // namespace onnxruntime
+}  // namespace onnxruntime::training::framework::torch
diff --git a/orttraining/orttraining/core/framework/torch/dlpack_python.h b/orttraining/orttraining/core/framework/torch/dlpack_python.h
index 37bae2ab37025..9b641971dceac 100644
--- a/orttraining/orttraining/core/framework/torch/dlpack_python.h
+++ b/orttraining/orttraining/core/framework/torch/dlpack_python.h
@@ -8,10 +8,7 @@
 #include "core/dlpack/dlpack_converter.h"
 #include "orttraining/core/framework/torch/python_common.h"
 
-namespace onnxruntime {
-namespace training {
-namespace framework {
-namespace torch {
+namespace onnxruntime::training::framework::torch {
 
 // Allocate a new Capsule object, which takes the ownership of OrtValue.
 // Caller is responsible for releasing.
@@ -22,7 +19,4 @@ PyObject* ToDlpack(OrtValue ort_value);
 // create a OrtValue. This function calls DlpackToOrtValue(...) to do the conversion.
 OrtValue FromDlpack(PyObject* dlpack_tensor, const bool is_bool_tensor);
 
-}  // namespace torch
-}  // namespace framework
-}  // namespace training
-}  // namespace onnxruntime
+}  // namespace onnxruntime::training::framework::torch
diff --git a/orttraining/orttraining/core/framework/torch/torch_proxy.h b/orttraining/orttraining/core/framework/torch/torch_proxy.h
index 450a5048aea44..b80acd6c4791a 100644
--- a/orttraining/orttraining/core/framework/torch/torch_proxy.h
+++ b/orttraining/orttraining/core/framework/torch/torch_proxy.h
@@ -22,7 +22,7 @@ namespace torch {
 // For handling temporary PyObject pointer newly created with Py_XXX APIs, here is our practice:
 // Convention:
 //     Wrap those PyObject* in format of "PythonObjectPtr(Py_XXX(), PythonObjectDeleter)".
-// Explaination:
+// Explanation:
 //     That means, for the PyObject* created by Py_XXX(), its refcnt will be decreased by one
 //     in the PythonObjectDeleter which is triggered once lifetime of PythonObjectPtr instance
 //     ends.
diff --git a/orttraining/orttraining/core/graph/gradient_builder_base.cc b/orttraining/orttraining/core/graph/gradient_builder_base.cc
index d57675e8b8e20..4262acb636582 100644
--- a/orttraining/orttraining/core/graph/gradient_builder_base.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder_base.cc
@@ -63,16 +63,20 @@ void ComputeBroadcastBackwardAxes(
       auto A_dim = A_dims[i].dim_param(),
            B_dim = B_dims[j].dim_param();
       if (A_dim != B_dim) {
-        LOGS_DEFAULT(INFO) << "Gradient building for node " << node_name << ": symbolic dimension expects to match. "
-                           << "A_dims:" << ToString(A_dims) << ", B_dims:" << ToString(B_dims) << " This is a relaxing case, and the kernel might run into problem later if A_dims and B_dims turns out not broadcastable.";
+        LOGS_DEFAULT(INFO)
+            << "Gradient building for node " << node_name << ": symbolic dimension expects to match. "
+            << "A_dims:" << ToString(A_dims) << ", B_dims:" << ToString(B_dims)
+            << " This is a relaxing case, and the kernel might run into problem later if A_dims and B_dims turns out not broadcastable.";
       }
     } else if (A_dims[i].has_dim_param() && B_dims[j].has_dim_value()) {
       auto A_dim = A_dims[i].dim_param();
       auto B_dim = B_dims[j].dim_value();
 
       if (B_dim != 1) {
-        LOGS_DEFAULT(INFO) << "Gradient building for node " << node_name << ": symbolic broadcasting expects the B_dimension to be 1. "
-                           << "A_dims:" << ToString(A_dims) << ", B_dims:" << ToString(B_dims) << " This is a relaxing case, and the kernel might run into problem later if A_dims and B_dims turns out not broadcastable.";
+        LOGS_DEFAULT(INFO)
+            << "Gradient building for node " << node_name << ": symbolic broadcasting expects the B_dimension to be 1. "
+            << "A_dims:" << ToString(A_dims) << ", B_dims:" << ToString(B_dims)
+            << " This is a relaxing case, and the kernel might run into problem later if A_dims and B_dims turns out not broadcastable.";
       } else {
         if (B_axes) {
           B_axes->push_back(gsl::narrow_cast<int64_t>(k));
@@ -83,8 +87,10 @@ void ComputeBroadcastBackwardAxes(
       auto B_dim = B_dims[j].dim_param();
 
       if (A_dim != 1) {
-        LOGS_DEFAULT(INFO) << "Gradient building for node " << node_name << ": symbolic broadcasting expects the A_dimension to be 1. "
-                           << "A_dims:" << ToString(A_dims) << ", B_dims:" << ToString(B_dims) << " This is a relaxing case, and the kernel might run into problem later if A_dims and B_dims turns out not broadcastable.";
+        LOGS_DEFAULT(INFO)
+            << "Gradient building for node " << node_name << ": symbolic broadcasting expects the A_dimension to be 1. "
+            << "A_dims:" << ToString(A_dims) << ", B_dims:" << ToString(B_dims)
+            << " This is a relaxing case, and the kernel might run into problem later if A_dims and B_dims turns out not broadcastable.";
       } else {
         if (A_axes) {
           A_axes->push_back(gsl::narrow_cast<int64_t>(k));
diff --git a/orttraining/orttraining/core/graph/gradient_builder_base.h b/orttraining/orttraining/core/graph/gradient_builder_base.h
index 2d8a87f6d4427..a4aa70c99eec1 100644
--- a/orttraining/orttraining/core/graph/gradient_builder_base.h
+++ b/orttraining/orttraining/core/graph/gradient_builder_base.h
@@ -225,7 +225,9 @@ class GradientBuilderBase {
   }
 
   int OnnxOpSetVersion() const {
-    return graph_ != nullptr && graph_->DomainToVersionMap().find(kOnnxDomain) != graph_->DomainToVersionMap().end() ? graph_->DomainToVersionMap().at(kOnnxDomain) : -1;
+    return graph_ != nullptr && graph_->DomainToVersionMap().find(kOnnxDomain) != graph_->DomainToVersionMap().end()
+               ? graph_->DomainToVersionMap().at(kOnnxDomain)
+               : -1;
   }
 
   template <typename T>
diff --git a/orttraining/orttraining/core/graph/loss_function_registry.h b/orttraining/orttraining/core/graph/loss_function_registry.h
index 76242276080ca..6c880b6a3d9f5 100644
--- a/orttraining/orttraining/core/graph/loss_function_registry.h
+++ b/orttraining/orttraining/core/graph/loss_function_registry.h
@@ -18,7 +18,7 @@ struct LossFunctionUsingOperator : public ILossFunction {
 
 class LossFunctionRegistry : public GenericRegistry<ILossFunction> {
  public:
-  // Register a list of non-operator loss functions stacitally.
+  // Register a list of non-operator loss functions statically.
   void RegisterNonOperatorLossFunctions();
 
   // Register a operator loss function.
diff --git a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc
index 0d4291a3b8b31..4b6a9a6e594cd 100644
--- a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc
+++ b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc
@@ -508,7 +508,7 @@ Status PaddingElimination::ApplyImpl(Graph& graph, bool& modified, int graph_lev
                                              *embedding_node);
 
   // Add flatten pattern to each input node of the subgraph
-  // to flattern the shape of [batch_size, seqlen, ...] to [valid_token_count, ...]
+  // to flatten the shape of [batch_size, seqlen, ...] to [valid_token_count, ...]
   InsertFlattenPatternForInput(graph, *embedding_node, 1, squeeze_out_arg, logger);
   handled_input_count++;
   for (auto& node : candidate_inputs) {
diff --git a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.h b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.h
index cc3c90dac2d58..a09ee75c73aaf 100644
--- a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.h
+++ b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.h
@@ -25,7 +25,7 @@ namespace onnxruntime {
  *
  * This transformer is implemented in the following steps:
  * 1. Iterate the graph and find the Embedding node that matches these requirements:
- *    1.1 The 2nd input is a graph input and its rank > 2, with the first two dimensions, are:
+ *    1.1 Following a PythonOp(FlagAndPrintDensity) node, and its rank > 2, with the first two dimensions, are:
  *        [batch_size, sequence_length]. Both dimensions can be symbolic or concrete dim values.
  *    1.2 The 3rd input(padding idx) is a scalar constant initializer, and should >= 0.
  * 2. Append embedding node in node_to_scan_list.
@@ -54,6 +54,8 @@ namespace onnxruntime {
  *         \                 \                           /                /                      /
  *          \_________________\_________________________/________________/______________________/
  *                                         |
+ *                              PythonOp (FlagAndPrintDensity)
+ *                                         |
  *                                   ATen:embedding
  *                                         |
  *                  - - - - - - - - - - - -|
@@ -68,7 +70,7 @@ namespace onnxruntime {
  *                                       output
  *
  *
- * After the transformation:
+ * After the transformation (PythonOp (FlagAndPrintDensity) is removed unless user need to print density for each step):
  *
  *             input_ids [batch_size, seq_length]
  *                |      \
diff --git a/orttraining/orttraining/core/optimizer/compute_optimizer/sceloss_compute_optimization.h b/orttraining/orttraining/core/optimizer/compute_optimizer/sceloss_compute_optimization.h
index cf34706115894..2204724bacf55 100644
--- a/orttraining/orttraining/core/optimizer/compute_optimizer/sceloss_compute_optimization.h
+++ b/orttraining/orttraining/core/optimizer/compute_optimizer/sceloss_compute_optimization.h
@@ -32,10 +32,10 @@ namespace onnxruntime {
  * 2. Its 2nd output (log_prob) MUST NOT be a graph output and MUST NOT be consumed by other nodes.
  * 3. Its ignore_index exists and is a constant scalar value.
  * 4. Its 2nd input label's input node is not a `ShrunkGather` node (to avoid this transformer duplicated applied).
- * 5. Its 2nd input label is 1) a graph input or 2) output of a Reshape node taking a graph input as its data input.
+ * 5. Following PythonOp (FlagAndPrintDensity).
  *
  *
- * After the transformation:
+ * After the transformation (PythonOp (FlagAndPrintDensity) is removed unless user need to print density for each step):
  *                                        labels [token_count]
  *                                            \_______
  *                                             \       \
diff --git a/orttraining/orttraining/core/optimizer/insert_output_rewriter.h b/orttraining/orttraining/core/optimizer/insert_output_rewriter.h
index 5e4bf5c5ce7a9..de000e00f1bf8 100644
--- a/orttraining/orttraining/core/optimizer/insert_output_rewriter.h
+++ b/orttraining/orttraining/core/optimizer/insert_output_rewriter.h
@@ -7,7 +7,7 @@
 
 namespace onnxruntime {
 
-// Rewrite rule that insert an addtional output to the matched node.
+// Rewrite rule that insert an additional output to the matched node.
 class InsertMaxPoolOutput : public RewriteRule {
  public:
   InsertMaxPoolOutput() noexcept
@@ -24,7 +24,7 @@ class InsertMaxPoolOutput : public RewriteRule {
   Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect, const logging::Logger& logger) const override;
 };
 
-// Rewrite rule that insert an addtional output to the matched node.
+// Rewrite rule that insert an additional output to the matched node.
 // Adding this second output to expose FW intermediate result for speeding up BW computation
 class InsertSoftmaxCrossEntropyLossOutput : public RewriteRule {
  public:
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
index 088fd345135db..8d110c692751e 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
@@ -60,9 +60,9 @@ using OpsetToIgnorableIndicesMap = InlinedHashMap<int, IgnorableInputIndices>;
  * Most recent revisited for ONNX v1.15.0 release - https://github.com/onnx/onnx/blob/b86cc54efce19530fb953e4b21f57e6b3888534c/docs/Operators.md
  *
  * We defined supported list explicitly instead of using a excluding list for the following reasons:
- * 1. Some ops generate indeterministic results (for example using random number generator). We need evaluate whether
+ * 1. Some ops generate non-deterministic results (for example using random number generator). We need evaluate whether
  *   this is a problem for recompute before adding the support, instead of fixing this after we find and try to
- *   fix convergence issues (which will be very hard if we have multiple indeterministic operators by default supported.)
+ *   fix convergence issues (which will be very hard if we have multiple non-deterministic operators by default supported.)
  * 2. Some ops schema will be changed in new opsets, we need also check manually whether it is applicable to recompute
  *   or not.
  * 3. Some ops are not supported in older opsets, we need to check whether it is applicable to recompute or not.
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
index 5aa05b0f02e0f..d87706ea98061 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
@@ -151,7 +151,7 @@ class NodeRecomputePlan : public NodeOptimizationPlanBase {
  * @param compromise_stashed_activation Whether to compromise stashed activation, e.g. if we cannot find a
  * recomputable subgraph to save a stashed activation, we can compromise to find a recomputable subgraph to reduce the
  * size of stashed activation.
- * @param can_compromise_stashed_activation A bool return value, to indicate there is opportunaties for finding a
+ * @param can_compromise_stashed_activation A bool return value, to indicate there are opportunities for finding a
  * compromised subgraph.
  */
 std::unique_ptr<NodeRecomputePlan> CheckNodeForRecompute(const GraphViewer& graph_viewer,
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc
index a4fbacc8a1f4c..dd585068a23ca 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/transformer_specific.cc
@@ -88,7 +88,7 @@ std::tuple<bool, const Node*, const Node*> IsResidualNodeArg(const GraphViewer&
       ----------------------|
       |                     |
       |                     |
-      |        SimplifiedLayerNormalization (layer boudary node)
+      |        SimplifiedLayerNormalization (layer boundary node)
       |                     |
       |                     |
       |               MistralAttention
diff --git a/orttraining/orttraining/core/optimizer/qdq_fusion.h b/orttraining/orttraining/core/optimizer/qdq_fusion.h
index 722565cffa803..3bf9a7909f7e2 100644
--- a/orttraining/orttraining/core/optimizer/qdq_fusion.h
+++ b/orttraining/orttraining/core/optimizer/qdq_fusion.h
@@ -14,7 +14,7 @@ This transformer will be used during QAT (Quantization Aware Training). For QAT
 an onnx graph that has Q->DQ nodes needs to be made ready for training. The output of the
 Q node is a quantized type. Backpropagation on quantized type is not supported in ort.
 So, we replace the occurrences of Q->DQ with FakeQuant which internally will perform the
-Q->DQ opeeration and at the same time can support backpropagation.
+Q->DQ operation and at the same time can support backpropagation.
 
 from:
          x (fp32)
diff --git a/orttraining/orttraining/core/optimizer/transpose_replacement.h b/orttraining/orttraining/core/optimizer/transpose_replacement.h
index c38e402339823..d2bbe2fdcfc19 100644
--- a/orttraining/orttraining/core/optimizer/transpose_replacement.h
+++ b/orttraining/orttraining/core/optimizer/transpose_replacement.h
@@ -13,10 +13,10 @@ namespace onnxruntime {
 
 Transpose is equivalent to a Reshape if:
  empty dimensions (which dim_value=1) can change place, not empty dimensions must be in
- the same order in the permuted tenosr.
+ the same order in the permuted tensor.
  Example: Shape=(1,1,1024,4096) -> perm=(2,0,3,1).
 
-This Rewrite rule replaces Transpose which meets the requirments with Reshape.
+This Rewrite rule replaces Transpose which meets the requirements with Reshape.
 Because Transpose need memory copy while Reshape needn't, this replacement can save overhead for memory copy.
 
 It is attempted to be triggered only on nodes with op type "Transpose".
diff --git a/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py b/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
index ff128c4da4259..12eba90170fb9 100644
--- a/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
@@ -26,7 +26,7 @@ def override_function(m_self):  # noqa: N805
 
         from onnxruntime.training.ortmodule.torch_cpp_extensions import fused_ops
 
-        warnings.warn("Apex AMP fp16_optimizer functions are overrided with faster implementation.", UserWarning)
+        warnings.warn("Apex AMP fp16_optimizer functions are overridden with faster implementation.", UserWarning)
 
         # Implementation adapted from https://github.com/NVIDIA/apex/blob/082f999a6e18a3d02306e27482cc7486dab71a50/apex/amp/_process_optimizer.py#L161
         def post_backward_with_master_weights(self, scaler):
diff --git a/orttraining/orttraining/python/training/optim/_ds_modifier.py b/orttraining/orttraining/python/training/optim/_ds_modifier.py
index 20f4f814e5476..55e2e08432137 100644
--- a/orttraining/orttraining/python/training/optim/_ds_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_ds_modifier.py
@@ -140,7 +140,7 @@ def can_be_modified(self):
         )
 
     def override_function(self):
-        warnings.warn("DeepSpeed fp16_optimizer functions are overrided with faster implementation.", UserWarning)
+        warnings.warn("DeepSpeed fp16_optimizer functions are overridden with faster implementation.", UserWarning)
 
         def get_grad_norm_direct(target, gradients, params, norm_type=2):
             from onnxruntime.training.ortmodule.torch_cpp_extensions import fused_ops
diff --git a/orttraining/orttraining/python/training/optim/_megatron_modifier.py b/orttraining/orttraining/python/training/optim/_megatron_modifier.py
index 707727120c5cd..702eba77cb74a 100644
--- a/orttraining/orttraining/python/training/optim/_megatron_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_megatron_modifier.py
@@ -27,7 +27,7 @@ def can_be_modified(self):
         )
 
     def override_function(self):
-        warnings.warn("Megatron-LM fp16_optimizer functions are overrided with faster implementation.", UserWarning)
+        warnings.warn("Megatron-LM fp16_optimizer functions are overridden with faster implementation.", UserWarning)
 
         def clip_master_grads(target, max_norm, norm_type=2):
             """
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
index 75512cb8e8c88..a8590cea22887 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
@@ -238,7 +238,7 @@ def native_group_norm_gradient():
 
 
 # PyTorch removed related backward functions with "vec" overload name since 1.13. The functions with no overload name
-# are available for all versions, though they are not that convienent to use.
+# are available for all versions, though they are not that convenient to use.
 def _upsample_gradient(backward_fn, dims):
     scales = ["" for _ in range(dims)]
     if "bicubic" in backward_fn:
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
index 0bd29b8d155c4..10e7f60b7da0f 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
@@ -437,7 +437,7 @@ def permute_and_reshape_tensor(
     shape_tensor,
 ):
     # If matmul_output_axes and contraction_axes are contiguous in input tensor,
-    # we can move Reshape to before Transpose, so it's possible that the Transpoase is fused to MatMul.
+    # we can move Reshape to before Transpose, so it's possible that the Transpose is fused to MatMul.
     # Otherwise, we have to Transpose first to move those axes together and then Reshape.
     is_matmul_output_axes_contiguous = is_axes_contiguous(matmul_output_axes)
     is_contraction_axes_contiguous = is_axes_contiguous(contraction_axes)
@@ -525,7 +525,7 @@ def permute_and_reshape_tensor(
 
 @register_symbolic("einsum", torch_version_end="1.13.0")
 @parse_args("s", "v")
-def einsum_pre_troch_113(g, equation, tensor_list):
+def einsum_pre_torch_113(g, equation, tensor_list):
     return einsum_internal(g, equation, tensor_list)
 
 
@@ -540,12 +540,12 @@ def einsum_internal(g, equation, tensor_list):
     num_ops = len(tensors)
     assert num_ops > 0
 
-    # Doesn't support implicit output is ellipsis or more than 2 oprands for now.
-    # Doesn't support ellipsis ('...') for now as not easy to get sizes of oprands.
+    # Doesn't support implicit output is ellipsis or more than 2 operands for now.
+    # Doesn't support ellipsis ('...') for now as not easy to get sizes of operands.
     if num_ops != 2 or equation.find("->") == -1 or "." in equation:
         return g.op("Einsum", *tensors, equation_s=equation)
 
-    # Take "ks,ksm->sm" as example. After prcoess inputs,
+    # Take "ks,ksm->sm" as example. After process inputs,
     # lhs_labels = [k,s], rhs_labels = [k,s,m], result_labels = [s,m].
     lhs_labels, rhs_labels, result_labels = parse_equation(equation)
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_execution_agent.py b/orttraining/orttraining/python/training/ortmodule/_execution_agent.py
index 84d7bf6410966..047cd4c59d636 100644
--- a/orttraining/orttraining/python/training/ortmodule/_execution_agent.py
+++ b/orttraining/orttraining/python/training/ortmodule/_execution_agent.py
@@ -102,7 +102,7 @@ def __init__(
     ):
         """
         :param path_or_bytes: filename or serialized ONNX or ORT format model in a byte string
-        :param fw_feed_names: Feed names for foward pass.
+        :param fw_feed_names: Feed names for forward pass.
         :param fw_outputs_device_info: Device info for fetches in forward pass.
         :param bw_fetches_names: Fetch names for backward pass.
         :param bw_outputs_device_info: Device info for fetches in backward pass.
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index 18999ce2fa1ab..c1ff62a5faea7 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -312,7 +312,7 @@ def _initialize_graph_builder(self, post_export_processed_model_info: PostExport
 
     def __getstate__(self):
         state = copy.copy(self.__dict__)
-        # Remove any re-contructible/pybound object from the state
+        # Remove any re-constructible/pybound object from the state
         serialization_deny_list = [
             "_onnx_models",
             "_graph_builder",
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_transition_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_transition_manager.py
index 80bb00e0c3ac1..22627749c316c 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_transition_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_transition_manager.py
@@ -871,7 +871,7 @@ def _get_exported_model(
                 enable_zero_stage3_support, stage3_param_handle, flattened_module
             ):
                 required_export_kwargs = {
-                    "input_names": model_info_for_export.onnx_graph_input_names,  # did not contains paramerter as its input yet
+                    "input_names": model_info_for_export.onnx_graph_input_names,  # did not contains parameters as its input yet
                     "output_names": output_names,
                     "opset_version": onnx_opset_version,
                     "do_constant_folding": False,
diff --git a/orttraining/orttraining/python/training/ortmodule/_mem_efficient_grad_mgmt.py b/orttraining/orttraining/python/training/ortmodule/_mem_efficient_grad_mgmt.py
index 93d151ea1217d..fcab32f4356bc 100644
--- a/orttraining/orttraining/python/training/ortmodule/_mem_efficient_grad_mgmt.py
+++ b/orttraining/orttraining/python/training/ortmodule/_mem_efficient_grad_mgmt.py
@@ -130,7 +130,7 @@ def _create_param_retrieval_function(
 
     Args:
         trainable_named_params: The trainable named parameters.
-        param_trigger: The trigger tensor for pulling the weights. param_trigger is pre-alloced just once
+        param_trigger: The trigger tensor for pulling the weights. param_trigger is pre-allocated just once
             before model execution, later it will be reused by each iteration. This could save the unnecessary
             overhead allocating for each iteration run.
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
index 3708343a228fc..d5d5ce672224c 100644
--- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
@@ -457,8 +457,8 @@ def _create_execution_agent(self):
     def __getstate__(self):
         state = super().__getstate__()
 
-        # Only top level classes are pickleable. So, _ORTModuleFunction is
-        # not pickleable. So, let's not pickle it, and redefine it when
+        # Only top level classes are picklable. So, _ORTModuleFunction is
+        # not picklable. So, let's not pickle it, and redefine it when
         # loading the state.
         del state["_forward_class"]
         return state
diff --git a/orttraining/orttraining/python/training/ortmodule/graph_optimizers/utils.py b/orttraining/orttraining/python/training/ortmodule/graph_optimizers/utils.py
index e6e5ce56773e1..fbd98675aebe6 100644
--- a/orttraining/orttraining/python/training/ortmodule/graph_optimizers/utils.py
+++ b/orttraining/orttraining/python/training/ortmodule/graph_optimizers/utils.py
@@ -41,13 +41,13 @@ class GraphMatcher:
         * Second bool indicates it's producer node or consumer node for source node.
         * There is a list to describe the edge infos of this node to other nodes, each edge is a tuple with 3 integers,
           first integer is the index of the target node in the list, second integer is the output index of the edge,
-          and thrid integer is the input index of the edge.
+          and third integer is the input index of the edge.
 
     For each entry, GraphMatcher used the first edge to lookup target node, and try to use make sure the sug-graph also
     matches rest edge infos.
 
     Note that when lookup target node, it will only take the first matched node as target node. For example, if a source
-    node has multiple "MatMul" consumers nodes comsuming same output, only the first "MatMul" node will be returned.
+    node has multiple "MatMul" consumers nodes consuming same output, only the first "MatMul" node will be returned.
     You need to avoid using such confusing edge info as the first edge info for node lookup. Try to use other edge to
     avoid such confusion if possible.
     """
diff --git a/orttraining/orttraining/python/training/ortmodule/options.py b/orttraining/orttraining/python/training/ortmodule/options.py
index 9145fb1712e88..2d036a5abcb5c 100644
--- a/orttraining/orttraining/python/training/ortmodule/options.py
+++ b/orttraining/orttraining/python/training/ortmodule/options.py
@@ -75,7 +75,7 @@ def __init__(self, log_level):
 
     def _extract_info(self, log_level):
         # get the log_level from os env variable
-        # OS environment variable log level superseeds the locally provided one
+        # OS environment variable log level supersedes the locally provided one
         self._validate(log_level)
         log_level = LogLevel[os.getenv(_LoggingOptions._log_level_environment_key, log_level.name)]
         return log_level
@@ -197,7 +197,7 @@ class _MemoryOptimizationLevel(IntFlag):
 
     USER_SPECIFIED = 0  # Fully respect user-specified config
     TRANSFORMER_LAYERWISE_RECOMPUTE = (
-        1  # Enable all recomputable subgraphs (excluding compromised recomptable graphs) per layer
+        1  # Enable all recomputable subgraphs (excluding compromised recomputable graphs) per layer
     )
     TRANSFORMER_LAYERWISE_RECOMPUTE_WITH_COMPROMISE = 2  # Enable all recomputable subgraphs per layer
 
diff --git a/orttraining/orttraining/python/training/ortmodule/ortmodule.py b/orttraining/orttraining/python/training/ortmodule/ortmodule.py
index ba6f7c2d0c03a..b291bfb2ba03c 100644
--- a/orttraining/orttraining/python/training/ortmodule/ortmodule.py
+++ b/orttraining/orttraining/python/training/ortmodule/ortmodule.py
@@ -124,7 +124,7 @@ def forward(self, *inputs, **kwargs):
         The first call to forward performs setup and checking steps. During this call,
         ORTModule determines whether the module can be trained with ONNX Runtime. For
         this reason, the first forward call execution takes longer than subsequent calls.
-        Execution is interupted if ONNX Runtime cannot process the model for training.
+        Execution is interrupted if ONNX Runtime cannot process the model for training.
 
         Args:
             inputs:  positional, variable positional inputs to the PyTorch module's forward method.