diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
index 0e6555080..ec2209e6c 100644
--- a/torchtitan/config_manager.py
+++ b/torchtitan/config_manager.py
@@ -47,7 +47,7 @@ def __init__(self):
             "--job.config_file",
             type=str,
             default=None,
-            help="job config file",
+            help="Job config file",
         )
 
         # job level configs
@@ -55,38 +55,38 @@ def __init__(self):
             "--job.dump_folder",
             type=str,
             default="./torchtitan/outputs",
-            help="folder to dump job outputs",
+            help="Folder to dump job outputs",
         )
         self.parser.add_argument(
             "--job.description",
             type=str,
             default="default job",
-            help="description of the job",
+            help="Description of the job",
         )
         self.parser.add_argument(
             "--job.use_for_integration_test",
             default=False,
             action="store_true",
-            help="add this config to integration test suite",
+            help="Add this config to the integration test suite",
         )
 
         # profiling configs
         self.parser.add_argument(
             "--profiling.enable_profiling",
             action="store_true",
-            help="enable pytorch profiler",
+            help="Whether to enable pytorch profiler",
         )
         self.parser.add_argument(
             "--profiling.save_traces_folder",
             type=str,
-            default="profiling/traces",
-            help="trace file location",
+            default="profile_traces",
+            help="Trace files location",
         )
         self.parser.add_argument(
             "--profiling.profile_freq",
             type=int,
             default=10,
-            help="how often to collect profiler traces, in iterations",
+            help="How often to collect profiler traces, in iterations",
         )
 
         # metrics configs
@@ -94,24 +94,24 @@ def __init__(self):
             "--metrics.log_freq",
             type=int,
             default=10,
-            help="how often to log metrics to TensorBoard, in iterations",
+            help="How often to log metrics to TensorBoard, in iterations",
         )
         self.parser.add_argument(
             "--metrics.enable_color_printing",
             default=False,
             action="store_true",
-            help="whether to enable color printing",
+            help="Whether to enable color printing",
         )
         self.parser.add_argument(
             "--metrics.enable_tensorboard",
             action="store_true",
-            help="whether to log metrics to TensorBoard",
+            help="Whether to log metrics to TensorBoard",
         )
         self.parser.add_argument(
             "--metrics.save_tb_folder",
             type=str,
             default="tb",
-            help="folder to dump tensorboard state",
+            help="Folder to dump TensorBoard states",
         )
 
         # model configs
@@ -119,38 +119,38 @@ def __init__(self):
             "--model.name",
             type=str,
             default="llama",
-            help="which model to train",
+            help="Which model to train",
         )
         self.parser.add_argument(
             "--model.flavor",
             type=str,
             default="debugmodel",
-            help="which model config to train",
+            help="Which model config to train",
         )
         self.parser.add_argument(
             "--model.norm_type",
             type=str,
             default="rmsnorm",
-            help="Layer Normalization type to use [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]",
+            help="Type of layer normalization to use [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]",
         )
         self.parser.add_argument(
             "--model.tokenizer_path",
             type=str,
             default="./torchtitan/datasets/tokenizer/tokenizer.model",
-            help="tokenizer path",
+            help="Tokenizer path",
         )
 
         # optimizer configs
         self.parser.add_argument(
-            "--optimizer.name", type=str, default="AdamW", help="optimizer to use"
+            "--optimizer.name", type=str, default="AdamW", help="Optimizer to use"
         )
         self.parser.add_argument(
-            "--optimizer.lr", type=float, default=8e-4, help="learning rate to use"
+            "--optimizer.lr", type=float, default=8e-4, help="Learning rate to use"
         )
 
         # training configs
         self.parser.add_argument(
-            "--training.dataset", type=str, default="alpaca", help="dataset to use"
+            "--training.dataset", type=str, default="alpaca", help="Dataset to use"
         )
         self.parser.add_argument(
             "--training.dataset_path",
@@ -160,28 +160,28 @@ def __init__(self):
                 loaded from this path instead of downloaded.""",
         )
         self.parser.add_argument(
-            "--training.batch_size", type=int, default=8, help="batch size"
+            "--training.batch_size", type=int, default=8, help="Batch size"
         )
         self.parser.add_argument(
-            "--training.seq_len", type=int, default=2048, help="sequence length"
+            "--training.seq_len", type=int, default=2048, help="Sequence length"
         )
         self.parser.add_argument(
             "--training.warmup_steps",
             type=int,
             default=200,
-            help="steps for lr scheduler warmup",
+            help="Steps for lr scheduler warmup, normally 1/5 of --training.steps",
         )
         self.parser.add_argument(
             "--training.max_norm",
             type=Union[float, int],
             default=1.0,
-            help="max norm for gradient clipping",
+            help="Max norm for gradient clipping",
         )
         self.parser.add_argument(
             "--training.steps",
             type=int,
             default=10000,
-            help="how many train steps to run",
+            help="How many train steps to run",
         )
         self.parser.add_argument(
             "--training.data_parallel_degree",
@@ -199,18 +199,18 @@ def __init__(self):
             "--training.enable_loss_parallel",
             default=True,
             action="store_true",
-            help="whether to enable loss parallel when sequence parallel is enabled",
+            help="Whether to apply loss parallel when sequence parallel is enabled",
         )
         self.parser.add_argument(
             "--training.pipeline_parallel_degree",
             type=int,
             default=1,
-            help="Pipeline Parallelism degree (default of 1 means disabled)",
+            help="Pipeline Parallelism degree. 1 means disabled.",
         )
         self.parser.add_argument(
             "--training.compile",
             action="store_true",
-            help="Whether to compile the model.",
+            help="Whether to compile the model",
         )
         self.parser.add_argument(
             "--training.fp8_linear",
@@ -220,7 +220,7 @@ def __init__(self):
                 "dynamic",
                 "",
             ],  # TODO: add "delayed" option back in when supported
-            help="Type of fp8 linear quantization to apply to the model",
+            help="Type of fp8 linear quantization to apply to the model ['', 'dynamic']",
         )
         self.parser.add_argument(
             "--training.gc_freq",
@@ -229,7 +229,7 @@ def __init__(self):
             help="Python garbage control scheduling interval, in steps",
         )
 
-        # checkpoint configs
+        # checkpointing configs
         self.parser.add_argument(
             "--checkpoint.enable_checkpoint",
             action="store_true",
@@ -248,19 +248,13 @@ def __init__(self):
             "--checkpoint.interval_type",
             type=str,
             default="steps",
-            help="""
-                The checkpointing interval unit of measurement.
-                The default value is steps.
-            """,
+            help="Checkpointing interval unit of measurement ['step', 'seconds']",
         )
         self.parser.add_argument(
             "--checkpoint.interval",
             type=int,
             default=500,
-            help="""
-                Checkpointing interval. The unit of measurement is in seconds or
-                steps depending on --checkpoint.interval_type.
-            """,
+            help="Checkpointing interval, in steps or seconds depending on --checkpoint.interval_type",
         )
         self.parser.add_argument(
             "--checkpoint.model_weights_only",
@@ -284,18 +278,21 @@ def __init__(self):
             """,
         )
 
-        # activation checkpointing
+        # activation checkpointing configs
         self.parser.add_argument(
             "--activation_checkpoint.mode",
             type=str,
             default="selective",
-            help=" ['none', 'full', 'selective'] = type of activation checkpointing to use",
+            help="Type of activation checkpointing to use ['none', 'full', 'selective']",
         )
         self.parser.add_argument(
             "--activation_checkpoint.selective_ac_option",
             type=str,
             default="2",  # 2 = checkpoint every other layer
-            help="['int', 'op'] = selective activation checkpointing options, 'int' for every nth layer, or 'op' for op level ac.",
+            help="""
+                Selective activation checkpointing options ['int', 'op'].
+                'int' (e.g., 2) for every nth layer, or 'op' for op level ac.
+            """,
         )
 
         # communications library settings
@@ -310,7 +307,7 @@ def __init__(self):
             type=int,
             default=100,
             help=(
-                "Timeout for communication operations after the first train step-"
+                "Timeout for communication operations after the first train step -- "
                 "usually a tighter bound than during initialization."
             ),
         )
diff --git a/train.py b/train.py
index 00327385d..2b4d5ee96 100644
--- a/train.py
+++ b/train.py
@@ -385,7 +385,7 @@ def loss_fn(pred, labels):
                 )
 
     if torch.distributed.get_rank() == 0:
-        logger.info("Sleeping for 2 seconds for others ranks to complete ")
+        logger.info("Sleeping 2 seconds for other ranks to complete")
         time.sleep(2)
 
     metric_logger.close()
diff --git a/train_configs/debug_model.toml b/train_configs/debug_model.toml
index 6eb623a5e..baf6bd553 100644
--- a/train_configs/debug_model.toml
+++ b/train_configs/debug_model.toml
@@ -19,7 +19,7 @@ save_tb_folder = "tb"
 [model]
 name = "llama"
 flavor = "debugmodel"
-norm_type = "fused_rmsnorm" # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
+norm_type = "fused_rmsnorm"  # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
 tokenizer_path = "./torchtitan/datasets/tokenizer/tokenizer.model"
 
 [optimizer]
@@ -37,7 +37,7 @@ tensor_parallel_degree = 1
 pipeline_parallel_degree = 1
 fp8_linear = ""
 compile = false
-dataset = "alpaca"   # supported datasets: alpaca (52K), openwebtext (8M), c4 (177M)
+dataset = "alpaca"  # supported datasets: alpaca (52K), openwebtext (8M), c4 (177M)
 
 [checkpoint]
 enable_checkpoint = false
diff --git a/train_configs/llama_13b.toml b/train_configs/llama_13b.toml
index 4fc72c11c..7151df999 100644
--- a/train_configs/llama_13b.toml
+++ b/train_configs/llama_13b.toml
@@ -18,7 +18,7 @@ save_tb_folder = "tb"
 [model]
 name = "llama"
 flavor = "13B"
-norm_type = "fused_rmsnorm" # [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]
+norm_type = "fused_rmsnorm"  # [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]
 tokenizer_path = "./torchtitan/datasets/tokenizer/tokenizer.model"
 
 [optimizer]
diff --git a/train_configs/llama_70b.toml b/train_configs/llama_70b.toml
index 1878647db..110453d4f 100644
--- a/train_configs/llama_70b.toml
+++ b/train_configs/llama_70b.toml
@@ -18,7 +18,7 @@ save_tb_folder = "tb"
 [model]
 name = "llama"
 flavor = "70B"
-norm_type = "rmsnorm" # [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]
+norm_type = "rmsnorm"  # [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]
 tokenizer_path = "./torchtitan/datasets/tokenizer/tokenizer.model"
 
 [optimizer]
diff --git a/train_configs/llama_7b.toml b/train_configs/llama_7b.toml
index 7e8f7f789..3b8711517 100644
--- a/train_configs/llama_7b.toml
+++ b/train_configs/llama_7b.toml
@@ -17,7 +17,7 @@ save_tb_folder = "tb"
 [model]
 name = "llama"
 flavor = "7B"
-norm_type = "fused_rmsnorm" # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
+norm_type = "fused_rmsnorm"  # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
 tokenizer_path = "./torchtitan/datasets/tokenizer/tokenizer.model"
 
 [optimizer]