Skip to content

Commit

Permalink
polish toml files
Browse files Browse the repository at this point in the history
ghstack-source-id: 287d31e9a14861244f1292f61604a296fb7d4e11
Pull Request resolved: #245
  • Loading branch information
tianyu-l committed Apr 16, 2024
1 parent 241ae6f commit b022a96
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 47 deletions.
79 changes: 38 additions & 41 deletions torchtitan/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,110 +47,110 @@ def __init__(self):
"--job.config_file",
type=str,
default=None,
help="job config file",
help="Job config file",
)

# job level configs
self.parser.add_argument(
"--job.dump_folder",
type=str,
default="./torchtitan/outputs",
help="folder to dump job outputs",
help="Folder to dump job outputs",
)
self.parser.add_argument(
"--job.description",
type=str,
default="default job",
help="description of the job",
help="Description of the job",
)
self.parser.add_argument(
"--job.use_for_integration_test",
default=False,
action="store_true",
help="add this config to integration test suite",
help="Add this config to the integration test suite",
)

# profiling configs
self.parser.add_argument(
"--profiling.enable_profiling",
action="store_true",
help="enable pytorch profiler",
help="Whether to enable pytorch profiler",
)
self.parser.add_argument(
"--profiling.save_traces_folder",
type=str,
default="profiling/traces",
help="trace file location",
default="profile_traces",
help="Trace files location",
)
self.parser.add_argument(
"--profiling.profile_freq",
type=int,
default=10,
help="how often to collect profiler traces, in iterations",
help="How often to collect profiler traces, in iterations",
)

# metrics configs
self.parser.add_argument(
"--metrics.log_freq",
type=int,
default=10,
help="how often to log metrics to TensorBoard, in iterations",
help="How often to log metrics to TensorBoard, in iterations",
)
self.parser.add_argument(
"--metrics.enable_color_printing",
default=False,
action="store_true",
help="whether to enable color printing",
help="Whether to enable color printing",
)
self.parser.add_argument(
"--metrics.enable_tensorboard",
action="store_true",
help="whether to log metrics to TensorBoard",
help="Whether to log metrics to TensorBoard",
)
self.parser.add_argument(
"--metrics.save_tb_folder",
type=str,
default="tb",
help="folder to dump tensorboard state",
help="Folder to dump TensorBoard states",
)

# model configs
self.parser.add_argument(
"--model.name",
type=str,
default="llama",
help="which model to train",
help="Which model to train",
)
self.parser.add_argument(
"--model.flavor",
type=str,
default="debugmodel",
help="which model config to train",
help="Which model config to train",
)
self.parser.add_argument(
"--model.norm_type",
type=str,
default="rmsnorm",
help="Layer Normalization type to use [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]",
help="Type of layer normalization to use [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]",
)
self.parser.add_argument(
"--model.tokenizer_path",
type=str,
default="./torchtitan/datasets/tokenizer/tokenizer.model",
help="tokenizer path",
help="Tokenizer path",
)

# optimizer configs
self.parser.add_argument(
"--optimizer.name", type=str, default="AdamW", help="optimizer to use"
"--optimizer.name", type=str, default="AdamW", help="Optimizer to use"
)
self.parser.add_argument(
"--optimizer.lr", type=float, default=8e-4, help="learning rate to use"
"--optimizer.lr", type=float, default=8e-4, help="Learning rate to use"
)

# training configs
self.parser.add_argument(
"--training.dataset", type=str, default="alpaca", help="dataset to use"
"--training.dataset", type=str, default="alpaca", help="Dataset to use"
)
self.parser.add_argument(
"--training.dataset_path",
Expand All @@ -160,28 +160,28 @@ def __init__(self):
loaded from this path instead of downloaded.""",
)
self.parser.add_argument(
"--training.batch_size", type=int, default=8, help="batch size"
"--training.batch_size", type=int, default=8, help="Batch size"
)
self.parser.add_argument(
"--training.seq_len", type=int, default=2048, help="sequence length"
"--training.seq_len", type=int, default=2048, help="Sequence length"
)
self.parser.add_argument(
"--training.warmup_steps",
type=int,
default=200,
help="steps for lr scheduler warmup",
help="Steps for lr scheduler warmup, normally 1/5 of --training.steps",
)
self.parser.add_argument(
"--training.max_norm",
type=Union[float, int],
default=1.0,
help="max norm for gradient clipping",
help="Max norm for gradient clipping",
)
self.parser.add_argument(
"--training.steps",
type=int,
default=10000,
help="how many train steps to run",
help="How many train steps to run",
)
self.parser.add_argument(
"--training.data_parallel_degree",
Expand All @@ -199,18 +199,18 @@ def __init__(self):
"--training.enable_loss_parallel",
default=True,
action="store_true",
help="whether to enable loss parallel when sequence parallel is enabled",
help="Whether to apply loss parallel when sequence parallel is enabled",
)
self.parser.add_argument(
"--training.pipeline_parallel_degree",
type=int,
default=1,
help="Pipeline Parallelism degree (default of 1 means disabled)",
help="Pipeline Parallelism degree. 1 means disabled.",
)
self.parser.add_argument(
"--training.compile",
action="store_true",
help="Whether to compile the model.",
help="Whether to compile the model",
)
self.parser.add_argument(
"--training.fp8_linear",
Expand All @@ -220,7 +220,7 @@ def __init__(self):
"dynamic",
"",
], # TODO: add "delayed" option back in when supported
help="Type of fp8 linear quantization to apply to the model",
help="Type of fp8 linear quantization to apply to the model ['', 'dynamic']",
)
self.parser.add_argument(
"--training.gc_freq",
Expand All @@ -229,7 +229,7 @@ def __init__(self):
help="Python garbage control scheduling interval, in steps",
)

# checkpoint configs
# checkpointing configs
self.parser.add_argument(
"--checkpoint.enable_checkpoint",
action="store_true",
Expand All @@ -248,19 +248,13 @@ def __init__(self):
"--checkpoint.interval_type",
type=str,
default="steps",
help="""
The checkpointing interval unit of measurement.
The default value is steps.
""",
help="Checkpointing interval unit of measurement ['step', 'seconds']",
)
self.parser.add_argument(
"--checkpoint.interval",
type=int,
default=500,
help="""
Checkpointing interval. The unit of measurement is in seconds or
steps depending on --checkpoint.interval_type.
""",
help="Checkpointing interval, in steps or seconds depending on --checkpoint.interval_type",
)
self.parser.add_argument(
"--checkpoint.model_weights_only",
Expand All @@ -284,18 +278,21 @@ def __init__(self):
""",
)

# activation checkpointing
# activation checkpointing configs
self.parser.add_argument(
"--activation_checkpoint.mode",
type=str,
default="selective",
help=" ['none', 'full', 'selective'] = type of activation checkpointing to use",
help="Type of activation checkpointing to use ['none', 'full', 'selective']",
)
self.parser.add_argument(
"--activation_checkpoint.selective_ac_option",
type=str,
default="2", # 2 = checkpoint every other layer
help="['int', 'op'] = selective activation checkpointing options, 'int' for every nth layer, or 'op' for op level ac.",
help="""
Selective activation checkpointing options ['int', 'op'].
'int' (e.g., 2) for every nth layer, or 'op' for op level ac.
""",
)

# communications library settings
Expand All @@ -310,7 +307,7 @@ def __init__(self):
type=int,
default=100,
help=(
"Timeout for communication operations after the first train step-"
"Timeout for communication operations after the first train step -- "
"usually a tighter bound than during initialization."
),
)
Expand Down
2 changes: 1 addition & 1 deletion train.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ def loss_fn(pred, labels):
)

if torch.distributed.get_rank() == 0:
logger.info("Sleeping for 2 seconds for others ranks to complete ")
logger.info("Sleeping 2 seconds for other ranks to complete")
time.sleep(2)

metric_logger.close()
Expand Down
4 changes: 2 additions & 2 deletions train_configs/debug_model.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ save_tb_folder = "tb"
[model]
name = "llama"
flavor = "debugmodel"
norm_type = "fused_rmsnorm" # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
norm_type = "fused_rmsnorm" # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
tokenizer_path = "./torchtitan/datasets/tokenizer/tokenizer.model"

[optimizer]
Expand All @@ -37,7 +37,7 @@ tensor_parallel_degree = 1
pipeline_parallel_degree = 1
fp8_linear = ""
compile = false
dataset = "alpaca" # supported datasets: alpaca (52K), openwebtext (8M), c4 (177M)
dataset = "alpaca" # supported datasets: alpaca (52K), openwebtext (8M), c4 (177M)

[checkpoint]
enable_checkpoint = false
Expand Down
2 changes: 1 addition & 1 deletion train_configs/llama_13b.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ save_tb_folder = "tb"
[model]
name = "llama"
flavor = "13B"
norm_type = "fused_rmsnorm" # [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]
norm_type = "fused_rmsnorm" # [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]
tokenizer_path = "./torchtitan/datasets/tokenizer/tokenizer.model"

[optimizer]
Expand Down
2 changes: 1 addition & 1 deletion train_configs/llama_70b.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ save_tb_folder = "tb"
[model]
name = "llama"
flavor = "70B"
norm_type = "rmsnorm" # [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]
norm_type = "rmsnorm" # [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]
tokenizer_path = "./torchtitan/datasets/tokenizer/tokenizer.model"

[optimizer]
Expand Down
2 changes: 1 addition & 1 deletion train_configs/llama_7b.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ save_tb_folder = "tb"
[model]
name = "llama"
flavor = "7B"
norm_type = "fused_rmsnorm" # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
norm_type = "fused_rmsnorm" # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
tokenizer_path = "./torchtitan/datasets/tokenizer/tokenizer.model"

[optimizer]
Expand Down

0 comments on commit b022a96

Please sign in to comment.