Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

polish toml files and helper messages #245

Merged
merged 4 commits into from
Apr 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 38 additions & 41 deletions torchtitan/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,110 +47,110 @@ def __init__(self):
"--job.config_file",
type=str,
default=None,
help="job config file",
help="Job config file",
)

# job level configs
self.parser.add_argument(
"--job.dump_folder",
type=str,
default="./torchtitan/outputs",
help="folder to dump job outputs",
help="Folder to dump job outputs",
)
self.parser.add_argument(
"--job.description",
type=str,
default="default job",
help="description of the job",
help="Description of the job",
)
self.parser.add_argument(
"--job.use_for_integration_test",
default=False,
action="store_true",
help="add this config to integration test suite",
help="Add this config to the integration test suite",
)

# profiling configs
self.parser.add_argument(
"--profiling.enable_profiling",
action="store_true",
help="enable pytorch profiler",
help="Whether to enable pytorch profiler",
)
self.parser.add_argument(
"--profiling.save_traces_folder",
type=str,
default="profiling/traces",
help="trace file location",
default="profile_traces",
help="Trace files location",
)
self.parser.add_argument(
"--profiling.profile_freq",
type=int,
default=10,
help="how often to collect profiler traces, in iterations",
help="How often to collect profiler traces, in iterations",
)

# metrics configs
self.parser.add_argument(
"--metrics.log_freq",
type=int,
default=10,
help="how often to log metrics to TensorBoard, in iterations",
help="How often to log metrics to TensorBoard, in iterations",
)
self.parser.add_argument(
"--metrics.enable_color_printing",
default=False,
action="store_true",
help="whether to enable color printing",
help="Whether to enable color printing",
)
self.parser.add_argument(
"--metrics.enable_tensorboard",
action="store_true",
help="whether to log metrics to TensorBoard",
help="Whether to log metrics to TensorBoard",
)
self.parser.add_argument(
"--metrics.save_tb_folder",
type=str,
default="tb",
help="folder to dump tensorboard state",
help="Folder to dump TensorBoard states",
)

# model configs
self.parser.add_argument(
"--model.name",
type=str,
default="llama",
help="which model to train",
help="Which model to train",
)
self.parser.add_argument(
"--model.flavor",
type=str,
default="debugmodel",
help="which model config to train",
help="Which model config to train",
)
self.parser.add_argument(
"--model.norm_type",
type=str,
default="rmsnorm",
help="Layer Normalization type to use [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]",
help="Type of layer normalization to use [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]",
)
self.parser.add_argument(
"--model.tokenizer_path",
type=str,
default="./torchtitan/datasets/tokenizer/tokenizer.model",
help="tokenizer path",
help="Tokenizer path",
)

# optimizer configs
self.parser.add_argument(
"--optimizer.name", type=str, default="AdamW", help="optimizer to use"
"--optimizer.name", type=str, default="AdamW", help="Optimizer to use"
)
self.parser.add_argument(
"--optimizer.lr", type=float, default=8e-4, help="learning rate to use"
"--optimizer.lr", type=float, default=8e-4, help="Learning rate to use"
)

# training configs
self.parser.add_argument(
"--training.dataset", type=str, default="alpaca", help="dataset to use"
"--training.dataset", type=str, default="alpaca", help="Dataset to use"
)
self.parser.add_argument(
"--training.dataset_path",
Expand All @@ -160,28 +160,28 @@ def __init__(self):
loaded from this path instead of downloaded.""",
)
self.parser.add_argument(
"--training.batch_size", type=int, default=8, help="batch size"
"--training.batch_size", type=int, default=8, help="Batch size"
)
self.parser.add_argument(
"--training.seq_len", type=int, default=2048, help="sequence length"
"--training.seq_len", type=int, default=2048, help="Sequence length"
)
self.parser.add_argument(
"--training.warmup_steps",
type=int,
default=200,
help="steps for lr scheduler warmup",
help="Steps for lr scheduler warmup, normally 1/5 of --training.steps",
)
self.parser.add_argument(
"--training.max_norm",
type=Union[float, int],
default=1.0,
help="max norm for gradient clipping",
help="Max norm for gradient clipping",
)
self.parser.add_argument(
"--training.steps",
type=int,
default=10000,
help="how many train steps to run",
help="How many train steps to run",
)
self.parser.add_argument(
"--training.data_parallel_degree",
Expand All @@ -199,18 +199,18 @@ def __init__(self):
"--training.enable_loss_parallel",
default=True,
action="store_true",
help="whether to enable loss parallel when sequence parallel is enabled",
help="Whether to apply loss parallel when sequence parallel is enabled",
)
self.parser.add_argument(
"--training.pipeline_parallel_degree",
type=int,
default=1,
help="Pipeline Parallelism degree (default of 1 means disabled)",
help="Pipeline Parallelism degree. 1 means disabled.",
)
self.parser.add_argument(
"--training.compile",
action="store_true",
help="Whether to compile the model.",
help="Whether to compile the model",
)
self.parser.add_argument(
"--training.fp8_linear",
Expand All @@ -220,7 +220,7 @@ def __init__(self):
"dynamic",
"",
], # TODO: add "delayed" option back in when supported
help="Type of fp8 linear quantization to apply to the model",
help="Type of fp8 linear quantization to apply to the model ['', 'dynamic']",
)
self.parser.add_argument(
"--training.gc_freq",
Expand All @@ -229,7 +229,7 @@ def __init__(self):
help="Python garbage control scheduling interval, in steps",
)

# checkpoint configs
# checkpointing configs
self.parser.add_argument(
"--checkpoint.enable_checkpoint",
action="store_true",
Expand All @@ -248,19 +248,13 @@ def __init__(self):
"--checkpoint.interval_type",
type=str,
default="steps",
help="""
The checkpointing interval unit of measurement.
The default value is steps.
""",
help="Checkpointing interval unit of measurement ['step', 'seconds']",
)
self.parser.add_argument(
"--checkpoint.interval",
type=int,
default=500,
help="""
Checkpointing interval. The unit of measurement is in seconds or
steps depending on --checkpoint.interval_type.
""",
help="Checkpointing interval, in steps or seconds depending on --checkpoint.interval_type",
)
self.parser.add_argument(
"--checkpoint.model_weights_only",
Expand All @@ -284,18 +278,21 @@ def __init__(self):
""",
)

# activation checkpointing
# activation checkpointing configs
self.parser.add_argument(
"--activation_checkpoint.mode",
type=str,
default="selective",
help=" ['none', 'full', 'selective'] = type of activation checkpointing to use",
help="Type of activation checkpointing to use ['none', 'full', 'selective']",
)
self.parser.add_argument(
"--activation_checkpoint.selective_ac_option",
type=str,
default="2", # 2 = checkpoint every other layer
help="['int', 'op'] = selective activation checkpointing options, 'int' for every nth layer, or 'op' for op level ac.",
help="""
Selective activation checkpointing options ['int', 'op'].
'int' (e.g., 2) for every nth layer, or 'op' for op level ac.
""",
)

# communications library settings
Expand All @@ -310,7 +307,7 @@ def __init__(self):
type=int,
default=100,
help=(
"Timeout for communication operations after the first train step-"
"Timeout for communication operations after the first train step -- "
"usually a tighter bound than during initialization."
),
)
Expand Down
2 changes: 1 addition & 1 deletion train.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ def loss_fn(pred, labels):
)

if torch.distributed.get_rank() == 0:
logger.info("Sleeping for 2 seconds for others ranks to complete ")
logger.info("Sleeping 2 seconds for other ranks to complete")
time.sleep(2)

metric_logger.close()
Expand Down
4 changes: 2 additions & 2 deletions train_configs/debug_model.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ save_tb_folder = "tb"
[model]
name = "llama"
flavor = "debugmodel"
norm_type = "fused_rmsnorm" # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
norm_type = "fused_rmsnorm" # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
tokenizer_path = "./torchtitan/datasets/tokenizer/tokenizer.model"

[optimizer]
Expand All @@ -37,7 +37,7 @@ tensor_parallel_degree = 1
pipeline_parallel_degree = 1
fp8_linear = ""
compile = false
dataset = "alpaca" # supported datasets: alpaca (52K), openwebtext (8M), c4 (177M)
dataset = "alpaca" # supported datasets: alpaca (52K), openwebtext (8M), c4 (177M)

[checkpoint]
enable_checkpoint = false
Expand Down
2 changes: 1 addition & 1 deletion train_configs/llama_13b.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ save_tb_folder = "tb"
[model]
name = "llama"
flavor = "13B"
norm_type = "fused_rmsnorm" # [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]
norm_type = "fused_rmsnorm" # [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]
tokenizer_path = "./torchtitan/datasets/tokenizer/tokenizer.model"

[optimizer]
Expand Down
2 changes: 1 addition & 1 deletion train_configs/llama_70b.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ save_tb_folder = "tb"
[model]
name = "llama"
flavor = "70B"
norm_type = "rmsnorm" # [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]
norm_type = "rmsnorm" # [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]
tokenizer_path = "./torchtitan/datasets/tokenizer/tokenizer.model"

[optimizer]
Expand Down
2 changes: 1 addition & 1 deletion train_configs/llama_7b.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ save_tb_folder = "tb"
[model]
name = "llama"
flavor = "7B"
norm_type = "fused_rmsnorm" # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
norm_type = "fused_rmsnorm" # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
tokenizer_path = "./torchtitan/datasets/tokenizer/tokenizer.model"

[optimizer]
Expand Down
Loading