diff --git a/recipes/configs/falcon3/10B_full.yaml b/recipes/configs/falcon3/10B_full.yaml new file mode 100644 index 0000000000..6bdafb4b22 --- /dev/null +++ b/recipes/configs/falcon3/10B_full.yaml @@ -0,0 +1,107 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Falcon3 10B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download tiiuae/Falcon3-10B-Base --output-dir /tmp/Falcon3-10B --ignore-patterns None +# +# To launch on 4 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config falcon3/10B_full +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config falcon3/10B_full checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# Single device full finetuning requires more memory optimizations. It's +# best to use 10B_full.yaml for those cases + +# Tokenizer +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-10B-Base/tokenizer.json + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.falcon3.falcon3_10b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-10B-Base + checkpoint_files: [ + model-00001-of-00005.safetensors, + model-00002-of-00005.safetensors, + model-00003-of-00005.safetensors, + model-00004-of-00005.safetensors, + model-00005-of-00005.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-10B-Base + model_type: FALCON3 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 +optimizer: + _component_: torch.optim.AdamW + fused: True + lr: 5e-6 +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 8 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory +optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1 + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: False # True reduces memory + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Falcon3-10B-Base-finetune +log_every_n_steps: 1 +log_peak_memory_stats: True + +# Profiler (disabled) +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 3 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/falcon3/10B_full_single_device.yaml b/recipes/configs/falcon3/10B_full_single_device.yaml new file mode 100644 index 0000000000..c87ff7f52d --- /dev/null +++ b/recipes/configs/falcon3/10B_full_single_device.yaml @@ -0,0 +1,108 @@ +# Config for single device full finetuning in full_finetune_single_device.py +# using a Falcon3 10B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download tiiuae/Falcon3-10B-Base --output-dir /tmp/Falcon3-10B --ignore-patterns None +# +# The default config uses an optimizer from bitsandbytes. If you do not have it installed, +# you can install it with +# pip install bitsandbytes +# +# To launch on a single device, run the following command from root: +# tune run full_finetune_single_device --config falcon3/10B_full_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run full_finetune_single_device --config falcon3/10B_full_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + +# Tokenizer +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-10B-Base/tokenizer.json + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.falcon3.falcon3_10b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-10B-Base + checkpoint_files: [ + model-00001-of-00005.safetensors, + model-00002-of-00005.safetensors, + model-00003-of-00005.safetensors, + model-00004-of-00005.safetensors, + model-00005-of-00005.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-10B-Base + model_type: FALCON3 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 1 +epochs: 1 +optimizer: + _component_: torch.optim.AdamW + lr: 5e-6 +optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1 +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory + +# Training environment +device: cuda + +# Memory management +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: False # True reduces memory + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Falcon3-10B-Base-finetune +log_every_n_steps: 1 +log_peak_memory_stats: True + +# Profiler (disabled) +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 3 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/falcon3/10B_lora.yaml b/recipes/configs/falcon3/10B_lora.yaml new file mode 100644 index 0000000000..0e342bd03e --- /dev/null +++ b/recipes/configs/falcon3/10B_lora.yaml @@ -0,0 +1,117 @@ +# Config for multi-device LoRA finetuning in lora_finetune_distributed.py +# using a Falcon3 10B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download tiiuae/Falcon3-10B-Base --output-dir /tmp/Falcon3-10B --ignore-patterns None +# +# To launch on 2 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config falcon3/10B_lora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config falcon3/10B_lora checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# For single device LoRA finetuning please use 10B_lora_single_device.yaml + + +# Model Arguments +model: + _component_: torchtune.models.falcon3.lora_falcon3_10b + lora_attn_modules: ['q_proj', 'v_proj', 'output_proj'] + apply_lora_to_mlp: True + apply_lora_to_output: False + lora_rank: 8 # higher increases accuracy and memory + lora_alpha: 16 # usually alpha=2*rank + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-10B-Base/tokenizer.json + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-10B-Base + checkpoint_files: [ + model-00001-of-00005.safetensors, + model-00002-of-00005.safetensors, + model-00003-of-00005.safetensors, + model-00004-of-00005.safetensors, + model-00005-of-00005.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-10B-Base + model_type: FALCON3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed +seed: null +shuffle: True + + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +batch_size: 2 +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 8 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Falcon3-10B-Base-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: False # True reduces memory +enable_activation_offloading: False # True reduces memory + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/falcon3/10B_lora_single_device.yaml b/recipes/configs/falcon3/10B_lora_single_device.yaml new file mode 100644 index 0000000000..98e9f0dd76 --- /dev/null +++ b/recipes/configs/falcon3/10B_lora_single_device.yaml @@ -0,0 +1,116 @@ +# Config for single device LoRA finetuning in lora_finetune_single_device.py +# using a Falcon3 10B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download tiiuae/Falcon3-10B-Base --output-dir /tmp/Falcon3-10B --ignore-patterns None +# +# To launch on a single device, run the following command from root: +# tune run lora_finetune_single_device --config falcon3/10B_lora_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run lora_finetune_single_device --config falcon3/10B_lora_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + + +# Model Arguments +model: + _component_: torchtune.models.falcon3.lora_falcon3_10b + lora_attn_modules: ['q_proj', 'v_proj', 'output_proj'] + apply_lora_to_mlp: True + apply_lora_to_output: False + lora_rank: 8 # higher increases accuracy and memory + lora_alpha: 16 # usually alpha=2*rank + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-10B-Base/tokenizer.json + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-10B-Base + checkpoint_files: [ + model-00001-of-00005.safetensors, + model-00002-of-00005.safetensors, + model-00003-of-00005.safetensors, + model-00004-of-00005.safetensors, + model-00005-of-00005.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-10B-Base + model_type: FALCON3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 8 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Falcon3-10B-Base-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 + +# Activations Offloading +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: False # True reduces memory + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/falcon3/1B_full.yaml b/recipes/configs/falcon3/1B_full.yaml new file mode 100644 index 0000000000..a14d1d6c14 --- /dev/null +++ b/recipes/configs/falcon3/1B_full.yaml @@ -0,0 +1,103 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Falcon3 1B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download tiiuae/Falcon3-1B-Base --output-dir /tmp/Falcon3-1B --ignore-patterns None +# +# To launch on 4 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config falcon3/1B_full +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config falcon3/1B_full checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# Single device full finetuning requires more memory optimizations. It's +# best to use 1B_full.yaml for those cases + +# Tokenizer +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-1B-Base/tokenizer.json + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.falcon3.falcon3_1b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-1B-Base + checkpoint_files: [ + model.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-1B-Base + model_type: FALCON3 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 +optimizer: + _component_: torch.optim.AdamW + fused: True + lr: 5e-6 +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 8 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory +optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1 + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: False # True reduces memory + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Falcon3-1B-Base-finetune +log_every_n_steps: 1 +log_peak_memory_stats: True + +# Profiler (disabled) +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 3 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/falcon3/1B_full_single_device.yaml b/recipes/configs/falcon3/1B_full_single_device.yaml new file mode 100644 index 0000000000..bd95d54971 --- /dev/null +++ b/recipes/configs/falcon3/1B_full_single_device.yaml @@ -0,0 +1,104 @@ +# Config for single device full finetuning in full_finetune_single_device.py +# using a Falcon3 1B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download tiiuae/Falcon3-1B-Base --output-dir /tmp/Falcon3-7B-Base --ignore-patterns None +# +# The default config uses an optimizer from bitsandbytes. If you do not have it installed, +# you can install it with +# pip install bitsandbytes +# +# To launch on a single device, run the following command from root: +# tune run full_finetune_single_device --config falcon3/1B_full_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run full_finetune_single_device --config falcon3/1B_full_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + +# Tokenizer +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-1B-Base/tokenizer.json + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.falcon3.falcon3_1b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-1B-Base + checkpoint_files: [ + model.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-1B-Base + model_type: FALCON3 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 1 +epochs: 1 +optimizer: + _component_: torch.optim.AdamW + lr: 5e-6 +optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1 +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory + +# Training environment +device: cuda + +# Memory management +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: False # True reduces memory + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Falcon3-7B-Base-finetune +log_every_n_steps: 1 +log_peak_memory_stats: True + +# Profiler (disabled) +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 3 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/falcon3/1B_lora.yaml b/recipes/configs/falcon3/1B_lora.yaml new file mode 100644 index 0000000000..14736e8475 --- /dev/null +++ b/recipes/configs/falcon3/1B_lora.yaml @@ -0,0 +1,113 @@ +# Config for multi-device LoRA finetuning in lora_finetune_distributed.py +# using a Falcon3 3B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download tiiuae/Falcon3-3B-Base --output-dir /tmp/Falcon3-3B --ignore-patterns None +# +# To launch on 2 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config falcon3/3B_lora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config falcon3/3B_lora checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# For single device LoRA finetuning please use 3B_lora_single_device.yaml + + +# Model Arguments +model: + _component_: torchtune.models.falcon3.lora_falcon3_3b + lora_attn_modules: ['q_proj', 'v_proj', 'output_proj'] + apply_lora_to_mlp: True + apply_lora_to_output: False + lora_rank: 8 # higher increases accuracy and memory + lora_alpha: 16 # usually alpha=2*rank + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-3B-Base/tokenizer.json + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-3B-Base + checkpoint_files: [ + model.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-3B-Base + model_type: FALCON3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed +seed: null +shuffle: True + + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +batch_size: 2 +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 8 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Falcon3-3B-Base-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: False # True reduces memory +enable_activation_offloading: False # True reduces memory + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/falcon3/1B_lora_single_device.yaml b/recipes/configs/falcon3/1B_lora_single_device.yaml new file mode 100644 index 0000000000..1e5ec0f19f --- /dev/null +++ b/recipes/configs/falcon3/1B_lora_single_device.yaml @@ -0,0 +1,112 @@ +# Config for single device LoRA finetuning in lora_finetune_single_device.py +# using a Falcon3 3B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download tiiuae/Falcon3-3B-Base --output-dir /tmp/Falcon3-3B --ignore-patterns None +# +# To launch on a single device, run the following command from root: +# tune run lora_finetune_single_device --config falcon3/3B_lora_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run lora_finetune_single_device --config falcon3/3B_lora_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + + +# Model Arguments +model: + _component_: torchtune.models.falcon3.lora_falcon3_3b + lora_attn_modules: ['q_proj', 'v_proj', 'output_proj'] + apply_lora_to_mlp: True + apply_lora_to_output: False + lora_rank: 8 # higher increases accuracy and memory + lora_alpha: 16 # usually alpha=2*rank + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-3B-Base/tokenizer.json + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-3B-Base + checkpoint_files: [ + model.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-3B-Base + model_type: FALCON3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 8 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Falcon3-3B-Base-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 + +# Activations Offloading +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: False # True reduces memory + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/falcon3/3B_full.yaml b/recipes/configs/falcon3/3B_full.yaml new file mode 100644 index 0000000000..ca32297e7e --- /dev/null +++ b/recipes/configs/falcon3/3B_full.yaml @@ -0,0 +1,104 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Falcon3 3B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download tiiuae/Falcon3-3B-Base --output-dir /tmp/Falcon3-3B --ignore-patterns None +# +# To launch on 4 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config falcon3/3B_full +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config falcon3/3B_full checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# Single device full finetuning requires more memory optimizations. It's +# best to use 3B_full.yaml for those cases + +# Tokenizer +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-3B-Base/tokenizer.json + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.falcon3.falcon3_3b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-3B-Base + checkpoint_files: [ + model-00001-of-00002.safetensors, + model-00002-of-00002.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-3B-Base + model_type: FALCON3 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 +optimizer: + _component_: torch.optim.AdamW + fused: True + lr: 5e-6 +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 8 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory +optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1 + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: False # True reduces memory + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Falcon3-3B-Base-finetune +log_every_n_steps: 1 +log_peak_memory_stats: True + +# Profiler (disabled) +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 3 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/falcon3/3B_full_single_device.yaml b/recipes/configs/falcon3/3B_full_single_device.yaml new file mode 100644 index 0000000000..baf63aade7 --- /dev/null +++ b/recipes/configs/falcon3/3B_full_single_device.yaml @@ -0,0 +1,105 @@ +# Config for single device full finetuning in full_finetune_single_device.py +# using a Falcon3 3B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download tiiuae/Falcon3-3B-Base --output-dir /tmp/Falcon3-7B-Base --ignore-patterns None +# +# The default config uses an optimizer from bitsandbytes. If you do not have it installed, +# you can install it with +# pip install bitsandbytes +# +# To launch on a single device, run the following command from root: +# tune run full_finetune_single_device --config falcon3/3B_full_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run full_finetune_single_device --config falcon3/3B_full_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + +# Tokenizer +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-3B-Base/tokenizer.json + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.falcon3.falcon3_3b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-3B-Base + checkpoint_files: [ + model-00001-of-00002.safetensors, + model-00002-of-00002.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-3B-Base + model_type: FALCON3 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 1 +epochs: 1 +optimizer: + _component_: torch.optim.AdamW + lr: 5e-6 +optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1 +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory + +# Training environment +device: cuda + +# Memory management +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: False # True reduces memory + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Falcon3-7B-Base-finetune +log_every_n_steps: 1 +log_peak_memory_stats: True + +# Profiler (disabled) +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 3 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/falcon3/3B_lora.yaml b/recipes/configs/falcon3/3B_lora.yaml new file mode 100644 index 0000000000..7b5cddc28e --- /dev/null +++ b/recipes/configs/falcon3/3B_lora.yaml @@ -0,0 +1,114 @@ +# Config for multi-device LoRA finetuning in lora_finetune_distributed.py +# using a Falcon3 3B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download tiiuae/Falcon3-3B-Base --output-dir /tmp/Falcon3-3B --ignore-patterns None +# +# To launch on 2 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config falcon3/3B_lora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config falcon3/3B_lora checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# For single device LoRA finetuning please use 3B_lora_single_device.yaml + + +# Model Arguments +model: + _component_: torchtune.models.falcon3.lora_falcon3_3b + lora_attn_modules: ['q_proj', 'v_proj', 'output_proj'] + apply_lora_to_mlp: True + apply_lora_to_output: False + lora_rank: 8 # higher increases accuracy and memory + lora_alpha: 16 # usually alpha=2*rank + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-3B-Base/tokenizer.json + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-3B-Base + checkpoint_files: [ + model-00001-of-00002.safetensors, + model-00002-of-00002.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-3B-Base + model_type: FALCON3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed +seed: null +shuffle: True + + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +batch_size: 2 +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 8 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Falcon3-3B-Base-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: False # True reduces memory +enable_activation_offloading: False # True reduces memory + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/falcon3/3B_lora_single_device.yaml b/recipes/configs/falcon3/3B_lora_single_device.yaml new file mode 100644 index 0000000000..8c86548a8a --- /dev/null +++ b/recipes/configs/falcon3/3B_lora_single_device.yaml @@ -0,0 +1,113 @@ +# Config for single device LoRA finetuning in lora_finetune_single_device.py +# using a Falcon3 3B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download tiiuae/Falcon3-3B-Base --output-dir /tmp/Falcon3-3B --ignore-patterns None +# +# To launch on a single device, run the following command from root: +# tune run lora_finetune_single_device --config falcon3/3B_lora_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run lora_finetune_single_device --config falcon3/3B_lora_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + + +# Model Arguments +model: + _component_: torchtune.models.falcon3.lora_falcon3_3b + lora_attn_modules: ['q_proj', 'v_proj', 'output_proj'] + apply_lora_to_mlp: True + apply_lora_to_output: False + lora_rank: 8 # higher increases accuracy and memory + lora_alpha: 16 # usually alpha=2*rank + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-3B-Base/tokenizer.json + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-3B-Base + checkpoint_files: [ + model-00001-of-00002.safetensors, + model-00002-of-00002.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-3B-Base + model_type: FALCON3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 8 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Falcon3-3B-Base-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 + +# Activations Offloading +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: False # True reduces memory + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/falcon3/7B_full.yaml b/recipes/configs/falcon3/7B_full.yaml new file mode 100644 index 0000000000..c2b41d8c33 --- /dev/null +++ b/recipes/configs/falcon3/7B_full.yaml @@ -0,0 +1,106 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Falcon3 7B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download tiiuae/Falcon3-7B-Base --output-dir /tmp/Falcon3-7B --ignore-patterns None +# +# To launch on 4 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config falcon3/7B_full +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config falcon3/7B_full checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# Single device full finetuning requires more memory optimizations. It's +# best to use 7B_full.yaml for those cases + +# Tokenizer +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-7B-Base/tokenizer.json + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.falcon3.falcon3_7b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-7B-Base + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-7B-Base + model_type: FALCON3 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 +optimizer: + _component_: torch.optim.AdamW + fused: True + lr: 5e-6 +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 8 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory +optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1 + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: False # True reduces memory + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Falcon3-7B-Base-finetune +log_every_n_steps: 1 +log_peak_memory_stats: True + +# Profiler (disabled) +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 3 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/falcon3/7B_full_single_device.yaml b/recipes/configs/falcon3/7B_full_single_device.yaml new file mode 100644 index 0000000000..313ebb46fd --- /dev/null +++ b/recipes/configs/falcon3/7B_full_single_device.yaml @@ -0,0 +1,107 @@ +# Config for single device full finetuning in full_finetune_single_device.py +# using a Falcon3 7B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download tiiuae/Falcon3-7B-Base --output-dir /tmp/Falcon3-7B --ignore-patterns None +# +# The default config uses an optimizer from bitsandbytes. If you do not have it installed, +# you can install it with +# pip install bitsandbytes +# +# To launch on a single device, run the following command from root: +# tune run full_finetune_single_device --config falcon3/7B_full_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run full_finetune_single_device --config falcon3/7B_full_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + +# Tokenizer +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-7B-Base/tokenizer.json + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.falcon3.falcon3_7b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-7B-Base + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-7B-Base + model_type: FALCON3 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 1 +epochs: 1 +optimizer: + _component_: torch.optim.AdamW + lr: 5e-6 +optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1 +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory + +# Training environment +device: cuda + +# Memory management +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: False # True reduces memory + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Falcon3-7B-Base-finetune +log_every_n_steps: 1 +log_peak_memory_stats: True + +# Profiler (disabled) +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 3 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/falcon3/7B_lora.yaml b/recipes/configs/falcon3/7B_lora.yaml new file mode 100644 index 0000000000..e8080b5476 --- /dev/null +++ b/recipes/configs/falcon3/7B_lora.yaml @@ -0,0 +1,116 @@ +# Config for multi-device LoRA finetuning in lora_finetune_distributed.py +# using a Falcon3 7B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download tiiuae/Falcon3-7B-Base --output-dir /tmp/Falcon3-7B --ignore-patterns None +# +# To launch on 2 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config falcon3/7B_lora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config falcon3/7B_lora checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# For single device LoRA finetuning please use 7B_lora_single_device.yaml + + +# Model Arguments +model: + _component_: torchtune.models.falcon3.lora_falcon3_7b + lora_attn_modules: ['q_proj', 'v_proj', 'output_proj'] + apply_lora_to_mlp: True + apply_lora_to_output: False + lora_rank: 8 # higher increases accuracy and memory + lora_alpha: 16 # usually alpha=2*rank + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-7B-Base/tokenizer.json + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-7B-Base + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-7B-Base + model_type: FALCON3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed +seed: null +shuffle: True + + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +batch_size: 2 +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 8 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Falcon3-7B-Base-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: False # True reduces memory +enable_activation_offloading: False # True reduces memory + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/falcon3/7B_lora_single_device.yaml b/recipes/configs/falcon3/7B_lora_single_device.yaml new file mode 100644 index 0000000000..98b56be550 --- /dev/null +++ b/recipes/configs/falcon3/7B_lora_single_device.yaml @@ -0,0 +1,115 @@ +# Config for single device LoRA finetuning in lora_finetune_single_device.py +# using a Falcon3 7B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download tiiuae/Falcon3-7B-Base --output-dir /tmp/Falcon3-7B --ignore-patterns None +# +# To launch on a single device, run the following command from root: +# tune run lora_finetune_single_device --config falcon3/7B_lora_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run lora_finetune_single_device --config falcon3/7B_lora_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + + +# Model Arguments +model: + _component_: torchtune.models.falcon3.lora_falcon3_7b + lora_attn_modules: ['q_proj', 'v_proj', 'output_proj'] + apply_lora_to_mlp: True + apply_lora_to_output: False + lora_rank: 8 # higher increases accuracy and memory + lora_alpha: 16 # usually alpha=2*rank + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-7B-Base/tokenizer.json + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-7B-Base + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-7B-Base + model_type: FALCON3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False # True increases speed +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 8 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Falcon3-7B-Base-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 + +# Activations Offloading +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: False # True reduces memory + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/falcon3/eleuther_evaluation.yaml b/recipes/configs/falcon3/eleuther_evaluation.yaml new file mode 100644 index 0000000000..ad3dbc990f --- /dev/null +++ b/recipes/configs/falcon3/eleuther_evaluation.yaml @@ -0,0 +1,41 @@ +# Config for EleutherEvalRecipe in eleuther_eval.py +# +# To launch, run the following command from root torchtune directory: +# tune run eleuther_eval --config eleuther_evaluation tasks=["truthfulqa_mc2","hellaswag"] + +# Model Arguments +model: + _component_: torchtune.models.falcon3.falcon3_7b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-7B-Base/ + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors + ] + output_dir: /tmp/Falcon3-7B-Base/ + model_type: FALCON3 + +# Tokenizer +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-7B-Base/tokenizer.json + max_seq_len: null + +# Environment +device: cuda +dtype: bf16 +seed: 1234 # It is not recommended to change this seed, b/c it matches EleutherAI's default seed + +# EleutherAI specific eval args +tasks: ["truthfulqa_mc2"] +limit: null +max_seq_length: 4096 +batch_size: 8 +enable_kv_cache: True + +# Quantization specific args +quantizer: null diff --git a/recipes/configs/falcon3/generation.yaml b/recipes/configs/falcon3/generation.yaml new file mode 100644 index 0000000000..0b6cba28e9 --- /dev/null +++ b/recipes/configs/falcon3/generation.yaml @@ -0,0 +1,44 @@ +# Config for running the InferenceRecipe in generate.py to generate output from an LLM +# +# To launch, run the following command from root torchtune directory: +# tune run generate --config generation + +# Model arguments +model: + _component_: torchtune.models.falcon3.falcon3_7b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-7B-Base/ + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors + ] + output_dir: /tmp/Falcon3-7B-Base/ + model_type: FALCON3 + +device: cuda +dtype: bf16 + +seed: 1234 + +# Tokenizer arguments +tokenizer: + _component_: torchtune.models.falcon3.falcon3_tokenizer + path: /tmp/Falcon3-7B-Base/tokenizer.json + max_seq_len: null + prompt_template: null + +# Generation arguments; defaults taken from gpt-fast +prompt: + system: null + user: "Tell me a joke." +max_new_tokens: 300 +temperature: 0.6 # 0.8 and 0.6 are popular values to try +top_k: 300 + +enable_kv_cache: True + +quantizer: null diff --git a/recipes/configs/falcon3/quantization.yaml b/recipes/configs/falcon3/quantization.yaml new file mode 100644 index 0000000000..d798f31b06 --- /dev/null +++ b/recipes/configs/falcon3/quantization.yaml @@ -0,0 +1,30 @@ +# Config for QuantizationRecipe in quantize.py +# +# To launch, run the following command from root torchtune directory: +# tune run quantize --config quantization + +# +# Model arguments +model: + _component_: torchtune.models.falcon3.falcon3_7b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Falcon3-7B-Base/ + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Falcon3-7B-Base/ + model_type: FALCON3 + +device: cuda +dtype: bf16 +seed: 1234 + +quantizer: + _component_: torchtune.training.quantization.Int8DynActInt4WeightQuantizer + groupsize: 256 diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py index faf1ec7124..8cd74c08f4 100644 --- a/torchtune/_recipe_registry.py +++ b/torchtune/_recipe_registry.py @@ -27,6 +27,22 @@ class Recipe: name="full_finetune_single_device", file_path="full_finetune_single_device.py", configs=[ + Config( + name="falcon3/1B_full_single_device", + file_path="falcon3/1B_full_single_device.yaml", + ), + Config( + name="falcon3/3B_full_single_device", + file_path="falcon3/3B_full_single_device.yaml", + ), + Config( + name="falcon3/7B_full_single_device", + file_path="falcon3/7B_full_single_device.yaml", + ), + Config( + name="falcon3/10B_full_single_device", + file_path="falcon3/10B_full_single_device.yaml", + ), Config( name="llama2/7B_full_low_memory", file_path="llama2/7B_full_low_memory.yaml", @@ -98,6 +114,10 @@ class Recipe: name="full_finetune_distributed", file_path="full_finetune_distributed.py", configs=[ + Config(name="falcon3/1B_full", file_path="falcon3/1B_full.yaml"), + Config(name="falcon3/3B_full", file_path="falcon3/3B_full.yaml"), + Config(name="falcon3/7B_full", file_path="falcon3/7B_full.yaml"), + Config(name="falcon3/10B_full", file_path="falcon3/10B_full.yaml"), Config(name="llama2/7B_full", file_path="llama2/7B_full.yaml"), Config(name="llama2/13B_full", file_path="llama2/13B_full.yaml"), Config(name="llama3/8B_full", file_path="llama3/8B_full.yaml"), @@ -136,6 +156,22 @@ class Recipe: name="lora_finetune_single_device", file_path="lora_finetune_single_device.py", configs=[ + Config( + name="falcon3/1B_lora_single_device", + file_path="falcon3/1B_lora_single_device.yaml", + ), + Config( + name="falcon3/3B_lora_single_device", + file_path="falcon3/3B_lora_single_device.yaml", + ), + Config( + name="falcon3/7B_lora_single_device", + file_path="falcon3/7B_lora_single_device.yaml", + ), + Config( + name="falcon3/10B_lora_single_device", + file_path="falcon3/10B_lora_single_device.yaml", + ), Config( name="llama2/7B_lora_single_device", file_path="llama2/7B_lora_single_device.yaml", @@ -340,6 +376,10 @@ class Recipe: name="lora_finetune_distributed", file_path="lora_finetune_distributed.py", configs=[ + Config(name="falcon3/1B_lora", file_path="falcon3/1B_lora.yaml"), + Config(name="falcon3/3B_lora", file_path="falcon3/3B_lora.yaml"), + Config(name="falcon3/7B_lora", file_path="falcon3/7B_lora.yaml"), + Config(name="falcon3/10B_lora", file_path="falcon3/10B_lora.yaml"), Config(name="llama2/7B_lora", file_path="llama2/7B_lora.yaml"), Config(name="llama2/13B_lora", file_path="llama2/13B_lora.yaml"), Config(name="llama2/70B_lora", file_path="llama2/70B_lora.yaml"), @@ -449,6 +489,10 @@ class Recipe: file_path="eleuther_eval.py", configs=[ Config(name="eleuther_evaluation", file_path="eleuther_evaluation.yaml"), + Config( + name="falcon3/evaluation", + file_path="falcon3/evaluation.yaml", + ), Config( name="llama3_2_vision/11B_evaluation", file_path="llama3_2_vision/11B_evaluation.yaml", diff --git a/torchtune/models/falcon3/__init__.py b/torchtune/models/falcon3/__init__.py new file mode 100644 index 0000000000..f8cdbdeac7 --- /dev/null +++ b/torchtune/models/falcon3/__init__.py @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from ._tokenizer import Falcon3Tokenizer +from ._component_builders import falcon3, lora_falcon3 +from ._convert_weights import falcon3_hf_to_tune, falcon3_tune_to_hf # noqa +from ._model_builders import ( # noqa + falcon3_tokenizer, + falcon3_1b, + falcon3_3b, + falcon3_7b, + falcon3_10b, + lora_falcon3_1b, + lora_falcon3_3b, + lora_falcon3_7b, + lora_falcon3_10b +) +from ._positional_embeddings import Falcon3RotaryPositionalEmbeddings + +__all__ = [ + "falcon3", + "falcon3_1b", + "falcon3_3b", + "falcon3_7b", + "falcon3_10b", + "falcon3_hf_to_tune", + "falcon3_tune_to_hf", + "lora_falcon3", + "lora_falcon3_1b", + "lora_falcon3_3b", + "lora_falcon3_7b", + "lora_falcon3_10b", + "Falcon3Tokenizer", + "falcon3_tokenizer", + "Falcon3RotaryPositionalEmbeddings", +] + + diff --git a/torchtune/models/falcon3/_component_builders.py b/torchtune/models/falcon3/_component_builders.py new file mode 100644 index 0000000000..21eec709e8 --- /dev/null +++ b/torchtune/models/falcon3/_component_builders.py @@ -0,0 +1,447 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from functools import partial +from typing import List +from torchtune.modules.common_utils import reparametrize_as_dtype_state_dict_post_hook + +from torch import nn +from torchtune.modules.transformer import TransformerDecoder +from torchtune.models.falcon3._positional_embeddings import Falcon3RotaryPositionalEmbeddings + +from torchtune.modules import ( + MultiHeadAttention, + FeedForward, + RMSNorm, + TransformerSelfAttentionLayer, + TiedLinear +) + + +from torchtune.modules.peft import DoRALinear, LORA_ATTN_MODULES, LoRALinear + +""" +Component builders for the Falcon3 model and popular variants such as LoRA. + +torchtune provides composable building blocks. Builder functions help +stitch these building blocks into higher-level components. This design has +two benefits: +- The building blocks themselves are very flexible. For example, ``MultiHeadAttention`` +can take either nn.Linear or nn.LoRALinear for ``q_proj``. +- Builder functions expose a set of configurable params which keep the constructors of +the building blocks simple. +""" + + +def falcon3( + vocab_size: int, + num_layers: int, + num_heads: int, + num_kv_heads: int, + embed_dim: int, + intermediate_dim: int, + max_seq_len: int, + attn_dropout: float = 0.0, + norm_eps: float = 1e-06, + rope_base: float = 1_000_042, + tie_word_embeddings: bool = False, +) -> TransformerDecoder: + """ + Build the decoder associated with the Falcon3 model. This includes: + - Token embeddings + - num_layers number of TransformerSelfAttentionLayer blocks + - RMS Norm layer applied to the output of the transformer + - Final projection into token space + + Args: + vocab_size (int): number of tokens in vocabulary. + num_layers (int): number of layers in the transformer decoder. + num_heads (int): number of query heads. For MHA this is also the + number of heads for key and value + num_kv_heads (int): number of key and value heads. User should ensure + `num_heads` % `num_kv_heads` == 0. For standard MHA set `num_kv_heads` == `num_heads`, + for GQA `num_kv_heads` < `num_heads`, and for MQA set `num_kv_heads` == 1. + embed_dim (int): embedding dimension for self-attention + max_seq_len (int): maximum sequence length the model will be run with, as used + by :func:`~torchtune.modules.KVCache` + attn_dropout (float): dropout value passed onto scaled_dot_product_attention. + Default: 0.0 + intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified, + + norm_eps (float): epsilon in RMS norms. + rope_base (float): the base period of the RoPE embeddings. + tie_word_embeddings (bool): whether the model's input and output word embeddings should be tied. + + Returns: + TransformerDecoder: Instantiation of Falcon3 model. + """ + head_dim = embed_dim // num_heads + num_kv_heads = num_kv_heads if num_kv_heads else num_heads + + rope = Falcon3RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len, base=rope_base) + self_attn = MultiHeadAttention( + embed_dim=embed_dim, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + head_dim=head_dim, + q_proj=nn.Linear(embed_dim, num_heads * head_dim, bias=False), + k_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False), + v_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False), + output_proj=nn.Linear(embed_dim, embed_dim, bias=False), + pos_embeddings=rope, + kv_cache=None, + max_seq_len=max_seq_len, + attn_dropout=attn_dropout, + ) + mlp = falcon3_mlp(dim=embed_dim, hidden_dim=intermediate_dim) + layer = TransformerSelfAttentionLayer( + attn=self_attn, + mlp=mlp, + sa_norm=RMSNorm(dim=embed_dim, eps=norm_eps), + mlp_norm=RMSNorm(dim=embed_dim, eps=norm_eps), + ) + tok_embeddings = nn.Embedding(vocab_size, embed_dim) + if tie_word_embeddings: + output_proj = TiedLinear(tok_embeddings) + else: + output_proj = nn.Linear(embed_dim, vocab_size, bias=False) + return TransformerDecoder( + tok_embeddings=tok_embeddings, + layers=layer, + num_layers=num_layers, + max_seq_len=max_seq_len, + num_heads=num_heads, + head_dim=head_dim, + norm=RMSNorm(embed_dim, eps=norm_eps), + output=output_proj, + ) + + +def falcon3_mlp(dim: int, hidden_dim: int) -> FeedForward: + """ + Build the MLP layer associated with the Falcon3 model. + """ + gate_proj = nn.Linear(dim, hidden_dim, bias=False) + down_proj = nn.Linear(hidden_dim, dim, bias=False) + up_proj = nn.Linear(dim, hidden_dim, bias=False) + return FeedForward(gate_proj=gate_proj, down_proj=down_proj, up_proj=up_proj) + +# ------------------ LoRA Falcon3 ------------------ + + +def lora_falcon3( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + *, + # falcon3 args + vocab_size: int, + num_layers: int, + num_heads: int, + num_kv_heads: int, + embed_dim: int, + intermediate_dim: int, + max_seq_len: int, + attn_dropout: float = 0.0, + norm_eps: float = 1e-06, + rope_base: float = 1_000_042, + tie_word_embeddings: bool = False, + # LoRA args + lora_rank: int, + lora_alpha: float, + lora_dropout: float = 0.0, + use_dora: bool = False, + # Quantization args + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Return a version of Falcon3 (an instance of :func:`~torchtune.models.falcon3.transformer.Falcon3TransformerDecoder`) + with LoRA applied based on the passed in configuration. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + vocab_size (int): number of tokens in vocabulary. + num_layers (int): number of layers in the transformer decoder. + num_heads (int): number of query heads. For MHA this is also the + number of heads for key and value + num_kv_heads (int): number of key and value heads. User should ensure + `num_heads` % `num_kv_heads` == 0. For standard MHA set `num_kv_heads` == `num_heads`, + for GQA `num_kv_heads` < `num_heads`, and for MQA set `num_kv_heads` == 1. + embed_dim (int): embedding dimension for self-attention + max_seq_len (int): maximum sequence length the model will be run with, as used + by :func:`~torchtune.modules.KVCache` + attn_dropout (float): dropout value passed onto scaled_dot_product_attention. + Default: 0.0 + intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified, + this is computed using :func:`~torchtune.modules.scale_hidden_dim_for_mlp` + norm_eps (float): epsilon in RMS norms. + rope_base (float): the base period of the RoPE embeddings. + tie_word_embeddings (bool): whether the model's input and output word embeddings should be tied. + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): LoRA dropout probability. Default: 0.0 + quantize_base: (bool): Whether to quantize base model weights or not. Only applied to base + weights within linear layers LoRA is applied to. The final output linear projection is not + supported for quantization currently. + + Returns: + TransformerDecoder: Instantiation of Falcon3 model with LoRA applied to + a subset of the attention projections in each layer. + + Raises: + ValueError: if ``apply_lora_to_output`` and ``tie_word_embeddings``. + + """ + + self_attn = lora_falcon3_self_attention( + lora_modules=lora_attn_modules, + embed_dim=embed_dim, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + max_seq_len=max_seq_len, + attn_dropout=attn_dropout, + rope_base=rope_base, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) + + if apply_lora_to_mlp: + mlp = lora_falcon3_mlp( + dim=embed_dim, + hidden_dim=intermediate_dim, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + quantize_base=quantize_base, + use_dora=use_dora, + lora_dropout=lora_dropout, + ) + else: + mlp = falcon3_mlp(dim=embed_dim, hidden_dim=intermediate_dim) + + layer = TransformerSelfAttentionLayer( + attn=self_attn, + mlp=mlp, + sa_norm=RMSNorm(dim=embed_dim, eps=norm_eps), + mlp_norm=RMSNorm(dim=embed_dim, eps=norm_eps), + ) + + tok_embeddings = nn.Embedding(vocab_size, embed_dim) + + if tie_word_embeddings: + if apply_lora_to_output: + raise ValueError( + "apply_lora_to_output is incompatible with tie_word_embeddings," + " as there would be no output to apply lora to!" + ) + output_proj = TiedLinear(tok_embeddings) + else: + # TODO: quantize_base is not applied to final output_proj currently. + adapter_cls = DoRALinear if use_dora else LoRALinear + output_proj = ( + adapter_cls(embed_dim, vocab_size, rank=lora_rank, alpha=lora_alpha, dropout=lora_dropout) + if apply_lora_to_output + else nn.Linear(embed_dim, vocab_size, bias=False) + ) + model = TransformerDecoder( + tok_embeddings=tok_embeddings, + layers=layer, + num_layers=num_layers, + max_seq_len=max_seq_len, + num_heads=num_heads, + head_dim=(embed_dim // num_heads), + norm=RMSNorm(embed_dim, eps=norm_eps), + output=output_proj, + ) + + if quantize_base: + # For QLoRA, we reparametrize 4-bit tensors to higher precision, and offload to CPU on the fly + # so as to not increase peak memory + model._register_state_dict_hook( + partial( + reparametrize_as_dtype_state_dict_post_hook, + # TODO this is clowny, figure out a better way to get what precision the rest + # of the model is in + dtype=tok_embeddings.weight.dtype, + offload_to_cpu=True, + ) + ) + + return model + + +def lora_falcon3_self_attention( + lora_modules: List[LORA_ATTN_MODULES], + *, + # MultiHeadAttention args + embed_dim: int, + num_heads: int, + num_kv_heads: int, + max_seq_len: int, + attn_dropout: float = 0.0, + rope_base: float = 1_000_042, + # LoRA args + lora_rank: int, + lora_alpha: float, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> MultiHeadAttention: + """ + Return an instance of :func:`~torchtune.modules.MultiHeadAttention` with LoRA + applied to a subset of its linear layers + + Args: + lora_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to. Options are ``{"q_proj", "k_proj", "v_proj", + "output_proj"}``. + embed_dim (int): embedding dimension for self-attention + num_heads (int): number of query heads. For MHA this is also the + number of heads for key and value + num_kv_heads (int): number of key and value heads. User should ensure + `num_heads` % `num_kv_heads` == 0. For standard MHA set `num_kv_heads` == `num_heads`, + for GQA `num_kv_heads` < `num_heads`, and for MQA set `num_kv_heads` == 1. + max_seq_len (int): maximum sequence length the model will be run with, as used + by :func:`~torchtune.modules.KVCache` + attn_dropout (float): dropout value passed onto scaled_dot_product_attention. + Default: 0.0 + rope_base (float): the base period of the RoPE embeddings. Default: 1_000_000.0 + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): LoRA dropout probability. Default: 0.0 + quantize_base (bool): Whether to quantize base model parameters for linear layers + LoRA is being applied to. Default is ``False``. + + Returns: + MultiHeadAttention: instantiation of self-attention module with LoRA + applied to a subset of Q, K, V, output projections. + + Raises: + ValueError: If lora_modules arg is an empty list + """ + if not lora_modules: + raise ValueError(f"Must pass one or more of {LORA_ATTN_MODULES} as lora_modules") + + head_dim = embed_dim // num_heads + num_kv_heads = num_kv_heads if num_kv_heads else num_heads + adapter_cls = DoRALinear if use_dora else LoRALinear + q_proj = ( + adapter_cls( + embed_dim, + num_heads * head_dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + use_bias=False, + quantize_base=quantize_base, + ) + if "q_proj" in lora_modules + else nn.Linear(embed_dim, num_heads * head_dim, bias=False) + ) + k_proj = ( + adapter_cls( + embed_dim, + num_kv_heads * head_dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + use_bias=False, + quantize_base=quantize_base, + ) + if "k_proj" in lora_modules + else nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False) + ) + v_proj = ( + adapter_cls( + embed_dim, + num_kv_heads * head_dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + use_bias=False, + quantize_base=quantize_base, + ) + if "v_proj" in lora_modules + else nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False) + ) + output_proj = ( + adapter_cls( + embed_dim, + embed_dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + quantize_base=quantize_base, + ) + if "output_proj" in lora_modules + else nn.Linear(embed_dim, embed_dim, bias=False) + ) + rope = Falcon3RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len, base=rope_base) + self_attn = MultiHeadAttention( + embed_dim=embed_dim, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + head_dim=head_dim, + q_proj=q_proj, + k_proj=k_proj, + v_proj=v_proj, + output_proj=output_proj, + pos_embeddings=rope, + kv_cache=None, + max_seq_len=max_seq_len, + attn_dropout=attn_dropout, + ) + return self_attn + + +def lora_falcon3_mlp( + *, + dim: int, + hidden_dim: int, + lora_rank: int, + lora_alpha: float, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> FeedForward: + adapter_cls = DoRALinear if use_dora else LoRALinear + gate_proj = adapter_cls( + in_dim=dim, + out_dim=hidden_dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + quantize_base=quantize_base, + ) + down_proj = adapter_cls( + in_dim=hidden_dim, + out_dim=dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + quantize_base=quantize_base, + ) + up_proj = adapter_cls( + in_dim=dim, + out_dim=hidden_dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + quantize_base=quantize_base, + ) + return FeedForward( + gate_proj=gate_proj, + down_proj=down_proj, + up_proj=up_proj, + ) diff --git a/torchtune/models/falcon3/_convert_weights.py b/torchtune/models/falcon3/_convert_weights.py new file mode 100644 index 0000000000..d6d6122f5c --- /dev/null +++ b/torchtune/models/falcon3/_convert_weights.py @@ -0,0 +1,113 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict + +import torch + +from torchtune.models.convert_weights import get_mapped_key + +# state dict key mappings from HF's format to torchtune's format +_FROM_HF = { + "model.embed_tokens.weight": "tok_embeddings.weight", + "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attn.q_proj.weight", + "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attn.k_proj.weight", + "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attn.v_proj.weight", + "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attn.output_proj.weight", + "model.layers.{}.mlp.gate_proj.weight": "layers.{}.mlp.w1.weight", + "model.layers.{}.mlp.up_proj.weight": "layers.{}.mlp.w3.weight", + "model.layers.{}.mlp.down_proj.weight": "layers.{}.mlp.w2.weight", + "model.layers.{}.input_layernorm.weight": "layers.{}.sa_norm.scale", + "model.layers.{}.post_attention_layernorm.weight": "layers.{}.mlp_norm.scale", + "model.norm.weight": "norm.scale", + "lm_head.weight": "output.weight", +} + + +FALCON3_TIED_KEY = "lm_head.weight" + + +def falcon3_hf_to_tune( + state_dict: Dict[str, torch.Tensor], + num_heads: int = 12, + num_kv_heads: int = 4, + dim: int = 3072, + head_dim: int = None, + tie_word_embeddings: bool = False, +) -> Dict[str, torch.Tensor]: + """ + Convert a state dict from HF's format to TorchTune's format, which contains the weights + of a Falcon3 model. + State dicts from multiple checkpoint files should be consolidated into a single state dict + before calling this function. + The logic is identical to :func:`~torchtune.models.convert_weights.hf_to_tune`, but may not load + output projection weights. + + Args: + state_dict (Dict[str, torch.Tensor]): State dict in HF's format. + num_heads (int): Number of heads in the model. + num_kv_heads (int): Number of heads in the key/value projection layers. + dim (int): Dimension of the model. + head_dim (int): Dimension of the head. If not provided, it will be calculated + as dim // num_heads. + tie_word_embeddings (bool): Whether the model's input and output word embeddings should be tied. + + Returns: + Dict[str, torch.Tensor]: State dict in torchtune's format. + """ + converted_state_dict = {} + if head_dim is None: + head_dim = dim // num_heads + + for key, value in state_dict.items(): + if ( + tie_word_embeddings and FALCON3_TIED_KEY in key + ): # Skip loading the output projection weights + continue + if "rotary_emb.inv_freq" in key: # Skip loading the position embeddings + continue + + new_key = get_mapped_key(key, _FROM_HF) + converted_state_dict[new_key] = value + return converted_state_dict + + +def falcon3_tune_to_hf( + state_dict: Dict[str, torch.Tensor], + num_heads: int = 12, + num_kv_heads: int = 4, + dim: int = 3072, + head_dim: int = None, + tie_word_embeddings: bool = False, +): + """ + Convert a state dict from torchtune's format to HF's format. This function + doesn't handle any sharding or splitting of state dicts. It follows the + state_dict IN -> state_dict OUT pattern. + + Args: + state_dict (Dict[str, torch.Tensor]): State dict in torchtune's format. + num_heads (int): Number of heads in the model. + num_kv_heads (int): Number of heads in the key/value projection layers. + dim (int): Dimension of the model. + head_dim (int): Dimension of the head. If not provided, it will be calculated + as dim // num_heads. + tie_word_embeddings (bool): Whether the model's input and output word embeddings should be tied. + + Returns: + Dict[str, torch.Tensor]: State dict in HF's format. + """ + converted_state_dict = {} + inverted_mapping_dict = {v: k for k, v in _FROM_HF.items()} + + if head_dim is None: + head_dim = dim // num_heads + + for key, value in state_dict.items(): + new_key = get_mapped_key(key, inverted_mapping_dict) + converted_state_dict[new_key] = value + + return converted_state_dict diff --git a/torchtune/models/falcon3/_model_builders.py b/torchtune/models/falcon3/_model_builders.py new file mode 100644 index 0000000000..acf224920a --- /dev/null +++ b/torchtune/models/falcon3/_model_builders.py @@ -0,0 +1,375 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from typing import List, Optional + +from torchtune.data._prompt_templates import _get_prompt_template, _TemplateType + +from torchtune.models.falcon3._component_builders import lora_falcon3, falcon3 +from torchtune.models.falcon3._tokenizer import FALCON3_SPECIAL_TOKENS, Falcon3Tokenizer +from torchtune.modules import TransformerDecoder +from torchtune.modules.peft import LORA_ATTN_MODULES +from torchtune.modules.tokenizers import parse_hf_tokenizer_json + +""" +Model builders build specific instantiations using component builders. For example +the falcon3 model builder uses the falcon3 component builder to create the +falcon3 model series. +""" +def falcon3_tokenizer( + path: str, + merges_file: str = None, + special_tokens_path: Optional[str] = None, + max_seq_len: Optional[int] = None, + prompt_template: Optional[_TemplateType] = None, + **kwargs, +) -> Falcon3Tokenizer: + """ + Tokenizer for Falcon3. + + Args: + path (str): path to the vocab.json file. + special_tokens_path (Optional[str]): Path to ``tokenizer.json`` from Hugging Face + model files that contains all registered special tokens, or a local json file + structured similarly. Default is None to use the canonical Falcon3 special tokens. + max_seq_len (Optional[int]): A max sequence length to truncate tokens to. + Default: None + prompt_template (Optional[_TemplateType]): optional specified prompt template. + If a string, it is assumed to be the dotpath of a :class:`~torchtune.data.PromptTemplateInterface` + class. If a dictionary, it is assumed to be a custom prompt template mapping role to the + prepend/append tags. Default is None. + + Returns: + Falcon3Tokenizer: Instantiation of the Falcon3 tokenizer + """ + special_tokens = ( + parse_hf_tokenizer_json(special_tokens_path) + if special_tokens_path is not None + else FALCON3_SPECIAL_TOKENS + ) + template = ( + _get_prompt_template(prompt_template) if prompt_template is not None else None + ) + return Falcon3Tokenizer( + path=path, + special_tokens=special_tokens, + max_seq_len=max_seq_len, + prompt_template=template, + **kwargs, + ) + + +''' +Models +''' +def falcon3_10b() -> TransformerDecoder: + """ + Builder for creating a Falcon3 model initialized w/ the default 7B parameter values + from https://huggingface.co/tiiuae/Falcon3-7B + + Returns: + TransformerDecoder: Instantiation of Falcon3 7B model + """ + return falcon3( + vocab_size=131072, + num_layers=40, + num_heads=12, + num_kv_heads=4, + embed_dim=3072, + intermediate_dim=23040, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-06, + rope_base=1_000_042, + ) + + +def falcon3_7b() -> TransformerDecoder: + """ + Builder for creating a Falcon3 model initialized w/ the default 7B parameter values + from https://huggingface.co/tiiuae/Falcon3-7B + + Returns: + TransformerDecoder: Instantiation of Falcon3 7B model + """ + return falcon3( + vocab_size=131072, + num_layers=28, + num_heads=12, + num_kv_heads=4, + embed_dim=3072, + intermediate_dim=23040, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-06, + rope_base=1_000_042, + ) + + + +def falcon3_3b() -> TransformerDecoder: + """ + Builder for creating a Falcon3 model initialized w/ the default 7B parameter values + from https://huggingface.co/tiiuae/Falcon3-7B + + Returns: + TransformerDecoder: Instantiation of Falcon3 7B model + """ + return falcon3( + vocab_size=131072, + num_layers=22, + num_heads=12, + num_kv_heads=4, + embed_dim=3072, + intermediate_dim=9216, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-06, + rope_base=1_000_042, + ) + + +def falcon3_1b() -> TransformerDecoder: + """ + Builder for creating a Falcon3 model initialized w/ the default 7B parameter values + from https://huggingface.co/tiiuae/Falcon3-7B + + Returns: + TransformerDecoder: Instantiation of Falcon3 7B model + """ + return falcon3( + vocab_size=131072, + num_layers=18, + num_heads=8, + num_kv_heads=4, + embed_dim=2048, + intermediate_dim=8192, + max_seq_len=4096, + attn_dropout=0.0, + norm_eps=1e-06, + rope_base=1_000_042, + ) +''' +LoRA +''' + +def lora_falcon3_10b( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Falcon3 7B model with LoRA enabled. + + The Falcon3 defaults are the same as in :func:`~torchtune.models.falcon3.falcon3_7b`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0 + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Falcon3 7B model with LoRA applied + """ + return lora_falcon3( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=131072, + num_layers=40, + num_heads=12, + num_kv_heads=4, + embed_dim=3072, + intermediate_dim=23040, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-06, + rope_base=1_000_042, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) + + +def lora_falcon3_7b( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Falcon3 7B model with LoRA enabled. + + The Falcon3 defaults are the same as in :func:`~torchtune.models.falcon3.falcon3_7b`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0 + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Falcon3 7B model with LoRA applied + """ + return lora_falcon3( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=131072, + num_layers=28, + num_heads=12, + num_kv_heads=4, + embed_dim=3072, + intermediate_dim=23040, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1_000_042, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) + + +def lora_falcon3_3b( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Falcon3 7B model with LoRA enabled. + + The Falcon3 defaults are the same as in :func:`~torchtune.models.falcon3.falcon3_7b`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0 + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Falcon3 7B model with LoRA applied + """ + return lora_falcon3( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=131072, + num_layers=22, + num_heads=12, + num_kv_heads=4, + embed_dim=3072, + intermediate_dim=9216, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-06, + rope_base=1_000_042, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) + + +def lora_falcon3_1b( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Falcon3 7B model with LoRA enabled. + + The Falcon3 defaults are the same as in :func:`~torchtune.models.falcon3.falcon3_7b`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0 + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Falcon3 7B model with LoRA applied + """ + return lora_falcon3( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=131072, + num_layers=18, + num_heads=8, + num_kv_heads=4, + embed_dim=2048, + intermediate_dim=8192, + max_seq_len=4096, + attn_dropout=0.0, + norm_eps=1e-06, + rope_base=1_000_042, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) diff --git a/torchtune/models/falcon3/_positional_embeddings.py b/torchtune/models/falcon3/_positional_embeddings.py new file mode 100644 index 0000000000..12b9303571 --- /dev/null +++ b/torchtune/models/falcon3/_positional_embeddings.py @@ -0,0 +1,119 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional + +import torch + +from torch import nn + + +class Falcon3RotaryPositionalEmbeddings(nn.Module): + """ + RoPE Embeddings used in the Falcon3 model. + Ref: https://huggingface.co/Qwen/Qwen2-7B-Instruct + + This class is not numerically equivalent to the RoPE Embedding module + used by Llama2 and Llama3. + + Args: + dim (int): Embedding dimension. This is usually set to the dim of each + head in the attention module computed as ``embed_dim`` // ``num_heads`` + max_seq_len (int): Maximum expected sequence length for the + model, if exceeded the cached freqs will be recomputed + base (float): The base for the geometric progression used to compute + the rotation angles + """ + + def __init__( + self, + dim: int, + max_seq_len: int = 8192, + base: float = 1_000_042, + ) -> None: + super().__init__() + self.dim = dim + self.base = base + self.max_seq_len = max_seq_len + self.rope_init() + + def rope_init(self): + theta = 1.0 / ( + self.base + ** (torch.arange(0, self.dim, 2)[: (self.dim // 2)].float() / self.dim) + ) + self.register_buffer("theta", theta, persistent=False) + self.build_rope_cache(self.max_seq_len) + + def build_rope_cache(self, max_seq_len: int = 8192) -> None: + # Create position indexes `[0, 1, ..., max_seq_len - 1]` + seq_idx = torch.arange( + max_seq_len, dtype=self.theta.dtype, device=self.theta.device + ) + + # Outer product of theta and position index; output tensor has + # a shape of [max_seq_len, dim // 2] + idx_theta = torch.einsum("i, j -> ij", seq_idx, self.theta).float() + + # We cache the cos and sin embeddings instead of the IDs. This helps + # ensure we have correct behavior when training with bf16 + # Size: [max_seq_len, (dim * 2)] + freqs = torch.cat([idx_theta, idx_theta], dim=-1) + cache = torch.cat([freqs.cos(), freqs.sin()], dim=-1) + self.register_buffer("cache", cache, persistent=False) + + def forward( + self, x: torch.Tensor, input_pos: Optional[torch.Tensor] = None + ) -> torch.Tensor: + """ + Args: + x (torch.Tensor): input tensor with shape + [b, s, n_h, h_d] + input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids + of each token. During training, this is used to indicate the positions + of each token relative to its sample when packed, shape [b, s]. + During inference, this indicates the position of the current token. + If none, assume the index of the token is its position id. Default is None. + + Returns: + Tensor: output tensor with RoPE applied + + Notation used for tensor shapes: + - b: batch size + - s: sequence length + - n_h: num heads + - h_d: head dim + + TODO: The implementation below can be made more efficient + for inference. + """ + # input tensor has shape [b, s, n_h, h_d] + seq_len = x.size(1) + head_dim = x.size(-1) + + # extract the values based on whether input_pos is set or not. When + # input_pos is provided, we're in inference mode + rope_cache = ( + self.cache[:seq_len] if input_pos is None else self.cache[input_pos] + ) + + # reshape the cache for broadcasting + # tensor has shape [b, s, 1, h_d * 2] if packed samples, + # otherwise has shape [1, s, 1, h_d * 2] + rope_cache = rope_cache.view(-1, seq_len, 1, head_dim * 2) + + # [b, s, 1, h_d] + cos = rope_cache[..., :head_dim].to(x.dtype) + sin = rope_cache[..., head_dim:].to(x.dtype) + + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + rotated = torch.cat((-x2, x1), dim=-1) + + # cos: [b, s, 1, h_d] + # x: [b, s, n_h, h_d] + x_out = (x * cos) + (rotated * sin) + return x_out.type_as(x) diff --git a/torchtune/models/falcon3/_tokenizer.py b/torchtune/models/falcon3/_tokenizer.py new file mode 100644 index 0000000000..a5fc871cd0 --- /dev/null +++ b/torchtune/models/falcon3/_tokenizer.py @@ -0,0 +1,436 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import json +import unicodedata +from functools import lru_cache +from typing import Any, Dict, List, Mapping, Optional, Tuple + +import regex as re + +from torchtune.data import ChatMLTemplate, Message, PromptTemplate, truncate +from torchtune.modules.tokenizers import ModelTokenizer + +PRETOKENIZE_REGEX = ( + r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" +) +FALCON3_SPECIAL_TOKENS = { + ">>TITLE<<": 0, + ">>ABSTRACT<<": 1, + ">>INTRODUCTION<<": 2, + ">>SUMMARY<<": 3, + ">>COMMENT<<": 4, + ">>ANSWER<<": 5, + ">>QUESTION<<": 6, + ">>DOMAIN<<": 7, + ">>EMAIL_ADDRESS<<": 8, + ">>IP_ADDRESS<<": 9, + "<|startoftext|>": 10, + "<|endoftext|>": 11, + ">>IP_ADDRESS_0<<": 14, + ">>IP_ADDRESS_1<<": 15, + ">>IP_ADDRESS_2<<": 16, + ">>IP_ADDRESS_3<<": 17, + ">>IP_ADDRESS_4<<": 18, + ">>IP_ADDRESS_5<<": 19, + ">>IP_ADDRESS_6<<": 20, + ">>IP_ADDRESS_7<<": 21, + ">>IP_ADDRESS_8<<": 22, + ">>IP_ADDRESS_9<<": 23, + ">>PASSWORD<<": 24, + ">>KEY<<": 25 +} + +ENDOFTEXT = "<|endoftext|>" + +DEFAULT_FALCON3_TOKENIZER_BPE_CACHE_SIZE = 152064 + + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoid mapping to whitespace/control + characters the bpe code barfs on. + + The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab + if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for + decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup + tables between utf-8 bytes and unicode strings. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2 ** 8): + if b not in bs: + bs.append(b) + cs.append(2 ** 8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class Falcon3Tokenizer(ModelTokenizer): + """This class construct a Falcon3 tokenizer, based on GPT-2 byte-level BPE tokenization. + + See . + + Args: + path (str): Path to tokenizer.json file. + special_tokens (Dict[str, int]): Special tokens to add to the tokenizer. Default is FALCON3_SPECIAL_TOKENS. + max_seq_len (Optional[int]): A max sequence length to truncate tokens to. + Default: None + prompt_template (Optional[PromptTemplate]): template used to format the messages based on their role. This is used + to add structured text around the actual messages. The structured text is used in three scenarios: + + - Task-specific templates to gear models for a particular task that it will expect after training + - Model-specific templates that are required whenever the model is prompted, such as the [INST] + tags in Llama2 and in Mistral + - Community standardized templates, such as :class:`~torchtune.data.ChatMLTemplate` + + The extra text will still get tokenized as normal text, not as special tokens. + Default: None + errors (str): Paradigm to follow when decoding bytes to UTF-8. Defaults to "replace". + See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + unk_token (Optional[str]): The unknown token. A token that is not in the vocabulary cannot be converted + to an ID and is set to be this token instead. Defaults to ``<|endoftext|>``. + bos_token (Optional[str]): The beginning of sequence token. Defaults to None. + eos_token (str): The end of sequence token. Defaults to ``<|endoftext|>``. + pad_token (Optional[str]): The token used for padding. Defaults to ``<|endoftext|>``. + bpe_cache_size (int): BPE token cache size in Falcon3Tokenizer. + NOTE: large cache size will speed up tokenization, but the cache object will get really + large for long running processes (esp. for texts of language that do not use space between + word, e.g. Chinese); technically not a memory leak but appears as one. + By default, we set the cache size equals to size of the official Falcon3 tokenizer. + + Example: + >>> tokenizer = Falcon3Tokenizer(path="/path/to/tokenizer.json", special_tokens=FALCON3_SPECIAL_TOKENS) + >>> tokenized_text = tokenizer.encode("Hello world!") + >>> print(tokenized_text) + [13955, 3139, 2024] + """ + + def __init__( + self, + path: str, + special_tokens: Dict[str, int] = FALCON3_SPECIAL_TOKENS, + max_seq_len: Optional[int] = None, + *, + prompt_template: Optional[PromptTemplate] = None, + errors: str = "replace", + unk_token: Optional[str] = None, + bos_token: Optional[str] = None, + eos_token: str = ENDOFTEXT, + pad_token: Optional[str] = ENDOFTEXT, + bpe_cache_size: int = DEFAULT_FALCON3_TOKENIZER_BPE_CACHE_SIZE, + ): + with open(path, encoding="utf-8") as tokenizer_file: + tok_dict = json.load(tokenizer_file) + + self.encoder = tok_dict['model']['vocab'] + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + bpe_merges = [] + merges_handle = tok_dict['model']['merges'] + for i, line in enumerate(merges_handle): + # line = line.strip() + bpe_merges.append(tuple(line)) + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + + self._bpe = lru_cache(maxsize=bpe_cache_size)(self._bpe_without_cache) + + self.pat = re.compile(PRETOKENIZE_REGEX) + + self.special_tokens = special_tokens + self._special_tokens_reversed = {v: k for k, v in self.special_tokens.items()} + + self.unk_id = None if unk_token is None else self.special_tokens[unk_token] + self.bos_id = None if bos_token is None else self.special_tokens[bos_token] + self.eos_id = None if eos_token is None else self.special_tokens[eos_token] + self.pad_id = None if pad_token is None else self.special_tokens[pad_token] + + # Pattern for special tokens. + self._pattern_split_special_tokens = re.compile(r"(" + "|".join(special_tokens.keys()) + r")") + + self.max_seq_len = max_seq_len + + self.prompt_template = prompt_template + + def _bpe_without_cache(self, token): + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + return word + + def _tokenize(self, text): + """Tokenize a string.""" + bpe_tokens = [] + for token in re.findall(self.pat, text): + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self._bpe(token).split(" ")) + return bpe_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.unk_id) + + def encode( + self, text: str, add_bos: bool = True, add_eos: bool = True + ) -> List[int]: + """ + Encode a string into a list of token ids. + + Args: + text (str): The string to encode. + add_bos (bool): (Optional) Whether to add the beginning of sequence token. + add_eos (bool): (Optional) Whether to add the end of sequence token. + + Returns: + List[int]: The list of token ids. + + Note: + This method follows + and + """ + + text = unicodedata.normalize("NFC", text) + + tokens = self._pattern_split_special_tokens.split(text) + + tokenized_text = [] + for token in tokens: + if not token: + continue + if token in self.special_tokens: + tokenized_text.append(token) + else: + tokenized_text.extend(self._tokenize(token)) + + # Convert tokenized text to token ids. + token_ids = [] + if add_bos and self.bos_id is not None: + token_ids.append(self.bos_id) + for token in tokenized_text: + if token in self.special_tokens: + token_id = self.special_tokens[token] + else: + token_id = self._convert_token_to_id(token) + token_ids.append(token_id) + if add_eos and self.eos_id is not None: + token_ids.append(self.eos_id) + + return token_ids + + def _convert_id_to_token(self, index: int) -> str: + """Converts an index (integer) in a token (str) using the vocab.""" + token = self._special_tokens_reversed.get(index, None) + if token is None: + return self.decoder.get(index) + return token + + def _convert_tokens_to_string(self, tokens: List[str]) -> str: + """Converts a sequence of tokens (string) in a single string.""" + text = "".join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode( + "utf-8", errors=self.errors + ) + return text + + def decode( + self, + token_ids: List[int], + skip_special_tokens: bool = False, + ) -> str: + """ + Decode a list of token ids into a string. + + Args: + token_ids (List[int]): The list of token ids. + skip_special_tokens (bool): Whether the special tokens should be removed from the decoded string. + + Returns: + str: The decoded string. + """ + sub_texts = [] + current_sub_text = [] + for token_id in token_ids: + token = self._convert_id_to_token(token_id) + if token_id in self._special_tokens_reversed: + if current_sub_text: + string = self._convert_tokens_to_string(current_sub_text) + if string: + sub_texts.append(string) + current_sub_text = [] + if not skip_special_tokens: + sub_texts.append(token) + else: + current_sub_text.append(token) + if current_sub_text: + sub_texts.append(self._convert_tokens_to_string(current_sub_text)) + + text = "".join(sub_texts) + return text + + def tokenize_messages( + self, + messages: List[Message], + *, + add_eos: bool = True, + ) -> Tuple[List[int], List[bool]]: + """ + Given a list of messages, return a list of tokens for the concatenated + and formatted messages. + + Args: + messages (List[Message]): The message list to tokenize. + add_eos (bool): Wether to add the tokenizer's eos_id at the end of the + sequence of messages. Default is True. + + Returns: + Tuple[List[int], List[bool]]: The list of token ids and the list of masks. + + Raises: + RuntimeError: If a message contains non-text content + """ + assert not isinstance(self.prompt_template, ChatMLTemplate), ( + "Using ChatMLTemplate with tokenize_messages will result in multiple <|im_*|> tokens wrapping each message." + "Please use a different template or set to None." + ) + templated_messages = ( + self.prompt_template(messages) + if self.prompt_template is not None + else messages + ) + + tokenized_messages = [] + mask = [] + for index, message in enumerate(templated_messages): + tokens = [] + + # message header + if message.role != "ipython": + # tokens.append(self.im_start_id) + tokens.extend( + self.encode(f"{message.role}\n", add_bos=False, add_eos=False) + ) + + # message content + for item in message.content: + if item["type"] == "text": + tokens.extend( + self.encode( + item["content"], + add_bos=False, + add_eos=False, + ) + ) + else: + raise RuntimeError( + f"Unsupported message content type: {item['type']}" + ) + + # message footer + if message.role != "ipython" and ( + message.role != "assistant" or index != len(messages) - 1 + ): + # tokens.append(self.im_end_id) + tokens.extend(self.encode("\n", add_bos=False, add_eos=False)) + + tokenized_messages.extend(tokens) + mask.extend([message.masked] * len(tokens)) + + # Break out early if we reach max_seq_len + if self.max_seq_len and len(tokenized_messages) >= self.max_seq_len: + break + + # Add the End-Of-Sequence token + if add_eos: + tokenized_messages.append(self.eos_id) + mask.append(mask[-1]) + + # Finally, truncate if necessary + if self.max_seq_len: + tokenized_messages = truncate( + tokenized_messages, self.max_seq_len, self.eos_id if add_eos else None + ) + mask = truncate(mask, self.max_seq_len, True if add_eos else None) + + return tokenized_messages, mask + + def __call__( + self, sample: Mapping[str, Any], inference: bool = False + ) -> Mapping[str, Any]: + """ + Apply ``tokenize_messages`` to the "messages" field in the sample. + + Args: + sample (Mapping[str, Any]): A sample with a "messages" field containing + a List[Message] to tokenize + inference (bool): Whether the template is being used for inference or not. + + Returns: + Mapping[str, Any]: The sample with added "tokens" and "mask" fields + and the "messages" field removed. + inference (bool): Whether the template is being used for inference or not. + """ + messages = sample.pop("messages") + tokens, mask = self.tokenize_messages(messages) + sample["tokens"] = tokens + sample["mask"] = mask + return sample diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py index a5d72af320..df93535dca 100644 --- a/torchtune/training/checkpointing/_checkpointer.py +++ b/torchtune/training/checkpointing/_checkpointer.py @@ -553,6 +553,16 @@ def load_checkpoint(self) -> Dict[str, Any]: num_kv_heads=self._config["num_key_value_heads"], dim=self._config["hidden_size"], ) + elif self._model_type == ModelType.FALCON3: + from torchtune.models.falcon3._convert_weights import falcon3_hf_to_tune + + converted_state_dict[training.MODEL_KEY] = falcon3_hf_to_tune( + merged_state_dict, + num_heads=self._config["num_attention_heads"], + num_kv_heads=self._config["num_key_value_heads"], + dim=self._config["hidden_size"], + tie_word_embeddings=self._config["tie_word_embeddings"], + ) elif self._model_type == ModelType.QWEN2: from torchtune.models.qwen2._convert_weights import qwen2_hf_to_tune @@ -662,6 +672,16 @@ def save_checkpoint( num_kv_heads=self._config["num_key_value_heads"], dim=self._config["hidden_size"], ) + elif self._model_type == ModelType.FALCON3: + from torchtune.models.falcon3._convert_weights import falcon3_tune_to_hf + + state_dict[training.MODEL_KEY] = falcon3_tune_to_hf( + state_dict[training.MODEL_KEY], + num_heads=self._config["num_attention_heads"], + num_kv_heads=self._config["num_key_value_heads"], + dim=self._config["hidden_size"], + tie_word_embeddings=self._config["tie_word_embeddings"], + ) elif self._model_type == ModelType.QWEN2: from torchtune.models.qwen2._convert_weights import qwen2_tune_to_hf diff --git a/torchtune/training/checkpointing/_utils.py b/torchtune/training/checkpointing/_utils.py index f8dc55452b..d38156599d 100644 --- a/torchtune/training/checkpointing/_utils.py +++ b/torchtune/training/checkpointing/_utils.py @@ -103,6 +103,7 @@ class ModelType(Enum): >>> state_dict = my_custom_state_dict_mapping(state_dict) """ + FALCON3: str = "falcon3" GEMMA: str = "gemma" GEMMA2: str = "gemma2" LLAMA2: str = "llama2"